Linux-HyperV List
 help / color / mirror / Atom feed
* Re: [PATCH net, v2] net: mana: Fix crash from unvalidated SHM offset read from BAR0 during FLR
From: Dipayaan Roy @ 2026-05-03  3:03 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	pabeni, leon, longli, kotaranov, horms, shradhagupta, ssengar,
	ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260501185324.0f02dc72@kernel.org>

On Fri, May 01, 2026 at 06:53:24PM -0700, Jakub Kicinski wrote:
> On Wed, 29 Apr 2026 11:57:55 -0700 Dipayaan Roy wrote:
> > During Function Level Reset recovery, the MANA driver reads
> > hardware BAR0 registers that may temporarily contain garbage values.
> > The SHM (Shared Memory) offset read from GDMA_REG_SHM_OFFSET is used
> > to compute gc->shm_base, which is later dereferenced via readl() in
> > mana_smc_poll_register(). If the hardware returns an unaligned or
> > out-of-range value, the driver must not blindly use it, as this would
> > propagate the hardware error into a kernel crash.
> > 
> > The following crash was observed on an arm64 Hyper-V guest running
> > kernel 6.17.0-3013-azure during VF reset recovery triggered by HWC
> > timeout.
> > 
> > [13291.785274] Unable to handle kernel paging request at virtual address ffff8000a200001b
> > [13291.785311] Mem abort info:
> > [13291.785332]   ESR = 0x0000000096000021
> > [13291.785343]   EC = 0x25: DABT (current EL), IL = 32 bits
> > [13291.785355]   SET = 0, FnV = 0
> > [13291.785363]   EA = 0, S1PTW = 0
> > [13291.785372]   FSC = 0x21: alignment fault
> > [13291.785382] Data abort info:
> > [13291.785391]   ISV = 0, ISS = 0x00000021, ISS2 = 0x00000000
> > [13291.785404]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > [13291.785412]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > [13291.785421] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000014df3a1000
> > [13291.785432] [ffff8000a200001b] pgd=1000000100438403, p4d=1000000100438403, pud=1000000100439403, pmd=0068000fc2000711
> > [13291.785703] Internal error: Oops: 0000000096000021 [#1]  SMP
> > [13291.830975] Modules linked in: tls qrtr mana_ib ib_uverbs ib_core xt_owner xt_tcpudp xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_compat nf_tables cfg80211 8021q garp mrp stp llc binfmt_misc joydev serio_raw nls_iso8859_1 hid_generic aes_ce_blk aes_ce_cipher polyval_ce ghash_ce sm4_ce_gcm sm4_ce_ccm sm4_ce sm4_ce_cipher hid_hyperv sm4 sm3_ce sha3_ce hv_netvsc hid vmgenid hyperv_keyboard hyperv_drm sch_fq_codel nvme_fabrics efi_pstore dm_multipath nfnetlink vsock_loopback vmw_vsock_virtio_transport_common hv_sock vmw_vsock_vmci_transport vmw_vmci vsock dmi_sysfs ip_tables x_tables autofs4
> > [13291.862630] CPU: 122 UID: 0 PID: 61796 Comm: kworker/122:2 Tainted: G        W           6.17.0-3013-azure #13-Ubuntu VOLUNTARY
> > [13291.869902] Tainted: [W]=WARN
> > [13291.871901] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 01/08/2026
> > [13291.878086] Workqueue: events mana_serv_func
> > [13291.880718] pstate: 62400005 (nZCv daif +PAN -UAO +TCO -DIT -SSBS BTYPE=--)
> > [13291.884835] pc : mana_smc_poll_register+0x48/0xb0
> > [13291.887902] lr : mana_smc_setup_hwc+0x70/0x1c0
> > [13291.890493] sp : ffff8000ab79bbb0
> > [13291.892364] x29: ffff8000ab79bbb0 x28: ffff00410c8b5900 x27: ffff00410d630680
> > [13291.896252] x26: ffff004171f9fd80 x25: 000000016ed55000 x24: 000000017f37e000
> > [13291.899990] x23: 0000000000000000 x22: 000000016ed55000 x21: 0000000000000000
> > [13291.904497] x20: ffff8000a200001b x19: 0000000000004e20 x18: ffff8000a6183050
> > [13291.908308] x17: 0000000000000000 x16: 0000000000000000 x15: 000000000000000a
> > [13291.912542] x14: 0000000000000004 x13: 0000000000000000 x12: 0000000000000000
> > [13291.916298] x11: 0000000000000000 x10: 0000000000000001 x9 : ffffc45006af1bd8
> > [13291.920945] x8 : ffff000151129000 x7 : 0000000000000000 x6 : 0000000000000000
> > [13291.925293] x5 : 000000015f214000 x4 : 000000017217a000 x3 : 000000016ed50000
> > [13291.930436] x2 : 000000016ed55000 x1 : 0000000000000000 x0 : ffff8000a1ffffff
> > [13291.934342] Call trace:
> > [13291.935736]  mana_smc_poll_register+0x48/0xb0 (P)
> > [13291.938611]  mana_smc_setup_hwc+0x70/0x1c0
> > [13291.941113]  mana_hwc_create_channel+0x1a0/0x3a0
> > [13291.944283]  mana_gd_setup+0x16c/0x398
> > [13291.946584]  mana_gd_resume+0x24/0x70
> > [13291.948917]  mana_do_service+0x13c/0x1d0
> > [13291.951583]  mana_serv_func+0x34/0x68
> > [13291.953732]  process_one_work+0x168/0x3d0
> > [13291.956745]  worker_thread+0x2ac/0x480
> > [13291.959104]  kthread+0xf8/0x110
> > [13291.961026]  ret_from_fork+0x10/0x20
> > [13291.963560] Code: d2807d00 9417c551 71000673 54000220 (b9400281)
> > [13291.967299] ---[ end trace 0000000000000000 ]---
> > 
> > Disassembly of mana_smc_poll_register() around the crash site:
> > 
> > Disassembly of section .text:
> > 
> > 00000000000047c8 <mana_smc_poll_register>:
> >     47c8: d503201f        nop
> >     47cc: d503201f        nop
> >     47d0: d503233f        paciasp
> >     47d4: f800865e        str     x30, [x18], #8
> >     47d8: a9bd7bfd        stp     x29, x30, [sp, #-48]!
> >     47dc: 910003fd        mov     x29, sp
> >     47e0: a90153f3        stp     x19, x20, [sp, #16]
> >     47e4: 91007014        add     x20, x0, #0x1c
> >     47e8: 5289c413        mov     w19, #0x4e20
> >     47ec: f90013f5        str     x21, [sp, #32]
> >     47f0: 12001c35        and     w21, w1, #0xff
> >     47f4: 14000008        b       4814 <mana_smc_poll_register+0x4c>
> >     47f8: 36f801e1  tbz  w1, #31, 4834 <mana_smc_poll_register+0x6c>
> >     47fc: 52800042        mov     w2, #0x2
> >     4800: d280fa01        mov     x1, #0x7d0
> >     4804: d2807d00        mov     x0, #0x3e8
> >     4808: 94000000        bl      0 <usleep_range_state>
> >     480c: 71000673        subs    w19, w19, #0x1
> >     4810: 54000200        b.eq    4850 <mana_smc_poll_register+0x88>
> >     4814: b9400281      ldr   w1, [x20] <-- **** CRASHED HERE *****
> >     4818: d50331bf        dmb     oshld
> >     481c: 2a0103e2        mov     w2, w1
> >     ...
> > 
> > From the crash signature x20 = ffff8000a200001b, this address
> > ends in 0x1b which is not 4-byte aligned, so the 'ldr w1, [x20]'
> > instruction (readl) triggers the arm64 alignment fault (FSC = 0x21).
> > 
> > The root cause is in mana_gd_init_vf_regs(), which computes:
> > 
> >   gc->shm_base = gc->bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET);
> > 
> > The offset is used without any validation.  The same problem exists
> > in mana_gd_init_pf_regs() for sriov_base_off and sriov_shm_off.
> > 
> > Fix this by validating all offsets before use:
> > 
> > - VF: check shm_off is within BAR0, properly aligned to 4 bytes
> >   (readl requirement), and leaves room for the full 256-bit
> >   (32-byte) SMC aperture.
> > 
> > - PF: check sriov_base_off is within BAR0, aligned to 8 bytes
> >   (readq requirement), and leaves room to safely read the
> >   sriov_shm_off register at sriov_base_off + GDMA_PF_REG_SHM_OFF.
> >   Then check sriov_shm_off leaves room for the full SMC aperture.
> >   All arithmetic uses subtraction rather than addition to avoid
> >   integer overflow on garbage firmware values.
> > 
> > without validating the offset read from hardware. If the register
> > returns a garbage value that is neither within bar 0 bounds nor aligned
> > to the 4-byte granularity, thus causing the alignment fault.
> > 
> > Define SMC_APERTURE_SIZE (32 bytes, derived from the 256-bit aperture
> > width)
> > 
> > Return -EPROTO on invalid values.  The existing recovery path in
> > mana_serv_reset() already handles -EPROTO by falling through to PCI
> > device rescan, giving the hardware another chance to present valid
> > register values after reset.
> > 
> > Fixes: 9bf66036d686 ("net: mana: Handle hardware recovery events when probing the device")
> > Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
> > 
> > ---
> > Changes in v2:
> > - Fix sriov_base_off alignment check: sizeof(u32) to sizeof(u64), since
> >   mana_gd_r64() (readq) requires 8-byte alignment on arm64.
> > - Fix sriov_base_off bounds: also verify enough space remains in BAR0
> >   to safely read sriov_shm_off at offset GDMA_PF_REG_SHM_OFF + 8 bytes.
> > - Fix integer overflow: rewrite bounds checks using subtraction
> >   (remaining = bar0_size - base) instead of addition.
> > - Fix SMC aperture size: add gc->bar0_size - shm_off < SMC_APERTURE_SIZE
> >   checks in both VF and PF paths; previously only the start address was
> >   validated, but mana_smc_poll_register() accesses up to shm_base + 0x1c
> >   (28 bytes from base, 32 bytes total).
> > - Export SMC_APERTURE_SIZE to shm_channel.h.
> > ---
> >  .../net/ethernet/microsoft/mana/gdma_main.c   | 40 ++++++++++++++++---
> >  include/net/mana/shm_channel.h                |  6 +++
> >  2 files changed, 41 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > index 098fbda0d128..d8e816882f02 100644
> > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > @@ -43,8 +43,9 @@ static u64 mana_gd_r64(struct gdma_context *g, u64 offset)
> >  static int mana_gd_init_pf_regs(struct pci_dev *pdev)
> >  {
> >  	struct gdma_context *gc = pci_get_drvdata(pdev);
> > -	void __iomem *sriov_base_va;
> > +	u64 remaining_barsize;
> >  	u64 sriov_base_off;
> > +	u64 sriov_shm_off;
> >  
> >  	gc->db_page_size = mana_gd_r32(gc, GDMA_PF_REG_DB_PAGE_SIZE) & 0xFFFF;
> >  
> > @@ -73,10 +74,28 @@ static int mana_gd_init_pf_regs(struct pci_dev *pdev)
> >  	gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off;
> >  
> >  	sriov_base_off = mana_gd_r64(gc, GDMA_SRIOV_REG_CFG_BASE_OFF);
> > +	if (sriov_base_off >= gc->bar0_size ||
> > +	    gc->bar0_size - sriov_base_off <
> > +		GDMA_PF_REG_SHM_OFF + sizeof(u64) ||
> 
> nit: fits on a single line, I think?
>
It goes beyond the limit of 80, hence did it this way.
 
> > +	    !IS_ALIGNED(sriov_base_off, sizeof(u64))) {
> > +		dev_err(gc->dev,
> > +			"SRIOV base offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
> > +			sriov_base_off, (u64)gc->bar0_size);
> > +		return -EPROTO;
> > +	}
> >  
> > -	sriov_base_va = gc->bar0_va + sriov_base_off;
> > -	gc->shm_base = sriov_base_va +
> > -			mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF);
> > +	remaining_barsize = gc->bar0_size - sriov_base_off;
> > +	sriov_shm_off = mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF);
> > +	if (sriov_shm_off >= remaining_barsize ||
> > +	    remaining_barsize - sriov_shm_off < SMC_APERTURE_SIZE ||
> > +	    !IS_ALIGNED(sriov_shm_off, sizeof(u32))) {
> > +		dev_err(gc->dev,
> > +			"SRIOV SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
> > +			sriov_shm_off, (u64)gc->bar0_size);
> > +		return -EPROTO;
> > +	}
> > +
> > +	gc->shm_base = gc->bar0_va + sriov_base_off + sriov_shm_off;
> >  
> >  	return 0;
> >  }
> > @@ -84,6 +103,7 @@ static int mana_gd_init_pf_regs(struct pci_dev *pdev)
> >  static int mana_gd_init_vf_regs(struct pci_dev *pdev)
> >  {
> >  	struct gdma_context *gc = pci_get_drvdata(pdev);
> > +	u64 shm_off;
> >  
> >  	gc->db_page_size = mana_gd_r32(gc, GDMA_REG_DB_PAGE_SIZE) & 0xFFFF;
> >  
> > @@ -111,7 +131,17 @@ static int mana_gd_init_vf_regs(struct pci_dev *pdev)
> >  	gc->db_page_base = gc->bar0_va + gc->db_page_off;
> >  	gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off;
> >  
> > -	gc->shm_base = gc->bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET);
> > +	shm_off = mana_gd_r64(gc, GDMA_REG_SHM_OFFSET);
> > +	if (shm_off >= gc->bar0_size ||
> > +	    gc->bar0_size - shm_off < SMC_APERTURE_SIZE ||
> > +	    !IS_ALIGNED(shm_off, sizeof(u32))) {
> > +		dev_err(gc->dev,
> > +			"SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
> > +			shm_off, (u64)gc->bar0_size);
> > +		return -EPROTO;
> > +	}
> > +
> > +	gc->shm_base = gc->bar0_va + shm_off;
> >  
> >  	return 0;
> >  }
> > diff --git a/include/net/mana/shm_channel.h b/include/net/mana/shm_channel.h
> > index 5199b41497ff..dbabcfb95daf 100644
> > --- a/include/net/mana/shm_channel.h
> > +++ b/include/net/mana/shm_channel.h
> > @@ -4,6 +4,12 @@
> >  #ifndef _SHM_CHANNEL_H
> >  #define _SHM_CHANNEL_H
> >  
> > +#define SMC_APERTURE_BITS 256
> > +#define SMC_BASIC_UNIT (sizeof(u32))
> > +#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8))
> > +#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1)
> > +#define SMC_APERTURE_SIZE  (SMC_APERTURE_BITS / 8)
> 
> AI bots complain that we're redefining this.
> Since it's a fix I think it's better to remove the existing definition
> even if it lives in a driver that goes via a different tree.
> 
Ack, removed this in the next version.
> >  struct shm_channel {
> >  	struct device *dev;
> >  	void __iomem *base;
> -- 
> pw-bot: cr

Hi Jakub,

Thanks for the comments, I have shared v3 addressing it.

Regards
Dipayaan Roy


^ permalink raw reply

* Re: [PATCH rc 00/15] Various bug fixes for RDMA drivers in the uapi functions
From: Jason Gunthorpe @ 2026-05-02 18:39 UTC (permalink / raw)
  To: Andrew Lunn, Broadcom internal kernel review list, Bryan Tan,
	Eric Dumazet, Junxian Huang, Konstantin Taranov, Jakub Kicinski,
	Leon Romanovsky, linux-hyperv, linux-rdma, netdev, Paolo Abeni,
	Selvin Xavier, Chengchang Tang, Tariq Toukan, Vishnu Dasa,
	Yishai Hadas
  Cc: Abhijit Gangurde, Adit Ranadive, Allen Hubbe, Andrew Boyer,
	Aditya Sarwade, Brad Spengler, Bryan Tan, David S. Miller,
	Dexuan Cui, Doug Ledford, George Zhang, Jorgen Hansen, Jianbo Liu,
	Kai Aizen, Leon Romanovsky, Leon Romanovsky, Yixian Liu, Long Li,
	Lijun Ou, Parav Pandit, patches, Roland Dreier, Roland Dreier,
	Sagi Grimberg, Ajay Sharma, stable, Tariq Toukan, Wei Hu (Xavier),
	Shaobo Xu, Nenglong Zhao
In-Reply-To: <0-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com>

On Tue, Apr 28, 2026 at 01:17:33PM -0300, Jason Gunthorpe wrote:
> All were found by Sashiko or Claude AI tools. They vary in severity, but
> are all things that shouldn't be present.
> 
> Jason Gunthorpe (15):
>   RDMA/ionic: Fix typo in format string
>   RDMA/mlx5: Restore zero-init to mlx5_ib_modify_qp() ucmd
>   RDMA/mlx5: Add missing store/release for lock elision pattern
>   RDMA/mana: Validate rx_hash_key_len
>   RDMA/mana: Remove user triggerable WARN_ON() in
>     mana_ib_create_qp_rss()
>   RDMA/mana: Fix mana_destroy_wq_obj() cleanup in
>     mana_ib_create_qp_rss()
>   RDMA/mana: Fix error unwind in mana_ib_create_qp_rss()
>   RDMA/ocrdma: Clarify the mm_head searching
>   RDMA/ocrdma: Don't NULL deref uctx on errors in ocrdma_copy_pd_uresp()
>   RDMA/vmw_pvrdma: Fix double free on pvrdma_alloc_ucontext() error path
>   RDMA/mlx4: Fix resource leak on error in mlx4_ib_create_srq()
>   RDMA/mlx4: Fix mis-use of RCU in mlx4_srq_event()
>   RDMA/hns: Fix xarray race in hns_roce_create_srq()
>   RDMA/hns: Fix xarray race in hns_roce_create_qp_common()
>   RDMA/hns: Fix unlocked call to hns_roce_qp_remove()
> 
>  drivers/infiniband/hw/hns/hns_roce_qp.c         | 13 ++++++++++---
>  drivers/infiniband/hw/hns/hns_roce_srq.c        | 12 ++++++------
>  drivers/infiniband/hw/ionic/ionic_ibdev.c       |  2 +-
>  drivers/infiniband/hw/mana/cq.c                 |  5 +++--
>  drivers/infiniband/hw/mana/qp.c                 | 16 ++++++++++------
>  drivers/infiniband/hw/mlx4/srq.c                |  4 +++-
>  drivers/infiniband/hw/mlx5/main.c               |  8 ++++----
>  drivers/infiniband/hw/mlx5/qp.c                 |  2 +-
>  drivers/infiniband/hw/mlx5/umr.c                |  4 ++--
>  drivers/infiniband/hw/ocrdma/ocrdma_verbs.c     |  8 ++++----
>  drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c |  2 +-
>  drivers/net/ethernet/mellanox/mlx4/srq.c        | 13 +++++++------
>  12 files changed, 52 insertions(+), 37 deletions(-)

Applied to for-rc

Jason

^ permalink raw reply

* Re: [PATCH net v2] net: mana: Optimize irq affinity for low vcpu configs
From: Yury Norov @ 2026-05-02 17:15 UTC (permalink / raw)
  To: Shradha Gupta
  Cc: Dexuan Cui, Wei Liu, Haiyang Zhang, K. Y. Srinivasan, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
	Dipayaan Roy, Shiraz Saleem, Michael Kelley, Long Li, Yury Norov,
	linux-hyperv, linux-kernel, netdev, Paul Rosswurm, Shradha Gupta,
	Saurabh Singh Sengar, stable
In-Reply-To: <afYMN6vbiX7Rzss+@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>

On Sat, May 02, 2026 at 07:37:43AM -0700, Shradha Gupta wrote:
> On Fri, May 01, 2026 at 12:22:20PM -0400, Yury Norov wrote:
> > On Wed, Apr 29, 2026 at 02:06:37AM -0700, Shradha Gupta wrote:
> > > In mana driver, the number of IRQs allocated is capped by the
> > > min(num_cpu + 1, queue count). In cases, where the IRQ count is greater
> > > than the vcpu count, we want to utilize all the vCPUs, irrespective of
> > > their NUMA/core bindings.
> > > 
> > > This is important, especially in the envs where number of vCPUs are so
> > > few that the softIRQ handling overhead on two IRQs on the same vCPU is
> > > much more than their overheads if they were spread across sibling vCPUs.
> > > 
> > > This behaviour is more evident with dynamic IRQ allocation. Since MANA
> > > IRQs are assigned at a later stage compared to static allocation, other
> > > device IRQs may already be affinitized to the vCPUs. As a result, IRQ
> > > weights become imbalanced, causing multiple MANA IRQs to land on the
> > > same vCPU, while some vCPUs have none.
> > > 
> > > In such cases when many parallel TCP connections are tested, the
> > > throughput drops significantly.
> > > 
> > > Test envs:
> > > =======================================================
> > > Case 1: without this patch
> > > =======================================================
> > > 4 vcpu(2 cores), 5 MANA IRQs (1 HWC + 4 Queue)
> > > 
> > > 	TYPE		effective vCPU aff
> > > =======================================================
> > > IRQ0:	HWC		0
> > > IRQ1:	mana_q1		0
> > > IRQ2:	mana_q2		2
> > > IRQ3:	mana_q3		0
> > > IRQ4:	mana_q4		3
> > > 
> > > %soft on each vCPU(mpstat -P ALL 1) on receiver
> > > vCPU		0	1	2	3
> > > =======================================================
> > > pass 1:		38.85	0.03	24.89	24.65
> > > pass 2:		39.15	0.03	24.57	25.28
> > > pass 3:		40.36	0.03	23.20	23.17
> > > 
> > > =======================================================
> > > Case 2: with this patch
> > > =======================================================
> > > 4 vcpu(2 cores), 5 MANA IRQs (1 HWC + 4 Queue)
> > > 
> > >         TYPE            effective vCPU aff
> > > =======================================================
> > > IRQ0:   HWC             0
> > > IRQ1:   mana_q1         0
> > > IRQ2:   mana_q2         1
> > > IRQ3:   mana_q3         2
> > > IRQ4:   mana_q4         3
> > > 
> > > %soft on each vCPU(mpstat -P ALL 1) on receiver
> > > vCPU            0       1       2       3
> > > =======================================================
> > > pass 1:         15.42	15.85	14.99	14.51
> > > pass 2:         15.53	15.94	15.81	15.93
> > > pass 3:         16.41	16.35	16.40	16.36
> > > 
> > > =======================================================
> > > Throughput Impact(in Gbps, same env)
> > > =======================================================
> > > TCP conn	with patch	w/o patch
> > > 20480		15.65		7.73
> > > 10240		15.63		8.93
> > > 8192		15.64		9.69
> > > 6144		15.64		13.16
> > > 4096		15.69		15.75
> > > 2048		15.69		15.83
> > > 1024		15.71		15.28
> > > 
> > > Fixes: 755391121038 ("net: mana: Allocate MSI-X vectors dynamically")
> > > Cc: stable@vger.kernel.org
> > > Co-developed-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> > > Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> > > Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
> > > Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
> > > ---
> > > Changes in v2
> > >  * Removed the unused skip_first_cpu variable
> > >  * fixed exit condition in irq_setup_linear() with len == 0
> > >  * changed return type of irq_setup_linear() as it will always be 0
> > >  * removed the unnecessary rcu_read_lock() in irq_setup_linear()
> > >  * added appropriate comments to indicate expected behaviour when
> > >    IRQs are more than or equal to num_online_cpus()
> > > ---
> > >  .../net/ethernet/microsoft/mana/gdma_main.c   | 47 ++++++++++++++++---
> > >  1 file changed, 40 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > index 098fbda0d128..d740d1dc43da 100644
> > > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > @@ -167,6 +167,8 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
> > >  	} else {
> > >  		/* If dynamic allocation is enabled we have already allocated
> > >  		 * hwc msi
> > > +		 * Also, we make sure in this case the following is always true
> > > +		 * (num_msix_usable - 1 HWC) <= num_online_cpus()
> > >  		 */
> > >  		gc->num_msix_usable = min(resp.max_msix, num_online_cpus() + 1);
> > >  	}
> > > @@ -1672,11 +1674,24 @@ static int irq_setup(unsigned int *irqs, unsigned int len, int node,
> > >  	return 0;
> > >  }
> > >  
> > > +/* should be called with cpus_read_lock() held */
> > > +static void irq_setup_linear(unsigned int *irqs, unsigned int len)
> > > +{
> > > +	int cpu;
> > > +
> > > +	for_each_online_cpu(cpu) {
> > > +		if (len == 0)
> > > +			break;
> > > +
> > > +		irq_set_affinity_and_hint(*irqs++, cpumask_of(cpu));
> > > +		len--;
> > > +	}
> > > +}
> > > +
> > >  static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
> > >  {
> > >  	struct gdma_context *gc = pci_get_drvdata(pdev);
> > >  	struct gdma_irq_context *gic;
> > > -	bool skip_first_cpu = false;
> > >  	int *irqs, irq, err, i;
> > >  
> > >  	irqs = kmalloc_objs(int, nvec);
> > 
> > So what about WARN_ON() and nvec adjustment before kmalloc?
> Hey Yury,
> 
> I am still a bit unsure about the WARN_ON() before kmalloc, as after
> that also, in the same function till we take the cpus_read_lock() the
> num_online_cpus() can change(or reduce). That's why I introduced the
> dev_dbg() to capture hot-remove edge case.

OK.
 
> Do you still think it adds more value?

It's your driver, so you know better. I just wonder because you said
it's good to add WARN_ON(), and then didn't do that.

> > 
> > > @@ -1722,13 +1737,31 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
> > >  	 * first CPU sibling group since they are already affinitized to HWC IRQ
> > >  	 */
> > >  	cpus_read_lock();
> > > -	if (gc->num_msix_usable <= num_online_cpus())
> > > -		skip_first_cpu = true;
> > > +	if (gc->num_msix_usable <= num_online_cpus()) {
> > > +		err = irq_setup(irqs, nvec, gc->numa_node, true);
> > > +		if (err) {
> > > +			cpus_read_unlock();
> > > +			goto free_irq;
> > 
> > One thing puzzles me: if you skip first CPU with this 'true', and the
> > gc->num_msix_usable == num_online_cpus(), it's one more than you can
> > distribute. What do I miss?
> > 
> 
> Let me explain this case a bit better then,
> 
> - num_msix_usable = HWC IRQ + Queue IRQ
> - nvec in this functions is only Queue IRQ (HWC already setup)
> 
> When num_online_cpus == num_msix_usable:
> - nvec = num_online_cpus - 1
> - first CPU is already assigned to HWC IRQ, so skip it
> - Queue IRQs fit in the remaining CPUs
> 
> please let me know if I did not get your question right

Can you put that in a comment?

> > > +		}
> > > +	} else {
> > > +		/*
> > > +		 * When num_msix_usable are more than num_online_cpus, we try to
> > > +		 * make sure we are using all vcpus. In such a case NUMA or
> > > +		 * CPU core affinity does not matter.
> > 
> > If it doesn't matter, why don't you assign each IRQ to all CPUs then?
> > In theory, the system would have most of flexibility to balance them.
> > 
> 
> Okay, let me fix the comment and elaborate on this. It doesn't matter
> because in such a case we want to anyway exhaust and distribute the
> Queue IRQs to all vCPUs.
> We don't want to rely on the system's balancer in this case as it could
> be skewed by other devices' IRQ weights

I don't understand this. If I want to reserve some CPUs to solely
handle IRQs from my high-priority hardware, then I configure my system
accordingly. For example, assign all non-networking IRQs on CPU0, and
all networking IRQs to all CPUs.

In your case, you distribute IRQs evenly, which means you've no
preferred CPUs. So, assuming the system is only running your IRQ
driver, it's at max is as good as all-CPU distribution. In case of
heavy loading some particular CPU, your scheme could cause
corresponding IRQs to starve.

I recall, when we was working on irq_setup(), the original idea was to
distribute IRQs one-to-one, but than I suggested the 

        irq_set_affinity_and_hint(*irqs++, topology_sibling_cpumask(cpu));

and after experiments, you agreed on that.

Can you please run your throughput test for my suggested distribution
too? Would be also nice to see how each distribution works when some
CPUs are under stress.

Thanks,
Yury

^ permalink raw reply

* Re: [PATCH 0/3] net: mana: Fix mana_destroy_rxq() cleanup for partial RXQ init
From: Simon Horman @ 2026-05-02 16:52 UTC (permalink / raw)
  To: Dipayaan Roy
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, shradhagupta, ssengar,
	ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260430035935.1859220-1-dipayanroy@linux.microsoft.com>

On Wed, Apr 29, 2026 at 08:57:51PM -0700, Dipayaan Roy wrote:
> When mana_create_rxq() fails partway through initialization (e.g. the
> hardware rejects the WQ object creation), the error path calls
> mana_destroy_rxq() to tear down a partially-initialized RXQ.
> This exposed multiple issues in mana_destroy_rxq() path, as it assumed
> the RXQ was always fully initialized, leading to multiple issues:
> 
> 1. xdp_rxq_info_unreg() was called on an unregistered xdp_rxq,
>    triggering a WARN_ON ("Driver BUG") in net/core/xdp.c.
> 
> 2. mana_destroy_wq_obj() was called with INVALID_MANA_HANDLE,
>    sending a bogus destroy command to the hardware.
> 
> 3. mana_deinit_cq() was called twice — once inside mana_destroy_rxq()
>    and again in mana_create_rxq()'s error path — causing a
>    use-after-free since mana_destroy_rxq() frees the rxq first.
> 
> This was observed during ethtool ring parameter changes when the
> hardware returned an error creating the RXQ. This series makes
> mana_destroy_rxq() safe to call at any stage of RXQ initialization
> by guarding each teardown step, and removes the redundant cleanup
> in mana_create_rxq().

For the series:

Reviewed-by: Simon Horman <horms@kernel.org>

I don't think that you need to repost for this. But please keep in mind for
future submissions that fixes for code present in the net tree should be
targeted at that tree, like this:

Subject: [PATCH net vN/M] ...

Also, FTR, there is an AI generated review of this patch-set available
on sashiko.dev. It seems to me that the issues flagged there pre-date
this patch-set and should not block progress of it. But you may wish
to use that review as the basis of some follow-up.

^ permalink raw reply

* Re: [PATCH net-next v6 2/6] net: mana: Query device capabilities and configure MSI-X sharing for EQs
From: Simon Horman @ 2026-05-02 15:26 UTC (permalink / raw)
  To: longli
  Cc: 'Simon Horman', kotaranov, kuba, davem, pabeni, edumazet,
	andrew+netdev, jgg, leon, haiyangz, kys, wei.liu, decui, netdev,
	linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260429221625.1841150-3-longli@microsoft.com>

From: 'Simon Horman' <horms@kernel.org>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://netdev-ai.bots.linux.dev/sashiko/

I apologise for any overlap with this and the review of this
patchset available at sashiko.dev. They both seem relevant and
I'm unsure how to cleanly pass them both on.

Both Sashiko instances also flagged concerns in the remaining
patches of this patchset. But I am not forwarding them as they
seem to be lower priority, and I think I've already created
enough noise here.

---
net: mana: Query device capabilities and configure MSI-X sharing for EQs

This patch queries device capabilities early in GDMA probe and chooses
between dedicated MSI-X per vPort or a shared-MSI-X mode based on the
available vector count, introducing gc->max_num_queues_vport,
gc->msi_sharing and gc->msi_bitmap, and renaming mana_query_device_cfg()
to mana_gd_query_device_cfg() so it can be called from the GDMA layer.

> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 098fbda0d128..b96859e0aec9 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -149,6 +149,9 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
>  	struct gdma_context *gc = pci_get_drvdata(pdev);
>  	struct gdma_query_max_resources_resp resp = {};
>  	struct gdma_general_req req = {};
> +	unsigned int max_num_queues;
> +	u8 bm_hostmode;
> +	u16 num_ports;
>  	int err;
>  
>  	mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES,
> @@ -194,6 +197,40 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
>  	if (gc->max_num_queues > gc->num_msix_usable - 1)
>  		gc->max_num_queues = gc->num_msix_usable - 1;
>  
> +	err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
> +				       MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
> +	if (err)
> +		return err;

Is this the same MANA_QUERY_DEV_CONFIG that mana_probe() issues shortly
after?  mana_probe() still contains:

	err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
				       MANA_MICRO_VERSION, &num_ports, &bm_hostmode);

so the same HWC round-trip runs twice on every probe with identical
arguments, both populating gc->adapter_mtu.

The commit message frames this rename as consolidation ("used at GDMA
device probe time"), but the mana_probe() call-site is not removed.  Was
one of the two call-sites meant to go away?

The relocation of debugfs_create_u16("adapter-MTU", ...) from
mana_gd_query_device_cfg() into mana_probe() only makes sense if the
function is now invoked more than once, which hints at the same
duplication.

> +
> +	if (!num_ports)
> +		return -EINVAL;
> +
> +	/*
> +	 * Adjust gc->max_num_queues returned from the SOC to allow dedicated
> +	 * MSIx for each vPort. Clamp to no less than MANA_DEF_NUM_QUEUES.
> +	 */
> +	max_num_queues = (gc->num_msix_usable - 1) / num_ports;
> +	max_num_queues = roundup_pow_of_two(max(max_num_queues, 1U));
> +	if (max_num_queues < MANA_DEF_NUM_QUEUES)
> +		max_num_queues = MANA_DEF_NUM_QUEUES;
> +
> +	/*
> +	 * Use dedicated MSIx for EQs whenever possible, use MSIx sharing for
> +	 * Ethernet EQs when (max_num_queues * num_ports > num_msix_usable - 1)
> +	 */
> +	max_num_queues = min(gc->max_num_queues, max_num_queues);
> +	if (max_num_queues * num_ports > gc->num_msix_usable - 1)
> +		gc->msi_sharing = true;

Is gc->msi_sharing ever reset to false?  The only two writers are this
line and mana_gd_setup_hwc_irqs() (the !pci_msix_can_alloc_dyn branch),
and both only set it to true.  mana_gd_remove_irqs() frees msi_bitmap
and zeros max_num_msix / num_msix_usable, but does not clear
msi_sharing, and the gdma_context survives mana_gd_suspend() /
mana_gd_resume().

Once true is latched in any setup cycle, mana_gd_setup() will always
take the sharing branch on resume even if the recomputed
max_num_queues * num_ports fits within num_msix_usable - 1.  Is that
consistent with the commit message saying sharing "is only enabled when
there are not enough MSI-X vectors for dedicated allocation"?

> +
> +	/* If MSI is shared, use max allowed value */
> +	if (gc->msi_sharing)
> +		gc->max_num_queues_vport = min(gc->num_msix_usable - 1, gc->max_num_queues);
> +	else
> +		gc->max_num_queues_vport = max_num_queues;
> +
> +	dev_info(gc->dev, "MSI sharing mode %d max queues %d\n",
> +		 gc->msi_sharing, gc->max_num_queues);

Should this print gc->max_num_queues_vport rather than
gc->max_num_queues?  The block immediately above computes
gc->max_num_queues_vport and leaves gc->max_num_queues unchanged, and
mana_probe_port() sizes alloc_etherdev_mq() and apc->max_queues from
gc->max_num_queues_vport:

	ndev = alloc_etherdev_mq(sizeof(struct mana_port_context),
				 gc->max_num_queues_vport);
	...
	apc->max_queues = gc->max_num_queues_vport;
	apc->num_queues = min(gc->max_num_queues_vport, MANA_DEF_NUM_QUEUES);

so the logged value does not match the per-vPort queue count the driver
actually exposes.

> +
>  	return 0;
>  }
>  
> @@ -1856,6 +1893,7 @@ static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev)

[ ... ]

> @@ -1968,20 +2008,30 @@ static int mana_gd_setup(struct pci_dev *pdev)
>  	if (err)
>  		goto destroy_hwc;
>  
> -	err = mana_gd_query_max_resources(pdev);
> +	err = mana_gd_detect_devices(pdev);
>  	if (err)
>  		goto destroy_hwc;
>  
> -	err = mana_gd_setup_remaining_irqs(pdev);
> -	if (err) {
> -		dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err);
> -		goto destroy_hwc;
> -	}
> -
> -	err = mana_gd_detect_devices(pdev);
> +	err = mana_gd_query_max_resources(pdev);
>  	if (err)
>  		goto destroy_hwc;
>  
> +	if (!gc->msi_sharing) {
> +		gc->msi_bitmap = bitmap_zalloc(gc->num_msix_usable, GFP_KERNEL);
> +		if (!gc->msi_bitmap) {
> +			err = -ENOMEM;
> +			goto destroy_hwc;
> +		}
> +		/* Set bit for HWC */
> +		set_bit(0, gc->msi_bitmap);
> +	} else {
> +		err = mana_gd_setup_remaining_irqs(pdev);
> +		if (err) {
> +			dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err);
> +			goto destroy_hwc;
> +		}
> +	}
> +

Can the driver bring up any vPort after this change when the !msi_sharing
branch is taken?

In the dedicated branch, only gc->msi_bitmap is allocated and bit 0 is
set for HWC.  mana_gd_setup_remaining_irqs() is skipped, so no
gdma_irq_context is inserted into gc->irq_contexts for indices 1..
num_msix_usable-1.

Later, mana_create_eq() still assigns

	spec.eq.msix_index = (i + 1) % gc->num_msix_usable;

and mana_gd_register_irq() does:

	gic = xa_load(&gc->irq_contexts, msi_index);
	if (WARN_ON(!gic))
		return -EINVAL;

On a typical cloud SKU with, for example, num_msix_usable=32,
num_ports=1 and num_online_cpus=16, the new math keeps msi_sharing=false
(16 * 1 <= 31), so every EQ-create goes down this path and hits the
WARN_ON.  Doesn't that make every vPort open and every resume fail for
the common dedicated-MSI-X case?

The msi_bitmap allocated here is not consumed anywhere in this commit;
the on-demand allocation via mana_gd_get_gic() appears in the later
commit "net: mana: Allocate interrupt context for each EQ when creating
vPort" (dbbdf40a8974).  Should the bitmap and the new branch be
introduced in the same commit that actually uses them, so each commit in
the series is independently bootable?

>  	dev_dbg(&pdev->dev, "mana gdma setup successful\n");
>  	return 0;
>

^ permalink raw reply

* Re: [PATCH net-next v6 1/6] net: mana: Create separate EQs for each vPort
From: Simon Horman @ 2026-05-02 15:29 UTC (permalink / raw)
  To: longli
  Cc: kotaranov, kuba, davem, pabeni, edumazet, andrew+netdev, jgg,
	leon, haiyangz, kys, wei.liu, decui, netdev, linux-rdma,
	linux-hyperv, linux-kernel
In-Reply-To: <20260502152354.289044-2-horms@kernel.org>

On Sat, May 02, 2026 at 04:23:55PM +0100, Simon Horman wrote:
> From: 'Simon Horman' <horms@kernel.org>
> 
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> Full review at: https://sashiko.dev

Sorry about this, there was supposed to be some different text here.

This review is available at https://netdev-ai.bots.linux.dev/sashiko/
And I apologise that it overlaps with the review from https://sashiko.dev
which I also posted.

^ permalink raw reply

* Re: [PATCH net-next v6 1/6] net: mana: Create separate EQs for each vPort
From: Simon Horman @ 2026-05-02 15:23 UTC (permalink / raw)
  To: longli
  Cc: 'Simon Horman', kotaranov, kuba, davem, pabeni, edumazet,
	andrew+netdev, jgg, leon, haiyangz, kys, wei.liu, decui, netdev,
	linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260429221625.1841150-2-longli@microsoft.com>

From: 'Simon Horman' <horms@kernel.org>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
net: mana: Create separate EQs for each vPort

This change moves EQ ownership from struct mana_context to struct
mana_port_context so each vPort can have dedicated EQs, and exports
mana_create_eq()/mana_destroy_eq() for use by the MANA RDMA driver.

> diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
> index ac5e75dd3494..60cc02e4ad10 100644
> --- a/drivers/infiniband/hw/mana/main.c
> +++ b/drivers/infiniband/hw/mana/main.c
> @@ -20,8 +20,10 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
>  	pd->vport_use_count--;
>  	WARN_ON(pd->vport_use_count < 0);
>  
> -	if (!pd->vport_use_count)
> +	if (!pd->vport_use_count) {
> +		mana_destroy_eq(mpc);
>  		mana_uncfg_vport(mpc);
> +	}
>  
>  	mutex_unlock(&pd->vport_mutex);
>  }
> @@ -55,15 +57,21 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
>  		return err;
>  	}
>  
> -	mutex_unlock(&pd->vport_mutex);
>  
>  	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
>  	pd->tx_vp_offset = mpc->tx_vp_offset;
> +	err = mana_create_eq(mpc);
> +	if (err) {
> +		mana_uncfg_vport(mpc);
> +		pd->vport_use_count--;
> +	}
> +
> +	mutex_unlock(&pd->vport_mutex);
>  
>  	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
>  		  mpc->port_handle, pd->pdn, doorbell_id);

On the failure path, mana_uncfg_vport() is called and pd->vport_use_count
is decremented, but execution falls through to the ibdev_dbg() line that
prints the vport handle as if it were a success. Should this debug print
be skipped on the error path so logs are not ambiguous?

Also, pd->tx_shortform_allowed and pd->tx_vp_offset are assigned from mpc
before the mana_create_eq() attempt and are not cleared when that call
fails. Callers are expected to gate on pd->vport_use_count, but is it
worth resetting these fields on failure so the state matches the
pre-patch behavior?

The mutex_unlock(&pd->vport_mutex) has moved from before the field
assignments to after mana_create_eq(). That now holds pd->vport_mutex
across GFP_KERNEL allocations, debugfs directory creation, and multiple
mana_gd_create_mana_eq() firmware RPCs, and establishes the lock order
pd->vport_mutex -> apc->vport_mutex via mana_uncfg_vport() on the error
path. Is the extended critical section intentional?

>  
> -	return 0;
> +	return err;
>  }
>  
>  int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
> diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
> index 645581359cee..6f1043383e8c 100644
> --- a/drivers/infiniband/hw/mana/qp.c
> +++ b/drivers/infiniband/hw/mana/qp.c
> @@ -168,7 +168,15 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
>  		cq_spec.gdma_region = cq->queue.gdma_region;
>  		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
>  		cq_spec.modr_ctx_id = 0;
> -		eq = &mpc->ac->eqs[cq->comp_vector];
> +		/* EQs are created when a raw QP configures the vport.
> +		 * A raw QP must be created before creating rwq_ind_tbl.
> +		 */
> +		if (!mpc->eqs) {
> +			ret = -EINVAL;
> +			i--;
> +			goto fail;
> +		}
> +		eq = &mpc->eqs[cq->comp_vector % mpc->num_queues];

Can the mpc->eqs read race with a concurrent free? mana_ib_create_qp_rss()
runs without pd->vport_mutex or RTNL, but mpc->eqs is freed by
mana_destroy_eq() from two paths:

  mana_ib_uncfg_vport()   (under pd->vport_mutex, on last raw-QP destroy)
  mana_dealloc_queues()   (under RTNL, on netdev down)

both of which do:

  kfree(apc->eqs);
  apc->eqs = NULL;

with no RCU grace period or reader-visible synchronization. If CPU-A
passes the !mpc->eqs check after CPU-B begins ip link set dev X down,
does CPU-A then dereference freed memory via mpc->eqs[...].eq->id?

Separately, what populates mpc->eqs for an RDMA-only RSS QP user on a
probed-but-idle Ethernet port? Pre-patch mana_probe() called
mana_create_eq(ac) unconditionally, so ac->eqs existed for the device
lifetime. After this patch the only creators are mana_alloc_queues()
(Ethernet up) and mana_ib_cfg_vport() (raw QP). An RDMA application that
uses only RSS QPs and never creates a raw QP will now get -EINVAL here
where it used to succeed. Is this intended, and should the commit log
mention it?

The adjacent comment:

   /* EQs are created when a raw QP configures the vport.
    * A raw QP must be created before creating rwq_ind_tbl.
    */

omits the Ethernet-up path that also populates mpc->eqs. Would it be
clearer to describe both creators?

There is also a behavior change in the index expression:

   eq = &mpc->eqs[cq->comp_vector % mpc->num_queues];

Pre-patch this was ac->eqs[cq->comp_vector] sized by gc->max_num_queues.
Now comp_vector is folded modulo mpc->num_queues, which is tunable via
ethtool -L. Userspace that used distinct comp_vector values to hit
distinct EQs will silently alias when comp_vector >= num_queues. Should
this be documented or rejected with -EINVAL rather than silently
wrapped?

Can mpc->num_queues be 0 at this point? mana_set_channels() does not
reject new_count==0, and kzalloc_objs(struct mana_eq, 0) returns
ZERO_SIZE_PTR, which passes the !mpc->eqs check. During the window
between mana_create_eq() and the subsequent netif_set_real_num_tx_queues()
failing, a concurrent RDMA QP create would compute
cq->comp_vector % 0 here. Should mpc->num_queues be validated alongside
mpc->eqs?

The placement of the !mpc->eqs check is inside the per-iteration loop
over ind_tbl_size, but mpc->eqs cannot change across iterations, so the
check is only meaningful at i==0. It works today because i-- then makes
i = -1 and the cleanup while (i-- > 0) skips. Would hoisting the check
above the loop be clearer and less fragile against refactoring?

>  		cq_spec.attached_eq = eq->eq->id;
>  
>  		ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,
> @@ -317,7 +325,11 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
>  	cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE;
>  	cq_spec.modr_ctx_id = 0;
>  	eq_vec = send_cq->comp_vector;
> -	eq = &mpc->ac->eqs[eq_vec];
> +	if (!mpc->eqs) {
> +		err = -EINVAL;
> +		goto err_destroy_queue;
> +	}
> +	eq = &mpc->eqs[eq_vec % mpc->num_queues];

The same mpc->eqs TOCTOU concern applies here: the check and subsequent
dereference occur without pd->vport_mutex or RTNL held, so a concurrent
mana_dealloc_queues() on the Ethernet side can free mpc->eqs between the
check and the index access. Is there synchronization that prevents this?

The same comp_vector % num_queues aliasing and num_queues==0 divide
concerns apply here as well.

>  	cq_spec.attached_eq = eq->eq->id;
>  
>  	err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec,
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index a654b3699c4c..6c709f8b875d 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c

[ ... ]

> -static int mana_create_eq(struct mana_context *ac)
> +int mana_create_eq(struct mana_port_context *apc)
>  {
> -	struct gdma_dev *gd = ac->gdma_dev;
> +	struct gdma_dev *gd = apc->ac->gdma_dev;
>  	struct gdma_context *gc = gd->gdma_context;
>  	struct gdma_queue_spec spec = {};
>  	int err;
>  	int i;
>  
> -	ac->eqs = kzalloc_objs(struct mana_eq, gc->max_num_queues);
> -	if (!ac->eqs)
> +	WARN_ON(apc->eqs);
> +	apc->eqs = kzalloc_objs(struct mana_eq, apc->num_queues);
> +	if (!apc->eqs)
>  		return -ENOMEM;

WARN_ON(apc->eqs) is used as a defensive assertion but execution
proceeds to overwrite apc->eqs unconditionally. If the invariant is
violated, the previous array and all HW EQs it referenced are leaked.
Should this instead bail out with an error or call mana_destroy_eq()
first to keep the function idempotent against future callers?

[ ... ]

> @@ -3326,6 +3349,9 @@ static int mana_dealloc_queues(struct net_device *ndev)
>  	mana_fence_rqs(apc);
>  
>  	/* Even in err case, still need to cleanup the vPort */
> +	mana_destroy_rxqs(apc);
> +	mana_destroy_txq(apc);
> +	mana_destroy_eq(apc);
>  	mana_destroy_vport(apc);

What is the intended interaction with outstanding RDMA QPs here? An
RDMA user that created an RSS QP while the Ethernet netdev was up sees
mpc->eqs populated by mana_alloc_queues(). When the admin issues
ip link set dev N down, mana_dealloc_queues() -> mana_destroy_eq()
destroys those EQs and frees mpc->eqs while the RDMA QPs are still
alive, leaving the QPs with dangling attached_eq IDs at the hardware
level and stale kernel references.

Pre-patch ac->eqs lived for the full mana_context lifetime and was torn
down only in mana_remove(). Is unconditionally destroying the EQs on
netdev-down the intended new behavior, and if so how are concurrent
RDMA consumers expected to recover?

>  	return 0;
>  }

^ permalink raw reply

* Re: [PATCH net-next v6 2/6] net: mana: Query device capabilities and configure MSI-X sharing for EQs
From: Simon Horman @ 2026-05-02 15:08 UTC (permalink / raw)
  To: longli
  Cc: 'Simon Horman', kotaranov, kuba, davem, pabeni, edumazet,
	andrew+netdev, jgg, leon, haiyangz, kys, wei.liu, decui, netdev,
	linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260429221625.1841150-3-longli@microsoft.com>

From: 'Simon Horman' <horms@kernel.org>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
net: mana: Query device capabilities and configure MSI-X sharing for EQs

When querying the device, this adjusts the max number of queues to allow
dedicated MSI-X vectors for each vPort, clamping to no less than
MANA_DEF_NUM_QUEUES. MSI-X sharing among vPorts is disabled by default
and enabled only when there are not enough vectors. It also renames
mana_query_device_cfg() to mana_gd_query_device_cfg().

> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c

[ ... ]

> +	/*
> +	 * Adjust gc->max_num_queues returned from the SOC to allow dedicated
> +	 * MSIx for each vPort. Clamp to no less than MANA_DEF_NUM_QUEUES.
> +	 */
> +	max_num_queues = (gc->num_msix_usable - 1) / num_ports;
> +	max_num_queues = roundup_pow_of_two(max(max_num_queues, 1U));

Does rounding up here inflate max_num_queues and unnecessarily force MSI-X
sharing?

For example, if there are 63 usable MSIs and 2 ports, the division yields 31.
Rounding up gives 32. The subsequent check below:

> +	/*
> +	 * Use dedicated MSIx for EQs whenever possible, use MSIx sharing for
> +	 * Ethernet EQs when (max_num_queues * num_ports > num_msix_usable - 1)
> +	 */
> +	max_num_queues = min(gc->max_num_queues, max_num_queues);
> +	if (max_num_queues * num_ports > gc->num_msix_usable - 1)
> +		gc->msi_sharing = true;

would then evaluate to true (32 * 2 > 63) and force the driver into MSI-X
shared mode. This seems to contradict the intent to use dedicated MSI-X
whenever possible.

Would it be better to use rounddown_pow_of_two() instead to ensure the
calculated queues fit within the available dedicated vectors?

[ ... ]

> @@ -1968,20 +2008,30 @@ static int mana_gd_setup(struct pci_dev *pdev)
>  	if (err)
>  		goto destroy_hwc;
>  
> -	err = mana_gd_query_max_resources(pdev);
> +	err = mana_gd_detect_devices(pdev);
>  	if (err)
>  		goto destroy_hwc;
>  
> -	err = mana_gd_setup_remaining_irqs(pdev);
> -	if (err) {
> -		dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err);
> -		goto destroy_hwc;
> -	}
> -
> -	err = mana_gd_detect_devices(pdev);
> +	err = mana_gd_query_max_resources(pdev);
>  	if (err)
>  		goto destroy_hwc;
>  
> +	if (!gc->msi_sharing) {
> +		gc->msi_bitmap = bitmap_zalloc(gc->num_msix_usable, GFP_KERNEL);
> +		if (!gc->msi_bitmap) {
> +			err = -ENOMEM;
> +			goto destroy_hwc;
> +		}
> +		/* Set bit for HWC */
> +		set_bit(0, gc->msi_bitmap);
> +	} else {
> +		err = mana_gd_setup_remaining_irqs(pdev);
> +		if (err) {
> +			dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err);
> +			goto destroy_hwc;
> +		}
> +	}

If msi_sharing is disabled, we allocate the msi_bitmap but skip calling
mana_gd_setup_remaining_irqs(). 

Since mana_gd_setup_hwc_irqs() only allocates a single vector for the hardware
channel when dynamic allocation is supported, does this leave the device
without interrupts for its Ethernet queues?

If so, it seems this could lead to queue creation failures when the driver
attempts to map uninitialized vectors. I notice this is fixed in a later patch
in the series ("net: mana: Allocate interrupt context for each EQ when
creating vPort"), but does leaving it out here break bisectability?

^ permalink raw reply

* Re: [PATCH net-next v6 1/6] net: mana: Create separate EQs for each vPort
From: Simon Horman @ 2026-05-02 15:07 UTC (permalink / raw)
  To: longli
  Cc: 'Simon Horman', kotaranov, kuba, davem, pabeni, edumazet,
	andrew+netdev, jgg, leon, haiyangz, kys, wei.liu, decui, netdev,
	linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260429221625.1841150-2-longli@microsoft.com>

From: 'Simon Horman' <horms@kernel.org>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
net: mana: Create separate EQs for each vPort

This commit modifies the mana driver to prepare for assigning vPorts to
dedicated MSI-X vectors by removing EQ sharing among vPorts. It moves the EQ
definition from the mana_context to the mana_port_context and exports EQ
lifecycle functions for use by the MANA RDMA driver.

> diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
> index 645581359cee..6f1043383e8c 100644
> --- a/drivers/infiniband/hw/mana/qp.c
> +++ b/drivers/infiniband/hw/mana/qp.c
> @@ -168,7 +168,15 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
>  		cq_spec.gdma_region = cq->queue.gdma_region;
>  		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
>  		cq_spec.modr_ctx_id = 0;
> -		eq = &mpc->ac->eqs[cq->comp_vector];
> +		/* EQs are created when a raw QP configures the vport.
> +		 * A raw QP must be created before creating rwq_ind_tbl.
> +		 */
> +		if (!mpc->eqs) {
> +			ret = -EINVAL;
> +			i--;
> +			goto fail;
> +		}
> +		eq = &mpc->eqs[cq->comp_vector % mpc->num_queues];

Is it possible for the EQs to be destroyed while this RSS QP is still actively
using them?

If the EQs are created by the Ethernet interface being brought up, or by
a RAW QP configuring the vport, this RSS QP will attach to them without
incrementing pd->vport_use_count or taking any vport reference count.

If the Ethernet interface is subsequently brought down, or the RAW QP is
destroyed, mana_destroy_eq() will be called, freeing the mpc->eqs array and
destroying the underlying DMA regions while this RSS QP remains active. This
regression could allow the hardware to DMA completion events into freed
EQ memory.

Additionally, since mpc->eqs is accessed here without holding pd->vport_mutex,
could a concurrent teardown of the EQs lead to a use-after-free when reading
eq->eq->id?

>  		cq_spec.attached_eq = eq->eq->id;
>  
>  		ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,

^ permalink raw reply

* Re: [PATCH net-next v2] net: mana: hardening: Reject zero max_num_queues from GDMA_QUERY_MAX_RESOURCES
From: Shradha Gupta @ 2026-05-02 14:44 UTC (permalink / raw)
  To: Erni Sri Satya Vennela
  Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, horms, dipayanroy, yury.norov,
	linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260430083627.1873757-1-ernis@linux.microsoft.com>

On Thu, Apr 30, 2026 at 01:36:21AM -0700, Erni Sri Satya Vennela wrote:
> In a CVM environment, hardware responses cannot be trusted. The
> GDMA_QUERY_MAX_RESOURCES command returns resource limits used to
> determine the maximum number of queues.
> 
> In mana_gd_query_max_resources(), gc->max_num_queues is initialized
> from num_online_cpus() and successively clamped by the hardware-reported
> max_eq, max_cq, max_sq, max_rq, and num_msix_usable values. If any of
> these hardware values is zero, gc->max_num_queues becomes zero and the
> function returns success. This leads to a confusing failure later when
> alloc_etherdev_mq() is called with zero queues, returning NULL and
> producing a misleading -ENOMEM error.
> 
> Add an explicit zero check for gc->max_num_queues after all clamping
> steps and return -ENOSPC for a clear early failure, consistent with the
> existing gc->num_msix_usable <= 1 guard.
> 
> Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> ---
> Changes in v2:
> * Rebase to latest main.
> ---
>  drivers/net/ethernet/microsoft/mana/gdma_main.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 098fbda0d128..f3316e929175 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -194,6 +194,9 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
>  	if (gc->max_num_queues > gc->num_msix_usable - 1)
>  		gc->max_num_queues = gc->num_msix_usable - 1;
>  
> +	if (gc->max_num_queues == 0)
> +		return -ENOSPC;
> +
>  	return 0;
>  }
>

Reviewed-by: Shradha Gupta <shradhagupta@linux.microsoft.com>  

> -- 
> 2.34.1

^ permalink raw reply

* Re: [PATCH net v2] net: mana: Optimize irq affinity for low vcpu configs
From: Shradha Gupta @ 2026-05-02 14:37 UTC (permalink / raw)
  To: Yury Norov
  Cc: Dexuan Cui, Wei Liu, Haiyang Zhang, K. Y. Srinivasan, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
	Dipayaan Roy, Shiraz Saleem, Michael Kelley, Long Li, Yury Norov,
	linux-hyperv, linux-kernel, netdev, Paul Rosswurm, Shradha Gupta,
	Saurabh Singh Sengar, stable
In-Reply-To: <afTTPLClWwIMWTOh@yury>

On Fri, May 01, 2026 at 12:22:20PM -0400, Yury Norov wrote:
> On Wed, Apr 29, 2026 at 02:06:37AM -0700, Shradha Gupta wrote:
> > In mana driver, the number of IRQs allocated is capped by the
> > min(num_cpu + 1, queue count). In cases, where the IRQ count is greater
> > than the vcpu count, we want to utilize all the vCPUs, irrespective of
> > their NUMA/core bindings.
> > 
> > This is important, especially in the envs where number of vCPUs are so
> > few that the softIRQ handling overhead on two IRQs on the same vCPU is
> > much more than their overheads if they were spread across sibling vCPUs.
> > 
> > This behaviour is more evident with dynamic IRQ allocation. Since MANA
> > IRQs are assigned at a later stage compared to static allocation, other
> > device IRQs may already be affinitized to the vCPUs. As a result, IRQ
> > weights become imbalanced, causing multiple MANA IRQs to land on the
> > same vCPU, while some vCPUs have none.
> > 
> > In such cases when many parallel TCP connections are tested, the
> > throughput drops significantly.
> > 
> > Test envs:
> > =======================================================
> > Case 1: without this patch
> > =======================================================
> > 4 vcpu(2 cores), 5 MANA IRQs (1 HWC + 4 Queue)
> > 
> > 	TYPE		effective vCPU aff
> > =======================================================
> > IRQ0:	HWC		0
> > IRQ1:	mana_q1		0
> > IRQ2:	mana_q2		2
> > IRQ3:	mana_q3		0
> > IRQ4:	mana_q4		3
> > 
> > %soft on each vCPU(mpstat -P ALL 1) on receiver
> > vCPU		0	1	2	3
> > =======================================================
> > pass 1:		38.85	0.03	24.89	24.65
> > pass 2:		39.15	0.03	24.57	25.28
> > pass 3:		40.36	0.03	23.20	23.17
> > 
> > =======================================================
> > Case 2: with this patch
> > =======================================================
> > 4 vcpu(2 cores), 5 MANA IRQs (1 HWC + 4 Queue)
> > 
> >         TYPE            effective vCPU aff
> > =======================================================
> > IRQ0:   HWC             0
> > IRQ1:   mana_q1         0
> > IRQ2:   mana_q2         1
> > IRQ3:   mana_q3         2
> > IRQ4:   mana_q4         3
> > 
> > %soft on each vCPU(mpstat -P ALL 1) on receiver
> > vCPU            0       1       2       3
> > =======================================================
> > pass 1:         15.42	15.85	14.99	14.51
> > pass 2:         15.53	15.94	15.81	15.93
> > pass 3:         16.41	16.35	16.40	16.36
> > 
> > =======================================================
> > Throughput Impact(in Gbps, same env)
> > =======================================================
> > TCP conn	with patch	w/o patch
> > 20480		15.65		7.73
> > 10240		15.63		8.93
> > 8192		15.64		9.69
> > 6144		15.64		13.16
> > 4096		15.69		15.75
> > 2048		15.69		15.83
> > 1024		15.71		15.28
> > 
> > Fixes: 755391121038 ("net: mana: Allocate MSI-X vectors dynamically")
> > Cc: stable@vger.kernel.org
> > Co-developed-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> > Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> > Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
> > Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
> > ---
> > Changes in v2
> >  * Removed the unused skip_first_cpu variable
> >  * fixed exit condition in irq_setup_linear() with len == 0
> >  * changed return type of irq_setup_linear() as it will always be 0
> >  * removed the unnecessary rcu_read_lock() in irq_setup_linear()
> >  * added appropriate comments to indicate expected behaviour when
> >    IRQs are more than or equal to num_online_cpus()
> > ---
> >  .../net/ethernet/microsoft/mana/gdma_main.c   | 47 ++++++++++++++++---
> >  1 file changed, 40 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > index 098fbda0d128..d740d1dc43da 100644
> > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > @@ -167,6 +167,8 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
> >  	} else {
> >  		/* If dynamic allocation is enabled we have already allocated
> >  		 * hwc msi
> > +		 * Also, we make sure in this case the following is always true
> > +		 * (num_msix_usable - 1 HWC) <= num_online_cpus()
> >  		 */
> >  		gc->num_msix_usable = min(resp.max_msix, num_online_cpus() + 1);
> >  	}
> > @@ -1672,11 +1674,24 @@ static int irq_setup(unsigned int *irqs, unsigned int len, int node,
> >  	return 0;
> >  }
> >  
> > +/* should be called with cpus_read_lock() held */
> > +static void irq_setup_linear(unsigned int *irqs, unsigned int len)
> > +{
> > +	int cpu;
> > +
> > +	for_each_online_cpu(cpu) {
> > +		if (len == 0)
> > +			break;
> > +
> > +		irq_set_affinity_and_hint(*irqs++, cpumask_of(cpu));
> > +		len--;
> > +	}
> > +}
> > +
> >  static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
> >  {
> >  	struct gdma_context *gc = pci_get_drvdata(pdev);
> >  	struct gdma_irq_context *gic;
> > -	bool skip_first_cpu = false;
> >  	int *irqs, irq, err, i;
> >  
> >  	irqs = kmalloc_objs(int, nvec);
> 
> So what about WARN_ON() and nvec adjustment before kmalloc?
Hey Yury,

I am still a bit unsure about the WARN_ON() before kmalloc, as after
that also, in the same function till we take the cpus_read_lock() the
num_online_cpus() can change(or reduce). That's why I introduced the
dev_dbg() to capture hot-remove edge case.

Do you still think it adds more value?
> 
> > @@ -1722,13 +1737,31 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
> >  	 * first CPU sibling group since they are already affinitized to HWC IRQ
> >  	 */
> >  	cpus_read_lock();
> > -	if (gc->num_msix_usable <= num_online_cpus())
> > -		skip_first_cpu = true;
> > +	if (gc->num_msix_usable <= num_online_cpus()) {
> > +		err = irq_setup(irqs, nvec, gc->numa_node, true);
> > +		if (err) {
> > +			cpus_read_unlock();
> > +			goto free_irq;
> 
> One thing puzzles me: if you skip first CPU with this 'true', and the
> gc->num_msix_usable == num_online_cpus(), it's one more than you can
> distribute. What do I miss?
> 

Let me explain this case a bit better then,

- num_msix_usable = HWC IRQ + Queue IRQ
- nvec in this functions is only Queue IRQ (HWC already setup)

When num_online_cpus == num_msix_usable:
- nvec = num_online_cpus - 1
- first CPU is already assigned to HWC IRQ, so skip it
- Queue IRQs fit in the remaining CPUs

please let me know if I did not get your question right

> > +		}
> > +	} else {
> > +		/*
> > +		 * When num_msix_usable are more than num_online_cpus, we try to
> > +		 * make sure we are using all vcpus. In such a case NUMA or
> > +		 * CPU core affinity does not matter.
> 
> If it doesn't matter, why don't you assign each IRQ to all CPUs then?
> In theory, the system would have most of flexibility to balance them.
> 

Okay, let me fix the comment and elaborate on this. It doesn't matter
because in such a case we want to anyway exhaust and distribute the
Queue IRQs to all vCPUs.
We don't want to rely on the system's balancer in this case as it could
be skewed by other devices' IRQ weights

> > +		 * Note: in this case the total mana IRQ should always be
> > +		 * num_online_cpus + 1. The first HWC IRQ is already handled
> > +		 * in HWC setup calls
> > +		 * However, if CPUs went offline since num_msix_usable was
> > +		 * computed, nvec count will be more than num_online_cpus().
> > +		 * In such cases remaining extra IRQs will retain their default
> > +		 * affinity.
> > +		 */
> > +		if (nvec > num_online_cpus())
> > +			dev_dbg(&pdev->dev,
> > +				"IRQ count %d exceeds online CPU count %d. Some IRQs will share CPU\n",
> 
> I'd better say 'some IRQs will share the default CPU', and in the
> perfect world, I'd like to see:
> 
>         'The IRQs #4-12 will share the default CPU #0'
> 
> type of message.

Sure, that makes more sense. Will make the change in next version
> 
> > +				nvec, num_online_cpus());
>                 
> It's not that straightforward as it should be. In one case 
> 
>         nvec > num_online_cpus()
> 
> is a problem, while in another - not. It looks already suspicious. So
> when you throw a warning, you should mention it, I believe.
> 
> In the
> 
>         gc->num_msix_usable <= num_online_cpus()
> 
> case, when nvec is too big, would'n 'some IRQs share some CPU' just
> as well? If so, you again should throw a message.

let me try to cleanly address this too, in the next version. May be even
a seperate patch.

Thanks Yury.
> 
> > -	err = irq_setup(irqs, nvec, gc->numa_node, skip_first_cpu);
> > -	if (err) {
> > -		cpus_read_unlock();
> > -		goto free_irq;
> > +		irq_setup_linear(irqs, nvec);
> >  	}
> >  
> >  	cpus_read_unlock();
> > 
> > base-commit: e728258debd553c95d2e70f9cd97c9fde27c7130
> > -- 
> > 2.34.1

^ permalink raw reply

* Re: [PATCH] mshv: Add dedicated ioctl for GVA to GPA translation
From: kernel test robot @ 2026-05-02 13:12 UTC (permalink / raw)
  To: Stanislav Kinsburskii, kys, haiyangz, wei.liu, decui, longli
  Cc: oe-kbuild-all, linux-hyperv, linux-kernel
In-Reply-To: <177741648871.626779.11067281081219290277.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

Hi Stanislav,

kernel test robot noticed the following build warnings:

[auto build test WARNING on linus/master]
[also build test WARNING on v7.1-rc1 next-20260430]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Stanislav-Kinsburskii/mshv-Add-dedicated-ioctl-for-GVA-to-GPA-translation/20260429-094326
base:   linus/master
patch link:    https://lore.kernel.org/r/177741648871.626779.11067281081219290277.stgit%40skinsburskii-cloud-desktop.internal.cloudapp.net
patch subject: [PATCH] mshv: Add dedicated ioctl for GVA to GPA translation
config: x86_64-randconfig-123-20260430 (https://download.01.org/0day-ci/archive/20260502/202605022145.mMqA1AEU-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
sparse: v0.6.5-rc1
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260502/202605022145.mMqA1AEU-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605022145.mMqA1AEU-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> drivers/hv/mshv_root_main.c:958:30: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected void [noderef] __user *to @@     got unsigned int enum hv_translate_gva_result_code *[addressable] result @@
   drivers/hv/mshv_root_main.c:958:30: sparse:     expected void [noderef] __user *to
   drivers/hv/mshv_root_main.c:958:30: sparse:     got unsigned int enum hv_translate_gva_result_code *[addressable] result
>> drivers/hv/mshv_root_main.c:961:30: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected void [noderef] __user *to @@     got unsigned long long [usertype] *[addressable] gpa @@
   drivers/hv/mshv_root_main.c:961:30: sparse:     expected void [noderef] __user *to
   drivers/hv/mshv_root_main.c:961:30: sparse:     got unsigned long long [usertype] *[addressable] gpa

vim +958 drivers/hv/mshv_root_main.c

   913	
   914	static long
   915	mshv_vp_ioctl_translate_gva(struct mshv_vp *vp, void __user *user_args)
   916	{
   917		struct mshv_partition *partition = vp->vp_partition;
   918		struct mshv_translate_gva args;
   919		struct hv_translate_gva_result_ex result;
   920		u64 gfn, gpa;
   921		int ret;
   922	
   923		if (copy_from_user(&args, user_args, sizeof(args)))
   924			return -EFAULT;
   925	
   926		do {
   927			ret = hv_call_translate_virtual_address_ex(vp->vp_index,
   928								   partition->pt_id,
   929								   args.flags, args.gva,
   930								   &gfn, &result);
   931			if (ret)
   932				return ret;
   933	
   934			if (mshv_gpa_fault_retryable(result.result_code)) {
   935				struct mshv_mem_region *region;
   936				bool faulted;
   937	
   938				region = mshv_partition_region_by_gfn_get(partition,
   939									  gfn);
   940				if (!region)
   941					return -EFAULT;
   942	
   943				faulted = false;
   944				if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
   945					faulted = mshv_region_handle_gfn_fault(region,
   946									       gfn);
   947				mshv_region_put(region);
   948	
   949				if (!faulted)
   950					return -EFAULT;
   951	
   952				cond_resched();
   953			}
   954		} while (mshv_gpa_fault_retryable(result.result_code));
   955	
   956		gpa = (gfn << PAGE_SHIFT) | (args.gva & ~PAGE_MASK);
   957	
 > 958	        if (copy_to_user(args.result, &result, sizeof(*args.result)))
   959	                return -EFAULT;
   960	
 > 961		if (copy_to_user(args.gpa, &gpa, sizeof(*args.gpa)))
   962			return -EFAULT;
   963	
   964		return 0;
   965	}
   966	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* [PATCH net-next v3 1/2] net: mana: Use per-queue allocation for tx_qp to reduce allocation size
From: Aditya Garg @ 2026-05-02  7:45 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, kotaranov, horms, ssengar, jacob.e.keller,
	dipayanroy, ernis, shirazsaleem, kees, sbhatta, leitao, netdev,
	linux-hyperv, linux-kernel, linux-rdma, bpf, gargaditya,
	gargaditya
In-Reply-To: <20260502074552.23857-1-gargaditya@linux.microsoft.com>

Convert tx_qp from a single contiguous array allocation to per-queue
individual allocations. Each mana_tx_qp struct is approximately 35KB.
With many queues (e.g., 32/64), the flat array requires a single
contiguous allocation that can fail under memory fragmentation.

Change mana_tx_qp *tx_qp to mana_tx_qp **tx_qp (array of pointers),
allocating each queue's mana_tx_qp individually via kvzalloc. This
reduces each allocation to ~35KB and provides vmalloc fallback,
avoiding allocation failure due to fragmentation.

Signed-off-by: Aditya Garg <gargaditya@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 .../net/ethernet/microsoft/mana/mana_bpf.c    |  2 +-
 drivers/net/ethernet/microsoft/mana/mana_en.c | 49 ++++++++++++-------
 .../ethernet/microsoft/mana/mana_ethtool.c    |  2 +-
 include/net/mana/mana.h                       |  2 +-
 4 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
index 7697c9b52ed3..b5e9bb184a1d 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
@@ -68,7 +68,7 @@ int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames,
 		count++;
 	}
 
-	tx_stats = &apc->tx_qp[q_idx].txq.stats;
+	tx_stats = &apc->tx_qp[q_idx]->txq.stats;
 
 	u64_stats_update_begin(&tx_stats->syncp);
 	tx_stats->xdp_xmit += count;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a654b3699c4c..8adf72b96145 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -355,9 +355,9 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	if (skb_cow_head(skb, MANA_HEADROOM))
 		goto tx_drop_count;
 
-	txq = &apc->tx_qp[txq_idx].txq;
+	txq = &apc->tx_qp[txq_idx]->txq;
 	gdma_sq = txq->gdma_sq;
-	cq = &apc->tx_qp[txq_idx].tx_cq;
+	cq = &apc->tx_qp[txq_idx]->tx_cq;
 	tx_stats = &txq->stats;
 
 	BUILD_BUG_ON(MAX_TX_WQE_SGL_ENTRIES != MANA_MAX_TX_WQE_SGL_ENTRIES);
@@ -614,7 +614,7 @@ static void mana_get_stats64(struct net_device *ndev,
 	}
 
 	for (q = 0; q < num_queues; q++) {
-		tx_stats = &apc->tx_qp[q].txq.stats;
+		tx_stats = &apc->tx_qp[q]->txq.stats;
 
 		do {
 			start = u64_stats_fetch_begin(&tx_stats->syncp);
@@ -2321,21 +2321,26 @@ static void mana_destroy_txq(struct mana_port_context *apc)
 		return;
 
 	for (i = 0; i < apc->num_queues; i++) {
-		debugfs_remove_recursive(apc->tx_qp[i].mana_tx_debugfs);
-		apc->tx_qp[i].mana_tx_debugfs = NULL;
+		if (!apc->tx_qp[i])
+			continue;
+
+		debugfs_remove_recursive(apc->tx_qp[i]->mana_tx_debugfs);
+		apc->tx_qp[i]->mana_tx_debugfs = NULL;
 
-		napi = &apc->tx_qp[i].tx_cq.napi;
-		if (apc->tx_qp[i].txq.napi_initialized) {
+		napi = &apc->tx_qp[i]->tx_cq.napi;
+		if (apc->tx_qp[i]->txq.napi_initialized) {
 			napi_synchronize(napi);
 			napi_disable_locked(napi);
 			netif_napi_del_locked(napi);
-			apc->tx_qp[i].txq.napi_initialized = false;
+			apc->tx_qp[i]->txq.napi_initialized = false;
 		}
-		mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object);
+		mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i]->tx_object);
 
-		mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq);
+		mana_deinit_cq(apc, &apc->tx_qp[i]->tx_cq);
 
-		mana_deinit_txq(apc, &apc->tx_qp[i].txq);
+		mana_deinit_txq(apc, &apc->tx_qp[i]->txq);
+
+		kvfree(apc->tx_qp[i]);
 	}
 
 	kfree(apc->tx_qp);
@@ -2344,7 +2349,7 @@ static void mana_destroy_txq(struct mana_port_context *apc)
 
 static void mana_create_txq_debugfs(struct mana_port_context *apc, int idx)
 {
-	struct mana_tx_qp *tx_qp = &apc->tx_qp[idx];
+	struct mana_tx_qp *tx_qp = apc->tx_qp[idx];
 	char qnum[32];
 
 	sprintf(qnum, "TX-%d", idx);
@@ -2383,7 +2388,7 @@ static int mana_create_txq(struct mana_port_context *apc,
 	int err;
 	int i;
 
-	apc->tx_qp = kzalloc_objs(struct mana_tx_qp, apc->num_queues);
+	apc->tx_qp = kzalloc_objs(struct mana_tx_qp *, apc->num_queues);
 	if (!apc->tx_qp)
 		return -ENOMEM;
 
@@ -2403,10 +2408,16 @@ static int mana_create_txq(struct mana_port_context *apc,
 	gc = gd->gdma_context;
 
 	for (i = 0; i < apc->num_queues; i++) {
-		apc->tx_qp[i].tx_object = INVALID_MANA_HANDLE;
+		apc->tx_qp[i] = kvzalloc_obj(*apc->tx_qp[i]);
+		if (!apc->tx_qp[i]) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		apc->tx_qp[i]->tx_object = INVALID_MANA_HANDLE;
 
 		/* Create SQ */
-		txq = &apc->tx_qp[i].txq;
+		txq = &apc->tx_qp[i]->txq;
 
 		u64_stats_init(&txq->stats.syncp);
 		txq->ndev = net;
@@ -2424,7 +2435,7 @@ static int mana_create_txq(struct mana_port_context *apc,
 			goto out;
 
 		/* Create SQ's CQ */
-		cq = &apc->tx_qp[i].tx_cq;
+		cq = &apc->tx_qp[i]->tx_cq;
 		cq->type = MANA_CQ_TYPE_TX;
 
 		cq->txq = txq;
@@ -2453,7 +2464,7 @@ static int mana_create_txq(struct mana_port_context *apc,
 
 		err = mana_create_wq_obj(apc, apc->port_handle, GDMA_SQ,
 					 &wq_spec, &cq_spec,
-					 &apc->tx_qp[i].tx_object);
+					 &apc->tx_qp[i]->tx_object);
 
 		if (err)
 			goto out;
@@ -3288,7 +3299,7 @@ static int mana_dealloc_queues(struct net_device *ndev)
 	 */
 
 	for (i = 0; i < apc->num_queues; i++) {
-		txq = &apc->tx_qp[i].txq;
+		txq = &apc->tx_qp[i]->txq;
 		tsleep = 1000;
 		while (atomic_read(&txq->pending_sends) > 0 &&
 		       time_before(jiffies, timeout)) {
@@ -3307,7 +3318,7 @@ static int mana_dealloc_queues(struct net_device *ndev)
 	}
 
 	for (i = 0; i < apc->num_queues; i++) {
-		txq = &apc->tx_qp[i].txq;
+		txq = &apc->tx_qp[i]->txq;
 		while ((skb = skb_dequeue(&txq->pending_skbs))) {
 			mana_unmap_skb(skb, apc);
 			dev_kfree_skb_any(skb);
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 6a4b42fe0944..04350973e19e 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -260,7 +260,7 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 	}
 
 	for (q = 0; q < num_queues; q++) {
-		tx_stats = &apc->tx_qp[q].txq.stats;
+		tx_stats = &apc->tx_qp[q]->txq.stats;
 
 		do {
 			start = u64_stats_fetch_begin(&tx_stats->syncp);
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 8f721cd4e4a7..aa90a858c8e3 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -507,7 +507,7 @@ struct mana_port_context {
 	bool tx_shortform_allowed;
 	u16 tx_vp_offset;
 
-	struct mana_tx_qp *tx_qp;
+	struct mana_tx_qp **tx_qp;
 
 	/* Indirection Table for RX & TX. The values are queue indexes */
 	u32 *indir_table;
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v3 2/2] net: mana: Use kvmalloc for large RX queue and buffer allocations
From: Aditya Garg @ 2026-05-02  7:45 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, kotaranov, horms, ssengar, jacob.e.keller,
	dipayanroy, ernis, shirazsaleem, kees, sbhatta, leitao, netdev,
	linux-hyperv, linux-kernel, linux-rdma, bpf, gargaditya,
	gargaditya
In-Reply-To: <20260502074552.23857-1-gargaditya@linux.microsoft.com>

The RX path allocations for rxbufs_pre, das_pre, and rxq scale with
queue count and queue depth. With high queue counts and depth, these can
exceed what kmalloc can reliably provide from physically contiguous
memory under fragmentation.

Switch these from kmalloc to kvmalloc variants so the allocator
transparently falls back to vmalloc when contiguous memory is scarce,
and update the corresponding frees to kvfree.

Signed-off-by: Aditya Garg <gargaditya@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 8adf72b96145..e1d8ac3417e8 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -685,11 +685,11 @@ void mana_pre_dealloc_rxbufs(struct mana_port_context *mpc)
 		put_page(virt_to_head_page(mpc->rxbufs_pre[i]));
 	}
 
-	kfree(mpc->das_pre);
+	kvfree(mpc->das_pre);
 	mpc->das_pre = NULL;
 
 out2:
-	kfree(mpc->rxbufs_pre);
+	kvfree(mpc->rxbufs_pre);
 	mpc->rxbufs_pre = NULL;
 
 out1:
@@ -806,11 +806,11 @@ int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_qu
 	num_rxb = num_queues * mpc->rx_queue_size;
 
 	WARN(mpc->rxbufs_pre, "mana rxbufs_pre exists\n");
-	mpc->rxbufs_pre = kmalloc_array(num_rxb, sizeof(void *), GFP_KERNEL);
+	mpc->rxbufs_pre = kvmalloc_array(num_rxb, sizeof(void *), GFP_KERNEL);
 	if (!mpc->rxbufs_pre)
 		goto error;
 
-	mpc->das_pre = kmalloc_objs(dma_addr_t, num_rxb);
+	mpc->das_pre = kvmalloc_objs(dma_addr_t, num_rxb);
 	if (!mpc->das_pre)
 		goto error;
 
@@ -2564,7 +2564,7 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 	if (rxq->gdma_rq)
 		mana_gd_destroy_queue(gc, rxq->gdma_rq);
 
-	kfree(rxq);
+	kvfree(rxq);
 }
 
 static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
@@ -2704,7 +2704,7 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 
 	gc = gd->gdma_context;
 
-	rxq = kzalloc_flex(*rxq, rx_oobs, apc->rx_queue_size);
+	rxq = kvzalloc_flex(*rxq, rx_oobs, apc->rx_queue_size);
 	if (!rxq)
 		return NULL;
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v3 0/2] net: mana: Avoid queue struct allocation failure under memory fragmentation
From: Aditya Garg @ 2026-05-02  7:45 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, kotaranov, horms, ssengar, jacob.e.keller,
	dipayanroy, ernis, shirazsaleem, kees, sbhatta, leitao, netdev,
	linux-hyperv, linux-kernel, linux-rdma, bpf, gargaditya,
	gargaditya

The MANA driver can fail to load on systems with high memory
utilization because several allocations in the queue setup paths
require large physically contiguous blocks via kmalloc. Under memory
fragmentation these high-order allocations may fail, preventing the
driver from creating queues when opening the interface or when
reconfiguring channels, ring parameters or MTU at runtime.

Allocation sizes that are problematic:

  mana_create_txq -> tx_qp flat array (sizeof(mana_tx_qp) = 35528):
    16 queues (default): 35528 * 16 =  ~555 KB contiguous
    64 queues (max):     35528 * 64 = ~2220 KB contiguous

  mana_create_rxq -> rxq struct with flex array
  (sizeof(mana_rxq) = 35712, rx_oobs=296 per entry):
    depth 1024 (default): 35712 + 296 * 1024 =  ~331 KB per queue
    depth 8192 (max):     35712 + 296 * 8192 = ~2403 KB per queue

  mana_pre_alloc_rxbufs -> rxbufs_pre and das_pre arrays:
    16 queues, depth 1024 (default): 16 * 1024 * 8 =  128 KB each
    64 queues, depth 8192 (max):     64 * 8192 * 8 = 4096 KB each

This series addresses the issue by:
  1. Converting the tx_qp flat array into an array of pointers with
     per-queue kvzalloc (~35 KB each), replacing a single contiguous
     allocation that can reach ~2.2 MB at 64 queues.
  2. Switching rxbufs_pre, das_pre, and rxq allocations to
     kvmalloc/kvzalloc so the allocator can fall back to vmalloc
     when contiguous memory is unavailable.

Throughput testing confirms no regression. Since kvmalloc falls
back to vmalloc under memory fragmentation, all kvmalloc calls
were temporarily replaced with vmalloc to simulate the fallback
path (iperf3, GBits/sec):

                 Physically contiguous         vmalloc region
  Connections      TX          RX              TX          RX
  --------------------------------------------------------------
  1                47.2        46.9            46.8        46.6
  16               181         181             181         181
  32               181         181             181         181
  64               181         181             181         181

---
Changes in v3:
  - Rebased to latest net-next (net-next reopened)

Changes in v2:
  - Rebased to latest net-next

Aditya Garg (2):
  net: mana: Use per-queue allocation for tx_qp to reduce allocation
    size
  net: mana: Use kvmalloc for large RX queue and buffer allocations

 .../net/ethernet/microsoft/mana/mana_bpf.c    |  2 +-
 drivers/net/ethernet/microsoft/mana/mana_en.c | 61 +++++++++++--------
 .../ethernet/microsoft/mana/mana_ethtool.c    |  2 +-
 include/net/mana/mana.h                       |  2 +-
 4 files changed, 39 insertions(+), 28 deletions(-)

-- 
2.43.0


^ permalink raw reply

* [PATCH v2 18/18] mshv: Fix missing error code on VP allocation failure
From: Stanislav Kinsburskii @ 2026-05-02  4:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

In mshv_partition_ioctl_create_vp(), when kzalloc for the VP struct
fails, the code jumps to the cleanup path without setting ret. At that
point ret is 0 from the preceding successful mshv_vp_stats_map() call,
so the function returns success to userspace despite having failed to
create the VP. No fd is installed and no VP is registered in pt_vp_array,
but userspace has no way to know the operation failed.

Set ret to -ENOMEM before jumping to the cleanup path.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_root_main.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index f01bd0877aef1..c9371e049c42b 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1186,8 +1186,10 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 		goto unmap_ghcb_page;
 
 	vp = kzalloc_obj(*vp);
-	if (!vp)
+	if (!vp) {
+		ret = -ENOMEM;
 		goto unmap_stats_pages;
+	}
 
 	vp->vp_partition = mshv_partition_get(partition);
 	if (!vp->vp_partition) {



^ permalink raw reply related

* [PATCH v2 17/18] mshv: Validate scheduler message bounds from hypervisor
From: Stanislav Kinsburskii @ 2026-05-02  4:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

handle_pair_message() iterates up to msg->vp_count without verifying it
against HV_MESSAGE_MAX_PARTITION_VP_PAIR_COUNT. Since vp_count is read
from untrusted hypervisor data, a malformed message with a large value
would cause out-of-bounds reads from the partition_ids and vp_indexes
arrays.

handle_bitset_message() iterates over set bits in valid_bank_mask (up to
64) and advances bank_contents for each one. However, the payload buffer
only has space for 16 bank entries. A valid_bank_mask with more than 16
bits set causes bank_contents to read beyond the message buffer.

Fix both by adding bounds validation:
- Clamp vp_count to HV_MESSAGE_MAX_PARTITION_VP_PAIR_COUNT
- Track banks consumed and stop before exceeding buffer capacity

Fixes: 621191d709b1 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_synic.c |   20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index 5333b3889a408..3a71682ffe048 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -187,7 +187,9 @@ static void kick_vp(struct mshv_vp *vp)
 static void
 handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg)
 {
-	int bank_idx, vps_signaled = 0, bank_mask_size;
+	int bank_idx, vps_signaled = 0, bank_mask_size, banks_used = 0;
+	const int max_banks = sizeof(msg->vp_bitset.bitset_buffer) /
+			      sizeof(u64) - 2; /* subtract format + mask */
 	struct mshv_partition *partition;
 	const struct hv_vpset *vpset;
 	const u64 *bank_contents;
@@ -227,6 +229,11 @@ handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg)
 		if (bank_idx == bank_mask_size)
 			break;
 
+		if (unlikely(banks_used >= max_banks)) {
+			pr_debug("valid_bank_mask exceeds buffer capacity\n");
+			goto unlock_out;
+		}
+
 		while (true) {
 			struct mshv_vp *vp;
 
@@ -255,6 +262,7 @@ handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg)
 		}
 
 		bank_contents++;
+		banks_used++;
 	}
 
 unlock_out:
@@ -271,10 +279,18 @@ handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg)
 	struct mshv_partition *partition = NULL;
 	struct mshv_vp *vp;
 	int idx;
+	u8 vp_count = msg->vp_count;
+
+	if (unlikely(vp_count > HV_MESSAGE_MAX_PARTITION_VP_PAIR_COUNT)) {
+		pr_debug("pair message vp_count %u exceeds max %lu\n",
+			 vp_count,
+			 (unsigned long)HV_MESSAGE_MAX_PARTITION_VP_PAIR_COUNT);
+		return;
+	}
 
 	rcu_read_lock();
 
-	for (idx = 0; idx < msg->vp_count; idx++) {
+	for (idx = 0; idx < vp_count; idx++) {
 		u64 partition_id = msg->partition_ids[idx];
 		u32 vp_index = msg->vp_indexes[idx];
 



^ permalink raw reply related

* [PATCH v2 16/18] mshv: Add store/load ordering for VP array publish
From: Stanislav Kinsburskii @ 2026-05-02  4:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_vp_create() publishes a VP pointer to pt_vp_array with a plain
store. The ISR reads this array locklessly from interrupt context on
other CPUs. Without memory ordering, a reader may observe the non-NULL
pointer before all VP fields (e.g. vp_register_page, run.vp_suspend_queue)
are fully initialized, leading to use of uninitialized data.

Fix by using smp_store_release() when publishing the VP pointer and
smp_load_acquire() on all lockless ISR read sites. This guarantees that
all VP initialization is visible to readers before the pointer itself.

Fixes: 621191d709b1 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_root_main.c |    3 ++-
 drivers/hv/mshv_synic.c     |    6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 2b7d56e108bad..f01bd0877aef1 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1224,7 +1224,8 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 
 	/* already exclusive with the partition mutex for all ioctls */
 	partition->pt_vp_count++;
-	partition->pt_vp_array[args.vp_index] = vp;
+	/* Ensure VP is fully initialized before visible to lockless ISR readers */
+	smp_store_release(&partition->pt_vp_array[args.vp_index], vp);
 
 	goto out;
 
diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index d4d98fa771189..5333b3889a408 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -244,7 +244,7 @@ handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg)
 				goto unlock_out;
 			}
 
-			vp = partition->pt_vp_array[vp_index];
+			vp = smp_load_acquire(&partition->pt_vp_array[vp_index]);
 			if (unlikely(!vp)) {
 				pr_debug("failed to find VP %u\n", vp_index);
 				goto unlock_out;
@@ -293,7 +293,7 @@ handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg)
 			break;
 		}
 
-		vp = partition->pt_vp_array[vp_index];
+		vp = smp_load_acquire(&partition->pt_vp_array[vp_index]);
 		if (!vp) {
 			pr_debug("failed to find VP %u\n", vp_index);
 			break;
@@ -388,7 +388,7 @@ mshv_intercept_isr(struct hv_message *msg)
 		pr_debug("VP index %u out of bounds\n", vp_index);
 		goto unlock_out;
 	}
-	vp = partition->pt_vp_array[vp_index];
+	vp = smp_load_acquire(&partition->pt_vp_array[vp_index]);
 	if (unlikely(!vp)) {
 		pr_debug("failed to find VP %u\n", vp_index);
 		goto unlock_out;



^ permalink raw reply related

* [PATCH v2 15/18] mshv: Add missing vp_index bounds check in intercept ISR
From: Stanislav Kinsburskii @ 2026-05-02  4:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_intercept_isr() reads vp_index from the intercept message payload
and uses it directly to index into partition->pt_vp_array without
validating it against MSHV_MAX_VPS. A malformed or corrupted hypervisor
message with a vp_index beyond the array bounds would cause an
out-of-bounds memory access in interrupt context, likely crashing the
host.

Use READ_ONCE() when reading vp_index from the shared SynIC message
page to prevent the compiler from re-fetching the value between the
bounds check and the array access.

Both handle_bitset_message() and handle_pair_message() already validate
vp_index before use. Add the same check to mshv_intercept_isr() for
consistency.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_synic.c |    8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index 43f1bcbbf2d34..d4d98fa771189 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -382,8 +382,12 @@ mshv_intercept_isr(struct hv_message *msg)
 	 * (because the vp is only deleted when the partition is), no additional
 	 * locking is needed here
 	 */
-	vp_index =
-	       ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index;
+	vp_index = READ_ONCE(
+	       ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index);
+	if (unlikely(vp_index >= MSHV_MAX_VPS)) {
+		pr_debug("VP index %u out of bounds\n", vp_index);
+		goto unlock_out;
+	}
 	vp = partition->pt_vp_array[vp_index];
 	if (unlikely(!vp)) {
 		pr_debug("failed to find VP %u\n", vp_index);



^ permalink raw reply related

* [PATCH v2 14/18] mshv: Use kfree_rcu in mshv_portid_free
From: Stanislav Kinsburskii @ 2026-05-02  4:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_portid_free() uses synchronize_rcu() followed by kfree() to
reclaim port table entries. This blocks the caller until a full RCU
grace period elapses, which is unnecessary since the same module already
uses the non-blocking kfree_rcu() pattern in mshv_port_table_fini().

Replace with kfree_rcu() to avoid the blocking wait and keep the
reclamation strategy consistent across the file.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_portid_table.c |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c
index d6884c601b298..1ccbafe7aa596 100644
--- a/drivers/hv/mshv_portid_table.c
+++ b/drivers/hv/mshv_portid_table.c
@@ -62,8 +62,7 @@ mshv_portid_free(int port_id)
 	WARN_ON(!info);
 	idr_unlock(&port_table_idr);
 
-	synchronize_rcu();
-	kfree(info);
+	kfree_rcu(info, portbl_rcu);
 }
 
 int



^ permalink raw reply related

* [PATCH v2 13/18] mshv: Fix sleeping under spinlock in mshv_portid_alloc
From: Stanislav Kinsburskii @ 2026-05-02  4:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

idr_alloc() is called with GFP_KERNEL inside idr_lock(), which holds a
spinlock. GFP_KERNEL allows the allocator to sleep, triggering a
sleeping-while-atomic bug.

Fix by using idr_preload(GFP_KERNEL) before taking the lock to
pre-allocate memory in a sleepable context, then idr_alloc() with
GFP_NOWAIT inside the spinlock-protected section.

Fixes: 621191d709b1 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_portid_table.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c
index f1aaef69eb9b7..d6884c601b298 100644
--- a/drivers/hv/mshv_portid_table.c
+++ b/drivers/hv/mshv_portid_table.c
@@ -40,12 +40,14 @@ mshv_port_table_fini(void)
 int
 mshv_portid_alloc(struct port_table_info *info)
 {
-	int ret = 0;
+	int ret;
 
+	idr_preload(GFP_KERNEL);
 	idr_lock(&port_table_idr);
 	ret = idr_alloc(&port_table_idr, info, PORTID_MIN,
-			PORTID_MAX, GFP_KERNEL);
+			PORTID_MAX, GFP_NOWAIT);
 	idr_unlock(&port_table_idr);
+	idr_preload_end();
 
 	return ret;
 }



^ permalink raw reply related

* [PATCH v2 12/18] mshv: Fix use-after-RCU in mshv_portid_lookup
From: Stanislav Kinsburskii @ 2026-05-02  4:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_portid_lookup() drops the RCU read lock before copying the
port_table_info struct found by idr_find(). If mshv_portid_free() runs
concurrently on another CPU, it can remove the entry and free it (via
synchronize_rcu + kfree) before the copy at line *info = *_info
completes — resulting in a use-after-free.

Move rcu_read_unlock() after the struct copy so the object remains
protected for the entire duration of the read-side access.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_portid_table.c |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c
index c349af1f0aaac..f1aaef69eb9b7 100644
--- a/drivers/hv/mshv_portid_table.c
+++ b/drivers/hv/mshv_portid_table.c
@@ -72,12 +72,11 @@ mshv_portid_lookup(int port_id, struct port_table_info *info)
 
 	rcu_read_lock();
 	_info = idr_find(&port_table_idr, port_id);
-	rcu_read_unlock();
-
 	if (_info) {
 		*info = *_info;
 		ret = 0;
 	}
+	rcu_read_unlock();
 
 	return ret;
 }



^ permalink raw reply related

* [PATCH v2 11/18] mshv: Fix duplicate GSI detection for GSI 0
From: Stanislav Kinsburskii @ 2026-05-02  4:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

The duplicate routing entry check in mshv_update_routing_table() uses
guest_irq_num != 0 to detect whether a GSI slot is already occupied.
This fails for GSI 0 because its guest_irq_num is 0 both when the slot
is unused (zero-initialized) and when legitimately assigned. As a
result, duplicate entries for GSI 0 are silently accepted, with the
second entry overwriting the first — corrupting the routing table
without any error reported to userspace.

While GSI 0 (legacy timer) is unlikely to appear in MSI-based routing
in practice, the check is semantically wrong — it conflates
"uninitialized" with "GSI number 0." Use girq_entry_valid instead,
which is explicitly set to true when an entry is populated and remains
zero for unused slots regardless of the GSI number.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_irq.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c
index b3142c84dcbc2..65a4ffc82d566 100644
--- a/drivers/hv/mshv_irq.c
+++ b/drivers/hv/mshv_irq.c
@@ -51,7 +51,7 @@ int mshv_update_routing_table(struct mshv_partition *partition,
 		/*
 		 * Allow only one to one mapping between GSI and MSI routing.
 		 */
-		if (girq->guest_irq_num != 0) {
+		if (girq->girq_entry_valid) {
 			r = -EINVAL;
 			goto out;
 		}



^ permalink raw reply related

* [PATCH v2 10/18] mshv: Fix level-triggered check on uninitialized data
From: Stanislav Kinsburskii @ 2026-05-02  4:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

In mshv_irqfd_assign(), the level-triggered validation for resample
irqfds checks irqfd_lapic_irq.lapic_control.level_triggered before
mshv_irqfd_update() has populated the field. Since the irqfd struct is
zero-allocated, level_triggered is always 0 at that point, causing the
check to always reject resample irqfds with -EINVAL. This makes
level-triggered interrupt resampling — used to avoid interrupt storms
with assigned devices — completely non-functional.

Move the check after the mshv_irqfd_update() call, which resolves the
IRQ routing entry and populates irqfd_lapic_irq with the actual trigger
mode.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_eventfd.c |   25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index b92e7f05aa9cd..3165f787994fc 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -483,6 +483,19 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
 	init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
 
 	spin_lock_irq(&pt->pt_irqfds_lock);
+	ret = 0;
+	hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
+		if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
+			continue;
+		/* This fd is used for another irq already. */
+		ret = -EBUSY;
+		spin_unlock_irq(&pt->pt_irqfds_lock);
+		goto fail;
+	}
+
+	idx = srcu_read_lock(&pt->pt_irq_srcu);
+	mshv_irqfd_update(pt, irqfd);
+
 #if IS_ENABLED(CONFIG_X86)
 	if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
 	    !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
@@ -491,22 +504,12 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
 		 * Otherwise return with failure
 		 */
 		spin_unlock_irq(&pt->pt_irqfds_lock);
+		srcu_read_unlock(&pt->pt_irq_srcu, idx);
 		ret = -EINVAL;
 		goto fail;
 	}
 #endif
-	ret = 0;
-	hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
-		if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
-			continue;
-		/* This fd is used for another irq already. */
-		ret = -EBUSY;
-		spin_unlock_irq(&pt->pt_irqfds_lock);
-		goto fail;
-	}
 
-	idx = srcu_read_lock(&pt->pt_irq_srcu);
-	mshv_irqfd_update(pt, irqfd);
 	hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list);
 	spin_unlock_irq(&pt->pt_irqfds_lock);
 



^ permalink raw reply related

* [PATCH v2 09/18] mshv: Consolidate irqfd interrupt injection paths
From: Stanislav Kinsburskii @ 2026-05-02  4:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

The irqfd interrupt injection had duplicated seqcount reads and
inconsistent validity checks between the fast and slow paths:

1. The wakeup handler snapshots irqfd_lapic_irq under seqcount, then on
   fast-path failure calls mshv_assert_irq_slow() which re-reads both
   girq_ent and lapic_irq under seqcount again — wasteful and confusing.

2. The validity check (girq_entry_valid) only existed in the slow path.
   The fast path would blindly accept a zero-initialized structure when
   routing was not configured, potentially injecting vector 0 to VP 0.

3. The condition 'girq_ent.guest_irq_num && !girq_ent.girq_entry_valid'
   short-circuits for GSI 0 (guest_irq_num == 0), bypassing the
   validity check entirely.

4. mshv_irqfd_resampler_ack() reads irqfd_lapic_irq.lapic_control
   without seqcount protection, allowing torn reads when racing with
   mshv_irqfd_update().

Consolidate by:
- Moving the seqcount snapshot and validity check into the wakeup
  handler, performed once before either injection path.
- Changing mshv_assert_irq_slow() to accept a pre-snapshotted
  const struct mshv_lapic_irq pointer, eliminating its internal
  seqcount read and SRCU lock/unlock.
- Using !girq_entry_valid alone as the validity condition, fixing the
  GSI 0 bypass.
- Adding seqcount protection in mshv_irqfd_resampler_ack() to prevent
  torn reads of interrupt_type.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_eventfd.c |   62 +++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 7275b9eaa7541..b92e7f05aa9cd 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -90,7 +90,15 @@ static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
 	hlist_for_each_entry_srcu(irqfd, &resampler->rsmplr_irqfd_list,
 				 irqfd_resampler_hnode,
 				 srcu_read_lock_held(&partition->pt_irq_srcu)) {
-		if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
+		struct mshv_lapic_irq irq;
+		unsigned int seq;
+
+		do {
+			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+			irq = irqfd->irqfd_lapic_irq;
+		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+
+		if (hv_should_clear_interrupt(irq.lapic_control.interrupt_type))
 			hv_call_clear_virtual_interrupt(partition->pt_id);
 
 		eventfd_signal(irqfd->irqfd_resamplefd);
@@ -198,36 +206,19 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd,
 }
 #endif
 
-static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
+static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd,
+				 const struct mshv_lapic_irq *irq)
 {
 	struct mshv_partition *partition = irqfd->irqfd_partn;
-	struct mshv_guest_irq_ent girq_ent;
-	struct mshv_lapic_irq irq;
-	unsigned int seq;
-	int idx;
-
-	idx = srcu_read_lock(&partition->pt_irq_srcu);
-
-	do {
-		seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
-		girq_ent = irqfd->irqfd_girq_ent;
-		irq = irqfd->irqfd_lapic_irq;
-	} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
-
-	if (girq_ent.guest_irq_num && !girq_ent.girq_entry_valid) {
-		srcu_read_unlock(&partition->pt_irq_srcu, idx);
-		return;
-	}
 
 #if IS_ENABLED(CONFIG_X86)
 	WARN_ON(irqfd->irqfd_resampler &&
-		!irq.lapic_control.level_triggered);
+		!irq->lapic_control.level_triggered);
 #endif
 
 	hv_call_assert_virtual_interrupt(partition->pt_id,
-					 irq.lapic_vector, irq.lapic_apic_id,
-					 irq.lapic_control);
-	srcu_read_unlock(&partition->pt_irq_srcu, idx);
+					 irq->lapic_vector, irq->lapic_apic_id,
+					 irq->lapic_control);
 }
 
 static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd)
@@ -316,6 +307,7 @@ static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
 	int ret = 0;
 
 	if (flags & EPOLLIN) {
+		struct mshv_guest_irq_ent girq_ent;
 		struct mshv_lapic_irq irq;
 		u64 cnt;
 
@@ -323,14 +315,18 @@ static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
 		idx = srcu_read_lock(&pt->pt_irq_srcu);
 		do {
 			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+			girq_ent = irqfd->irqfd_girq_ent;
 			irq = irqfd->irqfd_lapic_irq;
 		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
 
+		if (!girq_ent.girq_entry_valid)
+			goto out_unlock;
+
 		/* An event has been signaled, raise an interrupt */
-		ret = mshv_try_assert_irq_fast(irqfd, &irq);
-		if (ret)
-			mshv_assert_irq_slow(irqfd);
+		if (mshv_try_assert_irq_fast(irqfd, &irq))
+			mshv_assert_irq_slow(irqfd, &irq);
 
+out_unlock:
 		srcu_read_unlock(&pt->pt_irq_srcu, idx);
 
 		ret = 1;
@@ -520,8 +516,18 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
 	 */
 	events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl);
 
-	if (events & EPOLLIN)
-		mshv_assert_irq_slow(irqfd);
+	if (events & EPOLLIN) {
+		struct mshv_lapic_irq irq;
+		unsigned int seq;
+
+		do {
+			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+			irq = irqfd->irqfd_lapic_irq;
+		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+
+		if (irqfd->irqfd_girq_ent.girq_entry_valid)
+			mshv_assert_irq_slow(irqfd, &irq);
+	}
 
 	srcu_read_unlock(&pt->pt_irq_srcu, idx);
 	return 0;



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox