Linux-HyperV List

Linux-HyperV List
 help / color / mirror / Atom feed

* [PATCH v1 1/4] page_reporting: add PAGE_REPORTING_DEFAULT_ORDER
From: Yuvraj Sakshith @ 2026-02-27 14:06 UTC (permalink / raw)
  To: akpm
  Cc: mst, david, kys, haiyangz, wei.liu, decui, longli, jasowang,
	xuanzhuo, eperezma, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
	surenb, mhocko, jackmanb, hannes, ziy, linux-hyperv,
	virtualization, linux-mm, linux-kernel
In-Reply-To: <20260227140655.360696-1-yuvraj.sakshith@oss.qualcomm.com>

Drivers can pass order of pages to be reported while
registering itself. Today, this is a magic number, 0.

Label this with PAGE_REPORTING_DEFAULT_ORDER and
check for it when the driver is being registered.

Signed-off-by: Yuvraj Sakshith <yuvraj.sakshith@oss.qualcomm.com>
---
 include/linux/page_reporting.h | 1 +
 mm/page_reporting.c            | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
index fe648dfa3..a7e3e30f2 100644
--- a/include/linux/page_reporting.h
+++ b/include/linux/page_reporting.h
@@ -7,6 +7,7 @@
 
 /* This value should always be a power of 2, see page_reporting_cycle() */
 #define PAGE_REPORTING_CAPACITY		32
+#define PAGE_REPORTING_DEFAULT_ORDER	0
 
 struct page_reporting_dev_info {
 	/* function that alters pages to make them "reported" */
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index e4c428e61..9ad4fc3f8 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -370,7 +370,8 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
 	 */
 
 	if (page_reporting_order == -1) {
-		if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER)
+		if (prdev->order != PAGE_REPORTING_DEFAULT_ORDER &&
+			prdev->order <= MAX_PAGE_ORDER)
 			page_reporting_order = prdev->order;
 		else
 			page_reporting_order = pageblock_order;
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 2/4] virtio_balloon: set default page reporting order
From: Yuvraj Sakshith @ 2026-02-27 14:06 UTC (permalink / raw)
  To: akpm
  Cc: mst, david, kys, haiyangz, wei.liu, decui, longli, jasowang,
	xuanzhuo, eperezma, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
	surenb, mhocko, jackmanb, hannes, ziy, linux-hyperv,
	virtualization, linux-mm, linux-kernel
In-Reply-To: <20260227140655.360696-1-yuvraj.sakshith@oss.qualcomm.com>

virtio_balloon page reporting order is set to MAX_PAGE_ORDER implicitly
as vb->prdev.order is never initialised and is auto-set to zero.

Explicitly mention usage of default page order by making use of
PAGE_REPORTING_DEFAULT ORDER fallback value.

Signed-off-by: Yuvraj Sakshith <yuvraj.sakshith@oss.qualcomm.com>
---
 drivers/virtio/virtio_balloon.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 74fe59f5a..0616c03b2 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -1044,6 +1044,8 @@ static int virtballoon_probe(struct virtio_device *vdev)
 			goto out_unregister_oom;
 		}
 
+		vb->pr_dev_info.order = PAGE_REPORTING_DEFAULT_ORDER;
+
 		/*
 		 * The default page reporting order is @pageblock_order, which
 		 * corresponds to 512MB in size on ARM64 when 64KB base page
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 3/4] hv_balloon: set default page reporting order
From: Yuvraj Sakshith @ 2026-02-27 14:06 UTC (permalink / raw)
  To: akpm
  Cc: mst, david, kys, haiyangz, wei.liu, decui, longli, jasowang,
	xuanzhuo, eperezma, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
	surenb, mhocko, jackmanb, hannes, ziy, linux-hyperv,
	virtualization, linux-mm, linux-kernel
In-Reply-To: <20260227140655.360696-1-yuvraj.sakshith@oss.qualcomm.com>

Explicitly mention page reporting order to be set to
default value using PAGE_REPORTING_DEFAULT_ORDER fallback
value.

Reviewed-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Yuvraj Sakshith <yuvraj.sakshith@oss.qualcomm.com>
---
 drivers/hv/hv_balloon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 2b4080e51..3d6bd9936 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -1663,7 +1663,7 @@ static void enable_page_reporting(void)
 	 * We let the page_reporting_order parameter decide the order
 	 * in the page_reporting code
 	 */
-	dm_device.pr_dev_info.order = 0;
+	dm_device.pr_dev_info.order = PAGE_REPORTING_DEFAULT_ORDER;
 	ret = page_reporting_register(&dm_device.pr_dev_info);
 	if (ret < 0) {
 		dm_device.pr_dev_info.report = NULL;
-- 
2.34.1


^ permalink raw reply related

* [PATCH v1 4/4] page_reporting: change PAGE_REPORTING_DEFAULT_ORDER to -1
From: Yuvraj Sakshith @ 2026-02-27 14:06 UTC (permalink / raw)
  To: akpm
  Cc: mst, david, kys, haiyangz, wei.liu, decui, longli, jasowang,
	xuanzhuo, eperezma, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
	surenb, mhocko, jackmanb, hannes, ziy, linux-hyperv,
	virtualization, linux-mm, linux-kernel
In-Reply-To: <20260227140655.360696-1-yuvraj.sakshith@oss.qualcomm.com>

PAGE_REPORTING_DEFAULT_ORDER is now set to zero. This means,
pages of order zero cannot be reported to a client/driver -- as zero
is used to signal a fallback to MAX_PAGE_ORDER.

Change PAGE_REPORTING_DEFAULT_ORDER to (-1),
so that zero can be used as a valid order with which pages can
be reported.

Signed-off-by: Yuvraj Sakshith <yuvraj.sakshith@oss.qualcomm.com>
---
 include/linux/page_reporting.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
index a7e3e30f2..3eb3e26d8 100644
--- a/include/linux/page_reporting.h
+++ b/include/linux/page_reporting.h
@@ -7,7 +7,7 @@
 
 /* This value should always be a power of 2, see page_reporting_cycle() */
 #define PAGE_REPORTING_CAPACITY		32
-#define PAGE_REPORTING_DEFAULT_ORDER	0
+#define PAGE_REPORTING_DEFAULT_ORDER	(-1)
 
 struct page_reporting_dev_info {
 	/* function that alters pages to make them "reported" */
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH net-next] net: mana: Expose page_pool stats via ethtool
From: Andrew Lunn @ 2026-02-27 15:11 UTC (permalink / raw)
  To: Dipayaan Roy
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, dipayanroy
In-Reply-To: <aaFmRqjjOuPIEo5x@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>

> +static void mana_get_page_pool_stats(struct net_device *ndev, u64 *data)
> +{
> +#ifdef CONFIG_PAGE_POOL_STATS
> +	struct mana_port_context *apc = netdev_priv(ndev);
> +	unsigned int num_queues = apc->num_queues;
> +	struct page_pool_stats pp_stats = {};
> +	int q;
> +
> +	for (q = 0; q < num_queues; q++) {
> +		if (!apc->rxqs[q] || !apc->rxqs[q]->page_pool)
> +			continue;
> +
> +		page_pool_get_stats(apc->rxqs[q]->page_pool, &pp_stats);
> +	}
> +
> +	page_pool_ethtool_stats_get(data, &pp_stats);
> +#endif /* CONFIG_PAGE_POOL_STATS */

You should not need this #ifdef. The stubs should make the code do
sensible things if CONFIG_PAGE_POOL_STATS is not enabled.

	 Andrew

^ permalink raw reply

* Re: [PATCH] scsi: storvsc: Fix scheduling while atomic on PREEMPT_RT
From: Jan Kiszka @ 2026-02-27 15:55 UTC (permalink / raw)
  To: Martin K. Petersen, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, James E.J. Bottomley, linux-hyperv
  Cc: linux-scsi, Linux Kernel Mailing List, Florian Bezdeka, RT,
	Mitchell Levy
In-Reply-To: <177195161164.1154639.10246495163151300179.b4-ty@oracle.com>

On 24.02.26 17:47, Martin K. Petersen wrote:
> On Thu, 29 Jan 2026 15:30:39 +0100, Jan Kiszka wrote:
> 
>> This resolves the follow splat and lock-up when running with PREEMPT_RT
>> enabled on Hyper-V:
>>
>> [  415.140818] BUG: scheduling while atomic: stress-ng-iomix/1048/0x00000002
>> [  415.140822] INFO: lockdep is turned off.
>> [  415.140823] Modules linked in: intel_rapl_msr intel_rapl_common intel_uncore_frequency_common intel_pmc_core pmt_telemetry pmt_discovery pmt_class intel_pmc_ssram_telemetry intel_vsec ghash_clmulni_intel aesni_intel rapl binfmt_misc nls_ascii nls_cp437 vfat fat snd_pcm hyperv_drm snd_timer drm_client_lib drm_shmem_helper snd sg soundcore drm_kms_helper pcspkr hv_balloon hv_utils evdev joydev drm configfs efi_pstore nfnetlink vsock_loopback vmw_vsock_virtio_transport_common hv_sock vmw_vsock_vmci_transport vsock vmw_vmci efivarfs autofs4 ext4 crc16 mbcache jbd2 sr_mod sd_mod cdrom hv_storvsc serio_raw hid_generic scsi_transport_fc hid_hyperv scsi_mod hid hv_netvsc hyperv_keyboard scsi_common
>> [  415.140846] Preemption disabled at:
>> [  415.140847] [<ffffffffc0656171>] storvsc_queuecommand+0x2e1/0xbe0 [hv_storvsc]
>> [  415.140854] CPU: 8 UID: 0 PID: 1048 Comm: stress-ng-iomix Not tainted 6.19.0-rc7 #30 PREEMPT_{RT,(full)}
>> [  415.140856] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/04/2024
>> [  415.140857] Call Trace:
>> [  415.140861]  <TASK>
>> [  415.140861]  ? storvsc_queuecommand+0x2e1/0xbe0 [hv_storvsc]
>> [  415.140863]  dump_stack_lvl+0x91/0xb0
>> [  415.140870]  __schedule_bug+0x9c/0xc0
>> [  415.140875]  __schedule+0xdf6/0x1300
>> [  415.140877]  ? rtlock_slowlock_locked+0x56c/0x1980
>> [  415.140879]  ? rcu_is_watching+0x12/0x60
>> [  415.140883]  schedule_rtlock+0x21/0x40
>> [  415.140885]  rtlock_slowlock_locked+0x502/0x1980
>> [  415.140891]  rt_spin_lock+0x89/0x1e0
>> [  415.140893]  hv_ringbuffer_write+0x87/0x2a0
>> [  415.140899]  vmbus_sendpacket_mpb_desc+0xb6/0xe0
>> [  415.140900]  ? rcu_is_watching+0x12/0x60
>> [  415.140902]  storvsc_queuecommand+0x669/0xbe0 [hv_storvsc]
>> [  415.140904]  ? HARDIRQ_verbose+0x10/0x10
>> [  415.140908]  ? __rq_qos_issue+0x28/0x40
>> [  415.140911]  scsi_queue_rq+0x760/0xd80 [scsi_mod]
>> [  415.140926]  __blk_mq_issue_directly+0x4a/0xc0
>> [  415.140928]  blk_mq_issue_direct+0x87/0x2b0
>> [  415.140931]  blk_mq_dispatch_queue_requests+0x120/0x440
>> [  415.140933]  blk_mq_flush_plug_list+0x7a/0x1a0
>> [  415.140935]  __blk_flush_plug+0xf4/0x150
>> [  415.140940]  __submit_bio+0x2b2/0x5c0
>> [  415.140944]  ? submit_bio_noacct_nocheck+0x272/0x360
>> [  415.140946]  submit_bio_noacct_nocheck+0x272/0x360
>> [  415.140951]  ext4_read_bh_lock+0x3e/0x60 [ext4]
>> [  415.140995]  ext4_block_write_begin+0x396/0x650 [ext4]
>> [  415.141018]  ? __pfx_ext4_da_get_block_prep+0x10/0x10 [ext4]
>> [  415.141038]  ext4_da_write_begin+0x1c4/0x350 [ext4]
>> [  415.141060]  generic_perform_write+0x14e/0x2c0
>> [  415.141065]  ext4_buffered_write_iter+0x6b/0x120 [ext4]
>> [  415.141083]  vfs_write+0x2ca/0x570
>> [  415.141087]  ksys_write+0x76/0xf0
>> [  415.141089]  do_syscall_64+0x99/0x1490
>> [  415.141093]  ? rcu_is_watching+0x12/0x60
>> [  415.141095]  ? finish_task_switch.isra.0+0xdf/0x3d0
>> [  415.141097]  ? rcu_is_watching+0x12/0x60
>> [  415.141098]  ? lock_release+0x1f0/0x2a0
>> [  415.141100]  ? rcu_is_watching+0x12/0x60
>> [  415.141101]  ? finish_task_switch.isra.0+0xe4/0x3d0
>> [  415.141103]  ? rcu_is_watching+0x12/0x60
>> [  415.141104]  ? __schedule+0xb34/0x1300
>> [  415.141106]  ? hrtimer_try_to_cancel+0x1d/0x170
>> [  415.141109]  ? do_nanosleep+0x8b/0x160
>> [  415.141111]  ? hrtimer_nanosleep+0x89/0x100
>> [  415.141114]  ? __pfx_hrtimer_wakeup+0x10/0x10
>> [  415.141116]  ? xfd_validate_state+0x26/0x90
>> [  415.141118]  ? rcu_is_watching+0x12/0x60
>> [  415.141120]  ? do_syscall_64+0x1e0/0x1490
>> [  415.141121]  ? do_syscall_64+0x1e0/0x1490
>> [  415.141123]  ? rcu_is_watching+0x12/0x60
>> [  415.141124]  ? do_syscall_64+0x1e0/0x1490
>> [  415.141125]  ? do_syscall_64+0x1e0/0x1490
>> [  415.141127]  ? irqentry_exit+0x140/0x7e0
>> [  415.141129]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
>>
>> [...]
> 
> Applied to 7.0/scsi-fixes, thanks!
> 
> [1/1] scsi: storvsc: Fix scheduling while atomic on PREEMPT_RT
>       https://git.kernel.org/mkp/scsi/c/57297736c082
> 

Should it be here then already?

https://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git/log/?h=7.0/scsi-fixes

Sorry, just trying to understand the process.

Jan

-- 
Siemens AG, Foundational Technologies
Linux Expert Center

^ permalink raw reply

* Re: [PATCH net-next] net: mana: Expose page_pool stats via ethtool
From: Jakub Kicinski @ 2026-02-27 17:27 UTC (permalink / raw)
  To: Dipayaan Roy
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	pabeni, leon, longli, kotaranov, horms, shradhagupta, ssengar,
	ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, dipayanroy
In-Reply-To: <aaFmRqjjOuPIEo5x@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>

On Fri, 27 Feb 2026 01:39:18 -0800 Dipayaan Roy wrote:
> MANA relies on page_pool for RX buffers, and the buffer refill paths
> can behave quite differently across architectures and configurations (e.g.
> base page size, fragment vs full-page usage). This makes it harder to
> understand and compare RX buffer behavior when investigating performance
> and memory differences across platforms.

Standard stats must not be duplicated in ethtool -S.
ynl and ynltool provide easy access to these stats

# ynltool page-pool stats 
    eth0[2]	page pools: 44 (zombies: 0)
		refs: 495680 bytes: 2030305280 (refs: 0 bytes: 0)
		recycling: 100.0% (alloc: 7745:2097593009 recycle: 379301630:1717888312)

^ permalink raw reply

* Re: [PATCH net-next v4] net: mana: Add MAC address to vPort logs and clarify error messages
From: Erni Sri Satya Vennela @ 2026-02-27 19:06 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, dipayanroy, shirazsaleem, ssengar,
	shradhagupta, gargaditya, linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260225192252.943534-1-ernis@linux.microsoft.com>

On Wed, Feb 25, 2026 at 11:22:41AM -0800, Erni Sri Satya Vennela wrote:
> Add MAC address to vPort configuration success message and update error
> message to be more specific about HWC message errors in
> mana_send_request.
> 
> Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>

Gentle ping — I sent this patch on 25/02/2026 and would appreciate any
feedback when you have time.  
Happy to rebase or add more details if needed, thanks for your review.

Regards,
Vennela

^ permalink raw reply

* RE: [PATCH net-next, v2] net: mana: Trigger VF reset/recovery on health check failure due to HWC timeout
From: Haiyang Zhang @ 2026-02-27 19:24 UTC (permalink / raw)
  To: Dipayaan Roy, KY Srinivasan, wei.liu@kernel.org, Dexuan Cui,
	andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, leon@kernel.org, Long Li,
	Konstantin Taranov, horms@kernel.org,
	shradhagupta@linux.microsoft.com, ssengar@linux.microsoft.com,
	ernis@linux.microsoft.com, Shiraz Saleem,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
	Dipayaan Roy
In-Reply-To: <aaFShvKnwR5FY8dH@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>



> -----Original Message-----
> From: Dipayaan Roy <dipayanroy@linux.microsoft.com>
> Sent: Friday, February 27, 2026 3:15 AM
> To: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> <haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> <DECUI@microsoft.com>; andrew+netdev@lunn.ch; davem@davemloft.net;
> edumazet@google.com; kuba@kernel.org; pabeni@redhat.com; leon@kernel.org;
> Long Li <longli@microsoft.com>; Konstantin Taranov
> <kotaranov@microsoft.com>; horms@kernel.org;
> shradhagupta@linux.microsoft.com; ssengar@linux.microsoft.com;
> ernis@linux.microsoft.com; Shiraz Saleem <shirazsaleem@microsoft.com>;
> linux-hyperv@vger.kernel.org; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; linux-rdma@vger.kernel.org; Dipayaan Roy
> <dipayanroy@microsoft.com>
> Subject: [PATCH net-next, v2] net: mana: Trigger VF reset/recovery on
> health check failure due to HWC timeout
> 
> The GF stats periodic query is used as mechanism to monitor HWC health
> check. If this HWC command times out, it is a strong indication that
> the device/SoC is in a faulty state and requires recovery.
> 
> Today, when a timeout is detected, the driver marks
> hwc_timeout_occurred, clears cached stats, and stops rescheduling the
> periodic work. However, the device itself is left in the same failing
> state.
> 
> Extend the timeout handling path to trigger the existing MANA VF
> recovery service by queueing a GDMA_EQE_HWC_RESET_REQUEST work item.
> This is expected to initiate the appropriate recovery flow by suspende
> resume first and if it fails then trigger a bus rescan.
> 
> This change is intentionally limited to HWC command timeouts and does
> not trigger recovery for errors reported by the SoC as a normal command
> response.
> 
> Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
> ---
> Changes in v2:
>   - Added common helper, proper clearing of gc flags.
> ---

Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Thanks.


^ permalink raw reply

* Re: VFIO support on hyperv (vfio_pci_core_ioctl())
From: Alex Williamson @ 2026-02-27 19:29 UTC (permalink / raw)
  To: Mukesh R; +Cc: kvm, wei.liu@kernel.org, linux-hyperv@vger.kernel.org, alex
In-Reply-To: <1f50dae2-ec4a-7914-a14f-2ada803eb0e3@linux.microsoft.com>

On Wed, 25 Feb 2026 14:04:49 -0800
Mukesh R <mrathor@linux.microsoft.com> wrote:

> Hi Alex et al:
> 
> I've been looking at making pci passthru irq setup/remap work on hyperv
> for the latest (6.19) version using vfio core. Unfortunately, it's just
> not fitting well because in case of hyperv the irq remap is done by
> the hypervisor. Specifically, for a robust and proper solution, we need
> to override vfio_pci_set_msi_trigger(). As such, for the best way forward
> I am trying to figure how much flexibility there is to modify
> vfio_pci_intrs.c with "if (running_on_hyperv())" branches (putting hyperv
> code in separate file).
> 
> If none, then the alternative would be to create vfio-hyperv.c with
> vfio_device_ops.ioctl = hyperv_vfio_pci_core_ioctl(). But, then I'd
> be replicating code for other sub ioctls like vfio_pci_ioctl_get_info(),
> vfio_pci_ioctl_get_irq_info(), etc. Would it be acceptable to make them
> non static in this case?
> 
> Please let me know your thoughts or if you have other suggestions.

Hi Mukesh,

In general, littering the code with running_on_hyperv() tests is not
acceptable, but the presented alternative isn't really accurate either.
If you want to substitute in your own ioctl callback, you can still
call vfio_pci_core_ioctl() for all the unhandled ioctls, without extra
exports.  We can also look at whether vfio_pci_device_ops could have a
callback specifically addressing an alternative set_msi_trigger
handler.  Thanks,

Alex

^ permalink raw reply

* Re: [PATCH v1 5/6] x86/hyperv: Implement hypervisor ram collection into vmcore
From: Mukesh R @ 2026-02-27 20:05 UTC (permalink / raw)
  To: Ard Biesheuvel, linux-hyperv, linux-kernel, linux-arch
  Cc: kys, haiyangz, wei.liu, decui, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, dave.hansen, x86, H . Peter Anvin, Arnd Bergmann
In-Reply-To: <eb1c44d7-2664-4269-8824-e90e5a8494b2@app.fastmail.com>

On 2/25/26 23:44, Ard Biesheuvel wrote:
> 
> On Wed, 25 Feb 2026, at 23:27, Mukesh R wrote:
>> On 2/21/26 08:43, Ard Biesheuvel wrote:
>>> Just spotted this code in v7.0-rc
>>>
>>> On Wed, 10 Sep 2025, at 02:10, Mukesh Rathor wrote:
>>> ...
>>>
>>>> +static asmlinkage void __noreturn hv_crash_c_entry(void)
>>>
>>> 'asmlinkage' means that the function may be called from another compilation unit written in assembler, but it doesn't actually evaluate to anything in most cases. Combining it with 'static' makes no sense whatsoever.
>>
>> 'static' means scope is limited to the file. Common in cases where function
>> pointers are used, like here in this file way below.
>>
>> Like the comment says:
>>       "This is the C entry point from the asm glue code after...."
>>
>> IOW, called from assembly function (asm == assembly).
>>
> 
> I wasn't asking you to explain what 'static' means. I was explaining to you that asmlinkage means 'external linkage' whereas 'static' means the opposite, and so combining them makes no sense.
> 
> 
>>>
>>>> +{
>>>> +	struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
>>>> +
>>>> +	/* first thing, restore kernel gdt */
>>>> +	native_load_gdt(&ctxt->gdtr);
>>>> +
>>>> +	asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
>>>> +	asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
>>>> +
>>>
>>> This code is truly very broken. You cannot enter a C function without a stack, and assign RSP half way down the function. Especially after allocating local variables and/or calling other functions - it may happen to work in most cases, but it is very fragile. (Other architectures have the concept of 'naked' functions for this purpose but x86 does not)
>>
>> Local variable refers to static bss struct. IOW,
>>
>>         asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
>>
>> same as:
>>         asm volatile("movq %0, %%rsp" : : "m"(&hv_crash_ctxt.rsp));
>>
>>
> 
> No, it is *not* the same. In practice, the compiler might perform this substitution, but there is no guarantee that this happens.
> 
> 
>>> IOW, this whole function should be written in asm.
>>>> +	asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
>>>> +	asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
>>>> +	asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
>>>> +	asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
>>>> +
>>>> +	native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
>>>> +	asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
>>>> +
>>>> +	asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
>>>> +	asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
>>>> +	asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
>>>> +
>>>> +	native_load_idt(&ctxt->idtr);
>>>> +	native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
>>>> +	native_wrmsrq(MSR_EFER, ctxt->efer);
>>>> +
>>>> +	/* restore the original kernel CS now via far return */
>>>> +	asm volatile("movzwq %0, %%rax\n\t"
>>>> +		     "pushq %%rax\n\t"
>>>> +		     "pushq $1f\n\t"
>>>> +		     "lretq\n\t"
>>>> +		     "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
>>>> +
>>>> +	/* We are in asmlinkage without stack frame,
>>>
>>> You just switched to __KERNEL_CS via the stack.
>>
>> compiler doesn't know that.
>>
> 
> So? But does it means to 'be in asmlinkage' in your interpretation? Did you check what 'asmlinkage' actually evaluates to?
> 
> I am not asking you to justify why this broken code works in practice, I am asking you to fix it.


STOP bossing me! I am not your servant nor your slave. And you are not the
only genius around here.

Now, many people looked at this code before it was merged and no one really
thought any self respecting compiler in modern times would create an issue
here. Still, I see the remote possibility of that happening. All you had
to do was to show your concern and suggest using __naked here (which looks
like we all missed, or maybe it came after the code was written), and it
would have been addressed. This is x64 specific code for very special case
of hyperv or kernel-on-hyperv crashing.

In future if you choose to correspond, watch your tone!



>>>> hence make a C function
>>>> +	 * call which will buy stack frame to restore the tss or clear PT
>>>> entry.
>>>> +	 */
>>>
>>> Where does one buy a stack frame?
>>
>> A stack market :).  Callee will create stack frame now that rsp is
>> setup.
>>
> 
> This code is beyond broken. Please propose fixes rather than try to argue why carrying broken code like this is acceptable.


^ permalink raw reply

* Re: VFIO support on hyperv (vfio_pci_core_ioctl())
From: Mukesh R @ 2026-02-27 20:06 UTC (permalink / raw)
  To: Alex Williamson; +Cc: kvm, wei.liu@kernel.org, linux-hyperv@vger.kernel.org
In-Reply-To: <20260227122957.1e555024@shazbot.org>

On 2/27/26 11:29, Alex Williamson wrote:
> On Wed, 25 Feb 2026 14:04:49 -0800
> Mukesh R <mrathor@linux.microsoft.com> wrote:
> 
>> Hi Alex et al:
>>
>> I've been looking at making pci passthru irq setup/remap work on hyperv
>> for the latest (6.19) version using vfio core. Unfortunately, it's just
>> not fitting well because in case of hyperv the irq remap is done by
>> the hypervisor. Specifically, for a robust and proper solution, we need
>> to override vfio_pci_set_msi_trigger(). As such, for the best way forward
>> I am trying to figure how much flexibility there is to modify
>> vfio_pci_intrs.c with "if (running_on_hyperv())" branches (putting hyperv
>> code in separate file).
>>
>> If none, then the alternative would be to create vfio-hyperv.c with
>> vfio_device_ops.ioctl = hyperv_vfio_pci_core_ioctl(). But, then I'd
>> be replicating code for other sub ioctls like vfio_pci_ioctl_get_info(),
>> vfio_pci_ioctl_get_irq_info(), etc. Would it be acceptable to make them
>> non static in this case?
>>
>> Please let me know your thoughts or if you have other suggestions.
> 
> Hi Mukesh,
> 
> In general, littering the code with running_on_hyperv() tests is not
> acceptable, but the presented alternative isn't really accurate either.
> If you want to substitute in your own ioctl callback, you can still
> call vfio_pci_core_ioctl() for all the unhandled ioctls, without extra

Yes, I realized that after looking at how other callers were using it.

> exports.  We can also look at whether vfio_pci_device_ops could have a
> callback specifically addressing an alternative set_msi_trigger
> handler.  Thanks,

Sounds good. thanks as always,
-Mukesh


> Alex


^ permalink raw reply

* Re: [PATCH v1 0/4] Allow order zero pages in page reporting
From: David Hildenbrand (Arm) @ 2026-02-27 20:44 UTC (permalink / raw)
  To: Yuvraj Sakshith, akpm
  Cc: mst, kys, haiyangz, wei.liu, decui, longli, jasowang, xuanzhuo,
	eperezma, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
	mhocko, jackmanb, hannes, ziy, linux-hyperv, virtualization,
	linux-mm, linux-kernel
In-Reply-To: <20260227140655.360696-1-yuvraj.sakshith@oss.qualcomm.com>

On 2/27/26 15:06, Yuvraj Sakshith wrote:
> Today, page reporting sets page_reporting_order in two ways:
> 
> (1) page_reporting.page_reporting_order cmdline parameter
> (2) Driver can pass order while registering itself.
> 
> In both cases, order zero is ignored by free page reporting
> because it is used to set page_reporting_order to a default
> value, like MAX_PAGE_ORDER.
> 
> In some cases we might want page_reporting_order to be zero.
> 
> For instance, when virtio-balloon runs inside a guest with
> tiny memory (say, 16MB), it might not be able to find a order 1 page
> (or in the worst case order MAX_PAGE_ORDER page) after some uptime.
> Page reporting should be able to return order zero pages back for
> optimal memory relinquishment.
> 
> This patch changes the default fallback value from '0' to '-1' in
> all possible clients of free page reporting (hv_balloon and
> virtio-balloon) together with allowing '0' as a valid order in
> page_reporting_register().
> 
> Changes in v1:
> - Introduce PAGE_REPORTING_DEFAULT_ORDER macro (initially set to 0).
> - Make use of new macro in drivers (hv_balloon and virtio-balloon)
> 	working with page reporting.
> - Change PAGE_REPORTING_DEFAULT_ORDER to -1 as zero is a valid
> 	page order that can be requested.
> 
> Yuvraj Sakshith (3):
>   mm/page_reporting: Allow zero page_reporting_order
>   hv_balloon: Change default page reporting order
>   virtio_balloon: Set pr_dev.order to new default

These look like old stats :)

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v1 1/4] page_reporting: add PAGE_REPORTING_DEFAULT_ORDER
From: David Hildenbrand (Arm) @ 2026-02-27 20:45 UTC (permalink / raw)
  To: Yuvraj Sakshith, akpm
  Cc: mst, kys, haiyangz, wei.liu, decui, longli, jasowang, xuanzhuo,
	eperezma, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
	mhocko, jackmanb, hannes, ziy, linux-hyperv, virtualization,
	linux-mm, linux-kernel
In-Reply-To: <20260227140655.360696-2-yuvraj.sakshith@oss.qualcomm.com>

On 2/27/26 15:06, Yuvraj Sakshith wrote:
> Drivers can pass order of pages to be reported while
> registering itself. Today, this is a magic number, 0.
> 
> Label this with PAGE_REPORTING_DEFAULT_ORDER and
> check for it when the driver is being registered.

Patch subject: "mm/page_reporting:"

Might want to add "We'll make explicit use of this define in relevant
drivers next."

Acked-by: David Hildenbrand (Arm) <david@kernel.org>


-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v1 2/4] virtio_balloon: set default page reporting order
From: David Hildenbrand (Arm) @ 2026-02-27 20:46 UTC (permalink / raw)
  To: Yuvraj Sakshith, akpm
  Cc: mst, kys, haiyangz, wei.liu, decui, longli, jasowang, xuanzhuo,
	eperezma, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
	mhocko, jackmanb, hannes, ziy, linux-hyperv, virtualization,
	linux-mm, linux-kernel
In-Reply-To: <20260227140655.360696-3-yuvraj.sakshith@oss.qualcomm.com>

On 2/27/26 15:06, Yuvraj Sakshith wrote:
> virtio_balloon page reporting order is set to MAX_PAGE_ORDER implicitly
> as vb->prdev.order is never initialised and is auto-set to zero.
> 
> Explicitly mention usage of default page order by making use of
> PAGE_REPORTING_DEFAULT ORDER fallback value.
> 
> Signed-off-by: Yuvraj Sakshith <yuvraj.sakshith@oss.qualcomm.com>

Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v1 4/4] page_reporting: change PAGE_REPORTING_DEFAULT_ORDER to -1
From: David Hildenbrand (Arm) @ 2026-02-27 20:50 UTC (permalink / raw)
  To: Yuvraj Sakshith, akpm
  Cc: mst, kys, haiyangz, wei.liu, decui, longli, jasowang, xuanzhuo,
	eperezma, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
	mhocko, jackmanb, hannes, ziy, linux-hyperv, virtualization,
	linux-mm, linux-kernel
In-Reply-To: <20260227140655.360696-5-yuvraj.sakshith@oss.qualcomm.com>

On 2/27/26 15:06, Yuvraj Sakshith wrote:
> PAGE_REPORTING_DEFAULT_ORDER is now set to zero. This means,
> pages of order zero cannot be reported to a client/driver -- as zero
> is used to signal a fallback to MAX_PAGE_ORDER.
> 
> Change PAGE_REPORTING_DEFAULT_ORDER to (-1),
> so that zero can be used as a valid order with which pages can
> be reported.
> 
> Signed-off-by: Yuvraj Sakshith <yuvraj.sakshith@oss.qualcomm.com>
> ---
>  include/linux/page_reporting.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
> index a7e3e30f2..3eb3e26d8 100644
> --- a/include/linux/page_reporting.h
> +++ b/include/linux/page_reporting.h
> @@ -7,7 +7,7 @@
>  
>  /* This value should always be a power of 2, see page_reporting_cycle() */
>  #define PAGE_REPORTING_CAPACITY		32
> -#define PAGE_REPORTING_DEFAULT_ORDER	0
> +#define PAGE_REPORTING_DEFAULT_ORDER	(-1)

No need for the ().

Wondering whether we now also want to do in this patch:


diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index f0042d5743af..d432aadf9d07 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -11,8 +11,7 @@
 #include "page_reporting.h"
 #include "internal.h"

-/* Initialize to an unsupported value */
-unsigned int page_reporting_order = -1;
+unsigned int page_reporting_order = PAGE_REPORTING_DEFAULT_ORDER;

 static int page_order_update_notify(const char *val, const struct
kernel_param *kp)
 {
@@ -369,7 +368,7 @@ int page_reporting_register(struct
page_reporting_dev_info *prdev)
         * pageblock_order.
         */

-       if (page_reporting_order == -1) {
+       if (page_reporting_order == PAGE_REPORTING_DEFAULT_ORDER) {



(and wondering whether we should have called it
PAGE_REPORTING_USE_DEFAULT_ORDER to make it clearer that it is not an
actual order. Leaving that up to you :) )

-- 
Cheers,

David

^ permalink raw reply related

* Re: [PATCH v1 5/6] x86/hyperv: Implement hypervisor ram collection into vmcore
From: Wei Liu @ 2026-02-27 21:37 UTC (permalink / raw)
  To: Mukesh R, ardb
  Cc: Ard Biesheuvel, linux-hyperv, linux-kernel, linux-arch, kys,
	haiyangz, wei.liu, decui, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, dave.hansen, x86, H . Peter Anvin, Arnd Bergmann
In-Reply-To: <6a601546-a26f-79f6-a3b0-be145dfa7781@linux.microsoft.com>

On Fri, Feb 27, 2026 at 12:05:13PM -0800, Mukesh R wrote:
[...]
> > 
> > So? But does it means to 'be in asmlinkage' in your interpretation? Did you check what 'asmlinkage' actually evaluates to?
> > 
> > I am not asking you to justify why this broken code works in practice, I am asking you to fix it.
> 
> 
> STOP bossing me! I am not your servant nor your slave. And you are not the
> only genius around here.
> 
> Now, many people looked at this code before it was merged and no one really
> thought any self respecting compiler in modern times would create an issue
> here. Still, I see the remote possibility of that happening. All you had
> to do was to show your concern and suggest using __naked here (which looks
> like we all missed, or maybe it came after the code was written), and it
> would have been addressed. This is x64 specific code for very special case
> of hyperv or kernel-on-hyperv crashing.
> 
> In future if you choose to correspond, watch your tone!

Mukesh, there is no need to be so emotional and defensive.

I don't think anyone, no matter how good he or she is, knows all the
intricacies in the kernel. We're lucky to have other people look at our
code and point out potential issues. Regardless of your opinion on the
discussion, we should be thankful for the time and effort people put
into even sending an email, let alone a patch.

Let's keep the discussion civil and constructive, and focus on the
technical aspects of the code.

Ard, I want to let you know that I appreciate you raising this issue
with us.

Thanks,
Wei

^ permalink raw reply

* Re: [RFT PATCH] x86/hyperv: Use __naked attribute to fix stackless C function
From: Wei Liu @ 2026-02-27 21:50 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: linux-kernel, Ard Biesheuvel, Mukesh Rathor, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, H. Peter Anvin,
	Uros Bizjak, linux-hyperv
In-Reply-To: <20260226095056.46410-2-ardb+git@google.com>

On Thu, Feb 26, 2026 at 10:50:57AM +0100, Ard Biesheuvel wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
> 
> hv_crash_c_entry() is a C function that is entered without a stack,
> and this is only allowed for functions that have the __naked attribute,
> which informs the compiler that it must not emit the usual prologue and
> epilogue or emit any other kind of instrumentation that relies on a
> stack frame.
> 
> So split up the function, and set the __naked attribute on the initial
> part that sets up the stack, GDT, IDT and other pieces that are needed
> for ordinary C execution. Given that function calls are not permitted
> either, use the existing long return coded in an asm() block to call the
> second part of the function, which is an ordinary function that is
> permitted to call other functions as usual.
> 
> Fixes: 94212d34618c ("x86/hyperv: Implement hypervisor RAM collection into vmcore")
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>

Ard, thank you for the patch.

For avoidance of doubt, I expect another version to be sent. We will
review and test the new version on our side.

Wei

^ permalink raw reply

* Re: [PATCH v1 5/6] x86/hyperv: Implement hypervisor ram collection into vmcore
From: Ard Biesheuvel @ 2026-02-27 22:10 UTC (permalink / raw)
  To: Wei Liu, Mukesh Rathor
  Cc: linux-hyperv, linux-kernel, linux-arch, kys, haiyangz, decui,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, dave.hansen, x86,
	H . Peter Anvin, Arnd Bergmann
In-Reply-To: <20260227213733.GA976651@liuwe-devbox-debian-v2.local>



On Fri, 27 Feb 2026, at 22:37, Wei Liu wrote:
> On Fri, Feb 27, 2026 at 12:05:13PM -0800, Mukesh R wrote:
> [...]
>> > 
>> > So? But does it means to 'be in asmlinkage' in your interpretation? Did you check what 'asmlinkage' actually evaluates to?
>> > 
>> > I am not asking you to justify why this broken code works in practice, I am asking you to fix it.
>> 
>> 
>> STOP bossing me! I am not your servant nor your slave. And you are not the
>> only genius around here.
>> 
>> Now, many people looked at this code before it was merged and no one really
>> thought any self respecting compiler in modern times would create an issue
>> here. Still, I see the remote possibility of that happening. All you had
>> to do was to show your concern and suggest using __naked here (which looks
>> like we all missed, or maybe it came after the code was written), and it
>> would have been addressed. This is x64 specific code for very special case
>> of hyperv or kernel-on-hyperv crashing.
>> 
>> In future if you choose to correspond, watch your tone!
>
> Mukesh, there is no need to be so emotional and defensive.
>
> I don't think anyone, no matter how good he or she is, knows all the
> intricacies in the kernel. We're lucky to have other people look at our
> code and point out potential issues. Regardless of your opinion on the
> discussion, we should be thankful for the time and effort people put
> into even sending an email, let alone a patch.
>
> Let's keep the discussion civil and constructive, and focus on the
> technical aspects of the code.
>
> Ard, I want to let you know that I appreciate you raising this issue
> with us.
>

Much appreciated. And apologies to Mukesh for my harsh tone - I should have been more diplomatic in my response.


^ permalink raw reply

* [PATCH v2] x86/hyperv: Use __naked attribute to fix stackless C function
From: Ard Biesheuvel @ 2026-02-27 22:40 UTC (permalink / raw)
  To: linux-kernel
  Cc: x86, Ard Biesheuvel, Mukesh Rathor, Wei Liu, Uros Bizjak,
	Andrew Cooper, linux-hyperv

hv_crash_c_entry() is a C function that is entered without a stack,
and this is only allowed for functions that have the __naked attribute,
which informs the compiler that it must not emit the usual prologue and
epilogue or emit any other kind of instrumentation that relies on a
stack frame.

So split up the function, and set the __naked attribute on the initial
part that sets up the stack, GDT, IDT and other pieces that are needed
for ordinary C execution. Given that function calls are not permitted
either, use the existing long return coded in an asm() block to call the
second part of the function, which is an ordinary function that is
permitted to call other functions as usual.

Cc: Mukesh Rathor <mrathor@linux.microsoft.com>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Uros Bizjak <ubizjak@gmail.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: linux-hyperv@vger.kernel.org
Fixes: 94212d34618c ("x86/hyperv: Implement hypervisor RAM collection into vmcore")
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
v2: apply some asm tweaks suggested by Uros and Andrew

 arch/x86/hyperv/hv_crash.c | 79 ++++++++++----------
 1 file changed, 41 insertions(+), 38 deletions(-)

diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c
index 92da1b4f2e73..1c0965eb346e 100644
--- a/arch/x86/hyperv/hv_crash.c
+++ b/arch/x86/hyperv/hv_crash.c
@@ -107,14 +107,12 @@ static void __noreturn hv_panic_timeout_reboot(void)
 		cpu_relax();
 }
 
-/* This cannot be inlined as it needs stack */
-static noinline __noclone void hv_crash_restore_tss(void)
+static void hv_crash_restore_tss(void)
 {
 	load_TR_desc();
 }
 
-/* This cannot be inlined as it needs stack */
-static noinline void hv_crash_clear_kernpt(void)
+static void hv_crash_clear_kernpt(void)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -125,6 +123,25 @@ static noinline void hv_crash_clear_kernpt(void)
 	native_p4d_clear(p4d);
 }
 
+
+static void __noreturn hv_crash_handle(void)
+{
+	hv_crash_restore_tss();
+	hv_crash_clear_kernpt();
+
+	/* we are now fully in devirtualized normal kernel mode */
+	__crash_kexec(NULL);
+
+	hv_panic_timeout_reboot();
+}
+
+/*
+ * __naked functions do not permit function calls, not even to __always_inline
+ * functions that only contain asm() blocks themselves. So use a macro instead.
+ */
+#define hv_wrmsr(msr, val) \
+	asm("wrmsr" :: "c"(msr), "a"((u32)val), "d"((u32)(val >> 32)) : "memory")
+
 /*
  * This is the C entry point from the asm glue code after the disable hypercall.
  * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
@@ -133,49 +150,35 @@ static noinline void hv_crash_clear_kernpt(void)
  * available. We restore kernel GDT, and rest of the context, and continue
  * to kexec.
  */
-static asmlinkage void __noreturn hv_crash_c_entry(void)
+static void __naked hv_crash_c_entry(void)
 {
-	struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
-
 	/* first thing, restore kernel gdt */
-	native_load_gdt(&ctxt->gdtr);
+	asm volatile("lgdt %0" : : "m" (hv_crash_ctxt.gdtr));
 
-	asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
-	asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
+	asm volatile("movw %0, %%ss" : : "m"(hv_crash_ctxt.ss));
+	asm volatile("movq %0, %%rsp" : : "m"(hv_crash_ctxt.rsp));
 
-	asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
-	asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
-	asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
-	asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
+	asm volatile("movw %0, %%ds" : : "m"(hv_crash_ctxt.ds));
+	asm volatile("movw %0, %%es" : : "m"(hv_crash_ctxt.es));
+	asm volatile("movw %0, %%fs" : : "m"(hv_crash_ctxt.fs));
+	asm volatile("movw %0, %%gs" : : "m"(hv_crash_ctxt.gs));
 
-	native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
-	asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
+	hv_wrmsr(MSR_IA32_CR_PAT, hv_crash_ctxt.pat);
+	asm volatile("movq %0, %%cr0" : : "r"(hv_crash_ctxt.cr0));
 
-	asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
-	asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
-	asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
+	asm volatile("movq %0, %%cr8" : : "r"(hv_crash_ctxt.cr8));
+	asm volatile("movq %0, %%cr4" : : "r"(hv_crash_ctxt.cr4));
+	asm volatile("movq %0, %%cr2" : : "r"(hv_crash_ctxt.cr4));
 
-	native_load_idt(&ctxt->idtr);
-	native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
-	native_wrmsrq(MSR_EFER, ctxt->efer);
+	asm volatile("lidt %0" : : "m" (hv_crash_ctxt.idtr));
+	hv_wrmsr(MSR_GS_BASE, hv_crash_ctxt.gsbase);
+	hv_wrmsr(MSR_EFER, hv_crash_ctxt.efer);
 
 	/* restore the original kernel CS now via far return */
-	asm volatile("movzwq %0, %%rax\n\t"
-		     "pushq %%rax\n\t"
-		     "pushq $1f\n\t"
-		     "lretq\n\t"
-		     "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
-
-	/* We are in asmlinkage without stack frame, hence make C function
-	 * calls which will buy stack frames.
-	 */
-	hv_crash_restore_tss();
-	hv_crash_clear_kernpt();
-
-	/* we are now fully in devirtualized normal kernel mode */
-	__crash_kexec(NULL);
-
-	hv_panic_timeout_reboot();
+	asm volatile("pushq %q0\n\t"
+		     "pushq %q1\n\t"
+		     "lretq"
+		     :: "r"(hv_crash_ctxt.cs), "r"(hv_crash_handle));
 }
 /* Tell gcc we are using lretq long jump in the above function intentionally */
 STACK_FRAME_NON_STANDARD(hv_crash_c_entry);
-- 
2.47.3


^ permalink raw reply related

* Re: [PATCH v2] x86/hyperv: Use __naked attribute to fix stackless C function
From: Mukesh R @ 2026-02-27 23:03 UTC (permalink / raw)
  To: Ard Biesheuvel, linux-kernel
  Cc: x86, Wei Liu, Uros Bizjak, Andrew Cooper, linux-hyperv
In-Reply-To: <20260227224030.299993-2-ardb@kernel.org>

On 2/27/26 14:40, Ard Biesheuvel wrote:
> hv_crash_c_entry() is a C function that is entered without a stack,
> and this is only allowed for functions that have the __naked attribute,
> which informs the compiler that it must not emit the usual prologue and
> epilogue or emit any other kind of instrumentation that relies on a
> stack frame.
> 
> So split up the function, and set the __naked attribute on the initial
> part that sets up the stack, GDT, IDT and other pieces that are needed
> for ordinary C execution. Given that function calls are not permitted
> either, use the existing long return coded in an asm() block to call the
> second part of the function, which is an ordinary function that is
> permitted to call other functions as usual.

Thank you for the patch. I'll start a build on the side and test it
out and let you know.

Thanks,
-Mukesh



> Cc: Mukesh Rathor <mrathor@linux.microsoft.com>
> Cc: Wei Liu <wei.liu@kernel.org>
> Cc: Uros Bizjak <ubizjak@gmail.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> Cc: linux-hyperv@vger.kernel.org
> Fixes: 94212d34618c ("x86/hyperv: Implement hypervisor RAM collection into vmcore")
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
> v2: apply some asm tweaks suggested by Uros and Andrew
> 
>   arch/x86/hyperv/hv_crash.c | 79 ++++++++++----------
>   1 file changed, 41 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c
> index 92da1b4f2e73..1c0965eb346e 100644
> --- a/arch/x86/hyperv/hv_crash.c
> +++ b/arch/x86/hyperv/hv_crash.c
> @@ -107,14 +107,12 @@ static void __noreturn hv_panic_timeout_reboot(void)
>   		cpu_relax();
>   }
>   
> -/* This cannot be inlined as it needs stack */
> -static noinline __noclone void hv_crash_restore_tss(void)
> +static void hv_crash_restore_tss(void)
>   {
>   	load_TR_desc();
>   }
>   
> -/* This cannot be inlined as it needs stack */
> -static noinline void hv_crash_clear_kernpt(void)
> +static void hv_crash_clear_kernpt(void)
>   {
>   	pgd_t *pgd;
>   	p4d_t *p4d;
> @@ -125,6 +123,25 @@ static noinline void hv_crash_clear_kernpt(void)
>   	native_p4d_clear(p4d);
>   }
>   
> +
> +static void __noreturn hv_crash_handle(void)
> +{
> +	hv_crash_restore_tss();
> +	hv_crash_clear_kernpt();
> +
> +	/* we are now fully in devirtualized normal kernel mode */
> +	__crash_kexec(NULL);
> +
> +	hv_panic_timeout_reboot();
> +}
> +
> +/*
> + * __naked functions do not permit function calls, not even to __always_inline
> + * functions that only contain asm() blocks themselves. So use a macro instead.
> + */
> +#define hv_wrmsr(msr, val) \
> +	asm("wrmsr" :: "c"(msr), "a"((u32)val), "d"((u32)(val >> 32)) : "memory")
> +
>   /*
>    * This is the C entry point from the asm glue code after the disable hypercall.
>    * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
> @@ -133,49 +150,35 @@ static noinline void hv_crash_clear_kernpt(void)
>    * available. We restore kernel GDT, and rest of the context, and continue
>    * to kexec.
>    */
> -static asmlinkage void __noreturn hv_crash_c_entry(void)
> +static void __naked hv_crash_c_entry(void)
>   {
> -	struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
> -
>   	/* first thing, restore kernel gdt */
> -	native_load_gdt(&ctxt->gdtr);
> +	asm volatile("lgdt %0" : : "m" (hv_crash_ctxt.gdtr));
>   
> -	asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
> -	asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
> +	asm volatile("movw %0, %%ss" : : "m"(hv_crash_ctxt.ss));
> +	asm volatile("movq %0, %%rsp" : : "m"(hv_crash_ctxt.rsp));
>   
> -	asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
> -	asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
> -	asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
> -	asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
> +	asm volatile("movw %0, %%ds" : : "m"(hv_crash_ctxt.ds));
> +	asm volatile("movw %0, %%es" : : "m"(hv_crash_ctxt.es));
> +	asm volatile("movw %0, %%fs" : : "m"(hv_crash_ctxt.fs));
> +	asm volatile("movw %0, %%gs" : : "m"(hv_crash_ctxt.gs));
>   
> -	native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
> -	asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
> +	hv_wrmsr(MSR_IA32_CR_PAT, hv_crash_ctxt.pat);
> +	asm volatile("movq %0, %%cr0" : : "r"(hv_crash_ctxt.cr0));
>   
> -	asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
> -	asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
> -	asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
> +	asm volatile("movq %0, %%cr8" : : "r"(hv_crash_ctxt.cr8));
> +	asm volatile("movq %0, %%cr4" : : "r"(hv_crash_ctxt.cr4));
> +	asm volatile("movq %0, %%cr2" : : "r"(hv_crash_ctxt.cr4));
>   
> -	native_load_idt(&ctxt->idtr);
> -	native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
> -	native_wrmsrq(MSR_EFER, ctxt->efer);
> +	asm volatile("lidt %0" : : "m" (hv_crash_ctxt.idtr));
> +	hv_wrmsr(MSR_GS_BASE, hv_crash_ctxt.gsbase);
> +	hv_wrmsr(MSR_EFER, hv_crash_ctxt.efer);
>   
>   	/* restore the original kernel CS now via far return */
> -	asm volatile("movzwq %0, %%rax\n\t"
> -		     "pushq %%rax\n\t"
> -		     "pushq $1f\n\t"
> -		     "lretq\n\t"
> -		     "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
> -
> -	/* We are in asmlinkage without stack frame, hence make C function
> -	 * calls which will buy stack frames.
> -	 */
> -	hv_crash_restore_tss();
> -	hv_crash_clear_kernpt();
> -
> -	/* we are now fully in devirtualized normal kernel mode */
> -	__crash_kexec(NULL);
> -
> -	hv_panic_timeout_reboot();
> +	asm volatile("pushq %q0\n\t"
> +		     "pushq %q1\n\t"
> +		     "lretq"
> +		     :: "r"(hv_crash_ctxt.cs), "r"(hv_crash_handle));
>   }
>   /* Tell gcc we are using lretq long jump in the above function intentionally */
>   STACK_FRAME_NON_STANDARD(hv_crash_c_entry);


^ permalink raw reply

* Re: [PATCH v2] x86/hyperv: Use __naked attribute to fix stackless C function
From: Andrew Cooper @ 2026-02-27 23:07 UTC (permalink / raw)
  To: Ard Biesheuvel, linux-kernel
  Cc: Andrew Cooper, x86, Mukesh Rathor, Wei Liu, Uros Bizjak,
	linux-hyperv
In-Reply-To: <20260227224030.299993-2-ardb@kernel.org>

On 27/02/2026 10:40 pm, Ard Biesheuvel wrote:
> hv_crash_c_entry() is a C function that is entered without a stack,
> and this is only allowed for functions that have the __naked attribute,
> which informs the compiler that it must not emit the usual prologue and
> epilogue or emit any other kind of instrumentation that relies on a
> stack frame.
>
> So split up the function, and set the __naked attribute on the initial
> part that sets up the stack, GDT, IDT and other pieces that are needed
> for ordinary C execution. Given that function calls are not permitted
> either, use the existing long return coded in an asm() block to call the
> second part of the function, which is an ordinary function that is
> permitted to call other functions as usual.
>
> Cc: Mukesh Rathor <mrathor@linux.microsoft.com>
> Cc: Wei Liu <wei.liu@kernel.org>
> Cc: Uros Bizjak <ubizjak@gmail.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> Cc: linux-hyperv@vger.kernel.org
> Fixes: 94212d34618c ("x86/hyperv: Implement hypervisor RAM collection into vmcore")
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
> v2: apply some asm tweaks suggested by Uros and Andrew

Looking better.  FWIW, Reviewed-by: Andrew Cooper
<andrew.cooper3@citrix.com> (asm parts, not hv parts).

Two minor suggestions, probably left to the maintainers digression.

> diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c
> index 92da1b4f2e73..1c0965eb346e 100644
> --- a/arch/x86/hyperv/hv_crash.c
> +++ b/arch/x86/hyperv/hv_crash.c
> @@ -125,6 +123,25 @@ static noinline void hv_crash_clear_kernpt(void)
>  	native_p4d_clear(p4d);
>  }
>  
> +
> +static void __noreturn hv_crash_handle(void)
> +{
> +	hv_crash_restore_tss();
> +	hv_crash_clear_kernpt();
> +
> +	/* we are now fully in devirtualized normal kernel mode */
> +	__crash_kexec(NULL);
> +
> +	hv_panic_timeout_reboot();
> +}
> +
> +/*
> + * __naked functions do not permit function calls, not even to __always_inline
> + * functions that only contain asm() blocks themselves. So use a macro instead.
> + */
> +#define hv_wrmsr(msr, val) \
> +	asm("wrmsr" :: "c"(msr), "a"((u32)val), "d"((u32)(val >> 32)) : "memory")

How about naming it hv_crash_wrmsr()?

It's important that this wrapper is not reused elsewhere.  Elsewhere
should use the regular MSR accessors.

> +
>  /*
>   * This is the C entry point from the asm glue code after the disable hypercall.
>   * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
> @@ -133,49 +150,35 @@ static noinline void hv_crash_clear_kernpt(void)
>   * available. We restore kernel GDT, and rest of the context, and continue
>   * to kexec.
>   */
> -static asmlinkage void __noreturn hv_crash_c_entry(void)
> +static void __naked hv_crash_c_entry(void)
>  {
> -	struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
> -
>  	/* first thing, restore kernel gdt */
> -	native_load_gdt(&ctxt->gdtr);
> +	asm volatile("lgdt %0" : : "m" (hv_crash_ctxt.gdtr));
>  
> -	asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
> -	asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
> +	asm volatile("movw %0, %%ss" : : "m"(hv_crash_ctxt.ss));
> +	asm volatile("movq %0, %%rsp" : : "m"(hv_crash_ctxt.rsp));
>  
> -	asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
> -	asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
> -	asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
> -	asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
> +	asm volatile("movw %0, %%ds" : : "m"(hv_crash_ctxt.ds));
> +	asm volatile("movw %0, %%es" : : "m"(hv_crash_ctxt.es));
> +	asm volatile("movw %0, %%fs" : : "m"(hv_crash_ctxt.fs));
> +	asm volatile("movw %0, %%gs" : : "m"(hv_crash_ctxt.gs));
>  
> -	native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
> -	asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
> +	hv_wrmsr(MSR_IA32_CR_PAT, hv_crash_ctxt.pat);
> +	asm volatile("movq %0, %%cr0" : : "r"(hv_crash_ctxt.cr0));
>  
> -	asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
> -	asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
> -	asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
> +	asm volatile("movq %0, %%cr8" : : "r"(hv_crash_ctxt.cr8));
> +	asm volatile("movq %0, %%cr4" : : "r"(hv_crash_ctxt.cr4));
> +	asm volatile("movq %0, %%cr2" : : "r"(hv_crash_ctxt.cr4));
>  
> -	native_load_idt(&ctxt->idtr);
> -	native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
> -	native_wrmsrq(MSR_EFER, ctxt->efer);
> +	asm volatile("lidt %0" : : "m" (hv_crash_ctxt.idtr));
> +	hv_wrmsr(MSR_GS_BASE, hv_crash_ctxt.gsbase);
> +	hv_wrmsr(MSR_EFER, hv_crash_ctxt.efer);
>  
>  	/* restore the original kernel CS now via far return */
> -	asm volatile("movzwq %0, %%rax\n\t"
> -		     "pushq %%rax\n\t"
> -		     "pushq $1f\n\t"
> -		     "lretq\n\t"
> -		     "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
> -
> -	/* We are in asmlinkage without stack frame, hence make C function
> -	 * calls which will buy stack frames.
> -	 */
> -	hv_crash_restore_tss();
> -	hv_crash_clear_kernpt();
> -
> -	/* we are now fully in devirtualized normal kernel mode */
> -	__crash_kexec(NULL);
> -
> -	hv_panic_timeout_reboot();
> +	asm volatile("pushq %q0\n\t"
> +		     "pushq %q1\n\t"
> +		     "lretq"
> +		     :: "r"(hv_crash_ctxt.cs), "r"(hv_crash_handle));
>  }
>  /* Tell gcc we are using lretq long jump in the above function intentionally */
>  STACK_FRAME_NON_STANDARD(hv_crash_c_entry);

How about fixing the comment to say objtool?  It's not GCC which cares.

~Andrew

^ permalink raw reply

* Re: [PATCH net-next v4] net: mana: Add MAC address to vPort logs and clarify error messages
From: Jakub Kicinski @ 2026-02-28  0:52 UTC (permalink / raw)
  To: Erni Sri Satya Vennela
  Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, pabeni, dipayanroy, shirazsaleem, ssengar, shradhagupta,
	gargaditya, linux-hyperv, netdev, linux-kernel
In-Reply-To: <aaHrN+spIIaswoX6@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>

On Fri, 27 Feb 2026 11:06:31 -0800 Erni Sri Satya Vennela wrote:
> On Wed, Feb 25, 2026 at 11:22:41AM -0800, Erni Sri Satya Vennela wrote:
> > Add MAC address to vPort configuration success message and update error
> > message to be more specific about HWC message errors in
> > mana_send_request.
> > 
> > Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>  
> 
> Gentle ping — I sent this patch on 25/02/2026 and would appreciate any
> feedback when you have time.  
> Happy to rebase or add more details if needed, thanks for your review.

What are you trying to achieve with this ping? Just look at patchwork,
there are 61 patches ahead of you in the queue.

These are Microsoft review contribution scores:
  Author score negative (-42)
  Company score negative (-1118)
so you expecting that someone in the community will jump onto reviewing
your patches is... odd. How about you review something?

Read the process documentation, and please have some basic
understanding of what is consider good manners when communicating
upstream.

^ permalink raw reply

* [PATCH net-next 0/6] net: mana: Per-vPort EQ and MSI-X interrupt management
From: Long Li @ 2026-02-28  2:11 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel

This series adds per-vPort Event Queue (EQ) allocation and MSI-X interrupt
management for the MANA driver. Previously, all vPorts shared a single set
of EQs. This change enables dedicated EQs per vPort with support for both
dedicated and shared MSI-X vector allocation modes.

Patch 1 moves EQ ownership from mana_context to per-vPort mana_port_context
and exports create/destroy functions for the RDMA driver.

Patch 2 adds device capability queries to determine whether MSI-X vectors
should be dedicated per-vPort or shared. When the number of available MSI-X
vectors is insufficient for dedicated allocation, the driver enables sharing
mode with bitmap-based vector assignment.

Patch 3 introduces the GIC (GDMA IRQ Context) abstraction with reference
counting, allowing multiple EQs to safely share a single MSI-X vector.

Patch 4 converts the global EQ allocation in probe/resume to use the new
GIC functions.

Patch 5 adds per-vPort GIC lifecycle management, calling get/put on each
EQ creation and destruction during vPort open/close.

Patch 6 extends the same GIC lifecycle management to the RDMA driver's EQ
allocation path.

Long Li (6):
  net: mana: Create separate EQs for each vPort
  net: mana: Query device capabilities and configure MSI-X sharing for
    EQs
  net: mana: Introduce GIC context with refcounting for interrupt
    management
  net: mana: Use GIC functions to allocate global EQs
  net: mana: Allocate interrupt context for each EQ when creating vPort
  RDMA/mana_ib: Allocate interrupt contexts on EQs

 drivers/infiniband/hw/mana/main.c             |  47 ++-
 drivers/infiniband/hw/mana/qp.c               |   4 +-
 .../net/ethernet/microsoft/mana/gdma_main.c   | 309 +++++++++++++-----
 drivers/net/ethernet/microsoft/mana/mana_en.c | 164 ++++++----
 include/net/mana/gdma.h                       |  29 +-
 include/net/mana/mana.h                       |   7 +-
 6 files changed, 402 insertions(+), 158 deletions(-)

-- 
2.43.0

^ permalink raw reply

* [PATCH net-next 1/6] net: mana: Create separate EQs for each vPort
From: Long Li @ 2026-02-28  2:11 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260228021144.85054-1-longli@microsoft.com>

To prepare for assigning vPorts to dedicated MSI-X vectors, remove EQ
sharing among the vPorts and create dedicated EQs for each vPort.

Move the EQ definition from struct mana_context to struct mana_port_context
and update related support functions. Export mana_create_eq() and
mana_destroy_eq() for use by the MANA RDMA driver.

Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/infiniband/hw/mana/main.c             |  14 ++-
 drivers/infiniband/hw/mana/qp.c               |   4 +-
 drivers/net/ethernet/microsoft/mana/mana_en.c | 111 ++++++++++--------
 include/net/mana/mana.h                       |   7 +-
 4 files changed, 83 insertions(+), 53 deletions(-)

diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index fac159f7128d..cfa954460585 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -20,8 +20,10 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
 	pd->vport_use_count--;
 	WARN_ON(pd->vport_use_count < 0);
 
-	if (!pd->vport_use_count)
+	if (!pd->vport_use_count) {
+		mana_destroy_eq(mpc);
 		mana_uncfg_vport(mpc);
+	}
 
 	mutex_unlock(&pd->vport_mutex);
 }
@@ -55,15 +57,21 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
 		return err;
 	}
 
-	mutex_unlock(&pd->vport_mutex);
 
 	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
 	pd->tx_vp_offset = mpc->tx_vp_offset;
+	err = mana_create_eq(mpc);
+	if (err) {
+		mana_uncfg_vport(mpc);
+		pd->vport_use_count--;
+	}
+
+	mutex_unlock(&pd->vport_mutex);
 
 	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
 		  mpc->port_handle, pd->pdn, doorbell_id);
 
-	return 0;
+	return err;
 }
 
 int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index 48c1f4977f21..d71c301b29c2 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -189,7 +189,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 		cq_spec.gdma_region = cq->queue.gdma_region;
 		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
 		cq_spec.modr_ctx_id = 0;
-		eq = &mpc->ac->eqs[cq->comp_vector];
+		eq = &mpc->eqs[cq->comp_vector % mpc->num_queues];
 		cq_spec.attached_eq = eq->eq->id;
 
 		ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,
@@ -341,7 +341,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 	cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE;
 	cq_spec.modr_ctx_id = 0;
 	eq_vec = send_cq->comp_vector;
-	eq = &mpc->ac->eqs[eq_vec];
+	eq = &mpc->eqs[eq_vec % mpc->num_queues];
 	cq_spec.attached_eq = eq->eq->id;
 
 	err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec,
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 9b5a72ada5c4..566e45a66adf 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1590,79 +1590,83 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
 }
 EXPORT_SYMBOL_NS(mana_destroy_wq_obj, "NET_MANA");
 
-static void mana_destroy_eq(struct mana_context *ac)
+void mana_destroy_eq(struct mana_port_context *apc)
 {
+	struct mana_context *ac = apc->ac;
 	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct gdma_queue *eq;
 	int i;
 
-	if (!ac->eqs)
+	if (!apc->eqs)
 		return;
 
-	debugfs_remove_recursive(ac->mana_eqs_debugfs);
-	ac->mana_eqs_debugfs = NULL;
+	debugfs_remove_recursive(apc->mana_eqs_debugfs);
+	apc->mana_eqs_debugfs = NULL;
 
-	for (i = 0; i < gc->max_num_queues; i++) {
-		eq = ac->eqs[i].eq;
+	for (i = 0; i < apc->num_queues; i++) {
+		eq = apc->eqs[i].eq;
 		if (!eq)
 			continue;
 
 		mana_gd_destroy_queue(gc, eq);
 	}
 
-	kfree(ac->eqs);
-	ac->eqs = NULL;
+	kfree(apc->eqs);
+	apc->eqs = NULL;
 }
+EXPORT_SYMBOL_NS(mana_destroy_eq, "NET_MANA");
 
-static void mana_create_eq_debugfs(struct mana_context *ac, int i)
+static void mana_create_eq_debugfs(struct mana_port_context *apc, int i)
 {
-	struct mana_eq eq = ac->eqs[i];
+	struct mana_eq eq = apc->eqs[i];
 	char eqnum[32];
 
 	sprintf(eqnum, "eq%d", i);
-	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, ac->mana_eqs_debugfs);
+	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs);
 	debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head);
 	debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail);
 	debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops);
 }
 
-static int mana_create_eq(struct mana_context *ac)
+int mana_create_eq(struct mana_port_context *apc)
 {
-	struct gdma_dev *gd = ac->gdma_dev;
+	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct gdma_context *gc = gd->gdma_context;
 	struct gdma_queue_spec spec = {};
 	int err;
 	int i;
 
-	ac->eqs = kcalloc(gc->max_num_queues, sizeof(struct mana_eq),
-			  GFP_KERNEL);
-	if (!ac->eqs)
+	WARN_ON(apc->eqs);
+	apc->eqs = kcalloc(apc->num_queues, sizeof(struct mana_eq),
+			   GFP_KERNEL);
+	if (!apc->eqs)
 		return -ENOMEM;
 
 	spec.type = GDMA_EQ;
 	spec.monitor_avl_buf = false;
 	spec.queue_size = EQ_SIZE;
 	spec.eq.callback = NULL;
-	spec.eq.context = ac->eqs;
+	spec.eq.context = apc->eqs;
 	spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
 
-	ac->mana_eqs_debugfs = debugfs_create_dir("EQs", gc->mana_pci_debugfs);
+	apc->mana_eqs_debugfs = debugfs_create_dir("EQs", apc->mana_port_debugfs);
 
-	for (i = 0; i < gc->max_num_queues; i++) {
+	for (i = 0; i < apc->num_queues; i++) {
 		spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
-		err = mana_gd_create_mana_eq(gd, &spec, &ac->eqs[i].eq);
+		err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);
 		if (err) {
 			dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err);
 			goto out;
 		}
-		mana_create_eq_debugfs(ac, i);
+		mana_create_eq_debugfs(apc, i);
 	}
 
 	return 0;
 out:
-	mana_destroy_eq(ac);
+	mana_destroy_eq(apc);
 	return err;
 }
+EXPORT_SYMBOL_NS(mana_create_eq, "NET_MANA");
 
 static int mana_fence_rq(struct mana_port_context *apc, struct mana_rxq *rxq)
 {
@@ -2381,7 +2385,7 @@ static int mana_create_txq(struct mana_port_context *apc,
 		spec.monitor_avl_buf = false;
 		spec.queue_size = cq_size;
 		spec.cq.callback = mana_schedule_napi;
-		spec.cq.parent_eq = ac->eqs[i].eq;
+		spec.cq.parent_eq = apc->eqs[i].eq;
 		spec.cq.context = cq;
 		err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq);
 		if (err)
@@ -2775,13 +2779,12 @@ static void mana_create_rxq_debugfs(struct mana_port_context *apc, int idx)
 static int mana_add_rx_queues(struct mana_port_context *apc,
 			      struct net_device *ndev)
 {
-	struct mana_context *ac = apc->ac;
 	struct mana_rxq *rxq;
 	int err = 0;
 	int i;
 
 	for (i = 0; i < apc->num_queues; i++) {
-		rxq = mana_create_rxq(apc, i, &ac->eqs[i], ndev);
+		rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev);
 		if (!rxq) {
 			err = -ENOMEM;
 			netdev_err(ndev, "Failed to create rxq %d : %d\n", i, err);
@@ -2800,9 +2803,8 @@ static int mana_add_rx_queues(struct mana_port_context *apc,
 	return err;
 }
 
-static void mana_destroy_vport(struct mana_port_context *apc)
+static void mana_destroy_rxqs(struct mana_port_context *apc)
 {
-	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct mana_rxq *rxq;
 	u32 rxq_idx;
 
@@ -2814,8 +2816,12 @@ static void mana_destroy_vport(struct mana_port_context *apc)
 		mana_destroy_rxq(apc, rxq, true);
 		apc->rxqs[rxq_idx] = NULL;
 	}
+}
+
+static void mana_destroy_vport(struct mana_port_context *apc)
+{
+	struct gdma_dev *gd = apc->ac->gdma_dev;
 
-	mana_destroy_txq(apc);
 	mana_uncfg_vport(apc);
 
 	if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode)
@@ -2836,11 +2842,7 @@ static int mana_create_vport(struct mana_port_context *apc,
 			return err;
 	}
 
-	err = mana_cfg_vport(apc, gd->pdid, gd->doorbell);
-	if (err)
-		return err;
-
-	return mana_create_txq(apc, net);
+	return mana_cfg_vport(apc, gd->pdid, gd->doorbell);
 }
 
 static int mana_rss_table_alloc(struct mana_port_context *apc)
@@ -3117,21 +3119,36 @@ int mana_alloc_queues(struct net_device *ndev)
 
 	err = mana_create_vport(apc, ndev);
 	if (err) {
-		netdev_err(ndev, "Failed to create vPort %u : %d\n", apc->port_idx, err);
+		netdev_err(ndev, "Failed to create vPort %u : %d\n",
+			   apc->port_idx, err);
 		return err;
 	}
 
+	err = mana_create_eq(apc);
+	if (err) {
+		netdev_err(ndev, "Failed to create EQ on vPort %u: %d\n",
+			   apc->port_idx, err);
+		goto destroy_vport;
+	}
+
+	err = mana_create_txq(apc, ndev);
+	if (err) {
+		netdev_err(ndev, "Failed to create TXQ on vPort %u: %d\n",
+			   apc->port_idx, err);
+		goto destroy_eq;
+	}
+
 	err = netif_set_real_num_tx_queues(ndev, apc->num_queues);
 	if (err) {
 		netdev_err(ndev,
 			   "netif_set_real_num_tx_queues () failed for ndev with num_queues %u : %d\n",
 			   apc->num_queues, err);
-		goto destroy_vport;
+		goto destroy_txq;
 	}
 
 	err = mana_add_rx_queues(apc, ndev);
 	if (err)
-		goto destroy_vport;
+		goto destroy_rxq;
 
 	apc->rss_state = apc->num_queues > 1 ? TRI_STATE_TRUE : TRI_STATE_FALSE;
 
@@ -3140,7 +3157,7 @@ int mana_alloc_queues(struct net_device *ndev)
 		netdev_err(ndev,
 			   "netif_set_real_num_rx_queues () failed for ndev with num_queues %u : %d\n",
 			   apc->num_queues, err);
-		goto destroy_vport;
+		goto destroy_rxq;
 	}
 
 	mana_rss_table_init(apc);
@@ -3148,19 +3165,25 @@ int mana_alloc_queues(struct net_device *ndev)
 	err = mana_config_rss(apc, TRI_STATE_TRUE, true, true);
 	if (err) {
 		netdev_err(ndev, "Failed to configure RSS table: %d\n", err);
-		goto destroy_vport;
+		goto destroy_rxq;
 	}
 
 	if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) {
 		err = mana_pf_register_filter(apc);
 		if (err)
-			goto destroy_vport;
+			goto destroy_rxq;
 	}
 
 	mana_chn_setxdp(apc, mana_xdp_get(apc));
 
 	return 0;
 
+destroy_rxq:
+	mana_destroy_rxqs(apc);
+destroy_txq:
+	mana_destroy_txq(apc);
+destroy_eq:
+	mana_destroy_eq(apc);
 destroy_vport:
 	mana_destroy_vport(apc);
 	return err;
@@ -3263,6 +3286,9 @@ static int mana_dealloc_queues(struct net_device *ndev)
 		netdev_err(ndev, "Failed to disable vPort: %d\n", err);
 
 	/* Even in err case, still need to cleanup the vPort */
+	mana_destroy_rxqs(apc);
+	mana_destroy_txq(apc);
+	mana_destroy_eq(apc);
 	mana_destroy_vport(apc);
 
 	return 0;
@@ -3570,12 +3596,6 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 		gd->driver_data = ac;
 	}
 
-	err = mana_create_eq(ac);
-	if (err) {
-		dev_err(dev, "Failed to create EQs: %d\n", err);
-		goto out;
-	}
-
 	err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
 				    MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
 	if (err)
@@ -3714,7 +3734,6 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
 		free_netdev(ndev);
 	}
 
-	mana_destroy_eq(ac);
 out:
 	if (ac->per_port_queue_reset_wq) {
 		destroy_workqueue(ac->per_port_queue_reset_wq);
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index a078af283bdd..787e637059df 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -478,8 +478,6 @@ struct mana_context {
 	u8 bm_hostmode;
 
 	struct mana_ethtool_hc_stats hc_stats;
-	struct mana_eq *eqs;
-	struct dentry *mana_eqs_debugfs;
 	struct workqueue_struct *per_port_queue_reset_wq;
 	/* Workqueue for querying hardware stats */
 	struct delayed_work gf_stats_work;
@@ -499,6 +497,9 @@ struct mana_port_context {
 
 	u8 mac_addr[ETH_ALEN];
 
+	struct mana_eq *eqs;
+	struct dentry *mana_eqs_debugfs;
+
 	enum TRI_STATE rss_state;
 
 	mana_handle_t default_rxobj;
@@ -1023,6 +1024,8 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
 int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
 		   u32 doorbell_pg_id);
 void mana_uncfg_vport(struct mana_port_context *apc);
+int mana_create_eq(struct mana_port_context *apc);
+void mana_destroy_eq(struct mana_port_context *apc);
 
 struct net_device *mana_get_primary_netdev(struct mana_context *ac,
 					   u32 port_index,
-- 
2.43.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox