[PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT

public inbox for linux-hyperv@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
@ 2026-02-16 16:24 Jan Kiszka
  2026-02-17  6:42 ` Michael Kelley
                   ` (3 more replies)
  0 siblings, 4 replies; 20+ messages in thread
From: Jan Kiszka @ 2026-02-16 16:24 UTC (permalink / raw)
  To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86
  Cc: linux-hyperv, linux-kernel, Florian Bezdeka, RT, Mitchell Levy,
	Michael Kelley, Saurabh Singh Sengar, Naman Jain

From: Jan Kiszka <jan.kiszka@siemens.com>

Resolves the following lockdep report when booting PREEMPT_RT on Hyper-V
with related guest support enabled:

[    1.127941] hv_vmbus: registering driver hyperv_drm

[    1.132518] =============================
[    1.132519] [ BUG: Invalid wait context ]
[    1.132521] 6.19.0-rc8+ #9 Not tainted
[    1.132524] -----------------------------
[    1.132525] swapper/0/0 is trying to lock:
[    1.132526] ffff8b9381bb3c90 (&channel->sched_lock){....}-{3:3}, at: vmbus_chan_sched+0xc4/0x2b0
[    1.132543] other info that might help us debug this:
[    1.132544] context-{2:2}
[    1.132545] 1 lock held by swapper/0/0:
[    1.132547]  #0: ffffffffa010c4c0 (rcu_read_lock){....}-{1:3}, at: vmbus_chan_sched+0x31/0x2b0
[    1.132557] stack backtrace:
[    1.132560] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.19.0-rc8+ #9 PREEMPT_{RT,(lazy)}
[    1.132565] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/25/2025
[    1.132567] Call Trace:
[    1.132570]  <IRQ>
[    1.132573]  dump_stack_lvl+0x6e/0xa0
[    1.132581]  __lock_acquire+0xee0/0x21b0
[    1.132592]  lock_acquire+0xd5/0x2d0
[    1.132598]  ? vmbus_chan_sched+0xc4/0x2b0
[    1.132606]  ? lock_acquire+0xd5/0x2d0
[    1.132613]  ? vmbus_chan_sched+0x31/0x2b0
[    1.132619]  rt_spin_lock+0x3f/0x1f0
[    1.132623]  ? vmbus_chan_sched+0xc4/0x2b0
[    1.132629]  ? vmbus_chan_sched+0x31/0x2b0
[    1.132634]  vmbus_chan_sched+0xc4/0x2b0
[    1.132641]  vmbus_isr+0x2c/0x150
[    1.132648]  __sysvec_hyperv_callback+0x5f/0xa0
[    1.132654]  sysvec_hyperv_callback+0x88/0xb0
[    1.132658]  </IRQ>
[    1.132659]  <TASK>
[    1.132660]  asm_sysvec_hyperv_callback+0x1a/0x20

As code paths that handle vmbus IRQs use sleepy locks under PREEMPT_RT,
the vmbus_isr execution needs to be moved into thread context. Open-
coding this allows to skip the IPI that irq_work would additionally
bring and which we do not need, being an IRQ, never an NMI.

This affects both x86 and arm64, therefore hook into the common driver
logic.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---

Changes in v3:
 - move logic to generic vmbus driver, targeting arm64 as well
 - annotate non-RT path with lockdep_hardirq_threaded
 - only teardown if setup ran

Changes in v2:
 - reorder vmbus_irq_pending clearing to fix a race condition

 drivers/hv/vmbus_drv.c | 66 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 6785ad63a9cb..749a2e68af05 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -25,6 +25,7 @@
 #include <linux/cpu.h>
 #include <linux/sched/isolation.h>
 #include <linux/sched/task_stack.h>
+#include <linux/smpboot.h>
 
 #include <linux/delay.h>
 #include <linux/panic_notifier.h>
@@ -1350,7 +1351,7 @@ static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message
 	}
 }
 
-void vmbus_isr(void)
+static void __vmbus_isr(void)
 {
 	struct hv_per_cpu_context *hv_cpu
 		= this_cpu_ptr(hv_context.cpu_context);
@@ -1363,6 +1364,53 @@ void vmbus_isr(void)
 
 	add_interrupt_randomness(vmbus_interrupt);
 }
+
+static DEFINE_PER_CPU(bool, vmbus_irq_pending);
+static DEFINE_PER_CPU(struct task_struct *, vmbus_irqd);
+
+static void vmbus_irqd_wake(void)
+{
+	struct task_struct *tsk = __this_cpu_read(vmbus_irqd);
+
+	__this_cpu_write(vmbus_irq_pending, true);
+	wake_up_process(tsk);
+}
+
+static void vmbus_irqd_setup(unsigned int cpu)
+{
+	sched_set_fifo(current);
+}
+
+static int vmbus_irqd_should_run(unsigned int cpu)
+{
+	return __this_cpu_read(vmbus_irq_pending);
+}
+
+static void run_vmbus_irqd(unsigned int cpu)
+{
+	__this_cpu_write(vmbus_irq_pending, false);
+	__vmbus_isr();
+}
+
+static bool vmbus_irq_initialized;
+
+static struct smp_hotplug_thread vmbus_irq_threads = {
+	.store                  = &vmbus_irqd,
+	.setup			= vmbus_irqd_setup,
+	.thread_should_run      = vmbus_irqd_should_run,
+	.thread_fn              = run_vmbus_irqd,
+	.thread_comm            = "vmbus_irq/%u",
+};
+
+void vmbus_isr(void)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+		vmbus_irqd_wake();
+	} else {
+		lockdep_hardirq_threaded();
+		__vmbus_isr();
+	}
+}
 EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
 
 static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
@@ -1462,6 +1510,13 @@ static int vmbus_bus_init(void)
 	 * the VMbus interrupt handler.
 	 */
 
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !vmbus_irq_initialized) {
+		ret = smpboot_register_percpu_thread(&vmbus_irq_threads);
+		if (ret)
+			goto err_kthread;
+		vmbus_irq_initialized = true;
+	}
+
 	if (vmbus_irq == -1) {
 		hv_setup_vmbus_handler(vmbus_isr);
 	} else {
@@ -1507,6 +1562,11 @@ static int vmbus_bus_init(void)
 		free_percpu(vmbus_evt);
 	}
 err_setup:
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) {
+		smpboot_unregister_percpu_thread(&vmbus_irq_threads);
+		vmbus_irq_initialized = false;
+	}
+err_kthread:
 	bus_unregister(&hv_bus);
 	return ret;
 }
@@ -2976,6 +3036,10 @@ static void __exit vmbus_exit(void)
 		free_percpu_irq(vmbus_irq, vmbus_evt);
 		free_percpu(vmbus_evt);
 	}
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) {
+		smpboot_unregister_percpu_thread(&vmbus_irq_threads);
+		vmbus_irq_initialized = false;
+	}
 	for_each_online_cpu(cpu) {
 		struct hv_per_cpu_context *hv_cpu
 			= per_cpu_ptr(hv_context.cpu_context, cpu);
-- 
2.47.3

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-02-16 16:24 [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT Jan Kiszka
@ 2026-02-17  6:42 ` Michael Kelley
  2026-02-17 23:03 ` Bezdeka, Florian
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 20+ messages in thread
From: Michael Kelley @ 2026-02-17  6:42 UTC (permalink / raw)
  To: Jan Kiszka, K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Long Li, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86@kernel.org
  Cc: linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	Florian Bezdeka, RT, Mitchell Levy, Michael Kelley,
	Saurabh Singh Sengar, Naman Jain

From: Jan Kiszka <jan.kiszka@siemens.com> Sent: Monday, February 16, 2026 8:25 AM
> 
> Resolves the following lockdep report when booting PREEMPT_RT on Hyper-V
> with related guest support enabled:
> 
> [    1.127941] hv_vmbus: registering driver hyperv_drm
> 
> [    1.132518] =============================
> [    1.132519] [ BUG: Invalid wait context ]
> [    1.132521] 6.19.0-rc8+ #9 Not tainted
> [    1.132524] -----------------------------
> [    1.132525] swapper/0/0 is trying to lock:
> [    1.132526] ffff8b9381bb3c90 (&channel->sched_lock){....}-{3:3}, at: vmbus_chan_sched+0xc4/0x2b0
> [    1.132543] other info that might help us debug this:
> [    1.132544] context-{2:2}
> [    1.132545] 1 lock held by swapper/0/0:
> [    1.132547]  #0: ffffffffa010c4c0 (rcu_read_lock){....}-{1:3}, at: vmbus_chan_sched+0x31/0x2b0
> [    1.132557] stack backtrace:
> [    1.132560] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.19.0-rc8+ #9 PREEMPT_{RT,(lazy)}
> [    1.132565] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/25/2025
> [    1.132567] Call Trace:
> [    1.132570]  <IRQ>
> [    1.132573]  dump_stack_lvl+0x6e/0xa0
> [    1.132581]  __lock_acquire+0xee0/0x21b0
> [    1.132592]  lock_acquire+0xd5/0x2d0
> [    1.132598]  ? vmbus_chan_sched+0xc4/0x2b0
> [    1.132606]  ? lock_acquire+0xd5/0x2d0
> [    1.132613]  ? vmbus_chan_sched+0x31/0x2b0
> [    1.132619]  rt_spin_lock+0x3f/0x1f0
> [    1.132623]  ? vmbus_chan_sched+0xc4/0x2b0
> [    1.132629]  ? vmbus_chan_sched+0x31/0x2b0
> [    1.132634]  vmbus_chan_sched+0xc4/0x2b0
> [    1.132641]  vmbus_isr+0x2c/0x150
> [    1.132648]  __sysvec_hyperv_callback+0x5f/0xa0
> [    1.132654]  sysvec_hyperv_callback+0x88/0xb0
> [    1.132658]  </IRQ>
> [    1.132659]  <TASK>
> [    1.132660]  asm_sysvec_hyperv_callback+0x1a/0x20
> 
> As code paths that handle vmbus IRQs use sleepy locks under PREEMPT_RT,
> the vmbus_isr execution needs to be moved into thread context. Open-
> coding this allows to skip the IPI that irq_work would additionally
> bring and which we do not need, being an IRQ, never an NMI.
> 
> This affects both x86 and arm64, therefore hook into the common driver
> logic.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>

Tested this patch in combination with the related SCSI driver patch.
Tested three configurations with a recent linux-next kernel, either
20260128 or 20260205.

1) Normal Linux kernel
2) Normal Linux kernel plus CONFIG_PROVE_LOCKING
3) PREEMPT_RT kernel plus CONFIG_PROVE_LOCKING

Tested these three configurations in an x86/x64 VM on a local Hyper-V
and again in an ARM64 VM in the Azure public cloud. With all
combinations, ran the "stress-ng" command provided by Florian
Bezdeka for several minutes. Saw no issues related to these patches.
Presumably the normal kernel with CONFIG_PROVE_LOCKING produced
the lockdep report that Saurabh Sengar saw, and that also appears to be
fixed in this version of the patch due to adding lockdep_hardirq_threaded().

However, I noted one additional locking problem in the ARM64 Azure
VM, which has multiple PCI pass-thru devices -- one Mellanox NIC VF and
two NVMe controllers. The first PCI device to be brought online gets
this lockdep report, though Linux continues to run without problems:

[    8.128629] hv_vmbus: registering driver hv_pci
[    8.132276] hv_pci ad26ad39-fa5e-4d12-9825-fa62e9c88483: PCI VMBus probing: Using version 0x10004
[    8.142956] hv_pci ad26ad39-fa5e-4d12-9825-fa62e9c88483: PCI host bridge to bus fa5e:00
[    8.143231] pci_bus fa5e:00: root bus resource [mem 0xfc0000000-0xfc00fffff window]
[    8.143272] pci_bus fa5e:00: No busn resource found for root bus, will use [bus 00-ff]
[    8.154069] =============================
[    8.156609] [ BUG: Invalid wait context ]
[    8.159209] 6.19.0-rc7rt-next-20260128+ #9 Tainted: G            E
[    8.163582] -----------------------------
[    8.166323] systemd-udevd/575 is trying to lock:
[    8.169163] ffff00011fb62260 (&hbus->device_list_lock){+.+.}-{3:3}, at: get_pcichild_wslot+0x30/0xe0 [pci_hyperv]
[    8.175792] other info that might help us debug this:
[    8.179187] context-{5:5}
[    8.180954] 3 locks held by systemd-udevd/575:
[    8.183048]  #0: ffff000116e50100 (&dev->mutex){....}-{4:4}, at: __device_driver_lock+0x4c/0xb0
[    8.193285]  #1: ffff00011fb62118 (&hbus->state_lock){+.+.}-{4:4}, at: hv_pci_probe+0x32c/0x590 [pci_hyperv]
[    8.199565]  #2: ffffa40f7caa61e0 (pci_lock){....}-{2:2}, at: pci_bus_read_config_dword+0x64/0xf8
[    8.205112] stack backtrace:
[    8.207037] CPU: 0 UID: 0 PID: 575 Comm: systemd-udevd Tainted: G            E       6.19.0-rc7rt-next-20260128+ #9 PREEMPT_RT
[    8.209134] Tainted: [E]=UNSIGNED_MODULE
[    8.219505] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 06/10/2025
[    8.226029] Call trace:
[    8.227433]  show_stack+0x20/0x38 (C)
[    8.229541]  dump_stack_lvl+0x9c/0x158
[    8.231698]  dump_stack+0x18/0x28
[    8.233799]  __lock_acquire+0x488/0x1e20
[    8.236373]  lock_acquire+0x11c/0x388
[    8.238783]  rt_spin_lock+0x54/0x230
[    8.241138]  get_pcichild_wslot+0x30/0xe0 [pci_hyperv]
[    8.244550]  hv_pcifront_read_config+0x3c/0x98 [pci_hyperv]
[    8.248323]  pci_bus_read_config_dword+0x88/0xf8
[    8.250419]  pci_bus_generic_read_dev_vendor_id+0x3c/0x1c0
[    8.252517]  pci_bus_read_dev_vendor_id+0x54/0x80
[    8.263922]  pci_scan_single_device+0x88/0x100
[    8.266903]  pci_scan_slot+0x74/0x1e0
[    8.269208]  pci_scan_child_bus_extend+0x50/0x328
[    8.271978]  pci_scan_root_bus_bridge+0xc4/0xf8
[    8.274705]  hv_pci_probe+0x390/0x590 [pci_hyperv]
[    8.277584]  vmbus_probe+0x4c/0xb0 [hv_vmbus]
[    8.279688]  really_probe+0xd4/0x3d8
[    8.285954]  __driver_probe_device+0x90/0x1a0
[    8.288645]  driver_probe_device+0x44/0x148
[    8.291011]  __driver_attach+0x154/0x290
[    8.293201]  bus_for_each_dev+0x80/0xf0
[    8.295407]  driver_attach+0x2c/0x40
[    8.297478]  bus_add_driver+0x128/0x270
[    8.299607]  driver_register+0x68/0x138
[    8.302179]  __vmbus_driver_register+0x98/0xc0 [hv_vmbus]
[    8.305535]  init_hv_pci_drv+0x198/0xff8 [pci_hyperv]
[    8.308566]  do_one_initcall+0x70/0x400
[    8.310957]  do_init_module+0x60/0x280
[    8.313393]  load_module+0x2308/0x2680
[    8.315535]  init_module_from_file+0xe0/0x110
[    8.318432]  idempotent_init_module+0x194/0x280
[    8.321141]  __arm64_sys_finit_module+0x74/0xf8
[    8.323874]  invoke_syscall+0x6c/0xf8
[    8.326213]  el0_svc_common.constprop.0+0xe0/0xf0
[    8.329068]  do_el0_svc+0x24/0x38
[    8.331070]  el0_svc+0x164/0x3c8
[    8.333137]  el0t_64_sync_handler+0xd0/0xe8
[    8.335599]  el0t_64_sync+0x1b0/0x1b8
[    8.338598] pci fa5e:00:00.0: [1414:b111] type 00 class 0x010802 PCIe Endpoint
[    8.340646] pci fa5e:00:00.0: BAR 0 [mem 0xfc0000000-0xfc00fffff 64bit]
[    8.357759] pci_bus fa5e:00: busn_res: [bus 00-ff] end is updated to 00

The lockdep report would also be seen in an x86/x64 VM in Azure, though I
did not explicitly test that combination. I have not looked at what it would
take to fix this for PREEMPT_RT. But the fix would be a separate patch that
does not affect the validity of this patch.

So for this patch,
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>

> ---
> 
> Changes in v3:
>  - move logic to generic vmbus driver, targeting arm64 as well
>  - annotate non-RT path with lockdep_hardirq_threaded
>  - only teardown if setup ran
> 
> Changes in v2:
>  - reorder vmbus_irq_pending clearing to fix a race condition
> 
>  drivers/hv/vmbus_drv.c | 66 +++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 65 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index 6785ad63a9cb..749a2e68af05 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -25,6 +25,7 @@
>  #include <linux/cpu.h>
>  #include <linux/sched/isolation.h>
>  #include <linux/sched/task_stack.h>
> +#include <linux/smpboot.h>
> 
>  #include <linux/delay.h>
>  #include <linux/panic_notifier.h>
> @@ -1350,7 +1351,7 @@ static void vmbus_message_sched(struct
> hv_per_cpu_context *hv_cpu, void *message
>  	}
>  }
> 
> -void vmbus_isr(void)
> +static void __vmbus_isr(void)
>  {
>  	struct hv_per_cpu_context *hv_cpu
>  		= this_cpu_ptr(hv_context.cpu_context);
> @@ -1363,6 +1364,53 @@ void vmbus_isr(void)
> 
>  	add_interrupt_randomness(vmbus_interrupt);
>  }
> +
> +static DEFINE_PER_CPU(bool, vmbus_irq_pending);
> +static DEFINE_PER_CPU(struct task_struct *, vmbus_irqd);
> +
> +static void vmbus_irqd_wake(void)
> +{
> +	struct task_struct *tsk = __this_cpu_read(vmbus_irqd);
> +
> +	__this_cpu_write(vmbus_irq_pending, true);
> +	wake_up_process(tsk);
> +}
> +
> +static void vmbus_irqd_setup(unsigned int cpu)
> +{
> +	sched_set_fifo(current);
> +}
> +
> +static int vmbus_irqd_should_run(unsigned int cpu)
> +{
> +	return __this_cpu_read(vmbus_irq_pending);
> +}
> +
> +static void run_vmbus_irqd(unsigned int cpu)
> +{
> +	__this_cpu_write(vmbus_irq_pending, false);
> +	__vmbus_isr();
> +}
> +
> +static bool vmbus_irq_initialized;
> +
> +static struct smp_hotplug_thread vmbus_irq_threads = {
> +	.store                  = &vmbus_irqd,
> +	.setup			= vmbus_irqd_setup,
> +	.thread_should_run      = vmbus_irqd_should_run,
> +	.thread_fn              = run_vmbus_irqd,
> +	.thread_comm            = "vmbus_irq/%u",
> +};
> +
> +void vmbus_isr(void)
> +{
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
> +		vmbus_irqd_wake();
> +	} else {
> +		lockdep_hardirq_threaded();
> +		__vmbus_isr();
> +	}
> +}
>  EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
> 
>  static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
> @@ -1462,6 +1510,13 @@ static int vmbus_bus_init(void)
>  	 * the VMbus interrupt handler.
>  	 */
> 
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !vmbus_irq_initialized) {
> +		ret = smpboot_register_percpu_thread(&vmbus_irq_threads);
> +		if (ret)
> +			goto err_kthread;
> +		vmbus_irq_initialized = true;
> +	}
> +
>  	if (vmbus_irq == -1) {
>  		hv_setup_vmbus_handler(vmbus_isr);
>  	} else {
> @@ -1507,6 +1562,11 @@ static int vmbus_bus_init(void)
>  		free_percpu(vmbus_evt);
>  	}
>  err_setup:
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) {
> +		smpboot_unregister_percpu_thread(&vmbus_irq_threads);
> +		vmbus_irq_initialized = false;
> +	}
> +err_kthread:
>  	bus_unregister(&hv_bus);
>  	return ret;
>  }
> @@ -2976,6 +3036,10 @@ static void __exit vmbus_exit(void)
>  		free_percpu_irq(vmbus_irq, vmbus_evt);
>  		free_percpu(vmbus_evt);
>  	}
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) {
> +		smpboot_unregister_percpu_thread(&vmbus_irq_threads);
> +		vmbus_irq_initialized = false;
> +	}
>  	for_each_online_cpu(cpu) {
>  		struct hv_per_cpu_context *hv_cpu
>  			= per_cpu_ptr(hv_context.cpu_context, cpu);
> --
> 2.47.3


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-02-16 16:24 [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT Jan Kiszka
  2026-02-17  6:42 ` Michael Kelley
@ 2026-02-17 23:03 ` Bezdeka, Florian
  2026-02-18  6:48   ` Jan Kiszka
  2026-02-18  7:05 ` Wei Liu
  2026-03-12 17:07 ` Sebastian Andrzej Siewior
  3 siblings, 1 reply; 20+ messages in thread
From: Bezdeka, Florian @ 2026-02-17 23:03 UTC (permalink / raw)
  To: kys@microsoft.com, decui@microsoft.com, bp@alien8.de,
	longli@microsoft.com, dave.hansen@linux.intel.com,
	mingo@redhat.com, wei.liu@kernel.org, tglx@kernel.org,
	Kiszka, Jan, haiyangz@microsoft.com, x86@kernel.org
  Cc: linux-rt-users@vger.kernel.org, namjain@linux.microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	levymitchell0@gmail.com, mhklinux@outlook.com,
	ssengar@linux.microsoft.com

On Mon, 2026-02-16 at 17:24 +0100, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Resolves the following lockdep report when booting PREEMPT_RT on Hyper-V
> with related guest support enabled:
> 
> [    1.127941] hv_vmbus: registering driver hyperv_drm
> 
> [    1.132518] =============================
> [    1.132519] [ BUG: Invalid wait context ]
> [    1.132521] 6.19.0-rc8+ #9 Not tainted
> [    1.132524] -----------------------------
> [    1.132525] swapper/0/0 is trying to lock:
> [    1.132526] ffff8b9381bb3c90 (&channel->sched_lock){....}-{3:3}, at: vmbus_chan_sched+0xc4/0x2b0
> [    1.132543] other info that might help us debug this:
> [    1.132544] context-{2:2}
> [    1.132545] 1 lock held by swapper/0/0:
> [    1.132547]  #0: ffffffffa010c4c0 (rcu_read_lock){....}-{1:3}, at: vmbus_chan_sched+0x31/0x2b0
> [    1.132557] stack backtrace:
> [    1.132560] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.19.0-rc8+ #9 PREEMPT_{RT,(lazy)}
> [    1.132565] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/25/2025
> [    1.132567] Call Trace:
> [    1.132570]  <IRQ>
> [    1.132573]  dump_stack_lvl+0x6e/0xa0
> [    1.132581]  __lock_acquire+0xee0/0x21b0
> [    1.132592]  lock_acquire+0xd5/0x2d0
> [    1.132598]  ? vmbus_chan_sched+0xc4/0x2b0
> [    1.132606]  ? lock_acquire+0xd5/0x2d0
> [    1.132613]  ? vmbus_chan_sched+0x31/0x2b0
> [    1.132619]  rt_spin_lock+0x3f/0x1f0
> [    1.132623]  ? vmbus_chan_sched+0xc4/0x2b0
> [    1.132629]  ? vmbus_chan_sched+0x31/0x2b0
> [    1.132634]  vmbus_chan_sched+0xc4/0x2b0
> [    1.132641]  vmbus_isr+0x2c/0x150
> [    1.132648]  __sysvec_hyperv_callback+0x5f/0xa0
> [    1.132654]  sysvec_hyperv_callback+0x88/0xb0
> [    1.132658]  </IRQ>
> [    1.132659]  <TASK>
> [    1.132660]  asm_sysvec_hyperv_callback+0x1a/0x20
> 
> As code paths that handle vmbus IRQs use sleepy locks under PREEMPT_RT,
> the vmbus_isr execution needs to be moved into thread context. Open-
> coding this allows to skip the IPI that irq_work would additionally
> bring and which we do not need, being an IRQ, never an NMI.
> 
> This affects both x86 and arm64, therefore hook into the common driver
> logic.

I tested this patch in combination with the related SCSI driver patch.
The tests were done on x86 with both VM generations provided by Hyper-v.

Lockdep was enabled and there were no splat reports within 24 hours of
massive load produced by stress-ng.

With that:

Reviewed-by: Florian Bezdeka <florian.bezdeka@siemens.com>
Tested-by: Florian Bezdeka <florian.bezdeka@siemens.com>


Side note: We did some backports down to 6.1 already, just in case
someone is interested. We recognized a massive network performance drop
in 6.1. The root cause has been identified and is not related to this
patch. It's simply another RT regression caused by a missing stable-rt
backport. Upstreaming in progress...

Best regards,
Florian

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-02-17 23:03 ` Bezdeka, Florian
@ 2026-02-18  6:48   ` Jan Kiszka
  0 siblings, 0 replies; 20+ messages in thread
From: Jan Kiszka @ 2026-02-18  6:48 UTC (permalink / raw)
  To: Bezdeka, Florian (FT RPD CED OES-DE), kys@microsoft.com,
	decui@microsoft.com, bp@alien8.de, longli@microsoft.com,
	dave.hansen@linux.intel.com, mingo@redhat.com, wei.liu@kernel.org,
	tglx@kernel.org, haiyangz@microsoft.com, x86@kernel.org
  Cc: linux-rt-users@vger.kernel.org, namjain@linux.microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	levymitchell0@gmail.com, mhklinux@outlook.com,
	ssengar@linux.microsoft.com

On 18.02.26 00:03, Bezdeka, Florian (FT RPD CED OES-DE) wrote:
> On Mon, 2026-02-16 at 17:24 +0100, Jan Kiszka wrote:
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> Resolves the following lockdep report when booting PREEMPT_RT on Hyper-V
>> with related guest support enabled:
>>
>> [    1.127941] hv_vmbus: registering driver hyperv_drm
>>
>> [    1.132518] =============================
>> [    1.132519] [ BUG: Invalid wait context ]
>> [    1.132521] 6.19.0-rc8+ #9 Not tainted
>> [    1.132524] -----------------------------
>> [    1.132525] swapper/0/0 is trying to lock:
>> [    1.132526] ffff8b9381bb3c90 (&channel->sched_lock){....}-{3:3}, at: vmbus_chan_sched+0xc4/0x2b0
>> [    1.132543] other info that might help us debug this:
>> [    1.132544] context-{2:2}
>> [    1.132545] 1 lock held by swapper/0/0:
>> [    1.132547]  #0: ffffffffa010c4c0 (rcu_read_lock){....}-{1:3}, at: vmbus_chan_sched+0x31/0x2b0
>> [    1.132557] stack backtrace:
>> [    1.132560] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.19.0-rc8+ #9 PREEMPT_{RT,(lazy)}
>> [    1.132565] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/25/2025
>> [    1.132567] Call Trace:
>> [    1.132570]  <IRQ>
>> [    1.132573]  dump_stack_lvl+0x6e/0xa0
>> [    1.132581]  __lock_acquire+0xee0/0x21b0
>> [    1.132592]  lock_acquire+0xd5/0x2d0
>> [    1.132598]  ? vmbus_chan_sched+0xc4/0x2b0
>> [    1.132606]  ? lock_acquire+0xd5/0x2d0
>> [    1.132613]  ? vmbus_chan_sched+0x31/0x2b0
>> [    1.132619]  rt_spin_lock+0x3f/0x1f0
>> [    1.132623]  ? vmbus_chan_sched+0xc4/0x2b0
>> [    1.132629]  ? vmbus_chan_sched+0x31/0x2b0
>> [    1.132634]  vmbus_chan_sched+0xc4/0x2b0
>> [    1.132641]  vmbus_isr+0x2c/0x150
>> [    1.132648]  __sysvec_hyperv_callback+0x5f/0xa0
>> [    1.132654]  sysvec_hyperv_callback+0x88/0xb0
>> [    1.132658]  </IRQ>
>> [    1.132659]  <TASK>
>> [    1.132660]  asm_sysvec_hyperv_callback+0x1a/0x20
>>
>> As code paths that handle vmbus IRQs use sleepy locks under PREEMPT_RT,
>> the vmbus_isr execution needs to be moved into thread context. Open-
>> coding this allows to skip the IPI that irq_work would additionally
>> bring and which we do not need, being an IRQ, never an NMI.
>>
>> This affects both x86 and arm64, therefore hook into the common driver
>> logic.
> 
> I tested this patch in combination with the related SCSI driver patch.
> The tests were done on x86 with both VM generations provided by Hyper-v.
> 
> Lockdep was enabled and there were no splat reports within 24 hours of
> massive load produced by stress-ng.
> 
> With that:
> 
> Reviewed-by: Florian Bezdeka <florian.bezdeka@siemens.com>
> Tested-by: Florian Bezdeka <florian.bezdeka@siemens.com>
> 
> 
> Side note: We did some backports down to 6.1 already, just in case
> someone is interested. We recognized a massive network performance drop
> in 6.1. The root cause has been identified and is not related to this
> patch. It's simply another RT regression caused by a missing stable-rt
> backport. Upstreaming in progress...
> 

Submitted:
https://lore.kernel.org/stable/05ae6b87-0b53-4948-a1ed-2a3235a5f82b@siemens.com/T/#u

Jan

-- 
Siemens AG, Foundational Technologies
Linux Expert Center

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-02-16 16:24 [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT Jan Kiszka
  2026-02-17  6:42 ` Michael Kelley
  2026-02-17 23:03 ` Bezdeka, Florian
@ 2026-02-18  7:05 ` Wei Liu
  2026-02-18  7:19   ` Saurabh Singh Sengar
  2026-03-12 17:07 ` Sebastian Andrzej Siewior
  3 siblings, 1 reply; 20+ messages in thread
From: Wei Liu @ 2026-02-18  7:05 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	linux-hyperv, linux-kernel, Florian Bezdeka, RT, Mitchell Levy,
	Michael Kelley, Saurabh Singh Sengar, Naman Jain

On Mon, Feb 16, 2026 at 05:24:56PM +0100, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Resolves the following lockdep report when booting PREEMPT_RT on Hyper-V
> with related guest support enabled:
> 
> [    1.127941] hv_vmbus: registering driver hyperv_drm
> 
> [    1.132518] =============================
> [    1.132519] [ BUG: Invalid wait context ]
> [    1.132521] 6.19.0-rc8+ #9 Not tainted
> [    1.132524] -----------------------------
> [    1.132525] swapper/0/0 is trying to lock:
> [    1.132526] ffff8b9381bb3c90 (&channel->sched_lock){....}-{3:3}, at: vmbus_chan_sched+0xc4/0x2b0
> [    1.132543] other info that might help us debug this:
> [    1.132544] context-{2:2}
> [    1.132545] 1 lock held by swapper/0/0:
> [    1.132547]  #0: ffffffffa010c4c0 (rcu_read_lock){....}-{1:3}, at: vmbus_chan_sched+0x31/0x2b0
> [    1.132557] stack backtrace:
> [    1.132560] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.19.0-rc8+ #9 PREEMPT_{RT,(lazy)}
> [    1.132565] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/25/2025
> [    1.132567] Call Trace:
> [    1.132570]  <IRQ>
> [    1.132573]  dump_stack_lvl+0x6e/0xa0
> [    1.132581]  __lock_acquire+0xee0/0x21b0
> [    1.132592]  lock_acquire+0xd5/0x2d0
> [    1.132598]  ? vmbus_chan_sched+0xc4/0x2b0
> [    1.132606]  ? lock_acquire+0xd5/0x2d0
> [    1.132613]  ? vmbus_chan_sched+0x31/0x2b0
> [    1.132619]  rt_spin_lock+0x3f/0x1f0
> [    1.132623]  ? vmbus_chan_sched+0xc4/0x2b0
> [    1.132629]  ? vmbus_chan_sched+0x31/0x2b0
> [    1.132634]  vmbus_chan_sched+0xc4/0x2b0
> [    1.132641]  vmbus_isr+0x2c/0x150
> [    1.132648]  __sysvec_hyperv_callback+0x5f/0xa0
> [    1.132654]  sysvec_hyperv_callback+0x88/0xb0
> [    1.132658]  </IRQ>
> [    1.132659]  <TASK>
> [    1.132660]  asm_sysvec_hyperv_callback+0x1a/0x20
> 
> As code paths that handle vmbus IRQs use sleepy locks under PREEMPT_RT,
> the vmbus_isr execution needs to be moved into thread context. Open-
> coding this allows to skip the IPI that irq_work would additionally
> bring and which we do not need, being an IRQ, never an NMI.
> 
> This affects both x86 and arm64, therefore hook into the common driver
> logic.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>

Applied to hyperv-next. Thanks.

Saurabh and Naman, I want to get this submitted in this merge window. If
you find any more issues with this patch, we can address them in the RC
phase. In the worst case, we can revert this patch later.

Wei

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-02-18  7:05 ` Wei Liu
@ 2026-02-18  7:19   ` Saurabh Singh Sengar
  0 siblings, 0 replies; 20+ messages in thread
From: Saurabh Singh Sengar @ 2026-02-18  7:19 UTC (permalink / raw)
  To: Wei Liu
  Cc: Jan Kiszka, K. Y. Srinivasan, Haiyang Zhang, Dexuan Cui, Long Li,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	linux-hyperv, linux-kernel, Florian Bezdeka, RT, Mitchell Levy,
	Michael Kelley, Naman Jain

On Wed, Feb 18, 2026 at 07:05:57AM +0000, Wei Liu wrote:
> On Mon, Feb 16, 2026 at 05:24:56PM +0100, Jan Kiszka wrote:
> > From: Jan Kiszka <jan.kiszka@siemens.com>
> > 
> > Resolves the following lockdep report when booting PREEMPT_RT on Hyper-V
> > with related guest support enabled:
> > 
> > [    1.127941] hv_vmbus: registering driver hyperv_drm
> > 
> > [    1.132518] =============================
> > [    1.132519] [ BUG: Invalid wait context ]
> > [    1.132521] 6.19.0-rc8+ #9 Not tainted
> > [    1.132524] -----------------------------
> > [    1.132525] swapper/0/0 is trying to lock:
> > [    1.132526] ffff8b9381bb3c90 (&channel->sched_lock){....}-{3:3}, at: vmbus_chan_sched+0xc4/0x2b0
> > [    1.132543] other info that might help us debug this:
> > [    1.132544] context-{2:2}
> > [    1.132545] 1 lock held by swapper/0/0:
> > [    1.132547]  #0: ffffffffa010c4c0 (rcu_read_lock){....}-{1:3}, at: vmbus_chan_sched+0x31/0x2b0
> > [    1.132557] stack backtrace:
> > [    1.132560] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.19.0-rc8+ #9 PREEMPT_{RT,(lazy)}
> > [    1.132565] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/25/2025
> > [    1.132567] Call Trace:
> > [    1.132570]  <IRQ>
> > [    1.132573]  dump_stack_lvl+0x6e/0xa0
> > [    1.132581]  __lock_acquire+0xee0/0x21b0
> > [    1.132592]  lock_acquire+0xd5/0x2d0
> > [    1.132598]  ? vmbus_chan_sched+0xc4/0x2b0
> > [    1.132606]  ? lock_acquire+0xd5/0x2d0
> > [    1.132613]  ? vmbus_chan_sched+0x31/0x2b0
> > [    1.132619]  rt_spin_lock+0x3f/0x1f0
> > [    1.132623]  ? vmbus_chan_sched+0xc4/0x2b0
> > [    1.132629]  ? vmbus_chan_sched+0x31/0x2b0
> > [    1.132634]  vmbus_chan_sched+0xc4/0x2b0
> > [    1.132641]  vmbus_isr+0x2c/0x150
> > [    1.132648]  __sysvec_hyperv_callback+0x5f/0xa0
> > [    1.132654]  sysvec_hyperv_callback+0x88/0xb0
> > [    1.132658]  </IRQ>
> > [    1.132659]  <TASK>
> > [    1.132660]  asm_sysvec_hyperv_callback+0x1a/0x20
> > 
> > As code paths that handle vmbus IRQs use sleepy locks under PREEMPT_RT,
> > the vmbus_isr execution needs to be moved into thread context. Open-
> > coding this allows to skip the IPI that irq_work would additionally
> > bring and which we do not need, being an IRQ, never an NMI.
> > 
> > This affects both x86 and arm64, therefore hook into the common driver
> > logic.
> > 
> > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Applied to hyperv-next. Thanks.
> 
> Saurabh and Naman, I want to get this submitted in this merge window. If
> you find any more issues with this patch, we can address them in the RC
> phase. In the worst case, we can revert this patch later.
> 
> Wei

I was in the process of completing the final round of testing; however, since
the change has now been merged, it will receive broader coverage, I will rely
on that.

Overall, the patch looks good to me.

- Saurabh

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-02-16 16:24 [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT Jan Kiszka
                   ` (2 preceding siblings ...)
  2026-02-18  7:05 ` Wei Liu
@ 2026-03-12 17:07 ` Sebastian Andrzej Siewior
  2026-03-17  7:49   ` Jan Kiszka
  2026-03-17 17:25   ` Michael Kelley
  3 siblings, 2 replies; 20+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-03-12 17:07 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	linux-hyperv, linux-kernel, Florian Bezdeka, RT, Mitchell Levy,
	Michael Kelley, Saurabh Singh Sengar, Naman Jain

On 2026-02-16 17:24:56 [+0100], Jan Kiszka wrote:
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -25,6 +25,7 @@
>  #include <linux/cpu.h>
>  #include <linux/sched/isolation.h>
>  #include <linux/sched/task_stack.h>
> +#include <linux/smpboot.h>
>  
>  #include <linux/delay.h>
>  #include <linux/panic_notifier.h>
> @@ -1350,7 +1351,7 @@ static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message
>  	}
>  }
>  
> -void vmbus_isr(void)
> +static void __vmbus_isr(void)
>  {
>  	struct hv_per_cpu_context *hv_cpu
>  		= this_cpu_ptr(hv_context.cpu_context);
> @@ -1363,6 +1364,53 @@ void vmbus_isr(void)
>  
>  	add_interrupt_randomness(vmbus_interrupt);

This is feeding entropy and would like to see interrupt registers. But
since this is invoked from a thread it won't.

>  }
> +
…
> +void vmbus_isr(void)
> +{
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
> +		vmbus_irqd_wake();
> +	} else {
> +		lockdep_hardirq_threaded();

What clears this? This is wrongly placed. This should go to
sysvec_hyperv_callback() instead with its matching canceling part. The
add_interrupt_randomness() should also be there and not here.
sysvec_hyperv_stimer0() managed to do so.

Different question: What guarantees that there won't be another
interrupt before this one is done? The handshake appears to be
deprecated. The interrupt itself returns ACKing (or not) but the actual
handler is delayed to this thread. Depending on the userland it could
take some time and I don't know how impatient the host is.

> +		__vmbus_isr();
Moving on. This (trying very hard here) even schedules tasklets. Why?
You need to disable BH before doing so. Otherwise it ends in ksoftirqd.
You don't want that.

Couldn't the whole logic be integrated into the IRQ code? Then we could
have mask/ unmask if supported/ provided and threaded interrupts. Then
sysvec_hyperv_reenlightenment() could use a proper threaded interrupt
instead apic_eoi() + schedule_delayed_work(). 

> +	}
> +}
>  EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
>  
>  static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)

Sebastian

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-12 17:07 ` Sebastian Andrzej Siewior
@ 2026-03-17  7:49   ` Jan Kiszka
  2026-03-17 11:01     ` Sebastian Andrzej Siewior
  2026-03-17 17:25   ` Michael Kelley
  1 sibling, 1 reply; 20+ messages in thread
From: Jan Kiszka @ 2026-03-17  7:49 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	linux-hyperv, linux-kernel, Florian Bezdeka, RT, Mitchell Levy,
	Michael Kelley, Saurabh Singh Sengar, Naman Jain

On 12.03.26 18:07, Sebastian Andrzej Siewior wrote:
> On 2026-02-16 17:24:56 [+0100], Jan Kiszka wrote:
>> --- a/drivers/hv/vmbus_drv.c
>> +++ b/drivers/hv/vmbus_drv.c
>> @@ -25,6 +25,7 @@
>>  #include <linux/cpu.h>
>>  #include <linux/sched/isolation.h>
>>  #include <linux/sched/task_stack.h>
>> +#include <linux/smpboot.h>
>>  
>>  #include <linux/delay.h>
>>  #include <linux/panic_notifier.h>
>> @@ -1350,7 +1351,7 @@ static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message
>>  	}
>>  }
>>  
>> -void vmbus_isr(void)
>> +static void __vmbus_isr(void)
>>  {
>>  	struct hv_per_cpu_context *hv_cpu
>>  		= this_cpu_ptr(hv_context.cpu_context);
>> @@ -1363,6 +1364,53 @@ void vmbus_isr(void)
>>  
>>  	add_interrupt_randomness(vmbus_interrupt);
> 
> This is feeding entropy and would like to see interrupt registers. But
> since this is invoked from a thread it won't.
> 

Good point, will move this to vmbus_isr.

>>  }
>> +
> …
>> +void vmbus_isr(void)
>> +{
>> +	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
>> +		vmbus_irqd_wake();
>> +	} else {
>> +		lockdep_hardirq_threaded();
> 
> What clears this? This is wrongly placed. This should go to
> sysvec_hyperv_callback() instead with its matching canceling part. The
> add_interrupt_randomness() should also be there and not here.
> sysvec_hyperv_stimer0() managed to do so.

First of all, we need to keep all this in generic code to avoid missing
arm64.

But the question about lockdep_hardirq_threaded() is valid - and that
not only for this new code: I tried hard to understand from the code how
hardirq_threaded is managed, but I simply couldn't find the spot where
it is reset after lockdep_hardirq_threaded() but before returning from
the interrupt to the task that now has hardirq_threaded=1. I failed, and
so I started a debugger. That confirms for the existing code path
(__handle_irq_event_percpu) that we are indeed returning to the
interrupted task with hardirq_threaded set. I'm not sure if that is
intended that only the next irq_enter_rcu->lockdep_hardirq_enter of the
next IRQ over this same task will reset the flag again.

With that in mind, the new logic here is no different from the one the
kernel used before. If both are not doing what they should, we likely
want to add a generic reset of hardirq_threaded to the IRQ exit path(s).

> 
> Different question: What guarantees that there won't be another
> interrupt before this one is done? The handshake appears to be
> deprecated. The interrupt itself returns ACKing (or not) but the actual
> handler is delayed to this thread. Depending on the userland it could
> take some time and I don't know how impatient the host is.
> 

Good question. I guess people familiar with the hv interface need to
comment on that.

>> +		__vmbus_isr();
> Moving on. This (trying very hard here) even schedules tasklets. Why?
> You need to disable BH before doing so. Otherwise it ends in ksoftirqd.
> You don't want that.
> 

You are referring to the re-existing logic now, aren't you?

> Couldn't the whole logic be integrated into the IRQ code? Then we could
> have mask/ unmask if supported/ provided and threaded interrupts. Then
> sysvec_hyperv_reenlightenment() could use a proper threaded interrupt
> instead apic_eoi() + schedule_delayed_work(). 
> 

Again, you are thinking x86-only. We need a portable solution.

>> +	}
>> +}
>>  EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
>>  
>>  static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
> 
> Sebastian

Jan

-- 
Siemens AG, Foundational Technologies
Linux Expert Center

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-17  7:49   ` Jan Kiszka
@ 2026-03-17 11:01     ` Sebastian Andrzej Siewior
  2026-03-17 11:55       ` Jan Kiszka
  0 siblings, 1 reply; 20+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-03-17 11:01 UTC (permalink / raw)
  To: Jan Kiszka, Peter Zijlstra
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	linux-hyperv, linux-kernel, Florian Bezdeka, RT, Mitchell Levy,
	Michael Kelley, Saurabh Singh Sengar, Naman Jain

On 2026-03-17 08:49:38 [+0100], Jan Kiszka wrote:
> >> +void vmbus_isr(void)
> >> +{
> >> +	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
> >> +		vmbus_irqd_wake();
> >> +	} else {
> >> +		lockdep_hardirq_threaded();
> > 
> > What clears this? This is wrongly placed. This should go to
> > sysvec_hyperv_callback() instead with its matching canceling part. The
> > add_interrupt_randomness() should also be there and not here.
> > sysvec_hyperv_stimer0() managed to do so.
> 
> First of all, we need to keep all this in generic code to avoid missing
> arm64.

This kind of belongs to the IRQ core code so I would prefer to see it on
IRQ entry, not in a random driver.

> But the question about lockdep_hardirq_threaded() is valid - and that
> not only for this new code: I tried hard to understand from the code how
> hardirq_threaded is managed, but I simply couldn't find the spot where
> it is reset after lockdep_hardirq_threaded() but before returning from
> the interrupt to the task that now has hardirq_threaded=1. I failed, and
> so I started a debugger. That confirms for the existing code path
> (__handle_irq_event_percpu) that we are indeed returning to the
> interrupted task with hardirq_threaded set. I'm not sure if that is
> intended that only the next irq_enter_rcu->lockdep_hardirq_enter of the
> next IRQ over this same task will reset the flag again.

While looking into it again, it assumes that you enter an IRQ and due to
the implementation if one is threaded, all of them are. So if you switch
from IRQ handling to TIMER then this does not happen "as-is" but exit
from one and then entry another at which point it is set to zero again.

> With that in mind, the new logic here is no different from the one the
> kernel used before. If both are not doing what they should, we likely
> want to add a generic reset of hardirq_threaded to the IRQ exit path(s).

The difference is that you expect that _everyone_ calling this driver
has everything else threaded. This might not be the case. That is why
this should be in core knowing what is called if threaded, use in driver
after explicit killing that flag afterwards since you don't know what
can follow or add a generic threaded infrastructure here. 

A different option which I would prefer in the drivere, would be an
explicit lockdep override for the locking class without using
lockdep_hardirq_threaded()

> > Different question: What guarantees that there won't be another
> > interrupt before this one is done? The handshake appears to be
> > deprecated. The interrupt itself returns ACKing (or not) but the actual
> > handler is delayed to this thread. Depending on the userland it could
> > take some time and I don't know how impatient the host is.
> > 
> 
> Good question. I guess people familiar with the hv interface need to
> comment on that.
> 
> >> +		__vmbus_isr();
> > Moving on. This (trying very hard here) even schedules tasklets. Why?
> > You need to disable BH before doing so. Otherwise it ends in ksoftirqd.
> > You don't want that.
> > 
> 
> You are referring to the re-existing logic now, aren't you?

Yes.

> > Couldn't the whole logic be integrated into the IRQ code? Then we could
> > have mask/ unmask if supported/ provided and threaded interrupts. Then
> > sysvec_hyperv_reenlightenment() could use a proper threaded interrupt
> > instead apic_eoi() + schedule_delayed_work(). 
> > 
> 
> Again, you are thinking x86-only. We need a portable solution.

well, ARM could use a threaded interrupt, too.

> >> +	}
> >> +}
> >>  EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
> >>  
> >>  static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
> > 
> > Sebastian
> 
> Jan

Sebastian

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-17 11:01     ` Sebastian Andrzej Siewior
@ 2026-03-17 11:55       ` Jan Kiszka
  2026-03-18  9:08         ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Kiszka @ 2026-03-17 11:55 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Peter Zijlstra
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	linux-hyperv, linux-kernel, Florian Bezdeka, RT, Mitchell Levy,
	Michael Kelley, Saurabh Singh Sengar, Naman Jain

On 17.03.26 12:01, Sebastian Andrzej Siewior wrote:
> On 2026-03-17 08:49:38 [+0100], Jan Kiszka wrote:
>>>> +void vmbus_isr(void)
>>>> +{
>>>> +	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
>>>> +		vmbus_irqd_wake();
>>>> +	} else {
>>>> +		lockdep_hardirq_threaded();
>>>
>>> What clears this? This is wrongly placed. This should go to
>>> sysvec_hyperv_callback() instead with its matching canceling part. The
>>> add_interrupt_randomness() should also be there and not here.
>>> sysvec_hyperv_stimer0() managed to do so.
>>
>> First of all, we need to keep all this in generic code to avoid missing
>> arm64.
> 
> This kind of belongs to the IRQ core code so I would prefer to see it on
> IRQ entry, not in a random driver.

I have no idea why hv is so special, starting with having its own
vectors. But if you have an idea how to address those needs via core
APIs or to create new ones for it, I guess that is welcome.

> 
>> But the question about lockdep_hardirq_threaded() is valid - and that
>> not only for this new code: I tried hard to understand from the code how
>> hardirq_threaded is managed, but I simply couldn't find the spot where
>> it is reset after lockdep_hardirq_threaded() but before returning from
>> the interrupt to the task that now has hardirq_threaded=1. I failed, and
>> so I started a debugger. That confirms for the existing code path
>> (__handle_irq_event_percpu) that we are indeed returning to the
>> interrupted task with hardirq_threaded set. I'm not sure if that is
>> intended that only the next irq_enter_rcu->lockdep_hardirq_enter of the
>> next IRQ over this same task will reset the flag again.
> 
> While looking into it again, it assumes that you enter an IRQ and due to
> the implementation if one is threaded, all of them are. So if you switch
> from IRQ handling to TIMER then this does not happen "as-is" but exit
> from one and then entry another at which point it is set to zero again.

Point is that a task that was interrupted by a potentially threaded
interrupt keeps this flag longer that it needs it. And that is
apparently harmless, but fairly confusing.

> 
>> With that in mind, the new logic here is no different from the one the
>> kernel used before. If both are not doing what they should, we likely
>> want to add a generic reset of hardirq_threaded to the IRQ exit path(s).
> 
> The difference is that you expect that _everyone_ calling this driver
> has everything else threaded. This might not be the case. That is why
> this should be in core knowing what is called if threaded, use in driver
> after explicit killing that flag afterwards since you don't know what
> can follow or add a generic threaded infrastructure here. 

This driver is different, unfortunately. I'm not sure if we can / want
to thread everything that the platform interrupt does on x86. So far,
only the last part of it - vmbus handling - is threaded. On arm64, the
irq is exclusive (see vmbus_percpu_isr), thus everything can be and is
threaded.

> 
> A different option which I would prefer in the drivere, would be an
> explicit lockdep override for the locking class without using
> lockdep_hardirq_threaded()

Happy to learn how to do that.

> 
>>> Different question: What guarantees that there won't be another
>>> interrupt before this one is done? The handshake appears to be
>>> deprecated. The interrupt itself returns ACKing (or not) but the actual
>>> handler is delayed to this thread. Depending on the userland it could
>>> take some time and I don't know how impatient the host is.
>>>
>>
>> Good question. I guess people familiar with the hv interface need to
>> comment on that.
>>
>>>> +		__vmbus_isr();
>>> Moving on. This (trying very hard here) even schedules tasklets. Why?
>>> You need to disable BH before doing so. Otherwise it ends in ksoftirqd.
>>> You don't want that.
>>>
>>
>> You are referring to the re-existing logic now, aren't you?
> 
> Yes.
> 

Then someone else needs to answer this.

>>> Couldn't the whole logic be integrated into the IRQ code? Then we could
>>> have mask/ unmask if supported/ provided and threaded interrupts. Then
>>> sysvec_hyperv_reenlightenment() could use a proper threaded interrupt
>>> instead apic_eoi() + schedule_delayed_work(). 
>>>
>>
>> Again, you are thinking x86-only. We need a portable solution.
> 
> well, ARM could use a threaded interrupt, too.

For a reason we didn't explore in details, per-CPU interrupts aren't
threaded. See older version of this patch
(https://lore.kernel.org/lkml/005a01dc9d30$a40515e0$ec0f41a0$@zohomail.com/)
where I thought I only had to fix x86, but arm64 was needing care as well.

Jan

-- 
Siemens AG, Foundational Technologies
Linux Expert Center

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-12 17:07 ` Sebastian Andrzej Siewior
  2026-03-17  7:49   ` Jan Kiszka
@ 2026-03-17 17:25   ` Michael Kelley
  2026-03-18  5:52     ` Jan Kiszka
  2026-03-18 10:01     ` Sebastian Andrzej Siewior
  1 sibling, 2 replies; 20+ messages in thread
From: Michael Kelley @ 2026-03-17 17:25 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Jan Kiszka
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	x86@kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, Florian Bezdeka, RT, Mitchell Levy,
	Michael Kelley, Saurabh Singh Sengar, Naman Jain

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Sent: Thursday, March 12, 2026 10:07 AM
>

Let me try to address the range of questions here and in the follow-up
discussion. As background, an overview of VMBus interrupt handling is in:

Documentation/virt/hyperv/vmbus.rst

in the section entitled "Synthetic Interrupt Controller (synic)". The
relevant text is:

   The SINT is mapped to a single per-CPU architectural interrupt (i.e,
   an 8-bit x86/x64 interrupt vector, or an arm64 PPI INTID). Because
   each CPU in the guest has a synic and may receive VMBus interrupts,
   they are best modeled in Linux as per-CPU interrupts. This model works
   well on arm64 where a single per-CPU Linux IRQ is allocated for
   VMBUS_MESSAGE_SINT. This IRQ appears in /proc/interrupts as an IRQ labelled
   "Hyper-V VMbus". Since x86/x64 lacks support for per-CPU IRQs, an x86
   interrupt vector is statically allocated (HYPERVISOR_CALLBACK_VECTOR)
   across all CPUs and explicitly coded to call vmbus_isr(). In this case,
   there's no Linux IRQ, and the interrupts are visible in aggregate in
   /proc/interrupts on the "HYP" line.

The use of a statically allocated sysvec pre-dates my involvement in this
code starting in 2017, but I believe it was modelled after what Xen does,
and for the same reason -- to effectively create a per-CPU interrupt on
x86/x64. Acorn is also using HYPERVISOR_CALLBACK_VECTOR, but I
don't know if that is also to create a per-CPU interrupt.

More below ....

> On 2026-02-16 17:24:56 [+0100], Jan Kiszka wrote:
> > --- a/drivers/hv/vmbus_drv.c
> > +++ b/drivers/hv/vmbus_drv.c
> > @@ -25,6 +25,7 @@
> >  #include <linux/cpu.h>
> >  #include <linux/sched/isolation.h>
> >  #include <linux/sched/task_stack.h>
> > +#include <linux/smpboot.h>
> >
> >  #include <linux/delay.h>
> >  #include <linux/panic_notifier.h>
> > @@ -1350,7 +1351,7 @@ static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message
> >  	}
> >  }
> >
> > -void vmbus_isr(void)
> > +static void __vmbus_isr(void)
> >  {
> >  	struct hv_per_cpu_context *hv_cpu
> >  		= this_cpu_ptr(hv_context.cpu_context);
> > @@ -1363,6 +1364,53 @@ void vmbus_isr(void)
> >
> >  	add_interrupt_randomness(vmbus_interrupt);
> 
> This is feeding entropy and would like to see interrupt registers. But
> since this is invoked from a thread it won't.

I'll respond to this topic on the new thread for the new patch
where Jan has moved the call to add_interrupt_randomness().

> 
> >  }
> > +
> …
> > +void vmbus_isr(void)
> > +{
> > +	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
> > +		vmbus_irqd_wake();
> > +	} else {
> > +		lockdep_hardirq_threaded();
> 
> What clears this? This is wrongly placed. This should go to
> sysvec_hyperv_callback() instead with its matching canceling part. The
> add_interrupt_randomness() should also be there and not here.
> sysvec_hyperv_stimer0() managed to do so.

I don't have any knowledge to bring regarding the use of
lockdep_hardirq_threaded().

> 
> Different question: What guarantees that there won't be another
> interrupt before this one is done? The handshake appears to be
> deprecated. The interrupt itself returns ACKing (or not) but the actual
> handler is delayed to this thread. Depending on the userland it could
> take some time and I don't know how impatient the host is.

In more recent versions of Hyper-V, what's deprecated is Hyper-V implicitly
and automatically doing the EOI. So in sysvec_hyperv_callback(), apic_eoi()
is usually explicitly called to ack the interrupt.

There's no guarantee, in either the existing case or the new PREEMPT_RT
case, that another VMBus interrupt won't come in on the same CPU
before the tasklets scheduled by vmbus_message_sched() or
vmbus_chan_sched() have run. From a functional standpoint, the Linux
code and interaction with Hyper-V handles another interrupt correctly.

From a delay standpoint, there's not a problem for the normal (i.e., not
PREEMPT_RT) case because the tasklets run as the interrupt exits -- they
don't end up in ksoftirqd. For the PREEMPT_RT case, I can see your point
about delays since the tasklets are scheduled from the new per-CPU thread.
But my understanding is that Jan's motivation for these changes is not to
achieve true RT behavior, since Hyper-V doesn't provide that anyway.
The goal is simply to make PREEMPT_RT builds functional, though Jan may
have further comments on the goal.

> 
> > +		__vmbus_isr();
> Moving on. This (trying very hard here) even schedules tasklets. Why?
> You need to disable BH before doing so. Otherwise it ends in ksoftirqd.
> You don't want that.

Again, Jan can comment on the impact of delays due to ending up
in ksoftirqd.

> 
> Couldn't the whole logic be integrated into the IRQ code? Then we could
> have mask/ unmask if supported/ provided and threaded interrupts. Then
> sysvec_hyperv_reenlightenment() could use a proper threaded interrupt
> instead apic_eoi() + schedule_delayed_work().

As I described above, Hyper-V needs a per-CPU interrupt. It's faked up
on x86/x64 with the hardcoded HYPERVISOR_CALLBACK_VECTOR sysvec
entry, but on arm64 a normal Linux per-CPU IRQ is used. Once the execution
path gets to vmbus_isr(), the two architectures share the same code. Same
thing is done with the Hyper-V STIMER0 interrupt as a per-CPU interrupt.
If there's a better way to fake up a per-CPU interrupt on x86/x64, I'm open
to looking at it.

As I recently discovered in discussion with Jan, standard Linux IRQ handling
will *not* thread per-CPU interrupts. So even on arm64 with a standard
Linux per-CPU IRQ is used for VMBus and STIMER0 interrupts, we can't
request threading.

I need to refresh my memory on sysvec_hyperv_reenlightenment(). If
I recall correctly, it's not a per-CPU interrupt, so it probably doesn't
need to have a hardcoded vector. Overall, the Hyper-V reenlightenment
functionality is a bit of a fossil that isn't needed on modern x86/x64
processors that support TSC scaling. And it doesn't exist for arm64.
It might be worth seeing if it could be dropped entirely ...

Michael

> 
> > +	}
> > +}
> >  EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
> >
> >  static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
> 
> Sebastian

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-17 17:25   ` Michael Kelley
@ 2026-03-18  5:52     ` Jan Kiszka
  2026-03-18 10:01     ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 20+ messages in thread
From: Jan Kiszka @ 2026-03-18  5:52 UTC (permalink / raw)
  To: Michael Kelley, Sebastian Andrzej Siewior
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	x86@kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, Florian Bezdeka, RT, Mitchell Levy,
	Saurabh Singh Sengar, Naman Jain

On 17.03.26 18:25, Michael Kelley wrote:
> From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Sent: Thursday, March 12, 2026 10:07 AM
>>
> 
> Let me try to address the range of questions here and in the follow-up
> discussion. As background, an overview of VMBus interrupt handling is in:
> 
> Documentation/virt/hyperv/vmbus.rst
> 
> in the section entitled "Synthetic Interrupt Controller (synic)". The
> relevant text is:
> 
>    The SINT is mapped to a single per-CPU architectural interrupt (i.e,
>    an 8-bit x86/x64 interrupt vector, or an arm64 PPI INTID). Because
>    each CPU in the guest has a synic and may receive VMBus interrupts,
>    they are best modeled in Linux as per-CPU interrupts. This model works
>    well on arm64 where a single per-CPU Linux IRQ is allocated for
>    VMBUS_MESSAGE_SINT. This IRQ appears in /proc/interrupts as an IRQ labelled
>    "Hyper-V VMbus". Since x86/x64 lacks support for per-CPU IRQs, an x86
>    interrupt vector is statically allocated (HYPERVISOR_CALLBACK_VECTOR)
>    across all CPUs and explicitly coded to call vmbus_isr(). In this case,
>    there's no Linux IRQ, and the interrupts are visible in aggregate in
>    /proc/interrupts on the "HYP" line.
> 
> The use of a statically allocated sysvec pre-dates my involvement in this
> code starting in 2017, but I believe it was modelled after what Xen does,
> and for the same reason -- to effectively create a per-CPU interrupt on
> x86/x64. Acorn is also using HYPERVISOR_CALLBACK_VECTOR, but I
> don't know if that is also to create a per-CPU interrupt.

Long ago, we demonstrated via Jailhouse that you do not necessarily gain
complexity on the hypervisor side by providing a minimal PCI host and
attaching all your virtual devices to that instead. Even longer ago in
the absence of proper IRQ controller virtualization on the various
archs, there was a bit of performance to gain doing "special"
interrupts. All these design decisions made sense at a certain time but
you would likely no longer repeat them today.

> 
> More below ....
> 
>> On 2026-02-16 17:24:56 [+0100], Jan Kiszka wrote:
>>> --- a/drivers/hv/vmbus_drv.c
>>> +++ b/drivers/hv/vmbus_drv.c
>>> @@ -25,6 +25,7 @@
>>>  #include <linux/cpu.h>
>>>  #include <linux/sched/isolation.h>
>>>  #include <linux/sched/task_stack.h>
>>> +#include <linux/smpboot.h>
>>>
>>>  #include <linux/delay.h>
>>>  #include <linux/panic_notifier.h>
>>> @@ -1350,7 +1351,7 @@ static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message
>>>  	}
>>>  }
>>>
>>> -void vmbus_isr(void)
>>> +static void __vmbus_isr(void)
>>>  {
>>>  	struct hv_per_cpu_context *hv_cpu
>>>  		= this_cpu_ptr(hv_context.cpu_context);
>>> @@ -1363,6 +1364,53 @@ void vmbus_isr(void)
>>>
>>>  	add_interrupt_randomness(vmbus_interrupt);
>>
>> This is feeding entropy and would like to see interrupt registers. But
>> since this is invoked from a thread it won't.
> 
> I'll respond to this topic on the new thread for the new patch
> where Jan has moved the call to add_interrupt_randomness().
> 
>>
>>>  }
>>> +
>> …
>>> +void vmbus_isr(void)
>>> +{
>>> +	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
>>> +		vmbus_irqd_wake();
>>> +	} else {
>>> +		lockdep_hardirq_threaded();
>>
>> What clears this? This is wrongly placed. This should go to
>> sysvec_hyperv_callback() instead with its matching canceling part. The
>> add_interrupt_randomness() should also be there and not here.
>> sysvec_hyperv_stimer0() managed to do so.
> 
> I don't have any knowledge to bring regarding the use of
> lockdep_hardirq_threaded().
> 
>>
>> Different question: What guarantees that there won't be another
>> interrupt before this one is done? The handshake appears to be
>> deprecated. The interrupt itself returns ACKing (or not) but the actual
>> handler is delayed to this thread. Depending on the userland it could
>> take some time and I don't know how impatient the host is.
> 
> In more recent versions of Hyper-V, what's deprecated is Hyper-V implicitly
> and automatically doing the EOI. So in sysvec_hyperv_callback(), apic_eoi()
> is usually explicitly called to ack the interrupt.
> 
> There's no guarantee, in either the existing case or the new PREEMPT_RT
> case, that another VMBus interrupt won't come in on the same CPU
> before the tasklets scheduled by vmbus_message_sched() or
> vmbus_chan_sched() have run. From a functional standpoint, the Linux
> code and interaction with Hyper-V handles another interrupt correctly.
> 
> From a delay standpoint, there's not a problem for the normal (i.e., not
> PREEMPT_RT) case because the tasklets run as the interrupt exits -- they
> don't end up in ksoftirqd. For the PREEMPT_RT case, I can see your point
> about delays since the tasklets are scheduled from the new per-CPU thread.
> But my understanding is that Jan's motivation for these changes is not to
> achieve true RT behavior, since Hyper-V doesn't provide that anyway.
> The goal is simply to make PREEMPT_RT builds functional, though Jan may
> have further comments on the goal.
> 

That is exactly the goal: A Linux guest happening to use a PREEMPT_RT
kernel should correctly run on Hyper-V, and that without losing relevant
performance. However, we do not expect any deterministic timing behavior
from such a setup.

>>
>>> +		__vmbus_isr();
>> Moving on. This (trying very hard here) even schedules tasklets. Why?
>> You need to disable BH before doing so. Otherwise it ends in ksoftirqd.
>> You don't want that.
> 
> Again, Jan can comment on the impact of delays due to ending up
> in ksoftirqd.
> 
>>
>> Couldn't the whole logic be integrated into the IRQ code? Then we could
>> have mask/ unmask if supported/ provided and threaded interrupts. Then
>> sysvec_hyperv_reenlightenment() could use a proper threaded interrupt
>> instead apic_eoi() + schedule_delayed_work().
> 
> As I described above, Hyper-V needs a per-CPU interrupt. It's faked up
> on x86/x64 with the hardcoded HYPERVISOR_CALLBACK_VECTOR sysvec
> entry, but on arm64 a normal Linux per-CPU IRQ is used. Once the execution
> path gets to vmbus_isr(), the two architectures share the same code. Same
> thing is done with the Hyper-V STIMER0 interrupt as a per-CPU interrupt.
> If there's a better way to fake up a per-CPU interrupt on x86/x64, I'm open
> to looking at it.
> 
> As I recently discovered in discussion with Jan, standard Linux IRQ handling
> will *not* thread per-CPU interrupts. So even on arm64 with a standard
> Linux per-CPU IRQ is used for VMBus and STIMER0 interrupts, we can't
> request threading.
> 
> I need to refresh my memory on sysvec_hyperv_reenlightenment(). If
> I recall correctly, it's not a per-CPU interrupt, so it probably doesn't
> need to have a hardcoded vector. Overall, the Hyper-V reenlightenment
> functionality is a bit of a fossil that isn't needed on modern x86/x64
> processors that support TSC scaling. And it doesn't exist for arm64.
> It might be worth seeing if it could be dropped entirely ...
> 

I suppose that all depends on how long Linux needs to support the
underlying hypervisor versions and interfaces, no? It's a bit like
supporting old hardware...

Jan

-- 
Siemens AG, Foundational Technologies
Linux Expert Center

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-17 11:55       ` Jan Kiszka
@ 2026-03-18  9:08         ` Sebastian Andrzej Siewior
  2026-03-18 11:02           ` Jan Kiszka
  0 siblings, 1 reply; 20+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-03-18  9:08 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: Peter Zijlstra, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, linux-hyperv, linux-kernel,
	Florian Bezdeka, RT, Mitchell Levy, Michael Kelley,
	Saurabh Singh Sengar, Naman Jain

On 2026-03-17 12:55:15 [+0100], Jan Kiszka wrote:
> Point is that a task that was interrupted by a potentially threaded
> interrupt keeps this flag longer that it needs it. And that is
> apparently harmless, but fairly confusing.

correct. My only concern would be a shared handler where the second is
not threaded.

> >> With that in mind, the new logic here is no different from the one the
> >> kernel used before. If both are not doing what they should, we likely
> >> want to add a generic reset of hardirq_threaded to the IRQ exit path(s).
> > 
> > The difference is that you expect that _everyone_ calling this driver
> > has everything else threaded. This might not be the case. That is why
> > this should be in core knowing what is called if threaded, use in driver
> > after explicit killing that flag afterwards since you don't know what
> > can follow or add a generic threaded infrastructure here. 
> 
> This driver is different, unfortunately. I'm not sure if we can / want
> to thread everything that the platform interrupt does on x86. So far,
> only the last part of it - vmbus handling - is threaded. On arm64, the
> irq is exclusive (see vmbus_percpu_isr), thus everything can be and is
> threaded.

No, it is a percpu interrupt which are not forced-threaded.

> >>> Couldn't the whole logic be integrated into the IRQ code? Then we could
> >>> have mask/ unmask if supported/ provided and threaded interrupts. Then
> >>> sysvec_hyperv_reenlightenment() could use a proper threaded interrupt
> >>> instead apic_eoi() + schedule_delayed_work(). 
> >>>
> >>
> >> Again, you are thinking x86-only. We need a portable solution.
> > 
> > well, ARM could use a threaded interrupt, too.
> 
> For a reason we didn't explore in details, per-CPU interrupts aren't
> threaded. See older version of this patch
> (https://lore.kernel.org/lkml/005a01dc9d30$a40515e0$ec0f41a0$@zohomail.com/)
> where I thought I only had to fix x86, but arm64 was needing care as well.

Per-CPU are usually timers or other things which are not threaded and
have their own thing for the "second" port and I only remember MCE using
a workqueue for notification.

> Jan

Sebastian

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: RE: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-17 17:25   ` Michael Kelley
  2026-03-18  5:52     ` Jan Kiszka
@ 2026-03-18 10:01     ` Sebastian Andrzej Siewior
  2026-03-18 11:03       ` Jan Kiszka
  2026-03-19  3:43       ` Michael Kelley
  1 sibling, 2 replies; 20+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-03-18 10:01 UTC (permalink / raw)
  To: Michael Kelley
  Cc: Jan Kiszka, K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Long Li, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86@kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, Florian Bezdeka, RT, Mitchell Levy,
	Saurabh Singh Sengar, Naman Jain

On 2026-03-17 17:25:20 [+0000], Michael Kelley wrote:
> From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Sent: Thursday, March 12, 2026 10:07 AM
> >
> 
> Let me try to address the range of questions here and in the follow-up
> discussion. As background, an overview of VMBus interrupt handling is in:
> 
> Documentation/virt/hyperv/vmbus.rst
> 
> in the section entitled "Synthetic Interrupt Controller (synic)". The
> relevant text is:
> 
>    The SINT is mapped to a single per-CPU architectural interrupt (i.e,
>    an 8-bit x86/x64 interrupt vector, or an arm64 PPI INTID). Because
>    each CPU in the guest has a synic and may receive VMBus interrupts,
>    they are best modeled in Linux as per-CPU interrupts. This model works
>    well on arm64 where a single per-CPU Linux IRQ is allocated for
>    VMBUS_MESSAGE_SINT. This IRQ appears in /proc/interrupts as an IRQ labelled
>    "Hyper-V VMbus". Since x86/x64 lacks support for per-CPU IRQs, an x86
>    interrupt vector is statically allocated (HYPERVISOR_CALLBACK_VECTOR)
>    across all CPUs and explicitly coded to call vmbus_isr(). In this case,
>    there's no Linux IRQ, and the interrupts are visible in aggregate in
>    /proc/interrupts on the "HYP" line.
> 
> The use of a statically allocated sysvec pre-dates my involvement in this
> code starting in 2017, but I believe it was modelled after what Xen does,
> and for the same reason -- to effectively create a per-CPU interrupt on
> x86/x64. Acorn is also using HYPERVISOR_CALLBACK_VECTOR, but I
> don't know if that is also to create a per-CPU interrupt.

If you create a vector, it becomes per-CPU. There is simply no mapping
from HYPERVISOR_CALLBACK_VECTOR to request_percpu_irq(). But if we had
this…

…
> > What clears this? This is wrongly placed. This should go to
> > sysvec_hyperv_callback() instead with its matching canceling part. The
> > add_interrupt_randomness() should also be there and not here.
> > sysvec_hyperv_stimer0() managed to do so.
> 
> I don't have any knowledge to bring regarding the use of
> lockdep_hardirq_threaded().

It is used in IRQ core to mark the execution of an interrupt handler
which becomes threaded in a forced-threaded scenario. The goal is to let
lockdep know that this piece of code on !RT will be threaded on RT and
therefore there is no need to report a possible locking problem that
will not exist on RT.

> > Different question: What guarantees that there won't be another
> > interrupt before this one is done? The handshake appears to be
> > deprecated. The interrupt itself returns ACKing (or not) but the actual
> > handler is delayed to this thread. Depending on the userland it could
> > take some time and I don't know how impatient the host is.
> 
> In more recent versions of Hyper-V, what's deprecated is Hyper-V implicitly
> and automatically doing the EOI. So in sysvec_hyperv_callback(), apic_eoi()
> is usually explicitly called to ack the interrupt.
> 
> There's no guarantee, in either the existing case or the new PREEMPT_RT
> case, that another VMBus interrupt won't come in on the same CPU
> before the tasklets scheduled by vmbus_message_sched() or
> vmbus_chan_sched() have run. From a functional standpoint, the Linux
> code and interaction with Hyper-V handles another interrupt correctly.

So there is no scenario that the host will trigger interrupts because
the guest is leaving the ISR without doing anything/ making progress?

> From a delay standpoint, there's not a problem for the normal (i.e., not
> PREEMPT_RT) case because the tasklets run as the interrupt exits -- they
> don't end up in ksoftirqd. For the PREEMPT_RT case, I can see your point
> about delays since the tasklets are scheduled from the new per-CPU thread.
> But my understanding is that Jan's motivation for these changes is not to
> achieve true RT behavior, since Hyper-V doesn't provide that anyway.
> The goal is simply to make PREEMPT_RT builds functional, though Jan may
> have further comments on the goal.

I would be worried if the host would storming interrupts to the guest
because it makes no progress.

> > > +		__vmbus_isr();
> > Moving on. This (trying very hard here) even schedules tasklets. Why?
> > You need to disable BH before doing so. Otherwise it ends in ksoftirqd.
> > You don't want that.
> 
> Again, Jan can comment on the impact of delays due to ending up
> in ksoftirqd.

My point is that having this with threaded interrupt support would
eliminate the usage of tasklets.

> > Couldn't the whole logic be integrated into the IRQ code? Then we could
> > have mask/ unmask if supported/ provided and threaded interrupts. Then
> > sysvec_hyperv_reenlightenment() could use a proper threaded interrupt
> > instead apic_eoi() + schedule_delayed_work().
> 
> As I described above, Hyper-V needs a per-CPU interrupt. It's faked up
> on x86/x64 with the hardcoded HYPERVISOR_CALLBACK_VECTOR sysvec
> entry, but on arm64 a normal Linux per-CPU IRQ is used. Once the execution
> path gets to vmbus_isr(), the two architectures share the same code. Same
> thing is done with the Hyper-V STIMER0 interrupt as a per-CPU interrupt.

This one has the "random" collecting on the right spot.

> If there's a better way to fake up a per-CPU interrupt on x86/x64, I'm open
> to looking at it.
> 
> As I recently discovered in discussion with Jan, standard Linux IRQ handling
> will *not* thread per-CPU interrupts. So even on arm64 with a standard
> Linux per-CPU IRQ is used for VMBus and STIMER0 interrupts, we can't
> request threading.

It would require a statement from the x86 & IRQ maintainers if it is
worth on x86 to make allow pass HYPERVISOR_CALLBACK_VECTOR to
request_percpu_irq() and have an IRQF_ that this one needs to be forced
threaded. Otherwise we would need to remain with the workarounds.

If you say that an interrupt storm can not occur, I would prefer
|static DEFINE_WAIT_OVERRIDE_MAP(vmbus_map, LD_WAIT_CONFIG);
|…
|	lock_map_acquire_try(&vmbus_map);
|	__vmbus_isr();
|	lock_map_release(&vmbus_map);

while it has mostly the same effect.

Either way, that add_interrupt_randomness() should be moved to
sysvec_hyperv_callback() like it has been done for
sysvec_hyperv_stimer0(). It should be invoked twice now if gets there
via vmbus_percpu_isr().

> I need to refresh my memory on sysvec_hyperv_reenlightenment(). If
> I recall correctly, it's not a per-CPU interrupt, so it probably doesn't
> need to have a hardcoded vector. Overall, the Hyper-V reenlightenment
> functionality is a bit of a fossil that isn't needed on modern x86/x64
> processors that support TSC scaling. And it doesn't exist for arm64.
> It might be worth seeing if it could be dropped entirely ...
> 
> Michael

Sebastian

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-18  9:08         ` Sebastian Andrzej Siewior
@ 2026-03-18 11:02           ` Jan Kiszka
  0 siblings, 0 replies; 20+ messages in thread
From: Jan Kiszka @ 2026-03-18 11:02 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, linux-hyperv, linux-kernel,
	Florian Bezdeka, RT, Mitchell Levy, Michael Kelley,
	Saurabh Singh Sengar, Naman Jain

On 18.03.26 10:08, Sebastian Andrzej Siewior wrote:
> On 2026-03-17 12:55:15 [+0100], Jan Kiszka wrote:
>> Point is that a task that was interrupted by a potentially threaded
>> interrupt keeps this flag longer that it needs it. And that is
>> apparently harmless, but fairly confusing.
> 
> correct. My only concern would be a shared handler where the second is
> not threaded.

The vmbus irqs are not shared (beyond what sysvec_hyperv_callback does).

> 
>>>> With that in mind, the new logic here is no different from the one the
>>>> kernel used before. If both are not doing what they should, we likely
>>>> want to add a generic reset of hardirq_threaded to the IRQ exit path(s).
>>>
>>> The difference is that you expect that _everyone_ calling this driver
>>> has everything else threaded. This might not be the case. That is why
>>> this should be in core knowing what is called if threaded, use in driver
>>> after explicit killing that flag afterwards since you don't know what
>>> can follow or add a generic threaded infrastructure here. 
>>
>> This driver is different, unfortunately. I'm not sure if we can / want
>> to thread everything that the platform interrupt does on x86. So far,
>> only the last part of it - vmbus handling - is threaded. On arm64, the
>> irq is exclusive (see vmbus_percpu_isr), thus everything can be and is
>> threaded.
> 
> No, it is a percpu interrupt which are not forced-threaded.

It is threaded now due to my patch.

> 
>>>>> Couldn't the whole logic be integrated into the IRQ code? Then we could
>>>>> have mask/ unmask if supported/ provided and threaded interrupts. Then
>>>>> sysvec_hyperv_reenlightenment() could use a proper threaded interrupt
>>>>> instead apic_eoi() + schedule_delayed_work(). 
>>>>>
>>>>
>>>> Again, you are thinking x86-only. We need a portable solution.
>>>
>>> well, ARM could use a threaded interrupt, too.
>>
>> For a reason we didn't explore in details, per-CPU interrupts aren't
>> threaded. See older version of this patch
>> (https://lore.kernel.org/lkml/005a01dc9d30$a40515e0$ec0f41a0$@zohomail.com/)
>> where I thought I only had to fix x86, but arm64 was needing care as well.
> 
> Per-CPU are usually timers or other things which are not threaded and
> have their own thing for the "second" port and I only remember MCE using
> a workqueue for notification.

And the hv vmbus now provides a case where threading could be useful, at
least for arm64. For x86, we would have to check if the first half of
sysvec_hyperv_callback (mshv_handler) wants threading as well / would
support that.

Jan

-- 
Siemens AG, Foundational Technologies
Linux Expert Center

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-18 10:01     ` Sebastian Andrzej Siewior
@ 2026-03-18 11:03       ` Jan Kiszka
  2026-03-18 11:21         ` Sebastian Andrzej Siewior
  2026-03-19  3:43       ` Michael Kelley
  1 sibling, 1 reply; 20+ messages in thread
From: Jan Kiszka @ 2026-03-18 11:03 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Michael Kelley
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	x86@kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, Florian Bezdeka, RT, Mitchell Levy,
	Saurabh Singh Sengar, Naman Jain

On 18.03.26 11:01, Sebastian Andrzej Siewior wrote:
> On 2026-03-17 17:25:20 [+0000], Michael Kelley wrote:
>> From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Sent: Thursday, March 12, 2026 10:07 AM
>>>
>>
>> Let me try to address the range of questions here and in the follow-up
>> discussion. As background, an overview of VMBus interrupt handling is in:
>>
>> Documentation/virt/hyperv/vmbus.rst
>>
>> in the section entitled "Synthetic Interrupt Controller (synic)". The
>> relevant text is:
>>
>>    The SINT is mapped to a single per-CPU architectural interrupt (i.e,
>>    an 8-bit x86/x64 interrupt vector, or an arm64 PPI INTID). Because
>>    each CPU in the guest has a synic and may receive VMBus interrupts,
>>    they are best modeled in Linux as per-CPU interrupts. This model works
>>    well on arm64 where a single per-CPU Linux IRQ is allocated for
>>    VMBUS_MESSAGE_SINT. This IRQ appears in /proc/interrupts as an IRQ labelled
>>    "Hyper-V VMbus". Since x86/x64 lacks support for per-CPU IRQs, an x86
>>    interrupt vector is statically allocated (HYPERVISOR_CALLBACK_VECTOR)
>>    across all CPUs and explicitly coded to call vmbus_isr(). In this case,
>>    there's no Linux IRQ, and the interrupts are visible in aggregate in
>>    /proc/interrupts on the "HYP" line.
>>
>> The use of a statically allocated sysvec pre-dates my involvement in this
>> code starting in 2017, but I believe it was modelled after what Xen does,
>> and for the same reason -- to effectively create a per-CPU interrupt on
>> x86/x64. Acorn is also using HYPERVISOR_CALLBACK_VECTOR, but I
>> don't know if that is also to create a per-CPU interrupt.
> 
> If you create a vector, it becomes per-CPU. There is simply no mapping
> from HYPERVISOR_CALLBACK_VECTOR to request_percpu_irq(). But if we had
> this…
> 
> …
>>> What clears this? This is wrongly placed. This should go to
>>> sysvec_hyperv_callback() instead with its matching canceling part. The
>>> add_interrupt_randomness() should also be there and not here.
>>> sysvec_hyperv_stimer0() managed to do so.
>>
>> I don't have any knowledge to bring regarding the use of
>> lockdep_hardirq_threaded().
> 
> It is used in IRQ core to mark the execution of an interrupt handler
> which becomes threaded in a forced-threaded scenario. The goal is to let
> lockdep know that this piece of code on !RT will be threaded on RT and
> therefore there is no need to report a possible locking problem that
> will not exist on RT.
> 
>>> Different question: What guarantees that there won't be another
>>> interrupt before this one is done? The handshake appears to be
>>> deprecated. The interrupt itself returns ACKing (or not) but the actual
>>> handler is delayed to this thread. Depending on the userland it could
>>> take some time and I don't know how impatient the host is.
>>
>> In more recent versions of Hyper-V, what's deprecated is Hyper-V implicitly
>> and automatically doing the EOI. So in sysvec_hyperv_callback(), apic_eoi()
>> is usually explicitly called to ack the interrupt.
>>
>> There's no guarantee, in either the existing case or the new PREEMPT_RT
>> case, that another VMBus interrupt won't come in on the same CPU
>> before the tasklets scheduled by vmbus_message_sched() or
>> vmbus_chan_sched() have run. From a functional standpoint, the Linux
>> code and interaction with Hyper-V handles another interrupt correctly.
> 
> So there is no scenario that the host will trigger interrupts because
> the guest is leaving the ISR without doing anything/ making progress?
> 
>> From a delay standpoint, there's not a problem for the normal (i.e., not
>> PREEMPT_RT) case because the tasklets run as the interrupt exits -- they
>> don't end up in ksoftirqd. For the PREEMPT_RT case, I can see your point
>> about delays since the tasklets are scheduled from the new per-CPU thread.
>> But my understanding is that Jan's motivation for these changes is not to
>> achieve true RT behavior, since Hyper-V doesn't provide that anyway.
>> The goal is simply to make PREEMPT_RT builds functional, though Jan may
>> have further comments on the goal.
> 
> I would be worried if the host would storming interrupts to the guest
> because it makes no progress.
> 
>>>> +		__vmbus_isr();
>>> Moving on. This (trying very hard here) even schedules tasklets. Why?
>>> You need to disable BH before doing so. Otherwise it ends in ksoftirqd.
>>> You don't want that.
>>
>> Again, Jan can comment on the impact of delays due to ending up
>> in ksoftirqd.
> 
> My point is that having this with threaded interrupt support would
> eliminate the usage of tasklets.
> 
>>> Couldn't the whole logic be integrated into the IRQ code? Then we could
>>> have mask/ unmask if supported/ provided and threaded interrupts. Then
>>> sysvec_hyperv_reenlightenment() could use a proper threaded interrupt
>>> instead apic_eoi() + schedule_delayed_work().
>>
>> As I described above, Hyper-V needs a per-CPU interrupt. It's faked up
>> on x86/x64 with the hardcoded HYPERVISOR_CALLBACK_VECTOR sysvec
>> entry, but on arm64 a normal Linux per-CPU IRQ is used. Once the execution
>> path gets to vmbus_isr(), the two architectures share the same code. Same
>> thing is done with the Hyper-V STIMER0 interrupt as a per-CPU interrupt.
> 
> This one has the "random" collecting on the right spot.
> 
>> If there's a better way to fake up a per-CPU interrupt on x86/x64, I'm open
>> to looking at it.
>>
>> As I recently discovered in discussion with Jan, standard Linux IRQ handling
>> will *not* thread per-CPU interrupts. So even on arm64 with a standard
>> Linux per-CPU IRQ is used for VMBus and STIMER0 interrupts, we can't
>> request threading.
> 
> It would require a statement from the x86 & IRQ maintainers if it is
> worth on x86 to make allow pass HYPERVISOR_CALLBACK_VECTOR to
> request_percpu_irq() and have an IRQF_ that this one needs to be forced
> threaded. Otherwise we would need to remain with the workarounds.
> 
> If you say that an interrupt storm can not occur, I would prefer
> |static DEFINE_WAIT_OVERRIDE_MAP(vmbus_map, LD_WAIT_CONFIG);
> |…
> |	lock_map_acquire_try(&vmbus_map);
> |	__vmbus_isr();
> |	lock_map_release(&vmbus_map);
> 
> while it has mostly the same effect.
> 
> Either way, that add_interrupt_randomness() should be moved to
> sysvec_hyperv_callback() like it has been done for
> sysvec_hyperv_stimer0(). It should be invoked twice now if gets there
> via vmbus_percpu_isr().

No, this would degrade arm64.

Jan

-- 
Siemens AG, Foundational Technologies
Linux Expert Center

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-18 11:03       ` Jan Kiszka
@ 2026-03-18 11:21         ` Sebastian Andrzej Siewior
  2026-03-18 12:12           ` Jan Kiszka
  0 siblings, 1 reply; 20+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-03-18 11:21 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: Michael Kelley, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86@kernel.org,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	Florian Bezdeka, RT, Mitchell Levy, Saurabh Singh Sengar,
	Naman Jain

On 2026-03-18 12:03:03 [+0100], Jan Kiszka wrote:
> > Either way, that add_interrupt_randomness() should be moved to
> > sysvec_hyperv_callback() like it has been done for
> > sysvec_hyperv_stimer0(). It should be invoked twice now if gets there
> > via vmbus_percpu_isr().
> 
> No, this would degrade arm64.

Okay. So why does this needs to be done for _this_ per-CPU IRQ on ARM64
but not for the others? What does it make so special.

> Jan

Sebastian

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-18 11:21         ` Sebastian Andrzej Siewior
@ 2026-03-18 12:12           ` Jan Kiszka
  0 siblings, 0 replies; 20+ messages in thread
From: Jan Kiszka @ 2026-03-18 12:12 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Michael Kelley, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86@kernel.org,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	Florian Bezdeka, RT, Mitchell Levy, Saurabh Singh Sengar,
	Naman Jain

On 18.03.26 12:21, Sebastian Andrzej Siewior wrote:
> On 2026-03-18 12:03:03 [+0100], Jan Kiszka wrote:
>>> Either way, that add_interrupt_randomness() should be moved to
>>> sysvec_hyperv_callback() like it has been done for
>>> sysvec_hyperv_stimer0(). It should be invoked twice now if gets there
>>> via vmbus_percpu_isr().
>>
>> No, this would degrade arm64.
> 
> Okay. So why does this needs to be done for _this_ per-CPU IRQ on ARM64
> but not for the others? What does it make so special.
> 

See the other thread:
https://lore.kernel.org/lkml/SN6PR02MB41573332BF202DAE0AF79ED1D441A@SN6PR02MB4157.namprd02.prod.outlook.com/

Jan

-- 
Siemens AG, Foundational Technologies
Linux Expert Center

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: RE: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-18 10:01     ` Sebastian Andrzej Siewior
  2026-03-18 11:03       ` Jan Kiszka
@ 2026-03-19  3:43       ` Michael Kelley
  2026-03-19 10:14         ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 20+ messages in thread
From: Michael Kelley @ 2026-03-19  3:43 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Michael Kelley
  Cc: Jan Kiszka, K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Long Li, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86@kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, Florian Bezdeka, RT, Mitchell Levy,
	Saurabh Singh Sengar, Naman Jain

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Sent: Wednesday, March 18, 2026 3:02 AM
> 
> On 2026-03-17 17:25:20 [+0000], Michael Kelley wrote:
> > From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Sent: Thursday, March 12, 2026 10:07 AM
> > >
> > Let me try to address the range of questions here and in the follow-up
> > discussion. As background, an overview of VMBus interrupt handling is in:
> >
> > Documentation/virt/hyperv/vmbus.rst
> >
> > in the section entitled "Synthetic Interrupt Controller (synic)". The
> > relevant text is:
> >
> >    The SINT is mapped to a single per-CPU architectural interrupt (i.e,
> >    an 8-bit x86/x64 interrupt vector, or an arm64 PPI INTID). Because
> >    each CPU in the guest has a synic and may receive VMBus interrupts,
> >    they are best modeled in Linux as per-CPU interrupts. This model works
> >    well on arm64 where a single per-CPU Linux IRQ is allocated for
> >    VMBUS_MESSAGE_SINT. This IRQ appears in /proc/interrupts as an IRQ labelled
> >    "Hyper-V VMbus". Since x86/x64 lacks support for per-CPU IRQs, an x86
> >    interrupt vector is statically allocated (HYPERVISOR_CALLBACK_VECTOR)
> >    across all CPUs and explicitly coded to call vmbus_isr(). In this case,
> >    there's no Linux IRQ, and the interrupts are visible in aggregate in
> >    /proc/interrupts on the "HYP" line.
> >
> > The use of a statically allocated sysvec pre-dates my involvement in this
> > code starting in 2017, but I believe it was modelled after what Xen does,
> > and for the same reason -- to effectively create a per-CPU interrupt on
> > x86/x64. Acorn is also using HYPERVISOR_CALLBACK_VECTOR, but I
> > don't know if that is also to create a per-CPU interrupt.
> 
> If you create a vector, it becomes per-CPU. There is simply no mapping
> from HYPERVISOR_CALLBACK_VECTOR to request_percpu_irq(). But if we had
> this…

Indeed, yes, that would remove the need for all the per-CPU interrupt hackery
on x86/x64. I don't have any objection to someone pursuing that path, but it's
not something I can do. Full disclosure:  You'll see my name on Hyper-V and
VMBus stuff in the Linux kernel, with Microsoft as my employer. But I retired
from Microsoft 2.5 years ago, and my current involvement in Linux kernel work
is purely as a very part-time volunteer. I also lack access to hardware and the
test machinery needed to make more significant changes, particularly if multiple
versions of Hyper-V must be tested.

> 
> …
> > > What clears this? This is wrongly placed. This should go to
> > > sysvec_hyperv_callback() instead with its matching canceling part. The
> > > add_interrupt_randomness() should also be there and not here.
> > > sysvec_hyperv_stimer0() managed to do so.
> >
> > I don't have any knowledge to bring regarding the use of
> > lockdep_hardirq_threaded().
> 
> It is used in IRQ core to mark the execution of an interrupt handler
> which becomes threaded in a forced-threaded scenario. The goal is to let
> lockdep know that this piece of code on !RT will be threaded on RT and
> therefore there is no need to report a possible locking problem that
> will not exist on RT.
> 
> > > Different question: What guarantees that there won't be another
> > > interrupt before this one is done? The handshake appears to be
> > > deprecated. The interrupt itself returns ACKing (or not) but the actual
> > > handler is delayed to this thread. Depending on the userland it could
> > > take some time and I don't know how impatient the host is.
> >
> > In more recent versions of Hyper-V, what's deprecated is Hyper-V implicitly
> > and automatically doing the EOI. So in sysvec_hyperv_callback(), apic_eoi()
> > is usually explicitly called to ack the interrupt.
> >
> > There's no guarantee, in either the existing case or the new PREEMPT_RT
> > case, that another VMBus interrupt won't come in on the same CPU
> > before the tasklets scheduled by vmbus_message_sched() or
> > vmbus_chan_sched() have run. From a functional standpoint, the Linux
> > code and interaction with Hyper-V handles another interrupt correctly.
> 
> So there is no scenario that the host will trigger interrupts because
> the guest is leaving the ISR without doing anything/ making progress?
> 
> > From a delay standpoint, there's not a problem for the normal (i.e., not
> > PREEMPT_RT) case because the tasklets run as the interrupt exits -- they
> > don't end up in ksoftirqd. For the PREEMPT_RT case, I can see your point
> > about delays since the tasklets are scheduled from the new per-CPU thread.
> > But my understanding is that Jan's motivation for these changes is not to
> > achieve true RT behavior, since Hyper-V doesn't provide that anyway.
> > The goal is simply to make PREEMPT_RT builds functional, though Jan may
> > have further comments on the goal.
> 
> I would be worried if the host would storming interrupts to the guest
> because it makes no progress.

No, that kind of storming won't happen. The Hyper-V host<->guest
interface is based on message queues. The host interrupts the guest
if it puts a message in the queue that transitions the queue from
"empty" to "not empty". Eventually the tasklet enabled in vmbus_isr()
and its subsidiaries gets around to emptying the queue, which effectively
re-arms the interrupt. The host may add more messages to the queue,
but it doesn't interrupt again for that queue until the queue is empty.
If the guest is delayed in doing that emptying, nothing bad happens.

There could be multiple queues that interrupt the same vCPU in the
guest, so there might be another interrupt to the same vCPU due to
a different queue, but that could happen regardless of the latency in
emptying a queue. And the number of queues assigned to a vCPU
is at most a small integer.

> 
> > > > +		__vmbus_isr();
> > > Moving on. This (trying very hard here) even schedules tasklets. Why?
> > > You need to disable BH before doing so. Otherwise it ends in ksoftirqd.
> > > You don't want that.
> >
> > Again, Jan can comment on the impact of delays due to ending up
> > in ksoftirqd.
> 
> My point is that having this with threaded interrupt support would
> eliminate the usage of tasklets.

Agreed, probably. For the non-RT case, the latency in getting to the
tasklet code *does* matter. I'm not familiar with how tasklets compare
to threaded interrupts on latency.

> 
> > > Couldn't the whole logic be integrated into the IRQ code? Then we could
> > > have mask/ unmask if supported/ provided and threaded interrupts. Then
> > > sysvec_hyperv_reenlightenment() could use a proper threaded interrupt
> > > instead apic_eoi() + schedule_delayed_work().
> >
> > As I described above, Hyper-V needs a per-CPU interrupt. It's faked up
> > on x86/x64 with the hardcoded HYPERVISOR_CALLBACK_VECTOR sysvec
> > entry, but on arm64 a normal Linux per-CPU IRQ is used. Once the execution
> > path gets to vmbus_isr(), the two architectures share the same code. Same
> > thing is done with the Hyper-V STIMER0 interrupt as a per-CPU interrupt.
> 
> This one has the "random" collecting on the right spot.

Regarding the timer path, see my comment in the other email thread.

> 
> > If there's a better way to fake up a per-CPU interrupt on x86/x64, I'm open
> > to looking at it.
> >
> > As I recently discovered in discussion with Jan, standard Linux IRQ handling
> > will *not* thread per-CPU interrupts. So even on arm64 with a standard
> > Linux per-CPU IRQ is used for VMBus and STIMER0 interrupts, we can't
> > request threading.
> 
> It would require a statement from the x86 & IRQ maintainers if it is
> worth on x86 to make allow pass HYPERVISOR_CALLBACK_VECTOR to
> request_percpu_irq() and have an IRQF_ that this one needs to be forced
> threaded. Otherwise we would need to remain with the workarounds.

Again, you or someone else is welcome to explore this topic.

> 
> If you say that an interrupt storm can not occur, I would prefer
> |static DEFINE_WAIT_OVERRIDE_MAP(vmbus_map, LD_WAIT_CONFIG);
> |…
> |	lock_map_acquire_try(&vmbus_map);
> |	__vmbus_isr();
> |	lock_map_release(&vmbus_map);
> 
> while it has mostly the same effect.
> 
> Either way, that add_interrupt_randomness() should be moved to
> sysvec_hyperv_callback() like it has been done for
> sysvec_hyperv_stimer0(). It should be invoked twice now if gets there
> via vmbus_percpu_isr().
> 
> > I need to refresh my memory on sysvec_hyperv_reenlightenment(). If
> > I recall correctly, it's not a per-CPU interrupt, so it probably doesn't
> > need to have a hardcoded vector. Overall, the Hyper-V reenlightenment
> > functionality is a bit of a fossil that isn't needed on modern x86/x64
> > processors that support TSC scaling. And it doesn't exist for arm64.
> > It might be worth seeing if it could be dropped entirely ...

I've refreshed my memory on the reenlightenment functionality, and
I think it has to stay. The functionality is used by KVM when it is running
in an L1 VM on an L0 Hyper-V host, and supporting its own L2 guest VMs.
I will check with Vitaly Kuznetsov, who originally added the reenlightenment
support for KVM, but I suspect it needs to stay for a few more years.

Old Hyper-V version support has been dropped in the past [1], but the
situation with reenlightenment is more that just the Hyper-V version.

Michael

[1] https://lore.kernel.org/all/1651509391-2058-2-git-send-email-mikelley@microsoft.com/

> >
> > Michael
> 
> Sebastian


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: RE: RE: [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT
  2026-03-19  3:43       ` Michael Kelley
@ 2026-03-19 10:14         ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 20+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-03-19 10:14 UTC (permalink / raw)
  To: Michael Kelley
  Cc: Jan Kiszka, K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Long Li, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86@kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, Florian Bezdeka, RT, Mitchell Levy,
	Saurabh Singh Sengar, Naman Jain

On 2026-03-19 03:43:12 [+0000], Michael Kelley wrote:
> Indeed, yes, that would remove the need for all the per-CPU interrupt hackery
> on x86/x64. I don't have any objection to someone pursuing that path, but it's
> not something I can do. Full disclosure:  You'll see my name on Hyper-V and
> VMBus stuff in the Linux kernel, with Microsoft as my employer. But I retired
> from Microsoft 2.5 years ago, and my current involvement in Linux kernel work
> is purely as a very part-time volunteer. I also lack access to hardware and the
> test machinery needed to make more significant changes, particularly if multiple
> versions of Hyper-V must be tested.

right. Then I would only ask for better annotation instead this current
thingy.

> > I would be worried if the host would storming interrupts to the guest
> > because it makes no progress.
> 
> No, that kind of storming won't happen. The Hyper-V host<->guest
> interface is based on message queues. The host interrupts the guest
> if it puts a message in the queue that transitions the queue from
> "empty" to "not empty". Eventually the tasklet enabled in vmbus_isr()
> and its subsidiaries gets around to emptying the queue, which effectively
> re-arms the interrupt. The host may add more messages to the queue,
> but it doesn't interrupt again for that queue until the queue is empty.
> If the guest is delayed in doing that emptying, nothing bad happens.

Okay.

> > > > Moving on. This (trying very hard here) even schedules tasklets. Why?
> > > > You need to disable BH before doing so. Otherwise it ends in ksoftirqd.
> > > > You don't want that.
> > >
> > > Again, Jan can comment on the impact of delays due to ending up
> > > in ksoftirqd.
> > 
> > My point is that having this with threaded interrupt support would
> > eliminate the usage of tasklets.
> 
> Agreed, probably. For the non-RT case, the latency in getting to the
> tasklet code *does* matter. I'm not familiar with how tasklets compare
> to threaded interrupts on latency.

There shouldn't be much difference on level where it actually matters.

Sebastian

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2026-03-19 10:14 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-16 16:24 [PATCH v3] drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT Jan Kiszka
2026-02-17  6:42 ` Michael Kelley
2026-02-17 23:03 ` Bezdeka, Florian
2026-02-18  6:48   ` Jan Kiszka
2026-02-18  7:05 ` Wei Liu
2026-02-18  7:19   ` Saurabh Singh Sengar
2026-03-12 17:07 ` Sebastian Andrzej Siewior
2026-03-17  7:49   ` Jan Kiszka
2026-03-17 11:01     ` Sebastian Andrzej Siewior
2026-03-17 11:55       ` Jan Kiszka
2026-03-18  9:08         ` Sebastian Andrzej Siewior
2026-03-18 11:02           ` Jan Kiszka
2026-03-17 17:25   ` Michael Kelley
2026-03-18  5:52     ` Jan Kiszka
2026-03-18 10:01     ` Sebastian Andrzej Siewior
2026-03-18 11:03       ` Jan Kiszka
2026-03-18 11:21         ` Sebastian Andrzej Siewior
2026-03-18 12:12           ` Jan Kiszka
2026-03-19  3:43       ` Michael Kelley
2026-03-19 10:14         ` Sebastian Andrzej Siewior

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox