* Re: [PATCH v5 2/2] mshv: add arm64 support for doorbell & intercept SINTs
From: Anirudh Rayabharam @ 2026-02-25 12:12 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <aZys_5A657AYq5DQ@skinsburskii.localdomain>
On Mon, Feb 23, 2026 at 11:39:43AM -0800, Stanislav Kinsburskii wrote:
> On Mon, Feb 23, 2026 at 02:01:59PM +0000, Anirudh Rayabharam wrote:
> > From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> >
> > On x86, the HYPERVISOR_CALLBACK_VECTOR is used to receive synthetic
> > interrupts (SINTs) from the hypervisor for doorbells and intercepts.
> > There is no such vector reserved for arm64.
> >
> > On arm64, the hypervisor exposes a synthetic register that can be read
> > to find the INTID that should be used for SINTs. This INTID is in the
> > PPI range.
> >
> > To better unify the code paths, introduce mshv_sint_vector_init() that
> > either reads the synthetic register and obtains the INTID (arm64) or
> > just uses HYPERVISOR_CALLBACK_VECTOR as the interrupt vector (x86).
> >
> > Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> > ---
> > drivers/hv/mshv_synic.c | 120 +++++++++++++++++++++++++++++++++---
> > include/hyperv/hvgdk_mini.h | 2 +
> > 2 files changed, 112 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
> > index 074e37c48876..75ef2160b3e0 100644
> > --- a/drivers/hv/mshv_synic.c
> > +++ b/drivers/hv/mshv_synic.c
> > @@ -10,17 +10,22 @@
> > #include <linux/kernel.h>
> > #include <linux/slab.h>
> > #include <linux/mm.h>
> > +#include <linux/interrupt.h>
> > #include <linux/io.h>
> > #include <linux/random.h>
> > #include <linux/cpuhotplug.h>
> > #include <linux/reboot.h>
> > #include <asm/mshyperv.h>
> > +#include <linux/platform_device.h>
> > +#include <linux/acpi.h>
> >
> > #include "mshv_eventfd.h"
> > #include "mshv.h"
> >
> > static int synic_cpuhp_online;
> > static struct hv_synic_pages __percpu *synic_pages;
> > +static int mshv_sint_vector = -1; /* hwirq for the SynIC SINTs */
> > +static int mshv_sint_irq = -1; /* Linux IRQ for mshv_sint_vector */
> >
> > static u32 synic_event_ring_get_queued_port(u32 sint_index)
> > {
> > @@ -442,9 +447,7 @@ void mshv_isr(void)
> > if (msg->header.message_flags.msg_pending)
> > hv_set_non_nested_msr(HV_MSR_EOM, 0);
> >
> > -#ifdef HYPERVISOR_CALLBACK_VECTOR
> > - add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR);
> > -#endif
> > + add_interrupt_randomness(mshv_sint_vector);
> > } else {
> > pr_warn_once("%s: unknown message type 0x%x\n", __func__,
> > msg->header.message_type);
> > @@ -456,9 +459,7 @@ static int mshv_synic_cpu_init(unsigned int cpu)
> > union hv_synic_simp simp;
> > union hv_synic_siefp siefp;
> > union hv_synic_sirbp sirbp;
> > -#ifdef HYPERVISOR_CALLBACK_VECTOR
> > union hv_synic_sint sint;
> > -#endif
> > union hv_synic_scontrol sctrl;
> > struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
> > struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
> > @@ -501,10 +502,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
> >
> > hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
> >
> > -#ifdef HYPERVISOR_CALLBACK_VECTOR
> > + if (mshv_sint_irq != -1)
> > + enable_percpu_irq(mshv_sint_irq, 0);
> > +
> > /* Enable intercepts */
> > sint.as_uint64 = 0;
> > - sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> > + sint.vector = mshv_sint_vector;
> > sint.masked = false;
> > sint.auto_eoi = hv_recommend_using_aeoi();
> > hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
> > @@ -512,13 +515,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
> >
> > /* Doorbell SINT */
> > sint.as_uint64 = 0;
> > - sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> > + sint.vector = mshv_sint_vector;
> > sint.masked = false;
> > sint.as_intercept = 1;
> > sint.auto_eoi = hv_recommend_using_aeoi();
> > hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
> > sint.as_uint64);
> > -#endif
> >
> > /* Enable global synic bit */
> > sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
> > @@ -573,6 +575,9 @@ static int mshv_synic_cpu_exit(unsigned int cpu)
> > hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
> > sint.as_uint64);
> >
> > + if (mshv_sint_irq != -1)
> > + disable_percpu_irq(mshv_sint_irq);
> > +
> > /* Disable Synic's event ring page */
> > sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
> > sirbp.sirbp_enabled = false;
> > @@ -683,14 +688,106 @@ static struct notifier_block mshv_synic_reboot_nb = {
> > .notifier_call = mshv_synic_reboot_notify,
> > };
> >
> > +#ifndef HYPERVISOR_CALLBACK_VECTOR
> > +static DEFINE_PER_CPU(long, mshv_evt);
> > +
> > +static irqreturn_t mshv_percpu_isr(int irq, void *dev_id)
> > +{
> > + mshv_isr();
> > + return IRQ_HANDLED;
> > +}
> > +
> > +#ifdef CONFIG_ACPI
> > +static int __init mshv_acpi_setup_sint_irq(void)
> > +{
> > + return acpi_register_gsi(NULL, mshv_sint_vector, ACPI_EDGE_SENSITIVE,
> > + ACPI_ACTIVE_HIGH);
> > +}
> > +
> > +static void mshv_acpi_cleanup_sint_irq(void)
> > +{
> > + acpi_unregister_gsi(mshv_sint_vector);
> > +}
> > +#else
> > +static int __init mshv_acpi_setup_sint_irq(void)
> > +{
> > + return -ENODEV;
> > +}
> > +
> > +static void mshv_acpi_cleanup_sint_irq(void)
> > +{
> > +}
> > +#endif
> > +
> > +static int __init mshv_sint_vector_init(void)
> > +{
> > + int ret;
> > + struct hv_register_assoc reg = {
> > + .name = HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID,
> > + };
> > + union hv_input_vtl input_vtl = { 0 };
> > +
> > + if (acpi_disabled)
> > + return -ENODEV;
> > +
> > + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> > + 1, input_vtl, ®);
> > + if (ret || !reg.value.reg64)
> > + return -ENODEV;
> > +
> > + mshv_sint_vector = reg.value.reg64;
> > + ret = mshv_acpi_setup_sint_irq();
> > + if (ret <= 0) {
> > + pr_err("Failed to setup IRQ for MSHV SINT vector %d: %d\n",
> > + mshv_sint_vector, ret);
> > + goto out_fail;
> > + }
> > +
> > + mshv_sint_irq = ret;
>
> nit: given that mshv_sint_irq can't be zero, the logic can be simplified by
> using 0 instead of -1.
>
>
>
> > +
> > + ret = request_percpu_irq(mshv_sint_irq, mshv_percpu_isr, "MSHV",
> > + &mshv_evt);
> > + if (ret)
> > + goto out_unregister;
> > +
> > + return 0;
> > +
> > +out_unregister:
> > + mshv_acpi_cleanup_sint_irq();
> > +out_fail:
> > + return ret;
> > +}
> > +
> > +static void mshv_sint_vector_cleanup(void)
> > +{
> > + free_percpu_irq(mshv_sint_irq, &mshv_evt);
> > + mshv_acpi_cleanup_sint_irq();
> > +}
> > +#else /* !HYPERVISOR_CALLBACK_VECTOR */
> > +static int __init mshv_sint_vector_init(void)
>
> nit: `init` is usually paired with `exit` or `fini`, so maybe `cleanup` can be
> renamed to `exit` as well for better consistency?
>
> Reviewed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Thanks! I'll fix the naming inconsistencies and also pickup your
Reviewed-by in v6.
Anirudh.
>
> > +{
> > + mshv_sint_vector = HYPERVISOR_CALLBACK_VECTOR;
> > + return 0;
> > +}
> > +
> > +static void mshv_sint_vector_cleanup(void)
> > +{
> > +}
> > +#endif /* HYPERVISOR_CALLBACK_VECTOR */
> > +
> > int __init mshv_synic_init(struct device *dev)
> > {
> > int ret = 0;
> >
> > + ret = mshv_sint_vector_init();
> > + if (ret)
> > + return ret;
> > +
> > synic_pages = alloc_percpu(struct hv_synic_pages);
> > if (!synic_pages) {
> > dev_err(dev, "Failed to allocate percpu synic page\n");
> > - return -ENOMEM;
> > + ret = -ENOMEM;
> > + goto sint_vector_cleanup;
> > }
> >
> > ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
> > @@ -713,6 +810,8 @@ int __init mshv_synic_init(struct device *dev)
> > cpuhp_remove_state(synic_cpuhp_online);
> > free_synic_pages:
> > free_percpu(synic_pages);
> > +sint_vector_cleanup:
> > + mshv_sint_vector_cleanup();
> > return ret;
> > }
> >
> > @@ -721,4 +820,5 @@ void mshv_synic_cleanup(void)
> > unregister_reboot_notifier(&mshv_synic_reboot_nb);
> > cpuhp_remove_state(synic_cpuhp_online);
> > free_percpu(synic_pages);
> > + mshv_sint_vector_cleanup();
> > }
> > diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> > index 30fbbde81c5c..7676f78e0766 100644
> > --- a/include/hyperv/hvgdk_mini.h
> > +++ b/include/hyperv/hvgdk_mini.h
> > @@ -1117,6 +1117,8 @@ enum hv_register_name {
> > HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A,
> >
> > HV_X64_REGISTER_REG_PAGE = 0x0009001C,
> > +#elif defined(CONFIG_ARM64)
> > + HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID = 0x00070001,
> > #endif
> > };
> >
> > --
> > 2.34.1
> >
^ permalink raw reply
* Re: [PATCH v5 2/2] mshv: add arm64 support for doorbell & intercept SINTs
From: Anirudh Rayabharam @ 2026-02-25 12:11 UTC (permalink / raw)
To: Michael Kelley
Cc: Stanislav Kinsburskii, kys@microsoft.com, haiyangz@microsoft.com,
wei.liu@kernel.org, decui@microsoft.com, longli@microsoft.com,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <SN6PR02MB41575AC771D08F64AC00DC17D477A@SN6PR02MB4157.namprd02.prod.outlook.com>
On Mon, Feb 23, 2026 at 08:49:37PM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Monday, February 23, 2026 11:40 AM
> >
>
> [snip]
>
> > > +
> > > +static int __init mshv_sint_vector_init(void)
> > > +{
> > > + int ret;
> > > + struct hv_register_assoc reg = {
> > > + .name = HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID,
> > > + };
> > > + union hv_input_vtl input_vtl = { 0 };
> > > +
> > > + if (acpi_disabled)
> > > + return -ENODEV;
> > > +
> > > + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> > > + 1, input_vtl, ®);
> > > + if (ret || !reg.value.reg64)
> > > + return -ENODEV;
> > > +
> > > + mshv_sint_vector = reg.value.reg64;
> > > + ret = mshv_acpi_setup_sint_irq();
> > > + if (ret <= 0) {
> > > + pr_err("Failed to setup IRQ for MSHV SINT vector %d: %d\n",
> > > + mshv_sint_vector, ret);
> > > + goto out_fail;
> > > + }
> > > +
> > > + mshv_sint_irq = ret;
> >
> > nit: given that mshv_sint_irq can't be zero, the logic can be simplified by
> > using 0 instead of -1.
>
> The test for <= 0 is actually wrong -- it should be just < 0. Zero is a valid
> Linux IRQ number. For example, here's the output of /proc/interrupts on
> a Gen1 VM on Hyper-V, where IRQ 0 is used by the legacy timer:
>
> root@gen1ubun:~# cat /proc/interrupts
> CPU0 CPU1 CPU2 CPU3
> 0: 18 0 0 0 IR-IO-APIC 2-edge timer
> 1: 0 9 0 0 IR-IO-APIC 1-edge i8042
> 4: 0 0 0 792 IR-IO-APIC 4-edge ttyS0
> 6: 6 0 0 0 IR-IO-APIC 6-edge floppy
> 8: 0 0 0 0 IR-IO-APIC 8-edge rtc0
> 9: 0 0 0 0 IR-IO-APIC 9-fasteoi acpi
>
> But I see other places throughout Linux kernel code that treat IRQ 0 as
> invalid. So I dunno .... But it's probably better to treat 0 as a valid IRQ
> number.
Agreed. I will fix this check in v6.
Thanks,
Anirudh.
>
> Michael
>
> >
> >
> >
> > > +
> > > + ret = request_percpu_irq(mshv_sint_irq, mshv_percpu_isr, "MSHV",
> > > + &mshv_evt);
> > > + if (ret)
> > > + goto out_unregister;
> > > +
> > > + return 0;
> > > +
> > > +out_unregister:
> > > + mshv_acpi_cleanup_sint_irq();
> > > +out_fail:
> > > + return ret;
> > > +}
> > > +
> > > +static void mshv_sint_vector_cleanup(void)
> > > +{
> > > + free_percpu_irq(mshv_sint_irq, &mshv_evt);
> > > + mshv_acpi_cleanup_sint_irq();
> > > +}
> > > +#else /* !HYPERVISOR_CALLBACK_VECTOR */
> > > +static int __init mshv_sint_vector_init(void)
> >
> > nit: `init` is usually paired with `exit` or `fini`, so maybe `cleanup` can be
> > renamed to `exit` as well for better consistency?
> >
> > Reviewed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> >
> > > +{
> > > + mshv_sint_vector = HYPERVISOR_CALLBACK_VECTOR;
> > > + return 0;
> > > +}
> > > +
> > > +static void mshv_sint_vector_cleanup(void)
> > > +{
> > > +}
> > > +#endif /* HYPERVISOR_CALLBACK_VECTOR */
> > > +
> > > int __init mshv_synic_init(struct device *dev)
> > > {
> > > int ret = 0;
> > >
> > > + ret = mshv_sint_vector_init();
> > > + if (ret)
> > > + return ret;
> > > +
> > > synic_pages = alloc_percpu(struct hv_synic_pages);
> > > if (!synic_pages) {
> > > dev_err(dev, "Failed to allocate percpu synic page\n");
> > > - return -ENOMEM;
> > > + ret = -ENOMEM;
> > > + goto sint_vector_cleanup;
> > > }
> > >
> > > ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
> > > @@ -713,6 +810,8 @@ int __init mshv_synic_init(struct device *dev)
> > > cpuhp_remove_state(synic_cpuhp_online);
> > > free_synic_pages:
> > > free_percpu(synic_pages);
> > > +sint_vector_cleanup:
> > > + mshv_sint_vector_cleanup();
> > > return ret;
> > > }
> > >
> > > @@ -721,4 +820,5 @@ void mshv_synic_cleanup(void)
> > > unregister_reboot_notifier(&mshv_synic_reboot_nb);
> > > cpuhp_remove_state(synic_cpuhp_online);
> > > free_percpu(synic_pages);
> > > + mshv_sint_vector_cleanup();
> > > }
> > > diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> > > index 30fbbde81c5c..7676f78e0766 100644
> > > --- a/include/hyperv/hvgdk_mini.h
> > > +++ b/include/hyperv/hvgdk_mini.h
> > > @@ -1117,6 +1117,8 @@ enum hv_register_name {
> > > HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A,
> > >
> > > HV_X64_REGISTER_REG_PAGE = 0x0009001C,
> > > +#elif defined(CONFIG_ARM64)
> > > + HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID = 0x00070001,
> > > #endif
> > > };
> > >
> > > --
> > > 2.34.1
> > >
>
^ permalink raw reply
* Re: [PATCH v5 2/2] mshv: add arm64 support for doorbell & intercept SINTs
From: Anirudh Rayabharam @ 2026-02-25 12:10 UTC (permalink / raw)
To: Michael Kelley
Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
decui@microsoft.com, longli@microsoft.com,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <SN6PR02MB4157FCA3268094CFAE5BA9D4D477A@SN6PR02MB4157.namprd02.prod.outlook.com>
On Mon, Feb 23, 2026 at 05:53:03PM +0000, Michael Kelley wrote:
> From: Anirudh Rayabharam <anirudh@anirudhrb.com> Sent: Monday, February 23, 2026 6:02 AM
> >
> > On x86, the HYPERVISOR_CALLBACK_VECTOR is used to receive synthetic
> > interrupts (SINTs) from the hypervisor for doorbells and intercepts.
> > There is no such vector reserved for arm64.
> >
> > On arm64, the hypervisor exposes a synthetic register that can be read
> > to find the INTID that should be used for SINTs. This INTID is in the
> > PPI range.
> >
> > To better unify the code paths, introduce mshv_sint_vector_init() that
> > either reads the synthetic register and obtains the INTID (arm64) or
> > just uses HYPERVISOR_CALLBACK_VECTOR as the interrupt vector (x86).
> >
> > Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> > ---
> > drivers/hv/mshv_synic.c | 120 +++++++++++++++++++++++++++++++++---
> > include/hyperv/hvgdk_mini.h | 2 +
> > 2 files changed, 112 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
> > index 074e37c48876..75ef2160b3e0 100644
> > --- a/drivers/hv/mshv_synic.c
> > +++ b/drivers/hv/mshv_synic.c
> > @@ -10,17 +10,22 @@
> > #include <linux/kernel.h>
> > #include <linux/slab.h>
> > #include <linux/mm.h>
> > +#include <linux/interrupt.h>
> > #include <linux/io.h>
> > #include <linux/random.h>
> > #include <linux/cpuhotplug.h>
> > #include <linux/reboot.h>
> > #include <asm/mshyperv.h>
> > +#include <linux/platform_device.h>
>
> I don't think this #include is needed now that you've switched to getting
> the INTID via a hypercall instead of via an ACPI device.
>
> The rest of the changes look good to me. You have a place carved out
> to put the DT setup of the mshv_sint_irq, and the scope of all the
> variables and mshv_percpu_isr() is correct so that there won't be any
> "unused" warnings generated. Nice!
>
> Modulo the unnecessary #include,
> Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Thanks! I'll get rid of the unnecessary include in v6 and also pick-up
your Reviewed-by.
Anirudh.
>
> > +#include <linux/acpi.h>
> >
> > #include "mshv_eventfd.h"
> > #include "mshv.h"
> >
> > static int synic_cpuhp_online;
> > static struct hv_synic_pages __percpu *synic_pages;
> > +static int mshv_sint_vector = -1; /* hwirq for the SynIC SINTs */
> > +static int mshv_sint_irq = -1; /* Linux IRQ for mshv_sint_vector */
> >
> > static u32 synic_event_ring_get_queued_port(u32 sint_index)
> > {
> > @@ -442,9 +447,7 @@ void mshv_isr(void)
> > if (msg->header.message_flags.msg_pending)
> > hv_set_non_nested_msr(HV_MSR_EOM, 0);
> >
> > -#ifdef HYPERVISOR_CALLBACK_VECTOR
> > - add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR);
> > -#endif
> > + add_interrupt_randomness(mshv_sint_vector);
> > } else {
> > pr_warn_once("%s: unknown message type 0x%x\n", __func__,
> > msg->header.message_type);
> > @@ -456,9 +459,7 @@ static int mshv_synic_cpu_init(unsigned int cpu)
> > union hv_synic_simp simp;
> > union hv_synic_siefp siefp;
> > union hv_synic_sirbp sirbp;
> > -#ifdef HYPERVISOR_CALLBACK_VECTOR
> > union hv_synic_sint sint;
> > -#endif
> > union hv_synic_scontrol sctrl;
> > struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
> > struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
> > @@ -501,10 +502,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
> >
> > hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
> >
> > -#ifdef HYPERVISOR_CALLBACK_VECTOR
> > + if (mshv_sint_irq != -1)
> > + enable_percpu_irq(mshv_sint_irq, 0);
> > +
> > /* Enable intercepts */
> > sint.as_uint64 = 0;
> > - sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> > + sint.vector = mshv_sint_vector;
> > sint.masked = false;
> > sint.auto_eoi = hv_recommend_using_aeoi();
> > hv_set_non_nested_msr(HV_MSR_SINT0 +
> > HV_SYNIC_INTERCEPTION_SINT_INDEX,
> > @@ -512,13 +515,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
> >
> > /* Doorbell SINT */
> > sint.as_uint64 = 0;
> > - sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> > + sint.vector = mshv_sint_vector;
> > sint.masked = false;
> > sint.as_intercept = 1;
> > sint.auto_eoi = hv_recommend_using_aeoi();
> > hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
> > sint.as_uint64);
> > -#endif
> >
> > /* Enable global synic bit */
> > sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
> > @@ -573,6 +575,9 @@ static int mshv_synic_cpu_exit(unsigned int cpu)
> > hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
> > sint.as_uint64);
> >
> > + if (mshv_sint_irq != -1)
> > + disable_percpu_irq(mshv_sint_irq);
> > +
> > /* Disable Synic's event ring page */
> > sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
> > sirbp.sirbp_enabled = false;
> > @@ -683,14 +688,106 @@ static struct notifier_block mshv_synic_reboot_nb = {
> > .notifier_call = mshv_synic_reboot_notify,
> > };
> >
> > +#ifndef HYPERVISOR_CALLBACK_VECTOR
> > +static DEFINE_PER_CPU(long, mshv_evt);
> > +
> > +static irqreturn_t mshv_percpu_isr(int irq, void *dev_id)
> > +{
> > + mshv_isr();
> > + return IRQ_HANDLED;
> > +}
> > +
> > +#ifdef CONFIG_ACPI
> > +static int __init mshv_acpi_setup_sint_irq(void)
> > +{
> > + return acpi_register_gsi(NULL, mshv_sint_vector, ACPI_EDGE_SENSITIVE,
> > + ACPI_ACTIVE_HIGH);
> > +}
> > +
> > +static void mshv_acpi_cleanup_sint_irq(void)
> > +{
> > + acpi_unregister_gsi(mshv_sint_vector);
> > +}
> > +#else
> > +static int __init mshv_acpi_setup_sint_irq(void)
> > +{
> > + return -ENODEV;
> > +}
> > +
> > +static void mshv_acpi_cleanup_sint_irq(void)
> > +{
> > +}
> > +#endif
> > +
> > +static int __init mshv_sint_vector_init(void)
> > +{
> > + int ret;
> > + struct hv_register_assoc reg = {
> > + .name = HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID,
> > + };
> > + union hv_input_vtl input_vtl = { 0 };
> > +
> > + if (acpi_disabled)
> > + return -ENODEV;
> > +
> > + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> > + 1, input_vtl, ®);
> > + if (ret || !reg.value.reg64)
> > + return -ENODEV;
> > +
> > + mshv_sint_vector = reg.value.reg64;
> > + ret = mshv_acpi_setup_sint_irq();
> > + if (ret <= 0) {
> > + pr_err("Failed to setup IRQ for MSHV SINT vector %d: %d\n",
> > + mshv_sint_vector, ret);
> > + goto out_fail;
> > + }
> > +
> > + mshv_sint_irq = ret;
> > +
> > + ret = request_percpu_irq(mshv_sint_irq, mshv_percpu_isr, "MSHV",
> > + &mshv_evt);
> > + if (ret)
> > + goto out_unregister;
> > +
> > + return 0;
> > +
> > +out_unregister:
> > + mshv_acpi_cleanup_sint_irq();
> > +out_fail:
> > + return ret;
> > +}
> > +
> > +static void mshv_sint_vector_cleanup(void)
> > +{
> > + free_percpu_irq(mshv_sint_irq, &mshv_evt);
> > + mshv_acpi_cleanup_sint_irq();
> > +}
> > +#else /* !HYPERVISOR_CALLBACK_VECTOR */
> > +static int __init mshv_sint_vector_init(void)
> > +{
> > + mshv_sint_vector = HYPERVISOR_CALLBACK_VECTOR;
> > + return 0;
> > +}
> > +
> > +static void mshv_sint_vector_cleanup(void)
> > +{
> > +}
> > +#endif /* HYPERVISOR_CALLBACK_VECTOR */
> > +
> > int __init mshv_synic_init(struct device *dev)
> > {
> > int ret = 0;
> >
> > + ret = mshv_sint_vector_init();
> > + if (ret)
> > + return ret;
> > +
> > synic_pages = alloc_percpu(struct hv_synic_pages);
> > if (!synic_pages) {
> > dev_err(dev, "Failed to allocate percpu synic page\n");
> > - return -ENOMEM;
> > + ret = -ENOMEM;
> > + goto sint_vector_cleanup;
> > }
> >
> > ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
> > @@ -713,6 +810,8 @@ int __init mshv_synic_init(struct device *dev)
> > cpuhp_remove_state(synic_cpuhp_online);
> > free_synic_pages:
> > free_percpu(synic_pages);
> > +sint_vector_cleanup:
> > + mshv_sint_vector_cleanup();
> > return ret;
> > }
> >
> > @@ -721,4 +820,5 @@ void mshv_synic_cleanup(void)
> > unregister_reboot_notifier(&mshv_synic_reboot_nb);
> > cpuhp_remove_state(synic_cpuhp_online);
> > free_percpu(synic_pages);
> > + mshv_sint_vector_cleanup();
> > }
> > diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> > index 30fbbde81c5c..7676f78e0766 100644
> > --- a/include/hyperv/hvgdk_mini.h
> > +++ b/include/hyperv/hvgdk_mini.h
> > @@ -1117,6 +1117,8 @@ enum hv_register_name {
> > HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A,
> >
> > HV_X64_REGISTER_REG_PAGE = 0x0009001C,
> > +#elif defined(CONFIG_ARM64)
> > + HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID = 0x00070001,
> > #endif
> > };
> >
> > --
> > 2.34.1
> >
>
^ permalink raw reply
* Re: [EXTERNAL] [PATCH rdma-next 25/50] RDMA/mana: Provide a modern CQ creation interface
From: Leon Romanovsky @ 2026-02-25 8:24 UTC (permalink / raw)
To: Long Li
Cc: Jason Gunthorpe, Selvin Xavier, Kalesh AP, Potnuri Bharat Teja,
Michael Margolin, Gal Pressman, Yossi Leybovich, Cheng Xu,
Kai Shen, Chengchang Tang, Junxian Huang, Abhijit Gangurde,
Allen Hubbe, Krzysztof Czurylo, Tatyana Nikolova,
Konstantin Taranov, Yishai Hadas, Michal Kalderon, Bryan Tan,
Vishnu Dasa, Broadcom internal kernel review list,
Christian Benvenuti, Nelson Escobar, Dennis Dalessandro,
Bernard Metzler, Zhu Yanjun, Shiraz Saleem,
linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
linux-hyperv@vger.kernel.org
In-Reply-To: <DS3PR21MB5735C22704C2AA25C5037EA5CE74A@DS3PR21MB5735.namprd21.prod.outlook.com>
On Tue, Feb 24, 2026 at 10:30:37PM +0000, Long Li wrote:
> > diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
> > index 2dce1b677115..605122ecf9f9 100644
> > --- a/drivers/infiniband/hw/mana/cq.c
> > +++ b/drivers/infiniband/hw/mana/cq.c
> > @@ -5,8 +5,8 @@
> >
> > #include "mana_ib.h"
> >
> > -int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
> > - struct uverbs_attr_bundle *attrs)
> > +int mana_ib_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr
> > *attr,
> > + struct uverbs_attr_bundle *attrs)
> > {
> > struct ib_udata *udata = &attrs->driver_udata;
> > struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
> > @@ -17,7 +17,6 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct
> > ib_cq_init_attr *attr,
> > struct mana_ib_dev *mdev;
> > bool is_rnic_cq;
> > u32 doorbell;
> > - u32 buf_size;
> > int err;
> >
> > mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); @@ -26,44
> > +25,100 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct
> > ib_cq_init_attr *attr,
> > cq->cq_handle = INVALID_MANA_HANDLE;
> > is_rnic_cq = mana_ib_is_rnic(mdev);
> >
> > - if (udata) {
> > - if (udata->inlen < offsetof(struct mana_ib_create_cq, flags))
> > - return -EINVAL;
> > + if (udata->inlen < offsetof(struct mana_ib_create_cq, flags))
> > + return -EINVAL;
> >
> > - err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd),
> > udata->inlen));
> > - if (err) {
> > - ibdev_dbg(ibdev, "Failed to copy from udata for create
> > cq, %d\n", err);
> > - return err;
> > - }
> > + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata-
> > >inlen));
> > + if (err) {
> > + ibdev_dbg(ibdev, "Failed to copy from udata for create
> > cq, %d\n", err);
> > + return err;
> > + }
> >
> > - if ((!is_rnic_cq && attr->cqe > mdev-
> > >adapter_caps.max_qp_wr) ||
> > - attr->cqe > U32_MAX / COMP_ENTRY_SIZE) {
> > - ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr-
> > >cqe);
> > - return -EINVAL;
> > - }
> > + if ((!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) ||
> > + attr->cqe > U32_MAX / COMP_ENTRY_SIZE) {
> > + ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
> > + return -EINVAL;
> > + }
> > +
> > + cq->cqe = attr->cqe;
> > + if (!ibcq->umem)
> > + ibcq->umem = ib_umem_get(ibdev, ucmd.buf_addr,
> > + cq->cqe * COMP_ENTRY_SIZE,
> > + IB_ACCESS_LOCAL_WRITE);
> > + if (IS_ERR(ibcq->umem))
> > + return PTR_ERR(ibcq->umem);
> > + cq->queue.umem = ibcq->umem;
> > +
> > + err = mana_ib_create_queue(mdev, &cq->queue);
> > + if (err)
> > + return err;
>
> Should we call ib_umem_release() on this err?
<...>
> > err_destroy_queue:
> > mana_ib_destroy_queue(mdev, &qp->raw_sq);
> > + return err;
>
> Should remove this "return err", the error handling code should fall through.
The main idea of this series is to allocate/release umem in the core logic.
See patch #5 https://lore.kernel.org/linux-rdma/20260213-refactor-umem-v1-5-f3be85847922@nvidia.com/
>
> > +
> > +err_release_umem:
> > + ib_umem_release(qp->raw_sq.umem);
> >
> > err_free_vport:
> > mana_ib_uncfg_vport(mdev, pd, port);
> > @@ -553,13 +566,25 @@ static int mana_ib_create_rc_qp(struct ib_qp *ibqp,
> > struct ib_pd *ibpd,
> > if (i == MANA_RC_SEND_QUEUE_FMR) {
> > qp->rc_qp.queues[i].id = INVALID_QUEUE_ID;
> > qp->rc_qp.queues[i].gdma_region =
> > GDMA_INVALID_DMA_REGION;
> > + qp->rc_qp.queues[i].umem = NULL;
> > continue;
> > }
> > - err = mana_ib_create_queue(mdev, ucmd.queue_buf[j],
> > ucmd.queue_size[j],
> > - &qp->rc_qp.queues[i]);
> > + qp->rc_qp.queues[i].umem = ib_umem_get(&mdev->ib_dev,
> > + ucmd.queue_buf[j],
> > + ucmd.queue_size[j],
> > +
> > IB_ACCESS_LOCAL_WRITE);
> > + if (IS_ERR(qp->rc_qp.queues[i].umem)) {
> > + err = PTR_ERR(qp->rc_qp.queues[i].umem);
> > + ibdev_err(&mdev->ib_dev, "Failed to get umem for
> > queue %d, err %d\n",
> > + i, err);
> > + goto release_umems;
>
> mana_ib_create_queue() may already have created some queues, need to clean them up or we have a leak.
>
> Maybe use destroy_queues: to call ib_umem_release()?
We should remove mana_ib_create_rc_qp() hunk, it came from my future
work where I removed umem from QPs as well.
Thanks
^ permalink raw reply
* Re: [PATCH] mshv: Replace fixed memory deposit with status driven helper
From: Anirudh Rayabharam @ 2026-02-25 5:20 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <177153896491.48883.14285093878498416061.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
On Thu, Feb 19, 2026 at 10:09:32PM +0000, Stanislav Kinsburskii wrote:
> Replace hardcoded HV_MAP_GPA_DEPOSIT_PAGES usage with
> hv_deposit_memory() which derives the deposit size from
> the hypercall status, and remove the now-unused constant.
>
> The previous code always deposited a fixed 256 pages on
> insufficient memory, ignoring the actual demand reported
> by the hypervisor. hv_deposit_memory() handles different
> deposit statuses, aligning map-GPA retries with the rest
> of the codebase.
>
> This approach may require more allocation and deposit
> hypercall iterations, but avoids over-depositing large
> fixed chunks when fewer pages would suffice. Until any
> performance impact is measured, the more frugal and
> consistent behavior is preferred.
>
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
> drivers/hv/mshv_root_hv_call.c | 4 +---
> 1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> index 7f91096f95a8..317191462b63 100644
> --- a/drivers/hv/mshv_root_hv_call.c
> +++ b/drivers/hv/mshv_root_hv_call.c
> @@ -16,7 +16,6 @@
>
> /* Determined empirically */
> #define HV_INIT_PARTITION_DEPOSIT_PAGES 208
> -#define HV_MAP_GPA_DEPOSIT_PAGES 256
> #define HV_UMAP_GPA_PAGES 512
>
> #define HV_PAGE_COUNT_2M_ALIGNED(pg_count) (!((pg_count) & (0x200 - 1)))
> @@ -239,8 +238,7 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> completed = hv_repcomp(status);
>
> if (hv_result_needs_memory(status)) {
> - ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
> - HV_MAP_GPA_DEPOSIT_PAGES);
> + ret = hv_deposit_memory(partition_id, status);
> if (ret)
> break;
>
>
>
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
^ permalink raw reply
* RE: [EXTERNAL] [PATCH rdma-next 25/50] RDMA/mana: Provide a modern CQ creation interface
From: Long Li @ 2026-02-24 22:30 UTC (permalink / raw)
To: Leon Romanovsky, Jason Gunthorpe, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun,
Shiraz Saleem
Cc: linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
linux-hyperv@vger.kernel.org
In-Reply-To: <20260213-refactor-umem-v1-25-f3be85847922@nvidia.com>
> diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
> index 2dce1b677115..605122ecf9f9 100644
> --- a/drivers/infiniband/hw/mana/cq.c
> +++ b/drivers/infiniband/hw/mana/cq.c
> @@ -5,8 +5,8 @@
>
> #include "mana_ib.h"
>
> -int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
> - struct uverbs_attr_bundle *attrs)
> +int mana_ib_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr
> *attr,
> + struct uverbs_attr_bundle *attrs)
> {
> struct ib_udata *udata = &attrs->driver_udata;
> struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
> @@ -17,7 +17,6 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct
> ib_cq_init_attr *attr,
> struct mana_ib_dev *mdev;
> bool is_rnic_cq;
> u32 doorbell;
> - u32 buf_size;
> int err;
>
> mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); @@ -26,44
> +25,100 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct
> ib_cq_init_attr *attr,
> cq->cq_handle = INVALID_MANA_HANDLE;
> is_rnic_cq = mana_ib_is_rnic(mdev);
>
> - if (udata) {
> - if (udata->inlen < offsetof(struct mana_ib_create_cq, flags))
> - return -EINVAL;
> + if (udata->inlen < offsetof(struct mana_ib_create_cq, flags))
> + return -EINVAL;
>
> - err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd),
> udata->inlen));
> - if (err) {
> - ibdev_dbg(ibdev, "Failed to copy from udata for create
> cq, %d\n", err);
> - return err;
> - }
> + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata-
> >inlen));
> + if (err) {
> + ibdev_dbg(ibdev, "Failed to copy from udata for create
> cq, %d\n", err);
> + return err;
> + }
>
> - if ((!is_rnic_cq && attr->cqe > mdev-
> >adapter_caps.max_qp_wr) ||
> - attr->cqe > U32_MAX / COMP_ENTRY_SIZE) {
> - ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr-
> >cqe);
> - return -EINVAL;
> - }
> + if ((!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) ||
> + attr->cqe > U32_MAX / COMP_ENTRY_SIZE) {
> + ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
> + return -EINVAL;
> + }
> +
> + cq->cqe = attr->cqe;
> + if (!ibcq->umem)
> + ibcq->umem = ib_umem_get(ibdev, ucmd.buf_addr,
> + cq->cqe * COMP_ENTRY_SIZE,
> + IB_ACCESS_LOCAL_WRITE);
> + if (IS_ERR(ibcq->umem))
> + return PTR_ERR(ibcq->umem);
> + cq->queue.umem = ibcq->umem;
> +
> + err = mana_ib_create_queue(mdev, &cq->queue);
> + if (err)
> + return err;
Should we call ib_umem_release() on this err?
>
> diff --git a/drivers/infiniband/hw/mana/qp.c
> b/drivers/infiniband/hw/mana/qp.c index 48c1f4977f21..b08dbc675741
> 100644
> --- a/drivers/infiniband/hw/mana/qp.c
> +++ b/drivers/infiniband/hw/mana/qp.c
> @@ -326,11 +326,20 @@ static int mana_ib_create_qp_raw(struct ib_qp
> *ibqp, struct ib_pd *ibpd,
> ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n",
> ucmd.sq_buf_addr, ucmd.port);
>
> - err = mana_ib_create_queue(mdev, ucmd.sq_buf_addr,
> ucmd.sq_buf_size, &qp->raw_sq);
> + qp->raw_sq.umem = ib_umem_get(&mdev->ib_dev, ucmd.sq_buf_addr,
> + ucmd.sq_buf_size,
> IB_ACCESS_LOCAL_WRITE);
> + if (IS_ERR(qp->raw_sq.umem)) {
> + err = PTR_ERR(qp->raw_sq.umem);
> + ibdev_dbg(&mdev->ib_dev,
> + "Failed to get umem for qp-raw, err %d\n", err);
> + goto err_free_vport;
> + }
> +
> + err = mana_ib_create_queue(mdev, &qp->raw_sq);
> if (err) {
> ibdev_dbg(&mdev->ib_dev,
> "Failed to create queue for create qp-raw, err %d\n",
> err);
> - goto err_free_vport;
> + goto err_release_umem;
> }
>
> /* Create a WQ on the same port handle used by the Ethernet */ @@ -
> 391,6 +400,10 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp,
> struct ib_pd *ibpd,
>
> err_destroy_queue:
> mana_ib_destroy_queue(mdev, &qp->raw_sq);
> + return err;
Should remove this "return err", the error handling code should fall through.
> +
> +err_release_umem:
> + ib_umem_release(qp->raw_sq.umem);
>
> err_free_vport:
> mana_ib_uncfg_vport(mdev, pd, port);
> @@ -553,13 +566,25 @@ static int mana_ib_create_rc_qp(struct ib_qp *ibqp,
> struct ib_pd *ibpd,
> if (i == MANA_RC_SEND_QUEUE_FMR) {
> qp->rc_qp.queues[i].id = INVALID_QUEUE_ID;
> qp->rc_qp.queues[i].gdma_region =
> GDMA_INVALID_DMA_REGION;
> + qp->rc_qp.queues[i].umem = NULL;
> continue;
> }
> - err = mana_ib_create_queue(mdev, ucmd.queue_buf[j],
> ucmd.queue_size[j],
> - &qp->rc_qp.queues[i]);
> + qp->rc_qp.queues[i].umem = ib_umem_get(&mdev->ib_dev,
> + ucmd.queue_buf[j],
> + ucmd.queue_size[j],
> +
> IB_ACCESS_LOCAL_WRITE);
> + if (IS_ERR(qp->rc_qp.queues[i].umem)) {
> + err = PTR_ERR(qp->rc_qp.queues[i].umem);
> + ibdev_err(&mdev->ib_dev, "Failed to get umem for
> queue %d, err %d\n",
> + i, err);
> + goto release_umems;
mana_ib_create_queue() may already have created some queues, need to clean them up or we have a leak.
Maybe use destroy_queues: to call ib_umem_release()?
Another issue: there is a call to ib_umem_release(queue->umem) in mana_ib_destroy_queue(), should we remove that as well?
Thanks,
Long
^ permalink raw reply
* RE: [EXTERNAL] Re: [PATCH net-next] net: ethtool: add COALESCE_RX_CQE_FRAMES/NSECS parameters
From: Haiyang Zhang @ 2026-02-24 21:38 UTC (permalink / raw)
To: Tariq Toukan, Haiyang Zhang, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, Andrew Lunn, Jakub Kicinski,
Donald Hunter, David S. Miller, Eric Dumazet, Paolo Abeni,
Simon Horman, Jonathan Corbet, Shuah Khan,
Kory Maincent (Dent Project), Gal Pressman, Oleksij Rempel,
Vadim Fedorenko, linux-kernel@vger.kernel.org,
linux-doc@vger.kernel.org
Cc: Paul Rosswurm
In-Reply-To: <9ed3ade5-717d-4f03-ac13-40614a0f093c@gmail.com>
> -----Original Message-----
> From: Tariq Toukan <ttoukan.linux@gmail.com>
> Sent: Tuesday, February 24, 2026 5:22 AM
> To: Haiyang Zhang <haiyangz@linux.microsoft.com>; linux-
> hyperv@vger.kernel.org; netdev@vger.kernel.org; Andrew Lunn
> <andrew@lunn.ch>; Jakub Kicinski <kuba@kernel.org>; Donald Hunter
> <donald.hunter@gmail.com>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Paolo Abeni <pabeni@redhat.com>; Simon
> Horman <horms@kernel.org>; Jonathan Corbet <corbet@lwn.net>; Shuah Khan
> <skhan@linuxfoundation.org>; Kory Maincent (Dent Project)
> <kory.maincent@bootlin.com>; Gal Pressman <gal@nvidia.com>; Oleksij Rempel
> <o.rempel@pengutronix.de>; Vadim Fedorenko <vadim.fedorenko@linux.dev>;
> linux-kernel@vger.kernel.org; linux-doc@vger.kernel.org
> Cc: Haiyang Zhang <haiyangz@microsoft.com>; Paul Rosswurm
> <paulros@microsoft.com>
> Subject: [EXTERNAL] Re: [PATCH net-next] net: ethtool: add
> COALESCE_RX_CQE_FRAMES/NSECS parameters
>
> [You don't often get email from ttoukan.linux@gmail.com. Learn why this is
> important at https://aka.ms/LearnAboutSenderIdentification ]
> >
> > +Rx CQE coalescing allows multiple received packets to be coalesced into
> a single
> > +Completion Queue Entry (CQE). ``ETHTOOL_A_COALESCE_RX_CQE_FRAMES``
> describes the
> > +maximum number of frames that can be coalesced into a CQE.
> > +``ETHTOOL_A_COALESCE_RX_CQE_NSECS`` describes max time in nanoseconds
> after the
> > +first packet arrival in a coalesced CQE to be sent.
> > +
>
> I am trying to understand how generic this feature/API is.
> Can you please elaborate on the feature you want to configure here?
It's the similar feature as MLX's "RX CQE compression", which merges
"multiple near-identical completions that share/match several fields."
I'm adding this kAPI for any drivers that support this feature.
You may find driver details in my previous submission:
[V2,net-next,1/2] net: mana: Add support for coalesced RX packets on CQE
https://patchwork.kernel.org/project/netdevbpf/patch/1767732407-12389-2-git-send-email-haiyangz@linux.microsoft.com/
> A single CQE to describe several packets?
Yes, up to 4 for our MANA driver.
> What is the price?
The price is the latency can increase a bit.
> What per-packet information/hw offloads do you lose
> in the process?
For example, the vlan_id is shared among up to 4 pkts.
But, the pkt len & hash are per-pkt.
struct mana_rxcomp_perpkt_info {
u32 pkt_len : 16;
u32 reserved1 : 16;
u32 reserved2;
u32 pkt_hash;
}; /* HW DATA */
/* Receive completion OOB */
struct mana_rxcomp_oob {
struct mana_cqe_header cqe_hdr;
u32 rx_vlan_id : 12;
u32 rx_vlantag_present : 1;
u32 rx_outer_iphdr_csum_succeed : 1;
u32 rx_outer_iphdr_csum_fail : 1;
u32 reserved1 : 1;
u32 rx_hashtype : 9;
u32 rx_iphdr_csum_succeed : 1;
u32 rx_iphdr_csum_fail : 1;
u32 rx_tcp_csum_succeed : 1;
u32 rx_tcp_csum_fail : 1;
u32 rx_udp_csum_succeed : 1;
u32 rx_udp_csum_fail : 1;
u32 reserved2 : 1;
struct mana_rxcomp_perpkt_info ppi[MANA_RXCOMP_OOB_NUM_PPI]; // MANA_RXCOMP_OOB_NUM_PPI=4
u32 rx_wqe_offset;
}; /* HW DATA */
> For comparison, in mlx5 we have RX CQE compression, which can be applied
> on multiple near-identical completions that share/match several fields.
> Still, there is a per-packet mini-cqe with distinctive per-packet fields
> like csum.
As said above, we have similar "per-packet mini-cqe":
struct mana_rxcomp_perpkt_info, which has pkt len & hash.
Thanks,
- Haiyang
^ permalink raw reply
* Re: [PATCH] scsi: storvsc: Fix scheduling while atomic on PREEMPT_RT
From: Martin K. Petersen @ 2026-02-24 16:47 UTC (permalink / raw)
To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
James E.J. Bottomley, linux-hyperv, Jan Kiszka
Cc: Martin K . Petersen, linux-scsi, Linux Kernel Mailing List,
Florian Bezdeka, RT, Mitchell Levy
In-Reply-To: <0c7fb5cd-fb21-4760-8593-e04bade84744@siemens.com>
On Thu, 29 Jan 2026 15:30:39 +0100, Jan Kiszka wrote:
> This resolves the follow splat and lock-up when running with PREEMPT_RT
> enabled on Hyper-V:
>
> [ 415.140818] BUG: scheduling while atomic: stress-ng-iomix/1048/0x00000002
> [ 415.140822] INFO: lockdep is turned off.
> [ 415.140823] Modules linked in: intel_rapl_msr intel_rapl_common intel_uncore_frequency_common intel_pmc_core pmt_telemetry pmt_discovery pmt_class intel_pmc_ssram_telemetry intel_vsec ghash_clmulni_intel aesni_intel rapl binfmt_misc nls_ascii nls_cp437 vfat fat snd_pcm hyperv_drm snd_timer drm_client_lib drm_shmem_helper snd sg soundcore drm_kms_helper pcspkr hv_balloon hv_utils evdev joydev drm configfs efi_pstore nfnetlink vsock_loopback vmw_vsock_virtio_transport_common hv_sock vmw_vsock_vmci_transport vsock vmw_vmci efivarfs autofs4 ext4 crc16 mbcache jbd2 sr_mod sd_mod cdrom hv_storvsc serio_raw hid_generic scsi_transport_fc hid_hyperv scsi_mod hid hv_netvsc hyperv_keyboard scsi_common
> [ 415.140846] Preemption disabled at:
> [ 415.140847] [<ffffffffc0656171>] storvsc_queuecommand+0x2e1/0xbe0 [hv_storvsc]
> [ 415.140854] CPU: 8 UID: 0 PID: 1048 Comm: stress-ng-iomix Not tainted 6.19.0-rc7 #30 PREEMPT_{RT,(full)}
> [ 415.140856] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/04/2024
> [ 415.140857] Call Trace:
> [ 415.140861] <TASK>
> [ 415.140861] ? storvsc_queuecommand+0x2e1/0xbe0 [hv_storvsc]
> [ 415.140863] dump_stack_lvl+0x91/0xb0
> [ 415.140870] __schedule_bug+0x9c/0xc0
> [ 415.140875] __schedule+0xdf6/0x1300
> [ 415.140877] ? rtlock_slowlock_locked+0x56c/0x1980
> [ 415.140879] ? rcu_is_watching+0x12/0x60
> [ 415.140883] schedule_rtlock+0x21/0x40
> [ 415.140885] rtlock_slowlock_locked+0x502/0x1980
> [ 415.140891] rt_spin_lock+0x89/0x1e0
> [ 415.140893] hv_ringbuffer_write+0x87/0x2a0
> [ 415.140899] vmbus_sendpacket_mpb_desc+0xb6/0xe0
> [ 415.140900] ? rcu_is_watching+0x12/0x60
> [ 415.140902] storvsc_queuecommand+0x669/0xbe0 [hv_storvsc]
> [ 415.140904] ? HARDIRQ_verbose+0x10/0x10
> [ 415.140908] ? __rq_qos_issue+0x28/0x40
> [ 415.140911] scsi_queue_rq+0x760/0xd80 [scsi_mod]
> [ 415.140926] __blk_mq_issue_directly+0x4a/0xc0
> [ 415.140928] blk_mq_issue_direct+0x87/0x2b0
> [ 415.140931] blk_mq_dispatch_queue_requests+0x120/0x440
> [ 415.140933] blk_mq_flush_plug_list+0x7a/0x1a0
> [ 415.140935] __blk_flush_plug+0xf4/0x150
> [ 415.140940] __submit_bio+0x2b2/0x5c0
> [ 415.140944] ? submit_bio_noacct_nocheck+0x272/0x360
> [ 415.140946] submit_bio_noacct_nocheck+0x272/0x360
> [ 415.140951] ext4_read_bh_lock+0x3e/0x60 [ext4]
> [ 415.140995] ext4_block_write_begin+0x396/0x650 [ext4]
> [ 415.141018] ? __pfx_ext4_da_get_block_prep+0x10/0x10 [ext4]
> [ 415.141038] ext4_da_write_begin+0x1c4/0x350 [ext4]
> [ 415.141060] generic_perform_write+0x14e/0x2c0
> [ 415.141065] ext4_buffered_write_iter+0x6b/0x120 [ext4]
> [ 415.141083] vfs_write+0x2ca/0x570
> [ 415.141087] ksys_write+0x76/0xf0
> [ 415.141089] do_syscall_64+0x99/0x1490
> [ 415.141093] ? rcu_is_watching+0x12/0x60
> [ 415.141095] ? finish_task_switch.isra.0+0xdf/0x3d0
> [ 415.141097] ? rcu_is_watching+0x12/0x60
> [ 415.141098] ? lock_release+0x1f0/0x2a0
> [ 415.141100] ? rcu_is_watching+0x12/0x60
> [ 415.141101] ? finish_task_switch.isra.0+0xe4/0x3d0
> [ 415.141103] ? rcu_is_watching+0x12/0x60
> [ 415.141104] ? __schedule+0xb34/0x1300
> [ 415.141106] ? hrtimer_try_to_cancel+0x1d/0x170
> [ 415.141109] ? do_nanosleep+0x8b/0x160
> [ 415.141111] ? hrtimer_nanosleep+0x89/0x100
> [ 415.141114] ? __pfx_hrtimer_wakeup+0x10/0x10
> [ 415.141116] ? xfd_validate_state+0x26/0x90
> [ 415.141118] ? rcu_is_watching+0x12/0x60
> [ 415.141120] ? do_syscall_64+0x1e0/0x1490
> [ 415.141121] ? do_syscall_64+0x1e0/0x1490
> [ 415.141123] ? rcu_is_watching+0x12/0x60
> [ 415.141124] ? do_syscall_64+0x1e0/0x1490
> [ 415.141125] ? do_syscall_64+0x1e0/0x1490
> [ 415.141127] ? irqentry_exit+0x140/0x7e0
> [ 415.141129] entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> [...]
Applied to 7.0/scsi-fixes, thanks!
[1/1] scsi: storvsc: Fix scheduling while atomic on PREEMPT_RT
https://git.kernel.org/mkp/scsi/c/57297736c082
--
Martin K. Petersen
^ permalink raw reply
* Re: [PATCH v2] x86/hyperv: Reserve 3 interrupt vectors used exclusively by mshv
From: Wei Liu @ 2026-02-24 16:29 UTC (permalink / raw)
To: Mukesh R
Cc: Wei Liu, Michael Kelley, linux-hyperv@vger.kernel.org,
linux-kernel@vger.kernel.org, kys@microsoft.com,
haiyangz@microsoft.com, decui@microsoft.com, longli@microsoft.com,
tglx@linutronix.de, mingo@redhat.com, bp@alien8.de,
dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com
In-Reply-To: <e344676b-2893-b264-68f1-b92a3e0c40c6@linux.microsoft.com>
On Fri, Feb 20, 2026 at 10:56:07AM -0800, Mukesh R wrote:
> On 2/20/26 10:45, Wei Liu wrote:
> > On Fri, Feb 20, 2026 at 05:14:26PM +0000, Michael Kelley wrote:
> > > From: Mukesh R <mrathor@linux.microsoft.com> Sent: Tuesday, February 17, 2026 3:12 PM
> > > >
> > > > MSVC compiler, used to compile the Microsoft Hyper-V hypervisor currently,
> > > > has an assert intrinsic that uses interrupt vector 0x29 to create an
> > > > exception. This will cause hypervisor to then crash and collect core. As
> > > > such, if this interrupt number is assigned to a device by Linux and the
> > > > device generates it, hypervisor will crash. There are two other such
> > > > vectors hard coded in the hypervisor, 0x2C and 0x2D for debug purposes.
> > > > Fortunately, the three vectors are part of the kernel driver space and
> > > > that makes it feasible to reserve them early so they are not assigned
> > > > later.
> > > >
> > > > Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > ---
> > > >
> > > > v1: Add ifndef CONFIG_X86_FRED (thanks hpa)
> > > > v2: replace ifndef with cpu_feature_enabled() (thanks hpa and tglx)
> > > >
> > > > arch/x86/kernel/cpu/mshyperv.c | 27 +++++++++++++++++++++++++++
> > > > 1 file changed, 27 insertions(+)
> > > >
> > > > diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> > > > index 579fb2c64cfd..88ca127dc6d4 100644
> > > > --- a/arch/x86/kernel/cpu/mshyperv.c
> > > > +++ b/arch/x86/kernel/cpu/mshyperv.c
> > > > @@ -478,6 +478,28 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
> > > > }
> > > > EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
> > > >
> > > > +/*
> > > > + * Reserve vectors hard coded in the hypervisor. If used outside, the hypervisor
> > > > + * will either crash or hang or attempt to break into debugger.
> > > > + */
> > > > +static void hv_reserve_irq_vectors(void)
> > > > +{
> > > > + #define HYPERV_DBG_FASTFAIL_VECTOR 0x29
> > > > + #define HYPERV_DBG_ASSERT_VECTOR 0x2C
> > > > + #define HYPERV_DBG_SERVICE_VECTOR 0x2D
> > > > +
> > > > + if (cpu_feature_enabled(X86_FEATURE_FRED))
> > > > + return;
> > > > +
> > > > + if (test_and_set_bit(HYPERV_DBG_ASSERT_VECTOR, system_vectors) ||
> > > > + test_and_set_bit(HYPERV_DBG_SERVICE_VECTOR, system_vectors) ||
> > > > + test_and_set_bit(HYPERV_DBG_FASTFAIL_VECTOR, system_vectors))
> > > > + BUG();
> > > > +
> > > > + pr_info("Hyper-V:reserve vectors: %d %d %d\n", HYPERV_DBG_ASSERT_VECTOR,
> > > > + HYPERV_DBG_SERVICE_VECTOR, HYPERV_DBG_FASTFAIL_VECTOR);
> > >
> > > I'm a little late to the party here, but I've always seen Intel interrupt vectors
> > > displayed as 2-digit hex numbers. This info message is displaying decimal,
> > > which is atypical and will probably be confusing.
> >
> > Noted. The pull request to Linus has been sent. We will change the
> > format in a follow up patch.
>
> Well, there is no 0x prefix, so should not be confusing, but no big
> deal, whatever.....
>
When I change these I will add the 0x prefix as well.
Wei
> Thanks,
> -Mukesh
>
>
>
^ permalink raw reply
* [PATCH net] net: mana: Fix double destroy_workqueue on service rescan PCI path
From: Dipayaan Roy @ 2026-02-24 12:38 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, longli, kotaranov, horms, shradhagupta, ssengar,
ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, dipayanroy
While testing corner cases in the driver, a use-after-free crash
was found on the service rescan PCI path.
When mana_serv_reset() calls mana_gd_suspend(), mana_gd_cleanup()
destroys gc->service_wq. If the subsequent mana_gd_resume() fails
with -ETIMEDOUT or -EPROTO, the code falls through to
mana_serv_rescan() which triggers pci_stop_and_remove_bus_device().
This invokes the PCI .remove callback (mana_gd_remove), which calls
mana_gd_cleanup() a second time, attempting to destroy the already-
freed workqueue. Fix this by NULL-checking gc->service_wq in
mana_gd_cleanup() and setting it to NULL after destruction.
Call stack of issue for reference:
[Sat Feb 21 18:53:48 2026] Call Trace:
[Sat Feb 21 18:53:48 2026] <TASK>
[Sat Feb 21 18:53:48 2026] mana_gd_cleanup+0x33/0x70 [mana]
[Sat Feb 21 18:53:48 2026] mana_gd_remove+0x3a/0xc0 [mana]
[Sat Feb 21 18:53:48 2026] pci_device_remove+0x41/0xb0
[Sat Feb 21 18:53:48 2026] device_remove+0x46/0x70
[Sat Feb 21 18:53:48 2026] device_release_driver_internal+0x1e3/0x250
[Sat Feb 21 18:53:48 2026] device_release_driver+0x12/0x20
[Sat Feb 21 18:53:48 2026] pci_stop_bus_device+0x6a/0x90
[Sat Feb 21 18:53:48 2026] pci_stop_and_remove_bus_device+0x13/0x30
[Sat Feb 21 18:53:48 2026] mana_do_service+0x180/0x290 [mana]
[Sat Feb 21 18:53:48 2026] mana_serv_func+0x24/0x50 [mana]
[Sat Feb 21 18:53:48 2026] process_one_work+0x190/0x3d0
[Sat Feb 21 18:53:48 2026] worker_thread+0x16e/0x2e0
[Sat Feb 21 18:53:48 2026] kthread+0xf7/0x130
[Sat Feb 21 18:53:48 2026] ? __pfx_worker_thread+0x10/0x10
[Sat Feb 21 18:53:48 2026] ? __pfx_kthread+0x10/0x10
[Sat Feb 21 18:53:48 2026] ret_from_fork+0x269/0x350
[Sat Feb 21 18:53:48 2026] ? __pfx_kthread+0x10/0x10
[Sat Feb 21 18:53:48 2026] ret_from_fork_asm+0x1a/0x30
[Sat Feb 21 18:53:48 2026] </TASK>
Fixes: 505cc26bcae0 ("net: mana: Add support for auxiliary device servicing events")
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
drivers/net/ethernet/microsoft/mana/gdma_main.c | 5 ++++-
drivers/net/ethernet/microsoft/mana/mana_en.c | 4 +++-
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 0055c231acf6..3926d18f1840 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1946,7 +1946,10 @@ static void mana_gd_cleanup(struct pci_dev *pdev)
mana_gd_remove_irqs(pdev);
- destroy_workqueue(gc->service_wq);
+ if (gc->service_wq) {
+ destroy_workqueue(gc->service_wq);
+ gc->service_wq = NULL;
+ }
dev_dbg(&pdev->dev, "mana gdma cleanup successful\n");
}
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 9b5a72ada5c4..f69e42651359 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3762,7 +3762,9 @@ void mana_rdma_remove(struct gdma_dev *gd)
}
WRITE_ONCE(gd->rdma_teardown, true);
- flush_workqueue(gc->service_wq);
+
+ if (gc->service_wq)
+ flush_workqueue(gc->service_wq);
if (gd->adev)
remove_adev(gd);
--
2.43.0
^ permalink raw reply related
* Re: [PATCH net-next v3] net: mana: Add MAC address to vPort logs and clarify error messages
From: Paolo Abeni @ 2026-02-24 12:22 UTC (permalink / raw)
To: Erni Sri Satya Vennela, Erni Sri Satya Vennela, kys, haiyangz,
wei.liu, decui, longli, andrew+netdev, davem, edumazet, kuba,
dipayanroy, ssengar, shradhagupta, shirazsaleem, gargaditya,
linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260223040826.750864-1-ernis@linux.microsoft.com>
On 2/23/26 5:08 AM, Erni Sri Satya Vennela wrote:
> @@ -861,8 +862,8 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
> tx_wr = &txq->msg_buf->reqs[msg_id];
>
> if (req_len > tx_wr->buf_len) {
> - dev_err(hwc->dev, "HWC: req msg size: %d > %d\n", req_len,
> - tx_wr->buf_len);
> + dev_err(hwc->dev, "%s:%d: req msg size: %d > %d\n",
> + __func__, __LINE__, req_len, tx_wr->buf_len);
I fail to see any relevant information added here ...
> err = -EINVAL;
> goto out;
> }
> @@ -878,6 +879,7 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
> req_msg->req.hwc_msg_id = msg_id;
>
> tx_wr->msg_size = req_len;
> + command = req_msg->req.msg_type;
>
> if (gc->is_pf) {
> dest_vrq = hwc->pf_dest_vrq_id;
> @@ -886,15 +888,16 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
>
> err = mana_hwc_post_tx_wqe(txq, tx_wr, dest_vrq, dest_vrcq, false);
> if (err) {
> - dev_err(hwc->dev, "HWC: Failed to post send WQE: %d\n", err);
> + dev_err(hwc->dev, "%s:%d: Failed to post send WQE: %d\n",
> + __func__, __LINE__, err);
... and here. The string message should be (and apparently is) enough to
locate the relevant code inside the tree. Please don't included
unneeded/irrelevant changes.
Thanks,
Paolo
^ permalink raw reply
* Re: [PATCH net-next] net: ethtool: add COALESCE_RX_CQE_FRAMES/NSECS parameters
From: Tariq Toukan @ 2026-02-24 10:21 UTC (permalink / raw)
To: Haiyang Zhang, linux-hyperv, netdev, Andrew Lunn, Jakub Kicinski,
Donald Hunter, David S. Miller, Eric Dumazet, Paolo Abeni,
Simon Horman, Jonathan Corbet, Shuah Khan,
Kory Maincent (Dent Project), Gal Pressman, Oleksij Rempel,
Vadim Fedorenko, linux-kernel, linux-doc
Cc: haiyangz, paulros
In-Reply-To: <20260222212328.736628-1-haiyangz@linux.microsoft.com>
On 22/02/2026 23:23, Haiyang Zhang wrote:
> From: Haiyang Zhang <haiyangz@microsoft.com>
>
> Add two parameters for drivers supporting Rx CQE Coalescing.
>
> ETHTOOL_A_COALESCE_RX_CQE_FRAMES:
> Maximum number of frames that can be coalesced into a CQE.
>
> ETHTOOL_A_COALESCE_RX_CQE_NSECS:
> Time out value in nanoseconds after the first packet arrival in a
> coalesced CQE to be sent.
>
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> ---
> Documentation/netlink/specs/ethtool.yaml | 8 ++++++++
> Documentation/networking/ethtool-netlink.rst | 10 ++++++++++
> include/linux/ethtool.h | 6 +++++-
> include/uapi/linux/ethtool_netlink_generated.h | 2 ++
> net/ethtool/coalesce.c | 14 +++++++++++++-
> 5 files changed, 38 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
> index 0a2d2343f79a..951d98f6bb12 100644
> --- a/Documentation/netlink/specs/ethtool.yaml
> +++ b/Documentation/netlink/specs/ethtool.yaml
> @@ -861,6 +861,12 @@ attribute-sets:
> name: tx-profile
> type: nest
> nested-attributes: profile
> + -
> + name: rx-cqe-frames
> + type: u32
> + -
> + name: rx-cqe-nsecs
> + type: u32
>
> -
> name: pause-stat
> @@ -2244,6 +2250,8 @@ operations:
> - tx-aggr-time-usecs
> - rx-profile
> - tx-profile
> + - rx-cqe-frames
> + - rx-cqe-nsecs
> dump: *coalesce-get-op
> -
> name: coalesce-set
> diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
> index af56c304cef4..a3e78b69fd07 100644
> --- a/Documentation/networking/ethtool-netlink.rst
> +++ b/Documentation/networking/ethtool-netlink.rst
> @@ -1072,6 +1072,8 @@ Kernel response contents:
> ``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS`` u32 time (us), aggr, Tx
> ``ETHTOOL_A_COALESCE_RX_PROFILE`` nested profile of DIM, Rx
> ``ETHTOOL_A_COALESCE_TX_PROFILE`` nested profile of DIM, Tx
> + ``ETHTOOL_A_COALESCE_RX_CQE_FRAMES`` u32 max packets, Rx CQE
> + ``ETHTOOL_A_COALESCE_RX_CQE_NSECS`` u32 delay (ns), Rx CQE
> =========================================== ====== =======================
>
> Attributes are only included in reply if their value is not zero or the
> @@ -1105,6 +1107,12 @@ well with frequent small-sized URBs transmissions.
> to DIM parameters, see `Generic Network Dynamic Interrupt Moderation (Net DIM)
> <https://www.kernel.org/doc/Documentation/networking/net_dim.rst>`_.
>
> +Rx CQE coalescing allows multiple received packets to be coalesced into a single
> +Completion Queue Entry (CQE). ``ETHTOOL_A_COALESCE_RX_CQE_FRAMES`` describes the
> +maximum number of frames that can be coalesced into a CQE.
> +``ETHTOOL_A_COALESCE_RX_CQE_NSECS`` describes max time in nanoseconds after the
> +first packet arrival in a coalesced CQE to be sent.
> +
I am trying to understand how generic this feature/API is.
Can you please elaborate on the feature you want to configure here?
A single CQE to describe several packets?
What is the price? What per-packet information/hw offloads do you lose
in the process?
For comparison, in mlx5 we have RX CQE compression, which can be applied
on multiple near-identical completions that share/match several fields.
Still, there is a per-packet mini-cqe with distinctive per-packet fields
like csum.
> COALESCE_SET
> ============
>
> @@ -1143,6 +1151,8 @@ Request contents:
> ``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS`` u32 time (us), aggr, Tx
> ``ETHTOOL_A_COALESCE_RX_PROFILE`` nested profile of DIM, Rx
> ``ETHTOOL_A_COALESCE_TX_PROFILE`` nested profile of DIM, Tx
> + ``ETHTOOL_A_COALESCE_RX_CQE_FRAMES`` u32 max packets, Rx CQE
> + ``ETHTOOL_A_COALESCE_RX_CQE_NSECS`` u32 delay (ns), Rx CQE
> =========================================== ====== =======================
>
> Request is rejected if it attributes declared as unsupported by driver (i.e.
> diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
> index 798abec67a1b..25ccd2d5d4dc 100644
> --- a/include/linux/ethtool.h
> +++ b/include/linux/ethtool.h
> @@ -332,6 +332,8 @@ struct kernel_ethtool_coalesce {
> u32 tx_aggr_max_bytes;
> u32 tx_aggr_max_frames;
> u32 tx_aggr_time_usecs;
> + u32 rx_cqe_frames;
> + u32 rx_cqe_nsecs;
> };
>
> /**
> @@ -380,7 +382,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
> #define ETHTOOL_COALESCE_TX_AGGR_TIME_USECS BIT(26)
> #define ETHTOOL_COALESCE_RX_PROFILE BIT(27)
> #define ETHTOOL_COALESCE_TX_PROFILE BIT(28)
> -#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(28, 0)
> +#define ETHTOOL_COALESCE_RX_CQE_FRAMES BIT(29)
> +#define ETHTOOL_COALESCE_RX_CQE_NSECS BIT(30)
> +#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(30, 0)
>
> #define ETHTOOL_COALESCE_USECS \
> (ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS)
> diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
> index 556a0c834df5..efc6e4ade77b 100644
> --- a/include/uapi/linux/ethtool_netlink_generated.h
> +++ b/include/uapi/linux/ethtool_netlink_generated.h
> @@ -371,6 +371,8 @@ enum {
> ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS,
> ETHTOOL_A_COALESCE_RX_PROFILE,
> ETHTOOL_A_COALESCE_TX_PROFILE,
> + ETHTOOL_A_COALESCE_RX_CQE_FRAMES,
> + ETHTOOL_A_COALESCE_RX_CQE_NSECS,
>
> __ETHTOOL_A_COALESCE_CNT,
> ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1)
> diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c
> index 3e18ca1ccc5e..349bb02c517a 100644
> --- a/net/ethtool/coalesce.c
> +++ b/net/ethtool/coalesce.c
> @@ -118,6 +118,8 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base,
> nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */
> nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */
> nla_total_size(sizeof(u32)) + /* _TX_AGGR_TIME_USECS */
> + nla_total_size(sizeof(u32)) + /* _RX_CQE_FRAMES */
> + nla_total_size(sizeof(u32)) + /* _RX_CQE_NSECS */
> total_modersz * 2; /* _{R,T}X_PROFILE */
> }
>
> @@ -269,7 +271,11 @@ static int coalesce_fill_reply(struct sk_buff *skb,
> coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES,
> kcoal->tx_aggr_max_frames, supported) ||
> coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS,
> - kcoal->tx_aggr_time_usecs, supported))
> + kcoal->tx_aggr_time_usecs, supported) ||
> + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_CQE_FRAMES,
> + kcoal->rx_cqe_frames, supported) ||
> + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_CQE_NSECS,
> + kcoal->rx_cqe_nsecs, supported))
> return -EMSGSIZE;
>
> if (!req_base->dev || !req_base->dev->irq_moder)
> @@ -338,6 +344,8 @@ const struct nla_policy ethnl_coalesce_set_policy[] = {
> [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 },
> [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 },
> [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 },
> + [ETHTOOL_A_COALESCE_RX_CQE_FRAMES] = { .type = NLA_U32 },
> + [ETHTOOL_A_COALESCE_RX_CQE_NSECS] = { .type = NLA_U32 },
> [ETHTOOL_A_COALESCE_RX_PROFILE] =
> NLA_POLICY_NESTED(coalesce_profile_policy),
> [ETHTOOL_A_COALESCE_TX_PROFILE] =
> @@ -570,6 +578,10 @@ __ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info,
> tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES], &mod);
> ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs,
> tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod);
> + ethnl_update_u32(&kernel_coalesce.rx_cqe_frames,
> + tb[ETHTOOL_A_COALESCE_RX_CQE_FRAMES], &mod);
> + ethnl_update_u32(&kernel_coalesce.rx_cqe_nsecs,
> + tb[ETHTOOL_A_COALESCE_RX_CQE_NSECS], &mod);
>
> if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_RX) {
> ret = ethnl_update_profile(dev, &dev->irq_moder->rx_profile,
^ permalink raw reply
* Re: [PATCH rdma-next 42/50] RDMA/bnxt_re: Complete CQ resize in a single step
From: Leon Romanovsky @ 2026-02-24 10:59 UTC (permalink / raw)
To: Selvin Xavier
Cc: Jason Gunthorpe, Kalesh AP, Potnuri Bharat Teja, Michael Margolin,
Gal Pressman, Yossi Leybovich, Cheng Xu, Kai Shen,
Chengchang Tang, Junxian Huang, Abhijit Gangurde, Allen Hubbe,
Krzysztof Czurylo, Tatyana Nikolova, Long Li, Konstantin Taranov,
Yishai Hadas, Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun,
linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <CA+sbYW2QKSbKpoHWMCL_6QnXYVuhx9Los9EMFasWeKCfcqUXsg@mail.gmail.com>
On Tue, Feb 24, 2026 at 01:45:42PM +0530, Selvin Xavier wrote:
> On Fri, Feb 13, 2026 at 4:31 PM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > From: Leon Romanovsky <leonro@nvidia.com>
> >
> > There is no need to defer the CQ resize operation, as it is intended to
> > be completed in one pass. The current bnxt_re_resize_cq() implementation
> > does not handle concurrent CQ resize requests, and this will be addressed
> > in the following patches.
> >
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> > drivers/infiniband/hw/bnxt_re/ib_verbs.c | 33 +++++++++-----------------------
> > 1 file changed, 9 insertions(+), 24 deletions(-)
> >
> > diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> > index d652018c19b3..2aecfbbb7eaf 100644
> > --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> > +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> > @@ -3309,20 +3309,6 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
> > return rc;
> > }
> >
> > -static void bnxt_re_resize_cq_complete(struct bnxt_re_cq *cq)
> > -{
> > - struct bnxt_re_dev *rdev = cq->rdev;
> > -
> > - bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq);
> > -
> > - cq->qplib_cq.max_wqe = cq->resize_cqe;
> > - if (cq->resize_umem) {
> > - ib_umem_release(cq->ib_cq.umem);
> > - cq->ib_cq.umem = cq->resize_umem;
> > - cq->resize_umem = NULL;
> > - cq->resize_cqe = 0;
> > - }
> > -}
> >
> > int bnxt_re_resize_cq(struct ib_cq *ibcq, unsigned int cqe,
> > struct ib_udata *udata)
> > @@ -3387,7 +3373,15 @@ int bnxt_re_resize_cq(struct ib_cq *ibcq, unsigned int cqe,
> > goto fail;
> > }
> >
> > - cq->ib_cq.cqe = cq->resize_cqe;
> > + bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq);
> > +
> > + cq->qplib_cq.max_wqe = cq->resize_cqe;
> > + ib_umem_release(cq->ib_cq.umem);
> > + cq->ib_cq.umem = cq->resize_umem;
> > + cq->resize_umem = NULL;
> > + cq->resize_cqe = 0;
> > +
> > + cq->ib_cq.cqe = entries;
> > atomic_inc(&rdev->stats.res.resize_count);
> >
> > return 0;
> > @@ -3907,15 +3901,6 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
> > struct bnxt_re_sqp_entries *sqp_entry = NULL;
> > unsigned long flags;
> >
> > - /* User CQ; the only processing we do is to
> > - * complete any pending CQ resize operation.
> > - */
> > - if (cq->ib_cq.umem) {
> > - if (cq->resize_umem)
> > - bnxt_re_resize_cq_complete(cq);
> > - return 0;
> > - }
> > -
> Since this code is removed, we need to remove ibv_cmd_poll_cq call
> from the user library.
> For older libraries which still calls ibv_cmd_poll_cq, i think we
> should we keep a check. Else it will throw a print "POLL CQ : no CQL
> to use". Either we should add the following code or remove this print.
> if (cq->ib_cq.umem)
> return 0;
I'll add the check with extra comment.
> Otherwise, it looks good to me.
Thanks
>
> Thanks,
> Selvin
>
>
>
>
> > spin_lock_irqsave(&cq->cq_lock, flags);
> > budget = min_t(u32, num_entries, cq->max_cql);
> > num_entries = budget;
> >
> > --
> > 2.52.0
> >
^ permalink raw reply
* Re: [PATCH rdma-next 18/50] RDMA/erdma: Separate user and kernel CQ creation paths
From: Leon Romanovsky @ 2026-02-24 10:57 UTC (permalink / raw)
To: Cheng Xu; +Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <d21833fa-a737-3b46-dda3-92837f78f8e4@linux.alibaba.com>
On Tue, Feb 24, 2026 at 01:51:41PM +0800, Cheng Xu wrote:
>
>
> On 2/13/26 6:57 PM, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> >
> > Split CQ creation into distinct kernel and user flows. The erdma driver,
> > inherited from mlx4, uses a problematic pattern that shares and caches
> > umem in erdma_map_user_dbrecords(). This design blocks the driver from
> > supporting generic umem sources (VMA, dmabuf, memfd, and others).
> >
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> > drivers/infiniband/hw/erdma/erdma_main.c | 1 +
> > drivers/infiniband/hw/erdma/erdma_verbs.c | 97 ++++++++++++++++++++-----------
> > drivers/infiniband/hw/erdma/erdma_verbs.h | 2 +
> > 3 files changed, 67 insertions(+), 33 deletions(-)
> >
> > diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c
> > index f35b30235018..1b6426e89d80 100644
> > --- a/drivers/infiniband/hw/erdma/erdma_main.c
> > +++ b/drivers/infiniband/hw/erdma/erdma_main.c
> > @@ -505,6 +505,7 @@ static const struct ib_device_ops erdma_device_ops = {
> > .alloc_pd = erdma_alloc_pd,
> > .alloc_ucontext = erdma_alloc_ucontext,
> > .create_cq = erdma_create_cq,
> > + .create_user_cq = erdma_create_user_cq,
> > .create_qp = erdma_create_qp,
> > .dealloc_pd = erdma_dealloc_pd,
> > .dealloc_ucontext = erdma_dealloc_ucontext,
>
> <...>
>
> > +
> > +int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
> > + struct uverbs_attr_bundle *attrs)
>
> create_cq will be used for kernel CQ creation, and the third input parameter
> 'struct uverbs_attr_bundle *attrs' will be useless, so it can be removed? Same to
> all drivers.
Yes, but only after conversion of all drivers. I have that removal patch
in my v2.
>
>
> > +{
>
> <...>
>
> > + ret = create_cq_cmd(NULL, cq);
> > + if (ret)
> > + goto err_free_res;
>
>
> In create_cq_cmd, should add the following change:
I took slightly different approach and inlined create_cq_cmd() into erdma_create_*_cq().
Thanks
>
> diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c
> index 8c30df61ae3d..eca28524e04b 100644
> --- a/drivers/infiniband/hw/erdma/erdma_verbs.c
> +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c
> @@ -240,7 +240,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)
> req.first_page_offset = mem->page_offset;
> req.cq_dbrec_dma = cq->user_cq.dbrec_dma;
>
> - if (uctx->ext_db.enable) {
> + if (uctx && uctx->ext_db.enable) {
> req.cfg1 |= FIELD_PREP(
> ERDMA_CMD_CREATE_CQ_MTT_DB_CFG_MASK, 1);
> req.cfg2 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_DB_CFG_MASK,
>
>
> Thanks,
> Cheng Xu
>
^ permalink raw reply
* Re: [PATCH rdma-next 26/50] RDMA/erdma: Separate user and kernel CQ creation paths
From: Leon Romanovsky @ 2026-02-24 10:46 UTC (permalink / raw)
To: Cheng Xu
Cc: Jason Gunthorpe, Selvin Xavier, Kalesh AP, Potnuri Bharat Teja,
Michael Margolin, Gal Pressman, Yossi Leybovich, Kai Shen,
Chengchang Tang, Junxian Huang, Abhijit Gangurde, Allen Hubbe,
Krzysztof Czurylo, Tatyana Nikolova, Long Li, Konstantin Taranov,
Yishai Hadas, Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun,
linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <b1070bb3-5963-2e2b-288c-ac5912b6c22e@linux.alibaba.com>
On Tue, Feb 24, 2026 at 10:20:39AM +0800, Cheng Xu wrote:
>
>
> On 2/13/26 6:58 PM, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> >
> > Split CQ creation into distinct kernel and user flows. The hns driver,
> > inherited from mlx4, uses a problematic pattern that shares and caches
> > umem in hns_roce_db_map_user(). This design blocks the driver from
> > supporting generic umem sources (VMA, dmabuf, memfd, and others).
> >
> > In addition, let's delete counter that counts CQ creation errors. There
> > are multiple ways to debug kernel in modern kernel without need to rely
> > on that debugfs counter.
> >
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> > drivers/infiniband/hw/hns/hns_roce_cq.c | 103 ++++++++++++++++++++-------
> > drivers/infiniband/hw/hns/hns_roce_debugfs.c | 1 -
> > drivers/infiniband/hw/hns/hns_roce_device.h | 3 +-
> > drivers/infiniband/hw/hns/hns_roce_main.c | 1 +
> > 4 files changed, 82 insertions(+), 26 deletions(-)
> >
>
> Hi Leon,
>
> The driver name in this patch's title should be "RDMA/hns".
Right, thanks
^ permalink raw reply
* Re: [net-next] net: mana: Trigger VF reset/recovery on health check failure due to HWC timeout
From: Simon Horman @ 2026-02-24 8:26 UTC (permalink / raw)
To: dipayanroy
Cc: Simon Horman, kuba, decui, pabeni, dipayanroy, kotaranov,
linux-kernel, edumazet, davem, linux-rdma, shradhagupta, kys,
ssengar, haiyangz, linux-hyperv, wei.liu, netdev, longli, ernis,
shirazsaleem, andrew+netdev
In-Reply-To: <aZwUDlTkb5xunIkH@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
For local reproduction steps see https://netdev-ai.bots.linux.dev/ai-local.html
---
net: mana: Trigger VF reset/recovery on health check failure due to HWC timeout
The GF stats periodic query is used as mechanism to monitor HWC health
check. If this HWC command times out, it is a strong indication that
the device/SoC is in a faulty state and requires recovery.
Today, when a timeout is detected, the driver marks
hwc_timeout_occurred, clears cached stats, and stops rescheduling the
periodic work. However, the device itself is left in the same failing
state.
Extend the timeout handling path to trigger the existing MANA VF
recovery service by queueing a GDMA_EQE_HWC_RESET_REQUEST work item.
This is expected to initiate the appropriate recovery flow by suspende
resume first and if it fails then trigger a bus rescan.
This change is intentionally limited to HWC command timeouts and does
not trigger recovery for errors reported by the SoC as a normal command
response.
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 0055c231acf6..16c438d2aaa3 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
[ ... ]
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 9b5a72ada5c4..e6a4034b40f9 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
[ ... ]
> @@ -3530,6 +3530,8 @@ static void mana_gf_stats_work_handler(struct work_struct *work)
> {
> struct mana_context *ac =
> container_of(to_delayed_work(work), struct mana_context, gf_stats_work);
> + struct gdma_context *gc = ac->gdma_dev->gdma_context;
> + struct mana_serv_work *mns_wk;
> int err;
>
> err = mana_query_gf_stats(ac);
> @@ -3537,6 +3539,30 @@ static void mana_gf_stats_work_handler(struct work_struct *work)
> /* HWC timeout detected - reset stats and stop rescheduling */
> ac->hwc_timeout_occurred = true;
> memset(&ac->hc_stats, 0, sizeof(ac->hc_stats));
> + dev_warn(gc->dev,
> + "Gf stats wk handler: gf stats query timed out.\n");
> +
> + /* As HWC timed out, indicating a faulty HW state and needs a
> + * reset.
> + */
> + if (!test_and_set_bit(GC_IN_SERVICE, &gc->flags)) {
> + if (!try_module_get(THIS_MODULE)) {
> + dev_info(gc->dev, "Module is unloading\n");
> + return;
> + }
> +
> + mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC);
> + if (!mns_wk) {
> + module_put(THIS_MODULE);
> + return;
> + }
Does this leave the GC_IN_SERVICE flag permanently set? When
test_and_set_bit() atomically sets the bit and then try_module_get()
fails or kzalloc() fails, the function returns without clearing
GC_IN_SERVICE.
This would permanently block all future recovery attempts. Looking at
mana_gd_process_eqe() in gdma_main.c:
if (test_bit(GC_IN_SERVICE, &gc->flags)) {
dev_info(gc->dev, "Already in service\n");
break;
}
It would detect the set bit and skip the reset. Similarly, mana_tx_timeout()
in mana_en.c would also short-circuit:
if (test_bit(GC_IN_SERVICE, &gc->flags))
return;
The device cannot recover without manual intervention since the flag is only
cleared by mana_serv_reset() at the end of the service work:
drivers/net/ethernet/microsoft/mana/gdma_main.c:mana_serv_reset()
{
...
out:
clear_bit(GC_IN_SERVICE, &gc->flags);
}
But since the work is never scheduled in these error paths, clear_bit() is
never called.
Note that the existing mana_gd_process_eqe() path avoids this problem by
calling set_bit(GC_IN_SERVICE) only AFTER all allocations succeed:
drivers/net/ethernet/microsoft/mana/gdma_main.c:mana_gd_process_eqe()
{
...
if (test_bit(GC_IN_SERVICE, &gc->flags)) {
dev_info(gc->dev, "Already in service\n");
break;
}
if (!try_module_get(THIS_MODULE)) {
...
break;
}
mns_wk = kzalloc(sizeof(*mns_wk), GFP_KERNEL);
if (!mns_wk) {
module_put(THIS_MODULE);
break;
}
dev_info(gc->dev, "Start MANA service type:%d\n", type);
set_bit(GC_IN_SERVICE, &gc->flags); // Set only after success
...
}
Should clear_bit(GC_IN_SERVICE, &gc->flags) be added before each error
return?
> +
> + mns_wk->pdev = to_pci_dev(gc->dev);
> + mns_wk->type = GDMA_EQE_HWC_RESET_REQUEST;
> + pci_dev_get(mns_wk->pdev);
> + INIT_WORK(&mns_wk->serv_work, mana_serv_func);
> + schedule_work(&mns_wk->serv_work);
> + }
This isn't a bug, but the recovery initiation sequence here is a
near-duplicate of the pattern in mana_gd_process_eqe() (gdma_main.c).
Both perform the identical 7-step sequence: try_module_get, kzalloc
mana_serv_work, set pdev, set type, pci_dev_get, INIT_WORK, schedule_work.
The duplication already contributed to the GC_IN_SERVICE error path issue
above, where this new copy diverged from the original in its error handling.
Consider extracting a common helper to avoid divergence.
> return;
> }
> schedule_delayed_work(&ac->gf_stats_work, MANA_GF_STATS_PERIOD);
[ ... ]
^ permalink raw reply
* Re: [PATCH rdma-next 42/50] RDMA/bnxt_re: Complete CQ resize in a single step
From: Selvin Xavier @ 2026-02-24 8:15 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Jason Gunthorpe, Kalesh AP, Potnuri Bharat Teja, Michael Margolin,
Gal Pressman, Yossi Leybovich, Cheng Xu, Kai Shen,
Chengchang Tang, Junxian Huang, Abhijit Gangurde, Allen Hubbe,
Krzysztof Czurylo, Tatyana Nikolova, Long Li, Konstantin Taranov,
Yishai Hadas, Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun,
linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-42-f3be85847922@nvidia.com>
[-- Attachment #1: Type: text/plain, Size: 3239 bytes --]
On Fri, Feb 13, 2026 at 4:31 PM Leon Romanovsky <leon@kernel.org> wrote:
>
> From: Leon Romanovsky <leonro@nvidia.com>
>
> There is no need to defer the CQ resize operation, as it is intended to
> be completed in one pass. The current bnxt_re_resize_cq() implementation
> does not handle concurrent CQ resize requests, and this will be addressed
> in the following patches.
>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
> drivers/infiniband/hw/bnxt_re/ib_verbs.c | 33 +++++++++-----------------------
> 1 file changed, 9 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> index d652018c19b3..2aecfbbb7eaf 100644
> --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
> @@ -3309,20 +3309,6 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
> return rc;
> }
>
> -static void bnxt_re_resize_cq_complete(struct bnxt_re_cq *cq)
> -{
> - struct bnxt_re_dev *rdev = cq->rdev;
> -
> - bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq);
> -
> - cq->qplib_cq.max_wqe = cq->resize_cqe;
> - if (cq->resize_umem) {
> - ib_umem_release(cq->ib_cq.umem);
> - cq->ib_cq.umem = cq->resize_umem;
> - cq->resize_umem = NULL;
> - cq->resize_cqe = 0;
> - }
> -}
>
> int bnxt_re_resize_cq(struct ib_cq *ibcq, unsigned int cqe,
> struct ib_udata *udata)
> @@ -3387,7 +3373,15 @@ int bnxt_re_resize_cq(struct ib_cq *ibcq, unsigned int cqe,
> goto fail;
> }
>
> - cq->ib_cq.cqe = cq->resize_cqe;
> + bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq);
> +
> + cq->qplib_cq.max_wqe = cq->resize_cqe;
> + ib_umem_release(cq->ib_cq.umem);
> + cq->ib_cq.umem = cq->resize_umem;
> + cq->resize_umem = NULL;
> + cq->resize_cqe = 0;
> +
> + cq->ib_cq.cqe = entries;
> atomic_inc(&rdev->stats.res.resize_count);
>
> return 0;
> @@ -3907,15 +3901,6 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
> struct bnxt_re_sqp_entries *sqp_entry = NULL;
> unsigned long flags;
>
> - /* User CQ; the only processing we do is to
> - * complete any pending CQ resize operation.
> - */
> - if (cq->ib_cq.umem) {
> - if (cq->resize_umem)
> - bnxt_re_resize_cq_complete(cq);
> - return 0;
> - }
> -
Since this code is removed, we need to remove ibv_cmd_poll_cq call
from the user library.
For older libraries which still calls ibv_cmd_poll_cq, i think we
should we keep a check. Else it will throw a print "POLL CQ : no CQL
to use". Either we should add the following code or remove this print.
if (cq->ib_cq.umem)
return 0;
Otherwise, it looks good to me.
Thanks,
Selvin
> spin_lock_irqsave(&cq->cq_lock, flags);
> budget = min_t(u32, num_entries, cq->max_cql);
> num_entries = budget;
>
> --
> 2.52.0
>
[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5473 bytes --]
^ permalink raw reply
* Re: [PATCH rdma-next 18/50] RDMA/erdma: Separate user and kernel CQ creation paths
From: Cheng Xu @ 2026-02-24 5:51 UTC (permalink / raw)
To: Leon Romanovsky; +Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-18-f3be85847922@nvidia.com>
On 2/13/26 6:57 PM, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
>
> Split CQ creation into distinct kernel and user flows. The erdma driver,
> inherited from mlx4, uses a problematic pattern that shares and caches
> umem in erdma_map_user_dbrecords(). This design blocks the driver from
> supporting generic umem sources (VMA, dmabuf, memfd, and others).
>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
> drivers/infiniband/hw/erdma/erdma_main.c | 1 +
> drivers/infiniband/hw/erdma/erdma_verbs.c | 97 ++++++++++++++++++++-----------
> drivers/infiniband/hw/erdma/erdma_verbs.h | 2 +
> 3 files changed, 67 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c
> index f35b30235018..1b6426e89d80 100644
> --- a/drivers/infiniband/hw/erdma/erdma_main.c
> +++ b/drivers/infiniband/hw/erdma/erdma_main.c
> @@ -505,6 +505,7 @@ static const struct ib_device_ops erdma_device_ops = {
> .alloc_pd = erdma_alloc_pd,
> .alloc_ucontext = erdma_alloc_ucontext,
> .create_cq = erdma_create_cq,
> + .create_user_cq = erdma_create_user_cq,
> .create_qp = erdma_create_qp,
> .dealloc_pd = erdma_dealloc_pd,
> .dealloc_ucontext = erdma_dealloc_ucontext,
<...>
> +
> +int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
> + struct uverbs_attr_bundle *attrs)
create_cq will be used for kernel CQ creation, and the third input parameter
'struct uverbs_attr_bundle *attrs' will be useless, so it can be removed? Same to
all drivers.
> +{
<...>
> + ret = create_cq_cmd(NULL, cq);
> + if (ret)
> + goto err_free_res;
In create_cq_cmd, should add the following change:
diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c
index 8c30df61ae3d..eca28524e04b 100644
--- a/drivers/infiniband/hw/erdma/erdma_verbs.c
+++ b/drivers/infiniband/hw/erdma/erdma_verbs.c
@@ -240,7 +240,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)
req.first_page_offset = mem->page_offset;
req.cq_dbrec_dma = cq->user_cq.dbrec_dma;
- if (uctx->ext_db.enable) {
+ if (uctx && uctx->ext_db.enable) {
req.cfg1 |= FIELD_PREP(
ERDMA_CMD_CREATE_CQ_MTT_DB_CFG_MASK, 1);
req.cfg2 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_DB_CFG_MASK,
Thanks,
Cheng Xu
^ permalink raw reply related
* Re: [PATCH rdma-next 26/50] RDMA/erdma: Separate user and kernel CQ creation paths
From: Cheng Xu @ 2026-02-24 2:20 UTC (permalink / raw)
To: Leon Romanovsky, Jason Gunthorpe, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Kai Shen, Chengchang Tang, Junxian Huang,
Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-26-f3be85847922@nvidia.com>
On 2/13/26 6:58 PM, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
>
> Split CQ creation into distinct kernel and user flows. The hns driver,
> inherited from mlx4, uses a problematic pattern that shares and caches
> umem in hns_roce_db_map_user(). This design blocks the driver from
> supporting generic umem sources (VMA, dmabuf, memfd, and others).
>
> In addition, let's delete counter that counts CQ creation errors. There
> are multiple ways to debug kernel in modern kernel without need to rely
> on that debugfs counter.
>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
> drivers/infiniband/hw/hns/hns_roce_cq.c | 103 ++++++++++++++++++++-------
> drivers/infiniband/hw/hns/hns_roce_debugfs.c | 1 -
> drivers/infiniband/hw/hns/hns_roce_device.h | 3 +-
> drivers/infiniband/hw/hns/hns_roce_main.c | 1 +
> 4 files changed, 82 insertions(+), 26 deletions(-)
>
Hi Leon,
The driver name in this patch's title should be "RDMA/hns".
Thanks,
Cheng Xu
> diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
> index 857a913326cd..0f24a916466b 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_cq.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
> @@ -335,7 +335,10 @@ static int verify_cq_create_attr(struct hns_roce_dev *hr_dev,
> {
> struct ib_device *ibdev = &hr_dev->ib_dev;
>
> - if (!attr->cqe || attr->cqe > hr_dev->caps.max_cqes) {
> + if (attr->flags)
> + return -EOPNOTSUPP;
> +
> + if (attr->cqe > hr_dev->caps.max_cqes) {
> ibdev_err(ibdev, "failed to check CQ count %u, max = %u.\n",
> attr->cqe, hr_dev->caps.max_cqes);
> return -EINVAL;
> @@ -407,8 +410,8 @@ static int set_cqe_size(struct hns_roce_cq *hr_cq, struct ib_udata *udata,
> return 0;
> }
>
> -int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> - struct uverbs_attr_bundle *attrs)
> +int hns_roce_create_user_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> + struct uverbs_attr_bundle *attrs)
> {
> struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
> struct ib_udata *udata = &attrs->driver_udata;
> @@ -418,31 +421,27 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> struct hns_roce_ib_create_cq ucmd = {};
> int ret;
>
> - if (attr->flags) {
> - ret = -EOPNOTSUPP;
> - goto err_out;
> - }
> + if (ib_cq->umem)
> + return -EOPNOTSUPP;
>
> ret = verify_cq_create_attr(hr_dev, attr);
> if (ret)
> - goto err_out;
> + return ret;
>
> - if (udata) {
> - ret = get_cq_ucmd(hr_cq, udata, &ucmd);
> - if (ret)
> - goto err_out;
> - }
> + ret = get_cq_ucmd(hr_cq, udata, &ucmd);
> + if (ret)
> + return ret;
>
> set_cq_param(hr_cq, attr->cqe, attr->comp_vector, &ucmd);
>
> ret = set_cqe_size(hr_cq, udata, &ucmd);
> if (ret)
> - goto err_out;
> + return ret;
>
> ret = alloc_cq_buf(hr_dev, hr_cq, udata, ucmd.buf_addr);
> if (ret) {
> ibdev_err(ibdev, "failed to alloc CQ buf, ret = %d.\n", ret);
> - goto err_out;
> + return ret;
> }
>
> ret = alloc_cq_db(hr_dev, hr_cq, udata, ucmd.db_addr, &resp);
> @@ -464,13 +463,11 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> goto err_cqn;
> }
>
> - if (udata) {
> - resp.cqn = hr_cq->cqn;
> - ret = ib_copy_to_udata(udata, &resp,
> - min(udata->outlen, sizeof(resp)));
> - if (ret)
> - goto err_cqc;
> - }
> + resp.cqn = hr_cq->cqn;
> + ret = ib_copy_to_udata(udata, &resp,
> + min(udata->outlen, sizeof(resp)));
> + if (ret)
> + goto err_cqc;
>
> hr_cq->cons_index = 0;
> hr_cq->arm_sn = 1;
> @@ -487,9 +484,67 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> free_cq_db(hr_dev, hr_cq, udata);
> err_cq_buf:
> free_cq_buf(hr_dev, hr_cq);
> -err_out:
> - atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CQ_CREATE_ERR_CNT]);
> + return ret;
> +}
> +
> +int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> + struct uverbs_attr_bundle *attrs)
> +{
> + struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
> + struct hns_roce_ib_create_cq_resp resp = {};
> + struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
> + struct ib_device *ibdev = &hr_dev->ib_dev;
> + struct hns_roce_ib_create_cq ucmd = {};
> + int ret;
> +
> + ret = verify_cq_create_attr(hr_dev, attr);
> + if (ret)
> + return ret;
> +
> + set_cq_param(hr_cq, attr->cqe, attr->comp_vector, &ucmd);
> +
> + ret = set_cqe_size(hr_cq, NULL, &ucmd);
> + if (ret)
> + return ret;
>
> + ret = alloc_cq_buf(hr_dev, hr_cq, NULL, 0);
> + if (ret) {
> + ibdev_err(ibdev, "failed to alloc CQ buf, ret = %d.\n", ret);
> + return ret;
> + }
> +
> + ret = alloc_cq_db(hr_dev, hr_cq, NULL, 0, &resp);
> + if (ret) {
> + ibdev_err(ibdev, "failed to alloc CQ db, ret = %d.\n", ret);
> + goto err_cq_buf;
> + }
> +
> + ret = alloc_cqn(hr_dev, hr_cq, NULL);
> + if (ret) {
> + ibdev_err(ibdev, "failed to alloc CQN, ret = %d.\n", ret);
> + goto err_cq_db;
> + }
> +
> + ret = alloc_cqc(hr_dev, hr_cq);
> + if (ret) {
> + ibdev_err(ibdev,
> + "failed to alloc CQ context, ret = %d.\n", ret);
> + goto err_cqn;
> + }
> +
> + hr_cq->cons_index = 0;
> + hr_cq->arm_sn = 1;
> + refcount_set(&hr_cq->refcount, 1);
> + init_completion(&hr_cq->free);
> +
> + return 0;
> +
> +err_cqn:
> + free_cqn(hr_dev, hr_cq->cqn);
> +err_cq_db:
> + free_cq_db(hr_dev, hr_cq, NULL);
> +err_cq_buf:
> + free_cq_buf(hr_dev, hr_cq);
> return ret;
> }
>
> diff --git a/drivers/infiniband/hw/hns/hns_roce_debugfs.c b/drivers/infiniband/hw/hns/hns_roce_debugfs.c
> index b869cdc54118..481b30f2f5b5 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_debugfs.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_debugfs.c
> @@ -47,7 +47,6 @@ static const char * const sw_stat_info[] = {
> [HNS_ROCE_DFX_MBX_EVENT_CNT] = "mbx_event",
> [HNS_ROCE_DFX_QP_CREATE_ERR_CNT] = "qp_create_err",
> [HNS_ROCE_DFX_QP_MODIFY_ERR_CNT] = "qp_modify_err",
> - [HNS_ROCE_DFX_CQ_CREATE_ERR_CNT] = "cq_create_err",
> [HNS_ROCE_DFX_CQ_MODIFY_ERR_CNT] = "cq_modify_err",
> [HNS_ROCE_DFX_SRQ_CREATE_ERR_CNT] = "srq_create_err",
> [HNS_ROCE_DFX_SRQ_MODIFY_ERR_CNT] = "srq_modify_err",
> diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
> index 3f032b8038af..fdc5f487d7a3 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_device.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h
> @@ -902,7 +902,6 @@ enum hns_roce_sw_dfx_stat_index {
> HNS_ROCE_DFX_MBX_EVENT_CNT,
> HNS_ROCE_DFX_QP_CREATE_ERR_CNT,
> HNS_ROCE_DFX_QP_MODIFY_ERR_CNT,
> - HNS_ROCE_DFX_CQ_CREATE_ERR_CNT,
> HNS_ROCE_DFX_CQ_MODIFY_ERR_CNT,
> HNS_ROCE_DFX_SRQ_CREATE_ERR_CNT,
> HNS_ROCE_DFX_SRQ_MODIFY_ERR_CNT,
> @@ -1295,6 +1294,8 @@ int to_hr_qp_type(int qp_type);
>
> int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> struct uverbs_attr_bundle *attrs);
> +int hns_roce_create_user_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> + struct uverbs_attr_bundle *attrs);
>
> int hns_roce_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
> int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
> diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
> index a3490bab297a..64de49bf8df7 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_main.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_main.c
> @@ -727,6 +727,7 @@ static const struct ib_device_ops hns_roce_dev_ops = {
> .create_ah = hns_roce_create_ah,
> .create_user_ah = hns_roce_create_ah,
> .create_cq = hns_roce_create_cq,
> + .create_user_cq = hns_roce_create_user_cq,
> .create_qp = hns_roce_create_qp,
> .dealloc_pd = hns_roce_dealloc_pd,
> .dealloc_ucontext = hns_roce_dealloc_ucontext,
>
^ permalink raw reply
* RE: [PATCH v5 2/2] mshv: add arm64 support for doorbell & intercept SINTs
From: Michael Kelley @ 2026-02-23 20:49 UTC (permalink / raw)
To: Stanislav Kinsburskii, Anirudh Rayabharam
Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
decui@microsoft.com, longli@microsoft.com,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <aZys_5A657AYq5DQ@skinsburskii.localdomain>
From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Monday, February 23, 2026 11:40 AM
>
[snip]
> > +
> > +static int __init mshv_sint_vector_init(void)
> > +{
> > + int ret;
> > + struct hv_register_assoc reg = {
> > + .name = HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID,
> > + };
> > + union hv_input_vtl input_vtl = { 0 };
> > +
> > + if (acpi_disabled)
> > + return -ENODEV;
> > +
> > + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> > + 1, input_vtl, ®);
> > + if (ret || !reg.value.reg64)
> > + return -ENODEV;
> > +
> > + mshv_sint_vector = reg.value.reg64;
> > + ret = mshv_acpi_setup_sint_irq();
> > + if (ret <= 0) {
> > + pr_err("Failed to setup IRQ for MSHV SINT vector %d: %d\n",
> > + mshv_sint_vector, ret);
> > + goto out_fail;
> > + }
> > +
> > + mshv_sint_irq = ret;
>
> nit: given that mshv_sint_irq can't be zero, the logic can be simplified by
> using 0 instead of -1.
The test for <= 0 is actually wrong -- it should be just < 0. Zero is a valid
Linux IRQ number. For example, here's the output of /proc/interrupts on
a Gen1 VM on Hyper-V, where IRQ 0 is used by the legacy timer:
root@gen1ubun:~# cat /proc/interrupts
CPU0 CPU1 CPU2 CPU3
0: 18 0 0 0 IR-IO-APIC 2-edge timer
1: 0 9 0 0 IR-IO-APIC 1-edge i8042
4: 0 0 0 792 IR-IO-APIC 4-edge ttyS0
6: 6 0 0 0 IR-IO-APIC 6-edge floppy
8: 0 0 0 0 IR-IO-APIC 8-edge rtc0
9: 0 0 0 0 IR-IO-APIC 9-fasteoi acpi
But I see other places throughout Linux kernel code that treat IRQ 0 as
invalid. So I dunno .... But it's probably better to treat 0 as a valid IRQ
number.
Michael
>
>
>
> > +
> > + ret = request_percpu_irq(mshv_sint_irq, mshv_percpu_isr, "MSHV",
> > + &mshv_evt);
> > + if (ret)
> > + goto out_unregister;
> > +
> > + return 0;
> > +
> > +out_unregister:
> > + mshv_acpi_cleanup_sint_irq();
> > +out_fail:
> > + return ret;
> > +}
> > +
> > +static void mshv_sint_vector_cleanup(void)
> > +{
> > + free_percpu_irq(mshv_sint_irq, &mshv_evt);
> > + mshv_acpi_cleanup_sint_irq();
> > +}
> > +#else /* !HYPERVISOR_CALLBACK_VECTOR */
> > +static int __init mshv_sint_vector_init(void)
>
> nit: `init` is usually paired with `exit` or `fini`, so maybe `cleanup` can be
> renamed to `exit` as well for better consistency?
>
> Reviewed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
>
> > +{
> > + mshv_sint_vector = HYPERVISOR_CALLBACK_VECTOR;
> > + return 0;
> > +}
> > +
> > +static void mshv_sint_vector_cleanup(void)
> > +{
> > +}
> > +#endif /* HYPERVISOR_CALLBACK_VECTOR */
> > +
> > int __init mshv_synic_init(struct device *dev)
> > {
> > int ret = 0;
> >
> > + ret = mshv_sint_vector_init();
> > + if (ret)
> > + return ret;
> > +
> > synic_pages = alloc_percpu(struct hv_synic_pages);
> > if (!synic_pages) {
> > dev_err(dev, "Failed to allocate percpu synic page\n");
> > - return -ENOMEM;
> > + ret = -ENOMEM;
> > + goto sint_vector_cleanup;
> > }
> >
> > ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
> > @@ -713,6 +810,8 @@ int __init mshv_synic_init(struct device *dev)
> > cpuhp_remove_state(synic_cpuhp_online);
> > free_synic_pages:
> > free_percpu(synic_pages);
> > +sint_vector_cleanup:
> > + mshv_sint_vector_cleanup();
> > return ret;
> > }
> >
> > @@ -721,4 +820,5 @@ void mshv_synic_cleanup(void)
> > unregister_reboot_notifier(&mshv_synic_reboot_nb);
> > cpuhp_remove_state(synic_cpuhp_online);
> > free_percpu(synic_pages);
> > + mshv_sint_vector_cleanup();
> > }
> > diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> > index 30fbbde81c5c..7676f78e0766 100644
> > --- a/include/hyperv/hvgdk_mini.h
> > +++ b/include/hyperv/hvgdk_mini.h
> > @@ -1117,6 +1117,8 @@ enum hv_register_name {
> > HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A,
> >
> > HV_X64_REGISTER_REG_PAGE = 0x0009001C,
> > +#elif defined(CONFIG_ARM64)
> > + HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID = 0x00070001,
> > #endif
> > };
> >
> > --
> > 2.34.1
> >
^ permalink raw reply
* Re: [PATCH v5 2/2] mshv: add arm64 support for doorbell & intercept SINTs
From: Stanislav Kinsburskii @ 2026-02-23 19:39 UTC (permalink / raw)
To: Anirudh Rayabharam
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <20260223140159.1627229-3-anirudh@anirudhrb.com>
On Mon, Feb 23, 2026 at 02:01:59PM +0000, Anirudh Rayabharam wrote:
> From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
>
> On x86, the HYPERVISOR_CALLBACK_VECTOR is used to receive synthetic
> interrupts (SINTs) from the hypervisor for doorbells and intercepts.
> There is no such vector reserved for arm64.
>
> On arm64, the hypervisor exposes a synthetic register that can be read
> to find the INTID that should be used for SINTs. This INTID is in the
> PPI range.
>
> To better unify the code paths, introduce mshv_sint_vector_init() that
> either reads the synthetic register and obtains the INTID (arm64) or
> just uses HYPERVISOR_CALLBACK_VECTOR as the interrupt vector (x86).
>
> Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> ---
> drivers/hv/mshv_synic.c | 120 +++++++++++++++++++++++++++++++++---
> include/hyperv/hvgdk_mini.h | 2 +
> 2 files changed, 112 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
> index 074e37c48876..75ef2160b3e0 100644
> --- a/drivers/hv/mshv_synic.c
> +++ b/drivers/hv/mshv_synic.c
> @@ -10,17 +10,22 @@
> #include <linux/kernel.h>
> #include <linux/slab.h>
> #include <linux/mm.h>
> +#include <linux/interrupt.h>
> #include <linux/io.h>
> #include <linux/random.h>
> #include <linux/cpuhotplug.h>
> #include <linux/reboot.h>
> #include <asm/mshyperv.h>
> +#include <linux/platform_device.h>
> +#include <linux/acpi.h>
>
> #include "mshv_eventfd.h"
> #include "mshv.h"
>
> static int synic_cpuhp_online;
> static struct hv_synic_pages __percpu *synic_pages;
> +static int mshv_sint_vector = -1; /* hwirq for the SynIC SINTs */
> +static int mshv_sint_irq = -1; /* Linux IRQ for mshv_sint_vector */
>
> static u32 synic_event_ring_get_queued_port(u32 sint_index)
> {
> @@ -442,9 +447,7 @@ void mshv_isr(void)
> if (msg->header.message_flags.msg_pending)
> hv_set_non_nested_msr(HV_MSR_EOM, 0);
>
> -#ifdef HYPERVISOR_CALLBACK_VECTOR
> - add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR);
> -#endif
> + add_interrupt_randomness(mshv_sint_vector);
> } else {
> pr_warn_once("%s: unknown message type 0x%x\n", __func__,
> msg->header.message_type);
> @@ -456,9 +459,7 @@ static int mshv_synic_cpu_init(unsigned int cpu)
> union hv_synic_simp simp;
> union hv_synic_siefp siefp;
> union hv_synic_sirbp sirbp;
> -#ifdef HYPERVISOR_CALLBACK_VECTOR
> union hv_synic_sint sint;
> -#endif
> union hv_synic_scontrol sctrl;
> struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
> struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
> @@ -501,10 +502,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
>
> hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
>
> -#ifdef HYPERVISOR_CALLBACK_VECTOR
> + if (mshv_sint_irq != -1)
> + enable_percpu_irq(mshv_sint_irq, 0);
> +
> /* Enable intercepts */
> sint.as_uint64 = 0;
> - sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> + sint.vector = mshv_sint_vector;
> sint.masked = false;
> sint.auto_eoi = hv_recommend_using_aeoi();
> hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
> @@ -512,13 +515,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
>
> /* Doorbell SINT */
> sint.as_uint64 = 0;
> - sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> + sint.vector = mshv_sint_vector;
> sint.masked = false;
> sint.as_intercept = 1;
> sint.auto_eoi = hv_recommend_using_aeoi();
> hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
> sint.as_uint64);
> -#endif
>
> /* Enable global synic bit */
> sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
> @@ -573,6 +575,9 @@ static int mshv_synic_cpu_exit(unsigned int cpu)
> hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
> sint.as_uint64);
>
> + if (mshv_sint_irq != -1)
> + disable_percpu_irq(mshv_sint_irq);
> +
> /* Disable Synic's event ring page */
> sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
> sirbp.sirbp_enabled = false;
> @@ -683,14 +688,106 @@ static struct notifier_block mshv_synic_reboot_nb = {
> .notifier_call = mshv_synic_reboot_notify,
> };
>
> +#ifndef HYPERVISOR_CALLBACK_VECTOR
> +static DEFINE_PER_CPU(long, mshv_evt);
> +
> +static irqreturn_t mshv_percpu_isr(int irq, void *dev_id)
> +{
> + mshv_isr();
> + return IRQ_HANDLED;
> +}
> +
> +#ifdef CONFIG_ACPI
> +static int __init mshv_acpi_setup_sint_irq(void)
> +{
> + return acpi_register_gsi(NULL, mshv_sint_vector, ACPI_EDGE_SENSITIVE,
> + ACPI_ACTIVE_HIGH);
> +}
> +
> +static void mshv_acpi_cleanup_sint_irq(void)
> +{
> + acpi_unregister_gsi(mshv_sint_vector);
> +}
> +#else
> +static int __init mshv_acpi_setup_sint_irq(void)
> +{
> + return -ENODEV;
> +}
> +
> +static void mshv_acpi_cleanup_sint_irq(void)
> +{
> +}
> +#endif
> +
> +static int __init mshv_sint_vector_init(void)
> +{
> + int ret;
> + struct hv_register_assoc reg = {
> + .name = HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID,
> + };
> + union hv_input_vtl input_vtl = { 0 };
> +
> + if (acpi_disabled)
> + return -ENODEV;
> +
> + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> + 1, input_vtl, ®);
> + if (ret || !reg.value.reg64)
> + return -ENODEV;
> +
> + mshv_sint_vector = reg.value.reg64;
> + ret = mshv_acpi_setup_sint_irq();
> + if (ret <= 0) {
> + pr_err("Failed to setup IRQ for MSHV SINT vector %d: %d\n",
> + mshv_sint_vector, ret);
> + goto out_fail;
> + }
> +
> + mshv_sint_irq = ret;
nit: given that mshv_sint_irq can't be zero, the logic can be simplified by
using 0 instead of -1.
> +
> + ret = request_percpu_irq(mshv_sint_irq, mshv_percpu_isr, "MSHV",
> + &mshv_evt);
> + if (ret)
> + goto out_unregister;
> +
> + return 0;
> +
> +out_unregister:
> + mshv_acpi_cleanup_sint_irq();
> +out_fail:
> + return ret;
> +}
> +
> +static void mshv_sint_vector_cleanup(void)
> +{
> + free_percpu_irq(mshv_sint_irq, &mshv_evt);
> + mshv_acpi_cleanup_sint_irq();
> +}
> +#else /* !HYPERVISOR_CALLBACK_VECTOR */
> +static int __init mshv_sint_vector_init(void)
nit: `init` is usually paired with `exit` or `fini`, so maybe `cleanup` can be
renamed to `exit` as well for better consistency?
Reviewed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> +{
> + mshv_sint_vector = HYPERVISOR_CALLBACK_VECTOR;
> + return 0;
> +}
> +
> +static void mshv_sint_vector_cleanup(void)
> +{
> +}
> +#endif /* HYPERVISOR_CALLBACK_VECTOR */
> +
> int __init mshv_synic_init(struct device *dev)
> {
> int ret = 0;
>
> + ret = mshv_sint_vector_init();
> + if (ret)
> + return ret;
> +
> synic_pages = alloc_percpu(struct hv_synic_pages);
> if (!synic_pages) {
> dev_err(dev, "Failed to allocate percpu synic page\n");
> - return -ENOMEM;
> + ret = -ENOMEM;
> + goto sint_vector_cleanup;
> }
>
> ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
> @@ -713,6 +810,8 @@ int __init mshv_synic_init(struct device *dev)
> cpuhp_remove_state(synic_cpuhp_online);
> free_synic_pages:
> free_percpu(synic_pages);
> +sint_vector_cleanup:
> + mshv_sint_vector_cleanup();
> return ret;
> }
>
> @@ -721,4 +820,5 @@ void mshv_synic_cleanup(void)
> unregister_reboot_notifier(&mshv_synic_reboot_nb);
> cpuhp_remove_state(synic_cpuhp_online);
> free_percpu(synic_pages);
> + mshv_sint_vector_cleanup();
> }
> diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> index 30fbbde81c5c..7676f78e0766 100644
> --- a/include/hyperv/hvgdk_mini.h
> +++ b/include/hyperv/hvgdk_mini.h
> @@ -1117,6 +1117,8 @@ enum hv_register_name {
> HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A,
>
> HV_X64_REGISTER_REG_PAGE = 0x0009001C,
> +#elif defined(CONFIG_ARM64)
> + HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID = 0x00070001,
> #endif
> };
>
> --
> 2.34.1
>
^ permalink raw reply
* Re: [PATCH] mshv: Replace fixed memory deposit with status driven helper
From: Stanislav Kinsburskii @ 2026-02-23 18:17 UTC (permalink / raw)
To: Michael Kelley
Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
decui@microsoft.com, longli@microsoft.com,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <SN6PR02MB415705AA10C44D52CFFC0D31D468A@SN6PR02MB4157.namprd02.prod.outlook.com>
On Fri, Feb 20, 2026 at 05:05:09PM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Thursday, February 19, 2026 2:10 PM
> >
> > Replace hardcoded HV_MAP_GPA_DEPOSIT_PAGES usage with
> > hv_deposit_memory() which derives the deposit size from
> > the hypercall status, and remove the now-unused constant.
> >
> > The previous code always deposited a fixed 256 pages on
> > insufficient memory, ignoring the actual demand reported
> > by the hypervisor.
>
> Does the hypervisor report a specific page count demand? I haven't
> seen that anywhere. It seems like the deposit memory operation is
> always something of a guess.
>
Correct, it does not, except for the *CONTIGUOUS_MEMORY* status. That
status indicates a need for a large contiguous block (at least 8 pages).
> > hv_deposit_memory() handles different
> > deposit statuses, aligning map-GPA retries with the rest
> > of the codebase.
> >
> > This approach may require more allocation and deposit
> > hypercall iterations, but avoids over-depositing large
> > fixed chunks when fewer pages would suffice. Until any
> > performance impact is measured, the more frugal and
> > consistent behavior is preferred.
> >
> > Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
>
> From a purely functional standpoint, this change addresses the
> concern that I raised. But I don’t have any intuition on the performance
> impact of having to iterate. hv_deposit_memory() adds only a single
> page for some of the statuses, so if there really is a large memory need,
> the new code would iterate 256 times to achieve what the existing code
> does.
>
> Any idea where the 256 came from the first place? Was that
> empirically determined like some of the other memory deposit counts?
>
Unfortunately, the history of this change has been lost. My guess is
that it was a straightforward optimization to reduce the number of
iterations. But without a clear understanding of the real memory needs
or the performance impact, it was only a guess.
> In addition to a potential performance impact, I know the hypervisor tries
> to detect denial-of-service attempts that make "too many" calls to the
> hypervisor in a short period of time. In such a case, the hypervisor
> suspends scheduling the VM for a few seconds before allowing it to resume.
> Just need to make sure the hypervisor doesn't think the iterating is a
> denial-of-service attack. Or maybe that denial-of-service detection
> doesn't apply to the root partition VM.
>
This deposit hypercall shouldn’t run into this issue. If it did, it
would mean that starting 256 VMs at the same time would trigger the same
problem, with one deposit per VM.
Since there's no sign of that happening so far, I'd prefer to keep
things simple and revisit it later if needed.
Thanks,
Stanislav
> But from a functional standpoint,
> Reviewed-by: Michael Kelley <mhklinux@outlook.com>
>
> > ---
> > drivers/hv/mshv_root_hv_call.c | 4 +---
> > 1 file changed, 1 insertion(+), 3 deletions(-)
> >
> > diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> > index 7f91096f95a8..317191462b63 100644
> > --- a/drivers/hv/mshv_root_hv_call.c
> > +++ b/drivers/hv/mshv_root_hv_call.c
> > @@ -16,7 +16,6 @@
> >
> > /* Determined empirically */
> > #define HV_INIT_PARTITION_DEPOSIT_PAGES 208
> > -#define HV_MAP_GPA_DEPOSIT_PAGES 256
> > #define HV_UMAP_GPA_PAGES 512
> >
> > #define HV_PAGE_COUNT_2M_ALIGNED(pg_count) (!((pg_count) & (0x200 - 1)))
> > @@ -239,8 +238,7 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64
> > page_struct_count,
> > completed = hv_repcomp(status);
> >
> > if (hv_result_needs_memory(status)) {
> > - ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
> > - HV_MAP_GPA_DEPOSIT_PAGES);
> > + ret = hv_deposit_memory(partition_id, status);
> > if (ret)
> > break;
> >
> >
> >
>
^ permalink raw reply
* RE: [PATCH v5 2/2] mshv: add arm64 support for doorbell & intercept SINTs
From: Michael Kelley @ 2026-02-23 17:53 UTC (permalink / raw)
To: Anirudh Rayabharam, kys@microsoft.com, haiyangz@microsoft.com,
wei.liu@kernel.org, decui@microsoft.com, longli@microsoft.com,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20260223140159.1627229-3-anirudh@anirudhrb.com>
From: Anirudh Rayabharam <anirudh@anirudhrb.com> Sent: Monday, February 23, 2026 6:02 AM
>
> On x86, the HYPERVISOR_CALLBACK_VECTOR is used to receive synthetic
> interrupts (SINTs) from the hypervisor for doorbells and intercepts.
> There is no such vector reserved for arm64.
>
> On arm64, the hypervisor exposes a synthetic register that can be read
> to find the INTID that should be used for SINTs. This INTID is in the
> PPI range.
>
> To better unify the code paths, introduce mshv_sint_vector_init() that
> either reads the synthetic register and obtains the INTID (arm64) or
> just uses HYPERVISOR_CALLBACK_VECTOR as the interrupt vector (x86).
>
> Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> ---
> drivers/hv/mshv_synic.c | 120 +++++++++++++++++++++++++++++++++---
> include/hyperv/hvgdk_mini.h | 2 +
> 2 files changed, 112 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
> index 074e37c48876..75ef2160b3e0 100644
> --- a/drivers/hv/mshv_synic.c
> +++ b/drivers/hv/mshv_synic.c
> @@ -10,17 +10,22 @@
> #include <linux/kernel.h>
> #include <linux/slab.h>
> #include <linux/mm.h>
> +#include <linux/interrupt.h>
> #include <linux/io.h>
> #include <linux/random.h>
> #include <linux/cpuhotplug.h>
> #include <linux/reboot.h>
> #include <asm/mshyperv.h>
> +#include <linux/platform_device.h>
I don't think this #include is needed now that you've switched to getting
the INTID via a hypercall instead of via an ACPI device.
The rest of the changes look good to me. You have a place carved out
to put the DT setup of the mshv_sint_irq, and the scope of all the
variables and mshv_percpu_isr() is correct so that there won't be any
"unused" warnings generated. Nice!
Modulo the unnecessary #include,
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> +#include <linux/acpi.h>
>
> #include "mshv_eventfd.h"
> #include "mshv.h"
>
> static int synic_cpuhp_online;
> static struct hv_synic_pages __percpu *synic_pages;
> +static int mshv_sint_vector = -1; /* hwirq for the SynIC SINTs */
> +static int mshv_sint_irq = -1; /* Linux IRQ for mshv_sint_vector */
>
> static u32 synic_event_ring_get_queued_port(u32 sint_index)
> {
> @@ -442,9 +447,7 @@ void mshv_isr(void)
> if (msg->header.message_flags.msg_pending)
> hv_set_non_nested_msr(HV_MSR_EOM, 0);
>
> -#ifdef HYPERVISOR_CALLBACK_VECTOR
> - add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR);
> -#endif
> + add_interrupt_randomness(mshv_sint_vector);
> } else {
> pr_warn_once("%s: unknown message type 0x%x\n", __func__,
> msg->header.message_type);
> @@ -456,9 +459,7 @@ static int mshv_synic_cpu_init(unsigned int cpu)
> union hv_synic_simp simp;
> union hv_synic_siefp siefp;
> union hv_synic_sirbp sirbp;
> -#ifdef HYPERVISOR_CALLBACK_VECTOR
> union hv_synic_sint sint;
> -#endif
> union hv_synic_scontrol sctrl;
> struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
> struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
> @@ -501,10 +502,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
>
> hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
>
> -#ifdef HYPERVISOR_CALLBACK_VECTOR
> + if (mshv_sint_irq != -1)
> + enable_percpu_irq(mshv_sint_irq, 0);
> +
> /* Enable intercepts */
> sint.as_uint64 = 0;
> - sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> + sint.vector = mshv_sint_vector;
> sint.masked = false;
> sint.auto_eoi = hv_recommend_using_aeoi();
> hv_set_non_nested_msr(HV_MSR_SINT0 +
> HV_SYNIC_INTERCEPTION_SINT_INDEX,
> @@ -512,13 +515,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
>
> /* Doorbell SINT */
> sint.as_uint64 = 0;
> - sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> + sint.vector = mshv_sint_vector;
> sint.masked = false;
> sint.as_intercept = 1;
> sint.auto_eoi = hv_recommend_using_aeoi();
> hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
> sint.as_uint64);
> -#endif
>
> /* Enable global synic bit */
> sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
> @@ -573,6 +575,9 @@ static int mshv_synic_cpu_exit(unsigned int cpu)
> hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
> sint.as_uint64);
>
> + if (mshv_sint_irq != -1)
> + disable_percpu_irq(mshv_sint_irq);
> +
> /* Disable Synic's event ring page */
> sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
> sirbp.sirbp_enabled = false;
> @@ -683,14 +688,106 @@ static struct notifier_block mshv_synic_reboot_nb = {
> .notifier_call = mshv_synic_reboot_notify,
> };
>
> +#ifndef HYPERVISOR_CALLBACK_VECTOR
> +static DEFINE_PER_CPU(long, mshv_evt);
> +
> +static irqreturn_t mshv_percpu_isr(int irq, void *dev_id)
> +{
> + mshv_isr();
> + return IRQ_HANDLED;
> +}
> +
> +#ifdef CONFIG_ACPI
> +static int __init mshv_acpi_setup_sint_irq(void)
> +{
> + return acpi_register_gsi(NULL, mshv_sint_vector, ACPI_EDGE_SENSITIVE,
> + ACPI_ACTIVE_HIGH);
> +}
> +
> +static void mshv_acpi_cleanup_sint_irq(void)
> +{
> + acpi_unregister_gsi(mshv_sint_vector);
> +}
> +#else
> +static int __init mshv_acpi_setup_sint_irq(void)
> +{
> + return -ENODEV;
> +}
> +
> +static void mshv_acpi_cleanup_sint_irq(void)
> +{
> +}
> +#endif
> +
> +static int __init mshv_sint_vector_init(void)
> +{
> + int ret;
> + struct hv_register_assoc reg = {
> + .name = HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID,
> + };
> + union hv_input_vtl input_vtl = { 0 };
> +
> + if (acpi_disabled)
> + return -ENODEV;
> +
> + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> + 1, input_vtl, ®);
> + if (ret || !reg.value.reg64)
> + return -ENODEV;
> +
> + mshv_sint_vector = reg.value.reg64;
> + ret = mshv_acpi_setup_sint_irq();
> + if (ret <= 0) {
> + pr_err("Failed to setup IRQ for MSHV SINT vector %d: %d\n",
> + mshv_sint_vector, ret);
> + goto out_fail;
> + }
> +
> + mshv_sint_irq = ret;
> +
> + ret = request_percpu_irq(mshv_sint_irq, mshv_percpu_isr, "MSHV",
> + &mshv_evt);
> + if (ret)
> + goto out_unregister;
> +
> + return 0;
> +
> +out_unregister:
> + mshv_acpi_cleanup_sint_irq();
> +out_fail:
> + return ret;
> +}
> +
> +static void mshv_sint_vector_cleanup(void)
> +{
> + free_percpu_irq(mshv_sint_irq, &mshv_evt);
> + mshv_acpi_cleanup_sint_irq();
> +}
> +#else /* !HYPERVISOR_CALLBACK_VECTOR */
> +static int __init mshv_sint_vector_init(void)
> +{
> + mshv_sint_vector = HYPERVISOR_CALLBACK_VECTOR;
> + return 0;
> +}
> +
> +static void mshv_sint_vector_cleanup(void)
> +{
> +}
> +#endif /* HYPERVISOR_CALLBACK_VECTOR */
> +
> int __init mshv_synic_init(struct device *dev)
> {
> int ret = 0;
>
> + ret = mshv_sint_vector_init();
> + if (ret)
> + return ret;
> +
> synic_pages = alloc_percpu(struct hv_synic_pages);
> if (!synic_pages) {
> dev_err(dev, "Failed to allocate percpu synic page\n");
> - return -ENOMEM;
> + ret = -ENOMEM;
> + goto sint_vector_cleanup;
> }
>
> ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
> @@ -713,6 +810,8 @@ int __init mshv_synic_init(struct device *dev)
> cpuhp_remove_state(synic_cpuhp_online);
> free_synic_pages:
> free_percpu(synic_pages);
> +sint_vector_cleanup:
> + mshv_sint_vector_cleanup();
> return ret;
> }
>
> @@ -721,4 +820,5 @@ void mshv_synic_cleanup(void)
> unregister_reboot_notifier(&mshv_synic_reboot_nb);
> cpuhp_remove_state(synic_cpuhp_online);
> free_percpu(synic_pages);
> + mshv_sint_vector_cleanup();
> }
> diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> index 30fbbde81c5c..7676f78e0766 100644
> --- a/include/hyperv/hvgdk_mini.h
> +++ b/include/hyperv/hvgdk_mini.h
> @@ -1117,6 +1117,8 @@ enum hv_register_name {
> HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A,
>
> HV_X64_REGISTER_REG_PAGE = 0x0009001C,
> +#elif defined(CONFIG_ARM64)
> + HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID = 0x00070001,
> #endif
> };
>
> --
> 2.34.1
>
^ permalink raw reply
* RE: [PATCH v5 1/2] mshv: refactor synic init and cleanup
From: Michael Kelley @ 2026-02-23 17:52 UTC (permalink / raw)
To: Anirudh Rayabharam, kys@microsoft.com, haiyangz@microsoft.com,
wei.liu@kernel.org, decui@microsoft.com, longli@microsoft.com,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20260223140159.1627229-2-anirudh@anirudhrb.com>
From: Anirudh Rayabharam <anirudh@anirudhrb.com> Sent: Monday, February 23, 2026 6:02 AM
>
> Rename mshv_synic_init() to mshv_synic_cpu_init() and
> mshv_synic_cleanup() to mshv_synic_cpu_exit() to better reflect that
> these functions handle per-cpu synic setup and teardown.
>
> Use mshv_synic_init/cleanup() to perform init/cleanup that is not per-cpu.
> Move all the synic related setup from mshv_parent_partition_init.
>
> Move the reboot notifier to mshv_synic.c because it currently only
> operates on the synic cpuhp state.
>
> Move out synic_pages from the global mshv_root since its use is now
> completely local to mshv_synic.c.
>
> This is in preparation for the next patch which will add more stuff to
> mshv_synic_init().
>
> No functional change.
>
> Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
This patch needs to be rebased on the latest linux-next. It doesn't
apply cleanly on linux-next20260219 and resolving the conflicts is
a bit messy. But other than that,
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> ---
> drivers/hv/mshv_root.h | 5 ++-
> drivers/hv/mshv_root_main.c | 59 +++++-------------------------
> drivers/hv/mshv_synic.c | 71 +++++++++++++++++++++++++++++++++----
> 3 files changed, 75 insertions(+), 60 deletions(-)
>
> diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
> index 3c1d88b36741..26e0320c8097 100644
> --- a/drivers/hv/mshv_root.h
> +++ b/drivers/hv/mshv_root.h
> @@ -183,7 +183,6 @@ struct hv_synic_pages {
> };
>
> struct mshv_root {
> - struct hv_synic_pages __percpu *synic_pages;
> spinlock_t pt_ht_lock;
> DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS);
> struct hv_partition_property_vmm_capabilities vmm_caps;
> @@ -242,8 +241,8 @@ int mshv_register_doorbell(u64 partition_id, doorbell_cb_t
> doorbell_cb,
> void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid);
>
> void mshv_isr(void);
> -int mshv_synic_init(unsigned int cpu);
> -int mshv_synic_cleanup(unsigned int cpu);
> +int mshv_synic_init(struct device *dev);
> +void mshv_synic_cleanup(void);
>
> static inline bool mshv_partition_encrypted(struct mshv_partition *partition)
> {
> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> index 681b58154d5e..7c1666456e78 100644
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c
> @@ -2035,7 +2035,6 @@ mshv_dev_release(struct inode *inode, struct file *filp)
> return 0;
> }
>
> -static int mshv_cpuhp_online;
> static int mshv_root_sched_online;
>
> static const char *scheduler_type_to_string(enum hv_scheduler_type type)
> @@ -2198,40 +2197,14 @@ root_scheduler_deinit(void)
> free_percpu(root_scheduler_output);
> }
>
> -static int mshv_reboot_notify(struct notifier_block *nb,
> - unsigned long code, void *unused)
> -{
> - cpuhp_remove_state(mshv_cpuhp_online);
> - return 0;
> -}
> -
> -struct notifier_block mshv_reboot_nb = {
> - .notifier_call = mshv_reboot_notify,
> -};
> -
> static void mshv_root_partition_exit(void)
> {
> - unregister_reboot_notifier(&mshv_reboot_nb);
> root_scheduler_deinit();
> }
>
> static int __init mshv_root_partition_init(struct device *dev)
> {
> - int err;
> -
> - err = root_scheduler_init(dev);
> - if (err)
> - return err;
> -
> - err = register_reboot_notifier(&mshv_reboot_nb);
> - if (err)
> - goto root_sched_deinit;
> -
> - return 0;
> -
> -root_sched_deinit:
> - root_scheduler_deinit();
> - return err;
> + return root_scheduler_init(dev);
> }
>
> static void mshv_init_vmm_caps(struct device *dev)
> @@ -2276,31 +2249,18 @@ static int __init mshv_parent_partition_init(void)
> MSHV_HV_MAX_VERSION);
> }
>
> - mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
> - if (!mshv_root.synic_pages) {
> - dev_err(dev, "Failed to allocate percpu synic page\n");
> - ret = -ENOMEM;
> + ret = mshv_synic_init(dev);
> + if (ret)
> goto device_deregister;
> - }
> -
> - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
> - mshv_synic_init,
> - mshv_synic_cleanup);
> - if (ret < 0) {
> - dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
> - goto free_synic_pages;
> - }
> -
> - mshv_cpuhp_online = ret;
>
> ret = mshv_retrieve_scheduler_type(dev);
> if (ret)
> - goto remove_cpu_state;
> + goto synic_cleanup;
>
> if (hv_root_partition())
> ret = mshv_root_partition_init(dev);
> if (ret)
> - goto remove_cpu_state;
> + goto synic_cleanup;
>
> mshv_init_vmm_caps(dev);
>
> @@ -2318,10 +2278,8 @@ static int __init mshv_parent_partition_init(void)
> exit_partition:
> if (hv_root_partition())
> mshv_root_partition_exit();
> -remove_cpu_state:
> - cpuhp_remove_state(mshv_cpuhp_online);
> -free_synic_pages:
> - free_percpu(mshv_root.synic_pages);
> +synic_cleanup:
> + mshv_synic_cleanup();
> device_deregister:
> misc_deregister(&mshv_dev);
> return ret;
> @@ -2335,8 +2293,7 @@ static void __exit mshv_parent_partition_exit(void)
> mshv_irqfd_wq_cleanup();
> if (hv_root_partition())
> mshv_root_partition_exit();
> - cpuhp_remove_state(mshv_cpuhp_online);
> - free_percpu(mshv_root.synic_pages);
> + mshv_synic_cleanup();
> }
>
> module_init(mshv_parent_partition_init);
> diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
> index f8b0337cdc82..074e37c48876 100644
> --- a/drivers/hv/mshv_synic.c
> +++ b/drivers/hv/mshv_synic.c
> @@ -12,11 +12,16 @@
> #include <linux/mm.h>
> #include <linux/io.h>
> #include <linux/random.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/reboot.h>
> #include <asm/mshyperv.h>
>
> #include "mshv_eventfd.h"
> #include "mshv.h"
>
> +static int synic_cpuhp_online;
> +static struct hv_synic_pages __percpu *synic_pages;
> +
> static u32 synic_event_ring_get_queued_port(u32 sint_index)
> {
> struct hv_synic_event_ring_page **event_ring_page;
> @@ -26,7 +31,7 @@ static u32 synic_event_ring_get_queued_port(u32 sint_index)
> u32 message;
> u8 tail;
>
> - spages = this_cpu_ptr(mshv_root.synic_pages);
> + spages = this_cpu_ptr(synic_pages);
> event_ring_page = &spages->synic_event_ring_page;
> synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
>
> @@ -393,7 +398,7 @@ mshv_intercept_isr(struct hv_message *msg)
>
> void mshv_isr(void)
> {
> - struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
> + struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
> struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
> struct hv_message *msg;
> bool handled;
> @@ -446,7 +451,7 @@ void mshv_isr(void)
> }
> }
>
> -int mshv_synic_init(unsigned int cpu)
> +static int mshv_synic_cpu_init(unsigned int cpu)
> {
> union hv_synic_simp simp;
> union hv_synic_siefp siefp;
> @@ -455,7 +460,7 @@ int mshv_synic_init(unsigned int cpu)
> union hv_synic_sint sint;
> #endif
> union hv_synic_scontrol sctrl;
> - struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
> + struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
> struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
> struct hv_synic_event_flags_page **event_flags_page =
> &spages->synic_event_flags_page;
> @@ -542,14 +547,14 @@ int mshv_synic_init(unsigned int cpu)
> return -EFAULT;
> }
>
> -int mshv_synic_cleanup(unsigned int cpu)
> +static int mshv_synic_cpu_exit(unsigned int cpu)
> {
> union hv_synic_sint sint;
> union hv_synic_simp simp;
> union hv_synic_siefp siefp;
> union hv_synic_sirbp sirbp;
> union hv_synic_scontrol sctrl;
> - struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
> + struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
> struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
> struct hv_synic_event_flags_page **event_flags_page =
> &spages->synic_event_flags_page;
> @@ -663,3 +668,57 @@ mshv_unregister_doorbell(u64 partition_id, int
> doorbell_portid)
>
> mshv_portid_free(doorbell_portid);
> }
> +
> +static int mshv_synic_reboot_notify(struct notifier_block *nb,
> + unsigned long code, void *unused)
> +{
> + if (!hv_root_partition())
> + return 0;
> +
> + cpuhp_remove_state(synic_cpuhp_online);
> + return 0;
> +}
> +
> +static struct notifier_block mshv_synic_reboot_nb = {
> + .notifier_call = mshv_synic_reboot_notify,
> +};
> +
> +int __init mshv_synic_init(struct device *dev)
> +{
> + int ret = 0;
> +
> + synic_pages = alloc_percpu(struct hv_synic_pages);
> + if (!synic_pages) {
> + dev_err(dev, "Failed to allocate percpu synic page\n");
> + return -ENOMEM;
> + }
> +
> + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
> + mshv_synic_cpu_init,
> + mshv_synic_cpu_exit);
> + if (ret < 0) {
> + dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
> + goto free_synic_pages;
> + }
> +
> + synic_cpuhp_online = ret;
> +
> + ret = register_reboot_notifier(&mshv_synic_reboot_nb);
> + if (ret)
> + goto remove_cpuhp_state;
> +
> + return 0;
> +
> +remove_cpuhp_state:
> + cpuhp_remove_state(synic_cpuhp_online);
> +free_synic_pages:
> + free_percpu(synic_pages);
> + return ret;
> +}
> +
> +void mshv_synic_cleanup(void)
> +{
> + unregister_reboot_notifier(&mshv_synic_reboot_nb);
> + cpuhp_remove_state(synic_cpuhp_online);
> + free_percpu(synic_pages);
> +}
> --
> 2.34.1
>
^ permalink raw reply
* RE: [EXTERNAL] Re: [PATCH net-next] net: ethtool: add COALESCE_RX_CQE_FRAMES/NSECS parameters
From: Haiyang Zhang @ 2026-02-23 16:11 UTC (permalink / raw)
To: Andrew Lunn, Haiyang Zhang
Cc: linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
Jakub Kicinski, Donald Hunter, David S. Miller, Eric Dumazet,
Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
Kory Maincent (Dent Project), Gal Pressman, Oleksij Rempel,
Vadim Fedorenko, linux-kernel@vger.kernel.org,
linux-doc@vger.kernel.org, Paul Rosswurm
In-Reply-To: <6bf21536-569b-49b4-9541-c22a152570fd@lunn.ch>
> -----Original Message-----
> From: Andrew Lunn <andrew@lunn.ch>
> Sent: Monday, February 23, 2026 9:01 AM
> To: Haiyang Zhang <haiyangz@linux.microsoft.com>
> Cc: linux-hyperv@vger.kernel.org; netdev@vger.kernel.org; Jakub Kicinski
> <kuba@kernel.org>; Donald Hunter <donald.hunter@gmail.com>; David S.
> Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Paolo
> Abeni <pabeni@redhat.com>; Simon Horman <horms@kernel.org>; Jonathan
> Corbet <corbet@lwn.net>; Shuah Khan <skhan@linuxfoundation.org>; Kory
> Maincent (Dent Project) <kory.maincent@bootlin.com>; Gal Pressman
> <gal@nvidia.com>; Oleksij Rempel <o.rempel@pengutronix.de>; Vadim
> Fedorenko <vadim.fedorenko@linux.dev>; linux-kernel@vger.kernel.org;
> linux-doc@vger.kernel.org; Haiyang Zhang <haiyangz@microsoft.com>; Paul
> Rosswurm <paulros@microsoft.com>
> Subject: [EXTERNAL] Re: [PATCH net-next] net: ethtool: add
> COALESCE_RX_CQE_FRAMES/NSECS parameters
>
> On Sun, Feb 22, 2026 at 01:23:17PM -0800, Haiyang Zhang wrote:
> > From: Haiyang Zhang <haiyangz@microsoft.com>
> >
> > Add two parameters for drivers supporting Rx CQE Coalescing.
> >
> > ETHTOOL_A_COALESCE_RX_CQE_FRAMES:
> > Maximum number of frames that can be coalesced into a CQE.
> >
> > ETHTOOL_A_COALESCE_RX_CQE_NSECS:
> > Time out value in nanoseconds after the first packet arrival in a
> > coalesced CQE to be sent.
>
> A new API needs a user. A kAPI especially needs a user. Please add
> support to at least one driver.
Sure, next time I will include MANA driver patches using this kAPI
in the same series. The MANA HW/FW API is still being worked on by
other teams.
Thanks,
- Haiyang
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox