* Re: [PATCH v5 2/2] mshv: add arm64 support for doorbell & intercept SINTs
From: Anirudh Rayabharam @ 2026-02-25 12:12 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <aZys_5A657AYq5DQ@skinsburskii.localdomain>
On Mon, Feb 23, 2026 at 11:39:43AM -0800, Stanislav Kinsburskii wrote:
> On Mon, Feb 23, 2026 at 02:01:59PM +0000, Anirudh Rayabharam wrote:
> > From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> >
> > On x86, the HYPERVISOR_CALLBACK_VECTOR is used to receive synthetic
> > interrupts (SINTs) from the hypervisor for doorbells and intercepts.
> > There is no such vector reserved for arm64.
> >
> > On arm64, the hypervisor exposes a synthetic register that can be read
> > to find the INTID that should be used for SINTs. This INTID is in the
> > PPI range.
> >
> > To better unify the code paths, introduce mshv_sint_vector_init() that
> > either reads the synthetic register and obtains the INTID (arm64) or
> > just uses HYPERVISOR_CALLBACK_VECTOR as the interrupt vector (x86).
> >
> > Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> > ---
> > drivers/hv/mshv_synic.c | 120 +++++++++++++++++++++++++++++++++---
> > include/hyperv/hvgdk_mini.h | 2 +
> > 2 files changed, 112 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
> > index 074e37c48876..75ef2160b3e0 100644
> > --- a/drivers/hv/mshv_synic.c
> > +++ b/drivers/hv/mshv_synic.c
> > @@ -10,17 +10,22 @@
> > #include <linux/kernel.h>
> > #include <linux/slab.h>
> > #include <linux/mm.h>
> > +#include <linux/interrupt.h>
> > #include <linux/io.h>
> > #include <linux/random.h>
> > #include <linux/cpuhotplug.h>
> > #include <linux/reboot.h>
> > #include <asm/mshyperv.h>
> > +#include <linux/platform_device.h>
> > +#include <linux/acpi.h>
> >
> > #include "mshv_eventfd.h"
> > #include "mshv.h"
> >
> > static int synic_cpuhp_online;
> > static struct hv_synic_pages __percpu *synic_pages;
> > +static int mshv_sint_vector = -1; /* hwirq for the SynIC SINTs */
> > +static int mshv_sint_irq = -1; /* Linux IRQ for mshv_sint_vector */
> >
> > static u32 synic_event_ring_get_queued_port(u32 sint_index)
> > {
> > @@ -442,9 +447,7 @@ void mshv_isr(void)
> > if (msg->header.message_flags.msg_pending)
> > hv_set_non_nested_msr(HV_MSR_EOM, 0);
> >
> > -#ifdef HYPERVISOR_CALLBACK_VECTOR
> > - add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR);
> > -#endif
> > + add_interrupt_randomness(mshv_sint_vector);
> > } else {
> > pr_warn_once("%s: unknown message type 0x%x\n", __func__,
> > msg->header.message_type);
> > @@ -456,9 +459,7 @@ static int mshv_synic_cpu_init(unsigned int cpu)
> > union hv_synic_simp simp;
> > union hv_synic_siefp siefp;
> > union hv_synic_sirbp sirbp;
> > -#ifdef HYPERVISOR_CALLBACK_VECTOR
> > union hv_synic_sint sint;
> > -#endif
> > union hv_synic_scontrol sctrl;
> > struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
> > struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
> > @@ -501,10 +502,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
> >
> > hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
> >
> > -#ifdef HYPERVISOR_CALLBACK_VECTOR
> > + if (mshv_sint_irq != -1)
> > + enable_percpu_irq(mshv_sint_irq, 0);
> > +
> > /* Enable intercepts */
> > sint.as_uint64 = 0;
> > - sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> > + sint.vector = mshv_sint_vector;
> > sint.masked = false;
> > sint.auto_eoi = hv_recommend_using_aeoi();
> > hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
> > @@ -512,13 +515,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
> >
> > /* Doorbell SINT */
> > sint.as_uint64 = 0;
> > - sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> > + sint.vector = mshv_sint_vector;
> > sint.masked = false;
> > sint.as_intercept = 1;
> > sint.auto_eoi = hv_recommend_using_aeoi();
> > hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
> > sint.as_uint64);
> > -#endif
> >
> > /* Enable global synic bit */
> > sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
> > @@ -573,6 +575,9 @@ static int mshv_synic_cpu_exit(unsigned int cpu)
> > hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
> > sint.as_uint64);
> >
> > + if (mshv_sint_irq != -1)
> > + disable_percpu_irq(mshv_sint_irq);
> > +
> > /* Disable Synic's event ring page */
> > sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
> > sirbp.sirbp_enabled = false;
> > @@ -683,14 +688,106 @@ static struct notifier_block mshv_synic_reboot_nb = {
> > .notifier_call = mshv_synic_reboot_notify,
> > };
> >
> > +#ifndef HYPERVISOR_CALLBACK_VECTOR
> > +static DEFINE_PER_CPU(long, mshv_evt);
> > +
> > +static irqreturn_t mshv_percpu_isr(int irq, void *dev_id)
> > +{
> > + mshv_isr();
> > + return IRQ_HANDLED;
> > +}
> > +
> > +#ifdef CONFIG_ACPI
> > +static int __init mshv_acpi_setup_sint_irq(void)
> > +{
> > + return acpi_register_gsi(NULL, mshv_sint_vector, ACPI_EDGE_SENSITIVE,
> > + ACPI_ACTIVE_HIGH);
> > +}
> > +
> > +static void mshv_acpi_cleanup_sint_irq(void)
> > +{
> > + acpi_unregister_gsi(mshv_sint_vector);
> > +}
> > +#else
> > +static int __init mshv_acpi_setup_sint_irq(void)
> > +{
> > + return -ENODEV;
> > +}
> > +
> > +static void mshv_acpi_cleanup_sint_irq(void)
> > +{
> > +}
> > +#endif
> > +
> > +static int __init mshv_sint_vector_init(void)
> > +{
> > + int ret;
> > + struct hv_register_assoc reg = {
> > + .name = HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID,
> > + };
> > + union hv_input_vtl input_vtl = { 0 };
> > +
> > + if (acpi_disabled)
> > + return -ENODEV;
> > +
> > + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> > + 1, input_vtl, ®);
> > + if (ret || !reg.value.reg64)
> > + return -ENODEV;
> > +
> > + mshv_sint_vector = reg.value.reg64;
> > + ret = mshv_acpi_setup_sint_irq();
> > + if (ret <= 0) {
> > + pr_err("Failed to setup IRQ for MSHV SINT vector %d: %d\n",
> > + mshv_sint_vector, ret);
> > + goto out_fail;
> > + }
> > +
> > + mshv_sint_irq = ret;
>
> nit: given that mshv_sint_irq can't be zero, the logic can be simplified by
> using 0 instead of -1.
>
>
>
> > +
> > + ret = request_percpu_irq(mshv_sint_irq, mshv_percpu_isr, "MSHV",
> > + &mshv_evt);
> > + if (ret)
> > + goto out_unregister;
> > +
> > + return 0;
> > +
> > +out_unregister:
> > + mshv_acpi_cleanup_sint_irq();
> > +out_fail:
> > + return ret;
> > +}
> > +
> > +static void mshv_sint_vector_cleanup(void)
> > +{
> > + free_percpu_irq(mshv_sint_irq, &mshv_evt);
> > + mshv_acpi_cleanup_sint_irq();
> > +}
> > +#else /* !HYPERVISOR_CALLBACK_VECTOR */
> > +static int __init mshv_sint_vector_init(void)
>
> nit: `init` is usually paired with `exit` or `fini`, so maybe `cleanup` can be
> renamed to `exit` as well for better consistency?
>
> Reviewed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Thanks! I'll fix the naming inconsistencies and also pickup your
Reviewed-by in v6.
Anirudh.
>
> > +{
> > + mshv_sint_vector = HYPERVISOR_CALLBACK_VECTOR;
> > + return 0;
> > +}
> > +
> > +static void mshv_sint_vector_cleanup(void)
> > +{
> > +}
> > +#endif /* HYPERVISOR_CALLBACK_VECTOR */
> > +
> > int __init mshv_synic_init(struct device *dev)
> > {
> > int ret = 0;
> >
> > + ret = mshv_sint_vector_init();
> > + if (ret)
> > + return ret;
> > +
> > synic_pages = alloc_percpu(struct hv_synic_pages);
> > if (!synic_pages) {
> > dev_err(dev, "Failed to allocate percpu synic page\n");
> > - return -ENOMEM;
> > + ret = -ENOMEM;
> > + goto sint_vector_cleanup;
> > }
> >
> > ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
> > @@ -713,6 +810,8 @@ int __init mshv_synic_init(struct device *dev)
> > cpuhp_remove_state(synic_cpuhp_online);
> > free_synic_pages:
> > free_percpu(synic_pages);
> > +sint_vector_cleanup:
> > + mshv_sint_vector_cleanup();
> > return ret;
> > }
> >
> > @@ -721,4 +820,5 @@ void mshv_synic_cleanup(void)
> > unregister_reboot_notifier(&mshv_synic_reboot_nb);
> > cpuhp_remove_state(synic_cpuhp_online);
> > free_percpu(synic_pages);
> > + mshv_sint_vector_cleanup();
> > }
> > diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> > index 30fbbde81c5c..7676f78e0766 100644
> > --- a/include/hyperv/hvgdk_mini.h
> > +++ b/include/hyperv/hvgdk_mini.h
> > @@ -1117,6 +1117,8 @@ enum hv_register_name {
> > HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A,
> >
> > HV_X64_REGISTER_REG_PAGE = 0x0009001C,
> > +#elif defined(CONFIG_ARM64)
> > + HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID = 0x00070001,
> > #endif
> > };
> >
> > --
> > 2.34.1
> >
^ permalink raw reply
* [PATCH v6 0/2] ARM64 support for doorbell and intercept SINTs
From: Anirudh Rayabharam @ 2026-02-25 12:44 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel; +Cc: anirudh
From: "Anirudh Rayabharam (Microsoft)" <anirudh@anirudhrb.com>
On x86, the HYPERVISOR_CALLBACK_VECTOR is used to receive synthetic
interrupts (SINTs) from the hypervisor for doorbells and intercepts.
There is no such vector reserved for arm64.
On arm64, the hypervisor exposes a synthetic register that can be read
to find the INTID that should be used for SINTs. This INTID is in the
PPI range.
Changes in v6:
- Rebase on latest hyperv-next.
- Consistent init/exit & setup/cleanup function naming.
- Pick up Reviewed-by tags.
v5: https://lore.kernel.org/linux-hyperv/20260223140159.1627229-1-anirudh@anirudhrb.com/
Changes in v5:
- Better align with coding-style.rst guidelines.
v4: https://lore.kernel.org/linux-hyperv/?t=20260211170747
Changes in v4:
- Hypervisor now exposes a synthetic register to read the SINT vector
instead of using an ACPI platform device. So make changes to accomodate that.
Changes in v3:
- Moved the hv_root_partition() check into the reboot notifier
to avoid doing it multiple times.
v2: https://lore.kernel.org/linux-hyperv/20260202182706.648192-1-anirudh@anirudhrb.com/
Changes in v2:
Addressed review comments:
- Moved more stuff into mshv_synic.c
- Code simplifications
- Removed unnecessary debug prints
v1: https://lore.kernel.org/linux-hyperv/20260128160437.3342167-1-anirudh@anirudhrb.com/
Anirudh Rayabharam (Microsoft) (2):
mshv: refactor synic init and cleanup
mshv: add arm64 support for doorbell & intercept SINTs
drivers/hv/mshv_root.h | 5 +-
drivers/hv/mshv_root_main.c | 64 ++----------
drivers/hv/mshv_synic.c | 188 +++++++++++++++++++++++++++++++++---
include/hyperv/hvgdk_mini.h | 2 +
4 files changed, 185 insertions(+), 74 deletions(-)
--
2.34.1
^ permalink raw reply
* [PATCH v6 1/2] mshv: refactor synic init and cleanup
From: Anirudh Rayabharam @ 2026-02-25 12:44 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
Cc: anirudh, Michael Kelley
In-Reply-To: <20260225124403.2187880-1-anirudh@anirudhrb.com>
From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
Rename mshv_synic_init() to mshv_synic_cpu_init() and
mshv_synic_cleanup() to mshv_synic_cpu_exit() to better reflect that
these functions handle per-cpu synic setup and teardown.
Use mshv_synic_init/cleanup() to perform init/cleanup that is not per-cpu.
Move all the synic related setup from mshv_parent_partition_init.
Move the reboot notifier to mshv_synic.c because it currently only
operates on the synic cpuhp state.
Move out synic_pages from the global mshv_root since its use is now
completely local to mshv_synic.c.
This is in preparation for the next patch which will add more stuff to
mshv_synic_init().
No functional change.
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
---
drivers/hv/mshv_root.h | 5 ++-
drivers/hv/mshv_root_main.c | 64 +++++----------------------------
drivers/hv/mshv_synic.c | 71 +++++++++++++++++++++++++++++++++----
3 files changed, 75 insertions(+), 65 deletions(-)
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 04c2a1910a8a..826798f1a8ec 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -190,7 +190,6 @@ struct hv_synic_pages {
};
struct mshv_root {
- struct hv_synic_pages __percpu *synic_pages;
spinlock_t pt_ht_lock;
DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS);
struct hv_partition_property_vmm_capabilities vmm_caps;
@@ -249,8 +248,8 @@ int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb,
void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid);
void mshv_isr(void);
-int mshv_synic_init(unsigned int cpu);
-int mshv_synic_cleanup(unsigned int cpu);
+int mshv_synic_init(struct device *dev);
+void mshv_synic_exit(void);
static inline bool mshv_partition_encrypted(struct mshv_partition *partition)
{
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index e6509c980763..7fcde33d3e75 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -2064,7 +2064,6 @@ mshv_dev_release(struct inode *inode, struct file *filp)
return 0;
}
-static int mshv_cpuhp_online;
static int mshv_root_sched_online;
static const char *scheduler_type_to_string(enum hv_scheduler_type type)
@@ -2249,27 +2248,6 @@ root_scheduler_deinit(void)
free_percpu(root_scheduler_output);
}
-static int mshv_reboot_notify(struct notifier_block *nb,
- unsigned long code, void *unused)
-{
- cpuhp_remove_state(mshv_cpuhp_online);
- return 0;
-}
-
-struct notifier_block mshv_reboot_nb = {
- .notifier_call = mshv_reboot_notify,
-};
-
-static void mshv_root_partition_exit(void)
-{
- unregister_reboot_notifier(&mshv_reboot_nb);
-}
-
-static int __init mshv_root_partition_init(struct device *dev)
-{
- return register_reboot_notifier(&mshv_reboot_nb);
-}
-
static int __init mshv_init_vmm_caps(struct device *dev)
{
int ret;
@@ -2314,39 +2292,21 @@ static int __init mshv_parent_partition_init(void)
MSHV_HV_MAX_VERSION);
}
- mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
- if (!mshv_root.synic_pages) {
- dev_err(dev, "Failed to allocate percpu synic page\n");
- ret = -ENOMEM;
+ ret = mshv_synic_init(dev);
+ if (ret)
goto device_deregister;
- }
-
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
- mshv_synic_init,
- mshv_synic_cleanup);
- if (ret < 0) {
- dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
- goto free_synic_pages;
- }
-
- mshv_cpuhp_online = ret;
ret = mshv_init_vmm_caps(dev);
if (ret)
- goto remove_cpu_state;
+ goto synic_cleanup;
ret = mshv_retrieve_scheduler_type(dev);
if (ret)
- goto remove_cpu_state;
-
- if (hv_root_partition())
- ret = mshv_root_partition_init(dev);
- if (ret)
- goto remove_cpu_state;
+ goto synic_cleanup;
ret = root_scheduler_init(dev);
if (ret)
- goto exit_partition;
+ goto synic_cleanup;
ret = mshv_debugfs_init();
if (ret)
@@ -2367,13 +2327,8 @@ static int __init mshv_parent_partition_init(void)
mshv_debugfs_exit();
deinit_root_scheduler:
root_scheduler_deinit();
-exit_partition:
- if (hv_root_partition())
- mshv_root_partition_exit();
-remove_cpu_state:
- cpuhp_remove_state(mshv_cpuhp_online);
-free_synic_pages:
- free_percpu(mshv_root.synic_pages);
+synic_cleanup:
+ mshv_synic_exit();
device_deregister:
misc_deregister(&mshv_dev);
return ret;
@@ -2387,10 +2342,7 @@ static void __exit mshv_parent_partition_exit(void)
misc_deregister(&mshv_dev);
mshv_irqfd_wq_cleanup();
root_scheduler_deinit();
- if (hv_root_partition())
- mshv_root_partition_exit();
- cpuhp_remove_state(mshv_cpuhp_online);
- free_percpu(mshv_root.synic_pages);
+ mshv_synic_exit();
}
module_init(mshv_parent_partition_init);
diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index f8b0337cdc82..f716c2a4952f 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -12,11 +12,16 @@
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/random.h>
+#include <linux/cpuhotplug.h>
+#include <linux/reboot.h>
#include <asm/mshyperv.h>
#include "mshv_eventfd.h"
#include "mshv.h"
+static int synic_cpuhp_online;
+static struct hv_synic_pages __percpu *synic_pages;
+
static u32 synic_event_ring_get_queued_port(u32 sint_index)
{
struct hv_synic_event_ring_page **event_ring_page;
@@ -26,7 +31,7 @@ static u32 synic_event_ring_get_queued_port(u32 sint_index)
u32 message;
u8 tail;
- spages = this_cpu_ptr(mshv_root.synic_pages);
+ spages = this_cpu_ptr(synic_pages);
event_ring_page = &spages->synic_event_ring_page;
synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
@@ -393,7 +398,7 @@ mshv_intercept_isr(struct hv_message *msg)
void mshv_isr(void)
{
- struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+ struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_message *msg;
bool handled;
@@ -446,7 +451,7 @@ void mshv_isr(void)
}
}
-int mshv_synic_init(unsigned int cpu)
+static int mshv_synic_cpu_init(unsigned int cpu)
{
union hv_synic_simp simp;
union hv_synic_siefp siefp;
@@ -455,7 +460,7 @@ int mshv_synic_init(unsigned int cpu)
union hv_synic_sint sint;
#endif
union hv_synic_scontrol sctrl;
- struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+ struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_synic_event_flags_page **event_flags_page =
&spages->synic_event_flags_page;
@@ -542,14 +547,14 @@ int mshv_synic_init(unsigned int cpu)
return -EFAULT;
}
-int mshv_synic_cleanup(unsigned int cpu)
+static int mshv_synic_cpu_exit(unsigned int cpu)
{
union hv_synic_sint sint;
union hv_synic_simp simp;
union hv_synic_siefp siefp;
union hv_synic_sirbp sirbp;
union hv_synic_scontrol sctrl;
- struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+ struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_synic_event_flags_page **event_flags_page =
&spages->synic_event_flags_page;
@@ -663,3 +668,57 @@ mshv_unregister_doorbell(u64 partition_id, int doorbell_portid)
mshv_portid_free(doorbell_portid);
}
+
+static int mshv_synic_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (!hv_root_partition())
+ return 0;
+
+ cpuhp_remove_state(synic_cpuhp_online);
+ return 0;
+}
+
+static struct notifier_block mshv_synic_reboot_nb = {
+ .notifier_call = mshv_synic_reboot_notify,
+};
+
+int __init mshv_synic_init(struct device *dev)
+{
+ int ret = 0;
+
+ synic_pages = alloc_percpu(struct hv_synic_pages);
+ if (!synic_pages) {
+ dev_err(dev, "Failed to allocate percpu synic page\n");
+ return -ENOMEM;
+ }
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
+ mshv_synic_cpu_init,
+ mshv_synic_cpu_exit);
+ if (ret < 0) {
+ dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
+ goto free_synic_pages;
+ }
+
+ synic_cpuhp_online = ret;
+
+ ret = register_reboot_notifier(&mshv_synic_reboot_nb);
+ if (ret)
+ goto remove_cpuhp_state;
+
+ return 0;
+
+remove_cpuhp_state:
+ cpuhp_remove_state(synic_cpuhp_online);
+free_synic_pages:
+ free_percpu(synic_pages);
+ return ret;
+}
+
+void mshv_synic_exit(void)
+{
+ unregister_reboot_notifier(&mshv_synic_reboot_nb);
+ cpuhp_remove_state(synic_cpuhp_online);
+ free_percpu(synic_pages);
+}
--
2.34.1
^ permalink raw reply related
* [PATCH v6 2/2] mshv: add arm64 support for doorbell & intercept SINTs
From: Anirudh Rayabharam @ 2026-02-25 12:44 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
Cc: anirudh, Michael Kelley, Stanislav Kinsburskii
In-Reply-To: <20260225124403.2187880-1-anirudh@anirudhrb.com>
From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
On x86, the HYPERVISOR_CALLBACK_VECTOR is used to receive synthetic
interrupts (SINTs) from the hypervisor for doorbells and intercepts.
There is no such vector reserved for arm64.
On arm64, the hypervisor exposes a synthetic register that can be read
to find the INTID that should be used for SINTs. This INTID is in the
PPI range.
To better unify the code paths, introduce mshv_sint_vector_init() that
either reads the synthetic register and obtains the INTID (arm64) or
just uses HYPERVISOR_CALLBACK_VECTOR as the interrupt vector (x86).
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
---
drivers/hv/mshv_synic.c | 119 +++++++++++++++++++++++++++++++++---
include/hyperv/hvgdk_mini.h | 2 +
2 files changed, 111 insertions(+), 10 deletions(-)
diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index f716c2a4952f..1e48fce5816b 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -10,17 +10,21 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/random.h>
#include <linux/cpuhotplug.h>
#include <linux/reboot.h>
#include <asm/mshyperv.h>
+#include <linux/acpi.h>
#include "mshv_eventfd.h"
#include "mshv.h"
static int synic_cpuhp_online;
static struct hv_synic_pages __percpu *synic_pages;
+static int mshv_sint_vector = -1; /* hwirq for the SynIC SINTs */
+static int mshv_sint_irq = -1; /* Linux IRQ for mshv_sint_vector */
static u32 synic_event_ring_get_queued_port(u32 sint_index)
{
@@ -442,9 +446,7 @@ void mshv_isr(void)
if (msg->header.message_flags.msg_pending)
hv_set_non_nested_msr(HV_MSR_EOM, 0);
-#ifdef HYPERVISOR_CALLBACK_VECTOR
- add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR);
-#endif
+ add_interrupt_randomness(mshv_sint_vector);
} else {
pr_warn_once("%s: unknown message type 0x%x\n", __func__,
msg->header.message_type);
@@ -456,9 +458,7 @@ static int mshv_synic_cpu_init(unsigned int cpu)
union hv_synic_simp simp;
union hv_synic_siefp siefp;
union hv_synic_sirbp sirbp;
-#ifdef HYPERVISOR_CALLBACK_VECTOR
union hv_synic_sint sint;
-#endif
union hv_synic_scontrol sctrl;
struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
@@ -501,10 +501,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
-#ifdef HYPERVISOR_CALLBACK_VECTOR
+ if (mshv_sint_irq != -1)
+ enable_percpu_irq(mshv_sint_irq, 0);
+
/* Enable intercepts */
sint.as_uint64 = 0;
- sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.vector = mshv_sint_vector;
sint.masked = false;
sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
@@ -512,13 +514,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
/* Doorbell SINT */
sint.as_uint64 = 0;
- sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.vector = mshv_sint_vector;
sint.masked = false;
sint.as_intercept = 1;
sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
sint.as_uint64);
-#endif
/* Enable global synic bit */
sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
@@ -573,6 +574,9 @@ static int mshv_synic_cpu_exit(unsigned int cpu)
hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
sint.as_uint64);
+ if (mshv_sint_irq != -1)
+ disable_percpu_irq(mshv_sint_irq);
+
/* Disable Synic's event ring page */
sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
sirbp.sirbp_enabled = false;
@@ -683,14 +687,106 @@ static struct notifier_block mshv_synic_reboot_nb = {
.notifier_call = mshv_synic_reboot_notify,
};
+#ifndef HYPERVISOR_CALLBACK_VECTOR
+static DEFINE_PER_CPU(long, mshv_evt);
+
+static irqreturn_t mshv_percpu_isr(int irq, void *dev_id)
+{
+ mshv_isr();
+ return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_ACPI
+static int __init mshv_acpi_setup_sint_irq(void)
+{
+ return acpi_register_gsi(NULL, mshv_sint_vector, ACPI_EDGE_SENSITIVE,
+ ACPI_ACTIVE_HIGH);
+}
+
+static void mshv_acpi_cleanup_sint_irq(void)
+{
+ acpi_unregister_gsi(mshv_sint_vector);
+}
+#else
+static int __init mshv_acpi_setup_sint_irq(void)
+{
+ return -ENODEV;
+}
+
+static void mshv_acpi_cleanup_sint_irq(void)
+{
+}
+#endif
+
+static int __init mshv_sint_vector_setup(void)
+{
+ int ret;
+ struct hv_register_assoc reg = {
+ .name = HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID,
+ };
+ union hv_input_vtl input_vtl = { 0 };
+
+ if (acpi_disabled)
+ return -ENODEV;
+
+ ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl, ®);
+ if (ret || !reg.value.reg64)
+ return -ENODEV;
+
+ mshv_sint_vector = reg.value.reg64;
+ ret = mshv_acpi_setup_sint_irq();
+ if (ret < 0) {
+ pr_err("Failed to setup IRQ for MSHV SINT vector %d: %d\n",
+ mshv_sint_vector, ret);
+ goto out_fail;
+ }
+
+ mshv_sint_irq = ret;
+
+ ret = request_percpu_irq(mshv_sint_irq, mshv_percpu_isr, "MSHV",
+ &mshv_evt);
+ if (ret)
+ goto out_unregister;
+
+ return 0;
+
+out_unregister:
+ mshv_acpi_cleanup_sint_irq();
+out_fail:
+ return ret;
+}
+
+static void mshv_sint_vector_cleanup(void)
+{
+ free_percpu_irq(mshv_sint_irq, &mshv_evt);
+ mshv_acpi_cleanup_sint_irq();
+}
+#else /* !HYPERVISOR_CALLBACK_VECTOR */
+static int __init mshv_sint_vector_setup(void)
+{
+ mshv_sint_vector = HYPERVISOR_CALLBACK_VECTOR;
+ return 0;
+}
+
+static void mshv_sint_vector_cleanup(void)
+{
+}
+#endif /* HYPERVISOR_CALLBACK_VECTOR */
+
int __init mshv_synic_init(struct device *dev)
{
int ret = 0;
+ ret = mshv_sint_vector_setup();
+ if (ret)
+ return ret;
+
synic_pages = alloc_percpu(struct hv_synic_pages);
if (!synic_pages) {
dev_err(dev, "Failed to allocate percpu synic page\n");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto sint_vector_cleanup;
}
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
@@ -713,6 +809,8 @@ int __init mshv_synic_init(struct device *dev)
cpuhp_remove_state(synic_cpuhp_online);
free_synic_pages:
free_percpu(synic_pages);
+sint_vector_cleanup:
+ mshv_sint_vector_cleanup();
return ret;
}
@@ -721,4 +819,5 @@ void mshv_synic_exit(void)
unregister_reboot_notifier(&mshv_synic_reboot_nb);
cpuhp_remove_state(synic_cpuhp_online);
free_percpu(synic_pages);
+ mshv_sint_vector_cleanup();
}
diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index 056ef7b6b360..8bb3dd71c5b4 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -1121,6 +1121,8 @@ enum hv_register_name {
HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A,
HV_X64_REGISTER_REG_PAGE = 0x0009001C,
+#elif defined(CONFIG_ARM64)
+ HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID = 0x00070001,
#endif
};
--
2.34.1
^ permalink raw reply related
* Re: [PATCH net] net: mana: Fix double destroy_workqueue on service rescan PCI path
From: Simon Horman @ 2026-02-25 13:19 UTC (permalink / raw)
To: Dipayaan Roy
Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, longli, kotaranov, shradhagupta, ssengar, ernis,
shirazsaleem, linux-hyperv, netdev, linux-kernel, linux-rdma,
dipayanroy, Leon Romanovsky
In-Reply-To: <aZ2bzL64NagfyHpg@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>
+ Leon
On Tue, Feb 24, 2026 at 04:38:36AM -0800, Dipayaan Roy wrote:
> While testing corner cases in the driver, a use-after-free crash
> was found on the service rescan PCI path.
>
> When mana_serv_reset() calls mana_gd_suspend(), mana_gd_cleanup()
> destroys gc->service_wq. If the subsequent mana_gd_resume() fails
> with -ETIMEDOUT or -EPROTO, the code falls through to
> mana_serv_rescan() which triggers pci_stop_and_remove_bus_device().
> This invokes the PCI .remove callback (mana_gd_remove), which calls
> mana_gd_cleanup() a second time, attempting to destroy the already-
> freed workqueue. Fix this by NULL-checking gc->service_wq in
> mana_gd_cleanup() and setting it to NULL after destruction.
>
> Call stack of issue for reference:
> [Sat Feb 21 18:53:48 2026] Call Trace:
> [Sat Feb 21 18:53:48 2026] <TASK>
> [Sat Feb 21 18:53:48 2026] mana_gd_cleanup+0x33/0x70 [mana]
> [Sat Feb 21 18:53:48 2026] mana_gd_remove+0x3a/0xc0 [mana]
> [Sat Feb 21 18:53:48 2026] pci_device_remove+0x41/0xb0
> [Sat Feb 21 18:53:48 2026] device_remove+0x46/0x70
> [Sat Feb 21 18:53:48 2026] device_release_driver_internal+0x1e3/0x250
> [Sat Feb 21 18:53:48 2026] device_release_driver+0x12/0x20
> [Sat Feb 21 18:53:48 2026] pci_stop_bus_device+0x6a/0x90
> [Sat Feb 21 18:53:48 2026] pci_stop_and_remove_bus_device+0x13/0x30
> [Sat Feb 21 18:53:48 2026] mana_do_service+0x180/0x290 [mana]
> [Sat Feb 21 18:53:48 2026] mana_serv_func+0x24/0x50 [mana]
> [Sat Feb 21 18:53:48 2026] process_one_work+0x190/0x3d0
> [Sat Feb 21 18:53:48 2026] worker_thread+0x16e/0x2e0
> [Sat Feb 21 18:53:48 2026] kthread+0xf7/0x130
> [Sat Feb 21 18:53:48 2026] ? __pfx_worker_thread+0x10/0x10
> [Sat Feb 21 18:53:48 2026] ? __pfx_kthread+0x10/0x10
> [Sat Feb 21 18:53:48 2026] ret_from_fork+0x269/0x350
> [Sat Feb 21 18:53:48 2026] ? __pfx_kthread+0x10/0x10
> [Sat Feb 21 18:53:48 2026] ret_from_fork_asm+0x1a/0x30
> [Sat Feb 21 18:53:48 2026] </TASK>
>
> Fixes: 505cc26bcae0 ("net: mana: Add support for auxiliary device servicing events")
> Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
> Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Reviewed-by: Simon Horman <horms@kernel.org>
> ---
> drivers/net/ethernet/microsoft/mana/gdma_main.c | 5 ++++-
> drivers/net/ethernet/microsoft/mana/mana_en.c | 4 +++-
> 2 files changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 0055c231acf6..3926d18f1840 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -1946,7 +1946,10 @@ static void mana_gd_cleanup(struct pci_dev *pdev)
>
> mana_gd_remove_irqs(pdev);
>
> - destroy_workqueue(gc->service_wq);
> + if (gc->service_wq) {
> + destroy_workqueue(gc->service_wq);
> + gc->service_wq = NULL;
> + }
> dev_dbg(&pdev->dev, "mana gdma cleanup successful\n");
> }
>
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 9b5a72ada5c4..f69e42651359 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -3762,7 +3762,9 @@ void mana_rdma_remove(struct gdma_dev *gd)
> }
>
> WRITE_ONCE(gd->rdma_teardown, true);
> - flush_workqueue(gc->service_wq);
> +
> + if (gc->service_wq)
> + flush_workqueue(gc->service_wq);
>
> if (gd->adev)
> remove_adev(gd);
> --
> 2.43.0
>
^ permalink raw reply
* Re: (subset) [PATCH rdma-next 00/50] RDMA: Ensure CQ UMEMs are managed by ib_core
From: Leon Romanovsky @ 2026-02-25 13:51 UTC (permalink / raw)
To: Jason Gunthorpe, Selvin Xavier, Kalesh AP, Potnuri Bharat Teja,
Michael Margolin, Gal Pressman, Yossi Leybovich, Cheng Xu,
Kai Shen, Chengchang Tang, Junxian Huang, Abhijit Gangurde,
Allen Hubbe, Krzysztof Czurylo, Tatyana Nikolova, Long Li,
Konstantin Taranov, Yishai Hadas, Michal Kalderon, Bryan Tan,
Vishnu Dasa, Broadcom internal kernel review list,
Christian Benvenuti, Nelson Escobar, Dennis Dalessandro,
Bernard Metzler, Zhu Yanjun, Leon Romanovsky
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
On Fri, 13 Feb 2026 12:57:36 +0200, Leon Romanovsky wrote:
> Unify CQ UMEM creation, resize and release in ib_core to avoid the need
> for complex driver-side handling. This lets us rely on the internal
> reference counters of the relevant ib_XXX objects to manage UMEM
> lifetime safely and consistently.
>
> The resize cleanup made it clear that most drivers never handled this
> path correctly, and there's a good chance the functionality was never
> actually used. The most common issue was relying on the cq->resize_umem
> pointer to detect races with other CQ commands, without clearing it on
> errors and while ignoring proper locking for other CQ operations.
>
> [...]
Applied, thanks!
[01/50] RDMA: Move DMA block iterator logic into dedicated files
(no commit info)
[02/50] RDMA/umem: Allow including ib_umem header from any location
(no commit info)
[03/50] RDMA/umem: Remove unnecessary includes and defines from ib_umem header
(no commit info)
[04/50] RDMA/core: Promote UMEM to a core component
(no commit info)
[05/50] RDMA/core: Manage CQ umem in core code
(no commit info)
[06/50] RDMA/efa: Rely on CPU address in create‑QP
(no commit info)
[07/50] RDMA/core: Prepare create CQ path for API unification
(no commit info)
[08/50] RDMA/core: Reject zero CQE count
(no commit info)
[09/50] RDMA/efa: Remove check for zero CQE count
(no commit info)
[10/50] RDMA/mlx5: Save 4 bytes in CQ structure
(no commit info)
[11/50] RDMA/mlx5: Provide a modern CQ creation interface
(no commit info)
[12/50] RDMA/mlx4: Inline mlx4_ib_get_cq_umem into callers
(no commit info)
[13/50] RDMA/mlx4: Introduce a modern CQ creation interface
(no commit info)
[14/50] RDMA/mlx4: Remove unused create_flags field from CQ structure
(no commit info)
Best regards,
--
Leon Romanovsky <leon@kernel.org>
^ permalink raw reply
* Re: [PATCH rdma-next 00/50] RDMA: Ensure CQ UMEMs are managed by ib_core
From: Leon Romanovsky @ 2026-02-25 13:53 UTC (permalink / raw)
To: Jason Gunthorpe, Selvin Xavier, Kalesh AP, Potnuri Bharat Teja,
Michael Margolin, Gal Pressman, Yossi Leybovich, Cheng Xu,
Kai Shen, Chengchang Tang, Junxian Huang, Abhijit Gangurde,
Allen Hubbe, Krzysztof Czurylo, Tatyana Nikolova, Long Li,
Konstantin Taranov, Yishai Hadas, Michal Kalderon, Bryan Tan,
Vishnu Dasa, Broadcom internal kernel review list,
Christian Benvenuti, Nelson Escobar, Dennis Dalessandro,
Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
On Fri, Feb 13, 2026 at 12:57:36PM +0200, Leon Romanovsky wrote:
> Unify CQ UMEM creation, resize and release in ib_core to avoid the need
> for complex driver-side handling. This lets us rely on the internal
> reference counters of the relevant ib_XXX objects to manage UMEM
> lifetime safely and consistently.
>
> The resize cleanup made it clear that most drivers never handled this
> path correctly, and there's a good chance the functionality was never
> actually used. The most common issue was relying on the cq->resize_umem
> pointer to detect races with other CQ commands, without clearing it on
> errors and while ignoring proper locking for other CQ operations.
>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
> Leon Romanovsky (50):
> RDMA: Move DMA block iterator logic into dedicated files
> RDMA/umem: Allow including ib_umem header from any location
> RDMA/umem: Remove unnecessary includes and defines from ib_umem header
> RDMA/core: Promote UMEM to a core component
> RDMA/core: Manage CQ umem in core code
> RDMA/efa: Rely on CPU address in create‑QP
> RDMA/core: Prepare create CQ path for API unification
> RDMA/core: Reject zero CQE count
> RDMA/efa: Remove check for zero CQE count
> RDMA/mlx5: Save 4 bytes in CQ structure
> RDMA/mlx5: Provide a modern CQ creation interface
> RDMA/mlx4: Inline mlx4_ib_get_cq_umem into callers
> RDMA/mlx4: Introduce a modern CQ creation interface
> RDMA/mlx4: Remove unused create_flags field from CQ structure
I took 14 patches above, rest will need to be resubmitted.
Thanks
> RDMA/bnxt_re: Convert to modern CQ interface
> RDMA/cxgb4: Separate kernel and user CQ creation paths
> RDMA/mthca: Split user and kernel CQ creation paths
> RDMA/erdma: Separate user and kernel CQ creation paths
> RDMA/ionic: Split user and kernel CQ creation paths
> RDMA/qedr: Convert to modern CQ interface
> RDMA/vmw_pvrdma: Provide a modern CQ creation interface
> RDMA/ocrdma: Split user and kernel CQ creation paths
> RDMA/irdma: Split user and kernel CQ creation paths
> RDMA/usnic: Provide a modern CQ creation interface
> RDMA/mana: Provide a modern CQ creation interface
> RDMA/erdma: Separate user and kernel CQ creation paths
> RDMA/rdmavt: Split user and kernel CQ creation paths
> RDMA/siw: Split user and kernel CQ creation paths
> RDMA/rxe: Split user and kernel CQ creation paths
> RDMA/core: Remove legacy CQ creation fallback path
> RDMA/core: Remove unused ib_resize_cq() implementation
> RDMA: Clarify that CQ resize is a user‑space verb
> RDMA/bnxt_re: Drop support for resizing kernel CQs
> RDMA/irdma: Remove resize support for kernel CQs
> RDMA/mlx4: Remove support for kernel CQ resize
> RDMA/mlx5: Remove support for resizing kernel CQs
> RDMA/mthca: Remove resize support for kernel CQs
> RDMA/rdmavt: Remove resize support for kernel CQs
> RDMA/rxe: Remove unused kernel‑side CQ resize support
> RDMA: Properly propagate the number of CQEs as unsigned int
> RDMA/core: Generalize CQ resize locking
> RDMA/bnxt_re: Complete CQ resize in a single step
> RDMA/bnxt_re: Rely on common resize‑CQ locking
> RDMA/bnxt_re: Reduce CQ memory footprint
> RDMA/mlx4: Use generic resize-CQ lock
> RDMA/mlx4: Use on‑stack variables instead of storing them in the CQ object
> RDMA/mlx5: Use generic resize-CQ lock
> RDMA/mlx5: Select resize‑CQ callback based on device capabilities
> RDMA/mlx5: Reduce CQ memory footprint
> RDMA/mthca: Use generic resize-CQ lock
>
> drivers/infiniband/core/Makefile | 6 +-
> drivers/infiniband/core/cq.c | 3 +
> drivers/infiniband/core/device.c | 4 +-
> drivers/infiniband/core/iter.c | 43 +++
> drivers/infiniband/core/umem.c | 2 +-
> drivers/infiniband/core/uverbs_cmd.c | 18 +-
> drivers/infiniband/core/uverbs_std_types_cq.c | 35 ++-
> drivers/infiniband/core/verbs.c | 61 +---
> drivers/infiniband/hw/bnxt_re/ib_verbs.c | 246 ++++++++-------
> drivers/infiniband/hw/bnxt_re/ib_verbs.h | 9 +-
> drivers/infiniband/hw/bnxt_re/main.c | 3 +-
> drivers/infiniband/hw/bnxt_re/qplib_res.c | 2 +-
> drivers/infiniband/hw/cxgb4/cq.c | 218 +++++++++----
> drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 2 +
> drivers/infiniband/hw/cxgb4/mem.c | 2 +-
> drivers/infiniband/hw/cxgb4/provider.c | 1 +
> drivers/infiniband/hw/efa/efa.h | 6 +-
> drivers/infiniband/hw/efa/efa_main.c | 3 +-
> drivers/infiniband/hw/efa/efa_verbs.c | 44 ++-
> drivers/infiniband/hw/erdma/erdma_main.c | 1 +
> drivers/infiniband/hw/erdma/erdma_verbs.c | 99 ++++--
> drivers/infiniband/hw/erdma/erdma_verbs.h | 2 +
> drivers/infiniband/hw/hns/hns_roce_alloc.c | 2 +-
> drivers/infiniband/hw/hns/hns_roce_cq.c | 103 ++++--
> drivers/infiniband/hw/hns/hns_roce_debugfs.c | 1 -
> drivers/infiniband/hw/hns/hns_roce_device.h | 3 +-
> drivers/infiniband/hw/hns/hns_roce_main.c | 1 +
> drivers/infiniband/hw/ionic/ionic_controlpath.c | 88 ++++--
> drivers/infiniband/hw/ionic/ionic_ibdev.c | 1 +
> drivers/infiniband/hw/ionic/ionic_ibdev.h | 4 +-
> drivers/infiniband/hw/irdma/main.h | 2 +-
> drivers/infiniband/hw/irdma/verbs.c | 402 +++++++++++++-----------
> drivers/infiniband/hw/mana/cq.c | 128 +++++---
> drivers/infiniband/hw/mana/device.c | 1 +
> drivers/infiniband/hw/mana/main.c | 25 +-
> drivers/infiniband/hw/mana/mana_ib.h | 6 +-
> drivers/infiniband/hw/mana/qp.c | 42 ++-
> drivers/infiniband/hw/mana/wq.c | 14 +-
> drivers/infiniband/hw/mlx4/cq.c | 401 ++++++++---------------
> drivers/infiniband/hw/mlx4/main.c | 3 +-
> drivers/infiniband/hw/mlx4/mlx4_ib.h | 10 +-
> drivers/infiniband/hw/mlx4/mr.c | 1 +
> drivers/infiniband/hw/mlx5/cq.c | 383 ++++++++--------------
> drivers/infiniband/hw/mlx5/main.c | 9 +-
> drivers/infiniband/hw/mlx5/mem.c | 1 +
> drivers/infiniband/hw/mlx5/mlx5_ib.h | 12 +-
> drivers/infiniband/hw/mlx5/qp.c | 2 +-
> drivers/infiniband/hw/mlx5/umr.c | 1 +
> drivers/infiniband/hw/mthca/mthca_cq.c | 1 -
> drivers/infiniband/hw/mthca/mthca_provider.c | 193 ++++--------
> drivers/infiniband/hw/mthca/mthca_provider.h | 1 -
> drivers/infiniband/hw/ocrdma/ocrdma_main.c | 3 +-
> drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 70 +++--
> drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 6 +-
> drivers/infiniband/hw/qedr/main.c | 1 +
> drivers/infiniband/hw/qedr/verbs.c | 325 +++++++++++--------
> drivers/infiniband/hw/qedr/verbs.h | 2 +
> drivers/infiniband/hw/usnic/usnic_ib_main.c | 2 +-
> drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 6 +-
> drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 4 +-
> drivers/infiniband/hw/vmw_pvrdma/pvrdma.h | 2 +-
> drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 171 ++++++----
> drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 1 +
> drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 3 +
> drivers/infiniband/sw/rdmavt/cq.c | 224 +++++++------
> drivers/infiniband/sw/rdmavt/cq.h | 4 +-
> drivers/infiniband/sw/rdmavt/vt.c | 3 +-
> drivers/infiniband/sw/rxe/rxe_cq.c | 31 --
> drivers/infiniband/sw/rxe/rxe_loc.h | 3 -
> drivers/infiniband/sw/rxe/rxe_verbs.c | 115 +++----
> drivers/infiniband/sw/siw/siw_main.c | 1 +
> drivers/infiniband/sw/siw/siw_verbs.c | 111 +++++--
> drivers/infiniband/sw/siw/siw_verbs.h | 2 +
> include/rdma/ib_umem.h | 36 +--
> include/rdma/ib_verbs.h | 67 +---
> include/rdma/iter.h | 88 ++++++
> 76 files changed, 2085 insertions(+), 1847 deletions(-)
> ---
> base-commit: 42e3aac65c1c9eb36cdee0d8312a326196e0822f
> change-id: 20260203-refactor-umem-e5b4277e41b4
>
> Best regards,
> --
> Leon Romanovsky <leonro@nvidia.com>
>
^ permalink raw reply
* Re: [PATCH net-next v3] net: mana: Add MAC address to vPort logs and clarify error messages
From: Erni Sri Satya Vennela @ 2026-02-25 17:36 UTC (permalink / raw)
To: Paolo Abeni
Cc: Erni Sri Satya Vennela, kys, haiyangz, wei.liu, decui, longli,
andrew+netdev, davem, edumazet, kuba, dipayanroy, ssengar,
shradhagupta, shirazsaleem, gargaditya, linux-hyperv, netdev,
linux-kernel
In-Reply-To: <2f11ece1-cb85-4491-89d5-c8818666ff41@redhat.com>
On Tue, Feb 24, 2026 at 01:22:58PM +0100, Paolo Abeni wrote:
> On 2/23/26 5:08 AM, Erni Sri Satya Vennela wrote:
> > @@ -861,8 +862,8 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
> > tx_wr = &txq->msg_buf->reqs[msg_id];
> >
> > if (req_len > tx_wr->buf_len) {
> > - dev_err(hwc->dev, "HWC: req msg size: %d > %d\n", req_len,
> > - tx_wr->buf_len);
> > + dev_err(hwc->dev, "%s:%d: req msg size: %d > %d\n",
> > + __func__, __LINE__, req_len, tx_wr->buf_len);
>
> I fail to see any relevant information added here ...
>
> > err = -EINVAL;
> > goto out;
> > }
> > @@ -878,6 +879,7 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
> > req_msg->req.hwc_msg_id = msg_id;
> >
> > tx_wr->msg_size = req_len;
> > + command = req_msg->req.msg_type;
> >
> > if (gc->is_pf) {
> > dest_vrq = hwc->pf_dest_vrq_id;
> > @@ -886,15 +888,16 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
> >
> > err = mana_hwc_post_tx_wqe(txq, tx_wr, dest_vrq, dest_vrcq, false);
> > if (err) {
> > - dev_err(hwc->dev, "HWC: Failed to post send WQE: %d\n", err);
> > + dev_err(hwc->dev, "%s:%d: Failed to post send WQE: %d\n",
> > + __func__, __LINE__, err);
>
> ... and here. The string message should be (and apparently is) enough to
> locate the relevant code inside the tree. Please don't included
> unneeded/irrelevant changes.
>
> Thanks,
>
> Paolo
Thank you for the clarification, Paolo. I’ll drop these changes in the
next revision.
^ permalink raw reply
* [PATCH net] net: mana: Ring doorbell at 4 CQ wraparounds
From: Long Li @ 2026-02-25 18:49 UTC (permalink / raw)
To: K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Shradha Gupta, Erni Sri Satya Vennela, linux-hyperv, netdev,
linux-kernel, stable
MANA hardware requires at least one doorbell ring every 8 wraparounds
of the CQ. The driver rings the doorbell as a form of flow control to
inform hardware that CQEs have been consumed.
The NAPI poll functions mana_poll_tx_cq() and mana_poll_rx_cq() can
poll up to CQE_POLLING_BUFFER (512) completions per call. If the CQ
has fewer than 512 entries, a single poll call can process more than
4 wraparounds without ringing the doorbell. The doorbell threshold
check also uses ">" instead of ">=", delaying the ring by one extra
CQE beyond 4 wraparounds. Combined, these issues can cause the driver
to exceed the 8-wraparound hardware limit, leading to missed
completions and stalled queues.
Fix this by capping the number of CQEs polled per call to 4 wraparounds
of the CQ in both TX and RX paths. Also change the doorbell threshold
from ">" to ">=" so the doorbell is rung as soon as 4 wraparounds are
reached.
Cc: stable@vger.kernel.org
Fixes: 58a63729c957 ("net: mana: Fix doorbell out of order violation and avoid unnecessary doorbell rings")
Signed-off-by: Long Li <longli@microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 23 +++++++++++++++----
1 file changed, 18 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 9919183ad39e..fe667e0d930d 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1770,8 +1770,14 @@ static void mana_poll_tx_cq(struct mana_cq *cq)
ndev = txq->ndev;
apc = netdev_priv(ndev);
+ /* Limit CQEs polled to 4 wraparounds of the CQ to ensure the
+ * doorbell can be rung in time for the hardware's requirement
+ * of at least one doorbell ring every 8 wraparounds.
+ */
comp_read = mana_gd_poll_cq(cq->gdma_cq, completions,
- CQE_POLLING_BUFFER);
+ min_t(u32, (cq->gdma_cq->queue_size /
+ COMP_ENTRY_SIZE) * 4,
+ CQE_POLLING_BUFFER));
if (comp_read < 1)
return;
@@ -2156,7 +2162,14 @@ static void mana_poll_rx_cq(struct mana_cq *cq)
struct mana_rxq *rxq = cq->rxq;
int comp_read, i;
- comp_read = mana_gd_poll_cq(cq->gdma_cq, comp, CQE_POLLING_BUFFER);
+ /* Limit CQEs polled to 4 wraparounds of the CQ to ensure the
+ * doorbell can be rung in time for the hardware's requirement
+ * of at least one doorbell ring every 8 wraparounds.
+ */
+ comp_read = mana_gd_poll_cq(cq->gdma_cq, comp,
+ min_t(u32, (cq->gdma_cq->queue_size /
+ COMP_ENTRY_SIZE) * 4,
+ CQE_POLLING_BUFFER));
WARN_ON_ONCE(comp_read > CQE_POLLING_BUFFER);
rxq->xdp_flush = false;
@@ -2201,11 +2214,11 @@ static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue)
mana_gd_ring_cq(gdma_queue, SET_ARM_BIT);
cq->work_done_since_doorbell = 0;
napi_complete_done(&cq->napi, w);
- } else if (cq->work_done_since_doorbell >
- cq->gdma_cq->queue_size / COMP_ENTRY_SIZE * 4) {
+ } else if (cq->work_done_since_doorbell >=
+ (cq->gdma_cq->queue_size / COMP_ENTRY_SIZE) * 4) {
/* MANA hardware requires at least one doorbell ring every 8
* wraparounds of CQ even if there is no need to arm the CQ.
- * This driver rings the doorbell as soon as we have exceeded
+ * This driver rings the doorbell as soon as it has processed
* 4 wraparounds.
*/
mana_gd_ring_cq(gdma_queue, 0);
--
2.43.0
^ permalink raw reply related
* Re: [PATCH v6 1/2] mshv: refactor synic init and cleanup
From: Wei Liu @ 2026-02-25 19:05 UTC (permalink / raw)
To: Anirudh Rayabharam
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel,
Michael Kelley
In-Reply-To: <20260225124403.2187880-2-anirudh@anirudhrb.com>
On Wed, Feb 25, 2026 at 12:44:02PM +0000, Anirudh Rayabharam wrote:
> From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
>
> Rename mshv_synic_init() to mshv_synic_cpu_init() and
> mshv_synic_cleanup() to mshv_synic_cpu_exit() to better reflect that
> these functions handle per-cpu synic setup and teardown.
>
> Use mshv_synic_init/cleanup() to perform init/cleanup that is not per-cpu.
> Move all the synic related setup from mshv_parent_partition_init.
>
> Move the reboot notifier to mshv_synic.c because it currently only
> operates on the synic cpuhp state.
>
> Move out synic_pages from the global mshv_root since its use is now
> completely local to mshv_synic.c.
>
> This is in preparation for the next patch which will add more stuff to
> mshv_synic_init().
There is no need to say "next patch". No need to resend. I will fix it
when I commit this patch.
Wei
>
> No functional change.
>
> Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> ---
> drivers/hv/mshv_root.h | 5 ++-
> drivers/hv/mshv_root_main.c | 64 +++++----------------------------
> drivers/hv/mshv_synic.c | 71 +++++++++++++++++++++++++++++++++----
> 3 files changed, 75 insertions(+), 65 deletions(-)
>
> diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
> index 04c2a1910a8a..826798f1a8ec 100644
> --- a/drivers/hv/mshv_root.h
> +++ b/drivers/hv/mshv_root.h
> @@ -190,7 +190,6 @@ struct hv_synic_pages {
> };
>
> struct mshv_root {
> - struct hv_synic_pages __percpu *synic_pages;
> spinlock_t pt_ht_lock;
> DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS);
> struct hv_partition_property_vmm_capabilities vmm_caps;
> @@ -249,8 +248,8 @@ int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb,
> void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid);
>
> void mshv_isr(void);
> -int mshv_synic_init(unsigned int cpu);
> -int mshv_synic_cleanup(unsigned int cpu);
> +int mshv_synic_init(struct device *dev);
> +void mshv_synic_exit(void);
>
> static inline bool mshv_partition_encrypted(struct mshv_partition *partition)
> {
> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> index e6509c980763..7fcde33d3e75 100644
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c
> @@ -2064,7 +2064,6 @@ mshv_dev_release(struct inode *inode, struct file *filp)
> return 0;
> }
>
> -static int mshv_cpuhp_online;
> static int mshv_root_sched_online;
>
> static const char *scheduler_type_to_string(enum hv_scheduler_type type)
> @@ -2249,27 +2248,6 @@ root_scheduler_deinit(void)
> free_percpu(root_scheduler_output);
> }
>
> -static int mshv_reboot_notify(struct notifier_block *nb,
> - unsigned long code, void *unused)
> -{
> - cpuhp_remove_state(mshv_cpuhp_online);
> - return 0;
> -}
> -
> -struct notifier_block mshv_reboot_nb = {
> - .notifier_call = mshv_reboot_notify,
> -};
> -
> -static void mshv_root_partition_exit(void)
> -{
> - unregister_reboot_notifier(&mshv_reboot_nb);
> -}
> -
> -static int __init mshv_root_partition_init(struct device *dev)
> -{
> - return register_reboot_notifier(&mshv_reboot_nb);
> -}
> -
> static int __init mshv_init_vmm_caps(struct device *dev)
> {
> int ret;
> @@ -2314,39 +2292,21 @@ static int __init mshv_parent_partition_init(void)
> MSHV_HV_MAX_VERSION);
> }
>
> - mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
> - if (!mshv_root.synic_pages) {
> - dev_err(dev, "Failed to allocate percpu synic page\n");
> - ret = -ENOMEM;
> + ret = mshv_synic_init(dev);
> + if (ret)
> goto device_deregister;
> - }
> -
> - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
> - mshv_synic_init,
> - mshv_synic_cleanup);
> - if (ret < 0) {
> - dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
> - goto free_synic_pages;
> - }
> -
> - mshv_cpuhp_online = ret;
>
> ret = mshv_init_vmm_caps(dev);
> if (ret)
> - goto remove_cpu_state;
> + goto synic_cleanup;
>
> ret = mshv_retrieve_scheduler_type(dev);
> if (ret)
> - goto remove_cpu_state;
> -
> - if (hv_root_partition())
> - ret = mshv_root_partition_init(dev);
> - if (ret)
> - goto remove_cpu_state;
> + goto synic_cleanup;
>
> ret = root_scheduler_init(dev);
> if (ret)
> - goto exit_partition;
> + goto synic_cleanup;
>
> ret = mshv_debugfs_init();
> if (ret)
> @@ -2367,13 +2327,8 @@ static int __init mshv_parent_partition_init(void)
> mshv_debugfs_exit();
> deinit_root_scheduler:
> root_scheduler_deinit();
> -exit_partition:
> - if (hv_root_partition())
> - mshv_root_partition_exit();
> -remove_cpu_state:
> - cpuhp_remove_state(mshv_cpuhp_online);
> -free_synic_pages:
> - free_percpu(mshv_root.synic_pages);
> +synic_cleanup:
> + mshv_synic_exit();
> device_deregister:
> misc_deregister(&mshv_dev);
> return ret;
> @@ -2387,10 +2342,7 @@ static void __exit mshv_parent_partition_exit(void)
> misc_deregister(&mshv_dev);
> mshv_irqfd_wq_cleanup();
> root_scheduler_deinit();
> - if (hv_root_partition())
> - mshv_root_partition_exit();
> - cpuhp_remove_state(mshv_cpuhp_online);
> - free_percpu(mshv_root.synic_pages);
> + mshv_synic_exit();
> }
>
> module_init(mshv_parent_partition_init);
> diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
> index f8b0337cdc82..f716c2a4952f 100644
> --- a/drivers/hv/mshv_synic.c
> +++ b/drivers/hv/mshv_synic.c
> @@ -12,11 +12,16 @@
> #include <linux/mm.h>
> #include <linux/io.h>
> #include <linux/random.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/reboot.h>
> #include <asm/mshyperv.h>
>
> #include "mshv_eventfd.h"
> #include "mshv.h"
>
> +static int synic_cpuhp_online;
> +static struct hv_synic_pages __percpu *synic_pages;
> +
> static u32 synic_event_ring_get_queued_port(u32 sint_index)
> {
> struct hv_synic_event_ring_page **event_ring_page;
> @@ -26,7 +31,7 @@ static u32 synic_event_ring_get_queued_port(u32 sint_index)
> u32 message;
> u8 tail;
>
> - spages = this_cpu_ptr(mshv_root.synic_pages);
> + spages = this_cpu_ptr(synic_pages);
> event_ring_page = &spages->synic_event_ring_page;
> synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
>
> @@ -393,7 +398,7 @@ mshv_intercept_isr(struct hv_message *msg)
>
> void mshv_isr(void)
> {
> - struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
> + struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
> struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
> struct hv_message *msg;
> bool handled;
> @@ -446,7 +451,7 @@ void mshv_isr(void)
> }
> }
>
> -int mshv_synic_init(unsigned int cpu)
> +static int mshv_synic_cpu_init(unsigned int cpu)
> {
> union hv_synic_simp simp;
> union hv_synic_siefp siefp;
> @@ -455,7 +460,7 @@ int mshv_synic_init(unsigned int cpu)
> union hv_synic_sint sint;
> #endif
> union hv_synic_scontrol sctrl;
> - struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
> + struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
> struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
> struct hv_synic_event_flags_page **event_flags_page =
> &spages->synic_event_flags_page;
> @@ -542,14 +547,14 @@ int mshv_synic_init(unsigned int cpu)
> return -EFAULT;
> }
>
> -int mshv_synic_cleanup(unsigned int cpu)
> +static int mshv_synic_cpu_exit(unsigned int cpu)
> {
> union hv_synic_sint sint;
> union hv_synic_simp simp;
> union hv_synic_siefp siefp;
> union hv_synic_sirbp sirbp;
> union hv_synic_scontrol sctrl;
> - struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
> + struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
> struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
> struct hv_synic_event_flags_page **event_flags_page =
> &spages->synic_event_flags_page;
> @@ -663,3 +668,57 @@ mshv_unregister_doorbell(u64 partition_id, int doorbell_portid)
>
> mshv_portid_free(doorbell_portid);
> }
> +
> +static int mshv_synic_reboot_notify(struct notifier_block *nb,
> + unsigned long code, void *unused)
> +{
> + if (!hv_root_partition())
> + return 0;
> +
> + cpuhp_remove_state(synic_cpuhp_online);
> + return 0;
> +}
> +
> +static struct notifier_block mshv_synic_reboot_nb = {
> + .notifier_call = mshv_synic_reboot_notify,
> +};
> +
> +int __init mshv_synic_init(struct device *dev)
> +{
> + int ret = 0;
> +
> + synic_pages = alloc_percpu(struct hv_synic_pages);
> + if (!synic_pages) {
> + dev_err(dev, "Failed to allocate percpu synic page\n");
> + return -ENOMEM;
> + }
> +
> + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
> + mshv_synic_cpu_init,
> + mshv_synic_cpu_exit);
> + if (ret < 0) {
> + dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
> + goto free_synic_pages;
> + }
> +
> + synic_cpuhp_online = ret;
> +
> + ret = register_reboot_notifier(&mshv_synic_reboot_nb);
> + if (ret)
> + goto remove_cpuhp_state;
> +
> + return 0;
> +
> +remove_cpuhp_state:
> + cpuhp_remove_state(synic_cpuhp_online);
> +free_synic_pages:
> + free_percpu(synic_pages);
> + return ret;
> +}
> +
> +void mshv_synic_exit(void)
> +{
> + unregister_reboot_notifier(&mshv_synic_reboot_nb);
> + cpuhp_remove_state(synic_cpuhp_online);
> + free_percpu(synic_pages);
> +}
> --
> 2.34.1
>
^ permalink raw reply
* [PATCH net-next v4] net: mana: Add MAC address to vPort logs and clarify error messages
From: Erni Sri Satya Vennela @ 2026-02-25 19:22 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
edumazet, kuba, pabeni, dipayanroy, ernis, shirazsaleem, ssengar,
shradhagupta, gargaditya, linux-hyperv, netdev, linux-kernel
Add MAC address to vPort configuration success message and update error
message to be more specific about HWC message errors in
mana_send_request.
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v4:
* Remove logs that do not add value in hw_channel.c.
Changes in v3:
* Remove the changes from v2 and Update commit message.
* Use "Enabled vPort ..." instead of "Configured vPort" in
mana_cfg_vport.
* Update error logs in mana_hwc_send_request.
Changes in v2:
* Update commit message.
* Use "Enabled vPort ..." instead of "Configured vPort" in
mana_cfg_vport.
* Add info log in mana_uncfg_vport, mana_gd_verify_vf_version,
mana_gd_query_max_resources, mana_query_device_cfg and
mana_query_vport_cfg.
---
drivers/net/ethernet/microsoft/mana/hw_channel.c | 12 +++++++-----
drivers/net/ethernet/microsoft/mana/mana_en.c | 8 ++++----
2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index aa4e2731e2ba..e89b7ed8dd69 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -853,6 +853,7 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
struct hwc_caller_ctx *ctx;
u32 dest_vrcq = 0;
u32 dest_vrq = 0;
+ u32 command;
u16 msg_id;
int err;
@@ -878,6 +879,7 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
req_msg->req.hwc_msg_id = msg_id;
tx_wr->msg_size = req_len;
+ command = req_msg->req.msg_type;
if (gc->is_pf) {
dest_vrq = hwc->pf_dest_vrq_id;
@@ -893,8 +895,8 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
if (!wait_for_completion_timeout(&ctx->comp_event,
(msecs_to_jiffies(hwc->hwc_timeout)))) {
if (hwc->hwc_timeout != 0)
- dev_err(hwc->dev, "HWC: Request timed out: %u ms\n",
- hwc->hwc_timeout);
+ dev_err(hwc->dev, "%s:%d: Command 0x%x timed out: %u ms\n",
+ __func__, __LINE__, command, hwc->hwc_timeout);
/* Reduce further waiting if HWC no response */
if (hwc->hwc_timeout > 1)
@@ -914,9 +916,9 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
err = -EOPNOTSUPP;
goto out;
}
- if (req_msg->req.msg_type != MANA_QUERY_PHY_STAT)
- dev_err(hwc->dev, "HWC: Failed hw_channel req: 0x%x\n",
- ctx->status_code);
+ if (command != MANA_QUERY_PHY_STAT)
+ dev_err(hwc->dev, "%s:%d: Command 0x%x failed with status: 0x%x\n",
+ __func__, __LINE__, command, ctx->status_code);
err = -EPROTO;
goto out;
}
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 9b5a72ada5c4..53f24244de75 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1023,8 +1023,8 @@ static int mana_send_request(struct mana_context *ac, void *in_buf,
if (req->req.msg_type != MANA_QUERY_PHY_STAT &&
mana_need_log(gc, err))
- dev_err(dev, "Failed to send mana message: %d, 0x%x\n",
- err, resp->status);
+ dev_err(dev, "Command 0x%x failed with status: 0x%x, err: %d\n",
+ req->req.msg_type, resp->status, err);
return err ? err : -EPROTO;
}
@@ -1337,8 +1337,8 @@ int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
apc->tx_shortform_allowed = resp.short_form_allowed;
apc->tx_vp_offset = resp.tx_vport_offset;
- netdev_info(apc->ndev, "Configured vPort %llu PD %u DB %u\n",
- apc->port_handle, protection_dom_id, doorbell_pg_id);
+ netdev_info(apc->ndev, "Enabled vPort %llu PD %u DB %u MAC %pM\n",
+ apc->port_handle, protection_dom_id, doorbell_pg_id, apc->mac_addr);
out:
if (err)
mana_uncfg_vport(apc);
--
2.34.1
^ permalink raw reply related
* Re: [PATCH v6 0/2] ARM64 support for doorbell and intercept SINTs
From: Wei Liu @ 2026-02-25 19:49 UTC (permalink / raw)
To: Anirudh Rayabharam
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <20260225124403.2187880-1-anirudh@anirudhrb.com>
On Wed, Feb 25, 2026 at 12:44:01PM +0000, Anirudh Rayabharam wrote:
> From: "Anirudh Rayabharam (Microsoft)" <anirudh@anirudhrb.com>
>
> On x86, the HYPERVISOR_CALLBACK_VECTOR is used to receive synthetic
> interrupts (SINTs) from the hypervisor for doorbells and intercepts.
> There is no such vector reserved for arm64.
>
> On arm64, the hypervisor exposes a synthetic register that can be read
> to find the INTID that should be used for SINTs. This INTID is in the
> PPI range.
>
[...]
> Anirudh Rayabharam (Microsoft) (2):
> mshv: refactor synic init and cleanup
> mshv: add arm64 support for doorbell & intercept SINTs
>
> drivers/hv/mshv_root.h | 5 +-
> drivers/hv/mshv_root_main.c | 64 ++----------
> drivers/hv/mshv_synic.c | 188 +++++++++++++++++++++++++++++++++---
> include/hyperv/hvgdk_mini.h | 2 +
> 4 files changed, 185 insertions(+), 74 deletions(-)
Applied to hyperv-fixes.
I debated a bit whether this is a new feature or a fix and decided it
fixes an important gap in arm64 support.
Wei
>
> --
> 2.34.1
>
>
^ permalink raw reply
* VFIO support on hyperv (vfio_pci_core_ioctl())
From: Mukesh R @ 2026-02-25 22:04 UTC (permalink / raw)
To: alex; +Cc: kvm, wei.liu@kernel.org, linux-hyperv@vger.kernel.org
Hi Alex et al:
I've been looking at making pci passthru irq setup/remap work on hyperv
for the latest (6.19) version using vfio core. Unfortunately, it's just
not fitting well because in case of hyperv the irq remap is done by
the hypervisor. Specifically, for a robust and proper solution, we need
to override vfio_pci_set_msi_trigger(). As such, for the best way forward
I am trying to figure how much flexibility there is to modify
vfio_pci_intrs.c with "if (running_on_hyperv())" branches (putting hyperv
code in separate file).
If none, then the alternative would be to create vfio-hyperv.c with
vfio_device_ops.ioctl = hyperv_vfio_pci_core_ioctl(). But, then I'd
be replicating code for other sub ioctls like vfio_pci_ioctl_get_info(),
vfio_pci_ioctl_get_irq_info(), etc. Would it be acceptable to make them
non static in this case?
Please let me know your thoughts or if you have other suggestions.
Thanks,
-Mukesh
^ permalink raw reply
* Re: [PATCH v1 5/6] x86/hyperv: Implement hypervisor ram collection into vmcore
From: Mukesh R @ 2026-02-25 22:27 UTC (permalink / raw)
To: Ard Biesheuvel, linux-hyperv, linux-kernel, linux-arch
Cc: kys, haiyangz, wei.liu, decui, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, dave.hansen, x86, H . Peter Anvin, Arnd Bergmann
In-Reply-To: <38cdec03-889e-43dd-9dad-e621aba9dc8d@app.fastmail.com>
On 2/21/26 08:43, Ard Biesheuvel wrote:
> Just spotted this code in v7.0-rc
>
> On Wed, 10 Sep 2025, at 02:10, Mukesh Rathor wrote:
> ...
>
>> +static asmlinkage void __noreturn hv_crash_c_entry(void)
>
> 'asmlinkage' means that the function may be called from another compilation unit written in assembler, but it doesn't actually evaluate to anything in most cases. Combining it with 'static' makes no sense whatsoever.
'static' means scope is limited to the file. Common in cases where function
pointers are used, like here in this file way below.
Like the comment says:
"This is the C entry point from the asm glue code after...."
IOW, called from assembly function (asm == assembly).
>
>> +{
>> + struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
>> +
>> + /* first thing, restore kernel gdt */
>> + native_load_gdt(&ctxt->gdtr);
>> +
>> + asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
>> + asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
>> +
>
> This code is truly very broken. You cannot enter a C function without a stack, and assign RSP half way down the function. Especially after allocating local variables and/or calling other functions - it may happen to work in most cases, but it is very fragile. (Other architectures have the concept of 'naked' functions for this purpose but x86 does not)
Local variable refers to static bss struct. IOW,
asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
same as:
asm volatile("movq %0, %%rsp" : : "m"(&hv_crash_ctxt.rsp));
> IOW, this whole function should be written in asm.
>> + asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
>> + asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
>> + asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
>> + asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
>> +
>> + native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
>> + asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
>> +
>> + asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
>> + asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
>> + asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
>> +
>> + native_load_idt(&ctxt->idtr);
>> + native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
>> + native_wrmsrq(MSR_EFER, ctxt->efer);
>> +
>> + /* restore the original kernel CS now via far return */
>> + asm volatile("movzwq %0, %%rax\n\t"
>> + "pushq %%rax\n\t"
>> + "pushq $1f\n\t"
>> + "lretq\n\t"
>> + "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
>> +
>> + /* We are in asmlinkage without stack frame,
>
> You just switched to __KERNEL_CS via the stack.
compiler doesn't know that.
>> hence make a C function
>> + * call which will buy stack frame to restore the tss or clear PT
>> entry.
>> + */
>
> Where does one buy a stack frame?
A stack market :). Callee will create stack frame now that rsp is
setup.
>> + hv_crash_restore_tss();
>> + hv_crash_clear_kernpt();
>> +
>> + /* we are now fully in devirtualized normal kernel mode */
>> + __crash_kexec(NULL);
>> +
>> + for (;;)
>> + cpu_relax();
>> +}
>> +/* Tell gcc we are using lretq long jump in the above function
>> intentionally */
>> +STACK_FRAME_NON_STANDARD(hv_crash_c_entry);
>> +
^ permalink raw reply
* Re: [PATCH net] net: mana: Fix double destroy_workqueue on service rescan PCI path
From: patchwork-bot+netdevbpf @ 2026-02-26 3:20 UTC (permalink / raw)
To: Dipayaan Roy
Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, longli, kotaranov, horms, shradhagupta, ssengar,
ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, dipayanroy
In-Reply-To: <aZ2bzL64NagfyHpg@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>
Hello:
This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Tue, 24 Feb 2026 04:38:36 -0800 you wrote:
> While testing corner cases in the driver, a use-after-free crash
> was found on the service rescan PCI path.
>
> When mana_serv_reset() calls mana_gd_suspend(), mana_gd_cleanup()
> destroys gc->service_wq. If the subsequent mana_gd_resume() fails
> with -ETIMEDOUT or -EPROTO, the code falls through to
> mana_serv_rescan() which triggers pci_stop_and_remove_bus_device().
> This invokes the PCI .remove callback (mana_gd_remove), which calls
> mana_gd_cleanup() a second time, attempting to destroy the already-
> freed workqueue. Fix this by NULL-checking gc->service_wq in
> mana_gd_cleanup() and setting it to NULL after destruction.
>
> [...]
Here is the summary with links:
- [net] net: mana: Fix double destroy_workqueue on service rescan PCI path
https://git.kernel.org/netdev/net/c/f975a0955276
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH rdma-next 26/50] RDMA/erdma: Separate user and kernel CQ creation paths
From: Junxian Huang @ 2026-02-26 6:17 UTC (permalink / raw)
To: Leon Romanovsky, Jason Gunthorpe, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-26-f3be85847922@nvidia.com>
On 2026/2/13 18:58, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
>
> Split CQ creation into distinct kernel and user flows. The hns driver,
> inherited from mlx4, uses a problematic pattern that shares and caches
> umem in hns_roce_db_map_user(). This design blocks the driver from
> supporting generic umem sources (VMA, dmabuf, memfd, and others).
>
> In addition, let's delete counter that counts CQ creation errors. There
> are multiple ways to debug kernel in modern kernel without need to rely
> on that debugfs counter.
>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
> drivers/infiniband/hw/hns/hns_roce_cq.c | 103 ++++++++++++++++++++-------
> drivers/infiniband/hw/hns/hns_roce_debugfs.c | 1 -
> drivers/infiniband/hw/hns/hns_roce_device.h | 3 +-
> drivers/infiniband/hw/hns/hns_roce_main.c | 1 +
> 4 files changed, 82 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
> index 857a913326cd..0f24a916466b 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_cq.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
> @@ -335,7 +335,10 @@ static int verify_cq_create_attr(struct hns_roce_dev *hr_dev,
> {
> struct ib_device *ibdev = &hr_dev->ib_dev;
>
> - if (!attr->cqe || attr->cqe > hr_dev->caps.max_cqes) {
> + if (attr->flags)
> + return -EOPNOTSUPP;
> +
> + if (attr->cqe > hr_dev->caps.max_cqes) {
> ibdev_err(ibdev, "failed to check CQ count %u, max = %u.\n",
> attr->cqe, hr_dev->caps.max_cqes);
> return -EINVAL;
> @@ -407,8 +410,8 @@ static int set_cqe_size(struct hns_roce_cq *hr_cq, struct ib_udata *udata,
> return 0;
> }
>
> -int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> - struct uverbs_attr_bundle *attrs)
> +int hns_roce_create_user_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> + struct uverbs_attr_bundle *attrs)
> {
> struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
> struct ib_udata *udata = &attrs->driver_udata;
> @@ -418,31 +421,27 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> struct hns_roce_ib_create_cq ucmd = {};
> int ret;
>
> - if (attr->flags) {
> - ret = -EOPNOTSUPP;
> - goto err_out;
> - }
> + if (ib_cq->umem)
> + return -EOPNOTSUPP;
>
> ret = verify_cq_create_attr(hr_dev, attr);
> if (ret)
> - goto err_out;
> + return ret;
>
> - if (udata) {
> - ret = get_cq_ucmd(hr_cq, udata, &ucmd);
> - if (ret)
> - goto err_out;
> - }
> + ret = get_cq_ucmd(hr_cq, udata, &ucmd);
> + if (ret)
> + return ret;
>
> set_cq_param(hr_cq, attr->cqe, attr->comp_vector, &ucmd);
>
> ret = set_cqe_size(hr_cq, udata, &ucmd);
> if (ret)
> - goto err_out;
> + return ret;
>
> ret = alloc_cq_buf(hr_dev, hr_cq, udata, ucmd.buf_addr);
> if (ret) {
> ibdev_err(ibdev, "failed to alloc CQ buf, ret = %d.\n", ret);
> - goto err_out;
> + return ret;
> }
>
> ret = alloc_cq_db(hr_dev, hr_cq, udata, ucmd.db_addr, &resp);
> @@ -464,13 +463,11 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> goto err_cqn;
> }
>
> - if (udata) {
> - resp.cqn = hr_cq->cqn;
> - ret = ib_copy_to_udata(udata, &resp,
> - min(udata->outlen, sizeof(resp)));
> - if (ret)
> - goto err_cqc;
> - }
> + resp.cqn = hr_cq->cqn;
> + ret = ib_copy_to_udata(udata, &resp,
> + min(udata->outlen, sizeof(resp)));
> + if (ret)
> + goto err_cqc;
>
> hr_cq->cons_index = 0;
> hr_cq->arm_sn = 1;
> @@ -487,9 +484,67 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> free_cq_db(hr_dev, hr_cq, udata);
> err_cq_buf:
> free_cq_buf(hr_dev, hr_cq);
> -err_out:
> - atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CQ_CREATE_ERR_CNT]);
> + return ret;
> +}
> +
> +int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> + struct uverbs_attr_bundle *attrs)
> +{
> + struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
> + struct hns_roce_ib_create_cq_resp resp = {};
> + struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
> + struct ib_device *ibdev = &hr_dev->ib_dev;
> + struct hns_roce_ib_create_cq ucmd = {};
ucmd and resp are not needed since we don't have udata here.
Junxian
> + int ret;
> +
> + ret = verify_cq_create_attr(hr_dev, attr);
> + if (ret)
> + return ret;
> +
> + set_cq_param(hr_cq, attr->cqe, attr->comp_vector, &ucmd)> +
> + ret = set_cqe_size(hr_cq, NULL, &ucmd);
> + if (ret)
> + return ret;
>
> + ret = alloc_cq_buf(hr_dev, hr_cq, NULL, 0);
> + if (ret) {
> + ibdev_err(ibdev, "failed to alloc CQ buf, ret = %d.\n", ret);
> + return ret;
> + }
> +
> + ret = alloc_cq_db(hr_dev, hr_cq, NULL, 0, &resp);
> + if (ret) {
> + ibdev_err(ibdev, "failed to alloc CQ db, ret = %d.\n", ret);
> + goto err_cq_buf;
> + }
> +
> + ret = alloc_cqn(hr_dev, hr_cq, NULL);
> + if (ret) {
> + ibdev_err(ibdev, "failed to alloc CQN, ret = %d.\n", ret);
> + goto err_cq_db;
> + }
> +
> + ret = alloc_cqc(hr_dev, hr_cq);
> + if (ret) {
> + ibdev_err(ibdev,
> + "failed to alloc CQ context, ret = %d.\n", ret);
> + goto err_cqn;
> + }
> +
> + hr_cq->cons_index = 0;
> + hr_cq->arm_sn = 1;
> + refcount_set(&hr_cq->refcount, 1);
> + init_completion(&hr_cq->free);
> +
> + return 0;
> +
> +err_cqn:
> + free_cqn(hr_dev, hr_cq->cqn);
> +err_cq_db:
> + free_cq_db(hr_dev, hr_cq, NULL);
> +err_cq_buf:
> + free_cq_buf(hr_dev, hr_cq);
> return ret;
> }
>
> diff --git a/drivers/infiniband/hw/hns/hns_roce_debugfs.c b/drivers/infiniband/hw/hns/hns_roce_debugfs.c
> index b869cdc54118..481b30f2f5b5 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_debugfs.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_debugfs.c
> @@ -47,7 +47,6 @@ static const char * const sw_stat_info[] = {
> [HNS_ROCE_DFX_MBX_EVENT_CNT] = "mbx_event",
> [HNS_ROCE_DFX_QP_CREATE_ERR_CNT] = "qp_create_err",
> [HNS_ROCE_DFX_QP_MODIFY_ERR_CNT] = "qp_modify_err",
> - [HNS_ROCE_DFX_CQ_CREATE_ERR_CNT] = "cq_create_err",
> [HNS_ROCE_DFX_CQ_MODIFY_ERR_CNT] = "cq_modify_err",
> [HNS_ROCE_DFX_SRQ_CREATE_ERR_CNT] = "srq_create_err",
> [HNS_ROCE_DFX_SRQ_MODIFY_ERR_CNT] = "srq_modify_err",
> diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
> index 3f032b8038af..fdc5f487d7a3 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_device.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h
> @@ -902,7 +902,6 @@ enum hns_roce_sw_dfx_stat_index {
> HNS_ROCE_DFX_MBX_EVENT_CNT,
> HNS_ROCE_DFX_QP_CREATE_ERR_CNT,
> HNS_ROCE_DFX_QP_MODIFY_ERR_CNT,
> - HNS_ROCE_DFX_CQ_CREATE_ERR_CNT,
> HNS_ROCE_DFX_CQ_MODIFY_ERR_CNT,
> HNS_ROCE_DFX_SRQ_CREATE_ERR_CNT,
> HNS_ROCE_DFX_SRQ_MODIFY_ERR_CNT,
> @@ -1295,6 +1294,8 @@ int to_hr_qp_type(int qp_type);
>
> int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> struct uverbs_attr_bundle *attrs);
> +int hns_roce_create_user_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> + struct uverbs_attr_bundle *attrs);
>
> int hns_roce_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
> int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
> diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
> index a3490bab297a..64de49bf8df7 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_main.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_main.c
> @@ -727,6 +727,7 @@ static const struct ib_device_ops hns_roce_dev_ops = {
> .create_ah = hns_roce_create_ah,
> .create_user_ah = hns_roce_create_ah,
> .create_cq = hns_roce_create_cq,
> + .create_user_cq = hns_roce_create_user_cq,
> .create_qp = hns_roce_create_qp,
> .dealloc_pd = hns_roce_dealloc_pd,
> .dealloc_ucontext = hns_roce_dealloc_ucontext,
>
^ permalink raw reply
* Re: [PATCH rdma-next 26/50] RDMA/erdma: Separate user and kernel CQ creation paths
From: Leon Romanovsky @ 2026-02-26 6:54 UTC (permalink / raw)
To: Junxian Huang
Cc: Jason Gunthorpe, Selvin Xavier, Kalesh AP, Potnuri Bharat Teja,
Michael Margolin, Gal Pressman, Yossi Leybovich, Cheng Xu,
Kai Shen, Chengchang Tang, Abhijit Gangurde, Allen Hubbe,
Krzysztof Czurylo, Tatyana Nikolova, Long Li, Konstantin Taranov,
Yishai Hadas, Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun,
linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <ce205a5a-0b10-449e-0a84-39d3f43aeb53@hisilicon.com>
On Thu, Feb 26, 2026 at 02:17:38PM +0800, Junxian Huang wrote:
>
>
> On 2026/2/13 18:58, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> >
> > Split CQ creation into distinct kernel and user flows. The hns driver,
> > inherited from mlx4, uses a problematic pattern that shares and caches
> > umem in hns_roce_db_map_user(). This design blocks the driver from
> > supporting generic umem sources (VMA, dmabuf, memfd, and others).
> >
> > In addition, let's delete counter that counts CQ creation errors. There
> > are multiple ways to debug kernel in modern kernel without need to rely
> > on that debugfs counter.
> >
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> > drivers/infiniband/hw/hns/hns_roce_cq.c | 103 ++++++++++++++++++++-------
> > drivers/infiniband/hw/hns/hns_roce_debugfs.c | 1 -
> > drivers/infiniband/hw/hns/hns_roce_device.h | 3 +-
> > drivers/infiniband/hw/hns/hns_roce_main.c | 1 +
> > 4 files changed, 82 insertions(+), 26 deletions(-)
<...>
> > +int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
> > + struct uverbs_attr_bundle *attrs)
> > +{
> > + struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
> > + struct hns_roce_ib_create_cq_resp resp = {};
> > + struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
> > + struct ib_device *ibdev = &hr_dev->ib_dev;
> > + struct hns_roce_ib_create_cq ucmd = {};
>
> ucmd and resp are not needed since we don't have udata here.
Thanks, will fix.
>
> Junxian
^ permalink raw reply
* [PATCH 0/3] Allow order zero pages in page reporting
From: Yuvraj Sakshith @ 2026-02-26 7:01 UTC (permalink / raw)
To: akpm, mst, david
Cc: vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm, jasowang,
xuanzhuo, eperezma, virtualization, kys, haiyangz, wei.liu, decui,
longli, linux-hyperv, linux-kernel
Today, page reporting sets page_reporting_order in two ways:
(1) page_reporting.page_reporting_order cmdline parameter
(2) Driver can pass order while registering itself.
In both cases, order zero is ignored by free page reporting
because it is used to set page_reporting_order to a default
value, like MAX_PAGE_ORDER.
In some cases we might want page_reporting_order to be zero.
For instance, when virtio-balloon runs inside a guest with
tiny memory (say, 16MB), it might not be able to find a order 1 page
(or in the worst case order MAX_PAGE_ORDER page) after some uptime.
Page reporting should be able to return order zero pages back for
optimal memory relinquishment.
This patch changes the default fallback value from '0' to '-1' in
all possible clients of free page reporting (hv_balloon and
virtio-balloon) together with allowing '0' as a valid order in
page_reporting_register().
Yuvraj Sakshith (3):
mm/page_reporting: Allow zero page_reporting_order
hv_balloon: Change default page reporting order
virtio_balloon: Set pr_dev.order to new default
drivers/hv/hv_balloon.c | 2 +-
drivers/virtio/virtio_balloon.c | 14 ++++++++++++++
mm/page_reporting.c | 2 +-
3 files changed, 16 insertions(+), 2 deletions(-)
--
2.34.1
^ permalink raw reply
* [PATCH 1/3] mm/page_reporting: Allow zero page_reporting_order
From: Yuvraj Sakshith @ 2026-02-26 7:01 UTC (permalink / raw)
To: akpm, mst, david
Cc: vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm, jasowang,
xuanzhuo, eperezma, virtualization, kys, haiyangz, wei.liu, decui,
longli, linux-hyperv, linux-kernel
In-Reply-To: <20260226070125.3732265-1-yuvraj.sakshith@oss.qualcomm.com>
Some drivers might require page sized chunks to be
reported. This patch allows registering a driver with
order as zero.
Example use case: virtio-balloon driver running on a
guest with very small memory. After some time has passed,
the guest might not be able to find a chunk of 8KB.
Signed-off-by: Yuvraj Sakshith <yuvraj.sakshith@oss.qualcomm.com>
---
mm/page_reporting.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index e4c428e61..fd7c5f0de 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -370,7 +370,7 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
*/
if (page_reporting_order == -1) {
- if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER)
+ if (prdev->order >= 0 && prdev->order <= MAX_PAGE_ORDER)
page_reporting_order = prdev->order;
else
page_reporting_order = pageblock_order;
--
2.34.1
^ permalink raw reply related
* [PATCH 3/3] virtio_balloon: Set pr_dev.order to new default
From: Yuvraj Sakshith @ 2026-02-26 7:01 UTC (permalink / raw)
To: akpm, mst, david
Cc: vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm, jasowang,
xuanzhuo, eperezma, virtualization, kys, haiyangz, wei.liu, decui,
longli, linux-hyperv, linux-kernel
In-Reply-To: <20260226070125.3732265-1-yuvraj.sakshith@oss.qualcomm.com>
Drivers registering with page reporting used zero
as a way to signal page_reporting_order to be set
as a default value (either passed as a param or
MAX_PAGE_ORDER).
Since page_reporting_order can now have zero as
valid order, default fallback value send by drivers
to page reporting is now -1.
Signed-off-by: Yuvraj Sakshith <yuvraj.sakshith@oss.qualcomm.com>
---
drivers/virtio/virtio_balloon.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 74fe59f5a..3cc3dc28a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -1044,6 +1044,20 @@ static int virtballoon_probe(struct virtio_device *vdev)
goto out_unregister_oom;
}
+ /*
+ * page_reporting_register() takes the order either
+ * from the driver or the commandline. If neither
+ * are provided, it falls back to MAX_PAGE_ORDER.
+ *
+ * Order given by the driver is required to be in the
+ * range [0, MAX_PAGE_ORDER].
+ *
+ * One way for the driver to not provide any order
+ * is by setting it to -1.
+ */
+
+ vb->pr_dev_info.order = -1;
+
/*
* The default page reporting order is @pageblock_order, which
* corresponds to 512MB in size on ARM64 when 64KB base page
--
2.34.1
^ permalink raw reply related
* [PATCH 2/3] hv_balloon: Change default page reporting order
From: Yuvraj Sakshith @ 2026-02-26 7:01 UTC (permalink / raw)
To: akpm, mst, david
Cc: vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm, jasowang,
xuanzhuo, eperezma, virtualization, kys, haiyangz, wei.liu, decui,
longli, linux-hyperv, linux-kernel
In-Reply-To: <20260226070125.3732265-1-yuvraj.sakshith@oss.qualcomm.com>
page_reporting_order used to fall back to default
value (passed as parameter or MAX_PAGE_ORDER) if
the driver wishes to not provide it.
The way the driver used to do this was by passing
the order as zero.
Now that zero is a valid order that can be passed by
a driver to page reporting, we use -1 to signal
default value to be used.
Signed-off-by: Yuvraj Sakshith <yuvraj.sakshith@oss.qualcomm.com>
---
drivers/hv/hv_balloon.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 2b4080e51..e33d6e3b2 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -1663,7 +1663,7 @@ static void enable_page_reporting(void)
* We let the page_reporting_order parameter decide the order
* in the page_reporting code
*/
- dm_device.pr_dev_info.order = 0;
+ dm_device.pr_dev_info.order = -1;
ret = page_reporting_register(&dm_device.pr_dev_info);
if (ret < 0) {
dm_device.pr_dev_info.report = NULL;
--
2.34.1
^ permalink raw reply related
* Re: [PATCH v1 5/6] x86/hyperv: Implement hypervisor ram collection into vmcore
From: Ard Biesheuvel @ 2026-02-26 7:44 UTC (permalink / raw)
To: Mukesh Rathor, linux-hyperv, linux-kernel, linux-arch
Cc: kys, haiyangz, wei.liu, decui, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, dave.hansen, x86, H . Peter Anvin, Arnd Bergmann
In-Reply-To: <f8199494-0c42-5eb0-f99e-cc6f6e304d40@linux.microsoft.com>
On Wed, 25 Feb 2026, at 23:27, Mukesh R wrote:
> On 2/21/26 08:43, Ard Biesheuvel wrote:
>> Just spotted this code in v7.0-rc
>>
>> On Wed, 10 Sep 2025, at 02:10, Mukesh Rathor wrote:
>> ...
>>
>>> +static asmlinkage void __noreturn hv_crash_c_entry(void)
>>
>> 'asmlinkage' means that the function may be called from another compilation unit written in assembler, but it doesn't actually evaluate to anything in most cases. Combining it with 'static' makes no sense whatsoever.
>
> 'static' means scope is limited to the file. Common in cases where function
> pointers are used, like here in this file way below.
>
> Like the comment says:
> "This is the C entry point from the asm glue code after...."
>
> IOW, called from assembly function (asm == assembly).
>
I wasn't asking you to explain what 'static' means. I was explaining to you that asmlinkage means 'external linkage' whereas 'static' means the opposite, and so combining them makes no sense.
>>
>>> +{
>>> + struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
>>> +
>>> + /* first thing, restore kernel gdt */
>>> + native_load_gdt(&ctxt->gdtr);
>>> +
>>> + asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
>>> + asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
>>> +
>>
>> This code is truly very broken. You cannot enter a C function without a stack, and assign RSP half way down the function. Especially after allocating local variables and/or calling other functions - it may happen to work in most cases, but it is very fragile. (Other architectures have the concept of 'naked' functions for this purpose but x86 does not)
>
> Local variable refers to static bss struct. IOW,
>
> asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
>
> same as:
> asm volatile("movq %0, %%rsp" : : "m"(&hv_crash_ctxt.rsp));
>
>
No, it is *not* the same. In practice, the compiler might perform this substitution, but there is no guarantee that this happens.
>> IOW, this whole function should be written in asm.
>>> + asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
>>> + asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
>>> + asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
>>> + asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
>>> +
>>> + native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
>>> + asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
>>> +
>>> + asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
>>> + asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
>>> + asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
>>> +
>>> + native_load_idt(&ctxt->idtr);
>>> + native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
>>> + native_wrmsrq(MSR_EFER, ctxt->efer);
>>> +
>>> + /* restore the original kernel CS now via far return */
>>> + asm volatile("movzwq %0, %%rax\n\t"
>>> + "pushq %%rax\n\t"
>>> + "pushq $1f\n\t"
>>> + "lretq\n\t"
>>> + "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
>>> +
>>> + /* We are in asmlinkage without stack frame,
>>
>> You just switched to __KERNEL_CS via the stack.
>
> compiler doesn't know that.
>
So? But does it means to 'be in asmlinkage' in your interpretation? Did you check what 'asmlinkage' actually evaluates to?
I am not asking you to justify why this broken code works in practice, I am asking you to fix it.
>>> hence make a C function
>>> + * call which will buy stack frame to restore the tss or clear PT
>>> entry.
>>> + */
>>
>> Where does one buy a stack frame?
>
> A stack market :). Callee will create stack frame now that rsp is
> setup.
>
This code is beyond broken. Please propose fixes rather than try to argue why carrying broken code like this is acceptable.
^ permalink raw reply
* [RFT PATCH] x86/hyperv: Use __naked attribute to fix stackless C function
From: Ard Biesheuvel @ 2026-02-26 9:50 UTC (permalink / raw)
To: linux-kernel
Cc: Ard Biesheuvel, Mukesh Rathor, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Long Li, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin, Uros Bizjak,
linux-hyperv
From: Ard Biesheuvel <ardb@kernel.org>
hv_crash_c_entry() is a C function that is entered without a stack,
and this is only allowed for functions that have the __naked attribute,
which informs the compiler that it must not emit the usual prologue and
epilogue or emit any other kind of instrumentation that relies on a
stack frame.
So split up the function, and set the __naked attribute on the initial
part that sets up the stack, GDT, IDT and other pieces that are needed
for ordinary C execution. Given that function calls are not permitted
either, use the existing long return coded in an asm() block to call the
second part of the function, which is an ordinary function that is
permitted to call other functions as usual.
Fixes: 94212d34618c ("x86/hyperv: Implement hypervisor RAM collection into vmcore")
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
Build tested only.
Cc: Mukesh Rathor <mrathor@linux.microsoft.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Long Li <longli@microsoft.com>
Cc: Thomas Gleixner <tglx@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Uros Bizjak <ubizjak@gmail.com>
Cc: linux-hyperv@vger.kernel.org
arch/x86/hyperv/hv_crash.c | 80 ++++++++++----------
1 file changed, 42 insertions(+), 38 deletions(-)
diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c
index a78e4fed5720..d77766e8d37e 100644
--- a/arch/x86/hyperv/hv_crash.c
+++ b/arch/x86/hyperv/hv_crash.c
@@ -107,14 +107,12 @@ static void __noreturn hv_panic_timeout_reboot(void)
cpu_relax();
}
-/* This cannot be inlined as it needs stack */
-static noinline __noclone void hv_crash_restore_tss(void)
+static void hv_crash_restore_tss(void)
{
load_TR_desc();
}
-/* This cannot be inlined as it needs stack */
-static noinline void hv_crash_clear_kernpt(void)
+static void hv_crash_clear_kernpt(void)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -125,6 +123,25 @@ static noinline void hv_crash_clear_kernpt(void)
native_p4d_clear(p4d);
}
+
+static void __noreturn hv_crash_handle(void)
+{
+ hv_crash_restore_tss();
+ hv_crash_clear_kernpt();
+
+ /* we are now fully in devirtualized normal kernel mode */
+ __crash_kexec(NULL);
+
+ hv_panic_timeout_reboot();
+}
+
+/*
+ * __naked functions do not permit function calls, not even to __always_inline
+ * functions that only contain asm() blocks themselves. So use a macro instead.
+ */
+#define hv_wrmsr(msr, val) \
+ asm("wrmsr" :: "c"(msr), "a"((u32)val), "d"((u32)(val >> 32)) : "memory")
+
/*
* This is the C entry point from the asm glue code after the disable hypercall.
* We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
@@ -133,49 +150,36 @@ static noinline void hv_crash_clear_kernpt(void)
* available. We restore kernel GDT, and rest of the context, and continue
* to kexec.
*/
-static asmlinkage void __noreturn hv_crash_c_entry(void)
+static void __naked hv_crash_c_entry(void)
{
- struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
-
/* first thing, restore kernel gdt */
- native_load_gdt(&ctxt->gdtr);
+ asm volatile("lgdt %0" : : "m" (hv_crash_ctxt.gdtr));
- asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
- asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
+ asm volatile("movw %%ax, %%ss" : : "a"(hv_crash_ctxt.ss));
+ asm volatile("movq %0, %%rsp" : : "m"(hv_crash_ctxt.rsp));
- asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
- asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
- asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
- asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
+ asm volatile("movw %%ax, %%ds" : : "a"(hv_crash_ctxt.ds));
+ asm volatile("movw %%ax, %%es" : : "a"(hv_crash_ctxt.es));
+ asm volatile("movw %%ax, %%fs" : : "a"(hv_crash_ctxt.fs));
+ asm volatile("movw %%ax, %%gs" : : "a"(hv_crash_ctxt.gs));
- native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
- asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
+ hv_wrmsr(MSR_IA32_CR_PAT, hv_crash_ctxt.pat);
+ asm volatile("movq %0, %%cr0" : : "r"(hv_crash_ctxt.cr0));
- asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
- asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
- asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
+ asm volatile("movq %0, %%cr8" : : "r"(hv_crash_ctxt.cr8));
+ asm volatile("movq %0, %%cr4" : : "r"(hv_crash_ctxt.cr4));
+ asm volatile("movq %0, %%cr2" : : "r"(hv_crash_ctxt.cr4));
- native_load_idt(&ctxt->idtr);
- native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
- native_wrmsrq(MSR_EFER, ctxt->efer);
+ asm volatile("lidt %0" : : "m" (hv_crash_ctxt.idtr));
+ hv_wrmsr(MSR_GS_BASE, hv_crash_ctxt.gsbase);
+ hv_wrmsr(MSR_EFER, hv_crash_ctxt.efer);
/* restore the original kernel CS now via far return */
- asm volatile("movzwq %0, %%rax\n\t"
- "pushq %%rax\n\t"
- "pushq $1f\n\t"
- "lretq\n\t"
- "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
-
- /* We are in asmlinkage without stack frame, hence make C function
- * calls which will buy stack frames.
- */
- hv_crash_restore_tss();
- hv_crash_clear_kernpt();
-
- /* we are now fully in devirtualized normal kernel mode */
- __crash_kexec(NULL);
-
- hv_panic_timeout_reboot();
+ asm volatile("pushq %q0 \n\t"
+ "leaq %c1(%%rip), %q0 \n\t"
+ "pushq %q0 \n\t"
+ "lretq \n\t"
+ :: "a"(hv_crash_ctxt.cs), "i"(hv_crash_handle));
}
/* Tell gcc we are using lretq long jump in the above function intentionally */
STACK_FRAME_NON_STANDARD(hv_crash_c_entry);
--
2.53.0.414.gf7e9f6c205-goog
^ permalink raw reply related
* Re: [RFT PATCH] x86/hyperv: Use __naked attribute to fix stackless C function
From: Uros Bizjak @ 2026-02-26 10:35 UTC (permalink / raw)
To: Ard Biesheuvel
Cc: linux-kernel, Ard Biesheuvel, Mukesh Rathor, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, H. Peter Anvin,
linux-hyperv
In-Reply-To: <20260226095056.46410-2-ardb+git@google.com>
On Thu, Feb 26, 2026 at 10:51 AM Ard Biesheuvel <ardb+git@google.com> wrote:
>
> From: Ard Biesheuvel <ardb@kernel.org>
>
> hv_crash_c_entry() is a C function that is entered without a stack,
> and this is only allowed for functions that have the __naked attribute,
> which informs the compiler that it must not emit the usual prologue and
> epilogue or emit any other kind of instrumentation that relies on a
> stack frame.
>
> So split up the function, and set the __naked attribute on the initial
> part that sets up the stack, GDT, IDT and other pieces that are needed
> for ordinary C execution. Given that function calls are not permitted
> either, use the existing long return coded in an asm() block to call the
> second part of the function, which is an ordinary function that is
> permitted to call other functions as usual.
>
> Fixes: 94212d34618c ("x86/hyperv: Implement hypervisor RAM collection into vmcore")
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
> Build tested only.
>
> Cc: Mukesh Rathor <mrathor@linux.microsoft.com>
> Cc: "K. Y. Srinivasan" <kys@microsoft.com>
> Cc: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: Wei Liu <wei.liu@kernel.org>
> Cc: Dexuan Cui <decui@microsoft.com>
> Cc: Long Li <longli@microsoft.com>
> Cc: Thomas Gleixner <tglx@kernel.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Uros Bizjak <ubizjak@gmail.com>
> Cc: linux-hyperv@vger.kernel.org
>
> arch/x86/hyperv/hv_crash.c | 80 ++++++++++----------
> 1 file changed, 42 insertions(+), 38 deletions(-)
>
> diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c
> index a78e4fed5720..d77766e8d37e 100644
> --- a/arch/x86/hyperv/hv_crash.c
> +++ b/arch/x86/hyperv/hv_crash.c
> @@ -107,14 +107,12 @@ static void __noreturn hv_panic_timeout_reboot(void)
> cpu_relax();
> }
>
> -/* This cannot be inlined as it needs stack */
> -static noinline __noclone void hv_crash_restore_tss(void)
> +static void hv_crash_restore_tss(void)
> {
> load_TR_desc();
> }
>
> -/* This cannot be inlined as it needs stack */
> -static noinline void hv_crash_clear_kernpt(void)
> +static void hv_crash_clear_kernpt(void)
> {
> pgd_t *pgd;
> p4d_t *p4d;
> @@ -125,6 +123,25 @@ static noinline void hv_crash_clear_kernpt(void)
> native_p4d_clear(p4d);
> }
>
> +
> +static void __noreturn hv_crash_handle(void)
> +{
> + hv_crash_restore_tss();
> + hv_crash_clear_kernpt();
> +
> + /* we are now fully in devirtualized normal kernel mode */
> + __crash_kexec(NULL);
> +
> + hv_panic_timeout_reboot();
> +}
> +
> +/*
> + * __naked functions do not permit function calls, not even to __always_inline
> + * functions that only contain asm() blocks themselves. So use a macro instead.
> + */
> +#define hv_wrmsr(msr, val) \
> + asm("wrmsr" :: "c"(msr), "a"((u32)val), "d"((u32)(val >> 32)) : "memory")
> +
> /*
> * This is the C entry point from the asm glue code after the disable hypercall.
> * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
> @@ -133,49 +150,36 @@ static noinline void hv_crash_clear_kernpt(void)
> * available. We restore kernel GDT, and rest of the context, and continue
> * to kexec.
> */
> -static asmlinkage void __noreturn hv_crash_c_entry(void)
> +static void __naked hv_crash_c_entry(void)
> {
> - struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
> -
> /* first thing, restore kernel gdt */
> - native_load_gdt(&ctxt->gdtr);
> + asm volatile("lgdt %0" : : "m" (hv_crash_ctxt.gdtr));
>
> - asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
> - asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
> + asm volatile("movw %%ax, %%ss" : : "a"(hv_crash_ctxt.ss));
> + asm volatile("movq %0, %%rsp" : : "m"(hv_crash_ctxt.rsp));
>
> - asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
> - asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
> - asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
> - asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
> + asm volatile("movw %%ax, %%ds" : : "a"(hv_crash_ctxt.ds));
> + asm volatile("movw %%ax, %%es" : : "a"(hv_crash_ctxt.es));
> + asm volatile("movw %%ax, %%fs" : : "a"(hv_crash_ctxt.fs));
> + asm volatile("movw %%ax, %%gs" : : "a"(hv_crash_ctxt.gs));
>
> - native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
> - asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
> + hv_wrmsr(MSR_IA32_CR_PAT, hv_crash_ctxt.pat);
> + asm volatile("movq %0, %%cr0" : : "r"(hv_crash_ctxt.cr0));
>
> - asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
> - asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
> - asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
> + asm volatile("movq %0, %%cr8" : : "r"(hv_crash_ctxt.cr8));
> + asm volatile("movq %0, %%cr4" : : "r"(hv_crash_ctxt.cr4));
> + asm volatile("movq %0, %%cr2" : : "r"(hv_crash_ctxt.cr4));
>
> - native_load_idt(&ctxt->idtr);
> - native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
> - native_wrmsrq(MSR_EFER, ctxt->efer);
> + asm volatile("lidt %0" : : "m" (hv_crash_ctxt.idtr));
> + hv_wrmsr(MSR_GS_BASE, hv_crash_ctxt.gsbase);
> + hv_wrmsr(MSR_EFER, hv_crash_ctxt.efer);
>
> /* restore the original kernel CS now via far return */
> - asm volatile("movzwq %0, %%rax\n\t"
> - "pushq %%rax\n\t"
> - "pushq $1f\n\t"
> - "lretq\n\t"
> - "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
> -
> - /* We are in asmlinkage without stack frame, hence make C function
> - * calls which will buy stack frames.
> - */
> - hv_crash_restore_tss();
> - hv_crash_clear_kernpt();
> -
> - /* we are now fully in devirtualized normal kernel mode */
> - __crash_kexec(NULL);
> -
> - hv_panic_timeout_reboot();
> + asm volatile("pushq %q0 \n\t"
> + "leaq %c1(%%rip), %q0 \n\t"
You can use %a1 instead of %c1(%%rip).
> + "pushq %q0 \n\t"
> + "lretq \n\t"
No need for terminating \n\t after the last insn in the asm template.
> + :: "a"(hv_crash_ctxt.cs), "i"(hv_crash_handle));
Pedantically, you need ': "+a"(...) : "i"(...)' here.
Uros.
^ permalink raw reply
* Re: [RFT PATCH] x86/hyperv: Use __naked attribute to fix stackless C function
From: Ard Biesheuvel @ 2026-02-26 10:48 UTC (permalink / raw)
To: Uros Bizjak, Ard Biesheuvel
Cc: linux-kernel, Mukesh Rathor, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Long Li, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H . Peter Anvin, linux-hyperv
In-Reply-To: <CAFULd4aSAdKV7XtASr_uQz5hA4qBbWeO-nfgKb979HkwZDbQ_w@mail.gmail.com>
Hi Uros,
On Thu, 26 Feb 2026, at 11:35, Uros Bizjak wrote:
> On Thu, Feb 26, 2026 at 10:51 AM Ard Biesheuvel <ardb+git@google.com> wrote:
>>
>> From: Ard Biesheuvel <ardb@kernel.org>
>>
>> hv_crash_c_entry() is a C function that is entered without a stack,
>> and this is only allowed for functions that have the __naked attribute,
>> which informs the compiler that it must not emit the usual prologue and
>> epilogue or emit any other kind of instrumentation that relies on a
>> stack frame.
>>
>> So split up the function, and set the __naked attribute on the initial
>> part that sets up the stack, GDT, IDT and other pieces that are needed
>> for ordinary C execution. Given that function calls are not permitted
>> either, use the existing long return coded in an asm() block to call the
>> second part of the function, which is an ordinary function that is
>> permitted to call other functions as usual.
>>
>> Fixes: 94212d34618c ("x86/hyperv: Implement hypervisor RAM collection into vmcore")
>> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
>> ---
>> Build tested only.
>>
>> Cc: Mukesh Rathor <mrathor@linux.microsoft.com>
>> Cc: "K. Y. Srinivasan" <kys@microsoft.com>
>> Cc: Haiyang Zhang <haiyangz@microsoft.com>
>> Cc: Wei Liu <wei.liu@kernel.org>
>> Cc: Dexuan Cui <decui@microsoft.com>
>> Cc: Long Li <longli@microsoft.com>
>> Cc: Thomas Gleixner <tglx@kernel.org>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Borislav Petkov <bp@alien8.de>
>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Cc: Uros Bizjak <ubizjak@gmail.com>
>> Cc: linux-hyperv@vger.kernel.org
>>
>> arch/x86/hyperv/hv_crash.c | 80 ++++++++++----------
>> 1 file changed, 42 insertions(+), 38 deletions(-)
>>
>> diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c
>> index a78e4fed5720..d77766e8d37e 100644
>> --- a/arch/x86/hyperv/hv_crash.c
>> +++ b/arch/x86/hyperv/hv_crash.c
>> @@ -107,14 +107,12 @@ static void __noreturn hv_panic_timeout_reboot(void)
>> cpu_relax();
>> }
>>
>> -/* This cannot be inlined as it needs stack */
>> -static noinline __noclone void hv_crash_restore_tss(void)
>> +static void hv_crash_restore_tss(void)
>> {
>> load_TR_desc();
>> }
>>
>> -/* This cannot be inlined as it needs stack */
>> -static noinline void hv_crash_clear_kernpt(void)
>> +static void hv_crash_clear_kernpt(void)
>> {
>> pgd_t *pgd;
>> p4d_t *p4d;
>> @@ -125,6 +123,25 @@ static noinline void hv_crash_clear_kernpt(void)
>> native_p4d_clear(p4d);
>> }
>>
>> +
>> +static void __noreturn hv_crash_handle(void)
>> +{
>> + hv_crash_restore_tss();
>> + hv_crash_clear_kernpt();
>> +
>> + /* we are now fully in devirtualized normal kernel mode */
>> + __crash_kexec(NULL);
>> +
>> + hv_panic_timeout_reboot();
>> +}
>> +
>> +/*
>> + * __naked functions do not permit function calls, not even to __always_inline
>> + * functions that only contain asm() blocks themselves. So use a macro instead.
>> + */
>> +#define hv_wrmsr(msr, val) \
>> + asm("wrmsr" :: "c"(msr), "a"((u32)val), "d"((u32)(val >> 32)) : "memory")
>> +
>> /*
>> * This is the C entry point from the asm glue code after the disable hypercall.
>> * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
>> @@ -133,49 +150,36 @@ static noinline void hv_crash_clear_kernpt(void)
>> * available. We restore kernel GDT, and rest of the context, and continue
>> * to kexec.
>> */
>> -static asmlinkage void __noreturn hv_crash_c_entry(void)
>> +static void __naked hv_crash_c_entry(void)
>> {
>> - struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
>> -
>> /* first thing, restore kernel gdt */
>> - native_load_gdt(&ctxt->gdtr);
>> + asm volatile("lgdt %0" : : "m" (hv_crash_ctxt.gdtr));
>>
>> - asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
>> - asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
>> + asm volatile("movw %%ax, %%ss" : : "a"(hv_crash_ctxt.ss));
>> + asm volatile("movq %0, %%rsp" : : "m"(hv_crash_ctxt.rsp));
>>
>> - asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
>> - asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
>> - asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
>> - asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
>> + asm volatile("movw %%ax, %%ds" : : "a"(hv_crash_ctxt.ds));
>> + asm volatile("movw %%ax, %%es" : : "a"(hv_crash_ctxt.es));
>> + asm volatile("movw %%ax, %%fs" : : "a"(hv_crash_ctxt.fs));
>> + asm volatile("movw %%ax, %%gs" : : "a"(hv_crash_ctxt.gs));
>>
>> - native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
>> - asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
>> + hv_wrmsr(MSR_IA32_CR_PAT, hv_crash_ctxt.pat);
>> + asm volatile("movq %0, %%cr0" : : "r"(hv_crash_ctxt.cr0));
>>
>> - asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
>> - asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
>> - asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
>> + asm volatile("movq %0, %%cr8" : : "r"(hv_crash_ctxt.cr8));
>> + asm volatile("movq %0, %%cr4" : : "r"(hv_crash_ctxt.cr4));
>> + asm volatile("movq %0, %%cr2" : : "r"(hv_crash_ctxt.cr4));
>>
>> - native_load_idt(&ctxt->idtr);
>> - native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
>> - native_wrmsrq(MSR_EFER, ctxt->efer);
>> + asm volatile("lidt %0" : : "m" (hv_crash_ctxt.idtr));
>> + hv_wrmsr(MSR_GS_BASE, hv_crash_ctxt.gsbase);
>> + hv_wrmsr(MSR_EFER, hv_crash_ctxt.efer);
>>
>> /* restore the original kernel CS now via far return */
>> - asm volatile("movzwq %0, %%rax\n\t"
>> - "pushq %%rax\n\t"
>> - "pushq $1f\n\t"
>> - "lretq\n\t"
>> - "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
>> -
>> - /* We are in asmlinkage without stack frame, hence make C function
>> - * calls which will buy stack frames.
>> - */
>> - hv_crash_restore_tss();
>> - hv_crash_clear_kernpt();
>> -
>> - /* we are now fully in devirtualized normal kernel mode */
>> - __crash_kexec(NULL);
>> -
>> - hv_panic_timeout_reboot();
>> + asm volatile("pushq %q0 \n\t"
>> + "leaq %c1(%%rip), %q0 \n\t"
>
> You can use %a1 instead of %c1(%%rip).
>
Nice.
>> + "pushq %q0 \n\t"
>> + "lretq \n\t"
>
> No need for terminating \n\t after the last insn in the asm template.
>
>> + :: "a"(hv_crash_ctxt.cs), "i"(hv_crash_handle));
>
> Pedantically, you need ': "+a"(...) : "i"(...)' here.
>
Right, so the compiler knows that the register will be updated by the asm() block. But what is preventing it from writing back this value to hv_crash_ctxt.cs? The generated code doesn't seem to do so, but the semantics of "+r" suggest otherwise AIUI.
The code following the asm() block is unreachable anyway, so it doesn't really matter either way in practice. Just curious ...
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox