* [PATCH v2 09/18] mshv: Consolidate irqfd interrupt injection paths
From: Stanislav Kinsburskii @ 2026-05-02 4:28 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
The irqfd interrupt injection had duplicated seqcount reads and
inconsistent validity checks between the fast and slow paths:
1. The wakeup handler snapshots irqfd_lapic_irq under seqcount, then on
fast-path failure calls mshv_assert_irq_slow() which re-reads both
girq_ent and lapic_irq under seqcount again — wasteful and confusing.
2. The validity check (girq_entry_valid) only existed in the slow path.
The fast path would blindly accept a zero-initialized structure when
routing was not configured, potentially injecting vector 0 to VP 0.
3. The condition 'girq_ent.guest_irq_num && !girq_ent.girq_entry_valid'
short-circuits for GSI 0 (guest_irq_num == 0), bypassing the
validity check entirely.
4. mshv_irqfd_resampler_ack() reads irqfd_lapic_irq.lapic_control
without seqcount protection, allowing torn reads when racing with
mshv_irqfd_update().
Consolidate by:
- Moving the seqcount snapshot and validity check into the wakeup
handler, performed once before either injection path.
- Changing mshv_assert_irq_slow() to accept a pre-snapshotted
const struct mshv_lapic_irq pointer, eliminating its internal
seqcount read and SRCU lock/unlock.
- Using !girq_entry_valid alone as the validity condition, fixing the
GSI 0 bypass.
- Adding seqcount protection in mshv_irqfd_resampler_ack() to prevent
torn reads of interrupt_type.
Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_eventfd.c | 62 +++++++++++++++++++++++++--------------------
1 file changed, 34 insertions(+), 28 deletions(-)
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 7275b9eaa7541..b92e7f05aa9cd 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -90,7 +90,15 @@ static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
hlist_for_each_entry_srcu(irqfd, &resampler->rsmplr_irqfd_list,
irqfd_resampler_hnode,
srcu_read_lock_held(&partition->pt_irq_srcu)) {
- if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
+ struct mshv_lapic_irq irq;
+ unsigned int seq;
+
+ do {
+ seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+ irq = irqfd->irqfd_lapic_irq;
+ } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+
+ if (hv_should_clear_interrupt(irq.lapic_control.interrupt_type))
hv_call_clear_virtual_interrupt(partition->pt_id);
eventfd_signal(irqfd->irqfd_resamplefd);
@@ -198,36 +206,19 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd,
}
#endif
-static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
+static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd,
+ const struct mshv_lapic_irq *irq)
{
struct mshv_partition *partition = irqfd->irqfd_partn;
- struct mshv_guest_irq_ent girq_ent;
- struct mshv_lapic_irq irq;
- unsigned int seq;
- int idx;
-
- idx = srcu_read_lock(&partition->pt_irq_srcu);
-
- do {
- seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
- girq_ent = irqfd->irqfd_girq_ent;
- irq = irqfd->irqfd_lapic_irq;
- } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
-
- if (girq_ent.guest_irq_num && !girq_ent.girq_entry_valid) {
- srcu_read_unlock(&partition->pt_irq_srcu, idx);
- return;
- }
#if IS_ENABLED(CONFIG_X86)
WARN_ON(irqfd->irqfd_resampler &&
- !irq.lapic_control.level_triggered);
+ !irq->lapic_control.level_triggered);
#endif
hv_call_assert_virtual_interrupt(partition->pt_id,
- irq.lapic_vector, irq.lapic_apic_id,
- irq.lapic_control);
- srcu_read_unlock(&partition->pt_irq_srcu, idx);
+ irq->lapic_vector, irq->lapic_apic_id,
+ irq->lapic_control);
}
static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd)
@@ -316,6 +307,7 @@ static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
int ret = 0;
if (flags & EPOLLIN) {
+ struct mshv_guest_irq_ent girq_ent;
struct mshv_lapic_irq irq;
u64 cnt;
@@ -323,14 +315,18 @@ static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
idx = srcu_read_lock(&pt->pt_irq_srcu);
do {
seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+ girq_ent = irqfd->irqfd_girq_ent;
irq = irqfd->irqfd_lapic_irq;
} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+ if (!girq_ent.girq_entry_valid)
+ goto out_unlock;
+
/* An event has been signaled, raise an interrupt */
- ret = mshv_try_assert_irq_fast(irqfd, &irq);
- if (ret)
- mshv_assert_irq_slow(irqfd);
+ if (mshv_try_assert_irq_fast(irqfd, &irq))
+ mshv_assert_irq_slow(irqfd, &irq);
+out_unlock:
srcu_read_unlock(&pt->pt_irq_srcu, idx);
ret = 1;
@@ -520,8 +516,18 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
*/
events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl);
- if (events & EPOLLIN)
- mshv_assert_irq_slow(irqfd);
+ if (events & EPOLLIN) {
+ struct mshv_lapic_irq irq;
+ unsigned int seq;
+
+ do {
+ seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+ irq = irqfd->irqfd_lapic_irq;
+ } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+
+ if (irqfd->irqfd_girq_ent.girq_entry_valid)
+ mshv_assert_irq_slow(irqfd, &irq);
+ }
srcu_read_unlock(&pt->pt_irq_srcu, idx);
return 0;
^ permalink raw reply related
* [PATCH v2 08/18] mshv: Fix broken seqcount read protection
From: Stanislav Kinsburskii @ 2026-05-02 4:27 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
mshv_irqfd_update() writes both irqfd_girq_ent and irqfd_lapic_irq as a
logical unit under seqcount write protection. Readers must snapshot these
fields inside the seqcount begin/retry loop to obtain a consistent
point-in-time view — otherwise a concurrent update can produce a torn
read where one field comes from the old state and the other from the new.
Both mshv_assert_irq_slow() and mshv_irqfd_wakeup() get this wrong: the
seqcount loop bodies are empty (just spinning until a stable sequence is
observed), and all reads of the protected fields happen after the loop
with no protection from concurrent writes. If mshv_irqfd_update() races
with interrupt assertion, the caller may use a stale or mixed
vector/apic_id/control combination — delivering an interrupt to the
wrong vCPU, with the wrong vector, or with the wrong trigger mode. This
can cause spurious or lost interrupts in the guest, or a stuck interrupt
line in the level-triggered case.
Fix mshv_assert_irq_slow() by snapshotting both irqfd_girq_ent and
irqfd_lapic_irq into local variables inside the seqcount loop, then
using those locals for the validity check and the hypercall.
Fix mshv_irqfd_wakeup() by snapshotting irqfd_lapic_irq inside its
seqcount loop and passing the snapshot to mshv_try_assert_irq_fast(),
so the fast path operates on the consistent copy rather than reading
the field directly outside seqcount protection.
Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_eventfd.c | 47 +++++++++++++++++++++++++--------------------
1 file changed, 26 insertions(+), 21 deletions(-)
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 509911ffcbeee..7275b9eaa7541 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -151,10 +151,10 @@ static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector)
* Try to raise irq for guest via shared vector array. hyp does the actual
* inject of the interrupt.
*/
-static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
+static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd,
+ const struct mshv_lapic_irq *irq)
{
struct mshv_partition *partition = irqfd->irqfd_partn;
- struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
struct mshv_vp *vp;
if (!(ms_hyperv.ext_features &
@@ -191,7 +191,8 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
return 0;
}
#else /* CONFIG_X86_64 */
-static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
+static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd,
+ const struct mshv_lapic_irq *irq)
{
return -EOPNOTSUPP;
}
@@ -200,30 +201,32 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
{
struct mshv_partition *partition = irqfd->irqfd_partn;
- struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
+ struct mshv_guest_irq_ent girq_ent;
+ struct mshv_lapic_irq irq;
unsigned int seq;
int idx;
-#if IS_ENABLED(CONFIG_X86)
- WARN_ON(irqfd->irqfd_resampler &&
- !irq->lapic_control.level_triggered);
-#endif
-
idx = srcu_read_lock(&partition->pt_irq_srcu);
- if (irqfd->irqfd_girq_ent.guest_irq_num) {
- if (!irqfd->irqfd_girq_ent.girq_entry_valid) {
- srcu_read_unlock(&partition->pt_irq_srcu, idx);
- return;
- }
- do {
- seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
- } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+ do {
+ seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+ girq_ent = irqfd->irqfd_girq_ent;
+ irq = irqfd->irqfd_lapic_irq;
+ } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+
+ if (girq_ent.guest_irq_num && !girq_ent.girq_entry_valid) {
+ srcu_read_unlock(&partition->pt_irq_srcu, idx);
+ return;
}
- hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id,
- irq->lapic_vector, irq->lapic_apic_id,
- irq->lapic_control);
+#if IS_ENABLED(CONFIG_X86)
+ WARN_ON(irqfd->irqfd_resampler &&
+ !irq.lapic_control.level_triggered);
+#endif
+
+ hv_call_assert_virtual_interrupt(partition->pt_id,
+ irq.lapic_vector, irq.lapic_apic_id,
+ irq.lapic_control);
srcu_read_unlock(&partition->pt_irq_srcu, idx);
}
@@ -313,16 +316,18 @@ static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
int ret = 0;
if (flags & EPOLLIN) {
+ struct mshv_lapic_irq irq;
u64 cnt;
eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt);
idx = srcu_read_lock(&pt->pt_irq_srcu);
do {
seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+ irq = irqfd->irqfd_lapic_irq;
} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
/* An event has been signaled, raise an interrupt */
- ret = mshv_try_assert_irq_fast(irqfd);
+ ret = mshv_try_assert_irq_fast(irqfd, &irq);
if (ret)
mshv_assert_irq_slow(irqfd);
^ permalink raw reply related
* [PATCH v2 07/18] mshv: Add NULL check for vp in mshv_try_assert_irq_fast
From: Stanislav Kinsburskii @ 2026-05-02 4:27 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
mshv_try_assert_irq_fast() dereferences the vp pointer obtained from
pt_vp_array[lapic_apic_id] without checking for NULL or validating that
lapic_apic_id is within bounds. A spurious interrupt from the hypervisor
targeting a non-existent VP (or one not yet created) causes a NULL
pointer dereference and crashes the host.
Add a bounds check on lapic_apic_id against MSHV_MAX_VPS and a NULL
check on the vp pointer before dereferencing.
Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_eventfd.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 3ab6338064237..509911ffcbeee 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -169,7 +169,12 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
return -EOPNOTSUPP;
#endif
+ if (irq->lapic_apic_id >= MSHV_MAX_VPS)
+ return -EINVAL;
+
vp = partition->pt_vp_array[irq->lapic_apic_id];
+ if (!vp)
+ return -EINVAL;
if (!vp->vp_register_page)
return -EOPNOTSUPP;
^ permalink raw reply related
* [PATCH v2 06/18] mshv: Add defensive synchronize_srcu in irqfd shutdown
From: Stanislav Kinsburskii @ 2026-05-02 4:27 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
mshv_irqfd_assign() adds the irqfd to the partition's hlist and then
registers the wait entry on the eventfd waitqueue via vfs_poll(). A
narrow window exists between these two operations where the irqfd is
visible to deactivation paths but the wait entry is not yet initialized
on the waitqueue.
Currently this is not reachable because mshv_irqfd_assign() and
mshv_irqfd_deassign() are serialized by the partition mutex, and the
EPOLLHUP wakeup path can only fire after vfs_poll() has registered the
wait entry. However, if future refactoring removes or relaxes that
serialization, mshv_irqfd_shutdown() could call
eventfd_ctx_remove_wait_queue() before the wait entry is on the queue,
causing a NULL pointer dereference (the list_head is zeroed by kzalloc
and not initialized by init_waitqueue_func_entry()).
Add synchronize_srcu_expedited() at the start of mshv_irqfd_shutdown()
as a defensive measure, ensuring the assignment path's SRCU read-side
section (which covers vfs_poll() registration) has completed. This
follows the pattern established by KVM in irqfd_shutdown().
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_eventfd.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 5995a62aff8d8..3ab6338064237 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -248,8 +248,12 @@ static void mshv_irqfd_shutdown(struct work_struct *work)
{
struct mshv_irqfd *irqfd =
container_of(work, struct mshv_irqfd, irqfd_shutdown);
+ struct mshv_partition *pt = irqfd->irqfd_partn;
u64 cnt;
+ /* Make sure irqfd has been initialized in assign path. */
+ synchronize_srcu_expedited(&pt->pt_irq_srcu);
+
/*
* Synchronize with the wait-queue and unhook ourselves to prevent
* further events.
^ permalink raw reply related
* [PATCH v2 05/18] mshv: Fix race in mshv_irqfd_deassign
From: Stanislav Kinsburskii @ 2026-05-02 4:27 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
mshv_irqfd_deactivate() and the hlist traversal of pt_irqfds_list
require pt->pt_irqfds_lock to be held, but mshv_irqfd_deassign()
omits it. This races with the EPOLLHUP path in mshv_irqfd_wakeup(),
which does take the lock before calling mshv_irqfd_deactivate().
Additionally, mshv_irqfd_deactivate() uses hlist_del() which poisons
the node pointers rather than resetting them. Since
mshv_irqfd_is_active() relies on hlist_unhashed() (checks pprev ==
NULL), a poisoned node still appears active. If a concurrent path calls
mshv_irqfd_deactivate() again on the same irqfd, the guard fails to
prevent a double hlist_del() on poisoned pointers.
Fix both issues:
- Add the missing spin_lock_irq/spin_unlock_irq around the list
traversal in mshv_irqfd_deassign(), matching mshv_irqfd_release().
- Use hlist_del_init() instead of hlist_del() so the node is properly
marked as unhashed after removal, making the is_active guard reliable.
Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_eventfd.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 90959f639dc32..5995a62aff8d8 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -284,7 +284,7 @@ static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
if (!mshv_irqfd_is_active(irqfd))
return;
- hlist_del(&irqfd->irqfd_hnode);
+ hlist_del_init(&irqfd->irqfd_hnode);
queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
}
@@ -541,13 +541,14 @@ static int mshv_irqfd_deassign(struct mshv_partition *pt,
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
+ spin_lock_irq(&pt->pt_irqfds_lock);
hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list,
irqfd_hnode) {
if (irqfd->irqfd_eventfd_ctx == eventfd &&
irqfd->irqfd_irqnum == args->gsi)
-
mshv_irqfd_deactivate(irqfd);
}
+ spin_unlock_irq(&pt->pt_irqfds_lock);
eventfd_ctx_put(eventfd);
^ permalink raw reply related
* [PATCH v2 04/18] mshv: Fix potential u64 overflow in region overlap check
From: Stanislav Kinsburskii @ 2026-05-02 4:27 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
mshv_partition_create_region() checks for overlapping guest memory
regions using arithmetic that can overflow u64:
mem->guest_pfn + nr_pages <= rg->start_gfn
If guest_pfn is near U64_MAX, the addition wraps around to a small
value, causing the overlap check to incorrectly pass. This could allow
creation of overlapping regions.
Fix by validating the sum with check_add_overflow() before the loop and
using the pre-computed end_gfn in the comparison.
Fixes: f91bc8f61abf ("mshv: Allow mappings that overlap in uaddr")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_root_main.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index aa0f452aa17c1..2b7d56e108bad 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1290,11 +1290,15 @@ static int mshv_partition_create_region(struct mshv_partition *partition,
{
struct mshv_mem_region *rg;
u64 nr_pages = HVPFN_DOWN(mem->size);
+ u64 end_gfn;
+
+ if (check_add_overflow(mem->guest_pfn, nr_pages, &end_gfn))
+ return -EINVAL;
/* Reject overlapping regions */
spin_lock(&partition->pt_mem_regions_lock);
hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
- if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
+ if (end_gfn <= rg->start_gfn ||
rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
continue;
spin_unlock(&partition->pt_mem_regions_lock);
^ permalink raw reply related
* [PATCH v2 03/18] mshv: Fix mshv_prepare_pinned_region error path for unencrypted partitions
From: Stanislav Kinsburskii @ 2026-05-02 4:27 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
mshv_prepare_pinned_region() returns 0 (success) when mshv_region_map()
fails on an unencrypted partition. The condition on the error path:
if (ret && mshv_partition_encrypted(partition))
only handles map failures for encrypted partitions — if the partition is
not encrypted and the map fails, execution falls through to 'return 0',
silently ignoring the error.
Fix by returning immediately on success and falling through to the
cleanup path on failure. For encrypted partitions, attempt to re-share
the region before invalidating. For unencrypted partitions, proceed
directly to invalidation and error return.
Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_root_main.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 665d565899c15..aa0f452aa17c1 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1365,7 +1365,13 @@ static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
}
ret = mshv_region_map(region);
- if (ret && mshv_partition_encrypted(partition)) {
+ if (ret)
+ goto share_region;
+
+ return 0;
+
+share_region:
+ if (mshv_partition_encrypted(partition)) {
int shrc;
shrc = mshv_region_share(region);
@@ -1381,9 +1387,6 @@ static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
*/
goto err_out;
}
-
- return 0;
-
invalidate_region:
mshv_region_invalidate(region);
err_out:
^ permalink raw reply related
* [PATCH v2 02/18] mshv: Fix potential integer overflow in mshv_region_create
From: Stanislav Kinsburskii @ 2026-05-02 4:27 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
The allocation size is computed as:
sizeof(*region) + sizeof(struct page *) * nr_pages
where nr_pages is a u64 originating from userspace. A sufficiently
large nr_pages can overflow the multiplication, resulting in a small
allocation followed by out-of-bounds writes when populating mreg_pages.
Use struct_size() which returns SIZE_MAX on overflow, causing vzalloc
to safely return NULL — caught by the existing error check.
Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_regions.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index fdffd4f002f6f..1d04a97980b8b 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -177,7 +177,7 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
{
struct mshv_mem_region *region;
- region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
+ region = vzalloc(struct_size(region, mreg_pages, nr_pages));
if (!region)
return ERR_PTR(-ENOMEM);
^ permalink raw reply related
* [PATCH v2 01/18] mshv: Fix IRQ leak and type hazards in hv_call_modify_spa_host_access
From: Stanislav Kinsburskii @ 2026-05-02 4:27 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177769588777.222166.3414280094142944420.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
The bounds check inside the PFN-filling loop can return -EINVAL while
interrupts are disabled via local_irq_save(), leaking IRQ state.
Remove the check — it is redundant because the loop invariant
(done + i < page_count == page_struct_count >> large_shift) guarantees
(done + i) << large_shift < page_struct_count always holds.
While here, fix type mismatches: change 'int done' to 'u64 done' and
use u64 for loop and batch-size variables so they match the u64
page_count they are compared against.
Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_root_hv_call.c | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index 129456bd72aba..cc580225e9e45 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -1042,7 +1042,7 @@ int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
{
struct hv_input_modify_sparse_spa_page_host_access *input_page;
u64 status;
- int done = 0;
+ u64 done = 0;
unsigned long irq_flags, large_shift = 0;
u64 page_count = page_struct_count;
u16 code = acquire ? HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS :
@@ -1059,9 +1059,9 @@ int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
}
while (done < page_count) {
- ulong i, completed, remain = page_count - done;
- int rep_count = min(remain,
- HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT);
+ u64 i, completed, remain = page_count - done;
+ u64 rep_count = min_t(u64, remain,
+ HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT);
local_irq_save(irq_flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -1075,15 +1075,9 @@ int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
input_page->flags = flags;
input_page->host_access = host_access;
- for (i = 0; i < rep_count; i++) {
- u64 index = (done + i) << large_shift;
-
- if (index >= page_struct_count)
- return -EINVAL;
-
+ for (i = 0; i < rep_count; i++)
input_page->spa_page_list[i] =
- page_to_pfn(pages[index]);
- }
+ page_to_pfn(pages[(done + i) << large_shift]);
status = hv_do_rep_hypercall(code, rep_count, 0, input_page,
NULL);
^ permalink raw reply related
* [PATCH v2 00/18] mshv: Bug fixes across the mshv_root module
From: Stanislav Kinsburskii @ 2026-05-02 4:27 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
This series addresses bugs found during a continued review of the
mshv_root module introduced by commit 621191d709b14 ("Drivers: hv:
Introduce mshv_root module to expose /dev/mshv to VMMs").
Changes since v1:
- Added 8 new patches addressing issues found by Sashiko (automated
review) covering the irqfd, portid, scheduler message, and VP
lifecycle paths.
- Consolidated the irqfd fast/slow injection paths to eliminate
duplicated seqcount reads and fix the GSI 0 validity bypass.
- Added memory ordering for the lockless VP array.
The fixes range from data corruption and use-after-free to silent
functional failures and sleeping-while-atomic:
Memory region management:
- Integer overflow on userspace-controlled allocation size
(mshv_region_create)
- Silent success on map failure for unencrypted partitions
(mshv_prepare_pinned_region)
- u64 overflow in region overlap check allowing overlapping mappings
IRQ/eventfd path:
- IRQ state leak and type truncation in hypercall helpers
- Missing locking and hlist_del vs hlist_del_init race in irqfd
deassign
- Defensive synchronize_srcu in irqfd shutdown (follows KVM pattern)
- NULL pointer dereference on spurious interrupt to non-existent VP
(mshv_try_assert_irq_fast)
- Broken seqcount read protection — torn reads of interrupt routing
- Duplicated and inconsistent validity checks between fast/slow
injection paths; fast path could inject vector 0 spuriously
- Level-triggered check on uninitialized data making interrupt
resampling completely non-functional
- Duplicate GSI 0 detection using the wrong predicate
Port ID table:
- Use-after-RCU in mshv_portid_lookup (dereference outside read-side
critical section)
- Sleeping under spinlock in mshv_portid_alloc (GFP_KERNEL inside
idr_lock)
- Use kfree_rcu for deferred free without blocking
SynIC / ISR paths:
- Missing VP index bounds check in intercept ISR (OOB in interrupt
context from untrusted hypervisor data)
- Missing store/load ordering for VP array publish — lockless ISR
readers could observe partially-initialized VP
- Missing bounds validation in scheduler messages
(handle_pair_message vp_count, handle_bitset_message bank_mask)
Miscellaneous:
- Missing error code on VP allocation failure (silent success to
userspace)
Kudos to Claude and Sashiko for assisting with analysis and
implementation.
---
Stanislav Kinsburskii (18):
mshv: Fix IRQ leak and type hazards in hv_call_modify_spa_host_access
mshv: Fix potential integer overflow in mshv_region_create
mshv: Fix mshv_prepare_pinned_region error path for unencrypted partitions
mshv: Fix potential u64 overflow in region overlap check
mshv: Fix race in mshv_irqfd_deassign
mshv: Add defensive synchronize_srcu in irqfd shutdown
mshv: Add NULL check for vp in mshv_try_assert_irq_fast
mshv: Fix broken seqcount read protection
mshv: Consolidate irqfd interrupt injection paths
mshv: Fix level-triggered check on uninitialized data
mshv: Fix duplicate GSI detection for GSI 0
mshv: Fix use-after-RCU in mshv_portid_lookup
mshv: Fix sleeping under spinlock in mshv_portid_alloc
mshv: Use kfree_rcu in mshv_portid_free
mshv: Add missing vp_index bounds check in intercept ISR
mshv: Add store/load ordering for VP array publish
mshv: Validate scheduler message bounds from hypervisor
mshv: Fix missing error code on VP allocation failure
drivers/hv/mshv_eventfd.c | 104 +++++++++++++++++++++++++---------------
drivers/hv/mshv_irq.c | 2 -
drivers/hv/mshv_portid_table.c | 12 ++---
drivers/hv/mshv_regions.c | 2 -
drivers/hv/mshv_root_hv_call.c | 18 ++-----
drivers/hv/mshv_root_main.c | 24 +++++++--
drivers/hv/mshv_synic.c | 34 ++++++++++---
7 files changed, 122 insertions(+), 74 deletions(-)
^ permalink raw reply
* Re: [PATCH net, v2] net: mana: Fix crash from unvalidated SHM offset read from BAR0 during FLR
From: Jakub Kicinski @ 2026-05-02 1:53 UTC (permalink / raw)
To: Dipayaan Roy
Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
pabeni, leon, longli, kotaranov, horms, shradhagupta, ssengar,
ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <afJUszROT+yKjth0@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>
On Wed, 29 Apr 2026 11:57:55 -0700 Dipayaan Roy wrote:
> During Function Level Reset recovery, the MANA driver reads
> hardware BAR0 registers that may temporarily contain garbage values.
> The SHM (Shared Memory) offset read from GDMA_REG_SHM_OFFSET is used
> to compute gc->shm_base, which is later dereferenced via readl() in
> mana_smc_poll_register(). If the hardware returns an unaligned or
> out-of-range value, the driver must not blindly use it, as this would
> propagate the hardware error into a kernel crash.
>
> The following crash was observed on an arm64 Hyper-V guest running
> kernel 6.17.0-3013-azure during VF reset recovery triggered by HWC
> timeout.
>
> [13291.785274] Unable to handle kernel paging request at virtual address ffff8000a200001b
> [13291.785311] Mem abort info:
> [13291.785332] ESR = 0x0000000096000021
> [13291.785343] EC = 0x25: DABT (current EL), IL = 32 bits
> [13291.785355] SET = 0, FnV = 0
> [13291.785363] EA = 0, S1PTW = 0
> [13291.785372] FSC = 0x21: alignment fault
> [13291.785382] Data abort info:
> [13291.785391] ISV = 0, ISS = 0x00000021, ISS2 = 0x00000000
> [13291.785404] CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> [13291.785412] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> [13291.785421] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000014df3a1000
> [13291.785432] [ffff8000a200001b] pgd=1000000100438403, p4d=1000000100438403, pud=1000000100439403, pmd=0068000fc2000711
> [13291.785703] Internal error: Oops: 0000000096000021 [#1] SMP
> [13291.830975] Modules linked in: tls qrtr mana_ib ib_uverbs ib_core xt_owner xt_tcpudp xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_compat nf_tables cfg80211 8021q garp mrp stp llc binfmt_misc joydev serio_raw nls_iso8859_1 hid_generic aes_ce_blk aes_ce_cipher polyval_ce ghash_ce sm4_ce_gcm sm4_ce_ccm sm4_ce sm4_ce_cipher hid_hyperv sm4 sm3_ce sha3_ce hv_netvsc hid vmgenid hyperv_keyboard hyperv_drm sch_fq_codel nvme_fabrics efi_pstore dm_multipath nfnetlink vsock_loopback vmw_vsock_virtio_transport_common hv_sock vmw_vsock_vmci_transport vmw_vmci vsock dmi_sysfs ip_tables x_tables autofs4
> [13291.862630] CPU: 122 UID: 0 PID: 61796 Comm: kworker/122:2 Tainted: G W 6.17.0-3013-azure #13-Ubuntu VOLUNTARY
> [13291.869902] Tainted: [W]=WARN
> [13291.871901] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 01/08/2026
> [13291.878086] Workqueue: events mana_serv_func
> [13291.880718] pstate: 62400005 (nZCv daif +PAN -UAO +TCO -DIT -SSBS BTYPE=--)
> [13291.884835] pc : mana_smc_poll_register+0x48/0xb0
> [13291.887902] lr : mana_smc_setup_hwc+0x70/0x1c0
> [13291.890493] sp : ffff8000ab79bbb0
> [13291.892364] x29: ffff8000ab79bbb0 x28: ffff00410c8b5900 x27: ffff00410d630680
> [13291.896252] x26: ffff004171f9fd80 x25: 000000016ed55000 x24: 000000017f37e000
> [13291.899990] x23: 0000000000000000 x22: 000000016ed55000 x21: 0000000000000000
> [13291.904497] x20: ffff8000a200001b x19: 0000000000004e20 x18: ffff8000a6183050
> [13291.908308] x17: 0000000000000000 x16: 0000000000000000 x15: 000000000000000a
> [13291.912542] x14: 0000000000000004 x13: 0000000000000000 x12: 0000000000000000
> [13291.916298] x11: 0000000000000000 x10: 0000000000000001 x9 : ffffc45006af1bd8
> [13291.920945] x8 : ffff000151129000 x7 : 0000000000000000 x6 : 0000000000000000
> [13291.925293] x5 : 000000015f214000 x4 : 000000017217a000 x3 : 000000016ed50000
> [13291.930436] x2 : 000000016ed55000 x1 : 0000000000000000 x0 : ffff8000a1ffffff
> [13291.934342] Call trace:
> [13291.935736] mana_smc_poll_register+0x48/0xb0 (P)
> [13291.938611] mana_smc_setup_hwc+0x70/0x1c0
> [13291.941113] mana_hwc_create_channel+0x1a0/0x3a0
> [13291.944283] mana_gd_setup+0x16c/0x398
> [13291.946584] mana_gd_resume+0x24/0x70
> [13291.948917] mana_do_service+0x13c/0x1d0
> [13291.951583] mana_serv_func+0x34/0x68
> [13291.953732] process_one_work+0x168/0x3d0
> [13291.956745] worker_thread+0x2ac/0x480
> [13291.959104] kthread+0xf8/0x110
> [13291.961026] ret_from_fork+0x10/0x20
> [13291.963560] Code: d2807d00 9417c551 71000673 54000220 (b9400281)
> [13291.967299] ---[ end trace 0000000000000000 ]---
>
> Disassembly of mana_smc_poll_register() around the crash site:
>
> Disassembly of section .text:
>
> 00000000000047c8 <mana_smc_poll_register>:
> 47c8: d503201f nop
> 47cc: d503201f nop
> 47d0: d503233f paciasp
> 47d4: f800865e str x30, [x18], #8
> 47d8: a9bd7bfd stp x29, x30, [sp, #-48]!
> 47dc: 910003fd mov x29, sp
> 47e0: a90153f3 stp x19, x20, [sp, #16]
> 47e4: 91007014 add x20, x0, #0x1c
> 47e8: 5289c413 mov w19, #0x4e20
> 47ec: f90013f5 str x21, [sp, #32]
> 47f0: 12001c35 and w21, w1, #0xff
> 47f4: 14000008 b 4814 <mana_smc_poll_register+0x4c>
> 47f8: 36f801e1 tbz w1, #31, 4834 <mana_smc_poll_register+0x6c>
> 47fc: 52800042 mov w2, #0x2
> 4800: d280fa01 mov x1, #0x7d0
> 4804: d2807d00 mov x0, #0x3e8
> 4808: 94000000 bl 0 <usleep_range_state>
> 480c: 71000673 subs w19, w19, #0x1
> 4810: 54000200 b.eq 4850 <mana_smc_poll_register+0x88>
> 4814: b9400281 ldr w1, [x20] <-- **** CRASHED HERE *****
> 4818: d50331bf dmb oshld
> 481c: 2a0103e2 mov w2, w1
> ...
>
> From the crash signature x20 = ffff8000a200001b, this address
> ends in 0x1b which is not 4-byte aligned, so the 'ldr w1, [x20]'
> instruction (readl) triggers the arm64 alignment fault (FSC = 0x21).
>
> The root cause is in mana_gd_init_vf_regs(), which computes:
>
> gc->shm_base = gc->bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET);
>
> The offset is used without any validation. The same problem exists
> in mana_gd_init_pf_regs() for sriov_base_off and sriov_shm_off.
>
> Fix this by validating all offsets before use:
>
> - VF: check shm_off is within BAR0, properly aligned to 4 bytes
> (readl requirement), and leaves room for the full 256-bit
> (32-byte) SMC aperture.
>
> - PF: check sriov_base_off is within BAR0, aligned to 8 bytes
> (readq requirement), and leaves room to safely read the
> sriov_shm_off register at sriov_base_off + GDMA_PF_REG_SHM_OFF.
> Then check sriov_shm_off leaves room for the full SMC aperture.
> All arithmetic uses subtraction rather than addition to avoid
> integer overflow on garbage firmware values.
>
> without validating the offset read from hardware. If the register
> returns a garbage value that is neither within bar 0 bounds nor aligned
> to the 4-byte granularity, thus causing the alignment fault.
>
> Define SMC_APERTURE_SIZE (32 bytes, derived from the 256-bit aperture
> width)
>
> Return -EPROTO on invalid values. The existing recovery path in
> mana_serv_reset() already handles -EPROTO by falling through to PCI
> device rescan, giving the hardware another chance to present valid
> register values after reset.
>
> Fixes: 9bf66036d686 ("net: mana: Handle hardware recovery events when probing the device")
> Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
>
> ---
> Changes in v2:
> - Fix sriov_base_off alignment check: sizeof(u32) to sizeof(u64), since
> mana_gd_r64() (readq) requires 8-byte alignment on arm64.
> - Fix sriov_base_off bounds: also verify enough space remains in BAR0
> to safely read sriov_shm_off at offset GDMA_PF_REG_SHM_OFF + 8 bytes.
> - Fix integer overflow: rewrite bounds checks using subtraction
> (remaining = bar0_size - base) instead of addition.
> - Fix SMC aperture size: add gc->bar0_size - shm_off < SMC_APERTURE_SIZE
> checks in both VF and PF paths; previously only the start address was
> validated, but mana_smc_poll_register() accesses up to shm_base + 0x1c
> (28 bytes from base, 32 bytes total).
> - Export SMC_APERTURE_SIZE to shm_channel.h.
> ---
> .../net/ethernet/microsoft/mana/gdma_main.c | 40 ++++++++++++++++---
> include/net/mana/shm_channel.h | 6 +++
> 2 files changed, 41 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 098fbda0d128..d8e816882f02 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -43,8 +43,9 @@ static u64 mana_gd_r64(struct gdma_context *g, u64 offset)
> static int mana_gd_init_pf_regs(struct pci_dev *pdev)
> {
> struct gdma_context *gc = pci_get_drvdata(pdev);
> - void __iomem *sriov_base_va;
> + u64 remaining_barsize;
> u64 sriov_base_off;
> + u64 sriov_shm_off;
>
> gc->db_page_size = mana_gd_r32(gc, GDMA_PF_REG_DB_PAGE_SIZE) & 0xFFFF;
>
> @@ -73,10 +74,28 @@ static int mana_gd_init_pf_regs(struct pci_dev *pdev)
> gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off;
>
> sriov_base_off = mana_gd_r64(gc, GDMA_SRIOV_REG_CFG_BASE_OFF);
> + if (sriov_base_off >= gc->bar0_size ||
> + gc->bar0_size - sriov_base_off <
> + GDMA_PF_REG_SHM_OFF + sizeof(u64) ||
nit: fits on a single line, I think?
> + !IS_ALIGNED(sriov_base_off, sizeof(u64))) {
> + dev_err(gc->dev,
> + "SRIOV base offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
> + sriov_base_off, (u64)gc->bar0_size);
> + return -EPROTO;
> + }
>
> - sriov_base_va = gc->bar0_va + sriov_base_off;
> - gc->shm_base = sriov_base_va +
> - mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF);
> + remaining_barsize = gc->bar0_size - sriov_base_off;
> + sriov_shm_off = mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF);
> + if (sriov_shm_off >= remaining_barsize ||
> + remaining_barsize - sriov_shm_off < SMC_APERTURE_SIZE ||
> + !IS_ALIGNED(sriov_shm_off, sizeof(u32))) {
> + dev_err(gc->dev,
> + "SRIOV SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
> + sriov_shm_off, (u64)gc->bar0_size);
> + return -EPROTO;
> + }
> +
> + gc->shm_base = gc->bar0_va + sriov_base_off + sriov_shm_off;
>
> return 0;
> }
> @@ -84,6 +103,7 @@ static int mana_gd_init_pf_regs(struct pci_dev *pdev)
> static int mana_gd_init_vf_regs(struct pci_dev *pdev)
> {
> struct gdma_context *gc = pci_get_drvdata(pdev);
> + u64 shm_off;
>
> gc->db_page_size = mana_gd_r32(gc, GDMA_REG_DB_PAGE_SIZE) & 0xFFFF;
>
> @@ -111,7 +131,17 @@ static int mana_gd_init_vf_regs(struct pci_dev *pdev)
> gc->db_page_base = gc->bar0_va + gc->db_page_off;
> gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off;
>
> - gc->shm_base = gc->bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET);
> + shm_off = mana_gd_r64(gc, GDMA_REG_SHM_OFFSET);
> + if (shm_off >= gc->bar0_size ||
> + gc->bar0_size - shm_off < SMC_APERTURE_SIZE ||
> + !IS_ALIGNED(shm_off, sizeof(u32))) {
> + dev_err(gc->dev,
> + "SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
> + shm_off, (u64)gc->bar0_size);
> + return -EPROTO;
> + }
> +
> + gc->shm_base = gc->bar0_va + shm_off;
>
> return 0;
> }
> diff --git a/include/net/mana/shm_channel.h b/include/net/mana/shm_channel.h
> index 5199b41497ff..dbabcfb95daf 100644
> --- a/include/net/mana/shm_channel.h
> +++ b/include/net/mana/shm_channel.h
> @@ -4,6 +4,12 @@
> #ifndef _SHM_CHANNEL_H
> #define _SHM_CHANNEL_H
>
> +#define SMC_APERTURE_BITS 256
> +#define SMC_BASIC_UNIT (sizeof(u32))
> +#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8))
> +#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1)
> +#define SMC_APERTURE_SIZE (SMC_APERTURE_BITS / 8)
AI bots complain that we're redefining this.
Since it's a fix I think it's better to remove the existing definition
even if it lives in a driver that goes via a different tree.
> struct shm_channel {
> struct device *dev;
> void __iomem *base;
--
pw-bot: cr
^ permalink raw reply
* Re: [PATCH net-next v7] net: mana: Expose hardware diagnostic info via debugfs
From: Jakub Kicinski @ 2026-05-02 0:54 UTC (permalink / raw)
To: ernis
Cc: Jakub Kicinski, kys, haiyangz, wei.liu, decui, longli,
andrew+netdev, davem, edumazet, pabeni, kotaranov, horms,
shradhagupta, dipayanroy, yury.norov, shirazsaleem, kees,
linux-hyperv, netdev, linux-kernel, linux-rdma
In-Reply-To: <20260430075342.1871554-1-ernis@linux.microsoft.com>
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
net: mana: Expose hardware diagnostic info via debugfs
This patch adds debugfs entries under /sys/kernel/debug/mana/<BDF>/ to
expose device-level and per-vPort hardware configuration and diagnostic
information. Debugfs creation and removal are moved into mana_gd_setup()
and a renamed mana_gd_cleanup_device() so probe, remove, suspend, resume
and shutdown share a single path.
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index a654b3699c4c..077d3a1ff6bf 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
[ ... ]
> @@ -3659,6 +3684,11 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
>
> ac->bm_hostmode = bm_hostmode;
>
> + debugfs_create_u16("num_vports", 0400, gc->mana_pci_debugfs,
> + &ac->num_ports);
> + debugfs_create_u8("bm_hostmode", 0400, gc->mana_pci_debugfs,
> + &ac->bm_hostmode);
> +
> if (!resuming) {
> ac->num_ports = num_ports;
> } else {
Is the num_vports debugfs file pointed at ac->num_ports before that
field has been assigned from the queried num_ports value?
On a fresh probe (!resuming), the sequence in mana_probe() is:
err = mana_query_device_cfg(ac, ..., &num_ports, &bm_hostmode);
...
ac->bm_hostmode = bm_hostmode;
debugfs_create_u16("num_vports", 0400, gc->mana_pci_debugfs,
&ac->num_ports);
...
if (!resuming) {
ac->num_ports = num_ports;
} else {
...
}
...
if (ac->num_ports > MAX_PORTS_IN_MANA_DEV)
ac->num_ports = MAX_PORTS_IN_MANA_DEV;
During the window between debugfs_create_u16() and the assignment (and
subsequent clamp to MAX_PORTS_IN_MANA_DEV), would a reader of the
num_vports debugfs file observe 0 rather than the hardware-reported
value?
Would it be clearer to create this file after ac->num_ports has been
assigned and clamped, so the exposed value matches what the commit
message describes as "Device configuration"?
--
pw-bot: cr
^ permalink raw reply
* Re: [PATCH V2 08/11] PCI: hv: Build device id for a VMBus device, export PCI devid function
From: Easwar Hariharan @ 2026-05-01 18:38 UTC (permalink / raw)
To: Mukesh R
Cc: hpa, robin.murphy, robh, wei.liu, mhklinux, muislam, namjain,
magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch, easwar.hariharan, kys, haiyangz, decui,
longli, tglx, mingo, bp, dave.hansen, x86, joro, will, lpieralisi,
kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-9-mrathor@linux.microsoft.com>
On 4/30/2026 5:41 PM, Mukesh R wrote:
> On Hyper-V, most hypercalls related to PCI passthru to map/unmap regions,
> interrupts, etc need a device ID as a parameter. This device ID refers
> to that specific device during the lifetime of passthru.
>
> An L1VH VM only contains VMBus based devices. A device ID for a VMBus
> device is slightly different in that it uses the hv_pcibus_device info
> for building it to make sure it matches exactly what the hypervisor
> expects. This VMBus based device ID is needed when attaching devices in
> an L1VH based guest VM. Before building it, a check is done to make sure
> the device is a valid VMBus device.
>
> In remaining cases, PCI device ID is used. So, also make PCI device ID
> build function hv_build_devid_type_pci() public.
>
> Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
> ---
> arch/x86/hyperv/irqdomain.c | 9 +++++----
> arch/x86/include/asm/mshyperv.h | 6 ++++++
> drivers/pci/controller/pci-hyperv.c | 24 ++++++++++++++++++++++++
> include/asm-generic/mshyperv.h | 8 ++++++++
> 4 files changed, 43 insertions(+), 4 deletions(-)
>
Please see the discussion on a similar patch for the guest IOMMU driver here:
https://lore.kernel.org/all/2dabc1b8-0cf0-4fc8-9cd4-cce60adfc05e@linux.microsoft.com/
My implementation of that approach is below:
--------------------x8-------------------------------------------------
commit 233e90466cb79b3a952806206d89164ca2a1428a
Author: Easwar Hariharan <easwar.hariharan@linux.microsoft.com>
Date: Thu Apr 30 20:46:05 2026 +0000
PCI: hv: Inform IOMMU when new devices are offered
Hyper-V uses a logical device ID to identify a PCI endpoint device for
child partitions. This ID is built from parts of the virtual PCI bus
GUID and the function number of the PCI BDF. This ID is required to
identify devices to the hypervisor for IOMMU management hypercalls used
by the Hyper-V IOMMU driver.
Inform the IOMMU driver of the Hyper-V vPCI bus GUID so it can build the
logical device ID for vPCI devices, and factor the logic for building
this ID into a standalone helper function for clarity and easier
maintenance in tandem with the IOMMU driver's version.
Signed-off-by: Easwar Hariharan <easwar.hariharan@linux.microsoft.com>
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 20d947c2c758..d1070a4a24eb 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -315,6 +315,9 @@ static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
#ifdef CONFIG_HYPERV_PVIOMMU
int __init hv_iommu_init(void);
+int hv_iommu_inform(int dom, guid_t bus_instance_guid);
+#else
+int hv_iommu_inform(int dom, guid_t bus_instance_guid) {}
#endif
#include <asm-generic/mshyperv.h>
diff --git a/drivers/iommu/hyperv/iommu.c b/drivers/iommu/hyperv/iommu.c
index ad7000f4566e..34a39c3b89b4 100644
--- a/drivers/iommu/hyperv/iommu.c
+++ b/drivers/iommu/hyperv/iommu.c
@@ -8,6 +8,7 @@
#include <linux/iommu.h>
#include <linux/pci.h>
+#include <linux/uuid.h>
#include <linux/dma-map-ops.h>
#include <linux/generic_pt/iommu.h>
#include <linux/syscore_ops.h>
@@ -26,12 +27,86 @@ static struct hv_iommu_domain hv_blocking_domain;
static const struct iommu_domain_ops hv_iommu_identity_domain_ops;
static const struct iommu_domain_ops hv_iommu_blocking_domain_ops;
static struct iommu_ops hv_iommu_ops;
+static struct list_head hv_pci_bus_list;
#define hv_iommu_present(iommu_cap) (iommu_cap & HV_IOMMU_CAP_PRESENT)
#define hv_iommu_s1_domain_supported(iommu_cap) (iommu_cap & HV_IOMMU_CAP_S1)
#define hv_iommu_5lvl_supported(iommu_cap) (iommu_cap & HV_IOMMU_CAP_S1_5LVL)
#define hv_iommu_ats_supported(iommu_cap) (iommu_cap & HV_IOMMU_CAP_ATS)
+/*
+ * Build a "Device Logical ID" out of this PCI bus's instance GUID and the
+ * function number of the device.
+ * This is identical and should be maintained in sync with
+ * hv_pci_build_logical_dev_id() in pci-hyperv.c. Only repeated here to avoid
+ * dependency of a built-in driver (iommu) on a module (pci-hyperv) and to
+ * maintain the correct direction of dependency of the PCI driver on the IOMMU
+ * instead of vice versa
+ */
+static u64 hv_iommu_build_logical_dev_id(struct pci_dev *pdev,
+ guid_t bus_instance_guid)
+{
+ return (u64)((bus_instance_guid.b[5] << 24) |
+ (bus_instance_guid.b[4] << 16) |
+ (bus_instance_guid.b[7] << 8) |
+ (bus_instance_guid.b[6] & 0xf8) |
+ PCI_FUNC(pdev->devfn));
+}
+
+static struct hv_pci_busdata *find_hv_pci_bus(int dom)
+{
+ struct hv_pci_busdata *curr = NULL;
+
+ list_for_each_entry(curr, &hv_pci_bus_list, list) {
+ if (curr->dom == dom) {
+ return curr;
+ }
+ }
+ return NULL;
+}
+
+#define INVALID_LOGICAL_DEV_ID 0
+static u64 hv_iommu_get_logical_dev_id(struct pci_dev *pdev)
+{
+ struct hv_pci_busdata *bus = find_hv_pci_bus(pci_domain_nr(pdev->bus));
+
+ if (!bus) {
+ dev_WARN(&pdev->dev,
+ "received device ID request for missing hv_pci bus\n");
+ return INVALID_LOGICAL_DEV_ID;
+ }
+
+ return hv_iommu_build_logical_dev_id(pdev, bus->bus_instance_guid);
+}
+
+int hv_iommu_inform(int dom, guid_t bus_instance_guid)
+{
+ struct hv_pci_busdata *found, *new;
+
+ /* Don't spend memory if there is no consumer */
+ if (no_iommu || !iommu_detected)
+ return 0;
+
+ found = find_hv_pci_bus(dom);
+ if (found && !guid_equal(&found->bus_instance_guid, &bus_instance_guid)) {
+ found->bus_instance_guid = bus_instance_guid;
+ return 0;
+ }
+
+ new = kzalloc_obj(*new);
+ if (!new) {
+ pr_info("No memory to allocate hv_pci bus data\n");
+ return -ENOMEM;
+ }
+
+ new->dom = dom;
+ new->bus_instance_guid = bus_instance_guid;
+ list_add(&new->list, &hv_pci_bus_list);
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_MODULES(hv_iommu_inform, "pci-hyperv");
+
static int hv_create_device_domain(struct hv_iommu_domain *hv_domain, u32 domain_stage)
{
int ret;
@@ -143,7 +218,7 @@ static void hv_iommu_detach_dev(struct iommu_domain *domain, struct device *dev)
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->partition_id = HV_PARTITION_ID_SELF;
- input->device_id.as_uint64 = hv_build_logical_dev_id(pdev);
+ input->device_id.as_uint64 = hv_iommu_get_logical_dev_id(pdev);
status = hv_do_hypercall(HVCALL_DETACH_DEVICE_DOMAIN, input, NULL);
local_irq_restore(flags);
@@ -185,7 +260,7 @@ static int hv_iommu_attach_dev(struct iommu_domain *domain, struct device *dev,
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->device_domain = hv_domain->device_domain;
- input->device_id.as_uint64 = hv_build_logical_dev_id(pdev);
+ input->device_id.as_uint64 = hv_iommu_get_logical_dev_id(pdev);
status = hv_do_hypercall(HVCALL_ATTACH_DEVICE_DOMAIN, input, NULL);
local_irq_restore(flags);
@@ -213,7 +288,7 @@ static int hv_iommu_get_logical_device_property(struct device *dev,
output = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*input);
memset(input, 0, sizeof(*input));
input->partition_id = HV_PARTITION_ID_SELF;
- input->logical_device_id = hv_build_logical_dev_id(to_pci_dev(dev));
+ input->logical_device_id = hv_iommu_get_logical_dev_id(to_pci_dev(dev));
input->code = code;
status = hv_do_hypercall(HVCALL_GET_LOGICAL_DEVICE_PROPERTY, input, output);
*property = *output;
@@ -665,6 +740,7 @@ int __init hv_iommu_init(void)
iommu_detected = 1;
pci_request_acs();
+ INIT_LIST_HEAD(&hv_pci_bus_list);
hv_iommu = kzalloc(sizeof(*hv_iommu), GFP_KERNEL);
if (!hv_iommu)
diff --git a/drivers/iommu/hyperv/iommu.h b/drivers/iommu/hyperv/iommu.h
index 8829176ddb51..5b2d1c41c101 100644
--- a/drivers/iommu/hyperv/iommu.h
+++ b/drivers/iommu/hyperv/iommu.h
@@ -36,6 +36,12 @@ struct hv_iommu_domain {
u64 pgsize_bitmap;
};
+struct hv_pci_busdata {
+ int dom;
+ guid_t bus_instance_guid;
+ struct list_head list;
+};
+
struct hv_iommu_endpoint {
struct device *dev;
struct hv_iommu_dev *hv_iommu;
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 3b7adf28ee09..5d4fb2c2f60a 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -597,23 +597,19 @@ static unsigned int hv_msi_get_int_vector(struct irq_data *data)
#define hv_msi_prepare pci_msi_prepare
-/**
+#define INVALID_LOGICAL_DEV_ID 0
+/*
* Build a "Device Logical ID" out of this PCI bus's instance GUID and the
* function number of the device.
*/
-u64 hv_build_logical_dev_id(struct pci_dev *pdev)
+static u64 hv_pci_build_logical_dev_id(struct pci_dev *pdev, guid_t bus_instance_guid)
{
- struct pci_bus *pbus = pdev->bus;
- struct hv_pcibus_device *hbus = container_of(pbus->sysdata,
- struct hv_pcibus_device, sysdata);
-
- return (u64)((hbus->hdev->dev_instance.b[5] << 24) |
- (hbus->hdev->dev_instance.b[4] << 16) |
- (hbus->hdev->dev_instance.b[7] << 8) |
- (hbus->hdev->dev_instance.b[6] & 0xf8) |
+ return (u64)((bus_instance_guid.b[5] << 24) |
+ (bus_instance_guid.b[4] << 16) |
+ (bus_instance_guid.b[7] << 8) |
+ (bus_instance_guid.b[6] & 0xf8) |
PCI_FUNC(pdev->devfn));
}
-EXPORT_SYMBOL_GPL(hv_build_logical_dev_id);
/**
* hv_irq_retarget_interrupt() - "Unmask" the IRQ by setting its current
@@ -657,7 +653,7 @@ static void hv_irq_retarget_interrupt(struct irq_data *data)
params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff;
params->int_entry.msi_entry.data.as_uint32 = int_desc->data;
- params->device_id = hv_build_logical_dev_id(pdev);
+ params->device_id = hv_pci_build_logical_dev_id(pdev, hbus->hdev->dev_instance);
params->int_target.vector = hv_msi_get_int_vector(data);
if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
@@ -3869,6 +3865,9 @@ static int hv_pci_probe(struct hv_device *hdev,
hbus->state = hv_pcibus_probed;
+ /* Inform the IOMMU of the bus GUID of devices headed their way */
+ hv_iommu_inform(dom, hdev->dev_instance);
+
ret = create_root_hv_pci_bus(hbus);
if (ret)
goto free_windows;
--------------------8x-------------------------------------------------
Thanks,
Easwar (he/him)
^ permalink raw reply related
* Re: [PATCH V2 08/11] PCI: hv: Build device id for a VMBus device, export PCI devid function
From: Bjorn Helgaas @ 2026-05-01 16:33 UTC (permalink / raw)
To: Mukesh R
Cc: hpa, robin.murphy, robh, wei.liu, mhklinux, muislam, namjain,
magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch, kys, haiyangz, decui, longli, tglx, mingo,
bp, dave.hansen, x86, joro, will, lpieralisi, kwilczynski,
bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-9-mrathor@linux.microsoft.com>
s/id/ID/ in subject.
I don't know if the "export PCI devid function" part is essential in
the subject. If it is, I don't know whether that refers to
hv_build_devid_type_pci() of hv_pci_vmbus_device_id(). Both are
exported by this patch. Could just mention the actual name instead of
"PCI devid function" or could make the exports a separate patch.
On Thu, Apr 30, 2026 at 05:41:54PM -0700, Mukesh R wrote:
> On Hyper-V, most hypercalls related to PCI passthru to map/unmap regions,
> interrupts, etc need a device ID as a parameter. This device ID refers
> to that specific device during the lifetime of passthru.
> +++ b/include/asm-generic/mshyperv.h
> @@ -23,6 +23,7 @@
> #include <acpi/acpi_numa.h>
> #include <linux/cpumask.h>
> #include <linux/nmi.h>
> +#include <linux/pci.h>
It doesn't look like mshyperv.h actually needs the definition, so you
probably don't need to include pci.h. A "struct pci_dev;" declaration
should be sufficient.
> #include <asm/ptrace.h>
> #include <hyperv/hvhdk.h>
>
> @@ -329,6 +330,13 @@ static inline enum hv_isolation_type hv_get_isolation_type(void)
> }
> #endif /* CONFIG_HYPERV */
>
> +#if IS_ENABLED(CONFIG_PCI_HYPERV)
> +u64 hv_pci_vmbus_device_id(struct pci_dev *pdev);
> +#else
> +static inline u64 hv_pci_vmbus_device_id(struct pci_dev *pdev)
> +{ return 0; }
> +#endif /* IS_ENABLED(CONFIG_PCI_HYPERV) */
> +
> #if IS_ENABLED(CONFIG_MSHV_ROOT)
> static inline bool hv_root_partition(void)
> {
> --
> 2.51.2.vfs.0.1
>
^ permalink raw reply
* Re: [PATCH net v2] net: mana: Optimize irq affinity for low vcpu configs
From: Yury Norov @ 2026-05-01 16:22 UTC (permalink / raw)
To: Shradha Gupta
Cc: Dexuan Cui, Wei Liu, Haiyang Zhang, K. Y. Srinivasan, Andrew Lunn,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
Dipayaan Roy, Shiraz Saleem, Michael Kelley, Long Li, Yury Norov,
linux-hyperv, linux-kernel, netdev, Paul Rosswurm, Shradha Gupta,
Saurabh Singh Sengar, stable
In-Reply-To: <20260429090640.1790104-1-shradhagupta@linux.microsoft.com>
On Wed, Apr 29, 2026 at 02:06:37AM -0700, Shradha Gupta wrote:
> In mana driver, the number of IRQs allocated is capped by the
> min(num_cpu + 1, queue count). In cases, where the IRQ count is greater
> than the vcpu count, we want to utilize all the vCPUs, irrespective of
> their NUMA/core bindings.
>
> This is important, especially in the envs where number of vCPUs are so
> few that the softIRQ handling overhead on two IRQs on the same vCPU is
> much more than their overheads if they were spread across sibling vCPUs.
>
> This behaviour is more evident with dynamic IRQ allocation. Since MANA
> IRQs are assigned at a later stage compared to static allocation, other
> device IRQs may already be affinitized to the vCPUs. As a result, IRQ
> weights become imbalanced, causing multiple MANA IRQs to land on the
> same vCPU, while some vCPUs have none.
>
> In such cases when many parallel TCP connections are tested, the
> throughput drops significantly.
>
> Test envs:
> =======================================================
> Case 1: without this patch
> =======================================================
> 4 vcpu(2 cores), 5 MANA IRQs (1 HWC + 4 Queue)
>
> TYPE effective vCPU aff
> =======================================================
> IRQ0: HWC 0
> IRQ1: mana_q1 0
> IRQ2: mana_q2 2
> IRQ3: mana_q3 0
> IRQ4: mana_q4 3
>
> %soft on each vCPU(mpstat -P ALL 1) on receiver
> vCPU 0 1 2 3
> =======================================================
> pass 1: 38.85 0.03 24.89 24.65
> pass 2: 39.15 0.03 24.57 25.28
> pass 3: 40.36 0.03 23.20 23.17
>
> =======================================================
> Case 2: with this patch
> =======================================================
> 4 vcpu(2 cores), 5 MANA IRQs (1 HWC + 4 Queue)
>
> TYPE effective vCPU aff
> =======================================================
> IRQ0: HWC 0
> IRQ1: mana_q1 0
> IRQ2: mana_q2 1
> IRQ3: mana_q3 2
> IRQ4: mana_q4 3
>
> %soft on each vCPU(mpstat -P ALL 1) on receiver
> vCPU 0 1 2 3
> =======================================================
> pass 1: 15.42 15.85 14.99 14.51
> pass 2: 15.53 15.94 15.81 15.93
> pass 3: 16.41 16.35 16.40 16.36
>
> =======================================================
> Throughput Impact(in Gbps, same env)
> =======================================================
> TCP conn with patch w/o patch
> 20480 15.65 7.73
> 10240 15.63 8.93
> 8192 15.64 9.69
> 6144 15.64 13.16
> 4096 15.69 15.75
> 2048 15.69 15.83
> 1024 15.71 15.28
>
> Fixes: 755391121038 ("net: mana: Allocate MSI-X vectors dynamically")
> Cc: stable@vger.kernel.org
> Co-developed-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
> Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
> ---
> Changes in v2
> * Removed the unused skip_first_cpu variable
> * fixed exit condition in irq_setup_linear() with len == 0
> * changed return type of irq_setup_linear() as it will always be 0
> * removed the unnecessary rcu_read_lock() in irq_setup_linear()
> * added appropriate comments to indicate expected behaviour when
> IRQs are more than or equal to num_online_cpus()
> ---
> .../net/ethernet/microsoft/mana/gdma_main.c | 47 ++++++++++++++++---
> 1 file changed, 40 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 098fbda0d128..d740d1dc43da 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -167,6 +167,8 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
> } else {
> /* If dynamic allocation is enabled we have already allocated
> * hwc msi
> + * Also, we make sure in this case the following is always true
> + * (num_msix_usable - 1 HWC) <= num_online_cpus()
> */
> gc->num_msix_usable = min(resp.max_msix, num_online_cpus() + 1);
> }
> @@ -1672,11 +1674,24 @@ static int irq_setup(unsigned int *irqs, unsigned int len, int node,
> return 0;
> }
>
> +/* should be called with cpus_read_lock() held */
> +static void irq_setup_linear(unsigned int *irqs, unsigned int len)
> +{
> + int cpu;
> +
> + for_each_online_cpu(cpu) {
> + if (len == 0)
> + break;
> +
> + irq_set_affinity_and_hint(*irqs++, cpumask_of(cpu));
> + len--;
> + }
> +}
> +
> static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
> {
> struct gdma_context *gc = pci_get_drvdata(pdev);
> struct gdma_irq_context *gic;
> - bool skip_first_cpu = false;
> int *irqs, irq, err, i;
>
> irqs = kmalloc_objs(int, nvec);
So what about WARN_ON() and nvec adjustment before kmalloc?
> @@ -1722,13 +1737,31 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
> * first CPU sibling group since they are already affinitized to HWC IRQ
> */
> cpus_read_lock();
> - if (gc->num_msix_usable <= num_online_cpus())
> - skip_first_cpu = true;
> + if (gc->num_msix_usable <= num_online_cpus()) {
> + err = irq_setup(irqs, nvec, gc->numa_node, true);
> + if (err) {
> + cpus_read_unlock();
> + goto free_irq;
One thing puzzles me: if you skip first CPU with this 'true', and the
gc->num_msix_usable == num_online_cpus(), it's one more than you can
distribute. What do I miss?
> + }
> + } else {
> + /*
> + * When num_msix_usable are more than num_online_cpus, we try to
> + * make sure we are using all vcpus. In such a case NUMA or
> + * CPU core affinity does not matter.
If it doesn't matter, why don't you assign each IRQ to all CPUs then?
In theory, the system would have most of flexibility to balance them.
> + * Note: in this case the total mana IRQ should always be
> + * num_online_cpus + 1. The first HWC IRQ is already handled
> + * in HWC setup calls
> + * However, if CPUs went offline since num_msix_usable was
> + * computed, nvec count will be more than num_online_cpus().
> + * In such cases remaining extra IRQs will retain their default
> + * affinity.
> + */
> + if (nvec > num_online_cpus())
> + dev_dbg(&pdev->dev,
> + "IRQ count %d exceeds online CPU count %d. Some IRQs will share CPU\n",
I'd better say 'some IRQs will share the default CPU', and in the
perfect world, I'd like to see:
'The IRQs #4-12 will share the default CPU #0'
type of message.
> + nvec, num_online_cpus());
It's not that straightforward as it should be. In one case
nvec > num_online_cpus()
is a problem, while in another - not. It looks already suspicious. So
when you throw a warning, you should mention it, I believe.
In the
gc->num_msix_usable <= num_online_cpus()
case, when nvec is too big, would'n 'some IRQs share some CPU' just
as well? If so, you again should throw a message.
> - err = irq_setup(irqs, nvec, gc->numa_node, skip_first_cpu);
> - if (err) {
> - cpus_read_unlock();
> - goto free_irq;
> + irq_setup_linear(irqs, nvec);
> }
>
> cpus_read_unlock();
>
> base-commit: e728258debd553c95d2e70f9cd97c9fde27c7130
> --
> 2.34.1
^ permalink raw reply
* Re: [PATCH net v2] net: mana: Optimize irq affinity for low vcpu configs
From: Simon Horman @ 2026-05-01 9:12 UTC (permalink / raw)
To: Shradha Gupta
Cc: Dexuan Cui, Wei Liu, Haiyang Zhang, K. Y. Srinivasan, Andrew Lunn,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Konstantin Taranov, Erni Sri Satya Vennela, Dipayaan Roy,
Shiraz Saleem, Michael Kelley, Long Li, Yury Norov, linux-hyperv,
linux-kernel, netdev, Paul Rosswurm, Shradha Gupta,
Saurabh Singh Sengar, stable
In-Reply-To: <20260429090640.1790104-1-shradhagupta@linux.microsoft.com>
On Wed, Apr 29, 2026 at 02:06:37AM -0700, Shradha Gupta wrote:
> In mana driver, the number of IRQs allocated is capped by the
> min(num_cpu + 1, queue count). In cases, where the IRQ count is greater
> than the vcpu count, we want to utilize all the vCPUs, irrespective of
> their NUMA/core bindings.
>
> This is important, especially in the envs where number of vCPUs are so
> few that the softIRQ handling overhead on two IRQs on the same vCPU is
> much more than their overheads if they were spread across sibling vCPUs.
>
> This behaviour is more evident with dynamic IRQ allocation. Since MANA
> IRQs are assigned at a later stage compared to static allocation, other
> device IRQs may already be affinitized to the vCPUs. As a result, IRQ
> weights become imbalanced, causing multiple MANA IRQs to land on the
> same vCPU, while some vCPUs have none.
>
> In such cases when many parallel TCP connections are tested, the
> throughput drops significantly.
...
> Fixes: 755391121038 ("net: mana: Allocate MSI-X vectors dynamically")
> Cc: stable@vger.kernel.org
> Co-developed-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
> Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
> ---
> Changes in v2
> * Removed the unused skip_first_cpu variable
> * fixed exit condition in irq_setup_linear() with len == 0
> * changed return type of irq_setup_linear() as it will always be 0
> * removed the unnecessary rcu_read_lock() in irq_setup_linear()
> * added appropriate comments to indicate expected behaviour when
> IRQs are more than or equal to num_online_cpus()
Reviewed-by: Simon Horman <horms@kernel.org>
^ permalink raw reply
* [PATCH net, v3] net: mana: Fix crash from unvalidated SHM offset read from BAR0 during FLR
From: Dipayaan Roy @ 2026-05-01 2:47 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
During Function Level Reset recovery, the MANA driver reads
hardware BAR0 registers that may temporarily contain garbage values.
The SHM (Shared Memory) offset read from GDMA_REG_SHM_OFFSET is used
to compute gc->shm_base, which is later dereferenced via readl() in
mana_smc_poll_register(). If the hardware returns an unaligned or
out-of-range value, the driver must not blindly use it, as this would
propagate the hardware error into a kernel crash.
The following crash was observed on an arm64 Hyper-V guest running
kernel 6.17.0-3013-azure during VF reset recovery triggered by HWC
timeout.
[13291.785274] Unable to handle kernel paging request at virtual address ffff8000a200001b
[13291.785311] Mem abort info:
[13291.785332] ESR = 0x0000000096000021
[13291.785343] EC = 0x25: DABT (current EL), IL = 32 bits
[13291.785355] SET = 0, FnV = 0
[13291.785363] EA = 0, S1PTW = 0
[13291.785372] FSC = 0x21: alignment fault
[13291.785382] Data abort info:
[13291.785391] ISV = 0, ISS = 0x00000021, ISS2 = 0x00000000
[13291.785404] CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[13291.785412] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[13291.785421] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000014df3a1000
[13291.785432] [ffff8000a200001b] pgd=1000000100438403, p4d=1000000100438403, pud=1000000100439403, pmd=0068000fc2000711
[13291.785703] Internal error: Oops: 0000000096000021 [#1] SMP
[13291.830975] Modules linked in: tls qrtr mana_ib ib_uverbs ib_core xt_owner xt_tcpudp xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_compat nf_tables cfg80211 8021q garp mrp stp llc binfmt_misc joydev serio_raw nls_iso8859_1 hid_generic aes_ce_blk aes_ce_cipher polyval_ce ghash_ce sm4_ce_gcm sm4_ce_ccm sm4_ce sm4_ce_cipher hid_hyperv sm4 sm3_ce sha3_ce hv_netvsc hid vmgenid hyperv_keyboard hyperv_drm sch_fq_codel nvme_fabrics efi_pstore dm_multipath nfnetlink vsock_loopback vmw_vsock_virtio_transport_common hv_sock vmw_vsock_vmci_transport vmw_vmci vsock dmi_sysfs ip_tables x_tables autofs4
[13291.862630] CPU: 122 UID: 0 PID: 61796 Comm: kworker/122:2 Tainted: G W 6.17.0-3013-azure #13-Ubuntu VOLUNTARY
[13291.869902] Tainted: [W]=WARN
[13291.871901] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 01/08/2026
[13291.878086] Workqueue: events mana_serv_func
[13291.880718] pstate: 62400005 (nZCv daif +PAN -UAO +TCO -DIT -SSBS BTYPE=--)
[13291.884835] pc : mana_smc_poll_register+0x48/0xb0
[13291.887902] lr : mana_smc_setup_hwc+0x70/0x1c0
[13291.890493] sp : ffff8000ab79bbb0
[13291.892364] x29: ffff8000ab79bbb0 x28: ffff00410c8b5900 x27: ffff00410d630680
[13291.896252] x26: ffff004171f9fd80 x25: 000000016ed55000 x24: 000000017f37e000
[13291.899990] x23: 0000000000000000 x22: 000000016ed55000 x21: 0000000000000000
[13291.904497] x20: ffff8000a200001b x19: 0000000000004e20 x18: ffff8000a6183050
[13291.908308] x17: 0000000000000000 x16: 0000000000000000 x15: 000000000000000a
[13291.912542] x14: 0000000000000004 x13: 0000000000000000 x12: 0000000000000000
[13291.916298] x11: 0000000000000000 x10: 0000000000000001 x9 : ffffc45006af1bd8
[13291.920945] x8 : ffff000151129000 x7 : 0000000000000000 x6 : 0000000000000000
[13291.925293] x5 : 000000015f214000 x4 : 000000017217a000 x3 : 000000016ed50000
[13291.930436] x2 : 000000016ed55000 x1 : 0000000000000000 x0 : ffff8000a1ffffff
[13291.934342] Call trace:
[13291.935736] mana_smc_poll_register+0x48/0xb0 (P)
[13291.938611] mana_smc_setup_hwc+0x70/0x1c0
[13291.941113] mana_hwc_create_channel+0x1a0/0x3a0
[13291.944283] mana_gd_setup+0x16c/0x398
[13291.946584] mana_gd_resume+0x24/0x70
[13291.948917] mana_do_service+0x13c/0x1d0
[13291.951583] mana_serv_func+0x34/0x68
[13291.953732] process_one_work+0x168/0x3d0
[13291.956745] worker_thread+0x2ac/0x480
[13291.959104] kthread+0xf8/0x110
[13291.961026] ret_from_fork+0x10/0x20
[13291.963560] Code: d2807d00 9417c551 71000673 54000220 (b9400281)
[13291.967299] ---[ end trace 0000000000000000 ]---
Disassembly of mana_smc_poll_register() around the crash site:
Disassembly of section .text:
00000000000047c8 <mana_smc_poll_register>:
47c8: d503201f nop
47cc: d503201f nop
47d0: d503233f paciasp
47d4: f800865e str x30, [x18], #8
47d8: a9bd7bfd stp x29, x30, [sp, #-48]!
47dc: 910003fd mov x29, sp
47e0: a90153f3 stp x19, x20, [sp, #16]
47e4: 91007014 add x20, x0, #0x1c
47e8: 5289c413 mov w19, #0x4e20
47ec: f90013f5 str x21, [sp, #32]
47f0: 12001c35 and w21, w1, #0xff
47f4: 14000008 b 4814 <mana_smc_poll_register+0x4c>
47f8: 36f801e1 tbz w1, #31, 4834 <mana_smc_poll_register+0x6c>
47fc: 52800042 mov w2, #0x2
4800: d280fa01 mov x1, #0x7d0
4804: d2807d00 mov x0, #0x3e8
4808: 94000000 bl 0 <usleep_range_state>
480c: 71000673 subs w19, w19, #0x1
4810: 54000200 b.eq 4850 <mana_smc_poll_register+0x88>
4814: b9400281 ldr w1, [x20] <-- **** CRASHED HERE *****
4818: d50331bf dmb oshld
481c: 2a0103e2 mov w2, w1
...
From the crash signature x20 = ffff8000a200001b, this address
ends in 0x1b which is not 4-byte aligned, so the 'ldr w1, [x20]'
instruction (readl) triggers the arm64 alignment fault (FSC = 0x21).
The root cause is in mana_gd_init_vf_regs(), which computes:
gc->shm_base = gc->bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET);
The offset is used without any validation. The same problem exists
in mana_gd_init_pf_regs() for sriov_base_off and sriov_shm_off.
Fix this by validating all offsets before use:
- VF: check shm_off is within BAR0, properly aligned to 4 bytes
(readl requirement), and leaves room for the full 256-bit
(32-byte) SMC aperture.
- PF: check sriov_base_off is within BAR0, aligned to 8 bytes
(readq requirement), and leaves room to safely read the
sriov_shm_off register at sriov_base_off + GDMA_PF_REG_SHM_OFF.
Then check sriov_shm_off leaves room for the full SMC aperture.
All arithmetic uses subtraction rather than addition to avoid
integer overflow on garbage values.
Define SMC_APERTURE_SIZE (32 bytes, derived from the 256-bit aperture
width)
Return -EPROTO on invalid values. The existing recovery path in
mana_serv_reset() already handles -EPROTO by falling through to PCI
device rescan, giving the hardware another chance to present valid
register values after reset.
Fixes: 9bf66036d686 ("net: mana: Handle hardware recovery events when probing the device")
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
Changes in v3:
- Fixed commit message.
- Removed macro duplicates.
Changes in v2:
- Fix sriov_base_off alignment check: sizeof(u32) to sizeof(u64), since
mana_gd_r64() (readq) requires 8-byte alignment on arm64.
- Fix sriov_base_off bounds: also verify enough space remains in BAR0
to safely read sriov_shm_off at offset GDMA_PF_REG_SHM_OFF + 8 bytes.
- Fix integer overflow: rewrite bounds checks using subtraction
(remaining = bar0_size - base) instead of addition.
- Fix SMC aperture size: add gc->bar0_size - shm_off < SMC_APERTURE_SIZE
checks in both VF and PF paths; previously only the start address was
validated, but mana_smc_poll_register() accesses up to shm_base + 0x1c
(28 bytes from base, 32 bytes total).
- Export SMC_APERTURE_SIZE to shm_channel.h.
---
.../net/ethernet/microsoft/mana/gdma_main.c | 40 ++++++++++++++++---
.../net/ethernet/microsoft/mana/shm_channel.c | 5 ---
include/net/mana/shm_channel.h | 6 +++
3 files changed, 41 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 098fbda0d128..d8e816882f02 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -43,8 +43,9 @@ static u64 mana_gd_r64(struct gdma_context *g, u64 offset)
static int mana_gd_init_pf_regs(struct pci_dev *pdev)
{
struct gdma_context *gc = pci_get_drvdata(pdev);
- void __iomem *sriov_base_va;
+ u64 remaining_barsize;
u64 sriov_base_off;
+ u64 sriov_shm_off;
gc->db_page_size = mana_gd_r32(gc, GDMA_PF_REG_DB_PAGE_SIZE) & 0xFFFF;
@@ -73,10 +74,28 @@ static int mana_gd_init_pf_regs(struct pci_dev *pdev)
gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off;
sriov_base_off = mana_gd_r64(gc, GDMA_SRIOV_REG_CFG_BASE_OFF);
+ if (sriov_base_off >= gc->bar0_size ||
+ gc->bar0_size - sriov_base_off <
+ GDMA_PF_REG_SHM_OFF + sizeof(u64) ||
+ !IS_ALIGNED(sriov_base_off, sizeof(u64))) {
+ dev_err(gc->dev,
+ "SRIOV base offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
+ sriov_base_off, (u64)gc->bar0_size);
+ return -EPROTO;
+ }
- sriov_base_va = gc->bar0_va + sriov_base_off;
- gc->shm_base = sriov_base_va +
- mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF);
+ remaining_barsize = gc->bar0_size - sriov_base_off;
+ sriov_shm_off = mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF);
+ if (sriov_shm_off >= remaining_barsize ||
+ remaining_barsize - sriov_shm_off < SMC_APERTURE_SIZE ||
+ !IS_ALIGNED(sriov_shm_off, sizeof(u32))) {
+ dev_err(gc->dev,
+ "SRIOV SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
+ sriov_shm_off, (u64)gc->bar0_size);
+ return -EPROTO;
+ }
+
+ gc->shm_base = gc->bar0_va + sriov_base_off + sriov_shm_off;
return 0;
}
@@ -84,6 +103,7 @@ static int mana_gd_init_pf_regs(struct pci_dev *pdev)
static int mana_gd_init_vf_regs(struct pci_dev *pdev)
{
struct gdma_context *gc = pci_get_drvdata(pdev);
+ u64 shm_off;
gc->db_page_size = mana_gd_r32(gc, GDMA_REG_DB_PAGE_SIZE) & 0xFFFF;
@@ -111,7 +131,17 @@ static int mana_gd_init_vf_regs(struct pci_dev *pdev)
gc->db_page_base = gc->bar0_va + gc->db_page_off;
gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off;
- gc->shm_base = gc->bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET);
+ shm_off = mana_gd_r64(gc, GDMA_REG_SHM_OFFSET);
+ if (shm_off >= gc->bar0_size ||
+ gc->bar0_size - shm_off < SMC_APERTURE_SIZE ||
+ !IS_ALIGNED(shm_off, sizeof(u32))) {
+ dev_err(gc->dev,
+ "SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
+ shm_off, (u64)gc->bar0_size);
+ return -EPROTO;
+ }
+
+ gc->shm_base = gc->bar0_va + shm_off;
return 0;
}
diff --git a/drivers/net/ethernet/microsoft/mana/shm_channel.c b/drivers/net/ethernet/microsoft/mana/shm_channel.c
index 0f1679ebad96..d21b5db06e50 100644
--- a/drivers/net/ethernet/microsoft/mana/shm_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/shm_channel.c
@@ -61,11 +61,6 @@ union smc_proto_hdr {
};
}; /* HW DATA */
-#define SMC_APERTURE_BITS 256
-#define SMC_BASIC_UNIT (sizeof(u32))
-#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8))
-#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1)
-
static int mana_smc_poll_register(void __iomem *base, bool reset)
{
void __iomem *ptr = base + SMC_LAST_DWORD * SMC_BASIC_UNIT;
diff --git a/include/net/mana/shm_channel.h b/include/net/mana/shm_channel.h
index 5199b41497ff..dbabcfb95daf 100644
--- a/include/net/mana/shm_channel.h
+++ b/include/net/mana/shm_channel.h
@@ -4,6 +4,12 @@
#ifndef _SHM_CHANNEL_H
#define _SHM_CHANNEL_H
+#define SMC_APERTURE_BITS 256
+#define SMC_BASIC_UNIT (sizeof(u32))
+#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8))
+#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1)
+#define SMC_APERTURE_SIZE (SMC_APERTURE_BITS / 8)
+
struct shm_channel {
struct device *dev;
void __iomem *base;
--
2.43.0
^ permalink raw reply related
* [PATCH 3/3] selftests/mm: Add userfaultfd test for HMM unlockable path
From: Stanislav Kinsburskii @ 2026-05-01 1:20 UTC (permalink / raw)
To: kys, Liam.Howlett, akpm, akpm, david, decui, haiyangz, jgg,
corbet, leon, longli, ljs, mhocko, rppt, shuah, skhan, surenb,
vbabka, wei.liu
Cc: linux-doc, linux-hyperv, linux-kernel, linux-kernel,
linux-kselftest, linux-mm
In-Reply-To: <177759835313.221039.2807391868456411507.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
Add a selftest that exercises hmm_range_fault_unlockable() with a
userfaultfd-backed mapping. The test:
1. Creates an anonymous mmap region
2. Registers it with userfaultfd (UFFDIO_REGISTER_MODE_MISSING)
3. Spawns a handler thread that responds to page faults by filling
pages with a known pattern (0xAB) via UFFDIO_COPY
4. Issues HMM_DMIRROR_READ_UNLOCKABLE to the test_hmm driver, which
calls hmm_range_fault_unlockable() internally
5. Verifies the device read back the data provided by the userfaultfd
handler
This requires changes to the test_hmm kernel module:
- New dmirror_range_fault_unlockable() that uses the new HMM API
- New dmirror_fault_unlockable() and dmirror_read_unlockable() wrappers
- New HMM_DMIRROR_READ_UNLOCKABLE ioctl (0x09)
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
lib/test_hmm.c | 122 +++++++++++++++++++++++++++++
lib/test_hmm_uapi.h | 1
tools/testing/selftests/mm/hmm-tests.c | 133 ++++++++++++++++++++++++++++++++
3 files changed, 256 insertions(+)
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 0964d53365e61..20b14e279a8bd 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -327,6 +327,84 @@ static int dmirror_range_fault(struct dmirror *dmirror,
return ret;
}
+static int dmirror_range_fault_unlockable(struct dmirror *dmirror,
+ struct hmm_range *range)
+{
+ struct mm_struct *mm = dmirror->notifier.mm;
+ unsigned long timeout =
+ jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+ int locked;
+ int ret;
+
+ while (true) {
+ if (time_after(jiffies, timeout)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ range->notifier_seq = mmu_interval_read_begin(range->notifier);
+ locked = 1;
+ mmap_read_lock(mm);
+ ret = hmm_range_fault_unlockable(range, &locked);
+ if (locked)
+ mmap_read_unlock(mm);
+ if (ret) {
+ if (ret == -EBUSY)
+ continue;
+ goto out;
+ }
+ if (!locked)
+ continue;
+
+ mutex_lock(&dmirror->mutex);
+ if (mmu_interval_read_retry(range->notifier,
+ range->notifier_seq)) {
+ mutex_unlock(&dmirror->mutex);
+ continue;
+ }
+ break;
+ }
+
+ ret = dmirror_do_fault(dmirror, range);
+
+ mutex_unlock(&dmirror->mutex);
+out:
+ return ret;
+}
+
+static int dmirror_fault_unlockable(struct dmirror *dmirror,
+ unsigned long start,
+ unsigned long end, bool write)
+{
+ struct mm_struct *mm = dmirror->notifier.mm;
+ unsigned long addr;
+ unsigned long pfns[32];
+ struct hmm_range range = {
+ .notifier = &dmirror->notifier,
+ .hmm_pfns = pfns,
+ .pfn_flags_mask = 0,
+ .default_flags =
+ HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0),
+ .dev_private_owner = dmirror->mdevice,
+ };
+ int ret = 0;
+
+ if (!mmget_not_zero(mm))
+ return 0;
+
+ for (addr = start; addr < end; addr = range.end) {
+ range.start = addr;
+ range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
+
+ ret = dmirror_range_fault_unlockable(dmirror, &range);
+ if (ret)
+ break;
+ }
+
+ mmput(mm);
+ return ret;
+}
+
static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
unsigned long end, bool write)
{
@@ -426,6 +504,47 @@ static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
return ret;
}
+static int dmirror_read_unlockable(struct dmirror *dmirror,
+ struct hmm_dmirror_cmd *cmd)
+{
+ struct dmirror_bounce bounce;
+ unsigned long start, end;
+ unsigned long size = cmd->npages << PAGE_SHIFT;
+ int ret;
+
+ start = cmd->addr;
+ end = start + size;
+ if (end < start)
+ return -EINVAL;
+
+ ret = dmirror_bounce_init(&bounce, start, size);
+ if (ret)
+ return ret;
+
+ while (1) {
+ mutex_lock(&dmirror->mutex);
+ ret = dmirror_do_read(dmirror, start, end, &bounce);
+ mutex_unlock(&dmirror->mutex);
+ if (ret != -ENOENT)
+ break;
+
+ start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
+ ret = dmirror_fault_unlockable(dmirror, start, end, false);
+ if (ret)
+ break;
+ cmd->faults++;
+ }
+
+ if (ret == 0) {
+ if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
+ bounce.size))
+ ret = -EFAULT;
+ }
+ cmd->cpages = bounce.cpages;
+ dmirror_bounce_fini(&bounce);
+ return ret;
+}
+
static int dmirror_do_write(struct dmirror *dmirror, unsigned long start,
unsigned long end, struct dmirror_bounce *bounce)
{
@@ -1537,6 +1656,9 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
dmirror->flags = cmd.npages;
ret = 0;
break;
+ case HMM_DMIRROR_READ_UNLOCKABLE:
+ ret = dmirror_read_unlockable(dmirror, &cmd);
+ break;
default:
return -EINVAL;
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index f94c6d4573382..076df6df92275 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -38,6 +38,7 @@ struct hmm_dmirror_cmd {
#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_RELEASE _IOWR('H', 0x07, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_FLAGS _IOWR('H', 0x08, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_READ_UNLOCKABLE _IOWR('H', 0x09, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_FLAG_FAIL_ALLOC (1ULL << 0)
diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c
index e8328c89d855e..e7bf061747edd 100644
--- a/tools/testing/selftests/mm/hmm-tests.c
+++ b/tools/testing/selftests/mm/hmm-tests.c
@@ -26,6 +26,9 @@
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/time.h>
+#include <sys/syscall.h>
+#include <linux/userfaultfd.h>
+#include <poll.h>
/*
@@ -2852,4 +2855,134 @@ TEST_F_TIMEOUT(hmm, benchmark_thp_migration, 120)
&thp_results, ®ular_results);
}
}
+
+/*
+ * Test that HMM can fault in pages backed by userfaultfd using the
+ * hmm_range_fault_unlockable() path. This exercises the lock-drop retry
+ * logic in the HMM framework.
+ */
+struct uffd_thread_args {
+ int uffd;
+ void *page_buffer;
+ unsigned long page_size;
+};
+
+static void *uffd_handler_thread(void *arg)
+{
+ struct uffd_thread_args *args = arg;
+ struct uffd_msg msg;
+ struct uffdio_copy copy;
+ struct pollfd pollfd;
+ int ret;
+
+ pollfd.fd = args->uffd;
+ pollfd.events = POLLIN;
+
+ while (1) {
+ ret = poll(&pollfd, 1, 5000);
+ if (ret <= 0)
+ break;
+
+ ret = read(args->uffd, &msg, sizeof(msg));
+ if (ret != sizeof(msg))
+ break;
+
+ if (msg.event != UFFD_EVENT_PAGEFAULT)
+ break;
+
+ /* Fill the page with a known pattern */
+ memset(args->page_buffer, 0xAB, args->page_size);
+
+ copy.dst = msg.arg.pagefault.address & ~(args->page_size - 1);
+ copy.src = (unsigned long)args->page_buffer;
+ copy.len = args->page_size;
+ copy.mode = 0;
+ copy.copy = 0;
+
+ ret = ioctl(args->uffd, UFFDIO_COPY, ©);
+ if (ret < 0)
+ break;
+ }
+
+ return NULL;
+}
+
+TEST_F(hmm, userfaultfd_read)
+{
+ struct hmm_buffer *buffer;
+ struct uffd_thread_args uffd_args;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ unsigned char *ptr;
+ pthread_t thread;
+ int uffd;
+ int ret;
+ struct uffdio_api api;
+ struct uffdio_register reg;
+
+ npages = 4;
+ size = npages << self->page_shift;
+
+ /* Create userfaultfd */
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0)
+ SKIP(return, "userfaultfd not available");
+
+ api.api = UFFD_API;
+ api.features = 0;
+ ret = ioctl(uffd, UFFDIO_API, &api);
+ ASSERT_EQ(ret, 0);
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Create anonymous mapping */
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ -1, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Register the region with userfaultfd */
+ reg.range.start = (unsigned long)buffer->ptr;
+ reg.range.len = size;
+ reg.mode = UFFDIO_REGISTER_MODE_MISSING;
+ ret = ioctl(uffd, UFFDIO_REGISTER, ®);
+ ASSERT_EQ(ret, 0);
+
+ /* Set up the handler thread */
+ uffd_args.uffd = uffd;
+ uffd_args.page_buffer = malloc(self->page_size);
+ ASSERT_NE(uffd_args.page_buffer, NULL);
+ uffd_args.page_size = self->page_size;
+
+ ret = pthread_create(&thread, NULL, uffd_handler_thread, &uffd_args);
+ ASSERT_EQ(ret, 0);
+
+ /*
+ * Use the unlockable read path which allows the mmap lock to be
+ * dropped during the fault, enabling userfaultfd resolution.
+ */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ_UNLOCKABLE,
+ buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Verify the device read the data filled by the uffd handler */
+ ptr = buffer->mirror;
+ for (i = 0; i < size; ++i)
+ ASSERT_EQ(ptr[i], (unsigned char)0xAB);
+
+ pthread_join(thread, NULL);
+ free(uffd_args.page_buffer);
+ close(uffd);
+ hmm_buffer_free(buffer);
+}
+
TEST_HARNESS_MAIN
^ permalink raw reply related
* [PATCH 2/3] mshv: Use hmm_range_fault_unlockable() for userfaultfd support
From: Stanislav Kinsburskii @ 2026-05-01 1:20 UTC (permalink / raw)
To: kys, Liam.Howlett, akpm, akpm, david, decui, haiyangz, jgg,
corbet, leon, longli, ljs, mhocko, rppt, shuah, skhan, surenb,
vbabka, wei.liu
Cc: linux-doc, linux-hyperv, linux-kernel, linux-kernel,
linux-kselftest, linux-mm
In-Reply-To: <177759835313.221039.2807391868456411507.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
Convert the mshv driver's HMM fault path to use
hmm_range_fault_unlockable() instead of hmm_range_fault(). This enables
userfaultfd-backed guest memory regions by allowing the mmap lock to be
dropped during page fault handling.
Extract the per-VMA walk into a dedicated mshv_region_hmm_fault_walk()
helper. The outer mshv_region_hmm_fault_and_lock() handles the do/while
restart loop: if the lock is dropped during a fault (userfaultfd resolution
or similar) or an invalidation occurs (-EBUSY), the function restarts the
entire walk from the beginning with a fresh notifier_seq, since the VMA
layout may have changed.
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_regions.c | 127 +++++++++++++++++++++++++++++++--------------
1 file changed, 87 insertions(+), 40 deletions(-)
diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index d09940e88298e..05665446ca6d9 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -565,6 +565,75 @@ int mshv_region_get(struct mshv_region *region)
return kref_get_unless_zero(®ion->mreg_refcount);
}
+/**
+ * mshv_region_hmm_fault_walk - Walk VMAs and fault in pages for a range
+ * @region : Pointer to the memory region structure
+ * @range : HMM range structure (caller sets notifier and notifier_seq)
+ * @start : Starting virtual address of the range to fault (inclusive)
+ * @end : Ending virtual address of the range to fault (exclusive)
+ * @pfns : Output array for page frame numbers with HMM flags
+ * @locked : Pointer to lock state; set to 0 if mmap lock was dropped
+ * @do_fault: If true, fault in missing pages; if false, snapshot only
+ *
+ * Iterates through VMAs covering [start, end), collecting page frame
+ * numbers via hmm_range_fault_unlockable() for each VMA segment.
+ * When @do_fault is true, missing pages are faulted in and write faults
+ * are requested only when both the VMA and the hypervisor mapping permit
+ * writes, to avoid breaking copy-on-write semantics on read-only mappings.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int mshv_region_hmm_fault_walk(struct mshv_region *region,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ unsigned long *pfns,
+ int *locked,
+ bool do_fault)
+{
+ unsigned long cur_start = start;
+ unsigned long *cur_pfns = pfns;
+
+ while (cur_start < end) {
+ struct vm_area_struct *vma;
+
+ vma = vma_lookup(range->notifier->mm, cur_start);
+ if (!vma)
+ return -EFAULT;
+
+ range->hmm_pfns = cur_pfns;
+ range->start = cur_start;
+ range->end = min(vma->vm_end, end);
+ range->default_flags = 0;
+ if (do_fault) {
+ range->default_flags = HMM_PFN_REQ_FAULT;
+ /*
+ * Only request writable pages from HMM when
+ * both the VMA and the hypervisor mapping allow
+ * writes. Without this, hmm_range_fault() would
+ * trigger COW on read-only mappings (e.g. shared
+ * zero pages, file-backed pages), breaking
+ * copy-on-write semantics and potentially
+ * granting the guest write access to shared host
+ * pages.
+ */
+ if ((vma->vm_flags & VM_WRITE) &&
+ (region->hv_map_flags & HV_MAP_GPA_WRITABLE))
+ range->default_flags |= HMM_PFN_REQ_WRITE;
+ }
+
+ int ret = hmm_range_fault_unlockable(range, locked);
+
+ if (ret || !*locked)
+ return ret;
+
+ cur_start = range->end;
+ cur_pfns += (range->end - range->start) >> PAGE_SHIFT;
+ }
+
+ return 0;
+}
+
/**
* mshv_region_hmm_fault_and_lock - Fault in pages across VMAs and lock
* the memory region
@@ -575,11 +644,9 @@ int mshv_region_get(struct mshv_region *region)
* @do_fault: If true, fault in missing pages; if false, snapshot only
* pages already present in page tables
*
- * Iterates through VMAs covering [start, end), collecting page frame
- * numbers via hmm_range_fault() for each VMA segment. When @do_fault
- * is true, missing pages are faulted in and write faults are requested
- * only when both the VMA and the hypervisor mapping permit writes, to
- * avoid breaking copy-on-write semantics on read-only mappings.
+ * Faults in pages covering [start, end) and acquires region->mreg_mutex.
+ * If the mmap lock is dropped during the fault (e.g. by userfaultfd) or
+ * the mmu notifier sequence is invalidated, the entire walk is restarted.
*
* On success, returns with region->mreg_mutex held; the caller is
* responsible for releasing it. Returns -EBUSY if the mmu notifier
@@ -597,47 +664,27 @@ static int mshv_region_hmm_fault_and_lock(struct mshv_region *region,
.notifier = ®ion->mreg_mni,
};
struct mm_struct *mm = region->mreg_mni.mm;
+ int locked;
int ret;
- range.notifier_seq = mmu_interval_read_begin(range.notifier);
- mmap_read_lock(mm);
- while (start < end) {
- struct vm_area_struct *vma;
+ do {
+ range.notifier_seq = mmu_interval_read_begin(range.notifier);
+ locked = 1;
+ mmap_read_lock(mm);
- vma = vma_lookup(mm, start);
- if (!vma) {
- ret = -EFAULT;
- break;
- }
+ ret = mshv_region_hmm_fault_walk(region, &range, start, end,
+ pfns, &locked, do_fault);
- range.hmm_pfns = pfns;
- range.start = start;
- range.end = min(vma->vm_end, end);
- range.default_flags = 0;
- if (do_fault) {
- range.default_flags = HMM_PFN_REQ_FAULT;
- /*
- * Only request writable pages from HMM when both
- * the VMA and the hypervisor mapping allow writes.
- * Without this, hmm_range_fault() would trigger
- * COW on read-only mappings (e.g. shared zero
- * pages, file-backed pages), breaking
- * copy-on-write semantics and potentially granting
- * the guest write access to shared host pages.
- */
- if ((vma->vm_flags & VM_WRITE) &&
- (region->hv_map_flags & HV_MAP_GPA_WRITABLE))
- range.default_flags |= HMM_PFN_REQ_WRITE;
- }
+ if (locked)
+ mmap_read_unlock(mm);
- ret = hmm_range_fault(&range);
- if (ret)
- break;
+ /*
+ * If the lock was dropped (by userfaultfd or similar), restart
+ * the entire walk with a fresh notifier_seq since the VMA layout
+ * may have changed. Also restart on -EBUSY (invalidation).
+ */
+ } while (!locked || ret == -EBUSY);
- start = range.end;
- pfns += (range.end - range.start) >> PAGE_SHIFT;
- }
- mmap_read_unlock(mm);
if (ret)
return ret;
^ permalink raw reply related
* [PATCH 1/3] mm/hmm: Add hmm_range_fault_unlockable() for mmap lock-drop support
From: Stanislav Kinsburskii @ 2026-05-01 1:20 UTC (permalink / raw)
To: kys, Liam.Howlett, akpm, akpm, david, decui, haiyangz, jgg,
corbet, leon, longli, ljs, mhocko, rppt, shuah, skhan, surenb,
vbabka, wei.liu
Cc: linux-doc, linux-hyperv, linux-kernel, linux-kernel,
linux-kselftest, linux-mm
In-Reply-To: <177759835313.221039.2807391868456411507.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
Add hmm_range_fault_unlockable(), a new HMM entry point that allows the
mmap read lock to be dropped during page faults. This follows the
int *locked pattern from get_user_pages_remote() in mm/gup.c: callers
pass an int *locked variable indicating they can handle the lock being
dropped.
When locked is non-NULL, hmm_vma_fault() adds FAULT_FLAG_ALLOW_RETRY
and FAULT_FLAG_KILLABLE to the fault flags passed to handle_mm_fault().
If the fault handler drops the mmap lock (returning VM_FAULT_RETRY or
VM_FAULT_COMPLETED), the function sets *locked = 0 and returns 0,
signalling the caller to restart its walk with a fresh notifier
sequence. Fatal signals are checked before returning, matching GUP
behavior. The caller is responsible for re-acquiring the lock and
restarting from the beginning, since previously collected PFNs may be
stale after the lock was dropped.
The existing hmm_range_fault() is refactored into a thin wrapper that
calls hmm_range_fault_unlockable(range, NULL). Passing NULL means
FAULT_FLAG_ALLOW_RETRY is never set, preserving existing behavior for
all current callers with no functional change.
Faulting hugetlb pages is not supported on the unlockable path: if a
hugetlb page requires faulting, -EFAULT is returned. This is because
walk_hugetlb_range() holds hugetlb_vma_lock_read across the callback
and unconditionally unlocks on return; if the mmap lock is dropped
inside the callback the VMA may be freed, making the walk framework's
unlock a use-after-free. Hugetlb pages already present in page tables
are handled normally.
Documentation/mm/hmm.rst is updated with a new section describing the
unlockable API, its usage pattern, and the hugetlb limitation.
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
Documentation/mm/hmm.rst | 89 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/hmm.h | 1 +
mm/hmm.c | 91 +++++++++++++++++++++++++++++++++++++++++-----
3 files changed, 172 insertions(+), 9 deletions(-)
diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst
index 7d61b7a8b65b7..13874b4dfd5f4 100644
--- a/Documentation/mm/hmm.rst
+++ b/Documentation/mm/hmm.rst
@@ -208,6 +208,95 @@ invalidate() callback. That lock must be held before calling
mmu_interval_read_retry() to avoid any race with a concurrent CPU page table
update.
+Scalable lock-drop support (hmm_range_fault_unlockable)
+=======================================================
+
+Some page fault handlers (e.g., userfaultfd) require the mmap lock to be
+dropped during fault resolution. Drivers that need to support such mappings
+can use::
+
+ int hmm_range_fault_unlockable(struct hmm_range *range, int *locked);
+
+This follows the same ``int *locked`` pattern used by ``get_user_pages_remote()``
+in ``mm/gup.c``. The caller sets ``*locked = 1`` and holds the mmap read lock
+before calling. If the lock is dropped during the fault (VM_FAULT_RETRY or
+VM_FAULT_COMPLETED), the function returns 0 with ``*locked = 0``, signalling
+the caller to restart its walk with a fresh notifier sequence. The caller is
+responsible for re-acquiring the lock and restarting from the beginning, since
+previously collected PFNs may be stale.
+
+The usage pattern is::
+
+ int driver_populate_range_unlockable(...)
+ {
+ struct hmm_range range;
+ int locked;
+ ...
+
+ range.notifier = &interval_sub;
+ range.start = ...;
+ range.end = ...;
+ range.hmm_pfns = ...;
+
+ if (!mmget_not_zero(interval_sub->notifier.mm))
+ return -EFAULT;
+
+ again:
+ range.notifier_seq = mmu_interval_read_begin(&interval_sub);
+ locked = 1;
+ mmap_read_lock(mm);
+ ret = hmm_range_fault_unlockable(&range, &locked);
+ if (locked)
+ mmap_read_unlock(mm);
+ if (ret) {
+ if (ret == -EBUSY)
+ goto again;
+ return ret;
+ }
+ if (!locked)
+ goto again;
+
+ take_lock(driver->update);
+ if (mmu_interval_read_retry(&ni, range.notifier_seq) {
+ release_lock(driver->update);
+ goto again;
+ }
+
+ /* Use pfns array content to update device page table,
+ * under the update lock */
+
+ release_lock(driver->update);
+ return 0;
+ }
+
+Passing ``locked = NULL`` to ``hmm_range_fault_unlockable()`` is equivalent to
+calling ``hmm_range_fault()`` — the lock will never be dropped.
+
+Note: hugetlb pages are not supported with the unlockable path. If a hugetlb
+page requires faulting during an ``hmm_range_fault_unlockable()`` call,
+``-EFAULT`` is returned. Hugetlb pages that are already present in page tables
+are handled normally.
+
+This limitation exists because ``walk_hugetlb_range()`` in the page walk
+framework holds ``hugetlb_vma_lock_read`` across the callback and unconditionally
+unlocks on return. If the mmap lock is dropped inside the callback (via
+VM_FAULT_RETRY), the VMA may be freed before the walk framework's unlock,
+resulting in a use-after-free. Possible approaches to lift this limitation in
+the future:
+
+1. Extend the walk framework to allow callbacks to signal that the hugetlb vma
+ lock was dropped (e.g., a flag in ``struct mm_walk`` that tells
+ ``walk_hugetlb_range()`` to skip the unlock).
+
+2. Bypass ``walk_page_range()`` for hugetlb pages in the unlockable path and
+ walk hugetlb page tables directly with custom lock management (similar to
+ how GUP handles hugetlb without the walk framework).
+
+3. Re-acquire the mmap lock before returning from the hugetlb callback (like
+ ``fixup_user_fault()``), ensuring the VMA remains valid for the walk
+ framework's unlock. This changes the "never re-take" contract and would
+ require callers to handle hugetlb differently.
+
Leverage default_flags and pfn_flags_mask
=========================================
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index db75ffc949a7a..46e581865c48a 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -123,6 +123,7 @@ struct hmm_range {
* Please see Documentation/mm/hmm.rst for how to use the range API.
*/
int hmm_range_fault(struct hmm_range *range);
+int hmm_range_fault_unlockable(struct hmm_range *range, int *locked);
/*
* HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
diff --git a/mm/hmm.c b/mm/hmm.c
index 5955f2f0c83db..9bf2fa37f2efd 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -33,6 +33,7 @@
struct hmm_vma_walk {
struct hmm_range *range;
unsigned long last;
+ int *locked;
};
enum {
@@ -86,10 +87,28 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
fault_flags |= FAULT_FLAG_WRITE;
}
- for (; addr < end; addr += PAGE_SIZE)
- if (handle_mm_fault(vma, addr, fault_flags, NULL) &
- VM_FAULT_ERROR)
+ if (hmm_vma_walk->locked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ for (; addr < end; addr += PAGE_SIZE) {
+ vm_fault_t ret;
+
+ ret = handle_mm_fault(vma, addr, fault_flags, NULL);
+
+ if (ret & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)) {
+ /*
+ * The mmap lock has been dropped by the fault handler.
+ * Record the failing address and signal lock-drop to
+ * the caller.
+ */
+ *hmm_vma_walk->locked = 0;
+ hmm_vma_walk->last = addr;
+ return -EAGAIN;
+ }
+
+ if (ret & VM_FAULT_ERROR)
return -EFAULT;
+ }
return -EBUSY;
}
@@ -566,6 +585,17 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
if (required_fault) {
int ret;
+ /*
+ * Faulting hugetlb pages on the unlockable path is not
+ * supported. The walk framework holds hugetlb_vma_lock_read
+ * which must be dropped before handle_mm_fault, but if the
+ * mmap lock is also dropped (VM_FAULT_RETRY), the vma may
+ * be freed and the walk framework's unconditional unlock
+ * becomes a use-after-free.
+ */
+ if (hmm_vma_walk->locked)
+ return -EFAULT;
+
spin_unlock(ptl);
hugetlb_vma_unlock_read(vma);
/*
@@ -655,14 +685,49 @@ static const struct mm_walk_ops hmm_walk_ops = {
*
* This is similar to get_user_pages(), except that it can read the page tables
* without mutating them (ie causing faults).
+ *
+ * The mmap lock must be held by the caller and will remain held on return.
+ * For a variant that allows the mmap lock to be dropped during faults (e.g.,
+ * for userfaultfd support), see hmm_range_fault_unlockable().
*/
int hmm_range_fault(struct hmm_range *range)
{
+ return hmm_range_fault_unlockable(range, NULL);
+}
+EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_range_fault_unlockable - fault a range with mmap lock-drop support
+ * @range: argument structure
+ * @locked: pointer to lock state variable (input: 1; output: 0 if lock
+ * was dropped)
+ *
+ * Similar to hmm_range_fault() but allows the mmap lock to be dropped during
+ * page faults. This enables support for userfaultfd-backed mappings and other
+ * cases where handle_mm_fault() may need to release the mmap lock.
+ *
+ * The caller must hold the mmap read lock and set *locked = 1 before calling.
+ * On return:
+ * - *locked == 1: mmap lock is still held, return value has normal semantics
+ * - *locked == 0: mmap lock was dropped. The caller must re-acquire the lock
+ * and restart the operation. Return value is -EBUSY in this case.
+ *
+ * When the lock is dropped internally, this function will attempt to
+ * re-acquire it and retry the fault with FAULT_FLAG_TRIED set. If the retry
+ * also results in lock-drop (possible but unusual), or if a fatal signal is
+ * pending, the function returns with *locked == 0.
+ *
+ * Returns 0 on success or a negative error code. See hmm_range_fault() for
+ * the full list of possible errors.
+ */
+int hmm_range_fault_unlockable(struct hmm_range *range, int *locked)
+{
+ struct mm_struct *mm = range->notifier->mm;
struct hmm_vma_walk hmm_vma_walk = {
.range = range,
.last = range->start,
+ .locked = locked,
};
- struct mm_struct *mm = range->notifier->mm;
int ret;
mmap_assert_locked(mm);
@@ -674,16 +739,24 @@ int hmm_range_fault(struct hmm_range *range)
return -EBUSY;
ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
&hmm_walk_ops, &hmm_vma_walk);
+ if (ret == -EAGAIN) {
+ /*
+ * The mmap lock was dropped during the fault
+ * (e.g. userfaultfd). Signal the caller to restart
+ * by returning with *locked = 0.
+ */
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ return 0;
+ }
/*
- * When -EBUSY is returned the loop restarts with
- * hmm_vma_walk.last set to an address that has not been stored
- * in pfns. All entries < last in the pfn array are set to their
- * output, and all >= are still at their input values.
+ * -EBUSY: page table changed during the walk.
+ * Restart from hmm_vma_walk.last.
*/
} while (ret == -EBUSY);
return ret;
}
-EXPORT_SYMBOL(hmm_range_fault);
+EXPORT_SYMBOL(hmm_range_fault_unlockable);
/**
* hmm_dma_map_alloc - Allocate HMM map structure
^ permalink raw reply related
* [PATCH 0/3] mm/hmm: Add mmap lock-drop support for userfaultfd-backed mappings
From: Stanislav Kinsburskii @ 2026-05-01 1:20 UTC (permalink / raw)
To: kys, Liam.Howlett, akpm, akpm, david, decui, haiyangz, jgg,
corbet, leon, longli, ljs, mhocko, rppt, shuah, skhan, surenb,
vbabka, wei.liu
Cc: linux-doc, linux-hyperv, linux-kernel, linux-kernel,
linux-kselftest, linux-mm
This series extends the HMM framework to support userfaultfd-backed memory
by allowing the mmap read lock to be dropped during hmm_range_fault().
Some page fault handlers — most notably userfaultfd — require the mmap lock
to be released so that userspace can resolve the fault. The current HMM
interface never sets FAULT_FLAG_ALLOW_RETRY, making it impossible to fault
in pages from userfaultfd-registered regions.
This series follows the established int *locked pattern from
get_user_pages_remote() in mm/gup.c. A new entry point,
hmm_range_fault_unlockable(), accepts an int *locked parameter. When the
mmap lock is dropped during fault resolution (VM_FAULT_RETRY or
VM_FAULT_COMPLETED), the function returns 0 with *locked = 0, signalling
the caller to restart its walk. The existing hmm_range_fault() is
refactored into a thin wrapper that passes NULL, preserving current
behavior for all existing callers.
Faulting hugetlb pages on the unlockable path is not supported because
walk_hugetlb_range() unconditionally holds and releases
hugetlb_vma_lock_read across the callback; if the mmap lock is dropped
inside the callback, the VMA may be freed before the walk framework's
unlock. Hugetlb pages already present in page tables are handled normally.
Possible approaches to lift this limitation are documented in
Documentation/mm/hmm.rst.
Patch 1 adds hmm_range_fault_unlockable() to the HMM core, refactors
hmm_range_fault() into a wrapper, and updates Documentation/mm/hmm.rst.
Patch 2 converts the mshv driver to use the new API, enabling
userfaultfd-backed guest memory regions. Patch 3 adds a selftest exercising
the unlockable path with a userfaultfd handler that fills pages via
UFFDIO_COPY.
---
Stanislav Kinsburskii (3):
mm/hmm: Add hmm_range_fault_unlockable() for mmap lock-drop support
mshv: Use hmm_range_fault_unlockable() for userfaultfd support
selftests/mm: Add userfaultfd test for HMM unlockable path
Documentation/mm/hmm.rst | 89 +++++++++++++++++++++
drivers/hv/mshv_regions.c | 127 +++++++++++++++++++++----------
include/linux/hmm.h | 1
lib/test_hmm.c | 122 +++++++++++++++++++++++++++++
lib/test_hmm_uapi.h | 1
mm/hmm.c | 91 ++++++++++++++++++++--
tools/testing/selftests/mm/hmm-tests.c | 133 ++++++++++++++++++++++++++++++++
7 files changed, 515 insertions(+), 49 deletions(-)
^ permalink raw reply
* [PATCH V2 11/11] mshv: Mark mem regions as non-movable upfront if device passthru
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
If a VM is started with device attached, the mem regions must be marked
non-movable as the device attach hypercall right away allows the use of
SLAT for IOMMU. Marking them non-movable forces mapping of the entire
guest RAM in the SLAT at the time of region creation along with the
region pinned. Also, because a device could be dynamically attached
much later in a VM, create a boot parameter to disable movable pages
that users can set if they anticipate such an action.
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
drivers/hv/mshv_root.h | 1 +
drivers/hv/mshv_root_main.c | 15 ++++++++++++++-
2 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index b9880d0bdc4d..d57c26950203 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -141,6 +141,7 @@ struct mshv_partition {
pid_t pt_vmm_tgid;
bool import_completed;
bool pt_initialized;
+ bool pt_regions_pinned;
#if IS_ENABLED(CONFIG_DEBUG_FS)
struct dentry *pt_stats_dentry;
struct dentry *pt_vp_dentry;
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index a7864463961b..ac71534733bd 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -49,6 +49,10 @@ MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
static bool hv_nofull_mmio; /* don't map entire mmio region upon fault */
module_param(hv_nofull_mmio, bool, 0644);
+static bool hv_no_movbl_pgs; /* disable movable pages completely */
+module_param(hv_no_movbl_pgs, bool, 0644);
+MODULE_PARM_DESC(hv_no_movbl_pgs, "If set, don't do movable pages for VMs");
+
struct mshv_root mshv_root;
enum hv_scheduler_type hv_scheduler_type;
@@ -1303,6 +1307,12 @@ static void mshv_async_hvcall_handler(void *data, u64 *status)
*status = partition->async_hypercall_status;
}
+static bool mshv_do_pt_regions_pinned(struct mshv_partition *pt)
+{
+ return pt->pt_regions_pinned || mshv_partition_encrypted(pt) ||
+ hv_no_movbl_pgs;
+}
+
/*
* NB: caller checks and makes sure mem->size is page aligned
* Returns: 0 with regionpp updated on success, or -errno
@@ -1333,7 +1343,7 @@ static int mshv_partition_create_region(struct mshv_partition *partition,
if (is_mmio)
rg->mreg_type = MSHV_REGION_TYPE_MMIO;
- else if (mshv_partition_encrypted(partition) ||
+ else if (mshv_do_pt_regions_pinned(partition) ||
!mshv_region_movable_init(rg))
rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
else
@@ -1808,6 +1818,9 @@ static long mshv_partition_ioctl_create_device(struct mshv_partition *partition,
if (copy_to_user(uarg, &devargk, sizeof(devargk)))
return -EFAULT; /* cleanup in mshv_device_fop_release() */
+ /* For now, all regions must be pinned if there is device passthru. */
+ partition->pt_regions_pinned = true;
+
return 0;
undo_out:
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 10/11] mshv: Populate mmio mappings for PCI passthru
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
Upon guest access, in case of missing mmio mapping, the hypervisor
generates an unmapped gpa intercept. In this path, lookup the PCI
resource pfn for the guest gpa, and ask the hypervisor to map it
via hypercall. The PCI resource pfn is maintained by the VFIO driver,
and obtained via fixup_user_fault call (similar to KVM).
Also, VFIO no longer puts the mmio pfn in vma->vm_pgoff. So, remove
code that is using it to map mmio space. It is broken and will cause
panic.
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
drivers/hv/mshv_root_main.c | 113 ++++++++++++++++++++++++++++++------
1 file changed, 96 insertions(+), 17 deletions(-)
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 6ceb5f608589..a7864463961b 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -46,6 +46,9 @@ MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
#define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95
#endif
+static bool hv_nofull_mmio; /* don't map entire mmio region upon fault */
+module_param(hv_nofull_mmio, bool, 0644);
+
struct mshv_root mshv_root;
enum hv_scheduler_type hv_scheduler_type;
@@ -641,6 +644,94 @@ mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
return region;
}
+/*
+ * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
+ * else just return -errno.
+ */
+static int mshv_chk_get_mmio_start_pfn(u64 uaddr, u64 *mmio_pfnp)
+{
+ struct vm_area_struct *vma;
+ bool is_mmio;
+ struct follow_pfnmap_args pfnmap_args;
+ int rc = -EINVAL;
+
+ mmap_read_lock(current->mm);
+ vma = vma_lookup(current->mm, uaddr);
+ is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
+ if (!is_mmio)
+ goto unlock_mmap_out;
+
+ pfnmap_args.vma = vma;
+ pfnmap_args.address = uaddr;
+
+ rc = follow_pfnmap_start(&pfnmap_args);
+ if (rc) {
+ rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
+ NULL);
+ if (rc)
+ goto unlock_mmap_out;
+
+ rc = follow_pfnmap_start(&pfnmap_args);
+ if (rc)
+ goto unlock_mmap_out;
+ }
+
+ *mmio_pfnp = pfnmap_args.pfn;
+ follow_pfnmap_end(&pfnmap_args);
+
+unlock_mmap_out:
+ mmap_read_unlock(current->mm);
+ return rc;
+}
+
+/*
+ * Check if the unmapped gpa belongs to mmio space. If yes, resolve it.
+ *
+ * Returns: True if valid mmio intercept and handled, else false.
+ */
+static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
+{
+ struct hv_message *hvmsg = vp->vp_intercept_msg_page;
+ u64 gfn, uaddr, mmio_spa, numpgs;
+ struct mshv_mem_region *rg;
+ int rc = -EINVAL;
+ struct mshv_partition *pt = vp->vp_partition;
+#if defined(CONFIG_X86_64)
+ struct hv_x64_memory_intercept_message *msg =
+ (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
+#elif defined(CONFIG_ARM64)
+ struct hv_arm64_memory_intercept_message *msg =
+ (struct hv_arm64_memory_intercept_message *)hvmsg->u.payload;
+#endif
+
+ gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
+
+ rg = mshv_partition_region_by_gfn_get(pt, gfn);
+ if (rg == NULL)
+ return false;
+ if (rg->mreg_type != MSHV_REGION_TYPE_MMIO)
+ goto put_rg_out;
+
+ uaddr = rg->start_uaddr + ((gfn - rg->start_gfn) << HV_HYP_PAGE_SHIFT);
+
+ rc = mshv_chk_get_mmio_start_pfn(uaddr, &mmio_spa);
+ if (rc)
+ goto put_rg_out;
+
+ if (!hv_nofull_mmio) { /* default case */
+ mmio_spa = mmio_spa - (gfn - rg->start_gfn);
+ gfn = rg->start_gfn;
+ numpgs = rg->nr_pages;
+ } else
+ numpgs = 1;
+
+ rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
+
+put_rg_out:
+ mshv_region_put(rg);
+ return rc == 0;
+}
+
/**
* mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
* @vp: Pointer to the virtual processor structure.
@@ -699,6 +790,8 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
{
switch (vp->vp_intercept_msg_page->header.message_type) {
+ case HVMSG_UNMAPPED_GPA:
+ return mshv_handle_unmapped_gpa(vp);
case HVMSG_GPA_INTERCEPT:
return mshv_handle_gpa_intercept(vp);
}
@@ -1322,16 +1415,8 @@ static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
}
/*
- * This maps two things: guest RAM and for pci passthru mmio space.
- *
- * mmio:
- * - vfio overloads vm_pgoff to store the mmio start pfn/spa.
- * - Two things need to happen for mapping mmio range:
- * 1. mapped in the uaddr so VMM can access it.
- * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
- *
- * This function takes care of the second. The first one is managed by vfio,
- * and hence is taken care of via vfio_pci_mmap_fault().
+ * This is called for both user ram and mmio space. The mmio space is not
+ * mapped here, but later during intercept on demand.
*/
static long
mshv_map_user_memory(struct mshv_partition *partition,
@@ -1340,7 +1425,6 @@ mshv_map_user_memory(struct mshv_partition *partition,
struct mshv_mem_region *region;
struct vm_area_struct *vma;
bool is_mmio;
- ulong mmio_pfn;
long ret;
if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
@@ -1350,7 +1434,6 @@ mshv_map_user_memory(struct mshv_partition *partition,
mmap_read_lock(current->mm);
vma = vma_lookup(current->mm, mem->userspace_addr);
is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
- mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
mmap_read_unlock(current->mm);
if (!vma)
@@ -1376,11 +1459,7 @@ mshv_map_user_memory(struct mshv_partition *partition,
region->nr_pages,
HV_MAP_GPA_NO_ACCESS, NULL);
break;
- case MSHV_REGION_TYPE_MMIO:
- ret = hv_call_map_mmio_pages(partition->pt_id,
- region->start_gfn,
- mmio_pfn,
- region->nr_pages);
+ default:
break;
}
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 09/11] x86/hyperv: Implement hyperv virtual IOMMU
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
Add a new file to implement management of device domains, mapping and
unmapping of IOMMU memory, and other iommu_ops to fit within the VFIO
framework for PCI passthru on Hyper-V running Linux as baremetal root
or L1VH root. This also implements direct attach mechanism (see below),
a special feature of Hyper-V for PCI passthru, and it is also made to
work within the VFIO framework.
At a high level, during boot the hypervisor creates a default identity
domain and attaches all devices to it. This nicely maps to Linux IOMMU
subsystem IOMMU_DOMAIN_IDENTITY domain. As a result, Linux does not
need to explicitly ask Hyper-V to attach devices and do maps/unmaps
during boot. As mentioned previously, Hyper-V supports two ways to do
PCI passthru:
1. Device Domain (aka Domain Attach): root must create a device domain
in the hypervisor, and do map/unmap hypercalls for mapping and
unmapping guest RAM for DMA. All hypervisor communications use
device ID of type PCI for identifying and referencing the device.
2. Direct Attach: the hypervisor will simply use the guest's HW
page table for mappings, thus the root need not map/unmap guest
memory for DMA. As such, direct attach passthru setup during guest
boot is extremely fast. A direct attached device must always be
referenced via logical device ID and not via the PCI device ID.
At present, L1VH root only supports direct attaches. Also direct attach is
default in non-L1VH cases because there are some significant performance
issues with domain attach implementations currently for guests with higher
RAM (say more than 8GB), and that unfortunately cannot be addressed in
the short term.
Co-developed-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
MAINTAINERS | 1 +
arch/x86/kernel/pci-dma.c | 2 +
drivers/iommu/Kconfig | 5 +-
drivers/iommu/Makefile | 1 +
drivers/iommu/hyperv-iommu-root.c | 908 ++++++++++++++++++++++++++++++
include/asm-generic/mshyperv.h | 17 +
include/linux/hyperv.h | 6 +
7 files changed, 937 insertions(+), 3 deletions(-)
create mode 100644 drivers/iommu/hyperv-iommu-root.c
diff --git a/MAINTAINERS b/MAINTAINERS
index f803a6a38fee..8ae040b89a56 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11914,6 +11914,7 @@ F: drivers/clocksource/hyperv_timer.c
F: drivers/hid/hid-hyperv.c
F: drivers/hv/
F: drivers/input/serio/hyperv-keyboard.c
+F: drivers/iommu/hyperv-iommu-root.c
F: drivers/iommu/hyperv-irq.c
F: drivers/net/ethernet/microsoft/
F: drivers/net/hyperv/
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 6267363e0189..cfeee6505e17 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -8,6 +8,7 @@
#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/amd-iommu.h>
+#include <linux/hyperv.h>
#include <asm/proto.h>
#include <asm/dma.h>
@@ -105,6 +106,7 @@ void __init pci_iommu_alloc(void)
gart_iommu_hole_init();
amd_iommu_detect();
detect_intel_iommu();
+ hv_iommu_detect();
swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags);
}
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f86262b11416..7909cf4373a6 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -352,13 +352,12 @@ config MTK_IOMMU_V1
if unsure, say N here.
config HYPERV_IOMMU
- bool "Hyper-V IRQ Handling"
+ bool "Hyper-V IOMMU Unit"
depends on HYPERV && X86
select IOMMU_API
default HYPERV
help
- Stub IOMMU driver to handle IRQs to support Hyper-V Linux
- guest and root partitions.
+ Hyper-V pseudo IOMMU unit.
config VIRTIO_IOMMU
tristate "Virtio IOMMU driver"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 335ea77cced6..296fbc6ca829 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
obj-$(CONFIG_HYPERV) += hyperv-irq.o
+obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu-root.o
obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o
diff --git a/drivers/iommu/hyperv-iommu-root.c b/drivers/iommu/hyperv-iommu-root.c
new file mode 100644
index 000000000000..739bbf39dea2
--- /dev/null
+++ b/drivers/iommu/hyperv-iommu-root.c
@@ -0,0 +1,908 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyper-V root vIOMMU driver.
+ * Copyright (C) 2026, Microsoft, Inc.
+ */
+
+#include <linux/pci.h>
+#include <linux/dma-map-ops.h>
+#include <linux/interval_tree.h>
+#include <linux/hyperv.h>
+#include "dma-iommu.h"
+#include <asm/iommu.h>
+#include <asm/mshyperv.h>
+
+/* We will not claim these PCI devices, eg hypervisor needs it for debugger */
+static char *pci_devs_to_skip;
+static int __init hv_iommu_setup_skip(char *str)
+{
+ pci_devs_to_skip = str;
+
+ return 0;
+}
+/* hv_iommu_skip=(SSSS:BB:DD.F)(SSSS:BB:DD.F) */
+__setup("hv_iommu_skip=", hv_iommu_setup_skip);
+
+bool hv_no_attdev; /* disable direct device attach for passthru */
+EXPORT_SYMBOL_GPL(hv_no_attdev);
+static int __init setup_hv_no_attdev(char *str)
+{
+ hv_no_attdev = true;
+ return 0;
+}
+__setup("hv_no_attdev", setup_hv_no_attdev);
+
+/* Iommu device that we export to the world. HyperV supports max of one */
+static struct iommu_device hv_virt_iommu;
+
+struct hv_domain {
+ struct iommu_domain iommu_dom;
+ u32 domid_num; /* as opposed to domain_id.type */
+ bool attached_dom; /* is this direct attached dom? */
+ u64 partid; /* partition id */
+ spinlock_t mappings_lock; /* protects mappings_tree */
+ struct rb_root_cached mappings_tree; /* iova to pa lookup tree */
+};
+
+#define to_hv_domain(d) container_of(d, struct hv_domain, iommu_dom)
+
+struct hv_iommu_mapping {
+ phys_addr_t paddr;
+ struct interval_tree_node iova;
+ u32 flags;
+};
+
+/*
+ * By default, during boot the hypervisor creates one Stage 2 (S2) default
+ * domain. Stage 2 means that the page table is controlled by the hypervisor.
+ * S2 default: access to entire root partition memory. This for us easily
+ * maps to IOMMU_DOMAIN_IDENTITY in the iommu subsystem, and
+ * is called HV_DEVICE_DOMAIN_ID_S2_DEFAULT in the hypervisor.
+ *
+ * Device Management:
+ * There are two ways to manage device attaches to domains:
+ * 1. Domain Attach: A device domain is created in the hypervisor, the
+ * device is attached to this domain, and then memory
+ * ranges are mapped in the map callbacks.
+ * 2. Direct Attach: No need to create a domain in the hypervisor for direct
+ * attached devices. A hypercall is made to tell the
+ * hypervisor to attach the device to a guest. There is
+ * no need for explicit memory mappings because the
+ * hypervisor will just use the guest HW page table.
+ *
+ * Since a direct attach is much faster, it is the default. This can be
+ * changed via hv_no_attdev.
+ *
+ * L1VH: hypervisor only supports direct attach.
+ */
+
+/*
+ * Create dummy domains to correspond to hypervisor prebuilt default identity
+ * and null domains (dummy because we do not make hypercalls to create them).
+ */
+static struct hv_domain hv_def_identity_dom;
+static struct hv_domain hv_null_dom;
+
+static bool hv_special_domain(struct hv_domain *hvdom)
+{
+ return hvdom == &hv_def_identity_dom || hvdom == &hv_null_dom;
+}
+
+struct iommu_domain_geometry default_geometry = (struct iommu_domain_geometry) {
+ .aperture_start = 0,
+ .aperture_end = -1UL,
+ .force_aperture = true,
+};
+
+/*
+ * Since the relevant hypercalls can only fit less than 512 PFNs in the pfn
+ * array, report 1M max.
+ */
+#define HV_IOMMU_PGSIZES (SZ_4K | SZ_1M)
+
+static u32 unique_id; /* unique numeric id of a new domain */
+
+static void hv_iommu_detach_dev(struct iommu_domain *immdom,
+ struct device *dev);
+static size_t hv_iommu_unmap_pages(struct iommu_domain *immdom, ulong iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *gather);
+
+/*
+ * If the current thread is a VMM thread, return the partition id of the VM it
+ * is managing, else return HV_PARTITION_ID_INVALID.
+ */
+u64 hv_get_current_partid(void)
+{
+ u64 (*fn)(void);
+ u64 ptid;
+
+ fn = symbol_get(mshv_current_partid);
+ if (!fn)
+ return HV_PARTITION_ID_INVALID;
+
+ ptid = fn();
+ symbol_put(mshv_current_partid);
+
+ return ptid;
+}
+EXPORT_SYMBOL_GPL(hv_get_current_partid);
+
+/* If this is a VMM thread, then this domain is for a guest vm */
+static bool hv_curr_thread_is_vmm(void)
+{
+ return hv_get_current_partid() != HV_PARTITION_ID_INVALID;
+}
+
+/* As opposed to some host app like SPDK etc... */
+static bool hv_dom_owner_is_vmm(struct hv_domain *hvdom)
+{
+ return hvdom && hvdom->partid != HV_PARTITION_ID_INVALID;
+}
+
+static bool hv_iommu_capable(struct device *dev, enum iommu_cap cap)
+{
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * Check if given pci device is a direct attached device. Caller must have
+ * verified pdev is a valid pci device.
+ */
+bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
+{
+ struct iommu_domain *iommu_domain;
+ struct hv_domain *hvdom;
+ struct device *dev = &pdev->dev;
+
+ iommu_domain = iommu_get_domain_for_dev(dev);
+ if (iommu_domain) {
+ hvdom = to_hv_domain(iommu_domain);
+ return hvdom->attached_dom;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(hv_pcidev_is_attached_dev);
+
+bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct hv_domain *hvdom = dev_iommu_priv_get(dev);
+
+ if (hvdom && !hv_special_domain(hvdom))
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(hv_pcidev_is_pthru_dev);
+
+/* Build device id for direct attached devices */
+static u64 hv_build_devid_type_logical(struct pci_dev *pdev)
+{
+ hv_pci_segment segment;
+ union hv_device_id hv_devid;
+ union hv_pci_bdf bdf = {.as_uint16 = 0};
+ u32 rid = PCI_DEVID(pdev->bus->number, pdev->devfn);
+
+ segment = pci_domain_nr(pdev->bus);
+ bdf.bus = PCI_BUS_NUM(rid);
+ bdf.device = PCI_SLOT(rid);
+ bdf.function = PCI_FUNC(rid);
+
+ hv_devid.as_uint64 = 0;
+ hv_devid.device_type = HV_DEVICE_TYPE_LOGICAL;
+ hv_devid.logical.id = (u64)segment << 16 | bdf.as_uint16;
+
+ return hv_devid.as_uint64;
+}
+
+u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type)
+{
+ if (type == HV_DEVICE_TYPE_LOGICAL) {
+ if (hv_l1vh_partition())
+ return hv_pci_vmbus_device_id(pdev);
+ else
+ return hv_build_devid_type_logical(pdev);
+ } else if (type == HV_DEVICE_TYPE_PCI)
+#ifdef CONFIG_X86
+ return hv_build_devid_type_pci(pdev);
+#else
+ return 0;
+#endif
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hv_build_devid_oftype);
+
+/* Create a new device domain in the hypervisor */
+static int hv_iommu_create_hyp_devdom(struct hv_domain *hvdom)
+{
+ u64 status;
+ struct hv_input_device_domain *ddp;
+ struct hv_input_create_device_domain *input;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ ddp = &input->device_domain;
+ ddp->partition_id = HV_PARTITION_ID_SELF;
+ ddp->domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+ ddp->domain_id.id = hvdom->domid_num;
+
+ input->create_device_domain_flags.forward_progress_required = 1;
+ input->create_device_domain_flags.inherit_owning_vtl = 0;
+
+ status = hv_do_hypercall(HVCALL_CREATE_DEVICE_DOMAIN, input, NULL);
+
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+
+ return hv_result_to_errno(status);
+}
+
+static struct iommu_domain *hv_iommu_domain_alloc_paging(struct device *dev)
+{
+ struct hv_domain *hvdom;
+ int rc;
+
+ if (hv_l1vh_partition() && !hv_curr_thread_is_vmm()) {
+ pr_err("Hyper-V: l1vh iommu does not support host devices\n");
+ return NULL;
+ }
+
+ hvdom = kzalloc(sizeof(struct hv_domain), GFP_KERNEL);
+ if (hvdom == NULL)
+ return NULL;
+
+ spin_lock_init(&hvdom->mappings_lock);
+ hvdom->mappings_tree = RB_ROOT_CACHED;
+
+ /* Called under iommu group mutex, so single threaded */
+ if (++unique_id == HV_DEVICE_DOMAIN_ID_S2_DEFAULT) /* ie, 0 */
+ goto out_err;
+
+ hvdom->domid_num = unique_id;
+ hvdom->partid = hv_get_current_partid();
+ hvdom->iommu_dom.geometry = default_geometry;
+ hvdom->iommu_dom.pgsize_bitmap = HV_IOMMU_PGSIZES;
+
+ /* For guests, by default we do direct attaches, so no domain in hyp */
+ if (hv_dom_owner_is_vmm(hvdom) && !hv_no_attdev)
+ hvdom->attached_dom = true;
+ else {
+ rc = hv_iommu_create_hyp_devdom(hvdom);
+ if (rc)
+ goto out_err;
+ }
+
+ return &hvdom->iommu_dom;
+
+out_err:
+ unique_id--;
+ kfree(hvdom);
+ return NULL;
+}
+
+static void hv_iommu_domain_free(struct iommu_domain *immdom)
+{
+ struct hv_domain *hvdom = to_hv_domain(immdom);
+ unsigned long flags;
+ u64 status;
+ struct hv_input_delete_device_domain *input;
+
+ if (hv_special_domain(hvdom))
+ return;
+
+ if (!hv_dom_owner_is_vmm(hvdom) || hv_no_attdev) {
+ struct hv_input_device_domain *ddp;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ ddp = &input->device_domain;
+ memset(input, 0, sizeof(*input));
+
+ ddp->partition_id = HV_PARTITION_ID_SELF;
+ ddp->domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+ ddp->domain_id.id = hvdom->domid_num;
+
+ status = hv_do_hypercall(HVCALL_DELETE_DEVICE_DOMAIN, input,
+ NULL);
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+ }
+
+ kfree(hvdom);
+}
+
+/* Attach a device to a domain previously created in the hypervisor */
+static int hv_iommu_att_dev2dom(struct hv_domain *hvdom, struct pci_dev *pdev)
+{
+ unsigned long flags;
+ u64 status;
+ enum hv_device_type dev_type;
+ struct hv_input_attach_device_domain *input;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ input->device_domain.partition_id = HV_PARTITION_ID_SELF;
+ input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+ input->device_domain.domain_id.id = hvdom->domid_num;
+
+ /* NB: Upon guest shutdown, device is re-attached to the default domain
+ * without explicit detach.
+ */
+ if (hv_l1vh_partition())
+ dev_type = HV_DEVICE_TYPE_LOGICAL;
+ else
+ dev_type = HV_DEVICE_TYPE_PCI;
+
+ input->device_id.as_uint64 = hv_build_devid_oftype(pdev, dev_type);
+
+ status = hv_do_hypercall(HVCALL_ATTACH_DEVICE_DOMAIN, input, NULL);
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+
+ return hv_result_to_errno(status);
+}
+
+/* Caller must have validated that dev is a valid pci dev */
+static int hv_iommu_direct_attach_device(struct pci_dev *pdev, u64 ptid)
+{
+ struct hv_input_attach_device *input;
+ u64 status;
+ int rc;
+ unsigned long flags;
+ union hv_device_id host_devid;
+ enum hv_device_type dev_type;
+
+ if (ptid == HV_PARTITION_ID_INVALID) {
+ pr_err("Hyper-V: Invalid partition id in direct attach\n");
+ return -EINVAL;
+ }
+
+ if (hv_l1vh_partition())
+ dev_type = HV_DEVICE_TYPE_LOGICAL;
+ else
+ dev_type = HV_DEVICE_TYPE_PCI;
+
+ host_devid.as_uint64 = hv_build_devid_oftype(pdev, dev_type);
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->partition_id = ptid;
+ input->device_id = host_devid;
+
+ /* Hypervisor associates logical_id with this device, and in
+ * some hypercalls like retarget interrupts, logical_id must be
+ * used instead of the BDF. It is a required parameter.
+ */
+ input->attdev_flags.logical_id = 1;
+ input->logical_devid =
+ hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_LOGICAL);
+
+ status = hv_do_hypercall(HVCALL_ATTACH_DEVICE, input, NULL);
+ local_irq_restore(flags);
+
+ if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
+ rc = hv_call_deposit_pages(NUMA_NO_NODE, ptid, 1);
+ if (rc)
+ break;
+ }
+ } while (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY);
+
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+
+ return hv_result_to_errno(status);
+}
+
+/* Attach a device for passthru to guest VMs, host apps like SPDK, etc */
+static int hv_iommu_attach_dev(struct iommu_domain *immdom, struct device *dev,
+ struct iommu_domain *old)
+{
+ struct pci_dev *pdev;
+ int rc;
+ struct hv_domain *hvdom_new = to_hv_domain(immdom);
+ struct hv_domain *hvdom_prev = dev_iommu_priv_get(dev);
+
+ /* Only allow PCI devices for now */
+ if (!dev_is_pci(dev))
+ return -EINVAL;
+
+ pdev = to_pci_dev(dev);
+
+ if (hv_l1vh_partition() && !hv_special_domain(hvdom_new) &&
+ !hvdom_new->attached_dom)
+ return -EINVAL;
+
+ /* VFIO does not do explicit detach calls, hence check first if we need
+ * to detach first. Also, in case of guest shutdown, it's the VMM
+ * thread that attaches it back to the hv_def_identity_dom, and
+ * hvdom_prev will not be null then. It is null during boot.
+ */
+ if (hvdom_prev)
+ if (!hv_l1vh_partition() || !hv_special_domain(hvdom_prev))
+ hv_iommu_detach_dev(&hvdom_prev->iommu_dom, dev);
+
+ if (hv_l1vh_partition() && hv_special_domain(hvdom_new)) {
+ dev_iommu_priv_set(dev, hvdom_new); /* sets "private" field */
+ return 0;
+ }
+
+ if (hvdom_new->attached_dom)
+ rc = hv_iommu_direct_attach_device(pdev, hvdom_new->partid);
+ else
+ rc = hv_iommu_att_dev2dom(hvdom_new, pdev);
+
+ if (rc == 0)
+ dev_iommu_priv_set(dev, hvdom_new); /* sets "private" field */
+
+ return rc;
+}
+
+static void hv_iommu_det_dev_from_guest(struct pci_dev *pdev, u64 ptid)
+{
+ struct hv_input_detach_device *input;
+ u64 status, log_devid;
+ unsigned long flags;
+
+ log_devid = hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_LOGICAL);
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ input->partition_id = ptid;
+ input->logical_devid = log_devid;
+ status = hv_do_hypercall(HVCALL_DETACH_DEVICE, input, NULL);
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+}
+
+static void hv_iommu_det_dev_from_dom(struct pci_dev *pdev)
+{
+ u64 status, devid;
+ unsigned long flags;
+ struct hv_input_detach_device_domain *input;
+
+ devid = hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_PCI);
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->device_id.as_uint64 = devid;
+ status = hv_do_hypercall(HVCALL_DETACH_DEVICE_DOMAIN, input, NULL);
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+}
+
+static void hv_iommu_detach_dev(struct iommu_domain *immdom, struct device *dev)
+{
+ struct pci_dev *pdev;
+ struct hv_domain *hvdom = to_hv_domain(immdom);
+
+ /* See the attach function, only PCI devices for now */
+ if (!dev_is_pci(dev))
+ return;
+
+ pdev = to_pci_dev(dev);
+
+ if (hvdom->attached_dom)
+ hv_iommu_det_dev_from_guest(pdev, hvdom->partid);
+
+ /* Do not reset attached_dom, hv_iommu_unmap_pages happens
+ * next.
+ */
+ else
+ hv_iommu_det_dev_from_dom(pdev);
+}
+
+static int hv_iommu_add_tree_mapping(struct hv_domain *hvdom,
+ unsigned long iova, phys_addr_t paddr,
+ size_t size, u32 flags)
+{
+ unsigned long irqflags;
+ struct hv_iommu_mapping *mapping;
+
+ mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
+ if (!mapping)
+ return -ENOMEM;
+
+ mapping->paddr = paddr;
+ mapping->iova.start = iova;
+ mapping->iova.last = iova + size - 1;
+ mapping->flags = flags;
+
+ spin_lock_irqsave(&hvdom->mappings_lock, irqflags);
+ interval_tree_insert(&mapping->iova, &hvdom->mappings_tree);
+ spin_unlock_irqrestore(&hvdom->mappings_lock, irqflags);
+
+ return 0;
+}
+
+static size_t hv_iommu_del_tree_mappings(struct hv_domain *hvdom,
+ unsigned long iova, size_t size)
+{
+ unsigned long flags;
+ size_t unmapped = 0;
+ unsigned long last = iova + size - 1;
+ struct hv_iommu_mapping *mapping = NULL;
+ struct interval_tree_node *node, *next;
+
+ spin_lock_irqsave(&hvdom->mappings_lock, flags);
+ next = interval_tree_iter_first(&hvdom->mappings_tree, iova, last);
+ while (next) {
+ node = next;
+ mapping = container_of(node, struct hv_iommu_mapping, iova);
+ next = interval_tree_iter_next(node, iova, last);
+
+ /* Trying to split a mapping? Not supported for now. */
+ if (mapping->iova.start < iova)
+ break;
+
+ unmapped += mapping->iova.last - mapping->iova.start + 1;
+
+ interval_tree_remove(node, &hvdom->mappings_tree);
+ kfree(mapping);
+ }
+ spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
+
+ return unmapped;
+}
+
+/* Return: must return exact status from the hypercall without changes */
+static u64 hv_iommu_map_pgs(struct hv_domain *hvdom,
+ unsigned long iova, phys_addr_t paddr,
+ unsigned long npages, u32 map_flags)
+{
+ u64 status;
+ int i;
+ struct hv_input_map_device_gpa_pages *input;
+ unsigned long flags, pfn;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ input->device_domain.partition_id = HV_PARTITION_ID_SELF;
+ input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+ input->device_domain.domain_id.id = hvdom->domid_num;
+ input->map_flags = map_flags;
+ input->target_device_va_base = iova;
+
+ pfn = paddr >> HV_HYP_PAGE_SHIFT;
+ for (i = 0; i < npages; i++, pfn++)
+ input->gpa_page_list[i] = pfn;
+
+ status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_GPA_PAGES, npages, 0,
+ input, NULL);
+
+ local_irq_restore(flags);
+ return status;
+}
+
+/*
+ * The core VFIO code loops over memory ranges calling this function with
+ * the largest size from HV_IOMMU_PGSIZES. cond_resched() is in vfio_iommu_map.
+ */
+static int hv_iommu_map_pages(struct iommu_domain *immdom, ulong iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped)
+{
+ u32 map_flags;
+ int ret;
+ u64 status;
+ unsigned long npages, done = 0;
+ struct hv_domain *hvdom = to_hv_domain(immdom);
+ size_t size = pgsize * pgcount;
+
+ map_flags = HV_MAP_GPA_READABLE; /* required */
+ map_flags |= prot & IOMMU_WRITE ? HV_MAP_GPA_WRITABLE : 0;
+
+ ret = hv_iommu_add_tree_mapping(hvdom, iova, paddr, size, map_flags);
+ if (ret)
+ return ret;
+
+ if (hvdom->attached_dom) {
+ *mapped = size;
+ return 0;
+ }
+
+ npages = size >> HV_HYP_PAGE_SHIFT;
+ while (done < npages) {
+ ulong completed, remain = npages - done;
+
+ status = hv_iommu_map_pgs(hvdom, iova, paddr, remain,
+ map_flags);
+
+ completed = hv_repcomp(status);
+ done = done + completed;
+ iova = iova + (completed << HV_HYP_PAGE_SHIFT);
+ paddr = paddr + (completed << HV_HYP_PAGE_SHIFT);
+
+ if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
+ ret = hv_call_deposit_pages(NUMA_NO_NODE,
+ hv_current_partition_id,
+ 256);
+ if (ret)
+ break;
+ continue;
+ }
+ if (!hv_result_success(status))
+ break;
+ }
+
+ if (!hv_result_success(status)) {
+ size_t done_size = done << HV_HYP_PAGE_SHIFT;
+
+ hv_status_err(status, "pgs:%lx/%lx iova:%lx\n",
+ done, npages, iova);
+ /*
+ * lookup tree has all mappings [0 - size-1]. Below unmap will
+ * only remove from [0 - done], we need to remove second chunk
+ * [done+1 - size-1].
+ */
+ hv_iommu_del_tree_mappings(hvdom, iova, size - done_size);
+ hv_iommu_unmap_pages(immdom, iova - done_size, HV_HYP_PAGE_SIZE,
+ done, NULL);
+ if (mapped)
+ *mapped = 0;
+ } else
+ if (mapped)
+ *mapped = size;
+
+ return hv_result_to_errno(status);
+}
+
+static size_t hv_iommu_unmap_pages(struct iommu_domain *immdom, ulong iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *gather)
+{
+ unsigned long flags, npages;
+ struct hv_input_unmap_device_gpa_pages *input;
+ u64 status;
+ struct hv_domain *hvdom = to_hv_domain(immdom);
+ size_t unmapped, size = pgsize * pgcount;
+
+ unmapped = hv_iommu_del_tree_mappings(hvdom, iova, size);
+ if (unmapped < size)
+ pr_err("%s: could not delete all mappings (%lx:%lx/%lx)\n",
+ __func__, iova, unmapped, size);
+
+ if (hvdom->attached_dom)
+ return size;
+
+ npages = size >> HV_HYP_PAGE_SHIFT;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ input->device_domain.partition_id = HV_PARTITION_ID_SELF;
+ input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+ input->device_domain.domain_id.id = hvdom->domid_num;
+ input->target_device_va_base = iova;
+
+ status = hv_do_rep_hypercall(HVCALL_UNMAP_DEVICE_GPA_PAGES, npages,
+ 0, input, NULL);
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+
+ return unmapped;
+}
+
+static phys_addr_t hv_iommu_iova_to_phys(struct iommu_domain *immdom,
+ dma_addr_t iova)
+{
+ unsigned long flags;
+ struct hv_iommu_mapping *mapping;
+ struct interval_tree_node *node;
+ u64 paddr = 0;
+ struct hv_domain *hvdom = to_hv_domain(immdom);
+
+ spin_lock_irqsave(&hvdom->mappings_lock, flags);
+ node = interval_tree_iter_first(&hvdom->mappings_tree, iova, iova);
+ if (node) {
+ mapping = container_of(node, struct hv_iommu_mapping, iova);
+ paddr = mapping->paddr + (iova - mapping->iova.start);
+ }
+ spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
+
+ return paddr;
+}
+
+/*
+ * Currently, hypervisor does not provide list of devices it is using
+ * dynamically. So use this to allow users to manually specify devices that
+ * should be skipped. (eg. hypervisor debugger using some network device).
+ */
+static struct iommu_device *hv_iommu_probe_device(struct device *dev)
+{
+ if (!dev_is_pci(dev))
+ return ERR_PTR(-ENODEV);
+
+ if (pci_devs_to_skip && *pci_devs_to_skip) {
+ int rc, pos = 0;
+ int parsed;
+ int segment, bus, slot, func;
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ do {
+ parsed = 0;
+
+ rc = sscanf(pci_devs_to_skip + pos, " (%x:%x:%x.%x) %n",
+ &segment, &bus, &slot, &func, &parsed);
+ if (rc)
+ break;
+ if (parsed <= 0)
+ break;
+
+ if (pci_domain_nr(pdev->bus) == segment &&
+ pdev->bus->number == bus &&
+ PCI_SLOT(pdev->devfn) == slot &&
+ PCI_FUNC(pdev->devfn) == func) {
+
+ dev_info(dev, "skipped by Hyper-V IOMMU\n");
+ return ERR_PTR(-ENODEV);
+ }
+ pos += parsed;
+
+ } while (pci_devs_to_skip[pos]);
+ }
+
+ /* Device will be explicitly attached to the default domain, so no need
+ * to do dev_iommu_priv_set() here.
+ */
+
+ return &hv_virt_iommu;
+}
+
+static void hv_iommu_probe_finalize(struct device *dev)
+{
+ struct iommu_domain *immdom = iommu_get_domain_for_dev(dev);
+
+ if (immdom && immdom->type == IOMMU_DOMAIN_DMA)
+ iommu_setup_dma_ops(dev, immdom);
+ else
+ set_dma_ops(dev, NULL);
+}
+
+static void hv_iommu_release_device(struct device *dev)
+{
+ struct hv_domain *hvdom = dev_iommu_priv_get(dev);
+
+ /* Need to detach device from device domain if necessary. */
+ if (hvdom)
+ hv_iommu_detach_dev(&hvdom->iommu_dom, dev);
+
+ dev_iommu_priv_set(dev, NULL);
+ set_dma_ops(dev, NULL);
+}
+
+static struct iommu_group *hv_iommu_device_group(struct device *dev)
+{
+ if (dev_is_pci(dev))
+ return pci_device_group(dev);
+ else
+ return generic_device_group(dev);
+}
+
+static int hv_iommu_def_domain_type(struct device *dev)
+{
+ /* The hypervisor always creates this by default during boot */
+ return IOMMU_DOMAIN_IDENTITY;
+}
+
+static struct iommu_ops hv_iommu_ops = {
+ .capable = hv_iommu_capable,
+ .domain_alloc_paging = hv_iommu_domain_alloc_paging,
+ .probe_device = hv_iommu_probe_device,
+ .probe_finalize = hv_iommu_probe_finalize,
+ .release_device = hv_iommu_release_device,
+ .def_domain_type = hv_iommu_def_domain_type,
+ .device_group = hv_iommu_device_group,
+ .default_domain_ops = &(const struct iommu_domain_ops) {
+ .attach_dev = hv_iommu_attach_dev,
+ .map_pages = hv_iommu_map_pages,
+ .unmap_pages = hv_iommu_unmap_pages,
+ .iova_to_phys = hv_iommu_iova_to_phys,
+ .free = hv_iommu_domain_free,
+ },
+ .owner = THIS_MODULE,
+ .identity_domain = &hv_def_identity_dom.iommu_dom,
+ .blocked_domain = &hv_null_dom.iommu_dom,
+};
+
+static const struct iommu_domain_ops hv_special_domain_ops = {
+ .attach_dev = hv_iommu_attach_dev,
+};
+
+static void __init hv_initialize_special_domains(void)
+{
+ hv_def_identity_dom.iommu_dom.type = IOMMU_DOMAIN_IDENTITY;
+ hv_def_identity_dom.iommu_dom.ops = &hv_special_domain_ops;
+ hv_def_identity_dom.iommu_dom.owner = &hv_iommu_ops;
+ hv_def_identity_dom.iommu_dom.geometry = default_geometry;
+ hv_def_identity_dom.domid_num = HV_DEVICE_DOMAIN_ID_S2_DEFAULT; /* 0 */
+
+ hv_null_dom.iommu_dom.type = IOMMU_DOMAIN_BLOCKED;
+ hv_null_dom.iommu_dom.ops = &hv_special_domain_ops;
+ hv_null_dom.iommu_dom.owner = &hv_iommu_ops;
+ hv_null_dom.iommu_dom.geometry = default_geometry;
+ hv_null_dom.domid_num = HV_DEVICE_DOMAIN_ID_S2_NULL; /* INTMAX */
+}
+
+static int __init hv_iommu_init(void)
+{
+ int ret;
+ struct iommu_device *iommup = &hv_virt_iommu;
+
+ if (!hv_is_hyperv_initialized())
+ return -ENODEV;
+
+ ret = iommu_device_sysfs_add(iommup, NULL, NULL, "%s", "hyperv-iommu");
+ if (ret) {
+ pr_err("Hyper-V: iommu_device_sysfs_add failed: %d\n", ret);
+ return ret;
+ }
+
+ /* This must come before iommu_device_register because the latter calls
+ * into the hooks.
+ */
+ hv_initialize_special_domains();
+
+ ret = iommu_device_register(iommup, &hv_iommu_ops, NULL);
+ if (ret) {
+ pr_err("Hyper-V: iommu_device_register failed: %d\n", ret);
+ goto err_sysfs_remove;
+ }
+
+ pr_info("Hyper-V IOMMU initialized\n");
+
+ return 0;
+
+err_sysfs_remove:
+ iommu_device_sysfs_remove(iommup);
+ return ret;
+}
+
+void __init hv_iommu_detect(void)
+{
+ if (no_iommu || iommu_detected)
+ return;
+
+ /* For l1vh, always expose an iommu unit */
+ if (!hv_l1vh_partition())
+ if (!(ms_hyperv.misc_features & HV_DEVICE_DOMAIN_AVAILABLE))
+ return;
+
+ iommu_detected = 1;
+ x86_init.iommu.iommu_init = hv_iommu_init;
+
+ pci_request_acs();
+}
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index a6878ab685e7..fca5ed68b5c2 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -337,6 +337,23 @@ static inline u64 hv_pci_vmbus_device_id(struct pci_dev *pdev)
{ return 0; }
#endif /* IS_ENABLED(CONFIG_PCI_HYPERV) */
+#if IS_ENABLED(CONFIG_HYPERV_IOMMU)
+u64 hv_get_current_partid(void);
+bool hv_pcidev_is_attached_dev(struct pci_dev *pdev);
+bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev);
+u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type);
+#else
+static inline bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
+{ return false; }
+static inline bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev)
+{ return false; }
+static inline u64 hv_build_devid_oftype(struct pci_dev *pdev,
+ enum hv_device_type type)
+{ return 0; }
+static inline u64 hv_get_current_partid(void)
+{ return HV_PARTITION_ID_INVALID; }
+#endif /* IS_ENABLED(CONFIG_HYPERV_IOMMU) */
+
#if IS_ENABLED(CONFIG_MSHV_ROOT)
static inline bool hv_root_partition(void)
{
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 5459e776ec17..6eee1cbf6f23 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1769,4 +1769,10 @@ static inline unsigned long virt_to_hvpfn(void *addr)
#define HVPFN_DOWN(x) ((x) >> HV_HYP_PAGE_SHIFT)
#define page_to_hvpfn(page) (page_to_pfn(page) * NR_HV_HYP_PAGES_IN_PAGE)
+#ifdef CONFIG_HYPERV_IOMMU
+void __init hv_iommu_detect(void);
+#else
+static inline void hv_iommu_detect(void) { }
+#endif /* CONFIG_HYPERV_IOMMU */
+
#endif /* _HYPERV_H */
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 08/11] PCI: hv: Build device id for a VMBus device, export PCI devid function
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
On Hyper-V, most hypercalls related to PCI passthru to map/unmap regions,
interrupts, etc need a device ID as a parameter. This device ID refers
to that specific device during the lifetime of passthru.
An L1VH VM only contains VMBus based devices. A device ID for a VMBus
device is slightly different in that it uses the hv_pcibus_device info
for building it to make sure it matches exactly what the hypervisor
expects. This VMBus based device ID is needed when attaching devices in
an L1VH based guest VM. Before building it, a check is done to make sure
the device is a valid VMBus device.
In remaining cases, PCI device ID is used. So, also make PCI device ID
build function hv_build_devid_type_pci() public.
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
arch/x86/hyperv/irqdomain.c | 9 +++++----
arch/x86/include/asm/mshyperv.h | 6 ++++++
drivers/pci/controller/pci-hyperv.c | 24 ++++++++++++++++++++++++
include/asm-generic/mshyperv.h | 8 ++++++++
4 files changed, 43 insertions(+), 4 deletions(-)
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index b3ad50a874dc..8780573a4332 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -112,7 +112,7 @@ static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
return 0;
}
-static union hv_device_id hv_build_devid_type_pci(struct pci_dev *pdev)
+u64 hv_build_devid_type_pci(struct pci_dev *pdev)
{
int pos;
union hv_device_id hv_devid;
@@ -172,8 +172,9 @@ static union hv_device_id hv_build_devid_type_pci(struct pci_dev *pdev)
}
out:
- return hv_devid;
+ return hv_devid.as_uint64;
}
+EXPORT_SYMBOL_GPL(hv_build_devid_type_pci);
/*
* hv_map_msi_interrupt() - Map the MSI IRQ in the hypervisor.
@@ -196,7 +197,7 @@ int hv_map_msi_interrupt(struct irq_data *data,
msidesc = irq_data_get_msi_desc(data);
pdev = msi_desc_to_pci_dev(msidesc);
- hv_devid = hv_build_devid_type_pci(pdev);
+ hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
return hv_map_interrupt(hv_devid, false, cpu, cfg->vector,
@@ -271,7 +272,7 @@ static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
{
union hv_device_id hv_devid;
- hv_devid = hv_build_devid_type_pci(pdev);
+ hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
}
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index f64393e853ee..9d24cafed657 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -271,6 +271,12 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; }
#endif /* CONFIG_HYPERV */
+#if IS_ENABLED(CONFIG_HYPERV_IOMMU)
+u64 hv_build_devid_type_pci(struct pci_dev *pdev);
+#else
+u64 hv_build_devid_type_pci(struct pci_dev *pdev) { return 0; }
+#endif /* IS_ENABLED(CONFIG_HYPERV_IOMMU) */
+
struct mshv_vtl_cpu_context {
union {
struct {
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index cfc8fa403dad..50d793ca8f31 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -573,6 +573,7 @@ struct hv_pci_compl {
};
static void hv_pci_onchannelcallback(void *context);
+static bool hv_vmbus_pci_device(struct pci_bus *pbus);
#ifdef CONFIG_X86
#define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED
@@ -1005,6 +1006,24 @@ static struct irq_domain *hv_pci_get_root_domain(void)
static void hv_arch_irq_unmask(struct irq_data *data) { }
#endif /* CONFIG_ARM64 */
+u64 hv_pci_vmbus_device_id(struct pci_dev *pdev)
+{
+ struct hv_pcibus_device *hbus;
+ struct pci_bus *pbus = pdev->bus;
+
+ if (!hv_vmbus_pci_device(pbus))
+ return 0;
+
+ hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
+
+ return (hbus->hdev->dev_instance.b[5] << 24) |
+ (hbus->hdev->dev_instance.b[4] << 16) |
+ (hbus->hdev->dev_instance.b[7] << 8) |
+ (hbus->hdev->dev_instance.b[6] & 0xf8) |
+ PCI_FUNC(pdev->devfn);
+}
+EXPORT_SYMBOL_GPL(hv_pci_vmbus_device_id);
+
/**
* hv_pci_generic_compl() - Invoked for a completion packet
* @context: Set up by the sender of the packet.
@@ -1403,6 +1422,11 @@ static struct pci_ops hv_pcifront_ops = {
.write = hv_pcifront_write_config,
};
+static bool hv_vmbus_pci_device(struct pci_bus *pbus)
+{
+ return pbus->ops == &hv_pcifront_ops;
+}
+
/*
* Paravirtual backchannel
*
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index e8cbc4e3f7ad..a6878ab685e7 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -23,6 +23,7 @@
#include <acpi/acpi_numa.h>
#include <linux/cpumask.h>
#include <linux/nmi.h>
+#include <linux/pci.h>
#include <asm/ptrace.h>
#include <hyperv/hvhdk.h>
@@ -329,6 +330,13 @@ static inline enum hv_isolation_type hv_get_isolation_type(void)
}
#endif /* CONFIG_HYPERV */
+#if IS_ENABLED(CONFIG_PCI_HYPERV)
+u64 hv_pci_vmbus_device_id(struct pci_dev *pdev);
+#else
+static inline u64 hv_pci_vmbus_device_id(struct pci_dev *pdev)
+{ return 0; }
+#endif /* IS_ENABLED(CONFIG_PCI_HYPERV) */
+
#if IS_ENABLED(CONFIG_MSHV_ROOT)
static inline bool hv_root_partition(void)
{
--
2.51.2.vfs.0.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox