[PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
@ 2025-07-04 13:30 Lu Baolu
  2025-07-04 13:38 ` Jason Gunthorpe
                   ` (3 more replies)
  0 siblings, 4 replies; 23+ messages in thread
From: Lu Baolu @ 2025-07-04 13:30 UTC (permalink / raw)
  To: Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian,
	Jason Gunthorpe, Jann Horn, Vasant Hegde, Dave Hansen,
	Alistair Popple, Peter Zijlstra, Uladzislau Rezki,
	Jean-Philippe Brucker, Andy Lutomirski
  Cc: iommu, security, linux-kernel, Lu Baolu, stable

The vmalloc() and vfree() functions manage virtually contiguous, but not
necessarily physically contiguous, kernel memory regions. When vfree()
unmaps such a region, it tears down the associated kernel page table
entries and frees the physical pages.

In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware
shares and walks the CPU's page tables. Architectures like x86 share
static kernel address mappings across all user page tables, allowing the
IOMMU to access the kernel portion of these tables.

Modern IOMMUs often cache page table entries to optimize walk performance,
even for intermediate page table levels. If kernel page table mappings are
changed (e.g., by vfree()), but the IOMMU's internal caches retain stale
entries, Use-After-Free (UAF) vulnerability condition arises. If these
freed page table pages are reallocated for a different purpose, potentially
by an attacker, the IOMMU could misinterpret the new data as valid page
table entries. This allows the IOMMU to walk into attacker-controlled
memory, leading to arbitrary physical memory DMA access or privilege
escalation.

To mitigate this, introduce a new iommu interface to flush IOMMU caches
and fence pending page table walks when kernel page mappings are updated.
This interface should be invoked from architecture-specific code that
manages combined user and kernel page tables.

Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices")
Cc: stable@vger.kernel.org
Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 arch/x86/mm/tlb.c         |  2 ++
 drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++-
 include/linux/iommu.h     |  4 ++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 39f80111e6f1..a41499dfdc3f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
 #include <linux/task_work.h>
 #include <linux/mmu_notifier.h>
 #include <linux/mmu_context.h>
+#include <linux/iommu.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 		kernel_tlb_flush_range(info);
 
 	put_flush_tlb_info();
+	iommu_sva_invalidate_kva_range(start, end);
 }
 
 /*
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 1a51cfd82808..154384eab8a3 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -10,6 +10,8 @@
 #include "iommu-priv.h"
 
 static DEFINE_MUTEX(iommu_sva_lock);
+static DEFINE_STATIC_KEY_FALSE(iommu_sva_present);
+static LIST_HEAD(iommu_sva_mms);
 static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
 						   struct mm_struct *mm);
 
@@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de
 		return ERR_PTR(-ENOSPC);
 	}
 	iommu_mm->pasid = pasid;
+	iommu_mm->mm = mm;
 	INIT_LIST_HEAD(&iommu_mm->sva_domains);
 	/*
 	 * Make sure the write to mm->iommu_mm is not reordered in front of
@@ -132,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 	if (ret)
 		goto out_free_domain;
 	domain->users = 1;
-	list_add(&domain->next, &mm->iommu_mm->sva_domains);
 
+	if (list_empty(&iommu_mm->sva_domains)) {
+		if (list_empty(&iommu_sva_mms))
+			static_branch_enable(&iommu_sva_present);
+		list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
+	}
+	list_add(&domain->next, &iommu_mm->sva_domains);
 out:
 	refcount_set(&handle->users, 1);
 	mutex_unlock(&iommu_sva_lock);
@@ -175,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle)
 		list_del(&domain->next);
 		iommu_domain_free(domain);
 	}
+
+	if (list_empty(&iommu_mm->sva_domains)) {
+		list_del(&iommu_mm->mm_list_elm);
+		if (list_empty(&iommu_sva_mms))
+			static_branch_disable(&iommu_sva_present);
+	}
+
 	mutex_unlock(&iommu_sva_lock);
 	kfree(handle);
 }
@@ -312,3 +327,18 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
 
 	return domain;
 }
+
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
+{
+	struct iommu_mm_data *iommu_mm;
+
+	might_sleep();
+
+	if (!static_branch_unlikely(&iommu_sva_present))
+		return;
+
+	guard(mutex)(&iommu_sva_lock);
+	list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
+		mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 156732807994..31330c12b8ee 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1090,7 +1090,9 @@ struct iommu_sva {
 
 struct iommu_mm_data {
 	u32			pasid;
+	struct mm_struct	*mm;
 	struct list_head	sva_domains;
+	struct list_head	mm_list_elm;
 };
 
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode);
@@ -1571,6 +1573,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm);
 void iommu_sva_unbind_device(struct iommu_sva *handle);
 u32 iommu_sva_get_pasid(struct iommu_sva *handle);
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end);
 #else
 static inline struct iommu_sva *
 iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
@@ -1595,6 +1598,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
 }
 
 static inline void mm_pasid_drop(struct mm_struct *mm) {}
+static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {}
 #endif /* CONFIG_IOMMU_SVA */
 
 #ifdef CONFIG_IOMMU_IOPF
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-04 13:30 [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush Lu Baolu
@ 2025-07-04 13:38 ` Jason Gunthorpe
  2025-07-05  3:50   ` Baolu Lu
  2025-07-05  9:06 ` Vasant Hegde
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 23+ messages in thread
From: Jason Gunthorpe @ 2025-07-04 13:38 UTC (permalink / raw)
  To: Lu Baolu
  Cc: Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian, Jann Horn,
	Vasant Hegde, Dave Hansen, Alistair Popple, Peter Zijlstra,
	Uladzislau Rezki, Jean-Philippe Brucker, Andy Lutomirski, iommu,
	security, linux-kernel, stable

On Fri, Jul 04, 2025 at 09:30:56PM +0800, Lu Baolu wrote:
> The vmalloc() and vfree() functions manage virtually contiguous, but not
> necessarily physically contiguous, kernel memory regions. When vfree()
> unmaps such a region, it tears down the associated kernel page table
> entries and frees the physical pages.
> 
> In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware
> shares and walks the CPU's page tables. Architectures like x86 share
> static kernel address mappings across all user page tables, allowing the
> IOMMU to access the kernel portion of these tables.
> 
> Modern IOMMUs often cache page table entries to optimize walk performance,
> even for intermediate page table levels. If kernel page table mappings are
> changed (e.g., by vfree()), but the IOMMU's internal caches retain stale
> entries, Use-After-Free (UAF) vulnerability condition arises. If these
> freed page table pages are reallocated for a different purpose, potentially
> by an attacker, the IOMMU could misinterpret the new data as valid page
> table entries. This allows the IOMMU to walk into attacker-controlled
> memory, leading to arbitrary physical memory DMA access or privilege
> escalation.
> 
> To mitigate this, introduce a new iommu interface to flush IOMMU caches
> and fence pending page table walks when kernel page mappings are updated.
> This interface should be invoked from architecture-specific code that
> manages combined user and kernel page tables.
> 
> Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices")
> Cc: stable@vger.kernel.org
> Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>  arch/x86/mm/tlb.c         |  2 ++
>  drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++-
>  include/linux/iommu.h     |  4 ++++
>  3 files changed, 37 insertions(+), 1 deletion(-)

Reported-by: Jann Horn <jannh@google.com>

> @@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>  		kernel_tlb_flush_range(info);
>  
>  	put_flush_tlb_info();
> +	iommu_sva_invalidate_kva_range(start, end);
>  }

This is much less call sites than I guessed!

> +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
> +{
> +	struct iommu_mm_data *iommu_mm;
> +
> +	might_sleep();
> +
> +	if (!static_branch_unlikely(&iommu_sva_present))
> +		return;
> +
> +	guard(mutex)(&iommu_sva_lock);
> +	list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
> +		mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range);

I don't think it needs to be exported it only arch code is calling it?

Looks Ok to me:

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-04 13:38 ` Jason Gunthorpe
@ 2025-07-05  3:50   ` Baolu Lu
  0 siblings, 0 replies; 23+ messages in thread
From: Baolu Lu @ 2025-07-05  3:50 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: baolu.lu, Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian,
	Jann Horn, Vasant Hegde, Dave Hansen, Alistair Popple,
	Peter Zijlstra, Uladzislau Rezki, Jean-Philippe Brucker,
	Andy Lutomirski, iommu, security, linux-kernel, stable

On 7/4/2025 9:38 PM, Jason Gunthorpe wrote:
> On Fri, Jul 04, 2025 at 09:30:56PM +0800, Lu Baolu wrote:
>> The vmalloc() and vfree() functions manage virtually contiguous, but not
>> necessarily physically contiguous, kernel memory regions. When vfree()
>> unmaps such a region, it tears down the associated kernel page table
>> entries and frees the physical pages.
>>
>> In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware
>> shares and walks the CPU's page tables. Architectures like x86 share
>> static kernel address mappings across all user page tables, allowing the
>> IOMMU to access the kernel portion of these tables.
>>
>> Modern IOMMUs often cache page table entries to optimize walk performance,
>> even for intermediate page table levels. If kernel page table mappings are
>> changed (e.g., by vfree()), but the IOMMU's internal caches retain stale
>> entries, Use-After-Free (UAF) vulnerability condition arises. If these
>> freed page table pages are reallocated for a different purpose, potentially
>> by an attacker, the IOMMU could misinterpret the new data as valid page
>> table entries. This allows the IOMMU to walk into attacker-controlled
>> memory, leading to arbitrary physical memory DMA access or privilege
>> escalation.
>>
>> To mitigate this, introduce a new iommu interface to flush IOMMU caches
>> and fence pending page table walks when kernel page mappings are updated.
>> This interface should be invoked from architecture-specific code that
>> manages combined user and kernel page tables.
>>
>> Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices")
>> Cc:stable@vger.kernel.org
>> Co-developed-by: Jason Gunthorpe<jgg@nvidia.com>
>> Signed-off-by: Jason Gunthorpe<jgg@nvidia.com>
>> Signed-off-by: Lu Baolu<baolu.lu@linux.intel.com>
>> ---
>>   arch/x86/mm/tlb.c         |  2 ++
>>   drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++-
>>   include/linux/iommu.h     |  4 ++++
>>   3 files changed, 37 insertions(+), 1 deletion(-)
> Reported-by: Jann Horn<jannh@google.com>
> 
>> @@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>>   		kernel_tlb_flush_range(info);
>>   
>>   	put_flush_tlb_info();
>> +	iommu_sva_invalidate_kva_range(start, end);
>>   }
> This is much less call sites than I guessed!
> 
>> +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
>> +{
>> +	struct iommu_mm_data *iommu_mm;
>> +
>> +	might_sleep();
>> +
>> +	if (!static_branch_unlikely(&iommu_sva_present))
>> +		return;
>> +
>> +	guard(mutex)(&iommu_sva_lock);
>> +	list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
>> +		mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range);
> I don't think it needs to be exported it only arch code is calling it?

Yes. Done.

> 
> Looks Ok to me:
> 
> Reviewed-by: Jason Gunthorpe<jgg@nvidia.com>

Thanks,
baolu

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-04 13:30 [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush Lu Baolu
  2025-07-04 13:38 ` Jason Gunthorpe
@ 2025-07-05  9:06 ` Vasant Hegde
  2025-07-08  5:42 ` Baolu Lu
  2025-07-09 15:51 ` Jacob Pan
  3 siblings, 0 replies; 23+ messages in thread
From: Vasant Hegde @ 2025-07-05  9:06 UTC (permalink / raw)
  To: Lu Baolu, Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian,
	Jason Gunthorpe, Jann Horn, Dave Hansen, Alistair Popple,
	Peter Zijlstra, Uladzislau Rezki, Jean-Philippe Brucker,
	Andy Lutomirski
  Cc: iommu, security, linux-kernel, stable



On 7/4/2025 7:00 PM, Lu Baolu wrote:
> The vmalloc() and vfree() functions manage virtually contiguous, but not
> necessarily physically contiguous, kernel memory regions. When vfree()
> unmaps such a region, it tears down the associated kernel page table
> entries and frees the physical pages.
> 
> In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware
> shares and walks the CPU's page tables. Architectures like x86 share
> static kernel address mappings across all user page tables, allowing the
> IOMMU to access the kernel portion of these tables.
> 
> Modern IOMMUs often cache page table entries to optimize walk performance,
> even for intermediate page table levels. If kernel page table mappings are
> changed (e.g., by vfree()), but the IOMMU's internal caches retain stale
> entries, Use-After-Free (UAF) vulnerability condition arises. If these
> freed page table pages are reallocated for a different purpose, potentially
> by an attacker, the IOMMU could misinterpret the new data as valid page
> table entries. This allows the IOMMU to walk into attacker-controlled
> memory, leading to arbitrary physical memory DMA access or privilege
> escalation.
> 
> To mitigate this, introduce a new iommu interface to flush IOMMU caches
> and fence pending page table walks when kernel page mappings are updated.
> This interface should be invoked from architecture-specific code that
> manages combined user and kernel page tables.
> 
> Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices")
> Cc: stable@vger.kernel.org
> Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>

Thanks for getting this patch. Looks good to me.

Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>

-Vasant

> ---
>  arch/x86/mm/tlb.c         |  2 ++
>  drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++-
>  include/linux/iommu.h     |  4 ++++
>  3 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 39f80111e6f1..a41499dfdc3f 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -12,6 +12,7 @@
>  #include <linux/task_work.h>
>  #include <linux/mmu_notifier.h>
>  #include <linux/mmu_context.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/tlbflush.h>
>  #include <asm/mmu_context.h>
> @@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>  		kernel_tlb_flush_range(info);
>  
>  	put_flush_tlb_info();
> +	iommu_sva_invalidate_kva_range(start, end);
>  }
>  
>  /*
> diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
> index 1a51cfd82808..154384eab8a3 100644
> --- a/drivers/iommu/iommu-sva.c
> +++ b/drivers/iommu/iommu-sva.c
> @@ -10,6 +10,8 @@
>  #include "iommu-priv.h"
>  
>  static DEFINE_MUTEX(iommu_sva_lock);
> +static DEFINE_STATIC_KEY_FALSE(iommu_sva_present);
> +static LIST_HEAD(iommu_sva_mms);
>  static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
>  						   struct mm_struct *mm);
>  
> @@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de
>  		return ERR_PTR(-ENOSPC);
>  	}
>  	iommu_mm->pasid = pasid;
> +	iommu_mm->mm = mm;
>  	INIT_LIST_HEAD(&iommu_mm->sva_domains);
>  	/*
>  	 * Make sure the write to mm->iommu_mm is not reordered in front of
> @@ -132,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
>  	if (ret)
>  		goto out_free_domain;
>  	domain->users = 1;
> -	list_add(&domain->next, &mm->iommu_mm->sva_domains);
>  
> +	if (list_empty(&iommu_mm->sva_domains)) {
> +		if (list_empty(&iommu_sva_mms))
> +			static_branch_enable(&iommu_sva_present);
> +		list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
> +	}
> +	list_add(&domain->next, &iommu_mm->sva_domains);
>  out:
>  	refcount_set(&handle->users, 1);
>  	mutex_unlock(&iommu_sva_lock);
> @@ -175,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle)
>  		list_del(&domain->next);
>  		iommu_domain_free(domain);
>  	}
> +
> +	if (list_empty(&iommu_mm->sva_domains)) {
> +		list_del(&iommu_mm->mm_list_elm);
> +		if (list_empty(&iommu_sva_mms))
> +			static_branch_disable(&iommu_sva_present);
> +	}
> +
>  	mutex_unlock(&iommu_sva_lock);
>  	kfree(handle);
>  }
> @@ -312,3 +327,18 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
>  
>  	return domain;
>  }
> +
> +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
> +{
> +	struct iommu_mm_data *iommu_mm;
> +
> +	might_sleep();
> +
> +	if (!static_branch_unlikely(&iommu_sva_present))
> +		return;
> +
> +	guard(mutex)(&iommu_sva_lock);
> +	list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
> +		mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range);
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 156732807994..31330c12b8ee 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -1090,7 +1090,9 @@ struct iommu_sva {
>  
>  struct iommu_mm_data {
>  	u32			pasid;
> +	struct mm_struct	*mm;
>  	struct list_head	sva_domains;
> +	struct list_head	mm_list_elm;
>  };
>  
>  int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode);
> @@ -1571,6 +1573,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
>  					struct mm_struct *mm);
>  void iommu_sva_unbind_device(struct iommu_sva *handle);
>  u32 iommu_sva_get_pasid(struct iommu_sva *handle);
> +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end);
>  #else
>  static inline struct iommu_sva *
>  iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
> @@ -1595,6 +1598,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
>  }
>  
>  static inline void mm_pasid_drop(struct mm_struct *mm) {}
> +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {}
>  #endif /* CONFIG_IOMMU_SVA */
>  
>  #ifdef CONFIG_IOMMU_IOPF


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-04 13:30 [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush Lu Baolu
  2025-07-04 13:38 ` Jason Gunthorpe
  2025-07-05  9:06 ` Vasant Hegde
@ 2025-07-08  5:42 ` Baolu Lu
  2025-07-08 12:27   ` Jason Gunthorpe
  2025-07-09 15:51 ` Jacob Pan
  3 siblings, 1 reply; 23+ messages in thread
From: Baolu Lu @ 2025-07-08  5:42 UTC (permalink / raw)
  To: Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian,
	Jason Gunthorpe, Jann Horn, Vasant Hegde, Dave Hansen,
	Alistair Popple, Peter Zijlstra, Uladzislau Rezki,
	Jean-Philippe Brucker, Andy Lutomirski, Yi Lai
  Cc: iommu, security, linux-kernel, stable

On 7/4/25 21:30, Lu Baolu wrote:
> The vmalloc() and vfree() functions manage virtually contiguous, but not
> necessarily physically contiguous, kernel memory regions. When vfree()
> unmaps such a region, it tears down the associated kernel page table
> entries and frees the physical pages.
> 
> In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware
> shares and walks the CPU's page tables. Architectures like x86 share
> static kernel address mappings across all user page tables, allowing the
> IOMMU to access the kernel portion of these tables.
> 
> Modern IOMMUs often cache page table entries to optimize walk performance,
> even for intermediate page table levels. If kernel page table mappings are
> changed (e.g., by vfree()), but the IOMMU's internal caches retain stale
> entries, Use-After-Free (UAF) vulnerability condition arises. If these
> freed page table pages are reallocated for a different purpose, potentially
> by an attacker, the IOMMU could misinterpret the new data as valid page
> table entries. This allows the IOMMU to walk into attacker-controlled
> memory, leading to arbitrary physical memory DMA access or privilege
> escalation.
> 
> To mitigate this, introduce a new iommu interface to flush IOMMU caches
> and fence pending page table walks when kernel page mappings are updated.
> This interface should be invoked from architecture-specific code that
> manages combined user and kernel page tables.
> 
> Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices")
> Cc:stable@vger.kernel.org
> Co-developed-by: Jason Gunthorpe<jgg@nvidia.com>
> Signed-off-by: Jason Gunthorpe<jgg@nvidia.com>
> Signed-off-by: Lu Baolu<baolu.lu@linux.intel.com>
> ---
>   arch/x86/mm/tlb.c         |  2 ++
>   drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++-
>   include/linux/iommu.h     |  4 ++++
>   3 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 39f80111e6f1..a41499dfdc3f 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -12,6 +12,7 @@
>   #include <linux/task_work.h>
>   #include <linux/mmu_notifier.h>
>   #include <linux/mmu_context.h>
> +#include <linux/iommu.h>
>   
>   #include <asm/tlbflush.h>
>   #include <asm/mmu_context.h>
> @@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>   		kernel_tlb_flush_range(info);
>   
>   	put_flush_tlb_info();
> +	iommu_sva_invalidate_kva_range(start, end);
>   }
>   
>   /*
> diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
> index 1a51cfd82808..154384eab8a3 100644
> --- a/drivers/iommu/iommu-sva.c
> +++ b/drivers/iommu/iommu-sva.c
> @@ -10,6 +10,8 @@
>   #include "iommu-priv.h"
>   
>   static DEFINE_MUTEX(iommu_sva_lock);
> +static DEFINE_STATIC_KEY_FALSE(iommu_sva_present);
> +static LIST_HEAD(iommu_sva_mms);
>   static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
>   						   struct mm_struct *mm);
>   
> @@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de
>   		return ERR_PTR(-ENOSPC);
>   	}
>   	iommu_mm->pasid = pasid;
> +	iommu_mm->mm = mm;
>   	INIT_LIST_HEAD(&iommu_mm->sva_domains);
>   	/*
>   	 * Make sure the write to mm->iommu_mm is not reordered in front of
> @@ -132,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
>   	if (ret)
>   		goto out_free_domain;
>   	domain->users = 1;
> -	list_add(&domain->next, &mm->iommu_mm->sva_domains);
>   
> +	if (list_empty(&iommu_mm->sva_domains)) {
> +		if (list_empty(&iommu_sva_mms))
> +			static_branch_enable(&iommu_sva_present);
> +		list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
> +	}
> +	list_add(&domain->next, &iommu_mm->sva_domains);
>   out:
>   	refcount_set(&handle->users, 1);
>   	mutex_unlock(&iommu_sva_lock);
> @@ -175,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle)
>   		list_del(&domain->next);
>   		iommu_domain_free(domain);
>   	}
> +
> +	if (list_empty(&iommu_mm->sva_domains)) {
> +		list_del(&iommu_mm->mm_list_elm);
> +		if (list_empty(&iommu_sva_mms))
> +			static_branch_disable(&iommu_sva_present);
> +	}
> +
>   	mutex_unlock(&iommu_sva_lock);
>   	kfree(handle);
>   }
> @@ -312,3 +327,18 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
>   
>   	return domain;
>   }
> +
> +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
> +{
> +	struct iommu_mm_data *iommu_mm;
> +
> +	might_sleep();

Yi Lai <yi1.lai@intel.com> reported an issue here. This interface could
potentially be called in a non-sleepable context.

[    4.605633] BUG: sleeping function called from invalid context at 
drivers/iommu/iommu-sva.c:335
[    4.606433] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, 
name: swapper/0
[    4.606975] preempt_count: 1, expected: 0
[    4.607210] RCU nest depth: 0, expected: 0
[    4.607467] 1 lock held by swapper/0/1:
[    4.607773]  #0: ffffffff8743b5c8 (vmap_purge_lock){+.+.}-{4:4}, at: 
_vm_unmap_aliases+0xcd/0x800
[    4.608304] Preemption disabled at:
[    4.608308] [<ffffffff81413f2a>] flush_tlb_kernel_range+0x2a/0x420
[    4.608841] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 
6.16.0-rc5-e864c1d7585d+ #1 PREEMPT(voluntary)
[    4.608851] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[    4.608856] Call Trace:
[    4.608862]  <TASK>
[    4.608867]  dump_stack_lvl+0x121/0x150
[    4.608887]  dump_stack+0x19/0x20
[    4.608894]  __might_resched+0x37b/0x5a0
[    4.608910]  __might_sleep+0xa3/0x170
[    4.608919]  iommu_sva_invalidate_kva_range+0x32/0x140
[    4.608939]  flush_tlb_kernel_range+0x2d1/0x420
[    4.608951]  __purge_vmap_area_lazy+0x5ae/0xc60
[    4.608964]  _vm_unmap_aliases+0x653/0x800
[    4.608973]  ? kmemdup_noprof+0x37/0x70
[    4.608985]  ? __pfx__vm_unmap_aliases+0x10/0x10
[    4.608992]  ? ret_from_fork_asm+0x1a/0x30
[    4.609004]  ? __free_frozen_pages+0x493/0x1000
[    4.609014]  ? __free_frozen_pages+0x493/0x1000
[    4.609025]  vm_unmap_aliases+0x22/0x30
[    4.609032]  change_page_attr_set_clr+0x272/0x4c0
[    4.609046]  ? __pfx_change_page_attr_set_clr+0x10/0x10
[    4.609059]  ? __this_cpu_preempt_check+0x21/0x30
[    4.609078]  ? kasan_save_track+0x18/0x40
[    4.609099]  set_memory_nx+0xbd/0x110
[    4.609115]  ? __pfx_set_memory_nx+0x10/0x10
[    4.609128]  free_init_pages+0x82/0xd0
[    4.609137]  ? __pfx_kernel_init+0x10/0x10
[    4.609148]  mem_encrypt_free_decrypted_mem+0x4e/0x70
[    4.609173]  free_initmem+0x1c/0x40
[    4.609179]  kernel_init+0x4a/0x2f0
[    4.609190]  ret_from_fork+0x38e/0x490
[    4.609201]  ? __pfx_kernel_init+0x10/0x10
[    4.609212]  ret_from_fork_asm+0x1a/0x30
[    4.609227]  </TASK>


So we might need a spinlock to protect the sva mm_struct list? An
additional change like this:

diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index f6fe250d12e5..d503dd95e4e5 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -12,6 +12,7 @@
  static DEFINE_MUTEX(iommu_sva_lock);
  static DEFINE_STATIC_KEY_FALSE(iommu_sva_present);
  static LIST_HEAD(iommu_sva_mms);
+static DEFINE_SPINLOCK(iommu_mms_lock);
  static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
  						   struct mm_struct *mm);

@@ -137,9 +138,11 @@ struct iommu_sva *iommu_sva_bind_device(struct 
device *dev, struct mm_struct *mm
  	domain->users = 1;

  	if (list_empty(&iommu_mm->sva_domains)) {
+		spin_lock(&iommu_mms_lock);
  		if (list_empty(&iommu_sva_mms))
  			static_branch_enable(&iommu_sva_present);
  		list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
+		spin_unlock(&iommu_mms_lock);
  	}
  	list_add(&domain->next, &iommu_mm->sva_domains);
  out:
@@ -185,9 +188,11 @@ void iommu_sva_unbind_device(struct iommu_sva *handle)
  	}

  	if (list_empty(&iommu_mm->sva_domains)) {
+		spin_lock(&iommu_mms_lock);
  		list_del(&iommu_mm->mm_list_elm);
  		if (list_empty(&iommu_sva_mms))
  			static_branch_disable(&iommu_sva_present);
+		spin_unlock(&iommu_mms_lock);
  	}

  	mutex_unlock(&iommu_sva_lock);
@@ -332,12 +337,10 @@ void iommu_sva_invalidate_kva_range(unsigned long 
start, unsigned long end)
  {
  	struct iommu_mm_data *iommu_mm;

-	might_sleep();
-
  	if (!static_branch_unlikely(&iommu_sva_present))
  		return;

-	guard(mutex)(&iommu_sva_lock);
+	guard(spinlock)(&iommu_mms_lock);
  	list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
  		mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
  }
-- 
2.43.0

> +
> +	if (!static_branch_unlikely(&iommu_sva_present))
> +		return;
> +
> +	guard(mutex)(&iommu_sva_lock);
> +	list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
> +		mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
> +}

Thanks,
baolu

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-08  5:42 ` Baolu Lu
@ 2025-07-08 12:27   ` Jason Gunthorpe
  2025-07-08 14:06     ` Jason Gunthorpe
  0 siblings, 1 reply; 23+ messages in thread
From: Jason Gunthorpe @ 2025-07-08 12:27 UTC (permalink / raw)
  To: Baolu Lu
  Cc: Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian, Jann Horn,
	Vasant Hegde, Dave Hansen, Alistair Popple, Peter Zijlstra,
	Uladzislau Rezki, Jean-Philippe Brucker, Andy Lutomirski, Yi Lai,
	iommu, security, linux-kernel, stable

On Tue, Jul 08, 2025 at 01:42:53PM +0800, Baolu Lu wrote:
> > +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
> > +{
> > +	struct iommu_mm_data *iommu_mm;
> > +
> > +	might_sleep();
> 
> Yi Lai <yi1.lai@intel.com> reported an issue here. This interface could
> potentially be called in a non-sleepable context.

Oh thats really bad, the notifiers inside the iommu driver are not
required to be called in a sleepable context either and I don't really
want to change that requirement.

Can you do something about how the notifier is called to not be inside
an atomic context?

Maybe we can push the kernel page table pages onto a list and free
them from a work queue kind of like what the normal mm does?

Back to the shadowing idea?

Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-08 12:27   ` Jason Gunthorpe
@ 2025-07-08 14:06     ` Jason Gunthorpe
  2025-07-09  1:25       ` Baolu Lu
  0 siblings, 1 reply; 23+ messages in thread
From: Jason Gunthorpe @ 2025-07-08 14:06 UTC (permalink / raw)
  To: Baolu Lu
  Cc: Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian, Jann Horn,
	Vasant Hegde, Dave Hansen, Alistair Popple, Peter Zijlstra,
	Uladzislau Rezki, Jean-Philippe Brucker, Andy Lutomirski, Yi Lai,
	iommu, security, linux-kernel, stable

On Tue, Jul 08, 2025 at 09:27:55AM -0300, Jason Gunthorpe wrote:
> On Tue, Jul 08, 2025 at 01:42:53PM +0800, Baolu Lu wrote:
> > > +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
> > > +{
> > > +	struct iommu_mm_data *iommu_mm;
> > > +
> > > +	might_sleep();
> > 
> > Yi Lai <yi1.lai@intel.com> reported an issue here. This interface could
> > potentially be called in a non-sleepable context.
> 
> Oh thats really bad, the notifiers inside the iommu driver are not
> required to be called in a sleepable context either and I don't really
> want to change that requirement.

Actually, I have got confused here with the hmm use of notifiers.

The iommu drivers use arch_invalidate_secondary_tlbs so they are
already in atomic contexts.

So your idea to use a spinlock seems correct.

Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-08 14:06     ` Jason Gunthorpe
@ 2025-07-09  1:25       ` Baolu Lu
  0 siblings, 0 replies; 23+ messages in thread
From: Baolu Lu @ 2025-07-09  1:25 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian, Jann Horn,
	Vasant Hegde, Dave Hansen, Alistair Popple, Peter Zijlstra,
	Uladzislau Rezki, Jean-Philippe Brucker, Andy Lutomirski, Yi Lai,
	iommu, security, linux-kernel, stable

On 7/8/25 22:06, Jason Gunthorpe wrote:
> On Tue, Jul 08, 2025 at 09:27:55AM -0300, Jason Gunthorpe wrote:
>> On Tue, Jul 08, 2025 at 01:42:53PM +0800, Baolu Lu wrote:
>>>> +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
>>>> +{
>>>> +	struct iommu_mm_data *iommu_mm;
>>>> +
>>>> +	might_sleep();
>>>
>>> Yi Lai <yi1.lai@intel.com> reported an issue here. This interface could
>>> potentially be called in a non-sleepable context.
>>
>> Oh thats really bad, the notifiers inside the iommu driver are not
>> required to be called in a sleepable context either and I don't really
>> want to change that requirement.
> 
> Actually, I have got confused here with the hmm use of notifiers.
> 
> The iommu drivers use arch_invalidate_secondary_tlbs so they are
> already in atomic contexts.
> 
> So your idea to use a spinlock seems correct.

Okay, then let me post an updated version.

Thanks,
baolu

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-04 13:30 [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush Lu Baolu
                   ` (2 preceding siblings ...)
  2025-07-08  5:42 ` Baolu Lu
@ 2025-07-09 15:51 ` Jacob Pan
  2025-07-09 16:27   ` Jason Gunthorpe
  3 siblings, 1 reply; 23+ messages in thread
From: Jacob Pan @ 2025-07-09 15:51 UTC (permalink / raw)
  To: Lu Baolu
  Cc: Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian,
	Jason Gunthorpe, Jann Horn, Vasant Hegde, Dave Hansen,
	Alistair Popple, Peter Zijlstra, Uladzislau Rezki,
	Jean-Philippe Brucker, Andy Lutomirski, iommu, security,
	linux-kernel, stable, jacob.pan

Hi BaoLu,

On Fri,  4 Jul 2025 21:30:56 +0800
Lu Baolu <baolu.lu@linux.intel.com> wrote:

> The vmalloc() and vfree() functions manage virtually contiguous, but
> not necessarily physically contiguous, kernel memory regions. When
> vfree() unmaps such a region, it tears down the associated kernel
> page table entries and frees the physical pages.
>
> In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU
> hardware shares and walks the CPU's page tables. Architectures like
> x86 share static kernel address mappings across all user page tables,
> allowing the IOMMU to access the kernel portion of these tables.
Is there a use case where a SVA user can access kernel memory in the
first place? It seems VT-d code does not set supervisor request (SRE)
for the user PASID, I don't see SRE equivalent in AMD IOMMU GCR3 table.
So the PTE U/S bit will prevent kernel memory access, no?

> Modern IOMMUs often cache page table entries to optimize walk
> performance, even for intermediate page table levels. If kernel page
Just wondering if this patch has anything specific to "intermediate page
table", since invalidation hint is always 0 so the intermediate TLBs
are always flushed.

> table mappings are changed (e.g., by vfree()), but the IOMMU's
> internal caches retain stale entries, Use-After-Free (UAF)
> vulnerability condition arises. If these freed page table pages are
> reallocated for a different purpose, potentially by an attacker, the
> IOMMU could misinterpret the new data as valid page table entries.
> This allows the IOMMU to walk into attacker-controlled memory,
> leading to arbitrary physical memory DMA access or privilege
> escalation.
> 
> To mitigate this, introduce a new iommu interface to flush IOMMU
> caches and fence pending page table walks when kernel page mappings
> are updated. This interface should be invoked from
> architecture-specific code that manages combined user and kernel page
> tables.
> 
> Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices")
> Cc: stable@vger.kernel.org
> Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>  arch/x86/mm/tlb.c         |  2 ++
>  drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++-
>  include/linux/iommu.h     |  4 ++++
>  3 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 39f80111e6f1..a41499dfdc3f 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -12,6 +12,7 @@
>  #include <linux/task_work.h>
>  #include <linux/mmu_notifier.h>
>  #include <linux/mmu_context.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/tlbflush.h>
>  #include <asm/mmu_context.h>
> @@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long
> start, unsigned long end) kernel_tlb_flush_range(info);
>  
>  	put_flush_tlb_info();
> +	iommu_sva_invalidate_kva_range(start, end);
>  }
>  
>  /*
> diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
> index 1a51cfd82808..154384eab8a3 100644
> --- a/drivers/iommu/iommu-sva.c
> +++ b/drivers/iommu/iommu-sva.c
> @@ -10,6 +10,8 @@
>  #include "iommu-priv.h"
>  
>  static DEFINE_MUTEX(iommu_sva_lock);
> +static DEFINE_STATIC_KEY_FALSE(iommu_sva_present);
> +static LIST_HEAD(iommu_sva_mms);
>  static struct iommu_domain *iommu_sva_domain_alloc(struct device
> *dev, struct mm_struct *mm);
>  
> @@ -42,6 +44,7 @@ static struct iommu_mm_data
> *iommu_alloc_mm_data(struct mm_struct *mm, struct de return
> ERR_PTR(-ENOSPC); }
>  	iommu_mm->pasid = pasid;
> +	iommu_mm->mm = mm;
>  	INIT_LIST_HEAD(&iommu_mm->sva_domains);
>  	/*
>  	 * Make sure the write to mm->iommu_mm is not reordered in
> front of @@ -132,8 +135,13 @@ struct iommu_sva
> *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if
> (ret) goto out_free_domain;
>  	domain->users = 1;
> -	list_add(&domain->next, &mm->iommu_mm->sva_domains);
>  
> +	if (list_empty(&iommu_mm->sva_domains)) {
> +		if (list_empty(&iommu_sva_mms))
> +			static_branch_enable(&iommu_sva_present);
> +		list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
> +	}
> +	list_add(&domain->next, &iommu_mm->sva_domains);
>  out:
>  	refcount_set(&handle->users, 1);
>  	mutex_unlock(&iommu_sva_lock);
> @@ -175,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva
> *handle) list_del(&domain->next);
>  		iommu_domain_free(domain);
>  	}
> +
> +	if (list_empty(&iommu_mm->sva_domains)) {
> +		list_del(&iommu_mm->mm_list_elm);
> +		if (list_empty(&iommu_sva_mms))
> +			static_branch_disable(&iommu_sva_present);
> +	}
> +
>  	mutex_unlock(&iommu_sva_lock);
>  	kfree(handle);
>  }
> @@ -312,3 +327,18 @@ static struct iommu_domain
> *iommu_sva_domain_alloc(struct device *dev, 
>  	return domain;
>  }
> +
> +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned
> long end) +{
> +	struct iommu_mm_data *iommu_mm;
> +
> +	might_sleep();
> +
> +	if (!static_branch_unlikely(&iommu_sva_present))
> +		return;
> +
> +	guard(mutex)(&iommu_sva_lock);
> +	list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
> +
> mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start,
> end); +} +EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range);
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 156732807994..31330c12b8ee 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -1090,7 +1090,9 @@ struct iommu_sva {
>  
>  struct iommu_mm_data {
>  	u32			pasid;
> +	struct mm_struct	*mm;
>  	struct list_head	sva_domains;
> +	struct list_head	mm_list_elm;
>  };
>  
>  int iommu_fwspec_init(struct device *dev, struct fwnode_handle
> *iommu_fwnode); @@ -1571,6 +1573,7 @@ struct iommu_sva
> *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm);
>  void iommu_sva_unbind_device(struct iommu_sva *handle);
>  u32 iommu_sva_get_pasid(struct iommu_sva *handle);
> +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned
> long end); #else
>  static inline struct iommu_sva *
>  iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
> @@ -1595,6 +1598,7 @@ static inline u32 mm_get_enqcmd_pasid(struct
> mm_struct *mm) }
>  
>  static inline void mm_pasid_drop(struct mm_struct *mm) {}
> +static inline void iommu_sva_invalidate_kva_range(unsigned long
> start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */
>  
>  #ifdef CONFIG_IOMMU_IOPF


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-09 15:51 ` Jacob Pan
@ 2025-07-09 16:27   ` Jason Gunthorpe
  2025-07-09 18:15     ` Jacob Pan
  0 siblings, 1 reply; 23+ messages in thread
From: Jason Gunthorpe @ 2025-07-09 16:27 UTC (permalink / raw)
  To: Jacob Pan
  Cc: Lu Baolu, Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian,
	Jann Horn, Vasant Hegde, Dave Hansen, Alistair Popple,
	Peter Zijlstra, Uladzislau Rezki, Jean-Philippe Brucker,
	Andy Lutomirski, iommu, security, linux-kernel, stable

On Wed, Jul 09, 2025 at 08:51:58AM -0700, Jacob Pan wrote:
> > In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU
> > hardware shares and walks the CPU's page tables. Architectures like
> > x86 share static kernel address mappings across all user page tables,
> > allowing the IOMMU to access the kernel portion of these tables.

> Is there a use case where a SVA user can access kernel memory in the
> first place?

No. It should be fully blocked.

Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-09 16:27   ` Jason Gunthorpe
@ 2025-07-09 18:15     ` Jacob Pan
  2025-07-09 18:22       ` Dave Hansen
  2025-07-10  2:57       ` Baolu Lu
  0 siblings, 2 replies; 23+ messages in thread
From: Jacob Pan @ 2025-07-09 18:15 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Lu Baolu, Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian,
	Jann Horn, Vasant Hegde, Dave Hansen, Alistair Popple,
	Peter Zijlstra, Uladzislau Rezki, Jean-Philippe Brucker,
	Andy Lutomirski, iommu, security, linux-kernel, stable, jacob.pan

Hi Jason,

On Wed, 9 Jul 2025 13:27:24 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Jul 09, 2025 at 08:51:58AM -0700, Jacob Pan wrote:
> > > In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU
> > > hardware shares and walks the CPU's page tables. Architectures
> > > like x86 share static kernel address mappings across all user
> > > page tables, allowing the IOMMU to access the kernel portion of
> > > these tables.  
> 
> > Is there a use case where a SVA user can access kernel memory in the
> > first place?  
> 
> No. It should be fully blocked.
> 
Then I don't understand what is the "vulnerability condition" being
addressed here. We are talking about KVA range here.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-09 18:15     ` Jacob Pan
@ 2025-07-09 18:22       ` Dave Hansen
  2025-07-09 18:44         ` Jacob Pan
  2025-07-14 12:39         ` David Laight
  2025-07-10  2:57       ` Baolu Lu
  1 sibling, 2 replies; 23+ messages in thread
From: Dave Hansen @ 2025-07-09 18:22 UTC (permalink / raw)
  To: jacob.pan, Jason Gunthorpe
  Cc: Lu Baolu, Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian,
	Jann Horn, Vasant Hegde, Alistair Popple, Peter Zijlstra,
	Uladzislau Rezki, Jean-Philippe Brucker, Andy Lutomirski, iommu,
	security, linux-kernel, stable

On 7/9/25 11:15, Jacob Pan wrote:
>>> Is there a use case where a SVA user can access kernel memory in the
>>> first place?  
>> No. It should be fully blocked.
>>
> Then I don't understand what is the "vulnerability condition" being
> addressed here. We are talking about KVA range here.

SVA users can't access kernel memory, but they can compel walks of
kernel page tables, which the IOMMU caches. The trouble starts if the
kernel happens to free that page table page and the IOMMU is using the
cache after the page is freed.

That was covered in the changelog, but I guess it could be made a bit
more succinct.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-09 18:22       ` Dave Hansen
@ 2025-07-09 18:44         ` Jacob Pan
  2025-07-09 18:54           ` Jason Gunthorpe
  2025-07-14 12:39         ` David Laight
  1 sibling, 1 reply; 23+ messages in thread
From: Jacob Pan @ 2025-07-09 18:44 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Jason Gunthorpe, Lu Baolu, Joerg Roedel, Will Deacon,
	Robin Murphy, Kevin Tian, Jann Horn, Vasant Hegde,
	Alistair Popple, Peter Zijlstra, Uladzislau Rezki,
	Jean-Philippe Brucker, Andy Lutomirski, iommu, security,
	linux-kernel, stable, jacob.pan

Hi Dave,

On Wed, 9 Jul 2025 11:22:34 -0700
Dave Hansen <dave.hansen@intel.com> wrote:

> On 7/9/25 11:15, Jacob Pan wrote:
> >>> Is there a use case where a SVA user can access kernel memory in
> >>> the first place?    
> >> No. It should be fully blocked.
> >>  
> > Then I don't understand what is the "vulnerability condition" being
> > addressed here. We are talking about KVA range here.  
> 
> SVA users can't access kernel memory, but they can compel walks of
> kernel page tables, which the IOMMU caches. The trouble starts if the
> kernel happens to free that page table page and the IOMMU is using the
> cache after the page is freed.
> 
According to VT-d spec. 6.2.4 S1 IOTLB caching includes access
privilege.
"First-stage mappings:
— Each of these is a mapping from a input page number in a request to the physical page frame
to which it translates (derived from first-stage translation), along with information about
access privileges and memory typing (if applicable)."

So you are saying IOMMU can cache user DMA initiated walks and cache
with supervisor privilige? Since the SVA PASID is a user PASID, even if
IOMMU uses the cache later on, how could it get supervior privilege?



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-09 18:44         ` Jacob Pan
@ 2025-07-09 18:54           ` Jason Gunthorpe
  0 siblings, 0 replies; 23+ messages in thread
From: Jason Gunthorpe @ 2025-07-09 18:54 UTC (permalink / raw)
  To: Jacob Pan
  Cc: Dave Hansen, Lu Baolu, Joerg Roedel, Will Deacon, Robin Murphy,
	Kevin Tian, Jann Horn, Vasant Hegde, Alistair Popple,
	Peter Zijlstra, Uladzislau Rezki, Jean-Philippe Brucker,
	Andy Lutomirski, iommu, security, linux-kernel, stable

On Wed, Jul 09, 2025 at 11:44:32AM -0700, Jacob Pan wrote:
> So you are saying IOMMU can cache user DMA initiated walks and cache
> with supervisor privilige? Since the SVA PASID is a user PASID, even if
> IOMMU uses the cache later on, how could it get supervior privilege?

The walk cache (aka paging structure cache) and IOTLB cache are
different things.

The walk cache has no concept of privilege. All memory holding page
tables can be loaded into the walk cache. Meaning any table in the
radix tree is eligible to reside in the walk cache.

So we point the IOMMU at the CR3 of a MM struct with KVA's mapped into
it and the walk cache is permitted to somehow cache every single 4k
page that comprises that radix tree.

Supervisor does not come into it at all. I had hoped the U/S bits
within the table structure itself would effect the walk cache but it
was confirmed that it does not.

Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-09 18:15     ` Jacob Pan
  2025-07-09 18:22       ` Dave Hansen
@ 2025-07-10  2:57       ` Baolu Lu
  2025-07-10 15:28         ` Jacob Pan
  1 sibling, 1 reply; 23+ messages in thread
From: Baolu Lu @ 2025-07-10  2:57 UTC (permalink / raw)
  To: jacob.pan, Jason Gunthorpe
  Cc: Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian, Jann Horn,
	Vasant Hegde, Dave Hansen, Alistair Popple, Peter Zijlstra,
	Uladzislau Rezki, Jean-Philippe Brucker, Andy Lutomirski, iommu,
	security, linux-kernel, stable

Hi Jacob,

On 7/10/25 02:15, Jacob Pan wrote:
> Hi Jason,
> 
> On Wed, 9 Jul 2025 13:27:24 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
>> On Wed, Jul 09, 2025 at 08:51:58AM -0700, Jacob Pan wrote:
>>>> In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU
>>>> hardware shares and walks the CPU's page tables. Architectures
>>>> like x86 share static kernel address mappings across all user
>>>> page tables, allowing the IOMMU to access the kernel portion of
>>>> these tables.
>>
>>> Is there a use case where a SVA user can access kernel memory in the
>>> first place?
>>
>> No. It should be fully blocked.
>>
> Then I don't understand what is the "vulnerability condition" being
> addressed here. We are talking about KVA range here.

Let me take a real example:

A device might be mistakenly configured to access memory at IOVA
0xffffa866001d5000 (a vmalloc'd memory region) with user-mode access
permission. The corresponding page table entries for this IOVA
translation, assuming a five-level page table, would appear as follows:

PGD: Entry present with U/S bit set (1)
P4D: Entry present with U/S bit set (1)
PUD: Entry present with U/S bit set (1)
PMD: Entry present with U/S bit set (1)
PTE: Entry present with U/S bit clear (0)

When the IOMMU walks this page table, it may potentially cache all
present entries, regardless of the U/S bit's state. Upon reaching the
leaf PTE, the IOMMU performs a permission check. This involves comparing
the device's DMA access mode (in this case, user mode) against the
cumulative U/S permission derived from an AND operation across all U/S
bits in the traversed page table entries (which here results in U/S ==
0).

The IOMMU correctly blocks this DMA access because the device's
requested access (user mode) exceeds the permissions granted by the page
table (supervisor-only at the PTE level). However, the PGD, P4D, PUD,
and PMD entries that were traversed might remain cached within the
IOMMU's paging structure cache.

Now, consider a scenario where the page table leaf page is freed and
subsequently repurposed, and the U/S bit at its previous location is
modified to 1. From the IOMMU's perspective, the page table for the
aforementioned IOVA would now appear as follows:

PGD: Entry present with U/S bit set (1) [retrieved from paging cache]
P4D: Entry present with U/S bit set (1) [retrieved from paging cache]
PUD: Entry present with U/S bit set (1) [retrieved from paging cache]
PMD: Entry present with U/S bit set (1) [retrieved from paging cache]
PTE: Entry present with U/S bit set (1) {read from physical memory}

As a result, the device could then potentially access the memory at IOVA
0xffffa866001d5000 with user-mode permission, which was explicitly
disallowed.

Thanks,
baolu

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-10  2:57       ` Baolu Lu
@ 2025-07-10 15:28         ` Jacob Pan
  2025-07-10 15:35           ` Jason Gunthorpe
  2025-07-11 14:36           ` Dave Hansen
  0 siblings, 2 replies; 23+ messages in thread
From: Jacob Pan @ 2025-07-10 15:28 UTC (permalink / raw)
  To: Baolu Lu
  Cc: Jason Gunthorpe, Joerg Roedel, Will Deacon, Robin Murphy,
	Kevin Tian, Jann Horn, Vasant Hegde, Dave Hansen, Alistair Popple,
	Peter Zijlstra, Uladzislau Rezki, Jean-Philippe Brucker,
	Andy Lutomirski, iommu, security, linux-kernel, stable, jacob.pan

Hi Baolu,

On Thu, 10 Jul 2025 10:57:19 +0800
Baolu Lu <baolu.lu@linux.intel.com> wrote:

> Hi Jacob,
> 
> On 7/10/25 02:15, Jacob Pan wrote:
> > Hi Jason,
> > 
> > On Wed, 9 Jul 2025 13:27:24 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >   
> >> On Wed, Jul 09, 2025 at 08:51:58AM -0700, Jacob Pan wrote:  
> >>>> In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU
> >>>> hardware shares and walks the CPU's page tables. Architectures
> >>>> like x86 share static kernel address mappings across all user
> >>>> page tables, allowing the IOMMU to access the kernel portion of
> >>>> these tables.  
> >>  
> >>> Is there a use case where a SVA user can access kernel memory in
> >>> the first place?  
> >>
> >> No. It should be fully blocked.
> >>  
> > Then I don't understand what is the "vulnerability condition" being
> > addressed here. We are talking about KVA range here.  
> 
> Let me take a real example:
> 
> A device might be mistakenly configured to access memory at IOVA
> 0xffffa866001d5000 (a vmalloc'd memory region) with user-mode access
> permission. The corresponding page table entries for this IOVA
> translation, assuming a five-level page table, would appear as
> follows:
> 
> PGD: Entry present with U/S bit set (1)
> P4D: Entry present with U/S bit set (1)
> PUD: Entry present with U/S bit set (1)
> PMD: Entry present with U/S bit set (1)
> PTE: Entry present with U/S bit clear (0)
> 
> When the IOMMU walks this page table, it may potentially cache all
> present entries, regardless of the U/S bit's state. Upon reaching the
> leaf PTE, the IOMMU performs a permission check. This involves
> comparing the device's DMA access mode (in this case, user mode)
> against the cumulative U/S permission derived from an AND operation
> across all U/S bits in the traversed page table entries (which here
> results in U/S == 0).
why would IOMMU cache all the entries if the walk is not successful?

Also, per x86 vm map how could this example (UUUUS) happen to SVA? i.e.
sharing intermediate levels.

 ffffc90000000000 |  -55    TB | ffffe8ffffffffff |   32 TB | vmalloc/ioremap
 0000000000000000 |    0       | 00007fffffffffff |  128 TB | user-space

> The IOMMU correctly blocks this DMA access because the device's
> requested access (user mode) exceeds the permissions granted by the
> page table (supervisor-only at the PTE level). However, the PGD, P4D,
> PUD, and PMD entries that were traversed might remain cached within
> the IOMMU's paging structure cache.
> 
> Now, consider a scenario where the page table leaf page is freed and
> subsequently repurposed, and the U/S bit at its previous location is
> modified to 1. From the IOMMU's perspective, the page table for the
> aforementioned IOVA would now appear as follows:
> 
> PGD: Entry present with U/S bit set (1) [retrieved from paging cache]
> P4D: Entry present with U/S bit set (1) [retrieved from paging cache]
> PUD: Entry present with U/S bit set (1) [retrieved from paging cache]
> PMD: Entry present with U/S bit set (1) [retrieved from paging cache]
> PTE: Entry present with U/S bit set (1) {read from physical memory}
> 
> As a result, the device could then potentially access the memory at
> IOVA 0xffffa866001d5000 with user-mode permission, which was
> explicitly disallowed.
> 
> Thanks,
> baolu


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-10 15:28         ` Jacob Pan
@ 2025-07-10 15:35           ` Jason Gunthorpe
  2025-07-11 14:36           ` Dave Hansen
  1 sibling, 0 replies; 23+ messages in thread
From: Jason Gunthorpe @ 2025-07-10 15:35 UTC (permalink / raw)
  To: Jacob Pan
  Cc: Baolu Lu, Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian,
	Jann Horn, Vasant Hegde, Dave Hansen, Alistair Popple,
	Peter Zijlstra, Uladzislau Rezki, Jean-Philippe Brucker,
	Andy Lutomirski, iommu, security, linux-kernel, stable

On Thu, Jul 10, 2025 at 08:28:08AM -0700, Jacob Pan wrote:

> why would IOMMU cache all the entries if the walk is not successful?

Sadly, because nothing in the architecture said not to..

> Also, per x86 vm map how could this example (UUUUS) happen to SVA? i.e.
> sharing intermediate levels.
> 
>  ffffc90000000000 |  -55    TB | ffffe8ffffffffff |   32 TB | vmalloc/ioremap
>  0000000000000000 |    0       | 00007fffffffffff |  128 TB | user-space

Because Linux only uses the leaf U/S bit, the interior bits are set to
not-override the leaf.

Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-10 15:28         ` Jacob Pan
  2025-07-10 15:35           ` Jason Gunthorpe
@ 2025-07-11 14:36           ` Dave Hansen
  1 sibling, 0 replies; 23+ messages in thread
From: Dave Hansen @ 2025-07-11 14:36 UTC (permalink / raw)
  To: jacob.pan, Baolu Lu
  Cc: Jason Gunthorpe, Joerg Roedel, Will Deacon, Robin Murphy,
	Kevin Tian, Jann Horn, Vasant Hegde, Alistair Popple,
	Peter Zijlstra, Uladzislau Rezki, Jean-Philippe Brucker,
	Andy Lutomirski, iommu, security, linux-kernel, stable

On 7/10/25 08:28, Jacob Pan wrote:
> why would IOMMU cache all the entries if the walk is not successful?

This was one of those things which the IOMMU folks could have gone
either direction on. But, they generally choose to mirror the CPU
behavior when they can.

The CPU does page walks the same way. It probably requires less logic
because the caches can be filled while walking down the tree and don't
have to be evicted if the walk is ultimately unsuccessful.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-09 18:22       ` Dave Hansen
  2025-07-09 18:44         ` Jacob Pan
@ 2025-07-14 12:39         ` David Laight
  2025-07-14 13:19           ` Uladzislau Rezki
  1 sibling, 1 reply; 23+ messages in thread
From: David Laight @ 2025-07-14 12:39 UTC (permalink / raw)
  To: Dave Hansen
  Cc: jacob.pan, Jason Gunthorpe, Lu Baolu, Joerg Roedel, Will Deacon,
	Robin Murphy, Kevin Tian, Jann Horn, Vasant Hegde,
	Alistair Popple, Peter Zijlstra, Uladzislau Rezki,
	Jean-Philippe Brucker, Andy Lutomirski, iommu, security,
	linux-kernel, stable

On Wed, 9 Jul 2025 11:22:34 -0700
Dave Hansen <dave.hansen@intel.com> wrote:

> On 7/9/25 11:15, Jacob Pan wrote:
> >>> Is there a use case where a SVA user can access kernel memory in the
> >>> first place?    
> >> No. It should be fully blocked.
> >>  
> > Then I don't understand what is the "vulnerability condition" being
> > addressed here. We are talking about KVA range here.  
> 
> SVA users can't access kernel memory, but they can compel walks of
> kernel page tables, which the IOMMU caches. The trouble starts if the
> kernel happens to free that page table page and the IOMMU is using the
> cache after the page is freed.
> 
> That was covered in the changelog, but I guess it could be made a bit
> more succinct.
> 

Is it worth just never freeing the page tables used for vmalloc() memory?
After all they are likely to be reallocated again.

That (should) only require IOMMU invalidate for pages that are actually
used for io.

	David

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-14 12:39         ` David Laight
@ 2025-07-14 13:19           ` Uladzislau Rezki
  2025-07-14 14:50             ` Mike Rapoport
  0 siblings, 1 reply; 23+ messages in thread
From: Uladzislau Rezki @ 2025-07-14 13:19 UTC (permalink / raw)
  To: David Laight
  Cc: Dave Hansen, jacob.pan, Jason Gunthorpe, Lu Baolu, Joerg Roedel,
	Will Deacon, Robin Murphy, Kevin Tian, Jann Horn, Vasant Hegde,
	Alistair Popple, Peter Zijlstra, Uladzislau Rezki,
	Jean-Philippe Brucker, Andy Lutomirski, iommu, security,
	linux-kernel, stable

On Mon, Jul 14, 2025 at 01:39:20PM +0100, David Laight wrote:
> On Wed, 9 Jul 2025 11:22:34 -0700
> Dave Hansen <dave.hansen@intel.com> wrote:
> 
> > On 7/9/25 11:15, Jacob Pan wrote:
> > >>> Is there a use case where a SVA user can access kernel memory in the
> > >>> first place?    
> > >> No. It should be fully blocked.
> > >>  
> > > Then I don't understand what is the "vulnerability condition" being
> > > addressed here. We are talking about KVA range here.  
> > 
> > SVA users can't access kernel memory, but they can compel walks of
> > kernel page tables, which the IOMMU caches. The trouble starts if the
> > kernel happens to free that page table page and the IOMMU is using the
> > cache after the page is freed.
> > 
> > That was covered in the changelog, but I guess it could be made a bit
> > more succinct.
> > 
> 
> Is it worth just never freeing the page tables used for vmalloc() memory?
> After all they are likely to be reallocated again.
> 
>
Do we free? Maybe on some arches? According to my tests(AMD x86-64) i did
once upon a time, the PTE entries were not freed after vfree(). It could be
expensive if we did it, due to a global "page_table_lock" lock.

I see one place though, it is in the vmap_try_huge_pud()

	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
		return 0;

it is when replace a pud by a huge-page.

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-14 13:19           ` Uladzislau Rezki
@ 2025-07-14 14:50             ` Mike Rapoport
  2025-07-15  0:05               ` Tian, Kevin
  2025-07-15  1:19               ` Baolu Lu
  0 siblings, 2 replies; 23+ messages in thread
From: Mike Rapoport @ 2025-07-14 14:50 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: David Laight, Dave Hansen, jacob.pan, Jason Gunthorpe, Lu Baolu,
	Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian, Jann Horn,
	Vasant Hegde, Alistair Popple, Peter Zijlstra,
	Jean-Philippe Brucker, Andy Lutomirski, iommu, security,
	linux-kernel, stable

On Mon, Jul 14, 2025 at 03:19:17PM +0200, Uladzislau Rezki wrote:
> On Mon, Jul 14, 2025 at 01:39:20PM +0100, David Laight wrote:
> > On Wed, 9 Jul 2025 11:22:34 -0700
> > Dave Hansen <dave.hansen@intel.com> wrote:
> > 
> > > On 7/9/25 11:15, Jacob Pan wrote:
> > > >>> Is there a use case where a SVA user can access kernel memory in the
> > > >>> first place?    
> > > >> No. It should be fully blocked.
> > > >>  
> > > > Then I don't understand what is the "vulnerability condition" being
> > > > addressed here. We are talking about KVA range here.  
> > > 
> > > SVA users can't access kernel memory, but they can compel walks of
> > > kernel page tables, which the IOMMU caches. The trouble starts if the
> > > kernel happens to free that page table page and the IOMMU is using the
> > > cache after the page is freed.
> > > 
> > > That was covered in the changelog, but I guess it could be made a bit
> > > more succinct.

But does this really mean that every flush_tlb_kernel_range() should flush
the IOMMU page tables as well? AFAIU, set_memory flushes TLB even when bits
in pte change and it seems like an overkill...

> > Is it worth just never freeing the page tables used for vmalloc() memory?
> > After all they are likely to be reallocated again.
> > 
> >
> Do we free? Maybe on some arches? According to my tests(AMD x86-64) i did
> once upon a time, the PTE entries were not freed after vfree(). It could be
> expensive if we did it, due to a global "page_table_lock" lock.
> 
> I see one place though, it is in the vmap_try_huge_pud()
> 
> 	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
> 		return 0;
> 
> it is when replace a pud by a huge-page.

There's also a place that replaces a pmd by a smaller huge page, but other
than that vmalloc does not free page tables.

> --
> Uladzislau Rezki

-- 
Sincerely yours,
Mike.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-14 14:50             ` Mike Rapoport
@ 2025-07-15  0:05               ` Tian, Kevin
  2025-07-15  1:19               ` Baolu Lu
  1 sibling, 0 replies; 23+ messages in thread
From: Tian, Kevin @ 2025-07-15  0:05 UTC (permalink / raw)
  To: Mike Rapoport, Uladzislau Rezki
  Cc: David Laight, Hansen, Dave, jacob.pan@linux.microsoft.com,
	Jason Gunthorpe, Lu Baolu, Joerg Roedel, Will Deacon,
	Robin Murphy, Jann Horn, Vasant Hegde, Alistair Popple,
	Peter Zijlstra, Jean-Philippe Brucker, Andy Lutomirski,
	iommu@lists.linux.dev, security@kernel.org,
	linux-kernel@vger.kernel.org, stable@vger.kernel.org

> From: Mike Rapoport <rppt@kernel.org>
> Sent: Monday, July 14, 2025 10:50 PM
> 
> On Mon, Jul 14, 2025 at 03:19:17PM +0200, Uladzislau Rezki wrote:
> > On Mon, Jul 14, 2025 at 01:39:20PM +0100, David Laight wrote:
> > > On Wed, 9 Jul 2025 11:22:34 -0700
> > > Dave Hansen <dave.hansen@intel.com> wrote:
> > >
> > > > On 7/9/25 11:15, Jacob Pan wrote:
> > > > >>> Is there a use case where a SVA user can access kernel memory in
> the
> > > > >>> first place?
> > > > >> No. It should be fully blocked.
> > > > >>
> > > > > Then I don't understand what is the "vulnerability condition" being
> > > > > addressed here. We are talking about KVA range here.
> > > >
> > > > SVA users can't access kernel memory, but they can compel walks of
> > > > kernel page tables, which the IOMMU caches. The trouble starts if the
> > > > kernel happens to free that page table page and the IOMMU is using
> the
> > > > cache after the page is freed.
> > > >
> > > > That was covered in the changelog, but I guess it could be made a bit
> > > > more succinct.
> 
> But does this really mean that every flush_tlb_kernel_range() should flush
> the IOMMU page tables as well? AFAIU, set_memory flushes TLB even when
> bits
> in pte change and it seems like an overkill...
> 
> > > Is it worth just never freeing the page tables used for vmalloc() memory?
> > > After all they are likely to be reallocated again.
> > >
> > >
> > Do we free? Maybe on some arches? According to my tests(AMD x86-64) i
> did
> > once upon a time, the PTE entries were not freed after vfree(). It could be
> > expensive if we did it, due to a global "page_table_lock" lock.
> >
> > I see one place though, it is in the vmap_try_huge_pud()
> >
> > 	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
> > 		return 0;
> >
> > it is when replace a pud by a huge-page.
> 
> There's also a place that replaces a pmd by a smaller huge page, but other
> than that vmalloc does not free page tables.
> 

Dave spotted two other places where page tables might be freed:

https://lore.kernel.org/all/62580eab-3e68-4132-981a-84167d130d9f@intel.com/

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
  2025-07-14 14:50             ` Mike Rapoport
  2025-07-15  0:05               ` Tian, Kevin
@ 2025-07-15  1:19               ` Baolu Lu
  1 sibling, 0 replies; 23+ messages in thread
From: Baolu Lu @ 2025-07-15  1:19 UTC (permalink / raw)
  To: Mike Rapoport, Uladzislau Rezki
  Cc: David Laight, Dave Hansen, jacob.pan, Jason Gunthorpe,
	Joerg Roedel, Will Deacon, Robin Murphy, Kevin Tian, Jann Horn,
	Vasant Hegde, Alistair Popple, Peter Zijlstra,
	Jean-Philippe Brucker, Andy Lutomirski, iommu, security,
	linux-kernel, stable

On 7/14/25 22:50, Mike Rapoport wrote:
> On Mon, Jul 14, 2025 at 03:19:17PM +0200, Uladzislau Rezki wrote:
>> On Mon, Jul 14, 2025 at 01:39:20PM +0100, David Laight wrote:
>>> On Wed, 9 Jul 2025 11:22:34 -0700
>>> Dave Hansen<dave.hansen@intel.com> wrote:
>>>
>>>> On 7/9/25 11:15, Jacob Pan wrote:
>>>>>>> Is there a use case where a SVA user can access kernel memory in the
>>>>>>> first place?
>>>>>> No. It should be fully blocked.
>>>>>>   
>>>>> Then I don't understand what is the "vulnerability condition" being
>>>>> addressed here. We are talking about KVA range here.
>>>> SVA users can't access kernel memory, but they can compel walks of
>>>> kernel page tables, which the IOMMU caches. The trouble starts if the
>>>> kernel happens to free that page table page and the IOMMU is using the
>>>> cache after the page is freed.
>>>>
>>>> That was covered in the changelog, but I guess it could be made a bit
>>>> more succinct.
> But does this really mean that every flush_tlb_kernel_range() should flush
> the IOMMU page tables as well? AFAIU, set_memory flushes TLB even when bits
> in pte change and it seems like an overkill...

As far as I can see, only the next-level page table pointer in the
middle-level entry matters. SVA is not allowed to access kernel
addresses, which has been ensured by the U/S bit in the leaf PTEs, so
other bit changes don't matter here.

Thanks,
baolu

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2025-07-15  1:21 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-04 13:30 [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush Lu Baolu
2025-07-04 13:38 ` Jason Gunthorpe
2025-07-05  3:50   ` Baolu Lu
2025-07-05  9:06 ` Vasant Hegde
2025-07-08  5:42 ` Baolu Lu
2025-07-08 12:27   ` Jason Gunthorpe
2025-07-08 14:06     ` Jason Gunthorpe
2025-07-09  1:25       ` Baolu Lu
2025-07-09 15:51 ` Jacob Pan
2025-07-09 16:27   ` Jason Gunthorpe
2025-07-09 18:15     ` Jacob Pan
2025-07-09 18:22       ` Dave Hansen
2025-07-09 18:44         ` Jacob Pan
2025-07-09 18:54           ` Jason Gunthorpe
2025-07-14 12:39         ` David Laight
2025-07-14 13:19           ` Uladzislau Rezki
2025-07-14 14:50             ` Mike Rapoport
2025-07-15  0:05               ` Tian, Kevin
2025-07-15  1:19               ` Baolu Lu
2025-07-10  2:57       ` Baolu Lu
2025-07-10 15:28         ` Jacob Pan
2025-07-10 15:35           ` Jason Gunthorpe
2025-07-11 14:36           ` Dave Hansen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).