Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Baolu Lu <baolu.lu@linux.intel.com>
To: Dave Hansen <dave.hansen@intel.com>, Jason Gunthorpe <jgg@nvidia.com>
Cc: baolu.lu@linux.intel.com, Joerg Roedel <joro@8bytes.org>,
	Will Deacon <will@kernel.org>,
	Robin Murphy <robin.murphy@arm.com>,
	Kevin Tian <kevin.tian@intel.com>, Jann Horn <jannh@google.com>,
	Vasant Hegde <vasant.hegde@amd.com>,
	Alistair Popple <apopple@nvidia.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Uladzislau Rezki <urezki@gmail.com>,
	Jean-Philippe Brucker <jean-philippe@linaro.org>,
	Andy Lutomirski <luto@kernel.org>, Yi Lai <yi1.lai@intel.com>,
	iommu@lists.linux.dev, security@kernel.org,
	linux-kernel@vger.kernel.org, stable@vger.kernel.org
Subject: Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
Date: Thu, 7 Aug 2025 22:40:39 +0800	[thread overview]
Message-ID: <4a8df0e8-bd5a-44e4-acce-46ba75594846@linux.intel.com> (raw)
In-Reply-To: <62d21545-9e75-41e3-89a3-f21dda15bf16@intel.com>

On 8/7/2025 12:34 AM, Dave Hansen wrote:
> On 8/6/25 09:09, Jason Gunthorpe wrote:
>>>>
>>>> You can't do this approach without also pushing the pages to freed on
>>>> a list and defering the free till the work. This is broadly what the
>>>> normal mm user flow is doing..
>>> FWIW, I think the simplest way to do this is to plop an unconditional
>>> schedule_work() in pte_free_kernel(). The work function will invalidate
>>> the IOTLBs and then free the page.
>>>
>>> Keep the schedule_work() unconditional to keep it simple. The
>>> schedule_work() is way cheaper than all the system-wide TLB invalidation
>>> IPIs that have to get sent as well. No need to add complexity to
>>> optimize out something that's in the noise already.
>> That works also, but now you have to allocate memory or you are
>> dead.. Is it OK these days, and safe in this code which seems a little
>> bit linked to memory management?
>>
>> The MM side avoided this by putting the list and the rcu_head in the
>> struct page.
> 
> I don't think you need to allocate memory. A little static structure
> that uses the page->list and has a lock should do. Logically something
> like this:
> 
> struct kernel_pgtable_work
> {
> 	struct list_head list;
> 	spinlock_t lock;
> 	struct work_struct work;
> } kernel_pte_work;
> 
> pte_free_kernel()
> {
> 	struct page *page = ptdesc_magic();
> 
> 	guard(spinlock)(&kernel_pte_work.lock);
> 	
> 	list_add(&page->list, &kernel_pte_work.list);
> 	schedule_work(&kernel_pte_work.work);
> }
> 
> work_func()
> {
> 	iommu_sva_invalidate_kva();
> 
> 	guard(spinlock)(&kernel_pte_work.lock);
> 
> 	list_for_each_safe() {
> 		page = container_of(...);
> 		free_whatever(page);
> 	}
> }
> 
> The only wrinkle is that pte_free_kernel() itself still has a pte and
> 'ptdesc', not a 'struct page'. But there is ptdesc->pt_list, which
> should be unused at this point, especially for non-pgd pages on x86.
> 
> So, either go over to the 'struct page' earlier (maybe by open-coding
> pagetable_dtor_free()?), or just use the ptdesc.

I refactored the code above as follows. It compiles but hasn't been
tested yet. Does it look good to you?

diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c88691b15f3c..d9307dd09f67 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -10,9 +10,11 @@

  #define __HAVE_ARCH_PTE_ALLOC_ONE
  #define __HAVE_ARCH_PGD_FREE
+#define __HAVE_ARCH_PTE_FREE_KERNEL
  #include <asm-generic/pgalloc.h>

  static inline int  __paravirt_pgd_alloc(struct mm_struct *mm) { return 
0; }
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte);

  #ifdef CONFIG_PARAVIRT_XXL
  #include <asm/paravirt.h>
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ddf248c3ee7d..f9f6738dd3cc 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
  #include <linux/mm.h>
  #include <linux/gfp.h>
  #include <linux/hugetlb.h>
+#include <linux/iommu.h>
  #include <asm/pgalloc.h>
  #include <asm/tlb.h>
  #include <asm/fixmap.h>
@@ -844,3 +845,42 @@ void arch_check_zapped_pud(struct vm_area_struct 
*vma, pud_t pud)
  	/* See note in arch_check_zapped_pte() */
  	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
  }
+
+static void kernel_pte_work_func(struct work_struct *work);
+
+static struct {
+	struct list_head list;
+	spinlock_t lock;
+	struct work_struct work;
+} kernel_pte_work = {
+	.list = LIST_HEAD_INIT(kernel_pte_work.list),
+	.lock = __SPIN_LOCK_UNLOCKED(kernel_pte_work.lock),
+	.work = __WORK_INITIALIZER(kernel_pte_work.work, kernel_pte_work_func),
+};
+
+static void kernel_pte_work_func(struct work_struct *work)
+{
+	struct page *page, *next;
+
+	iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL);
+
+	guard(spinlock)(&kernel_pte_work.lock);
+	list_for_each_entry_safe(page, next, &kernel_pte_work.list, lru) {
+		list_del_init(&page->lru);
+		pagetable_dtor_free(page_ptdesc(page));
+	}
+}
+
+/**
+ * pte_free_kernel - free PTE-level kernel page table memory
+ * @mm: the mm_struct of the current context
+ * @pte: pointer to the memory containing the page table
+ */
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+	struct page *page = virt_to_page(pte);
+
+	guard(spinlock)(&kernel_pte_work.lock);
+	list_add(&page->lru, &kernel_pte_work.list);
+	schedule_work(&kernel_pte_work.work);
+}
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 3c8ec3bfea44..716ebab67636 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -46,6 +46,7 @@ static inline pte_t 
*pte_alloc_one_kernel_noprof(struct mm_struct *mm)
  #define pte_alloc_one_kernel(...) 
alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
  #endif

+#ifndef __HAVE_ARCH_PTE_FREE_KERNEL
  /**
   * pte_free_kernel - free PTE-level kernel page table memory
   * @mm: the mm_struct of the current context
@@ -55,6 +56,7 @@ static inline void pte_free_kernel(struct mm_struct 
*mm, pte_t *pte)
  {
  	pagetable_dtor_free(virt_to_ptdesc(pte));
  }
+#endif

  /**
   * __pte_alloc_one - allocate memory for a PTE-level user page table
-- 
2.43.0

Thanks,
baolu

next prev parent reply	other threads:[~2025-08-07 14:40 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-08-06  5:25 [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush Lu Baolu
2025-08-06 15:03 ` Dave Hansen
2025-08-06 15:52   ` Jason Gunthorpe
2025-08-06 16:04     ` Dave Hansen
2025-08-06 16:09       ` Jason Gunthorpe
2025-08-06 16:34         ` Dave Hansen
2025-08-06 16:42           ` Jason Gunthorpe
2025-08-07 14:40           ` Baolu Lu [this message]
2025-08-07 15:31             ` Dave Hansen
2025-08-08  5:15               ` Baolu Lu
2025-08-10  7:19                 ` Ethan Zhao
2025-08-11  9:15                   ` Uladzislau Rezki
2025-08-11 12:55                     ` Jason Gunthorpe
2025-08-15  9:23                       ` Baolu Lu
2025-08-11 13:55                     ` Dave Hansen
2025-08-11 14:56                       ` Uladzislau Rezki
2025-08-12  1:17                       ` Ethan Zhao
2025-08-15 14:35                         ` Dave Hansen
2025-08-11 12:57                 ` Jason Gunthorpe
2025-08-13  3:17                   ` Ethan Zhao
2025-08-18  1:34                   ` Baolu Lu
2025-08-07 19:51             ` Jason Gunthorpe
2025-08-08  2:57               ` Tian, Kevin
2025-08-15  9:16                 ` Baolu Lu
2025-08-15  9:46                   ` Tian, Kevin
2025-08-18  5:58                     ` Baolu Lu
2025-08-15 14:31                   ` Dave Hansen
2025-08-18  6:08                     ` Baolu Lu
2025-08-18  6:21                 ` Baolu Lu
2025-08-21  7:05                   ` Tian, Kevin
2025-08-23  3:26                     ` Baolu Lu
2025-08-25 22:36                       ` Dave Hansen
2025-08-26  1:25                         ` Baolu Lu
2025-08-26  2:49                           ` Baolu Lu
2025-08-26 14:22                             ` Dave Hansen
2025-08-26 14:33                               ` Matthew Wilcox
2025-08-26 14:57                                 ` Dave Hansen
2025-08-27 10:58                               ` Baolu Lu
2025-08-27 23:31                                 ` Dave Hansen
2025-08-28  5:31                                   ` Baolu Lu
2025-08-28  7:08                                     ` Tian, Kevin
2025-08-28 18:56                                       ` Dave Hansen
2025-08-28 19:10                                         ` Jason Gunthorpe
2025-08-28 19:31                                           ` Dave Hansen
2025-08-28 19:39                                             ` Matthew Wilcox
2025-08-26 16:21                             ` Dave Hansen
2025-08-27  6:34                               ` Baolu Lu
2025-08-08  5:08               ` Baolu Lu
2025-08-07  6:53   ` Baolu Lu
2025-08-14  4:48 ` Ethan Zhao
2025-08-15  7:48   ` Baolu Lu

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:c88691b15f3 dfblob:d9307dd09f6 dfblob:ddf248c3ee7
dfblob:f9f6738dd3c dfblob:3c8ec3bfea4 dfblob:716ebab6763 )
 OR (
bs:"Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4a8df0e8-bd5a-44e4-acce-46ba75594846@linux.intel.com \
    --to=baolu.lu@linux.intel.com \
    --cc=apopple@nvidia.com \
    --cc=dave.hansen@intel.com \
    --cc=iommu@lists.linux.dev \
    --cc=jannh@google.com \
    --cc=jean-philippe@linaro.org \
    --cc=jgg@nvidia.com \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=peterz@infradead.org \
    --cc=robin.murphy@arm.com \
    --cc=security@kernel.org \
    --cc=stable@vger.kernel.org \
    --cc=urezki@gmail.com \
    --cc=vasant.hegde@amd.com \
    --cc=will@kernel.org \
    --cc=yi1.lai@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).