Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Uladzislau Rezki <urezki@gmail.com>
To: Ethan Zhao <etzhao1900@gmail.com>, Baolu Lu <baolu.lu@linux.intel.com>
Cc: Baolu Lu <baolu.lu@linux.intel.com>,
	Dave Hansen <dave.hansen@intel.com>,
	Jason Gunthorpe <jgg@nvidia.com>, Joerg Roedel <joro@8bytes.org>,
	Will Deacon <will@kernel.org>,
	Robin Murphy <robin.murphy@arm.com>,
	Kevin Tian <kevin.tian@intel.com>, Jann Horn <jannh@google.com>,
	Vasant Hegde <vasant.hegde@amd.com>,
	Alistair Popple <apopple@nvidia.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Uladzislau Rezki <urezki@gmail.com>,
	Jean-Philippe Brucker <jean-philippe@linaro.org>,
	Andy Lutomirski <luto@kernel.org>, Yi Lai <yi1.lai@intel.com>,
	iommu@lists.linux.dev, security@kernel.org,
	linux-kernel@vger.kernel.org, stable@vger.kernel.org
Subject: Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush
Date: Mon, 11 Aug 2025 11:15:58 +0200	[thread overview]
Message-ID: <aJm0znaAqBRWqOCT@pc636> (raw)
In-Reply-To: <2611981e-3678-4619-b2ab-d9daace5a68a@gmail.com>

On Sun, Aug 10, 2025 at 03:19:58PM +0800, Ethan Zhao wrote:
> 
> 
> On 8/8/2025 1:15 PM, Baolu Lu wrote:
> > On 8/7/25 23:31, Dave Hansen wrote:
> > > > +void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
> > > > +{
> > > > +    struct page *page = virt_to_page(pte);
> > > > +
> > > > +    guard(spinlock)(&kernel_pte_work.lock);
> > > > +    list_add(&page->lru, &kernel_pte_work.list);
> > > > +    schedule_work(&kernel_pte_work.work);
> > > > +}
> > > > diff --git a/include/asm-generic/pgalloc.h
> > > > b/include/asm-generic/ pgalloc.h
> > > > index 3c8ec3bfea44..716ebab67636 100644
> > > > --- a/include/asm-generic/pgalloc.h
> > > > +++ b/include/asm-generic/pgalloc.h
> > > > @@ -46,6 +46,7 @@ static inline pte_t
> > > > *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
> > > >   #define pte_alloc_one_kernel(...)
> > > > alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
> > > >   #endif
> > > > 
> > > > +#ifndef __HAVE_ARCH_PTE_FREE_KERNEL
> > > >   /**
> > > >    * pte_free_kernel - free PTE-level kernel page table memory
> > > >    * @mm: the mm_struct of the current context
> > > > @@ -55,6 +56,7 @@ static inline void pte_free_kernel(struct mm_struct
> > > > *mm, pte_t *pte)
> > > >   {
> > > >       pagetable_dtor_free(virt_to_ptdesc(pte));
> > > >   }
> > > > +#endif
> > > > 
> > > >   /**
> > > >    * __pte_alloc_one - allocate memory for a PTE-level user page table
> > > I'd much rather the arch-generic code looked like this:
> > > 
> > > #ifdef CONFIG_ASYNC_PGTABLE_FREE
> > > // code and struct here, or dump them over in some
> > > // other file and do this in a header
> > > #else
> > > static void pte_free_kernel_async(struct page *page) {}
> > > #endif
> > > 
> > > void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
> > > {
> > >      struct page *page = virt_to_page(pte);
> > > 
> > >      if (IS_DEFINED(CONFIG_ASYNC_PGTABLE_FREE)) {
> > >     pte_free_kernel_async(page);
> > >      else
> > >     pagetable_dtor_free(page_ptdesc(page));
> > > }
> > > 
> > > Then in Kconfig, you end up with something like:
> > > 
> > > config ASYNC_PGTABLE_FREE
> > >     def_bool y
> > >     depends on INTEL_IOMMU_WHATEVER
> > > 
> > > That very much tells much more of the whole story in code. It also gives
> > > the x86 folks that compile out the IOMMU the exact same code as the
> > > arch-generic folks. It_also_ makes it dirt simple and obvious for the
> > > x86 folks to optimize out the async behavior if they don't like it in
> > > the future by replacing the compile-time IOMMU check with a runtime one.
> > > 
> > > Also, if another crazy IOMMU implementation comes along that happens to
> > > do what the x86 IOMMUs do, then they have a single Kconfig switch to
> > > flip. If they follow what this patch tries to do, they'll start by
> > > copying and pasting the x86 implementation.
> > 
> > I'll do it like this.  Does that look good to you?
> > 
> > diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> > index 70d29b14d851..6f1113e024fa 100644
> > --- a/drivers/iommu/Kconfig
> > +++ b/drivers/iommu/Kconfig
> > @@ -160,6 +160,7 @@ config IOMMU_DMA
> >   # Shared Virtual Addressing
> >   config IOMMU_SVA
> >       select IOMMU_MM_DATA
> > +    select ASYNC_PGTABLE_FREE if X86
> >       bool
> > 
> >   config IOMMU_IOPF
> > diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
> > index 3c8ec3bfea44..dbddacdca2ce 100644
> > --- a/include/asm-generic/pgalloc.h
> > +++ b/include/asm-generic/pgalloc.h
> > @@ -46,6 +46,19 @@ static inline pte_t
> > *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
> >   #define pte_alloc_one_kernel(...)
> > alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
> >   #endif
> > 
> > +#ifdef CONFIG_ASYNC_PGTABLE_FREE
> > +struct pgtable_free_work {
> > +    struct list_head list;
> > +    spinlock_t lock;
> > +    struct work_struct work;
> > +};
> > +extern struct pgtable_free_work kernel_pte_work;
> > +
> > +void pte_free_kernel_async(struct ptdesc *ptdesc);
> > +#else
> > +static inline void pte_free_kernel_async(struct ptdesc *ptdesc) {}
> > +#endif
> > +
> >   /**
> >    * pte_free_kernel - free PTE-level kernel page table memory
> >    * @mm: the mm_struct of the current context
> > @@ -53,7 +66,12 @@ static inline pte_t
> > *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
> >    */
> >   static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
> >   {
> > -    pagetable_dtor_free(virt_to_ptdesc(pte));
> > +    struct ptdesc *ptdesc = virt_to_ptdesc(pte);
> > +
> > +    if (IS_ENABLED(CONFIG_ASYNC_PGTABLE_FREE))
> > +        pte_free_kernel_async(ptdesc);
> > +    else
> > +        pagetable_dtor_free(ptdesc);
> >   }
> > 
> >   /**
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index e443fe8cd6cf..528550cfa7fe 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -1346,6 +1346,13 @@ config LOCK_MM_AND_FIND_VMA
> >   config IOMMU_MM_DATA
> >       bool
> > 
> > +config ASYNC_PGTABLE_FREE
> > +    bool "Asynchronous kernel page table freeing"
> > +    help
> > +      Perform kernel page table freeing asynchronously. This is required
> > +      for systems with IOMMU Shared Virtual Address (SVA) to flush IOTLB
> > +      paging structure caches.
> > +
> >   config EXECMEM
> >       bool
> > 
> > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> > index 567e2d084071..6639ee6641d4 100644
> > --- a/mm/pgtable-generic.c
> > +++ b/mm/pgtable-generic.c
> > @@ -13,6 +13,7 @@
> >   #include <linux/swap.h>
> >   #include <linux/swapops.h>
> >   #include <linux/mm_inline.h>
> > +#include <linux/iommu.h>
> >   #include <asm/pgalloc.h>
> >   #include <asm/tlb.h>
> > 
> > @@ -406,3 +407,32 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm,
> > pmd_t *pmd,
> >       pte_unmap_unlock(pte, ptl);
> >       goto again;
> >   }
> > +
> > +#ifdef CONFIG_ASYNC_PGTABLE_FREE
> > +static void kernel_pte_work_func(struct work_struct *work);
> > +struct pgtable_free_work kernel_pte_work = {
> > +    .list = LIST_HEAD_INIT(kernel_pte_work.list),
> > +    .lock = __SPIN_LOCK_UNLOCKED(kernel_pte_work.lock),
> > +    .work = __WORK_INITIALIZER(kernel_pte_work.work,
> > kernel_pte_work_func),
> > +};
> > +
> > +static void kernel_pte_work_func(struct work_struct *work)
> > +{
> > +    struct ptdesc *ptdesc, *next;
> > +
> > +    iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL);
> > +
> > +    guard(spinlock)(&kernel_pte_work.lock);
> > +    list_for_each_entry_safe(ptdesc, next, &kernel_pte_work.list,
> > pt_list) {
> > +        list_del_init(&ptdesc->pt_list);
> > +        pagetable_dtor_free(ptdesc);
> > +    }
> > +}
> > +
> > +void pte_free_kernel_async(struct ptdesc *ptdesc)
> > +{
> > +    guard(spinlock)(&kernel_pte_work.lock);
> > +    list_add(&ptdesc->pt_list, &kernel_pte_work.list);
> > +    schedule_work(&kernel_pte_work.work);
> > +}
> kernel_pte_work.list is global shared var, it would make the producer
> pte_free_kernel() and the consumer kernel_pte_work_func() to operate in
> serialized timing. In a large system, I don't think you design this
> deliberately :)
>
Sorry for jumping.

Agree, unless it is never considered as a hot path or something that can
be really contented. It looks like you can use just a per-cpu llist to drain
thinks.

As for reference you can have a look at how vfree_atomic() handles deferred
freeing.

Thanks!

--
Uladzislau Rezki

next prev parent reply	other threads:[~2025-08-11  9:16 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-08-06  5:25 [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush Lu Baolu
2025-08-06 15:03 ` Dave Hansen
2025-08-06 15:52   ` Jason Gunthorpe
2025-08-06 16:04     ` Dave Hansen
2025-08-06 16:09       ` Jason Gunthorpe
2025-08-06 16:34         ` Dave Hansen
2025-08-06 16:42           ` Jason Gunthorpe
2025-08-07 14:40           ` Baolu Lu
2025-08-07 15:31             ` Dave Hansen
2025-08-08  5:15               ` Baolu Lu
2025-08-10  7:19                 ` Ethan Zhao
2025-08-11  9:15                   ` Uladzislau Rezki [this message]
2025-08-11 12:55                     ` Jason Gunthorpe
2025-08-15  9:23                       ` Baolu Lu
2025-08-11 13:55                     ` Dave Hansen
2025-08-11 14:56                       ` Uladzislau Rezki
2025-08-12  1:17                       ` Ethan Zhao
2025-08-15 14:35                         ` Dave Hansen
2025-08-11 12:57                 ` Jason Gunthorpe
2025-08-13  3:17                   ` Ethan Zhao
2025-08-18  1:34                   ` Baolu Lu
2025-08-07 19:51             ` Jason Gunthorpe
2025-08-08  2:57               ` Tian, Kevin
2025-08-15  9:16                 ` Baolu Lu
2025-08-15  9:46                   ` Tian, Kevin
2025-08-18  5:58                     ` Baolu Lu
2025-08-15 14:31                   ` Dave Hansen
2025-08-18  6:08                     ` Baolu Lu
2025-08-18  6:21                 ` Baolu Lu
2025-08-21  7:05                   ` Tian, Kevin
2025-08-23  3:26                     ` Baolu Lu
2025-08-25 22:36                       ` Dave Hansen
2025-08-26  1:25                         ` Baolu Lu
2025-08-26  2:49                           ` Baolu Lu
2025-08-26 14:22                             ` Dave Hansen
2025-08-26 14:33                               ` Matthew Wilcox
2025-08-26 14:57                                 ` Dave Hansen
2025-08-27 10:58                               ` Baolu Lu
2025-08-27 23:31                                 ` Dave Hansen
2025-08-28  5:31                                   ` Baolu Lu
2025-08-28  7:08                                     ` Tian, Kevin
2025-08-28 18:56                                       ` Dave Hansen
2025-08-28 19:10                                         ` Jason Gunthorpe
2025-08-28 19:31                                           ` Dave Hansen
2025-08-28 19:39                                             ` Matthew Wilcox
2025-08-26 16:21                             ` Dave Hansen
2025-08-27  6:34                               ` Baolu Lu
2025-08-08  5:08               ` Baolu Lu
2025-08-07  6:53   ` Baolu Lu
2025-08-14  4:48 ` Ethan Zhao
2025-08-15  7:48   ` Baolu Lu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aJm0znaAqBRWqOCT@pc636 \
    --to=urezki@gmail.com \
    --cc=apopple@nvidia.com \
    --cc=baolu.lu@linux.intel.com \
    --cc=dave.hansen@intel.com \
    --cc=etzhao1900@gmail.com \
    --cc=iommu@lists.linux.dev \
    --cc=jannh@google.com \
    --cc=jean-philippe@linaro.org \
    --cc=jgg@nvidia.com \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=peterz@infradead.org \
    --cc=robin.murphy@arm.com \
    --cc=security@kernel.org \
    --cc=stable@vger.kernel.org \
    --cc=vasant.hegde@amd.com \
    --cc=will@kernel.org \
    --cc=yi1.lai@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.