* [PATCH] arm64: optimize flush tlb kernel range
@ 2024-09-14 14:14 Kefeng Wang
2024-09-16 15:08 ` Catalin Marinas
0 siblings, 1 reply; 3+ messages in thread
From: Kefeng Wang @ 2024-09-14 14:14 UTC (permalink / raw)
To: Catalin Marinas, Will Deacon
Cc: Ryan Roberts, linux-arm-kernel, Kefeng Wang, Yicong Yang
Currently the kernel TLBs is flushed page by page if the target
VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll
brutally issue a TLBI ALL.
But we could optimize it when CPU supports TLB range operations,
convert to use __flush_tlb_range_nosync() like other tlb range
flush to improve performance.
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++------------------
arch/arm64/mm/contpte.c | 3 ++-
2 files changed, 20 insertions(+), 26 deletions(-)
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 95fbc8c05607..8537fad83999 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -431,12 +431,12 @@ do { \
#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
-static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long stride, bool last_level,
- int tlb_level)
+static __always_inline void __flush_tlb_range_nosync(struct mm_struct *mm,
+ unsigned long asid, unsigned long start, unsigned long end,
+ unsigned long stride, bool last_level, int tlb_level)
{
- unsigned long asid, pages;
+ bool tlbi_user = !!asid;
+ unsigned long pages;
start = round_down(start, stride);
end = round_up(end, stride);
@@ -451,21 +451,24 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
if ((!system_supports_tlb_range() &&
(end - start) >= (MAX_DVM_OPS * stride)) ||
pages > MAX_TLBI_RANGE_PAGES) {
- flush_tlb_mm(vma->vm_mm);
+ if (asid)
+ flush_tlb_mm(mm);
+ else
+ flush_tlb_all();
return;
}
dsb(ishst);
- asid = ASID(vma->vm_mm);
if (last_level)
__flush_tlb_range_op(vale1is, start, pages, stride, asid,
- tlb_level, true, lpa2_is_enabled());
+ tlb_level, tlbi_user, lpa2_is_enabled());
else
- __flush_tlb_range_op(vae1is, start, pages, stride, asid,
- tlb_level, true, lpa2_is_enabled());
+ __flush_tlb_range_op(vae1is, start, pages, stride, tlbi_user,
+ tlb_level, tlbi_user, lpa2_is_enabled());
- mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
+ if (asid)
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
static inline void __flush_tlb_range(struct vm_area_struct *vma,
@@ -473,8 +476,8 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
unsigned long stride, bool last_level,
int tlb_level)
{
- __flush_tlb_range_nosync(vma, start, end, stride,
- last_level, tlb_level);
+ __flush_tlb_range_nosync(vma->vm_mm, ASID(vma->vm_mm), start, end,
+ stride, last_level, tlb_level);
dsb(ish);
}
@@ -492,19 +495,9 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- unsigned long addr;
- if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
- flush_tlb_all();
- return;
- }
-
- start = __TLBI_VADDR(start, 0);
- end = __TLBI_VADDR(end, 0);
-
- dsb(ishst);
- for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
- __tlbi(vaale1is, addr);
+ __flush_tlb_range_nosync(&init_mm, 0, start, end, PAGE_SIZE, false,
+ TLBI_TTL_UNKNOWN);
dsb(ish);
isb();
}
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 55107d27d3f8..7f93f19dc50b 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -335,7 +335,8 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
* eliding the trailing DSB applies here.
*/
addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
- __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE,
+ __flush_tlb_range_nosync(vma->vm_mm, ASID(vma->vm_mm),
+ addr, addr + CONT_PTE_SIZE,
PAGE_SIZE, true, 3);
}
--
2.27.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] arm64: optimize flush tlb kernel range
2024-09-14 14:14 [PATCH] arm64: optimize flush tlb kernel range Kefeng Wang
@ 2024-09-16 15:08 ` Catalin Marinas
2024-09-18 0:57 ` Kefeng Wang
0 siblings, 1 reply; 3+ messages in thread
From: Catalin Marinas @ 2024-09-16 15:08 UTC (permalink / raw)
To: Kefeng Wang; +Cc: Will Deacon, Ryan Roberts, linux-arm-kernel, Yicong Yang
On Sat, Sep 14, 2024 at 10:14:41PM +0800, Kefeng Wang wrote:
> Currently the kernel TLBs is flushed page by page if the target
> VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll
> brutally issue a TLBI ALL.
>
> But we could optimize it when CPU supports TLB range operations,
> convert to use __flush_tlb_range_nosync() like other tlb range
> flush to improve performance.
>
> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Nit: you need a co-developed-by here for Yicong.
> ---
> arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++------------------
> arch/arm64/mm/contpte.c | 3 ++-
> 2 files changed, 20 insertions(+), 26 deletions(-)
>
> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
> index 95fbc8c05607..8537fad83999 100644
> --- a/arch/arm64/include/asm/tlbflush.h
> +++ b/arch/arm64/include/asm/tlbflush.h
> @@ -431,12 +431,12 @@ do { \
> #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
> __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
>
> -static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
> - unsigned long start, unsigned long end,
> - unsigned long stride, bool last_level,
> - int tlb_level)
> +static __always_inline void __flush_tlb_range_nosync(struct mm_struct *mm,
> + unsigned long asid, unsigned long start, unsigned long end,
> + unsigned long stride, bool last_level, int tlb_level)
> {
> - unsigned long asid, pages;
> + bool tlbi_user = !!asid;
> + unsigned long pages;
>
> start = round_down(start, stride);
> end = round_up(end, stride);
> @@ -451,21 +451,24 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
> if ((!system_supports_tlb_range() &&
> (end - start) >= (MAX_DVM_OPS * stride)) ||
> pages > MAX_TLBI_RANGE_PAGES) {
> - flush_tlb_mm(vma->vm_mm);
> + if (asid)
> + flush_tlb_mm(mm);
> + else
> + flush_tlb_all();
> return;
> }
>
> dsb(ishst);
> - asid = ASID(vma->vm_mm);
>
> if (last_level)
> __flush_tlb_range_op(vale1is, start, pages, stride, asid,
> - tlb_level, true, lpa2_is_enabled());
> + tlb_level, tlbi_user, lpa2_is_enabled());
> else
> - __flush_tlb_range_op(vae1is, start, pages, stride, asid,
> - tlb_level, true, lpa2_is_enabled());
> + __flush_tlb_range_op(vae1is, start, pages, stride, tlbi_user,
> + tlb_level, tlbi_user, lpa2_is_enabled());
>
> - mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
> + if (asid)
> + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
> }
This is not correct. The flush_tlb_kernel_range() uses the TLBI VAALE1IS
operation while the above would use VALE1IS for the kernel mapping, with
ASID 0.
I also don't like overriding the meaning of asid here to guess whether
it's user or kernel mapping, it just complicates this function
unnecessarily.
> static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
> {
> - unsigned long addr;
>
> - if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
> - flush_tlb_all();
> - return;
> - }
> -
> - start = __TLBI_VADDR(start, 0);
> - end = __TLBI_VADDR(end, 0);
> -
> - dsb(ishst);
> - for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
> - __tlbi(vaale1is, addr);
> + __flush_tlb_range_nosync(&init_mm, 0, start, end, PAGE_SIZE, false,
> + TLBI_TTL_UNKNOWN);
> dsb(ish);
> isb();
> }
Just call __flush_tlb_range_op(vaale1is, ...) directly here.
--
Catalin
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] arm64: optimize flush tlb kernel range
2024-09-16 15:08 ` Catalin Marinas
@ 2024-09-18 0:57 ` Kefeng Wang
0 siblings, 0 replies; 3+ messages in thread
From: Kefeng Wang @ 2024-09-18 0:57 UTC (permalink / raw)
To: Catalin Marinas; +Cc: Will Deacon, Ryan Roberts, linux-arm-kernel, Yicong Yang
On 2024/9/16 23:08, Catalin Marinas wrote:
> On Sat, Sep 14, 2024 at 10:14:41PM +0800, Kefeng Wang wrote:
>> Currently the kernel TLBs is flushed page by page if the target
>> VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll
>> brutally issue a TLBI ALL.
>>
>> But we could optimize it when CPU supports TLB range operations,
>> convert to use __flush_tlb_range_nosync() like other tlb range
>> flush to improve performance.
>>
>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>
> Nit: you need a co-developed-by here for Yicong.
OK,
>
>> ---
>> arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++------------------
>> arch/arm64/mm/contpte.c | 3 ++-
>> 2 files changed, 20 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
>> index 95fbc8c05607..8537fad83999 100644
>> --- a/arch/arm64/include/asm/tlbflush.h
>> +++ b/arch/arm64/include/asm/tlbflush.h
>> @@ -431,12 +431,12 @@ do { \
>> #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
>> __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
>>
>> -static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>> - unsigned long start, unsigned long end,
>> - unsigned long stride, bool last_level,
>> - int tlb_level)
>> +static __always_inline void __flush_tlb_range_nosync(struct mm_struct *mm,
>> + unsigned long asid, unsigned long start, unsigned long end,
>> + unsigned long stride, bool last_level, int tlb_level)
>> {
>> - unsigned long asid, pages;
>> + bool tlbi_user = !!asid;
>> + unsigned long pages;
>>
>> start = round_down(start, stride);
>> end = round_up(end, stride);
>> @@ -451,21 +451,24 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>> if ((!system_supports_tlb_range() &&
>> (end - start) >= (MAX_DVM_OPS * stride)) ||
>> pages > MAX_TLBI_RANGE_PAGES) {
>> - flush_tlb_mm(vma->vm_mm);
>> + if (asid)
>> + flush_tlb_mm(mm);
>> + else
>> + flush_tlb_all();
>> return;
>> }
>>
>> dsb(ishst);
>> - asid = ASID(vma->vm_mm);
>>
>> if (last_level)
>> __flush_tlb_range_op(vale1is, start, pages, stride, asid,
>> - tlb_level, true, lpa2_is_enabled());
>> + tlb_level, tlbi_user, lpa2_is_enabled());
>> else
>> - __flush_tlb_range_op(vae1is, start, pages, stride, asid,
>> - tlb_level, true, lpa2_is_enabled());
>> + __flush_tlb_range_op(vae1is, start, pages, stride, tlbi_user,
>> + tlb_level, tlbi_user, lpa2_is_enabled());
>>
>> - mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
>> + if (asid)
>> + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
>> }
>
> This is not correct. The flush_tlb_kernel_range() uses the TLBI VAALE1IS
> operation while the above would use VALE1IS for the kernel mapping, with
> ASID 0.
Right, missing it when code refactoring.
>
> I also don't like overriding the meaning of asid here to guess whether
> it's user or kernel mapping, it just complicates this function
> unnecessarily.
>
>> static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>> {
>> - unsigned long addr;
>>
>> - if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
>> - flush_tlb_all();
>> - return;
>> - }
>> -
>> - start = __TLBI_VADDR(start, 0);
>> - end = __TLBI_VADDR(end, 0);
>> -
>> - dsb(ishst);
>> - for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
>> - __tlbi(vaale1is, addr);
>> + __flush_tlb_range_nosync(&init_mm, 0, start, end, PAGE_SIZE, false,
>> + TLBI_TTL_UNKNOWN);
>> dsb(ish);
>> isb();
>> }
>
> Just call __flush_tlb_range_op(vaale1is, ...) directly here.
Our first internal version using __flush_tlb_range_op(), but I want to
avoid some duplicate code between flush_tlb_kernel_range() and
__flush_tlb_range_nosync(), but will turn back to use
__flush_tlb_range_op(), thank.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2024-09-18 0:58 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-14 14:14 [PATCH] arm64: optimize flush tlb kernel range Kefeng Wang
2024-09-16 15:08 ` Catalin Marinas
2024-09-18 0:57 ` Kefeng Wang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).