[PATCH] arm64: optimize flush tlb kernel range

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] arm64: optimize flush tlb kernel range
@ 2024-09-14 14:14 Kefeng Wang
  2024-09-16 15:08 ` Catalin Marinas
  0 siblings, 1 reply; 3+ messages in thread
From: Kefeng Wang @ 2024-09-14 14:14 UTC (permalink / raw)
  To: Catalin Marinas, Will Deacon
  Cc: Ryan Roberts, linux-arm-kernel, Kefeng Wang, Yicong Yang

Currently the kernel TLBs is flushed page by page if the target
VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll
brutally issue a TLBI ALL.

But we could optimize it when CPU supports TLB range operations,
convert to use __flush_tlb_range_nosync() like other tlb range
flush to improve performance.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++------------------
 arch/arm64/mm/contpte.c           |  3 ++-
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 95fbc8c05607..8537fad83999 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -431,12 +431,12 @@ do {									\
 #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
 	__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
 
-static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
-				     unsigned long start, unsigned long end,
-				     unsigned long stride, bool last_level,
-				     int tlb_level)
+static __always_inline void __flush_tlb_range_nosync(struct mm_struct *mm,
+		unsigned long asid, unsigned long start, unsigned long end,
+		unsigned long stride, bool last_level, int tlb_level)
 {
-	unsigned long asid, pages;
+	bool tlbi_user = !!asid;
+	unsigned long pages;
 
 	start = round_down(start, stride);
 	end = round_up(end, stride);
@@ -451,21 +451,24 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
 	if ((!system_supports_tlb_range() &&
 	     (end - start) >= (MAX_DVM_OPS * stride)) ||
 	    pages > MAX_TLBI_RANGE_PAGES) {
-		flush_tlb_mm(vma->vm_mm);
+		if (asid)
+			flush_tlb_mm(mm);
+		else
+			flush_tlb_all();
 		return;
 	}
 
 	dsb(ishst);
-	asid = ASID(vma->vm_mm);
 
 	if (last_level)
 		__flush_tlb_range_op(vale1is, start, pages, stride, asid,
-				     tlb_level, true, lpa2_is_enabled());
+				     tlb_level, tlbi_user, lpa2_is_enabled());
 	else
-		__flush_tlb_range_op(vae1is, start, pages, stride, asid,
-				     tlb_level, true, lpa2_is_enabled());
+		__flush_tlb_range_op(vae1is, start, pages, stride, tlbi_user,
+				     tlb_level, tlbi_user, lpa2_is_enabled());
 
-	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
+	if (asid)
+		mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
 }
 
 static inline void __flush_tlb_range(struct vm_area_struct *vma,
@@ -473,8 +476,8 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				     unsigned long stride, bool last_level,
 				     int tlb_level)
 {
-	__flush_tlb_range_nosync(vma, start, end, stride,
-				 last_level, tlb_level);
+	__flush_tlb_range_nosync(vma->vm_mm, ASID(vma->vm_mm), start, end,
+				 stride, last_level, tlb_level);
 	dsb(ish);
 }
 
@@ -492,19 +495,9 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 
 static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	unsigned long addr;
 
-	if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
-		flush_tlb_all();
-		return;
-	}
-
-	start = __TLBI_VADDR(start, 0);
-	end = __TLBI_VADDR(end, 0);
-
-	dsb(ishst);
-	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
-		__tlbi(vaale1is, addr);
+	__flush_tlb_range_nosync(&init_mm, 0, start, end, PAGE_SIZE, false,
+				 TLBI_TTL_UNKNOWN);
 	dsb(ish);
 	isb();
 }
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 55107d27d3f8..7f93f19dc50b 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -335,7 +335,8 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
 		 * eliding the trailing DSB applies here.
 		 */
 		addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
-		__flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE,
+		__flush_tlb_range_nosync(vma->vm_mm, ASID(vma->vm_mm),
+					 addr, addr + CONT_PTE_SIZE,
 					 PAGE_SIZE, true, 3);
 	}
 
-- 
2.27.0



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] arm64: optimize flush tlb kernel range
  2024-09-14 14:14 [PATCH] arm64: optimize flush tlb kernel range Kefeng Wang
@ 2024-09-16 15:08 ` Catalin Marinas
  2024-09-18  0:57   ` Kefeng Wang
  0 siblings, 1 reply; 3+ messages in thread
From: Catalin Marinas @ 2024-09-16 15:08 UTC (permalink / raw)
  To: Kefeng Wang; +Cc: Will Deacon, Ryan Roberts, linux-arm-kernel, Yicong Yang

On Sat, Sep 14, 2024 at 10:14:41PM +0800, Kefeng Wang wrote:
> Currently the kernel TLBs is flushed page by page if the target
> VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll
> brutally issue a TLBI ALL.
> 
> But we could optimize it when CPU supports TLB range operations,
> convert to use __flush_tlb_range_nosync() like other tlb range
> flush to improve performance.
> 
> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>

Nit: you need a co-developed-by here for Yicong.

> ---
>  arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++------------------
>  arch/arm64/mm/contpte.c           |  3 ++-
>  2 files changed, 20 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
> index 95fbc8c05607..8537fad83999 100644
> --- a/arch/arm64/include/asm/tlbflush.h
> +++ b/arch/arm64/include/asm/tlbflush.h
> @@ -431,12 +431,12 @@ do {									\
>  #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
>  	__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
>  
> -static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
> -				     unsigned long start, unsigned long end,
> -				     unsigned long stride, bool last_level,
> -				     int tlb_level)
> +static __always_inline void __flush_tlb_range_nosync(struct mm_struct *mm,
> +		unsigned long asid, unsigned long start, unsigned long end,
> +		unsigned long stride, bool last_level, int tlb_level)
>  {
> -	unsigned long asid, pages;
> +	bool tlbi_user = !!asid;
> +	unsigned long pages;
>  
>  	start = round_down(start, stride);
>  	end = round_up(end, stride);
> @@ -451,21 +451,24 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>  	if ((!system_supports_tlb_range() &&
>  	     (end - start) >= (MAX_DVM_OPS * stride)) ||
>  	    pages > MAX_TLBI_RANGE_PAGES) {
> -		flush_tlb_mm(vma->vm_mm);
> +		if (asid)
> +			flush_tlb_mm(mm);
> +		else
> +			flush_tlb_all();
>  		return;
>  	}
>  
>  	dsb(ishst);
> -	asid = ASID(vma->vm_mm);
>  
>  	if (last_level)
>  		__flush_tlb_range_op(vale1is, start, pages, stride, asid,
> -				     tlb_level, true, lpa2_is_enabled());
> +				     tlb_level, tlbi_user, lpa2_is_enabled());
>  	else
> -		__flush_tlb_range_op(vae1is, start, pages, stride, asid,
> -				     tlb_level, true, lpa2_is_enabled());
> +		__flush_tlb_range_op(vae1is, start, pages, stride, tlbi_user,
> +				     tlb_level, tlbi_user, lpa2_is_enabled());
>  
> -	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
> +	if (asid)
> +		mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
>  }

This is not correct. The flush_tlb_kernel_range() uses the TLBI VAALE1IS
operation while the above would use VALE1IS for the kernel mapping, with
ASID 0.

I also don't like overriding the meaning of asid here to guess whether
it's user or kernel mapping, it just complicates this function
unnecessarily.

>  static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>  {
> -	unsigned long addr;
>  
> -	if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
> -		flush_tlb_all();
> -		return;
> -	}
> -
> -	start = __TLBI_VADDR(start, 0);
> -	end = __TLBI_VADDR(end, 0);
> -
> -	dsb(ishst);
> -	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
> -		__tlbi(vaale1is, addr);
> +	__flush_tlb_range_nosync(&init_mm, 0, start, end, PAGE_SIZE, false,
> +				 TLBI_TTL_UNKNOWN);
>  	dsb(ish);
>  	isb();
>  }

Just call __flush_tlb_range_op(vaale1is, ...) directly here.

-- 
Catalin


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] arm64: optimize flush tlb kernel range
  2024-09-16 15:08 ` Catalin Marinas
@ 2024-09-18  0:57   ` Kefeng Wang
  0 siblings, 0 replies; 3+ messages in thread
From: Kefeng Wang @ 2024-09-18  0:57 UTC (permalink / raw)
  To: Catalin Marinas; +Cc: Will Deacon, Ryan Roberts, linux-arm-kernel, Yicong Yang



On 2024/9/16 23:08, Catalin Marinas wrote:
> On Sat, Sep 14, 2024 at 10:14:41PM +0800, Kefeng Wang wrote:
>> Currently the kernel TLBs is flushed page by page if the target
>> VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll
>> brutally issue a TLBI ALL.
>>
>> But we could optimize it when CPU supports TLB range operations,
>> convert to use __flush_tlb_range_nosync() like other tlb range
>> flush to improve performance.
>>
>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> 
> Nit: you need a co-developed-by here for Yicong.

OK,

> 
>> ---
>>   arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++------------------
>>   arch/arm64/mm/contpte.c           |  3 ++-
>>   2 files changed, 20 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
>> index 95fbc8c05607..8537fad83999 100644
>> --- a/arch/arm64/include/asm/tlbflush.h
>> +++ b/arch/arm64/include/asm/tlbflush.h
>> @@ -431,12 +431,12 @@ do {									\
>>   #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
>>   	__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
>>   
>> -static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>> -				     unsigned long start, unsigned long end,
>> -				     unsigned long stride, bool last_level,
>> -				     int tlb_level)
>> +static __always_inline void __flush_tlb_range_nosync(struct mm_struct *mm,
>> +		unsigned long asid, unsigned long start, unsigned long end,
>> +		unsigned long stride, bool last_level, int tlb_level)
>>   {
>> -	unsigned long asid, pages;
>> +	bool tlbi_user = !!asid;
>> +	unsigned long pages;
>>   
>>   	start = round_down(start, stride);
>>   	end = round_up(end, stride);
>> @@ -451,21 +451,24 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>>   	if ((!system_supports_tlb_range() &&
>>   	     (end - start) >= (MAX_DVM_OPS * stride)) ||
>>   	    pages > MAX_TLBI_RANGE_PAGES) {
>> -		flush_tlb_mm(vma->vm_mm);
>> +		if (asid)
>> +			flush_tlb_mm(mm);
>> +		else
>> +			flush_tlb_all();
>>   		return;
>>   	}
>>   
>>   	dsb(ishst);
>> -	asid = ASID(vma->vm_mm);
>>   
>>   	if (last_level)
>>   		__flush_tlb_range_op(vale1is, start, pages, stride, asid,
>> -				     tlb_level, true, lpa2_is_enabled());
>> +				     tlb_level, tlbi_user, lpa2_is_enabled());
>>   	else
>> -		__flush_tlb_range_op(vae1is, start, pages, stride, asid,
>> -				     tlb_level, true, lpa2_is_enabled());
>> +		__flush_tlb_range_op(vae1is, start, pages, stride, tlbi_user,
>> +				     tlb_level, tlbi_user, lpa2_is_enabled());
>>   
>> -	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
>> +	if (asid)
>> +		mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
>>   }
> 
> This is not correct. The flush_tlb_kernel_range() uses the TLBI VAALE1IS
> operation while the above would use VALE1IS for the kernel mapping, with
> ASID 0.

Right, missing it when code refactoring.
> 
> I also don't like overriding the meaning of asid here to guess whether
> it's user or kernel mapping, it just complicates this function
> unnecessarily.
> 
>>   static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>>   {
>> -	unsigned long addr;
>>   
>> -	if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
>> -		flush_tlb_all();
>> -		return;
>> -	}
>> -
>> -	start = __TLBI_VADDR(start, 0);
>> -	end = __TLBI_VADDR(end, 0);
>> -
>> -	dsb(ishst);
>> -	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
>> -		__tlbi(vaale1is, addr);
>> +	__flush_tlb_range_nosync(&init_mm, 0, start, end, PAGE_SIZE, false,
>> +				 TLBI_TTL_UNKNOWN);
>>   	dsb(ish);
>>   	isb();
>>   }
> 
> Just call __flush_tlb_range_op(vaale1is, ...) directly here.

Our first internal version using __flush_tlb_range_op(), but I want to 
avoid some duplicate code between flush_tlb_kernel_range() and
__flush_tlb_range_nosync(), but will turn back to use 
__flush_tlb_range_op(), thank.



^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-09-18  0:58 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-14 14:14 [PATCH] arm64: optimize flush tlb kernel range Kefeng Wang
2024-09-16 15:08 ` Catalin Marinas
2024-09-18  0:57   ` Kefeng Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).