* Patch [2/2] relax per-cpu TLB requirement to DTC
@ 2006-10-13 17:08 Chen, Kenneth W
2006-10-13 18:54 ` Christoph Lameter
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Chen, Kenneth W @ 2006-10-13 17:08 UTC (permalink / raw)
To: linux-ia64
Instead of pinning per-cpu TLB into a DTR, use DTC. This will free up
one TLB entry for application, or even kernel if access pattern to
per-cpu data area has high temporal locality.
Since per-cpu is mapped at the top of region 7 address, we just need to
add special case in alt_dtlb_miss. The physical address of per-cpu data
is already conveniently stored in IA64_KR(PER_CPU_DATA). Latency for
alt_dtlb_miss is not affected as we can hide all the latency. It was
measured that alt_dtlb_miss handler has 23 cycles latency before and
after the patch.
The performance effect is massive for applications that put lots of tlb
pressure on CPU. Workload environment like database online transaction
processing or application uses tera-byte of memory would benefit the most.
Measurement with industry standard database benchmark shown an upward
of 1.6% gain. While smaller workloads like cpu, java also showing small
improvement.
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
--- ./include/asm-ia64/kregs.h.orig 2006-10-10 19:51:20.000000000 -0700
+++ ./include/asm-ia64/kregs.h 2006-10-13 10:43:09.000000000 -0700
@@ -29,8 +29,7 @@
*/
#define IA64_TR_KERNEL 0 /* itr0, dtr0: maps kernel image (code & data) */
#define IA64_TR_PALCODE 1 /* itr1: maps PALcode as required by EFI */
-#define IA64_TR_PERCPU_DATA 1 /* dtr1: percpu data */
-#define IA64_TR_CURRENT_STACK 2 /* dtr2: maps kernel's memory- & register-stacks */
+#define IA64_TR_CURRENT_STACK 1 /* dtr1: maps kernel's memory- & register-stacks */
/* Processor status register bits: */
#define IA64_PSR_BE_BIT 1
--- ./arch/ia64/kernel/ivt.S.orig 2006-10-10 19:51:10.000000000 -0700
+++ ./arch/ia64/kernel/ivt.S 2006-10-13 10:43:09.000000000 -0700
@@ -374,6 +374,7 @@ ENTRY(alt_dtlb_miss)
movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
mov r21=cr.ipsr
mov r31=pr
+ mov r24=PERCPU_ADDR
;;
#ifdef CONFIG_DISABLE_VHPT
shr.u r22=r16,61 // get the region number into r21
@@ -386,22 +387,30 @@ ENTRY(alt_dtlb_miss)
(p8) mov r29° // save b0
(p8) br.cond.dptk dtlb_fault
#endif
+ cmp.ge p10,p11=r16,r24 // access to per_cpu_data?
+ tbit.z p12,p0=r16,61 // access to region 6?
+ mov r25=PERCPU_PAGE_SHIFT << 2
+ mov r26=PERCPU_PAGE_SIZE
+ nop.m 0
+ nop.b 0
+ ;;
+(p10) mov r19=IA64_KR(PER_CPU_DATA)
+(p11) and r19=r19,r16 // clear non-ppn fields
extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field
tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on?
- shr.u r18=r16,57 // move address bit 61 to bit 4
- and r19=r19,r16 // clear ed, reserved bits, and PTE control bits
tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on?
;;
- andcm r18=0x10,r18 // bit 4=~address-bit(61)
+(p10) sub r19=r19,r26
+(p10) mov cr.itir=r25
cmp.ne p8,p0=r0,r23
(p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field
+(p12) dep r17=-1,r17,4,1 // set ma=UC for region 6 addr
(p8) br.cond.spnt page_fault
dep r21=-1,r21,IA64_PSR_ED_BIT,1
- or r19=r19,r17 // insert PTE control bits into r19
;;
- or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6
+ or r19=r19,r17 // insert PTE control bits into r19
(p6) mov cr.ipsr=r21
;;
(p7) itc.d r19 // insert the TLB entry
--- ./arch/ia64/kernel/mca_asm.S.orig 2006-10-10 19:51:10.000000000 -0700
+++ ./arch/ia64/kernel/mca_asm.S 2006-10-13 10:43:09.000000000 -0700
@@ -101,14 +101,6 @@ ia64_do_tlb_purge:
;;
srlz.d
;;
- // 2. Purge DTR for PERCPU data.
- movl r16=PERCPU_ADDR
- mov r18=PERCPU_PAGE_SHIFT<<2
- ;;
- ptr.d r16,r18
- ;;
- srlz.d
- ;;
// 3. Purge ITR for PAL code.
GET_THIS_PADDR(r2, ia64_mca_pal_base)
;;
@@ -196,22 +188,6 @@ ia64_reload_tr:
srlz.i
srlz.d
;;
- // 2. Reload DTR register for PERCPU data.
- GET_THIS_PADDR(r2, ia64_mca_per_cpu_pte)
- ;;
- movl r16=PERCPU_ADDR // vaddr
- movl r18=PERCPU_PAGE_SHIFT<<2
- ;;
- mov cr.itir=r18
- mov cr.ifa=r16
- ;;
- ld8 r18=[r2] // load per-CPU PTE
- mov r16=IA64_TR_PERCPU_DATA;
- ;;
- itr.d dtr[r16]=r18
- ;;
- srlz.d
- ;;
// 3. Reload ITR for PAL code.
GET_THIS_PADDR(r2, ia64_mca_pal_pte)
;;
--- ./arch/ia64/mm/init.c.orig 2006-10-10 19:51:10.000000000 -0700
+++ ./arch/ia64/mm/init.c 2006-10-13 10:43:09.000000000 -0700
@@ -337,7 +337,7 @@ setup_gate (void)
void __devinit
ia64_mmu_init (void *my_cpu_data)
{
- unsigned long psr, pta, impl_va_bits;
+ unsigned long pta, impl_va_bits;
extern void __devinit tlb_init (void);
#ifdef CONFIG_DISABLE_VHPT
@@ -346,15 +346,6 @@ ia64_mmu_init (void *my_cpu_data)
# define VHPT_ENABLE_BIT 1
#endif
- /* Pin mapping for percpu area into TLB */
- psr = ia64_clear_ic();
- ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR,
- pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)),
- PERCPU_PAGE_SHIFT);
-
- ia64_set_psr(psr);
- ia64_srlz_i();
-
/*
* Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
* address space. The IA-64 architecture guarantees that at least 50 bits of
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: Patch [2/2] relax per-cpu TLB requirement to DTC
2006-10-13 17:08 Patch [2/2] relax per-cpu TLB requirement to DTC Chen, Kenneth W
@ 2006-10-13 18:54 ` Christoph Lameter
2006-10-13 19:25 ` Chen, Kenneth W
2006-10-13 19:38 ` Christoph Lameter
2 siblings, 0 replies; 4+ messages in thread
From: Christoph Lameter @ 2006-10-13 18:54 UTC (permalink / raw)
To: linux-ia64
On Fri, 13 Oct 2006, Chen, Kenneth W wrote:
> - andcm r18=0x10,r18 // bit 4=~address-bit(61)
> +(p10) sub r19=r19,r26
> +(p10) mov cr.itir=r25
> cmp.ne p8,p0=r0,r23
This look somewhat familiar. Any chance that you could merge my mods to
the alt_dtlb_miss handler? Both have to modify ITIR.
Also there may be a conflict since we both use high address bits?
^ permalink raw reply [flat|nested] 4+ messages in thread
* RE: Patch [2/2] relax per-cpu TLB requirement to DTC
2006-10-13 17:08 Patch [2/2] relax per-cpu TLB requirement to DTC Chen, Kenneth W
2006-10-13 18:54 ` Christoph Lameter
@ 2006-10-13 19:25 ` Chen, Kenneth W
2006-10-13 19:38 ` Christoph Lameter
2 siblings, 0 replies; 4+ messages in thread
From: Chen, Kenneth W @ 2006-10-13 19:25 UTC (permalink / raw)
To: linux-ia64
Christoph Lameter wrote on Friday, October 13, 2006 11:54 AM
> On Fri, 13 Oct 2006, Chen, Kenneth W wrote:
>
> > - andcm r18=0x10,r18 // bit 4=~address-bit(61)
> > +(p10) sub r19=r19,r26
> > +(p10) mov cr.itir=r25
> > cmp.ne p8,p0=r0,r23
>
> This look somewhat familiar. Any chance that you could merge my mods to
> the alt_dtlb_miss handler? Both have to modify ITIR.
I suppose so.
> Also there may be a conflict since we both use high address bits?
Yes, there will be conflict, but easily fixable. My patch keys on PERCPU_ADDR,
which should have bits 16-63 set to one. An easy condition to check before
further decoding into variable page size.
There are a few details that your patch needs polishing in alt_dtlb_miss
handler: I don't think you would want to branch into dtlb_fault, because
there are no vhpt table in region7. Branching to dtlb_fault will dereference
a hashed address which will result a guaranteed nested_dtlb_miss fault,
And I think it is also dangerous to reference hashed vhpt address on region
7 address as your patch will fully utilize all the virtual address there.
Nonetheless, this double fault should be easy to optimize away.
I'm a bit uneasy about making nested_dtlb_miss now more frequently used
function. The code that walks the page table there isn't really as optimized
as vhpt_miss handler. We either optimize that or tap into vhpt_miss handler.
- Ken
^ permalink raw reply [flat|nested] 4+ messages in thread
* RE: Patch [2/2] relax per-cpu TLB requirement to DTC
2006-10-13 17:08 Patch [2/2] relax per-cpu TLB requirement to DTC Chen, Kenneth W
2006-10-13 18:54 ` Christoph Lameter
2006-10-13 19:25 ` Chen, Kenneth W
@ 2006-10-13 19:38 ` Christoph Lameter
2 siblings, 0 replies; 4+ messages in thread
From: Christoph Lameter @ 2006-10-13 19:38 UTC (permalink / raw)
To: linux-ia64
On Fri, 13 Oct 2006, Chen, Kenneth W wrote:
> > Also there may be a conflict since we both use high address bits?
>
> Yes, there will be conflict, but easily fixable. My patch keys on PERCPU_ADDR,
> which should have bits 16-63 set to one. An easy condition to check before
> further decoding into variable page size.
Hmm...
> There are a few details that your patch needs polishing in alt_dtlb_miss
> handler: I don't think you would want to branch into dtlb_fault, because
> there are no vhpt table in region7. Branching to dtlb_fault will dereference
Right this is only done in order to get to the nested_dtlb_miss handler.
We could simply avoid that by deferencing the original address again.
> a hashed address which will result a guaranteed nested_dtlb_miss fault,
> And I think it is also dangerous to reference hashed vhpt address on region
> 7 address as your patch will fully utilize all the virtual address there.
> Nonetheless, this double fault should be easy to optimize away.
I am using the original address not the hashed address in order to make
sure that a nested fault happens.
>
> I'm a bit uneasy about making nested_dtlb_miss now more frequently used
> function. The code that walks the page table there isn't really as optimized
> as vhpt_miss handler. We either optimize that or tap into vhpt_miss handler.
I'd would be great to optimize that further. However, as you already noted
we cannot generate a hashed vhpt address without causing address
conflicts (plus we will have other problems as well since the page size
is not 16k but 16MB so the translations will be off) and therefore the
vhpt_miss handler may not work.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2006-10-13 19:38 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-10-13 17:08 Patch [2/2] relax per-cpu TLB requirement to DTC Chen, Kenneth W
2006-10-13 18:54 ` Christoph Lameter
2006-10-13 19:25 ` Chen, Kenneth W
2006-10-13 19:38 ` Christoph Lameter
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox