linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* TLB preloading on 8xx
@ 2005-12-20 17:37 Marcelo Tosatti
  2005-12-20 19:45 ` Dan Malek
  0 siblings, 1 reply; 2+ messages in thread
From: Marcelo Tosatti @ 2005-12-20 17:37 UTC (permalink / raw)
  To: linux-ppc-embedded



Hi, 

Been playing with TLB preloading on 8xx for the past weeks, and I must
say that results are frustrating.

Most of the TLB setup work involves writing to special purpose
registers, using "mtspr", which is a serializing instruction (it blocks
all execution units). 

Sum up the costs of disabling interrupts and disabling translation, and you
end up with a slow dog. Damn, the TLB exceptions are indeed efficient.

The test used to measure pagefault latency is LMbench's "lat_pagefault".

vanilla:
[root@CAS /]# ./lat_pagefault -N10 out.prof                                     
Pagefaults on out.prof: 36.3728 microseconds               

d-tlb-preload:
[root@CAS /]# ./lat_pagefault -N10 out.prof                                     
Pagefaults on out.prof: 43.7793 microseconds                                   


diff -Nur --exclude-from=linux-2.6-git-dec01/Documentation/dontdiff linux-2.6-git-dec01.orig/arch/ppc/kernel/head_8xx.S linux-2.6-git-dec01/arch/ppc/kernel/head_8xx.S
--- linux-2.6-git-dec01.orig/arch/ppc/kernel/head_8xx.S	2005-12-05 09:47:27.000000000 -0600
+++ linux-2.6-git-dec01/arch/ppc/kernel/head_8xx.S	2005-12-15 12:37:07.449818656 -0600
@@ -804,7 +828,156 @@
 	SYNC
 	blr
 
+
+_GLOBAL(__tlb_data_load)
+	rlwinm	r8, r4, 0, 0, 19	/* extract page address */
+	ori	r8, r8, MD_EVALID	/* set valid bit */
+	slw	r3, r3, 28
+	rlwimi	r8, r3, 0, 28, 31	/* load ASID from r3 */
+#ifdef CONFIG_8xx_CPU6
+	li	r9, 0x3780;
+	stw	r9, 4(r7);
+	lwz	r9, 4(r7);
+#endif
+	mtspr	SPRN_MD_EPN, r8
+
+	mfspr	r10, SPRN_M_TWB	/* Get level 1 table entry address */
+	lwz	r11, 0(r10)	/* Get the level 1 entry */
+	ori	r11,r11,1	/* Set valid bit */
+
+	/* Insert the Guarded flag into the TWC from the Linux PTE.
+         * It is bit 27 of both the Linux PTE and the TWC (at least
+         * I got that right :-).  It will be better when we can put
+         * this into the Linux pgd/pmd and load it in the operation
+         * above.
+         */
+	mr	r12, r5
+	rlwimi r11, r12, 0, 27, 27
+
+	/* 
+	 * Some fields of MD_TWC are cleared by the CPU on a DTLB miss.
+	 * Must do it manually for TLB preload.
+	 * clear 23-26 (access protection group)
+	 * clear 28-29 (page size) and 30 (write-through)
+	 */
+	li	r12, 0
+	rlwimi r11, r12, 0, 23, 26
+	rlwimi r11, r12, 0, 28, 30
+#ifdef CONFIG_8xx_CPU6
+	li	r9, 0x3b80;
+	stw	r9, 4(r7);
+	lwz	r9, 4(r7);
+#endif
+	mtspr	SPRN_MD_TWC, r11		/* Set segment attributes */
+
+	mr	r8, r5
+	mr	r11, r8
+	rlwinm	r8, r8, 0, 0, 20
+	ori	r8, r8, 1			/* set valid bit */
+	/* Update 'changed', among others.
+	*/
+	andi.	r11, r11, _PAGE_RW
+	li	r11, 0x00f0
+	beq	1f
+	ori	r8, r8, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE
+//	stw	r8, 0(r5)		/* and update pte in table */
+	ori	r11, r11, _PAGE_HWWRITE
+	/* The Linux PTE won't go exactly into the MMU TLB.
+	 * Software indicator bits 21, 22 and 28 must be clear.
+	 * Software indicator bits 24, 25, 26, and 27 must be
+	 * set.  All other Linux PTE bits control the behavior
+	 * of the MMU.
+	 */
+1:
+	rlwimi	r8, r11, 0, 23, 28	/* Set 24-27, clear 28 */
+					/* 23 is set if page is _PAGE_RW */
+#ifdef CONFIG_8xx_CPU6
+	li	r9, 0x3d80;
+	stw	r9, 4(r7);
+	lwz	r9, 4(r7);
+#endif
+	mtspr	SPRN_MD_RPN, r8		/* Update TLB entry */
+
+	mfmsr	r11
+	lwz	r6, 0(r7)	/* restore Link Register */
+	mtlr	r6
+	li 	r6, 0x7fff
+	rlwimi	r11, r6, 0, 27, 27	/* set DR */
+	mtmsr	r11
+	tovirt(r7, r7)
+	blr
+
+/*
+ * Load a D-TLB entry.
+ * r3: context number
+ * r4: effective address
+ * r5: PTE pointer
+ * r6: PMD (level-1 entry)
+ * r7: temp location 
+ */
+_GLOBAL(tlb_data_load)
+	lwz	r5, 0(r5)
+	mflr	r6
+	stw	r6, 0(r7)	/* save Link Register */
+	mfmsr	r11
+	li 	r6, 0
+	rlwimi	r11, r6, 0, 27, 27	/* clear DR (data translat.)*/
+	mtmsr	r11
+	lis	r6, __tlb_data_load@h
+	ori	r6, r6, __tlb_data_load@l
+	tophys(r7, r7)
+	mtlr	r6
+	blr
+
+/*
+ * Load a I-TLB entry.
+ * r3: context number
+ * r4: effective address
+ * r5: PTE pointer
+ * r6: PMD (level-1 entry)
+ * r7: temp location 
+ */
+_GLOBAL(tlb_instr_load)
+	rlwinm	r8, r4, 0, 0, 19	/* extract page address */
+	ori	r8, r8, MI_EVALID	/* set valid bit */
+	slw	r3, r3, 28
+	rlwimi	r8, r3, 0, 28, 31	/* load ASID from r3 */
 #ifdef CONFIG_8xx_CPU6
+	li	r9, 0x2780;
+	stw	r9, 0(r7);
+	lwz	r9, 0(r7);
+#endif
+	
+	mfspr	r10, SPRN_M_TWB	/* Get level 1 table entry address */
+	tovirt(r10, r10)
+	lwz	r11, 0(r10)	/* Get the level 1 entry */
+	ori	r11,r11,1	/* Set valid bit */
+#ifdef CONFIG_8xx_CPU6
+	li	r9, 0x2b80;
+	stw	r9, 0(r7);
+	lwz	r9, 0(r7);
+#endif
+	mtspr	SPRN_MI_TWC, r11		/* Set segment attributes */
+
+	lwz	r8, 0(r5)
+	rlwinm	r8, r8, 0, 0, 19 
+	ori	r8, r8, 1			/* set valid bit */
+	/* The Linux PTE won't go exactly into the MMU TLB.
+	 * Software indicator bits 21, 22 and 28 must be clear.
+	 * Software indicator bits 24, 25, 26, and 27 must be
+	 * set.  All other Linux PTE bits control the behavior
+	 * of the MMU.
+	 */
+	li	r11, 0x00f0
+	rlwimi	r8, r11, 0, 24, 28	/* Set 24-27, clear 28 */
+#ifdef CONFIG_8xx_CPU6
+	li	r9, 0x2d80;
+	stw	r9, 0(r7);
+	lwz	r9, 0(r7);
+#endif
+	mtspr	SPRN_MI_RPN, r8		/* Update TLB entry */
+	blr
+
 /* It's here because it is unique to the 8xx.
  * It is important we get called with interrupts disabled.  I used to
  * do that, but it appears that all code that calls this already had
@@ -820,7 +993,6 @@
         mtspr   22, r3		/* Update Decrementer */
 	SYNC
 	blr
-#endif
 
 /*
  * We put a few things here that have to be page-aligned.
diff -Nur --exclude-from=linux-2.6-git-dec01/Documentation/dontdiff linux-2.6-git-dec01.orig/arch/ppc/mm/init.c linux-2.6-git-dec01/arch/ppc/mm/init.c
--- linux-2.6-git-dec01.orig/arch/ppc/mm/init.c	2005-12-05 09:47:28.000000000 -0600
+++ linux-2.6-git-dec01/arch/ppc/mm/init.c	2005-12-15 13:16:42.787712408 -0600
@@ -583,6 +583,54 @@
 	kunmap(page);
 }
 
+extern void tlb_data_load(unsigned long id, unsigned long address, pte_t *pte,
+			  unsigned long pmdval, unsigned long *tmpval);
+
+extern void tlb_instr_load(unsigned long id, unsigned long address, pte_t *pte,
+			   unsigned long pmdval, unsigned long *tmpval);
+
+void tlb_preload(struct vm_area_struct *vma, unsigned long address,
+		 pte_t pte)
+{
+	struct mm_struct *mm;
+	pmd_t *pmd;
+	pte_t *ptep;
+	int mapping_executable = 0;
+	unsigned long flags, tmpval;
+	unsigned long tmp[4];
+
+	if ((vma->vm_flags & VM_EXEC) == VM_EXEC)
+		mapping_executable = 1;
+
+	local_irq_save(flags);
+
+	mm = vma->vm_mm;
+	pmd = pmd_offset(pgd_offset(mm, address), address);
+	if (!pmd_none(*pmd)) {
+		if (mfspr(SPRN_M_CASID) != (mm->context)) {
+			printk(KERN_ERR "CASID:%lx mm->context:%lx\n",
+				mfspr(SPRN_M_CASID), (mm->context));
+			BUG();
+		}
+		ptep = pte_offset_map(pmd, address);
+		if (!pte_present(pte) || !ptep)
+			goto out;
+		if (!mapping_executable)
+			tlb_data_load(mm->context, address, ptep,
+					pmd_val(*pmd), &tmp);
+#ifdef NOTYET
+		else 
+			tlb_instr_load(mm->context, address, ptep,
+					pmd_val(*pmd), &tmp);
+#endif
+out:
+		pte_unmap(ptep);
+	}
+	local_irq_restore(flags);
+}
+
+extern void tlbie_efficient(unsigned long address, struct vm_area_struct *vma);
+
 /*
  * This is called at the end of handling a user page fault, when the
  * fault has been handled by updating a PTE in the linux page tables.
@@ -614,6 +662,7 @@
 				flush_dcache_icache_page(page);
 			set_bit(PG_arch_1, &page->flags);
 		}
+		tlb_preload(vma, address, pte);
 	}
 
 #ifdef CONFIG_PPC_STD_MMU

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: TLB preloading on 8xx
  2005-12-20 17:37 TLB preloading on 8xx Marcelo Tosatti
@ 2005-12-20 19:45 ` Dan Malek
  0 siblings, 0 replies; 2+ messages in thread
From: Dan Malek @ 2005-12-20 19:45 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-ppc-embedded


On Dec 20, 2005, at 12:37 PM, Marcelo Tosatti wrote:

> Sum up the costs of disabling interrupts and disabling translation, 
> and you
> end up with a slow dog. Damn, the TLB exceptions are indeed efficient.

Like I've always said, make the TLB miss exception path very
short and efficient.  You have to consider the total system impact
of running this code, which includes replacing lots of cache
lines that will affect the performance of the application.

Don't be looking for "tricks" in the exception path, look for
ways outside of that we can better structure the page tables
so we can _remove_ code from the exception handler, not
add to it.  What you are doing here is an attempt to do
that, but you are executing lots more code to do this
preload than the TLB miss exception would do.

Keep trying, though, this was a good idea to test :-)

Thanks.

	-- Dan

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2005-12-20 19:45 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-12-20 17:37 TLB preloading on 8xx Marcelo Tosatti
2005-12-20 19:45 ` Dan Malek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).