* [RFC 0/3] IA64 Long Format VHPT support
@ 2006-05-02 5:25 Ian Wienand
2006-05-02 5:25 ` [RFC 1/3] LVHPT - Fault handler modifications Ian Wienand
` (2 more replies)
0 siblings, 3 replies; 16+ messages in thread
From: Ian Wienand @ 2006-05-02 5:25 UTC (permalink / raw)
To: linux-ia64; +Cc: linux-mm, Ian Wienand
Hi,
Following from this message are patches to enable the Long Format VHPT
on IA64, which I am posting in the hope of community review. They are
against 2.6.17-rc3, and work for machines I have access to. These
patches have long been a chicken-egg problem, but I believe that there
are now multiple people interested in using LVHPT for dynamic page
size support in some form.
There are two papers which reference this work
Itanium Page Tables and TLB
Matthew Chapman, Ian Wienand, Gernot Heiser
http://citeseer.ist.psu.edu/chapman03itanium.html
Itanium - A System Implementor's Tale
Charles Gray, Matthew Chapman, Peter Chubb, David Mosberger-Tang, Gernot Heiser
http://www.usenix.org/events/usenix05/tech/general/gray/gray_html/index.html
Any comments are welcomed.
-i
^ permalink raw reply [flat|nested] 16+ messages in thread
* [RFC 1/3] LVHPT - Fault handler modifications
2006-05-02 5:25 [RFC 0/3] IA64 Long Format VHPT support Ian Wienand
@ 2006-05-02 5:25 ` Ian Wienand
2006-05-02 8:04 ` Keith Owens
2006-05-02 17:40 ` Chen, Kenneth W
2006-05-02 5:25 ` [RFC 2/3] LVHPT - Setup LVHPT Ian Wienand
2006-05-02 5:26 ` [RFC 3/3] LVHPT - LVHPT MM support functions Ian Wienand
2 siblings, 2 replies; 16+ messages in thread
From: Ian Wienand @ 2006-05-02 5:25 UTC (permalink / raw)
To: linux-ia64; +Cc: linux-mm, Ian Wienand
Fault handler changes
The logic behind the two fault paths is graphically layed out
http://www.gelato.unsw.edu.au/IA64wiki/TLBMissFlowchart
Firstly, we have stripped out common code in ivt.S into assembler
macros in ivt-macro.S. The comments before the macros should explain
what each is doing.
The main changes are
vhpt_miss can no longer happen. This fault is only raised when the
walker does not have a mapping for the hashed address; with lvhpt the
hash table is pinned with a single entry.
i/dtlb_miss now has to walk the page tables so that we can insert the
translation into the lvhpt. With short-format the code references the
hashed address, and if the hashed address was not mapped (e.g. a
mapping pointing to a page of PTE entries did not cover it) it would
raise the nested_dtlb_miss handler, which would then walk the page
table and insert a translation for that page of PTE's. However, we
can now no-longer fall into nested_dtlb_miss since the VHPT is always
mapped.
The only other changes are updating the VHPT in fairly obvious places
to make sure it is up to date.
Signed-Off-By: Ian Wienand <ianw@gelato.unsw.edu.au>
---
arch/ia64/kernel/ivt.S | 152 ++++++++++++------------------
include/asm-ia64/ivt-macro.S | 213 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 277 insertions(+), 88 deletions(-)
Index: linux-2.6.17-rc3-lvhpt/arch/ia64/kernel/ivt.S
=================================--- linux-2.6.17-rc3-lvhpt.orig/arch/ia64/kernel/ivt.S 2006-05-02 15:12:35.000000000 +1000
+++ linux-2.6.17-rc3-lvhpt/arch/ia64/kernel/ivt.S 2006-05-02 15:13:23.000000000 +1000
@@ -53,6 +53,9 @@
#include <asm/unistd.h>
#include <asm/errno.h>
+/* Generic macros that can be used in multiple places */
+#include <asm/ivt-macro.S>
+
#if 1
# define PSR_DEFAULT_BITS psr.ac
#else
@@ -103,6 +106,13 @@
* - the faulting virtual address uses unimplemented address bits
* - the faulting virtual address has no valid page table mapping
*/
+
+ /* With LVHPT this fault should not occur, since we have it
+ * permanently mapped
+ */
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ FAULT(0)
+#else
mov r16=cr.ifa // get address that caused the TLB miss
#ifdef CONFIG_HUGETLB_PAGE
movl r18=PAGE_SHIFT
@@ -236,6 +246,7 @@
mov pr=r31,-1 // restore predicate registers
rfi
+#endif /* !CONFIG_IA64_LONG_FORMAT_VHPT */
END(vhpt_miss)
.org ia64_ivt+0x400
@@ -253,15 +264,13 @@
mov r29° // save b0
mov r31=pr // save predicates
.itlb_fault:
- mov r17=cr.iha // get virtual address of PTE
- movl r30\x1f // load nested fault continuation point
- ;;
-1: ld8 r18=[r17] // read *pte
- ;;
- mov b0=r29
- tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared?
-(p6) br.cond.spnt page_fault
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ LOAD_PTE_MISS r16, r17, r18, r22, page_fault
;;
+ VHPT_INSERT r16, r17, r18, r22
+#else
+ LOAD_PTE_MISS r17, r18, page_fault
+#endif
itc.i r18
;;
#ifdef CONFIG_SMP
@@ -276,6 +285,7 @@
;;
cmp.ne p7,p0=r18,r19
;;
+ VHPT_PURGE p7, r22
(p7) ptc.l r16,r20
#endif
mov pr=r31,-1
@@ -296,16 +306,15 @@
mov r16=cr.ifa // get virtual address
mov r29° // save b0
mov r31=pr // save predicates
-dtlb_fault:
- mov r17=cr.iha // get virtual address of PTE
- movl r30\x1f // load nested fault continuation point
- ;;
-1: ld8 r18=[r17] // read *pte
+.dtlb_fault:
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ LOAD_PTE_MISS r16, r17, r18, r22, page_fault
;;
- mov b0=r29
- tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared?
-(p6) br.cond.spnt page_fault
+ VHPT_INSERT r16, r17, r18, r22
+#else
+ LOAD_PTE_MISS r17, r18, page_fault
;;
+#endif
itc.d r18
;;
#ifdef CONFIG_SMP
@@ -320,6 +329,7 @@
;;
cmp.ne p7,p0=r18,r19
;;
+ VHPT_PURGE p7, r22
(p7) ptc.l r16,r20
#endif
mov pr=r31,-1
@@ -436,59 +446,17 @@
*
* Clobbered: b0, r18, r19, r21, r22, psr.dt (cleared)
*/
- rsm psr.dt // switch to using physical data addressing
- mov r19=IA64_KR(PT_BASE) // get the page table base address
- shl r21=r16,3 // shift bit 60 into sign bit
- mov r18=cr.itir
- ;;
- shr.u r17=r16,61 // get the region number into r17
- extr.u r18=r18,2,6 // get the faulting page size
- ;;
- cmp.eq p6,p7=5,r17 // is faulting address in region 5?
- add r22=-PAGE_SHIFT,r18 // adjustment for hugetlb address
- add r18=PGDIR_SHIFT-PAGE_SHIFT,r18
- ;;
- shr.u r22=r16,r22
- shr.u r18=r16,r18
-(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place
- srlz.d
- LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir
-
- .pred.rel "mutex", p6, p7
-(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
-(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
- ;;
-(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
-(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
- cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
-#ifdef CONFIG_PGTABLE_4
- shr.u r18=r22,PUD_SHIFT // shift pud index into position
+ /* This fault should not happen with LVHPT */
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ DBG_FAULT(5)
+ FAULT(5)
#else
- shr.u r18=r22,PMD_SHIFT // shift pmd index into position
-#endif
- ;;
- ld8 r17=[r17] // get *pgd (may be 0)
- ;;
-(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) = NULL?
- dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=p[u|m]d_offset(pgd,addr)
- ;;
-#ifdef CONFIG_PGTABLE_4
-(p7) ld8 r17=[r17] // get *pud (may be 0)
- shr.u r18=r22,PMD_SHIFT // shift pmd index into position
- ;;
-(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pud_present(*pud) = NULL?
- dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr)
- ;;
-#endif
-(p7) ld8 r17=[r17] // get *pmd (may be 0)
- shr.u r19=r22,PAGE_SHIFT // shift pte index into position
- ;;
-(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pmd_present(*pmd) = NULL?
- dep r17=r19,r17,3,(PAGE_SHIFT-3) // r17=pte_offset(pmd,addr);
+ FIND_PTE r16, r17, p6, p7
(p6) br.cond.spnt page_fault
mov b0=r30
br.sptk.many b0 // return to continuation point
+#endif
END(nested_dtlb_miss)
.org ia64_ivt+0x1800
@@ -548,16 +516,16 @@
* page table TLB entry isn't present, we take a nested TLB miss hit where we look
* up the physical address of the L3 PTE and then continue at label 1 below.
*/
- mov r16=cr.ifa // get the address that caused the fault
- movl r30\x1f // load continuation point in case of nested fault
- ;;
- thash r17=r16 // compute virtual address of L3 PTE
mov r29° // save b0 in case of nested fault
mov r31=pr // save pr
+ ;;
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ LOAD_PTE_FAULT r16, r17, r18, r22, .dtlb_fault
+#else
+ LOAD_PTE_FAULT r16, r17, r18
+#endif
#ifdef CONFIG_SMP
mov r28=ar.ccv // save ar.ccv
- ;;
-1: ld8 r18=[r17]
;; // avoid RAW on r18
mov ar.ccv=r18 // set compare value for cmpxchg
or r25=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits
@@ -568,6 +536,7 @@
;;
(p6) cmp.eq p6,p7=r26,r18 // Only compare if page is present
;;
+ VHPT_UPDATE p6, r18, r22
(p6) itc.d r25 // install updated PTE
;;
/*
@@ -580,17 +549,17 @@
;;
cmp.eq p6,p7=r18,r25 // is it same as the newly installed
;;
+ VHPT_PURGE p7, r22
(p7) ptc.l r16,r24
mov b0=r29 // restore b0
mov ar.ccv=r28
#else
- ;;
-1: ld8 r18=[r17]
;; // avoid RAW on r18
or r18=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits
mov b0=r29 // restore b0
;;
st8 [r17]=r18 // store back updated PTE
+ VHPT_UPDATE p0, r18, r22
itc.d r18 // install updated PTE
#endif
mov pr=r31,-1 // restore pr
@@ -604,7 +573,7 @@
DBG_FAULT(9)
// Like Entry 8, except for instruction access
mov r16=cr.ifa // get the address that caused the fault
- movl r30\x1f // load continuation point in case of nested fault
+ mov r29°
mov r31=pr // save predicates
#ifdef CONFIG_ITANIUM
/*
@@ -618,13 +587,16 @@
(p6) mov r16=r18 // if so, use cr.iip instead of cr.ifa
#endif /* CONFIG_ITANIUM */
;;
- thash r17=r16 // compute virtual address of L3 PTE
- mov r29° // save b0 in case of nested fault)
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ LOAD_PTE_FAULT r16, r17, r18, r22, .itlb_fault
+#else
+ LOAD_PTE_FAULT r16, r17, r18
+#endif
+ ;;
+ mov b0=r29 // restore b0
#ifdef CONFIG_SMP
mov r28=ar.ccv // save ar.ccv
;;
-1: ld8 r18=[r17]
- ;;
mov ar.ccv=r18 // set compare value for cmpxchg
or r25=_PAGE_A,r18 // set the accessed bit
tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit
@@ -646,17 +618,17 @@
;;
cmp.eq p6,p7=r18,r25 // is it same as the newly installed
;;
+ VHPT_PURGE p7, r22
(p7) ptc.l r16,r24
mov b0=r29 // restore b0
mov ar.ccv=r28
#else /* !CONFIG_SMP */
;;
-1: ld8 r18=[r17]
- ;;
or r18=_PAGE_A,r18 // set the accessed bit
mov b0=r29 // restore b0
;;
st8 [r17]=r18 // store back updated PTE
+ VHPT_UPDATE p0, r18, r22
itc.i r18 // install updated PTE
#endif /* !CONFIG_SMP */
mov pr=r31,-1
@@ -670,15 +642,16 @@
DBG_FAULT(10)
// Like Entry 8, except for data access
mov r16=cr.ifa // get the address that caused the fault
- movl r30\x1f // load continuation point in case of nested fault
- ;;
- thash r17=r16 // compute virtual address of L3 PTE
- mov r31=pr
mov r29° // save b0 in case of nested fault)
+ mov r31=pr
+ ;;
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ LOAD_PTE_FAULT r16, r17, r18, r22, .dtlb_fault
+#else
+ LOAD_PTE_FAULT r16, r17, r18
+#endif
#ifdef CONFIG_SMP
mov r28=ar.ccv // save ar.ccv
- ;;
-1: ld8 r18=[r17]
;; // avoid RAW on r18
mov ar.ccv=r18 // set compare value for cmpxchg
or r25=_PAGE_A,r18 // set the dirty bit
@@ -689,6 +662,7 @@
;;
(p6) cmp.eq p6,p7=r26,r18 // Only if page is present
;;
+ VHPT_UPDATE p6, r18, r22
(p6) itc.d r25 // install updated PTE
/*
* Tell the assemblers dependency-violation checker that the above "itc" instructions
@@ -700,19 +674,21 @@
;;
cmp.eq p6,p7=r18,r25 // is it same as the newly installed
;;
+ VHPT_PURGE p7, r22
(p7) ptc.l r16,r24
+ mov b0=r29 // restore b0
mov ar.ccv=r28
#else
- ;;
-1: ld8 r18=[r17]
;; // avoid RAW on r18
or r18=_PAGE_A,r18 // set the accessed bit
;;
st8 [r17]=r18 // store back updated PTE
+ VHPT_UPDATE p0, r18, r22
itc.d r18 // install updated PTE
#endif
- mov b0=r29 // restore b0
mov pr=r31,-1
+ ;;
+ mov b0=r29 // restore b0
rfi
END(daccess_bit)
Index: linux-2.6.17-rc3-lvhpt/include/asm-ia64/ivt-macro.S
=================================--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.17-rc3-lvhpt/include/asm-ia64/ivt-macro.S 2006-05-02 15:09:48.000000000 +1000
@@ -0,0 +1,213 @@
+/*
+ * Macros for use in ivt.S
+ *
+ * Copyright (C) 2005 see ivt.S for orignal authors
+ * Abstractions some combination of
+ * Matthew Chapman <matthewc@cse.unsw.edu.au>
+ * Darren Williams <darren.williams@nicta.com.au>
+ * Ian Wienand <ianw@gelato.unsw.edu.au>
+ */
+
+/*
+ * FIND_PTE
+ * Walks the page table to find a PTE
+ * @va, register holding virtual address
+ * @ppte, register with pointer to page table entry
+ * @p1, predicate set if found
+ * @p2, predicate set if !found
+ */
+.macro find_pte va, ppte, p1, p2
+ rsm psr.dt // switch to using physical data addressing
+ mov r19=IA64_KR(PT_BASE) // get the page table base address
+ shl r21=\va,3 // shift bit 60 into sign bit
+ mov r18=cr.itir
+ ;;
+ shr.u \ppte=\va,61 // get the region number into ppte
+ extr.u r18=r18,2,6 // get the faulting page size
+ ;;
+ cmp.eq \p1,\p2=5,\ppte // is faulting address in region 5?
+ add r22=-PAGE_SHIFT,r18 // adjustment for hugetlb address
+ add r18=PGDIR_SHIFT-PAGE_SHIFT,r18
+ ;;
+ shr.u r22=\va,r22
+ shr.u r18=\va,r18
+(\p2) dep \ppte=\ppte,r19,(PAGE_SHIFT-3),3 // put region number bits in place
+
+ srlz.d
+ LOAD_PHYSICAL(\p1, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir
+
+ .pred.rel "mutex", \p1, \p2
+(\p1) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
+(\p2) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
+ ;;
+(\p1) dep \ppte=r18,r19,3,(PAGE_SHIFT-3) // ppte=pgd_offset for region 5
+(\p2) dep \ppte=r18,\ppte,3,(PAGE_SHIFT-3)-3 // ppte=pgd_offset for region[0-4]
+ cmp.eq \p2,\p1=0,r21 // unused address bits all zeroes?
+#ifdef CONFIG_PGTABLE_4
+ shr.u r18=r22,PUD_SHIFT // shift pud index into position
+#else
+ shr.u r18=r22,PMD_SHIFT // shift pmd index into position
+#endif
+ ;;
+ ld8 \ppte=[\ppte] // get *pgd (may be 0)
+ ;;
+(\p2) cmp.eq \p1,\p2=\ppte,r0 // was pgd_present(*pgd) = NULL?
+ dep \ppte=r18,\ppte,3,(PAGE_SHIFT-3) // ppte=p[u|m]d_offset(pgd,addr)
+ ;;
+#ifdef CONFIG_PGTABLE_4
+(\p2) ld8 \ppte=[\ppte] // get *pud (may be 0)
+ shr.u r18=r22,PMD_SHIFT // shift pmd index into position
+ ;;
+(\p2) cmp.eq.or.andcm \p1,\p2=\ppte,r0 // was pud_present(*pud) = NULL?
+ dep \ppte=r18,\ppte,3,(PAGE_SHIFT-3) // ppte=pmd_offset(pud,addr)
+ ;;
+#endif
+(\p2) ld8 \ppte=[\ppte] // get *pmd (may be 0)
+ shr.u r19=r22,PAGE_SHIFT // shift pte index into position
+ ;;
+(\p2) cmp.eq.or.andcm \p1,\p2=\ppte,r0 // was pmd_present(*pmd) = NULL?
+ dep \ppte=r19,\ppte,3,(PAGE_SHIFT-3) // ppte=pte_offset(pmd,addr)
+.endm
+
+
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+/*
+ * LOAD_PTE_MISS - load pte into tlb and VHPT
+ * @va, virtual address
+ * @ppte, pointer to the page table entry
+ * @pte, actual pte
+ * @hpte, hash page table entry
+ *
+ * Given a va get the ppte and load its value into pte
+ */
+.macro load_pte_miss va, ppte, pte, hpte, failfn
+ ;;
+ FIND_PTE \va, \ppte, p6, p7
+ ;;
+(p7) ld8 \pte=[\ppte]
+ ;;
+(p7) tbit.z p6,p0=\pte,_PAGE_P_BIT /* page present bit cleared? */
+(p6) br.cond.spnt \failfn
+.endm
+
+/* Since we access the page table physically, we access the long VHPT physically as well
+ * to avoid switching back and forth */
+
+/*
+ * LOAD_PTE_FAULT - get the pte entry from the VHPT for va
+ * @va, virtual address to resolve
+ * @ppte, pointer to the page table entry
+ * @pte, page table entry
+ * @hpte, store pte in this hash page table entry
+ * @failfn, function called if fault not resolved
+ *
+ * Retrieve the pte via the hashed page table and store it in pte=r18
+ * r25 = tag
+ * r26 = htag
+ */
+
+.macro load_pte_fault va, ppte, pte, hpte, failfn
+ thash r28=\va
+ rsm psr.dt
+ ;;
+ tpa \hpte=r28 // make hash address physical
+ ttag r25=\va
+ ;;
+ srlz.d
+ add r24\x16,\hpte
+ add \ppte$,\hpte
+ ;;
+ ld8 r26=[r24] // load tag
+ ld8 \ppte=[\ppte]
+ ;;
+ cmp.ne p6,p7=r26, r25 // verify tag
+ ;;
+(p7) ld8 \pte=[\ppte]
+ ;;
+(p6) mov cr.iha=r28 // set cr.iha only if we are going to take
+(p6) br.cond.spnt \failfn // the failfn fault, it depends on it
+.endm
+
+/*
+ * VHPT_INSERT -
+ * @va, virtual address to be inserted
+ * @ppte, pointer to the page table entry
+ * @pte, page table entry to be inserted
+ * @hpte, insert pte into this hash page table entry
+ *
+ * Insert the va into the VHPT and tlb, the tlb insert
+ * happens in ivt.S for the appropriate fault instruction or data.
+ */
+.macro vhpt_insert va, ppte, pte, hpte
+ mov \hpte=cr.iha
+ mov r26=cr.itir
+ ;;
+ tpa \hpte=\hpte /* make hash address physical */
+ ttag r25=\va
+ ;;
+ add r24\x16,\hpte
+ ;;
+ st8 [\hpte]=\pte,8 /* fill out VHPT entry */
+ st8 [r24]=r25,8
+ ;;
+ st8 [\hpte]=r26,8
+ st8 [r24]=\ppte
+.endm
+
+/*
+ * Update the VHPT with pte value obtained from LOAD_PTE_FAULT
+ */
+.macro vhpt_update cond, pte, hpte
+(\cond) st8 [\hpte]=\pte,16
+.endm
+
+/*
+ * Invalidate the tlb for the VHPT pointing to hpte, this is achieved by
+ * setting the invalid tag bit(63) in the VHPT tag field. A VHPT entry with
+ * ti bit set to one will never be inserted into a processor's TLBs.
+ *
+ */
+.macro vhpt_purge cond, hpte
+(\cond) dep r25=-1,r0,63,1 /* set tag-invalid bit */
+;;
+(\cond) st8 [\hpte]=r25 /* hpte already points to tag (see above) */
+.endm
+
+#else /* !CONFIG_IA64_LONG_FORMAT_VHPT */
+
+/*
+ * LOAD_PTE_MISS
+ * Get a PTE based on the hardware walker's miss address,
+ * branch to the failfn if we can't find it
+ * @ppte, pointer to page table entry
+ * @pte, actual pte
+ * @failfn function to call if PTE not present
+ */
+.macro load_pte_miss ppte, pte, failfn
+ mov \ppte=cr.iha // get virtual address of L3 PTE
+ movl r30\x1f // load nested fault continuation point
+ ;;
+1: ld8 \pte=[\ppte] // read L3 PTE
+ ;;
+ mov b0=r29
+ tbit.z p6,p0=\pte,_PAGE_P_BIT // page present bit cleared?
+(p6) br.cond.spnt \failfn
+.endm
+
+/*
+ * LOAD_PTE_FAULT
+ * get a PTE from the hashed page table
+ * Note we only set r30 and don't save the other registers
+ * as required for nested_dtlb_miss.
+ * @va, register holding virtual address
+ * @ppte, register to hold pointer to pte
+ * @pte, register to hold pte value
+ */
+.macro load_pte_fault va, ppte, pte
+ thash \ppte=\va // get virtual address of L3
+ movl r30\x1f // load continuation for nested_dtlb_miss
+ ;;
+1: ld8 \pte=[\ppte]
+.endm
+
+#endif /* CONFIG_IA64_LONG_FORMAT_VHPT */
^ permalink raw reply [flat|nested] 16+ messages in thread
* [RFC 2/3] LVHPT - Setup LVHPT
2006-05-02 5:25 [RFC 0/3] IA64 Long Format VHPT support Ian Wienand
2006-05-02 5:25 ` [RFC 1/3] LVHPT - Fault handler modifications Ian Wienand
@ 2006-05-02 5:25 ` Ian Wienand
2006-05-02 15:03 ` Luck, Tony
` (5 more replies)
2006-05-02 5:26 ` [RFC 3/3] LVHPT - LVHPT MM support functions Ian Wienand
2 siblings, 6 replies; 16+ messages in thread
From: Ian Wienand @ 2006-05-02 5:25 UTC (permalink / raw)
To: linux-ia64; +Cc: linux-mm, Ian Wienand
LVHPT setup
The following patch sets up the LVHPT on boot.
For the initial boot processor, we allocate the VHPT in ia64_mmu_init.
Other CPUs get the LVHPT allocated from do_boot_cpu before they are
woken up.
The logic is per CPU, but it attempts to choose a reasonable size that
can be pinned in the TLB. There are facilities to clamp it to a
specific size.
Signed-Off-By: Ian Wienand <ianw@gelato.unsw.edu.au>
---
Documentation/kernel-parameters.txt | 14 ++
arch/ia64/Kconfig | 10 +
arch/ia64/kernel/setup.c | 30 +++++
arch/ia64/kernel/smpboot.c | 12 ++
arch/ia64/mm/init.c | 187 +++++++++++++++++++++++++++++++++---
include/asm-ia64/pgtable.h | 20 +++
6 files changed, 261 insertions(+), 12 deletions(-)
Index: linux-2.6.17-rc3/arch/ia64/Kconfig
=================================--- linux-2.6.17-rc3.orig/arch/ia64/Kconfig 2006-05-01 15:35:44.000000000 +1000
+++ linux-2.6.17-rc3/arch/ia64/Kconfig 2006-05-01 15:35:51.000000000 +1000
@@ -374,6 +374,16 @@
def_bool y
depends on NEED_MULTIPLE_NODES
+config IA64_LONG_FORMAT_VHPT
+ bool "Long format VHPT"
+ depends on !DISABLE_VHPT
+ help
+ The long format VHPT is an alternative hashed page table. Advantages
+ of the long format VHPT are lower memory usage when there are a large
+ number of processes in the system.
+ The short format page table walker is currently the Linux default.
+ If you're unsure, answer N.
+
config IA32_SUPPORT
bool "Support for Linux/x86 binaries"
help
Index: linux-2.6.17-rc3/arch/ia64/kernel/setup.c
=================================--- linux-2.6.17-rc3.orig/arch/ia64/kernel/setup.c 2006-05-01 15:35:44.000000000 +1000
+++ linux-2.6.17-rc3/arch/ia64/kernel/setup.c 2006-05-01 15:35:51.000000000 +1000
@@ -284,6 +284,18 @@
#endif
}
+static void __init parse_cmdline_early (char ** cmdline_p)
+{
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ char *p;
+ extern int lvhpt_bits_clamp_setup(char *s);
+
+ strlcpy(saved_command_line, *cmdline_p, COMMAND_LINE_SIZE);
+ if ((p = strstr(*cmdline_p, "lvhpt_bits_clamp=")))
+ lvhpt_bits_clamp_setup(p + 17);
+#endif
+}
+
static void __init
io_port_init (void)
{
@@ -400,12 +412,14 @@
void __init
setup_arch (char **cmdline_p)
{
+ extern void __devinit ia64_tlb_early_init(void);
+
unw_init();
ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
*cmdline_p = __va(ia64_boot_param->command_line);
- strlcpy(saved_command_line, *cmdline_p, COMMAND_LINE_SIZE);
+ parse_cmdline_early(cmdline_p);
efi_init();
io_port_init();
@@ -438,6 +452,20 @@
ia64_setup_printk_clock();
+ /* Setup some information about the TLBS */
+ ia64_tlb_early_init();
+
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ /*
+ * put this after all the ACPI walking so we can get the size
+ * of memory on nodes
+ */
+ {
+ extern void compute_vhpt_size(void);
+ compute_vhpt_size();
+ }
+#endif
+
#ifdef CONFIG_SMP
cpu_physical_id(0) = hard_smp_processor_id();
Index: linux-2.6.17-rc3/arch/ia64/kernel/smpboot.c
=================================--- linux-2.6.17-rc3.orig/arch/ia64/kernel/smpboot.c 2006-05-01 15:35:44.000000000 +1000
+++ linux-2.6.17-rc3/arch/ia64/kernel/smpboot.c 2006-05-01 15:35:51.000000000 +1000
@@ -478,6 +478,11 @@
complete(&c_idle->done);
}
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+/* required for do_boot_cpu, defined in init.c */
+extern unsigned int alloc_vhpt(int cpu);
+#endif
+
static int __devinit
do_boot_cpu (int sapicid, int cpu)
{
@@ -512,6 +517,13 @@
do_rest:
task_for_booting_cpu = c_idle.idle;
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ if (alloc_vhpt(cpu)) {
+ panic("Couldn't allocate VHPT on CPU %d\n", cpu);
+ }
+ Dprintk("Allocated long format VHPT for CPU %d at: 0x%lx, size: 0x%lx\n", cpu, vhpt_base[cpu], long_vhpt_size(cpu));
+#endif /* CONFIG_IA64_LONG_FORMAT_VHPT */
+
Dprintk("Sending wakeup vector %lu to AP 0x%x/0x%x.\n", ap_wakeup_vector, cpu, sapicid);
set_brendez_area(cpu);
Index: linux-2.6.17-rc3/arch/ia64/mm/init.c
=================================--- linux-2.6.17-rc3.orig/arch/ia64/mm/init.c 2006-05-01 15:35:44.000000000 +1000
+++ linux-2.6.17-rc3/arch/ia64/mm/init.c 2006-05-01 15:35:51.000000000 +1000
@@ -42,6 +42,11 @@
DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist);
DEFINE_PER_CPU(long, __pgtable_quicklist_size);
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+unsigned long vhpt_base[NR_CPUS];
+unsigned long long_vhpt_bits[MAX_NUMNODES];
+#endif
+
extern void ia64_tlb_init (void);
unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
@@ -335,10 +340,140 @@
ia64_patch_gate();
}
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+/*
+ * This code must be called on a CPU which has it's MMU
+ * initialized. The page allocator seems to depend on it.
+ *
+ * Returns 0 on success.
+ */
+unsigned int
+alloc_vhpt(int cpu)
+{
+ int node = cpu_to_node(cpu);
+
+#ifdef CONFIG_NUMA
+ struct page *page;
+
+ page = alloc_pages_node(node, __GFP_HIGHMEM|GFP_ATOMIC, long_vhpt_bits[node] - PAGE_SHIFT);
+ if (!page)
+ return -1;
+ vhpt_base[cpu] = (unsigned long) page_address(page);
+#else
+ vhpt_base[cpu] = (unsigned long)__get_free_pages(__GFP_HIGHMEM|GFP_ATOMIC,
+ long_vhpt_bits[node] - PAGE_SHIFT);
+#endif
+ return (vhpt_base[cpu] = 0);
+}
+
+static int lvhpt_bits_clamp;
+
+/*
+ * This allows you to clamp the number of bits used for the long
+ * format vhpt. TODO check for invalid values here.
+ */
+
+int __init
+lvhpt_bits_clamp_setup(char *s)
+{
+ if (sscanf(s, "%d", &lvhpt_bits_clamp) <= 0)
+ lvhpt_bits_clamp = 0;
+ return 1;
+}
+
+__setup("lvhpt_bits_clamp=", lvhpt_bits_clamp_setup);
+
+static unsigned long vhpt_addressable_memory[MAX_NUMNODES];
+
+/*
+ * Passed to efi_memmap_walk to simply add up how much memory we have.
+ * This is used to size the LVHPT
+ */
+static int
+get_total_ram(unsigned long start, unsigned long end, void *arg)
+{
+ unsigned long *s = arg;
+ *s += (end - start);
+ return 0;
+}
+
+#ifdef CONFIG_NUMA
+static void compute_vhpt_size_numa(void)
+{
+ int i;
+
+ if (lvhpt_bits_clamp)
+ {
+ printk(KERN_INFO "Clamping LVHPT to %d bits\n", lvhpt_bits_clamp);
+ for (i = 0; i < MAX_NUMNODES; i++)
+ long_vhpt_bits[i] = lvhpt_bits_clamp;
+ return;
+ }
+
+ /* In the NUMA case, we evaluate how much memory each node has
+ * and then try to size it to three times the physical memory
+ * of the node (as this gives us the best coverage. As we pin
+ * this with a TLB entry, we need to make sure the size we
+ * choose is however suitable for the architecture.
+ */
+ for (i = 0; i < num_node_memblks; i++) {
+ printk(KERN_ERR "vhpt_addr_mem[%d] = %lx\n", node_memblk[i].nid, node_memblk[i].size);
+ vhpt_addressable_memory[node_memblk[i].nid] ++ node_memblk[i].size;
+ }
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ {
+ unsigned long size + 3 * (vhpt_addressable_memory[i] >> PAGE_SHIFT);
+ long_vhpt_bits[i] = find_largest_page_size(size);
+ }
+}
+#endif
+
+/* This version in both NUMA and non-NUMA, since we can use it in either. */
+static void compute_vhpt_size_non_numa(void)
+{
+ /*
+ * In the non-NUMA case we just put everything in the first
+ * node space and take a guess.
+ */
+ if (lvhpt_bits_clamp)
+ {
+ long_vhpt_bits[0] = lvhpt_bits_clamp;
+ printk(KERN_INFO "Clamping LVHPT to %d bits\n", lvhpt_bits_clamp);
+ return;
+ }
+ efi_memmap_walk(get_total_ram, &vhpt_addressable_memory);
+ /*
+ * For some reason the above doesn't work with the simulator.
+ * Clamp it to a fairly reasonable 4 megabytes
+ */
+ long_vhpt_bits[0] = max(22, find_largest_page_size(vhpt_addressable_memory[0] >> PAGE_SHIFT));
+}
+
+void __init
+compute_vhpt_size(void)
+{
+#ifdef CONFIG_NUMA
+ /* Machines like the ZX1 don't setup all the node info we
+ * require, but someone might still try to boot a NUMA kernel
+ * on it. In this case, fall back to our non-numa case.*/
+ if (num_node_memblks = 0)
+ compute_vhpt_size_non_numa();
+ else
+ compute_vhpt_size_numa();
+#else
+ compute_vhpt_size_non_numa();
+#endif
+}
+
+#endif /* CONFIG_IA64_LONG_FORMAT_VHPT */
+
void __devinit
ia64_mmu_init (void *my_cpu_data)
{
- unsigned long psr, pta, impl_va_bits;
+ unsigned long psr, pta;
extern void __devinit tlb_init (void);
#ifdef CONFIG_DISABLE_VHPT
@@ -347,16 +482,48 @@
# define VHPT_ENABLE_BIT 1
#endif
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ int cpu = smp_processor_id();
+#ifdef CONFIG_NUMA
+ int node = cpu_to_node_map[cpu];
+#else
+ int node = 0; // only one node
+#endif
+ /* boot CPU is guaranteed to be zero, I read that somewhere */
+ if (cpu = 0)
+ {
+ unsigned long size = long_vhpt_size(0);
+ vhpt_base[cpu] = (unsigned long)__alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS));
+ if (vhpt_base[cpu] = 0)
+ panic("Couldn't allocate VHPT on CPU %d, size: 0x%lx!\n",
+ cpu, long_vhpt_size(0));
+ printk(KERN_INFO "Allocated long format VHPT for boot processor (CPU %d) at: 0x%lx, size: 0x%lx\n",
+ cpu, vhpt_base[cpu], long_vhpt_size(0));
+ }
+#else /* !CONFIG_IA64_LONG_FORMAT_VHPT */
+ unsigned long impl_va_bits;
+#endif
/* Pin mapping for percpu area into TLB */
psr = ia64_clear_ic();
ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR,
pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)),
PERCPU_PAGE_SHIFT);
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ /* Insert the permanent translation for the VHPT */
+ ia64_itr(0x2, IA64_TR_LONG_VHPT, LONG_VHPT_BASE,
+ pte_val(pfn_pte(__pa(vhpt_base[cpu]) >> PAGE_SHIFT, PAGE_KERNEL)), long_vhpt_bits[node]);
+#endif
ia64_set_psr(psr);
ia64_srlz_i();
-
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+# define VHPT_FORMAT_BIT 1
+# define vhpt_bits long_vhpt_bits[node]
+ pta = LONG_VHPT_BASE;
+#else
/*
+ * SHORT FORMAT VHPT (virtually mapped linear pagetable)
+ *
* Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
* address space. The IA-64 architecture guarantees that at least 50 bits of
* virtual address space are implemented but if we pick a large enough page size
@@ -367,6 +534,7 @@
* address space to not permit mappings that would overlap with the VMLPT.
* --davidm 00/12/06
*/
+# define VHPT_FORMAT_BIT 0
# define pte_bits 3
# define mapped_space_bits (3*(PAGE_SHIFT - pte_bits) + PAGE_SHIFT)
/*
@@ -376,28 +544,27 @@
* non-speculative accesses to the virtual page table, so the address range of the
* virtual page table itself needs to be covered by virtual page table.
*/
-# define vmlpt_bits (impl_va_bits - PAGE_SHIFT + pte_bits)
+# define vhpt_bits (impl_va_bits - PAGE_SHIFT + pte_bits)
# define POW2(n) (1ULL << (n))
-
impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61)));
if (impl_va_bits < 51 || impl_va_bits > 61)
panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1);
/*
* mapped_space_bits - PAGE_SHIFT is the total number of ptes we need,
- * which must fit into "vmlpt_bits - pte_bits" slots. Second half of
+ * which must fit into "vhpt_bits - pte_bits" slots. Second half of
* the test makes sure that our mapped space doesn't overlap the
* unimplemented hole in the middle of the region.
*/
- if ((mapped_space_bits - PAGE_SHIFT > vmlpt_bits - pte_bits) ||
+ if ((mapped_space_bits - PAGE_SHIFT > vhpt_bits - pte_bits) ||
(mapped_space_bits > impl_va_bits - 1))
panic("Cannot build a big enough virtual-linear page table"
" to cover mapped address space.\n"
" Try using a smaller page size.\n");
-
/* place the VMLPT at the end of each page-table mapped region: */
- pta = POW2(61) - POW2(vmlpt_bits);
+ pta = POW2(61) - POW2(vhpt_bits);
+#endif
/*
* Set the (virtually mapped linear) page table address. Bit
@@ -405,10 +572,8 @@
* size of the table, and bit 0 whether the VHPT walker is
* enabled.
*/
- ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | VHPT_ENABLE_BIT);
-
+ ia64_set_pta(pta | (VHPT_FORMAT_BIT << 8) | (vhpt_bits << 2) | VHPT_ENABLE_BIT);
ia64_tlb_init();
-
#ifdef CONFIG_HUGETLB_PAGE
ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2);
ia64_srlz_d();
Index: linux-2.6.17-rc3/include/asm-ia64/pgtable.h
=================================--- linux-2.6.17-rc3.orig/include/asm-ia64/pgtable.h 2006-05-01 15:35:44.000000000 +1000
+++ linux-2.6.17-rc3/include/asm-ia64/pgtable.h 2006-05-01 15:35:51.000000000 +1000
@@ -556,6 +556,21 @@
extern void memmap_init (unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn);
# endif /* CONFIG_VIRTUAL_MEM_MAP */
+
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+extern unsigned long vhpt_base[NR_CPUS];
+extern unsigned long long_vhpt_bits[MAX_NUMNODES];
+static inline unsigned long long_vhpt_size(int cpu)
+{
+#ifdef CONFIG_NUMA
+ return (1UL << long_vhpt_bits[cpu_to_node_map[cpu]]);
+#else
+ /* For now, all CPUs in non-numa case have the same size VHPT */
+ return (1UL << long_vhpt_bits[0]);
+#endif
+}
+#endif
+
# endif /* !__ASSEMBLY__ */
/*
@@ -576,6 +591,11 @@
#define KERNEL_TR_PAGE_SIZE (1 << KERNEL_TR_PAGE_SHIFT)
/*
+ * Long format VHPT
+ */
+#define LONG_VHPT_BASE (0xc000000000000000 - long_vhpt_size(smp_processor_id()))
+
+/*
* No page table caches to initialise
*/
#define pgtable_cache_init() do { } while (0)
Index: linux-2.6.17-rc3/Documentation/kernel-parameters.txt
=================================--- linux-2.6.17-rc3.orig/Documentation/kernel-parameters.txt 2006-05-01 15:35:44.000000000 +1000
+++ linux-2.6.17-rc3/Documentation/kernel-parameters.txt 2006-05-01 15:35:51.000000000 +1000
@@ -50,6 +50,7 @@
ISDN Appropriate ISDN support is enabled.
JOY Appropriate joystick support is enabled.
LP Printer support is enabled.
+ LONG_FORMAT_VHPT Long Format VHPT is enabled
LOOP Loopback device support is enabled.
M68k M68k architecture is enabled.
These options have more detailed description inside of
@@ -805,6 +806,19 @@
ltpc= [NET]
Format: <io>,<irq>,<dma>
+ lvhpt_bits_clamp= [IA64,LONG_FORMAT_VHPT]
+ Format: <1-39>
+
+ Clamp the size of the LVHPT (on all
+ nodes for a NUMA system) to 2^n bits.
+ E.g. 2^22 gives a LVHPT of 4MB. We
+ pin a TLB entry of this size, so the
+ size must be valid for the
+ architecture, otherwise your kernel
+ will not boot. By default we take a
+ good guess at sizing this for optimal
+ operation.
+
mac5380= [HW,SCSI] Format:
<can_queue>,<cmd_per_lun>,<sg_tablesize>,<hostid>,<use_tags>
^ permalink raw reply [flat|nested] 16+ messages in thread
* [RFC 3/3] LVHPT - LVHPT MM support functions
2006-05-02 5:25 [RFC 0/3] IA64 Long Format VHPT support Ian Wienand
2006-05-02 5:25 ` [RFC 1/3] LVHPT - Fault handler modifications Ian Wienand
2006-05-02 5:25 ` [RFC 2/3] LVHPT - Setup LVHPT Ian Wienand
@ 2006-05-02 5:26 ` Ian Wienand
2 siblings, 0 replies; 16+ messages in thread
From: Ian Wienand @ 2006-05-02 5:26 UTC (permalink / raw)
To: linux-ia64; +Cc: linux-mm, Ian Wienand
LVHPT support
Functions for dealing with LVHPT flushing and miscellaneous other mm
functions.
Signed-Off-By: Ian Wienand <ianw@gelato.unsw.edu.au>
---
arch/ia64/mm/tlb.c | 78 ++++++++++++++++++++++++++++++++---------
include/asm-ia64/kregs.h | 1
include/asm-ia64/mmu_context.h | 21 ++++++++++-
include/asm-ia64/page.h | 5 ++
include/asm-ia64/tlb.h | 26 +++++++++++++
include/asm-ia64/tlbflush.h | 24 ++++++++++++
6 files changed, 137 insertions(+), 18 deletions(-)
Index: linux-2.6.17-rc3/arch/ia64/mm/tlb.c
=================================--- linux-2.6.17-rc3.orig/arch/ia64/mm/tlb.c 2006-05-01 15:35:43.000000000 +1000
+++ linux-2.6.17-rc3/arch/ia64/mm/tlb.c 2006-05-01 15:35:54.000000000 +1000
@@ -26,11 +26,10 @@
#include <asm/pal.h>
#include <asm/tlbflush.h>
#include <asm/dma.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
-static struct {
- unsigned long mask; /* mask of supported purge page-sizes */
- unsigned long max_bits; /* log2 of largest supported purge page-size */
-} purge;
+struct ia64_page_sizes_t ia64_page_sizes;
struct ia64_ctx ia64_ctx = {
.lock = SPIN_LOCK_UNLOCKED,
@@ -115,6 +114,14 @@
{
unsigned long i, j, flags, count0, count1, stride0, stride1, addr;
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ unsigned long page;
+
+ /* Admittedly 0 is a valid tag, but in that rare case the present bit will save us */
+ for (page = LONG_VHPT_BASE; page < LONG_VHPT_BASE+long_vhpt_size(smp_processor_id()); page += PAGE_SIZE)
+ clear_page((void *)page);
+#endif
+
addr = local_cpu_data->ptce_base;
count0 = local_cpu_data->ptce_count[0];
count1 = local_cpu_data->ptce_count[1];
@@ -133,12 +140,42 @@
ia64_srlz_i(); /* srlz.i implies srlz.d */
}
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+static void
+flush_vhpt_range (struct mm_struct *mm, unsigned long from, unsigned long to)
+{
+ unsigned long addr;
+
+ for (addr = from; addr < to; addr += PAGE_SIZE)
+ flush_vhpt_page(addr);
+
+#ifdef CONFIG_SMP
+ {
+ /* Urgh... flush VHPTs of any other CPUs that have run this mm */
+ extern unsigned long vhpt_base[];
+ unsigned long offset;
+ long_pte_t *hpte;
+ int cpu;
+
+ for_each_cpu_mask(cpu, mm->cpu_vm_mask)
+ {
+ for (addr = from; addr < to; addr += PAGE_SIZE)
+ {
+ offset = ia64_thash(addr) & (long_vhpt_size(cpu)-1);
+ hpte = (long_pte_t *)(vhpt_base[cpu] + offset);
+ hpte->tag = INVALID_TAG;
+ }
+ }
+ }
+#endif
+}
+#endif /* CONFIG_IA64_LONG_FORMAT_VHPT */
+
void
flush_tlb_range (struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long size = end - start;
unsigned long nbits;
#ifndef CONFIG_SMP
@@ -148,12 +185,10 @@
}
#endif
- nbits = ia64_fls(size + 0xfff);
- while (unlikely (((1UL << nbits) & purge.mask) = 0) &&
- (nbits < purge.max_bits))
- ++nbits;
- if (nbits > purge.max_bits)
- nbits = purge.max_bits;
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ flush_vhpt_range(mm, start, end);
+#endif
+ nbits = find_largest_page_size(end-start);
start &= ~((1UL << nbits) - 1);
preempt_disable();
@@ -173,19 +208,28 @@
}
EXPORT_SYMBOL(flush_tlb_range);
+/*
+ * We need this data early in the boot, so it gets called from
+ * setup_arch()
+ */
void __devinit
-ia64_tlb_init (void)
+ia64_tlb_early_init (void)
{
- ia64_ptce_info_t ptce_info;
- unsigned long tr_pgbits;
long status;
+ unsigned long tr_pgbits;
- if ((status = ia64_pal_vm_page_size(&tr_pgbits, &purge.mask)) != 0) {
+ if ((status = ia64_pal_vm_page_size(&tr_pgbits, &ia64_page_sizes.mask)) != 0) {
printk(KERN_ERR "PAL_VM_PAGE_SIZE failed with status=%ld;"
"defaulting to architected purge page-sizes.\n", status);
- purge.mask = 0x115557000UL;
+ ia64_page_sizes.mask = 0x115557000UL;
}
- purge.max_bits = ia64_fls(purge.mask);
+ ia64_page_sizes.max_bits = ia64_fls(ia64_page_sizes.mask);
+}
+
+void __devinit
+ia64_tlb_init (void)
+{
+ ia64_ptce_info_t ptce_info;
ia64_get_ptce(&ptce_info);
local_cpu_data->ptce_base = ptce_info.base;
Index: linux-2.6.17-rc3/include/asm-ia64/kregs.h
=================================--- linux-2.6.17-rc3.orig/include/asm-ia64/kregs.h 2006-05-01 15:35:43.000000000 +1000
+++ linux-2.6.17-rc3/include/asm-ia64/kregs.h 2006-05-01 15:35:54.000000000 +1000
@@ -31,6 +31,7 @@
#define IA64_TR_PALCODE 1 /* itr1: maps PALcode as required by EFI */
#define IA64_TR_PERCPU_DATA 1 /* dtr1: percpu data */
#define IA64_TR_CURRENT_STACK 2 /* dtr2: maps kernel's memory- & register-stacks */
+#define IA64_TR_LONG_VHPT 3 /* dtr3: maps long format VHPT */
/* Processor status register bits: */
#define IA64_PSR_BE_BIT 1
Index: linux-2.6.17-rc3/include/asm-ia64/mmu_context.h
=================================--- linux-2.6.17-rc3.orig/include/asm-ia64/mmu_context.h 2006-05-01 15:35:43.000000000 +1000
+++ linux-2.6.17-rc3/include/asm-ia64/mmu_context.h 2006-05-01 15:35:54.000000000 +1000
@@ -18,7 +18,21 @@
#define IA64_REGION_ID_KERNEL 0 /* the kernel's region id (tlb.c depends on this being 0) */
-#define ia64_rid(ctx,addr) (((ctx) << 3) | (addr >> 61))
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+/*
+ * Due to a high number of collisions in the long format VHPT walker hash function
+ * when RIDs and similar address space layout occur "eg. fork()". The following is
+ * used to space out the RIDs we present to the hardware without messing with Linux's
+ * sequential allocation scheme.
+ * Refer to 'Intel Itanium Processor Reference Manual for Software Development'
+ * http://www.intel.com/design/itanium/manuals.htm
+ */
+#define redistribute_rid(rid) (((rid) & ~0xffff) | (((rid) << 8) & 0xff00) | (((rid) >> 8) & 0xff))
+#else
+#define redistribute_rid(rid) (rid)
+#endif
+
+#define ia64_rid(ctx,addr) redistribute_rid(((ctx) << 3) | (addr >> 61))
# include <asm/page.h>
# ifndef __ASSEMBLY__
@@ -135,7 +149,12 @@
old_rr4 = ia64_get_rr(RGN_BASE(RGN_HPAGE));
rid = context << 3; /* make space for encoding the region number */
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+ rid = redistribute_rid(rid);
+ rid_incr = 1 << 16;
+#else
rid_incr = 1 << 8;
+#endif
/* encode the region id, preferred page size, and VHPT enable bit: */
rr0 = (rid << 8) | (PAGE_SHIFT << 2) | 1;
Index: linux-2.6.17-rc3/include/asm-ia64/page.h
=================================--- linux-2.6.17-rc3.orig/include/asm-ia64/page.h 2006-05-01 15:35:43.000000000 +1000
+++ linux-2.6.17-rc3/include/asm-ia64/page.h 2006-05-01 15:35:54.000000000 +1000
@@ -175,6 +175,11 @@
return order;
}
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+/* Long format VHPT entry */
+typedef struct { unsigned long pte, itir, tag, ig; } long_pte_t;
+#endif
+
# endif /* __KERNEL__ */
#endif /* !__ASSEMBLY__ */
Index: linux-2.6.17-rc3/include/asm-ia64/tlbflush.h
=================================--- linux-2.6.17-rc3.orig/include/asm-ia64/tlbflush.h 2006-05-01 15:35:43.000000000 +1000
+++ linux-2.6.17-rc3/include/asm-ia64/tlbflush.h 2006-05-01 15:35:54.000000000 +1000
@@ -19,6 +19,21 @@
* can be very expensive, so try to avoid them whenever possible.
*/
+/* Flushing a translation from the long format VHPT */
+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
+# define INVALID_TAG (1UL << 63)
+
+static inline void
+flush_vhpt_page(unsigned long addr)
+{
+ long_pte_t *hpte;
+ hpte = (long_pte_t *)ia64_thash(addr);
+ hpte->tag = INVALID_TAG;
+}
+#else
+# define flush_vhpt_page(addr) do { } while (0)
+#endif
+
/*
* Flush everything (kernel mapping may also have changed due to
* vmalloc/vfree).
@@ -54,6 +69,12 @@
set_bit(mm->context, ia64_ctx.flushmap);
mm->context = 0;
+ /* XXX smp_flush_tlb_mm actually enables and disables preempt
+ * ... maybe we should refactor all this
+ */
+ cpu_clear(get_cpu(), mm->cpu_vm_mask);
+ put_cpu();
+
if (atomic_read(&mm->mm_users) = 0)
return; /* happens as a result of exit_mmap() */
@@ -76,7 +97,10 @@
flush_tlb_range(vma, (addr & PAGE_MASK), (addr & PAGE_MASK) + PAGE_SIZE);
#else
if (vma->vm_mm = current->active_mm)
+ {
+ flush_vhpt_page(addr);
ia64_ptcl(addr, (PAGE_SHIFT << 2));
+ }
else
vma->vm_mm->context = 0;
#endif
Index: linux-2.6.17-rc3/include/asm-ia64/tlb.h
=================================--- linux-2.6.17-rc3.orig/include/asm-ia64/tlb.h 2006-05-01 15:35:43.000000000 +1000
+++ linux-2.6.17-rc3/include/asm-ia64/tlb.h 2006-05-01 15:35:54.000000000 +1000
@@ -107,8 +107,10 @@
vma.vm_mm = tlb->mm;
/* flush the address range from the tlb: */
flush_tlb_range(&vma, start, end);
+#ifndef CONFIG_IA64_LONG_FORMAT_VHPT
/* now flush the virt. page-table area mapping the address range: */
flush_tlb_range(&vma, ia64_thash(start), ia64_thash(end));
+#endif
}
/* lastly, release the freed pages */
@@ -200,6 +202,30 @@
tlb->end_addr = address + PAGE_SIZE;
}
+/*
+ * Find an architecture suitable page size based big enough to map
+ * input size. Return the number of bits; i.e. (1 << nbits) is the
+ * page size in bytes.
+ */
+struct ia64_page_sizes_t {
+ unsigned long mask; /* mask of supported page-sizes */
+ unsigned long max_bits; /* log2 of largest supported page-size */
+};
+extern struct ia64_page_sizes_t ia64_page_sizes; /* initalised in ia64_tlb_init from EFI */
+
+static inline unsigned long
+find_largest_page_size(unsigned long size)
+{
+ int nbits = ia64_fls(size + 0xfff);
+ while (unlikely (((1UL << nbits) & ia64_page_sizes.mask) = 0) &&
+ (nbits < ia64_page_sizes.max_bits))
+ ++nbits;
+ if (nbits > ia64_page_sizes.max_bits)
+ nbits = ia64_page_sizes.max_bits;
+ return nbits;
+}
+
+
#define tlb_migrate_finish(mm) platform_tlb_migrate_finish(mm)
#define tlb_start_vma(tlb, vma) do { } while (0)
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC 1/3] LVHPT - Fault handler modifications
2006-05-02 5:25 ` [RFC 1/3] LVHPT - Fault handler modifications Ian Wienand
@ 2006-05-02 8:04 ` Keith Owens
2006-05-02 17:40 ` Chen, Kenneth W
1 sibling, 0 replies; 16+ messages in thread
From: Keith Owens @ 2006-05-02 8:04 UTC (permalink / raw)
To: Ian Wienand; +Cc: linux-ia64, linux-mm
Ian Wienand (on Tue, 02 May 2006 15:25:51 +1000) wrote:
>Firstly, we have stripped out common code in ivt.S into assembler
>macros in ivt-macro.S. The comments before the macros should explain
>what each is doing.
Make that ivt.h to match the existing codebase, entry.S has entry.h.
ivt-macro.S is not standalone assembler.
These patches contain trailing whitespace on at least 15 lines.
>The main changes are
>
>vhpt_miss can no longer happen. This fault is only raised when the
>walker does not have a mapping for the hashed address; with lvhpt the
>hash table is pinned with a single entry.
ia64_do_tlb_purge() purges the fxed TR entries on an MCA caused by
invalid TLB entries, ia64_reload_tr() then reloads the fixed TR
entries. IA64_TR_LONG_VHPT must be added to both ia64_do_tlb_purge()
and ia64_reload_tr().
compute_vhpt_size_numa() has the comment
/* In the NUMA case, we evaluate how much memory each node has
* and then try to size it to three times the physical memory
* of the node (as this gives us the best coverage. As we pin
* this with a TLB entry, we need to make sure the size we
* choose is however suitable for the architecture.
*/
How will this work with cpu and memory hotplug?
>+#ifdef CONFIG_IA64_LONG_FORMAT_VHPT
>+ LOAD_PTE_MISS r16, r17, r18, r22, page_fault
>...
>+#else
>+ LOAD_PTE_MISS r17, r18, page_fault
>+#endif
I do not like LOAD_PTE_MISS being defined with different numbers and
order of parameters depending on the config. Use one LOAD_PTE_MISS
macro that always takes ppte, pte, failfn, va and hpte (in that order).
Then ignore va and hpte for the short form VHPT, hidden inside the
macro definition.
BTW, load_pte_miss claims to take an hpte parameter, but it is not
used.
It is difficult to see what has really changed in ivt.S because of the
change to macros and the addition of LONG_FORMAT_VHPT at the same time.
Could you split the first patch in two? One patch to add the macros
and a second one to add LONG_FORMAT_VHPT would be much easier to
understand.
The macros use hardcoded work registers like r18, r19 and r21. That is
going to make it really awkward to maintain, I hate macros with hidden
side effects. Either pass the work registers to the macros or document
what registers these macros clobber.
arch/ia64/kernel/setup.c:+ extern int lvhpt_bits_clamp_setup(char *s);
arch/ia64/kernel/setup.c:+ extern void __devinit ia64_tlb_early_init(void);
arch/ia64/kernel/setup.c:+ extern void compute_vhpt_size(void);
arch/ia64/kernel/smpboot.c:+extern unsigned int alloc_vhpt(int cpu);
arch/ia64/mm/tlb.c:+ extern unsigned long vhpt_base[];
Adding more extern to C files, yuck! That's what headers are for.
Could you explain how VHPT_PURGE works with LONG_FORMAT_VHPT=n? I am
puzzled why the patch has VHPT_PURGE not protected by #ifdef
CONFIG_LONG_FORMAT_VHPT.
^ permalink raw reply [flat|nested] 16+ messages in thread
* RE: [RFC 2/3] LVHPT - Setup LVHPT
2006-05-02 5:25 ` [RFC 2/3] LVHPT - Setup LVHPT Ian Wienand
@ 2006-05-02 15:03 ` Luck, Tony
2006-05-02 21:29 ` Ian Wienand
2006-05-02 17:30 ` Chen, Kenneth W
` (4 subsequent siblings)
5 siblings, 1 reply; 16+ messages in thread
From: Luck, Tony @ 2006-05-02 15:03 UTC (permalink / raw)
To: Ian Wienand, linux-ia64; +Cc: linux-mm
Ian,
Thanks for keeping this alive. Previous measurements on long
format VHPT were mostly close to neutral performance-wise with
short format ... so this is still waiting for the killer-app in
the form of another patch that actually uses features of the
long format VHPT to do something that can't easily be done by
the short format to give me an incentive to complicate the code
by adding yet another CONFIG option. In fact, I'd prefer to see
a compelling use case for long format so that it would be clear
that the right thing to do would be to just remove short format
and replace it with long format, but I don't expect that things
will ever be that simple :-(
+ help
+ The long format VHPT is an alternative hashed page table. Advantages
+ of the long format VHPT are lower memory usage when there are a large
+ number of processes in the system.
Is this really true? Don't you still have all of the 3-level (or 4-level)
tree allocated to keep the machine independent code in mm/memory.c
happy in addition to the big block of memory that you are using on
each cpu for the LVHPT? Where is the saving?
-Tony
^ permalink raw reply [flat|nested] 16+ messages in thread
* RE: [RFC 2/3] LVHPT - Setup LVHPT
2006-05-02 5:25 ` [RFC 2/3] LVHPT - Setup LVHPT Ian Wienand
2006-05-02 15:03 ` Luck, Tony
@ 2006-05-02 17:30 ` Chen, Kenneth W
2006-05-03 7:49 ` Ian Wienand
2006-05-02 21:33 ` Luck, Tony
` (3 subsequent siblings)
5 siblings, 1 reply; 16+ messages in thread
From: Chen, Kenneth W @ 2006-05-02 17:30 UTC (permalink / raw)
To: Luck, Tony, Ian Wienand, linux-ia64; +Cc: linux-mm
Luck, Tony wrote on Tuesday, May 02, 2006 8:03 AM
> Thanks for keeping this alive. Previous measurements on long
> format VHPT were mostly close to neutral performance-wise with
> short format ...
This is a fairly gentle comments :-) Digging up my result of performance
evaluation on database workload, the regression is quite big at 2.8%. I'm
not that happy at all :-(
> so this is still waiting for the killer-app in
> the form of another patch that actually uses features of the
> long format VHPT to do something that can't easily be done by
> the short format
Database workload can be the potential killer-app ....
> to give me an incentive to complicate the code
> by adding yet another CONFIG option. In fact, I'd prefer to see
> a compelling use case for long format so that it would be clear
> that the right thing to do would be to just remove short format
> and replace it with long format, but I don't expect that things
> will ever be that simple :-(
Boot time option to the rescue! I have a patch that does just like that.
Though first order of business is to make lvhpt to perform on large
workload. If I recall correctly, lvhpt introduces performance regression
on certain components of cpu2000.
- Ken
^ permalink raw reply [flat|nested] 16+ messages in thread
* RE: [RFC 1/3] LVHPT - Fault handler modifications
2006-05-02 5:25 ` [RFC 1/3] LVHPT - Fault handler modifications Ian Wienand
2006-05-02 8:04 ` Keith Owens
@ 2006-05-02 17:40 ` Chen, Kenneth W
2006-05-03 7:42 ` Ian Wienand
1 sibling, 1 reply; 16+ messages in thread
From: Chen, Kenneth W @ 2006-05-02 17:40 UTC (permalink / raw)
To: 'Ian Wienand', linux-ia64; +Cc: linux-mm
Ian Wienand wrote on Monday, May 01, 2006 10:26 PM
> Firstly, we have stripped out common code in ivt.S into assembler
> macros in ivt-macro.S. The comments before the macros should explain
> what each is doing.
I think at current state, it's way too early to extract out the common
code into macros, because for long format vhpt, the low level handler is
not even optimal. And in the final form, it may not be the same as the
short format vhpt (OK, the linux page table walk will be the same, but
other part maybe not).
I would like to experiment with a few algorithms for lvhpt and best thing
to do in my opinion is to have parallel ivt.S (or ivt table to be precise).
- Ken
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC 2/3] LVHPT - Setup LVHPT
2006-05-02 15:03 ` Luck, Tony
@ 2006-05-02 21:29 ` Ian Wienand
0 siblings, 0 replies; 16+ messages in thread
From: Ian Wienand @ 2006-05-02 21:29 UTC (permalink / raw)
To: Luck, Tony; +Cc: linux-ia64, linux-mm
On Tue, May 02, 2006 at 08:03:16AM -0700, Luck, Tony wrote:
> + help
> + The long format VHPT is an alternative hashed page table. Advantages
> + of the long format VHPT are lower memory usage when there are a large
> + number of processes in the system.
>
> Is this really true? Don't you still have all of the 3-level (or 4-level)
> tree allocated to keep the machine independent code in mm/memory.c
> happy in addition to the big block of memory that you are using on
> each cpu for the LVHPT? Where is the saving?
Yes that does seem a bit miss-leading. I guess the point was that
with short format you dedicate the top areas of your region to page
tables for each process, with long format it is static.
-i
^ permalink raw reply [flat|nested] 16+ messages in thread
* RE: [RFC 2/3] LVHPT - Setup LVHPT
2006-05-02 5:25 ` [RFC 2/3] LVHPT - Setup LVHPT Ian Wienand
2006-05-02 15:03 ` Luck, Tony
2006-05-02 17:30 ` Chen, Kenneth W
@ 2006-05-02 21:33 ` Luck, Tony
2006-05-03 10:53 ` Ian Wienand
` (2 subsequent siblings)
5 siblings, 0 replies; 16+ messages in thread
From: Luck, Tony @ 2006-05-02 21:33 UTC (permalink / raw)
To: Ian Wienand; +Cc: linux-ia64, linux-mm
> Yes that does seem a bit miss-leading. I guess the point was that
> with short format you dedicate the top areas of your region to page
> tables for each process, with long format it is static.
So perhaps adding the word "virtual" (in between the "lower" and the
"memory") into the help description, and dropping the bit "when there
are a large number of processes in the system" would be clearer?
-Tony
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC 1/3] LVHPT - Fault handler modifications
2006-05-02 17:40 ` Chen, Kenneth W
@ 2006-05-03 7:42 ` Ian Wienand
0 siblings, 0 replies; 16+ messages in thread
From: Ian Wienand @ 2006-05-03 7:42 UTC (permalink / raw)
To: linux-ia64; +Cc: linux-mm
On Tue, May 02, 2006 at 10:40:37AM -0700, Chen, Kenneth W wrote:
> I would like to experiment with a few algorithms for lvhpt and best thing
> to do in my opinion is to have parallel ivt.S (or ivt table to be precise).
Ok, I sent an email this morning which I think got dropped for size
(what is the posting limit?). This is an update.
Below is another approach, dynamically creating ivt.S from a template
and input files with a script. Below is only a partial patch in the
hope of it getting through the mail server; the full series is at
http://www.gelato.unsw.edu.au/~ianw/lvhpt/patches/v2
The long format handlers are then implemented without macros, etc, as per
http://www.gelato.unsw.edu.au/~ianw/lvhpt/patches/v2/ivt-long-format.patch
Any feedback on this approach is most welcome.
-i
Signed-Off-By: Ian Wienand <ianw@gelato.unsw.edu.au>
---
kernel/Makefile | 5
kernel/ivt-sfvhpt.in | 444 +++++++++++++++++
kernel/ivt.S.in | 1324 +++++++++++++++++++++++++++++++++++++++++++++++++++
scripts/merge.py | 58 ++
4 files changed, 1831 insertions(+)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.17-rc3-lvhpt-v2-fresh/arch/ia64/scripts/merge.py 2006-05-03 14:24:54.000000000 +1000
@@ -0,0 +1,58 @@
+#!/usr/bin/env python2.4
+# nb. requires python2.4 for string templating
+#
+# Usage: merge.py output template code
+#
+# merge the code in 'code' into 'template', writing out to 'output'
+#
+# Anything between lines __begin_name__ and __end_name__ in 'code'
+# gets inserted into $name in 'template'.
+#
+# Ian Wienand <ianw@gelato.unsw.edu.au>
+#
+import os
+import sys
+from string import Template
+
+if len(sys.argv) != 4:
+ print "Usage: %s output template code" % sys.argv[0]
+ sys.exit(2)
+
+print "Merging %s and template %s to %s" % (sys.argv[2], sys.argv[3], sys.argv[1])
+
+# bring the code file into a dictionary
+# anything between lines __begin_name__ and __end_name__ goes into
+# a dictionary entry of name
+template_dictionary = {}
+am_processing = False
+current_template = ""
+current_template_name = ""
+for line in open(sys.argv[3], 'r').readlines():
+
+ if am_processing:
+ # if this line is the end, stop
+ # XXX check this end is actually the name we are processing
+ if line[:6] = "__end_":
+ template_dictionary[current_template_name] = current_template
+ am_processing = False
+ print "... done"
+ continue
+ # otherwise, add this line to the current template
+ current_template += line
+ continue
+ # if we got here, we are not processing
+ if line[:8] = "__begin_":
+ am_processing = True
+ current_template_name = line[8:-3] #newline
+ print "Processing %s" % (current_template_name),
+ current_template = ""
+ continue
+ # this is some random line
+ continue
+
+#now open the file where we put these templates
+template = Template(open(sys.argv[2],'r').read())
+
+#finally, substitute them all in
+output = open(sys.argv[1],'w')
+output.write(template.substitute(template_dictionary))
Index: linux-2.6.17-rc3-lvhpt-v2-fresh/arch/ia64/kernel/ivt.S.in
=================================--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.17-rc3-lvhpt-v2-fresh/arch/ia64/kernel/ivt.S.in 2006-05-03 15:52:55.000000000 +1000
@@ -0,0 +1,1324 @@
+/*
+ * arch/ia64/kernel/ivt.S
+ *
+ * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co
+ * Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger <davidm@hpl.hp.com>
+ * Copyright (C) 2000, 2002-2003 Intel Co
+ * Asit Mallick <asit.k.mallick@intel.com>
+ * Suresh Siddha <suresh.b.siddha@intel.com>
+ * Kenneth Chen <kenneth.w.chen@intel.com>
+ * Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> TLB handling for SMP
+ * 00/12/20 David Mosberger-Tang <davidm@hpl.hp.com> DTLB/ITLB handler now uses virtual PT.
+ */
+/*
+ * This file defines the interruption vector table used by the CPU.
+ * It does not include one entry per possible cause of interruption.
+ *
+ * The first 20 entries of the table contain 64 bundles each while the
+ * remaining 48 entries contain only 16 bundles each.
+ *
+ * The 64 bundles are used to allow inlining the whole handler for critical
+ * interruptions like TLB misses.
+ *
+ * For each entry, the comment is as follows:
+ *
+ * // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
+ * entry offset ----/ / / / /
+ * entry number ---------/ / / /
+ * size of the entry -------------/ / /
+ * vector name -------------------------------------/ /
+ * interruptions triggering this vector ----------------------/
+ *
+ * The table is 32KB in size and must be aligned on 32KB boundary.
+ * (The CPU ignores the 15 lower bits of the address)
+ *
+ * Table is based upon EAS2.6 (Oct 1999)
+ */
+
+#include <linux/config.h>
+
+#include <asm/asmmacro.h>
+#include <asm/break.h>
+#include <asm/ia32.h>
+#include <asm/kregs.h>
+#include <asm/asm-offsets.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/system.h>
+#include <asm/thread_info.h>
+#include <asm/unistd.h>
+#include <asm/errno.h>
+
+#if 1
+# define PSR_DEFAULT_BITS psr.ac
+#else
+# define PSR_DEFAULT_BITS 0
+#endif
+
+#if 0
+ /*
+ * This lets you track the last eight faults that occurred on the CPU. Make sure ar.k2 isn't
+ * needed for something else before enabling this...
+ */
+# define DBG_FAULT(i) mov r16=ar.k2;; shl r16=r16,8;; add r16=(i),r16;;mov ar.k2=r16
+#else
+# define DBG_FAULT(i)
+#endif
+
+#include "minstate.h"
+
+#define FAULT(n) \
+ mov r31=pr; \
+ mov r19=n;; /* prepare to save predicates */ \
+ br.sptk.many dispatch_to_fault_handler
+
+ .section .text.ivt,"ax"
+
+ .align 32768 // align on 32KB boundary
+ .global ia64_ivt
+ia64_ivt:
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
+ENTRY(vhpt_miss)
+ DBG_FAULT(0)
+ /*
+ * The VHPT vector is invoked when the TLB entry for the virtual page table
+ * is missing. This happens only as a result of a previous
+ * (the "original") TLB miss, which may either be caused by an instruction
+ * fetch or a data access (or non-access).
+ *
+ * What we do here is normal TLB miss handing for the _original_ miss,
+ * followed by inserting the TLB entry for the virtual page table page
+ * that the VHPT walker was attempting to access. The latter gets
+ * inserted as long as page table entry above pte level have valid
+ * mappings for the faulting address. The TLB entry for the original
+ * miss gets inserted only if the pte entry indicates that the page is
+ * present.
+ *
+ * do_page_fault gets invoked in the following cases:
+ * - the faulting virtual address uses unimplemented address bits
+ * - the faulting virtual address has no valid page table mapping
+ */
+$vhpt_miss_handler
+END(vhpt_miss)
+
+ .org ia64_ivt+0x400
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
+ENTRY(itlb_miss)
+ DBG_FAULT(1)
+ /*
+ * The ITLB handler accesses the PTE via the virtually mapped linear
+ * page table. If a nested TLB miss occurs, we switch into physical
+ * mode, walk the page table, and then re-execute the PTE read and
+ * go on normally after that.
+ */
+$itlb_miss_handler
+END(itlb_miss)
+
+ .org ia64_ivt+0x0800
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
+ENTRY(dtlb_miss)
+ DBG_FAULT(2)
+ /*
+ * The DTLB handler accesses the PTE via the virtually mapped linear
+ * page table. If a nested TLB miss occurs, we switch into physical
+ * mode, walk the page table, and then re-execute the PTE read and
+ * go on normally after that.
+ */
+$dtlb_miss_handler
+END(dtlb_miss)
+
+ .org ia64_ivt+0x0c00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
+ENTRY(alt_itlb_miss)
+ DBG_FAULT(3)
+ mov r16=cr.ifa // get address that caused the TLB miss
+ movl r17=PAGE_KERNEL
+ mov r21=cr.ipsr
+ movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+ mov r31=pr
+ ;;
+#ifdef CONFIG_DISABLE_VHPT
+ shr.u r22=r16,61 // get the region number into r21
+ ;;
+ cmp.gt p8,p0=6,r22 // user mode
+ ;;
+(p8) thash r17=r16
+ ;;
+(p8) mov cr.iha=r17
+(p8) mov r29° // save b0
+(p8) br.cond.dptk .itlb_fault
+#endif
+ extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
+ and r19=r19,r16 // clear ed, reserved bits, and PTE control bits
+ shr.u r18=r16,57 // move address bit 61 to bit 4
+ ;;
+ andcm r18=0x10,r18 // bit 4=~address-bit(61)
+ cmp.ne p8,p0=r0,r23 // psr.cpl != 0?
+ or r19=r17,r19 // insert PTE control bits into r19
+ ;;
+ or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6
+(p8) br.cond.spnt page_fault
+ ;;
+ itc.i r19 // insert the TLB entry
+ mov pr=r31,-1
+ rfi
+END(alt_itlb_miss)
+
+ .org ia64_ivt+0x1000
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
+ENTRY(alt_dtlb_miss)
+ DBG_FAULT(4)
+ mov r16=cr.ifa // get address that caused the TLB miss
+ movl r17=PAGE_KERNEL
+ mov r20=cr.isr
+ movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+ mov r21=cr.ipsr
+ mov r31=pr
+ ;;
+#ifdef CONFIG_DISABLE_VHPT
+ shr.u r22=r16,61 // get the region number into r21
+ ;;
+ cmp.gt p8,p0=6,r22 // access to region 0-5
+ ;;
+(p8) thash r17=r16
+ ;;
+(p8) mov cr.iha=r17
+(p8) mov r29° // save b0
+(p8) br.cond.dptk dtlb_fault
+#endif
+ extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
+ and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field
+ tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on?
+ shr.u r18=r16,57 // move address bit 61 to bit 4
+ and r19=r19,r16 // clear ed, reserved bits, and PTE control bits
+ tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on?
+ ;;
+ andcm r18=0x10,r18 // bit 4=~address-bit(61)
+ cmp.ne p8,p0=r0,r23
+(p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field
+(p8) br.cond.spnt page_fault
+
+ dep r21=-1,r21,IA64_PSR_ED_BIT,1
+ or r19=r19,r17 // insert PTE control bits into r19
+ ;;
+ or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6
+(p6) mov cr.ipsr=r21
+ ;;
+(p7) itc.d r19 // insert the TLB entry
+ mov pr=r31,-1
+ rfi
+END(alt_dtlb_miss)
+
+ .org ia64_ivt+0x1400
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
+ENTRY(nested_dtlb_miss)
+ /*
+ * In the absence of kernel bugs, we get here when the virtually mapped linear
+ * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
+ * Access-bit, or Data Access-bit faults). If the DTLB entry for the virtual page
+ * table is missing, a nested TLB miss fault is triggered and control is
+ * transferred to this point. When this happens, we lookup the pte for the
+ * faulting address by walking the page table in physical mode and return to the
+ * continuation point passed in register r30 (or call page_fault if the address is
+ * not mapped).
+ *
+ * Input: r16: faulting address
+ * r29: saved b0
+ * r30: continuation address
+ * r31: saved pr
+ *
+ * Output: r17: physical address of PTE of faulting address
+ * r29: saved b0
+ * r30: continuation address
+ * r31: saved pr
+ *
+ * Clobbered: b0, r18, r19, r21, r22, psr.dt (cleared)
+ */
+$nested_dtlb_miss_handler
+END(nested_dtlb_miss)
+
+ .org ia64_ivt+0x1800
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
+ENTRY(ikey_miss)
+ DBG_FAULT(6)
+ FAULT(6)
+END(ikey_miss)
+
+ //-----------------------------------------------------------------------------------
+ // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
+ENTRY(page_fault)
+ ssm psr.dt
+ ;;
+ srlz.i
+ ;;
+ SAVE_MIN_WITH_COVER
+ alloc r15=ar.pfs,0,0,3,0
+ mov out0=cr.ifa
+ mov out1=cr.isr
+ adds r3=8,r2 // set up second base pointer
+ ;;
+ ssm psr.ic | PSR_DEFAULT_BITS
+ ;;
+ srlz.i // guarantee that interruption collectin is on
+ ;;
+(p15) ssm psr.i // restore psr.i
+ movl r14=ia64_leave_kernel
+ ;;
+ SAVE_REST
+ mov rp=r14
+ ;;
+ adds out2\x16,r12 // out2 = pointer to pt_regs
+ br.call.sptk.many b6=ia64_do_page_fault // ignore return address
+END(page_fault)
+
+ .org ia64_ivt+0x1c00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
+ENTRY(dkey_miss)
+ DBG_FAULT(7)
+ FAULT(7)
+END(dkey_miss)
+
+ .org ia64_ivt+0x2000
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
+ENTRY(dirty_bit)
+ DBG_FAULT(8)
+$dirty_bit_handler
+END(dirty_bit)
+
+ .org ia64_ivt+0x2400
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
+ENTRY(iaccess_bit)
+ DBG_FAULT(9)
+$iaccess_bit_handler
+END(iaccess_bit)
+
+ .org ia64_ivt+0x2800
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
+ENTRY(daccess_bit)
+ DBG_FAULT(10)
+$daccess_bit_handler
+END(daccess_bit)
+
+ .org ia64_ivt+0x2c00
--- STRIPPED : rest of the file remains the same ----
Index: linux-2.6.17-rc3-lvhpt-v2-fresh/arch/ia64/kernel/ivt.S
=================================--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.17-rc3-lvhpt-v2-fresh/arch/ia64/kernel/ivt-sfvhpt.in 2006-05-03 15:53:11.000000000 +1000
@@ -0,0 +1,444 @@
+/*
+ * This file is to be processed and inserted into the actual ivt.S
+ *
+ * Any variable $name in ivt.S will be replaced with what is between
+ *__begin_name__ and __end_name__ in this file.
+ *
+ */
+
+//vhpt_miss
+__begin_vhpt_miss_handler__
+ mov r16=cr.ifa // get address that caused the TLB miss
+#ifdef CONFIG_HUGETLB_PAGE
+ movl r18=PAGE_SHIFT
+ mov r25=cr.itir
+#endif
+ ;;
+ rsm psr.dt // use physical addressing for data
+ mov r31=pr // save the predicate registers
+ mov r19=IA64_KR(PT_BASE) // get page table base address
+ shl r21=r16,3 // shift bit 60 into sign bit
+ shr.u r17=r16,61 // get the region number into r17
+ ;;
+ shr.u r22=r21,3
+#ifdef CONFIG_HUGETLB_PAGE
+ extr.u r26=r25,2,6
+ ;;
+ cmp.ne p8,p0=r18,r26
+ sub r27=r26,r18
+ ;;
+(p8) dep r25=r18,r25,2,6
+(p8) shr r22=r22,r27
+#endif
+ ;;
+ cmp.eq p6,p7=5,r17 // is IFA pointing into to region 5?
+ shr.u r18=r22,PGDIR_SHIFT // get bottom portion of pgd index bit
+ ;;
+(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place
+
+ srlz.d
+ LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir
+
+ .pred.rel "mutex", p6, p7
+(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
+(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
+ ;;
+(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
+(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
+ cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
+#ifdef CONFIG_PGTABLE_4
+ shr.u r28=r22,PUD_SHIFT // shift pud index into position
+#else
+ shr.u r18=r22,PMD_SHIFT // shift pmd index into position
+#endif
+ ;;
+ ld8 r17=[r17] // get *pgd (may be 0)
+ ;;
+(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) = NULL?
+#ifdef CONFIG_PGTABLE_4
+ dep r28=r28,r17,3,(PAGE_SHIFT-3) // r28=pud_offset(pgd,addr)
+ ;;
+ shr.u r18=r22,PMD_SHIFT // shift pmd index into position
+(p7) ld8 r29=[r28] // get *pud (may be 0)
+ ;;
+(p7) cmp.eq.or.andcm p6,p7=r29,r0 // was pud_present(*pud) = NULL?
+ dep r17=r18,r29,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr)
+#else
+ dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pgd,addr)
+#endif
+ ;;
+(p7) ld8 r20=[r17] // get *pmd (may be 0)
+ shr.u r19=r22,PAGE_SHIFT // shift pte index into position
+ ;;
+(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was pmd_present(*pmd) = NULL?
+ dep r21=r19,r20,3,(PAGE_SHIFT-3) // r21=pte_offset(pmd,addr)
+ ;;
+(p7) ld8 r18=[r21] // read *pte
+ mov r19=cr.isr // cr.isr bit 32 tells us if this is an insn miss
+ ;;
+(p7) tbit.z p6,p7=r18,_PAGE_P_BIT // page present bit cleared?
+ mov r22=cr.iha // get the VHPT address that caused the TLB miss
+ ;; // avoid RAW on p7
+(p7) tbit.nz.unc p10,p11=r19,32 // is it an instruction TLB miss?
+ dep r23=0,r20,0,PAGE_SHIFT // clear low bits to get page address
+ ;;
+(p10) itc.i r18 // insert the instruction TLB entry
+(p11) itc.d r18 // insert the data TLB entry
+(p6) br.cond.spnt.many page_fault // handle bad address/page not present (page fault)
+ mov cr.ifa=r22
+
+#ifdef CONFIG_HUGETLB_PAGE
+(p8) mov cr.itir=r25 // change to default page-size for VHPT
+#endif
+
+ /*
+ * Now compute and insert the TLB entry for the virtual page table. We never
+ * execute in a page table page so there is no need to set the exception deferral
+ * bit.
+ */
+ adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
+ ;;
+(p7) itc.d r24
+ ;;
+#ifdef CONFIG_SMP
+ /*
+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
+ * cannot possibly affect the following loads:
+ */
+ dv_serialize_data
+
+ /*
+ * Re-check pagetable entry. If they changed, we may have received a ptc.g
+ * between reading the pagetable and the "itc". If so, flush the entry we
+ * inserted and retry. At this point, we have:
+ *
+ * r28 = equivalent of pud_offset(pgd, ifa)
+ * r17 = equivalent of pmd_offset(pud, ifa)
+ * r21 = equivalent of pte_offset(pmd, ifa)
+ *
+ * r29 = *pud
+ * r20 = *pmd
+ * r18 = *pte
+ */
+ ld8 r25=[r21] // read *pte again
+ ld8 r26=[r17] // read *pmd again
+#ifdef CONFIG_PGTABLE_4
+ ld8 r19=[r28] // read *pud again
+#endif
+ cmp.ne p6,p7=r0,r0
+ ;;
+ cmp.ne.or.andcm p6,p7=r26,r20 // did *pmd change
+#ifdef CONFIG_PGTABLE_4
+ cmp.ne.or.andcm p6,p7=r19,r29 // did *pud change
+#endif
+ mov r27=PAGE_SHIFT<<2
+ ;;
+(p6) ptc.l r22,r27 // purge PTE page translation
+(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did *pte change
+ ;;
+(p6) ptc.l r16,r27 // purge translation
+#endif
+
+ mov pr=r31,-1 // restore predicate registers
+ rfi
+__end_vhpt_miss_handler__
+
+// itlb_miss
+__begin_itlb_miss_handler__
+ mov r16=cr.ifa // get virtual address
+ mov r29° // save b0
+ mov r31=pr // save predicates
+.itlb_fault:
+ mov r17=cr.iha // get virtual address of PTE
+ movl r30\x1f // load nested fault continuation point
+ ;;
+1: ld8 r18=[r17] // read *pte
+ ;;
+ mov b0=r29
+ tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared?
+(p6) br.cond.spnt page_fault
+ ;;
+ itc.i r18
+ ;;
+#ifdef CONFIG_SMP
+ /*
+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
+ * cannot possibly affect the following loads:
+ */
+ dv_serialize_data
+
+ ld8 r19=[r17] // read *pte again and see if same
+ mov r20=PAGE_SHIFT<<2 // setup page size for purge
+ ;;
+ cmp.ne p7,p0=r18,r19
+ ;;
+(p7) ptc.l r16,r20
+#endif
+ mov pr=r31,-1
+ rfi
+__end_itlb_miss_handler__
+
+// dtlb_miss
+__begin_dtlb_miss_handler__
+ mov r16=cr.ifa // get virtual address
+ mov r29° // save b0
+ mov r31=pr // save predicates
+dtlb_fault:
+ mov r17=cr.iha // get virtual address of PTE
+ movl r30\x1f // load nested fault continuation point
+ ;;
+1: ld8 r18=[r17] // read *pte
+ ;;
+ mov b0=r29
+ tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared?
+(p6) br.cond.spnt page_fault
+ ;;
+ itc.d r18
+ ;;
+#ifdef CONFIG_SMP
+ /*
+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
+ * cannot possibly affect the following loads:
+ */
+ dv_serialize_data
+
+ ld8 r19=[r17] // read *pte again and see if same
+ mov r20=PAGE_SHIFT<<2 // setup page size for purge
+ ;;
+ cmp.ne p7,p0=r18,r19
+ ;;
+(p7) ptc.l r16,r20
+#endif
+ mov pr=r31,-1
+ rfi
+__end_dtlb_miss_handler__
+
+// nested_dtlb_miss
+__begin_nested_dtlb_miss_handler__
+ rsm psr.dt // switch to using physical data addressing
+ mov r19=IA64_KR(PT_BASE) // get the page table base address
+ shl r21=r16,3 // shift bit 60 into sign bit
+ mov r18=cr.itir
+ ;;
+ shr.u r17=r16,61 // get the region number into r17
+ extr.u r18=r18,2,6 // get the faulting page size
+ ;;
+ cmp.eq p6,p7=5,r17 // is faulting address in region 5?
+ add r22=-PAGE_SHIFT,r18 // adjustment for hugetlb address
+ add r18=PGDIR_SHIFT-PAGE_SHIFT,r18
+ ;;
+ shr.u r22=r16,r22
+ shr.u r18=r16,r18
+(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place
+
+ srlz.d
+ LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir
+
+ .pred.rel "mutex", p6, p7
+(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
+(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
+ ;;
+(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
+(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
+ cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
+#ifdef CONFIG_PGTABLE_4
+ shr.u r18=r22,PUD_SHIFT // shift pud index into position
+#else
+ shr.u r18=r22,PMD_SHIFT // shift pmd index into position
+#endif
+ ;;
+ ld8 r17=[r17] // get *pgd (may be 0)
+ ;;
+(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) = NULL?
+ dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=p[u|m]d_offset(pgd,addr)
+ ;;
+#ifdef CONFIG_PGTABLE_4
+(p7) ld8 r17=[r17] // get *pud (may be 0)
+ shr.u r18=r22,PMD_SHIFT // shift pmd index into position
+ ;;
+(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pud_present(*pud) = NULL?
+ dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr)
+ ;;
+#endif
+(p7) ld8 r17=[r17] // get *pmd (may be 0)
+ shr.u r19=r22,PAGE_SHIFT // shift pte index into position
+ ;;
+(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pmd_present(*pmd) = NULL?
+ dep r17=r19,r17,3,(PAGE_SHIFT-3) // r17=pte_offset(pmd,addr);
+(p6) br.cond.spnt page_fault
+ mov b0=r30
+ br.sptk.many b0 // return to continuation point
+__end_nested_dtlb_miss_handler__
+
+// dirty bit
+__begin_dirty_bit_handler__
+ /*
+ * What we do here is to simply turn on the dirty bit in the PTE. We need to
+ * update both the page-table and the TLB entry. To efficiently access the PTE,
+ * we address it through the virtual page table. Most likely, the TLB entry for
+ * the relevant virtual page table page is still present in the TLB so we can
+ * normally do this without additional TLB misses. In case the necessary virtual
+ * page table TLB entry isn't present, we take a nested TLB miss hit where we look
+ * up the physical address of the L3 PTE and then continue at label 1 below.
+ */
+ mov r16=cr.ifa // get the address that caused the fault
+ movl r30\x1f // load continuation point in case of nested fault
+ ;;
+ thash r17=r16 // compute virtual address of L3 PTE
+ mov r29° // save b0 in case of nested fault
+ mov r31=pr // save pr
+#ifdef CONFIG_SMP
+ mov r28=ar.ccv // save ar.ccv
+ ;;
+1: ld8 r18=[r17]
+ ;; // avoid RAW on r18
+ mov ar.ccv=r18 // set compare value for cmpxchg
+ or r25=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits
+ tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit
+ ;;
+(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only update if page is present
+ mov r24=PAGE_SHIFT<<2
+ ;;
+(p6) cmp.eq p6,p7=r26,r18 // Only compare if page is present
+ ;;
+(p6) itc.d r25 // install updated PTE
+ ;;
+ /*
+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
+ * cannot possibly affect the following loads:
+ */
+ dv_serialize_data
+
+ ld8 r18=[r17] // read PTE again
+ ;;
+ cmp.eq p6,p7=r18,r25 // is it same as the newly installed
+ ;;
+(p7) ptc.l r16,r24
+ mov b0=r29 // restore b0
+ mov ar.ccv=r28
+#else
+ ;;
+1: ld8 r18=[r17]
+ ;; // avoid RAW on r18
+ or r18=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits
+ mov b0=r29 // restore b0
+ ;;
+ st8 [r17]=r18 // store back updated PTE
+ itc.d r18 // install updated PTE
+#endif
+ mov pr=r31,-1 // restore pr
+ rfi
+__end_dirty_bit_handler__
+
+// iaccess bit
+__begin_iaccess_bit_handler__
+ // Like dirty bit handler, except for instruction access
+ mov r16=cr.ifa // get the address that caused the fault
+ movl r30\x1f // load continuation point in case of nested fault
+ mov r31=pr // save predicates
+#ifdef CONFIG_ITANIUM
+ /*
+ * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
+ */
+ mov r17=cr.ipsr
+ ;;
+ mov r18=cr.iip
+ tbit.z p6,p0=r17,IA64_PSR_IS_BIT // IA64 instruction set?
+ ;;
+(p6) mov r16=r18 // if so, use cr.iip instead of cr.ifa
+#endif /* CONFIG_ITANIUM */
+ ;;
+ thash r17=r16 // compute virtual address of L3 PTE
+ mov r29° // save b0 in case of nested fault)
+#ifdef CONFIG_SMP
+ mov r28=ar.ccv // save ar.ccv
+ ;;
+1: ld8 r18=[r17]
+ ;;
+ mov ar.ccv=r18 // set compare value for cmpxchg
+ or r25=_PAGE_A,r18 // set the accessed bit
+ tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit
+ ;;
+(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only if page present
+ mov r24=PAGE_SHIFT<<2
+ ;;
+(p6) cmp.eq p6,p7=r26,r18 // Only if page present
+ ;;
+(p6) itc.i r25 // install updated PTE
+ ;;
+ /*
+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
+ * cannot possibly affect the following loads:
+ */
+ dv_serialize_data
+
+ ld8 r18=[r17] // read PTE again
+ ;;
+ cmp.eq p6,p7=r18,r25 // is it same as the newly installed
+ ;;
+(p7) ptc.l r16,r24
+ mov b0=r29 // restore b0
+ mov ar.ccv=r28
+#else /* !CONFIG_SMP */
+ ;;
+1: ld8 r18=[r17]
+ ;;
+ or r18=_PAGE_A,r18 // set the accessed bit
+ mov b0=r29 // restore b0
+ ;;
+ st8 [r17]=r18 // store back updated PTE
+ itc.i r18 // install updated PTE
+#endif /* !CONFIG_SMP */
+ mov pr=r31,-1
+ rfi
+__end_iaccess_bit_handler__
+
+// daccess bit
+__begin_daccess_bit_handler__
+ // Like dirty bit handler, except for data access
+ mov r16=cr.ifa // get the address that caused the fault
+ movl r30\x1f // load continuation point in case of nested fault
+ ;;
+ thash r17=r16 // compute virtual address of L3 PTE
+ mov r31=pr
+ mov r29° // save b0 in case of nested fault)
+#ifdef CONFIG_SMP
+ mov r28=ar.ccv // save ar.ccv
+ ;;
+1: ld8 r18=[r17]
+ ;; // avoid RAW on r18
+ mov ar.ccv=r18 // set compare value for cmpxchg
+ or r25=_PAGE_A,r18 // set the dirty bit
+ tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit
+ ;;
+(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only if page is present
+ mov r24=PAGE_SHIFT<<2
+ ;;
+(p6) cmp.eq p6,p7=r26,r18 // Only if page is present
+ ;;
+(p6) itc.d r25 // install updated PTE
+ /*
+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
+ * cannot possibly affect the following loads:
+ */
+ dv_serialize_data
+ ;;
+ ld8 r18=[r17] // read PTE again
+ ;;
+ cmp.eq p6,p7=r18,r25 // is it same as the newly installed
+ ;;
+(p7) ptc.l r16,r24
+ mov ar.ccv=r28
+#else
+ ;;
+1: ld8 r18=[r17]
+ ;; // avoid RAW on r18
+ or r18=_PAGE_A,r18 // set the accessed bit
+ ;;
+ st8 [r17]=r18 // store back updated PTE
+ itc.d r18 // install updated PTE
+#endif
+ mov b0=r29 // restore b0
+ mov pr=r31,-1
+ rfi
+__end_daccess_bit_handler__
Index: linux-2.6.17-rc3-lvhpt-v2-fresh/arch/ia64/kernel/Makefile
=================================--- linux-2.6.17-rc3-lvhpt-v2-fresh.orig/arch/ia64/kernel/Makefile 2006-05-03 14:24:50.000000000 +1000
+++ linux-2.6.17-rc3-lvhpt-v2-fresh/arch/ia64/kernel/Makefile 2006-05-03 15:52:39.000000000 +1000
@@ -59,3 +59,8 @@
# We must build gate.so before we can assemble it.
# Note: kbuild does not track this dependency due to usage of .incbin
$(obj)/gate-data.o: $(obj)/gate.so
+
+# The real ivt.S needs to be built
+AFLAGS_ivt.o += -I$(srctree)/arch/ia64/kernel
+$(obj)/ivt.S: $(src)/ivt.S.in $(src)/ivt-sfvhpt.in
+ $(srctree)/arch/ia64/scripts/merge.py $@ $^
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC 2/3] LVHPT - Setup LVHPT
2006-05-02 17:30 ` Chen, Kenneth W
@ 2006-05-03 7:49 ` Ian Wienand
2006-05-03 8:07 ` Christian Hildner
0 siblings, 1 reply; 16+ messages in thread
From: Ian Wienand @ 2006-05-03 7:49 UTC (permalink / raw)
To: Chen, Kenneth W; +Cc: Luck, Tony, linux-ia64, linux-mm
On Tue, May 02, 2006 at 10:30:07AM -0700, Chen, Kenneth W wrote:
> Boot time option to the rescue! I have a patch that does just like that.
Being relatively inexperienced, all this dynamic patching (SMP, page
table, this) scares me in that what is executing diverges from what
appears to be in source code, making difficult things even more
difficult to debug. Is there consensus that a long term goal should
be that short and long formats should be dynamically selectable?
-i
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC 2/3] LVHPT - Setup LVHPT
2006-05-03 7:49 ` Ian Wienand
@ 2006-05-03 8:07 ` Christian Hildner
0 siblings, 0 replies; 16+ messages in thread
From: Christian Hildner @ 2006-05-03 8:07 UTC (permalink / raw)
To: Ian Wienand; +Cc: Chen, Kenneth W, Luck, Tony, linux-ia64, linux-mm
Ian Wienand schrieb:
>Being relatively inexperienced, all this dynamic patching (SMP, page
>table, this) scares me in that what is executing diverges from what
>appears to be in source code, making difficult things even more
>difficult to debug. Is there consensus that a long term goal should
>be that short and long formats should be dynamically selectable?
>
Yes. So why not picking up Ken's idea of two parallel IVTs. Best
practice and probably the most readable solution might be the usage of
common macros for all the common entires (like EXTERNAL_INTERRUPT_CODE),
so that only the VHPT-specific entries would be coded directly in the
corrensponding ivt.S. Straightforward and without patching, code
generation, ...
Christian
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC 2/3] LVHPT - Setup LVHPT
2006-05-02 5:25 ` [RFC 2/3] LVHPT - Setup LVHPT Ian Wienand
` (2 preceding siblings ...)
2006-05-02 21:33 ` Luck, Tony
@ 2006-05-03 10:53 ` Ian Wienand
2006-05-03 11:15 ` Christian Hildner
2006-05-04 16:58 ` Luck, Tony
5 siblings, 0 replies; 16+ messages in thread
From: Ian Wienand @ 2006-05-03 10:53 UTC (permalink / raw)
To: linux-ia64
[-- Attachment #1: Type: text/plain, Size: 491 bytes --]
On Wed, May 03, 2006 at 10:07:35AM +0200, Christian Hildner wrote:
> Yes. So why not picking up Ken's idea of two parallel IVTs.
I have a version of this, which I've tried to post here but I think
the size limit here is very low. It made it through to linux-mm
http://marc.theaimsgroup.com/?l=linux-mm&m=114664224732618&w=2
or the patches are at
http://www.gelato.unsw.edu.au/~ianw/lvhpt/patches/v2
It generates ivt.S using a little script. Any feedback on this
approach welcome.
-i
[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 191 bytes --]
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC 2/3] LVHPT - Setup LVHPT
2006-05-02 5:25 ` [RFC 2/3] LVHPT - Setup LVHPT Ian Wienand
` (3 preceding siblings ...)
2006-05-03 10:53 ` Ian Wienand
@ 2006-05-03 11:15 ` Christian Hildner
2006-05-04 16:58 ` Luck, Tony
5 siblings, 0 replies; 16+ messages in thread
From: Christian Hildner @ 2006-05-03 11:15 UTC (permalink / raw)
To: linux-ia64
Ian Wienand schrieb:
>On Wed, May 03, 2006 at 10:07:35AM +0200, Christian Hildner wrote:
>
>
>>Yes. So why not picking up Ken's idea of two parallel IVTs.
>>
>>
>
>I have a version of this, which I've tried to post here but I think
>the size limit here is very low. It made it through to linux-mm
>
>http://marc.theaimsgroup.com/?l=linux-mm&m\x114664224732618&w=2
>
>or the patches are at
>
>http://www.gelato.unsw.edu.au/~ianw/lvhpt/patches/v2
>
>It generates ivt.S using a little script. Any feedback on this
>approach welcome.
>
Ian,
I suggested a solution without having any script at all instead working
with something like ivt.h containing macros for all common code and
ivt_svhpt.S and ivt_lvhpt.S in parallel for specific code. It just might
be more readable than your approach and both ways would be treated equal
by the make process. Technically they are the same.
Christian
^ permalink raw reply [flat|nested] 16+ messages in thread
* RE: [RFC 2/3] LVHPT - Setup LVHPT
2006-05-02 5:25 ` [RFC 2/3] LVHPT - Setup LVHPT Ian Wienand
` (4 preceding siblings ...)
2006-05-03 11:15 ` Christian Hildner
@ 2006-05-04 16:58 ` Luck, Tony
5 siblings, 0 replies; 16+ messages in thread
From: Luck, Tony @ 2006-05-04 16:58 UTC (permalink / raw)
To: Ian Wienand, Chen, Kenneth W; +Cc: linux-ia64, linux-mm
> Being relatively inexperienced, all this dynamic patching (SMP, page
> table, this) scares me in that what is executing diverges from what
> appears to be in source code, making difficult things even more
> difficult to debug. Is there consensus that a long term goal should
> be that short and long formats should be dynamically selectable?
I wouldn't rule anything out until I see what can be done, and how
maintainable the code to do it is. Perhaps someone will come up with
the ultimate in dynamic selection and use long format for some processes,
and short format for others (and thus get around the objections that
some workloads perform less well with long format).
-Tony
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2006-05-04 16:58 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-05-02 5:25 [RFC 0/3] IA64 Long Format VHPT support Ian Wienand
2006-05-02 5:25 ` [RFC 1/3] LVHPT - Fault handler modifications Ian Wienand
2006-05-02 8:04 ` Keith Owens
2006-05-02 17:40 ` Chen, Kenneth W
2006-05-03 7:42 ` Ian Wienand
2006-05-02 5:25 ` [RFC 2/3] LVHPT - Setup LVHPT Ian Wienand
2006-05-02 15:03 ` Luck, Tony
2006-05-02 21:29 ` Ian Wienand
2006-05-02 17:30 ` Chen, Kenneth W
2006-05-03 7:49 ` Ian Wienand
2006-05-03 8:07 ` Christian Hildner
2006-05-02 21:33 ` Luck, Tony
2006-05-03 10:53 ` Ian Wienand
2006-05-03 11:15 ` Christian Hildner
2006-05-04 16:58 ` Luck, Tony
2006-05-02 5:26 ` [RFC 3/3] LVHPT - LVHPT MM support functions Ian Wienand
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox