* [Patch 1/1] 4-level page tables v4.
@ 2005-11-10 16:19 Robin Holt
2005-11-10 21:49 ` Luck, Tony
` (10 more replies)
0 siblings, 11 replies; 12+ messages in thread
From: Robin Holt @ 2005-11-10 16:19 UTC (permalink / raw)
To: linux-ia64
This patch introduces 4-level page tables to ia64. I have run
some benchmarks and found nothing interesting. Performance has
consistently fallen within the noise range.
It also introduces a config option (setting the default to 3
levels). The config option prevents having 4 level page
tables with 64k base page size.
Signed-off-by: Robin Holt <holt@sgi.com>
Index: linux-2.6/include/asm-ia64/pgtable.h
=================================--- linux-2.6.orig/include/asm-ia64/pgtable.h 2005-11-10 06:49:03.398374164 -0600
+++ linux-2.6/include/asm-ia64/pgtable.h 2005-11-10 06:50:43.490981172 -0600
@@ -84,32 +84,55 @@
#define __DIRTY_BITS _PAGE_ED | __DIRTY_BITS_NO_ED
/*
- * Definitions for first level:
- *
- * PGDIR_SHIFT determines what a first-level page table entry can map.
+ * How many pointers will a page table level hold expressed in shift
*/
-#define PGDIR_SHIFT (PAGE_SHIFT + 2*(PAGE_SHIFT-3))
-#define PGDIR_SIZE (__IA64_UL(1) << PGDIR_SHIFT)
-#define PGDIR_MASK (~(PGDIR_SIZE-1))
-#define PTRS_PER_PGD (1UL << (PAGE_SHIFT-3))
-#define USER_PTRS_PER_PGD (5*PTRS_PER_PGD/8) /* regions 0-4 are user regions */
-#define FIRST_USER_ADDRESS 0
+#define PTRS_PER_PTD_SHIFT (PAGE_SHIFT-3)
/*
- * Definitions for second level:
+ * Definitions for fourth level:
+ */
+#define PTRS_PER_PTE (__IA64_UL(1) << (PTRS_PER_PTD_SHIFT))
+
+/*
+ * Definitions for third level:
*
- * PMD_SHIFT determines the size of the area a second-level page table
+ * PMD_SHIFT determines the size of the area a third-level page table
* can map.
*/
-#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3))
+#define PMD_SHIFT (PAGE_SHIFT + (PTRS_PER_PTD_SHIFT))
#define PMD_SIZE (1UL << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE-1))
-#define PTRS_PER_PMD (1UL << (PAGE_SHIFT-3))
+#define PTRS_PER_PMD (1UL << (PTRS_PER_PTD_SHIFT))
+#ifdef CONFIG_PGTABLE_4
/*
- * Definitions for third level:
+ * Definitions for second level:
+ *
+ * PUD_SHIFT determines the size of the area a second-level page table
+ * can map.
*/
-#define PTRS_PER_PTE (__IA64_UL(1) << (PAGE_SHIFT-3))
+#define PUD_SHIFT (PMD_SHIFT + (PTRS_PER_PTD_SHIFT))
+#define PUD_SIZE (1UL << PUD_SHIFT)
+#define PUD_MASK (~(PUD_SIZE-1))
+#define PTRS_PER_PUD (1UL << (PTRS_PER_PTD_SHIFT))
+#endif
+
+/*
+ * Definitions for first level:
+ *
+ * PGDIR_SHIFT determines what a first-level page table entry can map.
+ */
+#ifdef CONFIG_PGTABLE_4
+#define PGDIR_SHIFT (PUD_SHIFT + (PTRS_PER_PTD_SHIFT))
+#else
+#define PGDIR_SHIFT (PMD_SHIFT + (PTRS_PER_PTD_SHIFT))
+#endif
+#define PGDIR_SIZE (__IA64_UL(1) << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE-1))
+#define PTRS_PER_PGD_SHIFT PTRS_PER_PTD_SHIFT
+#define PTRS_PER_PGD (1UL << PTRS_PER_PGD_SHIFT)
+#define USER_PTRS_PER_PGD (5*PTRS_PER_PGD/8) /* regions 0-4 are user regions */
+#define FIRST_USER_ADDRESS 0
/*
* All the normal masks have the "page accessed" bits on, as any time
@@ -161,6 +184,9 @@
#define __S111 __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RWX)
#define pgd_ERROR(e) printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
+#ifdef CONFIG_PGTABLE_4
+#define pud_ERROR(e) printk("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
+#endif
#define pmd_ERROR(e) printk("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
#define pte_ERROR(e) printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
@@ -218,6 +244,9 @@ ia64_phys_addr_valid (unsigned long addr
#define kc_vaddr_to_offset(v) ((v) - RGN_BASE(RGN_GATE))
#define kc_offset_to_vaddr(o) ((o) + RGN_BASE(RGN_GATE))
+#define RGN_MAP_SHIFT (PGDIR_SHIFT + PTRS_PER_PGD_SHIFT - 3)
+#define RGN_MAP_LIMIT ((1UL << RGN_MAP_SHIFT) - PAGE_SIZE) /* per region addr limit */
+
/*
* Conversion functions: convert page frame number (pfn) and a protection value to a page
* table entry (pte).
@@ -254,9 +283,16 @@ ia64_phys_addr_valid (unsigned long addr
#define pud_bad(pud) (!ia64_phys_addr_valid(pud_val(pud)))
#define pud_present(pud) (pud_val(pud) != 0UL)
#define pud_clear(pudp) (pud_val(*(pudp)) = 0UL)
-
#define pud_page(pud) ((unsigned long) __va(pud_val(pud) & _PFN_MASK))
+#ifdef CONFIG_PGTABLE_4
+#define pgd_none(pgd) (!pgd_val(pgd))
+#define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd)))
+#define pgd_present(pgd) (pgd_val(pgd) != 0UL)
+#define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL)
+#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & _PFN_MASK))
+#endif
+
/*
* The following have defined behavior only work if pte_present() is true.
*/
@@ -324,7 +360,13 @@ pgd_offset (struct mm_struct *mm, unsign
here. */
#define pgd_offset_gate(mm, addr) pgd_offset_k(addr)
+#ifdef CONFIG_PGTABLE_4
/* Find an entry in the second-level page table.. */
+#define pud_offset(dir,addr) \
+ ((pud_t *) pgd_page(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
+#endif
+
+/* Find an entry in the third-level page table.. */
#define pmd_offset(dir,addr) \
((pmd_t *) pud_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
@@ -557,7 +599,9 @@ do { \
#define __HAVE_ARCH_PGD_OFFSET_GATE
#define __HAVE_ARCH_LAZY_MMU_PROT_UPDATE
+#ifndef CONFIG_PGTABLE_4
#include <asm-generic/pgtable-nopud.h>
+#endif
#include <asm-generic/pgtable.h>
#endif /* _ASM_IA64_PGTABLE_H */
Index: linux-2.6/include/asm-ia64/pgalloc.h
=================================--- linux-2.6.orig/include/asm-ia64/pgalloc.h 2005-11-10 06:49:03.398374164 -0600
+++ linux-2.6/include/asm-ia64/pgalloc.h 2005-11-10 06:50:43.490981172 -0600
@@ -86,6 +86,25 @@ static inline void pgd_free(pgd_t * pgd)
pgtable_quicklist_free(pgd);
}
+#ifdef CONFIG_PGTABLE_4
+static inline void
+pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud)
+{
+ pgd_val(*pgd_entry) = __pa(pud);
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return pgtable_quicklist_alloc();
+}
+
+static inline void pud_free(pud_t * pud)
+{
+ pgtable_quicklist_free(pud);
+}
+#define __pud_free_tlb(tlb, pud) pud_free(pud)
+#endif /* CONFIG_PGTABLE_4 */
+
static inline void
pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
{
Index: linux-2.6/include/asm-ia64/page.h
=================================--- linux-2.6.orig/include/asm-ia64/page.h 2005-11-10 06:49:03.398374164 -0600
+++ linux-2.6/include/asm-ia64/page.h 2005-11-10 06:50:43.491957638 -0600
@@ -47,8 +47,6 @@
#define PERCPU_PAGE_SHIFT 16 /* log2() of max. size of per-CPU area */
#define PERCPU_PAGE_SIZE (__IA64_UL_CONST(1) << PERCPU_PAGE_SHIFT)
-#define RGN_MAP_LIMIT ((1UL << (4*PAGE_SHIFT - 12)) - PAGE_SIZE) /* per region addr limit */
-
#ifdef CONFIG_HUGETLB_PAGE
# define HPAGE_REGION_BASE RGN_BASE(RGN_HPAGE)
@@ -175,11 +173,17 @@ get_order (unsigned long size)
*/
typedef struct { unsigned long pte; } pte_t;
typedef struct { unsigned long pmd; } pmd_t;
+#ifdef CONFIG_PGTABLE_4
+ typedef struct { unsigned long pud; } pud_t;
+#endif
typedef struct { unsigned long pgd; } pgd_t;
typedef struct { unsigned long pgprot; } pgprot_t;
# define pte_val(x) ((x).pte)
# define pmd_val(x) ((x).pmd)
+#ifdef CONFIG_PGTABLE_4
+# define pud_val(x) ((x).pud)
+#endif
# define pgd_val(x) ((x).pgd)
# define pgprot_val(x) ((x).pgprot)
Index: linux-2.6/arch/ia64/kernel/ivt.S
=================================--- linux-2.6.orig/arch/ia64/kernel/ivt.S 2005-11-10 06:49:03.399350630 -0600
+++ linux-2.6/arch/ia64/kernel/ivt.S 2005-11-10 10:06:49.272116003 -0600
@@ -114,7 +114,7 @@ ENTRY(vhpt_miss)
shl r21=r16,3 // shift bit 60 into sign bit
shr.u r17=r16,61 // get the region number into r17
;;
- shr r22=r21,3
+ shr.u r22=r21,3
#ifdef CONFIG_HUGETLB_PAGE
extr.u r26=r25,2,6
;;
@@ -140,27 +140,42 @@ ENTRY(vhpt_miss)
(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8
(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
- shr.u r18=r22,PMD_SHIFT // shift L2 index into position
+#ifdef CONFIG_PGTABLE_4
+ shr.u r28=r22,PUD_SHIFT // shift L2 index into position
+#else
+ shr.u r20=r22,PMD_SHIFT // shift L3 index into position
+#endif
;;
ld8 r17=[r17] // fetch the L1 entry (may be 0)
;;
+#ifdef CONFIG_PGTABLE_4
+ dep r28=r28,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL?
- dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
;;
-(p7) ld8 r20=[r17] // fetch the L2 entry (may be 0)
- shr.u r19=r22,PAGE_SHIFT // shift L3 index into position
+(p7) ld8 r29=[r28] // fetch the L2 entry (may be 0)
+ shr.u r20=r22,PMD_SHIFT // shift L3 index into position
+ ;;
+ dep r30=r20,r29,3,(PAGE_SHIFT-3) // compute address of L3 page table entry
+(p7) cmp.eq.or.andcm p6,p7=r29,r0 // was L2 entry NULL?
+#else
+ dep r30=r20,r17,3,(PAGE_SHIFT-3) // compute address of L3 page table entry
+(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL?
+#endif
;;
-(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was L2 entry NULL?
- dep r21=r19,r20,3,(PAGE_SHIFT-3) // compute address of L3 page table entry
+(p7) ld8 r20=[r30] // fetch the L3 entry (may be 0)
+ shr.u r19=r22,PAGE_SHIFT // shift L4 index into position
;;
-(p7) ld8 r18=[r21] // read the L3 PTE
+ dep r21=r19,r20,3,(PAGE_SHIFT-3) // compute address of L4 page table entry
+(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was L3 entry NULL?
+ ;;
+(p7) ld8 r18=[r21] // read the L4 PTE
+ dep r23=0,r20,0,PAGE_SHIFT // clear low bits to get page address
mov r19=cr.isr // cr.isr bit 0 tells us if this is an insn miss
;;
(p7) tbit.z p6,p7=r18,_PAGE_P_BIT // page present bit cleared?
mov r22=cr.iha // get the VHPT address that caused the TLB miss
;; // avoid RAW on p7
(p7) tbit.nz.unc p10,p11=r19,32 // is it an instruction TLB miss?
- dep r23=0,r20,0,PAGE_SHIFT // clear low bits to get page address
;;
(p10) itc.i r18 // insert the instruction TLB entry
(p11) itc.d r18 // insert the data TLB entry
@@ -192,14 +207,23 @@ ENTRY(vhpt_miss)
* between reading the pagetable and the "itc". If so, flush the entry we
* inserted and retry.
*/
- ld8 r25=[r21] // read L3 PTE again
- ld8 r26=[r17] // read L2 entry again
+ ld8 r26=[r30] // read L3 entry again
+#ifdef CONFIG_PGTABLE_4
+ ld8 r30=[r28] // read L2 entry again
;;
- cmp.ne p6,p7=r26,r20 // did L2 entry change
+ cmp.ne p6,p7=r30,r29 // did L2 entry change
mov r27=PAGE_SHIFT<<2
;;
+(p7) cmp.ne.or.andcm p6,p7=r26,r20 // did L3 entry change
+#else
+ ;;
+ cmp.ne p6,p7=r26,r20 // did L3 entry change
+ mov r27=PAGE_SHIFT<<2
+#endif
+ ld8 r25=[r21] // read L4 PTE again
+ ;;
(p6) ptc.l r22,r27 // purge PTE page translation
-(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did L3 PTE change
+(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did L4 PTE change
;;
(p6) ptc.l r16,r27 // purge translation
#endif
@@ -432,18 +456,34 @@ ENTRY(nested_dtlb_miss)
(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8
(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
- shr.u r18=r22,PMD_SHIFT // shift L2 index into position
+#ifdef CONFIG_PGTABLE_4
+ shr.u r19=r22,PUD_SHIFT // shift L2 index into position
+#else
+ shr.u r18=r22,PMD_SHIFT // shift L3 index into position
+#endif
;;
+#ifdef CONFIG_PGTABLE_4
+ shr.u r18=r22,PMD_SHIFT // shift L3 index into position
+#else
+ shr.u r19=r22,PAGE_SHIFT // shift L4 index into position
+#endif
ld8 r17=[r17] // fetch the L1 entry (may be 0)
;;
(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL?
- dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
+#ifdef CONFIG_PGTABLE_4
+ dep r17=r19,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
;;
+ shr.u r19=r22,PAGE_SHIFT // shift L4 index into position
(p7) ld8 r17=[r17] // fetch the L2 entry (may be 0)
- shr.u r19=r22,PAGE_SHIFT // shift L3 index into position
;;
(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was L2 entry NULL?
- dep r17=r19,r17,3,(PAGE_SHIFT-3) // compute address of L3 page table entry
+#endif
+ dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L3 page table entry
+ ;;
+(p7) ld8 r17=[r17] // fetch the L3 entry (may be 0)
+ ;;
+(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was L3 entry NULL?
+ dep r17=r19,r17,3,(PAGE_SHIFT-3) // compute address of L4 page table entry
(p6) br.cond.spnt page_fault
mov b0=r30
br.sptk.many b0 // return to continuation point
Index: linux-2.6/arch/ia64/configs/sn2_defconfig
=================================--- linux-2.6.orig/arch/ia64/configs/sn2_defconfig 2005-11-10 06:49:03.399350630 -0600
+++ linux-2.6/arch/ia64/configs/sn2_defconfig 2005-11-10 06:50:43.514416347 -0600
@@ -80,6 +80,8 @@ CONFIG_MCKINLEY=y
# CONFIG_IA64_PAGE_SIZE_8KB is not set
CONFIG_IA64_PAGE_SIZE_16KB=y
# CONFIG_IA64_PAGE_SIZE_64KB is not set
+# CONFIG_PGTABLE_3 is not set
+CONFIG_PGTABLE_4=y
# CONFIG_HZ_100 is not set
CONFIG_HZ_250=y
# CONFIG_HZ_1000 is not set
Index: linux-2.6/arch/ia64/defconfig
=================================--- linux-2.6.orig/arch/ia64/defconfig 2005-11-10 06:49:03.400327095 -0600
+++ linux-2.6/arch/ia64/defconfig 2005-11-10 06:50:43.515392813 -0600
@@ -82,6 +82,8 @@ CONFIG_MCKINLEY=y
# CONFIG_IA64_PAGE_SIZE_8KB is not set
CONFIG_IA64_PAGE_SIZE_16KB=y
# CONFIG_IA64_PAGE_SIZE_64KB is not set
+CONFIG_PGTABLE_3=y
+# CONFIG_PGTABLE_4 is not set
# CONFIG_HZ_100 is not set
CONFIG_HZ_250=y
# CONFIG_HZ_1000 is not set
Index: linux-2.6/arch/ia64/Kconfig
=================================--- linux-2.6.orig/arch/ia64/Kconfig 2005-11-10 06:49:03.400327095 -0600
+++ linux-2.6/arch/ia64/Kconfig 2005-11-10 06:50:43.515392813 -0600
@@ -164,6 +164,19 @@ config IA64_PAGE_SIZE_64KB
endchoice
+choice
+ prompt "Page Table Levels"
+ default PGTABLE_3
+
+config PGTABLE_3
+ bool "3 Levels"
+
+config PGTABLE_4
+ depends on !IA64_PAGE_SIZE_64KB
+ bool "4 Levels"
+
+endchoice
+
source kernel/Kconfig.hz
config IA64_BRL_EMU
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch 1/1] 4-level page tables v4.
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
@ 2005-11-10 21:49 ` Luck, Tony
2005-11-10 22:38 ` Robin Holt
` (9 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Luck, Tony @ 2005-11-10 21:49 UTC (permalink / raw)
To: linux-ia64
Compiling with three levels, I see some differences in the scheduling
of instructions in the vhpt_miss handler and the nested_dtlb miss
handler. Side-by-side diff of a disassembly included below (original
sequence is on the left, new sequence is on the right). For the vhpt
case the new handler is 3 instructions shorter ... but shorter isn't
always better.
Ken, David ... can you cast an eye across these please.
-Tony
P.S. there are no other instruction differences in the remainder of
the kernel ... a good indication that you managed to dot all the Is
and cross all the Ts when changing the macros in the header files.
a000000100000000 <vhpt_miss>: a000000100000000 <vhpt_miss>:
a000000100000000: [MLX] mov r16=cr20 a000000100000000: [MLX] mov r16=cr20
a000000100000006: movl r18=0xe a000000100000006: movl r18=0xe
a000000100000010: [MMI] mov r25=cr21;; a000000100000010: [MMI] mov r25=cr21;;
a000000100000016: rsm 0x20000 a000000100000016: rsm 0x20000
a00000010000001c: mov r31=pr a00000010000001c: mov r31=pr
a000000100000020: [MII] mov.m r19=ar.k7 a000000100000020: [MII] mov.m r19=ar.k7
a000000100000026: shl r21=r16,3 a000000100000026: shl r21=r16,3
a00000010000002c: shr.u r17=r16,61;; a00000010000002c: shr.u r17=r16,61;;
a000000100000030: [MII] nop.m 0x0 a000000100000030: [MII] nop.m 0x0
a000000100000036: shr r22=r21,3 | a000000100000036: shr.u r22=r21,3
a00000010000003c: extr.u r26=r25,2,6;; a00000010000003c: extr.u r26=r25,2,6;;
a000000100000040: [MII] cmp.eq p0,p8=r18,r26 a000000100000040: [MII] cmp.eq p0,p8=r18,r26
a000000100000046: sub r27=r26,r18;; a000000100000046: sub r27=r26,r18;;
a00000010000004c: (p08) dep r25=r18,r25,2,6 a00000010000004c: (p08) dep r25=r18,r25,2,6
a000000100000050: [MII] nop.m 0x0 a000000100000050: [MII] nop.m 0x0
a000000100000056: (p08) shr r22=r22,r27;; a000000100000056: (p08) shr r22=r22,r27;;
a00000010000005c: cmp.eq p6,p7=5,r17 a00000010000005c: cmp.eq p6,p7=5,r17
a000000100000060: [MII] nop.m 0x0 a000000100000060: [MII] nop.m 0x0
a000000100000066: shr.u r18=r22,36;; a000000100000066: shr.u r18=r22,36;;
a00000010000006c: (p07) dep r17=r17,r19,11,3 a00000010000006c: (p07) dep r17=r17,r19,11,3
a000000100000070: [MLX] srlz.d a000000100000070: [MLX] srlz.d
a000000100000076: (p06) movl r19=0xa0000001008980 a000000100000076: (p06) movl r19=0xa0000001008980
a000000100000080: [MII] nop.m 0x0 a000000100000080: [MII] nop.m 0x0
a000000100000086: (p06) shr.u r21=r21,50 a000000100000086: (p06) shr.u r21=r21,50
a00000010000008c: (p07) shr.u r21=r21,47;; a00000010000008c: (p07) shr.u r21=r21,47;;
a000000100000090: [MII] nop.m 0x0 a000000100000090: [MII] nop.m 0x0
a000000100000096: (p06) dep r17=r18,r19,3,11 a000000100000096: (p06) dep r17=r18,r19,3,11
a00000010000009c: (p07) dep r17=r18,r17,3,8 a00000010000009c: (p07) dep r17=r18,r17,3,8
a0000001000000a0: [MFI] cmp.eq p7,p6=0,r21 a0000001000000a0: [MFI] cmp.eq p7,p6=0,r21
a0000001000000a6: nop.f 0x0 a0000001000000a6: nop.f 0x0
a0000001000000ac: shr.u r18=r22,25;; | a0000001000000ac: shr.u r20=r22,25;;
a0000001000000b0: [MMI] ld8 r17=[r17];; a0000001000000b0: [MMI] ld8 r17=[r17];;
a0000001000000b6: (p07) cmp.eq p6,p7=r17,r0 | a0000001000000b6: nop.m 0x0
a0000001000000bc: dep r17=r18,r17,3,11;; | a0000001000000bc: dep r30=r20,r17,3,11
a0000001000000c0: [MII] (p07) ld8 r20=[r17] | a0000001000000c0: [MMI] (p07) cmp.eq p6,p7=r17,r0;;
a0000001000000c6: shr.u r19=r22,14;; | a0000001000000c6: (p07) ld8 r20=[r30]
a0000001000000cc: (p07) cmp.eq.or.andcm p6,p7=r20 | a0000001000000cc: shr.u r19=r22,14;;
a0000001000000d0: [MFI] nop.m 0x0 | a0000001000000d0: [MII] nop.m 0x0
a0000001000000d6: nop.f 0x0 | a0000001000000d6: dep r21=r19,r20,3,11
a0000001000000dc: dep r21=r19,r20,3,11;; | a0000001000000dc: (p07) cmp.eq.or.andcm p6,p7=r20
a0000001000000e0: [MMI] (p07) ld8 r18=[r21] | a0000001000000e0: [MFI] (p07) ld8 r18=[r21]
a0000001000000e6: mov r19=cr17 | a0000001000000e6: nop.f 0x0
a0000001000000ec: nop.i 0x0;; | a0000001000000ec: dep r23=0,r20,0,14
a0000001000000f0: [MFI] nop.m 0x0 | a0000001000000f0: [MMI] mov r19=cr17;;
a0000001000000f6: nop.f 0x0 | a0000001000000f6: nop.m 0x0
a0000001000000fc: (p07) tbit.z p6,p7=r18,0 a0000001000000fc: (p07) tbit.z p6,p7=r18,0
a000000100000100: [MMI] mov r22=cr25;; a000000100000100: [MMI] mov r22=cr25;;
a000000100000106: nop.m 0x0 a000000100000106: nop.m 0x0
a00000010000010c: (p07) tbit.z.unc p11,p10=r19,32 | a00000010000010c: (p07) tbit.z.unc p11,p10=r19,32
a000000100000110: [MFI] nop.m 0x0 | a000000100000110: [MMI] (p10) itc.i r18;;
a000000100000116: nop.f 0x0 | a000000100000116: nop.m 0x0
a00000010000011c: dep r23=0,r20,0,14;; | a00000010000011c: nop.i 0x0;;
a000000100000120: [MMI] (p10) itc.i r18;; | a000000100000120: [MMI] (p11) itc.d r18;;
a000000100000126: nop.m 0x0 a000000100000126: nop.m 0x0
a00000010000012c: nop.i 0x0;; | a00000010000012c: nop.i 0x0
a000000100000130: [MMI] (p11) itc.d r18;; | a000000100000130: [MFB] nop.m 0x0
a000000100000136: nop.m 0x0 | a000000100000136: nop.f 0x0
a00000010000013c: nop.i 0x0 | a00000010000013c: (p06) br.cond.spnt.many a000000
a000000100000140: [MFB] nop.m 0x0 | a000000100000140: [MMI] mov cr20=r22
a000000100000146: nop.f 0x0 | a000000100000146: (p08) mov cr21=r25
a00000010000014c: (p06) br.cond.spnt.many a000000 | a00000010000014c: adds r24\x1121,r23;;
a000000100000150: [MMI] mov cr20=r22 | a000000100000150: [MMI] (p07) itc.d r24;;
a000000100000156: (p08) mov cr21=r25 | a000000100000156: ld8 r26=[r30]
a00000010000015c: adds r24\x1121,r23;; | a00000010000015c: nop.i 0x0;;
a000000100000160: [MMI] (p07) itc.d r24;; | a000000100000160: [MFI] cmp.eq p7,p6=r26,r20
a000000100000166: ld8 r25=[r21] | a000000100000166: nop.f 0x0
a00000010000016c: nop.i 0x0 | a00000010000016c: mov r27V
a000000100000170: [MMI] ld8 r26=[r17];; | a000000100000170: [MMI] ld8 r25=[r21];;
a000000100000176: cmp.eq p7,p6=r26,r20 | a000000100000176: (p06) ptc.l r22,r27
a00000010000017c: mov r27V;; | a00000010000017c: (p07) cmp.ne.or.andcm p6,p7=r25
a000000100000180: [MFI] (p06) ptc.l r22,r27 | a000000100000180: [MIB] (p06) ptc.l r16,r27
a000000100000186: nop.f 0x0 | a000000100000186: mov pr=r31,0xffffffffffff
a00000010000018c: (p07) cmp.ne.or.andcm p6,p7=r25 | a00000010000018c: rfi;;
a000000100000190: [MIB] (p06) ptc.l r16,r27 <
a000000100000196: mov pr=r31,0xffffffffffff <
a00000010000019c: rfi;; <
a000000100001400 <nested_dtlb_miss>: a000000100001400 <nested_dtlb_miss>:
a000000100001400: [MMI] rsm 0x20000 a000000100001400: [MMI] rsm 0x20000
a000000100001406: mov.m r19=ar.k7 a000000100001406: mov.m r19=ar.k7
a00000010000140c: shl r21=r16,3 a00000010000140c: shl r21=r16,3
a000000100001410: [MMI] mov r18=cr21;; a000000100001410: [MMI] mov r18=cr21;;
a000000100001416: nop.m 0x0 a000000100001416: nop.m 0x0
a00000010000141c: shr.u r17=r16,61 a00000010000141c: shr.u r17=r16,61
a000000100001420: [MII] nop.m 0x0 a000000100001420: [MII] nop.m 0x0
a000000100001426: extr.u r18=r18,2,6;; a000000100001426: extr.u r18=r18,2,6;;
a00000010000142c: cmp.eq p6,p7=5,r17 a00000010000142c: cmp.eq p6,p7=5,r17
a000000100001430: [MII] adds r22=-14,r18 a000000100001430: [MII] adds r22=-14,r18
a000000100001436: adds r18",r18;; a000000100001436: adds r18",r18;;
a00000010000143c: shr.u r22=r16,r22 a00000010000143c: shr.u r22=r16,r22
a000000100001440: [MII] nop.m 0x0 a000000100001440: [MII] nop.m 0x0
a000000100001446: shr.u r18=r16,r18 a000000100001446: shr.u r18=r16,r18
a00000010000144c: (p07) dep r17=r17,r19,11,3 a00000010000144c: (p07) dep r17=r17,r19,11,3
a000000100001450: [MLX] srlz.d a000000100001450: [MLX] srlz.d
a000000100001456: (p06) movl r19=0xa0000001008980 a000000100001456: (p06) movl r19=0xa0000001008980
a000000100001460: [MII] nop.m 0x0 a000000100001460: [MII] nop.m 0x0
a000000100001466: (p06) shr.u r21=r21,50 a000000100001466: (p06) shr.u r21=r21,50
a00000010000146c: (p07) shr.u r21=r21,47;; a00000010000146c: (p07) shr.u r21=r21,47;;
a000000100001470: [MII] nop.m 0x0 a000000100001470: [MII] nop.m 0x0
a000000100001476: (p06) dep r17=r18,r19,3,11 a000000100001476: (p06) dep r17=r18,r19,3,11
a00000010000147c: (p07) dep r17=r18,r17,3,8 a00000010000147c: (p07) dep r17=r18,r17,3,8
a000000100001480: [MFI] cmp.eq p7,p6=0,r21 | a000000100001480: [MII] cmp.eq p7,p6=0,r21
a000000100001486: nop.f 0x0 | a000000100001486: shr.u r18=r22,25;;
a00000010000148c: shr.u r18=r22,25;; | a00000010000148c: shr.u r19=r22,14
a000000100001490: [MMI] ld8 r17=[r17];; a000000100001490: [MMI] ld8 r17=[r17];;
a000000100001496: (p07) cmp.eq p6,p7=r17,r0 a000000100001496: (p07) cmp.eq p6,p7=r17,r0
a00000010000149c: dep r17=r18,r17,3,11;; a00000010000149c: dep r17=r18,r17,3,11;;
a0000001000014a0: [MII] (p07) ld8 r17=[r17] | a0000001000014a0: [MMI] (p07) ld8 r17=[r17];;
a0000001000014a6: shr.u r19=r22,14;; | a0000001000014a6: (p07) cmp.eq.or.andcm p6,p7=r17
a0000001000014ac: (p07) cmp.eq.or.andcm p6,p7=r17 | a0000001000014ac: dep r17=r19,r17,3,11
a0000001000014b0: [MIB] nop.m 0x0 | a0000001000014b0: [MFB] nop.m 0x0
a0000001000014b6: dep r17=r19,r17,3,11 | a0000001000014b6: nop.f 0x0
a0000001000014bc: (p06) br.cond.spnt.few a0000001 a0000001000014bc: (p06) br.cond.spnt.few a0000001
a0000001000014c0: [MIB] nop.m 0x0 a0000001000014c0: [MIB] nop.m 0x0
a0000001000014c6: mov b0=r30 a0000001000014c6: mov b0=r30
a0000001000014cc: br.many b0;; a0000001000014cc: br.many b0;;
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch 1/1] 4-level page tables v4.
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
2005-11-10 21:49 ` Luck, Tony
@ 2005-11-10 22:38 ` Robin Holt
2005-11-10 23:03 ` Luck, Tony
` (8 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Robin Holt @ 2005-11-10 22:38 UTC (permalink / raw)
To: linux-ia64
On Thu, Nov 10, 2005 at 01:49:26PM -0800, Luck, Tony wrote:
> Compiling with three levels, I see some differences in the scheduling
> of instructions in the vhpt_miss handler and the nested_dtlb miss
> handler. Side-by-side diff of a disassembly included below (original
> sequence is on the left, new sequence is on the right). For the vhpt
> case the new handler is 3 instructions shorter ... but shorter isn't
> always better.
I used the objdump that Jack Steiner pointed me towards to optomize the
vhpt_miss handler and then test. This instruction order gave the best
performance, but we are talking extremely small differences.
Is the goal to make these identical? If so, it should be easy to do,
but I was not aware that was the intent.
I am going to attach the dispersal analysis the modified objdump that
Jack has produced.
Thanks,
Robin
0000000000000000 <vhpt_miss>: 0000000000000000 <vhpt_miss>:
0: 0 [MLX] mov r16=cr20 0: 0 [MLX] mov r16=cr20
6: 0 movl r18=0xe 6: 0 movl r18=0xe
c: c:
10: 1 R[M2] [MMI] mov r25=cr21;; 10: 1 R[M2] [MMI] mov r25=cr21;;
16: 2 S rsm 0x20000 16: 2 S rsm 0x20000
1c: 2 mov r31=pr 1c: 2 mov r31=pr
20: 3 R[M2] [MII] mov.m r19=ar.k7 20: 3 R[M2] [MII] mov.m r19=ar.k7
26: 3 shl r21=r16,3 26: 3 shl r21=r16,3
2c: 3 shr.u r17=r16,61;; 2c: 3 shr.u r17=r16,61;;
30: 4 S [MII] nop.m 0x0 30: 4 S [MII] nop.m 0x0
36: 4 shr r22=r21,3 | 36: 4 shr.u r22=r21,3
3c: 5 R[I0] extr.u r26=r25,2,6;; 3c: 5 R[I0] extr.u r26=r25,2,6;;
40: 6 S [MII] cmp.eq p0,p8=r18,r26 40: 6 S [MII] cmp.eq p0,p8=r18,r26
46: 6 sub r27=r26,r18;; 46: 6 sub r27=r26,r18;;
4c: 7 S (p08) dep r25=r18,r25,2,6 4c: 7 S (p08) dep r25=r18,r25,2,6
50: 7 [MII] nop.m 0x0 50: 7 [MII] nop.m 0x0
56: 7 (p08) shr r22=r22,r27;; 56: 7 (p08) shr r22=r22,r27;;
5c: 8 S cmp.eq p6,p7=5,r17 5c: 8 S cmp.eq p6,p7=5,r17
60: 8 [MII] nop.m 0x0 60: 8 [MII] nop.m 0x0
66: 8 shr.u r18=r22,36;; 66: 8 shr.u r18=r22,36;;
6c: 9 S (p07) dep r17=r17,r19,11,3 6c: 9 S (p07) dep r17=r17,r19,11,3
70: 9 [MLX] srlz.d 70: 9 [MLX] srlz.d
76: 9 (p06) movl r19=0x0 76: 9 (p06) movl r19=0x0
7c: 7c:
80: 10 nop.m 0x0 80: 10 [MII] nop.m 0x0
86: 10 (p06) shr.u r21=r21,50 86: 10 (p06) shr.u r21=r21,50
8c: 10 (p07) shr.u r21=r21,47;; 8c: 10 (p07) shr.u r21=r21,47;;
90: 11 S nop.m 0x0 90: 11 S [MII] nop.m 0x0
96: 11 (p06) dep r17=r18,r19,3,11 96: 11 (p06) dep r17=r18,r19,3,11
9c: 12 R[I0] (p07) dep r17=r18,r17,3,8 9c: 12 R[I0] (p07) dep r17=r18,r17,3,8
a0: 12 cmp.eq p7,p6=0,r21 a0: 12 [MFI] cmp.eq p7,p6=0,r21
a6: 12 nop.f 0x0 | a6: 12 nop.f 0x0
ac: 12 shr.u r18=r22,25;; | ac: 12 shr.u r20=r22,25;;
b0: 13 ld8 r17=[r17];; | b0: 13 [MMI] ld8 r17=[r17];;
b6: 14 S (p07) cmp.eq p6,p7=r17,r0 | b6: 14 S nop.m 0x0
bc: 14 dep r17=r18,r17,3,11;; | bc: 14 dep r30=r20,r17,3,11
c0: 15 S (p07) ld8 r20=[r17] | c0: 14 [MMI] (p07) cmp.eq p6,p7=r17,r0;;
c6: 15 shr.u r19=r22,14;; | c6: 15 S (p07) ld8 r20=[r30]
cc: 16 S (p07) cmp.eq.or.andcm p6,p7=r20,r0 | cc: 15 shr.u r19=r22,14;;
d0: 16 nop.m 0x0 | d0: 16 S [MII] nop.m 0x0
d6: 16 nop.f 0x0 | d6: 16 dep r21=r19,r20,3,11
dc: 17 R[I0] dep r21=r19,r20,3,11;; | dc: 16 (p07) cmp.eq.or.andcm p6,p7=r20,r0;;
e0: 18 S (p07) ld8 r18=[r21] | e0: 17 S [MFI] (p07) ld8 r18=[r21]
e6: 18 mov r19=cr17 | e6: 17 nop.f 0x0
ec: 18 nop.i 0x0;; | ec: 17 dep r23=0,r20,0,14
f0: 19 S nop.m 0x0 | f0: 17 [MMI] mov r19=cr17;;
f6: 19 nop.f 0x0 | f6: 18 S nop.m 0x0
fc: 19 (p07) tbit.z p6,p7=r18,0 | fc: 18 (p07) tbit.z p6,p7=r18,0
100: 19 mov r22=cr25;; | 100: 18 [MMI] mov r22=cr25;;
106: 20 S nop.m 0x0 | 106: 19 S nop.m 0x0
10c: 20 (p07) tbit.z.unc p11,p10=r19,32 | 10c: 19 (p07) tbit.z.unc p11,p10=r19,32;;
110: 20 nop.m 0x0 | 110: 20 S [MMI] (p10) itc.i r18;;
116: 20 nop.f 0x0 | 116: 21 S nop.m 0x0
11c: 21 R[I0] dep r23=0,r20,0,14;; | 11c: 21 nop.i 0x0;;
120: 22 S (p10) itc.i r18;; | 120: 22 S [MMI] (p11) itc.d r18;;
126: 23 S nop.m 0x0 126: 23 S nop.m 0x0
12c: 23 nop.i 0x0;; | 12c: 23 nop.i 0x0
130: 24 S (p11) itc.d r18;; | 130: 23 [MFB] nop.m 0x0
136: 25 S nop.m 0x0 | 136: 23 nop.f 0x0
13c: 25 nop.i 0x0 | 13c: 23 (p06) br.cond.spnt.many 1820 <page_fault>
140: 25 nop.m 0x0 | 140: 24 [MMI] mov cr20=r22
146: 25 nop.f 0x0 | 146: 25 R[M2] (p08) mov cr21=r25
14c: 25 (p06) br.cond.spnt.many 1820 <page_fault> | 14c: 25 adds r24\x1121,r23;;
150: 26 mov cr20=r22 | 150: 26 S [MMI] (p07) itc.d r24;;
156: 27 R[M2] (p08) mov cr21=r25 | 156: 27 S ld8 r26=[r30]
15c: 27 adds r24\x1121,r23;; | 15c: 27 nop.i 0x0;;
160: 28 S (p07) itc.d r24;; | 160: 28 S [MFI] cmp.eq p7,p6=r26,r20
166: 29 S ld8 r25=[r21] | 166: 28 nop.f 0x0
16c: 29 nop.i 0x0 | 16c: 28 mov r27V
170: 29 ld8 r26=[r17];; | 170: 28 [MMI] ld8 r25=[r21];;
176: 30 S cmp.eq p7,p6=r26,r20 | 176: 29 S (p06) ptc.l r22,r27
17c: 30 mov r27V;; | 17c: 29 (p07) cmp.ne.or.andcm p6,p7=r25,r18;;
180: 31 S (p06) ptc.l r22,r27 | 180: 30 S [MIB] (p06) ptc.l r16,r27
186: 31 nop.f 0x0 | 186: 30 mov pr=r31,0xfffffffffffffffe
18c: 31 (p07) cmp.ne.or.andcm p6,p7=r25,r18;; | 18c: 30 rfi;;
190: 32 S (p06) ptc.l r16,r27 <
196: 32 mov pr=r31,0xfffffffffffffffe <
19c: 32 rfi;; <
...
0000000000001400 <nested_dtlb_miss>: 0000000000001400 <nested_dtlb_miss>:
1400: 0 [MMI] rsm 0x20000 1400: 0 [MMI] rsm 0x20000
1406: 1 R[M2] mov.m r19=ar.k7 1406: 1 R[M2] mov.m r19=ar.k7
140c: 1 shl r21=r16,3 140c: 1 shl r21=r16,3
1410: 2 R[M2] [MMI] mov r18=cr21;; 1410: 2 R[M2] [MMI] mov r18=cr21;;
1416: 3 S nop.m 0x0 1416: 3 S nop.m 0x0
141c: 3 shr.u r17=r16,61 141c: 3 shr.u r17=r16,61
1420: 3 [MII] nop.m 0x0 1420: 3 [MII] nop.m 0x0
1426: 4 R[I0] extr.u r18=r18,2,6;; 1426: 4 R[I0] extr.u r18=r18,2,6;;
142c: 5 S cmp.eq p6,p7=5,r17 142c: 5 S cmp.eq p6,p7=5,r17
1430: 5 [MII] adds r22=-14,r18 1430: 5 [MII] adds r22=-14,r18
1436: 5 adds r18",r18;; 1436: 5 adds r18",r18;;
143c: 6 S shr.u r22=r16,r22 143c: 6 S shr.u r22=r16,r22
1440: 6 [MII] nop.m 0x0 1440: 6 [MII] nop.m 0x0
1446: 6 shr.u r18=r16,r18 1446: 6 shr.u r18=r16,r18
144c: 7 R[I0] (p07) dep r17=r17,r19,11,3 144c: 7 R[I0] (p07) dep r17=r17,r19,11,3
1450: 7 [MLX] srlz.d 1450: 7 [MLX] srlz.d
1456: 7 (p06) movl r19=0x0 1456: 7 (p06) movl r19=0x0
145c: 145c:
1460: 8 [MII] nop.m 0x0 1460: 8 [MII] nop.m 0x0
1466: 8 (p06) shr.u r21=r21,50 1466: 8 (p06) shr.u r21=r21,50
146c: 8 (p07) shr.u r21=r21,47;; 146c: 8 (p07) shr.u r21=r21,47;;
1470: 9 S [MII] nop.m 0x0 1470: 9 S [MII] nop.m 0x0
1476: 9 (p06) dep r17=r18,r19,3,11 1476: 9 (p06) dep r17=r18,r19,3,11
147c: 10 R[I0] (p07) dep r17=r18,r17,3,8 147c: 10 R[I0] (p07) dep r17=r18,r17,3,8
1480: 10 [MFI] cmp.eq p7,p6=0,r21 | 1480: 10 [MII] cmp.eq p7,p6=0,r21
1486: 10 nop.f 0x0 | 1486: 10 shr.u r18=r22,25;;
148c: 10 shr.u r18=r22,25;; | 148c: 11 S shr.u r19=r22,14
1490: 11 [MMI] ld8 r17=[r17];; 1490: 11 [MMI] ld8 r17=[r17];;
1496: 12 S (p07) cmp.eq p6,p7=r17,r0 1496: 12 S (p07) cmp.eq p6,p7=r17,r0
149c: 12 dep r17=r18,r17,3,11;; 149c: 12 dep r17=r18,r17,3,11;;
14a0: 13 S [MII] (p07) ld8 r17=[r17] | 14a0: 13 S [MMI] (p07) ld8 r17=[r17];;
14a6: 13 shr.u r19=r22,14;; | 14a6: 14 S (p07) cmp.eq.or.andcm p6,p7=r17,r0
14ac: 14 S (p07) cmp.eq.or.andcm p6,p7=r17,r0 | 14ac: 14 dep r17=r19,r17,3,11
14b0: 14 [MIB] nop.m 0x0 | 14b0: 14 [MFB] nop.m 0x0
14b6: 15 R[I0] dep r17=r19,r17,3,11 | 14b6: 14 nop.f 0x0
14bc: 15 (p06) br.cond.spnt.few 1820 <page_fault> | 14bc: 14 (p06) br.cond.spnt.few 1820 <page_fault>
14c0: 16 B [MIB] nop.m 0x0 | 14c0: 15 [MIB] nop.m 0x0
14c6: 16 mov b0=r30 | 14c6: 15 mov b0=r30
14cc: 16 br.many b0;; | 14cc: 15 br.many b0;;
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [Patch 1/1] 4-level page tables v4.
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
2005-11-10 21:49 ` Luck, Tony
2005-11-10 22:38 ` Robin Holt
@ 2005-11-10 23:03 ` Luck, Tony
2005-11-10 23:30 ` Chen, Kenneth W
` (7 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Luck, Tony @ 2005-11-10 23:03 UTC (permalink / raw)
To: linux-ia64
>Is the goal to make these identical? If so, it should be easy to do,
>but I was not aware that was the intent.
Identical is only required if it can be proved that the
orginal was perfectly optimised to a unique peak point :-)
My post was just requesting David, Ken (and anyone else
who can schedule ia64 instructions in their head) to take
a look to make sure this isn't stalling someplace.
If the new code runs just as fast as the old, the only
possible remaining sticking point would be maintainability
of the code ... assembly code does not lend itself well
to the games we play in C code to keep the #ifdefs under
control. You've added six new #ifdefs to the fifteen
already in ivt.S ... it was already hard to read (which
is why I resorted to compiling and diffing the dissassembly
to see what really changed).
-Tony
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [Patch 1/1] 4-level page tables v4.
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
` (2 preceding siblings ...)
2005-11-10 23:03 ` Luck, Tony
@ 2005-11-10 23:30 ` Chen, Kenneth W
2005-11-10 23:54 ` Chen, Kenneth W
` (6 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Chen, Kenneth W @ 2005-11-10 23:30 UTC (permalink / raw)
To: linux-ia64
Robin Holt wrote on Thursday, November 10, 2005 2:39 PM
> On Thu, Nov 10, 2005 at 01:49:26PM -0800, Luck, Tony wrote:
> > Compiling with three levels, I see some differences in the scheduling
> > of instructions in the vhpt_miss handler and the nested_dtlb miss
> > handler. Side-by-side diff of a disassembly included below (original
> > sequence is on the left, new sequence is on the right). For the vhpt
> > case the new handler is 3 instructions shorter ... but shorter isn't
> > always better.
>
> I used the objdump that Jack Steiner pointed me towards to optomize the
> vhpt_miss handler and then test. This instruction order gave the best
> performance, but we are talking extremely small differences.
>
> Is the goal to make these identical? If so, it should be easy to do,
> but I was not aware that was the intent.
I was wondering earlier too why you changed all the register usage etc.
You really don't need to make that big of change since the resource
contention is around dep/cmp. cmp instruction is ALU type and can be
schedule on all 6 integer units. The easiest way is to just re-order
these two instructions. There is one change you made around tbit/dep on
line 163 (dep r23=0,r20,0,PAGE_SHIFT), but that is outside the 4-level
page table walk. And again, easiest thing to do is to pull that ins 2
bundle earlier.
- Ken
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [Patch 1/1] 4-level page tables v4.
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
` (3 preceding siblings ...)
2005-11-10 23:30 ` Chen, Kenneth W
@ 2005-11-10 23:54 ` Chen, Kenneth W
2005-11-11 0:13 ` Chen, Kenneth W
` (5 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Chen, Kenneth W @ 2005-11-10 23:54 UTC (permalink / raw)
To: linux-ia64
Robin Holt wrote on Thursday, November 10, 2005 8:19 AM
> --- linux-2.6.orig/arch/ia64/kernel/ivt.S 2005-11-10 06:49:03.399350630 -0600
> +++ linux-2.6/arch/ia64/kernel/ivt.S 2005-11-10 10:06:49.272116003 -0600
> @@ -114,7 +114,7 @@ ENTRY(vhpt_miss)
> @@ -192,14 +207,23 @@ ENTRY(vhpt_miss)
> * between reading the pagetable and the "itc". If so, flush the entry we
> * inserted and retry.
> */
> - ld8 r25=[r21] // read L3 PTE again
> - ld8 r26=[r17] // read L2 entry again
> + ld8 r26=[r30] // read L3 entry again
> +#ifdef CONFIG_PGTABLE_4
> + ld8 r30=[r28] // read L2 entry again
> ;;
> - cmp.ne p6,p7=r26,r20 // did L2 entry change
> + cmp.ne p6,p7=r30,r29 // did L2 entry change
> mov r27=PAGE_SHIFT<<2
> ;;
> +(p7) cmp.ne.or.andcm p6,p7=r26,r20 // did L3 entry change
> +#else
You can shave off one cycle here by using parallel cmp. Initialize
p6,p7 in the load bundle. Something like:
+#ifdef CONFIG_PGTABLE_4
+ ld8 r30=[r28] // read L2 entry again
+ cmp.ne p6,p7=r0,r0
;;
- cmp.ne p6,p7=r26,r20 // did L2 entry change
+ cmp.ne.or.andcm p6,p7=r30,r29
+ cmp.ne.or.andcm p6,p7=r26,r20
mov r27=PAGE_SHIFT<<2
+#else
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [Patch 1/1] 4-level page tables v4.
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
` (4 preceding siblings ...)
2005-11-10 23:54 ` Chen, Kenneth W
@ 2005-11-11 0:13 ` Chen, Kenneth W
2005-11-11 0:24 ` Jack Steiner
` (4 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Chen, Kenneth W @ 2005-11-11 0:13 UTC (permalink / raw)
To: linux-ia64
Robin Holt wrote on Thursday, November 10, 2005 2:39 PM
> I am going to attach the dispersal analysis the modified objdump that
> Jack has produced.
> 5c: 8 S cmp.eq p6,p7=5,r17
> 60: 8 [MII] nop.m 0x0
> 66: 8 shr.u r18=r22,36;;
Hmm, I think the dispersal analysis software is buggy. At least for
the above instance. Immediate form of shr is pseudo-op'ed to extr.
It should count the above bundle with 2 cycles instead of one cycle.
- Ken
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch 1/1] 4-level page tables v4.
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
` (5 preceding siblings ...)
2005-11-11 0:13 ` Chen, Kenneth W
@ 2005-11-11 0:24 ` Jack Steiner
2005-11-11 0:58 ` Chen, Kenneth W
` (3 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Jack Steiner @ 2005-11-11 0:24 UTC (permalink / raw)
To: linux-ia64
On Thu, Nov 10, 2005 at 04:13:02PM -0800, Chen, Kenneth W wrote:
> Robin Holt wrote on Thursday, November 10, 2005 2:39 PM
> > I am going to attach the dispersal analysis the modified objdump that
> > Jack has produced.
>
> > 5c: 8 S cmp.eq p6,p7=5,r17
> > 60: 8 [MII] nop.m 0x0
> > 66: 8 shr.u r18=r22,36;;
>
> Hmm, I think the dispersal analysis software is buggy. At least for
> the above instance. Immediate form of shr is pseudo-op'ed to extr.
> It should count the above bundle with 2 cycles instead of one cycle.
Very possible. The tool is very old & non-supported. We have used it
several times and had good luck. It seems to point out ares of code
that stall unexpectedlt. Eliminating the stalls usually improved
performance. However, I would be surprised if there were no
bugs in the code.
I would like to see a standard tool that gives dispersal analysis.
There was a discussion on the IA64 list ~ month ago but so far, no
takers.
>
> - Ken
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Thanks
Jack Steiner (steiner@sgi.com) 651-683-5302
Principal Engineer SGI - Silicon Graphics, Inc.
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [Patch 1/1] 4-level page tables v4.
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
` (6 preceding siblings ...)
2005-11-11 0:24 ` Jack Steiner
@ 2005-11-11 0:58 ` Chen, Kenneth W
2005-11-11 1:19 ` Robin Holt
` (2 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Chen, Kenneth W @ 2005-11-11 0:58 UTC (permalink / raw)
To: linux-ia64
Luck, Tony wrote on Thursday, November 10, 2005 3:03 PM
> If the new code runs just as fast as the old, the only
> possible remaining sticking point would be maintainability
> of the code ... assembly code does not lend itself well
> to the games we play in C code to keep the #ifdefs under
> control. You've added six new #ifdefs to the fifteen
> already in ivt.S ... it was already hard to read (which
> is why I resorted to compiling and diffing the dissassembly
> to see what really changed).
Perhaps, this patch for vhpt_miss handler is a bit easier for the tender
eyes out there (including mine :-p btw, this patch has not yet been tested).
The 2nd #ifdef block can be converted to predicated code, though for
3-level page table, it will have 2 cycle penalty. But that can be
recuperated from better instruction scheduling with several bad dep/cmp,
shr/cmp pair. I can do another patch to clean up instruction scheduling.
- Ken
--- ivt.S.orig 2005-11-10 10:26:26.104472218 -0800
+++ ivt.S.ken 2005-11-10 16:47:42.577824794 -0800
@@ -140,12 +140,28 @@
(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8
(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
+#ifdef CONFIG_PGTABLE_4
+ shr.u r18=r22,PUD_SHIFT
+ cmp.eq p9,p0=r0,r0
+#else
shr.u r18=r22,PMD_SHIFT // shift L2 index into position
+ cmp.eq p0,p9=r0,r0
+#endif
;;
ld8 r17=[r17] // fetch the L1 entry (may be 0)
;;
(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL?
+#ifdef CONFIG_PGTABLE_4
+ dep r28=r18,r17,3,(PAGE_SHIFT-3)
+ ;;
+(p7) ld8 r29=[r28]
+ shr.u r18=r22,PMD_SHIFT
+ ;;
+ dep r17=r18,r29,3,(PAGE_SHIFT-3)
+(p7) cmp.eq.or.andcm p6,p7=r29,r0
+#else
dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
+#endif
;;
(p7) ld8 r20=[r17] // fetch the L2 entry (may be 0)
shr.u r19=r22,PAGE_SHIFT // shift L3 index into position
@@ -194,8 +210,11 @@
*/
ld8 r25=[r21] // read L3 PTE again
ld8 r26=[r17] // read L2 entry again
+(p9) ld8 r18=[r28]
+ cmp.ne p6,p7=r0,r0
;;
- cmp.ne p6,p7=r26,r20 // did L2 entry change
+ cmp.ne.or.andcm p6,p7=r26,r20
+(p9) cmp.ne.or.andcm p6,p7=r18,r28
mov r27=PAGE_SHIFT<<2
;;
(p6) ptc.l r22,r27 // purge PTE page translation
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch 1/1] 4-level page tables v4.
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
` (7 preceding siblings ...)
2005-11-11 0:58 ` Chen, Kenneth W
@ 2005-11-11 1:19 ` Robin Holt
2005-11-11 2:06 ` Chen, Kenneth W
2005-11-11 2:11 ` Robin Holt
10 siblings, 0 replies; 12+ messages in thread
From: Robin Holt @ 2005-11-11 1:19 UTC (permalink / raw)
To: linux-ia64
Here is another go. I removed the p9 check. It all fits fairly
nicely under a #ifdef.
Robin
[holt@attica:linux-2.6] quilt diff arch/ia64/kernel/ivt.S
Index: linux-2.6/arch/ia64/kernel/ivt.S
=================================--- linux-2.6.orig/arch/ia64/kernel/ivt.S 2005-11-10 16:19:31.070347396 -0600
+++ linux-2.6/arch/ia64/kernel/ivt.S 2005-11-10 19:18:08.282399235 -0600
@@ -114,7 +114,7 @@ ENTRY(vhpt_miss)
shl r21=r16,3 // shift bit 60 into sign bit
shr.u r17=r16,61 // get the region number into r17
;;
- shr r22=r21,3
+ shr.u r22=r21,3
#ifdef CONFIG_HUGETLB_PAGE
extr.u r26=r25,2,6
;;
@@ -140,20 +140,32 @@ ENTRY(vhpt_miss)
(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8
(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
- shr.u r18=r22,PMD_SHIFT // shift L2 index into position
+#ifdef CONFIG_PGTABLE_4
+ shr.u r28=r22,PUD_SHIFT // shift L2 index into position
+#endif
+ shr.u r18=r22,PMD_SHIFT // shift L3 index into position
;;
ld8 r17=[r17] // fetch the L1 entry (may be 0)
;;
(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL?
- dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
+#ifdef CONFIG_PGTABLE_4
+ dep r28=r28,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
;;
-(p7) ld8 r20=[r17] // fetch the L2 entry (may be 0)
- shr.u r19=r22,PAGE_SHIFT // shift L3 index into position
+(p7) ld8 r29=[r28] // fetch the L2 entry (may be 0)
;;
-(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was L2 entry NULL?
- dep r21=r19,r20,3,(PAGE_SHIFT-3) // compute address of L3 page table entry
+(p7) cmp.eq.or.andcm p6,p7=r29,r0 // was L2 entry NULL?
+ dep r17=r18,r29,3,(PAGE_SHIFT-3) // compute address of L3 page table entry
+#else
+ dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L3 page table entry
+#endif
;;
-(p7) ld8 r18=[r21] // read the L3 PTE
+(p7) ld8 r20=[r17] // fetch the L3 entry (may be 0)
+ shr.u r19=r22,PAGE_SHIFT // shift L4 index into position
+ ;;
+(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was L3 entry NULL?
+ dep r21=r19,r20,3,(PAGE_SHIFT-3) // compute address of L4 page table entry
+ ;;
+(p7) ld8 r18=[r21] // read the L4 PTE
mov r19=cr.isr // cr.isr bit 0 tells us if this is an insn miss
;;
(p7) tbit.z p6,p7=r18,_PAGE_P_BIT // page present bit cleared?
@@ -192,14 +204,21 @@ ENTRY(vhpt_miss)
* between reading the pagetable and the "itc". If so, flush the entry we
* inserted and retry.
*/
- ld8 r25=[r21] // read L3 PTE again
- ld8 r26=[r17] // read L2 entry again
+ ld8 r25=[r21] // read L4 entry again
+ ld8 r26=[r17] // read L3 PTE again
+#ifdef CONFIG_PGTABLE_4
+ ld8 r18=[r28] // read L2 entry again
+#endif
+ cmp.ne p6,p7=r0,r0
;;
- cmp.ne p6,p7=r26,r20 // did L2 entry change
+ cmp.ne.or.andcm p6,p7=r26,r20 // did L3 entry change
+#ifdef CONFIG_PGTABLE_4
+ cmp.ne.or.andcm p6,p7=r29,r18 // did L4 PTE change
+#endif
mov r27=PAGE_SHIFT<<2
;;
(p6) ptc.l r22,r27 // purge PTE page translation
-(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did L3 PTE change
+(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did L4 PTE change
;;
(p6) ptc.l r16,r27 // purge translation
#endif
@@ -432,18 +451,30 @@ ENTRY(nested_dtlb_miss)
(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8
(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
- shr.u r18=r22,PMD_SHIFT // shift L2 index into position
+#ifdef CONFIG_PGTABLE_4
+ shr.u r18=r22,PUD_SHIFT // shift L2 index into position
+#else
+ shr.u r18=r22,PMD_SHIFT // shift L3 index into position
+#endif
;;
ld8 r17=[r17] // fetch the L1 entry (may be 0)
;;
(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL?
dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
;;
+#ifdef CONFIG_PGTABLE_4
(p7) ld8 r17=[r17] // fetch the L2 entry (may be 0)
- shr.u r19=r22,PAGE_SHIFT // shift L3 index into position
+ shr.u r18=r22,PMD_SHIFT // shift L3 index into position
;;
(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was L2 entry NULL?
- dep r17=r19,r17,3,(PAGE_SHIFT-3) // compute address of L3 page table entry
+ dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
+ ;;
+#endif
+(p7) ld8 r17=[r17] // fetch the L3 entry (may be 0)
+ shr.u r19=r22,PAGE_SHIFT // shift L4 index into position
+ ;;
+(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was L3 entry NULL?
+ dep r17=r19,r17,3,(PAGE_SHIFT-3) // compute address of L4 page table entry
(p6) br.cond.spnt page_fault
mov b0=r30
br.sptk.many b0 // return to continuation point
On Thu, Nov 10, 2005 at 04:58:01PM -0800, Chen, Kenneth W wrote:
> Luck, Tony wrote on Thursday, November 10, 2005 3:03 PM
> > If the new code runs just as fast as the old, the only
> > possible remaining sticking point would be maintainability
> > of the code ... assembly code does not lend itself well
> > to the games we play in C code to keep the #ifdefs under
> > control. You've added six new #ifdefs to the fifteen
> > already in ivt.S ... it was already hard to read (which
> > is why I resorted to compiling and diffing the dissassembly
> > to see what really changed).
>
> Perhaps, this patch for vhpt_miss handler is a bit easier for the tender
> eyes out there (including mine :-p btw, this patch has not yet been tested).
>
> The 2nd #ifdef block can be converted to predicated code, though for
> 3-level page table, it will have 2 cycle penalty. But that can be
> recuperated from better instruction scheduling with several bad dep/cmp,
> shr/cmp pair. I can do another patch to clean up instruction scheduling.
>
> - Ken
>
>
> --- ivt.S.orig 2005-11-10 10:26:26.104472218 -0800
> +++ ivt.S.ken 2005-11-10 16:47:42.577824794 -0800
> @@ -140,12 +140,28 @@
> (p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8
> (p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
> cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
> +#ifdef CONFIG_PGTABLE_4
> + shr.u r18=r22,PUD_SHIFT
> + cmp.eq p9,p0=r0,r0
> +#else
> shr.u r18=r22,PMD_SHIFT // shift L2 index into position
> + cmp.eq p0,p9=r0,r0
> +#endif
> ;;
> ld8 r17=[r17] // fetch the L1 entry (may be 0)
> ;;
> (p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL?
> +#ifdef CONFIG_PGTABLE_4
> + dep r28=r18,r17,3,(PAGE_SHIFT-3)
> + ;;
> +(p7) ld8 r29=[r28]
> + shr.u r18=r22,PMD_SHIFT
> + ;;
> + dep r17=r18,r29,3,(PAGE_SHIFT-3)
> +(p7) cmp.eq.or.andcm p6,p7=r29,r0
> +#else
> dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
> +#endif
> ;;
> (p7) ld8 r20=[r17] // fetch the L2 entry (may be 0)
> shr.u r19=r22,PAGE_SHIFT // shift L3 index into position
> @@ -194,8 +210,11 @@
> */
> ld8 r25=[r21] // read L3 PTE again
> ld8 r26=[r17] // read L2 entry again
> +(p9) ld8 r18=[r28]
> + cmp.ne p6,p7=r0,r0
> ;;
> - cmp.ne p6,p7=r26,r20 // did L2 entry change
> + cmp.ne.or.andcm p6,p7=r26,r20
> +(p9) cmp.ne.or.andcm p6,p7=r18,r28
> mov r27=PAGE_SHIFT<<2
> ;;
> (p6) ptc.l r22,r27 // purge PTE page translation
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [Patch 1/1] 4-level page tables v4.
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
` (8 preceding siblings ...)
2005-11-11 1:19 ` Robin Holt
@ 2005-11-11 2:06 ` Chen, Kenneth W
2005-11-11 2:11 ` Robin Holt
10 siblings, 0 replies; 12+ messages in thread
From: Chen, Kenneth W @ 2005-11-11 2:06 UTC (permalink / raw)
To: linux-ia64
Robin Holt wrote on Thursday, November 10, 2005 5:20 PM
> Here is another go. I removed the p9 check. It all fits fairly
> nicely under a #ifdef.
Oh, p9 is introduced to remove some of the #ifdef, which is what
people are grumbling about for readability and maintainability.
I thought it's a neat idea. Apparently not so from your view?
- Ken
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch 1/1] 4-level page tables v4.
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
` (9 preceding siblings ...)
2005-11-11 2:06 ` Chen, Kenneth W
@ 2005-11-11 2:11 ` Robin Holt
10 siblings, 0 replies; 12+ messages in thread
From: Robin Holt @ 2005-11-11 2:11 UTC (permalink / raw)
To: linux-ia64
On Thu, Nov 10, 2005 at 06:06:43PM -0800, Chen, Kenneth W wrote:
> Robin Holt wrote on Thursday, November 10, 2005 5:20 PM
> > Here is another go. I removed the p9 check. It all fits fairly
> > nicely under a #ifdef.
>
> Oh, p9 is introduced to remove some of the #ifdef, which is what
> people are grumbling about for readability and maintainability.
> I thought it's a neat idea. Apparently not so from your view?
I am not sure this adds that much to readability since the setting of
p9 happens fairly early in vhtp_miss and the use of it is fairly late.
The #ifdef just makes it clearer to me. I will put it back if that
makes this more pallatable.
I am more concerned about the slowdown I am seeing now. lmbench is now
showing a small slowdown on fork and your touch a lot of pages test is
considerably slower (guess approx 0.5% slower on average of 25 runs).
It used to have samples all over the spectrum but now they all tend to
be at the high side of the noise of the 3-level run.
Any suggestions?
Thanks,
Robin
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2005-11-11 2:11 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-11-10 16:19 [Patch 1/1] 4-level page tables v4 Robin Holt
2005-11-10 21:49 ` Luck, Tony
2005-11-10 22:38 ` Robin Holt
2005-11-10 23:03 ` Luck, Tony
2005-11-10 23:30 ` Chen, Kenneth W
2005-11-10 23:54 ` Chen, Kenneth W
2005-11-11 0:13 ` Chen, Kenneth W
2005-11-11 0:24 ` Jack Steiner
2005-11-11 0:58 ` Chen, Kenneth W
2005-11-11 1:19 ` Robin Holt
2005-11-11 2:06 ` Chen, Kenneth W
2005-11-11 2:11 ` Robin Holt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox