All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] MIPS: Optimize TLB Refill for Octeon/Octeon2
@ 2010-12-28  2:07 David Daney
  2010-12-28  2:07 ` [PATCH 1/2] MIPS: Add LDX and LWX instructions to uasm David Daney
  2010-12-28  2:07 ` [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs David Daney
  0 siblings, 2 replies; 10+ messages in thread
From: David Daney @ 2010-12-28  2:07 UTC (permalink / raw)
  To: linux-mips, ralf; +Cc: David Daney

Octeon and Octeon2 have scratch memory, and/or scratch registers that
allow us to save some instructions in the TLB refill handler.  Octeon2
has indexed load instructions that also can help.

The first patch adds uASM support for the indexed loads.  The second
essentially hand codes the refill handler with a view to optimally
scheduling the instructions to reduce stalls and increase the number
of dual issue slots that can be filled.

David Daney (2):
  MIPS: Add LDX and LWX instructions to uasm.
  MIPS: Optimize TLB handlers for Octeon CPUs

 arch/mips/include/asm/inst.h |   14 ++
 arch/mips/include/asm/uasm.h |    4 +
 arch/mips/mm/tlbex.c         |  361 ++++++++++++++++++++++++++++++++++++------
 arch/mips/mm/uasm.c          |    7 +-
 4 files changed, 334 insertions(+), 52 deletions(-)

-- 
1.7.2.3

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/2] MIPS: Add LDX and LWX instructions to uasm.
  2010-12-28  2:07 [PATCH 0/2] MIPS: Optimize TLB Refill for Octeon/Octeon2 David Daney
@ 2010-12-28  2:07 ` David Daney
  2010-12-28 17:11   ` Ralf Baechle
  2010-12-28  2:07 ` [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs David Daney
  1 sibling, 1 reply; 10+ messages in thread
From: David Daney @ 2010-12-28  2:07 UTC (permalink / raw)
  To: linux-mips, ralf; +Cc: David Daney

Needed by Octeon II optimized TLB handlers.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
---
 arch/mips/include/asm/inst.h |   14 ++++++++++++++
 arch/mips/include/asm/uasm.h |    4 ++++
 arch/mips/mm/uasm.c          |    7 ++++++-
 3 files changed, 24 insertions(+), 1 deletions(-)

diff --git a/arch/mips/include/asm/inst.h b/arch/mips/include/asm/inst.h
index 444ff71..7ebfc39 100644
--- a/arch/mips/include/asm/inst.h
+++ b/arch/mips/include/asm/inst.h
@@ -72,6 +72,7 @@ enum spec2_op {
 enum spec3_op {
 	ext_op, dextm_op, dextu_op, dext_op,
 	ins_op, dinsm_op, dinsu_op, dins_op,
+	lx_op = 0x0a,
 	bshfl_op = 0x20,
 	dbshfl_op = 0x24,
 	rdhwr_op = 0x3b
@@ -179,6 +180,19 @@ enum mad_func {
 };
 
 /*
+ * func field for special3 lx opcodes (Cavium Octeon).
+ */
+enum lx_func {
+	lwx_op	= 0x00,
+	lhx_op	= 0x04,
+	lbux_op	= 0x06,
+	ldx_op	= 0x08,
+	lwux_op	= 0x10,
+	lhux_op	= 0x14,
+	lbx_op	= 0x16,
+};
+
+/*
  * Damn ...  bitfields depend from byteorder :-(
  */
 #ifdef __MIPSEB__
diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index d361df3..dcbd4bb 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -119,6 +119,8 @@ Ip_u2u1msbu3(_dinsm);
 Ip_u1(_syscall);
 Ip_u1u2s3(_bbit0);
 Ip_u1u2s3(_bbit1);
+Ip_u3u1u2(_lwx);
+Ip_u3u1u2(_ldx);
 
 /* Handle labels. */
 struct uasm_label {
@@ -156,6 +158,7 @@ static inline void __uasminit uasm_l##lb(struct uasm_label **lab, u32 *addr) \
 # define UASM_i_SUBU(buf, rs, rt, rd) uasm_i_dsubu(buf, rs, rt, rd)
 # define UASM_i_LL(buf, rs, rt, off) uasm_i_lld(buf, rs, rt, off)
 # define UASM_i_SC(buf, rs, rt, off) uasm_i_scd(buf, rs, rt, off)
+# define UASM_i_LWX(buf, rs, rt, rd) uasm_i_ldx(buf, rs, rt, rd)
 #else
 # define UASM_i_LW(buf, rs, rt, off) uasm_i_lw(buf, rs, rt, off)
 # define UASM_i_SW(buf, rs, rt, off) uasm_i_sw(buf, rs, rt, off)
@@ -170,6 +173,7 @@ static inline void __uasminit uasm_l##lb(struct uasm_label **lab, u32 *addr) \
 # define UASM_i_SUBU(buf, rs, rt, rd) uasm_i_subu(buf, rs, rt, rd)
 # define UASM_i_LL(buf, rs, rt, off) uasm_i_ll(buf, rs, rt, off)
 # define UASM_i_SC(buf, rs, rt, off) uasm_i_sc(buf, rs, rt, off)
+# define UASM_i_LWX(buf, rs, rt, rd) uasm_i_lwx(buf, rs, rt, rd)
 #endif
 
 #define uasm_i_b(buf, off) uasm_i_beq(buf, 0, 0, off)
diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c
index 99f0347..357916d 100644
--- a/arch/mips/mm/uasm.c
+++ b/arch/mips/mm/uasm.c
@@ -68,7 +68,8 @@ enum opcode {
 	insn_pref, insn_rfe, insn_sc, insn_scd, insn_sd, insn_sll,
 	insn_sra, insn_srl, insn_rotr, insn_subu, insn_sw, insn_tlbp,
 	insn_tlbr, insn_tlbwi, insn_tlbwr, insn_xor, insn_xori,
-	insn_dins, insn_dinsm, insn_syscall, insn_bbit0, insn_bbit1
+	insn_dins, insn_dinsm, insn_syscall, insn_bbit0, insn_bbit1,
+	insn_lwx, insn_ldx
 };
 
 struct insn {
@@ -146,6 +147,8 @@ static struct insn insn_table[] __uasminitdata = {
 	{ insn_syscall, M(spec_op, 0, 0, 0, 0, syscall_op), SCIMM},
 	{ insn_bbit0, M(lwc2_op, 0, 0, 0, 0, 0), RS | RT | BIMM },
 	{ insn_bbit1, M(swc2_op, 0, 0, 0, 0, 0), RS | RT | BIMM },
+	{ insn_lwx, M(spec3_op, 0, 0, 0, lwx_op, lx_op), RS | RT | RD },
+	{ insn_ldx, M(spec3_op, 0, 0, 0, ldx_op, lx_op), RS | RT | RD },
 	{ insn_invalid, 0, 0 }
 };
 
@@ -434,6 +437,8 @@ I_u2u1msb32u3(_dinsm);
 I_u1(_syscall);
 I_u1u2s3(_bbit0);
 I_u1u2s3(_bbit1);
+I_u3u1u2(_lwx)
+I_u3u1u2(_ldx)
 
 #ifdef CONFIG_CPU_CAVIUM_OCTEON
 #include <asm/octeon/octeon.h>
-- 
1.7.2.3

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs
  2010-12-28  2:07 [PATCH 0/2] MIPS: Optimize TLB Refill for Octeon/Octeon2 David Daney
  2010-12-28  2:07 ` [PATCH 1/2] MIPS: Add LDX and LWX instructions to uasm David Daney
@ 2010-12-28  2:07 ` David Daney
  2010-12-28 17:11   ` Ralf Baechle
  2011-01-19 19:35   ` Jonas Gorski
  1 sibling, 2 replies; 10+ messages in thread
From: David Daney @ 2010-12-28  2:07 UTC (permalink / raw)
  To: linux-mips, ralf; +Cc: David Daney

Octeon can use scratch registers in the TLB handlers.  Octeon II can
use LDX instructions.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
---
 arch/mips/mm/tlbex.c |  361 +++++++++++++++++++++++++++++++++++++++++++-------
 1 files changed, 310 insertions(+), 51 deletions(-)

diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c
index 883cf76..083d341 100644
--- a/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@ -77,6 +77,40 @@ static int use_bbit_insns(void)
 	}
 }
 
+static int use_lwx_insns(void)
+{
+	switch (current_cpu_type()) {
+	case CPU_CAVIUM_OCTEON2:
+		return 1;
+	default:
+		return 0;
+	}
+}
+#if defined(CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE) && \
+    CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE > 0
+static bool scratchpad_available(void)
+{
+	return true;
+}
+static int scratchpad_offset(int i)
+{
+	/*
+	 * CVMSEG starts at address -32768 and extends for
+	 * CAVIUM_OCTEON_CVMSEG_SIZE 128 byte cache lines.
+	 */
+	i += 1; /* Kernel use starts at the top and works down. */
+	return CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE * 128 - (8 * i) - 32768;
+}
+#else
+static bool scratchpad_available(void)
+{
+	return false;
+}
+static int scratchpad_offset(int i)
+{
+	BUG();
+}
+#endif
 /*
  * Found by experiment: At least some revisions of the 4kc throw under
  * some circumstances a machine check exception, triggered by invalid
@@ -187,7 +221,7 @@ static struct uasm_reloc relocs[128] __cpuinitdata;
 static int check_for_high_segbits __cpuinitdata;
 #endif
 
-#ifdef CONFIG_MIPS_PGD_C0_CONTEXT
+static int check_for_high_segbits __cpuinitdata;
 
 static unsigned int kscratch_used_mask __cpuinitdata;
 
@@ -208,9 +242,12 @@ static int __cpuinit allocate_kscratch(void)
 	return r;
 }
 
+static int scratch_reg __cpuinitdata;
 static int pgd_reg __cpuinitdata;
+enum vmalloc64_mode {not_refill, refill_scratch, refill_noscratch};
+
+#ifndef CONFIG_MIPS_PGD_C0_CONTEXT
 
-#else /* !CONFIG_MIPS_PGD_C0_CONTEXT*/
 /*
  * CONFIG_MIPS_PGD_C0_CONTEXT implies 64 bit and lack of pgd_current,
  * we cannot do r3000 under these circumstances.
@@ -481,21 +518,43 @@ static __cpuinit __maybe_unused void build_convert_pte_to_entrylo(u32 **p,
 static __cpuinit void build_restore_pagemask(u32 **p,
 					     struct uasm_reloc **r,
 					     unsigned int tmp,
-					     enum label_id lid)
+					     enum label_id lid,
+					     int restore_scratch)
 {
-	/* Reset default page size */
-	if (PM_DEFAULT_MASK >> 16) {
-		uasm_i_lui(p, tmp, PM_DEFAULT_MASK >> 16);
-		uasm_i_ori(p, tmp, tmp, PM_DEFAULT_MASK & 0xffff);
-		uasm_il_b(p, r, lid);
-		uasm_i_mtc0(p, tmp, C0_PAGEMASK);
-	} else if (PM_DEFAULT_MASK) {
-		uasm_i_ori(p, tmp, 0, PM_DEFAULT_MASK);
-		uasm_il_b(p, r, lid);
-		uasm_i_mtc0(p, tmp, C0_PAGEMASK);
+	if (restore_scratch) {
+		/* Reset default page size */
+		if (PM_DEFAULT_MASK >> 16) {
+			uasm_i_lui(p, tmp, PM_DEFAULT_MASK >> 16);
+			uasm_i_ori(p, tmp, tmp, PM_DEFAULT_MASK & 0xffff);
+			uasm_i_mtc0(p, tmp, C0_PAGEMASK);
+			uasm_il_b(p, r, lid);
+		} else if (PM_DEFAULT_MASK) {
+			uasm_i_ori(p, tmp, 0, PM_DEFAULT_MASK);
+			uasm_i_mtc0(p, tmp, C0_PAGEMASK);
+			uasm_il_b(p, r, lid);
+		} else {
+			uasm_i_mtc0(p, 0, C0_PAGEMASK);
+			uasm_il_b(p, r, lid);
+		}
+		if (scratch_reg > 0)
+			UASM_i_MFC0(p, 1, 31, scratch_reg);
+		else
+			UASM_i_LW(p, 1, scratchpad_offset(0), 0);
 	} else {
-		uasm_il_b(p, r, lid);
-		uasm_i_mtc0(p, 0, C0_PAGEMASK);
+		/* Reset default page size */
+		if (PM_DEFAULT_MASK >> 16) {
+			uasm_i_lui(p, tmp, PM_DEFAULT_MASK >> 16);
+			uasm_i_ori(p, tmp, tmp, PM_DEFAULT_MASK & 0xffff);
+			uasm_il_b(p, r, lid);
+			uasm_i_mtc0(p, tmp, C0_PAGEMASK);
+		} else if (PM_DEFAULT_MASK) {
+			uasm_i_ori(p, tmp, 0, PM_DEFAULT_MASK);
+			uasm_il_b(p, r, lid);
+			uasm_i_mtc0(p, tmp, C0_PAGEMASK);
+		} else {
+			uasm_il_b(p, r, lid);
+			uasm_i_mtc0(p, 0, C0_PAGEMASK);
+		}
 	}
 }
 
@@ -503,7 +562,8 @@ static __cpuinit void build_huge_tlb_write_entry(u32 **p,
 						 struct uasm_label **l,
 						 struct uasm_reloc **r,
 						 unsigned int tmp,
-						 enum tlb_write_entry wmode)
+						 enum tlb_write_entry wmode,
+						 int restore_scratch)
 {
 	/* Set huge page tlb entry size */
 	uasm_i_lui(p, tmp, PM_HUGE_MASK >> 16);
@@ -512,7 +572,7 @@ static __cpuinit void build_huge_tlb_write_entry(u32 **p,
 
 	build_tlb_write_entry(p, l, r, wmode);
 
-	build_restore_pagemask(p, r, tmp, label_leave);
+	build_restore_pagemask(p, r, tmp, label_leave, restore_scratch);
 }
 
 /*
@@ -577,7 +637,7 @@ static __cpuinit void build_huge_handler_tail(u32 **p,
 	UASM_i_SW(p, pte, 0, ptr);
 #endif
 	build_huge_update_entries(p, pte, ptr);
-	build_huge_tlb_write_entry(p, l, r, pte, tlb_indexed);
+	build_huge_tlb_write_entry(p, l, r, pte, tlb_indexed, 0);
 }
 #endif /* CONFIG_HUGETLB_PAGE */
 
@@ -674,7 +734,6 @@ build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
 #endif
 }
 
-enum vmalloc64_mode {not_refill, refill};
 /*
  * BVADDR is the faulting address, PTR is scratch.
  * PTR will hold the pgd for vmalloc.
@@ -692,7 +751,7 @@ build_get_pgd_vmalloc64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
 
 	uasm_l_vmalloc(l, *p);
 
-	if (mode == refill && check_for_high_segbits) {
+	if (mode != not_refill && check_for_high_segbits) {
 		if (single_insn_swpd) {
 			uasm_il_bltz(p, r, bvaddr, label_vmalloc_done);
 			uasm_i_lui(p, ptr, uasm_rel_hi(swpd));
@@ -715,7 +774,7 @@ build_get_pgd_vmalloc64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
 				uasm_i_daddiu(p, ptr, ptr, uasm_rel_lo(swpd));
 		}
 	}
-	if (mode == refill && check_for_high_segbits) {
+	if (mode != not_refill && check_for_high_segbits) {
 		uasm_l_large_segbits_fault(l, *p);
 		/*
 		 * We get here if we are an xsseg address, or if we are
@@ -731,7 +790,15 @@ build_get_pgd_vmalloc64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
 		 */
 		UASM_i_LA(p, ptr, (unsigned long)tlb_do_page_fault_0);
 		uasm_i_jr(p, ptr);
-		uasm_i_nop(p);
+
+		if (mode == refill_scratch) {
+			if (scratch_reg > 0)
+				UASM_i_MFC0(p, 1, 31, scratch_reg);
+			else
+				UASM_i_LW(p, 1, scratchpad_offset(0), 0);
+		} else {
+			uasm_i_nop(p);
+		}
 	}
 }
 
@@ -888,6 +955,185 @@ static void __cpuinit build_update_entries(u32 **p, unsigned int tmp,
 #endif
 }
 
+struct mips_huge_tlb_info {
+	int huge_pte;
+	int restore_scratch;
+};
+
+static struct mips_huge_tlb_info __cpuinit
+build_fast_tlb_refill_handler (u32 **p, struct uasm_label **l,
+			       struct uasm_reloc **r, unsigned int tmp,
+			       unsigned int ptr, int c0_scratch)
+{
+	struct mips_huge_tlb_info rv;
+	unsigned int even, odd;
+	int vmalloc_branch_delay_filled = 0;
+	const int scratch = 1; /* Our extra working register */
+
+	rv.huge_pte = scratch;
+	rv.restore_scratch = 0;
+
+	if (check_for_high_segbits) {
+		UASM_i_MFC0(p, tmp, C0_BADVADDR);
+
+		if (pgd_reg != -1)
+			UASM_i_MFC0(p, ptr, 31, pgd_reg);
+		else
+			UASM_i_MFC0(p, ptr, C0_CONTEXT);
+
+		if (c0_scratch >= 0)
+			UASM_i_MTC0(p, scratch, 31, c0_scratch);
+		else
+			UASM_i_SW(p, scratch, scratchpad_offset(0), 0);
+
+		uasm_i_dsrl_safe(p, scratch, tmp,
+				 PGDIR_SHIFT + PGD_ORDER + PAGE_SHIFT - 3);
+		uasm_il_bnez(p, r, scratch, label_vmalloc);
+
+		if (pgd_reg == -1) {
+			vmalloc_branch_delay_filled = 1;
+			/* Clear lower 23 bits of context. */
+			uasm_i_dins(p, ptr, 0, 0, 23);
+		}
+	} else {
+		if (pgd_reg != -1)
+			UASM_i_MFC0(p, ptr, 31, pgd_reg);
+		else
+			UASM_i_MFC0(p, ptr, C0_CONTEXT);
+
+		UASM_i_MFC0(p, tmp, C0_BADVADDR);
+
+		if (c0_scratch >= 0)
+			UASM_i_MTC0(p, scratch, 31, c0_scratch);
+		else
+			UASM_i_SW(p, scratch, scratchpad_offset(0), 0);
+
+		if (pgd_reg == -1)
+			/* Clear lower 23 bits of context. */
+			uasm_i_dins(p, ptr, 0, 0, 23);
+
+		uasm_il_bltz(p, r, tmp, label_vmalloc);
+	}
+
+	if (pgd_reg == -1) {
+		vmalloc_branch_delay_filled = 1;
+		/* 1 0  1 0 1  << 6  xkphys cached */
+		uasm_i_ori(p, ptr, ptr, 0x540);
+		uasm_i_drotr(p, ptr, ptr, 11);
+	}
+
+#ifdef __PAGETABLE_PMD_FOLDED
+#define LOC_PTEP scratch
+#else
+#define LOC_PTEP ptr
+#endif
+
+	if (!vmalloc_branch_delay_filled)
+		/* get pgd offset in bytes */
+		uasm_i_dsrl_safe(p, scratch, tmp, PGDIR_SHIFT - 3);
+
+	uasm_l_vmalloc_done(l, *p);
+
+	/*
+	 *                         tmp          ptr
+	 * fall-through case =   badvaddr  *pgd_current
+	 * vmalloc case      =   badvaddr  swapper_pg_dir
+	 */
+
+	if (vmalloc_branch_delay_filled)
+		/* get pgd offset in bytes */
+		uasm_i_dsrl_safe(p, scratch, tmp, PGDIR_SHIFT - 3);
+
+#ifdef __PAGETABLE_PMD_FOLDED
+	GET_CONTEXT(p, tmp); /* get context reg */
+#endif
+	uasm_i_andi(p, scratch, scratch, (PTRS_PER_PGD - 1) << 3);
+
+	if (use_lwx_insns()) {
+		UASM_i_LWX(p, LOC_PTEP, scratch, ptr);
+	} else {
+		uasm_i_daddu(p, ptr, ptr, scratch); /* add in pgd offset */
+		uasm_i_ld(p, LOC_PTEP, 0, ptr); /* get pmd pointer */
+	}
+
+#ifndef __PAGETABLE_PMD_FOLDED
+	/* get pmd offset in bytes */
+	uasm_i_dsrl_safe(p, scratch, tmp, PMD_SHIFT - 3);
+	uasm_i_andi(p, scratch, scratch, (PTRS_PER_PMD - 1) << 3);
+	GET_CONTEXT(p, tmp); /* get context reg */
+
+	if (use_lwx_insns()) {
+		UASM_i_LWX(p, scratch, scratch, ptr);
+	} else {
+		uasm_i_daddu(p, ptr, ptr, scratch); /* add in pmd offset */
+		UASM_i_LW(p, scratch, 0, ptr);
+	}
+#endif
+	/* Adjust the context during the load latency. */
+	build_adjust_context(p, tmp);
+
+#ifdef CONFIG_HUGETLB_PAGE
+	uasm_il_bbit1(p, r, scratch, ilog2(_PAGE_HUGE), label_tlb_huge_update);
+	/*
+	 * The in the LWX case we don't want to do the load in the
+	 * delay slot.  It cannot issue in the same cycle and may be
+	 * speculative and unneeded.
+	 */
+	if (use_lwx_insns())
+		uasm_i_nop(p);
+#endif /* CONFIG_HUGETLB_PAGE */
+
+
+	/* build_update_entries */
+	if (use_lwx_insns()) {
+		even = ptr;
+		odd = tmp;
+		UASM_i_LWX(p, even, scratch, tmp);
+		UASM_i_ADDIU(p, tmp, tmp, sizeof(pte_t));
+		UASM_i_LWX(p, odd, scratch, tmp);
+	} else {
+		UASM_i_ADDU(p, ptr, scratch, tmp); /* add in offset */
+		even = tmp;
+		odd = ptr;
+		UASM_i_LW(p, even, 0, ptr); /* get even pte */
+		UASM_i_LW(p, odd, sizeof(pte_t), ptr); /* get odd pte */
+	}
+	if (kernel_uses_smartmips_rixi) {
+		uasm_i_dsrl_safe(p, even, even, ilog2(_PAGE_NO_EXEC));
+		uasm_i_dsrl_safe(p, odd, odd, ilog2(_PAGE_NO_EXEC));
+		uasm_i_drotr(p, even, even,
+			     ilog2(_PAGE_GLOBAL) - ilog2(_PAGE_NO_EXEC));
+		UASM_i_MTC0(p, even, C0_ENTRYLO0); /* load it */
+		uasm_i_drotr(p, odd, odd,
+			     ilog2(_PAGE_GLOBAL) - ilog2(_PAGE_NO_EXEC));
+	} else {
+		uasm_i_dsrl_safe(p, even, even, ilog2(_PAGE_GLOBAL));
+		UASM_i_MTC0(p, even, C0_ENTRYLO0); /* load it */
+		uasm_i_dsrl_safe(p, odd, odd, ilog2(_PAGE_GLOBAL));
+	}
+	UASM_i_MTC0(p, odd, C0_ENTRYLO1); /* load it */
+
+	if (c0_scratch >= 0) {
+		UASM_i_MFC0(p, scratch, 31, c0_scratch);
+		build_tlb_write_entry(p, l, r, tlb_random);
+		uasm_l_leave(l, *p);
+		rv.restore_scratch = 1;
+	} else if (PAGE_SHIFT == 14 || PAGE_SHIFT == 13)  {
+		build_tlb_write_entry(p, l, r, tlb_random);
+		uasm_l_leave(l, *p);
+		UASM_i_LW(p, scratch, scratchpad_offset(0), 0);
+	} else {
+		UASM_i_LW(p, scratch, scratchpad_offset(0), 0);
+		build_tlb_write_entry(p, l, r, tlb_random);
+		uasm_l_leave(l, *p);
+		rv.restore_scratch = 1;
+	}
+
+	uasm_i_eret(p); /* return from trap */
+
+	return rv;
+}
+
 /*
  * For a 64-bit kernel, we are using the 64-bit XTLB refill exception
  * because EXL == 0.  If we wrap, we can also use the 32 instruction
@@ -903,54 +1149,67 @@ static void __cpuinit build_r4000_tlb_refill_handler(void)
 	struct uasm_reloc *r = relocs;
 	u32 *f;
 	unsigned int final_len;
+	struct mips_huge_tlb_info htlb_info;
+	enum vmalloc64_mode vmalloc_mode;
 
 	memset(tlb_handler, 0, sizeof(tlb_handler));
 	memset(labels, 0, sizeof(labels));
 	memset(relocs, 0, sizeof(relocs));
 	memset(final_handler, 0, sizeof(final_handler));
 
-	/*
-	 * create the plain linear handler
-	 */
-	if (bcm1250_m3_war()) {
-		unsigned int segbits = 44;
+	if (scratch_reg == 0)
+		scratch_reg = allocate_kscratch();
 
-		uasm_i_dmfc0(&p, K0, C0_BADVADDR);
-		uasm_i_dmfc0(&p, K1, C0_ENTRYHI);
-		uasm_i_xor(&p, K0, K0, K1);
-		uasm_i_dsrl_safe(&p, K1, K0, 62);
-		uasm_i_dsrl_safe(&p, K0, K0, 12 + 1);
-		uasm_i_dsll_safe(&p, K0, K0, 64 + 12 + 1 - segbits);
-		uasm_i_or(&p, K0, K0, K1);
-		uasm_il_bnez(&p, &r, K0, label_leave);
-		/* No need for uasm_i_nop */
-	}
+	if ((scratch_reg > 0 || scratchpad_available()) && use_bbit_insns()) {
+		htlb_info = build_fast_tlb_refill_handler(&p, &l, &r, K0, K1,
+							  scratch_reg);
+		vmalloc_mode = refill_scratch;
+	} else {
+		htlb_info.huge_pte = K0;
+		htlb_info.restore_scratch = 0;
+		vmalloc_mode = refill_noscratch;
+		/*
+		 * create the plain linear handler
+		 */
+		if (bcm1250_m3_war()) {
+			unsigned int segbits = 44;
+
+			uasm_i_dmfc0(&p, K0, C0_BADVADDR);
+			uasm_i_dmfc0(&p, K1, C0_ENTRYHI);
+			uasm_i_xor(&p, K0, K0, K1);
+			uasm_i_dsrl_safe(&p, K1, K0, 62);
+			uasm_i_dsrl_safe(&p, K0, K0, 12 + 1);
+			uasm_i_dsll_safe(&p, K0, K0, 64 + 12 + 1 - segbits);
+			uasm_i_or(&p, K0, K0, K1);
+			uasm_il_bnez(&p, &r, K0, label_leave);
+			/* No need for uasm_i_nop */
+		}
 
 #ifdef CONFIG_64BIT
-	build_get_pmde64(&p, &l, &r, K0, K1); /* get pmd in K1 */
+		build_get_pmde64(&p, &l, &r, K0, K1); /* get pmd in K1 */
 #else
-	build_get_pgde32(&p, K0, K1); /* get pgd in K1 */
+		build_get_pgde32(&p, K0, K1); /* get pgd in K1 */
 #endif
 
 #ifdef CONFIG_HUGETLB_PAGE
-	build_is_huge_pte(&p, &r, K0, K1, label_tlb_huge_update);
+		build_is_huge_pte(&p, &r, K0, K1, label_tlb_huge_update);
 #endif
 
-	build_get_ptep(&p, K0, K1);
-	build_update_entries(&p, K0, K1);
-	build_tlb_write_entry(&p, &l, &r, tlb_random);
-	uasm_l_leave(&l, p);
-	uasm_i_eret(&p); /* return from trap */
-
+		build_get_ptep(&p, K0, K1);
+		build_update_entries(&p, K0, K1);
+		build_tlb_write_entry(&p, &l, &r, tlb_random);
+		uasm_l_leave(&l, p);
+		uasm_i_eret(&p); /* return from trap */
+	}
 #ifdef CONFIG_HUGETLB_PAGE
 	uasm_l_tlb_huge_update(&l, p);
-	UASM_i_LW(&p, K0, 0, K1);
-	build_huge_update_entries(&p, K0, K1);
-	build_huge_tlb_write_entry(&p, &l, &r, K0, tlb_random);
+	build_huge_update_entries(&p, htlb_info.huge_pte, K1);
+	build_huge_tlb_write_entry(&p, &l, &r, K0, tlb_random,
+				   htlb_info.restore_scratch);
 #endif
 
 #ifdef CONFIG_64BIT
-	build_get_pgd_vmalloc64(&p, &l, &r, K0, K1, refill);
+	build_get_pgd_vmalloc64(&p, &l, &r, K0, K1, vmalloc_mode);
 #endif
 
 	/*
@@ -1616,7 +1875,7 @@ static void __cpuinit build_r4000_tlb_load_handler(void)
 		 * We clobbered C0_PAGEMASK, restore it.  On the other branch
 		 * it is restored in build_huge_tlb_write_entry.
 		 */
-		build_restore_pagemask(&p, &r, K0, label_nopage_tlbl);
+		build_restore_pagemask(&p, &r, K0, label_nopage_tlbl, 0);
 
 		uasm_l_tlbl_goaround2(&l, p);
 	}
-- 
1.7.2.3

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] MIPS: Add LDX and LWX instructions to uasm.
  2010-12-28  2:07 ` [PATCH 1/2] MIPS: Add LDX and LWX instructions to uasm David Daney
@ 2010-12-28 17:11   ` Ralf Baechle
  0 siblings, 0 replies; 10+ messages in thread
From: Ralf Baechle @ 2010-12-28 17:11 UTC (permalink / raw)
  To: David Daney; +Cc: linux-mips

The same uasm.h fuzz problem with this patch as previously mentioned.
I'll check on the ordering as you suggested on IRC.

Queued for 2.6.38.  Thanks,

  Ralf

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs
  2010-12-28  2:07 ` [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs David Daney
@ 2010-12-28 17:11   ` Ralf Baechle
  2011-01-19 19:35   ` Jonas Gorski
  1 sibling, 0 replies; 10+ messages in thread
From: Ralf Baechle @ 2010-12-28 17:11 UTC (permalink / raw)
  To: David Daney; +Cc: linux-mips

Queued for 2.6.38.  Thanks,

  Ralf

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs
  2010-12-28  2:07 ` [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs David Daney
  2010-12-28 17:11   ` Ralf Baechle
@ 2011-01-19 19:35   ` Jonas Gorski
  2011-01-19 19:41     ` David Daney
  1 sibling, 1 reply; 10+ messages in thread
From: Jonas Gorski @ 2011-01-19 19:35 UTC (permalink / raw)
  To: David Daney; +Cc: linux-mips, ralf

On 28/12/2010, David Daney <ddaney@caviumnetworks.com> wrote:
> +#if defined(CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE) && \
> +    CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE > 0
> (...)
> +#else
> +static bool scratchpad_available(void)
> +{
> +	return false;
> +}
> +static int scratchpad_offset(int i)
> +{
> +	BUG();
> +}
> +#endif

This seems to have broken the build for any non-octeon mips build:

  CC      arch/mips/mm/tlbex.o
cc1: warnings being treated as errors
arch/mips/mm/tlbex.c: In function 'scratchpad_offset':
arch/mips/mm/tlbex.c:112: error: no return statement in function
returning non-void

Regards,
Jonas

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs
  2011-01-19 19:35   ` Jonas Gorski
@ 2011-01-19 19:41     ` David Daney
  2011-01-19 19:46       ` Jonas Gorski
  0 siblings, 1 reply; 10+ messages in thread
From: David Daney @ 2011-01-19 19:41 UTC (permalink / raw)
  To: Jonas Gorski; +Cc: linux-mips, ralf

On 01/19/2011 11:35 AM, Jonas Gorski wrote:
> On 28/12/2010, David Daney<ddaney@caviumnetworks.com>  wrote:
>> +#if defined(CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE)&&  \
>> +    CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE>  0
>> (...)
>> +#else
>> +static bool scratchpad_available(void)
>> +{
>> +	return false;
>> +}
>> +static int scratchpad_offset(int i)
>> +{
>> +	BUG();
>> +}
>> +#endif
>
> This seems to have broken the build for any non-octeon mips build:
>
>    CC      arch/mips/mm/tlbex.o
> cc1: warnings being treated as errors
> arch/mips/mm/tlbex.c: In function 'scratchpad_offset':
> arch/mips/mm/tlbex.c:112: error: no return statement in function
> returning non-void
>

Can you tell me which version of GCC you are using?

I tested it with gcc-4.5.x, BUG() may have problems if 
builtin_unreachable is not available.

David Daney

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs
  2011-01-19 19:41     ` David Daney
@ 2011-01-19 19:46       ` Jonas Gorski
  2011-01-19 20:05         ` David Daney
  0 siblings, 1 reply; 10+ messages in thread
From: Jonas Gorski @ 2011-01-19 19:46 UTC (permalink / raw)
  To: David Daney; +Cc: linux-mips, ralf

On 19 January 2011 20:41, David Daney <ddaney@caviumnetworks.com> wrote:
> On 01/19/2011 11:35 AM, Jonas Gorski wrote:
>>
>> On 28/12/2010, David Daney<ddaney@caviumnetworks.com>  wrote:
>>>
>>> +#if defined(CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE)&&  \
>>> +    CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE>  0
>>> (...)
>>> +#else
>>> +static bool scratchpad_available(void)
>>> +{
>>> +       return false;
>>> +}
>>> +static int scratchpad_offset(int i)
>>> +{
>>> +       BUG();
>>> +}
>>> +#endif
>>
>> This seems to have broken the build for any non-octeon mips build:
>>
>>   CC      arch/mips/mm/tlbex.o
>> cc1: warnings being treated as errors
>> arch/mips/mm/tlbex.c: In function 'scratchpad_offset':
>> arch/mips/mm/tlbex.c:112: error: no return statement in function
>> returning non-void
>>
>
> Can you tell me which version of GCC you are using?
>
> I tested it with gcc-4.5.x, BUG() may have problems if builtin_unreachable
> is not available.

That's probably it, It's a 4.3.3 (with code sourcery extensions, the
OpenWrt default one).

Jonas

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs
  2011-01-19 19:46       ` Jonas Gorski
@ 2011-01-19 20:05         ` David Daney
  2011-01-19 20:44           ` Jonas Gorski
  0 siblings, 1 reply; 10+ messages in thread
From: David Daney @ 2011-01-19 20:05 UTC (permalink / raw)
  To: Jonas Gorski; +Cc: linux-mips, ralf

On 01/19/2011 11:46 AM, Jonas Gorski wrote:
> On 19 January 2011 20:41, David Daney<ddaney@caviumnetworks.com>  wrote:
>> On 01/19/2011 11:35 AM, Jonas Gorski wrote:
>>>
>>> On 28/12/2010, David Daney<ddaney@caviumnetworks.com>    wrote:
>>>>
>>>> +#if defined(CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE)&&    \
>>>> +    CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE>    0
>>>> (...)
>>>> +#else
>>>> +static bool scratchpad_available(void)
>>>> +{
>>>> +       return false;
>>>> +}
>>>> +static int scratchpad_offset(int i)
>>>> +{
>>>> +       BUG();
>>>> +}
>>>> +#endif
>>>
>>> This seems to have broken the build for any non-octeon mips build:
>>>
>>>    CC      arch/mips/mm/tlbex.o
>>> cc1: warnings being treated as errors
>>> arch/mips/mm/tlbex.c: In function 'scratchpad_offset':
>>> arch/mips/mm/tlbex.c:112: error: no return statement in function
>>> returning non-void
>>>
>>
>> Can you tell me which version of GCC you are using?
>>
>> I tested it with gcc-4.5.x, BUG() may have problems if builtin_unreachable
>> is not available.
>
> That's probably it, It's a 4.3.3 (with code sourcery extensions, the
> OpenWrt default one).
>

It is a bug in GCC-4.3.  The proper fix is to add 'return 0;' after that 
BUG() statement.

I will prepare a patch.

David Daney

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs
  2011-01-19 20:05         ` David Daney
@ 2011-01-19 20:44           ` Jonas Gorski
  0 siblings, 0 replies; 10+ messages in thread
From: Jonas Gorski @ 2011-01-19 20:44 UTC (permalink / raw)
  To: David Daney; +Cc: linux-mips, ralf

On 19/01/2011, David Daney <ddaney@caviumnetworks.com> wrote:
> It is a bug in GCC-4.3.  The proper fix is to add 'return 0;' after that
> BUG() statement.
>
> I will prepare a patch.

Yes, that works. Thanks for the quick response.

Jonas

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2011-01-19 20:45 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-12-28  2:07 [PATCH 0/2] MIPS: Optimize TLB Refill for Octeon/Octeon2 David Daney
2010-12-28  2:07 ` [PATCH 1/2] MIPS: Add LDX and LWX instructions to uasm David Daney
2010-12-28 17:11   ` Ralf Baechle
2010-12-28  2:07 ` [PATCH 2/2] MIPS: Optimize TLB handlers for Octeon CPUs David Daney
2010-12-28 17:11   ` Ralf Baechle
2011-01-19 19:35   ` Jonas Gorski
2011-01-19 19:41     ` David Daney
2011-01-19 19:46       ` Jonas Gorski
2011-01-19 20:05         ` David Daney
2011-01-19 20:44           ` Jonas Gorski

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.