* [RFC] IXP4xx little-endian data-coherent support
@ 2010-03-26 0:35 Krzysztof Halasa
0 siblings, 0 replies; only message in thread
From: Krzysztof Halasa @ 2010-03-26 0:35 UTC (permalink / raw)
To: linux-arm-kernel
Hi,
I'm finally trying to eliminate the performance hit caused by IXP4xx NPE
network engines working in big-endian mode only (which means on LE
system network buffers have to be byte-swapped by the CPU).
I have already booted Linux in LE-DC mode and it generally works, and
the remaining changes needed seem trivial, though a few questions
remains.
The LE data-coherent mode on IXP4xx is achieved by setting a certain bit
(9) in first level page table descriptors. This means we can control
data-coherent vs value-coherent with 1 MB of virtual address space
granularity (LE DC is just hardware byte-swapping).
Options:
1. use DC mode on whole virtual address space.
Pro: simplicity
Con: using value-coherent mode for certain (most) peripherals
(registers) is faster, since there is no byte-unswapping to be done.
2. use DC mode for most devices (including RAM and PCI address space
(but not PCI controller registers)), and value-coherent mode for the
peripherals.
Pro: faster
Con: we have to provide different mappings ("memory types"?) for
different devices. It could use something like ioremap_byteswapped()
- does it make sense?
Another possibility for #2: map all those peripherals at boot (we're
already doing it except for a small QMgr region), set their page table
entry bit statically in MMU code (= value-coherent), then everything
else is DC. Much simpler, though it means the ROM area (EXP bus) has to
be mapped DC as well - perhaps an advantage (few drivers including MTD
and IDE drivers have to be modified).
That's what I personally prefer at this point.
Another question is entering the DC mode. It's only possible with MMU,
so the "boot" code has to be value-coherent. Then, just before enabling
the MMU, all active memory resources (such as the page table, the boot
loader tags, kernel image and possibly the external initramfs) have to
byte-swapped. Possibly, the kernel and/or external initramfs can be
pre-swapped (only parts which are running with MMU - not sure if it's
practical with the kernel). I'm currently just byte-swapping the entire
RAM except for a small area in which the swapping code (and MMU-on)
resides. Guess I could use a "trampoline" in QMgr SRAM area (with a
"section" mapping for simplicity).
Comments?
I'm attaching a working patch (core only, no drivers), it isn't pretty
but I think it shows the idea. It needs another trivial patch which adds
Kbuild option IXP4XX_SUPPORT_425A0 (LE data-coherent mode requires
IXP425 stepping B0 or later CPU).
--
Krzysztof Halasa
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -649,6 +649,16 @@ config CPU_ENDIAN_BE32
help
Support for the BE-32 (big-endian) mode on pre-ARMv6 processors.
+config CPU_LITTLE_ENDIAN_DATA_COHERENT
+ bool "Data-coherent CPU mode"
+ depends on !CPU_BIG_ENDIAN && ARCH_IXP4XX && !IXP4XX_SUPPORT_425A0
+ help
+ Use data-coherent mode to access peripherals. This will improve
+ performance of certain Ethernet and WAN drivers, at the cost of
+ added complexity. Not very well tested.
+
+ If unsure, say "N".
+
config CPU_HIGH_VECTOR
depends on !MMU && CPU_CP15 && !CPU_ARM740T
bool "Select the High exception vector"
Perhaps this MT_DEVICE_VALUE_COHERENT isn't best name, MT_DEVICE_BYTESWAPPED?
--- a/arch/arm/include/asm/mach/map.h
+++ b/arch/arm/include/asm/mach/map.h
@@ -27,6 +27,7 @@ struct map_desc {
#define MT_MEMORY 9
#define MT_ROM 10
#define MT_MEMORY_NONCACHED 11
+#define MT_DEVICE_VALUE_COHERENT 12
#ifdef CONFIG_MMU
extern void iotable_init(struct map_desc *, int);
The following also removes PMD_BIT4, it seems XScale wants it cleared:
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -20,8 +20,15 @@
#ifdef CONFIG_MMU
+
+#if defined(CONFIG_ARCH_IXP4XX) && defined(CONFIG_CPU_LITTLE_ENDIAN_DATA_COHERENT)
+#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_DOMAIN(DOMAIN_USER) | (1 << 9))
+#define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_DOMAIN(DOMAIN_KERNEL) | (1 << 9))
+#else
#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER))
#define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_KERNEL))
+#endif
+
/*
* Since we have only two-level page tables, these are trivial
Since our RAM is big-endian, we have to swap PCI accesses:
--- a/arch/arm/mach-ixp4xx/common-pci.c
+++ b/arch/arm/mach-ixp4xx/common-pci.c
@@ -415,6 +415,6 @@ void __init ixp4xx_pci_preinit(void)
* little-endian PCI and the big-endian AHB bus
*/
-#ifdef __ARMEB__
+#if defined(__ARMEB__) || defined(CONFIG_CPU_LITTLE_ENDIAN_DATA_COHERENT)
*PCI_CSR = PCI_CSR_IC | PCI_CSR_ABE | PCI_CSR_PDS | PCI_CSR_ADS;
#else
*PCI_CSR = PCI_CSR_IC | PCI_CSR_ABE;
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -53,24 +53,29 @@ static struct map_desc ixp4xx_io_desc[] __initdata = {
.virtual = IXP4XX_PERIPHERAL_BASE_VIRT,
.pfn = __phys_to_pfn(IXP4XX_PERIPHERAL_BASE_PHYS),
.length = IXP4XX_PERIPHERAL_REGION_SIZE,
- .type = MT_DEVICE
+ .type = MT_DEVICE_VALUE_COHERENT
}, { /* Expansion Bus Config Registers */
.virtual = IXP4XX_EXP_CFG_BASE_VIRT,
.pfn = __phys_to_pfn(IXP4XX_EXP_CFG_BASE_PHYS),
.length = IXP4XX_EXP_CFG_REGION_SIZE,
- .type = MT_DEVICE
+ .type = MT_DEVICE_VALUE_COHERENT
}, { /* PCI Registers */
.virtual = IXP4XX_PCI_CFG_BASE_VIRT,
.pfn = __phys_to_pfn(IXP4XX_PCI_CFG_BASE_PHYS),
.length = IXP4XX_PCI_CFG_REGION_SIZE,
- .type = MT_DEVICE
+ .type = MT_DEVICE_VALUE_COHERENT
+ }, { /* Queue Manager */
+ .virtual = IXP4XX_QMGR_BASE_VIRT,
+ .pfn = __phys_to_pfn(IXP4XX_QMGR_BASE_PHYS),
+ .length = IXP4XX_QMGR_REGION_SIZE,
+ .type = MT_DEVICE_VALUE_COHERENT
},
#ifdef CONFIG_DEBUG_LL
{ /* Debug UART mapping */
.virtual = IXP4XX_DEBUG_UART_BASE_VIRT,
.pfn = __phys_to_pfn(IXP4XX_DEBUG_UART_BASE_PHYS),
.length = IXP4XX_DEBUG_UART_REGION_SIZE,
- .type = MT_DEVICE
+ .type = MT_DEVICE_VALUE_COHERENT
}
#endif
};
--- a/arch/arm/mach-ixp4xx/include/mach/ixp4xx-regs.h
+++ b/arch/arm/mach-ixp4xx/include/mach/ixp4xx-regs.h
@@ -30,19 +30,24 @@
*
* 0x50000000 0x10000000 ioremap'd EXP BUS
*
- * 0x6000000 0x00004000 ioremap'd QMgr
+ * 0xFFA00000 -> 0xFFBFFFFF is value-preserving in little-endian mode
*
- * 0xC0000000 0x00001000 0xffbff000 PCI CFG
+ * 0x60000000 0x00004000 0xffbe7000 QMgr
+ *
+ * 0xC8000000 0x00013000 0xffbeb000 On-Chip Peripherals
*
* 0xC4000000 0x00001000 0xffbfe000 EXP CFG
*
- * 0xC8000000 0x00013000 0xffbeb000 On-Chip Peripherals
+ * 0xC0000000 0x00001000 0xffbff000 PCI CFG
+ *
+ * (this should end on 0xFEFFFFFF, only VMALLOC_END -> 0xFEFFFFFF is for platform usage)
*/
/*
* Queue Manager
*/
#define IXP4XX_QMGR_BASE_PHYS (0x60000000)
+#define IXP4XX_QMGR_BASE_VIRT (0xFFBE7000)
#define IXP4XX_QMGR_REGION_SIZE (0x00004000)
/*
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -257,6 +257,13 @@ static struct mem_type mem_types[] = {
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
.domain = DOMAIN_KERNEL,
},
+ [MT_DEVICE_VALUE_COHERENT] = {
+ .prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
+ L_PTE_SHARED,
+ .prot_l1 = PMD_TYPE_TABLE,
+ .prot_sect = PROT_SECT_DEVICE | PMD_SECT_S,
+ .domain = DOMAIN_IO,
+ },
};
const struct mem_type *get_mem_type(unsigned int type)
@@ -315,6 +322,12 @@ static void __init build_mem_type_table(void)
for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
mem_types[i].prot_sect &= ~PMD_BIT4;
mem_types[i].prot_l1 &= ~PMD_BIT4;
+#if defined(CONFIG_ARCH_IXP4XX) && defined(CONFIG_CPU_LITTLE_ENDIAN_DATA_COHERENT)
+ if (i != MT_DEVICE_VALUE_COHERENT) {
+ mem_types[i].prot_l1 |= 1 << 9;
+ mem_types[i].prot_sect |= 1 << 9;
+ }
+#endif
}
} else if (cpu_arch < CPU_ARCH_ARMv6) {
for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index 93df472..796eb87 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -23,11 +23,13 @@
#include <linux/linkage.h>
#include <linux/init.h>
#include <asm/assembler.h>
+#include <asm/domain.h>
#include <asm/hwcap.h>
#include <asm/pgtable.h>
#include <asm/pgtable-hwdef.h>
#include <asm/page.h>
#include <asm/ptrace.h>
+#include <asm/system.h>
#include "proc-macros.S"
/*
@@ -480,6 +482,64 @@ __xscale_setup:
mcr p15, 0, ip, c7, c7, 0 @ invalidate I, D caches & BTB
mcr p15, 0, ip, c7, c10, 4 @ Drain Write (& Fill) Buffer
mcr p15, 0, ip, c8, c7, 0 @ invalidate I, D TLBs
+#ifndef __ARMEB__
+ mrc p15, 0, r0, c1, c0, 1
+ orr r0, r0, #2 @ set the page table P bit
+ mcr p15, 0, r0, c1, c0, 1
+
+#define TRAMPOLINE 0x200
+
+ /*
+ * Create identity mapping for on-chip Queue Manager SRAM to cater for
+ * the MMU enable. This identity mapping will be removed by
+ * paging_init(). We use our current program counter to determine
+ * corresponding section base address.
+ */
+#if 0
+ mov r0, #0x60000000
+ orr r0, #0x00000C00
+ orr r0, #0x0000000E
+#endif
+ mov r0, #0x00000C00
+ orr r0, #0x0000000E
+ @ add r3, r4, #0x600 << 2 @ r4 = page table address
+ str r0, [r4] @ identity mapping @ 0x60002100
+
+ adr r6, BSYM(__xscale_setup_moved)
+ @mov r7, #0x60000000
+ @orr r7, #0x00002100
+ mov r7, #TRAMPOLINE
+ mov r3, #0x100
+1: ldr r0, [r6], #4
+ str r0, [r7], #4
+ subs r3, r3, #1
+ bne 1b
+ @mov r7, #0x60000000
+ @orr r7, #0x00002100
+ mov r7, #TRAMPOLINE
+ mov pc, r7
+
+__xscale_setup_moved:
+ mov r6, #0 @ base address to swap
+2: ldr r0, [r6]
+ eor r7, r0, r0, ror #16
+ bic r7, r7, #0x00ff0000
+ mov r0, r0, ror #8
+ eor r0, r0, r7, lsr #8
+ str r0, [r6], #4
+ cmp r6, #0x200 @ end address
+ bne 2b
+
+ mov r6, #0x400 @ base address to swap
+3: ldr r0, [r6]
+ eor r7, r0, r0, ror #16
+ bic r7, r7, #0x00ff0000
+ mov r0, r0, ror #8
+ eor r0, r0, r7, lsr #8
+ str r0, [r6], #4
+ cmp r6, #64 * 1024 * 1024 @ end address
+ bne 3b
+#endif
mov r0, #1 << 6 @ cp6 for IOP3xx and Bulverde
orr r0, r0, #1 << 13 @ Its undefined whether this
mcr p15, 0, r0, c15, c1, 0 @ affects USR or SVC modes
@@ -489,7 +549,57 @@ __xscale_setup:
mrc p15, 0, r0, c1, c0, 0 @ get control register
bic r0, r0, r5
orr r0, r0, r6
+
+#ifdef __ARMEB__
mov pc, lr
+#else
+/*
+ * Setup common bits before finally enabling the MMU. Essentially
+ * this is just loading the page table pointer and domain access
+ * registers.
+ */
+#ifdef CONFIG_ALIGNMENT_TRAP
+ orr r0, r0, #CR_A
+#else
+ bic r0, r0, #CR_A
+#endif
+#ifdef CONFIG_CPU_DCACHE_DISABLE
+ bic r0, r0, #CR_C
+#endif
+#ifdef CONFIG_CPU_BPREDICT_DISABLE
+ bic r0, r0, #CR_Z
+#endif
+#ifdef CONFIG_CPU_ICACHE_DISABLE
+ bic r0, r0, #CR_I
+#endif
+ mov r5, #(domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \
+ domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
+ domain_val(DOMAIN_TABLE, DOMAIN_MANAGER) | \
+ domain_val(DOMAIN_IO, DOMAIN_CLIENT))
+ mcr p15, 0, r5, c3, c0, 0 @ load domain access register
+ mcr p15, 0, r4, c2, c0, 0 @ load page table pointer
+ b __xscale_turn_mmu_on
+
+/*
+ * Enable the MMU. This completely changes the structure of the visible
+ * memory space. You will not be able to trace execution through this.
+ * If you have an enquiry about this, *please* check the linux-arm-kernel
+ * mailing list archives BEFORE sending another post to the list.
+ *
+ * r0 = cp#15 control register
+ * r13 = *virtual* address to jump to upon completion
+ *
+ * other registers depend on the function called upon completion
+ */
+ .align 5
+__xscale_turn_mmu_on:
+ mov r0, r0
+ mcr p15, 0, r0, c1, c0, 0 @ write control reg
+ mrc p15, 0, r3, c0, c0, 0 @ read id reg
+ mov r3, r3
+ mov r3, r13
+ mov pc, r3
+#endif
.size __xscale_setup, . - __xscale_setup
/*
@@ -823,7 +933,7 @@ __ixp42x_proc_info:
PMD_SECT_BUFFERABLE | \
PMD_SECT_CACHEABLE | \
PMD_SECT_AP_WRITE | \
- PMD_SECT_AP_READ
+ PMD_SECT_AP_READ | (1 << 9)
.long PMD_TYPE_SECT | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2010-03-26 0:35 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-26 0:35 [RFC] IXP4xx little-endian data-coherent support Krzysztof Halasa
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.