[PATCH] [ARM] Introduce patching of phys_to

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
@ 2010-06-26  8:47 eric.miao at canonical.com
  2010-06-26  8:50 ` Eric Miao
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: eric.miao at canonical.com @ 2010-06-26  8:47 UTC (permalink / raw)
  To: linux-arm-kernel

From: Eric Miao <eric.miao@canonical.com>

In most cases, the delta between PHYS_OFFSET and PAGE_OFFSET is normally
16MiB aligned, which means the difference can be handled by a simple ADD
or SUB instruction with an immediate shift operand in ARM.  This will be
a bit more efficient and generic when PHYS_OFFSET goes run-time.

This idea can be made generic to allow conversions more than phys_to_virt
and virt_to_phys. A stub instruction is inserted where applicable, and it
has a form of 'add rn, rd, #imm', where the lowest 8-bit of #imm is used
to identify the type of patching.  Currently, only two types are defined,
but could be expanded in my POV to definitions like __io(), __mem_pci()
and so on. A __patch_table section is introduced to include the addresses
of all these stub instructions.

There are several places for improvement:

1. constant parameters which can be optimized by the compiler now needs
   one additional instruction (although the optimization is neither
   possible when PHYS_OFFSET goes a variable)

2. flush_cache_all() when patching is done seems to be brute but simple
   enough here in this patch to show a proof concept

Any thing else?

PS: the general idea comes from Nicolas Pitre, and is drafted at
    https://wiki.ubuntu.com/Specs/ARMSingleKernel

Cc: Nicolas Pitre <nicolas.pitre@canonical.com>
Signed-off-by: Eric Miao <eric.miao@canonical.com>
---
 arch/arm/Kconfig              |    4 +++
 arch/arm/include/asm/memory.h |   32 ++++++++++++++++++++++++++
 arch/arm/kernel/setup.c       |   50 +++++++++++++++++++++++++++++++++++++++++
 arch/arm/kernel/vmlinux.lds.S |    4 +++
 4 files changed, 90 insertions(+), 0 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1f254bd..0c171c2 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -189,6 +189,9 @@ config VECTORS_BASE
 	help
 	  The base address of exception vectors.
 
+config PATCH_PHYS_VIRT
+	def_bool n
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
@@ -579,6 +582,7 @@ config ARCH_PXA
 	select GENERIC_CLOCKEVENTS
 	select TICK_ONESHOT
 	select PLAT_PXA
+	select PATCH_PHYS_VIRT if !XIP_KERNEL
 	help
 	  Support for Intel/Marvell's PXA2xx/PXA3xx processor line.
 
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index 4312ee5..a5f84bc 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -173,6 +173,37 @@
  */
 #define PHYS_PFN_OFFSET	(PHYS_OFFSET >> PAGE_SHIFT)
 
+#ifdef CONFIG_PATCH_PHYS_VIRT
+
+#define PATCH_TYPE_PHYS_TO_VIRT		(0)
+#define PATCH_TYPE_VIRT_TO_PHYS		(1)
+
+#define __patch_stub(from,to,type)			\
+	__asm__ __volatile__(				\
+	"1:	add	%0, %1, %2\n"			\
+	"\n"						\
+	"	.pushsection __patch_table,\"a\"\n"	\
+	"	.long	1b\n"				\
+	"	.popsection\n"				\
+	: "=r" (to)					\
+	: "r" (from), "i" (type))
+
+static inline unsigned long virt_to_phys(void *x)
+{
+	unsigned long t;
+
+	__patch_stub(x, t, PATCH_TYPE_VIRT_TO_PHYS);
+	return t;
+}
+
+static inline void *phys_to_virt(unsigned long x)
+{
+	void *t;
+
+	__patch_stub(x, t, PATCH_TYPE_PHYS_TO_VIRT);
+	return t;
+}
+#else
 /*
  * These are *only* valid on the kernel direct mapped RAM memory.
  * Note: Drivers should NOT use these.  They are the wrong
@@ -188,6 +219,7 @@ static inline void *phys_to_virt(unsigned long x)
 {
 	return (void *)(__phys_to_virt((unsigned long)(x)));
 }
+#endif
 
 /*
  * Drivers should NOT use these either.
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 122d999..d265b50 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -663,12 +663,62 @@ static int __init customize_machine(void)
 }
 arch_initcall(customize_machine);
 
+#ifdef CONFIG_PATCH_PHYS_VIRT
+
+#define PATCH_INSTR_ADD		(0x00800000)
+#define PATCH_INSTR_SUB		(0x00400000)
+
+#define PATCH_STUB_MASK		(0xffe000ff)
+#define PATCH_STUB_PHYS_TO_VIRT	(0xe2800000 | PATCH_TYPE_PHYS_TO_VIRT)
+#define PATCH_STUB_VIRT_TO_PHYS	(0xe2800000 | PATCH_TYPE_VIRT_TO_PHYS)
+
+/* patch_phys_virt - patch the stub instructions with the delta between
+ * PHYS_OFFSET and PAGE_OFFSET, which is assumed to be 16MiB aligned and
+ * can be expressed by an immediate shifter operand. The stub instruction
+ * has a form of 'add rn, rd, #imm', where the lowest 8-bit of #imm is
+ * used to identify the type of patching.
+ */
+static void __init patch_phys_virt(void)
+{
+	extern unsigned int *__patch_table_begin, *__patch_table_end;
+	unsigned int **p;
+	unsigned int imm, instr[2];
+
+	if (PHYS_OFFSET & 0x00ffffff)
+		panic("Physical memory start is not 16MiB aligned\n");
+
+	if (likely(PHYS_OFFSET < PAGE_OFFSET)) {
+		imm = 0x400 | ((PAGE_OFFSET >> 24) - (PHYS_OFFSET >> 24));
+		instr[0] = PATCH_INSTR_ADD | imm;
+		instr[1] = PATCH_INSTR_SUB | imm;
+	} else {
+		imm = 0x400 | ((PHYS_OFFSET >> 24) - (PAGE_OFFSET >> 24));
+		instr[0] = PATCH_INSTR_SUB | imm;
+		instr[1] = PATCH_INSTR_ADD | imm;
+	}
+
+	for (p = &__patch_table_begin; p < &__patch_table_end; p++) {
+		unsigned int *inptr = *p;
+
+		if ((*inptr & PATCH_STUB_MASK) == PATCH_STUB_PHYS_TO_VIRT)
+			*inptr = (*inptr & ~0x00e00fff) | instr[0];
+		if ((*inptr & PATCH_STUB_MASK) == PATCH_STUB_VIRT_TO_PHYS)
+			*inptr = (*inptr & ~0x00e00fff) | instr[1];
+	}
+	flush_cache_all();
+}
+#else
+static inline void patch_phys_virt(void) {}
+#endif
+
 void __init setup_arch(char **cmdline_p)
 {
 	struct tag *tags = (struct tag *)&init_tags;
 	struct machine_desc *mdesc;
 	char *from = default_command_line;
 
+	patch_phys_virt();
+
 	unwind_init();
 
 	setup_processor();
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index b16c079..c48c754 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -41,6 +41,10 @@ SECTIONS
 			*(.taglist.init)
 		__tagtable_end = .;
 
+		__patch_table_begin = .;
+			*(__patch_table)
+		__patch_table_end = .;
+
 		INIT_SETUP(16)
 
 		INIT_CALLS
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-06-26  8:47 [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa eric.miao at canonical.com
@ 2010-06-26  8:50 ` Eric Miao
  2010-06-26  9:54 ` Russell King - ARM Linux
  2010-07-02 15:06 ` Rob Herring
  2 siblings, 0 replies; 15+ messages in thread
From: Eric Miao @ 2010-06-26  8:50 UTC (permalink / raw)
  To: linux-arm-kernel

> +config PATCH_PHYS_VIRT
> + ? ? ? def_bool n
> +
> ?source "init/Kconfig"
>
> ?source "kernel/Kconfig.freezer"
> @@ -579,6 +582,7 @@ config ARCH_PXA
> ? ? ? ?select GENERIC_CLOCKEVENTS
> ? ? ? ?select TICK_ONESHOT
> ? ? ? ?select PLAT_PXA
> + ? ? ? select PATCH_PHYS_VIRT if !XIP_KERNEL

Tested on PXA, so ignore this change at the moment.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-06-26  8:47 [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa eric.miao at canonical.com
  2010-06-26  8:50 ` Eric Miao
@ 2010-06-26  9:54 ` Russell King - ARM Linux
  2010-06-26 18:22   ` Nicolas Pitre
  2010-07-02 15:06 ` Rob Herring
  2 siblings, 1 reply; 15+ messages in thread
From: Russell King - ARM Linux @ 2010-06-26  9:54 UTC (permalink / raw)
  To: linux-arm-kernel

Mostly like this patch, but there's a couple of issues with it which
can be worked around by documentation...

On Sat, Jun 26, 2010 at 04:47:05PM +0800, eric.miao at canonical.com wrote:
> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index 1f254bd..0c171c2 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -189,6 +189,9 @@ config VECTORS_BASE
>  	help
>  	  The base address of exception vectors.
>  
> +config PATCH_PHYS_VIRT

ARM_PATCH_PHYS_TO_VIRT might be a better name?

> +	def_bool n
> +
>  source "init/Kconfig"
>  
>  source "kernel/Kconfig.freezer"
> @@ -579,6 +582,7 @@ config ARCH_PXA
>  	select GENERIC_CLOCKEVENTS
>  	select TICK_ONESHOT
>  	select PLAT_PXA
> +	select PATCH_PHYS_VIRT if !XIP_KERNEL

It needs to be documented that PATCH_PHYS_VIRT is only for non-XIP and
non-Thumb2 kernels.  I suggest it goes in the help text for the config
option, or a comment before the option.

>  	help
>  	  Support for Intel/Marvell's PXA2xx/PXA3xx processor line.
>  
> diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
> index 4312ee5..a5f84bc 100644
> --- a/arch/arm/include/asm/memory.h
> +++ b/arch/arm/include/asm/memory.h
> @@ -173,6 +173,37 @@
>   */
>  #define PHYS_PFN_OFFSET	(PHYS_OFFSET >> PAGE_SHIFT)
>  
> +#ifdef CONFIG_PATCH_PHYS_VIRT
> +
> +#define PATCH_TYPE_PHYS_TO_VIRT		(0)
> +#define PATCH_TYPE_VIRT_TO_PHYS		(1)
> +
> +#define __patch_stub(from,to,type)			\
> +	__asm__ __volatile__(				\
> +	"1:	add	%0, %1, %2\n"			\
> +	"\n"						\
> +	"	.pushsection __patch_table,\"a\"\n"	\
> +	"	.long	1b\n"				\
> +	"	.popsection\n"				\
> +	: "=r" (to)					\
> +	: "r" (from), "i" (type))

This should be "I" not "i":

    `I'
          Integer that is valid as an immediate operand in a data
          processing instruction.  That is, an integer in the range 0
          to 255 rotated by a multiple of 2

which is what 'add' takes.  "i" means any integer of any size.

> +/* patch_phys_virt - patch the stub instructions with the delta between
> + * PHYS_OFFSET and PAGE_OFFSET, which is assumed to be 16MiB aligned and
> + * can be expressed by an immediate shifter operand. The stub instruction
> + * has a form of 'add rn, rd, #imm', where the lowest 8-bit of #imm is

'add rd, rn, #imm'

> + * used to identify the type of patching.
> + */
> +static void __init patch_phys_virt(void)
> +{
> +	extern unsigned int *__patch_table_begin, *__patch_table_end;
> +	unsigned int **p;
> +	unsigned int imm, instr[2];
> +
> +	if (PHYS_OFFSET & 0x00ffffff)
> +		panic("Physical memory start is not 16MiB aligned\n");
> +
> +	if (likely(PHYS_OFFSET < PAGE_OFFSET)) {
> +		imm = 0x400 | ((PAGE_OFFSET >> 24) - (PHYS_OFFSET >> 24));
> +		instr[0] = PATCH_INSTR_ADD | imm;
> +		instr[1] = PATCH_INSTR_SUB | imm;
> +	} else {
> +		imm = 0x400 | ((PHYS_OFFSET >> 24) - (PAGE_OFFSET >> 24));
> +		instr[0] = PATCH_INSTR_SUB | imm;
> +		instr[1] = PATCH_INSTR_ADD | imm;
> +	}
> +
> +	for (p = &__patch_table_begin; p < &__patch_table_end; p++) {
> +		unsigned int *inptr = *p;
> +
> +		if ((*inptr & PATCH_STUB_MASK) == PATCH_STUB_PHYS_TO_VIRT)
> +			*inptr = (*inptr & ~0x00e00fff) | instr[0];
> +		if ((*inptr & PATCH_STUB_MASK) == PATCH_STUB_VIRT_TO_PHYS)
> +			*inptr = (*inptr & ~0x00e00fff) | instr[1];
> +	}
> +	flush_cache_all();

That's not a good idea before the page tables are setup - there is CPU
support which needs to read data in order to writeback dirty entries in
the cache.  (eg, StrongARM CPUs - which corresponds with ebsa110,
footbridge, rpc, sa1100, and shark.)

This mapping does not exist before paging_init() has completed - which
presents a catch-22 situation here.  This also needs to be documented
against the config option.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-06-26  9:54 ` Russell King - ARM Linux
@ 2010-06-26 18:22   ` Nicolas Pitre
  0 siblings, 0 replies; 15+ messages in thread
From: Nicolas Pitre @ 2010-06-26 18:22 UTC (permalink / raw)
  To: linux-arm-kernel

On Sat, 26 Jun 2010, Russell King - ARM Linux wrote:

> It needs to be documented that PATCH_PHYS_VIRT is only for non-XIP and
> non-Thumb2 kernels.  I suggest it goes in the help text for the config
> option, or a comment before the option.

The idea is to do the same for Thumb2 as well.

Obviously this is incompatible with XIP so should be documented as such.  
And I think we should try to make this option non user selectable if 
possible, but rather make it depend on the variable phys offset and 
linear memory layout options.

Nicolas

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-06-26  8:47 [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa eric.miao at canonical.com
  2010-06-26  8:50 ` Eric Miao
  2010-06-26  9:54 ` Russell King - ARM Linux
@ 2010-07-02 15:06 ` Rob Herring
  2010-07-20 21:16   ` Russell King - ARM Linux
  2 siblings, 1 reply; 15+ messages in thread
From: Rob Herring @ 2010-07-02 15:06 UTC (permalink / raw)
  To: linux-arm-kernel

Eric,

On Sat, 2010-06-26 at 16:47 +0800, eric.miao at canonical.com wrote:
> From: Eric Miao <eric.miao@canonical.com>
> 
> In most cases, the delta between PHYS_OFFSET and PAGE_OFFSET is normally
> 16MiB aligned, which means the difference can be handled by a simple ADD
> or SUB instruction with an immediate shift operand in ARM.  This will be
> a bit more efficient and generic when PHYS_OFFSET goes run-time.
> 
> This idea can be made generic to allow conversions more than phys_to_virt
> and virt_to_phys. A stub instruction is inserted where applicable, and it
> has a form of 'add rn, rd, #imm', where the lowest 8-bit of #imm is used
> to identify the type of patching.  Currently, only two types are defined,
> but could be expanded in my POV to definitions like __io(), __mem_pci()
> and so on. A __patch_table section is introduced to include the addresses
> of all these stub instructions.
> 
> There are several places for improvement:
> 
> 1. constant parameters which can be optimized by the compiler now needs
>    one additional instruction (although the optimization is neither
>    possible when PHYS_OFFSET goes a variable)
> 
> 2. flush_cache_all() when patching is done seems to be brute but simple
>    enough here in this patch to show a proof concept
> 
> Any thing else?
> 
> PS: the general idea comes from Nicolas Pitre, and is drafted at
>     https://wiki.ubuntu.com/Specs/ARMSingleKernel
> 

Should something more generic like the x86 alternative code be done for
ARM? It's very likely we will need patching in other places like for UP
and SMP in one kernel. 

Rob

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-07-02 15:06 ` Rob Herring
@ 2010-07-20 21:16   ` Russell King - ARM Linux
  2010-08-05  8:45     ` Eric Miao
  0 siblings, 1 reply; 15+ messages in thread
From: Russell King - ARM Linux @ 2010-07-20 21:16 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Jul 02, 2010 at 10:06:04AM -0500, Rob Herring wrote:
> Should something more generic like the x86 alternative code be done for
> ARM? It's very likely we will need patching in other places like for UP
> and SMP in one kernel. 

I think yes, we need to - I can't see how we can sensibly combine
kernels without this.

However, the effect of this is that XIP can't use this - as the
kernel text is read-only.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-07-20 21:16   ` Russell King - ARM Linux
@ 2010-08-05  8:45     ` Eric Miao
  2010-08-05  9:45       ` Russell King - ARM Linux
  0 siblings, 1 reply; 15+ messages in thread
From: Eric Miao @ 2010-08-05  8:45 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Jul 21, 2010 at 5:16 AM, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> On Fri, Jul 02, 2010 at 10:06:04AM -0500, Rob Herring wrote:
>> Should something more generic like the x86 alternative code be done for
>> ARM? It's very likely we will need patching in other places like for UP
>> and SMP in one kernel.
>
> I think yes, we need to - I can't see how we can sensibly combine
> kernels without this.
>
> However, the effect of this is that XIP can't use this - as the
> kernel text is read-only.
>

Hi Russell,

Sorry for late update on this. I've addressed your comments as below,
and really hope this can get into this merge window, so I'm also sending
this to your patch system right now ;-)

    [ARM] Introduce patching of phys_to_virt and vice versa

    In most cases, the delta between PHYS_OFFSET and PAGE_OFFSET is normally
    16MiB aligned, which means the difference can be handled by a simple ADD
    or SUB instruction with an immediate shift operand in ARM.  This will be
    a bit more efficient and generic when PHYS_OFFSET goes run-time.

    This idea can be made generic to allow conversions more than phys_to_virt
    and virt_to_phys. A stub instruction is inserted where applicable, and it
    has a form of 'add rn, rd, #imm', where the lowest 8-bit of #imm is used
    to identify the type of patching.  Currently, only two types are defined,
    but could be expanded in my POV to definitions like __io(), __mem_pci()
    and so on. A __patch_table section is introduced to include the addresses
    of all these stub instructions.

    There are several places for improvement:

    1. constant parameters which can be optimized by the compiler now needs
       one additional instruction (although the optimization is neither
       possible when PHYS_OFFSET goes a variable)

    2. flush_cache_all() when patching done is brute but simple enough here,
       provided it's done only once during startup.

    3. thumb2 can be supported in a same way, but will leave that for future
       enhancement.

    The general idea comes from Nicolas Pitre, and is drafted at
        https://wiki.ubuntu.com/Specs/ARMSingleKernel

    Signed-off-by: Nicolas Pitre <nicolas.pitre@canonical.com>
    Signed-off-by: Eric Miao <eric.miao@canonical.com>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 98922f7..4e87b16 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -189,6 +189,16 @@ config VECTORS_BASE
 	help
 	  The base address of exception vectors.

+config ARM_PATCH_PHYS_VIRT
+	def_bool n
+	help
+	  Note this is only for non-XIP and non-Thumb2 kernels. And there
+	  is CPU support which needs to read data in order to writeback
+	  dirty entries in the cache. (e.g. StrongARM, ebsa110, footbridge,
+	  rpc, sa1100, and shark). The mapping in the above case does not
+	  exist before paging_init() has completed. Thus this option does
+	  not support these CPUs at this moment.
+
 source "init/Kconfig"

 source "kernel/Kconfig.freezer"
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index 4312ee5..a91fe15 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -173,6 +173,37 @@
  */
 #define PHYS_PFN_OFFSET	(PHYS_OFFSET >> PAGE_SHIFT)

+#ifdef CONFIG_ARM_PATCH_PHYS_VIRT
+
+#define PATCH_TYPE_PHYS_TO_VIRT		(0)
+#define PATCH_TYPE_VIRT_TO_PHYS		(1)
+
+#define __patch_stub(from,to,type)			\
+	__asm__ __volatile__(				\
+	"1:	add	%0, %1, %2\n"			\
+	"\n"						\
+	"	.pushsection __patch_table,\"a\"\n"	\
+	"	.long	1b\n"				\
+	"	.popsection\n"				\
+	: "=r" (to)					\
+	: "r" (from), "I" (type))
+
+static inline unsigned long virt_to_phys(void *x)
+{
+	unsigned long t;
+
+	__patch_stub(x, t, PATCH_TYPE_VIRT_TO_PHYS);
+	return t;
+}
+
+static inline void *phys_to_virt(unsigned long x)
+{
+	void *t;
+
+	__patch_stub(x, t, PATCH_TYPE_PHYS_TO_VIRT);
+	return t;
+}
+#else
 /*
  * These are *only* valid on the kernel direct mapped RAM memory.
  * Note: Drivers should NOT use these.  They are the wrong
@@ -188,6 +219,7 @@ static inline void *phys_to_virt(unsigned long x)
 {
 	return (void *)(__phys_to_virt((unsigned long)(x)));
 }
+#endif

 /*
  * Drivers should NOT use these either.
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 122d999..6e16067 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -663,12 +663,62 @@ static int __init customize_machine(void)
 }
 arch_initcall(customize_machine);

+#ifdef CONFIG_ARM_PATCH_PHYS_VIRT
+
+#define PATCH_INSTR_ADD		(0x00800000)
+#define PATCH_INSTR_SUB		(0x00400000)
+
+#define PATCH_STUB_MASK		(0xffe000ff)
+#define PATCH_STUB_PHYS_TO_VIRT	(0xe2800000 | PATCH_TYPE_PHYS_TO_VIRT)
+#define PATCH_STUB_VIRT_TO_PHYS	(0xe2800000 | PATCH_TYPE_VIRT_TO_PHYS)
+
+/* patch_phys_virt - patch the stub instructions with the delta between
+ * PHYS_OFFSET and PAGE_OFFSET, which is assumed to be 16MiB aligned and
+ * can be expressed by an immediate shifter operand. The stub instruction
+ * has a form of 'add rd, rn, #imm', where the lowest 8-bit of #imm is
+ * used to identify the type of patching.
+ */
+static void __init patch_phys_virt(void)
+{
+	extern unsigned int *__patch_table_begin, *__patch_table_end;
+	unsigned int **p;
+	unsigned int imm, instr[2];
+
+	if (PHYS_OFFSET & 0x00ffffff)
+		panic("Physical memory start is not 16MiB aligned\n");
+
+	if (likely(PHYS_OFFSET < PAGE_OFFSET)) {
+		imm = 0x400 | ((PAGE_OFFSET >> 24) - (PHYS_OFFSET >> 24));
+		instr[0] = PATCH_INSTR_ADD | imm;
+		instr[1] = PATCH_INSTR_SUB | imm;
+	} else {
+		imm = 0x400 | ((PHYS_OFFSET >> 24) - (PAGE_OFFSET >> 24));
+		instr[0] = PATCH_INSTR_SUB | imm;
+		instr[1] = PATCH_INSTR_ADD | imm;
+	}
+
+	for (p = &__patch_table_begin; p < &__patch_table_end; p++) {
+		unsigned int *inptr = *p;
+
+		if ((*inptr & PATCH_STUB_MASK) == PATCH_STUB_PHYS_TO_VIRT)
+			*inptr = (*inptr & ~0x00e00fff) | instr[0];
+		if ((*inptr & PATCH_STUB_MASK) == PATCH_STUB_VIRT_TO_PHYS)
+			*inptr = (*inptr & ~0x00e00fff) | instr[1];
+	}
+	flush_cache_all();
+}
+#else
+static inline void patch_phys_virt(void) {}
+#endif
+
 void __init setup_arch(char **cmdline_p)
 {
 	struct tag *tags = (struct tag *)&init_tags;
 	struct machine_desc *mdesc;
 	char *from = default_command_line;

+	patch_phys_virt();
+
 	unwind_init();

 	setup_processor();
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index b16c079..c48c754 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -41,6 +41,10 @@ SECTIONS
 			*(.taglist.init)
 		__tagtable_end = .;

+		__patch_table_begin = .;
+			*(__patch_table)
+		__patch_table_end = .;
+
 		INIT_SETUP(16)

 		INIT_CALLS

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-08-05  8:45     ` Eric Miao
@ 2010-08-05  9:45       ` Russell King - ARM Linux
  2010-08-05 13:01         ` Eric Miao
  2010-08-06 17:07         ` Nicolas Pitre
  0 siblings, 2 replies; 15+ messages in thread
From: Russell King - ARM Linux @ 2010-08-05  9:45 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Aug 05, 2010 at 04:45:25PM +0800, Eric Miao wrote:
> On Wed, Jul 21, 2010 at 5:16 AM, Russell King - ARM Linux
> <linux@arm.linux.org.uk> wrote:
> > On Fri, Jul 02, 2010 at 10:06:04AM -0500, Rob Herring wrote:
> >> Should something more generic like the x86 alternative code be done for
> >> ARM? It's very likely we will need patching in other places like for UP
> >> and SMP in one kernel.
> >
> > I think yes, we need to - I can't see how we can sensibly combine
> > kernels without this.
> >
> > However, the effect of this is that XIP can't use this - as the
> > kernel text is read-only.
> >
> 
> Hi Russell,
> 
> Sorry for late update on this. I've addressed your comments as below,
> and really hope this can get into this merge window, so I'm also sending
> this to your patch system right now ;-)

How does this affect the compiler's instruction scheduling w.r.t. previous
load instructions?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-08-05  9:45       ` Russell King - ARM Linux
@ 2010-08-05 13:01         ` Eric Miao
  2010-08-06 17:11           ` Nicolas Pitre
  2010-08-06 17:07         ` Nicolas Pitre
  1 sibling, 1 reply; 15+ messages in thread
From: Eric Miao @ 2010-08-05 13:01 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Aug 5, 2010 at 5:45 PM, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> On Thu, Aug 05, 2010 at 04:45:25PM +0800, Eric Miao wrote:
>> On Wed, Jul 21, 2010 at 5:16 AM, Russell King - ARM Linux
>> <linux@arm.linux.org.uk> wrote:
>> > On Fri, Jul 02, 2010 at 10:06:04AM -0500, Rob Herring wrote:
>> >> Should something more generic like the x86 alternative code be done for
>> >> ARM? It's very likely we will need patching in other places like for UP
>> >> and SMP in one kernel.
>> >
>> > I think yes, we need to - I can't see how we can sensibly combine
>> > kernels without this.
>> >
>> > However, the effect of this is that XIP can't use this - as the
>> > kernel text is read-only.
>> >
>>
>> Hi Russell,
>>
>> Sorry for late update on this. I've addressed your comments as below,
>> and really hope this can get into this merge window, so I'm also sending
>> this to your patch system right now ;-)
>
> How does this affect the compiler's instruction scheduling w.r.t. previous
> load instructions?
>

I'm not really sure, so is there some testing benchmarking program
this can be measured?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-08-05  9:45       ` Russell King - ARM Linux
  2010-08-05 13:01         ` Eric Miao
@ 2010-08-06 17:07         ` Nicolas Pitre
  2010-08-08 22:24           ` Russell King - ARM Linux
  1 sibling, 1 reply; 15+ messages in thread
From: Nicolas Pitre @ 2010-08-06 17:07 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, 5 Aug 2010, Russell King - ARM Linux wrote:

> On Thu, Aug 05, 2010 at 04:45:25PM +0800, Eric Miao wrote:
> > On Wed, Jul 21, 2010 at 5:16 AM, Russell King - ARM Linux
> > <linux@arm.linux.org.uk> wrote:
> > > On Fri, Jul 02, 2010 at 10:06:04AM -0500, Rob Herring wrote:
> > >> Should something more generic like the x86 alternative code be done for
> > >> ARM? It's very likely we will need patching in other places like for UP
> > >> and SMP in one kernel.
> > >
> > > I think yes, we need to - I can't see how we can sensibly combine
> > > kernels without this.
> > >
> > > However, the effect of this is that XIP can't use this - as the
> > > kernel text is read-only.
> > >
> > 
> > Hi Russell,
> > 
> > Sorry for late update on this. I've addressed your comments as below,
> > and really hope this can get into this merge window, so I'm also sending
> > this to your patch system right now ;-)
> 
> How does this affect the compiler's instruction scheduling w.r.t. previous
> load instructions?

No idea.  That depends how gcc is considering inline asm.  Given there 
is one input operand, I suppose gcc would account for the delay slot 
before that operand is actually available.  The inline asm itself is an 
ALU instruction but there is no way to indicate that fact to gcc, and 
whether or not it uses the output operand right away won't matter in 
this case.  So all that could be missing is for gcc to schedule this ALU 
instruction into a possible delay slot for another ldr instruction or 
the like.

In any case, this can only be an improvement over the alternative which 
is to have PHYS_OFFSET in a global variable.

Nicolas

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-08-05 13:01         ` Eric Miao
@ 2010-08-06 17:11           ` Nicolas Pitre
  0 siblings, 0 replies; 15+ messages in thread
From: Nicolas Pitre @ 2010-08-06 17:11 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, 5 Aug 2010, Eric Miao wrote:

> On Thu, Aug 5, 2010 at 5:45 PM, Russell King - ARM Linux
> <linux@arm.linux.org.uk> wrote:
> > On Thu, Aug 05, 2010 at 04:45:25PM +0800, Eric Miao wrote:
> >> On Wed, Jul 21, 2010 at 5:16 AM, Russell King - ARM Linux
> >> <linux@arm.linux.org.uk> wrote:
> >> > On Fri, Jul 02, 2010 at 10:06:04AM -0500, Rob Herring wrote:
> >> >> Should something more generic like the x86 alternative code be done for
> >> >> ARM? It's very likely we will need patching in other places like for UP
> >> >> and SMP in one kernel.
> >> >
> >> > I think yes, we need to - I can't see how we can sensibly combine
> >> > kernels without this.
> >> >
> >> > However, the effect of this is that XIP can't use this - as the
> >> > kernel text is read-only.
> >> >
> >>
> >> Hi Russell,
> >>
> >> Sorry for late update on this. I've addressed your comments as below,
> >> and really hope this can get into this merge window, so I'm also sending
> >> this to your patch system right now ;-)
> >
> > How does this affect the compiler's instruction scheduling w.r.t. previous
> > load instructions?
> >
> 
> I'm not really sure, so is there some testing benchmarking program
> this can be measured?

I doubt this could be measured with conclusive results.  It would be 
best to simply perform some inspection on the generated assembly, and 
inquire gcc people for the actual scheduling decision in this case.  
That would only be informative as there isn't much we can do on our side 
anyway.


Nicolas

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-08-06 17:07         ` Nicolas Pitre
@ 2010-08-08 22:24           ` Russell King - ARM Linux
  2010-08-09 16:55             ` Nicolas Pitre
  0 siblings, 1 reply; 15+ messages in thread
From: Russell King - ARM Linux @ 2010-08-08 22:24 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Aug 06, 2010 at 01:07:46PM -0400, Nicolas Pitre wrote:
> No idea.  That depends how gcc is considering inline asm.  Given there 
> is one input operand, I suppose gcc would account for the delay slot 
> before that operand is actually available.

For something which could very well become by default enabled across
the board - due to the push to have a single kernel binary for lots
of significantly different platforms - it seems little research has
been carried out on this point.

While I agree that other obvious solutions would be more expensive, I
think it makes sense to have the commit which is introducing this
method include a proper evaluation, not least so that people who need
to make the decision to enable this aren't repeating the same research
that someone else has done (or, worse, just decide to enable it 'just
because' without any understanding of what the effect may be.)

So, I'd be interested in hearing whether we do see any preceding ldr
delay slot scheduling for these asm instructions.

I think we also need to consider whether the volatile is really required.
This asm() doesn't have any side effects other than its output operand,
so I suspect that the volatile may get in the way of some optimisations
(such as deleting the operation if the output is not actually used.)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-08-08 22:24           ` Russell King - ARM Linux
@ 2010-08-09 16:55             ` Nicolas Pitre
  2010-09-28 13:35               ` Eric Miao
  0 siblings, 1 reply; 15+ messages in thread
From: Nicolas Pitre @ 2010-08-09 16:55 UTC (permalink / raw)
  To: linux-arm-kernel

On Sun, 8 Aug 2010, Russell King - ARM Linux wrote:

> On Fri, Aug 06, 2010 at 01:07:46PM -0400, Nicolas Pitre wrote:
> > No idea.  That depends how gcc is considering inline asm.  Given there 
> > is one input operand, I suppose gcc would account for the delay slot 
> > before that operand is actually available.
> 
> For something which could very well become by default enabled across
> the board - due to the push to have a single kernel binary for lots
> of significantly different platforms - it seems little research has
> been carried out on this point.
> 
> While I agree that other obvious solutions would be more expensive, I
> think it makes sense to have the commit which is introducing this
> method include a proper evaluation, not least so that people who need
> to make the decision to enable this aren't repeating the same research
> that someone else has done (or, worse, just decide to enable it 'just
> because' without any understanding of what the effect may be.)

Sure, I agree.  I'll contact gcc people for a definitive answer.

> I think we also need to consider whether the volatile is really required.
> This asm() doesn't have any side effects other than its output operand,
> so I suspect that the volatile may get in the way of some optimisations
> (such as deleting the operation if the output is not actually used.)

Indeed, the volatile should go.  This is even getting in the way of 
better scheduling.


Nicolas

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-08-09 16:55             ` Nicolas Pitre
@ 2010-09-28 13:35               ` Eric Miao
  2010-09-28 17:51                 ` Nicolas Pitre
  0 siblings, 1 reply; 15+ messages in thread
From: Eric Miao @ 2010-09-28 13:35 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Aug 10, 2010 at 12:55 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> On Sun, 8 Aug 2010, Russell King - ARM Linux wrote:
>
>> On Fri, Aug 06, 2010 at 01:07:46PM -0400, Nicolas Pitre wrote:
>> > No idea. ?That depends how gcc is considering inline asm. ?Given there
>> > is one input operand, I suppose gcc would account for the delay slot
>> > before that operand is actually available.
>>
>> For something which could very well become by default enabled across
>> the board - due to the push to have a single kernel binary for lots
>> of significantly different platforms - it seems little research has
>> been carried out on this point.
>>
>> While I agree that other obvious solutions would be more expensive, I
>> think it makes sense to have the commit which is introducing this
>> method include a proper evaluation, not least so that people who need
>> to make the decision to enable this aren't repeating the same research
>> that someone else has done (or, worse, just decide to enable it 'just
>> because' without any understanding of what the effect may be.)
>
> Sure, I agree. ?I'll contact gcc people for a definitive answer.
>

Nico, any feedback?

>> I think we also need to consider whether the volatile is really required.
>> This asm() doesn't have any side effects other than its output operand,
>> so I suspect that the volatile may get in the way of some optimisations
>> (such as deleting the operation if the output is not actually used.)
>
> Indeed, the volatile should go. ?This is even getting in the way of
> better scheduling.
>

Yep. I don't see the necessity here for the volatile. Actually after removing
it, it resulted a better instruction scheduling.

I took example of arch/arm/mach-pxa/mioa701.c

static int mioa701_sys_suspend(struct sys_device *sysdev, pm_message_t state)
{
	int i = 0, is_bt_on;
	u32 *mem_resume_vector	= phys_to_virt(RESUME_VECTOR_ADDR);
	u32 *mem_resume_enabler = phys_to_virt(RESUME_ENABLE_ADDR);
	u32 *mem_resume_bt	= phys_to_virt(RESUME_BT_ADDR);
	u32 *mem_resume_unknown	= phys_to_virt(RESUME_UNKNOWN_ADDR);

with __volatile__ gcc gave me something like:

c004de44 <mioa701_sys_suspend>:
c004de44:	e1a0c00d 	mov	ip, sp
c004de48:	e92dd9f8 	push	{r3, r4, r5, r6, r7, r8, fp, ip, lr, pc}
c004de4c:	e24cb004 	sub	fp, ip, #4
c004de50:	e59f80d8 	ldr	r8, [pc, #216]	; c004df30 <mioa701_sys_suspend+0xec>
c004de54:	e2888000 	add	r8, r8, #0
c004de58:	e59f50d4 	ldr	r5, [pc, #212]	; c004df34 <mioa701_sys_suspend+0xf0>
c004de5c:	e2855000 	add	r5, r5, #0
c004de60:	e59f40d0 	ldr	r4, [pc, #208]	; c004df38 <mioa701_sys_suspend+0xf4>
c004de64:	e2844000 	add	r4, r4, #0
c004de68:	e59f70cc 	ldr	r7, [pc, #204]	; c004df3c <mioa701_sys_suspend+0xf8>
c004de6c:	e2877000 	add	r7, r7, #0
c004de70:	e59f30c8 	ldr	r3, [pc, #200]	; c004df40 <mioa701_sys_suspend+0xfc>
c004de74:	e3a00053 	mov	r0, #83	; 0x53


below was what generated after removing __volatile__:

c004de44 <mioa701_sys_suspend>:
c004de44:	e1a0c00d 	mov	ip, sp
c004de48:	e92dd9f8 	push	{r3, r4, r5, r6, r7, r8, fp, ip, lr, pc}
c004de4c:	e24cb004 	sub	fp, ip, #4
c004de50:	e59f30d8 	ldr	r3, [pc, #216]	; c004df30 <mioa701_sys_suspend+0xec>
c004de54:	e3a00053 	mov	r0, #83	; 0x53
c004de58:	e59f80d4 	ldr	r8, [pc, #212]	; c004df34 <mioa701_sys_suspend+0xf0>
c004de5c:	e5936008 	ldr	r6, [r3, #8]
c004de60:	e59f50d0 	ldr	r5, [pc, #208]	; c004df38 <mioa701_sys_suspend+0xf4>
c004de64:	e59f40d0 	ldr	r4, [pc, #208]	; c004df3c <mioa701_sys_suspend+0xf8>
c004de68:	e1a069a6 	lsr	r6, r6, #19
c004de6c:	e2166001 	ands	r6, r6, #1
c004de70:	13a01802 	movne	r1, #131072	; 0x20000
c004de74:	03a01801 	moveq	r1, #65536	; 0x10000
c004de78:	ebffee0a 	bl	c00496a8 <pxa2xx_mfp_set_lpm>
c004de7c:	e2888000 	add	r8, r8, #0
c004de80:	e2855000 	add	r5, r5, #0
c004de84:	e2844000 	add	r4, r4, #0
c004de88:	e59f30b0 	ldr	r3, [pc, #176]	; c004df40 <mioa701_sys_suspend+0xfc>
c004de8c:	e59f70b0 	ldr	r7, [pc, #176]	; c004df44 <mioa701_sys_suspend+0x100>
c004de90:	e2877000 	add	r7, r7, #0

Note those 'add rN, rN, #0' instructions are those stub instructions
for patching.

Does this also show something w.r.t an optimal instruction scheduling?

>
> Nicolas
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa
  2010-09-28 13:35               ` Eric Miao
@ 2010-09-28 17:51                 ` Nicolas Pitre
  0 siblings, 0 replies; 15+ messages in thread
From: Nicolas Pitre @ 2010-09-28 17:51 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 28 Sep 2010, Eric Miao wrote:

> On Tue, Aug 10, 2010 at 12:55 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> > On Sun, 8 Aug 2010, Russell King - ARM Linux wrote:
> >
> >> On Fri, Aug 06, 2010 at 01:07:46PM -0400, Nicolas Pitre wrote:
> >> > No idea. ?That depends how gcc is considering inline asm. ?Given there
> >> > is one input operand, I suppose gcc would account for the delay slot
> >> > before that operand is actually available.
> >>
> >> For something which could very well become by default enabled across
> >> the board - due to the push to have a single kernel binary for lots
> >> of significantly different platforms - it seems little research has
> >> been carried out on this point.
> >>
> >> While I agree that other obvious solutions would be more expensive, I
> >> think it makes sense to have the commit which is introducing this
> >> method include a proper evaluation, not least so that people who need
> >> to make the decision to enable this aren't repeating the same research
> >> that someone else has done (or, worse, just decide to enable it 'just
> >> because' without any understanding of what the effect may be.)
> >
> > Sure, I agree. ?I'll contact gcc people for a definitive answer.
> >
> 
> Nico, any feedback?

Yes.  As far as scheduling goes, the current gcc implementation simply 
considers any inline assembly as a pipeline flush.  This is not perfect, 
but without the volatile this is still acceptable as demonstrated by 
your comparison example, probably better than dereferencing a global 
variable.  I'll ask the Linaro toolchain group to see if it is 
possible to improve gcc for this, but I suspect the improvement would be 
marginal compared to the current results.

> >> I think we also need to consider whether the volatile is really required.
> >> This asm() doesn't have any side effects other than its output operand,
> >> so I suspect that the volatile may get in the way of some optimisations
> >> (such as deleting the operation if the output is not actually used.)
> >
> > Indeed, the volatile should go. ?This is even getting in the way of
> > better scheduling.
> >
> 
> Yep. I don't see the necessity here for the volatile. Actually after removing
> it, it resulted a better instruction scheduling.

Indeed.  The volatile qualifier for inline assembly is a big hammer 
preventing any code movement around and across them.


Nicolas

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2010-09-28 17:51 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-06-26  8:47 [PATCH] [ARM] Introduce patching of phys_to_virt and vice versa eric.miao at canonical.com
2010-06-26  8:50 ` Eric Miao
2010-06-26  9:54 ` Russell King - ARM Linux
2010-06-26 18:22   ` Nicolas Pitre
2010-07-02 15:06 ` Rob Herring
2010-07-20 21:16   ` Russell King - ARM Linux
2010-08-05  8:45     ` Eric Miao
2010-08-05  9:45       ` Russell King - ARM Linux
2010-08-05 13:01         ` Eric Miao
2010-08-06 17:11           ` Nicolas Pitre
2010-08-06 17:07         ` Nicolas Pitre
2010-08-08 22:24           ` Russell King - ARM Linux
2010-08-09 16:55             ` Nicolas Pitre
2010-09-28 13:35               ` Eric Miao
2010-09-28 17:51                 ` Nicolas Pitre

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).