new text patching for review

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* new text patching for review
@ 2007-07-19  9:05 Andi Kleen
  2007-07-19 13:38 ` Mathieu Desnoyers
                   ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Andi Kleen @ 2007-07-19  9:05 UTC (permalink / raw)
  To: mathieu.desnoyers, jbeulich, S. P. Prasanna; +Cc: linux-kernel, patches


Hallo,

I had some second thoughts about the text patching of DEBUG_RODATA kernels
using change_page_attr(). Change_page_attr is intrusive and slow and using
a separate mapping is a little more gentle. I came up with this patch.
For your review and testing pleasure.

The main quirk is that it doesn't fully follow the cross-modifying code
recommendations; but i'm not sure that's really needed.

Also I admit nop_out is quite inefficient, but that's a very slow path
and it was easier to do it this way than handle all the corner cases
explicitely.

Comments,

-Andi

x86: Fix alternatives and kprobes to remap write-protected kernel text

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/i386/kernel/alternative.c   |   33 +++++++++++++++++++++++++++++++--
 arch/i386/kernel/kprobes.c       |    9 +++------
 arch/i386/mm/init.c              |   14 +++-----------
 arch/x86_64/kernel/kprobes.c     |   10 +++-------
 arch/x86_64/mm/init.c            |   10 ----------
 arch/x86_64/mm/pageattr.c        |    2 +-
 include/asm-i386/alternative.h   |    2 ++
 include/asm-x86_64/alternative.h |    2 ++
 include/asm-x86_64/pgtable.h     |    2 ++
 9 files changed, 47 insertions(+), 37 deletions(-)

Index: linux/arch/x86_64/kernel/kprobes.c
===================================================================
--- linux.orig/arch/x86_64/kernel/kprobes.c
+++ linux/arch/x86_64/kernel/kprobes.c
@@ -39,9 +39,9 @@
 #include <linux/module.h>
 #include <linux/kdebug.h>
 
-#include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
+#include <asm/alternative.h>
 
 void jprobe_return_end(void);
 static void __kprobes arch_copy_kprobe(struct kprobe *p);
@@ -209,16 +209,12 @@ static void __kprobes arch_copy_kprobe(s
 
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
-	*p->addr = BREAKPOINT_INSTRUCTION;
-	flush_icache_range((unsigned long) p->addr,
-			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+	text_poke(p->addr, BREAKPOINT_INSTRUCTION);
 }
 
 void __kprobes arch_disarm_kprobe(struct kprobe *p)
 {
-	*p->addr = p->opcode;
-	flush_icache_range((unsigned long) p->addr,
-			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+	text_poke(p->addr, p->opcode);
 }
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
Index: linux/arch/i386/kernel/alternative.c
===================================================================
--- linux.orig/arch/i386/kernel/alternative.c
+++ linux/arch/i386/kernel/alternative.c
@@ -2,8 +2,12 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/kprobes.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
+#include <asm/pgtable.h>
 
 #ifdef CONFIG_HOTPLUG_CPU
 static int smp_alt_once;
@@ -145,12 +149,15 @@ static unsigned char** find_nop_table(vo
 static void nop_out(void *insns, unsigned int len)
 {
 	unsigned char **noptable = find_nop_table();
+	int i;
 
 	while (len > 0) {
 		unsigned int noplen = len;
 		if (noplen > ASM_NOP_MAX)
 			noplen = ASM_NOP_MAX;
-		memcpy(insns, noptable[noplen], noplen);
+		/* Very inefficient */
+		for (i = 0; i < noplen; i++)
+			text_poke(insns + i, noptable[noplen][i]);
 		insns += noplen;
 		len -= noplen;
 	}
@@ -202,7 +209,7 @@ static void alternatives_smp_lock(u8 **s
 			continue;
 		if (*ptr > text_end)
 			continue;
-		**ptr = 0xf0; /* lock prefix */
+		text_poke(ptr, 0xf0); /* lock prefix */
 	};
 }
 
@@ -406,3 +413,25 @@ void __init alternative_instructions(voi
  	apply_paravirt(__parainstructions, __parainstructions_end);
 	local_irq_restore(flags);
 }
+
+/*
+ * RED-PEN Intel recommends stopping the other CPUs for such
+ * "cross-modifying code".
+ */
+void __kprobes text_poke(void *oaddr, u8 opcode)
+{
+        u8 *addr = oaddr;
+	if (!pte_write(*lookup_address((unsigned long)addr))) {
+		struct page *p = virt_to_page(addr);
+		addr = vmap(&p, 1, VM_MAP, PAGE_KERNEL);
+		if (!addr)
+			return;
+		addr += ((unsigned long)oaddr) % PAGE_SIZE;
+	}
+	*addr = opcode;
+	/* Not strictly needed, but can speed CPU recovery up */
+	if (cpu_has_clflush)
+		asm("clflush (%0) " :: "r" (addr) : "memory");
+	if (addr != oaddr)
+		vunmap(addr);
+}
Index: linux/arch/x86_64/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86_64/mm/pageattr.c
+++ linux/arch/x86_64/mm/pageattr.c
@@ -13,7 +13,7 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-static inline pte_t *lookup_address(unsigned long address) 
+pte_t *lookup_address(unsigned long address)
 { 
 	pgd_t *pgd = pgd_offset_k(address);
 	pud_t *pud;
Index: linux/include/asm-i386/alternative.h
===================================================================
--- linux.orig/include/asm-i386/alternative.h
+++ linux/include/asm-i386/alternative.h
@@ -149,4 +149,6 @@ apply_paravirt(struct paravirt_patch_sit
 #define __parainstructions_end	NULL
 #endif
 
+extern void text_poke(void *addr, u8 opcode);
+
 #endif /* _I386_ALTERNATIVE_H */
Index: linux/include/asm-x86_64/alternative.h
===================================================================
--- linux.orig/include/asm-x86_64/alternative.h
+++ linux/include/asm-x86_64/alternative.h
@@ -154,4 +154,6 @@ apply_paravirt(struct paravirt_patch *st
 #define __parainstructions_end NULL
 #endif
 
+extern void text_poke(void *addr, u8 opcode);
+
 #endif /* _X86_64_ALTERNATIVE_H */
Index: linux/include/asm-x86_64/pgtable.h
===================================================================
--- linux.orig/include/asm-x86_64/pgtable.h
+++ linux/include/asm-x86_64/pgtable.h
@@ -403,6 +403,8 @@ extern struct list_head pgd_list;
 
 extern int kern_addr_valid(unsigned long addr); 
 
+pte_t *lookup_address(unsigned long addr);
+
 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
 		remap_pfn_range(vma, vaddr, pfn, size, prot)
 
Index: linux/arch/i386/kernel/kprobes.c
===================================================================
--- linux.orig/arch/i386/kernel/kprobes.c
+++ linux/arch/i386/kernel/kprobes.c
@@ -35,6 +35,7 @@
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/uaccess.h>
+#include <asm/alternative.h>
 
 void jprobe_return_end(void);
 
@@ -169,16 +170,12 @@ int __kprobes arch_prepare_kprobe(struct
 
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
-	*p->addr = BREAKPOINT_INSTRUCTION;
-	flush_icache_range((unsigned long) p->addr,
-			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+	text_poke(p->addr, BREAKPOINT_INSTRUCTION);
 }
 
 void __kprobes arch_disarm_kprobe(struct kprobe *p)
 {
-	*p->addr = p->opcode;
-	flush_icache_range((unsigned long) p->addr,
-			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+	text_poke(p->addr, p->opcode);
 }
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
Index: linux/arch/i386/mm/init.c
===================================================================
--- linux.orig/arch/i386/mm/init.c
+++ linux/arch/i386/mm/init.c
@@ -801,17 +801,9 @@ void mark_rodata_ro(void)
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long size = PFN_ALIGN(_etext) - start;
 
-#ifndef CONFIG_KPROBES
-#ifdef CONFIG_HOTPLUG_CPU
-	/* It must still be possible to apply SMP alternatives. */
-	if (num_possible_cpus() <= 1)
-#endif
-	{
-		change_page_attr(virt_to_page(start),
-		                 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
-		printk("Write protecting the kernel text: %luk\n", size >> 10);
-	}
-#endif
+	change_page_attr(virt_to_page(start),
+	                 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
+	printk("Write protecting the kernel text: %luk\n", size >> 10);
 	start += size;
 	size = (unsigned long)__end_rodata - start;
 	change_page_attr(virt_to_page(start),
Index: linux/arch/x86_64/mm/init.c
===================================================================
--- linux.orig/arch/x86_64/mm/init.c
+++ linux/arch/x86_64/mm/init.c
@@ -600,16 +600,6 @@ void mark_rodata_ro(void)
 {
 	unsigned long start = (unsigned long)_stext, end;
 
-#ifdef CONFIG_HOTPLUG_CPU
-	/* It must still be possible to apply SMP alternatives. */
-	if (num_possible_cpus() > 1)
-		start = (unsigned long)_etext;
-#endif
-
-#ifdef CONFIG_KPROBES
-	start = (unsigned long)__start_rodata;
-#endif
-	
 	end = (unsigned long)__end_rodata;
 	start = (start + PAGE_SIZE - 1) & PAGE_MASK;
 	end &= PAGE_MASK;

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
@ 2007-07-19 12:25 Jan Beulich
  2007-07-19 12:41 ` Andi Kleen
  0 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2007-07-19 12:25 UTC (permalink / raw)
  To: prasanna, mathieu.desnoyers, ak; +Cc: linux-kernel, patches

I like this approach much better, just one small note (I think I
had the same mistake in my changes initially):

>@@ -202,7 +209,7 @@ static void alternatives_smp_lock(u8 **s
> 			continue;
> 		if (*ptr > text_end)
> 			continue;
>-		**ptr = 0xf0; /* lock prefix */
>+		text_poke(ptr, 0xf0); /* lock prefix */
> 	};
> }
 
You probably want *ptr here.

Jan

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19 12:25 Jan Beulich
@ 2007-07-19 12:41 ` Andi Kleen
  0 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2007-07-19 12:41 UTC (permalink / raw)
  To: Jan Beulich; +Cc: prasanna, mathieu.desnoyers, linux-kernel, patches

On Thursday 19 July 2007 14:25:59 Jan Beulich wrote:
> I like this approach much better, just one small note (I think I
> had the same mistake in my changes initially):
> 
> >@@ -202,7 +209,7 @@ static void alternatives_smp_lock(u8 **s
> > 			continue;
> > 		if (*ptr > text_end)
> > 			continue;
> >-		**ptr = 0xf0; /* lock prefix */
> >+		text_poke(ptr, 0xf0); /* lock prefix */
> > 	};
> > }
>  
> You probably want *ptr here.

Thanks fixed

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19  9:05 new text patching for review Andi Kleen
@ 2007-07-19 13:38 ` Mathieu Desnoyers
  2007-07-19 13:46   ` Andi Kleen
  2007-07-20  0:37 ` Zachary Amsden
  2007-08-10 19:00 ` Mathieu Desnoyers
  2 siblings, 1 reply; 27+ messages in thread
From: Mathieu Desnoyers @ 2007-07-19 13:38 UTC (permalink / raw)
  To: Andi Kleen; +Cc: jbeulich, S. P. Prasanna, linux-kernel, patches

Hi Andi,

* Andi Kleen (ak@suse.de) wrote:
> 
> Hallo,
> 
> I had some second thoughts about the text patching of DEBUG_RODATA kernels
> using change_page_attr(). Change_page_attr is intrusive and slow and using
> a separate mapping is a little more gentle. I came up with this patch.
> For your review and testing pleasure.
> 
> The main quirk is that it doesn't fully follow the cross-modifying code
> recommendations; but i'm not sure that's really needed.
> 
> Also I admit nop_out is quite inefficient, but that's a very slow path
> and it was easier to do it this way than handle all the corner cases
> explicitely.
> 
> Comments,
> 
> -Andi
> 
> x86: Fix alternatives and kprobes to remap write-protected kernel text
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
> 
> ---
>  arch/i386/kernel/alternative.c   |   33 +++++++++++++++++++++++++++++++--
>  arch/i386/kernel/kprobes.c       |    9 +++------
>  arch/i386/mm/init.c              |   14 +++-----------
>  arch/x86_64/kernel/kprobes.c     |   10 +++-------
>  arch/x86_64/mm/init.c            |   10 ----------
>  arch/x86_64/mm/pageattr.c        |    2 +-
>  include/asm-i386/alternative.h   |    2 ++
>  include/asm-x86_64/alternative.h |    2 ++
>  include/asm-x86_64/pgtable.h     |    2 ++
>  9 files changed, 47 insertions(+), 37 deletions(-)
> 
> Index: linux/arch/x86_64/kernel/kprobes.c
> ===================================================================
> --- linux.orig/arch/x86_64/kernel/kprobes.c
> +++ linux/arch/x86_64/kernel/kprobes.c
> @@ -39,9 +39,9 @@
>  #include <linux/module.h>
>  #include <linux/kdebug.h>
>  
> -#include <asm/cacheflush.h>
>  #include <asm/pgtable.h>
>  #include <asm/uaccess.h>
> +#include <asm/alternative.h>
>  
>  void jprobe_return_end(void);
>  static void __kprobes arch_copy_kprobe(struct kprobe *p);
> @@ -209,16 +209,12 @@ static void __kprobes arch_copy_kprobe(s
>  
>  void __kprobes arch_arm_kprobe(struct kprobe *p)
>  {
> -	*p->addr = BREAKPOINT_INSTRUCTION;
> -	flush_icache_range((unsigned long) p->addr,
> -			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
> +	text_poke(p->addr, BREAKPOINT_INSTRUCTION);
>  }
>  
>  void __kprobes arch_disarm_kprobe(struct kprobe *p)
>  {
> -	*p->addr = p->opcode;
> -	flush_icache_range((unsigned long) p->addr,
> -			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
> +	text_poke(p->addr, p->opcode);
>  }
>  
>  void __kprobes arch_remove_kprobe(struct kprobe *p)
> Index: linux/arch/i386/kernel/alternative.c
> ===================================================================
> --- linux.orig/arch/i386/kernel/alternative.c
> +++ linux/arch/i386/kernel/alternative.c
> @@ -2,8 +2,12 @@
>  #include <linux/sched.h>
>  #include <linux/spinlock.h>
>  #include <linux/list.h>
> +#include <linux/kprobes.h>
> +#include <linux/mm.h>
> +#include <linux/vmalloc.h>
>  #include <asm/alternative.h>
>  #include <asm/sections.h>
> +#include <asm/pgtable.h>
>  
>  #ifdef CONFIG_HOTPLUG_CPU
>  static int smp_alt_once;
> @@ -145,12 +149,15 @@ static unsigned char** find_nop_table(vo
>  static void nop_out(void *insns, unsigned int len)
>  {
>  	unsigned char **noptable = find_nop_table();
> +	int i;
>  
>  	while (len > 0) {
>  		unsigned int noplen = len;
>  		if (noplen > ASM_NOP_MAX)
>  			noplen = ASM_NOP_MAX;
> -		memcpy(insns, noptable[noplen], noplen);
> +		/* Very inefficient */
> +		for (i = 0; i < noplen; i++)
> +			text_poke(insns + i, noptable[noplen][i]);

Ewwwwwwwwwww.... you plan to run this in SMP ? So you actually go byte
by byte changing pieces of instructions non atomically and doing
non-Intel's errata friendly XMC. You are really looking for trouble
there :) Two distinct errors can occur: 

1 - Invalid instruction (due to partial instruction modification)
2 - GPF (or something like it), due to Intel's errata.

I don't see why we _need_ to do the copy operation within a wrapper. I
am always frowning when I see an API that could do a simple task (and do
it well) turn into a more sophisticated interface (here: dealing with
write permissions _and_ doing the copy). It seems to me that it will
just limit the programmer's options about what code they want to change,
how, on how many bytes at a time...

>  		insns += noplen;
>  		len -= noplen;
>  	}
> @@ -202,7 +209,7 @@ static void alternatives_smp_lock(u8 **s
>  			continue;
>  		if (*ptr > text_end)
>  			continue;
> -		**ptr = 0xf0; /* lock prefix */
> +		text_poke(ptr, 0xf0); /* lock prefix */
>  	};
>  }
>  
> @@ -406,3 +413,25 @@ void __init alternative_instructions(voi
>   	apply_paravirt(__parainstructions, __parainstructions_end);
>  	local_irq_restore(flags);
>  }
> +
> +/*
> + * RED-PEN Intel recommends stopping the other CPUs for such
> + * "cross-modifying code".
> + */

Yeah, actually, you don't want them to busy loop with interrupts
disabled during this. Even then, you can get an NMI. Dealing with this
properly at the kernel level requires a little bit more thought than
what has been given to Intel's proposed algorithm. See DJprobes and the
immediate values for that.

Note that it does not apply to the breakpoint instruction set/setting
the original byte back.

> +void __kprobes text_poke(void *oaddr, u8 opcode)
> +{
> +        u8 *addr = oaddr;
> +	if (!pte_write(*lookup_address((unsigned long)addr))) {
> +		struct page *p = virt_to_page(addr);
> +		addr = vmap(&p, 1, VM_MAP, PAGE_KERNEL);

Oh, this part is nice.. you remap the same physical page in a writable
mapping, and then remove the mapping after the operation. I like this :)

What I don't like about this particular implementation is that it does
not support "poking" more than 1 byte. In order to support this, you
would have to deal with the case where the address range spans over more
than one page.

Also, doing the copy in the same interface seems a bit awkward.


I would much prefer something like:

void *map_shadow_write(void *addr, size_t len);
(returns a pointer to the shadow writable pages, at the same page offset
as "addr")

int unmap_shadow_write(void *shadow_addr, size_t len);
(unmap the shadow pages)

Then, the in-kernel user is free to modify their pages as they like.
Since we cannot foresee each modification pattern, I think that leaving
this kind of flexibility is useful.


Mathieu


> +		if (!addr)
> +			return;
> +		addr += ((unsigned long)oaddr) % PAGE_SIZE;
> +	}
> +	*addr = opcode;
> +	/* Not strictly needed, but can speed CPU recovery up */
> +	if (cpu_has_clflush)
> +		asm("clflush (%0) " :: "r" (addr) : "memory");
> +	if (addr != oaddr)
> +		vunmap(addr);
> +}
> Index: linux/arch/x86_64/mm/pageattr.c
> ===================================================================
> --- linux.orig/arch/x86_64/mm/pageattr.c
> +++ linux/arch/x86_64/mm/pageattr.c
> @@ -13,7 +13,7 @@
>  #include <asm/tlbflush.h>
>  #include <asm/io.h>
>  
> -static inline pte_t *lookup_address(unsigned long address) 
> +pte_t *lookup_address(unsigned long address)
>  { 
>  	pgd_t *pgd = pgd_offset_k(address);
>  	pud_t *pud;
> Index: linux/include/asm-i386/alternative.h
> ===================================================================
> --- linux.orig/include/asm-i386/alternative.h
> +++ linux/include/asm-i386/alternative.h
> @@ -149,4 +149,6 @@ apply_paravirt(struct paravirt_patch_sit
>  #define __parainstructions_end	NULL
>  #endif
>  
> +extern void text_poke(void *addr, u8 opcode);
> +
>  #endif /* _I386_ALTERNATIVE_H */
> Index: linux/include/asm-x86_64/alternative.h
> ===================================================================
> --- linux.orig/include/asm-x86_64/alternative.h
> +++ linux/include/asm-x86_64/alternative.h
> @@ -154,4 +154,6 @@ apply_paravirt(struct paravirt_patch *st
>  #define __parainstructions_end NULL
>  #endif
>  
> +extern void text_poke(void *addr, u8 opcode);
> +
>  #endif /* _X86_64_ALTERNATIVE_H */
> Index: linux/include/asm-x86_64/pgtable.h
> ===================================================================
> --- linux.orig/include/asm-x86_64/pgtable.h
> +++ linux/include/asm-x86_64/pgtable.h
> @@ -403,6 +403,8 @@ extern struct list_head pgd_list;
>  
>  extern int kern_addr_valid(unsigned long addr); 
>  
> +pte_t *lookup_address(unsigned long addr);
> +
>  #define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
>  		remap_pfn_range(vma, vaddr, pfn, size, prot)
>  
> Index: linux/arch/i386/kernel/kprobes.c
> ===================================================================
> --- linux.orig/arch/i386/kernel/kprobes.c
> +++ linux/arch/i386/kernel/kprobes.c
> @@ -35,6 +35,7 @@
>  #include <asm/cacheflush.h>
>  #include <asm/desc.h>
>  #include <asm/uaccess.h>
> +#include <asm/alternative.h>
>  
>  void jprobe_return_end(void);
>  
> @@ -169,16 +170,12 @@ int __kprobes arch_prepare_kprobe(struct
>  
>  void __kprobes arch_arm_kprobe(struct kprobe *p)
>  {
> -	*p->addr = BREAKPOINT_INSTRUCTION;
> -	flush_icache_range((unsigned long) p->addr,
> -			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
> +	text_poke(p->addr, BREAKPOINT_INSTRUCTION);
>  }
>  
>  void __kprobes arch_disarm_kprobe(struct kprobe *p)
>  {
> -	*p->addr = p->opcode;
> -	flush_icache_range((unsigned long) p->addr,
> -			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
> +	text_poke(p->addr, p->opcode);
>  }
>  
>  void __kprobes arch_remove_kprobe(struct kprobe *p)
> Index: linux/arch/i386/mm/init.c
> ===================================================================
> --- linux.orig/arch/i386/mm/init.c
> +++ linux/arch/i386/mm/init.c
> @@ -801,17 +801,9 @@ void mark_rodata_ro(void)
>  	unsigned long start = PFN_ALIGN(_text);
>  	unsigned long size = PFN_ALIGN(_etext) - start;
>  
> -#ifndef CONFIG_KPROBES
> -#ifdef CONFIG_HOTPLUG_CPU
> -	/* It must still be possible to apply SMP alternatives. */
> -	if (num_possible_cpus() <= 1)
> -#endif
> -	{
> -		change_page_attr(virt_to_page(start),
> -		                 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
> -		printk("Write protecting the kernel text: %luk\n", size >> 10);
> -	}
> -#endif
> +	change_page_attr(virt_to_page(start),
> +	                 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
> +	printk("Write protecting the kernel text: %luk\n", size >> 10);
>  	start += size;
>  	size = (unsigned long)__end_rodata - start;
>  	change_page_attr(virt_to_page(start),
> Index: linux/arch/x86_64/mm/init.c
> ===================================================================
> --- linux.orig/arch/x86_64/mm/init.c
> +++ linux/arch/x86_64/mm/init.c
> @@ -600,16 +600,6 @@ void mark_rodata_ro(void)
>  {
>  	unsigned long start = (unsigned long)_stext, end;
>  
> -#ifdef CONFIG_HOTPLUG_CPU
> -	/* It must still be possible to apply SMP alternatives. */
> -	if (num_possible_cpus() > 1)
> -		start = (unsigned long)_etext;
> -#endif
> -
> -#ifdef CONFIG_KPROBES
> -	start = (unsigned long)__start_rodata;
> -#endif
> -	
>  	end = (unsigned long)__end_rodata;
>  	start = (start + PAGE_SIZE - 1) & PAGE_MASK;
>  	end &= PAGE_MASK;

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19 13:38 ` Mathieu Desnoyers
@ 2007-07-19 13:46   ` Andi Kleen
  2007-07-19 17:35     ` Mathieu Desnoyers
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2007-07-19 13:46 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: jbeulich, S. P. Prasanna, linux-kernel, patches

> Ewwwwwwwwwww.... you plan to run this in SMP ? So you actually go byte
> by byte changing pieces of instructions non atomically and doing
> non-Intel's errata friendly XMC. You are really looking for trouble
> there :) Two distinct errors can occur: 

In this case it is ok because this only happens when transitioning
from 1 CPU to 2 CPUs or vice versa and in both cases the other CPUs
are essentially stopped.

All the other manipulations currently are single byte.

I suppose for your immediate value patches something stronger is needed,
but we can worry about that post .23.

> What I don't like about this particular implementation is that it does
> not support "poking" more than 1 byte. In order to support this, you
> would have to deal with the case where the address range spans over more
> than one page.

I considered it, but the function would have been at least twice as big
to handle all the corner cases. And for the current callers it's all fine.

> Also, doing the copy in the same interface seems a bit awkward.

Splitting it would also seem quite awkward.

> 
> I would much prefer something like:
> 
> void *map_shadow_write(void *addr, size_t len);
> (returns a pointer to the shadow writable pages, at the same page offset
> as "addr")
> 
> int unmap_shadow_write(void *shadow_addr, size_t len);
> (unmap the shadow pages)
> 
> Then, the in-kernel user is free to modify their pages as they like.
> Since we cannot foresee each modification pattern, I think that leaving
> this kind of flexibility is useful.

You could as well call vmap directly then; it's not that much
more complicated.  I don't really see much value in complicating
it right now.

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19 13:46   ` Andi Kleen
@ 2007-07-19 17:35     ` Mathieu Desnoyers
  2007-07-19 21:14       ` Andi Kleen
  0 siblings, 1 reply; 27+ messages in thread
From: Mathieu Desnoyers @ 2007-07-19 17:35 UTC (permalink / raw)
  To: Andi Kleen; +Cc: jbeulich, S. P. Prasanna, linux-kernel, patches

* Andi Kleen (ak@suse.de) wrote:
> 
> > Ewwwwwwwwwww.... you plan to run this in SMP ? So you actually go byte
> > by byte changing pieces of instructions non atomically and doing
> > non-Intel's errata friendly XMC. You are really looking for trouble
> > there :) Two distinct errors can occur: 
> 
> In this case it is ok because this only happens when transitioning
> from 1 CPU to 2 CPUs or vice versa and in both cases the other CPUs
> are essentially stopped.
> 

I agree that it's ok with SMP, but another problem arises: it's not only
a matter of being protected from SMP access, but also a matter of
reentrancy wrt interrupt handlers.

i.e.: if, as we are patching nops non atomically, we have a non maskable
interrupt coming which calls get_cycles_sync() which uses the
alternatives for cpuid when the NOP instructions are not in a correct
state, we can end up doing an illegal instruction.

I see that IRQs are disabled in alternative_instructions(), but it does
not protect against NMIs, which could come at a very inappropriate
moment. MCE and SMIs would potentially cause the same kind of trouble.

So unless you can guarantee that any code from NMI handler won't call
basic things such as get_cycles() (nor MCE, nor SMIs), you can't insure
it won't execute an illegal instruction. Also, the option of temporarily
disabling the NMI for the duration of the update simply adds unwanted
latency to the NMI handler which could be unacceptable in some setups.

Another potential problem comes from preemption: if a thread is
preempted in the middle of the instructions you are modifying, let's say
we originally have 4 * 1 byte instructions that we replace with 1 * 4
byte instruction, a thread could iret in the middle of the new
instruction when it will be scheduled again, thus executing an illegal
instruction. This problem could be overcomed by putting the threads in
the freezer. See the djprobe project for details about this.

In the end, I think the safest solution would be to limit ourselves to
one of these 2 solutions:
- If the update can be done atomically and leaves the site in a coherent
  state after each atomic modification, while keeping the same
  instruction size, it could be done on the spot.
- If not, care should be taken to first do a copy of the code to modify
  into a "bypass", making sure that instructions can be relocated, so
  that any context that would try to execute the site during the
  modification would execute a breakpoint which would execute the bypass
  while we modify the instructions. Care should be taken to make sure
  that threads are in the freezer while we do this. This is basically
  what djprobes does. You case is only a bit simpler because you don't
  worry about SMP.

> All the other manipulations currently are single byte.
> 

For breakpoint, yes, this one is easy.

> I suppose for your immediate value patches something stronger is needed,
> but we can worry about that post .23.
> 
> > What I don't like about this particular implementation is that it does
> > not support "poking" more than 1 byte. In order to support this, you
> > would have to deal with the case where the address range spans over more
> > than one page.
> 
> I considered it, but the function would have been at least twice as big
> to handle all the corner cases. And for the current callers it's all fine.
> 
> > Also, doing the copy in the same interface seems a bit awkward.
> 
> Splitting it would also seem quite awkward.
>  

Because of the tricks that must be done to do code modification on a
live system (as explained above), I don't think it makes sense to
provide a falsely-safe infrastructure that would allow code modification
in a non-atomic manner.

> > 
> > I would much prefer something like:
> > 
> > void *map_shadow_write(void *addr, size_t len);
> > (returns a pointer to the shadow writable pages, at the same page offset
> > as "addr")
> > 
> > int unmap_shadow_write(void *shadow_addr, size_t len);
> > (unmap the shadow pages)
> > 
> > Then, the in-kernel user is free to modify their pages as they like.
> > Since we cannot foresee each modification pattern, I think that leaving
> > this kind of flexibility is useful.
> 
> You could as well call vmap directly then; it's not that much
> more complicated.  I don't really see much value in complicating
> it right now.
> 

Right about the direct vmap call, it is quite simple and elegant, I
guess I'll use it in my next version of kernel text lock.

Mathieu

> -Andi
> 

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19 21:14       ` Andi Kleen
@ 2007-07-19 20:30         ` Jeremy Fitzhardinge
  2007-07-19 20:46           ` [patches] " Andi Kleen
  2007-07-19 23:51           ` new text patching for review Mathieu Desnoyers
  2007-07-19 23:49         ` Mathieu Desnoyers
  1 sibling, 2 replies; 27+ messages in thread
From: Jeremy Fitzhardinge @ 2007-07-19 20:30 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Mathieu Desnoyers, jbeulich, S. P. Prasanna, linux-kernel,
	patches, Zachary Amsden, Chris Wright, Rusty Russell

Andi Kleen wrote:
> Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> writes:
>   
>> I see that IRQs are disabled in alternative_instructions(), but it does
>> not protect against NMIs, which could come at a very inappropriate
>> moment. MCE and SMIs would potentially cause the same kind of trouble.
>>
>> So unless you can guarantee that any code from NMI handler won't call
>> basic things such as get_cycles() (nor MCE, nor SMIs), you can't insure
>> it won't execute an illegal instruction. Also, the option of temporarily
>> disabling the NMI for the duration of the update simply adds unwanted
>> latency to the NMI handler which could be unacceptable in some setups.
>>     
>
> Ok it's a fair point.  But how would you address it ?
>
> Even if we IPIed the other CPUs NMIs or MCEs could still happen.
>
> BTW Jeremy, have you ever considered that problem with paravirt ops
> patching? 
>   

I remember Zach was thinking about it when he was thinking of making vmi
a kernel module, but I don't think we discussed it with respect to the
current patching mechanism.  Though he did discover that at one point
alternative_instructions() was being run with interrupts enabled, which
caused surprisingly few problems...

But, yeah, it seems like it could be a problem.

> - smp lock patching only ever changes a single byte (lock prefix) of
> a single instruction
> - kprobes only ever change a single byte
>
> For the immediate value patching it also cannot happen because
> you'll never modify multiple instructions and all immediate values
> can be changed atomically. 
>   

Are misaligned/cross-cache-line updates atomic?

    J

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patches] new text patching for review
  2007-07-19 20:30         ` Jeremy Fitzhardinge
@ 2007-07-19 20:46           ` Andi Kleen
  2007-07-19 20:51             ` Jeremy Fitzhardinge
                               ` (2 more replies)
  2007-07-19 23:51           ` new text patching for review Mathieu Desnoyers
  1 sibling, 3 replies; 27+ messages in thread
From: Andi Kleen @ 2007-07-19 20:46 UTC (permalink / raw)
  To: patches
  Cc: Jeremy Fitzhardinge, Zachary Amsden, Mathieu Desnoyers,
	Rusty Russell, linux-kernel, S. P. Prasanna, Chris Wright

On Thursday 19 July 2007 22:30:12 Jeremy Fitzhardinge wrote:
> Andi Kleen wrote:
> > Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> writes:
> >   
> >> I see that IRQs are disabled in alternative_instructions(), but it does
> >> not protect against NMIs, which could come at a very inappropriate
> >> moment. MCE and SMIs would potentially cause the same kind of trouble.
> >>
> >> So unless you can guarantee that any code from NMI handler won't call
> >> basic things such as get_cycles() (nor MCE, nor SMIs), you can't insure
> >> it won't execute an illegal instruction. Also, the option of temporarily
> >> disabling the NMI for the duration of the update simply adds unwanted
> >> latency to the NMI handler which could be unacceptable in some setups.
> >>     
> >
> > Ok it's a fair point.  But how would you address it ?
> >
> > Even if we IPIed the other CPUs NMIs or MCEs could still happen.
> >
> > BTW Jeremy, have you ever considered that problem with paravirt ops
> > patching? 
> >   
> 
> I remember Zach was thinking about it when he was thinking of making vmi
> a kernel module, but I don't think we discussed it with respect to the
> current patching mechanism.  Though he did discover that at one point
> alternative_instructions() was being run with interrupts enabled, which
> caused surprisingly few problems...
> 
> But, yeah, it seems like it could be a problem.

Normally there are not that many NMIs or MCEs at boot, but it would
be still good to avoid the very rare crash by auditing the code first
[better than try to debug it on some production system later]

> > - smp lock patching only ever changes a single byte (lock prefix) of
> > a single instruction
> > - kprobes only ever change a single byte
> >
> > For the immediate value patching it also cannot happen because
> > you'll never modify multiple instructions and all immediate values
> > can be changed atomically. 
> >   
> 
> Are misaligned/cross-cache-line updates atomic?

In theory yes, in practice there can be errata of course. There tend 
to be a couple with self modifying code, especially cross modifying
(from another CPU) -- but you don't do that.

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patches] new text patching for review
  2007-07-19 20:46           ` [patches] " Andi Kleen
@ 2007-07-19 20:51             ` Jeremy Fitzhardinge
  2007-07-19 21:06               ` Andi Kleen
  2007-07-19 23:53             ` Mathieu Desnoyers
  2007-08-20  0:55             ` Non atomic unaligned writes Mathieu Desnoyers
  2 siblings, 1 reply; 27+ messages in thread
From: Jeremy Fitzhardinge @ 2007-07-19 20:51 UTC (permalink / raw)
  To: Andi Kleen
  Cc: patches, Zachary Amsden, Mathieu Desnoyers, Rusty Russell,
	linux-kernel, S. P. Prasanna, Chris Wright

Andi Kleen wrote:
> Normally there are not that many NMIs or MCEs at boot, but it would
> be still good to avoid the very rare crash by auditing the code first
> [better than try to debug it on some production system later]
>   

Auditing it for what?  If we want to make patching safe against NMI/MCE,
I guess we need to make sure those handlers don't use any pvops, but
that seems unreasonable if they want to poke at MSRs and so on.

> In theory yes, in practice there can be errata of course. There tend 
> to be a couple with self modifying code, especially cross modifying
> (from another CPU) -- but you don't do that.
>   

No, but the pv-ops patching code should have no requirement for
atomicity at all; we shouldn't be trying to patch a live instruction stream.

    J

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patches] new text patching for review
  2007-07-19 20:51             ` Jeremy Fitzhardinge
@ 2007-07-19 21:06               ` Andi Kleen
  2007-07-19 21:08                 ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2007-07-19 21:06 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: patches, Zachary Amsden, Mathieu Desnoyers, Rusty Russell,
	linux-kernel, S. P. Prasanna, Chris Wright

On Thursday 19 July 2007 22:51:51 Jeremy Fitzhardinge wrote:
> Andi Kleen wrote:
> > Normally there are not that many NMIs or MCEs at boot, but it would
> > be still good to avoid the very rare crash by auditing the code first
> > [better than try to debug it on some production system later]
> >   
> 
> Auditing it for what?  If we want to make patching safe against NMI/MCE,
> I guess we need to make sure those handlers don't use any pvops, but
> that seems unreasonable if they want to poke at MSRs and so on.

Either not use any pvops or make sure all the pvops patching is atomic
on the local CPU.

Ok you can avoid MCEs by not enabling them until after you patch (which I think
is the case currently), but it's more difficult with NMIs.

The plain NMI handler is probably auditable.
Or set a flag that makes the NMI handler just return during patching?

> 
> > In theory yes, in practice there can be errata of course. There tend 
> > to be a couple with self modifying code, especially cross modifying
> > (from another CPU) -- but you don't do that.
> >   
> 
> No, but the pv-ops patching code should have no requirement for
> atomicity at all; we shouldn't be trying to patch a live instruction stream.

Yes, but NMI could happen inbetween

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patches] new text patching for review
  2007-07-19 21:06               ` Andi Kleen
@ 2007-07-19 21:08                 ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 27+ messages in thread
From: Jeremy Fitzhardinge @ 2007-07-19 21:08 UTC (permalink / raw)
  To: Andi Kleen
  Cc: patches, Zachary Amsden, Mathieu Desnoyers, Rusty Russell,
	linux-kernel, S. P. Prasanna, Chris Wright

Andi Kleen wrote:
> Either not use any pvops or make sure all the pvops patching is atomic
> on the local CPU.
>   

Erk, not really keen on that.  Sounds complicated, unless there's a nice
general algorithm.

> Ok you can avoid MCEs by not enabling them until after you patch (which I think
> is the case currently), but it's more difficult with NMIs.
>
> The plain NMI handler is probably auditable.
> Or set a flag that makes the NMI handler just return during patching?
>   

I think that's probably the simplest, if nobody will miss it.

    J

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19 17:35     ` Mathieu Desnoyers
@ 2007-07-19 21:14       ` Andi Kleen
  2007-07-19 20:30         ` Jeremy Fitzhardinge
  2007-07-19 23:49         ` Mathieu Desnoyers
  0 siblings, 2 replies; 27+ messages in thread
From: Andi Kleen @ 2007-07-19 21:14 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: jbeulich, S. P. Prasanna, linux-kernel, patches,
	Jeremy Fitzhardinge

Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> writes:

> * Andi Kleen (ak@suse.de) wrote:
> > 
> > > Ewwwwwwwwwww.... you plan to run this in SMP ? So you actually go byte
> > > by byte changing pieces of instructions non atomically and doing
> > > non-Intel's errata friendly XMC. You are really looking for trouble
> > > there :) Two distinct errors can occur: 
> > 
> > In this case it is ok because this only happens when transitioning
> > from 1 CPU to 2 CPUs or vice versa and in both cases the other CPUs
> > are essentially stopped.
> > 
> 
> I agree that it's ok with SMP, but another problem arises: it's not only
> a matter of being protected from SMP access, but also a matter of
> reentrancy wrt interrupt handlers.
> 
> i.e.: if, as we are patching nops non atomically, we have a non maskable
> interrupt coming which calls get_cycles_sync() which uses the

Hmm, i didn't think NMI handlers called that. e.g. nmi watchdog just
uses jiffies.

get_cycles_sync patching happens only relatively early at boot, so oprofile
cannot be running yet.

> I see that IRQs are disabled in alternative_instructions(), but it does
> not protect against NMIs, which could come at a very inappropriate
> moment. MCE and SMIs would potentially cause the same kind of trouble.
> 
> So unless you can guarantee that any code from NMI handler won't call
> basic things such as get_cycles() (nor MCE, nor SMIs), you can't insure
> it won't execute an illegal instruction. Also, the option of temporarily
> disabling the NMI for the duration of the update simply adds unwanted
> latency to the NMI handler which could be unacceptable in some setups.

Ok it's a fair point.  But how would you address it ?

Even if we IPIed the other CPUs NMIs or MCEs could still happen.

BTW Jeremy, have you ever considered that problem with paravirt ops
patching? 

> 
> Another potential problem comes from preemption: if a thread is
> preempted in the middle of the instructions you are modifying, let's say
> we originally have 4 * 1 byte instructions that we replace with 1 * 4
> byte instruction, a thread could iret in the middle of the new
> instruction when it will be scheduled again, thus executing an illegal
> instruction. This problem could be overcomed by putting the threads in
> the freezer. See the djprobe project for details about this.

This cannot happen for the current code: 
- full alternative patching happen only at boot when the other CPUs
are not running
- smp lock patching only ever changes a single byte (lock prefix) of
a single instruction
- kprobes only ever change a single byte

For the immediate value patching it also cannot happen because
you'll never modify multiple instructions and all immediate values
can be changed atomically. 



> In the end, I think the safest solution would be to limit ourselves to
> one of these 2 solutions:
> - If the update can be done atomically and leaves the site in a coherent
>   state after each atomic modification, while keeping the same
>   instruction size, it could be done on the spot.
> - If not, care should be taken to first do a copy of the code to modify
>   into a "bypass", making sure that instructions can be relocated, so
>   that any context that would try to execute the site during the
>   modification would execute a breakpoint which would execute the bypass
>   while we modify the instructions.

kprobes does that, but to write the break point it uses text_poke() 
with my patch.

> Because of the tricks that must be done to do code modification on a
> live system (as explained above), I don't think it makes sense to
> provide a falsely-safe infrastructure that would allow code modification
> in a non-atomic manner.

Hmm, perhaps it would make sense to add a comment warning about
the limitations of the current text_poke. Can you supply one?

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19 21:14       ` Andi Kleen
  2007-07-19 20:30         ` Jeremy Fitzhardinge
@ 2007-07-19 23:49         ` Mathieu Desnoyers
  2007-07-20  1:15           ` Zachary Amsden
  2007-07-20  8:28           ` Andi Kleen
  1 sibling, 2 replies; 27+ messages in thread
From: Mathieu Desnoyers @ 2007-07-19 23:49 UTC (permalink / raw)
  To: Andi Kleen
  Cc: jbeulich, S. P. Prasanna, linux-kernel, patches,
	Jeremy Fitzhardinge

* Andi Kleen (andi@firstfloor.org) wrote:
> Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> writes:
> 
> > * Andi Kleen (ak@suse.de) wrote:
> > > 
> > > > Ewwwwwwwwwww.... you plan to run this in SMP ? So you actually go byte
> > > > by byte changing pieces of instructions non atomically and doing
> > > > non-Intel's errata friendly XMC. You are really looking for trouble
> > > > there :) Two distinct errors can occur: 
> > > 
> > > In this case it is ok because this only happens when transitioning
> > > from 1 CPU to 2 CPUs or vice versa and in both cases the other CPUs
> > > are essentially stopped.
> > > 
> > 
> > I agree that it's ok with SMP, but another problem arises: it's not only
> > a matter of being protected from SMP access, but also a matter of
> > reentrancy wrt interrupt handlers.
> > 
> > i.e.: if, as we are patching nops non atomically, we have a non maskable
> > interrupt coming which calls get_cycles_sync() which uses the
> 
> Hmm, i didn't think NMI handlers called that. e.g. nmi watchdog just
> uses jiffies.
> 
> get_cycles_sync patching happens only relatively early at boot, so oprofile
> cannot be running yet.

Actually, the nmi handler does use the get_cycles(), and also uses the
spinlock code:

arch/i386/kernel/nmi.c:
__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
...
static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
spin_lock(&lock);
printk("NMI backtrace for cpu %d\n", cpu);
...
spin_unlock(&lock);

If A - we change the spinlock code non atomically it would break.
   B - printk reads the TSC to get a timestamp, it breaks:
   it calls:
printk_clock(void) -> sched_clock(); -> get_cycles_sync() on x86_64.


> 
> > I see that IRQs are disabled in alternative_instructions(), but it does
> > not protect against NMIs, which could come at a very inappropriate
> > moment. MCE and SMIs would potentially cause the same kind of trouble.
> > 
> > So unless you can guarantee that any code from NMI handler won't call
> > basic things such as get_cycles() (nor MCE, nor SMIs), you can't insure
> > it won't execute an illegal instruction. Also, the option of temporarily
> > disabling the NMI for the duration of the update simply adds unwanted
> > latency to the NMI handler which could be unacceptable in some setups.
> 
> Ok it's a fair point.  But how would you address it ?
> 
> Even if we IPIed the other CPUs NMIs or MCEs could still happen.
> 

Yeah, that's a mess. That's why I always consider patching the code
in a way that will let the NMI handler run through it in a sane manner
_while_ the code is being patched. It implies _at least_ to do the
updates atomically with atomic aligned memory writes that keeps the site
being patched in a coherent state. Using a int3-based bypass is also
required on Intel because of the erratum regarding instruction cache.
Using these techniques, I can atomically modify the execution stream. It
would be relatively simple to re-use the framework I have done in the
Immediate Values as long as no non-relocatable instructions are
involved.

> BTW Jeremy, have you ever considered that problem with paravirt ops
> patching? 
> 
> > 
> > Another potential problem comes from preemption: if a thread is
> > preempted in the middle of the instructions you are modifying, let's say
> > we originally have 4 * 1 byte instructions that we replace with 1 * 4
> > byte instruction, a thread could iret in the middle of the new
> > instruction when it will be scheduled again, thus executing an illegal
> > instruction. This problem could be overcomed by putting the threads in
> > the freezer. See the djprobe project for details about this.
> 
> This cannot happen for the current code: 
> - full alternative patching happen only at boot when the other CPUs
> are not running

Should be checked if NMIs and MCEs are active at that moment.

> - smp lock patching only ever changes a single byte (lock prefix) of
> a single instruction

Yup, as long as we are just adding/removing a f0 prefix, it's clearly
atomic.

> - kprobes only ever change a single byte
> 

I see the mb()/rmb()/wmb() also uses alternatives, they should be
checked for boot-time racing against NMIs and MCEs.

init/main.c:start_kernel()

parse_args() (where the nmi watchdog is enabled it seems) would probably
execute the smp-alt-boot and nmi_watchdog arguments in the order in which
they are given as kernel arguments. So I guess it could race.

the "mce" kernel argument is also parsed in parse_args(), which leads to
the same problem.

> For the immediate value patching it also cannot happen because
> you'll never modify multiple instructions and all immediate values
> can be changed atomically. 
> 

Exactly, I always make sure that the immediate value within the
instruction is aligned (so a 5 bytes movl must have an offset of +3
compared to a 4 bytes alignment).

> 
> 
> > In the end, I think the safest solution would be to limit ourselves to
> > one of these 2 solutions:
> > - If the update can be done atomically and leaves the site in a coherent
> >   state after each atomic modification, while keeping the same
> >   instruction size, it could be done on the spot.
> > - If not, care should be taken to first do a copy of the code to modify
> >   into a "bypass", making sure that instructions can be relocated, so
> >   that any context that would try to execute the site during the
> >   modification would execute a breakpoint which would execute the bypass
> >   while we modify the instructions.
> 
> kprobes does that, but to write the break point it uses text_poke() 
> with my patch.

Yes, kprobes is case 1: atomic update. And we don't even have to bother
about Intel's erratum. This one is ok. That's mainly the
alternatives/paravirt code I worry about.

> 
> > Because of the tricks that must be done to do code modification on a
> > live system (as explained above), I don't think it makes sense to
> > provide a falsely-safe infrastructure that would allow code modification
> > in a non-atomic manner.
> 
> Hmm, perhaps it would make sense to add a comment warning about
> the limitations of the current text_poke. Can you supply one?
> 

Sure, something like:

Make sure this API is used only to modify code meeting these
requirements (those are the ones I remember from the top of my head):

Case 1) Inserting/removing a breakpoint
        - Do as you please. These updates are atomic and SMP correct.
          Just don't put a breakpoint in code called from the breakpoint
          handler, please.

Boot time:
Case 2) Changing 1 byte of a larger instruction at boot time, splicing
        it in 2 instructions or leaving it as 1 instruction
        - Only one CPU must be live on the system.
        - If you plan to modify code that could be executed on top of
          you (NMI handlers, MCE, SMIs), make sure you do an atomic
          update to a "legal" instruction, or else an illegal
          instruction will be executed.  Spinlocks, memory barriers and
          rdtsc are examples of code executed by such handlers. The
          other option is to force application of alternatives before
          these interrupts are enabled. Make sure that maskable
          interrupts are disabled.
Case 3) Changing 1 byte, turning 2 instructions into one at boot time
        - Follow same conditions as 2.
        - Make sure that no threads could have been preempted in the
          instructions you are modifying.
        - Study the code to make sure no jump is targeting the middle of
          your new instruction.
Case 4) Changing more than 1 byte of an instruction at boot time
        - Same as in 2-3 (depending if you leave it as one instruction
          or splice it).
        - Make sure that the bytes you are modifying are aligned and
          that you do an atomic update. It may involve playing with the
          instruction's alignment a bit.

Live:
Case 5) Changing one instruction live on SMP (ok wrt Intel's errata about
        XMC)
        - Insert a breakpoint to run a "bypass" while you play with the
          code. This bypass will single-step the original instruction
          out of line.
        - Make sure the instructions to be copied into the bypass are
          relocatable.
        - Send an IPI to each CPU so they execute a synchronizing
          instruction (sync_core()) before you remove your breakpoint.
          Else, a CPU could see two different instructions at the same
          location without ever executing the breakpoint. The
          sync_core() makes sure we don't fall into Intel's XMC errata.
        - Think about how you plan to teardown your bypass (especially
          considering preemption within the bypass)
        - Since we use a bypass, the update does not have to be atomic,
          but keep in mind that without a bypass, it would have to.
          i.e. the bypass should not be required for AMD processors
          (AFAIK).
Case 6) Changing instructions (i.e. nops into one instruction..) live on
        SMP (ok wrt Intel's errata about XMC)
        - Use the same bypass technique as in case 5.
        - Make sure that no thread can be preempted in the nops, as they
          could iret in the middle of the new instruction. (use the
          freezer)
        - Make sure that no interrupt handler could iret in the modified
          code (start a worker thread or each CPU, wait for them to
          run.)
            However, this technique is not perfect: if an interrupt
            handler returns to the code being modified, then we get
            preempted again, schedule the worker thread, come back to
            the original thread and reexecute another interrupt handler
            with an iret within the modified instructions, it could
            return in the wrong spot. Putting every thread in the
            freezer seems to overcome this problem.
        - Don't modify code of threads that cannot be put in the freezer
          (idle thread?).
        - Make sure that no hypervisor could iret in the modified code
          (that's hard).
        - Study the code to make sure no jump is targeting the middle of
          your new instruction.

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19 20:30         ` Jeremy Fitzhardinge
  2007-07-19 20:46           ` [patches] " Andi Kleen
@ 2007-07-19 23:51           ` Mathieu Desnoyers
  1 sibling, 0 replies; 27+ messages in thread
From: Mathieu Desnoyers @ 2007-07-19 23:51 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Andi Kleen, jbeulich, S. P. Prasanna, linux-kernel, patches,
	Zachary Amsden, Chris Wright, Rusty Russell

* Jeremy Fitzhardinge (jeremy@goop.org) wrote:
> Andi Kleen wrote:
> > Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> writes:
> >   
> >> I see that IRQs are disabled in alternative_instructions(), but it does
> >> not protect against NMIs, which could come at a very inappropriate
> >> moment. MCE and SMIs would potentially cause the same kind of trouble.
> >>
> >> So unless you can guarantee that any code from NMI handler won't call
> >> basic things such as get_cycles() (nor MCE, nor SMIs), you can't insure
> >> it won't execute an illegal instruction. Also, the option of temporarily
> >> disabling the NMI for the duration of the update simply adds unwanted
> >> latency to the NMI handler which could be unacceptable in some setups.
> >>     
> >
> > Ok it's a fair point.  But how would you address it ?
> >
> > Even if we IPIed the other CPUs NMIs or MCEs could still happen.
> >
> > BTW Jeremy, have you ever considered that problem with paravirt ops
> > patching? 
> >   
> 
> I remember Zach was thinking about it when he was thinking of making vmi
> a kernel module, but I don't think we discussed it with respect to the
> current patching mechanism.  Though he did discover that at one point
> alternative_instructions() was being run with interrupts enabled, which
> caused surprisingly few problems...
> 
> But, yeah, it seems like it could be a problem.
> 
> > - smp lock patching only ever changes a single byte (lock prefix) of
> > a single instruction
> > - kprobes only ever change a single byte
> >
> > For the immediate value patching it also cannot happen because
> > you'll never modify multiple instructions and all immediate values
> > can be changed atomically. 
> >   
> 
> Are misaligned/cross-cache-line updates atomic?
> 

I align the "immediate values" within the mov instructions on multiples
of the immediate value size so I can update it atomically.

>     J

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patches] new text patching for review
  2007-07-19 20:46           ` [patches] " Andi Kleen
  2007-07-19 20:51             ` Jeremy Fitzhardinge
@ 2007-07-19 23:53             ` Mathieu Desnoyers
  2007-08-20  0:55             ` Non atomic unaligned writes Mathieu Desnoyers
  2 siblings, 0 replies; 27+ messages in thread
From: Mathieu Desnoyers @ 2007-07-19 23:53 UTC (permalink / raw)
  To: Andi Kleen
  Cc: patches, Jeremy Fitzhardinge, Zachary Amsden, Rusty Russell,
	linux-kernel, S. P. Prasanna, Chris Wright

* Andi Kleen (ak@suse.de) wrote:
> On Thursday 19 July 2007 22:30:12 Jeremy Fitzhardinge wrote:
> > Andi Kleen wrote:
> > > Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> writes:
> > >   
> > >> I see that IRQs are disabled in alternative_instructions(), but it does
> > >> not protect against NMIs, which could come at a very inappropriate
> > >> moment. MCE and SMIs would potentially cause the same kind of trouble.
> > >>
> > >> So unless you can guarantee that any code from NMI handler won't call
> > >> basic things such as get_cycles() (nor MCE, nor SMIs), you can't insure
> > >> it won't execute an illegal instruction. Also, the option of temporarily
> > >> disabling the NMI for the duration of the update simply adds unwanted
> > >> latency to the NMI handler which could be unacceptable in some setups.
> > >>     
> > >
> > > Ok it's a fair point.  But how would you address it ?
> > >
> > > Even if we IPIed the other CPUs NMIs or MCEs could still happen.
> > >
> > > BTW Jeremy, have you ever considered that problem with paravirt ops
> > > patching? 
> > >   
> > 
> > I remember Zach was thinking about it when he was thinking of making vmi
> > a kernel module, but I don't think we discussed it with respect to the
> > current patching mechanism.  Though he did discover that at one point
> > alternative_instructions() was being run with interrupts enabled, which
> > caused surprisingly few problems...
> > 
> > But, yeah, it seems like it could be a problem.
> 
> Normally there are not that many NMIs or MCEs at boot, but it would
> be still good to avoid the very rare crash by auditing the code first
> [better than try to debug it on some production system later]
> 
> > > - smp lock patching only ever changes a single byte (lock prefix) of
> > > a single instruction
> > > - kprobes only ever change a single byte
> > >
> > > For the immediate value patching it also cannot happen because
> > > you'll never modify multiple instructions and all immediate values
> > > can be changed atomically. 
> > >   
> > 
> > Are misaligned/cross-cache-line updates atomic?
> 
> In theory yes, in practice there can be errata of course. There tend 
> to be a couple with self modifying code, especially cross modifying
> (from another CPU) -- but you don't do that.
> 

Hrm, changing instructions in multiple memory accesses does not seem to
be atomic to me (unaligned case).

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19  9:05 new text patching for review Andi Kleen
  2007-07-19 13:38 ` Mathieu Desnoyers
@ 2007-07-20  0:37 ` Zachary Amsden
  2007-07-20  8:23   ` Andi Kleen
  2007-08-10 19:00 ` Mathieu Desnoyers
  2 siblings, 1 reply; 27+ messages in thread
From: Zachary Amsden @ 2007-07-20  0:37 UTC (permalink / raw)
  To: Andi Kleen
  Cc: mathieu.desnoyers, jbeulich, S. P. Prasanna, linux-kernel,
	patches

Andi Kleen wrote:
> +	*addr = opcode;
> +	/* Not strictly needed, but can speed CPU recovery up */
> +	if (cpu_has_clflush)
> +		asm("clflush (%0) " :: "r" (addr) : "memory");
> +	if (addr != oaddr)
> +		vunmap(addr);
>   

clflush should take oaddr.

If you had to remap, note that the processor does not know the linear 
address you wrote to could be matched by another mapping in icache.  In 
that case, you'll need a serializing instruction (cpuid) to 
resynchronize caches.

I don't think any of these are SMP problems since either the code is 
patched before AP bringup or it is single byte patching.

Zach

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19 23:49         ` Mathieu Desnoyers
@ 2007-07-20  1:15           ` Zachary Amsden
  2007-07-20  7:37             ` Andi Kleen
  2007-07-20 15:17             ` Mathieu Desnoyers
  2007-07-20  8:28           ` Andi Kleen
  1 sibling, 2 replies; 27+ messages in thread
From: Zachary Amsden @ 2007-07-20  1:15 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Andi Kleen, jbeulich, S. P. Prasanna, linux-kernel, patches,
	Jeremy Fitzhardinge

Mathieu Desnoyers wrote:
> Yes, kprobes is case 1: atomic update. And we don't even have to bother
> about Intel's erratum. This one is ok. That's mainly the
> alternatives/paravirt code I worry about.
>   

Paravirt and alternatives should all be ok because they are done before 
SMP bringup and with NMIs disabled.  NMI watchdog is not setup until 
smp_prepare_cpus/check_nmi_watchdog, which happens way later, not during 
parse_args/setup_nmi_watchdog, which just decides which type of watchdog 
to setup.

I originally considered the NMI problem for paravirt-ops patching done 
during module load, and found that I would need to modify 
stop_machine_run to have an architecture specific callout to mask and 
unmask NMIs.  I didn't imagine that would be very popular, and VMI was 
the only paravirt-ops that were considering module load time patching, 
so I flushed it.

You get some other nasty issues as well with run-time switching, like 
missing early init calls (in particular, we would have to go to some 
heroics to retake the land surrounding the APIC local timer interrupt).

Zach

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-20  1:15           ` Zachary Amsden
@ 2007-07-20  7:37             ` Andi Kleen
  2007-07-20 15:17             ` Mathieu Desnoyers
  1 sibling, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2007-07-20  7:37 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: Mathieu Desnoyers, Andi Kleen, jbeulich, S. P. Prasanna,
	linux-kernel, patches, Jeremy Fitzhardinge

On Thu, Jul 19, 2007 at 06:15:46PM -0700, Zachary Amsden wrote:
> Mathieu Desnoyers wrote:
> >Yes, kprobes is case 1: atomic update. And we don't even have to bother
> >about Intel's erratum. This one is ok. That's mainly the
> >alternatives/paravirt code I worry about.
> >  
> 
> Paravirt and alternatives should all be ok because they are done before 
> SMP bringup and with NMIs disabled.  NMI watchdog is not setup until 
> smp_prepare_cpus/check_nmi_watchdog, which happens way later, not during 
> parse_args/setup_nmi_watchdog, which just decides which type of watchdog 
> to setup.

There could be other NMIs too - e.g. caused by IO devices or NMI
buttons - but they're relatively unlikely.

> 
> I originally considered the NMI problem for paravirt-ops patching done 
> during module load, and found that I would need to modify 
> stop_machine_run to have an architecture specific callout to mask and 
> unmask NMIs.

When your virtual machine never injects NMIs or MCEs you're ok.

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-20  0:37 ` Zachary Amsden
@ 2007-07-20  8:23   ` Andi Kleen
  0 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2007-07-20  8:23 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: mathieu.desnoyers, jbeulich, S. P. Prasanna, linux-kernel,
	patches

On Friday 20 July 2007 02:37:20 Zachary Amsden wrote:
> Andi Kleen wrote:
> > +	*addr = opcode;
> > +	/* Not strictly needed, but can speed CPU recovery up */
> > +	if (cpu_has_clflush)
> > +		asm("clflush (%0) " :: "r" (addr) : "memory");
> > +	if (addr != oaddr)
> > +		vunmap(addr);
> >   
> 
> clflush should take oaddr.

Thanks

> If you had to remap, note that the processor does not know the linear 
> address you wrote to could be matched by another mapping in icache.  In 
> that case, you'll need a serializing instruction (cpuid) to 
> resynchronize caches.

We already got one in alternative patching, but you're right it's 
safer to do it in text_poke

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19 23:49         ` Mathieu Desnoyers
  2007-07-20  1:15           ` Zachary Amsden
@ 2007-07-20  8:28           ` Andi Kleen
  2007-07-20 14:36             ` Mathieu Desnoyers
  1 sibling, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2007-07-20  8:28 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Andi Kleen, jbeulich, S. P. Prasanna, linux-kernel, patches,
	Jeremy Fitzhardinge

On Thu, Jul 19, 2007 at 07:49:12PM -0400, Mathieu Desnoyers wrote:
> * Andi Kleen (andi@firstfloor.org) wrote:
> > Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> writes:
> > 
> > > * Andi Kleen (ak@suse.de) wrote:
> > > > 
> > > > > Ewwwwwwwwwww.... you plan to run this in SMP ? So you actually go byte
> > > > > by byte changing pieces of instructions non atomically and doing
> > > > > non-Intel's errata friendly XMC. You are really looking for trouble
> > > > > there :) Two distinct errors can occur: 
> > > > 
> > > > In this case it is ok because this only happens when transitioning
> > > > from 1 CPU to 2 CPUs or vice versa and in both cases the other CPUs
> > > > are essentially stopped.
> > > > 
> > > 
> > > I agree that it's ok with SMP, but another problem arises: it's not only
> > > a matter of being protected from SMP access, but also a matter of
> > > reentrancy wrt interrupt handlers.
> > > 
> > > i.e.: if, as we are patching nops non atomically, we have a non maskable
> > > interrupt coming which calls get_cycles_sync() which uses the
> > 
> > Hmm, i didn't think NMI handlers called that. e.g. nmi watchdog just
> > uses jiffies.
> > 
> > get_cycles_sync patching happens only relatively early at boot, so oprofile
> > cannot be running yet.
> 
> Actually, the nmi handler does use the get_cycles(), and also uses the
>
> spinlock code:
> 
> arch/i386/kernel/nmi.c:
> __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
> ...
> static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
> spin_lock(&lock);
> printk("NMI backtrace for cpu %d\n", cpu);
> ...
> spin_unlock(&lock);
> 
> If A - we change the spinlock code non atomically it would break.

It only has its lock prefixes twiggled, which should be ok.

>    B - printk reads the TSC to get a timestamp, it breaks:
>    it calls:
> printk_clock(void) -> sched_clock(); -> get_cycles_sync() on x86_64.

Are we reading the same source? sched_clock has never used get_cycles_sync(),
just ordinary get_cycles() which is not patched. In fact it mostly
used rdtscll() directly.

The main problem is alternative() nopify, e.g. for prefetches which
could hide in every list_for_each; but from a quick look the current
early NMI code doesn't do that.

> Yeah, that's a mess. That's why I always consider patching the code
> in a way that will let the NMI handler run through it in a sane manner
> _while_ the code is being patched. It implies _at least_ to do the
> updates atomically with atomic aligned memory writes that keeps the site
> being patched in a coherent state. Using a int3-based bypass is also
> required on Intel because of the erratum regarding instruction cache.

That's only for cross modifying code, no? 

> > This cannot happen for the current code: 
> > - full alternative patching happen only at boot when the other CPUs
> > are not running
> 
> Should be checked if NMIs and MCEs are active at that moment.

They are probably both.

I guess we could disable them again. I will cook up a patch.

> I see the mb()/rmb()/wmb() also uses alternatives, they should be
> checked for boot-time racing against NMIs and MCEs.

Patch above would take care of it.

> 
> init/main.c:start_kernel()
> 
> parse_args() (where the nmi watchdog is enabled it seems) would probably
> execute the smp-alt-boot and nmi_watchdog arguments in the order in which
> they are given as kernel arguments. So I guess it could race.

Not sure I see your point here. How can arguments race?

> 
> the "mce" kernel argument is also parsed in parse_args(), which leads to
> the same problem.

?


> 
> > For the immediate value patching it also cannot happen because
> > you'll never modify multiple instructions and all immediate values
> > can be changed atomically. 
> > 
> 
> Exactly, I always make sure that the immediate value within the
> instruction is aligned (so a 5 bytes movl must have an offset of +3
> compared to a 4 bytes alignment).

The x86 architecture doesn't require alignment for atomic updates.

> Make sure this API is used only to modify code meeting these
> requirements (those are the ones I remember from the top of my head):

Umm, that's far too complicated. Nobody will understand it anyways.
I'll cook up something simpler.

-Andi


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-20  8:28           ` Andi Kleen
@ 2007-07-20 14:36             ` Mathieu Desnoyers
  0 siblings, 0 replies; 27+ messages in thread
From: Mathieu Desnoyers @ 2007-07-20 14:36 UTC (permalink / raw)
  To: Andi Kleen
  Cc: jbeulich, S. P. Prasanna, linux-kernel, patches,
	Jeremy Fitzhardinge

* Andi Kleen (andi@firstfloor.org) wrote:
> On Thu, Jul 19, 2007 at 07:49:12PM -0400, Mathieu Desnoyers wrote:
> > * Andi Kleen (andi@firstfloor.org) wrote:
> > > Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> writes:
> > > 
> > > > * Andi Kleen (ak@suse.de) wrote:
> > > > > 
> > > > > > Ewwwwwwwwwww.... you plan to run this in SMP ? So you actually go byte
> > > > > > by byte changing pieces of instructions non atomically and doing
> > > > > > non-Intel's errata friendly XMC. You are really looking for trouble
> > > > > > there :) Two distinct errors can occur: 
> > > > > 
> > > > > In this case it is ok because this only happens when transitioning
> > > > > from 1 CPU to 2 CPUs or vice versa and in both cases the other CPUs
> > > > > are essentially stopped.
> > > > > 
> > > > 
> > > > I agree that it's ok with SMP, but another problem arises: it's not only
> > > > a matter of being protected from SMP access, but also a matter of
> > > > reentrancy wrt interrupt handlers.
> > > > 
> > > > i.e.: if, as we are patching nops non atomically, we have a non maskable
> > > > interrupt coming which calls get_cycles_sync() which uses the
> > > 
> > > Hmm, i didn't think NMI handlers called that. e.g. nmi watchdog just
> > > uses jiffies.
> > > 
> > > get_cycles_sync patching happens only relatively early at boot, so oprofile
> > > cannot be running yet.
> > 
> > Actually, the nmi handler does use the get_cycles(), and also uses the
> >
> > spinlock code:
> > 
> > arch/i386/kernel/nmi.c:
> > __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
> > ...
> > static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
> > spin_lock(&lock);
> > printk("NMI backtrace for cpu %d\n", cpu);
> > ...
> > spin_unlock(&lock);
> > 
> > If A - we change the spinlock code non atomically it would break.
> 
> It only has its lock prefixes twiggled, which should be ok.
> 

Yes, this is a special case where both the lock prefixed instructions
and the non lock prefixed instructions are valid, so it does not matter
if a thread is preempted right after executing the NOP turned into a
0xf0. However, if such case happens when passing from UP to SMP, a
thread could be scheduled in and try to access to a spinlock with the
non-locked instruction. There should be some kind of "teardown" to make
sure that no such case can happen.



> >    B - printk reads the TSC to get a timestamp, it breaks:
> >    it calls:
> > printk_clock(void) -> sched_clock(); -> get_cycles_sync() on x86_64.
> 
> Are we reading the same source? sched_clock has never used get_cycles_sync(),
> just ordinary get_cycles() which is not patched. In fact it mostly
> used rdtscll() directly.
> 

Yes, you are right.. I am thinking more about other clients, such as a
tracer, which could want the precision given by get_cycles_sync() and
may execute in NMI context. It does not apply to the current kernel
source. It's just that reading a timestamp counter is an operation so
common that it should not come with restrictions about which context it
could be called from due to the alternatives mechanism.

> The main problem is alternative() nopify, e.g. for prefetches which
> could hide in every list_for_each; but from a quick look the current
> early NMI code doesn't do that.

Yup.. well.. my tracer will ;) I use a list_for_each_rcu() to iterate on
active traces. That's another example of a very basic piece of
infrastructure for which we don't want to bother about alternatives
patching when using it.

> 
> > Yeah, that's a mess. That's why I always consider patching the code
> > in a way that will let the NMI handler run through it in a sane manner
> > _while_ the code is being patched. It implies _at least_ to do the
> > updates atomically with atomic aligned memory writes that keeps the site
> > being patched in a coherent state. Using a int3-based bypass is also
> > required on Intel because of the erratum regarding instruction cache.
> 
> That's only for cross modifying code, no? 
> 

No. It also applies to UP modification. Since it is hard to insure that
no unmaskable interrupt handler will run on top of you, it can help to
leave the code in a valid state at every moment.

> > > This cannot happen for the current code: 
> > > - full alternative patching happen only at boot when the other CPUs
> > > are not running
> > 
> > Should be checked if NMIs and MCEs are active at that moment.
> 
> They are probably both.
> 
> I guess we could disable them again. I will cook up a patch.
> 

I guess we could, although I wouldn't recommend doing it on a live
system, only at boot time.

> > I see the mb()/rmb()/wmb() also uses alternatives, they should be
> > checked for boot-time racing against NMIs and MCEs.
> 
> Patch above would take care of it.
> 
> > 
> > init/main.c:start_kernel()
> > 
> > parse_args() (where the nmi watchdog is enabled it seems) would probably
> > execute the smp-alt-boot and nmi_watchdog arguments in the order in which
> > they are given as kernel arguments. So I guess it could race.
> 
> Not sure I see your point here. How can arguments race?
> 

I thought parse_args() started the NMIs, but it seems to just take the
arguments and saves them for later.

> > 
> > the "mce" kernel argument is also parsed in parse_args(), which leads to
> > the same problem.
> 
> ?
> 
Same as above.

> 
> > 
> > > For the immediate value patching it also cannot happen because
> > > you'll never modify multiple instructions and all immediate values
> > > can be changed atomically. 
> > > 
> > 
> > Exactly, I always make sure that the immediate value within the
> > instruction is aligned (so a 5 bytes movl must have an offset of +3
> > compared to a 4 bytes alignment).
> 
> The x86 architecture doesn't require alignment for atomic updates.
> 

You mean for atomicity wrt the local SMP or cross-cpus ?

> > Make sure this API is used only to modify code meeting these
> > requirements (those are the ones I remember from the top of my head):
> 
> Umm, that's far too complicated. Nobody will understand it anyways.
> I'll cook up something simpler.
> 

ok :)

Mathieu

> -Andi
> 

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-20  1:15           ` Zachary Amsden
  2007-07-20  7:37             ` Andi Kleen
@ 2007-07-20 15:17             ` Mathieu Desnoyers
  2007-07-21  6:19               ` Andi Kleen
  1 sibling, 1 reply; 27+ messages in thread
From: Mathieu Desnoyers @ 2007-07-20 15:17 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: Andi Kleen, jbeulich, S. P. Prasanna, linux-kernel, patches,
	Jeremy Fitzhardinge

* Zachary Amsden (zach@vmware.com) wrote:
> Mathieu Desnoyers wrote:
> >Yes, kprobes is case 1: atomic update. And we don't even have to bother
> >about Intel's erratum. This one is ok. That's mainly the
> >alternatives/paravirt code I worry about.
> >  
> 
> Paravirt and alternatives should all be ok because they are done before 
> SMP bringup and with NMIs disabled.  NMI watchdog is not setup until 
> smp_prepare_cpus/check_nmi_watchdog, which happens way later, not during 
> parse_args/setup_nmi_watchdog, which just decides which type of watchdog 
> to setup.
> 

I'm not so sure about this. You are right in that it has nothing to do
with parse_args, but I just went in detail through the source, and the
order seems to be:

1 - NMI is activated
2 - MCE is activated
3 - alternatives are applied

In detail:

start_kernel()
  ...
  include/asm-i386/smp.h:smp_prepare_boot_cpu()
    smp_ops.smp_prepare_boot_cpu()
       arch/i386/kernel/smpboot.c:native_smp_prepare_boot_cpu()
        arch/i386/kernel/smpboot.c:native_smp_prepare_cpus()
          arch/i386/kernel/smpboot.c:smp_boot_cpus()
            arch/i386/kernel/apic.c:setup_local_APIC()
              arch/i386/kernel/nmi.c:setup_apic_nmi_watchdog()
                arch/i386/kernel/cpu/perfctr-watchdog.c:lapic_watchdog_init()
                  wd_ops->setup() (e.g. setup_intel_arch_watchdog)
  ...
  arch/i386/kernel/cpu/bugs.c:check_bugs()
    arch/i386/kernel/cpu/common.c:identify_boot_cpu()
      identify_cpu()
        arch/i386/kernel/cpu/mcheck/mce.c:mcheck_init()
          arch/i386/kernel/cpu/mcheck/p4.c:intel_p4_mcheck_init()
    ...
    arch/i386/kernel/alternative_instructions()
  ...

Therefore, applying alternatives instructions seems to be done after NMI
and MCE init at boot time, or am I misunderstanding something ?


> I originally considered the NMI problem for paravirt-ops patching done 
> during module load, and found that I would need to modify 
> stop_machine_run to have an architecture specific callout to mask and 
> unmask NMIs.  I didn't imagine that would be very popular, and VMI was 
> the only paravirt-ops that were considering module load time patching, 
> so I flushed it.
> 
> You get some other nasty issues as well with run-time switching, like 
> missing early init calls (in particular, we would have to go to some 
> heroics to retake the land surrounding the APIC local timer interrupt).
> 
> Zach

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-20 15:17             ` Mathieu Desnoyers
@ 2007-07-21  6:19               ` Andi Kleen
  0 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2007-07-21  6:19 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Zachary Amsden, Andi Kleen, jbeulich, S. P. Prasanna,
	linux-kernel, patches, Jeremy Fitzhardinge

On Fri, Jul 20, 2007 at 11:17:49AM -0400, Mathieu Desnoyers wrote:
> * Zachary Amsden (zach@vmware.com) wrote:
> > Mathieu Desnoyers wrote:
> > >Yes, kprobes is case 1: atomic update. And we don't even have to bother
> > >about Intel's erratum. This one is ok. That's mainly the
> > >alternatives/paravirt code I worry about.
> > >  
> > 
> > Paravirt and alternatives should all be ok because they are done before 
> > SMP bringup and with NMIs disabled.  NMI watchdog is not setup until 
> > smp_prepare_cpus/check_nmi_watchdog, which happens way later, not during 
> > parse_args/setup_nmi_watchdog, which just decides which type of watchdog 
> > to setup.
> > 
> 
> I'm not so sure about this. You are right in that it has nothing to do
> with parse_args, but I just went in detail through the source, and the
> order seems to be:
> 
> 1 - NMI is activated
> 2 - MCE is activated
> 3 - alternatives are applied

Yes I was wrong on this. I now added code to disable them again -- see 
the later patch I posted

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: new text patching for review
  2007-07-19  9:05 new text patching for review Andi Kleen
  2007-07-19 13:38 ` Mathieu Desnoyers
  2007-07-20  0:37 ` Zachary Amsden
@ 2007-08-10 19:00 ` Mathieu Desnoyers
  2 siblings, 0 replies; 27+ messages in thread
From: Mathieu Desnoyers @ 2007-08-10 19:00 UTC (permalink / raw)
  To: Andi Kleen; +Cc: jbeulich, S. P. Prasanna, linux-kernel, patches

* Andi Kleen (ak@suse.de) wrote:
> 
> Hallo,
> 
> I had some second thoughts about the text patching of DEBUG_RODATA kernels
> using change_page_attr(). Change_page_attr is intrusive and slow and using
> a separate mapping is a little more gentle. I came up with this patch.
> For your review and testing pleasure.
> 
> The main quirk is that it doesn't fully follow the cross-modifying code
> recommendations; but i'm not sure that's really needed.
> 
> Also I admit nop_out is quite inefficient, but that's a very slow path
> and it was easier to do it this way than handle all the corner cases
> explicitely.
> 
> Comments,
> 
> -Andi
> 
> x86: Fix alternatives and kprobes to remap write-protected kernel text
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
> 
> +/*
> + * RED-PEN Intel recommends stopping the other CPUs for such
> + * "cross-modifying code".
> + */
> +void __kprobes text_poke(void *oaddr, u8 opcode)
> +{
> +        u8 *addr = oaddr;
> +	if (!pte_write(*lookup_address((unsigned long)addr))) {
> +		struct page *p = virt_to_page(addr);
> +		addr = vmap(&p, 1, VM_MAP, PAGE_KERNEL);
> +		if (!addr)
> +			return;
> +		addr += ((unsigned long)oaddr) % PAGE_SIZE;
> +	}
> +	*addr = opcode;
> +	/* Not strictly needed, but can speed CPU recovery up */
> +	if (cpu_has_clflush)
> +		asm("clflush (%0) " :: "r" (addr) : "memory");
> +	if (addr != oaddr)
> +		vunmap(addr);
> +}

Hi Andi,

I was trying to make this work, but I find out that it seems incredibly
slow when I call it multiple times. Trying to understand why, I come
with a few questions:

- Is this supposed to work with large pages ?
- Is this supposed to work module text ?

Thanks,

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Non atomic unaligned writes
  2007-07-19 20:46           ` [patches] " Andi Kleen
  2007-07-19 20:51             ` Jeremy Fitzhardinge
  2007-07-19 23:53             ` Mathieu Desnoyers
@ 2007-08-20  0:55             ` Mathieu Desnoyers
  2007-08-20  5:03               ` Arjan van de Ven
  2007-08-20 10:23               ` Andi Kleen
  2 siblings, 2 replies; 27+ messages in thread
From: Mathieu Desnoyers @ 2007-08-20  0:55 UTC (permalink / raw)
  To: Andi Kleen
  Cc: patches, Jeremy Fitzhardinge, Zachary Amsden, Rusty Russell,
	linux-kernel, S. P. Prasanna, Chris Wright

* Andi Kleen (ak@suse.de) wrote:
> Normally there are not that many NMIs or MCEs at boot, but it would
> be still good to avoid the very rare crash by auditing the code first
> [better than try to debug it on some production system later]
> 
> > > - smp lock patching only ever changes a single byte (lock prefix) of
> > > a single instruction
> > > - kprobes only ever change a single byte
> > >
> > > For the immediate value patching it also cannot happen because
> > > you'll never modify multiple instructions and all immediate values
> > > can be changed atomically. 
> > >   
> > 
> > Are misaligned/cross-cache-line updates atomic?
> 
> In theory yes, in practice there can be errata of course. There tend 
> to be a couple with self modifying code, especially cross modifying
> (from another CPU) -- but you don't do that.
> 
> -Andi

I must disagree with Andi on this point. Considering the quoted
paragraph below, misaligned/cross-cache-line updates are not atomic.
This is why I align the immediate values in such a way that the
immediate value within the mov instruction is itself aligned.

Intel System Programming Guide

7.1.1 Guaranteed Atomic Operations

The Intel386™, Intel486™, Pentium®, and P6 family processors guarantee
that the following basic memory operations will always be carried out
atomically:
• Reading or writing a byte.
• Reading or writing a word aligned on a 16-bit boundary.
• Reading or writing a doubleword aligned on a 32-bit boundary.
The P6 family processors guarantee that the following additional memory
operations will always be carried out atomically:
• Reading or writing a quadword aligned on a 64-bit boundary. (This
operation is also guaranteed on the Pentium® processor.)
• 16-bit accesses to uncached memory locations that fit within a 32-bit
data bus.
• 16-, 32-, and 64-bit accesses to cached memory that fit within a
32-Byte cache line.

Accesses to cacheable memory that are split across bus widths, cache
lines, and page boundaries are not guaranteed to be atomic by the
Intel486™, Pentium®, or P6 family processors. The P6 family processors
provide bus control signals that permit external memory subsystems to
make split accesses atomic; however, nonaligned data accesses will
seriously impact the performance of the processor and should be avoided
where possible.

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: Non atomic unaligned writes
  2007-08-20  0:55             ` Non atomic unaligned writes Mathieu Desnoyers
@ 2007-08-20  5:03               ` Arjan van de Ven
  2007-08-20 10:23               ` Andi Kleen
  1 sibling, 0 replies; 27+ messages in thread
From: Arjan van de Ven @ 2007-08-20  5:03 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Andi Kleen, patches, Jeremy Fitzhardinge, Zachary Amsden,
	Rusty Russell, linux-kernel, S. P. Prasanna, Chris Wright


> > > Are misaligned/cross-cache-line updates atomic?
> > 
> > In theory yes, in practice there can be errata of course. 

Afaik the theory is "Platform specific; most x86 will be in practice";
it's up to the system vendor and chipset vendor. 

The practice is "mixed ball, slow as h*ck anyway, just never ever do it"



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: Non atomic unaligned writes
  2007-08-20  0:55             ` Non atomic unaligned writes Mathieu Desnoyers
  2007-08-20  5:03               ` Arjan van de Ven
@ 2007-08-20 10:23               ` Andi Kleen
  1 sibling, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2007-08-20 10:23 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Andi Kleen, patches, Jeremy Fitzhardinge, Zachary Amsden,
	Rusty Russell, linux-kernel, S. P. Prasanna, Chris Wright

> Intel486???, Pentium??, or P6 family processors. The P6 family processors
> provide bus control signals that permit external memory subsystems to
> make split accesses atomic; however, nonaligned data accesses will

Normally in standard SMP systems between CPUs they are atomic AFAIK 
(modulo bugs) 

You're right -- i'm not sure about P5 SMP.

> seriously impact the performance of the processor and should be avoided
> where possible.

But they're also slow yes.

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2007-08-20  9:29 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-07-19  9:05 new text patching for review Andi Kleen
2007-07-19 13:38 ` Mathieu Desnoyers
2007-07-19 13:46   ` Andi Kleen
2007-07-19 17:35     ` Mathieu Desnoyers
2007-07-19 21:14       ` Andi Kleen
2007-07-19 20:30         ` Jeremy Fitzhardinge
2007-07-19 20:46           ` [patches] " Andi Kleen
2007-07-19 20:51             ` Jeremy Fitzhardinge
2007-07-19 21:06               ` Andi Kleen
2007-07-19 21:08                 ` Jeremy Fitzhardinge
2007-07-19 23:53             ` Mathieu Desnoyers
2007-08-20  0:55             ` Non atomic unaligned writes Mathieu Desnoyers
2007-08-20  5:03               ` Arjan van de Ven
2007-08-20 10:23               ` Andi Kleen
2007-07-19 23:51           ` new text patching for review Mathieu Desnoyers
2007-07-19 23:49         ` Mathieu Desnoyers
2007-07-20  1:15           ` Zachary Amsden
2007-07-20  7:37             ` Andi Kleen
2007-07-20 15:17             ` Mathieu Desnoyers
2007-07-21  6:19               ` Andi Kleen
2007-07-20  8:28           ` Andi Kleen
2007-07-20 14:36             ` Mathieu Desnoyers
2007-07-20  0:37 ` Zachary Amsden
2007-07-20  8:23   ` Andi Kleen
2007-08-10 19:00 ` Mathieu Desnoyers
  -- strict thread matches above, loose matches on Subject: below --
2007-07-19 12:25 Jan Beulich
2007-07-19 12:41 ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).