public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Jeremy Fitzhardinge <jeremy@goop.org>
To: Mathieu Desnoyers <compudj@krystal.dyndns.org>
Cc: Ingo Molnar <mingo@elte.hu>, Andi Kleen <andi@firstfloor.org>,
	akpm@osdl.org, "H. Peter Anvin" <hpa@zytor.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	"Frank Ch. Eigler" <fche@redhat.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [RFC PATCH] x86 NMI-safe INT3 and Page Fault (v3)
Date: Wed, 16 Apr 2008 10:57:45 -0700	[thread overview]
Message-ID: <48063E19.9090701@goop.org> (raw)
In-Reply-To: <20080416162810.GA20450@Krystal>

Mathieu Desnoyers wrote:
> "This way lies madness. Don't go there."
>   

It is a large amount of... stuff.  This immediate values thing makes a 
big improvement then?

> Index: linux-2.6-lttng/include/linux/hardirq.h
> ===================================================================
> --- linux-2.6-lttng.orig/include/linux/hardirq.h	2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/include/linux/hardirq.h	2008-04-16 11:29:30.000000000 -0400
> @@ -22,10 +22,13 @@
>   * PREEMPT_MASK: 0x000000ff
>   * SOFTIRQ_MASK: 0x0000ff00
>   * HARDIRQ_MASK: 0x0fff0000
> + * HARDNMI_MASK: 0x40000000
>   */
>  #define PREEMPT_BITS	8
>  #define SOFTIRQ_BITS	8
>  
> +#define HARDNMI_BITS	1
> +
>  #ifndef HARDIRQ_BITS
>  #define HARDIRQ_BITS	12
>  
> @@ -45,16 +48,19 @@
>  #define PREEMPT_SHIFT	0
>  #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
>  #define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
> +#define HARDNMI_SHIFT	(30)
>   

Why at 30, rather than HARDIRQ_SHIFT+HARDIRQ_BITS?

>  
>  #define __IRQ_MASK(x)	((1UL << (x))-1)
>  
>  #define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
>  #define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
>  #define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
> +#define HARDNMI_MASK	(__IRQ_MASK(HARDNMI_BITS) << HARDNMI_SHIFT)
>  
>  #define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
>  #define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
>  #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
> +#define HARDNMI_OFFSET	(1UL << HARDNMI_SHIFT)
>  
>  #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
>  #error PREEMPT_ACTIVE is too low!
> @@ -63,6 +69,7 @@
>  #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
>  #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
>  #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
> +#define hardnmi_count()	(preempt_count() & HARDNMI_MASK)
>  
>  /*
>   * Are we doing bottom half or hardware interrupt processing?
> @@ -71,6 +78,7 @@
>  #define in_irq()		(hardirq_count())
>  #define in_softirq()		(softirq_count())
>  #define in_interrupt()		(irq_count())
> +#define in_nmi()		(hardnmi_count())
>  
>  /*
>   * Are we running in atomic context?  WARNING: this macro cannot
> @@ -159,7 +167,19 @@ extern void irq_enter(void);
>   */
>  extern void irq_exit(void);
>  
> -#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
> -#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
> +#define nmi_enter()					\
> +	do {						\
> +		lockdep_off();				\
> +		BUG_ON(hardnmi_count());		\
> +		add_preempt_count(HARDNMI_OFFSET);	\
> +		__irq_enter();				\
> +	} while (0)
> +
> +#define nmi_exit()					\
> +	do {						\
> +		__irq_exit();				\
> +		sub_preempt_count(HARDNMI_OFFSET);	\
> +		lockdep_on();				\
> +	} while (0)
>  
>  #endif /* LINUX_HARDIRQ_H */
> Index: linux-2.6-lttng/arch/x86/kernel/entry_32.S
> ===================================================================
> --- linux-2.6-lttng.orig/arch/x86/kernel/entry_32.S	2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/arch/x86/kernel/entry_32.S	2008-04-16 12:06:30.000000000 -0400
> @@ -79,7 +79,6 @@ VM_MASK		= 0x00020000
>  #define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
>  #else
>  #define preempt_stop(clobbers)
> -#define resume_kernel		restore_nocheck
>  #endif
>  
>  .macro TRACE_IRQS_IRET
> @@ -265,6 +264,8 @@ END(ret_from_exception)
>  #ifdef CONFIG_PREEMPT
>  ENTRY(resume_kernel)
>  	DISABLE_INTERRUPTS(CLBR_ANY)
> +	testl $0x40000000,TI_preempt_count(%ebp)	# nested over NMI ?
> +	jnz return_to_nmi
>  	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
>  	jnz restore_nocheck
>  need_resched:
> @@ -276,6 +277,12 @@ need_resched:
>  	call preempt_schedule_irq
>  	jmp need_resched
>  END(resume_kernel)
> +#else
> +ENTRY(resume_kernel)
> +	testl $0x40000000,TI_preempt_count(%ebp)	# nested over NMI ?
>   

HARDNMI_MASK?

> +	jnz return_to_nmi
> +	jmp restore_nocheck
> +END(resume_kernel)
>  #endif
>  	CFI_ENDPROC
>  
> @@ -411,6 +418,22 @@ restore_nocheck_notrace:
>  	CFI_ADJUST_CFA_OFFSET -4
>  irq_return:
>  	INTERRUPT_RETURN
> +return_to_nmi:
> +	testl $X86_EFLAGS_TF, PT_EFLAGS(%esp)
> +	jnz restore_nocheck		/*
> +					 * If single-stepping an NMI handler,
> +					 * use the normal iret path instead of
> +					 * the popf/lret because lret would be
> +					 * single-stepped. It should not
> +					 * happen : it will reactivate NMIs
> +					 * prematurely.
> +					 */
> +	TRACE_IRQS_IRET
> +	RESTORE_REGS
> +	addl $4, %esp			# skip orig_eax/error_code
> +	CFI_ADJUST_CFA_OFFSET -4
> +	INTERRUPT_RETURN_NMI_SAFE
> +
>  .section .fixup,"ax"
>  iret_exc:
>  	pushl $0			# no error code
> Index: linux-2.6-lttng/arch/x86/kernel/entry_64.S
> ===================================================================
> --- linux-2.6-lttng.orig/arch/x86/kernel/entry_64.S	2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/arch/x86/kernel/entry_64.S	2008-04-16 12:06:31.000000000 -0400
> @@ -581,12 +581,27 @@ retint_restore_args:	/* return to kernel
>  	 * The iretq could re-enable interrupts:
>  	 */
>  	TRACE_IRQS_IRETQ
> +	testl $0x40000000,threadinfo_preempt_count(%rcx) /* Nested over NMI ? */
>   

HARDNMI_MASK?  (ditto below)

> +	jnz return_to_nmi
>  restore_args:
>  	RESTORE_ARGS 0,8,0
>  
>  irq_return:
>  	INTERRUPT_RETURN
>  
> +return_to_nmi:				/*
> +					 * If single-stepping an NMI handler,
> +					 * use the normal iret path instead of
> +					 * the popf/lret because lret would be
> +					 * single-stepped. It should not
> +					 * happen : it will reactivate NMIs
> +					 * prematurely.
> +					 */
> +	bt $8,EFLAGS-ARGOFFSET(%rsp)	/* trap flag? */
>   

test[bwl] is a bit more usual; then you can use X86_EFLAGS_TF.

> +	jc restore_args
> +	RESTORE_ARGS 0,8,0
> +	INTERRUPT_RETURN_NMI_SAFE
> +
>  	.section __ex_table, "a"
>  	.quad irq_return, bad_iret
>  	.previous
> @@ -802,6 +817,10 @@ END(spurious_interrupt)
>  	.macro paranoidexit trace=1
>  	/* ebx:	no swapgs flag */
>  paranoid_exit\trace:
> +	GET_THREAD_INFO(%rcx)
> +	testl $0x40000000,threadinfo_preempt_count(%rcx) /* Nested over NMI ? */
> +	jnz paranoid_return_to_nmi\trace
> +paranoid_exit_no_nmi\trace:
>  	testl %ebx,%ebx				/* swapgs needed? */
>  	jnz paranoid_restore\trace
>  	testl $3,CS(%rsp)
> @@ -814,6 +833,18 @@ paranoid_swapgs\trace:
>  paranoid_restore\trace:
>  	RESTORE_ALL 8
>  	jmp irq_return
> +paranoid_return_to_nmi\trace:		/*
> +					 * If single-stepping an NMI handler,
> +					 * use the normal iret path instead of
> +					 * the popf/lret because lret would be
> +					 * single-stepped. It should not
> +					 * happen : it will reactivate NMIs
> +					 * prematurely.
> +					 */
> +	bt $8,EFLAGS-0(%rsp)		/* trap flag? */
> +	jc paranoid_exit_no_nmi\trace
> +	RESTORE_ALL 8
> +	INTERRUPT_RETURN_NMI_SAFE
>   

Does this need to be repeated verbatim?

>  paranoid_userspace\trace:
>  	GET_THREAD_INFO(%rcx)
>  	movl threadinfo_flags(%rcx),%ebx
> Index: linux-2.6-lttng/include/asm-x86/irqflags.h
> ===================================================================
> --- linux-2.6-lttng.orig/include/asm-x86/irqflags.h	2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/include/asm-x86/irqflags.h	2008-04-16 11:29:30.000000000 -0400
> @@ -138,12 +138,73 @@ static inline unsigned long __raw_local_
>  
>  #ifdef CONFIG_X86_64
>  #define INTERRUPT_RETURN	iretq
> +
> +/*
> + * Only returns from a trap or exception to a NMI context (intra-privilege
> + * level near return) to the same SS and CS segments. Should be used
> + * upon trap or exception return when nested over a NMI context so no iret is
> + * issued. It takes care of modifying the eflags, rsp and returning to the
> + * previous function.
> + *
> + * The stack, at that point, looks like :
> + *
> + * 0(rsp)  RIP
> + * 8(rsp)  CS
> + * 16(rsp) EFLAGS
> + * 24(rsp) RSP
> + * 32(rsp) SS
> + *
> + * Upon execution :
> + * Copy EIP to the top of the return stack
> + * Update top of return stack address
> + * Pop eflags into the eflags register
> + * Make the return stack current
> + * Near return (popping the return address from the return stack)
> + */
> +#define INTERRUPT_RETURN_NMI_SAFE	pushq %rax;		\
> +					pushq %rbx;		\
> +					movq 40(%rsp), %rax;	\
> +					movq 16(%rsp), %rbx;	\
>   

Use X+16(%rsp) notation here, so that the offsets correspond to the 
comment above.

> +					subq $8, %rax;		\
> +					movq %rbx, (%rax);	\
> +					movq %rax, 40(%rsp);	\
> +					popq %rbx;		\
> +					popq %rax;		\
> +					addq $16, %rsp;		\
> +					popfq;			\
> +					movq (%rsp), %rsp;	\
> +					ret;			\
>   

How about something like

	pushq	%rax
	mov	%rsp, %rax		/* old stack */
	mov	24+8(%rax), %rsp	/* switch stacks */
	pushq	 0+8(%rax)		/* push return rip */
	pushq	16+8(%rax)		/* push return rflags */
	movq	(%rax), %rax		/* restore %rax */
	popfq				/* restore flags */
	ret				/* restore rip */


> +
>  #define ENABLE_INTERRUPTS_SYSCALL_RET			\
>  			movq	%gs:pda_oldrsp, %rsp;	\
>  			swapgs;				\
>  			sysretq;
>  #else
>  #define INTERRUPT_RETURN		iret
> +
> +/*
> + * Protected mode only, no V8086. Implies that protected mode must
> + * be entered before NMIs or MCEs are enabled. Only returns from a trap or
> + * exception to a NMI context (intra-privilege level far return). Should be used
> + * upon trap or exception return when nested over a NMI context so no iret is
> + * issued.
> + *
> + * The stack, at that point, looks like :
> + *
> + * 0(esp) EIP
> + * 4(esp) CS
> + * 8(esp) EFLAGS
> + *
> + * Upon execution :
> + * Copy the stack eflags to top of stack
> + * Pop eflags into the eflags register
> + * Far return: pop EIP and CS into their register, and additionally pop EFLAGS.
> + */
> +#define INTERRUPT_RETURN_NMI_SAFE	pushl 8(%esp);	\
> +					popfl;		\
> +					.byte 0xCA;	\
> +					.word 4;
>   

Why not "lret $4"?

    J

  reply	other threads:[~2008-04-16 17:58 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20080414230344.GA16061@Krystal>
2008-04-14 23:05 ` [RFC PATCH] x86 NMI-safe INT3 and Page Fault (v2) Mathieu Desnoyers
2008-04-16 13:06   ` Ingo Molnar
2008-04-16 13:47     ` [TEST PATCH] Test NMI kprobe modules Mathieu Desnoyers
2008-04-16 14:34       ` Ingo Molnar
2008-04-16 14:54         ` Mathieu Desnoyers
2008-04-16 15:10     ` [RFC PATCH] x86 NMI-safe INT3 and Page Fault (v2) Ingo Molnar
2008-04-16 15:18       ` H. Peter Anvin
2008-04-16 15:37       ` Mathieu Desnoyers
2008-04-16 16:03       ` Jeremy Fitzhardinge
2008-04-18  0:48         ` Mathieu Desnoyers
2008-04-18  9:49           ` Jeremy Fitzhardinge
2008-04-16 16:28       ` [RFC PATCH] x86 NMI-safe INT3 and Page Fault (v3) Mathieu Desnoyers
2008-04-16 17:57         ` Jeremy Fitzhardinge [this message]
2008-04-17 16:29           ` Mathieu Desnoyers
2008-04-17 16:45             ` Andi Kleen
2008-04-18  0:05               ` Mathieu Desnoyers
2008-04-18  8:27                 ` Andi Kleen
2008-04-19 21:18                   ` Mathieu Desnoyers
2008-04-20 12:58                     ` Andi Kleen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=48063E19.9090701@goop.org \
    --to=jeremy@goop.org \
    --cc=akpm@osdl.org \
    --cc=andi@firstfloor.org \
    --cc=compudj@krystal.dyndns.org \
    --cc=fche@redhat.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=rostedt@goodmis.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox