All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jeremy Fitzhardinge <jeremy@goop.org>
To: Mathieu Desnoyers <compudj@krystal.dyndns.org>
Cc: Ingo Molnar <mingo@elte.hu>, Andi Kleen <andi@firstfloor.org>,
	akpm@osdl.org, "H. Peter Anvin" <hpa@zytor.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	"Frank Ch. Eigler" <fche@redhat.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [RFC PATCH] x86 NMI-safe INT3 and Page Fault (v3)
Date: Wed, 16 Apr 2008 10:57:45 -0700	[thread overview]
Message-ID: <48063E19.9090701@goop.org> (raw)
In-Reply-To: <20080416162810.GA20450@Krystal>

Mathieu Desnoyers wrote:
> "This way lies madness. Don't go there."
>   

It is a large amount of... stuff.  This immediate values thing makes a 
big improvement then?

> Index: linux-2.6-lttng/include/linux/hardirq.h
> ===================================================================
> --- linux-2.6-lttng.orig/include/linux/hardirq.h	2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/include/linux/hardirq.h	2008-04-16 11:29:30.000000000 -0400
> @@ -22,10 +22,13 @@
>   * PREEMPT_MASK: 0x000000ff
>   * SOFTIRQ_MASK: 0x0000ff00
>   * HARDIRQ_MASK: 0x0fff0000
> + * HARDNMI_MASK: 0x40000000
>   */
>  #define PREEMPT_BITS	8
>  #define SOFTIRQ_BITS	8
>  
> +#define HARDNMI_BITS	1
> +
>  #ifndef HARDIRQ_BITS
>  #define HARDIRQ_BITS	12
>  
> @@ -45,16 +48,19 @@
>  #define PREEMPT_SHIFT	0
>  #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
>  #define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
> +#define HARDNMI_SHIFT	(30)
>   

Why at 30, rather than HARDIRQ_SHIFT+HARDIRQ_BITS?

>  
>  #define __IRQ_MASK(x)	((1UL << (x))-1)
>  
>  #define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
>  #define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
>  #define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
> +#define HARDNMI_MASK	(__IRQ_MASK(HARDNMI_BITS) << HARDNMI_SHIFT)
>  
>  #define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
>  #define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
>  #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
> +#define HARDNMI_OFFSET	(1UL << HARDNMI_SHIFT)
>  
>  #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
>  #error PREEMPT_ACTIVE is too low!
> @@ -63,6 +69,7 @@
>  #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
>  #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
>  #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
> +#define hardnmi_count()	(preempt_count() & HARDNMI_MASK)
>  
>  /*
>   * Are we doing bottom half or hardware interrupt processing?
> @@ -71,6 +78,7 @@
>  #define in_irq()		(hardirq_count())
>  #define in_softirq()		(softirq_count())
>  #define in_interrupt()		(irq_count())
> +#define in_nmi()		(hardnmi_count())
>  
>  /*
>   * Are we running in atomic context?  WARNING: this macro cannot
> @@ -159,7 +167,19 @@ extern void irq_enter(void);
>   */
>  extern void irq_exit(void);
>  
> -#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
> -#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
> +#define nmi_enter()					\
> +	do {						\
> +		lockdep_off();				\
> +		BUG_ON(hardnmi_count());		\
> +		add_preempt_count(HARDNMI_OFFSET);	\
> +		__irq_enter();				\
> +	} while (0)
> +
> +#define nmi_exit()					\
> +	do {						\
> +		__irq_exit();				\
> +		sub_preempt_count(HARDNMI_OFFSET);	\
> +		lockdep_on();				\
> +	} while (0)
>  
>  #endif /* LINUX_HARDIRQ_H */
> Index: linux-2.6-lttng/arch/x86/kernel/entry_32.S
> ===================================================================
> --- linux-2.6-lttng.orig/arch/x86/kernel/entry_32.S	2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/arch/x86/kernel/entry_32.S	2008-04-16 12:06:30.000000000 -0400
> @@ -79,7 +79,6 @@ VM_MASK		= 0x00020000
>  #define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
>  #else
>  #define preempt_stop(clobbers)
> -#define resume_kernel		restore_nocheck
>  #endif
>  
>  .macro TRACE_IRQS_IRET
> @@ -265,6 +264,8 @@ END(ret_from_exception)
>  #ifdef CONFIG_PREEMPT
>  ENTRY(resume_kernel)
>  	DISABLE_INTERRUPTS(CLBR_ANY)
> +	testl $0x40000000,TI_preempt_count(%ebp)	# nested over NMI ?
> +	jnz return_to_nmi
>  	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
>  	jnz restore_nocheck
>  need_resched:
> @@ -276,6 +277,12 @@ need_resched:
>  	call preempt_schedule_irq
>  	jmp need_resched
>  END(resume_kernel)
> +#else
> +ENTRY(resume_kernel)
> +	testl $0x40000000,TI_preempt_count(%ebp)	# nested over NMI ?
>   

HARDNMI_MASK?

> +	jnz return_to_nmi
> +	jmp restore_nocheck
> +END(resume_kernel)
>  #endif
>  	CFI_ENDPROC
>  
> @@ -411,6 +418,22 @@ restore_nocheck_notrace:
>  	CFI_ADJUST_CFA_OFFSET -4
>  irq_return:
>  	INTERRUPT_RETURN
> +return_to_nmi:
> +	testl $X86_EFLAGS_TF, PT_EFLAGS(%esp)
> +	jnz restore_nocheck		/*
> +					 * If single-stepping an NMI handler,
> +					 * use the normal iret path instead of
> +					 * the popf/lret because lret would be
> +					 * single-stepped. It should not
> +					 * happen : it will reactivate NMIs
> +					 * prematurely.
> +					 */
> +	TRACE_IRQS_IRET
> +	RESTORE_REGS
> +	addl $4, %esp			# skip orig_eax/error_code
> +	CFI_ADJUST_CFA_OFFSET -4
> +	INTERRUPT_RETURN_NMI_SAFE
> +
>  .section .fixup,"ax"
>  iret_exc:
>  	pushl $0			# no error code
> Index: linux-2.6-lttng/arch/x86/kernel/entry_64.S
> ===================================================================
> --- linux-2.6-lttng.orig/arch/x86/kernel/entry_64.S	2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/arch/x86/kernel/entry_64.S	2008-04-16 12:06:31.000000000 -0400
> @@ -581,12 +581,27 @@ retint_restore_args:	/* return to kernel
>  	 * The iretq could re-enable interrupts:
>  	 */
>  	TRACE_IRQS_IRETQ
> +	testl $0x40000000,threadinfo_preempt_count(%rcx) /* Nested over NMI ? */
>   

HARDNMI_MASK?  (ditto below)

> +	jnz return_to_nmi
>  restore_args:
>  	RESTORE_ARGS 0,8,0
>  
>  irq_return:
>  	INTERRUPT_RETURN
>  
> +return_to_nmi:				/*
> +					 * If single-stepping an NMI handler,
> +					 * use the normal iret path instead of
> +					 * the popf/lret because lret would be
> +					 * single-stepped. It should not
> +					 * happen : it will reactivate NMIs
> +					 * prematurely.
> +					 */
> +	bt $8,EFLAGS-ARGOFFSET(%rsp)	/* trap flag? */
>   

test[bwl] is a bit more usual; then you can use X86_EFLAGS_TF.

> +	jc restore_args
> +	RESTORE_ARGS 0,8,0
> +	INTERRUPT_RETURN_NMI_SAFE
> +
>  	.section __ex_table, "a"
>  	.quad irq_return, bad_iret
>  	.previous
> @@ -802,6 +817,10 @@ END(spurious_interrupt)
>  	.macro paranoidexit trace=1
>  	/* ebx:	no swapgs flag */
>  paranoid_exit\trace:
> +	GET_THREAD_INFO(%rcx)
> +	testl $0x40000000,threadinfo_preempt_count(%rcx) /* Nested over NMI ? */
> +	jnz paranoid_return_to_nmi\trace
> +paranoid_exit_no_nmi\trace:
>  	testl %ebx,%ebx				/* swapgs needed? */
>  	jnz paranoid_restore\trace
>  	testl $3,CS(%rsp)
> @@ -814,6 +833,18 @@ paranoid_swapgs\trace:
>  paranoid_restore\trace:
>  	RESTORE_ALL 8
>  	jmp irq_return
> +paranoid_return_to_nmi\trace:		/*
> +					 * If single-stepping an NMI handler,
> +					 * use the normal iret path instead of
> +					 * the popf/lret because lret would be
> +					 * single-stepped. It should not
> +					 * happen : it will reactivate NMIs
> +					 * prematurely.
> +					 */
> +	bt $8,EFLAGS-0(%rsp)		/* trap flag? */
> +	jc paranoid_exit_no_nmi\trace
> +	RESTORE_ALL 8
> +	INTERRUPT_RETURN_NMI_SAFE
>   

Does this need to be repeated verbatim?

>  paranoid_userspace\trace:
>  	GET_THREAD_INFO(%rcx)
>  	movl threadinfo_flags(%rcx),%ebx
> Index: linux-2.6-lttng/include/asm-x86/irqflags.h
> ===================================================================
> --- linux-2.6-lttng.orig/include/asm-x86/irqflags.h	2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/include/asm-x86/irqflags.h	2008-04-16 11:29:30.000000000 -0400
> @@ -138,12 +138,73 @@ static inline unsigned long __raw_local_
>  
>  #ifdef CONFIG_X86_64
>  #define INTERRUPT_RETURN	iretq
> +
> +/*
> + * Only returns from a trap or exception to a NMI context (intra-privilege
> + * level near return) to the same SS and CS segments. Should be used
> + * upon trap or exception return when nested over a NMI context so no iret is
> + * issued. It takes care of modifying the eflags, rsp and returning to the
> + * previous function.
> + *
> + * The stack, at that point, looks like :
> + *
> + * 0(rsp)  RIP
> + * 8(rsp)  CS
> + * 16(rsp) EFLAGS
> + * 24(rsp) RSP
> + * 32(rsp) SS
> + *
> + * Upon execution :
> + * Copy EIP to the top of the return stack
> + * Update top of return stack address
> + * Pop eflags into the eflags register
> + * Make the return stack current
> + * Near return (popping the return address from the return stack)
> + */
> +#define INTERRUPT_RETURN_NMI_SAFE	pushq %rax;		\
> +					pushq %rbx;		\
> +					movq 40(%rsp), %rax;	\
> +					movq 16(%rsp), %rbx;	\
>   

Use X+16(%rsp) notation here, so that the offsets correspond to the 
comment above.

> +					subq $8, %rax;		\
> +					movq %rbx, (%rax);	\
> +					movq %rax, 40(%rsp);	\
> +					popq %rbx;		\
> +					popq %rax;		\
> +					addq $16, %rsp;		\
> +					popfq;			\
> +					movq (%rsp), %rsp;	\
> +					ret;			\
>   

How about something like

	pushq	%rax
	mov	%rsp, %rax		/* old stack */
	mov	24+8(%rax), %rsp	/* switch stacks */
	pushq	 0+8(%rax)		/* push return rip */
	pushq	16+8(%rax)		/* push return rflags */
	movq	(%rax), %rax		/* restore %rax */
	popfq				/* restore flags */
	ret				/* restore rip */


> +
>  #define ENABLE_INTERRUPTS_SYSCALL_RET			\
>  			movq	%gs:pda_oldrsp, %rsp;	\
>  			swapgs;				\
>  			sysretq;
>  #else
>  #define INTERRUPT_RETURN		iret
> +
> +/*
> + * Protected mode only, no V8086. Implies that protected mode must
> + * be entered before NMIs or MCEs are enabled. Only returns from a trap or
> + * exception to a NMI context (intra-privilege level far return). Should be used
> + * upon trap or exception return when nested over a NMI context so no iret is
> + * issued.
> + *
> + * The stack, at that point, looks like :
> + *
> + * 0(esp) EIP
> + * 4(esp) CS
> + * 8(esp) EFLAGS
> + *
> + * Upon execution :
> + * Copy the stack eflags to top of stack
> + * Pop eflags into the eflags register
> + * Far return: pop EIP and CS into their register, and additionally pop EFLAGS.
> + */
> +#define INTERRUPT_RETURN_NMI_SAFE	pushl 8(%esp);	\
> +					popfl;		\
> +					.byte 0xCA;	\
> +					.word 4;
>   

Why not "lret $4"?

    J

  reply	other threads:[~2008-04-16 17:58 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20080414230344.GA16061@Krystal>
2008-04-14 23:05 ` [RFC PATCH] x86 NMI-safe INT3 and Page Fault (v2) Mathieu Desnoyers
2008-04-16 13:06   ` Ingo Molnar
2008-04-16 13:47     ` [TEST PATCH] Test NMI kprobe modules Mathieu Desnoyers
2008-04-16 14:34       ` Ingo Molnar
2008-04-16 14:54         ` Mathieu Desnoyers
2008-04-16 15:10     ` [RFC PATCH] x86 NMI-safe INT3 and Page Fault (v2) Ingo Molnar
2008-04-16 15:18       ` H. Peter Anvin
2008-04-16 15:37       ` Mathieu Desnoyers
2008-04-16 16:03       ` Jeremy Fitzhardinge
2008-04-18  0:48         ` Mathieu Desnoyers
2008-04-18  9:49           ` Jeremy Fitzhardinge
2008-04-16 16:28       ` [RFC PATCH] x86 NMI-safe INT3 and Page Fault (v3) Mathieu Desnoyers
2008-04-16 17:57         ` Jeremy Fitzhardinge [this message]
2008-04-17 16:29           ` Mathieu Desnoyers
2008-04-17 16:45             ` Andi Kleen
2008-04-18  0:05               ` Mathieu Desnoyers
2008-04-18  8:27                 ` Andi Kleen
2008-04-19 21:18                   ` Mathieu Desnoyers
2008-04-20 12:58                     ` Andi Kleen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=48063E19.9090701@goop.org \
    --to=jeremy@goop.org \
    --cc=akpm@osdl.org \
    --cc=andi@firstfloor.org \
    --cc=compudj@krystal.dyndns.org \
    --cc=fche@redhat.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=rostedt@goodmis.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.