public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [GIT PULL] x86/fpu for v2.6.32
@ 2009-09-11 19:44 Ingo Molnar
  2009-09-11 21:30 ` Jesper Juhl
  0 siblings, 1 reply; 4+ messages in thread
From: Ingo Molnar @ 2009-09-11 19:44 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, H. Peter Anvin, Thomas Gleixner

Linus,

Please pull the x86-fpu-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git x86-fpu-for-linus

 Thanks,

	Ingo

------------------>
Jeremy Fitzhardinge (4):
      x86: split out core __math_state_restore
      x86-32: make sure clts is batched during context switch
      x86-64: move unlazy_fpu() into lazy cpu state part of context switch
      x86-64: move clts into batch cpu state updates when preloading fpu


 arch/x86/include/asm/i387.h  |    1 +
 arch/x86/kernel/process_32.c |   27 ++++++++++++++++-----------
 arch/x86/kernel/process_64.c |   33 +++++++++++++++++++++------------
 arch/x86/kernel/traps.c      |   33 +++++++++++++++++++++++----------
 4 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 175adf5..2e75292 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -26,6 +26,7 @@ extern void fpu_init(void);
 extern void mxcsr_feature_mask_init(void);
 extern int init_fpu(struct task_struct *child);
 extern asmlinkage void math_state_restore(void);
+extern void __math_state_restore(void);
 extern void init_thread_xstate(void);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 59f4524..a80eddd 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -350,14 +350,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 				 *next = &next_p->thread;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+	bool preload_fpu;
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	__unlazy_fpu(prev_p);
+	/*
+	 * If the task has used fpu the last 5 timeslices, just do a full
+	 * restore of the math state immediately to avoid the trap; the
+	 * chances of needing FPU soon are obviously high now
+	 */
+	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
 
+	__unlazy_fpu(prev_p);
 
 	/* we're going to use this soon, after a few expensive things */
-	if (next_p->fpu_counter > 5)
+	if (preload_fpu)
 		prefetch(next->xstate);
 
 	/*
@@ -398,6 +405,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
 		__switch_to_xtra(prev_p, next_p, tss);
 
+	/* If we're going to preload the fpu context, make sure clts
+	   is run while we're batching the cpu state updates. */
+	if (preload_fpu)
+		clts();
+
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
@@ -407,15 +419,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	arch_end_context_switch(next_p);
 
-	/* If the task has used fpu the last 5 timeslices, just do a full
-	 * restore of the math state immediately to avoid the trap; the
-	 * chances of needing FPU soon are obviously high now
-	 *
-	 * tsk_used_math() checks prevent calling math_state_restore(),
-	 * which can sleep in the case of !tsk_used_math()
-	 */
-	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
-		math_state_restore();
+	if (preload_fpu)
+		__math_state_restore();
 
 	/*
 	 * Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ebefb54..a28279d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -386,9 +386,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
 	unsigned fsindex, gsindex;
+	bool preload_fpu;
+
+	/*
+	 * If the task has used fpu the last 5 timeslices, just do a full
+	 * restore of the math state immediately to avoid the trap; the
+	 * chances of needing FPU soon are obviously high now
+	 */
+	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
 
 	/* we're going to use this soon, after a few expensive things */
-	if (next_p->fpu_counter > 5)
+	if (preload_fpu)
 		prefetch(next->xstate);
 
 	/*
@@ -419,6 +427,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	load_TLS(next, cpu);
 
+	/* Must be after DS reload */
+	unlazy_fpu(prev_p);
+
+	/* Make sure cpu is ready for new context */
+	if (preload_fpu)
+		clts();
+
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
@@ -459,9 +474,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 	prev->gsindex = gsindex;
 
-	/* Must be after DS reload */
-	unlazy_fpu(prev_p);
-
 	/*
 	 * Switch the PDA and FPU contexts.
 	 */
@@ -480,15 +492,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 		__switch_to_xtra(prev_p, next_p, tss);
 
-	/* If the task has used fpu the last 5 timeslices, just do a full
-	 * restore of the math state immediately to avoid the trap; the
-	 * chances of needing FPU soon are obviously high now
-	 *
-	 * tsk_used_math() checks prevent calling math_state_restore(),
-	 * which can sleep in the case of !tsk_used_math()
+	/*
+	 * Preload the FPU context, now that we've determined that the
+	 * task is likely to be using it. 
 	 */
-	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
-		math_state_restore();
+	if (preload_fpu)
+		__math_state_restore();
 	return prev_p;
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5f935f0..71b9166 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -814,6 +814,28 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
 }
 
 /*
+ * __math_state_restore assumes that cr0.TS is already clear and the
+ * fpu state is all ready for use.  Used during context switch.
+ */
+void __math_state_restore(void)
+{
+	struct thread_info *thread = current_thread_info();
+	struct task_struct *tsk = thread->task;
+
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(tsk))) {
+		stts();
+		force_sig(SIGSEGV, tsk);
+		return;
+	}
+
+	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
+	tsk->fpu_counter++;
+}
+
+/*
  * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
  *
@@ -844,17 +866,8 @@ asmlinkage void math_state_restore(void)
 	}
 
 	clts();				/* Allow maths ops (or we recurse) */
-	/*
-	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
-	 */
-	if (unlikely(restore_fpu_checking(tsk))) {
-		stts();
-		force_sig(SIGSEGV, tsk);
-		return;
-	}
 
-	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
-	tsk->fpu_counter++;
+	__math_state_restore();
 }
 EXPORT_SYMBOL_GPL(math_state_restore);
 

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [GIT PULL] x86/fpu for v2.6.32
  2009-09-11 19:44 [GIT PULL] x86/fpu for v2.6.32 Ingo Molnar
@ 2009-09-11 21:30 ` Jesper Juhl
  2009-09-12  3:47   ` Arjan van de Ven
  0 siblings, 1 reply; 4+ messages in thread
From: Jesper Juhl @ 2009-09-11 21:30 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, linux-kernel, H. Peter Anvin, Thomas Gleixner


Hi Ingo,

First of all, I want to say that I have no real objections to the patch 
what-so-ever. The following is merely to satisfy my own personal 
curiosity - and I'm sure you have better things to do than satisfy my 
curiosity, so if you want to ignore me; feel free :-)

On Fri, 11 Sep 2009, Ingo Molnar wrote:

[...]
> +	bool preload_fpu;
>  
>  	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
>  
> -	__unlazy_fpu(prev_p);
> +	/*
> +	 * If the task has used fpu the last 5 timeslices, just do a full
> +	 * restore of the math state immediately to avoid the trap; the
> +	 * chances of needing FPU soon are obviously high now
> +	 */
> +	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
>  
> +	__unlazy_fpu(prev_p);
>  
[...]
> +	 * If the task has used fpu the last 5 timeslices, just do a full
> +	 * restore of the math state immediately to avoid the trap; the
> +	 * chances of needing FPU soon are obviously high now
> +	 */
> +	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
>  

I'm wondering about two things:

1) Where did that magic constant "5" come from?
Is there some fundamental thing about CPU's, cache layout, scheduling, 
benchmarks or something else that I just don't know that makes 5 the magic 
"right number"?  Why not 2, 3, 9 or 42?

2) Instead of writing that constant "5" multiple places, wouldn't it be 
nicer to use a  '#define FPU_RESTORE_TIMESLICES 5'  or  'static const 
unsigned int FPU_RESTORE_TIMESLICES = 5;'  instead?
Personally I hate magic numbers that are used in more than one location. 
I'd much rather have a constant with a sane name defined once (with a 
comment explaining it) and then see the name used in multiple places.


That's all.  :)


-- 
Jesper Juhl <jj@chaosbits.net>             http://www.chaosbits.net/
Plain text mails only, please      http://www.expita.com/nomime.html
Don't top-post  http://www.catb.org/~esr/jargon/html/T/top-post.html


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [GIT PULL] x86/fpu for v2.6.32
  2009-09-11 21:30 ` Jesper Juhl
@ 2009-09-12  3:47   ` Arjan van de Ven
  2009-09-12 21:53     ` Jesper Juhl
  0 siblings, 1 reply; 4+ messages in thread
From: Arjan van de Ven @ 2009-09-12  3:47 UTC (permalink / raw)
  To: Jesper Juhl
  Cc: Ingo Molnar, Linus Torvalds, linux-kernel, H. Peter Anvin,
	Thomas Gleixner

On Fri, 11 Sep 2009 23:30:42 +0200 (CEST)
Jesper Juhl <jj@chaosbits.net> wrote:

> 
> Hi Ingo,
> 
> First of all, I want to say that I have no real objections to the
> patch what-so-ever. The following is merely to satisfy my own
> personal curiosity - and I'm sure you have better things to do than
> satisfy my curiosity, so if you want to ignore me; feel free :-)
> 
> On Fri, 11 Sep 2009, Ingo Molnar wrote:
> 
> [...]
> > +	bool preload_fpu;
> >  
> >  	/* never put a printk in __switch_to... printk() calls
> > wake_up*() indirectly */ 
> > -	__unlazy_fpu(prev_p);
> > +	/*
> > +	 * If the task has used fpu the last 5 timeslices, just do
> > a full
> > +	 * restore of the math state immediately to avoid the
> > trap; the
> > +	 * chances of needing FPU soon are obviously high now
> > +	 */
> > +	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter
> > > 5; 
> > +	__unlazy_fpu(prev_p);
> >  
> [...]
> > +	 * If the task has used fpu the last 5 timeslices, just do
> > a full
> > +	 * restore of the math state immediately to avoid the
> > trap; the
> > +	 * chances of needing FPU soon are obviously high now
> > +	 */
> > +	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter
> > > 5; 
> 
> I'm wondering about two things:
> 

(btw this is not new code)

> 1) Where did that magic constant "5" come from?

from me running a bunch of experiments noticing that tasks that do 5
do on average many more. Could it be 4? Perhaps. 

> Is there some fundamental thing about CPU's, cache layout,
> scheduling, benchmarks or something else that I just don't know that
> makes 5 the magic "right number"?  Why not 2, 3, 9 or 42?

At some point *a* number must be taken. That is currently 5.
Show us that any other number is better and we'll switch ;-)




-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [GIT PULL] x86/fpu for v2.6.32
  2009-09-12  3:47   ` Arjan van de Ven
@ 2009-09-12 21:53     ` Jesper Juhl
  0 siblings, 0 replies; 4+ messages in thread
From: Jesper Juhl @ 2009-09-12 21:53 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, Linus Torvalds, linux-kernel, H. Peter Anvin,
	Thomas Gleixner

On Sat, 12 Sep 2009, Arjan van de Ven wrote:

> On Fri, 11 Sep 2009 23:30:42 +0200 (CEST)
> Jesper Juhl <jj@chaosbits.net> wrote:
> 
> > 
> > Hi Ingo,
> > 
> > First of all, I want to say that I have no real objections to the
> > patch what-so-ever. The following is merely to satisfy my own
> > personal curiosity - and I'm sure you have better things to do than
> > satisfy my curiosity, so if you want to ignore me; feel free :-)
> > 
> > On Fri, 11 Sep 2009, Ingo Molnar wrote:
> > 
> > [...]
> > > +	bool preload_fpu;
> > >  
> > >  	/* never put a printk in __switch_to... printk() calls
> > > wake_up*() indirectly */ 
> > > -	__unlazy_fpu(prev_p);
> > > +	/*
> > > +	 * If the task has used fpu the last 5 timeslices, just do
> > > a full
> > > +	 * restore of the math state immediately to avoid the
> > > trap; the
> > > +	 * chances of needing FPU soon are obviously high now
> > > +	 */
> > > +	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter
> > > > 5; 
> > > +	__unlazy_fpu(prev_p);
> > >  
> > [...]
> > > +	 * If the task has used fpu the last 5 timeslices, just do
> > > a full
> > > +	 * restore of the math state immediately to avoid the
> > > trap; the
> > > +	 * chances of needing FPU soon are obviously high now
> > > +	 */
> > > +	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter
> > > > 5; 
> > 
> > I'm wondering about two things:
> > 
> 
> (btw this is not new code)
> 
> > 1) Where did that magic constant "5" come from?
> 
> from me running a bunch of experiments noticing that tasks that do 5
> do on average many more. Could it be 4? Perhaps. 
> 
> > Is there some fundamental thing about CPU's, cache layout,
> > scheduling, benchmarks or something else that I just don't know that
> > makes 5 the magic "right number"?  Why not 2, 3, 9 or 42?
> 
> At some point *a* number must be taken. That is currently 5.
> Show us that any other number is better and we'll switch ;-)
> 
Thank you for taking the time to answer and explain.
I'm sure 5 is fine, I was merely currious :-)

-- 
Jesper Juhl <jj@chaosbits.net>             http://www.chaosbits.net/
Plain text mails only, please      http://www.expita.com/nomime.html
Don't top-post  http://www.catb.org/~esr/jargon/html/T/top-post.html


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2009-09-12 21:53 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-09-11 19:44 [GIT PULL] x86/fpu for v2.6.32 Ingo Molnar
2009-09-11 21:30 ` Jesper Juhl
2009-09-12  3:47   ` Arjan van de Ven
2009-09-12 21:53     ` Jesper Juhl

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox