[PATCHv1] x86: don't schedule when handling #NM exception

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCHv1] x86: don't schedule when handling #NM exception
@ 2014-03-10 16:17 David Vrabel
  2014-03-10 16:40 ` H. Peter Anvin
                   ` (3 more replies)
  0 siblings, 4 replies; 67+ messages in thread
From: David Vrabel @ 2014-03-10 16:17 UTC (permalink / raw)
  To: linux-kernel
  Cc: x86, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, xen-devel,
	David Vrabel

math_state_restore() is called from the #NM exception handler.  It may
do a GFP_KERNEL allocation (in init_fpu()) which may schedule.

Change this allocation to GFP_ATOMIC, but leave all the other callers
of init_fpu() or fpu_alloc() using GFP_KERNEL.

do_group_exit() will also call schedule() so replace the call with
force_sig(SIGKILL, tsk) instead.

Scheduling in math_state_restore() is particularly bad in Xen PV
guests since the Xen clears CR0.TS before raising #NM exception (in
the expectation that the #NM handler always clears TS).  If task A is
descheduled and task B is scheduled.  Task B may end up with CR0.TS
unexpectedly clear and any FPU instructions will not raise #NM and
will corrupt task A's FPU state instead.

Reported-by: Zhu Yanhai <zhu.yanhai@gmail.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/include/asm/fpu-internal.h |    4 ++--
 arch/x86/include/asm/i387.h         |    2 +-
 arch/x86/include/asm/xsave.h        |    2 +-
 arch/x86/kernel/i387.c              |   16 ++++++++--------
 arch/x86/kernel/process.c           |    2 +-
 arch/x86/kernel/traps.c             |    9 ++-------
 arch/x86/kernel/xsave.c             |    4 ++--
 arch/x86/kvm/x86.c                  |    2 +-
 8 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index cea1c76..e32a73c 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -567,11 +567,11 @@ static bool fpu_allocated(struct fpu *fpu)
 	return fpu->state != NULL;
 }
 
-static inline int fpu_alloc(struct fpu *fpu)
+static inline int fpu_alloc(struct fpu *fpu, gfp_t gfp)
 {
 	if (fpu_allocated(fpu))
 		return 0;
-	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+	fpu->state = kmem_cache_alloc(task_xstate_cachep, gfp);
 	if (!fpu->state)
 		return -ENOMEM;
 	WARN_ON((unsigned long)fpu->state & 15);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index ed8089d..5559c80 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -18,7 +18,7 @@
 struct pt_regs;
 struct user_i387_struct;
 
-extern int init_fpu(struct task_struct *child);
+extern int init_fpu(struct task_struct *child, gfp_t gfp);
 extern void fpu_finit(struct fpu *fpu);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
 extern void math_state_restore(void);
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 5547389..e6d6610 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -44,7 +44,7 @@ extern struct xsave_struct *init_xstate_buf;
 
 extern void xsave_init(void);
 extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
-extern int init_fpu(struct task_struct *child);
+extern int init_fpu(struct task_struct *child, gfp_t gfp);
 
 static inline int fpu_xrstor_checking(struct xsave_struct *fx)
 {
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e8368c6..baf94ad 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -217,7 +217,7 @@ EXPORT_SYMBOL_GPL(fpu_finit);
  * value at reset if we support XMM instructions and then
  * remember the current task has used the FPU.
  */
-int init_fpu(struct task_struct *tsk)
+int init_fpu(struct task_struct *tsk, gfp_t gfp)
 {
 	int ret;
 
@@ -231,7 +231,7 @@ int init_fpu(struct task_struct *tsk)
 	/*
 	 * Memory allocation at the first usage of the FPU and other state.
 	 */
-	ret = fpu_alloc(&tsk->thread.fpu);
+	ret = fpu_alloc(&tsk->thread.fpu, gfp);
 	if (ret)
 		return ret;
 
@@ -266,7 +266,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	if (!cpu_has_fxsr)
 		return -ENODEV;
 
-	ret = init_fpu(target);
+	ret = init_fpu(target, GFP_KERNEL);
 	if (ret)
 		return ret;
 
@@ -285,7 +285,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (!cpu_has_fxsr)
 		return -ENODEV;
 
-	ret = init_fpu(target);
+	ret = init_fpu(target, GFP_KERNEL);
 	if (ret)
 		return ret;
 
@@ -318,7 +318,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	if (!cpu_has_xsave)
 		return -ENODEV;
 
-	ret = init_fpu(target);
+	ret = init_fpu(target, GFP_KERNEL);
 	if (ret)
 		return ret;
 
@@ -348,7 +348,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (!cpu_has_xsave)
 		return -ENODEV;
 
-	ret = init_fpu(target);
+	ret = init_fpu(target, GFP_KERNEL);
 	if (ret)
 		return ret;
 
@@ -515,7 +515,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 	struct user_i387_ia32_struct env;
 	int ret;
 
-	ret = init_fpu(target);
+	ret = init_fpu(target, GFP_KERNEL);
 	if (ret)
 		return ret;
 
@@ -546,7 +546,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	struct user_i387_ia32_struct env;
 	int ret;
 
-	ret = init_fpu(target);
+	ret = init_fpu(target, GFP_KERNEL);
 	if (ret)
 		return ret;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95..10705a6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -69,7 +69,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 	*dst = *src;
 	if (fpu_allocated(&src->thread.fpu)) {
 		memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
-		ret = fpu_alloc(&dst->thread.fpu);
+		ret = fpu_alloc(&dst->thread.fpu, GFP_KERNEL);
 		if (ret)
 			return ret;
 		fpu_copy(dst, src);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 57409f6..c8078d2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -624,18 +624,13 @@ void math_state_restore(void)
 	struct task_struct *tsk = current;
 
 	if (!tsk_used_math(tsk)) {
-		local_irq_enable();
-		/*
-		 * does a slab alloc which can sleep
-		 */
-		if (init_fpu(tsk)) {
+		if (init_fpu(tsk, GFP_ATOMIC)) {
 			/*
 			 * ran out of memory!
 			 */
-			do_group_exit(SIGKILL);
+			force_sig(SIGKILL, tsk);
 			return;
 		}
-		local_irq_disable();
 	}
 
 	__thread_fpu_begin(tsk);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a4b451c..0512744 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -347,7 +347,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
 	if (!access_ok(VERIFY_READ, buf, size))
 		return -EACCES;
 
-	if (!used_math() && init_fpu(tsk))
+	if (!used_math() && init_fpu(tsk, GFP_KERNEL))
 		return -1;
 
 	if (!static_cpu_has(X86_FEATURE_FPU))
@@ -628,7 +628,7 @@ void eager_fpu_init(void)
 	 * This is same as math_state_restore(). But use_xsave() is
 	 * not yet patched to use math_state_restore().
 	 */
-	init_fpu(current);
+	init_fpu(current, GFP_KERNEL);
 	__thread_fpu_begin(current);
 	if (cpu_has_xsave)
 		xrstor_state(init_xstate_buf, -1);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b85784..fc74b68 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6619,7 +6619,7 @@ int fx_init(struct kvm_vcpu *vcpu)
 {
 	int err;
 
-	err = fpu_alloc(&vcpu->arch.guest_fpu);
+	err = fpu_alloc(&vcpu->arch.guest_fpu, gfp);
 	if (err)
 		return err;
 
-- 
1.7.2.5


^ permalink raw reply related	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-10 16:17 [PATCHv1] x86: don't schedule when handling #NM exception David Vrabel
@ 2014-03-10 16:40 ` H. Peter Anvin
  2014-03-10 17:15   ` David Vrabel
  2014-03-10 17:15   ` David Vrabel
  2014-03-10 16:40 ` H. Peter Anvin
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-10 16:40 UTC (permalink / raw)
  To: David Vrabel, linux-kernel
  Cc: x86, Thomas Gleixner, Ingo Molnar, xen-devel, Sarah Newman

On 03/10/2014 09:17 AM, David Vrabel wrote:
> math_state_restore() is called from the #NM exception handler.  It may
> do a GFP_KERNEL allocation (in init_fpu()) which may schedule.
> 
> Change this allocation to GFP_ATOMIC, but leave all the other callers
> of init_fpu() or fpu_alloc() using GFP_KERNEL.

And what the [Finnish] do you do if GFP_ATOMIC fails?

> do_group_exit() will also call schedule() so replace the call with
> force_sig(SIGKILL, tsk) instead.
> 
> Scheduling in math_state_restore() is particularly bad in Xen PV
> guests since the Xen clears CR0.TS before raising #NM exception (in
> the expectation that the #NM handler always clears TS).  If task A is
> descheduled and task B is scheduled.  Task B may end up with CR0.TS
> unexpectedly clear and any FPU instructions will not raise #NM and
> will corrupt task A's FPU state instead.

Yes, we know Xen is completely broken in this respect.

Anyway, I have a patchset from Sarah Newman which I have been reviewing
privately so far (which looks good and should be posted publicly -- the
holdup has not been Sarah's code but a combination of my bandwidth and
trying to get some preexisting bugs in the eagerfpu code dealt with,
which Suresh Siddha fortunately stepped up to do and which we now have a
solution for.)

Sarah's patchset switches Xen PV to use eagerfpu unconditionally, which
removes the dependency on #NM and is the right thing to do.

Sarah, could you post the latest patchset to LKML so it can be publicly
reviewed?  I'm sorry for the slow response time on my end.

	-hpa


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-10 16:40 ` H. Peter Anvin
@ 2014-03-10 17:15   ` David Vrabel
  2014-03-10 17:25       ` H. Peter Anvin
  2014-03-17  3:13       ` Sarah Newman
  2014-03-10 17:15   ` David Vrabel
  1 sibling, 2 replies; 67+ messages in thread
From: David Vrabel @ 2014-03-10 17:15 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, xen-devel,
	Sarah Newman

On 10/03/14 16:40, H. Peter Anvin wrote:
> On 03/10/2014 09:17 AM, David Vrabel wrote:
>> math_state_restore() is called from the #NM exception handler.  It may
>> do a GFP_KERNEL allocation (in init_fpu()) which may schedule.
>>
>> Change this allocation to GFP_ATOMIC, but leave all the other callers
>> of init_fpu() or fpu_alloc() using GFP_KERNEL.
> 
> And what the [Finnish] do you do if GFP_ATOMIC fails?

The same thing it used to do -- kill the task with SIGKILL.  I haven't
changed this behaviour.

> Sarah's patchset switches Xen PV to use eagerfpu unconditionally, which
> removes the dependency on #NM and is the right thing to do.

Ok. I'll wait for this series and not pursue this patch any further.

David

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-10 17:15   ` David Vrabel
@ 2014-03-10 17:25       ` H. Peter Anvin
  2014-03-17  3:13       ` Sarah Newman
  1 sibling, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-10 17:25 UTC (permalink / raw)
  To: David Vrabel
  Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, xen-devel,
	Sarah Newman

On 03/10/2014 10:15 AM, David Vrabel wrote:
>>
>> And what the [Finnish] do you do if GFP_ATOMIC fails?
> 
> The same thing it used to do -- kill the task with SIGKILL.  I haven't
> changed this behaviour.
> 

No, you just made it many orders of magnitude more likely.

	-hpa


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
@ 2014-03-10 17:25       ` H. Peter Anvin
  0 siblings, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-10 17:25 UTC (permalink / raw)
  To: David Vrabel
  Cc: Thomas Gleixner, Ingo Molnar, linux-kernel, Sarah Newman,
	xen-devel

On 03/10/2014 10:15 AM, David Vrabel wrote:
>>
>> And what the [Finnish] do you do if GFP_ATOMIC fails?
> 
> The same thing it used to do -- kill the task with SIGKILL.  I haven't
> changed this behaviour.
> 

No, you just made it many orders of magnitude more likely.

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-10 17:15   ` David Vrabel
@ 2014-03-17  3:13       ` Sarah Newman
  2014-03-17  3:13       ` Sarah Newman
  1 sibling, 0 replies; 67+ messages in thread
From: Sarah Newman @ 2014-03-17  3:13 UTC (permalink / raw)
  To: David Vrabel, H. Peter Anvin
  Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, xen-devel

On 03/10/2014 10:15 AM, David Vrabel wrote:
> On 10/03/14 16:40, H. Peter Anvin wrote:
>> On 03/10/2014 09:17 AM, David Vrabel wrote:
>>> math_state_restore() is called from the #NM exception handler.  It may
>>> do a GFP_KERNEL allocation (in init_fpu()) which may schedule.
>>>
>>> Change this allocation to GFP_ATOMIC, but leave all the other callers
>>> of init_fpu() or fpu_alloc() using GFP_KERNEL.
>>
>> And what the [Finnish] do you do if GFP_ATOMIC fails?
> 
> The same thing it used to do -- kill the task with SIGKILL.  I haven't
> changed this behaviour.
> 
>> Sarah's patchset switches Xen PV to use eagerfpu unconditionally, which
>> removes the dependency on #NM and is the right thing to do.
> 
> Ok. I'll wait for this series and not pursue this patch any further.

Sorry, this got swallowed by my mail filter.

I did some more testing and I think eagerfpu is going to noticeably slow things down. When I ran
"time sysbench --num-threads=64 --test=threads run" I saw on the order of 15% more time spent in
system mode and this seemed consistent over different runs.

As for GFP_ATOMIC, unfortunately I don't know a sanctioned test here so I rolled my own. This test
sequentially allocated math-using processes in the background until it could not any more.  On a
64MB instance, I saw 10% fewer processes allocated with GFP_ATOMIC compared to GFP_KERNEL when I
continually allocated new processes up to OOM conditions (256 vs 228.)  A similar test on a
different RFS and a kernel using GFP_NOWAIT showed pretty much no difference in how many processes I
could allocate. This doesn't seem too bad unless there is some kind of fragmentation over time which
would cause worse performance.

Since performance degradation applies at all times and not just under extreme conditions, I think
the lesser evil will actually be GFP_ATOMIC.  But it's not necessary to always use GFP_ATOMIC, only
under certain conditions - IE when the xen PVABI forces us to.

Patches will be supplied shortly.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
@ 2014-03-17  3:13       ` Sarah Newman
  0 siblings, 0 replies; 67+ messages in thread
From: Sarah Newman @ 2014-03-17  3:13 UTC (permalink / raw)
  To: David Vrabel, H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, linux-kernel, xen-devel

On 03/10/2014 10:15 AM, David Vrabel wrote:
> On 10/03/14 16:40, H. Peter Anvin wrote:
>> On 03/10/2014 09:17 AM, David Vrabel wrote:
>>> math_state_restore() is called from the #NM exception handler.  It may
>>> do a GFP_KERNEL allocation (in init_fpu()) which may schedule.
>>>
>>> Change this allocation to GFP_ATOMIC, but leave all the other callers
>>> of init_fpu() or fpu_alloc() using GFP_KERNEL.
>>
>> And what the [Finnish] do you do if GFP_ATOMIC fails?
> 
> The same thing it used to do -- kill the task with SIGKILL.  I haven't
> changed this behaviour.
> 
>> Sarah's patchset switches Xen PV to use eagerfpu unconditionally, which
>> removes the dependency on #NM and is the right thing to do.
> 
> Ok. I'll wait for this series and not pursue this patch any further.

Sorry, this got swallowed by my mail filter.

I did some more testing and I think eagerfpu is going to noticeably slow things down. When I ran
"time sysbench --num-threads=64 --test=threads run" I saw on the order of 15% more time spent in
system mode and this seemed consistent over different runs.

As for GFP_ATOMIC, unfortunately I don't know a sanctioned test here so I rolled my own. This test
sequentially allocated math-using processes in the background until it could not any more.  On a
64MB instance, I saw 10% fewer processes allocated with GFP_ATOMIC compared to GFP_KERNEL when I
continually allocated new processes up to OOM conditions (256 vs 228.)  A similar test on a
different RFS and a kernel using GFP_NOWAIT showed pretty much no difference in how many processes I
could allocate. This doesn't seem too bad unless there is some kind of fragmentation over time which
would cause worse performance.

Since performance degradation applies at all times and not just under extreme conditions, I think
the lesser evil will actually be GFP_ATOMIC.  But it's not necessary to always use GFP_ATOMIC, only
under certain conditions - IE when the xen PVABI forces us to.

Patches will be supplied shortly.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-17  3:13       ` Sarah Newman
  (?)
@ 2014-03-17  3:30       ` Sarah Newman
  2014-03-17  8:38         ` Jan Beulich
  -1 siblings, 1 reply; 67+ messages in thread
From: Sarah Newman @ 2014-03-17  3:30 UTC (permalink / raw)
  To: konrad.wilk, boris.ostrovsky, david.vrabel, xen-devel, zhu.yanhai
  Cc: Sarah Newman

The device not available trap handler in the mainline Linux kernel
has not been PVABI compliant since 2.6.26, leading to FPU register
corruption under extremely rare circumstances.  While this should and
hopefully will be fixed, the performance of the fix vs. the original
behavior may lead to one or the other being desirable under different
workloads.

Add an option, dev_na_ts_allowed, which on a per dom0/U basis breaks
PVABI by keeping CR0 TS set during the trap device not available.
PVABI compatibility is advertised using CPUID such that the OS can
choose its behavior accordingly.

This option is disabled by default.

Reported-by: Zhu Yanhai <zhu.yanhai@gmail.com>
Signed-off-by: Sarah Newman <srn@prgmr.com>
---
 docs/man/xl.cfg.pod.5                   |   20 ++++++++++++++++++++
 tools/libxc/xc_dom_x86.c                |    8 ++++++++
 tools/libxc/xenctrl.h                   |    1 +
 tools/libxl/libxl_create.c              |    1 +
 tools/libxl/libxl_types.idl             |    1 +
 tools/libxl/libxl_x86.c                 |    4 ++++
 tools/libxl/xl_cmdimpl.c                |    1 +
 tools/libxl/xl_sxp.c                    |    2 ++
 tools/python/xen/lowlevel/xc/xc.c       |   21 +++++++++++++++++++++
 tools/python/xen/xend/XendConfig.py     |    4 ++++
 tools/python/xen/xend/XendDomainInfo.py |    4 ++++
 tools/python/xen/xm/create.py           |   11 ++++++++++-
 tools/python/xen/xm/xenapi_create.py    |    1 +
 xen/arch/x86/domain_build.c             |    6 ++++++
 xen/arch/x86/domctl.c                   |    6 ++++++
 xen/arch/x86/traps.c                    |   15 ++++++++++++---
 xen/include/asm-x86/domain.h            |    2 ++
 xen/include/public/arch-x86/cpuid.h     |    5 +++++
 xen/include/public/domctl.h             |   11 +++++++++++
 19 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
index a6663b9..4fc3953 100644
--- a/docs/man/xl.cfg.pod.5
+++ b/docs/man/xl.cfg.pod.5
@@ -696,6 +696,26 @@ anyway.
 =item B<pvh=BOOLEAN>
 Selects whether to run this PV guest in an HVM container. Default is 0.
 
+=item B<dev_na_ts_allowed=BOOLEAN>
+
+On the x86 platform, selects whether to break PVABI compatibility by 
+setting the task switch bit in control register 0 before entering 
+the device not available trap.
+
+Linux kernels derived from mainline between v2.6.26 through at least 3.14
+may in very rare circumstances have their FPU registers corrupted without
+this enabled.  
+
+Kernels that have the flag 'eagerfpu' in the flags in /proc/cpuinfo do not
+require this to be enabled, but it may be advisable to enable this anyway
+for performance reasons. On machines with the 'fxsr' flag (all x86 CPUs 
+starting from a Pentium 2) and a kernel version of v3.10 or newer 
+may also manually add 'eagerfpu=on' to their kernel command line 
+in order to avoid this bug.
+
+Leaving this on should not cause issues for any kernels derived from
+mainline.
+
 =back
 
 =head2 Fully-virtualised (HVM) Guest Specific Options
diff --git a/tools/libxc/xc_dom_x86.c b/tools/libxc/xc_dom_x86.c
index e034d62..83ebf7b 100644
--- a/tools/libxc/xc_dom_x86.c
+++ b/tools/libxc/xc_dom_x86.c
@@ -950,6 +950,14 @@ int xc_dom_feature_translated(struct xc_dom_image *dom)
     return elf_xen_feature_get(XENFEAT_auto_translated_physmap, dom->f_active);
 }
 
+int xc_domain_dev_na_ts_allowed(xc_interface *xch, uint32_t domid)
+{
+    DECLARE_DOMCTL;
+    domctl.cmd = XEN_DOMCTL_dev_na_ts_allowed;
+    domctl.domain = (domid_t)domid;
+    domctl.u.dev_na_ts_allowed.allowed = 1;
+    return do_domctl(xch, &domctl);
+}
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index 13f816b..2947c00 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -1913,6 +1913,7 @@ int xc_cpuid_apply_policy(xc_interface *xch,
 void xc_cpuid_to_str(const unsigned int *regs,
                      char **strs); /* some strs[] may be NULL if ENOMEM */
 int xc_mca_op(xc_interface *xch, struct xen_mc *mc);
+int xc_domain_dev_na_ts_allowed(xc_interface *xch, uint32_t domid);
 #endif
 
 struct xc_px_val {
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index d015cf4..3a09cf6 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -341,6 +341,7 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
         break;
     case LIBXL_DOMAIN_TYPE_PV:
         libxl_defbool_setdefault(&b_info->u.pv.e820_host, false);
+        libxl_defbool_setdefault(&b_info->u.pv.dev_na_ts_allowed, false);
         if (b_info->shadow_memkb == LIBXL_MEMKB_DEFAULT)
             b_info->shadow_memkb = 0;
         if (b_info->u.pv.slack_memkb == LIBXL_MEMKB_DEFAULT)
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 612645c..17b1ba2 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -383,6 +383,7 @@ libxl_domain_build_info = Struct("domain_build_info",[
                                       ("features", string, {'const': True}),
                                       # Use host's E820 for PCI passthrough.
                                       ("e820_host", libxl_defbool),
+                                      ("dev_na_ts_allowed", libxl_defbool),
                                       ])),
                  ("invalid", None),
                  ], keyvar_init_val = "LIBXL_DOMAIN_TYPE_INVALID")),
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index b11d036..6032d99 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -308,6 +308,10 @@ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
         }
     }
 
+    if (d_config->c_info.type == LIBXL_DOMAIN_TYPE_PV &&
+                libxl_defbool_val(d_config->b_info.u.pv.dev_na_ts_allowed))
+        xc_domain_dev_na_ts_allowed(ctx->xch, domid);
+
     return ret;
 }
 
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index 6b1ebfa..0016512 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -1481,6 +1481,7 @@ skip_vfb:
      * after guest starts is done (with PCI devices passed in). */
     if (c_info->type == LIBXL_DOMAIN_TYPE_PV) {
         xlu_cfg_get_defbool(config, "e820_host", &b_info->u.pv.e820_host, 0);
+        xlu_cfg_get_defbool(config, "dev_na_ts_allowed", &b_info->u.pv.dev_na_ts_allowed, 0);
     }
 
     if (!xlu_cfg_get_list (config, "pci", &pcis, 0, 0)) {
diff --git a/tools/libxl/xl_sxp.c b/tools/libxl/xl_sxp.c
index a16a025..a673614 100644
--- a/tools/libxl/xl_sxp.c
+++ b/tools/libxl/xl_sxp.c
@@ -152,6 +152,8 @@ void printf_info_sexp(int domid, libxl_domain_config *d_config)
         printf("\t\t\t(ramdisk %s)\n", b_info->u.pv.ramdisk);
         printf("\t\t\t(e820_host %s)\n",
                libxl_defbool_to_string(b_info->u.pv.e820_host));
+        printf("\t\t\t(dev_na_ts_allowed %s)\n",
+               libxl_defbool_to_string(b_info->u.pv.dev_na_ts_allowed));
         printf("\t\t)\n");
         break;
     default:
diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c
index 737bdac..afeaea4 100644
--- a/tools/python/xen/lowlevel/xc/xc.c
+++ b/tools/python/xen/lowlevel/xc/xc.c
@@ -1779,6 +1779,20 @@ static PyObject *pyxc_domain_disable_migrate(XcObject *self, PyObject *args)
     return zero;
 }
 
+static PyObject *pyxc_domain_dev_na_ts_allowed(XcObject *self, PyObject *args)
+{
+    uint32_t dom;
+
+    if (!PyArg_ParseTuple(args, "i", &dom))
+        return NULL;
+
+    if (xc_domain_dev_na_ts_allowed(self->xc_handle, dom) != 0)
+        return pyxc_error_to_exception(self->xc_handle);
+
+    Py_INCREF(zero);
+    return zero;
+}
+
 static PyObject *pyxc_domain_send_trigger(XcObject *self,
                                           PyObject *args,
                                           PyObject *kwds)
@@ -2731,6 +2745,13 @@ static PyMethodDef pyxc_methods[] = {
       " dom        [int]: Domain whose TSC mode is being set.\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
+    { "domain_dev_na_ts_allowed",
+      (PyCFunction)pyxc_domain_dev_na_ts_allowed,
+      METH_VARARGS, "\n"
+      "Marks domain as needing task switching enabled during x86 device na trap\n"
+      " dom        [int]: Identifier of domain.\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
     { "domain_send_trigger",
       (PyCFunction)pyxc_domain_send_trigger,
       METH_VARARGS | METH_KEYWORDS, "\n"
diff --git a/tools/python/xen/xend/XendConfig.py b/tools/python/xen/xend/XendConfig.py
index 4a226a7..4e5c880 100644
--- a/tools/python/xen/xend/XendConfig.py
+++ b/tools/python/xen/xend/XendConfig.py
@@ -158,6 +158,7 @@ XENAPI_PLATFORM_CFG_TYPES = {
     'monitor_path': str,
     'nographic': int,
     'nomigrate': int,
+    'dev_na_ts_allowed' : int,
     'pae' : int,
     'rtc_timeoffset': int,
     'parallel': str,
@@ -511,6 +512,9 @@ class XendConfig(dict):
         if 'nomigrate' not in self['platform']:
             self['platform']['nomigrate'] = 0
 
+        if 'dev_na_ts_allowed' not in self['platform']:
+            self['platform']['dev_na_ts_allowed'] = 0
+
         if self.is_hvm():
             if 'timer_mode' not in self['platform']:
                 self['platform']['timer_mode'] = 1
diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py
index 8d4ff5c..2646291 100644
--- a/tools/python/xen/xend/XendDomainInfo.py
+++ b/tools/python/xen/xend/XendDomainInfo.py
@@ -2606,6 +2606,10 @@ class XendDomainInfo:
         if nomigrate is not None and long(nomigrate) != 0:
             xc.domain_disable_migrate(self.domid)
 
+        dev_na_ts_allowed = self.info["platform"].get("dev_na_ts_allowed")
+        if arch.type == "x86" and not hvm and long(dev_na_ts_allowed) != 0:
+            xc.domain_dev_na_ts_allowed(self.domid)
+
         # Optionally enable virtual HPET
         hpet = self.info["platform"].get("hpet")
         if hvm and hpet is not None:
diff --git a/tools/python/xen/xm/create.py b/tools/python/xen/xm/create.py
index 22841aa..451f7bf 100644
--- a/tools/python/xen/xm/create.py
+++ b/tools/python/xen/xm/create.py
@@ -233,6 +233,12 @@ gopts.var('nomigrate', val='NOMIGRATE',
           fn=set_int, default=0,
           use="""migratability (0=migration enabled, 1=migration disabled).""")
 
+gopts.var('dev_na_ts_allowed', val='DEV_NA_TS_ALLOWED',
+          fn=set_int, default=0,
+          use="""Enable task switch during device not available trap
+          (Default is 0).
+          Recommended for x86 Linux kernels derived from mainline from v2.6.26 on.""")
+
 gopts.var('vpt_align', val='VPT_ALIGN',
           fn=set_int, default=1,
           use="Enable aligning all periodic vpt to reduce timer interrupts.")
@@ -776,6 +782,9 @@ def configure_image(vals):
     if vals.nomigrate is not None:
         config_image.append(['nomigrate', vals.nomigrate])
 
+    if vals.dev_na_ts_allowed is not None:
+        config_image.append(['dev_na_ts_allowed', vals.dev_na_ts_allowed])
+
     return config_image
     
 def configure_disks(config_devs, vals):
@@ -1094,7 +1103,7 @@ def make_config(vals):
                    'on_reboot', 'on_crash', 'features', 'on_xend_start',
                    'on_xend_stop', 'target', 'cpuid', 'cpuid_check',
                    'machine_address_size', 'suppress_spurious_page_faults',
-                   'description'])
+                   'description', 'dev_na_ts_allowed', ])
 
     vcpu_conf()
     if vals.uuid is not None:
diff --git a/tools/python/xen/xm/xenapi_create.py b/tools/python/xen/xm/xenapi_create.py
index 346ff20..dd30b41 100644
--- a/tools/python/xen/xm/xenapi_create.py
+++ b/tools/python/xen/xm/xenapi_create.py
@@ -1074,6 +1074,7 @@ class sxp2xml:
             'xen_platform_pci',
             'tsc_mode'
             'description',
+            'dev_na_ts_allowed',
             'nomigrate'
         ]
 
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 84ce392..0e8186f 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -126,6 +126,9 @@ boolean_param("dom0_shadow", opt_dom0_shadow);
 static char __initdata opt_dom0_ioports_disable[200] = "";
 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
 
+static bool_t __initdata opt_dom0_dev_na_ts_allowed;
+boolean_param("dev_na_ts_allowed", opt_dom0_dev_na_ts_allowed);
+
 /* Allow ring-3 access in long mode as guest cannot use ring 1 ... */
 #define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
 #define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL)
@@ -1061,6 +1064,9 @@ int __init construct_dom0(
         if ( paging_enable(d, PG_SH_enable) == 0 ) 
             paging_update_paging_modes(v);
 
+    if ( opt_dom0_dev_na_ts_allowed )
+        d->arch.pv_domain.dev_na_ts_allowed = 1;
+
     if ( supervisor_mode_kernel )
     {
         v->arch.pv_vcpu.kernel_ss &= ~3;
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 26635ff..0ad1708 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1256,6 +1256,12 @@ long arch_do_domctl(
     }
     break;
 
+    case XEN_DOMCTL_dev_na_ts_allowed:
+    {
+        d->arch.pv_domain.dev_na_ts_allowed = domctl->u.dev_na_ts_allowed.allowed;
+    }
+    break;
+
     default:
         ret = iommu_do_domctl(domctl, d, u_domctl);
         break;
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index ed4ae2d..cfe7c8d 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -713,8 +713,12 @@ int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
             *ebx = 0x40000200;
         *ecx = 0;          /* Features 1 */
         *edx = 0;          /* Features 2 */
-        if ( is_pv_vcpu(current) )
+        if ( is_pv_vcpu(current) ) {
             *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
+
+            if (current->domain->arch.pv_domain.dev_na_ts_allowed)
+                *ecx |= XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED;
+        }
         break;
 
     case 3:
@@ -3269,8 +3273,13 @@ void do_device_not_available(struct cpu_user_regs *regs)
 
     if ( curr->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS )
     {
-        do_guest_trap(TRAP_no_device, regs, 0);
-        curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
+        if (!current->domain->arch.pv_domain.dev_na_ts_allowed) {
+            do_guest_trap(TRAP_no_device, regs, 0);
+            curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
+        } else {
+            stts();
+            do_guest_trap(TRAP_no_device, regs, 0);
+        }
     }
     else
         TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 4ff89f0..75d47c9 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -232,6 +232,8 @@ struct pv_domain
      * unmask the event channel */
     bool_t auto_unmask;
 
+    bool_t dev_na_ts_allowed;
+
     /* map_domain_page() mapping cache. */
     struct mapcache_domain mapcache;
 };
diff --git a/xen/include/public/arch-x86/cpuid.h b/xen/include/public/arch-x86/cpuid.h
index d9bd627..3a165ee 100644
--- a/xen/include/public/arch-x86/cpuid.h
+++ b/xen/include/public/arch-x86/cpuid.h
@@ -65,4 +65,9 @@
 #define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
 #define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD  (1u<<0)
 
+/* Will the host not automatically clear CR0.TS after exiting ? */
+#define _XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED 1
+#define XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED \
+    (1u<<_XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED)
+
 #endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index f22fe2e..95b83fd 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -896,6 +896,13 @@ struct xen_domctl_cacheflush {
 typedef struct xen_domctl_cacheflush xen_domctl_cacheflush_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_cacheflush_t);
 
+#if defined(__i386__) || defined(__x86_64__)
+/* XEN_DOMCTL_dev_na_ts_allowed */
+typedef struct xen_domctl_dev_na_ts_allowed {
+    uint32_t allowed; /* IN: 1: allow task switch during device NA trap */
+} xen_domctl_dev_na_ts_allowed_t;
+#endif
+
 struct xen_domctl {
     uint32_t cmd;
 #define XEN_DOMCTL_createdomain                   1
@@ -966,6 +973,7 @@ struct xen_domctl {
 #define XEN_DOMCTL_getnodeaffinity               69
 #define XEN_DOMCTL_set_max_evtchn                70
 #define XEN_DOMCTL_cacheflush                    71
+#define XEN_DOMCTL_dev_na_ts_allowed             72
 #define XEN_DOMCTL_gdbsx_guestmemio            1000
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
@@ -1025,6 +1033,9 @@ struct xen_domctl {
         struct xen_domctl_gdbsx_memio       gdbsx_guest_memio;
         struct xen_domctl_set_broken_page_p2m set_broken_page_p2m;
         struct xen_domctl_cacheflush        cacheflush;
+#if defined(__i386__) || defined(__x86_64__)
+        struct xen_domctl_dev_na_ts_allowed dev_na_ts_allowed;
+#endif
         struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
         struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
         uint8_t                             pad[128];
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 67+ messages in thread

* Re: [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-17  3:30       ` [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed Sarah Newman
@ 2014-03-17  8:38         ` Jan Beulich
  2014-03-17 12:42           ` George Dunlap
  2014-03-17 12:44           ` George Dunlap
  0 siblings, 2 replies; 67+ messages in thread
From: Jan Beulich @ 2014-03-17  8:38 UTC (permalink / raw)
  To: Sarah Newman; +Cc: boris.ostrovsky, xen-devel, zhu.yanhai, david.vrabel

>>> On 17.03.14 at 04:30, Sarah Newman <srn@prgmr.com> wrote:

Not being convinced at all that this is the right approach (in
particular it remains unclear how an affected guest should deal with
running on a hypervisor not supporting the new interface), a couple
of mechanical comments nevertheless:

> --- a/xen/arch/x86/domctl.c
> +++ b/xen/arch/x86/domctl.c
> @@ -1256,6 +1256,12 @@ long arch_do_domctl(
>      }
>      break;
>  
> +    case XEN_DOMCTL_dev_na_ts_allowed:
> +    {
> +        d->arch.pv_domain.dev_na_ts_allowed = domctl->u.dev_na_ts_allowed.allowed;
> +    }
> +    break;

Pointless braces. Also you're converting a 32-bit flags value to
bool_t here, which you ought to do more carefully: Either any non-
zero value of the input means "boolean true", or bit 0 of it is the
intended designator. The decision here may also make necessary
an adjustment to the interface definition further down.

> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -713,8 +713,12 @@ int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
>              *ebx = 0x40000200;
>          *ecx = 0;          /* Features 1 */
>          *edx = 0;          /* Features 2 */
> -        if ( is_pv_vcpu(current) )
> +        if ( is_pv_vcpu(current) ) {
>              *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
> +
> +            if (current->domain->arch.pv_domain.dev_na_ts_allowed)
> +                *ecx |= XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED;
> +        }

Coding style.

> @@ -3269,8 +3273,13 @@ void do_device_not_available(struct cpu_user_regs *regs)
>  
>      if ( curr->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS )
>      {
> -        do_guest_trap(TRAP_no_device, regs, 0);
> -        curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
> +        if (!current->domain->arch.pv_domain.dev_na_ts_allowed) {
> +            do_guest_trap(TRAP_no_device, regs, 0);
> +            curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
> +        } else {
> +            stts();
> +            do_guest_trap(TRAP_no_device, regs, 0);
> +        }

No need to call do_guest_trap() separately in both branches -
leave the call where it was, and only make the other operation
conditional.

Also coding style again.

> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -232,6 +232,8 @@ struct pv_domain
>       * unmask the event channel */
>      bool_t auto_unmask;
>  
> +    bool_t dev_na_ts_allowed;

The naming of this (here and elsewhere) is rather odd: I assume
you mean "dev_na" to be an abbreviation of "device not available",
but I'd much prefer the CPU exception abbreviation (#NM) to be
used in such a case. However, in the case here this still wouldn't
be a precise description of the behavior you establish: TS being
set isn't allowed, but required to be set upon guest #NM. I'd
therefore suggest naming this along the lines of "nm_enforce_ts".

Jan

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-17  8:38         ` Jan Beulich
@ 2014-03-17 12:42           ` George Dunlap
  2014-03-17 13:35             ` Jan Beulich
  2014-03-17 12:44           ` George Dunlap
  1 sibling, 1 reply; 67+ messages in thread
From: George Dunlap @ 2014-03-17 12:42 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Boris Ostrovsky, David Vrabel, Yanhai Zhu,
	Sarah Newman

On Mon, Mar 17, 2014 at 8:38 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>> On 17.03.14 at 04:30, Sarah Newman <srn@prgmr.com> wrote:
>
> Not being convinced at all that this is the right approach (in
> particular it remains unclear how an affected guest should deal with
> running on a hypervisor not supporting the new interface)

It looks like the intention of this patch was that if the dom0
administrator enables the new option, then it will be on by default,
*but* the guest can disable the new behavior.  That way, if an admin
knows that she's running all PVOPS kernels (no "classic Xen" kernels),
she can enable it system-wide.  Older PVOPS kernels will behave
correctly (but a bit slowly), and newer PVOPS kernels will switch to
the PVABI behavior and reap the performance benefit.

Newer PVOPS kernels running on older hypervisors will simply use the
PVABI behavior.  Older PVOPS kernels (without the kernel-side fix
backported) running on older hypervisors (without this patch
backported) will be buggy, but there's nothing we can do about that.
:-)

 -George

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-17 12:42           ` George Dunlap
@ 2014-03-17 13:35             ` Jan Beulich
  2014-03-17 14:05               ` George Dunlap
  0 siblings, 1 reply; 67+ messages in thread
From: Jan Beulich @ 2014-03-17 13:35 UTC (permalink / raw)
  To: George Dunlap
  Cc: Boris Ostrovsky, xen-devel, Yanhai Zhu, David Vrabel,
	Sarah Newman

>>> On 17.03.14 at 13:42, George Dunlap <George.Dunlap@eu.citrix.com> wrote:
> On Mon, Mar 17, 2014 at 8:38 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>> On 17.03.14 at 04:30, Sarah Newman <srn@prgmr.com> wrote:
>>
>> Not being convinced at all that this is the right approach (in
>> particular it remains unclear how an affected guest should deal with
>> running on a hypervisor not supporting the new interface)
> 
> It looks like the intention of this patch was that if the dom0
> administrator enables the new option, then it will be on by default,
> *but* the guest can disable the new behavior.  That way, if an admin
> knows that she's running all PVOPS kernels (no "classic Xen" kernels),
> she can enable it system-wide.  Older PVOPS kernels will behave
> correctly (but a bit slowly), and newer PVOPS kernels will switch to
> the PVABI behavior and reap the performance benefit.
> 
> Newer PVOPS kernels running on older hypervisors will simply use the
> PVABI behavior.

But if that works correctly, then there's no hypervisor/tools
change needed in the first place.

Jan

>  Older PVOPS kernels (without the kernel-side fix
> backported) running on older hypervisors (without this patch
> backported) will be buggy, but there's nothing we can do about that.
> :-)
> 
>  -George

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-17 13:35             ` Jan Beulich
@ 2014-03-17 14:05               ` George Dunlap
  2014-03-17 14:18                 ` George Dunlap
  0 siblings, 1 reply; 67+ messages in thread
From: George Dunlap @ 2014-03-17 14:05 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Boris Ostrovsky, xen-devel, Yanhai Zhu, David Vrabel,
	Sarah Newman

On 03/17/2014 01:35 PM, Jan Beulich wrote:
>>>> On 17.03.14 at 13:42, George Dunlap <George.Dunlap@eu.citrix.com> wrote:
>> On Mon, Mar 17, 2014 at 8:38 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>> On 17.03.14 at 04:30, Sarah Newman <srn@prgmr.com> wrote:
>>> Not being convinced at all that this is the right approach (in
>>> particular it remains unclear how an affected guest should deal with
>>> running on a hypervisor not supporting the new interface)
>> It looks like the intention of this patch was that if the dom0
>> administrator enables the new option, then it will be on by default,
>> *but* the guest can disable the new behavior.  That way, if an admin
>> knows that she's running all PVOPS kernels (no "classic Xen" kernels),
>> she can enable it system-wide.  Older PVOPS kernels will behave
>> correctly (but a bit slowly), and newer PVOPS kernels will switch to
>> the PVABI behavior and reap the performance benefit.
>>
>> Newer PVOPS kernels running on older hypervisors will simply use the
>> PVABI behavior.
> But if that works correctly, then there's no hypervisor/tools
> change needed in the first place.

Yes, there's still a need to run *old* PVOPS kernels on *new* 
hypervisors.  That (as I understand it) is the point of this patch.

  -George

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-17 14:05               ` George Dunlap
@ 2014-03-17 14:18                 ` George Dunlap
  2014-03-17 15:28                   ` Jan Beulich
  2014-03-18 18:07                   ` Sarah Newman
  0 siblings, 2 replies; 67+ messages in thread
From: George Dunlap @ 2014-03-17 14:18 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Boris Ostrovsky, xen-devel, Yanhai Zhu, David Vrabel,
	Sarah Newman

On 03/17/2014 02:05 PM, George Dunlap wrote:
> On 03/17/2014 01:35 PM, Jan Beulich wrote:
>>>>> On 17.03.14 at 13:42, George Dunlap <George.Dunlap@eu.citrix.com> 
>>>>> wrote:
>>> On Mon, Mar 17, 2014 at 8:38 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>>> On 17.03.14 at 04:30, Sarah Newman <srn@prgmr.com> wrote:
>>>> Not being convinced at all that this is the right approach (in
>>>> particular it remains unclear how an affected guest should deal with
>>>> running on a hypervisor not supporting the new interface)
>>> It looks like the intention of this patch was that if the dom0
>>> administrator enables the new option, then it will be on by default,
>>> *but* the guest can disable the new behavior.  That way, if an admin
>>> knows that she's running all PVOPS kernels (no "classic Xen" kernels),
>>> she can enable it system-wide.  Older PVOPS kernels will behave
>>> correctly (but a bit slowly), and newer PVOPS kernels will switch to
>>> the PVABI behavior and reap the performance benefit.
>>>
>>> Newer PVOPS kernels running on older hypervisors will simply use the
>>> PVABI behavior.
>> But if that works correctly, then there's no hypervisor/tools
>> change needed in the first place.
>
> Yes, there's still a need to run *old* PVOPS kernels on *new* 
> hypervisors.  That (as I understand it) is the point of this patch.

So we have old hypervisors, new hypervisors with this disabled, and new 
hypervisors with this enabled.  New hypervisors with this disabled 
behave just like old hypervisors.  And we have old pvops kernels, new 
pvops kernels, and "classic Xen" kernels.  And we have "correctness" and 
"performance".  Then we have the following combinations:

* Old hypervisor / New hypervisor w/ mode disabled:
  - Old hypervisor, classic kernel: correct and fast.
  - Old hypervisor, old pvops kernel: fast but buggy.
  - Old hypervisor, new pvops kernel: correct and fast.
* New hypervisor (w/ mode enabled):
  - classic kernel: broken (since it's expecting PVABI TS behavior)
  - old pvops: correct but slow
  - new pvops kernel: correct and fast (since it will opt-in to the 
faster PVABI)

Is there a way for Xen to tell if it's running a "classic Xen" port 
(which expects PVABI TS behavior), or a pvops kernel (which will either 
expect "hardware" TS behavior, or will know how to opt-in to PVABI TS 
behavior)?  That would relieve the admin of trying to figure out for 
each guest whether he had a classic or a pvops kernel.

  -George

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-17 14:18                 ` George Dunlap
@ 2014-03-17 15:28                   ` Jan Beulich
  2014-03-18 18:07                   ` Sarah Newman
  1 sibling, 0 replies; 67+ messages in thread
From: Jan Beulich @ 2014-03-17 15:28 UTC (permalink / raw)
  To: George Dunlap
  Cc: Boris Ostrovsky, xen-devel, Yanhai Zhu, David Vrabel,
	Sarah Newman

>>> On 17.03.14 at 15:18, George Dunlap <george.dunlap@eu.citrix.com> wrote:
> On 03/17/2014 02:05 PM, George Dunlap wrote:
>> On 03/17/2014 01:35 PM, Jan Beulich wrote:
>>>>>> On 17.03.14 at 13:42, George Dunlap <George.Dunlap@eu.citrix.com> 
>>>>>> wrote:
>>>> On Mon, Mar 17, 2014 at 8:38 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>>>> On 17.03.14 at 04:30, Sarah Newman <srn@prgmr.com> wrote:
>>>>> Not being convinced at all that this is the right approach (in
>>>>> particular it remains unclear how an affected guest should deal with
>>>>> running on a hypervisor not supporting the new interface)
>>>> It looks like the intention of this patch was that if the dom0
>>>> administrator enables the new option, then it will be on by default,
>>>> *but* the guest can disable the new behavior.  That way, if an admin
>>>> knows that she's running all PVOPS kernels (no "classic Xen" kernels),
>>>> she can enable it system-wide.  Older PVOPS kernels will behave
>>>> correctly (but a bit slowly), and newer PVOPS kernels will switch to
>>>> the PVABI behavior and reap the performance benefit.
>>>>
>>>> Newer PVOPS kernels running on older hypervisors will simply use the
>>>> PVABI behavior.
>>> But if that works correctly, then there's no hypervisor/tools
>>> change needed in the first place.
>>
>> Yes, there's still a need to run *old* PVOPS kernels on *new* 
>> hypervisors.  That (as I understand it) is the point of this patch.
> 
> So we have old hypervisors, new hypervisors with this disabled, and new 
> hypervisors with this enabled.  New hypervisors with this disabled 
> behave just like old hypervisors.  And we have old pvops kernels, new 
> pvops kernels, and "classic Xen" kernels.  And we have "correctness" and 
> "performance".  Then we have the following combinations:
> 
> * Old hypervisor / New hypervisor w/ mode disabled:
>   - Old hypervisor, classic kernel: correct and fast.
>   - Old hypervisor, old pvops kernel: fast but buggy.
>   - Old hypervisor, new pvops kernel: correct and fast.

If it's really that way, then again - what's the point of making a
hypervisor change when the kernel alone can deal with it in its
entirety?

> * New hypervisor (w/ mode enabled):
>   - classic kernel: broken (since it's expecting PVABI TS behavior)
>   - old pvops: correct but slow
>   - new pvops kernel: correct and fast (since it will opt-in to the 
> faster PVABI)
> 
> Is there a way for Xen to tell if it's running a "classic Xen" port 
> (which expects PVABI TS behavior), or a pvops kernel (which will either 
> expect "hardware" TS behavior, or will know how to opt-in to PVABI TS 
> behavior)?  That would relieve the admin of trying to figure out for 
> each guest whether he had a classic or a pvops kernel.

I think Xen should conceptually not try to guess things like this,
even more so that you're leaving out of the discussion other
(perhaps non-Linux) PV kernels.

Jan

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-17 14:18                 ` George Dunlap
  2014-03-17 15:28                   ` Jan Beulich
@ 2014-03-18 18:07                   ` Sarah Newman
  2014-03-18 19:14                     ` David Vrabel
  1 sibling, 1 reply; 67+ messages in thread
From: Sarah Newman @ 2014-03-18 18:07 UTC (permalink / raw)
  To: George Dunlap, Jan Beulich
  Cc: xen-devel, Boris Ostrovsky, Yanhai Zhu, David Vrabel

On 03/17/2014 07:18 AM, George Dunlap wrote:
> On 03/17/2014 02:05 PM, George Dunlap wrote:
>> On 03/17/2014 01:35 PM, Jan Beulich wrote:
>>>>>> On 17.03.14 at 13:42, George Dunlap <George.Dunlap@eu.citrix.com> wrote:
>>>> On Mon, Mar 17, 2014 at 8:38 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>>>> On 17.03.14 at 04:30, Sarah Newman <srn@prgmr.com> wrote:
>>>>> Not being convinced at all that this is the right approach (in
>>>>> particular it remains unclear how an affected guest should deal with
>>>>> running on a hypervisor not supporting the new interface)
>>>> It looks like the intention of this patch was that if the dom0
>>>> administrator enables the new option, then it will be on by default,
>>>> *but* the guest can disable the new behavior.  That way, if an admin
>>>> knows that she's running all PVOPS kernels (no "classic Xen" kernels),
>>>> she can enable it system-wide.  Older PVOPS kernels will behave
>>>> correctly (but a bit slowly), and newer PVOPS kernels will switch to
>>>> the PVABI behavior and reap the performance benefit.

The guest cannot enable or disable this behavior but it can detect the behavior using CPUID.  I
considered making the behavior configurable from within the guest, but did not find a clean way of
implementing it since any decision should really happen very early in the boot process.  Suggestions
on how to do this are welcome.

>>>>
>>>> Newer PVOPS kernels running on older hypervisors will simply use the
>>>> PVABI behavior.
>>> But if that works correctly, then there's no hypervisor/tools
>>> change needed in the first place.
>>
>> Yes, there's still a need to run *old* PVOPS kernels on *new* hypervisors.  That (as I understand
>> it) is the point of this patch.

My assumption is that the accepted linux fix will be more slow or use memory less effectively in
order to integrate well with other x86 implementations. So I would like new kernels to be able to
detect that they can use clts/stts safely and only use the workaround if they can't.  This is why I
added a CPUID field that advertises nm_hardware_ts.

Given almost all of our customers run linux, my long term plan is turn this option on for everyone
by default and let individual users turn it off if they are running a classic kernel or a different
OS which is PVABI compliant.

> 
> So we have old hypervisors, new hypervisors with this disabled, and new hypervisors with this
> enabled.  New hypervisors with this disabled behave just like old hypervisors.  And we have old
> pvops kernels, new pvops kernels, and "classic Xen" kernels.  And we have "correctness" and
> "performance".  Then we have the following combinations:
> 
> * Old hypervisor / New hypervisor w/ mode disabled:
>  - Old hypervisor, classic kernel: correct and fast.
Yes

>  - Old hypervisor, old pvops kernel: fast but buggy.
Yes

>  - Old hypervisor, new pvops kernel: correct and fast.
Likely not fast if eagerfpu is the solution instead of eager allocation or atomic allocation.

> * New hypervisor (w/ mode enabled):
>  - classic kernel: broken (since it's expecting PVABI TS behavior)
Broken, yes

>  - old pvops: correct but slow
Correct and as fast as it was, because its behavior will not change with regards to clts/stts.

>  - new pvops kernel: correct and fast (since it will opt-in to the faster PVABI)
Correct and as fast as it was.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-18 18:07                   ` Sarah Newman
@ 2014-03-18 19:14                     ` David Vrabel
  0 siblings, 0 replies; 67+ messages in thread
From: David Vrabel @ 2014-03-18 19:14 UTC (permalink / raw)
  To: Sarah Newman
  Cc: George Dunlap, David Vrabel, Jan Beulich, xen-devel,
	Boris Ostrovsky, Yanhai Zhu

On 18/03/14 18:07, Sarah Newman wrote:
> 
> The guest cannot enable or disable this behavior but it can detect the behavior using CPUID.  I
> considered making the behavior configurable from within the guest, but did not find a clean way of
> implementing it since any decision should really happen very early in the boot process.  Suggestions
> on how to do this are welcome.

Consider using an Xen ELF note feature (see public/elfnote.h and
public/features.h).

e.g., XENFEAT_nm_hardware_ts

This would be a trivial one-liner to the kernel and would require no
user configurable option in the toolstack and compatibility would be
retained for guests needing the current ABI.

David

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-17  8:38         ` Jan Beulich
  2014-03-17 12:42           ` George Dunlap
@ 2014-03-17 12:44           ` George Dunlap
  2014-03-17 13:35             ` Jan Beulich
  1 sibling, 1 reply; 67+ messages in thread
From: George Dunlap @ 2014-03-17 12:44 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Boris Ostrovsky, David Vrabel, Yanhai Zhu,
	Sarah Newman

On Mon, Mar 17, 2014 at 8:38 AM, Jan Beulich <JBeulich@suse.com> wrote:
> The naming of this (here and elsewhere) is rather odd: I assume
> you mean "dev_na" to be an abbreviation of "device not available",
> but I'd much prefer the CPU exception abbreviation (#NM) to be
> used in such a case. However, in the case here this still wouldn't
> be a precise description of the behavior you establish: TS being
> set isn't allowed, but required to be set upon guest #NM. I'd
> therefore suggest naming this along the lines of "nm_enforce_ts".

nm_hardware_ts, perhaps, since the TS is mimicking the native hardware
interface?

 -George

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-17 12:44           ` George Dunlap
@ 2014-03-17 13:35             ` Jan Beulich
  2014-03-18 17:48               ` Sarah Newman
  0 siblings, 1 reply; 67+ messages in thread
From: Jan Beulich @ 2014-03-17 13:35 UTC (permalink / raw)
  To: George Dunlap
  Cc: Boris Ostrovsky, xen-devel, Yanhai Zhu, David Vrabel,
	Sarah Newman

>>> On 17.03.14 at 13:44, George Dunlap <George.Dunlap@eu.citrix.com> wrote:
> On Mon, Mar 17, 2014 at 8:38 AM, Jan Beulich <JBeulich@suse.com> wrote:
>> The naming of this (here and elsewhere) is rather odd: I assume
>> you mean "dev_na" to be an abbreviation of "device not available",
>> but I'd much prefer the CPU exception abbreviation (#NM) to be
>> used in such a case. However, in the case here this still wouldn't
>> be a precise description of the behavior you establish: TS being
>> set isn't allowed, but required to be set upon guest #NM. I'd
>> therefore suggest naming this along the lines of "nm_enforce_ts".
> 
> nm_hardware_ts, perhaps, since the TS is mimicking the native hardware
> interface?

Would be fine with me too.

Jan

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed
  2014-03-17 13:35             ` Jan Beulich
@ 2014-03-18 17:48               ` Sarah Newman
  0 siblings, 0 replies; 67+ messages in thread
From: Sarah Newman @ 2014-03-18 17:48 UTC (permalink / raw)
  To: Jan Beulich, George Dunlap
  Cc: xen-devel, Boris Ostrovsky, Yanhai Zhu, David Vrabel

On 03/17/2014 06:35 AM, Jan Beulich wrote:
>>>> On 17.03.14 at 13:44, George Dunlap <George.Dunlap@eu.citrix.com> wrote:
>> On Mon, Mar 17, 2014 at 8:38 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>> The naming of this (here and elsewhere) is rather odd: I assume
>>> you mean "dev_na" to be an abbreviation of "device not available",
>>> but I'd much prefer the CPU exception abbreviation (#NM) to be
>>> used in such a case. However, in the case here this still wouldn't
>>> be a precise description of the behavior you establish: TS being
>>> set isn't allowed, but required to be set upon guest #NM. I'd
>>> therefore suggest naming this along the lines of "nm_enforce_ts".
>>
>> nm_hardware_ts, perhaps, since the TS is mimicking the native hardware
>> interface?
> 
> Would be fine with me too.

OK, I'll make a name change from dev_na_ts_allowed to nm_hardware_ts.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* [PATCH] x86, fpu, xen: Allocate fpu state for xen pv based on PVABI behavior
  2014-03-17  3:13       ` Sarah Newman
  (?)
  (?)
@ 2014-03-17  3:32       ` Sarah Newman
  -1 siblings, 0 replies; 67+ messages in thread
From: Sarah Newman @ 2014-03-17  3:32 UTC (permalink / raw)
  To: konrad.wilk, boris.ostrovsky, david.vrabel, xen-devel, zhu.yanhai,
	hpa, tglx, mingo, linux-kernel
  Cc: Sarah Newman

The xen PVABI dictates that CR0 TS will be automatically cleared for
the device not available trap.  This means it is not safe to task
switch with the default PVABI behavior.

One method of working around this is to disallow scheduling when
allocating memory for the fpu state, but in extremely low memory
circumstances this may fail. Therefore only require this behavior
when xen pv mode is active and the xen PVABI does not allow task
switching.

One other solution, enabling eagerfpu, was explored but eventually
discarded due to notable performance impact.

Reported-by: Zhu Yanhai <zhu.yanhai@gmail.com>
Signed-off-by: Sarah Newman <srn@prgmr.com>
---
 arch/x86/include/asm/fpu-internal.h |    2 +-
 arch/x86/include/asm/processor.h    |    5 +++++
 arch/x86/kernel/i387.c              |   13 +++++++++++++
 arch/x86/kernel/traps.c             |    2 --
 arch/x86/xen/enlighten.c            |    1 +
 arch/x86/xen/setup.c                |   27 +++++++++++++++++++++++++++
 6 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index cea1c76..9ec236c 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -571,7 +571,7 @@ static inline int fpu_alloc(struct fpu *fpu)
 {
 	if (fpu_allocated(fpu))
 		return 0;
-	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+	fpu_ops.fpu_state_alloc(fpu);
 	if (!fpu->state)
 		return -ENOMEM;
 	WARN_ON((unsigned long)fpu->state & 15);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fdedd38..941b55d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -413,6 +413,11 @@ struct fpu {
 	union thread_xstate *state;
 };
 
+struct fpu_ops {
+	void (*fpu_state_alloc)(struct fpu *fpu);
+};
+extern struct fpu_ops fpu_ops;
+
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
 
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5dd808..24ce161 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -157,6 +157,19 @@ static void init_thread_xstate(void)
 		xstate_size = sizeof(struct i387_fsave_struct);
 }
 
+static void native_fpu_state_alloc(struct fpu *fpu)
+{
+	unsigned long flags;
+	local_save_flags(flags);
+	local_irq_enable();
+	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+	local_irq_restore(flags);
+}
+
+struct fpu_ops fpu_ops = {
+		.fpu_state_alloc = native_fpu_state_alloc,
+};
+
 /*
  * Called at bootup to set up the initial FPU state that is later cloned
  * into all processes.
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 57409f6..97479d6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -624,7 +624,6 @@ void math_state_restore(void)
 	struct task_struct *tsk = current;
 
 	if (!tsk_used_math(tsk)) {
-		local_irq_enable();
 		/*
 		 * does a slab alloc which can sleep
 		 */
@@ -635,7 +634,6 @@ void math_state_restore(void)
 			do_group_exit(SIGKILL);
 			return;
 		}
-		local_irq_disable();
 	}
 
 	__thread_fpu_begin(tsk);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 201d09a..fb3aa30 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -69,6 +69,7 @@
 #include <asm/mwait.h>
 #include <asm/pci_x86.h>
 #include <asm/pat.h>
+#include <asm/processor.h>
 
 #ifdef CONFIG_ACPI
 #include <linux/acpi.h>
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 0982233..4e65b52 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -11,6 +11,7 @@
 #include <linux/memblock.h>
 #include <linux/cpuidle.h>
 #include <linux/cpufreq.h>
+#include <linux/slab.h>
 
 #include <asm/elf.h>
 #include <asm/vdso.h>
@@ -18,6 +19,7 @@
 #include <asm/setup.h>
 #include <asm/acpi.h>
 #include <asm/numa.h>
+#include <asm/processor.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
@@ -598,6 +600,28 @@ void __init xen_pvmmu_arch_setup(void)
 	xen_enable_nmi();
 }
 
+static void xen_fpu_state_alloc(struct fpu *fpu)
+{
+	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_NOWAIT);
+}
+
+static const struct fpu_ops xen_fpu_ops __initconst = {
+		.fpu_state_alloc = xen_fpu_state_alloc,
+};
+
+#define _XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED 1
+#define XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED \
+	(1u<<_XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED)
+static bool __init xen_check_dev_na_ts_allowed(void)
+{
+	uint32_t pages, msr, feat1, feat2, base;
+
+	base = xen_cpuid_base();
+	cpuid(base + 2, &pages, &msr, &feat1, &feat2);
+
+	return !!(feat1 & XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED);
+}
+
 /* This function is not called for HVM domains */
 void __init xen_arch_setup(void)
 {
@@ -605,6 +629,9 @@ void __init xen_arch_setup(void)
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		xen_pvmmu_arch_setup();
 
+	if (!xen_check_dev_na_ts_allowed())
+		fpu_ops = xen_fpu_ops;
+
 #ifdef CONFIG_ACPI
 	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
 		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 67+ messages in thread

* [PATCH] x86, fpu, xen: Allocate fpu state for xen pv based on PVABI behavior
  2014-03-17  3:13       ` Sarah Newman
                         ` (2 preceding siblings ...)
  (?)
@ 2014-03-17  3:32       ` Sarah Newman
  -1 siblings, 0 replies; 67+ messages in thread
From: Sarah Newman @ 2014-03-17  3:32 UTC (permalink / raw)
  To: konrad.wilk, boris.ostrovsky, david.vrabel, xen-devel, zhu.yanhai,
	hpa, tglx, mingo, linux-kernel
  Cc: Sarah Newman

The xen PVABI dictates that CR0 TS will be automatically cleared for
the device not available trap.  This means it is not safe to task
switch with the default PVABI behavior.

One method of working around this is to disallow scheduling when
allocating memory for the fpu state, but in extremely low memory
circumstances this may fail. Therefore only require this behavior
when xen pv mode is active and the xen PVABI does not allow task
switching.

One other solution, enabling eagerfpu, was explored but eventually
discarded due to notable performance impact.

Reported-by: Zhu Yanhai <zhu.yanhai@gmail.com>
Signed-off-by: Sarah Newman <srn@prgmr.com>
---
 arch/x86/include/asm/fpu-internal.h |    2 +-
 arch/x86/include/asm/processor.h    |    5 +++++
 arch/x86/kernel/i387.c              |   13 +++++++++++++
 arch/x86/kernel/traps.c             |    2 --
 arch/x86/xen/enlighten.c            |    1 +
 arch/x86/xen/setup.c                |   27 +++++++++++++++++++++++++++
 6 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index cea1c76..9ec236c 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -571,7 +571,7 @@ static inline int fpu_alloc(struct fpu *fpu)
 {
 	if (fpu_allocated(fpu))
 		return 0;
-	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+	fpu_ops.fpu_state_alloc(fpu);
 	if (!fpu->state)
 		return -ENOMEM;
 	WARN_ON((unsigned long)fpu->state & 15);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fdedd38..941b55d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -413,6 +413,11 @@ struct fpu {
 	union thread_xstate *state;
 };
 
+struct fpu_ops {
+	void (*fpu_state_alloc)(struct fpu *fpu);
+};
+extern struct fpu_ops fpu_ops;
+
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
 
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5dd808..24ce161 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -157,6 +157,19 @@ static void init_thread_xstate(void)
 		xstate_size = sizeof(struct i387_fsave_struct);
 }
 
+static void native_fpu_state_alloc(struct fpu *fpu)
+{
+	unsigned long flags;
+	local_save_flags(flags);
+	local_irq_enable();
+	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+	local_irq_restore(flags);
+}
+
+struct fpu_ops fpu_ops = {
+		.fpu_state_alloc = native_fpu_state_alloc,
+};
+
 /*
  * Called at bootup to set up the initial FPU state that is later cloned
  * into all processes.
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 57409f6..97479d6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -624,7 +624,6 @@ void math_state_restore(void)
 	struct task_struct *tsk = current;
 
 	if (!tsk_used_math(tsk)) {
-		local_irq_enable();
 		/*
 		 * does a slab alloc which can sleep
 		 */
@@ -635,7 +634,6 @@ void math_state_restore(void)
 			do_group_exit(SIGKILL);
 			return;
 		}
-		local_irq_disable();
 	}
 
 	__thread_fpu_begin(tsk);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 201d09a..fb3aa30 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -69,6 +69,7 @@
 #include <asm/mwait.h>
 #include <asm/pci_x86.h>
 #include <asm/pat.h>
+#include <asm/processor.h>
 
 #ifdef CONFIG_ACPI
 #include <linux/acpi.h>
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 0982233..4e65b52 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -11,6 +11,7 @@
 #include <linux/memblock.h>
 #include <linux/cpuidle.h>
 #include <linux/cpufreq.h>
+#include <linux/slab.h>
 
 #include <asm/elf.h>
 #include <asm/vdso.h>
@@ -18,6 +19,7 @@
 #include <asm/setup.h>
 #include <asm/acpi.h>
 #include <asm/numa.h>
+#include <asm/processor.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
@@ -598,6 +600,28 @@ void __init xen_pvmmu_arch_setup(void)
 	xen_enable_nmi();
 }
 
+static void xen_fpu_state_alloc(struct fpu *fpu)
+{
+	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_NOWAIT);
+}
+
+static const struct fpu_ops xen_fpu_ops __initconst = {
+		.fpu_state_alloc = xen_fpu_state_alloc,
+};
+
+#define _XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED 1
+#define XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED \
+	(1u<<_XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED)
+static bool __init xen_check_dev_na_ts_allowed(void)
+{
+	uint32_t pages, msr, feat1, feat2, base;
+
+	base = xen_cpuid_base();
+	cpuid(base + 2, &pages, &msr, &feat1, &feat2);
+
+	return !!(feat1 & XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED);
+}
+
 /* This function is not called for HVM domains */
 void __init xen_arch_setup(void)
 {
@@ -605,6 +629,9 @@ void __init xen_arch_setup(void)
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		xen_pvmmu_arch_setup();
 
+	if (!xen_check_dev_na_ts_allowed())
+		fpu_ops = xen_fpu_ops;
+
 #ifdef CONFIG_ACPI
 	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
 		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:13       ` Sarah Newman
                         ` (3 preceding siblings ...)
  (?)
@ 2014-03-17  3:33       ` H. Peter Anvin
  -1 siblings, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-17  3:33 UTC (permalink / raw)
  To: Sarah Newman, David Vrabel
  Cc: Thomas Gleixner, Ingo Molnar, linux-kernel, xen-devel

No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a workaround for the legacy hypervisor versions.

GFP_ATOMIC -> SIGKILL is definitely a NAK.

On March 16, 2014 8:13:05 PM PDT, Sarah Newman <srn@prgmr.com> wrote:
>On 03/10/2014 10:15 AM, David Vrabel wrote:
>> On 10/03/14 16:40, H. Peter Anvin wrote:
>>> On 03/10/2014 09:17 AM, David Vrabel wrote:
>>>> math_state_restore() is called from the #NM exception handler.  It
>may
>>>> do a GFP_KERNEL allocation (in init_fpu()) which may schedule.
>>>>
>>>> Change this allocation to GFP_ATOMIC, but leave all the other
>callers
>>>> of init_fpu() or fpu_alloc() using GFP_KERNEL.
>>>
>>> And what the [Finnish] do you do if GFP_ATOMIC fails?
>> 
>> The same thing it used to do -- kill the task with SIGKILL.  I
>haven't
>> changed this behaviour.
>> 
>>> Sarah's patchset switches Xen PV to use eagerfpu unconditionally,
>which
>>> removes the dependency on #NM and is the right thing to do.
>> 
>> Ok. I'll wait for this series and not pursue this patch any further.
>
>Sorry, this got swallowed by my mail filter.
>
>I did some more testing and I think eagerfpu is going to noticeably
>slow things down. When I ran
>"time sysbench --num-threads=64 --test=threads run" I saw on the order
>of 15% more time spent in
>system mode and this seemed consistent over different runs.
>
>As for GFP_ATOMIC, unfortunately I don't know a sanctioned test here so
>I rolled my own. This test
>sequentially allocated math-using processes in the background until it
>could not any more.  On a
>64MB instance, I saw 10% fewer processes allocated with GFP_ATOMIC
>compared to GFP_KERNEL when I
>continually allocated new processes up to OOM conditions (256 vs 228.) 
>A similar test on a
>different RFS and a kernel using GFP_NOWAIT showed pretty much no
>difference in how many processes I
>could allocate. This doesn't seem too bad unless there is some kind of
>fragmentation over time which
>would cause worse performance.
>
>Since performance degradation applies at all times and not just under
>extreme conditions, I think
>the lesser evil will actually be GFP_ATOMIC.  But it's not necessary to
>always use GFP_ATOMIC, only
>under certain conditions - IE when the xen PVABI forces us to.
>
>Patches will be supplied shortly.

-- 
Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:13       ` Sarah Newman
                         ` (4 preceding siblings ...)
  (?)
@ 2014-03-17  3:33       ` H. Peter Anvin
  2014-03-17  3:35         ` [Xen-devel] " Sarah Newman
                           ` (3 more replies)
  -1 siblings, 4 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-17  3:33 UTC (permalink / raw)
  To: Sarah Newman, David Vrabel
  Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, xen-devel

No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a workaround for the legacy hypervisor versions.

GFP_ATOMIC -> SIGKILL is definitely a NAK.

On March 16, 2014 8:13:05 PM PDT, Sarah Newman <srn@prgmr.com> wrote:
>On 03/10/2014 10:15 AM, David Vrabel wrote:
>> On 10/03/14 16:40, H. Peter Anvin wrote:
>>> On 03/10/2014 09:17 AM, David Vrabel wrote:
>>>> math_state_restore() is called from the #NM exception handler.  It
>may
>>>> do a GFP_KERNEL allocation (in init_fpu()) which may schedule.
>>>>
>>>> Change this allocation to GFP_ATOMIC, but leave all the other
>callers
>>>> of init_fpu() or fpu_alloc() using GFP_KERNEL.
>>>
>>> And what the [Finnish] do you do if GFP_ATOMIC fails?
>> 
>> The same thing it used to do -- kill the task with SIGKILL.  I
>haven't
>> changed this behaviour.
>> 
>>> Sarah's patchset switches Xen PV to use eagerfpu unconditionally,
>which
>>> removes the dependency on #NM and is the right thing to do.
>> 
>> Ok. I'll wait for this series and not pursue this patch any further.
>
>Sorry, this got swallowed by my mail filter.
>
>I did some more testing and I think eagerfpu is going to noticeably
>slow things down. When I ran
>"time sysbench --num-threads=64 --test=threads run" I saw on the order
>of 15% more time spent in
>system mode and this seemed consistent over different runs.
>
>As for GFP_ATOMIC, unfortunately I don't know a sanctioned test here so
>I rolled my own. This test
>sequentially allocated math-using processes in the background until it
>could not any more.  On a
>64MB instance, I saw 10% fewer processes allocated with GFP_ATOMIC
>compared to GFP_KERNEL when I
>continually allocated new processes up to OOM conditions (256 vs 228.) 
>A similar test on a
>different RFS and a kernel using GFP_NOWAIT showed pretty much no
>difference in how many processes I
>could allocate. This doesn't seem too bad unless there is some kind of
>fragmentation over time which
>would cause worse performance.
>
>Since performance degradation applies at all times and not just under
>extreme conditions, I think
>the lesser evil will actually be GFP_ATOMIC.  But it's not necessary to
>always use GFP_ATOMIC, only
>under certain conditions - IE when the xen PVABI forces us to.
>
>Patches will be supplied shortly.

-- 
Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:33       ` H. Peter Anvin
@ 2014-03-17  3:35         ` Sarah Newman
  2014-03-17  3:43           ` H. Peter Anvin
  2014-03-17  3:43           ` H. Peter Anvin
  2014-03-17  3:35         ` Sarah Newman
                           ` (2 subsequent siblings)
  3 siblings, 2 replies; 67+ messages in thread
From: Sarah Newman @ 2014-03-17  3:35 UTC (permalink / raw)
  To: H. Peter Anvin, David Vrabel
  Cc: Thomas Gleixner, Ingo Molnar, linux-kernel, xen-devel

Can you please review my patch first?  It's only enabled when absolutely required.

On 03/16/2014 08:33 PM, H. Peter Anvin wrote:
> No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a workaround for the legacy hypervisor versions.
> 
> GFP_ATOMIC -> SIGKILL is definitely a NAK.
> 
> On March 16, 2014 8:13:05 PM PDT, Sarah Newman <srn@prgmr.com> wrote:
>> On 03/10/2014 10:15 AM, David Vrabel wrote:
>>> On 10/03/14 16:40, H. Peter Anvin wrote:
>>>> On 03/10/2014 09:17 AM, David Vrabel wrote:
>>>>> math_state_restore() is called from the #NM exception handler.  It
>> may
>>>>> do a GFP_KERNEL allocation (in init_fpu()) which may schedule.
>>>>>
>>>>> Change this allocation to GFP_ATOMIC, but leave all the other
>> callers
>>>>> of init_fpu() or fpu_alloc() using GFP_KERNEL.
>>>>
>>>> And what the [Finnish] do you do if GFP_ATOMIC fails?
>>>
>>> The same thing it used to do -- kill the task with SIGKILL.  I
>> haven't
>>> changed this behaviour.
>>>
>>>> Sarah's patchset switches Xen PV to use eagerfpu unconditionally,
>> which
>>>> removes the dependency on #NM and is the right thing to do.
>>>
>>> Ok. I'll wait for this series and not pursue this patch any further.
>>
>> Sorry, this got swallowed by my mail filter.
>>
>> I did some more testing and I think eagerfpu is going to noticeably
>> slow things down. When I ran
>> "time sysbench --num-threads=64 --test=threads run" I saw on the order
>> of 15% more time spent in
>> system mode and this seemed consistent over different runs.
>>
>> As for GFP_ATOMIC, unfortunately I don't know a sanctioned test here so
>> I rolled my own. This test
>> sequentially allocated math-using processes in the background until it
>> could not any more.  On a
>> 64MB instance, I saw 10% fewer processes allocated with GFP_ATOMIC
>> compared to GFP_KERNEL when I
>> continually allocated new processes up to OOM conditions (256 vs 228.) 
>> A similar test on a
>> different RFS and a kernel using GFP_NOWAIT showed pretty much no
>> difference in how many processes I
>> could allocate. This doesn't seem too bad unless there is some kind of
>> fragmentation over time which
>> would cause worse performance.
>>
>> Since performance degradation applies at all times and not just under
>> extreme conditions, I think
>> the lesser evil will actually be GFP_ATOMIC.  But it's not necessary to
>> always use GFP_ATOMIC, only
>> under certain conditions - IE when the xen PVABI forces us to.
>>
>> Patches will be supplied shortly.
> 


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:35         ` [Xen-devel] " Sarah Newman
@ 2014-03-17  3:43           ` H. Peter Anvin
  2014-03-17  4:12             ` Sarah Newman
                               ` (3 more replies)
  2014-03-17  3:43           ` H. Peter Anvin
  1 sibling, 4 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-17  3:43 UTC (permalink / raw)
  To: Sarah Newman, David Vrabel
  Cc: Thomas Gleixner, Ingo Molnar, linux-kernel, xen-devel,
	Suresh Siddha

On 03/16/2014 08:35 PM, Sarah Newman wrote:
> Can you please review my patch first?  It's only enabled when absolutely required.

It doesn't help.  It means you're running on Xen, and you will have
processes subjected to random SIGKILL because they happen to touch the
FPU when the atomic pool is low.

However, there is probably a happy medium: you don't actually need eager
FPU restore, you just need eager FPU *allocation*.  We have been
intending to allocate the FPU state at task creation time for eagerfpu,
and Suresh Siddha has already produced such a patch; it just needs some
minor fixups due to an __init failure.

http://lkml.kernel.org/r/1391325599.6481.5.camel@europa

In the Xen case we could turn on eager allocation but not eager fpu.  In
fact, it might be justified to *always* do eager allocation...

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:43           ` H. Peter Anvin
@ 2014-03-17  4:12             ` Sarah Newman
  2014-03-17  4:12             ` [Xen-devel] " Sarah Newman
                               ` (2 subsequent siblings)
  3 siblings, 0 replies; 67+ messages in thread
From: Sarah Newman @ 2014-03-17  4:12 UTC (permalink / raw)
  To: H. Peter Anvin, David Vrabel
  Cc: Thomas Gleixner, Ingo Molnar, xen-devel, linux-kernel,
	Suresh Siddha

On 03/16/2014 08:43 PM, H. Peter Anvin wrote:
> On 03/16/2014 08:35 PM, Sarah Newman wrote:
>> Can you please review my patch first?  It's only enabled when absolutely required.
> 
> It doesn't help.  It means you're running on Xen, and you will have
> processes subjected to random SIGKILL because they happen to touch the
> FPU when the atomic pool is low.
> 
> However, there is probably a happy medium: you don't actually need eager
> FPU restore, you just need eager FPU *allocation*.  We have been
> intending to allocate the FPU state at task creation time for eagerfpu,
> and Suresh Siddha has already produced such a patch; it just needs some
> minor fixups due to an __init failure.
> 
> http://lkml.kernel.org/r/1391325599.6481.5.camel@europa
> 
> In the Xen case we could turn on eager allocation but not eager fpu.  In
> fact, it might be justified to *always* do eager allocation...

Unconditional eager allocation works. Can xen users count on this being included and applied to the
stable kernels?

Thanks, Sarah

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:43           ` H. Peter Anvin
  2014-03-17  4:12             ` Sarah Newman
@ 2014-03-17  4:12             ` Sarah Newman
  2014-03-17  4:23               ` H. Peter Anvin
  2014-03-17  4:23               ` H. Peter Anvin
  2014-03-17 13:29             ` [Xen-devel] " David Vrabel
  2014-03-17 13:29             ` David Vrabel
  3 siblings, 2 replies; 67+ messages in thread
From: Sarah Newman @ 2014-03-17  4:12 UTC (permalink / raw)
  To: H. Peter Anvin, David Vrabel
  Cc: Suresh Siddha, Thomas Gleixner, Ingo Molnar, linux-kernel,
	xen-devel

On 03/16/2014 08:43 PM, H. Peter Anvin wrote:
> On 03/16/2014 08:35 PM, Sarah Newman wrote:
>> Can you please review my patch first?  It's only enabled when absolutely required.
> 
> It doesn't help.  It means you're running on Xen, and you will have
> processes subjected to random SIGKILL because they happen to touch the
> FPU when the atomic pool is low.
> 
> However, there is probably a happy medium: you don't actually need eager
> FPU restore, you just need eager FPU *allocation*.  We have been
> intending to allocate the FPU state at task creation time for eagerfpu,
> and Suresh Siddha has already produced such a patch; it just needs some
> minor fixups due to an __init failure.
> 
> http://lkml.kernel.org/r/1391325599.6481.5.camel@europa
> 
> In the Xen case we could turn on eager allocation but not eager fpu.  In
> fact, it might be justified to *always* do eager allocation...

Unconditional eager allocation works. Can xen users count on this being included and applied to the
stable kernels?

Thanks, Sarah

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  4:12             ` [Xen-devel] " Sarah Newman
@ 2014-03-17  4:23               ` H. Peter Anvin
  2014-03-20  0:00                 ` Greg Kroah-Hartman
  2014-03-20  0:00                 ` [Xen-devel] " Greg Kroah-Hartman
  2014-03-17  4:23               ` H. Peter Anvin
  1 sibling, 2 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-17  4:23 UTC (permalink / raw)
  To: Sarah Newman, David Vrabel
  Cc: Suresh Siddha, Thomas Gleixner, Ingo Molnar, linux-kernel,
	xen-devel, Greg Kroah-Hartman

On 03/16/2014 09:12 PM, Sarah Newman wrote:
> 
> Unconditional eager allocation works. Can xen users count on this being included and applied to the
> stable kernels?
> 

I don't know.  If we state that it is a bug fix for Xen it might be
possible, but it would be up to Greg (Cc:'d) and the rest of the stable
team.

	-hpa


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  4:23               ` H. Peter Anvin
@ 2014-03-20  0:00                 ` Greg Kroah-Hartman
  2014-03-20  0:00                 ` [Xen-devel] " Greg Kroah-Hartman
  1 sibling, 0 replies; 67+ messages in thread
From: Greg Kroah-Hartman @ 2014-03-20  0:00 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Sarah Newman, linux-kernel, xen-devel, Ingo Molnar, David Vrabel,
	Suresh Siddha, Thomas Gleixner

On Sun, Mar 16, 2014 at 09:23:01PM -0700, H. Peter Anvin wrote:
> On 03/16/2014 09:12 PM, Sarah Newman wrote:
> > 
> > Unconditional eager allocation works. Can xen users count on this being included and applied to the
> > stable kernels?
> > 
> 
> I don't know.  If we state that it is a bug fix for Xen it might be
> possible, but it would be up to Greg (Cc:'d) and the rest of the stable
> team.

If someone sends the git id of the commit in Linus's tree to the
stable@vger.kernel.org address, we will be glad to review it at that
point in time.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  4:23               ` H. Peter Anvin
  2014-03-20  0:00                 ` Greg Kroah-Hartman
@ 2014-03-20  0:00                 ` Greg Kroah-Hartman
  2014-03-20  2:29                   ` H. Peter Anvin
  2014-03-20  2:29                   ` [Xen-devel] " H. Peter Anvin
  1 sibling, 2 replies; 67+ messages in thread
From: Greg Kroah-Hartman @ 2014-03-20  0:00 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Sarah Newman, David Vrabel, Suresh Siddha, Thomas Gleixner,
	Ingo Molnar, linux-kernel, xen-devel

On Sun, Mar 16, 2014 at 09:23:01PM -0700, H. Peter Anvin wrote:
> On 03/16/2014 09:12 PM, Sarah Newman wrote:
> > 
> > Unconditional eager allocation works. Can xen users count on this being included and applied to the
> > stable kernels?
> > 
> 
> I don't know.  If we state that it is a bug fix for Xen it might be
> possible, but it would be up to Greg (Cc:'d) and the rest of the stable
> team.

If someone sends the git id of the commit in Linus's tree to the
stable@vger.kernel.org address, we will be glad to review it at that
point in time.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-20  0:00                 ` [Xen-devel] " Greg Kroah-Hartman
@ 2014-03-20  2:29                   ` H. Peter Anvin
  2014-03-20  2:29                   ` [Xen-devel] " H. Peter Anvin
  1 sibling, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-20  2:29 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Sarah Newman, linux-kernel, xen-devel, Ingo Molnar, David Vrabel,
	Suresh Siddha, Thomas Gleixner

Sounds good.

On March 19, 2014 5:00:11 PM PDT, Greg Kroah-Hartman <gregkh@linuxfoundation.org> wrote:
>On Sun, Mar 16, 2014 at 09:23:01PM -0700, H. Peter Anvin wrote:
>> On 03/16/2014 09:12 PM, Sarah Newman wrote:
>> > 
>> > Unconditional eager allocation works. Can xen users count on this
>being included and applied to the
>> > stable kernels?
>> > 
>> 
>> I don't know.  If we state that it is a bug fix for Xen it might be
>> possible, but it would be up to Greg (Cc:'d) and the rest of the
>stable
>> team.
>
>If someone sends the git id of the commit in Linus's tree to the
>stable@vger.kernel.org address, we will be glad to review it at that
>point in time.
>
>thanks,
>
>greg k-h

-- 
Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-20  0:00                 ` [Xen-devel] " Greg Kroah-Hartman
  2014-03-20  2:29                   ` H. Peter Anvin
@ 2014-03-20  2:29                   ` H. Peter Anvin
  1 sibling, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-20  2:29 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Sarah Newman, David Vrabel, Suresh Siddha, Thomas Gleixner,
	Ingo Molnar, linux-kernel, xen-devel

Sounds good.

On March 19, 2014 5:00:11 PM PDT, Greg Kroah-Hartman <gregkh@linuxfoundation.org> wrote:
>On Sun, Mar 16, 2014 at 09:23:01PM -0700, H. Peter Anvin wrote:
>> On 03/16/2014 09:12 PM, Sarah Newman wrote:
>> > 
>> > Unconditional eager allocation works. Can xen users count on this
>being included and applied to the
>> > stable kernels?
>> > 
>> 
>> I don't know.  If we state that it is a bug fix for Xen it might be
>> possible, but it would be up to Greg (Cc:'d) and the rest of the
>stable
>> team.
>
>If someone sends the git id of the commit in Linus's tree to the
>stable@vger.kernel.org address, we will be glad to review it at that
>point in time.
>
>thanks,
>
>greg k-h

-- 
Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  4:12             ` [Xen-devel] " Sarah Newman
  2014-03-17  4:23               ` H. Peter Anvin
@ 2014-03-17  4:23               ` H. Peter Anvin
  1 sibling, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-17  4:23 UTC (permalink / raw)
  To: Sarah Newman, David Vrabel
  Cc: Greg Kroah-Hartman, linux-kernel, xen-devel, Ingo Molnar,
	Suresh Siddha, Thomas Gleixner

On 03/16/2014 09:12 PM, Sarah Newman wrote:
> 
> Unconditional eager allocation works. Can xen users count on this being included and applied to the
> stable kernels?
> 

I don't know.  If we state that it is a bug fix for Xen it might be
possible, but it would be up to Greg (Cc:'d) and the rest of the stable
team.

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:43           ` H. Peter Anvin
  2014-03-17  4:12             ` Sarah Newman
  2014-03-17  4:12             ` [Xen-devel] " Sarah Newman
@ 2014-03-17 13:29             ` David Vrabel
  2014-03-19 13:21               ` Konrad Rzeszutek Wilk
  2014-03-19 13:21               ` Konrad Rzeszutek Wilk
  2014-03-17 13:29             ` David Vrabel
  3 siblings, 2 replies; 67+ messages in thread
From: David Vrabel @ 2014-03-17 13:29 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Sarah Newman, Thomas Gleixner, Ingo Molnar, linux-kernel,
	xen-devel, Suresh Siddha

On 17/03/14 03:43, H. Peter Anvin wrote:
> On 03/16/2014 08:35 PM, Sarah Newman wrote:
>> Can you please review my patch first?  It's only enabled when absolutely required.
> 
> It doesn't help.  It means you're running on Xen, and you will have
> processes subjected to random SIGKILL because they happen to touch the
> FPU when the atomic pool is low.
> 
> However, there is probably a happy medium: you don't actually need eager
> FPU restore, you just need eager FPU *allocation*.  We have been
> intending to allocate the FPU state at task creation time for eagerfpu,
> and Suresh Siddha has already produced such a patch; it just needs some
> minor fixups due to an __init failure.
> 
> http://lkml.kernel.org/r/1391325599.6481.5.camel@europa
> 
> In the Xen case we could turn on eager allocation but not eager fpu.  In
> fact, it might be justified to *always* do eager allocation...

The following patch does the always eager allocation.  It's a fixup of
Suresh's original patch.

v2:
- Allocate initial fpu state after xsave_init().
- Only allocate initial FPU state on boot cpu.

8<-----------------------
x86, fpu: remove the logic of non-eager fpu mem allocation at the first usage

From: Suresh Siddha <sbsiddha@gmail.com>

For non-eager fpu mode, thread's fpu state is allocated during the first
fpu usage (in the context of device not available exception). This can be
a blocking call and hence we enable interrupts (which were originally
disabled when the exception happened), allocate memory and disable
interrupts etc. While this saves 512 bytes or so per-thread, there
are some issues in general.

a.  Most of the future cases will be anyway using eager
FPU (because of processor features like xsaveopt, LWP, MPX etc) and
they do the allocation at the thread creation itself. Nice to have
one common mechanism as all the state save/restore code is
shared. Avoids the confusion and minimizes the subtle bugs
in the core piece involved with context-switch.

b. If a parent thread uses FPU, during fork() we allocate
the FPU state in the child and copy the state etc. Shortly after this,
during exec() we free it up, so that we can later allocate during
the first usage of FPU. So this free/allocate might be slower
for some workloads.

c. math_state_restore() is called from multiple places
and it is error pone if the caller expects interrupts to be disabled
throughout the execution of math_state_restore(). Can lead to subtle
bugs like Ubuntu bug #1265841.

Memory savings will be small anyways and the code complexity
introducing subtle bugs is not worth it. So remove
the logic of non-eager fpu mem allocation at the first usage.

Signed-off-by: Suresh Siddha <sbsiddha@gmail.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/kernel/i387.c    |   22 +++++++++++++---------
 arch/x86/kernel/process.c |    6 ------
 arch/x86/kernel/traps.c   |   16 ++--------------
 arch/x86/kernel/xsave.c   |    2 --
 4 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e8368c6..05aeae2 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -5,6 +5,7 @@
  *  General FPU state handling cleanups
  *	Gareth Hughes <gareth@valinux.com>, May 2000
  */
+#include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/regset.h>
 #include <linux/sched.h>
@@ -153,8 +154,15 @@ static void init_thread_xstate(void)
  * into all processes.
  */
 
+static void __init fpu_init_boot_cpu(void)
+{
+	current->thread.fpu.state =
+		alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
+}
+
 void fpu_init(void)
 {
+	static __refdata void (*boot_func)(void) = fpu_init_boot_cpu;
 	unsigned long cr0;
 	unsigned long cr4_mask = 0;
 
@@ -189,6 +197,11 @@ void fpu_init(void)
 	mxcsr_feature_mask_init();
 	xsave_init();
 	eager_fpu_init();
+
+	if (boot_func) {
+		boot_func();
+		boot_func = NULL;
+	}
 }
 
 void fpu_finit(struct fpu *fpu)
@@ -219,8 +232,6 @@ EXPORT_SYMBOL_GPL(fpu_finit);
  */
 int init_fpu(struct task_struct *tsk)
 {
-	int ret;
-
 	if (tsk_used_math(tsk)) {
 		if (cpu_has_fpu && tsk == current)
 			unlazy_fpu(tsk);
@@ -228,13 +239,6 @@ int init_fpu(struct task_struct *tsk)
 		return 0;
 	}
 
-	/*
-	 * Memory allocation at the first usage of the FPU and other state.
-	 */
-	ret = fpu_alloc(&tsk->thread.fpu);
-	if (ret)
-		return ret;
-
 	fpu_finit(&tsk->thread.fpu);
 
 	set_stopped_child_used_math(tsk);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95..cd9c190 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -128,12 +128,6 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	drop_init_fpu(tsk);
-	/*
-	 * Free the FPU state for non xsave platforms. They get reallocated
-	 * lazily at the first use.
-	 */
-	if (!use_eager_fpu())
-		free_thread_xstate(tsk);
 }
 
 static void hard_disable_TSC(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 57409f6..3265429 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -623,20 +623,8 @@ void math_state_restore(void)
 {
 	struct task_struct *tsk = current;
 
-	if (!tsk_used_math(tsk)) {
-		local_irq_enable();
-		/*
-		 * does a slab alloc which can sleep
-		 */
-		if (init_fpu(tsk)) {
-			/*
-			 * ran out of memory!
-			 */
-			do_group_exit(SIGKILL);
-			return;
-		}
-		local_irq_disable();
-	}
+	if (!tsk_used_math(tsk))
+		init_fpu(tsk);
 
 	__thread_fpu_begin(tsk);
 
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a4b451c..46f6266 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -598,8 +598,6 @@ void xsave_init(void)
 
 static inline void __init eager_fpu_init_bp(void)
 {
-	current->thread.fpu.state =
-	    alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
 	if (!init_xstate_buf)
 		setup_init_fpu_buf();
 }
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 13:29             ` [Xen-devel] " David Vrabel
@ 2014-03-19 13:21               ` Konrad Rzeszutek Wilk
  2014-03-19 15:02                 ` H. Peter Anvin
  2014-03-19 15:02                 ` H. Peter Anvin
  2014-03-19 13:21               ` Konrad Rzeszutek Wilk
  1 sibling, 2 replies; 67+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-03-19 13:21 UTC (permalink / raw)
  To: David Vrabel, hpa
  Cc: H. Peter Anvin, Sarah Newman, linux-kernel, xen-devel,
	Ingo Molnar, Suresh Siddha, Thomas Gleixner

On Mon, Mar 17, 2014 at 01:29:03PM +0000, David Vrabel wrote:
> On 17/03/14 03:43, H. Peter Anvin wrote:
> > On 03/16/2014 08:35 PM, Sarah Newman wrote:
> >> Can you please review my patch first?  It's only enabled when absolutely required.
> > 
> > It doesn't help.  It means you're running on Xen, and you will have
> > processes subjected to random SIGKILL because they happen to touch the
> > FPU when the atomic pool is low.
> > 
> > However, there is probably a happy medium: you don't actually need eager
> > FPU restore, you just need eager FPU *allocation*.  We have been
> > intending to allocate the FPU state at task creation time for eagerfpu,
> > and Suresh Siddha has already produced such a patch; it just needs some
> > minor fixups due to an __init failure.
> > 
> > http://lkml.kernel.org/r/1391325599.6481.5.camel@europa
> > 
> > In the Xen case we could turn on eager allocation but not eager fpu.  In
> > fact, it might be justified to *always* do eager allocation...
> 
> The following patch does the always eager allocation.  It's a fixup of
> Suresh's original patch.
> 

Hey Peter,

I think this is the solution you were looking for?

Or are there some other subtle issues that you think lurk around?

Thanks!
> v2:
> - Allocate initial fpu state after xsave_init().
> - Only allocate initial FPU state on boot cpu.
> 
> 8<-----------------------
> x86, fpu: remove the logic of non-eager fpu mem allocation at the first usage
> 
> From: Suresh Siddha <sbsiddha@gmail.com>
> 
> For non-eager fpu mode, thread's fpu state is allocated during the first
> fpu usage (in the context of device not available exception). This can be
> a blocking call and hence we enable interrupts (which were originally
> disabled when the exception happened), allocate memory and disable
> interrupts etc. While this saves 512 bytes or so per-thread, there
> are some issues in general.
> 
> a.  Most of the future cases will be anyway using eager
> FPU (because of processor features like xsaveopt, LWP, MPX etc) and
> they do the allocation at the thread creation itself. Nice to have
> one common mechanism as all the state save/restore code is
> shared. Avoids the confusion and minimizes the subtle bugs
> in the core piece involved with context-switch.
> 
> b. If a parent thread uses FPU, during fork() we allocate
> the FPU state in the child and copy the state etc. Shortly after this,
> during exec() we free it up, so that we can later allocate during
> the first usage of FPU. So this free/allocate might be slower
> for some workloads.
> 
> c. math_state_restore() is called from multiple places
> and it is error pone if the caller expects interrupts to be disabled
> throughout the execution of math_state_restore(). Can lead to subtle
> bugs like Ubuntu bug #1265841.
> 
> Memory savings will be small anyways and the code complexity
> introducing subtle bugs is not worth it. So remove
> the logic of non-eager fpu mem allocation at the first usage.
> 
> Signed-off-by: Suresh Siddha <sbsiddha@gmail.com>
> Signed-off-by: David Vrabel <david.vrabel@citrix.com>
> ---
>  arch/x86/kernel/i387.c    |   22 +++++++++++++---------
>  arch/x86/kernel/process.c |    6 ------
>  arch/x86/kernel/traps.c   |   16 ++--------------
>  arch/x86/kernel/xsave.c   |    2 --
>  4 files changed, 15 insertions(+), 31 deletions(-)
> 
> diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
> index e8368c6..05aeae2 100644
> --- a/arch/x86/kernel/i387.c
> +++ b/arch/x86/kernel/i387.c
> @@ -5,6 +5,7 @@
>   *  General FPU state handling cleanups
>   *	Gareth Hughes <gareth@valinux.com>, May 2000
>   */
> +#include <linux/bootmem.h>
>  #include <linux/module.h>
>  #include <linux/regset.h>
>  #include <linux/sched.h>
> @@ -153,8 +154,15 @@ static void init_thread_xstate(void)
>   * into all processes.
>   */
>  
> +static void __init fpu_init_boot_cpu(void)
> +{
> +	current->thread.fpu.state =
> +		alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
> +}
> +
>  void fpu_init(void)
>  {
> +	static __refdata void (*boot_func)(void) = fpu_init_boot_cpu;
>  	unsigned long cr0;
>  	unsigned long cr4_mask = 0;
>  
> @@ -189,6 +197,11 @@ void fpu_init(void)
>  	mxcsr_feature_mask_init();
>  	xsave_init();
>  	eager_fpu_init();
> +
> +	if (boot_func) {
> +		boot_func();
> +		boot_func = NULL;
> +	}
>  }
>  
>  void fpu_finit(struct fpu *fpu)
> @@ -219,8 +232,6 @@ EXPORT_SYMBOL_GPL(fpu_finit);
>   */
>  int init_fpu(struct task_struct *tsk)
>  {
> -	int ret;
> -
>  	if (tsk_used_math(tsk)) {
>  		if (cpu_has_fpu && tsk == current)
>  			unlazy_fpu(tsk);
> @@ -228,13 +239,6 @@ int init_fpu(struct task_struct *tsk)
>  		return 0;
>  	}
>  
> -	/*
> -	 * Memory allocation at the first usage of the FPU and other state.
> -	 */
> -	ret = fpu_alloc(&tsk->thread.fpu);
> -	if (ret)
> -		return ret;
> -
>  	fpu_finit(&tsk->thread.fpu);
>  
>  	set_stopped_child_used_math(tsk);
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index 3fb8d95..cd9c190 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -128,12 +128,6 @@ void flush_thread(void)
>  	flush_ptrace_hw_breakpoint(tsk);
>  	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
>  	drop_init_fpu(tsk);
> -	/*
> -	 * Free the FPU state for non xsave platforms. They get reallocated
> -	 * lazily at the first use.
> -	 */
> -	if (!use_eager_fpu())
> -		free_thread_xstate(tsk);
>  }
>  
>  static void hard_disable_TSC(void)
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 57409f6..3265429 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -623,20 +623,8 @@ void math_state_restore(void)
>  {
>  	struct task_struct *tsk = current;
>  
> -	if (!tsk_used_math(tsk)) {
> -		local_irq_enable();
> -		/*
> -		 * does a slab alloc which can sleep
> -		 */
> -		if (init_fpu(tsk)) {
> -			/*
> -			 * ran out of memory!
> -			 */
> -			do_group_exit(SIGKILL);
> -			return;
> -		}
> -		local_irq_disable();
> -	}
> +	if (!tsk_used_math(tsk))
> +		init_fpu(tsk);
>  
>  	__thread_fpu_begin(tsk);
>  
> diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
> index a4b451c..46f6266 100644
> --- a/arch/x86/kernel/xsave.c
> +++ b/arch/x86/kernel/xsave.c
> @@ -598,8 +598,6 @@ void xsave_init(void)
>  
>  static inline void __init eager_fpu_init_bp(void)
>  {
> -	current->thread.fpu.state =
> -	    alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
>  	if (!init_xstate_buf)
>  		setup_init_fpu_buf();
>  }
> -- 
> 1.7.2.5
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-19 13:21               ` Konrad Rzeszutek Wilk
@ 2014-03-19 15:02                 ` H. Peter Anvin
  2014-06-23 13:08                   ` Konrad Rzeszutek Wilk
  2014-06-23 13:08                   ` Konrad Rzeszutek Wilk
  2014-03-19 15:02                 ` H. Peter Anvin
  1 sibling, 2 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-19 15:02 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk, David Vrabel
  Cc: Sarah Newman, linux-kernel, xen-devel, Ingo Molnar, Suresh Siddha,
	Thomas Gleixner

On 03/19/2014 06:21 AM, Konrad Rzeszutek Wilk wrote:
>>
>> The following patch does the always eager allocation.  It's a fixup of
>> Suresh's original patch.
>>
> 
> Hey Peter,
> 
> I think this is the solution you were looking for?
> 
> Or are there some other subtle issues that you think lurk around?
> 

Ah, I managed to miss it (mostly because it was buried *inside* another
email and didn't change the subject line... I really dislike that mode
of delivering a patch.

Let me see if the issues have been fixed.  Still wondering if there is a
way we can get away without the boot_func hack...

	-hpa



^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-19 15:02                 ` H. Peter Anvin
@ 2014-06-23 13:08                   ` Konrad Rzeszutek Wilk
  2015-03-05 22:08                     ` H. Peter Anvin
  2015-03-05 22:08                     ` [PATCHv1] x86: don't schedule when handling #NM exception H. Peter Anvin
  2014-06-23 13:08                   ` Konrad Rzeszutek Wilk
  1 sibling, 2 replies; 67+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-06-23 13:08 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: David Vrabel, Sarah Newman, linux-kernel, xen-devel, Ingo Molnar,
	Suresh Siddha, Thomas Gleixner

On Wed, Mar 19, 2014 at 08:02:22AM -0700, H. Peter Anvin wrote:
> On 03/19/2014 06:21 AM, Konrad Rzeszutek Wilk wrote:
> >>
> >> The following patch does the always eager allocation.  It's a fixup of
> >> Suresh's original patch.
> >>
> > 
> > Hey Peter,
> > 
> > I think this is the solution you were looking for?
> > 
> > Or are there some other subtle issues that you think lurk around?
> > 
> 
> Ah, I managed to miss it (mostly because it was buried *inside* another
> email and didn't change the subject line... I really dislike that mode
> of delivering a patch.

Let me roll up some of these patchset and send them as git send-email.

> 
> Let me see if the issues have been fixed.  Still wondering if there is a
> way we can get away without the boot_func hack...

I have to confesss I don't even remember what the 'if the issues have been
fixed' is referring to?

> 
> 	-hpa
> 
> 

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-06-23 13:08                   ` Konrad Rzeszutek Wilk
@ 2015-03-05 22:08                     ` H. Peter Anvin
  2015-03-06 11:46                       ` [PATCHv4] x86, fpu: remove the logic of non-eager fpu mem allocation at the first usage David Vrabel
  2015-03-06 11:46                       ` David Vrabel
  2015-03-05 22:08                     ` [PATCHv1] x86: don't schedule when handling #NM exception H. Peter Anvin
  1 sibling, 2 replies; 67+ messages in thread
From: H. Peter Anvin @ 2015-03-05 22:08 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: David Vrabel, Sarah Newman, linux-kernel, xen-devel, Ingo Molnar,
	Suresh Siddha, Thomas Gleixner

On 06/23/2014 06:08 AM, Konrad Rzeszutek Wilk wrote:
> On Wed, Mar 19, 2014 at 08:02:22AM -0700, H. Peter Anvin wrote:
>> On 03/19/2014 06:21 AM, Konrad Rzeszutek Wilk wrote:
>>>>
>>>> The following patch does the always eager allocation.  It's a fixup of
>>>> Suresh's original patch.
>>>>
>>>
>>> Hey Peter,
>>>
>>> I think this is the solution you were looking for?
>>>
>>> Or are there some other subtle issues that you think lurk around?
>>>
>>
>> Ah, I managed to miss it (mostly because it was buried *inside* another
>> email and didn't change the subject line... I really dislike that mode
>> of delivering a patch.
> 
> Let me roll up some of these patchset and send them as git send-email.
> 
>>
>> Let me see if the issues have been fixed.  Still wondering if there is a
>> way we can get away without the boot_func hack...
> 
> I have to confesss I don't even remember what the 'if the issues have been
> fixed' is referring to?
> 

Hi Konrad... it looks like this got left waiting for you and got forgotten?

	-hpa



^ permalink raw reply	[flat|nested] 67+ messages in thread

* [PATCHv4] x86, fpu: remove the logic of non-eager fpu mem allocation at the first usage
  2015-03-05 22:08                     ` H. Peter Anvin
@ 2015-03-06 11:46                       ` David Vrabel
  2015-03-06 11:46                       ` David Vrabel
  1 sibling, 0 replies; 67+ messages in thread
From: David Vrabel @ 2015-03-06 11:46 UTC (permalink / raw)
  To: H. Peter Anvin, Konrad Rzeszutek Wilk
  Cc: Sarah Newman, linux-kernel, xen-devel, Ingo Molnar, Suresh Siddha,
	Thomas Gleixner

On 05/03/15 22:08, H. Peter Anvin wrote:
> On 06/23/2014 06:08 AM, Konrad Rzeszutek Wilk wrote:
>> On Wed, Mar 19, 2014 at 08:02:22AM -0700, H. Peter Anvin wrote:
>>> On 03/19/2014 06:21 AM, Konrad Rzeszutek Wilk wrote:
>>>>>
>>>>> The following patch does the always eager allocation.  It's a fixup of
>>>>> Suresh's original patch.
>>>>>
>>>>
>>>> Hey Peter,
>>>>
>>>> I think this is the solution you were looking for?
>>>>
>>>> Or are there some other subtle issues that you think lurk around?
>>>>
>>>
>>> Ah, I managed to miss it (mostly because it was buried *inside* another
>>> email and didn't change the subject line... I really dislike that mode
>>> of delivering a patch.
>>
>> Let me roll up some of these patchset and send them as git send-email.
>>
>>>
>>> Let me see if the issues have been fixed.  Still wondering if there is a
>>> way we can get away without the boot_func hack...
>>
>> I have to confesss I don't even remember what the 'if the issues have been
>> fixed' is referring to?
>>
> 
> Hi Konrad... it looks like this got left waiting for you and got forgotten?

8<--------------------------------
x86, fpu: remove the logic of non-eager fpu mem allocation at the first usage

For non-eager fpu mode, thread's fpu state is allocated during the first
fpu usage (in the context of device not available exception). This can be
a blocking call and hence we enable interrupts (which were originally
disabled when the exception happened), allocate memory and disable
interrupts etc. While this saves 512 bytes or so per-thread, there
are some issues in general.

a.  Most of the future cases will be anyway using eager
FPU (because of processor features like xsaveopt, LWP, MPX etc) and
they do the allocation at the thread creation itself. Nice to have
one common mechanism as all the state save/restore code is
shared. Avoids the confusion and minimizes the subtle bugs
in the core piece involved with context-switch.

b. If a parent thread uses FPU, during fork() we allocate
the FPU state in the child and copy the state etc. Shortly after this,
during exec() we free it up, so that we can later allocate during
the first usage of FPU. So this free/allocate might be slower
for some workloads.

c. math_state_restore() is called from multiple places
and it is error pone if the caller expects interrupts to be disabled
throughout the execution of math_state_restore(). Can lead to subtle
bugs like Ubuntu bug #1265841.

Memory savings will be small anyways and the code complexity
introducing subtle bugs is not worth it. So remove
the logic of non-eager fpu mem allocation at the first usage.

Signed-off-by: Suresh Siddha <sbsiddha@gmail.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
v4:
- add __ref to fpu_init() which needs to allocate the fpu state memory
  for the first task on the boot cpu.

v3:
- Rebase on 3.17-rc3.

v2:
- Tweak condition for allocating the first thread's FPU state.
---
 arch/x86/kernel/i387.c    |   20 +++++++++++---------
 arch/x86/kernel/process.c |    6 ------
 arch/x86/kernel/traps.c   |   16 ++--------------
 arch/x86/kernel/xsave.c   |    2 --
 4 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5651fc..089defc 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -5,6 +5,7 @@
  *  General FPU state handling cleanups
  *	Gareth Hughes <gareth@valinux.com>, May 2000
  */
+#include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/regset.h>
 #include <linux/sched.h>
@@ -211,6 +212,16 @@ void fpu_init(void)
 
 	mxcsr_feature_mask_init();
 	xsave_init();
+
+	/*
+	 * Now we know the final size of the xsave data, allocate the
+	 * FPU state area for the first task. All other tasks have
+	 * this allocated in arch_dup_task_struct().
+	 */
+	if (!current->thread.fpu.state)
+		current->thread.fpu.state = alloc_bootmem_align(xstate_size,
+				__alignof__(struct xsave_struct));
+
 	eager_fpu_init();
 }
 
@@ -242,8 +253,6 @@ EXPORT_SYMBOL_GPL(fpu_finit);
  */
 int init_fpu(struct task_struct *tsk)
 {
-	int ret;
-
 	if (tsk_used_math(tsk)) {
 		if (cpu_has_fpu && tsk == current)
 			unlazy_fpu(tsk);
@@ -251,13 +260,6 @@ int init_fpu(struct task_struct *tsk)
 		return 0;
 	}
 
-	/*
-	 * Memory allocation at the first usage of the FPU and other state.
-	 */
-	ret = fpu_alloc(&tsk->thread.fpu);
-	if (ret)
-		return ret;
-
 	fpu_finit(&tsk->thread.fpu);
 
 	set_stopped_child_used_math(tsk);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 046e2d6..5f4e746 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -132,12 +132,6 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	drop_init_fpu(tsk);
-	/*
-	 * Free the FPU state for non xsave platforms. They get reallocated
-	 * lazily at the first use.
-	 */
-	if (!use_eager_fpu())
-		free_thread_xstate(tsk);
 }
 
 static void hard_disable_TSC(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9d2073e..dbbf5e0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -844,20 +844,8 @@ void math_state_restore(void)
 {
 	struct task_struct *tsk = current;
 
-	if (!tsk_used_math(tsk)) {
-		local_irq_enable();
-		/*
-		 * does a slab alloc which can sleep
-		 */
-		if (init_fpu(tsk)) {
-			/*
-			 * ran out of memory!
-			 */
-			do_group_exit(SIGKILL);
-			return;
-		}
-		local_irq_disable();
-	}
+	if (!tsk_used_math(tsk))
+		init_fpu(tsk);
 
 	/* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */
 	kernel_fpu_disable();
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 34f66e5..5e21dd9 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -679,8 +679,6 @@ void xsave_init(void)
 
 static inline void __init eager_fpu_init_bp(void)
 {
-	current->thread.fpu.state =
-	    alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
 	if (!init_xstate_buf)
 		setup_init_fpu_buf();
 }
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 67+ messages in thread

* [PATCHv4] x86, fpu: remove the logic of non-eager fpu mem allocation at the first usage
  2015-03-05 22:08                     ` H. Peter Anvin
  2015-03-06 11:46                       ` [PATCHv4] x86, fpu: remove the logic of non-eager fpu mem allocation at the first usage David Vrabel
@ 2015-03-06 11:46                       ` David Vrabel
  1 sibling, 0 replies; 67+ messages in thread
From: David Vrabel @ 2015-03-06 11:46 UTC (permalink / raw)
  To: H. Peter Anvin, Konrad Rzeszutek Wilk
  Cc: Sarah Newman, linux-kernel, xen-devel, Ingo Molnar, Suresh Siddha,
	Thomas Gleixner

On 05/03/15 22:08, H. Peter Anvin wrote:
> On 06/23/2014 06:08 AM, Konrad Rzeszutek Wilk wrote:
>> On Wed, Mar 19, 2014 at 08:02:22AM -0700, H. Peter Anvin wrote:
>>> On 03/19/2014 06:21 AM, Konrad Rzeszutek Wilk wrote:
>>>>>
>>>>> The following patch does the always eager allocation.  It's a fixup of
>>>>> Suresh's original patch.
>>>>>
>>>>
>>>> Hey Peter,
>>>>
>>>> I think this is the solution you were looking for?
>>>>
>>>> Or are there some other subtle issues that you think lurk around?
>>>>
>>>
>>> Ah, I managed to miss it (mostly because it was buried *inside* another
>>> email and didn't change the subject line... I really dislike that mode
>>> of delivering a patch.
>>
>> Let me roll up some of these patchset and send them as git send-email.
>>
>>>
>>> Let me see if the issues have been fixed.  Still wondering if there is a
>>> way we can get away without the boot_func hack...
>>
>> I have to confesss I don't even remember what the 'if the issues have been
>> fixed' is referring to?
>>
> 
> Hi Konrad... it looks like this got left waiting for you and got forgotten?

8<--------------------------------
x86, fpu: remove the logic of non-eager fpu mem allocation at the first usage

For non-eager fpu mode, thread's fpu state is allocated during the first
fpu usage (in the context of device not available exception). This can be
a blocking call and hence we enable interrupts (which were originally
disabled when the exception happened), allocate memory and disable
interrupts etc. While this saves 512 bytes or so per-thread, there
are some issues in general.

a.  Most of the future cases will be anyway using eager
FPU (because of processor features like xsaveopt, LWP, MPX etc) and
they do the allocation at the thread creation itself. Nice to have
one common mechanism as all the state save/restore code is
shared. Avoids the confusion and minimizes the subtle bugs
in the core piece involved with context-switch.

b. If a parent thread uses FPU, during fork() we allocate
the FPU state in the child and copy the state etc. Shortly after this,
during exec() we free it up, so that we can later allocate during
the first usage of FPU. So this free/allocate might be slower
for some workloads.

c. math_state_restore() is called from multiple places
and it is error pone if the caller expects interrupts to be disabled
throughout the execution of math_state_restore(). Can lead to subtle
bugs like Ubuntu bug #1265841.

Memory savings will be small anyways and the code complexity
introducing subtle bugs is not worth it. So remove
the logic of non-eager fpu mem allocation at the first usage.

Signed-off-by: Suresh Siddha <sbsiddha@gmail.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
v4:
- add __ref to fpu_init() which needs to allocate the fpu state memory
  for the first task on the boot cpu.

v3:
- Rebase on 3.17-rc3.

v2:
- Tweak condition for allocating the first thread's FPU state.
---
 arch/x86/kernel/i387.c    |   20 +++++++++++---------
 arch/x86/kernel/process.c |    6 ------
 arch/x86/kernel/traps.c   |   16 ++--------------
 arch/x86/kernel/xsave.c   |    2 --
 4 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5651fc..089defc 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -5,6 +5,7 @@
  *  General FPU state handling cleanups
  *	Gareth Hughes <gareth@valinux.com>, May 2000
  */
+#include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/regset.h>
 #include <linux/sched.h>
@@ -211,6 +212,16 @@ void fpu_init(void)
 
 	mxcsr_feature_mask_init();
 	xsave_init();
+
+	/*
+	 * Now we know the final size of the xsave data, allocate the
+	 * FPU state area for the first task. All other tasks have
+	 * this allocated in arch_dup_task_struct().
+	 */
+	if (!current->thread.fpu.state)
+		current->thread.fpu.state = alloc_bootmem_align(xstate_size,
+				__alignof__(struct xsave_struct));
+
 	eager_fpu_init();
 }
 
@@ -242,8 +253,6 @@ EXPORT_SYMBOL_GPL(fpu_finit);
  */
 int init_fpu(struct task_struct *tsk)
 {
-	int ret;
-
 	if (tsk_used_math(tsk)) {
 		if (cpu_has_fpu && tsk == current)
 			unlazy_fpu(tsk);
@@ -251,13 +260,6 @@ int init_fpu(struct task_struct *tsk)
 		return 0;
 	}
 
-	/*
-	 * Memory allocation at the first usage of the FPU and other state.
-	 */
-	ret = fpu_alloc(&tsk->thread.fpu);
-	if (ret)
-		return ret;
-
 	fpu_finit(&tsk->thread.fpu);
 
 	set_stopped_child_used_math(tsk);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 046e2d6..5f4e746 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -132,12 +132,6 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	drop_init_fpu(tsk);
-	/*
-	 * Free the FPU state for non xsave platforms. They get reallocated
-	 * lazily at the first use.
-	 */
-	if (!use_eager_fpu())
-		free_thread_xstate(tsk);
 }
 
 static void hard_disable_TSC(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9d2073e..dbbf5e0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -844,20 +844,8 @@ void math_state_restore(void)
 {
 	struct task_struct *tsk = current;
 
-	if (!tsk_used_math(tsk)) {
-		local_irq_enable();
-		/*
-		 * does a slab alloc which can sleep
-		 */
-		if (init_fpu(tsk)) {
-			/*
-			 * ran out of memory!
-			 */
-			do_group_exit(SIGKILL);
-			return;
-		}
-		local_irq_disable();
-	}
+	if (!tsk_used_math(tsk))
+		init_fpu(tsk);
 
 	/* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */
 	kernel_fpu_disable();
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 34f66e5..5e21dd9 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -679,8 +679,6 @@ void xsave_init(void)
 
 static inline void __init eager_fpu_init_bp(void)
 {
-	current->thread.fpu.state =
-	    alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
 	if (!init_xstate_buf)
 		setup_init_fpu_buf();
 }
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-06-23 13:08                   ` Konrad Rzeszutek Wilk
  2015-03-05 22:08                     ` H. Peter Anvin
@ 2015-03-05 22:08                     ` H. Peter Anvin
  1 sibling, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2015-03-05 22:08 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Sarah Newman, linux-kernel, xen-devel, Ingo Molnar, David Vrabel,
	Suresh Siddha, Thomas Gleixner

On 06/23/2014 06:08 AM, Konrad Rzeszutek Wilk wrote:
> On Wed, Mar 19, 2014 at 08:02:22AM -0700, H. Peter Anvin wrote:
>> On 03/19/2014 06:21 AM, Konrad Rzeszutek Wilk wrote:
>>>>
>>>> The following patch does the always eager allocation.  It's a fixup of
>>>> Suresh's original patch.
>>>>
>>>
>>> Hey Peter,
>>>
>>> I think this is the solution you were looking for?
>>>
>>> Or are there some other subtle issues that you think lurk around?
>>>
>>
>> Ah, I managed to miss it (mostly because it was buried *inside* another
>> email and didn't change the subject line... I really dislike that mode
>> of delivering a patch.
> 
> Let me roll up some of these patchset and send them as git send-email.
> 
>>
>> Let me see if the issues have been fixed.  Still wondering if there is a
>> way we can get away without the boot_func hack...
> 
> I have to confesss I don't even remember what the 'if the issues have been
> fixed' is referring to?
> 

Hi Konrad... it looks like this got left waiting for you and got forgotten?

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-19 15:02                 ` H. Peter Anvin
  2014-06-23 13:08                   ` Konrad Rzeszutek Wilk
@ 2014-06-23 13:08                   ` Konrad Rzeszutek Wilk
  1 sibling, 0 replies; 67+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-06-23 13:08 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Sarah Newman, linux-kernel, xen-devel, Ingo Molnar, David Vrabel,
	Suresh Siddha, Thomas Gleixner

On Wed, Mar 19, 2014 at 08:02:22AM -0700, H. Peter Anvin wrote:
> On 03/19/2014 06:21 AM, Konrad Rzeszutek Wilk wrote:
> >>
> >> The following patch does the always eager allocation.  It's a fixup of
> >> Suresh's original patch.
> >>
> > 
> > Hey Peter,
> > 
> > I think this is the solution you were looking for?
> > 
> > Or are there some other subtle issues that you think lurk around?
> > 
> 
> Ah, I managed to miss it (mostly because it was buried *inside* another
> email and didn't change the subject line... I really dislike that mode
> of delivering a patch.

Let me roll up some of these patchset and send them as git send-email.

> 
> Let me see if the issues have been fixed.  Still wondering if there is a
> way we can get away without the boot_func hack...

I have to confesss I don't even remember what the 'if the issues have been
fixed' is referring to?

> 
> 	-hpa
> 
> 

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-19 13:21               ` Konrad Rzeszutek Wilk
  2014-03-19 15:02                 ` H. Peter Anvin
@ 2014-03-19 15:02                 ` H. Peter Anvin
  1 sibling, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-19 15:02 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk, David Vrabel
  Cc: Sarah Newman, linux-kernel, xen-devel, Ingo Molnar, Suresh Siddha,
	Thomas Gleixner

On 03/19/2014 06:21 AM, Konrad Rzeszutek Wilk wrote:
>>
>> The following patch does the always eager allocation.  It's a fixup of
>> Suresh's original patch.
>>
> 
> Hey Peter,
> 
> I think this is the solution you were looking for?
> 
> Or are there some other subtle issues that you think lurk around?
> 

Ah, I managed to miss it (mostly because it was buried *inside* another
email and didn't change the subject line... I really dislike that mode
of delivering a patch.

Let me see if the issues have been fixed.  Still wondering if there is a
way we can get away without the boot_func hack...

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 13:29             ` [Xen-devel] " David Vrabel
  2014-03-19 13:21               ` Konrad Rzeszutek Wilk
@ 2014-03-19 13:21               ` Konrad Rzeszutek Wilk
  1 sibling, 0 replies; 67+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-03-19 13:21 UTC (permalink / raw)
  To: David Vrabel
  Cc: H. Peter Anvin, Sarah Newman, linux-kernel, xen-devel,
	Ingo Molnar, Suresh Siddha, Thomas Gleixner

On Mon, Mar 17, 2014 at 01:29:03PM +0000, David Vrabel wrote:
> On 17/03/14 03:43, H. Peter Anvin wrote:
> > On 03/16/2014 08:35 PM, Sarah Newman wrote:
> >> Can you please review my patch first?  It's only enabled when absolutely required.
> > 
> > It doesn't help.  It means you're running on Xen, and you will have
> > processes subjected to random SIGKILL because they happen to touch the
> > FPU when the atomic pool is low.
> > 
> > However, there is probably a happy medium: you don't actually need eager
> > FPU restore, you just need eager FPU *allocation*.  We have been
> > intending to allocate the FPU state at task creation time for eagerfpu,
> > and Suresh Siddha has already produced such a patch; it just needs some
> > minor fixups due to an __init failure.
> > 
> > http://lkml.kernel.org/r/1391325599.6481.5.camel@europa
> > 
> > In the Xen case we could turn on eager allocation but not eager fpu.  In
> > fact, it might be justified to *always* do eager allocation...
> 
> The following patch does the always eager allocation.  It's a fixup of
> Suresh's original patch.
> 

Hey Peter,

I think this is the solution you were looking for?

Or are there some other subtle issues that you think lurk around?

Thanks!
> v2:
> - Allocate initial fpu state after xsave_init().
> - Only allocate initial FPU state on boot cpu.
> 
> 8<-----------------------
> x86, fpu: remove the logic of non-eager fpu mem allocation at the first usage
> 
> From: Suresh Siddha <sbsiddha@gmail.com>
> 
> For non-eager fpu mode, thread's fpu state is allocated during the first
> fpu usage (in the context of device not available exception). This can be
> a blocking call and hence we enable interrupts (which were originally
> disabled when the exception happened), allocate memory and disable
> interrupts etc. While this saves 512 bytes or so per-thread, there
> are some issues in general.
> 
> a.  Most of the future cases will be anyway using eager
> FPU (because of processor features like xsaveopt, LWP, MPX etc) and
> they do the allocation at the thread creation itself. Nice to have
> one common mechanism as all the state save/restore code is
> shared. Avoids the confusion and minimizes the subtle bugs
> in the core piece involved with context-switch.
> 
> b. If a parent thread uses FPU, during fork() we allocate
> the FPU state in the child and copy the state etc. Shortly after this,
> during exec() we free it up, so that we can later allocate during
> the first usage of FPU. So this free/allocate might be slower
> for some workloads.
> 
> c. math_state_restore() is called from multiple places
> and it is error pone if the caller expects interrupts to be disabled
> throughout the execution of math_state_restore(). Can lead to subtle
> bugs like Ubuntu bug #1265841.
> 
> Memory savings will be small anyways and the code complexity
> introducing subtle bugs is not worth it. So remove
> the logic of non-eager fpu mem allocation at the first usage.
> 
> Signed-off-by: Suresh Siddha <sbsiddha@gmail.com>
> Signed-off-by: David Vrabel <david.vrabel@citrix.com>
> ---
>  arch/x86/kernel/i387.c    |   22 +++++++++++++---------
>  arch/x86/kernel/process.c |    6 ------
>  arch/x86/kernel/traps.c   |   16 ++--------------
>  arch/x86/kernel/xsave.c   |    2 --
>  4 files changed, 15 insertions(+), 31 deletions(-)
> 
> diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
> index e8368c6..05aeae2 100644
> --- a/arch/x86/kernel/i387.c
> +++ b/arch/x86/kernel/i387.c
> @@ -5,6 +5,7 @@
>   *  General FPU state handling cleanups
>   *	Gareth Hughes <gareth@valinux.com>, May 2000
>   */
> +#include <linux/bootmem.h>
>  #include <linux/module.h>
>  #include <linux/regset.h>
>  #include <linux/sched.h>
> @@ -153,8 +154,15 @@ static void init_thread_xstate(void)
>   * into all processes.
>   */
>  
> +static void __init fpu_init_boot_cpu(void)
> +{
> +	current->thread.fpu.state =
> +		alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
> +}
> +
>  void fpu_init(void)
>  {
> +	static __refdata void (*boot_func)(void) = fpu_init_boot_cpu;
>  	unsigned long cr0;
>  	unsigned long cr4_mask = 0;
>  
> @@ -189,6 +197,11 @@ void fpu_init(void)
>  	mxcsr_feature_mask_init();
>  	xsave_init();
>  	eager_fpu_init();
> +
> +	if (boot_func) {
> +		boot_func();
> +		boot_func = NULL;
> +	}
>  }
>  
>  void fpu_finit(struct fpu *fpu)
> @@ -219,8 +232,6 @@ EXPORT_SYMBOL_GPL(fpu_finit);
>   */
>  int init_fpu(struct task_struct *tsk)
>  {
> -	int ret;
> -
>  	if (tsk_used_math(tsk)) {
>  		if (cpu_has_fpu && tsk == current)
>  			unlazy_fpu(tsk);
> @@ -228,13 +239,6 @@ int init_fpu(struct task_struct *tsk)
>  		return 0;
>  	}
>  
> -	/*
> -	 * Memory allocation at the first usage of the FPU and other state.
> -	 */
> -	ret = fpu_alloc(&tsk->thread.fpu);
> -	if (ret)
> -		return ret;
> -
>  	fpu_finit(&tsk->thread.fpu);
>  
>  	set_stopped_child_used_math(tsk);
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index 3fb8d95..cd9c190 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -128,12 +128,6 @@ void flush_thread(void)
>  	flush_ptrace_hw_breakpoint(tsk);
>  	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
>  	drop_init_fpu(tsk);
> -	/*
> -	 * Free the FPU state for non xsave platforms. They get reallocated
> -	 * lazily at the first use.
> -	 */
> -	if (!use_eager_fpu())
> -		free_thread_xstate(tsk);
>  }
>  
>  static void hard_disable_TSC(void)
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 57409f6..3265429 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -623,20 +623,8 @@ void math_state_restore(void)
>  {
>  	struct task_struct *tsk = current;
>  
> -	if (!tsk_used_math(tsk)) {
> -		local_irq_enable();
> -		/*
> -		 * does a slab alloc which can sleep
> -		 */
> -		if (init_fpu(tsk)) {
> -			/*
> -			 * ran out of memory!
> -			 */
> -			do_group_exit(SIGKILL);
> -			return;
> -		}
> -		local_irq_disable();
> -	}
> +	if (!tsk_used_math(tsk))
> +		init_fpu(tsk);
>  
>  	__thread_fpu_begin(tsk);
>  
> diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
> index a4b451c..46f6266 100644
> --- a/arch/x86/kernel/xsave.c
> +++ b/arch/x86/kernel/xsave.c
> @@ -598,8 +598,6 @@ void xsave_init(void)
>  
>  static inline void __init eager_fpu_init_bp(void)
>  {
> -	current->thread.fpu.state =
> -	    alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
>  	if (!init_xstate_buf)
>  		setup_init_fpu_buf();
>  }
> -- 
> 1.7.2.5
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:43           ` H. Peter Anvin
                               ` (2 preceding siblings ...)
  2014-03-17 13:29             ` [Xen-devel] " David Vrabel
@ 2014-03-17 13:29             ` David Vrabel
  3 siblings, 0 replies; 67+ messages in thread
From: David Vrabel @ 2014-03-17 13:29 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Sarah Newman, linux-kernel, xen-devel, Ingo Molnar, Suresh Siddha,
	Thomas Gleixner

On 17/03/14 03:43, H. Peter Anvin wrote:
> On 03/16/2014 08:35 PM, Sarah Newman wrote:
>> Can you please review my patch first?  It's only enabled when absolutely required.
> 
> It doesn't help.  It means you're running on Xen, and you will have
> processes subjected to random SIGKILL because they happen to touch the
> FPU when the atomic pool is low.
> 
> However, there is probably a happy medium: you don't actually need eager
> FPU restore, you just need eager FPU *allocation*.  We have been
> intending to allocate the FPU state at task creation time for eagerfpu,
> and Suresh Siddha has already produced such a patch; it just needs some
> minor fixups due to an __init failure.
> 
> http://lkml.kernel.org/r/1391325599.6481.5.camel@europa
> 
> In the Xen case we could turn on eager allocation but not eager fpu.  In
> fact, it might be justified to *always* do eager allocation...

The following patch does the always eager allocation.  It's a fixup of
Suresh's original patch.

v2:
- Allocate initial fpu state after xsave_init().
- Only allocate initial FPU state on boot cpu.

8<-----------------------
x86, fpu: remove the logic of non-eager fpu mem allocation at the first usage

From: Suresh Siddha <sbsiddha@gmail.com>

For non-eager fpu mode, thread's fpu state is allocated during the first
fpu usage (in the context of device not available exception). This can be
a blocking call and hence we enable interrupts (which were originally
disabled when the exception happened), allocate memory and disable
interrupts etc. While this saves 512 bytes or so per-thread, there
are some issues in general.

a.  Most of the future cases will be anyway using eager
FPU (because of processor features like xsaveopt, LWP, MPX etc) and
they do the allocation at the thread creation itself. Nice to have
one common mechanism as all the state save/restore code is
shared. Avoids the confusion and minimizes the subtle bugs
in the core piece involved with context-switch.

b. If a parent thread uses FPU, during fork() we allocate
the FPU state in the child and copy the state etc. Shortly after this,
during exec() we free it up, so that we can later allocate during
the first usage of FPU. So this free/allocate might be slower
for some workloads.

c. math_state_restore() is called from multiple places
and it is error pone if the caller expects interrupts to be disabled
throughout the execution of math_state_restore(). Can lead to subtle
bugs like Ubuntu bug #1265841.

Memory savings will be small anyways and the code complexity
introducing subtle bugs is not worth it. So remove
the logic of non-eager fpu mem allocation at the first usage.

Signed-off-by: Suresh Siddha <sbsiddha@gmail.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/kernel/i387.c    |   22 +++++++++++++---------
 arch/x86/kernel/process.c |    6 ------
 arch/x86/kernel/traps.c   |   16 ++--------------
 arch/x86/kernel/xsave.c   |    2 --
 4 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e8368c6..05aeae2 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -5,6 +5,7 @@
  *  General FPU state handling cleanups
  *	Gareth Hughes <gareth@valinux.com>, May 2000
  */
+#include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/regset.h>
 #include <linux/sched.h>
@@ -153,8 +154,15 @@ static void init_thread_xstate(void)
  * into all processes.
  */
 
+static void __init fpu_init_boot_cpu(void)
+{
+	current->thread.fpu.state =
+		alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
+}
+
 void fpu_init(void)
 {
+	static __refdata void (*boot_func)(void) = fpu_init_boot_cpu;
 	unsigned long cr0;
 	unsigned long cr4_mask = 0;
 
@@ -189,6 +197,11 @@ void fpu_init(void)
 	mxcsr_feature_mask_init();
 	xsave_init();
 	eager_fpu_init();
+
+	if (boot_func) {
+		boot_func();
+		boot_func = NULL;
+	}
 }
 
 void fpu_finit(struct fpu *fpu)
@@ -219,8 +232,6 @@ EXPORT_SYMBOL_GPL(fpu_finit);
  */
 int init_fpu(struct task_struct *tsk)
 {
-	int ret;
-
 	if (tsk_used_math(tsk)) {
 		if (cpu_has_fpu && tsk == current)
 			unlazy_fpu(tsk);
@@ -228,13 +239,6 @@ int init_fpu(struct task_struct *tsk)
 		return 0;
 	}
 
-	/*
-	 * Memory allocation at the first usage of the FPU and other state.
-	 */
-	ret = fpu_alloc(&tsk->thread.fpu);
-	if (ret)
-		return ret;
-
 	fpu_finit(&tsk->thread.fpu);
 
 	set_stopped_child_used_math(tsk);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95..cd9c190 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -128,12 +128,6 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	drop_init_fpu(tsk);
-	/*
-	 * Free the FPU state for non xsave platforms. They get reallocated
-	 * lazily at the first use.
-	 */
-	if (!use_eager_fpu())
-		free_thread_xstate(tsk);
 }
 
 static void hard_disable_TSC(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 57409f6..3265429 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -623,20 +623,8 @@ void math_state_restore(void)
 {
 	struct task_struct *tsk = current;
 
-	if (!tsk_used_math(tsk)) {
-		local_irq_enable();
-		/*
-		 * does a slab alloc which can sleep
-		 */
-		if (init_fpu(tsk)) {
-			/*
-			 * ran out of memory!
-			 */
-			do_group_exit(SIGKILL);
-			return;
-		}
-		local_irq_disable();
-	}
+	if (!tsk_used_math(tsk))
+		init_fpu(tsk);
 
 	__thread_fpu_begin(tsk);
 
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a4b451c..46f6266 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -598,8 +598,6 @@ void xsave_init(void)
 
 static inline void __init eager_fpu_init_bp(void)
 {
-	current->thread.fpu.state =
-	    alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
 	if (!init_xstate_buf)
 		setup_init_fpu_buf();
 }
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:35         ` [Xen-devel] " Sarah Newman
  2014-03-17  3:43           ` H. Peter Anvin
@ 2014-03-17  3:43           ` H. Peter Anvin
  1 sibling, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-17  3:43 UTC (permalink / raw)
  To: Sarah Newman, David Vrabel
  Cc: Suresh Siddha, Thomas Gleixner, Ingo Molnar, linux-kernel,
	xen-devel

On 03/16/2014 08:35 PM, Sarah Newman wrote:
> Can you please review my patch first?  It's only enabled when absolutely required.

It doesn't help.  It means you're running on Xen, and you will have
processes subjected to random SIGKILL because they happen to touch the
FPU when the atomic pool is low.

However, there is probably a happy medium: you don't actually need eager
FPU restore, you just need eager FPU *allocation*.  We have been
intending to allocate the FPU state at task creation time for eagerfpu,
and Suresh Siddha has already produced such a patch; it just needs some
minor fixups due to an __init failure.

http://lkml.kernel.org/r/1391325599.6481.5.camel@europa

In the Xen case we could turn on eager allocation but not eager fpu.  In
fact, it might be justified to *always* do eager allocation...

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:33       ` H. Peter Anvin
  2014-03-17  3:35         ` [Xen-devel] " Sarah Newman
@ 2014-03-17  3:35         ` Sarah Newman
  2014-03-17 12:19         ` George Dunlap
  2014-03-17 12:19         ` [Xen-devel] " George Dunlap
  3 siblings, 0 replies; 67+ messages in thread
From: Sarah Newman @ 2014-03-17  3:35 UTC (permalink / raw)
  To: H. Peter Anvin, David Vrabel
  Cc: Thomas Gleixner, Ingo Molnar, linux-kernel, xen-devel

Can you please review my patch first?  It's only enabled when absolutely required.

On 03/16/2014 08:33 PM, H. Peter Anvin wrote:
> No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a workaround for the legacy hypervisor versions.
> 
> GFP_ATOMIC -> SIGKILL is definitely a NAK.
> 
> On March 16, 2014 8:13:05 PM PDT, Sarah Newman <srn@prgmr.com> wrote:
>> On 03/10/2014 10:15 AM, David Vrabel wrote:
>>> On 10/03/14 16:40, H. Peter Anvin wrote:
>>>> On 03/10/2014 09:17 AM, David Vrabel wrote:
>>>>> math_state_restore() is called from the #NM exception handler.  It
>> may
>>>>> do a GFP_KERNEL allocation (in init_fpu()) which may schedule.
>>>>>
>>>>> Change this allocation to GFP_ATOMIC, but leave all the other
>> callers
>>>>> of init_fpu() or fpu_alloc() using GFP_KERNEL.
>>>>
>>>> And what the [Finnish] do you do if GFP_ATOMIC fails?
>>>
>>> The same thing it used to do -- kill the task with SIGKILL.  I
>> haven't
>>> changed this behaviour.
>>>
>>>> Sarah's patchset switches Xen PV to use eagerfpu unconditionally,
>> which
>>>> removes the dependency on #NM and is the right thing to do.
>>>
>>> Ok. I'll wait for this series and not pursue this patch any further.
>>
>> Sorry, this got swallowed by my mail filter.
>>
>> I did some more testing and I think eagerfpu is going to noticeably
>> slow things down. When I ran
>> "time sysbench --num-threads=64 --test=threads run" I saw on the order
>> of 15% more time spent in
>> system mode and this seemed consistent over different runs.
>>
>> As for GFP_ATOMIC, unfortunately I don't know a sanctioned test here so
>> I rolled my own. This test
>> sequentially allocated math-using processes in the background until it
>> could not any more.  On a
>> 64MB instance, I saw 10% fewer processes allocated with GFP_ATOMIC
>> compared to GFP_KERNEL when I
>> continually allocated new processes up to OOM conditions (256 vs 228.) 
>> A similar test on a
>> different RFS and a kernel using GFP_NOWAIT showed pretty much no
>> difference in how many processes I
>> could allocate. This doesn't seem too bad unless there is some kind of
>> fragmentation over time which
>> would cause worse performance.
>>
>> Since performance degradation applies at all times and not just under
>> extreme conditions, I think
>> the lesser evil will actually be GFP_ATOMIC.  But it's not necessary to
>> always use GFP_ATOMIC, only
>> under certain conditions - IE when the xen PVABI forces us to.
>>
>> Patches will be supplied shortly.
> 

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:33       ` H. Peter Anvin
  2014-03-17  3:35         ` [Xen-devel] " Sarah Newman
  2014-03-17  3:35         ` Sarah Newman
@ 2014-03-17 12:19         ` George Dunlap
  2014-03-17 12:19         ` [Xen-devel] " George Dunlap
  3 siblings, 0 replies; 67+ messages in thread
From: George Dunlap @ 2014-03-17 12:19 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Linux Kernel Mailing List, xen-devel@lists.xen.org, Ingo Molnar,
	David Vrabel, Sarah Newman, Thomas Gleixner

On Mon, Mar 17, 2014 at 3:33 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a workaround for the legacy hypervisor versions.

The interface wasn't an accident.  In the most common case you'll want
to clear the bit anyway. In PV mode clearing it would require an extra
trip up into the hypervisor.  So this saves one trip up into the
hypervisor on every context switch which involves an FPU, at the
expense of not being able to context-switch away when handling the
trap.

 -George

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17  3:33       ` H. Peter Anvin
                           ` (2 preceding siblings ...)
  2014-03-17 12:19         ` George Dunlap
@ 2014-03-17 12:19         ` George Dunlap
  2014-03-17 16:55           ` H. Peter Anvin
  2014-03-17 16:55           ` H. Peter Anvin
  3 siblings, 2 replies; 67+ messages in thread
From: George Dunlap @ 2014-03-17 12:19 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Sarah Newman, David Vrabel, Thomas Gleixner, Ingo Molnar,
	Linux Kernel Mailing List, xen-devel@lists.xen.org

On Mon, Mar 17, 2014 at 3:33 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a workaround for the legacy hypervisor versions.

The interface wasn't an accident.  In the most common case you'll want
to clear the bit anyway. In PV mode clearing it would require an extra
trip up into the hypervisor.  So this saves one trip up into the
hypervisor on every context switch which involves an FPU, at the
expense of not being able to context-switch away when handling the
trap.

 -George

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 12:19         ` [Xen-devel] " George Dunlap
@ 2014-03-17 16:55           ` H. Peter Anvin
  2014-03-17 17:05             ` Jan Beulich
  2014-03-17 17:05             ` [Xen-devel] " Jan Beulich
  2014-03-17 16:55           ` H. Peter Anvin
  1 sibling, 2 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-17 16:55 UTC (permalink / raw)
  To: George Dunlap
  Cc: Sarah Newman, David Vrabel, Thomas Gleixner, Ingo Molnar,
	Linux Kernel Mailing List, xen-devel@lists.xen.org

On 03/17/2014 05:19 AM, George Dunlap wrote:
> On Mon, Mar 17, 2014 at 3:33 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>> No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a workaround for the legacy hypervisor versions.
> 
> The interface wasn't an accident.  In the most common case you'll want
> to clear the bit anyway. In PV mode clearing it would require an extra
> trip up into the hypervisor.  So this saves one trip up into the
> hypervisor on every context switch which involves an FPU, at the
> expense of not being able to context-switch away when handling the
> trap.
> 
>  -George
> 

The interface was a complete faceplant, because it caused failures.
You're not infinitely unconstrained since you want to play in the same
sandbox as the native architecture, and if you want to have a hope of
avoiding these kinds of failures you really need to avoid making random
"improvements", certainly not without an explicit guest opt-in (the same
we do for the native CPU architecture when adding new features.)

So if this interface wasn't an accident it was active negligence and
incompetence.

	-hpa


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 16:55           ` H. Peter Anvin
@ 2014-03-17 17:05             ` Jan Beulich
  2014-03-17 17:05             ` [Xen-devel] " Jan Beulich
  1 sibling, 0 replies; 67+ messages in thread
From: Jan Beulich @ 2014-03-17 17:05 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: George Dunlap, Linux Kernel Mailing List, xen-devel@lists.xen.org,
	Ingo Molnar, David Vrabel, Sarah Newman, Thomas Gleixner

>>> On 17.03.14 at 17:55, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 03/17/2014 05:19 AM, George Dunlap wrote:
>> On Mon, Mar 17, 2014 at 3:33 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>>> No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a 
> workaround for the legacy hypervisor versions.
>> 
>> The interface wasn't an accident.  In the most common case you'll want
>> to clear the bit anyway. In PV mode clearing it would require an extra
>> trip up into the hypervisor.  So this saves one trip up into the
>> hypervisor on every context switch which involves an FPU, at the
>> expense of not being able to context-switch away when handling the
>> trap.
> 
> The interface was a complete faceplant, because it caused failures.
> You're not infinitely unconstrained since you want to play in the same
> sandbox as the native architecture, and if you want to have a hope of
> avoiding these kinds of failures you really need to avoid making random
> "improvements", certainly not without an explicit guest opt-in (the same
> we do for the native CPU architecture when adding new features.)
> 
> So if this interface wasn't an accident it was active negligence and
> incompetence.

I don't think so - while it (as we now see) disallows certain things
inside the guest, back at the time when this was designed there was
no sign of any sort of allocation/scheduling being done inside the
#NM handler. And furthermore, a PV specification is by its nature
allowed to define deviations from real hardware behavior, or else it
wouldn't be needed in the first place.

Jan

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 16:55           ` H. Peter Anvin
  2014-03-17 17:05             ` Jan Beulich
@ 2014-03-17 17:05             ` Jan Beulich
  2014-03-17 17:12               ` H. Peter Anvin
                                 ` (3 more replies)
  1 sibling, 4 replies; 67+ messages in thread
From: Jan Beulich @ 2014-03-17 17:05 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: David Vrabel, George Dunlap, Thomas Gleixner,
	xen-devel@lists.xen.org, Sarah Newman, Ingo Molnar,
	Linux Kernel Mailing List

>>> On 17.03.14 at 17:55, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 03/17/2014 05:19 AM, George Dunlap wrote:
>> On Mon, Mar 17, 2014 at 3:33 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>>> No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a 
> workaround for the legacy hypervisor versions.
>> 
>> The interface wasn't an accident.  In the most common case you'll want
>> to clear the bit anyway. In PV mode clearing it would require an extra
>> trip up into the hypervisor.  So this saves one trip up into the
>> hypervisor on every context switch which involves an FPU, at the
>> expense of not being able to context-switch away when handling the
>> trap.
> 
> The interface was a complete faceplant, because it caused failures.
> You're not infinitely unconstrained since you want to play in the same
> sandbox as the native architecture, and if you want to have a hope of
> avoiding these kinds of failures you really need to avoid making random
> "improvements", certainly not without an explicit guest opt-in (the same
> we do for the native CPU architecture when adding new features.)
> 
> So if this interface wasn't an accident it was active negligence and
> incompetence.

I don't think so - while it (as we now see) disallows certain things
inside the guest, back at the time when this was designed there was
no sign of any sort of allocation/scheduling being done inside the
#NM handler. And furthermore, a PV specification is by its nature
allowed to define deviations from real hardware behavior, or else it
wouldn't be needed in the first place.

Jan



^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 17:05             ` [Xen-devel] " Jan Beulich
@ 2014-03-17 17:12               ` H. Peter Anvin
  2014-03-18  8:14                 ` Ingo Molnar
  2014-03-18  8:14                 ` [Xen-devel] " Ingo Molnar
  2014-03-17 17:12               ` H. Peter Anvin
                                 ` (2 subsequent siblings)
  3 siblings, 2 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-17 17:12 UTC (permalink / raw)
  To: Jan Beulich
  Cc: David Vrabel, George Dunlap, Thomas Gleixner,
	xen-devel@lists.xen.org, Sarah Newman, Ingo Molnar,
	Linux Kernel Mailing List

On 03/17/2014 10:05 AM, Jan Beulich wrote:
> 
> I don't think so - while it (as we now see) disallows certain things
> inside the guest, back at the time when this was designed there was
> no sign of any sort of allocation/scheduling being done inside the
> #NM handler. And furthermore, a PV specification is by its nature
> allowed to define deviations from real hardware behavior, or else it
> wouldn't be needed in the first place.
> 

And this is exactly the sort of thing about Xen that make me want to go
on murderous rampage.  You think you can just take the current Linux
implementation at whatever time you implement the code and later come
back and say "don't change that, we hard-coded it in Xen."

Calling that "active negligence and incompetence" is probably on the
mild side.

	-hpa



^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 17:12               ` H. Peter Anvin
@ 2014-03-18  8:14                 ` Ingo Molnar
  2014-03-18  8:14                 ` [Xen-devel] " Ingo Molnar
  1 sibling, 0 replies; 67+ messages in thread
From: Ingo Molnar @ 2014-03-18  8:14 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: George Dunlap, Linux Kernel Mailing List, xen-devel@lists.xen.org,
	Ingo Molnar, David Vrabel, Jan Beulich, Sarah Newman,
	Thomas Gleixner


* H. Peter Anvin <hpa@zytor.com> wrote:

> On 03/17/2014 10:05 AM, Jan Beulich wrote:
> > 
> > I don't think so - while it (as we now see) disallows certain things
> > inside the guest, back at the time when this was designed there was
> > no sign of any sort of allocation/scheduling being done inside the
> > #NM handler. And furthermore, a PV specification is by its nature
> > allowed to define deviations from real hardware behavior, or else it
> > wouldn't be needed in the first place.
> > 
> 
> And this is exactly the sort of thing about Xen that make me want to 
> go on murderous rampage.  You think you can just take the current 
> Linux implementation at whatever time you implement the code and 
> later come back and say "don't change that, we hard-coded it in 
> Xen."

And the solution is that we just ignore that kind of crap in the 
native kernel and let Xen sort it out as best as it can.

When Xen (and PV) was merged it was promised that a PV interface can 
always adopt to whatever Linux does, without restricting the kernel on 
the native side in any fashion - time to check on that promise.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 17:12               ` H. Peter Anvin
  2014-03-18  8:14                 ` Ingo Molnar
@ 2014-03-18  8:14                 ` Ingo Molnar
  1 sibling, 0 replies; 67+ messages in thread
From: Ingo Molnar @ 2014-03-18  8:14 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Jan Beulich, George Dunlap, Linux Kernel Mailing List,
	xen-devel@lists.xen.org, Ingo Molnar, David Vrabel, Sarah Newman,
	Thomas Gleixner


* H. Peter Anvin <hpa@zytor.com> wrote:

> On 03/17/2014 10:05 AM, Jan Beulich wrote:
> > 
> > I don't think so - while it (as we now see) disallows certain things
> > inside the guest, back at the time when this was designed there was
> > no sign of any sort of allocation/scheduling being done inside the
> > #NM handler. And furthermore, a PV specification is by its nature
> > allowed to define deviations from real hardware behavior, or else it
> > wouldn't be needed in the first place.
> > 
> 
> And this is exactly the sort of thing about Xen that make me want to 
> go on murderous rampage.  You think you can just take the current 
> Linux implementation at whatever time you implement the code and 
> later come back and say "don't change that, we hard-coded it in 
> Xen."

And the solution is that we just ignore that kind of crap in the 
native kernel and let Xen sort it out as best as it can.

When Xen (and PV) was merged it was promised that a PV interface can 
always adopt to whatever Linux does, without restricting the kernel on 
the native side in any fashion - time to check on that promise.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 17:05             ` [Xen-devel] " Jan Beulich
  2014-03-17 17:12               ` H. Peter Anvin
@ 2014-03-17 17:12               ` H. Peter Anvin
  2014-03-17 17:14               ` George Dunlap
  2014-03-17 17:14               ` [Xen-devel] " George Dunlap
  3 siblings, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-17 17:12 UTC (permalink / raw)
  To: Jan Beulich
  Cc: George Dunlap, Linux Kernel Mailing List, xen-devel@lists.xen.org,
	Ingo Molnar, David Vrabel, Sarah Newman, Thomas Gleixner

On 03/17/2014 10:05 AM, Jan Beulich wrote:
> 
> I don't think so - while it (as we now see) disallows certain things
> inside the guest, back at the time when this was designed there was
> no sign of any sort of allocation/scheduling being done inside the
> #NM handler. And furthermore, a PV specification is by its nature
> allowed to define deviations from real hardware behavior, or else it
> wouldn't be needed in the first place.
> 

And this is exactly the sort of thing about Xen that make me want to go
on murderous rampage.  You think you can just take the current Linux
implementation at whatever time you implement the code and later come
back and say "don't change that, we hard-coded it in Xen."

Calling that "active negligence and incompetence" is probably on the
mild side.

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 17:05             ` [Xen-devel] " Jan Beulich
  2014-03-17 17:12               ` H. Peter Anvin
  2014-03-17 17:12               ` H. Peter Anvin
@ 2014-03-17 17:14               ` George Dunlap
  2014-03-17 17:14               ` [Xen-devel] " George Dunlap
  3 siblings, 0 replies; 67+ messages in thread
From: George Dunlap @ 2014-03-17 17:14 UTC (permalink / raw)
  To: Jan Beulich, H. Peter Anvin
  Cc: Linux Kernel Mailing List, xen-devel@lists.xen.org, Ingo Molnar,
	David Vrabel, Sarah Newman, Thomas Gleixner

On 03/17/2014 05:05 PM, Jan Beulich wrote:
>>>> On 17.03.14 at 17:55, "H. Peter Anvin" <hpa@zytor.com> wrote:
>> On 03/17/2014 05:19 AM, George Dunlap wrote:
>>> On Mon, Mar 17, 2014 at 3:33 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>>>> No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a
>> workaround for the legacy hypervisor versions.
>>> The interface wasn't an accident.  In the most common case you'll want
>>> to clear the bit anyway. In PV mode clearing it would require an extra
>>> trip up into the hypervisor.  So this saves one trip up into the
>>> hypervisor on every context switch which involves an FPU, at the
>>> expense of not being able to context-switch away when handling the
>>> trap.
>> The interface was a complete faceplant, because it caused failures.
>> You're not infinitely unconstrained since you want to play in the same
>> sandbox as the native architecture, and if you want to have a hope of
>> avoiding these kinds of failures you really need to avoid making random
>> "improvements", certainly not without an explicit guest opt-in (the same
>> we do for the native CPU architecture when adding new features.)
>>
>> So if this interface wasn't an accident it was active negligence and
>> incompetence.
> I don't think so - while it (as we now see) disallows certain things
> inside the guest, back at the time when this was designed there was
> no sign of any sort of allocation/scheduling being done inside the
> #NM handler. And furthermore, a PV specification is by its nature
> allowed to define deviations from real hardware behavior, or else it
> wouldn't be needed in the first place.

But it's certainly the case that deviating from the hardware in *this* 
way by default was always very likely to case the exact kind of bug 
we've seen here.  It is an "interface trap" that was bound to be tripped 
over (much like Intel's infamous sysret vulnerability).

Making it opt-in would have been a much better idea.  But the people who 
made that decision are long gone, and we now need to deal with the 
situation as we have it.

  -George

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 17:05             ` [Xen-devel] " Jan Beulich
                                 ` (2 preceding siblings ...)
  2014-03-17 17:14               ` George Dunlap
@ 2014-03-17 17:14               ` George Dunlap
  2014-03-18 18:17                 ` Sarah Newman
  2014-03-18 18:17                 ` [Xen-devel] " Sarah Newman
  3 siblings, 2 replies; 67+ messages in thread
From: George Dunlap @ 2014-03-17 17:14 UTC (permalink / raw)
  To: Jan Beulich, H. Peter Anvin
  Cc: David Vrabel, Thomas Gleixner, xen-devel@lists.xen.org,
	Sarah Newman, Ingo Molnar, Linux Kernel Mailing List

On 03/17/2014 05:05 PM, Jan Beulich wrote:
>>>> On 17.03.14 at 17:55, "H. Peter Anvin" <hpa@zytor.com> wrote:
>> On 03/17/2014 05:19 AM, George Dunlap wrote:
>>> On Mon, Mar 17, 2014 at 3:33 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>>>> No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a
>> workaround for the legacy hypervisor versions.
>>> The interface wasn't an accident.  In the most common case you'll want
>>> to clear the bit anyway. In PV mode clearing it would require an extra
>>> trip up into the hypervisor.  So this saves one trip up into the
>>> hypervisor on every context switch which involves an FPU, at the
>>> expense of not being able to context-switch away when handling the
>>> trap.
>> The interface was a complete faceplant, because it caused failures.
>> You're not infinitely unconstrained since you want to play in the same
>> sandbox as the native architecture, and if you want to have a hope of
>> avoiding these kinds of failures you really need to avoid making random
>> "improvements", certainly not without an explicit guest opt-in (the same
>> we do for the native CPU architecture when adding new features.)
>>
>> So if this interface wasn't an accident it was active negligence and
>> incompetence.
> I don't think so - while it (as we now see) disallows certain things
> inside the guest, back at the time when this was designed there was
> no sign of any sort of allocation/scheduling being done inside the
> #NM handler. And furthermore, a PV specification is by its nature
> allowed to define deviations from real hardware behavior, or else it
> wouldn't be needed in the first place.

But it's certainly the case that deviating from the hardware in *this* 
way by default was always very likely to case the exact kind of bug 
we've seen here.  It is an "interface trap" that was bound to be tripped 
over (much like Intel's infamous sysret vulnerability).

Making it opt-in would have been a much better idea.  But the people who 
made that decision are long gone, and we now need to deal with the 
situation as we have it.

  -George

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 17:14               ` [Xen-devel] " George Dunlap
@ 2014-03-18 18:17                 ` Sarah Newman
  2014-03-18 18:17                 ` [Xen-devel] " Sarah Newman
  1 sibling, 0 replies; 67+ messages in thread
From: Sarah Newman @ 2014-03-18 18:17 UTC (permalink / raw)
  To: George Dunlap, Jan Beulich, H. Peter Anvin
  Cc: David Vrabel, Thomas Gleixner, Ingo Molnar,
	Linux Kernel Mailing List, xen-devel@lists.xen.org

On 03/17/2014 10:14 AM, George Dunlap wrote:
> On 03/17/2014 05:05 PM, Jan Beulich wrote:
>>>>> On 17.03.14 at 17:55, "H. Peter Anvin" <hpa@zytor.com> wrote:
>>> So if this interface wasn't an accident it was active negligence and
>>> incompetence.
>> I don't think so - while it (as we now see) disallows certain things
>> inside the guest, back at the time when this was designed there was
>> no sign of any sort of allocation/scheduling being done inside the
>> #NM handler. And furthermore, a PV specification is by its nature
>> allowed to define deviations from real hardware behavior, or else it
>> wouldn't be needed in the first place.
> 
> But it's certainly the case that deviating from the hardware in *this* way by default was always
> very likely to case the exact kind of bug we've seen here.  It is an "interface trap" that was bound
> to be tripped over (much like Intel's infamous sysret vulnerability).
> 
> Making it opt-in would have been a much better idea.  But the people who made that decision are long
> gone, and we now need to deal with the situation as we have it.

Should or has there been a review of the current xen PVABI to look for any other such deviations?

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 17:14               ` [Xen-devel] " George Dunlap
  2014-03-18 18:17                 ` Sarah Newman
@ 2014-03-18 18:17                 ` Sarah Newman
  2014-03-18 18:27                     ` H. Peter Anvin
  1 sibling, 1 reply; 67+ messages in thread
From: Sarah Newman @ 2014-03-18 18:17 UTC (permalink / raw)
  To: George Dunlap, Jan Beulich, H. Peter Anvin
  Cc: Linux Kernel Mailing List, xen-devel@lists.xen.org, Ingo Molnar,
	David Vrabel, Thomas Gleixner

On 03/17/2014 10:14 AM, George Dunlap wrote:
> On 03/17/2014 05:05 PM, Jan Beulich wrote:
>>>>> On 17.03.14 at 17:55, "H. Peter Anvin" <hpa@zytor.com> wrote:
>>> So if this interface wasn't an accident it was active negligence and
>>> incompetence.
>> I don't think so - while it (as we now see) disallows certain things
>> inside the guest, back at the time when this was designed there was
>> no sign of any sort of allocation/scheduling being done inside the
>> #NM handler. And furthermore, a PV specification is by its nature
>> allowed to define deviations from real hardware behavior, or else it
>> wouldn't be needed in the first place.
> 
> But it's certainly the case that deviating from the hardware in *this* way by default was always
> very likely to case the exact kind of bug we've seen here.  It is an "interface trap" that was bound
> to be tripped over (much like Intel's infamous sysret vulnerability).
> 
> Making it opt-in would have been a much better idea.  But the people who made that decision are long
> gone, and we now need to deal with the situation as we have it.

Should or has there been a review of the current xen PVABI to look for any other such deviations?

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [Xen-devel] [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-18 18:17                 ` [Xen-devel] " Sarah Newman
@ 2014-03-18 18:27                     ` H. Peter Anvin
  0 siblings, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-18 18:27 UTC (permalink / raw)
  To: Sarah Newman, George Dunlap, Jan Beulich
  Cc: Linux Kernel Mailing List, xen-devel@lists.xen.org, Ingo Molnar,
	David Vrabel, Thomas Gleixner, Konrad Rzeszutek Wilk

On 03/18/2014 11:17 AM, Sarah Newman wrote:
> 
> Should or has there been a review of the current xen PVABI to look for any other such deviations?
> 

It would be a very good thing to do.  First of all, the PVABI needs to
be **documented** because without that there is no hope.  I would like
to specifically call out the efforts of Konrad Wilk in this general
area, but he obviously doesn't scale, either...

The other aspect is to get rid of as much of the PVABI as possible.  HPV
is a big step in that direction, getting rid of some of the most
invasive hooks, but I think we can minimize further.

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
@ 2014-03-18 18:27                     ` H. Peter Anvin
  0 siblings, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-18 18:27 UTC (permalink / raw)
  To: Sarah Newman, George Dunlap, Jan Beulich
  Cc: Linux Kernel Mailing List, xen-devel@lists.xen.org, Ingo Molnar,
	David Vrabel, Thomas Gleixner

On 03/18/2014 11:17 AM, Sarah Newman wrote:
> 
> Should or has there been a review of the current xen PVABI to look for any other such deviations?
> 

It would be a very good thing to do.  First of all, the PVABI needs to
be **documented** because without that there is no hope.  I would like
to specifically call out the efforts of Konrad Wilk in this general
area, but he obviously doesn't scale, either...

The other aspect is to get rid of as much of the PVABI as possible.  HPV
is a big step in that direction, getting rid of some of the most
invasive hooks, but I think we can minimize further.

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-17 12:19         ` [Xen-devel] " George Dunlap
  2014-03-17 16:55           ` H. Peter Anvin
@ 2014-03-17 16:55           ` H. Peter Anvin
  1 sibling, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-17 16:55 UTC (permalink / raw)
  To: George Dunlap
  Cc: Linux Kernel Mailing List, xen-devel@lists.xen.org, Ingo Molnar,
	David Vrabel, Sarah Newman, Thomas Gleixner

On 03/17/2014 05:19 AM, George Dunlap wrote:
> On Mon, Mar 17, 2014 at 3:33 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>> No, the right thing is to unf*ck the Xen braindamage and use eagerfpu as a workaround for the legacy hypervisor versions.
> 
> The interface wasn't an accident.  In the most common case you'll want
> to clear the bit anyway. In PV mode clearing it would require an extra
> trip up into the hypervisor.  So this saves one trip up into the
> hypervisor on every context switch which involves an FPU, at the
> expense of not being able to context-switch away when handling the
> trap.
> 
>  -George
> 

The interface was a complete faceplant, because it caused failures.
You're not infinitely unconstrained since you want to play in the same
sandbox as the native architecture, and if you want to have a hope of
avoiding these kinds of failures you really need to avoid making random
"improvements", certainly not without an explicit guest opt-in (the same
we do for the native CPU architecture when adding new features.)

So if this interface wasn't an accident it was active negligence and
incompetence.

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-10 16:40 ` H. Peter Anvin
  2014-03-10 17:15   ` David Vrabel
@ 2014-03-10 17:15   ` David Vrabel
  1 sibling, 0 replies; 67+ messages in thread
From: David Vrabel @ 2014-03-10 17:15 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, linux-kernel, Sarah Newman,
	xen-devel

On 10/03/14 16:40, H. Peter Anvin wrote:
> On 03/10/2014 09:17 AM, David Vrabel wrote:
>> math_state_restore() is called from the #NM exception handler.  It may
>> do a GFP_KERNEL allocation (in init_fpu()) which may schedule.
>>
>> Change this allocation to GFP_ATOMIC, but leave all the other callers
>> of init_fpu() or fpu_alloc() using GFP_KERNEL.
> 
> And what the [Finnish] do you do if GFP_ATOMIC fails?

The same thing it used to do -- kill the task with SIGKILL.  I haven't
changed this behaviour.

> Sarah's patchset switches Xen PV to use eagerfpu unconditionally, which
> removes the dependency on #NM and is the right thing to do.

Ok. I'll wait for this series and not pursue this patch any further.

David

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-10 16:17 [PATCHv1] x86: don't schedule when handling #NM exception David Vrabel
  2014-03-10 16:40 ` H. Peter Anvin
@ 2014-03-10 16:40 ` H. Peter Anvin
  2014-03-10 16:45 ` H. Peter Anvin
  2014-03-10 16:45 ` H. Peter Anvin
  3 siblings, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-10 16:40 UTC (permalink / raw)
  To: David Vrabel, linux-kernel
  Cc: Thomas Gleixner, Ingo Molnar, x86, Sarah Newman, xen-devel

On 03/10/2014 09:17 AM, David Vrabel wrote:
> math_state_restore() is called from the #NM exception handler.  It may
> do a GFP_KERNEL allocation (in init_fpu()) which may schedule.
> 
> Change this allocation to GFP_ATOMIC, but leave all the other callers
> of init_fpu() or fpu_alloc() using GFP_KERNEL.

And what the [Finnish] do you do if GFP_ATOMIC fails?

> do_group_exit() will also call schedule() so replace the call with
> force_sig(SIGKILL, tsk) instead.
> 
> Scheduling in math_state_restore() is particularly bad in Xen PV
> guests since the Xen clears CR0.TS before raising #NM exception (in
> the expectation that the #NM handler always clears TS).  If task A is
> descheduled and task B is scheduled.  Task B may end up with CR0.TS
> unexpectedly clear and any FPU instructions will not raise #NM and
> will corrupt task A's FPU state instead.

Yes, we know Xen is completely broken in this respect.

Anyway, I have a patchset from Sarah Newman which I have been reviewing
privately so far (which looks good and should be posted publicly -- the
holdup has not been Sarah's code but a combination of my bandwidth and
trying to get some preexisting bugs in the eagerfpu code dealt with,
which Suresh Siddha fortunately stepped up to do and which we now have a
solution for.)

Sarah's patchset switches Xen PV to use eagerfpu unconditionally, which
removes the dependency on #NM and is the right thing to do.

Sarah, could you post the latest patchset to LKML so it can be publicly
reviewed?  I'm sorry for the slow response time on my end.

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-10 16:17 [PATCHv1] x86: don't schedule when handling #NM exception David Vrabel
  2014-03-10 16:40 ` H. Peter Anvin
  2014-03-10 16:40 ` H. Peter Anvin
@ 2014-03-10 16:45 ` H. Peter Anvin
  2014-03-10 16:45 ` H. Peter Anvin
  3 siblings, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-10 16:45 UTC (permalink / raw)
  To: David Vrabel, linux-kernel; +Cc: x86, Thomas Gleixner, Ingo Molnar, xen-devel

On 03/10/2014 09:17 AM, David Vrabel wrote:
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 57409f6..c8078d2 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -624,18 +624,13 @@ void math_state_restore(void)
>  	struct task_struct *tsk = current;
>  
>  	if (!tsk_used_math(tsk)) {
> -		local_irq_enable();
> -		/*
> -		 * does a slab alloc which can sleep
> -		 */
> -		if (init_fpu(tsk)) {
> +		if (init_fpu(tsk, GFP_ATOMIC)) {
>  			/*
>  			 * ran out of memory!
>  			 */
> -			do_group_exit(SIGKILL);
> +			force_sig(SIGKILL, tsk);
>  			return;
>  		}
> -		local_irq_disable();
>  	}
>  

OK, answering my own question... you're randomly SIGKILLing processes
because the kernel doesn't have enough memory on hand.

In other words, because Xen is broken you want to break the rest of the
universe.

This is NAKed so hard it isn't even funny.

	-hpa



^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCHv1] x86: don't schedule when handling #NM exception
  2014-03-10 16:17 [PATCHv1] x86: don't schedule when handling #NM exception David Vrabel
                   ` (2 preceding siblings ...)
  2014-03-10 16:45 ` H. Peter Anvin
@ 2014-03-10 16:45 ` H. Peter Anvin
  3 siblings, 0 replies; 67+ messages in thread
From: H. Peter Anvin @ 2014-03-10 16:45 UTC (permalink / raw)
  To: David Vrabel, linux-kernel; +Cc: Thomas Gleixner, Ingo Molnar, x86, xen-devel

On 03/10/2014 09:17 AM, David Vrabel wrote:
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 57409f6..c8078d2 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -624,18 +624,13 @@ void math_state_restore(void)
>  	struct task_struct *tsk = current;
>  
>  	if (!tsk_used_math(tsk)) {
> -		local_irq_enable();
> -		/*
> -		 * does a slab alloc which can sleep
> -		 */
> -		if (init_fpu(tsk)) {
> +		if (init_fpu(tsk, GFP_ATOMIC)) {
>  			/*
>  			 * ran out of memory!
>  			 */
> -			do_group_exit(SIGKILL);
> +			force_sig(SIGKILL, tsk);
>  			return;
>  		}
> -		local_irq_disable();
>  	}
>  

OK, answering my own question... you're randomly SIGKILLing processes
because the kernel doesn't have enough memory on hand.

In other words, because Xen is broken you want to break the rest of the
universe.

This is NAKed so hard it isn't even funny.

	-hpa

^ permalink raw reply	[flat|nested] 67+ messages in thread

end of thread, other threads:[~2015-03-06 11:46 UTC | newest]

Thread overview: 67+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-03-10 16:17 [PATCHv1] x86: don't schedule when handling #NM exception David Vrabel
2014-03-10 16:40 ` H. Peter Anvin
2014-03-10 17:15   ` David Vrabel
2014-03-10 17:25     ` H. Peter Anvin
2014-03-10 17:25       ` H. Peter Anvin
2014-03-17  3:13     ` Sarah Newman
2014-03-17  3:13       ` Sarah Newman
2014-03-17  3:30       ` [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed Sarah Newman
2014-03-17  8:38         ` Jan Beulich
2014-03-17 12:42           ` George Dunlap
2014-03-17 13:35             ` Jan Beulich
2014-03-17 14:05               ` George Dunlap
2014-03-17 14:18                 ` George Dunlap
2014-03-17 15:28                   ` Jan Beulich
2014-03-18 18:07                   ` Sarah Newman
2014-03-18 19:14                     ` David Vrabel
2014-03-17 12:44           ` George Dunlap
2014-03-17 13:35             ` Jan Beulich
2014-03-18 17:48               ` Sarah Newman
2014-03-17  3:32       ` [PATCH] x86, fpu, xen: Allocate fpu state for xen pv based on PVABI behavior Sarah Newman
2014-03-17  3:32       ` Sarah Newman
2014-03-17  3:33       ` [PATCHv1] x86: don't schedule when handling #NM exception H. Peter Anvin
2014-03-17  3:33       ` H. Peter Anvin
2014-03-17  3:35         ` [Xen-devel] " Sarah Newman
2014-03-17  3:43           ` H. Peter Anvin
2014-03-17  4:12             ` Sarah Newman
2014-03-17  4:12             ` [Xen-devel] " Sarah Newman
2014-03-17  4:23               ` H. Peter Anvin
2014-03-20  0:00                 ` Greg Kroah-Hartman
2014-03-20  0:00                 ` [Xen-devel] " Greg Kroah-Hartman
2014-03-20  2:29                   ` H. Peter Anvin
2014-03-20  2:29                   ` [Xen-devel] " H. Peter Anvin
2014-03-17  4:23               ` H. Peter Anvin
2014-03-17 13:29             ` [Xen-devel] " David Vrabel
2014-03-19 13:21               ` Konrad Rzeszutek Wilk
2014-03-19 15:02                 ` H. Peter Anvin
2014-06-23 13:08                   ` Konrad Rzeszutek Wilk
2015-03-05 22:08                     ` H. Peter Anvin
2015-03-06 11:46                       ` [PATCHv4] x86, fpu: remove the logic of non-eager fpu mem allocation at the first usage David Vrabel
2015-03-06 11:46                       ` David Vrabel
2015-03-05 22:08                     ` [PATCHv1] x86: don't schedule when handling #NM exception H. Peter Anvin
2014-06-23 13:08                   ` Konrad Rzeszutek Wilk
2014-03-19 15:02                 ` H. Peter Anvin
2014-03-19 13:21               ` Konrad Rzeszutek Wilk
2014-03-17 13:29             ` David Vrabel
2014-03-17  3:43           ` H. Peter Anvin
2014-03-17  3:35         ` Sarah Newman
2014-03-17 12:19         ` George Dunlap
2014-03-17 12:19         ` [Xen-devel] " George Dunlap
2014-03-17 16:55           ` H. Peter Anvin
2014-03-17 17:05             ` Jan Beulich
2014-03-17 17:05             ` [Xen-devel] " Jan Beulich
2014-03-17 17:12               ` H. Peter Anvin
2014-03-18  8:14                 ` Ingo Molnar
2014-03-18  8:14                 ` [Xen-devel] " Ingo Molnar
2014-03-17 17:12               ` H. Peter Anvin
2014-03-17 17:14               ` George Dunlap
2014-03-17 17:14               ` [Xen-devel] " George Dunlap
2014-03-18 18:17                 ` Sarah Newman
2014-03-18 18:17                 ` [Xen-devel] " Sarah Newman
2014-03-18 18:27                   ` H. Peter Anvin
2014-03-18 18:27                     ` H. Peter Anvin
2014-03-17 16:55           ` H. Peter Anvin
2014-03-10 17:15   ` David Vrabel
2014-03-10 16:40 ` H. Peter Anvin
2014-03-10 16:45 ` H. Peter Anvin
2014-03-10 16:45 ` H. Peter Anvin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.