* [PATCH 3/3, v2] x86/xor: make virtualization friendly
@ 2012-11-02 14:21 Jan Beulich
2012-11-02 17:30 ` H. Peter Anvin
2013-01-25 10:43 ` [tip:x86/asm] x86/xor: Make " tip-bot for Jan Beulich
0 siblings, 2 replies; 11+ messages in thread
From: Jan Beulich @ 2012-11-02 14:21 UTC (permalink / raw)
To: mingo, tglx, hpa; +Cc: Konrad Rzeszutek Wilk, linux-kernel
In virtualized environments, the CR0.TS management needed here can be a
lot slower than anticipated by the original authors of this code, which
particularly means that in such cases forcing the use of SSE- (or MMX-)
based implementations is not desirable - actual measurements should
always be done in that case.
For consistency, pull into the shared (32- and 64-bit) header not only
the inclusion of the generic code, but also that of the AVX variants.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
arch/x86/include/asm/xor.h | 8 +++++++-
arch/x86/include/asm/xor_32.h | 22 ++++++++++------------
arch/x86/include/asm/xor_64.h | 10 ++++++----
3 files changed, 23 insertions(+), 17 deletions(-)
--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor.h
+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor.h
@@ -487,6 +487,12 @@ static struct xor_block_template xor_blo
#undef XOR_CONSTANT_CONSTRAINT
+/* Also try the AVX routines */
+#include <asm/xor_avx.h>
+
+/* Also try the generic routines. */
+#include <asm-generic/xor.h>
+
#ifdef CONFIG_X86_32
# include <asm/xor_32.h>
#else
@@ -494,6 +500,6 @@ static struct xor_block_template xor_blo
#endif
#define XOR_SELECT_TEMPLATE(FASTEST) \
- AVX_SELECT(FASTEST)
+ (cpu_has_hypervisor ? (FASTEST) : AVX_SELECT(FASTEST))
#endif /* _ASM_X86_XOR_H */
--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor_32.h
+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor_32.h
@@ -537,12 +537,6 @@ static struct xor_block_template xor_blo
.do_5 = xor_sse_5,
};
-/* Also try the AVX routines */
-#include <asm/xor_avx.h>
-
-/* Also try the generic routines. */
-#include <asm-generic/xor.h>
-
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
@@ -553,15 +547,19 @@ do { \
if (cpu_has_xmm) { \
xor_speed(&xor_block_pIII_sse); \
xor_speed(&xor_block_sse_pf64); \
- } else if (cpu_has_mmx) { \
+ if (!cpu_has_hypervisor) \
+ break; \
+ } \
+ if (cpu_has_mmx) { \
xor_speed(&xor_block_pII_mmx); \
xor_speed(&xor_block_p5_mmx); \
- } else { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_8regs_p); \
- xor_speed(&xor_block_32regs); \
- xor_speed(&xor_block_32regs_p); \
+ if (!cpu_has_hypervisor) \
+ break; \
} \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_8regs_p); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_32regs_p); \
} while (0)
#endif /* _ASM_X86_XOR_32_H */
--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor_64.h
+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor_64.h
@@ -9,10 +9,6 @@ static struct xor_block_template xor_blo
.do_5 = xor_sse_5,
};
-
-/* Also try the AVX routines */
-#include <asm/xor_avx.h>
-
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
@@ -22,6 +18,12 @@ do { \
AVX_XOR_SPEED; \
xor_speed(&xor_block_sse_pf64); \
xor_speed(&xor_block_sse); \
+ if (cpu_has_hypervisor) { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_8regs_p); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_32regs_p); \
+ } \
} while (0)
#endif /* _ASM_X86_XOR_64_H */
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH 3/3, v2] x86/xor: make virtualization friendly
2012-11-02 14:21 [PATCH 3/3, v2] x86/xor: make virtualization friendly Jan Beulich
@ 2012-11-02 17:30 ` H. Peter Anvin
2012-11-05 9:10 ` Jan Beulich
2013-01-25 10:43 ` [tip:x86/asm] x86/xor: Make " tip-bot for Jan Beulich
1 sibling, 1 reply; 11+ messages in thread
From: H. Peter Anvin @ 2012-11-02 17:30 UTC (permalink / raw)
To: Jan Beulich, mingo, tglx; +Cc: Konrad Rzeszutek Wilk, linux-kernel
Aren't we actually talking just about PV here?
If so the test is wrong.
Jan Beulich <JBeulich@suse.com> wrote:
>In virtualized environments, the CR0.TS management needed here can be a
>lot slower than anticipated by the original authors of this code, which
>particularly means that in such cases forcing the use of SSE- (or MMX-)
>based implementations is not desirable - actual measurements should
>always be done in that case.
>
>For consistency, pull into the shared (32- and 64-bit) header not only
>the inclusion of the generic code, but also that of the AVX variants.
>
>Signed-off-by: Jan Beulich <jbeulich@suse.com>
>Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
>
>---
> arch/x86/include/asm/xor.h | 8 +++++++-
> arch/x86/include/asm/xor_32.h | 22 ++++++++++------------
> arch/x86/include/asm/xor_64.h | 10 ++++++----
> 3 files changed, 23 insertions(+), 17 deletions(-)
>
>--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor.h
>+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor.h
>@@ -487,6 +487,12 @@ static struct xor_block_template xor_blo
>
> #undef XOR_CONSTANT_CONSTRAINT
>
>+/* Also try the AVX routines */
>+#include <asm/xor_avx.h>
>+
>+/* Also try the generic routines. */
>+#include <asm-generic/xor.h>
>+
> #ifdef CONFIG_X86_32
> # include <asm/xor_32.h>
> #else
>@@ -494,6 +500,6 @@ static struct xor_block_template xor_blo
> #endif
>
> #define XOR_SELECT_TEMPLATE(FASTEST) \
>- AVX_SELECT(FASTEST)
>+ (cpu_has_hypervisor ? (FASTEST) : AVX_SELECT(FASTEST))
>
> #endif /* _ASM_X86_XOR_H */
>--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor_32.h
>+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor_32.h
>@@ -537,12 +537,6 @@ static struct xor_block_template xor_blo
> .do_5 = xor_sse_5,
> };
>
>-/* Also try the AVX routines */
>-#include <asm/xor_avx.h>
>-
>-/* Also try the generic routines. */
>-#include <asm-generic/xor.h>
>-
>/* We force the use of the SSE xor block because it can write around
>L2.
> We may also be able to load into the L1 only depending on how the cpu
> deals with a load to a line that is being prefetched. */
>@@ -553,15 +547,19 @@ do { \
> if (cpu_has_xmm) { \
> xor_speed(&xor_block_pIII_sse); \
> xor_speed(&xor_block_sse_pf64); \
>- } else if (cpu_has_mmx) { \
>+ if (!cpu_has_hypervisor) \
>+ break; \
>+ } \
>+ if (cpu_has_mmx) { \
> xor_speed(&xor_block_pII_mmx); \
> xor_speed(&xor_block_p5_mmx); \
>- } else { \
>- xor_speed(&xor_block_8regs); \
>- xor_speed(&xor_block_8regs_p); \
>- xor_speed(&xor_block_32regs); \
>- xor_speed(&xor_block_32regs_p); \
>+ if (!cpu_has_hypervisor) \
>+ break; \
> } \
>+ xor_speed(&xor_block_8regs); \
>+ xor_speed(&xor_block_8regs_p); \
>+ xor_speed(&xor_block_32regs); \
>+ xor_speed(&xor_block_32regs_p); \
> } while (0)
>
> #endif /* _ASM_X86_XOR_32_H */
>--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor_64.h
>+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor_64.h
>@@ -9,10 +9,6 @@ static struct xor_block_template xor_blo
> .do_5 = xor_sse_5,
> };
>
>-
>-/* Also try the AVX routines */
>-#include <asm/xor_avx.h>
>-
>/* We force the use of the SSE xor block because it can write around
>L2.
> We may also be able to load into the L1 only depending on how the cpu
> deals with a load to a line that is being prefetched. */
>@@ -22,6 +18,12 @@ do { \
> AVX_XOR_SPEED; \
> xor_speed(&xor_block_sse_pf64); \
> xor_speed(&xor_block_sse); \
>+ if (cpu_has_hypervisor) { \
>+ xor_speed(&xor_block_8regs); \
>+ xor_speed(&xor_block_8regs_p); \
>+ xor_speed(&xor_block_32regs); \
>+ xor_speed(&xor_block_32regs_p); \
>+ } \
> } while (0)
>
> #endif /* _ASM_X86_XOR_64_H */
--
Sent from my mobile phone. Please excuse brevity and lack of formatting.
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH 3/3, v2] x86/xor: make virtualization friendly
2012-11-02 17:30 ` H. Peter Anvin
@ 2012-11-05 9:10 ` Jan Beulich
0 siblings, 0 replies; 11+ messages in thread
From: Jan Beulich @ 2012-11-05 9:10 UTC (permalink / raw)
To: H. Peter Anvin; +Cc: mingo, tglx, Konrad Rzeszutek Wilk, linux-kernel
>>> On 02.11.12 at 18:30, "H. Peter Anvin" <hpa@zytor.com> wrote:
> Aren't we actually talking just about PV here?
>
> If so the test is wrong.
No - this equally can affect "fully" virtualized guests (where the
CR0.TS accesses can involve VMEXIT-s).
Jan
> Jan Beulich <JBeulich@suse.com> wrote:
>
>>In virtualized environments, the CR0.TS management needed here can be a
>>lot slower than anticipated by the original authors of this code, which
>>particularly means that in such cases forcing the use of SSE- (or MMX-)
>>based implementations is not desirable - actual measurements should
>>always be done in that case.
>>
>>For consistency, pull into the shared (32- and 64-bit) header not only
>>the inclusion of the generic code, but also that of the AVX variants.
>>
>>Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
>>
>>---
>> arch/x86/include/asm/xor.h | 8 +++++++-
>> arch/x86/include/asm/xor_32.h | 22 ++++++++++------------
>> arch/x86/include/asm/xor_64.h | 10 ++++++----
>> 3 files changed, 23 insertions(+), 17 deletions(-)
>>
>>--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor.h
>>+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor.h
>>@@ -487,6 +487,12 @@ static struct xor_block_template xor_blo
>>
>> #undef XOR_CONSTANT_CONSTRAINT
>>
>>+/* Also try the AVX routines */
>>+#include <asm/xor_avx.h>
>>+
>>+/* Also try the generic routines. */
>>+#include <asm-generic/xor.h>
>>+
>> #ifdef CONFIG_X86_32
>> # include <asm/xor_32.h>
>> #else
>>@@ -494,6 +500,6 @@ static struct xor_block_template xor_blo
>> #endif
>>
>> #define XOR_SELECT_TEMPLATE(FASTEST) \
>>- AVX_SELECT(FASTEST)
>>+ (cpu_has_hypervisor ? (FASTEST) : AVX_SELECT(FASTEST))
>>
>> #endif /* _ASM_X86_XOR_H */
>>--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor_32.h
>>+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor_32.h
>>@@ -537,12 +537,6 @@ static struct xor_block_template xor_blo
>> .do_5 = xor_sse_5,
>> };
>>
>>-/* Also try the AVX routines */
>>-#include <asm/xor_avx.h>
>>-
>>-/* Also try the generic routines. */
>>-#include <asm-generic/xor.h>
>>-
>>/* We force the use of the SSE xor block because it can write around
>>L2.
>> We may also be able to load into the L1 only depending on how the cpu
>> deals with a load to a line that is being prefetched. */
>>@@ -553,15 +547,19 @@ do { \
>> if (cpu_has_xmm) { \
>> xor_speed(&xor_block_pIII_sse); \
>> xor_speed(&xor_block_sse_pf64); \
>>- } else if (cpu_has_mmx) { \
>>+ if (!cpu_has_hypervisor) \
>>+ break; \
>>+ } \
>>+ if (cpu_has_mmx) { \
>> xor_speed(&xor_block_pII_mmx); \
>> xor_speed(&xor_block_p5_mmx); \
>>- } else { \
>>- xor_speed(&xor_block_8regs); \
>>- xor_speed(&xor_block_8regs_p); \
>>- xor_speed(&xor_block_32regs); \
>>- xor_speed(&xor_block_32regs_p); \
>>+ if (!cpu_has_hypervisor) \
>>+ break; \
>> } \
>>+ xor_speed(&xor_block_8regs); \
>>+ xor_speed(&xor_block_8regs_p); \
>>+ xor_speed(&xor_block_32regs); \
>>+ xor_speed(&xor_block_32regs_p); \
>> } while (0)
>>
>> #endif /* _ASM_X86_XOR_32_H */
>>--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor_64.h
>>+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor_64.h
>>@@ -9,10 +9,6 @@ static struct xor_block_template xor_blo
>> .do_5 = xor_sse_5,
>> };
>>
>>-
>>-/* Also try the AVX routines */
>>-#include <asm/xor_avx.h>
>>-
>>/* We force the use of the SSE xor block because it can write around
>>L2.
>> We may also be able to load into the L1 only depending on how the cpu
>> deals with a load to a line that is being prefetched. */
>>@@ -22,6 +18,12 @@ do { \
>> AVX_XOR_SPEED; \
>> xor_speed(&xor_block_sse_pf64); \
>> xor_speed(&xor_block_sse); \
>>+ if (cpu_has_hypervisor) { \
>>+ xor_speed(&xor_block_8regs); \
>>+ xor_speed(&xor_block_8regs_p); \
>>+ xor_speed(&xor_block_32regs); \
>>+ xor_speed(&xor_block_32regs_p); \
>>+ } \
>> } while (0)
>>
>> #endif /* _ASM_X86_XOR_64_H */
>
> --
> Sent from my mobile phone. Please excuse brevity and lack of formatting.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [tip:x86/asm] x86/xor: Make virtualization friendly
2012-11-02 14:21 [PATCH 3/3, v2] x86/xor: make virtualization friendly Jan Beulich
2012-11-02 17:30 ` H. Peter Anvin
@ 2013-01-25 10:43 ` tip-bot for Jan Beulich
2013-01-25 22:11 ` H. Peter Anvin
1 sibling, 1 reply; 11+ messages in thread
From: tip-bot for Jan Beulich @ 2013-01-25 10:43 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, hpa, mingo, konrad.wilk, torvalds, jbeulich,
JBeulich, tglx
Commit-ID: 05fbf4d6fc6a3c0c3e63b77979c9311596716d10
Gitweb: http://git.kernel.org/tip/05fbf4d6fc6a3c0c3e63b77979c9311596716d10
Author: Jan Beulich <JBeulich@suse.com>
AuthorDate: Fri, 2 Nov 2012 14:21:23 +0000
Committer: Ingo Molnar <mingo@kernel.org>
CommitDate: Fri, 25 Jan 2013 09:23:51 +0100
x86/xor: Make virtualization friendly
In virtualized environments, the CR0.TS management needed here
can be a lot slower than anticipated by the original authors of
this code, which particularly means that in such cases forcing
the use of SSE- (or MMX-) based implementations is not desirable
- actual measurements should always be done in that case.
For consistency, pull into the shared (32- and 64-bit) header
not only the inclusion of the generic code, but also that of the
AVX variants.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Link: http://lkml.kernel.org/r/5093E4F302000078000A6162@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/x86/include/asm/xor.h | 8 +++++++-
arch/x86/include/asm/xor_32.h | 22 ++++++++++------------
arch/x86/include/asm/xor_64.h | 10 ++++++----
3 files changed, 23 insertions(+), 17 deletions(-)
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index d882975..55cd464 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -487,6 +487,12 @@ static struct xor_block_template xor_block_sse_pf64 = {
#undef XOR_CONSTANT_CONSTRAINT
+/* Also try the AVX routines */
+#include <asm/xor_avx.h>
+
+/* Also try the generic routines. */
+#include <asm-generic/xor.h>
+
#ifdef CONFIG_X86_32
# include <asm/xor_32.h>
#else
@@ -494,6 +500,6 @@ static struct xor_block_template xor_block_sse_pf64 = {
#endif
#define XOR_SELECT_TEMPLATE(FASTEST) \
- AVX_SELECT(FASTEST)
+ (cpu_has_hypervisor ? (FASTEST) : AVX_SELECT(FASTEST))
#endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index ce05722..fe7a277 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -537,12 +537,6 @@ static struct xor_block_template xor_block_pIII_sse = {
.do_5 = xor_sse_5,
};
-/* Also try the AVX routines */
-#include <asm/xor_avx.h>
-
-/* Also try the generic routines. */
-#include <asm-generic/xor.h>
-
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
@@ -553,15 +547,19 @@ do { \
if (cpu_has_xmm) { \
xor_speed(&xor_block_pIII_sse); \
xor_speed(&xor_block_sse_pf64); \
- } else if (cpu_has_mmx) { \
+ if (!cpu_has_hypervisor) \
+ break; \
+ } \
+ if (cpu_has_mmx) { \
xor_speed(&xor_block_pII_mmx); \
xor_speed(&xor_block_p5_mmx); \
- } else { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_8regs_p); \
- xor_speed(&xor_block_32regs); \
- xor_speed(&xor_block_32regs_p); \
+ if (!cpu_has_hypervisor) \
+ break; \
} \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_8regs_p); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_32regs_p); \
} while (0)
#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 546f1e3..30f9c43 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -9,10 +9,6 @@ static struct xor_block_template xor_block_sse = {
.do_5 = xor_sse_5,
};
-
-/* Also try the AVX routines */
-#include <asm/xor_avx.h>
-
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
@@ -22,6 +18,12 @@ do { \
AVX_XOR_SPEED; \
xor_speed(&xor_block_sse_pf64); \
xor_speed(&xor_block_sse); \
+ if (cpu_has_hypervisor) { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_8regs_p); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_32regs_p); \
+ } \
} while (0)
#endif /* _ASM_X86_XOR_64_H */
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [tip:x86/asm] x86/xor: Make virtualization friendly
2013-01-25 10:43 ` [tip:x86/asm] x86/xor: Make " tip-bot for Jan Beulich
@ 2013-01-25 22:11 ` H. Peter Anvin
2013-01-25 22:15 ` H. Peter Anvin
2013-01-28 9:04 ` Jan Beulich
0 siblings, 2 replies; 11+ messages in thread
From: H. Peter Anvin @ 2013-01-25 22:11 UTC (permalink / raw)
To: mingo, hpa, linux-kernel, konrad.wilk, torvalds, JBeulich,
jbeulich, tglx
Cc: linux-tip-commits
On 01/25/2013 02:43 AM, tip-bot for Jan Beulich wrote:
> Commit-ID: 05fbf4d6fc6a3c0c3e63b77979c9311596716d10
> Gitweb: http://git.kernel.org/tip/05fbf4d6fc6a3c0c3e63b77979c9311596716d10
> Author: Jan Beulich <JBeulich@suse.com>
> AuthorDate: Fri, 2 Nov 2012 14:21:23 +0000
> Committer: Ingo Molnar <mingo@kernel.org>
> CommitDate: Fri, 25 Jan 2013 09:23:51 +0100
>
> x86/xor: Make virtualization friendly
>
> In virtualized environments, the CR0.TS management needed here
> can be a lot slower than anticipated by the original authors of
> this code, which particularly means that in such cases forcing
> the use of SSE- (or MMX-) based implementations is not desirable
> - actual measurements should always be done in that case.
>
> For consistency, pull into the shared (32- and 64-bit) header
> not only the inclusion of the generic code, but also that of the
> AVX variants.
>
This patch is wrong and should be dropped. I verified it with the KVM
people that they do NOT want this change. It is a Xen-specific problem.
-hpa
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [tip:x86/asm] x86/xor: Make virtualization friendly
2013-01-25 22:11 ` H. Peter Anvin
@ 2013-01-25 22:15 ` H. Peter Anvin
2013-01-26 1:05 ` H. Peter Anvin
2013-01-26 12:10 ` Ingo Molnar
2013-01-28 9:04 ` Jan Beulich
1 sibling, 2 replies; 11+ messages in thread
From: H. Peter Anvin @ 2013-01-25 22:15 UTC (permalink / raw)
To: H. Peter Anvin
Cc: mingo, linux-kernel, konrad.wilk, torvalds, JBeulich, tglx,
linux-tip-commits
On 01/25/2013 02:11 PM, H. Peter Anvin wrote:
> On 01/25/2013 02:43 AM, tip-bot for Jan Beulich wrote:
>> Commit-ID: 05fbf4d6fc6a3c0c3e63b77979c9311596716d10
>> Gitweb: http://git.kernel.org/tip/05fbf4d6fc6a3c0c3e63b77979c9311596716d10
>> Author: Jan Beulich <JBeulich@suse.com>
>> AuthorDate: Fri, 2 Nov 2012 14:21:23 +0000
>> Committer: Ingo Molnar <mingo@kernel.org>
>> CommitDate: Fri, 25 Jan 2013 09:23:51 +0100
>>
>> x86/xor: Make virtualization friendly
>>
>> In virtualized environments, the CR0.TS management needed here
>> can be a lot slower than anticipated by the original authors of
>> this code, which particularly means that in such cases forcing
>> the use of SSE- (or MMX-) based implementations is not desirable
>> - actual measurements should always be done in that case.
>>
>> For consistency, pull into the shared (32- and 64-bit) header
>> not only the inclusion of the generic code, but also that of the
>> AVX variants.
>>
>
> This patch is wrong and should be dropped. I verified it with the KVM
> people that they do NOT want this change. It is a Xen-specific problem.
>
FWIW: I have dropped this patch from tip:x86/asm.
-hpa
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [tip:x86/asm] x86/xor: Make virtualization friendly
2013-01-25 22:15 ` H. Peter Anvin
@ 2013-01-26 1:05 ` H. Peter Anvin
2013-01-26 16:49 ` KY Srinivasan
2013-01-26 12:10 ` Ingo Molnar
1 sibling, 1 reply; 11+ messages in thread
From: H. Peter Anvin @ 2013-01-26 1:05 UTC (permalink / raw)
To: H. Peter Anvin
Cc: mingo, linux-kernel, konrad.wilk, torvalds, JBeulich, tglx,
linux-tip-commits, Marcelo Tosatti, K. Y. Srinivasan,
Haiyang Zhang
On 01/25/2013 02:15 PM, H. Peter Anvin wrote:
> On 01/25/2013 02:11 PM, H. Peter Anvin wrote:
>> On 01/25/2013 02:43 AM, tip-bot for Jan Beulich wrote:
>>> Commit-ID: 05fbf4d6fc6a3c0c3e63b77979c9311596716d10
>>> Gitweb: http://git.kernel.org/tip/05fbf4d6fc6a3c0c3e63b77979c9311596716d10
>>> Author: Jan Beulich <JBeulich@suse.com>
>>> AuthorDate: Fri, 2 Nov 2012 14:21:23 +0000
>>> Committer: Ingo Molnar <mingo@kernel.org>
>>> CommitDate: Fri, 25 Jan 2013 09:23:51 +0100
>>>
>>> x86/xor: Make virtualization friendly
>>>
>>> In virtualized environments, the CR0.TS management needed here
>>> can be a lot slower than anticipated by the original authors of
>>> this code, which particularly means that in such cases forcing
>>> the use of SSE- (or MMX-) based implementations is not desirable
>>> - actual measurements should always be done in that case.
>>>
>>> For consistency, pull into the shared (32- and 64-bit) header
>>> not only the inclusion of the generic code, but also that of the
>>> AVX variants.
>>>
>>
>> This patch is wrong and should be dropped. I verified it with the KVM
>> people that they do NOT want this change. It is a Xen-specific problem.
>>
>
> FWIW: I have dropped this patch from tip:x86/asm.
>
The bottom line, I guess, is that we need something like
cpu_has_slow_kernel_fpu or something like that, and set it for
specifically affected hypervisors?
Do we know if Hyper-V has performance issues with CR0.TS?
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
^ permalink raw reply [flat|nested] 11+ messages in thread
* RE: [tip:x86/asm] x86/xor: Make virtualization friendly
2013-01-26 1:05 ` H. Peter Anvin
@ 2013-01-26 16:49 ` KY Srinivasan
0 siblings, 0 replies; 11+ messages in thread
From: KY Srinivasan @ 2013-01-26 16:49 UTC (permalink / raw)
To: H. Peter Anvin
Cc: mingo@kernel.org, linux-kernel@vger.kernel.org,
konrad.wilk@oracle.com, torvalds@linux-foundation.org,
JBeulich@suse.com, tglx@linutronix.de,
linux-tip-commits@vger.kernel.org, Marcelo Tosatti, Haiyang Zhang
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 2364 bytes --]
> -----Original Message-----
> From: H. Peter Anvin [mailto:hpa@zytor.com]
> Sent: Friday, January 25, 2013 8:05 PM
> To: H. Peter Anvin
> Cc: mingo@kernel.org; linux-kernel@vger.kernel.org; konrad.wilk@oracle.com;
> torvalds@linux-foundation.org; JBeulich@suse.com; tglx@linutronix.de; linux-
> tip-commits@vger.kernel.org; Marcelo Tosatti; KY Srinivasan; Haiyang Zhang
> Subject: Re: [tip:x86/asm] x86/xor: Make virtualization friendly
>
> On 01/25/2013 02:15 PM, H. Peter Anvin wrote:
> > On 01/25/2013 02:11 PM, H. Peter Anvin wrote:
> >> On 01/25/2013 02:43 AM, tip-bot for Jan Beulich wrote:
> >>> Commit-ID: 05fbf4d6fc6a3c0c3e63b77979c9311596716d10
> >>> Gitweb:
> http://git.kernel.org/tip/05fbf4d6fc6a3c0c3e63b77979c9311596716d10
> >>> Author: Jan Beulich <JBeulich@suse.com>
> >>> AuthorDate: Fri, 2 Nov 2012 14:21:23 +0000
> >>> Committer: Ingo Molnar <mingo@kernel.org>
> >>> CommitDate: Fri, 25 Jan 2013 09:23:51 +0100
> >>>
> >>> x86/xor: Make virtualization friendly
> >>>
> >>> In virtualized environments, the CR0.TS management needed here
> >>> can be a lot slower than anticipated by the original authors of
> >>> this code, which particularly means that in such cases forcing
> >>> the use of SSE- (or MMX-) based implementations is not desirable
> >>> - actual measurements should always be done in that case.
> >>>
> >>> For consistency, pull into the shared (32- and 64-bit) header
> >>> not only the inclusion of the generic code, but also that of the
> >>> AVX variants.
> >>>
> >>
> >> This patch is wrong and should be dropped. I verified it with the KVM
> >> people that they do NOT want this change. It is a Xen-specific problem.
> >>
> >
> > FWIW: I have dropped this patch from tip:x86/asm.
> >
>
> The bottom line, I guess, is that we need something like
> cpu_has_slow_kernel_fpu or something like that, and set it for
> specifically affected hypervisors?
>
> Do we know if Hyper-V has performance issues with CR0.TS?
Checking with the Hyper-V developers, Hyper-V does not have performance issues
With CR0.TS
Regards,
K. Y
>
> -hpa
>
> --
> H. Peter Anvin, Intel Open Source Technology Center
> I work for Intel. I don't speak on their behalf.
>
>
ÿôèº{.nÇ+·®+%Ëÿ±éݶ\x17¥wÿº{.nÇ+·¥{±þG«éÿ{ayº\x1dÊÚë,j\a¢f£¢·hïêÿêçz_è®\x03(éÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?¨èÚ&£ø§~á¶iOæ¬z·vØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?I¥
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [tip:x86/asm] x86/xor: Make virtualization friendly
2013-01-25 22:15 ` H. Peter Anvin
2013-01-26 1:05 ` H. Peter Anvin
@ 2013-01-26 12:10 ` Ingo Molnar
1 sibling, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2013-01-26 12:10 UTC (permalink / raw)
To: H. Peter Anvin
Cc: linux-kernel, konrad.wilk, torvalds, JBeulich, tglx,
linux-tip-commits
* H. Peter Anvin <hpa@zytor.com> wrote:
> On 01/25/2013 02:11 PM, H. Peter Anvin wrote:
> > On 01/25/2013 02:43 AM, tip-bot for Jan Beulich wrote:
> >> Commit-ID: 05fbf4d6fc6a3c0c3e63b77979c9311596716d10
> >> Gitweb: http://git.kernel.org/tip/05fbf4d6fc6a3c0c3e63b77979c9311596716d10
> >> Author: Jan Beulich <JBeulich@suse.com>
> >> AuthorDate: Fri, 2 Nov 2012 14:21:23 +0000
> >> Committer: Ingo Molnar <mingo@kernel.org>
> >> CommitDate: Fri, 25 Jan 2013 09:23:51 +0100
> >>
> >> x86/xor: Make virtualization friendly
> >>
> >> In virtualized environments, the CR0.TS management needed here
> >> can be a lot slower than anticipated by the original authors of
> >> this code, which particularly means that in such cases forcing
> >> the use of SSE- (or MMX-) based implementations is not desirable
> >> - actual measurements should always be done in that case.
> >>
> >> For consistency, pull into the shared (32- and 64-bit) header
> >> not only the inclusion of the generic code, but also that of the
> >> AVX variants.
> >>
> >
> > This patch is wrong and should be dropped. I verified it with the KVM
> > people that they do NOT want this change. It is a Xen-specific problem.
> >
>
> FWIW: I have dropped this patch from tip:x86/asm.
Thanks - and the other two patches are fine, right?
Ingo
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [tip:x86/asm] x86/xor: Make virtualization friendly
2013-01-25 22:11 ` H. Peter Anvin
2013-01-25 22:15 ` H. Peter Anvin
@ 2013-01-28 9:04 ` Jan Beulich
2013-01-28 15:26 ` H. Peter Anvin
1 sibling, 1 reply; 11+ messages in thread
From: Jan Beulich @ 2013-01-28 9:04 UTC (permalink / raw)
To: H. Peter Anvin; +Cc: mingo, tglx, torvalds, konrad.wilk, linux-kernel
>>> On 25.01.13 at 23:11, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 01/25/2013 02:43 AM, tip-bot for Jan Beulich wrote:
>> Commit-ID: 05fbf4d6fc6a3c0c3e63b77979c9311596716d10
>> Gitweb:
> http://git.kernel.org/tip/05fbf4d6fc6a3c0c3e63b77979c9311596716d10
>> Author: Jan Beulich <JBeulich@suse.com>
>> AuthorDate: Fri, 2 Nov 2012 14:21:23 +0000
>> Committer: Ingo Molnar <mingo@kernel.org>
>> CommitDate: Fri, 25 Jan 2013 09:23:51 +0100
>>
>> x86/xor: Make virtualization friendly
>>
>> In virtualized environments, the CR0.TS management needed here
>> can be a lot slower than anticipated by the original authors of
>> this code, which particularly means that in such cases forcing
>> the use of SSE- (or MMX-) based implementations is not desirable
>> - actual measurements should always be done in that case.
>>
>> For consistency, pull into the shared (32- and 64-bit) header
>> not only the inclusion of the generic code, but also that of the
>> AVX variants.
>>
>
> This patch is wrong and should be dropped. I verified it with the KVM
> people that they do NOT want this change. It is a Xen-specific problem.
I don't follow: The patch doesn't penalize anyone, it merely
widens the set of methods tried on virtualized platforms. I.e.
if other hypervisors have no problem here, then the best
performing one should still turn out to be the SSE or AVX one.
Or if it doesn't, it ought to be to their advantage (I would even
question why this extra probing isn't done on native too, e.g.
to cope with eventual bad vector implementations, say on
low-power/low-cost CPUs).
Jan
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [tip:x86/asm] x86/xor: Make virtualization friendly
2013-01-28 9:04 ` Jan Beulich
@ 2013-01-28 15:26 ` H. Peter Anvin
0 siblings, 0 replies; 11+ messages in thread
From: H. Peter Anvin @ 2013-01-28 15:26 UTC (permalink / raw)
To: Jan Beulich; +Cc: mingo, tglx, torvalds, konrad.wilk, linux-kernel
It adds substantial boot time, and it has no value when the cache priority rules force the non-cache-polluting version even if somewhat slower... which can and does happen.
Jan Beulich <JBeulich@suse.com> wrote:
>>>> On 25.01.13 at 23:11, "H. Peter Anvin" <hpa@zytor.com> wrote:
>> On 01/25/2013 02:43 AM, tip-bot for Jan Beulich wrote:
>>> Commit-ID: 05fbf4d6fc6a3c0c3e63b77979c9311596716d10
>>> Gitweb:
>> http://git.kernel.org/tip/05fbf4d6fc6a3c0c3e63b77979c9311596716d10
>>> Author: Jan Beulich <JBeulich@suse.com>
>>> AuthorDate: Fri, 2 Nov 2012 14:21:23 +0000
>>> Committer: Ingo Molnar <mingo@kernel.org>
>>> CommitDate: Fri, 25 Jan 2013 09:23:51 +0100
>>>
>>> x86/xor: Make virtualization friendly
>>>
>>> In virtualized environments, the CR0.TS management needed here
>>> can be a lot slower than anticipated by the original authors of
>>> this code, which particularly means that in such cases forcing
>>> the use of SSE- (or MMX-) based implementations is not desirable
>>> - actual measurements should always be done in that case.
>>>
>>> For consistency, pull into the shared (32- and 64-bit) header
>>> not only the inclusion of the generic code, but also that of the
>>> AVX variants.
>>>
>>
>> This patch is wrong and should be dropped. I verified it with the
>KVM
>> people that they do NOT want this change. It is a Xen-specific
>problem.
>
>I don't follow: The patch doesn't penalize anyone, it merely
>widens the set of methods tried on virtualized platforms. I.e.
>if other hypervisors have no problem here, then the best
>performing one should still turn out to be the SSE or AVX one.
>Or if it doesn't, it ought to be to their advantage (I would even
>question why this extra probing isn't done on native too, e.g.
>to cope with eventual bad vector implementations, say on
>low-power/low-cost CPUs).
>
>Jan
--
Sent from my mobile phone. Please excuse brevity and lack of formatting.
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2013-01-28 15:27 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-11-02 14:21 [PATCH 3/3, v2] x86/xor: make virtualization friendly Jan Beulich
2012-11-02 17:30 ` H. Peter Anvin
2012-11-05 9:10 ` Jan Beulich
2013-01-25 10:43 ` [tip:x86/asm] x86/xor: Make " tip-bot for Jan Beulich
2013-01-25 22:11 ` H. Peter Anvin
2013-01-25 22:15 ` H. Peter Anvin
2013-01-26 1:05 ` H. Peter Anvin
2013-01-26 16:49 ` KY Srinivasan
2013-01-26 12:10 ` Ingo Molnar
2013-01-28 9:04 ` Jan Beulich
2013-01-28 15:26 ` H. Peter Anvin
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox