* [PATCH] x86: also use tzcnt instead of bsf in __scanbit()
@ 2015-01-26 12:03 Jan Beulich
2015-01-26 13:08 ` Andrew Cooper
0 siblings, 1 reply; 2+ messages in thread
From: Jan Beulich @ 2015-01-26 12:03 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Keir Fraser
[-- Attachment #1: Type: text/plain, Size: 4563 bytes --]
... when available, i.e. by runtime patching. This saves the
conditional move, having a back-to-back dependency on BSF's (EFLAGS)
result.
The need to include asm/cpufeatures.h from asm/bitops.h requires a
workaround for an otherwise resulting circular header file dependency:
Provide a mode by which the including site of the former header can
request to only get the X86_FEATURE_* defines (and very little more)
from it, allowing it to nevertheless be included in its entirety later
on.
While doing this I also noticed that the function's "max" parameter was
pointlessly "unsigned long" - the function only returning
"unsigned int", this can't be of any use, and hence gets converted at
once, along with the necessary adjustments to CMOVZ's output operands.
Note that while only alternative_io() is needed by this change (and
hence gets pulled over from Linux), for completeness its input-only
counterpart alternative_input() gets added as well.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/include/asm-x86/alternative.h
+++ b/xen/include/asm-x86/alternative.h
@@ -12,6 +12,7 @@
.byte \alt_len
.endm
#else
+#include <xen/stringify.h>
#include <xen/types.h>
struct alt_instr {
@@ -73,6 +74,26 @@ extern void alternative_instructions(voi
#define alternative(oldinstr, newinstr, feature) \
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+/*
+ * Alternative inline assembly with input.
+ *
+ * Pecularities:
+ * No memory clobber here.
+ * Argument numbers start with 1.
+ * Best is to use constraints that are fixed size (like (%1) ... "r")
+ * If you use variable sized constraints like "m" or "g" in the
+ * replacement make sure to pad to the worst case length.
+ * Leaving an unused argument 0 to keep API compatibility.
+ */
+#define alternative_input(oldinstr, newinstr, feature, input...) \
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : : "i" (0), ## input)
+
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, newinstr, feature, output, input...) \
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : output : "i" (0), ## input)
+
#endif /* __ASSEMBLY__ */
#endif /* __X86_ALTERNATIVE_H__ */
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -5,7 +5,9 @@
* Copyright 1992, Linus Torvalds.
*/
-#include <xen/config.h>
+#include <asm/alternative.h>
+#define X86_FEATURES_ONLY
+#include <asm/cpufeature.h>
/*
* We specify the memory operand as both input and output because the memory
@@ -313,9 +315,17 @@ extern unsigned int __find_first_zero_bi
extern unsigned int __find_next_zero_bit(
const unsigned long *addr, unsigned int size, unsigned int offset);
-static inline unsigned int __scanbit(unsigned long val, unsigned long max)
+static inline unsigned int __scanbit(unsigned long val, unsigned int max)
{
- asm ( "bsf %1,%0 ; cmovz %2,%0" : "=&r" (val) : "r" (val), "r" (max) );
+ if ( __builtin_constant_p(max) && max == BITS_PER_LONG )
+ alternative_io("bsf %[in],%[out]; cmovz %[max],%k[out]",
+ "rep; bsf %[in],%[out]",
+ X86_FEATURE_BMI1,
+ [out] "=&r" (val),
+ [in] "r" (val), [max] "r" (max));
+ else
+ asm ( "bsf %1,%0 ; cmovz %2,%k0"
+ : "=&r" (val) : "r" (val), "r" (max) );
return (unsigned int)val;
}
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -5,10 +5,8 @@
*/
#ifndef __ASM_I386_CPUFEATURE_H
+#ifndef X86_FEATURES_ONLY
#define __ASM_I386_CPUFEATURE_H
-
-#ifndef __ASSEMBLY__
-#include <xen/bitops.h>
#endif
#define NCAPINTS 8 /* N 32-bit words worth of info */
@@ -155,7 +153,9 @@
#define X86_FEATURE_ADX (7*32+19) /* ADCX, ADOX instructions */
#define X86_FEATURE_SMAP (7*32+20) /* Supervisor Mode Access Prevention */
-#ifndef __ASSEMBLY__
+#if !defined(__ASSEMBLY__) && !defined(X86_FEATURES_ONLY)
+#include <xen/bitops.h>
+
#define cpu_has(c, bit) test_bit(bit, (c)->x86_capability)
#define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability)
#define cpufeat_mask(idx) (1u << ((idx) & 31))
@@ -262,6 +262,8 @@ struct cpuid4_info {
int cpuid4_cache_lookup(int index, struct cpuid4_info *this_leaf);
#endif
+#undef X86_FEATURES_ONLY
+
#endif /* __ASM_I386_CPUFEATURE_H */
/*
[-- Attachment #2: x86-alt-tzcnt.patch --]
[-- Type: text/plain, Size: 4612 bytes --]
x86: also use tzcnt instead of bsf in __scanbit()
... when available, i.e. by runtime patching. This saves the
conditional move, having a back-to-back dependency on BSF's (EFLAGS)
result.
The need to include asm/cpufeatures.h from asm/bitops.h requires a
workaround for an otherwise resulting circular header file dependency:
Provide a mode by which the including site of the former header can
request to only get the X86_FEATURE_* defines (and very little more)
from it, allowing it to nevertheless be included in its entirety later
on.
While doing this I also noticed that the function's "max" parameter was
pointlessly "unsigned long" - the function only returning
"unsigned int", this can't be of any use, and hence gets converted at
once, along with the necessary adjustments to CMOVZ's output operands.
Note that while only alternative_io() is needed by this change (and
hence gets pulled over from Linux), for completeness its input-only
counterpart alternative_input() gets added as well.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/include/asm-x86/alternative.h
+++ b/xen/include/asm-x86/alternative.h
@@ -12,6 +12,7 @@
.byte \alt_len
.endm
#else
+#include <xen/stringify.h>
#include <xen/types.h>
struct alt_instr {
@@ -73,6 +74,26 @@ extern void alternative_instructions(voi
#define alternative(oldinstr, newinstr, feature) \
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+/*
+ * Alternative inline assembly with input.
+ *
+ * Pecularities:
+ * No memory clobber here.
+ * Argument numbers start with 1.
+ * Best is to use constraints that are fixed size (like (%1) ... "r")
+ * If you use variable sized constraints like "m" or "g" in the
+ * replacement make sure to pad to the worst case length.
+ * Leaving an unused argument 0 to keep API compatibility.
+ */
+#define alternative_input(oldinstr, newinstr, feature, input...) \
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : : "i" (0), ## input)
+
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, newinstr, feature, output, input...) \
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : output : "i" (0), ## input)
+
#endif /* __ASSEMBLY__ */
#endif /* __X86_ALTERNATIVE_H__ */
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -5,7 +5,9 @@
* Copyright 1992, Linus Torvalds.
*/
-#include <xen/config.h>
+#include <asm/alternative.h>
+#define X86_FEATURES_ONLY
+#include <asm/cpufeature.h>
/*
* We specify the memory operand as both input and output because the memory
@@ -313,9 +315,17 @@ extern unsigned int __find_first_zero_bi
extern unsigned int __find_next_zero_bit(
const unsigned long *addr, unsigned int size, unsigned int offset);
-static inline unsigned int __scanbit(unsigned long val, unsigned long max)
+static inline unsigned int __scanbit(unsigned long val, unsigned int max)
{
- asm ( "bsf %1,%0 ; cmovz %2,%0" : "=&r" (val) : "r" (val), "r" (max) );
+ if ( __builtin_constant_p(max) && max == BITS_PER_LONG )
+ alternative_io("bsf %[in],%[out]; cmovz %[max],%k[out]",
+ "rep; bsf %[in],%[out]",
+ X86_FEATURE_BMI1,
+ [out] "=&r" (val),
+ [in] "r" (val), [max] "r" (max));
+ else
+ asm ( "bsf %1,%0 ; cmovz %2,%k0"
+ : "=&r" (val) : "r" (val), "r" (max) );
return (unsigned int)val;
}
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -5,10 +5,8 @@
*/
#ifndef __ASM_I386_CPUFEATURE_H
+#ifndef X86_FEATURES_ONLY
#define __ASM_I386_CPUFEATURE_H
-
-#ifndef __ASSEMBLY__
-#include <xen/bitops.h>
#endif
#define NCAPINTS 8 /* N 32-bit words worth of info */
@@ -155,7 +153,9 @@
#define X86_FEATURE_ADX (7*32+19) /* ADCX, ADOX instructions */
#define X86_FEATURE_SMAP (7*32+20) /* Supervisor Mode Access Prevention */
-#ifndef __ASSEMBLY__
+#if !defined(__ASSEMBLY__) && !defined(X86_FEATURES_ONLY)
+#include <xen/bitops.h>
+
#define cpu_has(c, bit) test_bit(bit, (c)->x86_capability)
#define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability)
#define cpufeat_mask(idx) (1u << ((idx) & 31))
@@ -262,6 +262,8 @@ struct cpuid4_info {
int cpuid4_cache_lookup(int index, struct cpuid4_info *this_leaf);
#endif
+#undef X86_FEATURES_ONLY
+
#endif /* __ASM_I386_CPUFEATURE_H */
/*
[-- Attachment #3: Type: text/plain, Size: 126 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH] x86: also use tzcnt instead of bsf in __scanbit()
2015-01-26 12:03 [PATCH] x86: also use tzcnt instead of bsf in __scanbit() Jan Beulich
@ 2015-01-26 13:08 ` Andrew Cooper
0 siblings, 0 replies; 2+ messages in thread
From: Andrew Cooper @ 2015-01-26 13:08 UTC (permalink / raw)
To: Jan Beulich, xen-devel; +Cc: Keir Fraser
On 26/01/15 12:03, Jan Beulich wrote:
> ... when available, i.e. by runtime patching. This saves the
> conditional move, having a back-to-back dependency on BSF's (EFLAGS)
> result.
>
> The need to include asm/cpufeatures.h from asm/bitops.h requires a
> workaround for an otherwise resulting circular header file dependency:
> Provide a mode by which the including site of the former header can
> request to only get the X86_FEATURE_* defines (and very little more)
> from it, allowing it to nevertheless be included in its entirety later
> on.
>
> While doing this I also noticed that the function's "max" parameter was
> pointlessly "unsigned long" - the function only returning
> "unsigned int", this can't be of any use, and hence gets converted at
> once, along with the necessary adjustments to CMOVZ's output operands.
>
> Note that while only alternative_io() is needed by this change (and
> hence gets pulled over from Linux), for completeness its input-only
> counterpart alternative_input() gets added as well.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
>
> --- a/xen/include/asm-x86/alternative.h
> +++ b/xen/include/asm-x86/alternative.h
> @@ -12,6 +12,7 @@
> .byte \alt_len
> .endm
> #else
> +#include <xen/stringify.h>
> #include <xen/types.h>
>
> struct alt_instr {
> @@ -73,6 +74,26 @@ extern void alternative_instructions(voi
> #define alternative(oldinstr, newinstr, feature) \
> asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
>
> +/*
> + * Alternative inline assembly with input.
> + *
> + * Pecularities:
> + * No memory clobber here.
> + * Argument numbers start with 1.
> + * Best is to use constraints that are fixed size (like (%1) ... "r")
> + * If you use variable sized constraints like "m" or "g" in the
> + * replacement make sure to pad to the worst case length.
> + * Leaving an unused argument 0 to keep API compatibility.
> + */
> +#define alternative_input(oldinstr, newinstr, feature, input...) \
> + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
> + : : "i" (0), ## input)
> +
> +/* Like alternative_input, but with a single output argument */
> +#define alternative_io(oldinstr, newinstr, feature, output, input...) \
> + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
> + : output : "i" (0), ## input)
> +
> #endif /* __ASSEMBLY__ */
>
> #endif /* __X86_ALTERNATIVE_H__ */
> --- a/xen/include/asm-x86/bitops.h
> +++ b/xen/include/asm-x86/bitops.h
> @@ -5,7 +5,9 @@
> * Copyright 1992, Linus Torvalds.
> */
>
> -#include <xen/config.h>
> +#include <asm/alternative.h>
> +#define X86_FEATURES_ONLY
> +#include <asm/cpufeature.h>
>
> /*
> * We specify the memory operand as both input and output because the memory
> @@ -313,9 +315,17 @@ extern unsigned int __find_first_zero_bi
> extern unsigned int __find_next_zero_bit(
> const unsigned long *addr, unsigned int size, unsigned int offset);
>
> -static inline unsigned int __scanbit(unsigned long val, unsigned long max)
> +static inline unsigned int __scanbit(unsigned long val, unsigned int max)
> {
> - asm ( "bsf %1,%0 ; cmovz %2,%0" : "=&r" (val) : "r" (val), "r" (max) );
> + if ( __builtin_constant_p(max) && max == BITS_PER_LONG )
> + alternative_io("bsf %[in],%[out]; cmovz %[max],%k[out]",
> + "rep; bsf %[in],%[out]",
> + X86_FEATURE_BMI1,
> + [out] "=&r" (val),
> + [in] "r" (val), [max] "r" (max));
> + else
> + asm ( "bsf %1,%0 ; cmovz %2,%k0"
> + : "=&r" (val) : "r" (val), "r" (max) );
> return (unsigned int)val;
> }
>
> --- a/xen/include/asm-x86/cpufeature.h
> +++ b/xen/include/asm-x86/cpufeature.h
> @@ -5,10 +5,8 @@
> */
>
> #ifndef __ASM_I386_CPUFEATURE_H
> +#ifndef X86_FEATURES_ONLY
> #define __ASM_I386_CPUFEATURE_H
> -
> -#ifndef __ASSEMBLY__
> -#include <xen/bitops.h>
> #endif
>
> #define NCAPINTS 8 /* N 32-bit words worth of info */
> @@ -155,7 +153,9 @@
> #define X86_FEATURE_ADX (7*32+19) /* ADCX, ADOX instructions */
> #define X86_FEATURE_SMAP (7*32+20) /* Supervisor Mode Access Prevention */
>
> -#ifndef __ASSEMBLY__
> +#if !defined(__ASSEMBLY__) && !defined(X86_FEATURES_ONLY)
> +#include <xen/bitops.h>
> +
> #define cpu_has(c, bit) test_bit(bit, (c)->x86_capability)
> #define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability)
> #define cpufeat_mask(idx) (1u << ((idx) & 31))
> @@ -262,6 +262,8 @@ struct cpuid4_info {
> int cpuid4_cache_lookup(int index, struct cpuid4_info *this_leaf);
> #endif
>
> +#undef X86_FEATURES_ONLY
> +
> #endif /* __ASM_I386_CPUFEATURE_H */
>
> /*
>
>
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2015-01-26 13:08 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-01-26 12:03 [PATCH] x86: also use tzcnt instead of bsf in __scanbit() Jan Beulich
2015-01-26 13:08 ` Andrew Cooper
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.