All of lore.kernel.org
 help / color / mirror / Atom feed
* [GIT PULL] x86/fpu changes for v4.4
@ 2015-11-03 11:08 Ingo Molnar
  0 siblings, 0 replies; only message in thread
From: Ingo Molnar @ 2015-11-03 11:08 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Thomas Gleixner, H. Peter Anvin, Borislav Petkov,
	Andrew Morton, Denys Vlasenko

Linus,

Please pull the latest x86-fpu-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86-fpu-for-linus

   # HEAD: 158ecc39185b885420e5136b803b29be2bbec7fb x86/fpu: Fixup uninitialized feature_name warning

There are two main areas of changes:

 - Rework of the extended FPU state code to robustify the kernel's usage of cpuid 
   provided xstate sizes - and related changes. (Dave Hansen)

 - math emulation enhancements: new modern FPU instructions support, with 
   testcases, plus cleanups. (Denys Vlasnko)

 Thanks,

	Ingo

------------------>
Borislav Petkov (1):
      x86/fpu: Fixup uninitialized feature_name warning

Dave Hansen (15):
      x86/fpu: Print xfeature buffer size in decimal
      x86/fpu: Move XSAVE-disabling code to a helper
      x86/fpu: Remove XSTATE_RESERVE
      x86/fpu: Remove partial LWP support definitions
      x86/fpu: Rename XSAVE macros
      x86/fpu: Rename XFEATURES_NR_MAX
      x86/fpu: Rework XSTATE_* macros to remove magic '2'
      x86/fpu: Remove 'xfeature_nr'
      x86/fpu: Add xfeature_enabled() helper instead of test_bit()
      x86/fpu/mpx: Rework MPX 'xstate' types
      x86/fpu: Rework YMM definition
      x86/fpu: Add C structures for AVX-512 state components
      x86/fpu: Correct and check XSAVE xstate size calculations
      x86/fpu: Check to ensure increasing-offset xstate offsets
      x86/fpu: Check CPU-provided sizes against struct declarations

Denys Vlasenko (7):
      x86/fpu/math-emu: Remove !NO_UNDOC_CODE
      x86/fpu/math-emu, selftests: Add tests for FCMOV and FCOMI insns
      x86/fpu/math-emu: Remove define layer for undocumented opcodes
      x86/fpu/math-emu: Add support for F[U]COMI[P] insns
      x86/fpu/math-emu: Add support for FCMOVcc insns
      x86/fpu/math-emu, selftests: Add test for FISTTP instructions
      x86/fpu/math-emu: Add support for FISTTP instructions


 arch/x86/crypto/camellia_aesni_avx2_glue.c |   3 +-
 arch/x86/crypto/camellia_aesni_avx_glue.c  |   3 +-
 arch/x86/crypto/cast5_avx_glue.c           |   3 +-
 arch/x86/crypto/cast6_avx_glue.c           |   3 +-
 arch/x86/crypto/chacha20_glue.c            |   2 +-
 arch/x86/crypto/poly1305_glue.c            |   2 +-
 arch/x86/crypto/serpent_avx2_glue.c        |   3 +-
 arch/x86/crypto/serpent_avx_glue.c         |   3 +-
 arch/x86/crypto/sha1_ssse3_glue.c          |   2 +-
 arch/x86/crypto/sha256_ssse3_glue.c        |   2 +-
 arch/x86/crypto/sha512_ssse3_glue.c        |   2 +-
 arch/x86/crypto/twofish_avx_glue.c         |   2 +-
 arch/x86/include/asm/fpu/types.h           | 148 +++++++----
 arch/x86/include/asm/fpu/xstate.h          |  15 +-
 arch/x86/include/asm/trace/mpx.h           |   7 +-
 arch/x86/kernel/fpu/init.c                 |  18 +-
 arch/x86/kernel/fpu/regset.c               |   4 +-
 arch/x86/kernel/fpu/signal.c               |   6 +-
 arch/x86/kernel/fpu/xstate.c               | 398 +++++++++++++++++++++++++----
 arch/x86/kernel/traps.c                    |   4 +-
 arch/x86/kvm/cpuid.c                       |   4 +-
 arch/x86/kvm/x86.c                         |  27 +-
 arch/x86/kvm/x86.h                         |   6 +-
 arch/x86/math-emu/fpu_aux.c                |  70 +++++
 arch/x86/math-emu/fpu_entry.c              |  96 +++----
 arch/x86/math-emu/fpu_proto.h              |  12 +
 arch/x86/math-emu/load_store.c             |  63 ++++-
 arch/x86/math-emu/reg_compare.c            | 128 ++++++++++
 arch/x86/mm/mpx.c                          |  15 +-
 tools/testing/selftests/x86/Makefile       |   5 +-
 tools/testing/selftests/x86/test_FCMOV.c   |  93 +++++++
 tools/testing/selftests/x86/test_FCOMI.c   | 331 ++++++++++++++++++++++++
 tools/testing/selftests/x86/test_FISTTP.c  | 137 ++++++++++
 33 files changed, 1374 insertions(+), 243 deletions(-)
 create mode 100644 tools/testing/selftests/x86/test_FCMOV.c
 create mode 100644 tools/testing/selftests/x86/test_FCOMI.c
 create mode 100644 tools/testing/selftests/x86/test_FISTTP.c

diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 4c65c70e628b..d84456924563 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -567,7 +567,8 @@ static int __init camellia_aesni_init(void)
 		return -ENODEV;
 	}
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 80a0e4389c9a..12e729bfe71b 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -554,7 +554,8 @@ static int __init camellia_aesni_init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index be00aa48b2b5..8648158f3916 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -469,7 +469,8 @@ static int __init cast5_init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 5dbba7224221..fca459578c35 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -591,7 +591,8 @@ static int __init cast6_init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index effe2160b7c5..722bacea040e 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -130,7 +130,7 @@ static int __init chacha20_simd_mod_init(void)
 
 #ifdef CONFIG_AS_AVX2
 	chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
-			    cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL);
+			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
 #endif
 	return crypto_register_alg(&alg);
 }
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index f7170d764f32..4264a3d59589 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -184,7 +184,7 @@ static int __init poly1305_simd_mod_init(void)
 
 #ifdef CONFIG_AS_AVX2
 	poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
-			    cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL);
+			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
 	alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
 	if (poly1305_use_avx2)
 		alg.descsize += 10 * sizeof(u32);
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 7d838dc4d888..6d198342e2de 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -542,7 +542,8 @@ static int __init init(void)
 		pr_info("AVX2 instructions are not detected.\n");
 		return -ENODEV;
 	}
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index da7dafc9b16d..5dc37026c7ce 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -597,7 +597,8 @@ static int __init serpent_init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 7c48e8b20848..00212c32d4db 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -121,7 +121,7 @@ static struct shash_alg alg = {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
 		if (cpu_has_avx)
 			pr_info("AVX detected but unusable.\n");
 		return false;
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index f8097fc0d1d1..0e0e85aea634 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -130,7 +130,7 @@ static struct shash_alg algs[] = { {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
 		if (cpu_has_avx)
 			pr_info("AVX detected but unusable.\n");
 		return false;
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 2edad7b81870..0c8c38c101ac 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -129,7 +129,7 @@ static struct shash_alg algs[] = { {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
 		if (cpu_has_avx)
 			pr_info("AVX detected but unusable.\n");
 		return false;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index c2bd0ce718ee..b7a3904b953c 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -558,7 +558,7 @@ static int __init twofish_init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index c49c5173158e..1c6f6ac52ad0 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -95,63 +95,122 @@ struct swregs_state {
 /*
  * List of XSAVE features Linux knows about:
  */
-enum xfeature_bit {
-	XSTATE_BIT_FP,
-	XSTATE_BIT_SSE,
-	XSTATE_BIT_YMM,
-	XSTATE_BIT_BNDREGS,
-	XSTATE_BIT_BNDCSR,
-	XSTATE_BIT_OPMASK,
-	XSTATE_BIT_ZMM_Hi256,
-	XSTATE_BIT_Hi16_ZMM,
-
-	XFEATURES_NR_MAX,
+enum xfeature {
+	XFEATURE_FP,
+	XFEATURE_SSE,
+	/*
+	 * Values above here are "legacy states".
+	 * Those below are "extended states".
+	 */
+	XFEATURE_YMM,
+	XFEATURE_BNDREGS,
+	XFEATURE_BNDCSR,
+	XFEATURE_OPMASK,
+	XFEATURE_ZMM_Hi256,
+	XFEATURE_Hi16_ZMM,
+
+	XFEATURE_MAX,
 };
 
-#define XSTATE_FP		(1 << XSTATE_BIT_FP)
-#define XSTATE_SSE		(1 << XSTATE_BIT_SSE)
-#define XSTATE_YMM		(1 << XSTATE_BIT_YMM)
-#define XSTATE_BNDREGS		(1 << XSTATE_BIT_BNDREGS)
-#define XSTATE_BNDCSR		(1 << XSTATE_BIT_BNDCSR)
-#define XSTATE_OPMASK		(1 << XSTATE_BIT_OPMASK)
-#define XSTATE_ZMM_Hi256	(1 << XSTATE_BIT_ZMM_Hi256)
-#define XSTATE_Hi16_ZMM		(1 << XSTATE_BIT_Hi16_ZMM)
+#define XFEATURE_MASK_FP		(1 << XFEATURE_FP)
+#define XFEATURE_MASK_SSE		(1 << XFEATURE_SSE)
+#define XFEATURE_MASK_YMM		(1 << XFEATURE_YMM)
+#define XFEATURE_MASK_BNDREGS		(1 << XFEATURE_BNDREGS)
+#define XFEATURE_MASK_BNDCSR		(1 << XFEATURE_BNDCSR)
+#define XFEATURE_MASK_OPMASK		(1 << XFEATURE_OPMASK)
+#define XFEATURE_MASK_ZMM_Hi256		(1 << XFEATURE_ZMM_Hi256)
+#define XFEATURE_MASK_Hi16_ZMM		(1 << XFEATURE_Hi16_ZMM)
+
+#define XFEATURE_MASK_FPSSE		(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
+#define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK \
+					 | XFEATURE_MASK_ZMM_Hi256 \
+					 | XFEATURE_MASK_Hi16_ZMM)
+
+#define FIRST_EXTENDED_XFEATURE	XFEATURE_YMM
 
-#define XSTATE_FPSSE		(XSTATE_FP | XSTATE_SSE)
-#define XSTATE_AVX512		(XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+struct reg_128_bit {
+	u8      regbytes[128/8];
+};
+struct reg_256_bit {
+	u8	regbytes[256/8];
+};
+struct reg_512_bit {
+	u8	regbytes[512/8];
+};
 
 /*
+ * State component 2:
+ *
  * There are 16x 256-bit AVX registers named YMM0-YMM15.
  * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15)
- * and are stored in 'struct fxregs_state::xmm_space[]'.
+ * and are stored in 'struct fxregs_state::xmm_space[]' in the
+ * "legacy" area.
  *
- * The high 128 bits are stored here:
- *    16x 128 bits == 256 bytes.
+ * The high 128 bits are stored here.
  */
 struct ymmh_struct {
-	u8				ymmh_space[256];
-};
-
-/* We don't support LWP yet: */
-struct lwp_struct {
-	u8				reserved[128];
-};
+	struct reg_128_bit              hi_ymm[16];
+} __packed;
 
 /* Intel MPX support: */
-struct bndreg {
+
+struct mpx_bndreg {
 	u64				lower_bound;
 	u64				upper_bound;
 } __packed;
+/*
+ * State component 3 is used for the 4 128-bit bounds registers
+ */
+struct mpx_bndreg_state {
+	struct mpx_bndreg		bndreg[4];
+} __packed;
 
-struct bndcsr {
+/*
+ * State component 4 is used for the 64-bit user-mode MPX
+ * configuration register BNDCFGU and the 64-bit MPX status
+ * register BNDSTATUS.  We call the pair "BNDCSR".
+ */
+struct mpx_bndcsr {
 	u64				bndcfgu;
 	u64				bndstatus;
 } __packed;
 
-struct mpx_struct {
-	struct bndreg			bndreg[4];
-	struct bndcsr			bndcsr;
-};
+/*
+ * The BNDCSR state is padded out to be 64-bytes in size.
+ */
+struct mpx_bndcsr_state {
+	union {
+		struct mpx_bndcsr		bndcsr;
+		u8				pad_to_64_bytes[64];
+	};
+} __packed;
+
+/* AVX-512 Components: */
+
+/*
+ * State component 5 is used for the 8 64-bit opmask registers
+ * k0-k7 (opmask state).
+ */
+struct avx_512_opmask_state {
+	u64				opmask_reg[8];
+} __packed;
+
+/*
+ * State component 6 is used for the upper 256 bits of the
+ * registers ZMM0-ZMM15. These 16 256-bit values are denoted
+ * ZMM0_H-ZMM15_H (ZMM_Hi256 state).
+ */
+struct avx_512_zmm_uppers_state {
+	struct reg_256_bit		zmm_upper[16];
+} __packed;
+
+/*
+ * State component 7 is used for the 16 512-bit registers
+ * ZMM16-ZMM31 (Hi16_ZMM state).
+ */
+struct avx_512_hi16_state {
+	struct reg_512_bit		hi16_zmm[16];
+} __packed;
 
 struct xstate_header {
 	u64				xfeatures;
@@ -159,22 +218,19 @@ struct xstate_header {
 	u64				reserved[6];
 } __attribute__((packed));
 
-/* New processor state extensions should be added here: */
-#define XSTATE_RESERVE			(sizeof(struct ymmh_struct) + \
-					 sizeof(struct lwp_struct)  + \
-					 sizeof(struct mpx_struct)  )
 /*
  * This is our most modern FPU state format, as saved by the XSAVE
  * and restored by the XRSTOR instructions.
  *
  * It consists of a legacy fxregs portion, an xstate header and
- * subsequent fixed size areas as defined by the xstate header.
- * Not all CPUs support all the extensions.
+ * subsequent areas as defined by the xstate header.  Not all CPUs
+ * support all the extensions, so the size of the extended area
+ * can vary quite a bit between CPUs.
  */
 struct xregs_state {
 	struct fxregs_state		i387;
 	struct xstate_header		header;
-	u8				__reserved[XSTATE_RESERVE];
+	u8				extended_state_area[0];
 } __attribute__ ((packed, aligned (64)));
 
 /*
@@ -182,7 +238,9 @@ struct xregs_state {
  * put together, so that we can pick the right one runtime.
  *
  * The size of the structure is determined by the largest
- * member - which is the xsave area:
+ * member - which is the xsave area.  The padding is there
+ * to ensure that statically-allocated task_structs (just
+ * the init_task today) have enough space.
  */
 union fpregs_state {
 	struct fregs_state		fsave;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 4656b25bb9a7..3a6c89b70307 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -6,7 +6,7 @@
 #include <linux/uaccess.h>
 
 /* Bit 63 of XCR0 is reserved for future expansion */
-#define XSTATE_EXTEND_MASK	(~(XSTATE_FPSSE | (1ULL << 63)))
+#define XFEATURE_MASK_EXTEND	(~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
 
 #define XSTATE_CPUID		0x0000000d
 
@@ -19,14 +19,18 @@
 #define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
 /* Supported features which support lazy state saving */
-#define XSTATE_LAZY	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM		      \
-			| XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+#define XFEATURE_MASK_LAZY	(XFEATURE_MASK_FP | \
+				 XFEATURE_MASK_SSE | \
+				 XFEATURE_MASK_YMM | \
+				 XFEATURE_MASK_OPMASK |	\
+				 XFEATURE_MASK_ZMM_Hi256 | \
+				 XFEATURE_MASK_Hi16_ZMM)
 
 /* Supported features which require eager state saving */
-#define XSTATE_EAGER	(XSTATE_BNDREGS | XSTATE_BNDCSR)
+#define XFEATURE_MASK_EAGER	(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
 
 /* All currently supported features */
-#define XCNTXT_MASK	(XSTATE_LAZY | XSTATE_EAGER)
+#define XCNTXT_MASK	(XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
 
 #ifdef CONFIG_X86_64
 #define REX_PREFIX	"0x48, "
@@ -40,6 +44,7 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
 
+void fpu__xstate_clear_all_cpu_caps(void);
 void *get_xsave_addr(struct xregs_state *xsave, int xstate);
 const void *get_xsave_field_ptr(int xstate_field);
 
diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h
index 173dd3ba108c..0f492fc50bce 100644
--- a/arch/x86/include/asm/trace/mpx.h
+++ b/arch/x86/include/asm/trace/mpx.h
@@ -11,7 +11,7 @@
 TRACE_EVENT(mpx_bounds_register_exception,
 
 	TP_PROTO(void *addr_referenced,
-		 const struct bndreg *bndreg),
+		 const struct mpx_bndreg *bndreg),
 	TP_ARGS(addr_referenced, bndreg),
 
 	TP_STRUCT__entry(
@@ -44,7 +44,7 @@ TRACE_EVENT(mpx_bounds_register_exception,
 
 TRACE_EVENT(bounds_exception_mpx,
 
-	TP_PROTO(const struct bndcsr *bndcsr),
+	TP_PROTO(const struct mpx_bndcsr *bndcsr),
 	TP_ARGS(bndcsr),
 
 	TP_STRUCT__entry(
@@ -116,7 +116,8 @@ TRACE_EVENT(mpx_new_bounds_table,
 /*
  * This gets used outside of MPX-specific code, so we need a stub.
  */
-static inline void trace_bounds_exception_mpx(const struct bndcsr *bndcsr)
+static inline
+void trace_bounds_exception_mpx(const struct mpx_bndcsr *bndcsr)
 {
 }
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index d14e9ac3235a..be39b5fde4b9 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -290,11 +290,11 @@ static void __init fpu__init_system_ctx_switch(void)
 	if (cpu_has_xsaveopt && eagerfpu != DISABLE)
 		eagerfpu = ENABLE;
 
-	if (xfeatures_mask & XSTATE_EAGER) {
+	if (xfeatures_mask & XFEATURE_MASK_EAGER) {
 		if (eagerfpu == DISABLE) {
 			pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
-			       xfeatures_mask & XSTATE_EAGER);
-			xfeatures_mask &= ~XSTATE_EAGER;
+			       xfeatures_mask & XFEATURE_MASK_EAGER);
+			xfeatures_mask &= ~XFEATURE_MASK_EAGER;
 		} else {
 			eagerfpu = ENABLE;
 		}
@@ -354,17 +354,7 @@ static int __init x86_noxsave_setup(char *s)
 	if (strlen(s))
 		return 0;
 
-	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-	setup_clear_cpu_cap(X86_FEATURE_AVX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX2);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
-	setup_clear_cpu_cap(X86_FEATURE_MPX);
+	fpu__xstate_clear_all_cpu_caps();
 
 	return 1;
 }
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index dc60810c1c74..0bc3490420c5 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -66,7 +66,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	 * presence of FP and SSE state.
 	 */
 	if (cpu_has_xsave)
-		fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE;
+		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
 
 	return ret;
 }
@@ -326,7 +326,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	 * presence of FP.
 	 */
 	if (cpu_has_xsave)
-		fpu->state.xsave.header.xfeatures |= XSTATE_FP;
+		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
 	return ret;
 }
 
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 50ec9af1bd51..eb032677f939 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -107,7 +107,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
 	 * header as well as change any contents in the memory layout.
 	 * xrestore as part of sigreturn will capture all the changes.
 	 */
-	xfeatures |= XSTATE_FPSSE;
+	xfeatures |= XFEATURE_MASK_FPSSE;
 
 	err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures);
 
@@ -207,7 +207,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
 		 * layout and not enabled by the OS.
 		 */
 		if (fx_only)
-			header->xfeatures = XSTATE_FPSSE;
+			header->xfeatures = XFEATURE_MASK_FPSSE;
 		else
 			header->xfeatures &= (xfeatures_mask & xfeatures);
 	}
@@ -230,7 +230,7 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_
 {
 	if (use_xsave()) {
 		if ((unsigned long)buf % 64 || fx_only) {
-			u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE;
+			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 			return copy_user_to_fxregs(buf);
 		} else {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 62fc001c7846..6454f2731b56 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -31,12 +31,28 @@ static const char *xfeature_names[] =
  */
 u64 xfeatures_mask __read_mostly;
 
-static unsigned int xstate_offsets[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1};
-static unsigned int xstate_sizes[XFEATURES_NR_MAX]   = { [ 0 ... XFEATURES_NR_MAX - 1] = -1};
+static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
 
-/* The number of supported xfeatures in xfeatures_mask: */
-static unsigned int xfeatures_nr;
+/*
+ * Clear all of the X86_FEATURE_* bits that are unavailable
+ * when the CPU has no XSAVE support.
+ */
+void fpu__xstate_clear_all_cpu_caps(void)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+	setup_clear_cpu_cap(X86_FEATURE_AVX);
+	setup_clear_cpu_cap(X86_FEATURE_AVX2);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+	setup_clear_cpu_cap(X86_FEATURE_MPX);
+}
 
 /*
  * Return whether the system supports a given xfeature.
@@ -53,7 +69,7 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 		/*
 		 * So we use FLS here to be able to print the most advanced
 		 * feature that was requested but is missing. So if a driver
-		 * asks about "XSTATE_SSE | XSTATE_YMM" we'll print the
+		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
 		 * missing AVX feature - this is the most informative message
 		 * to users:
 		 */
@@ -112,7 +128,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 	/*
 	 * FP is in init state
 	 */
-	if (!(xfeatures & XSTATE_FP)) {
+	if (!(xfeatures & XFEATURE_MASK_FP)) {
 		fx->cwd = 0x37f;
 		fx->swd = 0;
 		fx->twd = 0;
@@ -125,7 +141,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 	/*
 	 * SSE is in init state
 	 */
-	if (!(xfeatures & XSTATE_SSE))
+	if (!(xfeatures & XFEATURE_MASK_SSE))
 		memset(&fx->xmm_space[0], 0, 256);
 
 	/*
@@ -169,25 +185,43 @@ void fpu__init_cpu_xstate(void)
 }
 
 /*
+ * Note that in the future we will likely need a pair of
+ * functions here: one for user xstates and the other for
+ * system xstates.  For now, they are the same.
+ */
+static int xfeature_enabled(enum xfeature xfeature)
+{
+	return !!(xfeatures_mask & (1UL << xfeature));
+}
+
+/*
  * Record the offsets and sizes of various xstates contained
  * in the XSAVE state memory layout.
- *
- * ( Note that certain features might be non-present, for them
- *   we'll have 0 offset and 0 size. )
  */
 static void __init setup_xstate_features(void)
 {
-	u32 eax, ebx, ecx, edx, leaf;
-
-	xfeatures_nr = fls64(xfeatures_mask);
-
-	for (leaf = 2; leaf < xfeatures_nr; leaf++) {
-		cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
-
-		xstate_offsets[leaf] = ebx;
-		xstate_sizes[leaf] = eax;
+	u32 eax, ebx, ecx, edx, i;
+	/* start at the beginnning of the "extended state" */
+	unsigned int last_good_offset = offsetof(struct xregs_state,
+						 extended_state_area);
+
+	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+		if (!xfeature_enabled(i))
+			continue;
+
+		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+		xstate_offsets[i] = ebx;
+		xstate_sizes[i] = eax;
+		/*
+		 * In our xstate size checks, we assume that the
+		 * highest-numbered xstate feature has the
+		 * highest offset in the buffer.  Ensure it does.
+		 */
+		WARN_ONCE(last_good_offset > xstate_offsets[i],
+			"x86/fpu: misordered xstate at %d\n", last_good_offset);
+		last_good_offset = xstate_offsets[i];
 
-		printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %04x, xstate_sizes[%d]: %04x\n", leaf, ebx, leaf, eax);
+		printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax);
 	}
 }
 
@@ -204,14 +238,14 @@ static void __init print_xstate_feature(u64 xstate_mask)
  */
 static void __init print_xstate_features(void)
 {
-	print_xstate_feature(XSTATE_FP);
-	print_xstate_feature(XSTATE_SSE);
-	print_xstate_feature(XSTATE_YMM);
-	print_xstate_feature(XSTATE_BNDREGS);
-	print_xstate_feature(XSTATE_BNDCSR);
-	print_xstate_feature(XSTATE_OPMASK);
-	print_xstate_feature(XSTATE_ZMM_Hi256);
-	print_xstate_feature(XSTATE_Hi16_ZMM);
+	print_xstate_feature(XFEATURE_MASK_FP);
+	print_xstate_feature(XFEATURE_MASK_SSE);
+	print_xstate_feature(XFEATURE_MASK_YMM);
+	print_xstate_feature(XFEATURE_MASK_BNDREGS);
+	print_xstate_feature(XFEATURE_MASK_BNDCSR);
+	print_xstate_feature(XFEATURE_MASK_OPMASK);
+	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
+	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
 }
 
 /*
@@ -233,8 +267,8 @@ static void __init setup_xstate_comp(void)
 	xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
 
 	if (!cpu_has_xsaves) {
-		for (i = 2; i < xfeatures_nr; i++) {
-			if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
+		for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+			if (xfeature_enabled(i)) {
 				xstate_comp_offsets[i] = xstate_offsets[i];
 				xstate_comp_sizes[i] = xstate_sizes[i];
 			}
@@ -242,15 +276,16 @@ static void __init setup_xstate_comp(void)
 		return;
 	}
 
-	xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
+		FXSAVE_SIZE + XSAVE_HDR_SIZE;
 
-	for (i = 2; i < xfeatures_nr; i++) {
-		if (test_bit(i, (unsigned long *)&xfeatures_mask))
+	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+		if (xfeature_enabled(i))
 			xstate_comp_sizes[i] = xstate_sizes[i];
 		else
 			xstate_comp_sizes[i] = 0;
 
-		if (i > 2)
+		if (i > FIRST_EXTENDED_XFEATURE)
 			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
 					+ xstate_comp_sizes[i-1];
 
@@ -290,27 +325,280 @@ static void __init setup_init_fpu_buf(void)
 	copy_xregs_to_kernel_booting(&init_fpstate.xsave);
 }
 
+static int xfeature_is_supervisor(int xfeature_nr)
+{
+	/*
+	 * We currently do not support supervisor states, but if
+	 * we did, we could find out like this.
+	 *
+	 * SDM says: If state component i is a user state component,
+	 * ECX[0] return 0; if state component i is a supervisor
+	 * state component, ECX[0] returns 1.
+	u32 eax, ebx, ecx, edx;
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx;
+	return !!(ecx & 1);
+	*/
+	return 0;
+}
+/*
+static int xfeature_is_user(int xfeature_nr)
+{
+	return !xfeature_is_supervisor(xfeature_nr);
+}
+*/
+
+/*
+ * This check is important because it is easy to get XSTATE_*
+ * confused with XSTATE_BIT_*.
+ */
+#define CHECK_XFEATURE(nr) do {		\
+	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
+	WARN_ON(nr >= XFEATURE_MAX);	\
+} while (0)
+
+/*
+ * We could cache this like xstate_size[], but we only use
+ * it here, so it would be a waste of space.
+ */
+static int xfeature_is_aligned(int xfeature_nr)
+{
+	u32 eax, ebx, ecx, edx;
+
+	CHECK_XFEATURE(xfeature_nr);
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+	/*
+	 * The value returned by ECX[1] indicates the alignment
+	 * of state component i when the compacted format
+	 * of the extended region of an XSAVE area is used
+	 */
+	return !!(ecx & 2);
+}
+
+static int xfeature_uncompacted_offset(int xfeature_nr)
+{
+	u32 eax, ebx, ecx, edx;
+
+	CHECK_XFEATURE(xfeature_nr);
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+	return ebx;
+}
+
+static int xfeature_size(int xfeature_nr)
+{
+	u32 eax, ebx, ecx, edx;
+
+	CHECK_XFEATURE(xfeature_nr);
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+	return eax;
+}
+
+/*
+ * 'XSAVES' implies two different things:
+ * 1. saving of supervisor/system state
+ * 2. using the compacted format
+ *
+ * Use this function when dealing with the compacted format so
+ * that it is obvious which aspect of 'XSAVES' is being handled
+ * by the calling code.
+ */
+static int using_compacted_format(void)
+{
+	return cpu_has_xsaves;
+}
+
+static void __xstate_dump_leaves(void)
+{
+	int i;
+	u32 eax, ebx, ecx, edx;
+	static int should_dump = 1;
+
+	if (!should_dump)
+		return;
+	should_dump = 0;
+	/*
+	 * Dump out a few leaves past the ones that we support
+	 * just in case there are some goodies up there
+	 */
+	for (i = 0; i < XFEATURE_MAX + 10; i++) {
+		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+			XSTATE_CPUID, i, eax, ebx, ecx, edx);
+	}
+}
+
+#define XSTATE_WARN_ON(x) do {							\
+	if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) {	\
+		__xstate_dump_leaves();						\
+	}									\
+} while (0)
+
+#define XCHECK_SZ(sz, nr, nr_macro, __struct) do {			\
+	if ((nr == nr_macro) &&						\
+	    WARN_ONCE(sz != sizeof(__struct),				\
+		"%s: struct is %zu bytes, cpu state %d bytes\n",	\
+		__stringify(nr_macro), sizeof(__struct), sz)) {		\
+		__xstate_dump_leaves();					\
+	}								\
+} while (0)
+
+/*
+ * We have a C struct for each 'xstate'.  We need to ensure
+ * that our software representation matches what the CPU
+ * tells us about the state's size.
+ */
+static void check_xstate_against_struct(int nr)
+{
+	/*
+	 * Ask the CPU for the size of the state.
+	 */
+	int sz = xfeature_size(nr);
+	/*
+	 * Match each CPU state with the corresponding software
+	 * structure.
+	 */
+	XCHECK_SZ(sz, nr, XFEATURE_YMM,       struct ymmh_struct);
+	XCHECK_SZ(sz, nr, XFEATURE_BNDREGS,   struct mpx_bndreg_state);
+	XCHECK_SZ(sz, nr, XFEATURE_BNDCSR,    struct mpx_bndcsr_state);
+	XCHECK_SZ(sz, nr, XFEATURE_OPMASK,    struct avx_512_opmask_state);
+	XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
+	XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
+
+	/*
+	 * Make *SURE* to add any feature numbers in below if
+	 * there are "holes" in the xsave state component
+	 * numbers.
+	 */
+	if ((nr < XFEATURE_YMM) ||
+	    (nr >= XFEATURE_MAX)) {
+		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
+		XSTATE_WARN_ON(1);
+	}
+}
+
+/*
+ * This essentially double-checks what the cpu told us about
+ * how large the XSAVE buffer needs to be.  We are recalculating
+ * it to be safe.
+ */
+static void do_extra_xstate_size_checks(void)
+{
+	int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	int i;
+
+	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+		if (!xfeature_enabled(i))
+			continue;
+
+		check_xstate_against_struct(i);
+		/*
+		 * Supervisor state components can be managed only by
+		 * XSAVES, which is compacted-format only.
+		 */
+		if (!using_compacted_format())
+			XSTATE_WARN_ON(xfeature_is_supervisor(i));
+
+		/* Align from the end of the previous feature */
+		if (xfeature_is_aligned(i))
+			paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
+		/*
+		 * The offset of a given state in the non-compacted
+		 * format is given to us in a CPUID leaf.  We check
+		 * them for being ordered (increasing offsets) in
+		 * setup_xstate_features().
+		 */
+		if (!using_compacted_format())
+			paranoid_xstate_size = xfeature_uncompacted_offset(i);
+		/*
+		 * The compacted-format offset always depends on where
+		 * the previous state ended.
+		 */
+		paranoid_xstate_size += xfeature_size(i);
+	}
+	XSTATE_WARN_ON(paranoid_xstate_size != xstate_size);
+}
+
 /*
  * Calculate total size of enabled xstates in XCR0/xfeatures_mask.
+ *
+ * Note the SDM's wording here.  "sub-function 0" only enumerates
+ * the size of the *user* states.  If we use it to size a buffer
+ * that we use 'XSAVES' on, we could potentially overflow the
+ * buffer because 'XSAVES' saves system states too.
+ *
+ * Note that we do not currently set any bits on IA32_XSS so
+ * 'XCR0 | IA32_XSS == XCR0' for now.
  */
-static void __init init_xstate_size(void)
+static unsigned int __init calculate_xstate_size(void)
 {
 	unsigned int eax, ebx, ecx, edx;
-	int i;
+	unsigned int calculated_xstate_size;
 
 	if (!cpu_has_xsaves) {
+		/*
+		 * - CPUID function 0DH, sub-function 0:
+		 *    EBX enumerates the size (in bytes) required by
+		 *    the XSAVE instruction for an XSAVE area
+		 *    containing all the *user* state components
+		 *    corresponding to bits currently set in XCR0.
+		 */
 		cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-		xstate_size = ebx;
-		return;
+		calculated_xstate_size = ebx;
+	} else {
+		/*
+		 * - CPUID function 0DH, sub-function 1:
+		 *    EBX enumerates the size (in bytes) required by
+		 *    the XSAVES instruction for an XSAVE area
+		 *    containing all the state components
+		 *    corresponding to bits currently set in
+		 *    XCR0 | IA32_XSS.
+		 */
+		cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+		calculated_xstate_size = ebx;
 	}
+	return calculated_xstate_size;
+}
 
-	xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-	for (i = 2; i < 64; i++) {
-		if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
-			cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
-			xstate_size += eax;
-		}
-	}
+/*
+ * Will the runtime-enumerated 'xstate_size' fit in the init
+ * task's statically-allocated buffer?
+ */
+static bool is_supported_xstate_size(unsigned int test_xstate_size)
+{
+	if (test_xstate_size <= sizeof(union fpregs_state))
+		return true;
+
+	pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
+			sizeof(union fpregs_state), test_xstate_size);
+	return false;
+}
+
+static int init_xstate_size(void)
+{
+	/* Recompute the context size for enabled features: */
+	unsigned int possible_xstate_size = calculate_xstate_size();
+
+	/* Ensure we have the space to store all enabled: */
+	if (!is_supported_xstate_size(possible_xstate_size))
+		return -EINVAL;
+
+	/*
+	 * The size is OK, we are definitely going to use xsave,
+	 * make it known to the world that we need more space.
+	 */
+	xstate_size = possible_xstate_size;
+	do_extra_xstate_size_checks();
+	return 0;
+}
+
+/*
+ * We enabled the XSAVE hardware, but something went wrong and
+ * we can not use it.  Disable it.
+ */
+static void fpu__init_disable_system_xstate(void)
+{
+	xfeatures_mask = 0;
+	cr4_clear_bits(X86_CR4_OSXSAVE);
+	fpu__xstate_clear_all_cpu_caps();
 }
 
 /*
@@ -321,6 +609,7 @@ void __init fpu__init_system_xstate(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 	static int on_boot_cpu = 1;
+	int err;
 
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
@@ -338,7 +627,7 @@ void __init fpu__init_system_xstate(void)
 	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
 	xfeatures_mask = eax + ((u64)edx << 32);
 
-	if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
+	if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
 		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
 		BUG();
 	}
@@ -348,16 +637,19 @@ void __init fpu__init_system_xstate(void)
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
-
-	/* Recompute the context size for enabled features: */
-	init_xstate_size();
+	err = init_xstate_size();
+	if (err) {
+		/* something went wrong, boot without any XSAVE support */
+		fpu__init_disable_system_xstate();
+		return;
+	}
 
 	update_regset_xstate_info(xstate_size, xfeatures_mask);
 	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
 	setup_xstate_comp();
 
-	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n",
+	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
 		xfeatures_mask,
 		xstate_size,
 		cpu_has_xsaves ? "compacted" : "standard");
@@ -388,7 +680,7 @@ void fpu__resume_cpu(void)
  * Inputs:
  *	xstate: the thread's storage area for all FPU data
  *	xstate_feature: state which is defined in xsave.h (e.g.
- *	XSTATE_FP, XSTATE_SSE, etc...)
+ *	XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...)
  * Output:
  *	address of the state in the xsave area, or NULL if the
  *	field is not present in the xsave buffer.
@@ -439,8 +731,8 @@ EXPORT_SYMBOL_GPL(get_xsave_addr);
  * Note that this only works on the current task.
  *
  * Inputs:
- *	@xsave_state: state which is defined in xsave.h (e.g. XSTATE_FP,
- *	XSTATE_SSE, etc...)
+ *	@xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
+ *	XFEATURE_MASK_SSE, etc...)
  * Output:
  *	address of the state in the xsave area or NULL if the state
  *	is not present or is in its 'init state'.
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 346eec73f7db..ade185a46b1d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 {
-	const struct bndcsr *bndcsr;
+	const struct mpx_bndcsr *bndcsr;
 	siginfo_t *info;
 
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
@@ -384,7 +384,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	 * which is all zeros which indicates MPX was not
 	 * responsible for the exception.
 	 */
-	bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
 	if (!bndcsr)
 		goto exit_trap;
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2fbea2544f24..156441bcaac8 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -30,7 +30,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 	int feature_bit = 0;
 	u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
 
-	xstate_bv &= XSTATE_EXTEND_MASK;
+	xstate_bv &= XFEATURE_MASK_EXTEND;
 	while (xstate_bv) {
 		if (xstate_bv & 0x1) {
 		        u32 eax, ebx, ecx, edx, offset;
@@ -51,7 +51,7 @@ u64 kvm_supported_xcr0(void)
 	u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
 
 	if (!kvm_x86_ops->mpx_supported())
-		xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR);
+		xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 
 	return xcr0;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a60bdbccff51..2d4e54db49af 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -662,9 +662,9 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
 	if (index != XCR_XFEATURE_ENABLED_MASK)
 		return 1;
-	if (!(xcr0 & XSTATE_FP))
+	if (!(xcr0 & XFEATURE_MASK_FP))
 		return 1;
-	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+	if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
 		return 1;
 
 	/*
@@ -672,23 +672,24 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 	 * saving.  However, xcr0 bit 0 is always set, even if the
 	 * emulated CPU does not support XSAVE (see fx_init).
 	 */
-	valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
+	valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
 	if (xcr0 & ~valid_bits)
 		return 1;
 
-	if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
+	if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
+	    (!(xcr0 & XFEATURE_MASK_BNDCSR)))
 		return 1;
 
-	if (xcr0 & XSTATE_AVX512) {
-		if (!(xcr0 & XSTATE_YMM))
+	if (xcr0 & XFEATURE_MASK_AVX512) {
+		if (!(xcr0 & XFEATURE_MASK_YMM))
 			return 1;
-		if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512)
+		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
 			return 1;
 	}
 	kvm_put_guest_xcr0(vcpu);
 	vcpu->arch.xcr0 = xcr0;
 
-	if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
+	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
 		kvm_update_cpuid(vcpu);
 	return 0;
 }
@@ -2906,7 +2907,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 	 * Copy each region from the possibly compacted offset to the
 	 * non-compacted offset.
 	 */
-	valid = xstate_bv & ~XSTATE_FPSSE;
+	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
 		u64 feature = valid & -valid;
 		int index = fls64(feature) - 1;
@@ -2944,7 +2945,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 	 * Copy each region from the non-compacted offset to the
 	 * possibly compacted offset.
 	 */
-	valid = xstate_bv & ~XSTATE_FPSSE;
+	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
 		u64 feature = valid & -valid;
 		int index = fls64(feature) - 1;
@@ -2972,7 +2973,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 			&vcpu->arch.guest_fpu.state.fxsave,
 			sizeof(struct fxregs_state));
 		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
-			XSTATE_FPSSE;
+			XFEATURE_MASK_FPSSE;
 	}
 }
 
@@ -2992,7 +2993,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 		load_xsave(vcpu, (u8 *)guest_xsave->region);
 	} else {
-		if (xstate_bv & ~XSTATE_FPSSE)
+		if (xstate_bv & ~XFEATURE_MASK_FPSSE)
 			return -EINVAL;
 		memcpy(&vcpu->arch.guest_fpu.state.fxsave,
 			guest_xsave->region, sizeof(struct fxregs_state));
@@ -7001,7 +7002,7 @@ static void fx_init(struct kvm_vcpu *vcpu)
 	/*
 	 * Ensure guest xcr0 is valid for loading
 	 */
-	vcpu->arch.xcr0 = XSTATE_FP;
+	vcpu->arch.xcr0 = XFEATURE_MASK_FP;
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
 }
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2f822cd886c2..f2afa5fe48a6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -180,9 +180,9 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
 					  int page_num);
 
-#define KVM_SUPPORTED_XCR0     (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
-				| XSTATE_BNDREGS | XSTATE_BNDCSR \
-				| XSTATE_AVX512)
+#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512)
 extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index dd76a05729b0..024f6e971174 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -169,6 +169,76 @@ void fxch_i(void)
 	fpu_tag_word = tag_word;
 }
 
+static void fcmovCC(void)
+{
+	/* fcmovCC st(i) */
+	int i = FPU_rm;
+	FPU_REG *st0_ptr = &st(0);
+	FPU_REG *sti_ptr = &st(i);
+	long tag_word = fpu_tag_word;
+	int regnr = top & 7;
+	int regnri = (top + i) & 7;
+	u_char sti_tag = (tag_word >> (regnri * 2)) & 3;
+
+	if (sti_tag == TAG_Empty) {
+		FPU_stack_underflow();
+		clear_C1();
+		return;
+	}
+	reg_copy(sti_ptr, st0_ptr);
+	tag_word &= ~(3 << (regnr * 2));
+	tag_word |= (sti_tag << (regnr * 2));
+	fpu_tag_word = tag_word;
+}
+
+void fcmovb(void)
+{
+	if (FPU_EFLAGS & X86_EFLAGS_CF)
+		fcmovCC();
+}
+
+void fcmove(void)
+{
+	if (FPU_EFLAGS & X86_EFLAGS_ZF)
+		fcmovCC();
+}
+
+void fcmovbe(void)
+{
+	if (FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF))
+		fcmovCC();
+}
+
+void fcmovu(void)
+{
+	if (FPU_EFLAGS & X86_EFLAGS_PF)
+		fcmovCC();
+}
+
+void fcmovnb(void)
+{
+	if (!(FPU_EFLAGS & X86_EFLAGS_CF))
+		fcmovCC();
+}
+
+void fcmovne(void)
+{
+	if (!(FPU_EFLAGS & X86_EFLAGS_ZF))
+		fcmovCC();
+}
+
+void fcmovnbe(void)
+{
+	if (!(FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF)))
+		fcmovCC();
+}
+
+void fcmovnu(void)
+{
+	if (!(FPU_EFLAGS & X86_EFLAGS_PF))
+		fcmovCC();
+}
+
 void ffree_(void)
 {
 	/* ffree st(i) */
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 3d8f2e421466..e945fedf1de2 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -40,49 +40,33 @@
 
 #define __BAD__ FPU_illegal	/* Illegal on an 80486, causes SIGILL */
 
-#ifndef NO_UNDOC_CODE		/* Un-documented FPU op-codes supported by default. */
+/* fcmovCC and f(u)comi(p) are enabled if CPUID(1).EDX(15) "cmov" is set */
 
-/* WARNING: These codes are not documented by Intel in their 80486 manual
-   and may not work on FPU clones or later Intel FPUs. */
-
-/* Changes to support the un-doc codes provided by Linus Torvalds. */
-
-#define _d9_d8_ fstp_i		/* unofficial code (19) */
-#define _dc_d0_ fcom_st		/* unofficial code (14) */
-#define _dc_d8_ fcompst		/* unofficial code (1c) */
-#define _dd_c8_ fxch_i		/* unofficial code (0d) */
-#define _de_d0_ fcompst		/* unofficial code (16) */
-#define _df_c0_ ffreep		/* unofficial code (07) ffree + pop */
-#define _df_c8_ fxch_i		/* unofficial code (0f) */
-#define _df_d0_ fstp_i		/* unofficial code (17) */
-#define _df_d8_ fstp_i		/* unofficial code (1f) */
+/* WARNING: "u" entries are not documented by Intel in their 80486 manual
+   and may not work on FPU clones or later Intel FPUs.
+   Changes to support them provided by Linus Torvalds. */
 
 static FUNC const st_instr_table[64] = {
-	fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_,
-	fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_,
-	fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_,
-	fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_,
-	fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
-	fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
-	fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
-	fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
+/* Opcode:	d8		d9		da		db */
+/*		dc		dd		de		df */
+/* c0..7 */	fadd__,		fld_i_,		fcmovb,		fcmovnb,
+/* c0..7 */	fadd_i,		ffree_,		faddp_,		ffreep,/*u*/
+/* c8..f */	fmul__,		fxch_i,		fcmove,		fcmovne,
+/* c8..f */	fmul_i,		fxch_i,/*u*/	fmulp_,		fxch_i,/*u*/
+/* d0..7 */	fcom_st,	fp_nop,		fcmovbe,	fcmovnbe,
+/* d0..7 */	fcom_st,/*u*/	fst_i_,		fcompst,/*u*/	fstp_i,/*u*/
+/* d8..f */	fcompst,	fstp_i,/*u*/	fcmovu,		fcmovnu,
+/* d8..f */	fcompst,/*u*/	fstp_i,		fcompp,		fstp_i,/*u*/
+/* e0..7 */	fsub__,		FPU_etc,	__BAD__,	finit_,
+/* e0..7 */	fsubri,		fucom_,		fsubrp,		fstsw_,
+/* e8..f */	fsubr_,		fconst,		fucompp,	fucomi_,
+/* e8..f */	fsub_i,		fucomp,		fsubp_,		fucomip,
+/* f0..7 */	fdiv__,		FPU_triga,	__BAD__,	fcomi_,
+/* f0..7 */	fdivri,		__BAD__,	fdivrp,		fcomip,
+/* f8..f */	fdivr_,		FPU_trigb,	__BAD__,	__BAD__,
+/* f8..f */	fdiv_i,		__BAD__,	fdivp_,		__BAD__,
 };
 
-#else /* Support only documented FPU op-codes */
-
-static FUNC const st_instr_table[64] = {
-	fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__,
-	fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__,
-	fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__,
-	fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__,
-	fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
-	fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
-	fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
-	fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
-};
-
-#endif /* NO_UNDOC_CODE */
-
 #define _NONE_ 0		/* Take no special action */
 #define _REG0_ 1		/* Need to check for not empty st(0) */
 #define _REGI_ 2		/* Need to check for not empty st(0) and st(rm) */
@@ -94,36 +78,18 @@ static FUNC const st_instr_table[64] = {
 #define _REGIc 0		/* Compare st(0) and st(rm) */
 #define _REGIn 0		/* Uses st(0) and st(rm), but handle checks later */
 
-#ifndef NO_UNDOC_CODE
-
-/* Un-documented FPU op-codes supported by default. (see above) */
-
 static u_char const type_table[64] = {
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_,
-	_REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_,
-	_REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
-	_REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
-	_REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
-	_REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
+/* Opcode:	d8	d9	da	db	dc	dd	de	df */
+/* c0..7 */	_REGI_, _NONE_, _REGIn, _REGIn, _REGIi, _REGi_, _REGIp, _REGi_,
+/* c8..f */	_REGI_, _REGIn, _REGIn, _REGIn, _REGIi, _REGI_, _REGIp, _REGI_,
+/* d0..7 */	_REGIc, _NONE_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_,
+/* d8..f */	_REGIc, _REG0_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_,
+/* e0..7 */	_REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
+/* e8..f */	_REGI_, _NONE_, _REGIc, _REGIc, _REGIi, _REGIc, _REGIp, _REGIc,
+/* f0..7 */	_REGI_, _NONE_, _null_, _REGIc, _REGIi, _null_, _REGIp, _REGIc,
+/* f8..f */	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
 };
 
-#else /* Support only documented FPU op-codes */
-
-static u_char const type_table[64] = {
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_,
-	_REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-	_REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_,
-	_REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_,
-	_REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
-	_REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
-};
-
-#endif /* NO_UNDOC_CODE */
-
 #ifdef RE_ENTRANT_CHECKING
 u_char emulating = 0;
 #endif /* RE_ENTRANT_CHECKING */
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
index 9779df436b7d..caff438b9c1d 100644
--- a/arch/x86/math-emu/fpu_proto.h
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -46,6 +46,14 @@ extern void fstsw_(void);
 extern void fp_nop(void);
 extern void fld_i_(void);
 extern void fxch_i(void);
+extern void fcmovb(void);
+extern void fcmove(void);
+extern void fcmovbe(void);
+extern void fcmovu(void);
+extern void fcmovnb(void);
+extern void fcmovne(void);
+extern void fcmovnbe(void);
+extern void fcmovnu(void);
 extern void ffree_(void);
 extern void ffreep(void);
 extern void fst_i_(void);
@@ -108,6 +116,10 @@ extern void fcompp(void);
 extern void fucom_(void);
 extern void fucomp(void);
 extern void fucompp(void);
+extern void fcomi_(void);
+extern void fcomip(void);
+extern void fucomi_(void);
+extern void fucomip(void);
 /* reg_constant.c */
 extern void fconst(void);
 /* reg_ld_str.c */
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
index 2931ff355218..95228ff042c0 100644
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -33,11 +33,12 @@
 
 #define pop_0()	{ FPU_settag0(TAG_Empty); top++; }
 
+/* index is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */
 static u_char const type_table[32] = {
-	_PUSH_, _PUSH_, _PUSH_, _PUSH_,
-	_null_, _null_, _null_, _null_,
-	_REG0_, _REG0_, _REG0_, _REG0_,
-	_REG0_, _REG0_, _REG0_, _REG0_,
+	_PUSH_, _PUSH_, _PUSH_, _PUSH_, /* /0: d9:fld f32,  db:fild m32,  dd:fld f64,  df:fild m16 */
+	_null_, _REG0_, _REG0_, _REG0_, /* /1: d9:undef,    db,dd,df:fisttp m32/64/16 */
+	_REG0_, _REG0_, _REG0_, _REG0_, /* /2: d9:fst f32,  db:fist m32,  dd:fst f64,  df:fist m16 */
+	_REG0_, _REG0_, _REG0_, _REG0_, /* /3: d9:fstp f32, db:fistp m32, dd:fstp f64, df:fistp m16 */
 	_NONE_, _null_, _NONE_, _PUSH_,
 	_NONE_, _PUSH_, _null_, _PUSH_,
 	_NONE_, _null_, _NONE_, _REG0_,
@@ -45,15 +46,19 @@ static u_char const type_table[32] = {
 };
 
 u_char const data_sizes_16[32] = {
-	4, 4, 8, 2, 0, 0, 0, 0,
-	4, 4, 8, 2, 4, 4, 8, 2,
+	4, 4, 8, 2,
+	0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */
+	4, 4, 8, 2,
+	4, 4, 8, 2,
 	14, 0, 94, 10, 2, 10, 0, 8,
 	14, 0, 94, 10, 2, 10, 2, 8
 };
 
 static u_char const data_sizes_32[32] = {
-	4, 4, 8, 2, 0, 0, 0, 0,
-	4, 4, 8, 2, 4, 4, 8, 2,
+	4, 4, 8, 2,
+	0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */
+	4, 4, 8, 2,
+	4, 4, 8, 2,
 	28, 0, 108, 10, 2, 10, 0, 8,
 	28, 0, 108, 10, 2, 10, 2, 8
 };
@@ -65,6 +70,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 	FPU_REG *st0_ptr;
 	u_char st0_tag = TAG_Empty;	/* This is just to stop a gcc warning. */
 	u_char loaded_tag;
+	int sv_cw;
 
 	st0_ptr = NULL;		/* Initialized just to stop compiler warnings. */
 
@@ -111,7 +117,8 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 	}
 
 	switch (type) {
-	case 000:		/* fld m32real */
+	/* type is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */
+	case 000:		/* fld m32real (d9 /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_single((float __user *)data_address, &loaded_data);
@@ -123,13 +130,13 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 		}
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
-	case 001:		/* fild m32int */
+	case 001:		/* fild m32int (db /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_int32((long __user *)data_address, &loaded_data);
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
-	case 002:		/* fld m64real */
+	case 002:		/* fld m64real (dd /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_double((double __user *)data_address,
@@ -142,12 +149,44 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 		}
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
-	case 003:		/* fild m16int */
+	case 003:		/* fild m16int (df /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_int16((short __user *)data_address, &loaded_data);
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
+	/* case 004: undefined (d9 /1) */
+	/* fisttp are enabled if CPUID(1).ECX(0) "sse3" is set */
+	case 005:		/* fisttp m32int (db /1) */
+		clear_C1();
+		sv_cw = control_word;
+		control_word |= RC_CHOP;
+		if (FPU_store_int32
+		    (st0_ptr, st0_tag, (long __user *)data_address))
+			pop_0();	/* pop only if the number was actually stored
+					   (see the 80486 manual p16-28) */
+		control_word = sv_cw;
+		break;
+	case 006:		/* fisttp m64int (dd /1) */
+		clear_C1();
+		sv_cw = control_word;
+		control_word |= RC_CHOP;
+		if (FPU_store_int64
+		    (st0_ptr, st0_tag, (long long __user *)data_address))
+			pop_0();	/* pop only if the number was actually stored
+					   (see the 80486 manual p16-28) */
+		control_word = sv_cw;
+		break;
+	case 007:		/* fisttp m16int (df /1) */
+		clear_C1();
+		sv_cw = control_word;
+		control_word |= RC_CHOP;
+		if (FPU_store_int16
+		    (st0_ptr, st0_tag, (short __user *)data_address))
+			pop_0();	/* pop only if the number was actually stored
+					   (see the 80486 manual p16-28) */
+		control_word = sv_cw;
+		break;
 	case 010:		/* fst m32real */
 		clear_C1();
 		FPU_store_single(st0_ptr, st0_tag,
diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c
index ecce55fc2e2e..b77360fdbf4a 100644
--- a/arch/x86/math-emu/reg_compare.c
+++ b/arch/x86/math-emu/reg_compare.c
@@ -249,6 +249,54 @@ static int compare_st_st(int nr)
 	return 0;
 }
 
+static int compare_i_st_st(int nr)
+{
+	int f, c;
+	FPU_REG *st_ptr;
+
+	if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		/* Stack fault */
+		EXCEPTION(EX_StackUnder);
+		return !(control_word & CW_Invalid);
+	}
+
+	partial_status &= ~SW_C0;
+	st_ptr = &st(nr);
+	c = compare(st_ptr, FPU_gettagi(nr));
+	if (c & COMP_NaN) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		EXCEPTION(EX_Invalid);
+		return !(control_word & CW_Invalid);
+	}
+
+	switch (c & 7) {
+	case COMP_A_lt_B:
+		f = X86_EFLAGS_CF;
+		break;
+	case COMP_A_eq_B:
+		f = X86_EFLAGS_ZF;
+		break;
+	case COMP_A_gt_B:
+		f = 0;
+		break;
+	case COMP_No_Comp:
+		f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
+		break;
+#ifdef PARANOID
+	default:
+		EXCEPTION(EX_INTERNAL | 0x122);
+		f = 0;
+		break;
+#endif /* PARANOID */
+	}
+	FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
+	if (c & COMP_Denormal) {
+		return denormal_operand() < 0;
+	}
+	return 0;
+}
+
 static int compare_u_st_st(int nr)
 {
 	int f = 0, c;
@@ -299,6 +347,58 @@ static int compare_u_st_st(int nr)
 	return 0;
 }
 
+static int compare_ui_st_st(int nr)
+{
+	int f = 0, c;
+	FPU_REG *st_ptr;
+
+	if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		/* Stack fault */
+		EXCEPTION(EX_StackUnder);
+		return !(control_word & CW_Invalid);
+	}
+
+	partial_status &= ~SW_C0;
+	st_ptr = &st(nr);
+	c = compare(st_ptr, FPU_gettagi(nr));
+	if (c & COMP_NaN) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		if (c & COMP_SNaN) {	/* This is the only difference between
+					   un-ordered and ordinary comparisons */
+			EXCEPTION(EX_Invalid);
+			return !(control_word & CW_Invalid);
+		}
+		return 0;
+	}
+
+	switch (c & 7) {
+	case COMP_A_lt_B:
+		f = X86_EFLAGS_CF;
+		break;
+	case COMP_A_eq_B:
+		f = X86_EFLAGS_ZF;
+		break;
+	case COMP_A_gt_B:
+		f = 0;
+		break;
+	case COMP_No_Comp:
+		f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
+		break;
+#ifdef PARANOID
+	default:
+		EXCEPTION(EX_INTERNAL | 0x123);
+		f = 0;
+		break;
+#endif /* PARANOID */
+	}
+	FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
+	if (c & COMP_Denormal) {
+		return denormal_operand() < 0;
+	}
+	return 0;
+}
+
 /*---------------------------------------------------------------------------*/
 
 void fcom_st(void)
@@ -348,3 +448,31 @@ void fucompp(void)
 	} else
 		FPU_illegal();
 }
+
+/* P6+ compare-to-EFLAGS ops */
+
+void fcomi_(void)
+{
+	/* fcomi st(i) */
+	compare_i_st_st(FPU_rm);
+}
+
+void fcomip(void)
+{
+	/* fcomip st(i) */
+	if (!compare_i_st_st(FPU_rm))
+		FPU_pop();
+}
+
+void fucomi_(void)
+{
+	/* fucomi st(i) */
+	compare_ui_st_st(FPU_rm);
+}
+
+void fucomip(void)
+{
+	/* fucomip st(i) */
+	if (!compare_ui_st_st(FPU_rm))
+		FPU_pop();
+}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 134948b0926f..b0ae85f90f10 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -237,7 +237,8 @@ static int mpx_insn_decode(struct insn *insn,
  */
 siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 {
-	const struct bndreg *bndregs, *bndreg;
+	const struct mpx_bndreg_state *bndregs;
+	const struct mpx_bndreg *bndreg;
 	siginfo_t *info = NULL;
 	struct insn insn;
 	uint8_t bndregno;
@@ -258,13 +259,13 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 		goto err_out;
 	}
 	/* get bndregs field from current task's xsave area */
-	bndregs = get_xsave_field_ptr(XSTATE_BNDREGS);
+	bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS);
 	if (!bndregs) {
 		err = -EINVAL;
 		goto err_out;
 	}
 	/* now go select the individual register in the set of 4 */
-	bndreg = &bndregs[bndregno];
+	bndreg = &bndregs->bndreg[bndregno];
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info) {
@@ -306,7 +307,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 
 static __user void *mpx_get_bounds_dir(void)
 {
-	const struct bndcsr *bndcsr;
+	const struct mpx_bndcsr *bndcsr;
 
 	if (!cpu_feature_enabled(X86_FEATURE_MPX))
 		return MPX_INVALID_BOUNDS_DIR;
@@ -315,7 +316,7 @@ static __user void *mpx_get_bounds_dir(void)
 	 * The bounds directory pointer is stored in a register
 	 * only accessible if we first do an xsave.
 	 */
-	bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
 	if (!bndcsr)
 		return MPX_INVALID_BOUNDS_DIR;
 
@@ -489,10 +490,10 @@ static int allocate_bt(struct mm_struct *mm, long __user *bd_entry)
 static int do_mpx_bt_fault(void)
 {
 	unsigned long bd_entry, bd_base;
-	const struct bndcsr *bndcsr;
+	const struct mpx_bndcsr *bndcsr;
 	struct mm_struct *mm = current->mm;
 
-	bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
 	if (!bndcsr)
 		return -EINVAL;
 	/*
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 29089b24d18b..7145b3d9030c 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -5,7 +5,8 @@ include ../lib.mk
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt
-TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn
+TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn \
+			test_FCMOV test_FCOMI test_FISTTP
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
@@ -35,7 +36,7 @@ all_64: $(BINARIES_64)
 	$(RM) $(BINARIES_32) $(BINARIES_64)
 
 $(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
-	$(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
+	$(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm
 
 $(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
 	$(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
diff --git a/tools/testing/selftests/x86/test_FCMOV.c b/tools/testing/selftests/x86/test_FCMOV.c
new file mode 100644
index 000000000000..4adcca0c80c4
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FCMOV.c
@@ -0,0 +1,93 @@
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+
+#define TEST(insn) \
+long double __attribute__((noinline)) insn(long flags) \
+{						\
+	long double out;			\
+	asm ("\n"				\
+	"	push	%1""\n"			\
+	"	popf""\n"			\
+	"	fldpi""\n"			\
+	"	fld1""\n"			\
+	"	" #insn " %%st(1), %%st" "\n"	\
+	"	ffree	%%st(1)" "\n"		\
+	: "=t" (out)				\
+	: "r" (flags)				\
+	);					\
+	return out;				\
+}
+
+TEST(fcmovb)
+TEST(fcmove)
+TEST(fcmovbe)
+TEST(fcmovu)
+TEST(fcmovnb)
+TEST(fcmovne)
+TEST(fcmovnbe)
+TEST(fcmovnu)
+
+enum {
+	CF = 1 << 0,
+	PF = 1 << 2,
+	ZF = 1 << 6,
+};
+
+void sighandler(int sig)
+{
+	printf("[FAIL]\tGot signal %d, exiting\n", sig);
+	exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+	int err = 0;
+
+	/* SIGILL triggers on 32-bit kernels w/o fcomi emulation
+	 * when run with "no387 nofxsr". Other signals are caught
+	 * just in case.
+	 */
+	signal(SIGILL, sighandler);
+	signal(SIGFPE, sighandler);
+	signal(SIGSEGV, sighandler);
+
+	printf("[RUN]\tTesting fcmovCC instructions\n");
+	/* If fcmovCC() returns 1.0, the move wasn't done */
+	err |= !(fcmovb(0)   == 1.0); err |= !(fcmovnb(0)  != 1.0);
+	err |= !(fcmove(0)   == 1.0); err |= !(fcmovne(0)  != 1.0);
+	err |= !(fcmovbe(0)  == 1.0); err |= !(fcmovnbe(0) != 1.0);
+	err |= !(fcmovu(0)   == 1.0); err |= !(fcmovnu(0)  != 1.0);
+
+	err |= !(fcmovb(CF)  != 1.0); err |= !(fcmovnb(CF)  == 1.0);
+	err |= !(fcmove(CF)  == 1.0); err |= !(fcmovne(CF)  != 1.0);
+	err |= !(fcmovbe(CF) != 1.0); err |= !(fcmovnbe(CF) == 1.0);
+	err |= !(fcmovu(CF)  == 1.0); err |= !(fcmovnu(CF)  != 1.0);
+
+	err |= !(fcmovb(ZF)  == 1.0); err |= !(fcmovnb(ZF)  != 1.0);
+	err |= !(fcmove(ZF)  != 1.0); err |= !(fcmovne(ZF)  == 1.0);
+	err |= !(fcmovbe(ZF) != 1.0); err |= !(fcmovnbe(ZF) == 1.0);
+	err |= !(fcmovu(ZF)  == 1.0); err |= !(fcmovnu(ZF)  != 1.0);
+
+	err |= !(fcmovb(PF)  == 1.0); err |= !(fcmovnb(PF)  != 1.0);
+	err |= !(fcmove(PF)  == 1.0); err |= !(fcmovne(PF)  != 1.0);
+	err |= !(fcmovbe(PF) == 1.0); err |= !(fcmovnbe(PF) != 1.0);
+	err |= !(fcmovu(PF)  != 1.0); err |= !(fcmovnu(PF)  == 1.0);
+
+        if (!err)
+                printf("[OK]\tfcmovCC\n");
+	else
+		printf("[FAIL]\tfcmovCC errors: %d\n", err);
+
+	return err;
+}
diff --git a/tools/testing/selftests/x86/test_FCOMI.c b/tools/testing/selftests/x86/test_FCOMI.c
new file mode 100644
index 000000000000..db4933e31af9
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FCOMI.c
@@ -0,0 +1,331 @@
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <fenv.h>
+
+enum {
+	CF = 1 << 0,
+	PF = 1 << 2,
+	ZF = 1 << 6,
+	ARITH = CF | PF | ZF,
+};
+
+long res_fcomi_pi_1;
+long res_fcomi_1_pi;
+long res_fcomi_1_1;
+long res_fcomi_nan_1;
+/* sNaN is s|111 1111 1|1xx xxxx xxxx xxxx xxxx xxxx */
+/* qNaN is s|111 1111 1|0xx xxxx xxxx xxxx xxxx xxxx (some x must be nonzero) */
+int snan = 0x7fc11111;
+int qnan = 0x7f811111;
+unsigned short snan1[5];
+/* sNaN80 is s|111 1111 1111 1111 |10xx xx...xx (some x must be nonzero) */
+unsigned short snan80[5] = { 0x1111, 0x1111, 0x1111, 0x8111, 0x7fff };
+
+int test(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fld1""\n"
+	"	fldpi""\n"
+	"	fcomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_1_pi""\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fldpi""\n"
+	"	fld1""\n"
+	"	fcomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_pi_1""\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fld1""\n"
+	"	fld1""\n"
+	"	fcomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_1_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_1_pi & ARITH) != (0)) {
+		printf("[BAD]\tfcomi_1_pi with flags:%lx\n", flags);
+		return 1;
+	}
+	if ((res_fcomi_pi_1 & ARITH) != (CF)) {
+		printf("[BAD]\tfcomi_pi_1 with flags:%lx->%lx\n", flags, res_fcomi_pi_1 & ARITH);
+		return 1;
+	}
+	if ((res_fcomi_1_1 & ARITH) != (ZF)) {
+		printf("[BAD]\tfcomi_1_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != 0) {
+		printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int test_qnan(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+	"	push	%0""\n"
+	"	popf""\n"
+	"	flds	qnan""\n"
+	"	fld1""\n"
+	"	fnclex""\n"		// fld of a qnan raised FE_INVALID, clear it
+	"	fcomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_nan_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+		printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != FE_INVALID) {
+		printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int testu_qnan(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+	"	push	%0""\n"
+	"	popf""\n"
+	"	flds	qnan""\n"
+	"	fld1""\n"
+	"	fnclex""\n"		// fld of a qnan raised FE_INVALID, clear it
+	"	fucomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_nan_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+		printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != 0) {
+		printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int testu_snan(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+	"	push	%0""\n"
+	"	popf""\n"
+//	"	flds	snan""\n"	// WRONG, this will convert 32-bit fp snan to a *qnan* in 80-bit fp register!
+//	"	fstpt	snan1""\n"	// if uncommented, it prints "snan1:7fff c111 1100 0000 0000" - c111, not 8111!
+//	"	fnclex""\n"		// flds of a snan raised FE_INVALID, clear it
+	"	fldt	snan80""\n"	// fldt never raise FE_INVALID
+	"	fld1""\n"
+	"	fucomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_nan_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+		printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+		return 1;
+	}
+//	printf("snan:%x snan1:%04x %04x %04x %04x %04x\n", snan, snan1[4], snan1[3], snan1[2], snan1[1], snan1[0]);
+	if (fetestexcept(FE_INVALID) != FE_INVALID) {
+		printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int testp(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fld1""\n"
+	"	fldpi""\n"
+	"	fcomip	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_1_pi""\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fldpi""\n"
+	"	fld1""\n"
+	"	fcomip	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_pi_1""\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fld1""\n"
+	"	fld1""\n"
+	"	fcomip	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_1_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_1_pi & ARITH) != (0)) {
+		printf("[BAD]\tfcomi_1_pi with flags:%lx\n", flags);
+		return 1;
+	}
+	if ((res_fcomi_pi_1 & ARITH) != (CF)) {
+		printf("[BAD]\tfcomi_pi_1 with flags:%lx->%lx\n", flags, res_fcomi_pi_1 & ARITH);
+		return 1;
+	}
+	if ((res_fcomi_1_1 & ARITH) != (ZF)) {
+		printf("[BAD]\tfcomi_1_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != 0) {
+		printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int testp_qnan(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+	"	push	%0""\n"
+	"	popf""\n"
+	"	flds	qnan""\n"
+	"	fld1""\n"
+	"	fnclex""\n"		// fld of a qnan raised FE_INVALID, clear it
+	"	fcomip	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_nan_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+		printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != FE_INVALID) {
+		printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int testup_qnan(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+	"	push	%0""\n"
+	"	popf""\n"
+	"	flds	qnan""\n"
+	"	fld1""\n"
+	"	fnclex""\n"		// fld of a qnan raised FE_INVALID, clear it
+	"	fucomip	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_nan_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+		printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != 0) {
+		printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+void sighandler(int sig)
+{
+	printf("[FAIL]\tGot signal %d, exiting\n", sig);
+	exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+	int err = 0;
+
+	/* SIGILL triggers on 32-bit kernels w/o fcomi emulation
+	 * when run with "no387 nofxsr". Other signals are caught
+	 * just in case.
+	 */
+	signal(SIGILL, sighandler);
+	signal(SIGFPE, sighandler);
+	signal(SIGSEGV, sighandler);
+
+	printf("[RUN]\tTesting f[u]comi[p] instructions\n");
+	err |= test(0);
+	err |= test_qnan(0);
+	err |= testu_qnan(0);
+	err |= testu_snan(0);
+	err |= test(CF|ZF|PF);
+	err |= test_qnan(CF|ZF|PF);
+	err |= testu_qnan(CF|ZF|PF);
+	err |= testu_snan(CF|ZF|PF);
+	err |= testp(0);
+	err |= testp_qnan(0);
+	err |= testup_qnan(0);
+	err |= testp(CF|ZF|PF);
+	err |= testp_qnan(CF|ZF|PF);
+	err |= testup_qnan(CF|ZF|PF);
+	if (!err)
+		printf("[OK]\tf[u]comi[p]\n");
+	else
+		printf("[FAIL]\tf[u]comi[p] errors: %d\n", err);
+
+	return err;
+}
diff --git a/tools/testing/selftests/x86/test_FISTTP.c b/tools/testing/selftests/x86/test_FISTTP.c
new file mode 100644
index 000000000000..b8e61a047f6b
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FISTTP.c
@@ -0,0 +1,137 @@
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <fenv.h>
+
+unsigned long long res64 = -1;
+unsigned int res32 = -1;
+unsigned short res16 = -1;
+
+int test(void)
+{
+	int ex;
+
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	asm volatile ("\n"
+	"	fld1""\n"
+	"	fisttp	res16""\n"
+	"	fld1""\n"
+	"	fisttpl	res32""\n"
+	"	fld1""\n"
+	"	fisttpll res64""\n"
+	: : : "memory"
+	);
+	if (res16 != 1 || res32 != 1 || res64 != 1) {
+		printf("[BAD]\tfisttp 1\n");
+		return 1;
+	}
+	ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	if (ex != 0) {
+		printf("[BAD]\tfisttp 1: wrong exception state\n");
+		return 1;
+	}
+
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	asm volatile ("\n"
+	"	fldpi""\n"
+	"	fisttp	res16""\n"
+	"	fldpi""\n"
+	"	fisttpl	res32""\n"
+	"	fldpi""\n"
+	"	fisttpll res64""\n"
+	: : : "memory"
+	);
+	if (res16 != 3 || res32 != 3 || res64 != 3) {
+		printf("[BAD]\tfisttp pi\n");
+		return 1;
+	}
+	ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	if (ex != FE_INEXACT) {
+		printf("[BAD]\tfisttp pi: wrong exception state\n");
+		return 1;
+	}
+
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	asm volatile ("\n"
+	"	fldpi""\n"
+	"	fchs""\n"
+	"	fisttp	res16""\n"
+	"	fldpi""\n"
+	"	fchs""\n"
+	"	fisttpl	res32""\n"
+	"	fldpi""\n"
+	"	fchs""\n"
+	"	fisttpll res64""\n"
+	: : : "memory"
+	);
+	if (res16 != 0xfffd || res32 != 0xfffffffd || res64 != 0xfffffffffffffffdULL) {
+		printf("[BAD]\tfisttp -pi\n");
+		return 1;
+	}
+	ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	if (ex != FE_INEXACT) {
+		printf("[BAD]\tfisttp -pi: wrong exception state\n");
+		return 1;
+	}
+
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	asm volatile ("\n"
+	"	fldln2""\n"
+	"	fisttp	res16""\n"
+	"	fldln2""\n"
+	"	fisttpl	res32""\n"
+	"	fldln2""\n"
+	"	fisttpll res64""\n"
+	: : : "memory"
+	);
+	/* Test truncation to zero (round-to-nearest would give 1 here) */
+	if (res16 != 0 || res32 != 0 || res64 != 0) {
+		printf("[BAD]\tfisttp ln2\n");
+		return 1;
+	}
+	ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	if (ex != FE_INEXACT) {
+		printf("[BAD]\tfisttp ln2: wrong exception state\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+void sighandler(int sig)
+{
+	printf("[FAIL]\tGot signal %d, exiting\n", sig);
+	exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+	int err = 0;
+
+	/* SIGILL triggers on 32-bit kernels w/o fisttp emulation
+	 * when run with "no387 nofxsr". Other signals are caught
+	 * just in case.
+	 */
+	signal(SIGILL, sighandler);
+	signal(SIGFPE, sighandler);
+	signal(SIGSEGV, sighandler);
+
+	printf("[RUN]\tTesting fisttp instructions\n");
+	err |= test();
+	if (!err)
+		printf("[OK]\tfisttp\n");
+	else
+		printf("[FAIL]\tfisttp errors: %d\n", err);
+
+	return err;
+}

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2015-11-03 11:09 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-11-03 11:08 [GIT PULL] x86/fpu changes for v4.4 Ingo Molnar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.