Linux-HyperV List
 help / color / mirror / Atom feed
* [PATCH RESEND v9 12/36] x86/fred: Update MSR_IA32_FRED_RSP0 during task switch
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

MSR_IA32_FRED_RSP0 is used during ring 3 event delivery, and needs to
be updated to point to the top of next task stack during task switch.

Update MSR_IA32_FRED_RSP0 with WRMSR instruction for now, and will use
WRMSRNS/WRMSRLIST for performance once it gets upstreamed.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/switch_to.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index f42dbf17f52b..6c911fd400b2 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -70,9 +70,16 @@ static inline void update_task_stack(struct task_struct *task)
 #ifdef CONFIG_X86_32
 	this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-	/* Xen PV enters the kernel on the thread stack. */
-	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+	if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+		/*
+		 * Will use WRMSRNS/WRMSRLIST for performance once it's upstreamed.
+		 */
+		wrmsrl(MSR_IA32_FRED_RSP0,
+		       (unsigned long)task_stack_page(task) + THREAD_SIZE);
+	} else if (cpu_feature_enabled(X86_FEATURE_XENPV)) {
+		/* Xen PV enters the kernel on the thread stack. */
 		load_sp0(task_top_of_stack(task));
+	}
 #endif
 }
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH RESEND v9 09/36] x86/fred: Make unions for the cs and ss fields in struct pt_regs
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Make the cs and ss fields in struct pt_regs unions between the actual
selector and the unsigned long stack slot. FRED uses this space to
store additional flags.

The printk changes are simply due to the cs and ss fields changed to
unsigned short from unsigned long.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---

Changes since v8:
* Reflect stack frame definition changes from FRED spec 3.0 to 5.0.
* Use __packed instead of __attribute__((__packed__)) (Borislav Petkov).
* Put all comments above the members, like the rest of the file does
  (Borislav Petkov).

Changes since v3:
* Rename csl/ssl of the pt_regs structure to csx/ssx (x for extended)
  (Andrew Cooper).
---
 arch/x86/entry/vsyscall/vsyscall_64.c |  2 +-
 arch/x86/include/asm/ptrace.h         | 57 +++++++++++++++++++++++++--
 arch/x86/kernel/process_64.c          |  2 +-
 3 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index e0ca8120aea8..a3c0df11d0e6 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -76,7 +76,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
 	if (!show_unhandled_signals)
 		return;
 
-	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
 			   level, current->comm, task_pid_nr(current),
 			   message, regs->ip, regs->cs,
 			   regs->sp, regs->ax, regs->si, regs->di);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index f4db78b09c8f..f1690beffd15 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -80,15 +80,66 @@ struct pt_regs {
 /*
  * On syscall entry, this is syscall#. On CPU exception, this is error code.
  * On hw interrupt, it's IRQ number:
+ *
+ * A FRED stack frame starts here:
+ *   1) It _always_ includes an error code;
+ *   2) The return frame for eretu/erets starts here.
  */
 	unsigned long orig_ax;
 /* Return frame for iretq */
 	unsigned long ip;
-	unsigned long cs;
+	union {
+/* CS extended: CS + any fields above it */
+		unsigned long csx;
+		struct {
+/* CS selector proper */
+			unsigned short cs;
+/* The stack level (SL) at the time the event occurred */
+			unsigned int sl		: 2;
+/* Set to indicate that indirect branch tracker in WAIT_FOR_ENDBRANCH state */
+			unsigned int wfe	: 1;
+			unsigned int __csx_resv1: 13;
+			unsigned int __csx_resv2: 32;
+		} __packed;
+	};
 	unsigned long flags;
 	unsigned long sp;
-	unsigned long ss;
-/* top of stack page */
+	union {
+/* SS extended: SS + any fields above it */
+		unsigned long ssx;
+		struct {
+/* SS selector proper */
+			unsigned short ss;
+/* Set to indicate that interrupt blocking by STI was in effect */
+			unsigned int sti	: 1;
+/* For SYSCALL, SYSENTER, or INT n (for any value of n) */
+			unsigned int sys	: 1;
+			unsigned int nmi	: 1;
+			unsigned int __ssx_resv1: 13;
+/* Event information fields, ignored by the FRED return instructions */
+			unsigned int vector	: 8;
+			unsigned int __ssx_resv2: 8;
+			unsigned int type	: 4;
+			unsigned int __ssx_resv3: 4;
+/* Set to indicate that the event was incident to enclave execution */
+			unsigned int enc	: 1;
+/* Set to indicate that the logical processor had been in 64-bit mode */
+			unsigned int l		: 1;
+/*
+ * Set to indicate the event is a nested exception encountered during FRED
+ * event delivery of another event. This bit is not set if the event is
+ * double fault (#DF).
+ */
+			unsigned int nst	: 1;
+			unsigned int __ssx_resv4: 1;
+/* The length of the instruction causing the event */
+			unsigned int instr_len	: 4;
+		} __packed;
+	};
+/*
+ * Top of stack page on IDT systems, while FRED systems have extra fields
+ * defined above, see <asm/fred.h>.
+ */
 };
 
 #endif /* !__i386__ */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d181c16a2f6..265ab8fcb146 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -117,7 +117,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
 
 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
-	printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
+	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
 		log_lvl, regs->cs, ds, es, cr0);
 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
 		log_lvl, cr2, cr3, cr4);
-- 
2.34.1


^ permalink raw reply related

* [PATCH RESEND v9 10/36] x86/fred: Add a new header file for FRED definitions
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add a header file for FRED prototypes and definitions.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---

Changes since v6:
* Replace pt_regs csx flags prefix FRED_CSL_ with FRED_CSX_.
---
 arch/x86/include/asm/fred.h | 104 ++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 arch/x86/include/asm/fred.h

diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
new file mode 100644
index 000000000000..d76e681a806f
--- /dev/null
+++ b/arch/x86/include/asm/fred.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Macros for Flexible Return and Event Delivery (FRED)
+ */
+
+#ifndef ASM_X86_FRED_H
+#define ASM_X86_FRED_H
+
+#include <linux/const.h>
+#include <asm/asm.h>
+
+/*
+ * FRED return instructions
+ *
+ * Replace with "ERETS"/"ERETU" once binutils support FRED return instructions.
+ * The binutils version supporting FRED instructions is still TBD, and will
+ * update once we have it.
+ */
+#define ERETS			_ASM_BYTES(0xf2,0x0f,0x01,0xca)
+#define ERETU			_ASM_BYTES(0xf3,0x0f,0x01,0xca)
+
+/*
+ * RSP is aligned to a 64-byte boundary before used to push a new stack frame
+ */
+#define FRED_STACK_FRAME_RSP_MASK	_AT(unsigned long, (~0x3f))
+
+/*
+ * Event stack level macro for the FRED_STKLVLS MSR.
+ * Usage example: FRED_STKLVL(X86_TRAP_DF, 3)
+ * Multiple values can be ORd together.
+ */
+#define FRED_STKLVL(v,l)	(_AT(unsigned long, l) << (2*(v)))
+
+/* FRED_CONFIG MSR */
+#define FRED_CONFIG_CSL_MASK		0x3
+/*
+ * Used for the return address for call emulation during code patching,
+ * and measured in 64-byte cache lines.
+ */
+#define FRED_CONFIG_REDZONE_AMOUNT	1
+#define FRED_CONFIG_REDZONE		(_AT(unsigned long, FRED_CONFIG_REDZONE_AMOUNT) << 6)
+#define FRED_CONFIG_INT_STKLVL(l)	(_AT(unsigned long, l) << 9)
+#define FRED_CONFIG_ENTRYPOINT(p)	_AT(unsigned long, (p))
+
+/*
+ * FRED event type and vector bit width and counts.
+ *
+ * There is space in the stack frame making it possible to extend event type
+ * and vector fields in the future.
+ */
+#define FRED_EVENT_TYPE_BITS		3
+#define FRED_EVENT_TYPE_COUNT		_BITUL(FRED_EVENT_TYPE_BITS)
+#define FRED_EVENT_VECTOR_BITS		8
+#define FRED_EVENT_VECTOR_COUNT		_BITUL(FRED_EVENT_VECTOR_BITS)
+
+/* FRED EVENT_TYPE_OTHER vector numbers */
+#define FRED_SYSCALL			1
+#define FRED_SYSENTER			2
+#define FRED_NUM_OTHER_VECTORS		3
+
+/* Flags above the SS selector (regs->ssx) */
+#define FRED_SSX_INTERRUPT_SHADOW_BIT	16
+#define FRED_SSX_INTERRUPT_SHADOW	_BITUL(FRED_SSX_INTERRUPT_SHADOW_BIT)
+#define FRED_SSX_SOFTWARE_INITIATED_BIT	17
+#define FRED_SSX_SOFTWARE_INITIATED	_BITUL(FRED_SSX_SOFTWARE_INITIATED_BIT)
+#define FRED_SSX_NMI_BIT		18
+#define FRED_SSX_NMI			_BITUL(FRED_SSX_NMI_BIT)
+#define FRED_SSX_64_BIT_MODE_BIT	57
+#define FRED_SSX_64_BIT_MODE		_BITUL(FRED_SSX_64_BIT_MODE_BIT)
+
+#ifdef CONFIG_X86_FRED
+
+#ifndef __ASSEMBLY__
+
+#include <linux/kernel.h>
+#include <asm/ptrace.h>
+
+struct fred_info {
+	/* Event data: CR2, DR6, ... */
+	unsigned long edata;
+	unsigned long resv;
+};
+
+/* Full format of the FRED stack frame */
+struct fred_frame {
+	struct pt_regs   regs;
+	struct fred_info info;
+};
+
+static __always_inline struct fred_info *fred_info(struct pt_regs *regs)
+{
+	return &container_of(regs, struct fred_frame, regs)->info;
+}
+
+static __always_inline unsigned long fred_event_data(struct pt_regs *regs)
+{
+	return fred_info(regs)->edata;
+}
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* CONFIG_X86_FRED */
+
+#endif /* ASM_X86_FRED_H */
-- 
2.34.1


^ permalink raw reply related

* [PATCH RESEND v9 11/36] x86/fred: Reserve space for the FRED stack frame
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

When using FRED, reserve space at the top of the stack frame, just
like i386 does.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/thread_info.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d63b02940747..089cab875cba 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,7 +31,9 @@
  * In vm86 mode, the hardware frame is much longer still, so add 16
  * bytes to make room for the real-mode segments.
  *
- * x86_64 has a fixed-length stack frame.
+ * x86-64 has a fixed-length stack frame, but it depends on whether
+ * or not FRED is enabled. Future versions of FRED might make this
+ * dynamic, but for now it is always 2 words longer.
  */
 #ifdef CONFIG_X86_32
 # ifdef CONFIG_VM86
@@ -39,8 +41,12 @@
 # else
 #  define TOP_OF_KERNEL_STACK_PADDING 8
 # endif
-#else
-# define TOP_OF_KERNEL_STACK_PADDING 0
+#else /* x86-64 */
+# ifdef CONFIG_X86_FRED
+#  define TOP_OF_KERNEL_STACK_PADDING (2*8)
+# else
+#  define TOP_OF_KERNEL_STACK_PADDING 0
+# endif
 #endif
 
 /*
-- 
2.34.1


^ permalink raw reply related

* [PATCH RESEND v9 07/36] x86/cpu: Add X86_CR4_FRED macro
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add X86_CR4_FRED macro for the FRED bit in %cr4. This bit should be a
pinned bit, not to be changed after initialization.

CR4 macros are defined in arch/x86/include/uapi/asm/processor-flags.h,
which is uapi, and thus cannot depend on CONFIG_X86_64.

Using _BITUL() causes build errors on 32 bits, and there is no
guarantee that user space applications (e.g. something like Qemu)
might not want to use this declaration even when building for i386 or
x32.

However, %cr4 is a machine word (unsigned long), so to avoid build
warnings on 32 bits, explicitly cast the value to unsigned long,
truncating upper 32 bits.

The other alternative would be to use CONFIG_X86_64 around the
definition of cr4_pinned_mask. It is probably not desirable to make
cr4_pinned_mask non-const.

Another option, which may be preferable, to be honest: explicitly
enumerate the CR4 bits which *may* be changed (a whitelist), instead
of the ones that may not. That would be a separate, pre-FRED, patch,
and would automatically resolve this problem as a side effect.

The following flags probably should have been in this set all along,
as they are all controls affecting the kernel runtime environment as
opposed to user space:

X86_CR4_DE, X86_CR4_PAE, X86_CR4_PSE, X86_CR4_MCE, X86_CR4_PGE,
X86_CR4_OSFXSR, X86_CR4_OSXMMEXCPT, X86_CR4_LA57, X86_CR4_PCIDE,
X86_CR4_LAM_SUP

Possibly X86_CR4_VMXE as well, which seems harmless even if KVM is
not loaded; X86_CR4_PKE can be fixed as long as the PKE configuration
registers are at least initialized to disabled.

It is relatively simple to do an audit of which flags are allowed to
be modified at runtime and whitelist only those. There is no reason
why we should allow bits in CR4 to be toggled by default.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/uapi/asm/processor-flags.h | 2 ++
 arch/x86/kernel/cpu/common.c                | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index d898432947ff..ce08c2ca70b5 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -138,6 +138,8 @@
 #define X86_CR4_CET		_BITUL(X86_CR4_CET_BIT)
 #define X86_CR4_LAM_SUP_BIT	28 /* LAM for supervisor pointers */
 #define X86_CR4_LAM_SUP		_BITUL(X86_CR4_LAM_SUP_BIT)
+#define X86_CR4_FRED_BIT	32 /* enable FRED kernel entry */
+#define X86_CR4_FRED		_BITULL(X86_CR4_FRED_BIT)
 
 /*
  * x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0ba1067f4e5f..331b06d19f7f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -402,8 +402,9 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c)
 
 /* These bits should not change their value after CPU init is finished. */
 static const unsigned long cr4_pinned_mask =
-	X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
-	X86_CR4_FSGSBASE | X86_CR4_CET;
+	(unsigned long)
+	(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
+	 X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED);
 static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
 static unsigned long cr4_pinned_bits __ro_after_init;
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH RESEND v9 04/36] x86/cpufeatures: Add the cpu feature bit for FRED
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add the CPU feature bit for FRED.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/cpufeatures.h       | 1 +
 tools/arch/x86/include/asm/cpufeatures.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index cb8ca46213be..fd3ddd5c0283 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -317,6 +317,7 @@
 #define X86_FEATURE_FZRM		(12*32+10) /* "" Fast zero-length REP MOVSB */
 #define X86_FEATURE_FSRS		(12*32+11) /* "" Fast short REP STOSB */
 #define X86_FEATURE_FSRC		(12*32+12) /* "" Fast short REP {CMPSB,SCASB} */
+#define X86_FEATURE_FRED		(12*32+17) /* Flexible Return and Event Delivery */
 #define X86_FEATURE_LKGS		(12*32+18) /* "" Load "kernel" (userspace) GS */
 #define X86_FEATURE_AMX_FP16		(12*32+21) /* "" AMX fp16 Support */
 #define X86_FEATURE_AVX_IFMA            (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index cb8ca46213be..fd3ddd5c0283 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -317,6 +317,7 @@
 #define X86_FEATURE_FZRM		(12*32+10) /* "" Fast zero-length REP MOVSB */
 #define X86_FEATURE_FSRS		(12*32+11) /* "" Fast short REP STOSB */
 #define X86_FEATURE_FSRC		(12*32+12) /* "" Fast short REP {CMPSB,SCASB} */
+#define X86_FEATURE_FRED		(12*32+17) /* Flexible Return and Event Delivery */
 #define X86_FEATURE_LKGS		(12*32+18) /* "" Load "kernel" (userspace) GS */
 #define X86_FEATURE_AMX_FP16		(12*32+21) /* "" AMX fp16 Support */
 #define X86_FEATURE_AVX_IFMA            (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
-- 
2.34.1


^ permalink raw reply related

* [PATCH RESEND v9 03/36] x86/fred: Disable FRED support if CONFIG_X86_FRED is disabled
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add CONFIG_X86_FRED to <asm/disabled-features.h> to make
cpu_feature_enabled() work correctly with FRED.

Originally-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/disabled-features.h       | 8 +++++++-
 tools/arch/x86/include/asm/disabled-features.h | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index fafe9be7a6f4..85fd67c67ce1 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -105,6 +105,12 @@
 # define DISABLE_TDX_GUEST	(1 << (X86_FEATURE_TDX_GUEST & 31))
 #endif
 
+#ifdef CONFIG_X86_FRED
+# define DISABLE_FRED 0
+#else
+# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -122,7 +128,7 @@
 #define DISABLED_MASK11	(DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
 			 DISABLE_CALL_DEPTH_TRACKING)
 #define DISABLED_MASK12	(DISABLE_LAM)
-#define DISABLED_MASK13	0
+#define DISABLED_MASK13	(DISABLE_FRED)
 #define DISABLED_MASK14	0
 #define DISABLED_MASK15	0
 #define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index fafe9be7a6f4..85fd67c67ce1 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -105,6 +105,12 @@
 # define DISABLE_TDX_GUEST	(1 << (X86_FEATURE_TDX_GUEST & 31))
 #endif
 
+#ifdef CONFIG_X86_FRED
+# define DISABLE_FRED 0
+#else
+# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -122,7 +128,7 @@
 #define DISABLED_MASK11	(DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
 			 DISABLE_CALL_DEPTH_TRACKING)
 #define DISABLED_MASK12	(DISABLE_LAM)
-#define DISABLED_MASK13	0
+#define DISABLED_MASK13	(DISABLE_FRED)
 #define DISABLED_MASK14	0
 #define DISABLED_MASK15	0
 #define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
-- 
2.34.1


^ permalink raw reply related

* [PATCH RESEND v9 06/36] x86/objtool: Teach objtool about ERETU and ERETS
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Update the objtool decoder to know about the ERETU and ERETS
instructions (type INSN_CONTEXT_SWITCH).

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 tools/objtool/arch/x86/decode.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 2e1caabecb18..a486485cff20 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -509,11 +509,20 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 
 		if (op2 == 0x01) {
 
-			if (modrm == 0xca)
-				insn->type = INSN_CLAC;
-			else if (modrm == 0xcb)
-				insn->type = INSN_STAC;
-
+			switch (insn_last_prefix_id(&ins)) {
+			case INAT_PFX_REPE:
+			case INAT_PFX_REPNE:
+				if (modrm == 0xca)
+					/* eretu/erets */
+					insn->type = INSN_CONTEXT_SWITCH;
+				break;
+			default:
+				if (modrm == 0xca)
+					insn->type = INSN_CLAC;
+				else if (modrm == 0xcb)
+					insn->type = INSN_STAC;
+				break;
+			}
 		} else if (op2 >= 0x80 && op2 <= 0x8f) {
 
 			insn->type = INSN_JUMP_CONDITIONAL;
-- 
2.34.1


^ permalink raw reply related

* [PATCH RESEND v9 08/36] x86/cpu: Add MSR numbers for FRED configuration
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add MSR numbers for the FRED configuration registers.

Originally-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/msr-index.h       | 13 ++++++++++++-
 tools/arch/x86/include/asm/msr-index.h | 13 ++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index a00a53e15ab7..111fb76f6dbe 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -36,8 +36,19 @@
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
 
-/* Intel MSRs. Some also available on other CPUs */
+/* FRED MSRs */
+#define MSR_IA32_FRED_RSP0	0x1cc /* Level 0 stack pointer */
+#define MSR_IA32_FRED_RSP1	0x1cd /* Level 1 stack pointer */
+#define MSR_IA32_FRED_RSP2	0x1ce /* Level 2 stack pointer */
+#define MSR_IA32_FRED_RSP3	0x1cf /* Level 3 stack pointer */
+#define MSR_IA32_FRED_STKLVLS	0x1d0 /* Exception stack levels */
+#define MSR_IA32_FRED_SSP0	MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */
+#define MSR_IA32_FRED_SSP1	0x1d1 /* Level 1 shadow stack pointer */
+#define MSR_IA32_FRED_SSP2	0x1d2 /* Level 2 shadow stack pointer */
+#define MSR_IA32_FRED_SSP3	0x1d3 /* Level 3 shadow stack pointer */
+#define MSR_IA32_FRED_CONFIG	0x1d4 /* Entrypoint and interrupt stack level */
 
+/* Intel MSRs. Some also available on other CPUs */
 #define MSR_TEST_CTRL				0x00000033
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT	29
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT		BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT)
diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index 3aedae61af4f..565cade0785a 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -36,8 +36,19 @@
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
 
-/* Intel MSRs. Some also available on other CPUs */
+/* FRED MSRs */
+#define MSR_IA32_FRED_RSP0	0x1cc /* Level 0 stack pointer */
+#define MSR_IA32_FRED_RSP1	0x1cd /* Level 1 stack pointer */
+#define MSR_IA32_FRED_RSP2	0x1ce /* Level 2 stack pointer */
+#define MSR_IA32_FRED_RSP3	0x1cf /* Level 3 stack pointer */
+#define MSR_IA32_FRED_STKLVLS	0x1d0 /* Exception stack levels */
+#define MSR_IA32_FRED_SSP0	MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */
+#define MSR_IA32_FRED_SSP1	0x1d1 /* Level 1 shadow stack pointer */
+#define MSR_IA32_FRED_SSP2	0x1d2 /* Level 2 shadow stack pointer */
+#define MSR_IA32_FRED_SSP3	0x1d3 /* Level 3 shadow stack pointer */
+#define MSR_IA32_FRED_CONFIG	0x1d4 /* Entrypoint and interrupt stack level */
 
+/* Intel MSRs. Some also available on other CPUs */
 #define MSR_TEST_CTRL				0x00000033
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT	29
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT		BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT)
-- 
2.34.1


^ permalink raw reply related

* [PATCH RESEND v9 05/36] x86/opcode: Add ERETU, ERETS instructions to x86-opcode-map
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add instruction opcodes used by FRED ERETU/ERETS to x86-opcode-map.

Opcode numbers are per FRED spec v5.0.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/lib/x86-opcode-map.txt       | 2 +-
 tools/arch/x86/lib/x86-opcode-map.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 5168ee0360b2..7a269e269dc0 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1052,7 +1052,7 @@ EndTable
 
 GrpTable: Grp7
 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B)
-1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B)
 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B)
 3: LIDT Ms
 4: SMSW Mw/Rv
diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt
index 5168ee0360b2..7a269e269dc0 100644
--- a/tools/arch/x86/lib/x86-opcode-map.txt
+++ b/tools/arch/x86/lib/x86-opcode-map.txt
@@ -1052,7 +1052,7 @@ EndTable
 
 GrpTable: Grp7
 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B)
-1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B)
 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B)
 3: LIDT Ms
 4: SMSW Mw/Rv
-- 
2.34.1


^ permalink raw reply related

* [PATCH RESEND v9 02/36] x86/fred: Add Kconfig option for FRED (CONFIG_X86_FRED)
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add the configuration option CONFIG_X86_FRED to enable FRED.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/Kconfig | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7422db409770..700d94cb8330 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -494,6 +494,15 @@ config X86_CPU_RESCTRL
 
 	  Say N if unsure.
 
+config X86_FRED
+	bool "Flexible Return and Event Delivery"
+	depends on X86_64
+	help
+	  When enabled, try to use Flexible Return and Event Delivery
+	  instead of the legacy SYSCALL/SYSENTER/IDT architecture for
+	  ring transitions and exception/interrupt handling if the
+	  system supports.
+
 if X86_32
 config X86_BIGSMP
 	bool "Support for big SMP systems with more than 8 CPUs"
-- 
2.34.1


^ permalink raw reply related

* [PATCH RESEND v9 00/36] x86: enable FRED for x86-64
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy

Resend because the mail system failed to deliver some messages yesterday.

This patch set enables the Intel flexible return and event delivery
(FRED) architecture for x86-64.

The FRED architecture defines simple new transitions that change
privilege level (ring transitions). The FRED architecture was
designed with the following goals:

1) Improve overall performance and response time by replacing event
   delivery through the interrupt descriptor table (IDT event
   delivery) and event return by the IRET instruction with lower
   latency transitions.

2) Improve software robustness by ensuring that event delivery
   establishes the full supervisor context and that event return
   establishes the full user context.

The new transitions defined by the FRED architecture are FRED event
delivery and, for returning from events, two FRED return instructions.
FRED event delivery can effect a transition from ring 3 to ring 0, but
it is used also to deliver events incident to ring 0. One FRED
instruction (ERETU) effects a return from ring 0 to ring 3, while the
other (ERETS) returns while remaining in ring 0. Collectively, FRED
event delivery and the FRED return instructions are FRED transitions.

Search for the latest FRED spec in most search engines with this search pattern:

  site:intel.com FRED (flexible return and event delivery) specification

As of now there is no publicly avaiable CPU supporting FRED, thus the Intel
Simics® Simulator is used as software development and testing vehicles. And
it can be downloaded from:
  https://www.intel.com/content/www/us/en/developer/articles/tool/simics-simulator.html

To enable FRED, the Simics package 8112 QSP-CPU needs to be installed with CPU
model configured as:
	$cpu_comp_class = "x86-experimental-fred"


Changes since v8:
* Move the FRED initialization patch after all required changes are in
  place (Thomas Gleixner).
* Don't do syscall early out in fred_entry_from_user() before there are
  proper performance numbers and justifications (Thomas Gleixner).
* Add the control exception handler to the FRED exception handler table
  (Thomas Gleixner).
* Introduce a macro sysvec_install() to derive the asm handler name from
  a C handler, which simplifies the code and avoids an ugly typecast
  (Thomas Gleixner).
* Remove junk code that assumes no local APIC on x86_64 (Thomas Gleixner).
* Put IDTENTRY changes in a separate patch (Thomas Gleixner).
* Use high-order 48 bits above the lowest 16 bit SS only when FRED is
  enabled (Thomas Gleixner).
* Explain why writing directly to the IA32_KERNEL_GS_BASE MSR is
  doing the right thing (Thomas Gleixner).
* Reword some patch descriptions (Thomas Gleixner).
* Add a new macro VMX_DO_FRED_EVENT_IRQOFF for FRED instead of
  refactoring VMX_DO_EVENT_IRQOFF (Sean Christopherson).
* Do NOT use a trampoline, just LEA+PUSH the return RIP, PUSH the error
  code, and jump to the FRED kernel entry point for NMI or call
  external_interrupt() for IRQs (Sean Christopherson).
* Call external_interrupt() only when FRED is enabled, and convert the
  non-FRED handling to external_interrupt() after FRED lands (Sean
  Christopherson).
* Use __packed instead of __attribute__((__packed__)) (Borislav Petkov).
* Put all comments above the members, like the rest of the file does
  (Borislav Petkov).
* Reflect the FRED spec 5.0 change that ERETS and ERETU add 8 to %rsp
  before popping the return context from the stack.
* Reflect stack frame definition changes from FRED spec 3.0 to 5.0.
* Add ENDBR to the FRED_ENTER asm macro after kernel IBT is added to
  FRED base line in FRED spec 5.0.
* Add a document which briefly introduces FRED features.
* Remove 2 patches, "allow FRED systems to use interrupt vectors
  0x10-0x1f" and "allow dynamic stack frame size", from this patch set,
  as they are "optimizations" only.
* Send 2 patches, "header file for event types" and "do not modify the
  DPL bits for a null selector", as pre-FRED patches.

Changes since v7:
* Always call external_interrupt() for VMX IRQ handling on x86_64, thus avoid
  re-entering the noinstr code.
* Create a FRED stack frame when FRED is compiled-in but not enabled, which
  uses some extra stack space but simplifies the code.
* Add a log message when FRED is enabled.

Changes since v6:
* Add a comment to explain why it is safe to write to a previous FRED stack
  frame. (Lai Jiangshan).
* Export fred_entrypoint_kernel(), required when kvm-intel built as a module.
* Reserve a REDZONE for CALL emulation and Align RSP to a 64-byte boundary
  before pushing a new FRED stack frame.
* Replace pt_regs csx flags prefix FRED_CSL_ with FRED_CSX_.

Changes since v5:
* Initialize system_interrupt_handlers with dispatch_table_spurious_interrupt()
  instead of NULL to get rid of a branch (Peter Zijlstra).
* Disallow #DB inside #MCE for robustness sake (Peter Zijlstra).
* Add a comment for FRED stack level settings (Lai Jiangshan).
* Move the NMI bit from an invalid stack frame, which caused ERETU to fault,
  to the fault handler's stack frame, thus to unblock NMI ASAP if NMI is blocked
  (Lai Jiangshan).
* Refactor VMX_DO_EVENT_IRQOFF to handle IRQ/NMI in IRQ/NMI induced VM exits
  when FRED is enabled (Sean Christopherson).

Changes since v4:
* Do NOT use the term "injection", which in the KVM context means to
  reinject an event into the guest (Sean Christopherson).
* Add the explanation of why to execute "int $2" to invoke the NMI handler
  in NMI caused VM exits (Sean Christopherson).
* Use cs/ss instead of csx/ssx when initializing the pt_regs structure
  for calling external_interrupt(), otherwise it breaks i386 build.

Changes since v3:
* Call external_interrupt() to handle IRQ in IRQ caused VM exits.
* Execute "int $2" to handle NMI in NMI caused VM exits.
* Rename csl/ssl of the pt_regs structure to csx/ssx (x for extended)
  (Andrew Cooper).

Changes since v2:
* Improve comments for changes in arch/x86/include/asm/idtentry.h.

Changes since v1:
* call irqentry_nmi_{enter,exit}() in both IDT and FRED debug fault kernel
  handler (Peter Zijlstra).
* Initialize a FRED exception handler to fred_bad_event() instead of NULL
  if no FRED handler defined for an exception vector (Peter Zijlstra).
* Push calling irqentry_{enter,exit}() and instrumentation_{begin,end}()
  down into individual FRED exception handlers, instead of in the dispatch
  framework (Peter Zijlstra).

H. Peter Anvin (Intel) (22):
  x86/fred: Add Kconfig option for FRED (CONFIG_X86_FRED)
  x86/fred: Disable FRED support if CONFIG_X86_FRED is disabled
  x86/cpufeatures: Add the cpu feature bit for FRED
  x86/opcode: Add ERETU, ERETS instructions to x86-opcode-map
  x86/objtool: Teach objtool about ERETU and ERETS
  x86/cpu: Add X86_CR4_FRED macro
  x86/cpu: Add MSR numbers for FRED configuration
  x86/fred: Make unions for the cs and ss fields in struct pt_regs
  x86/fred: Add a new header file for FRED definitions
  x86/fred: Reserve space for the FRED stack frame
  x86/fred: Update MSR_IA32_FRED_RSP0 during task switch
  x86/fred: Let ret_from_fork_asm() jmp to fred_exit_user when FRED is
    enabled
  x86/fred: Disallow the swapgs instruction when FRED is enabled
  x86/fred: No ESPFIX needed when FRED is enabled
  x86/fred: Allow single-step trap and NMI when starting a new task
  x86/fred: Add a page fault entry stub for FRED
  x86/fred: Add a debug fault entry stub for FRED
  x86/fred: Add a NMI entry stub for FRED
  x86/traps: Add a system interrupt handler table for system interrupt
    dispatch
  x86/traps: Add external_interrupt() to dispatch external interrupts
  x86/fred: FRED entry/exit and dispatch code
  x86/fred: FRED initialization code

Xin Li (14):
  Documentation/x86/64: Add documentation for FRED
  x86/fred: Define a common function type fred_handler
  x86/fred: Add a machine check entry stub for FRED
  x86/fred: Add a double fault entry stub for FRED
  x86/entry: Remove idtentry_sysvec from entry_{32,64}.S
  x86/idtentry: Incorporate definitions/declarations of the FRED
    external interrupt handler type
  x86/traps: Add sysvec_install() to install a system interrupt handler
  x86/idtentry: Incorporate declaration/definition of the FRED exception
    handler type
  x86/fred: Fixup fault on ERETU by jumping to fred_entrypoint_user
  x86/traps: Export external_interrupt() for handling IRQ in IRQ induced
    VM exits
  x86/fred: Export fred_entrypoint_kernel() for handling NMI in NMI
    induced VM exits
  KVM: VMX: Add VMX_DO_FRED_EVENT_IRQOFF for IRQ/NMI handling
  x86/syscall: Split IDT syscall setup code into idt_syscall_init()
  x86/fred: Disable FRED by default in its early stage

 .../admin-guide/kernel-parameters.txt         |   4 +
 Documentation/arch/x86/x86_64/fred.rst        | 102 ++++++++
 Documentation/arch/x86/x86_64/index.rst       |   1 +
 arch/x86/Kconfig                              |   9 +
 arch/x86/entry/Makefile                       |   5 +-
 arch/x86/entry/entry_32.S                     |   4 -
 arch/x86/entry/entry_64.S                     |  14 +-
 arch/x86/entry/entry_64_fred.S                |  58 +++++
 arch/x86/entry/entry_fred.c                   | 220 ++++++++++++++++++
 arch/x86/entry/vsyscall/vsyscall_64.c         |   2 +-
 arch/x86/include/asm/asm-prototypes.h         |   1 +
 arch/x86/include/asm/cpufeatures.h            |   1 +
 arch/x86/include/asm/disabled-features.h      |   8 +-
 arch/x86/include/asm/extable_fixup_types.h    |   4 +-
 arch/x86/include/asm/fred.h                   | 157 +++++++++++++
 arch/x86/include/asm/idtentry.h               | 115 ++++++++-
 arch/x86/include/asm/msr-index.h              |  13 +-
 arch/x86/include/asm/ptrace.h                 |  57 ++++-
 arch/x86/include/asm/switch_to.h              |  11 +-
 arch/x86/include/asm/thread_info.h            |  12 +-
 arch/x86/include/asm/traps.h                  |  23 ++
 arch/x86/include/uapi/asm/processor-flags.h   |   2 +
 arch/x86/kernel/Makefile                      |   1 +
 arch/x86/kernel/cpu/acrn.c                    |   5 +-
 arch/x86/kernel/cpu/common.c                  |  47 +++-
 arch/x86/kernel/cpu/mce/core.c                |  15 ++
 arch/x86/kernel/cpu/mshyperv.c                |  16 +-
 arch/x86/kernel/espfix_64.c                   |   8 +
 arch/x86/kernel/fred.c                        |  67 ++++++
 arch/x86/kernel/irqinit.c                     |   7 +-
 arch/x86/kernel/kvm.c                         |   2 +-
 arch/x86/kernel/nmi.c                         |  19 ++
 arch/x86/kernel/process_64.c                  |  31 ++-
 arch/x86/kernel/traps.c                       | 153 ++++++++++--
 arch/x86/kvm/vmx/vmenter.S                    |  88 +++++++
 arch/x86/kvm/vmx/vmx.c                        |  19 +-
 arch/x86/lib/x86-opcode-map.txt               |   2 +-
 arch/x86/mm/extable.c                         |  79 +++++++
 arch/x86/mm/fault.c                           |  18 +-
 drivers/xen/events/events_base.c              |   3 +-
 tools/arch/x86/include/asm/cpufeatures.h      |   1 +
 .../arch/x86/include/asm/disabled-features.h  |   8 +-
 tools/arch/x86/include/asm/msr-index.h        |  13 +-
 tools/arch/x86/lib/x86-opcode-map.txt         |   2 +-
 tools/objtool/arch/x86/decode.c               |  19 +-
 45 files changed, 1348 insertions(+), 98 deletions(-)
 create mode 100644 Documentation/arch/x86/x86_64/fred.rst
 create mode 100644 arch/x86/entry/entry_64_fred.S
 create mode 100644 arch/x86/entry/entry_fred.c
 create mode 100644 arch/x86/include/asm/fred.h
 create mode 100644 arch/x86/kernel/fred.c

-- 
2.34.1


^ permalink raw reply

* [PATCH RESEND v9 01/36] Documentation/x86/64: Add documentation for FRED
From: Xin Li @ 2023-08-01  8:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Babu Moger,
	Jim Mattson, Sandipan Das, Lai Jiangshan, Hans de Goede,
	Reinette Chatre, Daniel Sneddon, Breno Leitao, Nikunj A Dadhania,
	Brian Gerst, Sami Tolvanen, Alexander Potapenko, Andrew Morton,
	Arnd Bergmann, Eric W . Biederman, Kees Cook, Masami Hiramatsu,
	Masahiro Yamada, Ze Gao, Fei Li, Conghui, Ashok Raj,
	Jason A . Donenfeld, Mark Rutland, Jacob Pan, Jiapeng Chong,
	Jane Malalane, David Woodhouse, Boris Ostrovsky,
	Arnaldo Carvalho de Melo, Yantengsi, Christophe Leroy,
	Sathvika Vasireddy
In-Reply-To: <20230801083318.8363-1-xin3.li@intel.com>

Briefly introduce FRED, its advantages compared to IDT, and its
Linux enabling.

Signed-off-by: Xin Li <xin3.li@intel.com>
---
 Documentation/arch/x86/x86_64/fred.rst  | 102 ++++++++++++++++++++++++
 Documentation/arch/x86/x86_64/index.rst |   1 +
 2 files changed, 103 insertions(+)
 create mode 100644 Documentation/arch/x86/x86_64/fred.rst

diff --git a/Documentation/arch/x86/x86_64/fred.rst b/Documentation/arch/x86/x86_64/fred.rst
new file mode 100644
index 000000000000..27c980e882ba
--- /dev/null
+++ b/Documentation/arch/x86/x86_64/fred.rst
@@ -0,0 +1,102 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+Flexible Return and Event Delivery (FRED)
+=========================================
+
+Overview
+========
+
+The FRED architecture defines simple new transitions that change
+privilege level (ring transitions). The FRED architecture was
+designed with the following goals:
+
+1) Improve overall performance and response time by replacing event
+   delivery through the interrupt descriptor table (IDT event
+   delivery) and event return by the IRET instruction with lower
+   latency transitions.
+
+2) Improve software robustness by ensuring that event delivery
+   establishes the full supervisor context and that event return
+   establishes the full user context.
+
+The new transitions defined by the FRED architecture are FRED event
+delivery and, for returning from events, two FRED return instructions.
+FRED event delivery can effect a transition from ring 3 to ring 0, but
+it is used also to deliver events incident to ring 0. One FRED
+instruction (ERETU) effects a return from ring 0 to ring 3, while the
+other (ERETS) returns while remaining in ring 0. Collectively, FRED
+event delivery and the FRED return instructions are FRED transitions.
+
+In addition to these transitions, the FRED architecture defines a new
+instruction (LKGS) for managing the state of the GS segment register.
+The LKGS instruction can be used by 64-bit operating systems that do
+not use the new FRED transitions.
+
+Software based event dispatching
+================================
+
+FRED operates differently from IDT in terms of event handling. Instead
+of directly dispatching an event to its handler based on the event
+vector, FRED requires the software to dispatch an event to its handler
+based on both the event's type and vector. Therefore, an event
+dispatch framework must be implemented to facilitate the
+event-to-handler dispatch process. The FRED event dispatch framework
+assumes control once an event is delivered, starting from two FRED
+entry points, after which several event dispatch tables are introduced
+to facilitate the dispatching.
+
+The first level dispatching is event type based, and two tables need
+to be defined, one for ring 3 event dispatching, and the other
+for ring 0.
+
+The second level dispatching is event vector based, and
+several tables need to be defined, e.g., an exception handler table
+for exception dispatching.
+
+Full supervisor/user context
+============================
+
+FRED event delivery atomically save and restore full supervisor/user
+context upon event delivery and return. Thus it avoids the problem of
+transient states due to %cr2 and/or %dr6, thus it is no longer needed
+to handle all the ugly corner cases caused by half baked CPU states.
+
+FRED allows explicit unblock of NMI with new event return instructions
+ERETS/ERETU, avoiding the mess caused by IRET which unconditionally
+unblocks NMI, when an exception happens during NMI handling.
+
+FRED always restores the full value of %rsp, thus ESPFIX is no longer
+needed when FRED is enabled.
+
+LKGS
+====
+
+LKGS behaves like the MOV to GS instruction except that it loads the
+base address into the IA32_KERNEL_GS_BASE MSR instead of the GS
+segment’s descriptor cache, which is exactly what Linux kernel does
+to load user level GS base. With LKGS, it ends up with avoiding
+mucking with kernel GS.
+
+Because FRED event delivery from ring 3 swaps the value of the GS base
+address and that of the IA32_KERNEL_GS_BASE MSR, and ERETU swaps the
+value of the GS base address and that of the IA32_KERNEL_GS_BASE MSR,
+plus the introduction of LKGS instruction, the SWAPGS instruction is
+no longer needed when FRED is enabled, thus is disallowed (#UD).
+
+Stack levels
+============
+
+4 stack levels 0~3 are introduced to replace the un-reentrant IST for
+handling events. Each stack level could be configured to use a
+dedicated stack.
+
+The current stack level could be unchanged or go higher upon FRED
+event delivery. If unchanged, the CPU keeps using the current event
+stack. If higher, the CPU switches to a new stack specified by the
+stack MSR of the new stack level.
+
+Only execution of a FRED return instruction ERETU or ERETS could lower
+the current stack level, causing the CPU to switch back to the stack
+it was on before a previous event delivery.
+satck.
diff --git a/Documentation/arch/x86/x86_64/index.rst b/Documentation/arch/x86/x86_64/index.rst
index a56070fc8e77..ad15e9bd623f 100644
--- a/Documentation/arch/x86/x86_64/index.rst
+++ b/Documentation/arch/x86/x86_64/index.rst
@@ -15,3 +15,4 @@ x86_64 Support
    cpu-hotplug-spec
    machinecheck
    fsgs
+   fred
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH RFC net-next v5 00/14] virtio/vsock: support datagrams
From: Bobby Eshleman @ 2023-08-01  5:30 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Bobby Eshleman, linux-hyperv, Stefan Hajnoczi, kvm,
	VMware PV-Drivers Reviewers, Simon Horman, virtualization,
	Eric Dumazet, Dan Carpenter, Xuan Zhuo, Wei Liu, Dexuan Cui,
	Bryan Tan, Jakub Kicinski, Paolo Abeni, Haiyang Zhang,
	Krasnov Arseniy, Vishnu Dasa, Jiang Wang, netdev, linux-kernel,
	bpf, David S. Miller
In-Reply-To: <20230727035004-mutt-send-email-mst@kernel.org>

On Thu, Jul 27, 2023 at 03:51:42AM -0400, Michael S. Tsirkin wrote:
> On Wed, Jul 19, 2023 at 12:50:04AM +0000, Bobby Eshleman wrote:
> > Hey all!
> > 
> > This series introduces support for datagrams to virtio/vsock.
> > 
> > It is a spin-off (and smaller version) of this series from the summer:
> >   https://lore.kernel.org/all/cover.1660362668.git.bobby.eshleman@bytedance.com/
> > 
> > Please note that this is an RFC and should not be merged until
> > associated changes are made to the virtio specification, which will
> > follow after discussion from this series.
> > 
> > Another aside, the v4 of the series has only been mildly tested with a
> > run of tools/testing/vsock/vsock_test. Some code likely needs cleaning
> > up, but I'm hoping to get some of the design choices agreed upon before
> > spending too much time making it pretty.
> > 
> > This series first supports datagrams in a basic form for virtio, and
> > then optimizes the sendpath for all datagram transports.
> > 
> > The result is a very fast datagram communication protocol that
> > outperforms even UDP on multi-queue virtio-net w/ vhost on a variety
> > of multi-threaded workload samples.
> > 
> > For those that are curious, some summary data comparing UDP and VSOCK
> > DGRAM (N=5):
> > 
> > 	vCPUS: 16
> > 	virtio-net queues: 16
> > 	payload size: 4KB
> > 	Setup: bare metal + vm (non-nested)
> > 
> > 	UDP: 287.59 MB/s
> > 	VSOCK DGRAM: 509.2 MB/s
> > 
> > Some notes about the implementation...
> > 
> > This datagram implementation forces datagrams to self-throttle according
> > to the threshold set by sk_sndbuf. It behaves similar to the credits
> > used by streams in its effect on throughput and memory consumption, but
> > it is not influenced by the receiving socket as credits are.
> > 
> > The device drops packets silently.
> > 
> > As discussed previously, this series introduces datagrams and defers
> > fairness to future work. See discussion in v2 for more context around
> > datagrams, fairness, and this implementation.
> 
> it's a big thread - can't you summarize here?
> 

Sure, no problem. I'll add that in the next rev. For the sake of readers
here, the fairness of vsock streams and vsock datagrams per this
implementation was experimentally demonstrated to be nearly equal.

Fairness was measured as a percentage reduction of throughput on an
active and concurrent stream flow. The socket type under test (datagram
or stream) was overprovisioned into a large pool of sockets and were
exercised to maximum sending throughput. Each socket was given a unique
port and single-threaded sender to avoid any scalability differences
between datagrams and streams. Meanwhile, the throughput of a single,
lone stream socket was measured before and throughout the lifetime the
pool of sockets, to detect fairness as an amount of reduced throughput.
It was demonstrated that there was no real difference in this fairness
characteristic of datagrams and streams for vsock. In fact, datagrams
faired better (that is, datagrams were nicer to streams than streams
were to other streams), although the effect was not statistically
significant. From the design perspective, the queuing policy is always
FIFO regardless of socket type. Credits, despite being a perfect
mechanism for synchronizing send and receive buffer sizes, have no
effect on queuing fairness either.

> 
> > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> 
> 
> could you give a bit more motivation? which applications do
> you have in mind? for example, on localhost loopback datagrams
> are actually reliable and a bunch of apps came to depend
> on that even if they shouldn't.
> 
> 

Our use case is sending various metrics from VMs to the host.
Ultimately, we just like the performance numbers we get from this
datagram implementation compared to what we get from UDP.

Currently the system is:
  producers <-> UDS <-> guest proxy <-> UDP <-> host <-> UDS <-> consumers
  ^-------- guest ----------------^ ^------------ host ------------------^

And the numbers look really promising when using vsock dgram:
  producers <-> UDS <-> guest proxy <-> VSOCK dgram <-> host <-> UDS <-> consumers
  ^-------- guest ----------------^ ^------------ host ---------------------------^

The numbers also look really promising when using sockmap in lieu of the
proxies.

Best,
Bobby

> 
> > ---
> > Changes in v5:
> > - teach vhost to drop dgram when a datagram exceeds the receive buffer
> >   - now uses MSG_ERRQUEUE and depends on Arseniy's zerocopy patch:
> > 	"vsock: read from socket's error queue"
> > - replace multiple ->dgram_* callbacks with single ->dgram_addr_init()
> >   callback
> > - refactor virtio dgram skb allocator to reduce conflicts w/ zerocopy series
> > - add _fallback/_FALLBACK suffix to dgram transport variables/macros
> > - add WARN_ONCE() for table_size / VSOCK_HASH issue
> > - add static to vsock_find_bound_socket_common
> > - dedupe code in vsock_dgram_sendmsg() using module_got var
> > - drop concurrent sendmsg() for dgram and defer to future series
> > - Add more tests
> >   - test EHOSTUNREACH in errqueue
> >   - test stream + dgram address collision
> > - improve clarity of dgram msg bounds test code
> > - Link to v4: https://lore.kernel.org/r/20230413-b4-vsock-dgram-v4-0-0cebbb2ae899@bytedance.com
> > 
> > Changes in v4:
> > - style changes
> >   - vsock: use sk_vsock(vsk) in vsock_dgram_recvmsg instead of
> >     &sk->vsk
> >   - vsock: fix xmas tree declaration
> >   - vsock: fix spacing issues
> >   - virtio/vsock: virtio_transport_recv_dgram returns void because err
> >     unused
> > - sparse analysis warnings/errors
> >   - virtio/vsock: fix unitialized skerr on destroy
> >   - virtio/vsock: fix uninitialized err var on goto out
> >   - vsock: fix declarations that need static
> >   - vsock: fix __rcu annotation order
> > - bugs
> >   - vsock: fix null ptr in remote_info code
> >   - vsock/dgram: make transport_dgram a fallback instead of first
> >     priority
> >   - vsock: remove redundant rcu read lock acquire in getname()
> > - tests
> >   - add more tests (message bounds and more)
> >   - add vsock_dgram_bind() helper
> >   - add vsock_dgram_connect() helper
> > 
> > Changes in v3:
> > - Support multi-transport dgram, changing logic in connect/bind
> >   to support VMCI case
> > - Support per-pkt transport lookup for sendto() case
> > - Fix dgram_allow() implementation
> > - Fix dgram feature bit number (now it is 3)
> > - Fix binding so dgram and connectible (cid,port) spaces are
> >   non-overlapping
> > - RCU protect transport ptr so connect() calls never leave
> >   a lockless read of the transport and remote_addr are always
> >   in sync
> > - Link to v2: https://lore.kernel.org/r/20230413-b4-vsock-dgram-v2-0-079cc7cee62e@bytedance.com
> > 
> > ---
> > Bobby Eshleman (13):
> >       af_vsock: generalize vsock_dgram_recvmsg() to all transports
> >       af_vsock: refactor transport lookup code
> >       af_vsock: support multi-transport datagrams
> >       af_vsock: generalize bind table functions
> >       af_vsock: use a separate dgram bind table
> >       virtio/vsock: add VIRTIO_VSOCK_TYPE_DGRAM
> >       virtio/vsock: add common datagram send path
> >       af_vsock: add vsock_find_bound_dgram_socket()
> >       virtio/vsock: add common datagram recv path
> >       virtio/vsock: add VIRTIO_VSOCK_F_DGRAM feature bit
> >       vhost/vsock: implement datagram support
> >       vsock/loopback: implement datagram support
> >       virtio/vsock: implement datagram support
> > 
> > Jiang Wang (1):
> >       test/vsock: add vsock dgram tests
> > 
> >  drivers/vhost/vsock.c                   |  64 ++-
> >  include/linux/virtio_vsock.h            |  10 +-
> >  include/net/af_vsock.h                  |  14 +-
> >  include/uapi/linux/virtio_vsock.h       |   2 +
> >  net/vmw_vsock/af_vsock.c                | 281 ++++++++++---
> >  net/vmw_vsock/hyperv_transport.c        |  13 -
> >  net/vmw_vsock/virtio_transport.c        |  26 +-
> >  net/vmw_vsock/virtio_transport_common.c | 190 +++++++--
> >  net/vmw_vsock/vmci_transport.c          |  60 +--
> >  net/vmw_vsock/vsock_loopback.c          |  10 +-
> >  tools/testing/vsock/util.c              | 141 ++++++-
> >  tools/testing/vsock/util.h              |   6 +
> >  tools/testing/vsock/vsock_test.c        | 680 ++++++++++++++++++++++++++++++++
> >  13 files changed, 1320 insertions(+), 177 deletions(-)
> > ---
> > base-commit: 37cadc266ebdc7e3531111c2b3304fa01b2131e8
> > change-id: 20230413-b4-vsock-dgram-3b6eba6a64e5
> > 
> > Best regards,
> > -- 
> > Bobby Eshleman <bobby.eshleman@bytedance.com>
> 
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH RFC net-next v5 10/14] virtio/vsock: add VIRTIO_VSOCK_F_DGRAM feature bit
From: Bobby Eshleman @ 2023-08-01  4:30 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: Michael S. Tsirkin, Bobby Eshleman, linux-hyperv, Stefan Hajnoczi,
	kvm, VMware PV-Drivers Reviewers, Simon Horman, virtualization,
	Eric Dumazet, Dan Carpenter, Xuan Zhuo, Wei Liu, Dexuan Cui,
	Bryan Tan, Jakub Kicinski, Paolo Abeni, Haiyang Zhang,
	Krasnov Arseniy, Vishnu Dasa, Jiang Wang, netdev, linux-kernel,
	bpf, David S. Miller
In-Reply-To: <tpwk67lij7t7hquduogxzyox5wvq73yriv7vqiizqoxxtxvfwq@jzkcmq4kv3b4>

On Thu, Jul 27, 2023 at 09:48:21AM +0200, Stefano Garzarella wrote:
> On Wed, Jul 26, 2023 at 02:38:08PM -0400, Michael S. Tsirkin wrote:
> > On Wed, Jul 19, 2023 at 12:50:14AM +0000, Bobby Eshleman wrote:
> > > This commit adds a feature bit for virtio vsock to support datagrams.
> > > 
> > > Signed-off-by: Jiang Wang <jiang.wang@bytedance.com>
> > > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> > > ---
> > >  include/uapi/linux/virtio_vsock.h | 1 +
> > >  1 file changed, 1 insertion(+)
> > > 
> > > diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
> > > index 331be28b1d30..27b4b2b8bf13 100644
> > > --- a/include/uapi/linux/virtio_vsock.h
> > > +++ b/include/uapi/linux/virtio_vsock.h
> > > @@ -40,6 +40,7 @@
> > > 
> > >  /* The feature bitmap for virtio vsock */
> > >  #define VIRTIO_VSOCK_F_SEQPACKET	1	/* SOCK_SEQPACKET supported */
> > > +#define VIRTIO_VSOCK_F_DGRAM		3	/* SOCK_DGRAM supported */
> > > 
> > >  struct virtio_vsock_config {
> > >  	__le64 guest_cid;
> > 
> > pls do not add interface without first getting it accepted in the
> > virtio spec.
> 
> Yep, fortunatelly this series is still RFC.
> I think by now we've seen that the implementation is doable, so we
> should discuss the changes to the specification ASAP. Then we can
> merge the series.
> 
> @Bobby can you start the discussion about spec changes?
> 

No problem at all. Am I right to assume that a new patch to the spec is
the standard starting point for discussion?

> Thanks,
> Stefano
> 
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH RFC net-next v5 11/14] vhost/vsock: implement datagram support
From: Bobby Eshleman @ 2023-08-01  4:26 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Bobby Eshleman, linux-hyperv, Stefan Hajnoczi, kvm,
	VMware PV-Drivers Reviewers, Simon Horman, virtualization,
	Eric Dumazet, Dan Carpenter, Xuan Zhuo, Wei Liu, Dexuan Cui,
	Bryan Tan, Jakub Kicinski, Paolo Abeni, Haiyang Zhang,
	Krasnov Arseniy, Vishnu Dasa, netdev, linux-kernel, bpf,
	David S. Miller
In-Reply-To: <20230726143850-mutt-send-email-mst@kernel.org>

On Wed, Jul 26, 2023 at 02:40:22PM -0400, Michael S. Tsirkin wrote:
> On Wed, Jul 19, 2023 at 12:50:15AM +0000, Bobby Eshleman wrote:
> > This commit implements datagram support for vhost/vsock by teaching
> > vhost to use the common virtio transport datagram functions.
> > 
> > If the virtio RX buffer is too small, then the transmission is
> > abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
> > error queue.
> > 
> > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> 
> EHOSTUNREACH?
> 

Yes, in the v4 thread we decided to try to mimic UDP/ICMP behavior when
IP packets are lost.

If an IP packet is dropped and the full UDP segment is not assembled,
then ICMP_TIME_EXCEEDED ICMP_EXC_FRAGTIME is sent. The sending stack
propagates this up the socket as EHOSTUNREACH. ENOBUFS/ENOMEM is already
used for local buffers, so EHOSTUNREACH distinctly points to the remote
end of the flow as well.

> 
> > ---
> >  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
> >  net/vmw_vsock/af_vsock.c |  5 +++-
> >  2 files changed, 63 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > index d5d6a3c3f273..da14260c6654 100644
> > --- a/drivers/vhost/vsock.c
> > +++ b/drivers/vhost/vsock.c
> > @@ -8,6 +8,7 @@
> >   */
> >  #include <linux/miscdevice.h>
> >  #include <linux/atomic.h>
> > +#include <linux/errqueue.h>
> >  #include <linux/module.h>
> >  #include <linux/mutex.h>
> >  #include <linux/vmalloc.h>
> > @@ -32,7 +33,8 @@
> >  enum {
> >  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
> >  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
> > -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
> > +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
> > +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
> >  };
> >  
> >  enum {
> > @@ -56,6 +58,7 @@ struct vhost_vsock {
> >  	atomic_t queued_replies;
> >  
> >  	u32 guest_cid;
> > +	bool dgram_allow;
> >  	bool seqpacket_allow;
> >  };
> >  
> > @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
> >  	return NULL;
> >  }
> >  
> > +/* Claims ownership of the skb, do not free the skb after calling! */
> > +static void
> > +vhost_transport_error(struct sk_buff *skb, int err)
> > +{
> > +	struct sock_exterr_skb *serr;
> > +	struct sock *sk = skb->sk;
> > +	struct sk_buff *clone;
> > +
> > +	serr = SKB_EXT_ERR(skb);
> > +	memset(serr, 0, sizeof(*serr));
> > +	serr->ee.ee_errno = err;
> > +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
> > +
> > +	clone = skb_clone(skb, GFP_KERNEL);
> > +	if (!clone)
> > +		return;
> > +
> > +	if (sock_queue_err_skb(sk, clone))
> > +		kfree_skb(clone);
> > +
> > +	sk->sk_err = err;
> > +	sk_error_report(sk);
> > +
> > +	kfree_skb(skb);
> > +}
> > +
> >  static void
> >  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> >  			    struct vhost_virtqueue *vq)
> > @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> >  		hdr = virtio_vsock_hdr(skb);
> >  
> >  		/* If the packet is greater than the space available in the
> > -		 * buffer, we split it using multiple buffers.
> > +		 * buffer, we split it using multiple buffers for connectible
> > +		 * sockets and drop the packet for datagram sockets.
> >  		 */
> 
> won't this break things like recently proposed zerocopy?
> I think splitup has to be supported for all types.
> 
> 
> >  		if (payload_len > iov_len - sizeof(*hdr)) {
> > +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
> > +				vhost_transport_error(skb, EHOSTUNREACH);
> > +				continue;
> > +			}
> > +
> >  			payload_len = iov_len - sizeof(*hdr);
> >  
> >  			/* As we are copying pieces of large packet's buffer to
> > @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
> >  	return val < vq->num;
> >  }
> >  
> > +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
> >  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
> >  
> >  static struct virtio_transport vhost_transport = {
> > @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
> >  		.cancel_pkt               = vhost_transport_cancel_pkt,
> >  
> >  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
> > -		.dgram_allow              = virtio_transport_dgram_allow,
> > +		.dgram_allow              = vhost_transport_dgram_allow,
> > +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
> >  
> >  		.stream_enqueue           = virtio_transport_stream_enqueue,
> >  		.stream_dequeue           = virtio_transport_stream_dequeue,
> > @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
> >  	.send_pkt = vhost_transport_send_pkt,
> >  };
> >  
> > +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
> > +{
> > +	struct vhost_vsock *vsock;
> > +	bool dgram_allow = false;
> > +
> > +	rcu_read_lock();
> > +	vsock = vhost_vsock_get(cid);
> > +
> > +	if (vsock)
> > +		dgram_allow = vsock->dgram_allow;
> > +
> > +	rcu_read_unlock();
> > +
> > +	return dgram_allow;
> > +}
> > +
> >  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
> >  {
> >  	struct vhost_vsock *vsock;
> > @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
> >  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
> >  		vsock->seqpacket_allow = true;
> >  
> > +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
> > +		vsock->dgram_allow = true;
> > +
> >  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
> >  		vq = &vsock->vqs[i];
> >  		mutex_lock(&vq->mutex);
> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> > index e73f3b2c52f1..449ed63ac2b0 100644
> > --- a/net/vmw_vsock/af_vsock.c
> > +++ b/net/vmw_vsock/af_vsock.c
> > @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
> >  		return prot->recvmsg(sk, msg, len, flags, NULL);
> >  #endif
> >  
> > -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
> > +	if (unlikely(flags & MSG_OOB))
> >  		return -EOPNOTSUPP;
> >  
> > +	if (unlikely(flags & MSG_ERRQUEUE))
> > +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
> > +
> >  	transport = vsk->transport;
> >  
> >  	/* Retrieve the head sk_buff from the socket's receive queue. */
> > 
> > -- 
> > 2.30.2
> 
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH RFC net-next v5 13/14] virtio/vsock: implement datagram supporty
From: Bobby Eshleman @ 2023-08-01  4:14 UTC (permalink / raw)
  To: Arseniy Krasnov
  Cc: Bobby Eshleman, Stefan Hajnoczi, Stefano Garzarella,
	Michael S. Tsirkin, Jason Wang, Xuan Zhuo, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	VMware PV-Drivers Reviewers, Dan Carpenter, Simon Horman, kvm,
	virtualization, netdev, linux-kernel, linux-hyperv, bpf
In-Reply-To: <f04d2aa5-32d8-cdc4-3b51-f15b0f42a1e8@gmail.com>

On Thu, Jul 27, 2023 at 11:09:21AM +0300, Arseniy Krasnov wrote:
> 
> 
> On 26.07.2023 20:58, Bobby Eshleman wrote:
> > On Sat, Jul 22, 2023 at 11:45:29AM +0300, Arseniy Krasnov wrote:
> >>
> >>
> >> On 19.07.2023 03:50, Bobby Eshleman wrote:
> >>> This commit implements datagram support for virtio/vsock by teaching
> >>> virtio to use the general virtio transport ->dgram_addr_init() function
> >>> and implementation a new version of ->dgram_allow().
> >>>
> >>> Additionally, it drops virtio_transport_dgram_allow() as an exported
> >>> symbol because it is no longer used in other transports.
> >>>
> >>> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> >>> ---
> >>>  include/linux/virtio_vsock.h            |  1 -
> >>>  net/vmw_vsock/virtio_transport.c        | 24 +++++++++++++++++++++++-
> >>>  net/vmw_vsock/virtio_transport_common.c |  6 ------
> >>>  3 files changed, 23 insertions(+), 8 deletions(-)
> >>>
> >>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> >>> index b3856b8a42b3..d0a4f08b12c1 100644
> >>> --- a/include/linux/virtio_vsock.h
> >>> +++ b/include/linux/virtio_vsock.h
> >>> @@ -211,7 +211,6 @@ void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val);
> >>>  u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
> >>>  bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
> >>>  bool virtio_transport_stream_allow(u32 cid, u32 port);
> >>> -bool virtio_transport_dgram_allow(u32 cid, u32 port);
> >>>  void virtio_transport_dgram_addr_init(struct sk_buff *skb,
> >>>  				      struct sockaddr_vm *addr);
> >>>  
> >>> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> >>> index ac2126c7dac5..713718861bd4 100644
> >>> --- a/net/vmw_vsock/virtio_transport.c
> >>> +++ b/net/vmw_vsock/virtio_transport.c
> >>> @@ -63,6 +63,7 @@ struct virtio_vsock {
> >>>  
> >>>  	u32 guest_cid;
> >>>  	bool seqpacket_allow;
> >>> +	bool dgram_allow;
> >>>  };
> >>>  
> >>>  static u32 virtio_transport_get_local_cid(void)
> >>> @@ -413,6 +414,7 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
> >>>  	queue_work(virtio_vsock_workqueue, &vsock->rx_work);
> >>>  }
> >>>  
> >>> +static bool virtio_transport_dgram_allow(u32 cid, u32 port);
> >>
> >> May be add body here? Without prototyping? Same for loopback and vhost.
> >>
> > 
> > Sounds okay with me, but this seems to go against the pattern
> > established by seqpacket. Any reason why?
> 
> Stefano Garzarella <sgarzare@redhat.com> commented my patch with the same approach:
> 
> https://lore.kernel.org/netdev/lex6l5suez7azhirt22lidndtjomkbagfbpvvi5p7c2t7klzas@4l2qly7at37c/
> 
> Thanks, Arseniy
> 

Gotcha, sounds good.

Thanks,
Bobby
> 
> > 
> >>>  static bool virtio_transport_seqpacket_allow(u32 remote_cid);
> >>>  
> >>>  static struct virtio_transport virtio_transport = {
> >>> @@ -430,6 +432,7 @@ static struct virtio_transport virtio_transport = {
> >>>  
> >>>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
> >>>  		.dgram_allow              = virtio_transport_dgram_allow,
> >>> +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
> >>>  
> >>>  		.stream_dequeue           = virtio_transport_stream_dequeue,
> >>>  		.stream_enqueue           = virtio_transport_stream_enqueue,
> >>> @@ -462,6 +465,21 @@ static struct virtio_transport virtio_transport = {
> >>>  	.send_pkt = virtio_transport_send_pkt,
> >>>  };
> >>>  
> >>> +static bool virtio_transport_dgram_allow(u32 cid, u32 port)
> >>> +{
> >>> +	struct virtio_vsock *vsock;
> >>> +	bool dgram_allow;
> >>> +
> >>> +	dgram_allow = false;
> >>> +	rcu_read_lock();
> >>> +	vsock = rcu_dereference(the_virtio_vsock);
> >>> +	if (vsock)
> >>> +		dgram_allow = vsock->dgram_allow;
> >>> +	rcu_read_unlock();
> >>> +
> >>> +	return dgram_allow;
> >>> +}
> >>> +
> >>>  static bool virtio_transport_seqpacket_allow(u32 remote_cid)
> >>>  {
> >>>  	struct virtio_vsock *vsock;
> >>> @@ -655,6 +673,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
> >>>  	if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_SEQPACKET))
> >>>  		vsock->seqpacket_allow = true;
> >>>  
> >>> +	if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_DGRAM))
> >>> +		vsock->dgram_allow = true;
> >>> +
> >>>  	vdev->priv = vsock;
> >>>  
> >>>  	ret = virtio_vsock_vqs_init(vsock);
> >>> @@ -747,7 +768,8 @@ static struct virtio_device_id id_table[] = {
> >>>  };
> >>>  
> >>>  static unsigned int features[] = {
> >>> -	VIRTIO_VSOCK_F_SEQPACKET
> >>> +	VIRTIO_VSOCK_F_SEQPACKET,
> >>> +	VIRTIO_VSOCK_F_DGRAM
> >>>  };
> >>>  
> >>>  static struct virtio_driver virtio_vsock_driver = {
> >>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> >>> index 96118e258097..77898f5325cd 100644
> >>> --- a/net/vmw_vsock/virtio_transport_common.c
> >>> +++ b/net/vmw_vsock/virtio_transport_common.c
> >>> @@ -783,12 +783,6 @@ bool virtio_transport_stream_allow(u32 cid, u32 port)
> >>>  }
> >>>  EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
> >>>  
> >>> -bool virtio_transport_dgram_allow(u32 cid, u32 port)
> >>> -{
> >>> -	return false;
> >>> -}
> >>> -EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow);
> >>> -
> >>>  int virtio_transport_connect(struct vsock_sock *vsk)
> >>>  {
> >>>  	struct virtio_vsock_pkt_info info = {
> >>>
> >>
> >> Thanks, Arseniy
> > 
> > Thanks,
> > Bobby

^ permalink raw reply

* Re: [PATCH V4,net-next] net: mana: Add page pool for RX buffers
From: Jakub Kicinski @ 2023-08-01  0:31 UTC (permalink / raw)
  To: Haiyang Zhang
  Cc: linux-hyperv, netdev, decui, kys, paulros, olaf, vkuznets, davem,
	wei.liu, edumazet, pabeni, leon, longli, ssengar, linux-rdma,
	daniel, john.fastabend, bpf, ast, sharmaajay, hawk, tglx,
	shradhagupta, linux-kernel
In-Reply-To: <1690580767-18937-1-git-send-email-haiyangz@microsoft.com>

On Fri, 28 Jul 2023 14:46:07 -0700 Haiyang Zhang wrote:
>  static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
> -			     dma_addr_t *da, bool is_napi)
> +			     dma_addr_t *da, bool *from_pool, bool is_napi)
>  {
>  	struct page *page;
>  	void *va;
>  
> +	*from_pool = false;
> +
>  	/* Reuse XDP dropped page if available */
>  	if (rxq->xdp_save_va) {
>  		va = rxq->xdp_save_va;
> @@ -1533,17 +1543,22 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
>  			return NULL;
>  		}
>  	} else {
> -		page = dev_alloc_page();
> +		page = page_pool_dev_alloc_pages(rxq->page_pool);
>  		if (!page)
>  			return NULL;
>  
> +		*from_pool = true;
>  		va = page_to_virt(page);
>  	}
>  
>  	*da = dma_map_single(dev, va + rxq->headroom, rxq->datasize,
>  			     DMA_FROM_DEVICE);
>  	if (dma_mapping_error(dev, *da)) {
> -		put_page(virt_to_head_page(va));
> +		if (*from_pool)
> +			page_pool_put_full_page(rxq->page_pool, page, is_napi);

AFAICT you only pass the is_napi to recycle in case of error?
It's fine to always pass in false, passing true enables some
optimizations but it's not worth trying to optimize error paths.

Otherwise you may be passing in true, even tho budget was 0,
see the recently added warnings in this doc:

https://www.kernel.org/doc/html/next/networking/napi.html

In general the driver seems to be processing Rx regardless
of budget? This looks like a bug which should be fixed with
a separate patch for the net tree..
-- 
pw-bot: cr

^ permalink raw reply

* RE: [PATCH v9 00/36] x86: enable FRED for x86-64
From: Li, Xin3 @ 2023-07-31 23:56 UTC (permalink / raw)
  To: Christopherson,, Sean
  Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-edac@vger.kernel.org, linux-hyperv@vger.kernel.org,
	kvm@vger.kernel.org, xen-devel@lists.xenproject.org,
	Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86@kernel.org, H . Peter Anvin, Lutomirski, Andy,
	Oleg Nesterov, Luck, Tony, K . Y . Srinivasan, Haiyang Zhang,
	Wei Liu, Cui, Dexuan, Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov,
	Peter Zijlstra, Gross, Jurgen, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Chatre, Reinette, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Li, Fei1, Conghui, Raj, Ashok, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	Woodhouse, David, Ostrovsky, Boris, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <ZMhBDoTxqghvF7G7@google.com>

> > Are you talking about that you only got a subset of this patch set?
> 
> No, I'm saying I don't want to waste a bunch of time tracking down exactly which
> commit a 36 patch series is based on.  E.g. I just refreshed tip/master and still
> get:
> 
> Applying: x86/idtentry: Incorporate definitions/declarations of the FRED external
> interrupt handler type
> error: sha1 information is lacking or useless (arch/x86/include/asm/idtentry.h).
> error: could not build fake ancestor
> Patch failed at 0024 x86/idtentry: Incorporate definitions/declarations of the FRED
> external interrupt handler type
> hint: Use 'git am --show-current-patch=diff' to see the failed patch

That is due to the following patch set (originally from tglx) is not
merged yet:

https://lore.kernel.org/lkml/20230621171248.6805-1-xin3.li@intel.com/

Sigh, I should have mentioned it in the cover letter.

As mentioned in the cover letter, 2 patches are sent out separately
as pre-FRED patches:
https://lore.kernel.org/lkml/20230706051443.2054-1-xin3.li@intel.com/
https://lore.kernel.org/lkml/20230706052231.2183-1-xin3.li@intel.com/

Sorry it's a bit complicated.

Got to mention, just in case you want to try out FRED, the current
public Intel Simics emulator has not updated to support FRED 5.0 yet
(it only supports FRED 3.0). The plan is late Q3, or early Q4.

^ permalink raw reply

* Re: [PATCH v9 00/36] x86: enable FRED for x86-64
From: Sean Christopherson @ 2023-07-31 23:17 UTC (permalink / raw)
  To: Xin3 Li
  Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-edac@vger.kernel.org, linux-hyperv@vger.kernel.org,
	kvm@vger.kernel.org, xen-devel@lists.xenproject.org,
	Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86@kernel.org, H . Peter Anvin, Andy Lutomirski,
	Oleg Nesterov, Tony Luck, K . Y . Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov,
	Peter Zijlstra, Jurgen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei1 Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <SA1PR11MB6734A02EEFD83969F1965A8FA805A@SA1PR11MB6734.namprd11.prod.outlook.com>

On Mon, Jul 31, 2023, Xin3 Li wrote:
> > > This patch set enables the Intel flexible return and event delivery
> > > (FRED) architecture for x86-64.
> > 
> > ...
> > 
> > > --
> > > 2.34.1
> > 
> > What is this based on?
> 
> The tip tree master branch.
> 
> > FYI, you're using a version of git that will (mostly)
> > automatically generate the based, e.g. I do
> > 
> >   git format-patch --base=HEAD~$nr ...
> > 
> > in my scripts, where $nr is the number of patches I am sending.  My specific
> > approaches requires HEAD-$nr to be a publicly visible object/commit, but that
> > should be the case the vast majority of the time anyways.
> 
> Are you talking about that you only got a subset of this patch set?

No, I'm saying I don't want to waste a bunch of time tracking down exactly which
commit a 36 patch series is based on.  E.g. I just refreshed tip/master and still
get:

Applying: x86/idtentry: Incorporate definitions/declarations of the FRED external interrupt handler type
error: sha1 information is lacking or useless (arch/x86/include/asm/idtentry.h).
error: could not build fake ancestor
Patch failed at 0024 x86/idtentry: Incorporate definitions/declarations of the FRED external interrupt handler type
hint: Use 'git am --show-current-patch=diff' to see the failed patch

> HPA told me he only got patches 0-25/36.
> 
> And I got several undeliverable email notifications, saying
> "
> The following message to <tglx@linutronix.de> was undeliverable.
> The reason for the problem:
> 5.x.1 - Maximum number of delivery attempts exceeded. [Default] 450-'4.7.25 Client host rejected: cannot find your hostname, [134.134.136.31]'
> "
> 
> I guess there were some problems with the Intel mail system last night,
> probably I should resend this patch set later.

Yes, lore also appears to be missing patches.  I grabbed the mbox off of KVM's
patchwork instance.

^ permalink raw reply

* RE: [PATCH v9 00/36] x86: enable FRED for x86-64
From: Li, Xin3 @ 2023-07-31 23:10 UTC (permalink / raw)
  To: Christopherson,, Sean
  Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-edac@vger.kernel.org, linux-hyperv@vger.kernel.org,
	kvm@vger.kernel.org, xen-devel@lists.xenproject.org,
	Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86@kernel.org, H . Peter Anvin, Lutomirski, Andy,
	Oleg Nesterov, Luck, Tony, K . Y . Srinivasan, Haiyang Zhang,
	Wei Liu, Cui, Dexuan, Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov,
	Peter Zijlstra, Gross, Jurgen, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Chatre, Reinette, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Li, Fei1, Conghui, Raj, Ashok, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	Woodhouse, David, Ostrovsky, Boris, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <ZMg1sD7IamB0INVs@google.com>

> > This patch set enables the Intel flexible return and event delivery
> > (FRED) architecture for x86-64.
> 
> ...
> 
> > --
> > 2.34.1
> 
> What is this based on?

The tip tree master branch.

> FYI, you're using a version of git that will (mostly)
> automatically generate the based, e.g. I do
> 
>   git format-patch --base=HEAD~$nr ...
> 
> in my scripts, where $nr is the number of patches I am sending.  My specific
> approaches requires HEAD-$nr to be a publicly visible object/commit, but that
> should be the case the vast majority of the time anyways.

Are you talking about that you only got a subset of this patch set?

HPA told me he only got patches 0-25/36.

And I got several undeliverable email notifications, saying
"
The following message to <tglx@linutronix.de> was undeliverable.
The reason for the problem:
5.x.1 - Maximum number of delivery attempts exceeded. [Default] 450-'4.7.25 Client host rejected: cannot find your hostname, [134.134.136.31]'
"

I guess there were some problems with the Intel mail system last night,
probably I should resend this patch set later.

^ permalink raw reply

* Re: [PATCH v9 00/36] x86: enable FRED for x86-64
From: Sean Christopherson @ 2023-07-31 22:29 UTC (permalink / raw)
  To: Xin Li
  Cc: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel,
	Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Peter Zijlstra,
	Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko,
	Josh Poimboeuf, Paul E . McKenney, Catalin Marinas, Randy Dunlap,
	Steven Rostedt, Kim Phillips, Hyeonggon Yoo, Liam R . Howlett,
	Sebastian Reichel, Kirill A . Shutemov, Suren Baghdasaryan,
	Pawan Gupta, Jiaxi Chen, Babu Moger, Jim Mattson, Sandipan Das,
	Lai Jiangshan, Hans de Goede, Reinette Chatre, Daniel Sneddon,
	Breno Leitao, Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

On Sun, Jul 30, 2023, Xin Li wrote:
> This patch set enables the Intel flexible return and event delivery
> (FRED) architecture for x86-64.

...

> -- 
> 2.34.1

What is this based on?	FYI, you're using a version of git that will (mostly)
automatically generate the based, e.g. I do 

  git format-patch --base=HEAD~$nr ...

in my scripts, where $nr is the number of patches I am sending.  My specific
approaches requires HEAD-$nr to be a publicly visible object/commit, but that
should be the case the vast majority of the time anyways.

^ permalink raw reply

* Re: [PATCH v9 29/36] x86/fred: FRED entry/exit and dispatch code
From: H. Peter Anvin @ 2023-07-31 22:07 UTC (permalink / raw)
  To: Xin Li, linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm,
	xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Andy Lutomirski, Oleg Nesterov, Tony Luck,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731064119.3870-1-xin3.li@intel.com>

On 7/30/23 23:41, Xin Li wrote:
> +static DEFINE_FRED_HANDLER(fred_other_default)
> +{
> +	regs->vector = X86_TRAP_UD;
> +	fred_emulate_fault(regs);
> +}
> +
> +static DEFINE_FRED_HANDLER(fred_syscall)
> +{
> +	regs->orig_ax = regs->ax;
> +	regs->ax = -ENOSYS;
> +	do_syscall_64(regs, regs->orig_ax);
> +}
> +
> +#if IS_ENABLED(CONFIG_IA32_EMULATION)
> +/*
> + * Emulate SYSENTER if applicable. This is not the preferred system
> + * call in 32-bit mode under FRED, rather int $0x80 is preferred and
> + * exported in the vdso.
> + */
> +static DEFINE_FRED_HANDLER(fred_sysenter)
> +{
> +	regs->orig_ax = regs->ax;
> +	regs->ax = -ENOSYS;
> +	do_fast_syscall_32(regs);
> +}
> +#else
> +#define fred_sysenter fred_other_default
> +#endif
> +
> +static DEFINE_FRED_HANDLER(fred_other)
> +{
> +	static const fred_handler user_other_handlers[FRED_NUM_OTHER_VECTORS] =
> +	{
> +		/*
> +		 * Vector 0 of the other event type is not used
> +		 * per FRED spec 5.0.
> +		 */
> +		[0]		= fred_other_default,
> +		[FRED_SYSCALL]	= fred_syscall,
> +		[FRED_SYSENTER]	= fred_sysenter
> +	};
> +
> +	user_other_handlers[regs->vector](regs);
> +}

OK, this is wrong.

Dispatching like fred_syscall() is only valid for syscall64, which means 
you have to check regs->l is set in addition to the correct regs->vector 
to determine validity.

Similarly, sysenter is only valid if regs->l is clear.

The best way is probably to drop the dispatch table here and just do an 
if ... else if ... else statement; gcc is smart enough that it will 
combine the vector test and the L bit test into a single mask and 
compare. This also allows stubs to be inlined.

However, emulating #UD on events other than wrong mode of SYSCALL and 
SYSENTER may be a bad idea. It would probably be better to invoke 
fred_bad_event() in that case.

Something like this:

+static DEFINE_FRED_HANDLER(fred_other_default)
+{
+	regs->vector = X86_TRAP_UD;
+	fred_emulate_fault(regs);
+}

1) rename this to fred_emulate_ud (since that is what it actually does.)

... then ...

	/* The compiler can fold these into a single test */

	if (likely(regs->vector == FRED_SYSCALL && regs->l)) {
		fred_syscall64(regs);
	} else if (likely(regs->vector == FRED_SYSENTER && !regs->l)) {
		fred_sysenter32(regs);
	} else if (regs->vector == FRED_SYSCALL ||
		   regs->vector == FRED_SYSENTER) {
		/* Invalid SYSCALL or SYSENTER instruction */
		fred_emulate_ud(regs);
	} else {
		/* Unknown event */
		fred_bad_event(regs);
	}

... or the SYSCALL64 and SYSENTER32 can be inlined with the appropriate 
comment (gcc will do so regardless.)

	-hpa



	-hpa

^ permalink raw reply

* Re: [PATCH v9 29/36] x86/fred: FRED entry/exit and dispatch code
From: H. Peter Anvin @ 2023-07-31 21:44 UTC (permalink / raw)
  To: Xin Li, linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm,
	xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Andy Lutomirski, Oleg Nesterov, Tony Luck,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731064119.3870-1-xin3.li@intel.com>

On 7/30/23 23:41, Xin Li wrote:
> +
> +static DEFINE_FRED_HANDLER(fred_sw_interrupt_user)
> +{
> +	/*
> +	 * In compat mode INT $0x80 (32bit system call) is
> +	 * performance-critical. Handle it first.
> +	 */
> +	if (IS_ENABLED(CONFIG_IA32_EMULATION) &&
> +	    likely(regs->vector == IA32_SYSCALL_VECTOR)) {
> +		regs->orig_ax = regs->ax;
> +		regs->ax = -ENOSYS;
> +		return do_int80_syscall_32(regs);
> +	}

We can presumably drop the early out here as well...

> +
> +	/*
> +	 * Some software exceptions can also be triggered as
> +	 * int instructions, for historical reasons.
> +	 */
> +	switch (regs->vector) {
> +	case X86_TRAP_BP:
> +	case X86_TRAP_OF:
> +		fred_emulate_trap(regs);
> +		break;
> +	default:
> +		regs->vector = X86_TRAP_GP;
> +		fred_emulate_fault(regs);
> +		break;
> +	}
> +}
> +


^ permalink raw reply

* Re: [PATCH] hv_balloon: Update the balloon driver to use the SBRM API
From: Boqun Feng @ 2023-07-31 21:25 UTC (permalink / raw)
  To: levymitchell0
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	linux-hyperv, linux-kernel, mikelly, peterz
In-Reply-To: <20230726-master-v1-1-b2ce6a4538db@gmail.com>

Hi Mitchell,

On Wed, Jul 26, 2023 at 12:23:31AM +0000, Mitchell Levy via B4 Relay wrote:
> From: Mitchell Levy <levymitchell0@gmail.com>
> 
> 
> 
> ---

I don't know whether it's a tool issue or something else, but all words
after the "---" line in the email will be discarded from a commit log.
You can try to apply this patch yourself and see the result:

	b4 shazam 20230726-master-v1-1-b2ce6a4538db@gmail.com 

> This patch is intended as a proof-of-concept for the new SBRM
> machinery[1]. For some brief background, the idea behind SBRM is using
> the __cleanup__ attribute to automatically unlock locks (or otherwise
> release resources) when they go out of scope, similar to C++ style RAII.
> This promises some benefits such as making code simpler (particularly
> where you have lots of goto fail; type constructs) as well as reducing
> the surface area for certain kinds of bugs.
> 
> The changes in this patch should not result in any difference in how the
> code actually runs (i.e., it's purely an exercise in this new syntax
> sugar). In one instance SBRM was not appropriate, so I left that part
> alone, but all other locking/unlocking is handled automatically in this
> patch.
> 
> Link: https://lore.kernel.org/all/20230626125726.GU4253@hirez.programming.kicks-ass.net/ [1]
> 
> Suggested-by: Boqun Feng <boqun.feng@gmail.com>
> Signed-off-by: "Mitchell Levy (Microsoft)" <levymitchell0@gmail.com>

Beside the above format issue, the code looks good to me, nice job!

Feel free to add:

Reviewed-by: Boqun Feng <boqun.feng@gmail.com>

Regards,
Boqun

> ---
>  drivers/hv/hv_balloon.c | 82 +++++++++++++++++++++++--------------------------
>  1 file changed, 38 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index dffcc894f117..2812601e84da 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -8,6 +8,7 @@
>  
>  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>  
> +#include <linux/cleanup.h>
>  #include <linux/kernel.h>
>  #include <linux/jiffies.h>
>  #include <linux/mman.h>
> @@ -646,7 +647,7 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
>  			      void *v)
>  {
>  	struct memory_notify *mem = (struct memory_notify *)v;
> -	unsigned long flags, pfn_count;
> +	unsigned long pfn_count;
>  
>  	switch (val) {
>  	case MEM_ONLINE:
> @@ -655,21 +656,22 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
>  		break;
>  
>  	case MEM_OFFLINE:
> -		spin_lock_irqsave(&dm_device.ha_lock, flags);
> -		pfn_count = hv_page_offline_check(mem->start_pfn,
> -						  mem->nr_pages);
> -		if (pfn_count <= dm_device.num_pages_onlined) {
> -			dm_device.num_pages_onlined -= pfn_count;
> -		} else {
> -			/*
> -			 * We're offlining more pages than we managed to online.
> -			 * This is unexpected. In any case don't let
> -			 * num_pages_onlined wrap around zero.
> -			 */
> -			WARN_ON_ONCE(1);
> -			dm_device.num_pages_onlined = 0;
> +		scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
> +			pfn_count = hv_page_offline_check(mem->start_pfn,
> +							  mem->nr_pages);
> +			if (pfn_count <= dm_device.num_pages_onlined) {
> +				dm_device.num_pages_onlined -= pfn_count;
> +			} else {
> +				/*
> +				 * We're offlining more pages than we
> +				 * managed to online. This is
> +				 * unexpected. In any case don't let
> +				 * num_pages_onlined wrap around zero.
> +				 */
> +				WARN_ON_ONCE(1);
> +				dm_device.num_pages_onlined = 0;
> +			}
>  		}
> -		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
>  		break;
>  	case MEM_GOING_ONLINE:
>  	case MEM_GOING_OFFLINE:
> @@ -721,24 +723,23 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
>  	unsigned long start_pfn;
>  	unsigned long processed_pfn;
>  	unsigned long total_pfn = pfn_count;
> -	unsigned long flags;
>  
>  	for (i = 0; i < (size/HA_CHUNK); i++) {
>  		start_pfn = start + (i * HA_CHUNK);
>  
> -		spin_lock_irqsave(&dm_device.ha_lock, flags);
> -		has->ha_end_pfn +=  HA_CHUNK;
> +		scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
> +			has->ha_end_pfn +=  HA_CHUNK;
>  
> -		if (total_pfn > HA_CHUNK) {
> -			processed_pfn = HA_CHUNK;
> -			total_pfn -= HA_CHUNK;
> -		} else {
> -			processed_pfn = total_pfn;
> -			total_pfn = 0;
> -		}
> +			if (total_pfn > HA_CHUNK) {
> +				processed_pfn = HA_CHUNK;
> +				total_pfn -= HA_CHUNK;
> +			} else {
> +				processed_pfn = total_pfn;
> +				total_pfn = 0;
> +			}
>  
> -		has->covered_end_pfn +=  processed_pfn;
> -		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> +			has->covered_end_pfn +=  processed_pfn;
> +		}
>  
>  		reinit_completion(&dm_device.ol_waitevent);
>  
> @@ -758,10 +759,10 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
>  				 */
>  				do_hot_add = false;
>  			}
> -			spin_lock_irqsave(&dm_device.ha_lock, flags);
> -			has->ha_end_pfn -= HA_CHUNK;
> -			has->covered_end_pfn -=  processed_pfn;
> -			spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> +			scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
> +				has->ha_end_pfn -= HA_CHUNK;
> +				has->covered_end_pfn -=  processed_pfn;
> +			}
>  			break;
>  		}
>  
> @@ -781,10 +782,9 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
>  static void hv_online_page(struct page *pg, unsigned int order)
>  {
>  	struct hv_hotadd_state *has;
> -	unsigned long flags;
>  	unsigned long pfn = page_to_pfn(pg);
>  
> -	spin_lock_irqsave(&dm_device.ha_lock, flags);
> +	guard(spinlock_irqsave)(&dm_device.ha_lock);
>  	list_for_each_entry(has, &dm_device.ha_region_list, list) {
>  		/* The page belongs to a different HAS. */
>  		if ((pfn < has->start_pfn) ||
> @@ -794,7 +794,6 @@ static void hv_online_page(struct page *pg, unsigned int order)
>  		hv_bring_pgs_online(has, pfn, 1UL << order);
>  		break;
>  	}
> -	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
>  }
>  
>  static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
> @@ -803,9 +802,8 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
>  	struct hv_hotadd_gap *gap;
>  	unsigned long residual, new_inc;
>  	int ret = 0;
> -	unsigned long flags;
>  
> -	spin_lock_irqsave(&dm_device.ha_lock, flags);
> +	guard(spinlock_irqsave)(&dm_device.ha_lock);
>  	list_for_each_entry(has, &dm_device.ha_region_list, list) {
>  		/*
>  		 * If the pfn range we are dealing with is not in the current
> @@ -852,7 +850,6 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
>  		ret = 1;
>  		break;
>  	}
> -	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
>  
>  	return ret;
>  }
> @@ -947,7 +944,6 @@ static unsigned long process_hot_add(unsigned long pg_start,
>  {
>  	struct hv_hotadd_state *ha_region = NULL;
>  	int covered;
> -	unsigned long flags;
>  
>  	if (pfn_cnt == 0)
>  		return 0;
> @@ -979,9 +975,9 @@ static unsigned long process_hot_add(unsigned long pg_start,
>  		ha_region->covered_end_pfn = pg_start;
>  		ha_region->end_pfn = rg_start + rg_size;
>  
> -		spin_lock_irqsave(&dm_device.ha_lock, flags);
> -		list_add_tail(&ha_region->list, &dm_device.ha_region_list);
> -		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> +		scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
> +			list_add_tail(&ha_region->list, &dm_device.ha_region_list);
> +		}
>  	}
>  
>  do_pg_range:
> @@ -2047,7 +2043,6 @@ static void balloon_remove(struct hv_device *dev)
>  	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
>  	struct hv_hotadd_state *has, *tmp;
>  	struct hv_hotadd_gap *gap, *tmp_gap;
> -	unsigned long flags;
>  
>  	if (dm->num_pages_ballooned != 0)
>  		pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);
> @@ -2073,7 +2068,7 @@ static void balloon_remove(struct hv_device *dev)
>  #endif
>  	}
>  
> -	spin_lock_irqsave(&dm_device.ha_lock, flags);
> +	guard(spinlock_irqsave)(&dm_device.ha_lock);
>  	list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
>  		list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) {
>  			list_del(&gap->list);
> @@ -2082,7 +2077,6 @@ static void balloon_remove(struct hv_device *dev)
>  		list_del(&has->list);
>  		kfree(has);
>  	}
> -	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
>  }
>  
>  static int balloon_suspend(struct hv_device *hv_dev)
> 
> ---
> base-commit: 3f01e9fed8454dcd89727016c3e5b2fbb8f8e50c
> change-id: 20230725-master-bbcd9205758b
> 
> Best regards,
> -- 
> Mitchell Levy <levymitchell0@gmail.com>
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox