* [PATCH 1/5] x86/xen: Move Xen upcall handler
2025-03-13 18:22 [PATCH 0/5] x86/entry: Break up common.c Brian Gerst
@ 2025-03-13 18:22 ` Brian Gerst
2025-03-14 7:37 ` Juergen Gross
2025-03-14 9:47 ` [tip: x86/cpu] x86/xen: Move Xen upcall handler to Xen specific code files tip-bot2 for Brian Gerst
2025-03-13 18:22 ` [PATCH 2/5] x86/syscall/32: Move 32-bit syscall dispatch code Brian Gerst
` (3 subsequent siblings)
4 siblings, 2 replies; 22+ messages in thread
From: Brian Gerst @ 2025-03-13 18:22 UTC (permalink / raw)
To: linux-kernel, x86
Cc: Ingo Molnar, H . Peter Anvin, Thomas Gleixner, Borislav Petkov,
Andy Lutomirski, Juergen Gross, Boris Ostrovsky, Brian Gerst
Move the upcall handler to Xen-specific files.
No functional changes.
Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
arch/x86/entry/common.c | 72 -------------------------------------
arch/x86/xen/enlighten_pv.c | 46 ++++++++++++++++++++++++
include/xen/xen-ops.h | 19 ++++++++++
3 files changed, 65 insertions(+), 72 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 3514bf2978ee..ce4d88eda693 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -21,11 +21,6 @@
#include <linux/uaccess.h>
#include <linux/init.h>
-#ifdef CONFIG_XEN_PV
-#include <xen/xen-ops.h>
-#include <xen/events.h>
-#endif
-
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/traps.h>
@@ -455,70 +450,3 @@ SYSCALL_DEFINE0(ni_syscall)
{
return -ENOSYS;
}
-
-#ifdef CONFIG_XEN_PV
-#ifndef CONFIG_PREEMPTION
-/*
- * Some hypercalls issued by the toolstack can take many 10s of
- * seconds. Allow tasks running hypercalls via the privcmd driver to
- * be voluntarily preempted even if full kernel preemption is
- * disabled.
- *
- * Such preemptible hypercalls are bracketed by
- * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
- * calls.
- */
-DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
-EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
-
-/*
- * In case of scheduling the flag must be cleared and restored after
- * returning from schedule as the task might move to a different CPU.
- */
-static __always_inline bool get_and_clear_inhcall(void)
-{
- bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
-
- __this_cpu_write(xen_in_preemptible_hcall, false);
- return inhcall;
-}
-
-static __always_inline void restore_inhcall(bool inhcall)
-{
- __this_cpu_write(xen_in_preemptible_hcall, inhcall);
-}
-#else
-static __always_inline bool get_and_clear_inhcall(void) { return false; }
-static __always_inline void restore_inhcall(bool inhcall) { }
-#endif
-
-static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- inc_irq_stat(irq_hv_callback_count);
-
- xen_evtchn_do_upcall();
-
- set_irq_regs(old_regs);
-}
-
-__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
-{
- irqentry_state_t state = irqentry_enter(regs);
- bool inhcall;
-
- instrumentation_begin();
- run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
-
- inhcall = get_and_clear_inhcall();
- if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
- irqentry_exit_cond_resched();
- instrumentation_end();
- restore_inhcall(inhcall);
- } else {
- instrumentation_end();
- irqentry_exit(regs, state);
- }
-}
-#endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 5e57835e999d..af9e43c47b07 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -73,6 +73,7 @@
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/cpu.h>
+#include <asm/irq_stack.h>
#ifdef CONFIG_X86_IOPL_IOPERM
#include <asm/io_bitmap.h>
#endif
@@ -94,6 +95,21 @@ void *xen_initial_gdt;
static int xen_cpu_up_prepare_pv(unsigned int cpu);
static int xen_cpu_dead_pv(unsigned int cpu);
+#ifndef CONFIG_PREEMPTION
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+#endif
+
struct tls_descs {
struct desc_struct desc[3];
};
@@ -687,6 +703,36 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
}
#endif
+static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ inc_irq_stat(irq_hv_callback_count);
+
+ xen_evtchn_do_upcall();
+
+ set_irq_regs(old_regs);
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ irqentry_state_t state = irqentry_enter(regs);
+ bool inhcall;
+
+ instrumentation_begin();
+ run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
+
+ inhcall = get_and_clear_inhcall();
+ if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
+ irqentry_exit_cond_resched();
+ instrumentation_end();
+ restore_inhcall(inhcall);
+ } else {
+ instrumentation_end();
+ irqentry_exit(regs, state);
+ }
+}
+
struct trap_array_entry {
void (*orig)(void);
void (*xen)(void);
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 47f11bec5e90..174ef8e4600f 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -208,10 +208,29 @@ static inline void xen_preemptible_hcall_end(void)
__this_cpu_write(xen_in_preemptible_hcall, false);
}
+/*
+ * In case of scheduling the flag must be cleared and restored after
+ * returning from schedule as the task might move to a different CPU.
+ */
+static __always_inline bool get_and_clear_inhcall(void)
+{
+ bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+ __this_cpu_write(xen_in_preemptible_hcall, false);
+ return inhcall;
+}
+
+static __always_inline void restore_inhcall(bool inhcall)
+{
+ __this_cpu_write(xen_in_preemptible_hcall, inhcall);
+}
+
#else
static inline void xen_preemptible_hcall_begin(void) { }
static inline void xen_preemptible_hcall_end(void) { }
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
+static __always_inline void restore_inhcall(bool inhcall) { }
#endif /* CONFIG_XEN_PV && !CONFIG_PREEMPTION */
--
2.48.1
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH 1/5] x86/xen: Move Xen upcall handler
2025-03-13 18:22 ` [PATCH 1/5] x86/xen: Move Xen upcall handler Brian Gerst
@ 2025-03-14 7:37 ` Juergen Gross
2025-03-14 9:47 ` [tip: x86/cpu] x86/xen: Move Xen upcall handler to Xen specific code files tip-bot2 for Brian Gerst
1 sibling, 0 replies; 22+ messages in thread
From: Juergen Gross @ 2025-03-14 7:37 UTC (permalink / raw)
To: Brian Gerst, linux-kernel, x86
Cc: Ingo Molnar, H . Peter Anvin, Thomas Gleixner, Borislav Petkov,
Andy Lutomirski, Boris Ostrovsky
[-- Attachment #1.1.1: Type: text/plain, Size: 6585 bytes --]
On 13.03.25 19:22, Brian Gerst wrote:
> Move the upcall handler to Xen-specific files.
>
> No functional changes.
>
> Signed-off-by: Brian Gerst <brgerst@gmail.com>
> Cc: Juergen Gross <jgross@suse.com>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> ---
> arch/x86/entry/common.c | 72 -------------------------------------
> arch/x86/xen/enlighten_pv.c | 46 ++++++++++++++++++++++++
> include/xen/xen-ops.h | 19 ++++++++++
> 3 files changed, 65 insertions(+), 72 deletions(-)
>
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 3514bf2978ee..ce4d88eda693 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -21,11 +21,6 @@
> #include <linux/uaccess.h>
> #include <linux/init.h>
>
> -#ifdef CONFIG_XEN_PV
> -#include <xen/xen-ops.h>
> -#include <xen/events.h>
> -#endif
> -
> #include <asm/apic.h>
> #include <asm/desc.h>
> #include <asm/traps.h>
> @@ -455,70 +450,3 @@ SYSCALL_DEFINE0(ni_syscall)
> {
> return -ENOSYS;
> }
> -
> -#ifdef CONFIG_XEN_PV
> -#ifndef CONFIG_PREEMPTION
> -/*
> - * Some hypercalls issued by the toolstack can take many 10s of
> - * seconds. Allow tasks running hypercalls via the privcmd driver to
> - * be voluntarily preempted even if full kernel preemption is
> - * disabled.
> - *
> - * Such preemptible hypercalls are bracketed by
> - * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
> - * calls.
> - */
> -DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
> -EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
> -
> -/*
> - * In case of scheduling the flag must be cleared and restored after
> - * returning from schedule as the task might move to a different CPU.
> - */
> -static __always_inline bool get_and_clear_inhcall(void)
> -{
> - bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
> -
> - __this_cpu_write(xen_in_preemptible_hcall, false);
> - return inhcall;
> -}
> -
> -static __always_inline void restore_inhcall(bool inhcall)
> -{
> - __this_cpu_write(xen_in_preemptible_hcall, inhcall);
> -}
> -#else
> -static __always_inline bool get_and_clear_inhcall(void) { return false; }
> -static __always_inline void restore_inhcall(bool inhcall) { }
> -#endif
> -
> -static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
> -{
> - struct pt_regs *old_regs = set_irq_regs(regs);
> -
> - inc_irq_stat(irq_hv_callback_count);
> -
> - xen_evtchn_do_upcall();
> -
> - set_irq_regs(old_regs);
> -}
> -
> -__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
> -{
> - irqentry_state_t state = irqentry_enter(regs);
> - bool inhcall;
> -
> - instrumentation_begin();
> - run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
> -
> - inhcall = get_and_clear_inhcall();
> - if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
> - irqentry_exit_cond_resched();
> - instrumentation_end();
> - restore_inhcall(inhcall);
> - } else {
> - instrumentation_end();
> - irqentry_exit(regs, state);
> - }
> -}
> -#endif /* CONFIG_XEN_PV */
> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
> index 5e57835e999d..af9e43c47b07 100644
> --- a/arch/x86/xen/enlighten_pv.c
> +++ b/arch/x86/xen/enlighten_pv.c
> @@ -73,6 +73,7 @@
> #include <asm/mwait.h>
> #include <asm/pci_x86.h>
> #include <asm/cpu.h>
> +#include <asm/irq_stack.h>
> #ifdef CONFIG_X86_IOPL_IOPERM
> #include <asm/io_bitmap.h>
> #endif
> @@ -94,6 +95,21 @@ void *xen_initial_gdt;
> static int xen_cpu_up_prepare_pv(unsigned int cpu);
> static int xen_cpu_dead_pv(unsigned int cpu);
>
> +#ifndef CONFIG_PREEMPTION
> +/*
> + * Some hypercalls issued by the toolstack can take many 10s of
> + * seconds. Allow tasks running hypercalls via the privcmd driver to
> + * be voluntarily preempted even if full kernel preemption is
> + * disabled.
> + *
> + * Such preemptible hypercalls are bracketed by
> + * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
> + * calls.
> + */
> +DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
> +EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
> +#endif
> +
> struct tls_descs {
> struct desc_struct desc[3];
> };
> @@ -687,6 +703,36 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
> }
> #endif
>
> +static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
> +{
> + struct pt_regs *old_regs = set_irq_regs(regs);
> +
> + inc_irq_stat(irq_hv_callback_count);
> +
> + xen_evtchn_do_upcall();
> +
> + set_irq_regs(old_regs);
> +}
> +
> +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
> +{
> + irqentry_state_t state = irqentry_enter(regs);
> + bool inhcall;
> +
> + instrumentation_begin();
> + run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
> +
> + inhcall = get_and_clear_inhcall();
> + if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
> + irqentry_exit_cond_resched();
> + instrumentation_end();
> + restore_inhcall(inhcall);
> + } else {
> + instrumentation_end();
> + irqentry_exit(regs, state);
> + }
> +}
> +
> struct trap_array_entry {
> void (*orig)(void);
> void (*xen)(void);
> diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
> index 47f11bec5e90..174ef8e4600f 100644
> --- a/include/xen/xen-ops.h
> +++ b/include/xen/xen-ops.h
> @@ -208,10 +208,29 @@ static inline void xen_preemptible_hcall_end(void)
> __this_cpu_write(xen_in_preemptible_hcall, false);
> }
>
> +/*
> + * In case of scheduling the flag must be cleared and restored after
> + * returning from schedule as the task might move to a different CPU.
> + */
> +static __always_inline bool get_and_clear_inhcall(void)
> +{
> + bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
> +
> + __this_cpu_write(xen_in_preemptible_hcall, false);
> + return inhcall;
> +}
> +
> +static __always_inline void restore_inhcall(bool inhcall)
> +{
> + __this_cpu_write(xen_in_preemptible_hcall, inhcall);
> +}
> +
> #else
>
> static inline void xen_preemptible_hcall_begin(void) { }
> static inline void xen_preemptible_hcall_end(void) { }
> +static __always_inline bool get_and_clear_inhcall(void) { return false; }
> +static __always_inline void restore_inhcall(bool inhcall) { }
>
> #endif /* CONFIG_XEN_PV && !CONFIG_PREEMPTION */
>
I don't see a reason to put those two functions into xen_ops.h, as
they are used by xen_pv_evtchn_do_upcall() only.
Please move them to enlighten_pv.c, too.
Juergen
[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3743 bytes --]
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]
^ permalink raw reply [flat|nested] 22+ messages in thread* [tip: x86/cpu] x86/xen: Move Xen upcall handler to Xen specific code files
2025-03-13 18:22 ` [PATCH 1/5] x86/xen: Move Xen upcall handler Brian Gerst
2025-03-14 7:37 ` Juergen Gross
@ 2025-03-14 9:47 ` tip-bot2 for Brian Gerst
2025-03-14 9:53 ` Jürgen Groß
1 sibling, 1 reply; 22+ messages in thread
From: tip-bot2 for Brian Gerst @ 2025-03-14 9:47 UTC (permalink / raw)
To: linux-tip-commits
Cc: Brian Gerst, Ingo Molnar, Sohil Mehta, Andy Lutomirski,
Juergen Gross, H. Peter Anvin, Linus Torvalds, Josh Poimboeuf,
x86, linux-kernel
The following commit has been merged into the x86/cpu branch of tip:
Commit-ID: 827dc2e36172e978d6b1c701b04bee56881f54bf
Gitweb: https://git.kernel.org/tip/827dc2e36172e978d6b1c701b04bee56881f54bf
Author: Brian Gerst <brgerst@gmail.com>
AuthorDate: Thu, 13 Mar 2025 14:22:32 -04:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 14 Mar 2025 10:32:51 +01:00
x86/xen: Move Xen upcall handler to Xen specific code files
Move the upcall handler to Xen-specific files.
No functional changes.
Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20250313182236.655724-2-brgerst@gmail.com
---
arch/x86/entry/common.c | 72 +------------------------------------
arch/x86/xen/enlighten_pv.c | 46 +++++++++++++++++++++++-
include/xen/xen-ops.h | 19 ++++++++++-
3 files changed, 65 insertions(+), 72 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 3514bf2..ce4d88e 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -21,11 +21,6 @@
#include <linux/uaccess.h>
#include <linux/init.h>
-#ifdef CONFIG_XEN_PV
-#include <xen/xen-ops.h>
-#include <xen/events.h>
-#endif
-
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/traps.h>
@@ -455,70 +450,3 @@ SYSCALL_DEFINE0(ni_syscall)
{
return -ENOSYS;
}
-
-#ifdef CONFIG_XEN_PV
-#ifndef CONFIG_PREEMPTION
-/*
- * Some hypercalls issued by the toolstack can take many 10s of
- * seconds. Allow tasks running hypercalls via the privcmd driver to
- * be voluntarily preempted even if full kernel preemption is
- * disabled.
- *
- * Such preemptible hypercalls are bracketed by
- * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
- * calls.
- */
-DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
-EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
-
-/*
- * In case of scheduling the flag must be cleared and restored after
- * returning from schedule as the task might move to a different CPU.
- */
-static __always_inline bool get_and_clear_inhcall(void)
-{
- bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
-
- __this_cpu_write(xen_in_preemptible_hcall, false);
- return inhcall;
-}
-
-static __always_inline void restore_inhcall(bool inhcall)
-{
- __this_cpu_write(xen_in_preemptible_hcall, inhcall);
-}
-#else
-static __always_inline bool get_and_clear_inhcall(void) { return false; }
-static __always_inline void restore_inhcall(bool inhcall) { }
-#endif
-
-static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- inc_irq_stat(irq_hv_callback_count);
-
- xen_evtchn_do_upcall();
-
- set_irq_regs(old_regs);
-}
-
-__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
-{
- irqentry_state_t state = irqentry_enter(regs);
- bool inhcall;
-
- instrumentation_begin();
- run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
-
- inhcall = get_and_clear_inhcall();
- if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
- irqentry_exit_cond_resched();
- instrumentation_end();
- restore_inhcall(inhcall);
- } else {
- instrumentation_end();
- irqentry_exit(regs, state);
- }
-}
-#endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 5e57835..af9e43c 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -73,6 +73,7 @@
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/cpu.h>
+#include <asm/irq_stack.h>
#ifdef CONFIG_X86_IOPL_IOPERM
#include <asm/io_bitmap.h>
#endif
@@ -94,6 +95,21 @@ void *xen_initial_gdt;
static int xen_cpu_up_prepare_pv(unsigned int cpu);
static int xen_cpu_dead_pv(unsigned int cpu);
+#ifndef CONFIG_PREEMPTION
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+#endif
+
struct tls_descs {
struct desc_struct desc[3];
};
@@ -687,6 +703,36 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
}
#endif
+static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ inc_irq_stat(irq_hv_callback_count);
+
+ xen_evtchn_do_upcall();
+
+ set_irq_regs(old_regs);
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ irqentry_state_t state = irqentry_enter(regs);
+ bool inhcall;
+
+ instrumentation_begin();
+ run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
+
+ inhcall = get_and_clear_inhcall();
+ if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
+ irqentry_exit_cond_resched();
+ instrumentation_end();
+ restore_inhcall(inhcall);
+ } else {
+ instrumentation_end();
+ irqentry_exit(regs, state);
+ }
+}
+
struct trap_array_entry {
void (*orig)(void);
void (*xen)(void);
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 47f11be..174ef8e 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -208,10 +208,29 @@ static inline void xen_preemptible_hcall_end(void)
__this_cpu_write(xen_in_preemptible_hcall, false);
}
+/*
+ * In case of scheduling the flag must be cleared and restored after
+ * returning from schedule as the task might move to a different CPU.
+ */
+static __always_inline bool get_and_clear_inhcall(void)
+{
+ bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+ __this_cpu_write(xen_in_preemptible_hcall, false);
+ return inhcall;
+}
+
+static __always_inline void restore_inhcall(bool inhcall)
+{
+ __this_cpu_write(xen_in_preemptible_hcall, inhcall);
+}
+
#else
static inline void xen_preemptible_hcall_begin(void) { }
static inline void xen_preemptible_hcall_end(void) { }
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
+static __always_inline void restore_inhcall(bool inhcall) { }
#endif /* CONFIG_XEN_PV && !CONFIG_PREEMPTION */
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [tip: x86/cpu] x86/xen: Move Xen upcall handler to Xen specific code files
2025-03-14 9:47 ` [tip: x86/cpu] x86/xen: Move Xen upcall handler to Xen specific code files tip-bot2 for Brian Gerst
@ 2025-03-14 9:53 ` Jürgen Groß
2025-03-14 10:08 ` Ingo Molnar
0 siblings, 1 reply; 22+ messages in thread
From: Jürgen Groß @ 2025-03-14 9:53 UTC (permalink / raw)
To: linux-kernel, linux-tip-commits
Cc: Brian Gerst, Ingo Molnar, Sohil Mehta, Andy Lutomirski,
H. Peter Anvin, Linus Torvalds, Josh Poimboeuf, x86
[-- Attachment #1.1.1: Type: text/plain, Size: 1360 bytes --]
On 14.03.25 10:47, tip-bot2 for Brian Gerst wrote:
> The following commit has been merged into the x86/cpu branch of tip:
>
> Commit-ID: 827dc2e36172e978d6b1c701b04bee56881f54bf
> Gitweb: https://git.kernel.org/tip/827dc2e36172e978d6b1c701b04bee56881f54bf
> Author: Brian Gerst <brgerst@gmail.com>
> AuthorDate: Thu, 13 Mar 2025 14:22:32 -04:00
> Committer: Ingo Molnar <mingo@kernel.org>
> CommitterDate: Fri, 14 Mar 2025 10:32:51 +01:00
>
> x86/xen: Move Xen upcall handler to Xen specific code files
>
> Move the upcall handler to Xen-specific files.
>
> No functional changes.
>
> Signed-off-by: Brian Gerst <brgerst@gmail.com>
> Signed-off-by: Ingo Molnar <mingo@kernel.org>
> Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Juergen Gross <jgross@suse.com>
> Cc: H. Peter Anvin <hpa@zytor.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Josh Poimboeuf <jpoimboe@redhat.com>
> Link: https://lore.kernel.org/r/20250313182236.655724-2-brgerst@gmail.com
Why do I even request changes if such a request is being ignored?
Please note that my request wasn't about something which should be handled
in a followup patch. I was asking to NOT move the code into multiple files,
but to keep it in one file as it was originally.
Juergen
[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3743 bytes --]
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [tip: x86/cpu] x86/xen: Move Xen upcall handler to Xen specific code files
2025-03-14 9:53 ` Jürgen Groß
@ 2025-03-14 10:08 ` Ingo Molnar
2025-03-14 10:14 ` Jürgen Groß
0 siblings, 1 reply; 22+ messages in thread
From: Ingo Molnar @ 2025-03-14 10:08 UTC (permalink / raw)
To: Jürgen Groß
Cc: linux-kernel, linux-tip-commits, Brian Gerst, Sohil Mehta,
Andy Lutomirski, H. Peter Anvin, Linus Torvalds, Josh Poimboeuf,
x86
* Jürgen Groß <jgross@suse.com> wrote:
> On 14.03.25 10:47, tip-bot2 for Brian Gerst wrote:
> > The following commit has been merged into the x86/cpu branch of tip:
> >
> > Commit-ID: 827dc2e36172e978d6b1c701b04bee56881f54bf
> > Gitweb: https://git.kernel.org/tip/827dc2e36172e978d6b1c701b04bee56881f54bf
> > Author: Brian Gerst <brgerst@gmail.com>
> > AuthorDate: Thu, 13 Mar 2025 14:22:32 -04:00
> > Committer: Ingo Molnar <mingo@kernel.org>
> > CommitterDate: Fri, 14 Mar 2025 10:32:51 +01:00
> >
> > x86/xen: Move Xen upcall handler to Xen specific code files
> >
> > Move the upcall handler to Xen-specific files.
> >
> > No functional changes.
> >
> > Signed-off-by: Brian Gerst <brgerst@gmail.com>
> > Signed-off-by: Ingo Molnar <mingo@kernel.org>
> > Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
> > Cc: Andy Lutomirski <luto@kernel.org>
> > Cc: Juergen Gross <jgross@suse.com>
> > Cc: H. Peter Anvin <hpa@zytor.com>
> > Cc: Linus Torvalds <torvalds@linux-foundation.org>
> > Cc: Josh Poimboeuf <jpoimboe@redhat.com>
> > Link: https://lore.kernel.org/r/20250313182236.655724-2-brgerst@gmail.com
>
> Why do I even request changes if such a request is being ignored?
I missed your mail, sorry.
> Please note that my request wasn't about something which should be
> handled in a followup patch. I was asking to NOT move the code into
> multiple files, but to keep it in one file as it was originally.
I agree with you that this code looks better in enlighten_pv.c, but
there's no reason to keep arch/x86/entry/common.c, agreed?
I've rolled back these changes and will wait for -v2.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [tip: x86/cpu] x86/xen: Move Xen upcall handler to Xen specific code files
2025-03-14 10:08 ` Ingo Molnar
@ 2025-03-14 10:14 ` Jürgen Groß
0 siblings, 0 replies; 22+ messages in thread
From: Jürgen Groß @ 2025-03-14 10:14 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, linux-tip-commits, Brian Gerst, Sohil Mehta,
Andy Lutomirski, H. Peter Anvin, Linus Torvalds, Josh Poimboeuf,
x86
[-- Attachment #1.1.1: Type: text/plain, Size: 1772 bytes --]
On 14.03.25 11:08, Ingo Molnar wrote:
>
> * Jürgen Groß <jgross@suse.com> wrote:
>
>> On 14.03.25 10:47, tip-bot2 for Brian Gerst wrote:
>>> The following commit has been merged into the x86/cpu branch of tip:
>>>
>>> Commit-ID: 827dc2e36172e978d6b1c701b04bee56881f54bf
>>> Gitweb: https://git.kernel.org/tip/827dc2e36172e978d6b1c701b04bee56881f54bf
>>> Author: Brian Gerst <brgerst@gmail.com>
>>> AuthorDate: Thu, 13 Mar 2025 14:22:32 -04:00
>>> Committer: Ingo Molnar <mingo@kernel.org>
>>> CommitterDate: Fri, 14 Mar 2025 10:32:51 +01:00
>>>
>>> x86/xen: Move Xen upcall handler to Xen specific code files
>>>
>>> Move the upcall handler to Xen-specific files.
>>>
>>> No functional changes.
>>>
>>> Signed-off-by: Brian Gerst <brgerst@gmail.com>
>>> Signed-off-by: Ingo Molnar <mingo@kernel.org>
>>> Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
>>> Cc: Andy Lutomirski <luto@kernel.org>
>>> Cc: Juergen Gross <jgross@suse.com>
>>> Cc: H. Peter Anvin <hpa@zytor.com>
>>> Cc: Linus Torvalds <torvalds@linux-foundation.org>
>>> Cc: Josh Poimboeuf <jpoimboe@redhat.com>
>>> Link: https://lore.kernel.org/r/20250313182236.655724-2-brgerst@gmail.com
>>
>> Why do I even request changes if such a request is being ignored?
>
> I missed your mail, sorry.
>
>> Please note that my request wasn't about something which should be
>> handled in a followup patch. I was asking to NOT move the code into
>> multiple files, but to keep it in one file as it was originally.
>
> I agree with you that this code looks better in enlighten_pv.c, but
> there's no reason to keep arch/x86/entry/common.c, agreed?
Absolutely.
>
> I've rolled back these changes and will wait for -v2.
Thanks
Juergen
[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3743 bytes --]
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 2/5] x86/syscall/32: Move 32-bit syscall dispatch code
2025-03-13 18:22 [PATCH 0/5] x86/entry: Break up common.c Brian Gerst
2025-03-13 18:22 ` [PATCH 1/5] x86/xen: Move Xen upcall handler Brian Gerst
@ 2025-03-13 18:22 ` Brian Gerst
2025-03-13 23:44 ` Sohil Mehta
2025-03-14 9:46 ` [tip: x86/cpu] x86/syscall/32: Move the 32-bit syscall dispatch code to arch/x86/entry/syscall_32.c tip-bot2 for Brian Gerst
2025-03-13 18:22 ` [PATCH 3/5] x86/syscall/64: Move 64-bit syscall dispatch code Brian Gerst
` (2 subsequent siblings)
4 siblings, 2 replies; 22+ messages in thread
From: Brian Gerst @ 2025-03-13 18:22 UTC (permalink / raw)
To: linux-kernel, x86
Cc: Ingo Molnar, H . Peter Anvin, Thomas Gleixner, Borislav Petkov,
Andy Lutomirski, Juergen Gross, Boris Ostrovsky, Brian Gerst
Move the 32-bit syscall dispatch code to syscall_32.c.
No functional changes.
Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
arch/x86/entry/Makefile | 2 +
arch/x86/entry/common.c | 321 ----------------------------------
arch/x86/entry/syscall_32.c | 336 +++++++++++++++++++++++++++++++++++-
3 files changed, 336 insertions(+), 323 deletions(-)
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index ce1cc1622385..96a6b86e0a8b 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -8,8 +8,10 @@ UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
CFLAGS_common.o += -fno-stack-protector
+CFLAGS_syscall_32.o += -fno-stack-protector
obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o
obj-y += common.o
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index ce4d88eda693..183efabefe57 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -125,327 +125,6 @@ __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
}
#endif
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
-static __always_inline int syscall_32_enter(struct pt_regs *regs)
-{
- if (IS_ENABLED(CONFIG_IA32_EMULATION))
- current_thread_info()->status |= TS_COMPAT;
-
- return (int)regs->orig_ax;
-}
-
-#ifdef CONFIG_IA32_EMULATION
-bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
-
-static int __init ia32_emulation_override_cmdline(char *arg)
-{
- return kstrtobool(arg, &__ia32_enabled);
-}
-early_param("ia32_emulation", ia32_emulation_override_cmdline);
-#endif
-
-/*
- * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL.
- */
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
-{
- /*
- * Convert negative numbers to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int unr = nr;
-
- if (likely(unr < IA32_NR_syscalls)) {
- unr = array_index_nospec(unr, IA32_NR_syscalls);
- regs->ax = ia32_sys_call(regs, unr);
- } else if (nr != -1) {
- regs->ax = __ia32_sys_ni_syscall(regs);
- }
-}
-
-#ifdef CONFIG_IA32_EMULATION
-static __always_inline bool int80_is_external(void)
-{
- const unsigned int offs = (0x80 / 32) * 0x10;
- const u32 bit = BIT(0x80 % 32);
-
- /* The local APIC on XENPV guests is fake */
- if (cpu_feature_enabled(X86_FEATURE_XENPV))
- return false;
-
- /*
- * If vector 0x80 is set in the APIC ISR then this is an external
- * interrupt. Either from broken hardware or injected by a VMM.
- *
- * Note: In guest mode this is only valid for secure guests where
- * the secure module fully controls the vAPIC exposed to the guest.
- */
- return apic_read(APIC_ISR + offs) & bit;
-}
-
-/**
- * do_int80_emulation - 32-bit legacy syscall C entry from asm
- * @regs: syscall arguments in struct pt_args on the stack.
- *
- * This entry point can be used by 32-bit and 64-bit programs to perform
- * 32-bit system calls. Instances of INT $0x80 can be found inline in
- * various programs and libraries. It is also used by the vDSO's
- * __kernel_vsyscall fallback for hardware that doesn't support a faster
- * entry method. Restarted 32-bit system calls also fall back to INT
- * $0x80 regardless of what instruction was originally used to do the
- * system call.
- *
- * This is considered a slow path. It is not used by most libc
- * implementations on modern hardware except during process startup.
- *
- * The arguments for the INT $0x80 based syscall are on stack in the
- * pt_regs structure:
- * eax: system call number
- * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
- */
-__visible noinstr void do_int80_emulation(struct pt_regs *regs)
-{
- int nr;
-
- /* Kernel does not use INT $0x80! */
- if (unlikely(!user_mode(regs))) {
- irqentry_enter(regs);
- instrumentation_begin();
- panic("Unexpected external interrupt 0x80\n");
- }
-
- /*
- * Establish kernel context for instrumentation, including for
- * int80_is_external() below which calls into the APIC driver.
- * Identical for soft and external interrupts.
- */
- enter_from_user_mode(regs);
-
- instrumentation_begin();
- add_random_kstack_offset();
-
- /* Validate that this is a soft interrupt to the extent possible */
- if (unlikely(int80_is_external()))
- panic("Unexpected external interrupt 0x80\n");
-
- /*
- * The low level idtentry code pushed -1 into regs::orig_ax
- * and regs::ax contains the syscall number.
- *
- * User tracing code (ptrace or signal handlers) might assume
- * that the regs::orig_ax contains a 32-bit number on invoking
- * a 32-bit syscall.
- *
- * Establish the syscall convention by saving the 32bit truncated
- * syscall number in regs::orig_ax and by invalidating regs::ax.
- */
- regs->orig_ax = regs->ax & GENMASK(31, 0);
- regs->ax = -ENOSYS;
-
- nr = syscall_32_enter(regs);
-
- local_irq_enable();
- nr = syscall_enter_from_user_mode_work(regs, nr);
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-}
-
-#ifdef CONFIG_X86_FRED
-/*
- * A FRED-specific INT80 handler is warranted for the follwing reasons:
- *
- * 1) As INT instructions and hardware interrupts are separate event
- * types, FRED does not preclude the use of vector 0x80 for external
- * interrupts. As a result, the FRED setup code does not reserve
- * vector 0x80 and calling int80_is_external() is not merely
- * suboptimal but actively incorrect: it could cause a system call
- * to be incorrectly ignored.
- *
- * 2) It is called only for handling vector 0x80 of event type
- * EVENT_TYPE_SWINT and will never be called to handle any external
- * interrupt (event type EVENT_TYPE_EXTINT).
- *
- * 3) FRED has separate entry flows depending on if the event came from
- * user space or kernel space, and because the kernel does not use
- * INT insns, the FRED kernel entry handler fred_entry_from_kernel()
- * falls through to fred_bad_type() if the event type is
- * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
- * an INT insn, it can only be from a user level.
- *
- * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
- * likely take a different approach if it is ever needed: it
- * probably belongs in either fred_intx()/ fred_other() or
- * asm_fred_entrypoint_user(), depending on if this ought to be done
- * for all entries from userspace or only system
- * calls.
- *
- * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
- */
-DEFINE_FREDENTRY_RAW(int80_emulation)
-{
- int nr;
-
- enter_from_user_mode(regs);
-
- instrumentation_begin();
- add_random_kstack_offset();
-
- /*
- * FRED pushed 0 into regs::orig_ax and regs::ax contains the
- * syscall number.
- *
- * User tracing code (ptrace or signal handlers) might assume
- * that the regs::orig_ax contains a 32-bit number on invoking
- * a 32-bit syscall.
- *
- * Establish the syscall convention by saving the 32bit truncated
- * syscall number in regs::orig_ax and by invalidating regs::ax.
- */
- regs->orig_ax = regs->ax & GENMASK(31, 0);
- regs->ax = -ENOSYS;
-
- nr = syscall_32_enter(regs);
-
- local_irq_enable();
- nr = syscall_enter_from_user_mode_work(regs, nr);
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-}
-#endif
-#else /* CONFIG_IA32_EMULATION */
-
-/* Handles int $0x80 on a 32bit kernel */
-__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
-{
- int nr = syscall_32_enter(regs);
-
- add_random_kstack_offset();
- /*
- * Subtlety here: if ptrace pokes something larger than 2^31-1 into
- * orig_ax, the int return value truncates it. This matches
- * the semantics of syscall_get_nr().
- */
- nr = syscall_enter_from_user_mode(regs, nr);
- instrumentation_begin();
-
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-}
-#endif /* !CONFIG_IA32_EMULATION */
-
-static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
-{
- int nr = syscall_32_enter(regs);
- int res;
-
- add_random_kstack_offset();
- /*
- * This cannot use syscall_enter_from_user_mode() as it has to
- * fetch EBP before invoking any of the syscall entry work
- * functions.
- */
- syscall_enter_from_user_mode_prepare(regs);
-
- instrumentation_begin();
- /* Fetch EBP from where the vDSO stashed it. */
- if (IS_ENABLED(CONFIG_X86_64)) {
- /*
- * Micro-optimization: the pointer we're following is
- * explicitly 32 bits, so it can't be out of range.
- */
- res = __get_user(*(u32 *)®s->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp);
- } else {
- res = get_user(*(u32 *)®s->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp);
- }
-
- if (res) {
- /* User code screwed up. */
- regs->ax = -EFAULT;
-
- local_irq_disable();
- instrumentation_end();
- irqentry_exit_to_user_mode(regs);
- return false;
- }
-
- nr = syscall_enter_from_user_mode_work(regs, nr);
-
- /* Now this is just like a normal syscall. */
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
- return true;
-}
-
-/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
-__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
-{
- /*
- * Called using the internal vDSO SYSENTER/SYSCALL32 calling
- * convention. Adjust regs so it looks like we entered using int80.
- */
- unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
- vdso_image_32.sym_int80_landing_pad;
-
- /*
- * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
- * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
- * Fix it up.
- */
- regs->ip = landing_pad;
-
- /* Invoke the syscall. If it failed, keep it simple: use IRET. */
- if (!__do_fast_syscall_32(regs))
- return false;
-
- /*
- * Check that the register state is valid for using SYSRETL/SYSEXIT
- * to exit to userspace. Otherwise use the slower but fully capable
- * IRET exit path.
- */
-
- /* XEN PV guests always use the IRET path */
- if (cpu_feature_enabled(X86_FEATURE_XENPV))
- return false;
-
- /* EIP must point to the VDSO landing pad */
- if (unlikely(regs->ip != landing_pad))
- return false;
-
- /* CS and SS must match the values set in MSR_STAR */
- if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
- return false;
-
- /* If the TF, RF, or VM flags are set, use IRET */
- if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
- return false;
-
- /* Use SYSRETL/SYSEXIT to exit to userspace */
- return true;
-}
-
-/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
-__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
-{
- /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
- regs->sp = regs->bp;
-
- /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
- regs->flags |= X86_EFLAGS_IF;
-
- return do_fast_syscall_32(regs);
-}
-#endif
-
SYSCALL_DEFINE0(ni_syscall)
{
return -ENOSYS;
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 8cc9950d7104..7c286e89fd04 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -1,10 +1,23 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for i386. */
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 32-bit system call dispatch
+ *
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * Based on asm and ptrace code by many authors. The code here originated
+ * in ptrace.c and signal.c.
+ */
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
+#include <linux/entry-common.h>
+#include <linux/nospec.h>
+#include <linux/uaccess.h>
+#include <asm/apic.h>
+#include <asm/traps.h>
+#include <asm/cpufeature.h>
#include <asm/syscall.h>
#ifdef CONFIG_IA32_EMULATION
@@ -42,3 +55,322 @@ long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
default: return __ia32_sys_ni_syscall(regs);
}
};
+
+static __always_inline int syscall_32_enter(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_IA32_EMULATION))
+ current_thread_info()->status |= TS_COMPAT;
+
+ return (int)regs->orig_ax;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
+
+static int __init ia32_emulation_override_cmdline(char *arg)
+{
+ return kstrtobool(arg, &__ia32_enabled);
+}
+early_param("ia32_emulation", ia32_emulation_override_cmdline);
+#endif
+
+/*
+ * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL.
+ */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < IA32_NR_syscalls)) {
+ unr = array_index_nospec(unr, IA32_NR_syscalls);
+ regs->ax = ia32_sys_call(regs, unr);
+ } else if (nr != -1) {
+ regs->ax = __ia32_sys_ni_syscall(regs);
+ }
+}
+
+#ifdef CONFIG_IA32_EMULATION
+static __always_inline bool int80_is_external(void)
+{
+ const unsigned int offs = (0x80 / 32) * 0x10;
+ const u32 bit = BIT(0x80 % 32);
+
+ /* The local APIC on XENPV guests is fake */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /*
+ * If vector 0x80 is set in the APIC ISR then this is an external
+ * interrupt. Either from broken hardware or injected by a VMM.
+ *
+ * Note: In guest mode this is only valid for secure guests where
+ * the secure module fully controls the vAPIC exposed to the guest.
+ */
+ return apic_read(APIC_ISR + offs) & bit;
+}
+
+/**
+ * do_int80_emulation - 32-bit legacy syscall C entry from asm
+ * @regs: syscall arguments in struct pt_args on the stack.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls. Instances of INT $0x80 can be found inline in
+ * various programs and libraries. It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method. Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * The arguments for the INT $0x80 based syscall are on stack in the
+ * pt_regs structure:
+ * eax: system call number
+ * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
+ */
+__visible noinstr void do_int80_emulation(struct pt_regs *regs)
+{
+ int nr;
+
+ /* Kernel does not use INT $0x80! */
+ if (unlikely(!user_mode(regs))) {
+ irqentry_enter(regs);
+ instrumentation_begin();
+ panic("Unexpected external interrupt 0x80\n");
+ }
+
+ /*
+ * Establish kernel context for instrumentation, including for
+ * int80_is_external() below which calls into the APIC driver.
+ * Identical for soft and external interrupts.
+ */
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /* Validate that this is a soft interrupt to the extent possible */
+ if (unlikely(int80_is_external()))
+ panic("Unexpected external interrupt 0x80\n");
+
+ /*
+ * The low level idtentry code pushed -1 into regs::orig_ax
+ * and regs::ax contains the syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * A FRED-specific INT80 handler is warranted for the follwing reasons:
+ *
+ * 1) As INT instructions and hardware interrupts are separate event
+ * types, FRED does not preclude the use of vector 0x80 for external
+ * interrupts. As a result, the FRED setup code does not reserve
+ * vector 0x80 and calling int80_is_external() is not merely
+ * suboptimal but actively incorrect: it could cause a system call
+ * to be incorrectly ignored.
+ *
+ * 2) It is called only for handling vector 0x80 of event type
+ * EVENT_TYPE_SWINT and will never be called to handle any external
+ * interrupt (event type EVENT_TYPE_EXTINT).
+ *
+ * 3) FRED has separate entry flows depending on if the event came from
+ * user space or kernel space, and because the kernel does not use
+ * INT insns, the FRED kernel entry handler fred_entry_from_kernel()
+ * falls through to fred_bad_type() if the event type is
+ * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
+ * an INT insn, it can only be from a user level.
+ *
+ * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
+ * likely take a different approach if it is ever needed: it
+ * probably belongs in either fred_intx()/ fred_other() or
+ * asm_fred_entrypoint_user(), depending on if this ought to be done
+ * for all entries from userspace or only system
+ * calls.
+ *
+ * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
+ */
+DEFINE_FREDENTRY_RAW(int80_emulation)
+{
+ int nr;
+
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /*
+ * FRED pushed 0 into regs::orig_ax and regs::ax contains the
+ * syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#endif
+#else /* CONFIG_IA32_EMULATION */
+
+/* Handles int $0x80 on a 32bit kernel */
+__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
+{
+ int nr = syscall_32_enter(regs);
+
+ add_random_kstack_offset();
+ /*
+ * Subtlety here: if ptrace pokes something larger than 2^31-1 into
+ * orig_ax, the int return value truncates it. This matches
+ * the semantics of syscall_get_nr().
+ */
+ nr = syscall_enter_from_user_mode(regs, nr);
+ instrumentation_begin();
+
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#endif /* !CONFIG_IA32_EMULATION */
+
+static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
+{
+ int nr = syscall_32_enter(regs);
+ int res;
+
+ add_random_kstack_offset();
+ /*
+ * This cannot use syscall_enter_from_user_mode() as it has to
+ * fetch EBP before invoking any of the syscall entry work
+ * functions.
+ */
+ syscall_enter_from_user_mode_prepare(regs);
+
+ instrumentation_begin();
+ /* Fetch EBP from where the vDSO stashed it. */
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /*
+ * Micro-optimization: the pointer we're following is
+ * explicitly 32 bits, so it can't be out of range.
+ */
+ res = __get_user(*(u32 *)®s->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ } else {
+ res = get_user(*(u32 *)®s->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ }
+
+ if (res) {
+ /* User code screwed up. */
+ regs->ax = -EFAULT;
+
+ local_irq_disable();
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+ return false;
+ }
+
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+
+ /* Now this is just like a normal syscall. */
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+ return true;
+}
+
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
+{
+ /*
+ * Called using the internal vDSO SYSENTER/SYSCALL32 calling
+ * convention. Adjust regs so it looks like we entered using int80.
+ */
+ unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+ vdso_image_32.sym_int80_landing_pad;
+
+ /*
+ * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
+ * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
+ * Fix it up.
+ */
+ regs->ip = landing_pad;
+
+ /* Invoke the syscall. If it failed, keep it simple: use IRET. */
+ if (!__do_fast_syscall_32(regs))
+ return false;
+
+ /*
+ * Check that the register state is valid for using SYSRETL/SYSEXIT
+ * to exit to userspace. Otherwise use the slower but fully capable
+ * IRET exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* EIP must point to the VDSO landing pad */
+ if (unlikely(regs->ip != landing_pad))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+ return false;
+
+ /* If the TF, RF, or VM flags are set, use IRET */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+ return false;
+
+ /* Use SYSRETL/SYSEXIT to exit to userspace */
+ return true;
+}
+
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
+{
+ /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
+ regs->sp = regs->bp;
+
+ /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
+ regs->flags |= X86_EFLAGS_IF;
+
+ return do_fast_syscall_32(regs);
+}
--
2.48.1
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH 2/5] x86/syscall/32: Move 32-bit syscall dispatch code
2025-03-13 18:22 ` [PATCH 2/5] x86/syscall/32: Move 32-bit syscall dispatch code Brian Gerst
@ 2025-03-13 23:44 ` Sohil Mehta
2025-03-14 1:25 ` Brian Gerst
2025-03-14 9:46 ` [tip: x86/cpu] x86/syscall/32: Move the 32-bit syscall dispatch code to arch/x86/entry/syscall_32.c tip-bot2 for Brian Gerst
1 sibling, 1 reply; 22+ messages in thread
From: Sohil Mehta @ 2025-03-13 23:44 UTC (permalink / raw)
To: Brian Gerst, linux-kernel, x86
Cc: Ingo Molnar, H . Peter Anvin, Thomas Gleixner, Borislav Petkov,
Andy Lutomirski, Juergen Gross, Boris Ostrovsky
On 3/13/2025 11:22 AM, Brian Gerst wrote:
> SYSCALL_DEFINE0(ni_syscall)
> {
> return -ENOSYS;
> diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
> index 8cc9950d7104..7c286e89fd04 100644
> --- a/arch/x86/entry/syscall_32.c
> +++ b/arch/x86/entry/syscall_32.c
> @@ -1,10 +1,23 @@
> -// SPDX-License-Identifier: GPL-2.0
> -/* System call table for i386. */
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * 32-bit system call dispatch
> + *
> + * Copyright (c) 2015 Andrew Lutomirski
> + *
> + * Based on asm and ptrace code by many authors. The code here originated
> + * in ptrace.c and signal.c.
Wondering if we can skip copying over some of these old comments? As the
file grows, it's hard to discerne what specific code originated where.
For example, the FRED code is all new.
> +#ifdef CONFIG_IA32_EMULATION
> +static __always_inline bool int80_is_external(void)
> +{
> + const unsigned int offs = (0x80 / 32) * 0x10;
> + const u32 bit = BIT(0x80 % 32);
> +
> + /* The local APIC on XENPV guests is fake */
> + if (cpu_feature_enabled(X86_FEATURE_XENPV))
> + return false;
> +
> + /*
> + * If vector 0x80 is set in the APIC ISR then this is an external
> + * interrupt. Either from broken hardware or injected by a VMM.
> + *
> + * Note: In guest mode this is only valid for secure guests where
> + * the secure module fully controls the vAPIC exposed to the guest.
> + */
> + return apic_read(APIC_ISR + offs) & bit;
> +}
> +
> +/**
> + * do_int80_emulation - 32-bit legacy syscall C entry from asm
> + * @regs: syscall arguments in struct pt_args on the stack.
> + *
> + * This entry point can be used by 32-bit and 64-bit programs to perform
> + * 32-bit system calls. Instances of INT $0x80 can be found inline in
> + * various programs and libraries. It is also used by the vDSO's
> + * __kernel_vsyscall fallback for hardware that doesn't support a faster
> + * entry method. Restarted 32-bit system calls also fall back to INT
> + * $0x80 regardless of what instruction was originally used to do the
> + * system call.
> + *
> + * This is considered a slow path. It is not used by most libc
> + * implementations on modern hardware except during process startup.
> + *
> + * The arguments for the INT $0x80 based syscall are on stack in the
> + * pt_regs structure:
> + * eax: system call number
> + * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
> + */
> +__visible noinstr void do_int80_emulation(struct pt_regs *regs)
> +{
> + int nr;
> +
> + /* Kernel does not use INT $0x80! */
> + if (unlikely(!user_mode(regs))) {
> + irqentry_enter(regs);
> + instrumentation_begin();
> + panic("Unexpected external interrupt 0x80\n");
> + }
> +
> + /*
> + * Establish kernel context for instrumentation, including for
> + * int80_is_external() below which calls into the APIC driver.
> + * Identical for soft and external interrupts.
> + */
> + enter_from_user_mode(regs);
> +
> + instrumentation_begin();
> + add_random_kstack_offset();
> +
> + /* Validate that this is a soft interrupt to the extent possible */
> + if (unlikely(int80_is_external()))
> + panic("Unexpected external interrupt 0x80\n");
> +
> + /*
> + * The low level idtentry code pushed -1 into regs::orig_ax
> + * and regs::ax contains the syscall number.
> + *
> + * User tracing code (ptrace or signal handlers) might assume
> + * that the regs::orig_ax contains a 32-bit number on invoking
> + * a 32-bit syscall.
> + *
> + * Establish the syscall convention by saving the 32bit truncated
> + * syscall number in regs::orig_ax and by invalidating regs::ax.
> + */
> + regs->orig_ax = regs->ax & GENMASK(31, 0);
> + regs->ax = -ENOSYS;
> +
> + nr = syscall_32_enter(regs);
> +
> + local_irq_enable();
> + nr = syscall_enter_from_user_mode_work(regs, nr);
> + do_syscall_32_irqs_on(regs, nr);
> +
> + instrumentation_end();
> + syscall_exit_to_user_mode(regs);
> +}
> +
> +#ifdef CONFIG_X86_FRED
> +/*
> + * A FRED-specific INT80 handler is warranted for the follwing reasons:
> + *
> + * 1) As INT instructions and hardware interrupts are separate event
> + * types, FRED does not preclude the use of vector 0x80 for external
> + * interrupts. As a result, the FRED setup code does not reserve
> + * vector 0x80 and calling int80_is_external() is not merely
> + * suboptimal but actively incorrect: it could cause a system call
> + * to be incorrectly ignored.
> + *
> + * 2) It is called only for handling vector 0x80 of event type
> + * EVENT_TYPE_SWINT and will never be called to handle any external
> + * interrupt (event type EVENT_TYPE_EXTINT).
> + *
> + * 3) FRED has separate entry flows depending on if the event came from
> + * user space or kernel space, and because the kernel does not use
> + * INT insns, the FRED kernel entry handler fred_entry_from_kernel()
> + * falls through to fred_bad_type() if the event type is
> + * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
> + * an INT insn, it can only be from a user level.
> + *
> + * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
> + * likely take a different approach if it is ever needed: it
> + * probably belongs in either fred_intx()/ fred_other() or
> + * asm_fred_entrypoint_user(), depending on if this ought to be done
> + * for all entries from userspace or only system
> + * calls.
> + *
> + * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
> + */
> +DEFINE_FREDENTRY_RAW(int80_emulation)
> +{
> + int nr;
> +
> + enter_from_user_mode(regs);
> +
> + instrumentation_begin();
> + add_random_kstack_offset();
> +
> + /*
> + * FRED pushed 0 into regs::orig_ax and regs::ax contains the
> + * syscall number.
> + *
> + * User tracing code (ptrace or signal handlers) might assume
> + * that the regs::orig_ax contains a 32-bit number on invoking
> + * a 32-bit syscall.
> + *
> + * Establish the syscall convention by saving the 32bit truncated
> + * syscall number in regs::orig_ax and by invalidating regs::ax.
> + */
> + regs->orig_ax = regs->ax & GENMASK(31, 0);
> + regs->ax = -ENOSYS;
> +
> + nr = syscall_32_enter(regs);
> +
> + local_irq_enable();
> + nr = syscall_enter_from_user_mode_work(regs, nr);
> + do_syscall_32_irqs_on(regs, nr);
> +
> + instrumentation_end();
> + syscall_exit_to_user_mode(regs);
> +}
> +#endif
Nit: Would it be useful to add /* CONFIG_X86_FRED */ here since there
are nested #ifdefs?
> +#else /* CONFIG_IA32_EMULATION */
> +
> +/* Handles int $0x80 on a 32bit kernel */
> +__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
> +{
> + int nr = syscall_32_enter(regs);
> +
> + add_random_kstack_offset();
> + /*
> + * Subtlety here: if ptrace pokes something larger than 2^31-1 into
> + * orig_ax, the int return value truncates it. This matches
> + * the semantics of syscall_get_nr().
> + */
> + nr = syscall_enter_from_user_mode(regs, nr);
> + instrumentation_begin();
> +
> + do_syscall_32_irqs_on(regs, nr);
> +
> + instrumentation_end();
> + syscall_exit_to_user_mode(regs);
> +}
> +#endif /* !CONFIG_IA32_EMULATION */
> +
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [PATCH 2/5] x86/syscall/32: Move 32-bit syscall dispatch code
2025-03-13 23:44 ` Sohil Mehta
@ 2025-03-14 1:25 ` Brian Gerst
2025-03-14 9:27 ` Ingo Molnar
0 siblings, 1 reply; 22+ messages in thread
From: Brian Gerst @ 2025-03-14 1:25 UTC (permalink / raw)
To: Sohil Mehta
Cc: linux-kernel, x86, Ingo Molnar, H . Peter Anvin, Thomas Gleixner,
Borislav Petkov, Andy Lutomirski, Juergen Gross, Boris Ostrovsky
On Thu, Mar 13, 2025 at 7:45 PM Sohil Mehta <sohil.mehta@intel.com> wrote:
>
> On 3/13/2025 11:22 AM, Brian Gerst wrote:
>
> > SYSCALL_DEFINE0(ni_syscall)
> > {
> > return -ENOSYS;
> > diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
> > index 8cc9950d7104..7c286e89fd04 100644
> > --- a/arch/x86/entry/syscall_32.c
> > +++ b/arch/x86/entry/syscall_32.c
> > @@ -1,10 +1,23 @@
> > -// SPDX-License-Identifier: GPL-2.0
> > -/* System call table for i386. */
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * 32-bit system call dispatch
> > + *
> > + * Copyright (c) 2015 Andrew Lutomirski
> > + *
> > + * Based on asm and ptrace code by many authors. The code here originated
> > + * in ptrace.c and signal.c.
>
> Wondering if we can skip copying over some of these old comments? As the
> file grows, it's hard to discerne what specific code originated where.
> For example, the FRED code is all new.
I wasn't sure what the appropriate thing would be so I just copied it
over. The git history is probably a better way to attribute this
though.
>
> > +#ifdef CONFIG_IA32_EMULATION
> > +static __always_inline bool int80_is_external(void)
> > +{
> > + const unsigned int offs = (0x80 / 32) * 0x10;
> > + const u32 bit = BIT(0x80 % 32);
> > +
> > + /* The local APIC on XENPV guests is fake */
> > + if (cpu_feature_enabled(X86_FEATURE_XENPV))
> > + return false;
> > +
> > + /*
> > + * If vector 0x80 is set in the APIC ISR then this is an external
> > + * interrupt. Either from broken hardware or injected by a VMM.
> > + *
> > + * Note: In guest mode this is only valid for secure guests where
> > + * the secure module fully controls the vAPIC exposed to the guest.
> > + */
> > + return apic_read(APIC_ISR + offs) & bit;
> > +}
> > +
> > +/**
> > + * do_int80_emulation - 32-bit legacy syscall C entry from asm
> > + * @regs: syscall arguments in struct pt_args on the stack.
> > + *
> > + * This entry point can be used by 32-bit and 64-bit programs to perform
> > + * 32-bit system calls. Instances of INT $0x80 can be found inline in
> > + * various programs and libraries. It is also used by the vDSO's
> > + * __kernel_vsyscall fallback for hardware that doesn't support a faster
> > + * entry method. Restarted 32-bit system calls also fall back to INT
> > + * $0x80 regardless of what instruction was originally used to do the
> > + * system call.
> > + *
> > + * This is considered a slow path. It is not used by most libc
> > + * implementations on modern hardware except during process startup.
> > + *
> > + * The arguments for the INT $0x80 based syscall are on stack in the
> > + * pt_regs structure:
> > + * eax: system call number
> > + * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
> > + */
> > +__visible noinstr void do_int80_emulation(struct pt_regs *regs)
> > +{
> > + int nr;
> > +
> > + /* Kernel does not use INT $0x80! */
> > + if (unlikely(!user_mode(regs))) {
> > + irqentry_enter(regs);
> > + instrumentation_begin();
> > + panic("Unexpected external interrupt 0x80\n");
> > + }
> > +
> > + /*
> > + * Establish kernel context for instrumentation, including for
> > + * int80_is_external() below which calls into the APIC driver.
> > + * Identical for soft and external interrupts.
> > + */
> > + enter_from_user_mode(regs);
> > +
> > + instrumentation_begin();
> > + add_random_kstack_offset();
> > +
> > + /* Validate that this is a soft interrupt to the extent possible */
> > + if (unlikely(int80_is_external()))
> > + panic("Unexpected external interrupt 0x80\n");
> > +
> > + /*
> > + * The low level idtentry code pushed -1 into regs::orig_ax
> > + * and regs::ax contains the syscall number.
> > + *
> > + * User tracing code (ptrace or signal handlers) might assume
> > + * that the regs::orig_ax contains a 32-bit number on invoking
> > + * a 32-bit syscall.
> > + *
> > + * Establish the syscall convention by saving the 32bit truncated
> > + * syscall number in regs::orig_ax and by invalidating regs::ax.
> > + */
> > + regs->orig_ax = regs->ax & GENMASK(31, 0);
> > + regs->ax = -ENOSYS;
> > +
> > + nr = syscall_32_enter(regs);
> > +
> > + local_irq_enable();
> > + nr = syscall_enter_from_user_mode_work(regs, nr);
> > + do_syscall_32_irqs_on(regs, nr);
> > +
> > + instrumentation_end();
> > + syscall_exit_to_user_mode(regs);
> > +}
> > +
> > +#ifdef CONFIG_X86_FRED
> > +/*
> > + * A FRED-specific INT80 handler is warranted for the follwing reasons:
> > + *
> > + * 1) As INT instructions and hardware interrupts are separate event
> > + * types, FRED does not preclude the use of vector 0x80 for external
> > + * interrupts. As a result, the FRED setup code does not reserve
> > + * vector 0x80 and calling int80_is_external() is not merely
> > + * suboptimal but actively incorrect: it could cause a system call
> > + * to be incorrectly ignored.
> > + *
> > + * 2) It is called only for handling vector 0x80 of event type
> > + * EVENT_TYPE_SWINT and will never be called to handle any external
> > + * interrupt (event type EVENT_TYPE_EXTINT).
> > + *
> > + * 3) FRED has separate entry flows depending on if the event came from
> > + * user space or kernel space, and because the kernel does not use
> > + * INT insns, the FRED kernel entry handler fred_entry_from_kernel()
> > + * falls through to fred_bad_type() if the event type is
> > + * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
> > + * an INT insn, it can only be from a user level.
> > + *
> > + * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
> > + * likely take a different approach if it is ever needed: it
> > + * probably belongs in either fred_intx()/ fred_other() or
> > + * asm_fred_entrypoint_user(), depending on if this ought to be done
> > + * for all entries from userspace or only system
> > + * calls.
> > + *
> > + * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
> > + */
> > +DEFINE_FREDENTRY_RAW(int80_emulation)
> > +{
> > + int nr;
> > +
> > + enter_from_user_mode(regs);
> > +
> > + instrumentation_begin();
> > + add_random_kstack_offset();
> > +
> > + /*
> > + * FRED pushed 0 into regs::orig_ax and regs::ax contains the
> > + * syscall number.
> > + *
> > + * User tracing code (ptrace or signal handlers) might assume
> > + * that the regs::orig_ax contains a 32-bit number on invoking
> > + * a 32-bit syscall.
> > + *
> > + * Establish the syscall convention by saving the 32bit truncated
> > + * syscall number in regs::orig_ax and by invalidating regs::ax.
> > + */
> > + regs->orig_ax = regs->ax & GENMASK(31, 0);
> > + regs->ax = -ENOSYS;
> > +
> > + nr = syscall_32_enter(regs);
> > +
> > + local_irq_enable();
> > + nr = syscall_enter_from_user_mode_work(regs, nr);
> > + do_syscall_32_irqs_on(regs, nr);
> > +
> > + instrumentation_end();
> > + syscall_exit_to_user_mode(regs);
> > +}
> > +#endif
>
> Nit: Would it be useful to add /* CONFIG_X86_FRED */ here since there
> are nested #ifdefs?
These patches should be as close to a copy and paste as possible, so
that it's easy to prove that nothing changes. This could be a
followup patch later though.
Brian Gerst
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [PATCH 2/5] x86/syscall/32: Move 32-bit syscall dispatch code
2025-03-14 1:25 ` Brian Gerst
@ 2025-03-14 9:27 ` Ingo Molnar
0 siblings, 0 replies; 22+ messages in thread
From: Ingo Molnar @ 2025-03-14 9:27 UTC (permalink / raw)
To: Brian Gerst
Cc: Sohil Mehta, linux-kernel, x86, H . Peter Anvin, Thomas Gleixner,
Borislav Petkov, Andy Lutomirski, Juergen Gross, Boris Ostrovsky
* Brian Gerst <brgerst@gmail.com> wrote:
> On Thu, Mar 13, 2025 at 7:45 PM Sohil Mehta <sohil.mehta@intel.com> wrote:
> >
> > On 3/13/2025 11:22 AM, Brian Gerst wrote:
> >
> > > SYSCALL_DEFINE0(ni_syscall)
> > > {
> > > return -ENOSYS;
> > > diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
> > > index 8cc9950d7104..7c286e89fd04 100644
> > > --- a/arch/x86/entry/syscall_32.c
> > > +++ b/arch/x86/entry/syscall_32.c
> > > @@ -1,10 +1,23 @@
> > > -// SPDX-License-Identifier: GPL-2.0
> > > -/* System call table for i386. */
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * 32-bit system call dispatch
> > > + *
> > > + * Copyright (c) 2015 Andrew Lutomirski
> > > + *
> > > + * Based on asm and ptrace code by many authors. The code here originated
> > > + * in ptrace.c and signal.c.
> >
> > Wondering if we can skip copying over some of these old comments? As the
> > file grows, it's hard to discerne what specific code originated where.
> > For example, the FRED code is all new.
>
> I wasn't sure what the appropriate thing would be so I just copied it
> over. The git history is probably a better way to attribute this
> though.
I suppose we could trim some of the old comments in followup patches,
as they'll be preserved in the Git log and often aren't really accurate
or particularly informative in the current code context.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 22+ messages in thread
* [tip: x86/cpu] x86/syscall/32: Move the 32-bit syscall dispatch code to arch/x86/entry/syscall_32.c
2025-03-13 18:22 ` [PATCH 2/5] x86/syscall/32: Move 32-bit syscall dispatch code Brian Gerst
2025-03-13 23:44 ` Sohil Mehta
@ 2025-03-14 9:46 ` tip-bot2 for Brian Gerst
1 sibling, 0 replies; 22+ messages in thread
From: tip-bot2 for Brian Gerst @ 2025-03-14 9:46 UTC (permalink / raw)
To: linux-tip-commits
Cc: Brian Gerst, Ingo Molnar, Sohil Mehta, Andy Lutomirski,
Juergen Gross, H. Peter Anvin, Linus Torvalds, Josh Poimboeuf,
x86, linux-kernel
The following commit has been merged into the x86/cpu branch of tip:
Commit-ID: beea77e898b2a6a0d67b01ae24dcfa46a3dfc089
Gitweb: https://git.kernel.org/tip/beea77e898b2a6a0d67b01ae24dcfa46a3dfc089
Author: Brian Gerst <brgerst@gmail.com>
AuthorDate: Thu, 13 Mar 2025 14:22:33 -04:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 14 Mar 2025 10:32:51 +01:00
x86/syscall/32: Move the 32-bit syscall dispatch code to arch/x86/entry/syscall_32.c
Move the 32-bit syscall dispatch code to syscall_32.c.
No functional changes.
Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20250313182236.655724-3-brgerst@gmail.com
---
arch/x86/entry/Makefile | 2 +-
arch/x86/entry/common.c | 321 +---------------------------------
arch/x86/entry/syscall_32.c | 336 ++++++++++++++++++++++++++++++++++-
3 files changed, 336 insertions(+), 323 deletions(-)
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index ce1cc16..96a6b86 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -8,8 +8,10 @@ UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
CFLAGS_common.o += -fno-stack-protector
+CFLAGS_syscall_32.o += -fno-stack-protector
obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o
obj-y += common.o
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index ce4d88e..183efab 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -125,327 +125,6 @@ __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
}
#endif
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
-static __always_inline int syscall_32_enter(struct pt_regs *regs)
-{
- if (IS_ENABLED(CONFIG_IA32_EMULATION))
- current_thread_info()->status |= TS_COMPAT;
-
- return (int)regs->orig_ax;
-}
-
-#ifdef CONFIG_IA32_EMULATION
-bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
-
-static int __init ia32_emulation_override_cmdline(char *arg)
-{
- return kstrtobool(arg, &__ia32_enabled);
-}
-early_param("ia32_emulation", ia32_emulation_override_cmdline);
-#endif
-
-/*
- * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL.
- */
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
-{
- /*
- * Convert negative numbers to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int unr = nr;
-
- if (likely(unr < IA32_NR_syscalls)) {
- unr = array_index_nospec(unr, IA32_NR_syscalls);
- regs->ax = ia32_sys_call(regs, unr);
- } else if (nr != -1) {
- regs->ax = __ia32_sys_ni_syscall(regs);
- }
-}
-
-#ifdef CONFIG_IA32_EMULATION
-static __always_inline bool int80_is_external(void)
-{
- const unsigned int offs = (0x80 / 32) * 0x10;
- const u32 bit = BIT(0x80 % 32);
-
- /* The local APIC on XENPV guests is fake */
- if (cpu_feature_enabled(X86_FEATURE_XENPV))
- return false;
-
- /*
- * If vector 0x80 is set in the APIC ISR then this is an external
- * interrupt. Either from broken hardware or injected by a VMM.
- *
- * Note: In guest mode this is only valid for secure guests where
- * the secure module fully controls the vAPIC exposed to the guest.
- */
- return apic_read(APIC_ISR + offs) & bit;
-}
-
-/**
- * do_int80_emulation - 32-bit legacy syscall C entry from asm
- * @regs: syscall arguments in struct pt_args on the stack.
- *
- * This entry point can be used by 32-bit and 64-bit programs to perform
- * 32-bit system calls. Instances of INT $0x80 can be found inline in
- * various programs and libraries. It is also used by the vDSO's
- * __kernel_vsyscall fallback for hardware that doesn't support a faster
- * entry method. Restarted 32-bit system calls also fall back to INT
- * $0x80 regardless of what instruction was originally used to do the
- * system call.
- *
- * This is considered a slow path. It is not used by most libc
- * implementations on modern hardware except during process startup.
- *
- * The arguments for the INT $0x80 based syscall are on stack in the
- * pt_regs structure:
- * eax: system call number
- * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
- */
-__visible noinstr void do_int80_emulation(struct pt_regs *regs)
-{
- int nr;
-
- /* Kernel does not use INT $0x80! */
- if (unlikely(!user_mode(regs))) {
- irqentry_enter(regs);
- instrumentation_begin();
- panic("Unexpected external interrupt 0x80\n");
- }
-
- /*
- * Establish kernel context for instrumentation, including for
- * int80_is_external() below which calls into the APIC driver.
- * Identical for soft and external interrupts.
- */
- enter_from_user_mode(regs);
-
- instrumentation_begin();
- add_random_kstack_offset();
-
- /* Validate that this is a soft interrupt to the extent possible */
- if (unlikely(int80_is_external()))
- panic("Unexpected external interrupt 0x80\n");
-
- /*
- * The low level idtentry code pushed -1 into regs::orig_ax
- * and regs::ax contains the syscall number.
- *
- * User tracing code (ptrace or signal handlers) might assume
- * that the regs::orig_ax contains a 32-bit number on invoking
- * a 32-bit syscall.
- *
- * Establish the syscall convention by saving the 32bit truncated
- * syscall number in regs::orig_ax and by invalidating regs::ax.
- */
- regs->orig_ax = regs->ax & GENMASK(31, 0);
- regs->ax = -ENOSYS;
-
- nr = syscall_32_enter(regs);
-
- local_irq_enable();
- nr = syscall_enter_from_user_mode_work(regs, nr);
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-}
-
-#ifdef CONFIG_X86_FRED
-/*
- * A FRED-specific INT80 handler is warranted for the follwing reasons:
- *
- * 1) As INT instructions and hardware interrupts are separate event
- * types, FRED does not preclude the use of vector 0x80 for external
- * interrupts. As a result, the FRED setup code does not reserve
- * vector 0x80 and calling int80_is_external() is not merely
- * suboptimal but actively incorrect: it could cause a system call
- * to be incorrectly ignored.
- *
- * 2) It is called only for handling vector 0x80 of event type
- * EVENT_TYPE_SWINT and will never be called to handle any external
- * interrupt (event type EVENT_TYPE_EXTINT).
- *
- * 3) FRED has separate entry flows depending on if the event came from
- * user space or kernel space, and because the kernel does not use
- * INT insns, the FRED kernel entry handler fred_entry_from_kernel()
- * falls through to fred_bad_type() if the event type is
- * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
- * an INT insn, it can only be from a user level.
- *
- * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
- * likely take a different approach if it is ever needed: it
- * probably belongs in either fred_intx()/ fred_other() or
- * asm_fred_entrypoint_user(), depending on if this ought to be done
- * for all entries from userspace or only system
- * calls.
- *
- * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
- */
-DEFINE_FREDENTRY_RAW(int80_emulation)
-{
- int nr;
-
- enter_from_user_mode(regs);
-
- instrumentation_begin();
- add_random_kstack_offset();
-
- /*
- * FRED pushed 0 into regs::orig_ax and regs::ax contains the
- * syscall number.
- *
- * User tracing code (ptrace or signal handlers) might assume
- * that the regs::orig_ax contains a 32-bit number on invoking
- * a 32-bit syscall.
- *
- * Establish the syscall convention by saving the 32bit truncated
- * syscall number in regs::orig_ax and by invalidating regs::ax.
- */
- regs->orig_ax = regs->ax & GENMASK(31, 0);
- regs->ax = -ENOSYS;
-
- nr = syscall_32_enter(regs);
-
- local_irq_enable();
- nr = syscall_enter_from_user_mode_work(regs, nr);
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-}
-#endif
-#else /* CONFIG_IA32_EMULATION */
-
-/* Handles int $0x80 on a 32bit kernel */
-__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
-{
- int nr = syscall_32_enter(regs);
-
- add_random_kstack_offset();
- /*
- * Subtlety here: if ptrace pokes something larger than 2^31-1 into
- * orig_ax, the int return value truncates it. This matches
- * the semantics of syscall_get_nr().
- */
- nr = syscall_enter_from_user_mode(regs, nr);
- instrumentation_begin();
-
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-}
-#endif /* !CONFIG_IA32_EMULATION */
-
-static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
-{
- int nr = syscall_32_enter(regs);
- int res;
-
- add_random_kstack_offset();
- /*
- * This cannot use syscall_enter_from_user_mode() as it has to
- * fetch EBP before invoking any of the syscall entry work
- * functions.
- */
- syscall_enter_from_user_mode_prepare(regs);
-
- instrumentation_begin();
- /* Fetch EBP from where the vDSO stashed it. */
- if (IS_ENABLED(CONFIG_X86_64)) {
- /*
- * Micro-optimization: the pointer we're following is
- * explicitly 32 bits, so it can't be out of range.
- */
- res = __get_user(*(u32 *)®s->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp);
- } else {
- res = get_user(*(u32 *)®s->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp);
- }
-
- if (res) {
- /* User code screwed up. */
- regs->ax = -EFAULT;
-
- local_irq_disable();
- instrumentation_end();
- irqentry_exit_to_user_mode(regs);
- return false;
- }
-
- nr = syscall_enter_from_user_mode_work(regs, nr);
-
- /* Now this is just like a normal syscall. */
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
- return true;
-}
-
-/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
-__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
-{
- /*
- * Called using the internal vDSO SYSENTER/SYSCALL32 calling
- * convention. Adjust regs so it looks like we entered using int80.
- */
- unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
- vdso_image_32.sym_int80_landing_pad;
-
- /*
- * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
- * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
- * Fix it up.
- */
- regs->ip = landing_pad;
-
- /* Invoke the syscall. If it failed, keep it simple: use IRET. */
- if (!__do_fast_syscall_32(regs))
- return false;
-
- /*
- * Check that the register state is valid for using SYSRETL/SYSEXIT
- * to exit to userspace. Otherwise use the slower but fully capable
- * IRET exit path.
- */
-
- /* XEN PV guests always use the IRET path */
- if (cpu_feature_enabled(X86_FEATURE_XENPV))
- return false;
-
- /* EIP must point to the VDSO landing pad */
- if (unlikely(regs->ip != landing_pad))
- return false;
-
- /* CS and SS must match the values set in MSR_STAR */
- if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
- return false;
-
- /* If the TF, RF, or VM flags are set, use IRET */
- if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
- return false;
-
- /* Use SYSRETL/SYSEXIT to exit to userspace */
- return true;
-}
-
-/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
-__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
-{
- /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
- regs->sp = regs->bp;
-
- /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
- regs->flags |= X86_EFLAGS_IF;
-
- return do_fast_syscall_32(regs);
-}
-#endif
-
SYSCALL_DEFINE0(ni_syscall)
{
return -ENOSYS;
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 8cc9950..7c286e8 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -1,10 +1,23 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for i386. */
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 32-bit system call dispatch
+ *
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * Based on asm and ptrace code by many authors. The code here originated
+ * in ptrace.c and signal.c.
+ */
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
+#include <linux/entry-common.h>
+#include <linux/nospec.h>
+#include <linux/uaccess.h>
+#include <asm/apic.h>
+#include <asm/traps.h>
+#include <asm/cpufeature.h>
#include <asm/syscall.h>
#ifdef CONFIG_IA32_EMULATION
@@ -42,3 +55,322 @@ long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
default: return __ia32_sys_ni_syscall(regs);
}
};
+
+static __always_inline int syscall_32_enter(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_IA32_EMULATION))
+ current_thread_info()->status |= TS_COMPAT;
+
+ return (int)regs->orig_ax;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
+
+static int __init ia32_emulation_override_cmdline(char *arg)
+{
+ return kstrtobool(arg, &__ia32_enabled);
+}
+early_param("ia32_emulation", ia32_emulation_override_cmdline);
+#endif
+
+/*
+ * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL.
+ */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < IA32_NR_syscalls)) {
+ unr = array_index_nospec(unr, IA32_NR_syscalls);
+ regs->ax = ia32_sys_call(regs, unr);
+ } else if (nr != -1) {
+ regs->ax = __ia32_sys_ni_syscall(regs);
+ }
+}
+
+#ifdef CONFIG_IA32_EMULATION
+static __always_inline bool int80_is_external(void)
+{
+ const unsigned int offs = (0x80 / 32) * 0x10;
+ const u32 bit = BIT(0x80 % 32);
+
+ /* The local APIC on XENPV guests is fake */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /*
+ * If vector 0x80 is set in the APIC ISR then this is an external
+ * interrupt. Either from broken hardware or injected by a VMM.
+ *
+ * Note: In guest mode this is only valid for secure guests where
+ * the secure module fully controls the vAPIC exposed to the guest.
+ */
+ return apic_read(APIC_ISR + offs) & bit;
+}
+
+/**
+ * do_int80_emulation - 32-bit legacy syscall C entry from asm
+ * @regs: syscall arguments in struct pt_args on the stack.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls. Instances of INT $0x80 can be found inline in
+ * various programs and libraries. It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method. Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * The arguments for the INT $0x80 based syscall are on stack in the
+ * pt_regs structure:
+ * eax: system call number
+ * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
+ */
+__visible noinstr void do_int80_emulation(struct pt_regs *regs)
+{
+ int nr;
+
+ /* Kernel does not use INT $0x80! */
+ if (unlikely(!user_mode(regs))) {
+ irqentry_enter(regs);
+ instrumentation_begin();
+ panic("Unexpected external interrupt 0x80\n");
+ }
+
+ /*
+ * Establish kernel context for instrumentation, including for
+ * int80_is_external() below which calls into the APIC driver.
+ * Identical for soft and external interrupts.
+ */
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /* Validate that this is a soft interrupt to the extent possible */
+ if (unlikely(int80_is_external()))
+ panic("Unexpected external interrupt 0x80\n");
+
+ /*
+ * The low level idtentry code pushed -1 into regs::orig_ax
+ * and regs::ax contains the syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * A FRED-specific INT80 handler is warranted for the follwing reasons:
+ *
+ * 1) As INT instructions and hardware interrupts are separate event
+ * types, FRED does not preclude the use of vector 0x80 for external
+ * interrupts. As a result, the FRED setup code does not reserve
+ * vector 0x80 and calling int80_is_external() is not merely
+ * suboptimal but actively incorrect: it could cause a system call
+ * to be incorrectly ignored.
+ *
+ * 2) It is called only for handling vector 0x80 of event type
+ * EVENT_TYPE_SWINT and will never be called to handle any external
+ * interrupt (event type EVENT_TYPE_EXTINT).
+ *
+ * 3) FRED has separate entry flows depending on if the event came from
+ * user space or kernel space, and because the kernel does not use
+ * INT insns, the FRED kernel entry handler fred_entry_from_kernel()
+ * falls through to fred_bad_type() if the event type is
+ * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
+ * an INT insn, it can only be from a user level.
+ *
+ * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
+ * likely take a different approach if it is ever needed: it
+ * probably belongs in either fred_intx()/ fred_other() or
+ * asm_fred_entrypoint_user(), depending on if this ought to be done
+ * for all entries from userspace or only system
+ * calls.
+ *
+ * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
+ */
+DEFINE_FREDENTRY_RAW(int80_emulation)
+{
+ int nr;
+
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /*
+ * FRED pushed 0 into regs::orig_ax and regs::ax contains the
+ * syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#endif
+#else /* CONFIG_IA32_EMULATION */
+
+/* Handles int $0x80 on a 32bit kernel */
+__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
+{
+ int nr = syscall_32_enter(regs);
+
+ add_random_kstack_offset();
+ /*
+ * Subtlety here: if ptrace pokes something larger than 2^31-1 into
+ * orig_ax, the int return value truncates it. This matches
+ * the semantics of syscall_get_nr().
+ */
+ nr = syscall_enter_from_user_mode(regs, nr);
+ instrumentation_begin();
+
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#endif /* !CONFIG_IA32_EMULATION */
+
+static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
+{
+ int nr = syscall_32_enter(regs);
+ int res;
+
+ add_random_kstack_offset();
+ /*
+ * This cannot use syscall_enter_from_user_mode() as it has to
+ * fetch EBP before invoking any of the syscall entry work
+ * functions.
+ */
+ syscall_enter_from_user_mode_prepare(regs);
+
+ instrumentation_begin();
+ /* Fetch EBP from where the vDSO stashed it. */
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /*
+ * Micro-optimization: the pointer we're following is
+ * explicitly 32 bits, so it can't be out of range.
+ */
+ res = __get_user(*(u32 *)®s->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ } else {
+ res = get_user(*(u32 *)®s->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ }
+
+ if (res) {
+ /* User code screwed up. */
+ regs->ax = -EFAULT;
+
+ local_irq_disable();
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+ return false;
+ }
+
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+
+ /* Now this is just like a normal syscall. */
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+ return true;
+}
+
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
+{
+ /*
+ * Called using the internal vDSO SYSENTER/SYSCALL32 calling
+ * convention. Adjust regs so it looks like we entered using int80.
+ */
+ unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+ vdso_image_32.sym_int80_landing_pad;
+
+ /*
+ * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
+ * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
+ * Fix it up.
+ */
+ regs->ip = landing_pad;
+
+ /* Invoke the syscall. If it failed, keep it simple: use IRET. */
+ if (!__do_fast_syscall_32(regs))
+ return false;
+
+ /*
+ * Check that the register state is valid for using SYSRETL/SYSEXIT
+ * to exit to userspace. Otherwise use the slower but fully capable
+ * IRET exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* EIP must point to the VDSO landing pad */
+ if (unlikely(regs->ip != landing_pad))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+ return false;
+
+ /* If the TF, RF, or VM flags are set, use IRET */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+ return false;
+
+ /* Use SYSRETL/SYSEXIT to exit to userspace */
+ return true;
+}
+
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
+{
+ /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
+ regs->sp = regs->bp;
+
+ /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
+ regs->flags |= X86_EFLAGS_IF;
+
+ return do_fast_syscall_32(regs);
+}
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [PATCH 3/5] x86/syscall/64: Move 64-bit syscall dispatch code
2025-03-13 18:22 [PATCH 0/5] x86/entry: Break up common.c Brian Gerst
2025-03-13 18:22 ` [PATCH 1/5] x86/xen: Move Xen upcall handler Brian Gerst
2025-03-13 18:22 ` [PATCH 2/5] x86/syscall/32: Move 32-bit syscall dispatch code Brian Gerst
@ 2025-03-13 18:22 ` Brian Gerst
2025-03-14 9:46 ` [tip: x86/cpu] x86/syscall/64: Move the 64-bit syscall dispatch code to arch/x86/entry/syscall_64.c tip-bot2 for Brian Gerst
2025-03-13 18:22 ` [PATCH 4/5] x86/syscall/x32: Move x32 syscall table Brian Gerst
2025-03-13 18:22 ` [PATCH 5/5] x86/syscall: Move sys_ni_syscall() Brian Gerst
4 siblings, 1 reply; 22+ messages in thread
From: Brian Gerst @ 2025-03-13 18:22 UTC (permalink / raw)
To: linux-kernel, x86
Cc: Ingo Molnar, H . Peter Anvin, Thomas Gleixner, Borislav Petkov,
Andy Lutomirski, Juergen Gross, Boris Ostrovsky, Brian Gerst
Move the 64-bit syscall dispatch code to syscall_64.c.
No functional changes.
Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
arch/x86/entry/Makefile | 2 +
arch/x86/entry/common.c | 93 --------------------------------
arch/x86/entry/syscall_64.c | 103 +++++++++++++++++++++++++++++++++++-
3 files changed, 103 insertions(+), 95 deletions(-)
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 96a6b86e0a8b..5fd28abfd5a0 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -9,9 +9,11 @@ KCOV_INSTRUMENT := n
CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE)
CFLAGS_common.o += -fno-stack-protector
CFLAGS_syscall_32.o += -fno-stack-protector
+CFLAGS_syscall_64.o += -fno-stack-protector
obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o
obj-y += common.o
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 183efabefe57..5bd448c0664f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -32,99 +32,6 @@
#include <asm/syscall.h>
#include <asm/irq_stack.h>
-#ifdef CONFIG_X86_64
-
-static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
-{
- /*
- * Convert negative numbers to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int unr = nr;
-
- if (likely(unr < NR_syscalls)) {
- unr = array_index_nospec(unr, NR_syscalls);
- regs->ax = x64_sys_call(regs, unr);
- return true;
- }
- return false;
-}
-
-static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
-{
- /*
- * Adjust the starting offset of the table, and convert numbers
- * < __X32_SYSCALL_BIT to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int xnr = nr - __X32_SYSCALL_BIT;
-
- if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
- xnr = array_index_nospec(xnr, X32_NR_syscalls);
- regs->ax = x32_sys_call(regs, xnr);
- return true;
- }
- return false;
-}
-
-/* Returns true to return using SYSRET, or false to use IRET */
-__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
-{
- add_random_kstack_offset();
- nr = syscall_enter_from_user_mode(regs, nr);
-
- instrumentation_begin();
-
- if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
- /* Invalid system call, but still a system call. */
- regs->ax = __x64_sys_ni_syscall(regs);
- }
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-
- /*
- * Check that the register state is valid for using SYSRET to exit
- * to userspace. Otherwise use the slower but fully capable IRET
- * exit path.
- */
-
- /* XEN PV guests always use the IRET path */
- if (cpu_feature_enabled(X86_FEATURE_XENPV))
- return false;
-
- /* SYSRET requires RCX == RIP and R11 == EFLAGS */
- if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
- return false;
-
- /* CS and SS must match the values set in MSR_STAR */
- if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
- return false;
-
- /*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
- * TASK_SIZE_MAX covers all user-accessible addresses other than
- * the deprecated vsyscall page.
- */
- if (unlikely(regs->ip >= TASK_SIZE_MAX))
- return false;
-
- /*
- * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET.
- */
- if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
- return false;
-
- /* Use SYSRET to exit to userspace */
- return true;
-}
-#endif
-
SYSCALL_DEFINE0(ni_syscall)
{
return -ENOSYS;
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index ba8354424860..9e0ba339013c 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -1,10 +1,19 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for x86-64. */
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 64-bit system call dispatch
+ *
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * Based on asm and ptrace code by many authors. The code here originated
+ * in ptrace.c and signal.c.
+ */
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
+#include <linux/entry-common.h>
+#include <linux/nospec.h>
#include <asm/syscall.h>
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
@@ -34,3 +43,93 @@ long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
default: return __x64_sys_ni_syscall(regs);
}
};
+
+static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < NR_syscalls)) {
+ unr = array_index_nospec(unr, NR_syscalls);
+ regs->ax = x64_sys_call(regs, unr);
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
+{
+ /*
+ * Adjust the starting offset of the table, and convert numbers
+ * < __X32_SYSCALL_BIT to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int xnr = nr - __X32_SYSCALL_BIT;
+
+ if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
+ xnr = array_index_nospec(xnr, X32_NR_syscalls);
+ regs->ax = x32_sys_call(regs, xnr);
+ return true;
+ }
+ return false;
+}
+
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
+{
+ add_random_kstack_offset();
+ nr = syscall_enter_from_user_mode(regs, nr);
+
+ instrumentation_begin();
+
+ if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
+ /* Invalid system call, but still a system call. */
+ regs->ax = __x64_sys_ni_syscall(regs);
+ }
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+
+ /*
+ * Check that the register state is valid for using SYSRET to exit
+ * to userspace. Otherwise use the slower but fully capable IRET
+ * exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* SYSRET requires RCX == RIP and R11 == EFLAGS */
+ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+ return false;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * TASK_SIZE_MAX covers all user-accessible addresses other than
+ * the deprecated vsyscall page.
+ */
+ if (unlikely(regs->ip >= TASK_SIZE_MAX))
+ return false;
+
+ /*
+ * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET.
+ */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+ return false;
+
+ /* Use SYSRET to exit to userspace */
+ return true;
+}
--
2.48.1
^ permalink raw reply related [flat|nested] 22+ messages in thread* [tip: x86/cpu] x86/syscall/64: Move the 64-bit syscall dispatch code to arch/x86/entry/syscall_64.c
2025-03-13 18:22 ` [PATCH 3/5] x86/syscall/64: Move 64-bit syscall dispatch code Brian Gerst
@ 2025-03-14 9:46 ` tip-bot2 for Brian Gerst
0 siblings, 0 replies; 22+ messages in thread
From: tip-bot2 for Brian Gerst @ 2025-03-14 9:46 UTC (permalink / raw)
To: linux-tip-commits
Cc: Brian Gerst, Ingo Molnar, Sohil Mehta, Andy Lutomirski,
Juergen Gross, H. Peter Anvin, Linus Torvalds, Josh Poimboeuf,
x86, linux-kernel
The following commit has been merged into the x86/cpu branch of tip:
Commit-ID: daffd8a21847b2a37da6b4d23753a4581543c575
Gitweb: https://git.kernel.org/tip/daffd8a21847b2a37da6b4d23753a4581543c575
Author: Brian Gerst <brgerst@gmail.com>
AuthorDate: Thu, 13 Mar 2025 14:22:34 -04:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 14 Mar 2025 10:32:51 +01:00
x86/syscall/64: Move the 64-bit syscall dispatch code to arch/x86/entry/syscall_64.c
Move the 64-bit syscall dispatch code to syscall_64.c.
No functional changes.
Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20250313182236.655724-4-brgerst@gmail.com
---
arch/x86/entry/Makefile | 2 +-
arch/x86/entry/common.c | 93 +--------------------------------
arch/x86/entry/syscall_64.c | 103 ++++++++++++++++++++++++++++++++++-
3 files changed, 103 insertions(+), 95 deletions(-)
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 96a6b86..5fd28ab 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -9,9 +9,11 @@ KCOV_INSTRUMENT := n
CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE)
CFLAGS_common.o += -fno-stack-protector
CFLAGS_syscall_32.o += -fno-stack-protector
+CFLAGS_syscall_64.o += -fno-stack-protector
obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o
obj-y += common.o
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 183efab..5bd448c 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -32,99 +32,6 @@
#include <asm/syscall.h>
#include <asm/irq_stack.h>
-#ifdef CONFIG_X86_64
-
-static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
-{
- /*
- * Convert negative numbers to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int unr = nr;
-
- if (likely(unr < NR_syscalls)) {
- unr = array_index_nospec(unr, NR_syscalls);
- regs->ax = x64_sys_call(regs, unr);
- return true;
- }
- return false;
-}
-
-static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
-{
- /*
- * Adjust the starting offset of the table, and convert numbers
- * < __X32_SYSCALL_BIT to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int xnr = nr - __X32_SYSCALL_BIT;
-
- if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
- xnr = array_index_nospec(xnr, X32_NR_syscalls);
- regs->ax = x32_sys_call(regs, xnr);
- return true;
- }
- return false;
-}
-
-/* Returns true to return using SYSRET, or false to use IRET */
-__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
-{
- add_random_kstack_offset();
- nr = syscall_enter_from_user_mode(regs, nr);
-
- instrumentation_begin();
-
- if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
- /* Invalid system call, but still a system call. */
- regs->ax = __x64_sys_ni_syscall(regs);
- }
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-
- /*
- * Check that the register state is valid for using SYSRET to exit
- * to userspace. Otherwise use the slower but fully capable IRET
- * exit path.
- */
-
- /* XEN PV guests always use the IRET path */
- if (cpu_feature_enabled(X86_FEATURE_XENPV))
- return false;
-
- /* SYSRET requires RCX == RIP and R11 == EFLAGS */
- if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
- return false;
-
- /* CS and SS must match the values set in MSR_STAR */
- if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
- return false;
-
- /*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
- * TASK_SIZE_MAX covers all user-accessible addresses other than
- * the deprecated vsyscall page.
- */
- if (unlikely(regs->ip >= TASK_SIZE_MAX))
- return false;
-
- /*
- * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET.
- */
- if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
- return false;
-
- /* Use SYSRET to exit to userspace */
- return true;
-}
-#endif
-
SYSCALL_DEFINE0(ni_syscall)
{
return -ENOSYS;
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index ba83544..9e0ba33 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -1,10 +1,19 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for x86-64. */
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 64-bit system call dispatch
+ *
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * Based on asm and ptrace code by many authors. The code here originated
+ * in ptrace.c and signal.c.
+ */
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
+#include <linux/entry-common.h>
+#include <linux/nospec.h>
#include <asm/syscall.h>
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
@@ -34,3 +43,93 @@ long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
default: return __x64_sys_ni_syscall(regs);
}
};
+
+static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < NR_syscalls)) {
+ unr = array_index_nospec(unr, NR_syscalls);
+ regs->ax = x64_sys_call(regs, unr);
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
+{
+ /*
+ * Adjust the starting offset of the table, and convert numbers
+ * < __X32_SYSCALL_BIT to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int xnr = nr - __X32_SYSCALL_BIT;
+
+ if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
+ xnr = array_index_nospec(xnr, X32_NR_syscalls);
+ regs->ax = x32_sys_call(regs, xnr);
+ return true;
+ }
+ return false;
+}
+
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
+{
+ add_random_kstack_offset();
+ nr = syscall_enter_from_user_mode(regs, nr);
+
+ instrumentation_begin();
+
+ if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
+ /* Invalid system call, but still a system call. */
+ regs->ax = __x64_sys_ni_syscall(regs);
+ }
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+
+ /*
+ * Check that the register state is valid for using SYSRET to exit
+ * to userspace. Otherwise use the slower but fully capable IRET
+ * exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* SYSRET requires RCX == RIP and R11 == EFLAGS */
+ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+ return false;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * TASK_SIZE_MAX covers all user-accessible addresses other than
+ * the deprecated vsyscall page.
+ */
+ if (unlikely(regs->ip >= TASK_SIZE_MAX))
+ return false;
+
+ /*
+ * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET.
+ */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+ return false;
+
+ /* Use SYSRET to exit to userspace */
+ return true;
+}
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [PATCH 4/5] x86/syscall/x32: Move x32 syscall table
2025-03-13 18:22 [PATCH 0/5] x86/entry: Break up common.c Brian Gerst
` (2 preceding siblings ...)
2025-03-13 18:22 ` [PATCH 3/5] x86/syscall/64: Move 64-bit syscall dispatch code Brian Gerst
@ 2025-03-13 18:22 ` Brian Gerst
2025-03-13 23:47 ` Sohil Mehta
2025-03-14 9:46 ` [tip: x86/cpu] x86/syscall/x32: Move the x32 syscall table to arch/x86/entry/syscall_64.c tip-bot2 for Brian Gerst
2025-03-13 18:22 ` [PATCH 5/5] x86/syscall: Move sys_ni_syscall() Brian Gerst
4 siblings, 2 replies; 22+ messages in thread
From: Brian Gerst @ 2025-03-13 18:22 UTC (permalink / raw)
To: linux-kernel, x86
Cc: Ingo Molnar, H . Peter Anvin, Thomas Gleixner, Borislav Petkov,
Andy Lutomirski, Juergen Gross, Boris Ostrovsky, Brian Gerst
Since commit:
2e958a8a510d ("x86/entry/x32: Rename __x32_compat_sys_* to
__x64_compat_sys_*"),
the ABI prefix for x32 syscalls is the same as native 64-bit
syscalls. Move the x32 syscall table to syscall_64.c
No functional changes.
Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
arch/x86/entry/Makefile | 1 -
arch/x86/entry/syscall_64.c | 13 +++++++++++++
arch/x86/entry/syscall_x32.c | 25 -------------------------
3 files changed, 13 insertions(+), 26 deletions(-)
delete mode 100644 arch/x86/entry/syscall_x32.c
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 5fd28abfd5a0..e870f8aa936c 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -27,4 +27,3 @@ CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE)
obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o
obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
-obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 9e0ba339013c..b96f5621a2aa 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -19,6 +19,9 @@
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
+#ifdef CONFIG_X86_X32_ABI
+#include <asm/syscalls_x32.h>
+#endif
#undef __SYSCALL
#undef __SYSCALL_NORETURN
@@ -44,6 +47,16 @@ long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
}
};
+#ifdef CONFIG_X86_X32_ABI
+long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_x32.h>
+ default: return __x64_sys_ni_syscall(regs);
+ }
+};
+#endif
+
static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
/*
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
deleted file mode 100644
index fb77908f44f3..000000000000
--- a/arch/x86/entry/syscall_x32.c
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for x32 ABI. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <linux/syscalls.h>
-#include <asm/syscall.h>
-
-#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
-#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
-#include <asm/syscalls_x32.h>
-#undef __SYSCALL
-
-#undef __SYSCALL_NORETURN
-#define __SYSCALL_NORETURN __SYSCALL
-
-#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
-{
- switch (nr) {
- #include <asm/syscalls_x32.h>
- default: return __x64_sys_ni_syscall(regs);
- }
-};
--
2.48.1
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH 4/5] x86/syscall/x32: Move x32 syscall table
2025-03-13 18:22 ` [PATCH 4/5] x86/syscall/x32: Move x32 syscall table Brian Gerst
@ 2025-03-13 23:47 ` Sohil Mehta
2025-03-14 9:25 ` Ingo Molnar
2025-03-14 9:34 ` Ingo Molnar
2025-03-14 9:46 ` [tip: x86/cpu] x86/syscall/x32: Move the x32 syscall table to arch/x86/entry/syscall_64.c tip-bot2 for Brian Gerst
1 sibling, 2 replies; 22+ messages in thread
From: Sohil Mehta @ 2025-03-13 23:47 UTC (permalink / raw)
To: Brian Gerst, linux-kernel, x86
Cc: Ingo Molnar, H . Peter Anvin, Thomas Gleixner, Borislav Petkov,
Andy Lutomirski, Juergen Gross, Boris Ostrovsky
On 3/13/2025 11:22 AM, Brian Gerst wrote:
> Since commit:
>
> 2e958a8a510d ("x86/entry/x32: Rename __x32_compat_sys_* to
> __x64_compat_sys_*"),
>
> the ABI prefix for x32 syscalls is the same as native 64-bit
> syscalls. Move the x32 syscall table to syscall_64.c
>
> No functional changes.
>
> Signed-off-by: Brian Gerst <brgerst@gmail.com>
> ---
> arch/x86/entry/Makefile | 1 -
> arch/x86/entry/syscall_64.c | 13 +++++++++++++
> arch/x86/entry/syscall_x32.c | 25 -------------------------
> 3 files changed, 13 insertions(+), 26 deletions(-)
> delete mode 100644 arch/x86/entry/syscall_x32.c
>
> diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
> index 5fd28abfd5a0..e870f8aa936c 100644
> --- a/arch/x86/entry/Makefile
> +++ b/arch/x86/entry/Makefile
> @@ -27,4 +27,3 @@ CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE)
> obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o
>
> obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
> -obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o
> diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
> index 9e0ba339013c..b96f5621a2aa 100644
> --- a/arch/x86/entry/syscall_64.c
> +++ b/arch/x86/entry/syscall_64.c
> @@ -19,6 +19,9 @@
> #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
> #define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
> #include <asm/syscalls_64.h>
> +#ifdef CONFIG_X86_X32_ABI
> +#include <asm/syscalls_x32.h>
> +#endif
> #undef __SYSCALL
>
> #undef __SYSCALL_NORETURN
> @@ -44,6 +47,16 @@ long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
> }
> };
>
> +#ifdef CONFIG_X86_X32_ABI
> +long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
> +{
> + switch (nr) {
> + #include <asm/syscalls_x32.h>
> + default: return __x64_sys_ni_syscall(regs);
> + }
> +};
There seems to be a stray semicolon here. The original code also has it
but it doesn't seem necessary.
> +#endif
> +
> static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
> {
> /*
> diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
> deleted file mode 100644
> index fb77908f44f3..000000000000
> --- a/arch/x86/entry/syscall_x32.c
> +++ /dev/null
> @@ -1,25 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0
> -/* System call table for x32 ABI. */
> -
> -#include <linux/linkage.h>
> -#include <linux/sys.h>
> -#include <linux/cache.h>
> -#include <linux/syscalls.h>
> -#include <asm/syscall.h>
> -
> -#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
> -#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
> -#include <asm/syscalls_x32.h>
> -#undef __SYSCALL
> -
> -#undef __SYSCALL_NORETURN
> -#define __SYSCALL_NORETURN __SYSCALL
> -
> -#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
> -long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
> -{
> - switch (nr) {
> - #include <asm/syscalls_x32.h>
> - default: return __x64_sys_ni_syscall(regs);
> - }
> -};
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [PATCH 4/5] x86/syscall/x32: Move x32 syscall table
2025-03-13 23:47 ` Sohil Mehta
@ 2025-03-14 9:25 ` Ingo Molnar
2025-03-14 9:34 ` Ingo Molnar
1 sibling, 0 replies; 22+ messages in thread
From: Ingo Molnar @ 2025-03-14 9:25 UTC (permalink / raw)
To: Sohil Mehta
Cc: Brian Gerst, linux-kernel, x86, H . Peter Anvin, Thomas Gleixner,
Borislav Petkov, Andy Lutomirski, Juergen Gross, Boris Ostrovsky
* Sohil Mehta <sohil.mehta@intel.com> wrote:
> > +#ifdef CONFIG_X86_X32_ABI
> > +long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
> > +{
> > + switch (nr) {
> > + #include <asm/syscalls_x32.h>
> > + default: return __x64_sys_ni_syscall(regs);
> > + }
> > +};
>
> There seems to be a stray semicolon here. The original code also has it
> but it doesn't seem necessary.
Yeah, and this should be done in a followup patch.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [PATCH 4/5] x86/syscall/x32: Move x32 syscall table
2025-03-13 23:47 ` Sohil Mehta
2025-03-14 9:25 ` Ingo Molnar
@ 2025-03-14 9:34 ` Ingo Molnar
2025-03-14 16:02 ` Sohil Mehta
1 sibling, 1 reply; 22+ messages in thread
From: Ingo Molnar @ 2025-03-14 9:34 UTC (permalink / raw)
To: Sohil Mehta
Cc: Brian Gerst, linux-kernel, x86, H . Peter Anvin, Thomas Gleixner,
Borislav Petkov, Andy Lutomirski, Juergen Gross, Boris Ostrovsky
* Sohil Mehta <sohil.mehta@intel.com> wrote:
> > +#ifdef CONFIG_X86_X32_ABI
> > +long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
> > +{
> > + switch (nr) {
> > + #include <asm/syscalls_x32.h>
> > + default: return __x64_sys_ni_syscall(regs);
> > + }
> > +};
>
> There seems to be a stray semicolon here. The original code also has it
> but it doesn't seem necessary.
BTW., seeing that you've gone through this series with a fine comb I've
added your Reviewed-by tag to the series (with the caveat that the
details you've pointed out will be addressed in followup patches).
Let me know if that's not OK.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [PATCH 4/5] x86/syscall/x32: Move x32 syscall table
2025-03-14 9:34 ` Ingo Molnar
@ 2025-03-14 16:02 ` Sohil Mehta
0 siblings, 0 replies; 22+ messages in thread
From: Sohil Mehta @ 2025-03-14 16:02 UTC (permalink / raw)
To: Ingo Molnar
Cc: Brian Gerst, linux-kernel, x86, H . Peter Anvin, Thomas Gleixner,
Borislav Petkov, Andy Lutomirski, Juergen Gross, Boris Ostrovsky
On 3/14/2025 2:34 AM, Ingo Molnar wrote:
>
> * Sohil Mehta <sohil.mehta@intel.com> wrote:
>
>>> +#ifdef CONFIG_X86_X32_ABI
>>> +long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
>>> +{
>>> + switch (nr) {
>>> + #include <asm/syscalls_x32.h>
>>> + default: return __x64_sys_ni_syscall(regs);
>>> + }
>>> +};
>>
>> There seems to be a stray semicolon here. The original code also has it
>> but it doesn't seem necessary.
>
> BTW., seeing that you've gone through this series with a fine comb I've
> added your Reviewed-by tag to the series (with the caveat that the
> details you've pointed out will be addressed in followup patches).
> Let me know if that's not OK.
>
Thanks, that would have been fine with me. Reviewing the v2 series now.
> Thanks,
>
> Ingo
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [tip: x86/cpu] x86/syscall/x32: Move the x32 syscall table to arch/x86/entry/syscall_64.c
2025-03-13 18:22 ` [PATCH 4/5] x86/syscall/x32: Move x32 syscall table Brian Gerst
2025-03-13 23:47 ` Sohil Mehta
@ 2025-03-14 9:46 ` tip-bot2 for Brian Gerst
1 sibling, 0 replies; 22+ messages in thread
From: tip-bot2 for Brian Gerst @ 2025-03-14 9:46 UTC (permalink / raw)
To: linux-tip-commits
Cc: Brian Gerst, Ingo Molnar, Sohil Mehta, Andy Lutomirski,
Juergen Gross, H. Peter Anvin, Linus Torvalds, Josh Poimboeuf,
x86, linux-kernel
The following commit has been merged into the x86/cpu branch of tip:
Commit-ID: 8b24877200a8d3c01d67b968dfbe58228909cc1b
Gitweb: https://git.kernel.org/tip/8b24877200a8d3c01d67b968dfbe58228909cc1b
Author: Brian Gerst <brgerst@gmail.com>
AuthorDate: Thu, 13 Mar 2025 14:22:35 -04:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 14 Mar 2025 10:32:51 +01:00
x86/syscall/x32: Move the x32 syscall table to arch/x86/entry/syscall_64.c
Since commit:
2e958a8a510d ("x86/entry/x32: Rename __x32_compat_sys_* to __x64_compat_sys_*")
the ABI prefix for x32 syscalls is the same as native 64-bit
syscalls. Move the x32 syscall table to syscall_64.c
No functional changes.
Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20250313182236.655724-5-brgerst@gmail.com
---
arch/x86/entry/Makefile | 1 -
arch/x86/entry/syscall_64.c | 13 +++++++++++++
arch/x86/entry/syscall_x32.c | 25 -------------------------
3 files changed, 13 insertions(+), 26 deletions(-)
delete mode 100644 arch/x86/entry/syscall_x32.c
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 5fd28ab..e870f8a 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -27,4 +27,3 @@ CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE)
obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o
obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
-obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 9e0ba33..b96f562 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -19,6 +19,9 @@
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
+#ifdef CONFIG_X86_X32_ABI
+#include <asm/syscalls_x32.h>
+#endif
#undef __SYSCALL
#undef __SYSCALL_NORETURN
@@ -44,6 +47,16 @@ long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
}
};
+#ifdef CONFIG_X86_X32_ABI
+long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_x32.h>
+ default: return __x64_sys_ni_syscall(regs);
+ }
+};
+#endif
+
static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
/*
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
deleted file mode 100644
index fb77908..0000000
--- a/arch/x86/entry/syscall_x32.c
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for x32 ABI. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <linux/syscalls.h>
-#include <asm/syscall.h>
-
-#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
-#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
-#include <asm/syscalls_x32.h>
-#undef __SYSCALL
-
-#undef __SYSCALL_NORETURN
-#define __SYSCALL_NORETURN __SYSCALL
-
-#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
-{
- switch (nr) {
- #include <asm/syscalls_x32.h>
- default: return __x64_sys_ni_syscall(regs);
- }
-};
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [PATCH 5/5] x86/syscall: Move sys_ni_syscall()
2025-03-13 18:22 [PATCH 0/5] x86/entry: Break up common.c Brian Gerst
` (3 preceding siblings ...)
2025-03-13 18:22 ` [PATCH 4/5] x86/syscall/x32: Move x32 syscall table Brian Gerst
@ 2025-03-13 18:22 ` Brian Gerst
2025-03-14 9:46 ` [tip: x86/cpu] x86/syscall: Move sys_ni_syscall() to arch/x86/kernel/process.c tip-bot2 for Brian Gerst
4 siblings, 1 reply; 22+ messages in thread
From: Brian Gerst @ 2025-03-13 18:22 UTC (permalink / raw)
To: linux-kernel, x86
Cc: Ingo Molnar, H . Peter Anvin, Thomas Gleixner, Borislav Petkov,
Andy Lutomirski, Juergen Gross, Boris Ostrovsky, Brian Gerst
Move sys_ni_syscall() to kernel/process.c, and remove the now empty
entry/common.c
No functional changes.
Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
arch/x86/entry/Makefile | 3 ---
arch/x86/entry/common.c | 38 --------------------------------------
arch/x86/kernel/process.c | 5 +++++
3 files changed, 5 insertions(+), 41 deletions(-)
delete mode 100644 arch/x86/entry/common.c
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index e870f8aa936c..72cae8e0ce85 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -7,16 +7,13 @@ KASAN_SANITIZE := n
UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
-CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE)
-CFLAGS_common.o += -fno-stack-protector
CFLAGS_syscall_32.o += -fno-stack-protector
CFLAGS_syscall_64.o += -fno-stack-protector
obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o
-obj-y += common.o
obj-y += vdso/
obj-y += vsyscall/
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
deleted file mode 100644
index 5bd448c0664f..000000000000
--- a/arch/x86/entry/common.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * common.c - C code for kernel entry and exit
- * Copyright (c) 2015 Andrew Lutomirski
- *
- * Based on asm and ptrace code by many authors. The code here originated
- * in ptrace.c and signal.c.
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/entry-common.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/export.h>
-#include <linux/nospec.h>
-#include <linux/syscalls.h>
-#include <linux/uaccess.h>
-#include <linux/init.h>
-
-#include <asm/apic.h>
-#include <asm/desc.h>
-#include <asm/traps.h>
-#include <asm/vdso.h>
-#include <asm/cpufeature.h>
-#include <asm/fpu/api.h>
-#include <asm/nospec-branch.h>
-#include <asm/io_bitmap.h>
-#include <asm/syscall.h>
-#include <asm/irq_stack.h>
-
-SYSCALL_DEFINE0(ni_syscall)
-{
- return -ENOSYS;
-}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9c75d701011f..91f6ff618852 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1068,3 +1068,8 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
return -EINVAL;
}
+
+SYSCALL_DEFINE0(ni_syscall)
+{
+ return -ENOSYS;
+}
--
2.48.1
^ permalink raw reply related [flat|nested] 22+ messages in thread* [tip: x86/cpu] x86/syscall: Move sys_ni_syscall() to arch/x86/kernel/process.c
2025-03-13 18:22 ` [PATCH 5/5] x86/syscall: Move sys_ni_syscall() Brian Gerst
@ 2025-03-14 9:46 ` tip-bot2 for Brian Gerst
0 siblings, 0 replies; 22+ messages in thread
From: tip-bot2 for Brian Gerst @ 2025-03-14 9:46 UTC (permalink / raw)
To: linux-tip-commits
Cc: Brian Gerst, Ingo Molnar, Sohil Mehta, Andy Lutomirski,
Juergen Gross, H. Peter Anvin, Linus Torvalds, Josh Poimboeuf,
x86, linux-kernel
The following commit has been merged into the x86/cpu branch of tip:
Commit-ID: 83d5563952e6634826f831a9b91c62c7cab4ca4d
Gitweb: https://git.kernel.org/tip/83d5563952e6634826f831a9b91c62c7cab4ca4d
Author: Brian Gerst <brgerst@gmail.com>
AuthorDate: Thu, 13 Mar 2025 14:22:36 -04:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 14 Mar 2025 10:32:51 +01:00
x86/syscall: Move sys_ni_syscall() to arch/x86/kernel/process.c
Move sys_ni_syscall() to kernel/process.c, and remove the now empty
entry/common.c
No functional changes.
Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20250313182236.655724-6-brgerst@gmail.com
---
arch/x86/entry/Makefile | 3 +---
arch/x86/entry/common.c | 38 +--------------------------------------
arch/x86/kernel/process.c | 5 +++++-
3 files changed, 5 insertions(+), 41 deletions(-)
delete mode 100644 arch/x86/entry/common.c
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index e870f8a..72cae8e 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -7,16 +7,13 @@ KASAN_SANITIZE := n
UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
-CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE)
-CFLAGS_common.o += -fno-stack-protector
CFLAGS_syscall_32.o += -fno-stack-protector
CFLAGS_syscall_64.o += -fno-stack-protector
obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o
-obj-y += common.o
obj-y += vdso/
obj-y += vsyscall/
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
deleted file mode 100644
index 5bd448c..0000000
--- a/arch/x86/entry/common.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * common.c - C code for kernel entry and exit
- * Copyright (c) 2015 Andrew Lutomirski
- *
- * Based on asm and ptrace code by many authors. The code here originated
- * in ptrace.c and signal.c.
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/entry-common.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/export.h>
-#include <linux/nospec.h>
-#include <linux/syscalls.h>
-#include <linux/uaccess.h>
-#include <linux/init.h>
-
-#include <asm/apic.h>
-#include <asm/desc.h>
-#include <asm/traps.h>
-#include <asm/vdso.h>
-#include <asm/cpufeature.h>
-#include <asm/fpu/api.h>
-#include <asm/nospec-branch.h>
-#include <asm/io_bitmap.h>
-#include <asm/syscall.h>
-#include <asm/irq_stack.h>
-
-SYSCALL_DEFINE0(ni_syscall)
-{
- return -ENOSYS;
-}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6da6769..b5fb108 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1060,3 +1060,8 @@ long do_arch_prctl_common(int option, unsigned long arg2)
return -EINVAL;
}
+
+SYSCALL_DEFINE0(ni_syscall)
+{
+ return -ENOSYS;
+}
^ permalink raw reply related [flat|nested] 22+ messages in thread