xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
From: Andrew Cooper <andrew.cooper3@citrix.com>
To: Xen-devel <xen-devel@lists.xenproject.org>
Cc: "Andrew Cooper" <andrew.cooper3@citrix.com>,
	"Jan Beulich" <JBeulich@suse.com>,
	"Roger Pau Monné" <roger.pau@citrix.com>
Subject: [PATCH v3 21/22] x86/pv: System call handling in FRED mode
Date: Fri,  3 Oct 2025 23:53:33 +0100	[thread overview]
Message-ID: <20251003225334.2123667-22-andrew.cooper3@citrix.com> (raw)
In-Reply-To: <20251003225334.2123667-1-andrew.cooper3@citrix.com>

Under FRED, entry_from_pv() handles everything, even system calls.  This means
more of our logic is written in C now, rather than assembly.

In order to facilitate this, introduce pv_inject_callback(), which reuses
struct trap_bounce infrastructure to inject the syscall/sysenter callbacks.
This in turns requires some !PV compatibility for pv_inject_callback() and
pv_hypercall() which can both be ASSERT_UNREACHABLE().

For each of INT $N, SYSCALL and SYSENTER, FRED gives us interrupted context
which was previously lost.  As the guest can't see FRED, Xen has to lose state
in the same way to maintain the prior behaviour.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>

v3:
 * Simplify DCE handling.
 * Add ASSERT_UNREACHABLE() to pv_inject_callback().
 * Adjust comment for X86_ET_SW_INT

v2:
 * New
---
 xen/arch/x86/include/asm/domain.h    |   2 +
 xen/arch/x86/include/asm/hypercall.h |   2 -
 xen/arch/x86/pv/traps.c              |  39 ++++++++++
 xen/arch/x86/traps.c                 | 110 +++++++++++++++++++++++++++
 4 files changed, 151 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index 5df8c7825333..828f42c3e448 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -710,6 +710,8 @@ void arch_vcpu_regs_init(struct vcpu *v);
 struct vcpu_hvm_context;
 int arch_set_info_hvm_guest(struct vcpu *v, const struct vcpu_hvm_context *ctx);
 
+void pv_inject_callback(unsigned int type);
+
 #ifdef CONFIG_PV
 void pv_inject_event(const struct x86_event *event);
 #else
diff --git a/xen/arch/x86/include/asm/hypercall.h b/xen/arch/x86/include/asm/hypercall.h
index f6e9e2313b3c..ded3c24d40e2 100644
--- a/xen/arch/x86/include/asm/hypercall.h
+++ b/xen/arch/x86/include/asm/hypercall.h
@@ -18,9 +18,7 @@
 
 #define __HYPERVISOR_paging_domctl_cont __HYPERVISOR_arch_1
 
-#ifdef CONFIG_PV
 void pv_hypercall(struct cpu_user_regs *regs);
-#endif
 
 void pv_ring1_init_hypercall_page(void *ptr);
 void pv_ring3_init_hypercall_page(void *ptr);
diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c
index c3c0976c440f..00de03412639 100644
--- a/xen/arch/x86/pv/traps.c
+++ b/xen/arch/x86/pv/traps.c
@@ -19,6 +19,8 @@
 #include <asm/shared.h>
 #include <asm/traps.h>
 
+#include <public/callback.h>
+
 void pv_inject_event(const struct x86_event *event)
 {
     struct vcpu *curr = current;
@@ -95,6 +97,43 @@ void pv_inject_event(const struct x86_event *event)
     }
 }
 
+void pv_inject_callback(unsigned int type)
+{
+    struct vcpu *curr = current;
+    struct trap_bounce *tb = &curr->arch.pv.trap_bounce;
+    unsigned long rip;
+    bool irq;
+
+    ASSERT(is_pv_64bit_vcpu(curr));
+
+    switch ( type )
+    {
+    case CALLBACKTYPE_syscall:
+        rip = curr->arch.pv.syscall_callback_eip;
+        irq = curr->arch.pv.vgc_flags & VGCF_syscall_disables_events;
+        break;
+
+    case CALLBACKTYPE_syscall32:
+        rip = curr->arch.pv.syscall32_callback_eip;
+        irq = curr->arch.pv.syscall32_disables_events;
+        break;
+
+    case CALLBACKTYPE_sysenter:
+        rip = curr->arch.pv.sysenter_callback_eip;
+        irq = curr->arch.pv.sysenter_disables_events;
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+        rip = 0;
+        irq = false;
+        break;
+    }
+
+    tb->flags = TBF_EXCEPTION | (irq ? TBF_INTERRUPT : 0);
+    tb->eip = rip;
+}
+
 /*
  * Called from asm to set up the MCE trapbounce info.
  * Returns false no callback is set up, else true.
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 955cff32d75f..5f89928d8128 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -18,6 +18,7 @@
 #include <xen/delay.h>
 #include <xen/domain_page.h>
 #include <xen/guest_access.h>
+#include <xen/hypercall.h>
 #include <xen/init.h>
 #include <xen/mm.h>
 #include <xen/paging.h>
@@ -52,6 +53,8 @@
 #include <asm/uaccess.h>
 #include <asm/xenoprof.h>
 
+#include <public/callback.h>
+
 /*
  * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
  *  fatal:  Xen prints diagnostic message and then hangs.
@@ -2267,6 +2270,7 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
 void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
 {
     struct fred_info *fi = cpu_regs_fred_info(regs);
+    struct vcpu *curr = current;
     uint8_t type = regs->fred_ss.type;
     uint8_t vec = regs->fred_ss.vector;
 
@@ -2306,6 +2310,30 @@ void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
 
     switch ( type )
     {
+    case X86_ET_SW_INT:
+        /*
+         * For better or worse, Xen writes IDT vectors 3 and 4 with DPL3 (so
+         * INT3/INTO work), making INT $3/4 indistinguishable, and the guest
+         * choice of DPL for these vectors is ignored.
+         *
+         * Have them fall through into X86_ET_HW_EXC, as #BP in particular
+         * needs handling by do_int3() in case an external debugger is
+         * attached.
+         */
+        if ( vec != X86_EXC_BP && vec != X86_EXC_OF )
+        {
+            const struct trap_info *ti = &curr->arch.pv.trap_ctxt[vec];
+
+            if ( permit_softint(TI_GET_DPL(ti), curr, regs) )
+                pv_inject_sw_interrupt(vec);
+            else
+            {
+                regs->rip -= 2;
+                pv_inject_hw_exception(X86_EXC_GP, (vec << 3) | X86_XEC_IDT);
+            }
+            break;
+        }
+        fallthrough;
     case X86_ET_HW_EXC:
     case X86_ET_PRIV_SW_EXC:
     case X86_ET_SW_EXC:
@@ -2335,6 +2363,88 @@ void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
         }
         break;
 
+    case X86_ET_OTHER:
+        switch ( regs->fred_ss.vector )
+        {
+        case 1: /* SYSCALL */
+        {
+            /*
+             * FRED delivery preserves the interrupted %cs/%ss, but previously
+             * SYSCALL lost the interrupted selectors, and SYSRET forced the
+             * use of the ones in MSR_STAR.
+             *
+             * The guest isn't aware of FRED, so recreate the legacy
+             * behaviour, including the guess of instruction length for
+             * faults.
+             *
+             * The non-FRED SYSCALL path sets TRAP_syscall in entry_vector to
+             * signal that SYSRET can be used, but this isn't relevant in FRED
+             * mode.
+             *
+             * When setting the selectors, clear all upper metadata again for
+             * backwards compatibility.  In particular fred_ss.swint becomes
+             * pend_DB on ERETx, and nothing else in the pv_hypercall() would
+             * clean up.
+             */
+            bool l = regs->fred_ss.l;
+
+            regs->ssx = l ? FLAT_KERNEL_SS   : FLAT_USER_SS32;
+            regs->csx = l ? FLAT_KERNEL_CS64 : FLAT_USER_CS32;
+
+            if ( guest_kernel_mode(curr, regs) )
+                pv_hypercall(regs);
+            else if ( (l ? curr->arch.pv.syscall_callback_eip
+                         : curr->arch.pv.syscall32_callback_eip) == 0 )
+            {
+                regs->rip -= 2;
+                pv_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
+            }
+            else
+            {
+                /*
+                 * The PV ABI, given no virtual SYSCALL_MASK, hardcodes that
+                 * DF is cleared.  Other flags are handled in the same way as
+                 * interrupts and exceptions in create_bounce_frame().
+                 */
+                regs->eflags &= ~X86_EFLAGS_DF;
+                pv_inject_callback(l ? CALLBACKTYPE_syscall
+                                     : CALLBACKTYPE_syscall32);
+            }
+            break;
+        }
+
+        case 2: /* SYSENTER */
+            /*
+             * FRED delivery preserves the interrupted state, but previously
+             * SYSENTER discarded almost everything.
+             *
+             * The guest isn't aware of FRED, so recreate the legacy
+             * behaviour, including the guess of instruction length for
+             * faults.
+             *
+             * When setting the selectors, clear all upper metadata.  In
+             * particular fred_ss.swint becomes pend_DB on ERETx.
+             */
+            regs->ssx = FLAT_USER_SS;
+            regs->rsp = 0;
+            regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+            regs->csx = 3;
+            regs->rip = 0;
+
+            if ( !curr->arch.pv.sysenter_callback_eip )
+            {
+                regs->rip -= 2;
+                pv_inject_hw_exception(X86_EXC_GP, 0);
+            }
+            else
+                pv_inject_callback(CALLBACKTYPE_sysenter);
+            break;
+
+        default:
+            goto fatal;
+        }
+        break;
+
     default:
         goto fatal;
     }
-- 
2.39.5



  parent reply	other threads:[~2025-10-03 22:56 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-03 22:53 [PATCH v3 for-4.21 00/22] x86: FRED support Andrew Cooper
2025-10-03 22:53 ` [PATCH v3 01/22] x86/msr: Change rdmsr() to have normal API Andrew Cooper
2025-10-07 15:47   ` Jan Beulich
2025-10-03 22:53 ` [PATCH v3 02/22] x86/msr: Change wrmsr() to take a single parameter Andrew Cooper
2025-10-04  0:11   ` Demi Marie Obenour
2025-10-04  0:14     ` Andrew Cooper
2025-10-03 22:53 ` [PATCH v3 03/22] x86/fsgsbase: Split out __{rd,wr}gs_shadow() helpers Andrew Cooper
2025-10-07 15:49   ` Jan Beulich
2025-10-03 22:53 ` [PATCH v3 04/22] x86/fsgsbase: Update fs/gs helpers to use wrmsrns() Andrew Cooper
2025-10-07 15:53   ` Jan Beulich
2025-10-03 22:53 ` [PATCH v3 05/22] x86/fsgsbase: Improve code generation in read_registers() Andrew Cooper
2025-10-04  0:13   ` Demi Marie Obenour
2025-10-07 15:54   ` Jan Beulich
2025-10-03 22:53 ` [PATCH v3 06/22] x86/boot: Use RSTORSSP to establish SSP Andrew Cooper
2025-10-07 15:57   ` Jan Beulich
2025-10-03 22:53 ` [PATCH v3 07/22] x86/traps: Alter switch_stack_and_jump() for FRED mode Andrew Cooper
2025-10-07 15:58   ` Jan Beulich
2025-10-03 22:53 ` [PATCH v3 08/22] x86/traps: Skip Supervisor Shadow Stack tokens in " Andrew Cooper
2025-10-03 22:53 ` [PATCH v3 09/22] x86/traps: Make an IDT-specific #DB helper Andrew Cooper
2025-10-03 22:53 ` [PATCH v3 10/22] x86/traps: Make an IDT-specific #PF helper Andrew Cooper
2025-10-03 22:53 ` [PATCH v3 11/22] x86/fsgsbase: Make gskern accesses safe under FRED Andrew Cooper
2025-10-03 22:53 ` [PATCH v3 12/22] x86/traps: Introduce FRED entrypoints Andrew Cooper
2025-10-08  8:50   ` Jan Beulich
2025-10-16 14:54   ` Jan Beulich
2025-10-03 22:53 ` [PATCH v3 13/22] x86/traps: Enable FRED when requested Andrew Cooper
2025-10-08  8:54   ` Jan Beulich
2025-10-03 22:53 ` [PATCH v3 14/22] x86/pv: Deduplicate is_canonical_address() in do_set_segment_base() Andrew Cooper
2025-10-03 22:53 ` [PATCH v3 15/22] x86/entry: Alter how IRET faults are recognised Andrew Cooper
2025-10-03 22:53 ` [PATCH v3 16/22] x86/entry: Drop the pre exception table infrastructure Andrew Cooper
2025-10-03 22:53 ` [PATCH v3 17/22] x86/entry: Rework the comment about SYSCALL and DF Andrew Cooper
2025-10-03 22:53 ` [PATCH v3 18/22] x86/pv: Adjust GS handling for FRED mode Andrew Cooper
2025-10-03 22:53 ` [PATCH v3 19/22] x86/pv: Guest exception handling in " Andrew Cooper
2025-10-08 12:28   ` Jan Beulich
2025-10-03 22:53 ` [PATCH v3 20/22] x86/pv: ERETU error handling Andrew Cooper
2025-10-08 12:36   ` Jan Beulich
2025-10-03 22:53 ` Andrew Cooper [this message]
2025-10-08 13:45   ` [PATCH v3 21/22] x86/pv: System call handling in FRED mode Jan Beulich
2025-10-03 22:53 ` [PATCH v3 22/22] x86: Clamp reserved bits in eflags more aggressively Andrew Cooper
2025-10-08 13:50   ` Jan Beulich
2025-10-17 13:24 ` [PATCH v3 for-4.21 00/22] x86: FRED support Oleksii Kurochko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251003225334.2123667-22-andrew.cooper3@citrix.com \
    --to=andrew.cooper3@citrix.com \
    --cc=JBeulich@suse.com \
    --cc=roger.pau@citrix.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).