From: Thomas Gleixner <tglx@linutronix.de>
To: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
mingo@redhat.com, bp@alien8.de, dave.hansen@intel.com,
luto@kernel.org, peterz@infradead.org
Cc: sathyanarayanan.kuppuswamy@linux.intel.com, aarcange@redhat.com,
ak@linux.intel.com, dan.j.williams@intel.com, david@redhat.com,
hpa@zytor.com, jgross@suse.com, jmattson@google.com,
joro@8bytes.org, jpoimboe@redhat.com, knsathya@kernel.org,
pbonzini@redhat.com, sdeep@vmware.com, seanjc@google.com,
tony.luck@intel.com, vkuznets@redhat.com, wanpengli@tencent.com,
x86@kernel.org, linux-kernel@vger.kernel.org,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
Sean Christopherson <sean.j.christopherson@intel.com>
Subject: Re: [PATCHv2 04/29] x86/traps: Add #VE support for TDX guest
Date: Tue, 01 Feb 2022 22:02:41 +0100 [thread overview]
Message-ID: <877daez4em.ffs@tglx> (raw)
In-Reply-To: <20220124150215.36893-5-kirill.shutemov@linux.intel.com>
On Mon, Jan 24 2022 at 18:01, Kirill A. Shutemov wrote:
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index df0fa695bb09..1da074123c16 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -68,6 +68,9 @@ static const __initconst struct idt_data early_idts[] = {
> */
> INTG(X86_TRAP_PF, asm_exc_page_fault),
> #endif
> +#ifdef CONFIG_INTEL_TDX_GUEST
> + INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
> +#endif
>
> +bool tdx_get_ve_info(struct ve_info *ve)
> +{
> + struct tdx_module_output out;
> +
> + /*
> + * NMIs and machine checks are suppressed. Before this point any
> + * #VE is fatal. After this point (TDGETVEINFO call), NMIs and
> + * additional #VEs are permitted (but it is expected not to
> + * happen unless kernel panics).
I really do not understand that comment. #NMI and #MC are suppressed
according to the above. How long are they suppressed and what's the
mechanism? Are they unblocked on return from __tdx_module_call() ?
What prevents a nested #VE? If it happens what makes it fatal? Is it
converted to a #DF or detected by software?
Also I do not understand that the last sentence tries to tell me. If the
suppression of #NMI and #MC is lifted on return from tdcall then both
can be delivered immediately afterwards, right?
I assume the additional #VE is triggered by software or a bug in the
kernel.
Confused.
> + */
> + if (__tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out))
> + return false;
> +
> + ve->exit_reason = out.rcx;
> + ve->exit_qual = out.rdx;
> + ve->gla = out.r8;
> + ve->gpa = out.r9;
> + ve->instr_len = lower_32_bits(out.r10);
> + ve->instr_info = upper_32_bits(out.r10);
> +
> + return true;
> +}
> +
> +/*
> + * Handle the user initiated #VE.
> + *
> + * For example, executing the CPUID instruction from user space
> + * is a valid case and hence the resulting #VE has to be handled.
> + *
> + * For dis-allowed or invalid #VE just return failure.
> + */
> +static bool tdx_virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
> +{
> + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
> + return false;
> +}
> +
> +/* Handle the kernel #VE */
> +static bool tdx_virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
> +{
> + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
> + return false;
> +}
> +
> +bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
> +{
> + bool ret;
> +
> + if (user_mode(regs))
> + ret = tdx_virt_exception_user(regs, ve);
> + else
> + ret = tdx_virt_exception_kernel(regs, ve);
> +
> + /* After successful #VE handling, move the IP */
> + if (ret)
> + regs->ip += ve->instr_len;
> +
> + return ret;
> +}
> +
> bool is_tdx_guest(void)
> {
> return tdx_guest_detected;
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index c9d566dcf89a..428504535912 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -61,6 +61,7 @@
> #include <asm/insn.h>
> #include <asm/insn-eval.h>
> #include <asm/vdso.h>
> +#include <asm/tdx.h>
>
> #ifdef CONFIG_X86_64
> #include <asm/x86_init.h>
> @@ -1212,6 +1213,115 @@ DEFINE_IDTENTRY(exc_device_not_available)
> }
> }
>
> +#ifdef CONFIG_INTEL_TDX_GUEST
> +
> +#define VE_FAULT_STR "VE fault"
> +
> +static void ve_raise_fault(struct pt_regs *regs, long error_code)
> +{
> + struct task_struct *tsk = current;
> +
> + if (user_mode(regs)) {
> + tsk->thread.error_code = error_code;
> + tsk->thread.trap_nr = X86_TRAP_VE;
> + show_signal(tsk, SIGSEGV, "", VE_FAULT_STR, regs, error_code);
> + force_sig(SIGSEGV);
> + return;
> + }
> +
> + /*
> + * Attempt to recover from #VE exception failure without
> + * triggering OOPS (useful for MSR read/write failures)
> + */
> + if (fixup_exception(regs, X86_TRAP_VE, error_code, 0))
> + return;
> +
> + tsk->thread.error_code = error_code;
> + tsk->thread.trap_nr = X86_TRAP_VE;
> +
> + /*
> + * To be potentially processing a kprobe fault and to trust the result
> + * from kprobe_running(), it should be non-preemptible.
> + */
> + if (!preemptible() && kprobe_running() &&
> + kprobe_fault_handler(regs, X86_TRAP_VE))
> + return;
> +
> + /* Notify about #VE handling failure, useful for debugger hooks */
> + if (notify_die(DIE_GPF, VE_FAULT_STR, regs, error_code,
> + X86_TRAP_VE, SIGSEGV) == NOTIFY_STOP)
> + return;
> +
> + /* Trigger OOPS and panic */
> + die_addr(VE_FAULT_STR, regs, error_code, 0);
This is pretty much a copy of the #GP handling. So why not consolidating
this properly?
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -559,6 +559,36 @@ static bool fixup_iopl_exception(struct
return true;
}
+static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, long error_code,
+ const char *str)
+{
+ if (fixup_exception(regs, trapnr, error_code, 0))
+ return true;
+
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+
+ /*
+ * To be potentially processing a kprobe fault and to trust the result
+ * from kprobe_running(), we have to be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(regs, trapnr))
+ return true;
+
+ ret = notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV);
+ return ret == NOTIFY_STOP;
+}
+
+static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, long error_code,
+ const char *str)
+{
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+ show_signal(current, SIGSEGV, "", str, regs, error_code);
+ force_sig(SIGSEGV);
+}
+
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
@@ -587,34 +617,14 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_pr
if (fixup_iopl_exception(regs))
goto exit;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
goto exit;
- show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
- force_sig(SIGSEGV);
+ gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc);
goto exit;
}
- if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- goto exit;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
- /*
- * To be potentially processing a kprobe fault and to trust the result
- * from kprobe_running(), we have to be non-preemptible.
- */
- if (!preemptible() &&
- kprobe_running() &&
- kprobe_fault_handler(regs, X86_TRAP_GP))
- goto exit;
-
- ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
- if (ret == NOTIFY_STOP)
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc))
goto exit;
if (error_code)
which makes this:
static void ve_raise_fault(struct pt_regs *regs, long error_code)
{
if (user_mode(regs)) {
gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
return;
}
if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)
return;
die_addr(VE_FAULT_STR, regs, error_code, 0);
}
Hmm?
> +/*
> + * Virtualization Exceptions (#VE) are delivered to TDX guests due to
> + * specific guest actions which may happen in either user space or the
> + * kernel:
> + *
> + * * Specific instructions (WBINVD, for example)
> + * * Specific MSR accesses
> + * * Specific CPUID leaf accesses
> + * * Access to unmapped pages (EPT violation)
> + *
> + * In the settings that Linux will run in, virtualization exceptions are
> + * never generated on accesses to normal, TD-private memory that has been
> + * accepted.
> + *
> + * Syscall entry code has a critical window where the kernel stack is not
> + * yet set up. Any exception in this window leads to hard to debug issues
> + * and can be exploited for privilege escalation. Exceptions in the NMI
> + * entry code also cause issues. Returning from the exception handler with
> + * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
> + *
> + * For these reasons, the kernel avoids #VEs during the syscall gap and
> + * the NMI entry code. Entry code paths do not access TD-shared memory,
> + * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
> + * that might generate #VE.
How is that enforced or validated? What checks for a violation of that
assumption?
Thanks,
tglx
next prev parent reply other threads:[~2022-02-01 21:03 UTC|newest]
Thread overview: 154+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-01-24 15:01 [PATCHv2 00/29] TDX Guest: TDX core support Kirill A. Shutemov
2022-01-24 15:01 ` [PATCHv2 01/29] x86/tdx: Detect running as a TDX guest in early boot Kirill A. Shutemov
2022-02-01 19:29 ` Thomas Gleixner
2022-02-01 23:14 ` Kirill A. Shutemov
2022-02-03 0:32 ` Josh Poimboeuf
2022-02-03 14:09 ` Kirill A. Shutemov
2022-02-03 15:13 ` Dave Hansen
2022-01-24 15:01 ` [PATCHv2 02/29] x86/tdx: Extend the cc_platform_has() API to support TDX guests Kirill A. Shutemov
2022-02-01 19:31 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 03/29] x86/tdx: Add __tdx_module_call() and __tdx_hypercall() helper functions Kirill A. Shutemov
2022-02-01 19:58 ` Thomas Gleixner
2022-02-02 2:55 ` Kirill A. Shutemov
2022-02-02 10:59 ` Kai Huang
2022-02-03 14:44 ` Kirill A. Shutemov
2022-02-03 23:47 ` Kai Huang
2022-02-04 3:43 ` Kirill A. Shutemov
2022-02-04 9:51 ` Kai Huang
2022-02-04 13:20 ` Kirill A. Shutemov
2022-02-04 10:12 ` Kai Huang
2022-02-04 13:18 ` Kirill A. Shutemov
2022-02-05 0:06 ` Kai Huang
2022-02-02 17:08 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 04/29] x86/traps: Add #VE support for TDX guest Kirill A. Shutemov
2022-02-01 21:02 ` Thomas Gleixner [this message]
2022-02-01 21:26 ` Sean Christopherson
2022-02-12 1:42 ` Kirill A. Shutemov
2022-01-24 15:01 ` [PATCHv2 05/29] x86/tdx: Add HLT support for TDX guests Kirill A. Shutemov
2022-01-29 14:53 ` Borislav Petkov
2022-01-29 22:30 ` [PATCHv2.1 " Kirill A. Shutemov
2022-02-01 21:21 ` Thomas Gleixner
2022-02-02 12:48 ` Kirill A. Shutemov
2022-02-02 17:17 ` Thomas Gleixner
2022-02-04 16:55 ` Kirill A. Shutemov
2022-02-07 22:52 ` Sean Christopherson
2022-02-09 14:34 ` Kirill A. Shutemov
2022-02-09 18:05 ` Sean Christopherson
2022-02-09 22:23 ` Kirill A. Shutemov
2022-02-10 1:21 ` Sean Christopherson
2022-01-24 15:01 ` [PATCHv2 06/29] x86/tdx: Add MSR " Kirill A. Shutemov
2022-02-01 21:38 ` Thomas Gleixner
2022-02-02 13:06 ` Kirill A. Shutemov
2022-02-02 17:18 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 07/29] x86/tdx: Handle CPUID via #VE Kirill A. Shutemov
2022-02-01 21:39 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 08/29] x86/tdx: Handle in-kernel MMIO Kirill A. Shutemov
2022-01-24 19:30 ` Josh Poimboeuf
2022-01-24 22:08 ` Kirill A. Shutemov
2022-01-24 23:04 ` Josh Poimboeuf
2022-01-24 22:40 ` Dave Hansen
2022-01-24 23:04 ` [PATCHv2.1 " Kirill A. Shutemov
2022-02-01 16:14 ` Borislav Petkov
2022-02-01 22:30 ` [PATCHv2 " Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 09/29] x86/tdx: Detect TDX at early kernel decompression time Kirill A. Shutemov
2022-02-01 18:30 ` Borislav Petkov
2022-02-01 22:33 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 10/29] x86: Consolidate port I/O helpers Kirill A. Shutemov
2022-02-01 22:36 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 11/29] x86/boot: Allow to hook up alternative " Kirill A. Shutemov
2022-02-01 19:02 ` Borislav Petkov
2022-02-01 22:39 ` Thomas Gleixner
2022-02-01 22:53 ` Thomas Gleixner
2022-02-02 17:20 ` Kirill A. Shutemov
2022-02-02 19:05 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 12/29] x86/boot/compressed: Support TDX guest port I/O at decompression time Kirill A. Shutemov
2022-02-01 22:55 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 13/29] x86/tdx: Add port I/O emulation Kirill A. Shutemov
2022-02-01 23:01 ` Thomas Gleixner
2022-02-02 6:22 ` Borislav Petkov
2022-01-24 15:02 ` [PATCHv2 14/29] x86/tdx: Early boot handling of port I/O Kirill A. Shutemov
2022-02-01 23:02 ` Thomas Gleixner
2022-02-02 10:09 ` Borislav Petkov
2022-01-24 15:02 ` [PATCHv2 15/29] x86/tdx: Wire up KVM hypercalls Kirill A. Shutemov
2022-02-01 23:05 ` Thomas Gleixner
2022-01-24 15:02 ` [PATCHv2 16/29] x86/boot: Add a trampoline for booting APs via firmware handoff Kirill A. Shutemov
2022-02-01 23:06 ` Thomas Gleixner
2022-02-02 11:27 ` Borislav Petkov
2022-02-04 11:27 ` Kuppuswamy, Sathyanarayanan
2022-02-04 13:49 ` Borislav Petkov
2022-02-15 21:36 ` Kirill A. Shutemov
2022-02-16 10:07 ` Borislav Petkov
2022-02-16 14:10 ` Kirill A. Shutemov
2022-02-10 0:25 ` Kai Huang
2022-01-24 15:02 ` [PATCHv2 17/29] x86/acpi, x86/boot: Add multiprocessor wake-up support Kirill A. Shutemov
2022-02-01 23:27 ` Thomas Gleixner
2022-02-05 12:37 ` Kuppuswamy, Sathyanarayanan
2022-01-24 15:02 ` [PATCHv2 18/29] x86/boot: Avoid #VE during boot for TDX platforms Kirill A. Shutemov
2022-02-02 0:04 ` Thomas Gleixner
2022-02-11 16:13 ` Kirill A. Shutemov
2022-01-24 15:02 ` [PATCHv2 19/29] x86/topology: Disable CPU online/offline control for TDX guests Kirill A. Shutemov
2022-02-02 0:09 ` Thomas Gleixner
2022-02-02 0:11 ` Thomas Gleixner
2022-02-03 15:00 ` Borislav Petkov
2022-02-03 21:26 ` Thomas Gleixner
2022-01-24 15:02 ` [PATCHv2 20/29] x86/tdx: Get page shared bit info from the TDX module Kirill A. Shutemov
2022-02-02 0:14 ` Thomas Gleixner
2022-02-07 22:27 ` Sean Christopherson
2022-02-07 10:44 ` Borislav Petkov
2022-01-24 15:02 ` [PATCHv2 21/29] x86/tdx: Exclude shared bit from __PHYSICAL_MASK Kirill A. Shutemov
2022-02-02 0:18 ` Thomas Gleixner
2022-01-24 15:02 ` [PATCHv2 22/29] x86/tdx: Make pages shared in ioremap() Kirill A. Shutemov
2022-02-02 0:25 ` Thomas Gleixner
2022-02-02 19:27 ` Kirill A. Shutemov
2022-02-02 19:47 ` Thomas Gleixner
2022-02-07 16:27 ` Borislav Petkov
2022-02-07 16:57 ` Dave Hansen
2022-02-07 17:28 ` Borislav Petkov
2022-02-14 22:09 ` Kirill A. Shutemov
2022-02-15 10:50 ` Borislav Petkov
2022-02-15 14:49 ` Tom Lendacky
2022-02-15 15:41 ` Kirill A. Shutemov
2022-02-15 15:55 ` Tom Lendacky
2022-02-15 16:27 ` Kirill A. Shutemov
2022-02-15 16:34 ` Dave Hansen
2022-02-15 17:33 ` Kirill A. Shutemov
2022-02-16 9:58 ` Borislav Petkov
2022-02-16 15:37 ` Kirill A. Shutemov
2022-02-17 15:24 ` Borislav Petkov
2022-01-24 15:02 ` [PATCHv2 23/29] x86/tdx: Add helper to convert memory between shared and private Kirill A. Shutemov
2022-02-02 0:35 ` Thomas Gleixner
2022-02-08 12:12 ` Borislav Petkov
2022-02-09 23:21 ` Kirill A. Shutemov
2022-01-24 15:02 ` [PATCHv2 24/29] x86/mm/cpa: Add support for TDX shared memory Kirill A. Shutemov
2022-02-02 1:27 ` Thomas Gleixner
2022-01-24 15:02 ` [PATCHv2 25/29] x86/kvm: Use bounce buffers for TD guest Kirill A. Shutemov
2022-01-24 15:02 ` [PATCHv2 26/29] x86/tdx: ioapic: Add shared bit for IOAPIC base address Kirill A. Shutemov
2022-02-02 1:33 ` Thomas Gleixner
2022-02-04 22:09 ` Yamahata, Isaku
2022-02-04 22:31 ` Kirill A. Shutemov
2022-02-07 14:08 ` Tom Lendacky
2022-01-24 15:02 ` [PATCHv2 27/29] ACPICA: Avoid cache flush on TDX guest Kirill A. Shutemov
2022-01-24 15:02 ` [PATCHv2 28/29] x86/tdx: Warn about unexpected WBINVD Kirill A. Shutemov
2022-02-02 1:46 ` Thomas Gleixner
2022-02-04 21:35 ` Kirill A. Shutemov
2022-01-24 15:02 ` [PATCHv2 29/29] Documentation/x86: Document TDX kernel architecture Kirill A. Shutemov
2022-02-24 9:08 ` Xiaoyao Li
2022-02-09 10:56 ` [PATCHv2 00/29] TDX Guest: TDX core support Kai Huang
2022-02-09 11:08 ` Borislav Petkov
2022-02-09 11:30 ` Kai Huang
2022-02-09 11:40 ` Borislav Petkov
2022-02-09 11:48 ` Kai Huang
2022-02-09 11:56 ` Borislav Petkov
2022-02-09 11:58 ` Kai Huang
2022-02-09 16:50 ` Sean Christopherson
2022-02-09 19:11 ` Borislav Petkov
2022-02-09 20:07 ` Sean Christopherson
2022-02-09 20:36 ` Borislav Petkov
2022-02-10 0:05 ` Kai Huang
2022-02-16 16:08 ` Sean Christopherson
2022-02-16 15:48 ` Kirill A. Shutemov
2022-02-17 15:19 ` Borislav Petkov
2022-02-17 15:26 ` Kirill A. Shutemov
2022-02-17 15:34 ` Borislav Petkov
2022-02-17 15:29 ` Sean Christopherson
2022-02-17 15:31 ` Borislav Petkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=877daez4em.ffs@tglx \
--to=tglx@linutronix.de \
--cc=aarcange@redhat.com \
--cc=ak@linux.intel.com \
--cc=bp@alien8.de \
--cc=dan.j.williams@intel.com \
--cc=dave.hansen@intel.com \
--cc=david@redhat.com \
--cc=hpa@zytor.com \
--cc=jgross@suse.com \
--cc=jmattson@google.com \
--cc=joro@8bytes.org \
--cc=jpoimboe@redhat.com \
--cc=kirill.shutemov@linux.intel.com \
--cc=knsathya@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=luto@kernel.org \
--cc=mingo@redhat.com \
--cc=pbonzini@redhat.com \
--cc=peterz@infradead.org \
--cc=sathyanarayanan.kuppuswamy@linux.intel.com \
--cc=sdeep@vmware.com \
--cc=sean.j.christopherson@intel.com \
--cc=seanjc@google.com \
--cc=tony.luck@intel.com \
--cc=vkuznets@redhat.com \
--cc=wanpengli@tencent.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.