From: Thomas Gleixner <tglx@linutronix.de>
To: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
mingo@redhat.com, bp@alien8.de, dave.hansen@intel.com,
luto@kernel.org, peterz@infradead.org
Cc: sathyanarayanan.kuppuswamy@linux.intel.com, aarcange@redhat.com,
ak@linux.intel.com, dan.j.williams@intel.com, david@redhat.com,
hpa@zytor.com, jgross@suse.com, jmattson@google.com,
joro@8bytes.org, jpoimboe@redhat.com, knsathya@kernel.org,
pbonzini@redhat.com, sdeep@vmware.com, seanjc@google.com,
tony.luck@intel.com, vkuznets@redhat.com, wanpengli@tencent.com,
x86@kernel.org, linux-kernel@vger.kernel.org,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
Sean Christopherson <sean.j.christopherson@intel.com>
Subject: Re: [PATCHv2 04/29] x86/traps: Add #VE support for TDX guest
Date: Tue, 01 Feb 2022 22:02:41 +0100 [thread overview]
Message-ID: <877daez4em.ffs@tglx> (raw)
In-Reply-To: <20220124150215.36893-5-kirill.shutemov@linux.intel.com>
On Mon, Jan 24 2022 at 18:01, Kirill A. Shutemov wrote:
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index df0fa695bb09..1da074123c16 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -68,6 +68,9 @@ static const __initconst struct idt_data early_idts[] = {
> */
> INTG(X86_TRAP_PF, asm_exc_page_fault),
> #endif
> +#ifdef CONFIG_INTEL_TDX_GUEST
> + INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
> +#endif
>
> +bool tdx_get_ve_info(struct ve_info *ve)
> +{
> + struct tdx_module_output out;
> +
> + /*
> + * NMIs and machine checks are suppressed. Before this point any
> + * #VE is fatal. After this point (TDGETVEINFO call), NMIs and
> + * additional #VEs are permitted (but it is expected not to
> + * happen unless kernel panics).
I really do not understand that comment. #NMI and #MC are suppressed
according to the above. How long are they suppressed and what's the
mechanism? Are they unblocked on return from __tdx_module_call() ?
What prevents a nested #VE? If it happens what makes it fatal? Is it
converted to a #DF or detected by software?
Also I do not understand that the last sentence tries to tell me. If the
suppression of #NMI and #MC is lifted on return from tdcall then both
can be delivered immediately afterwards, right?
I assume the additional #VE is triggered by software or a bug in the
kernel.
Confused.
> + */
> + if (__tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out))
> + return false;
> +
> + ve->exit_reason = out.rcx;
> + ve->exit_qual = out.rdx;
> + ve->gla = out.r8;
> + ve->gpa = out.r9;
> + ve->instr_len = lower_32_bits(out.r10);
> + ve->instr_info = upper_32_bits(out.r10);
> +
> + return true;
> +}
> +
> +/*
> + * Handle the user initiated #VE.
> + *
> + * For example, executing the CPUID instruction from user space
> + * is a valid case and hence the resulting #VE has to be handled.
> + *
> + * For dis-allowed or invalid #VE just return failure.
> + */
> +static bool tdx_virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
> +{
> + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
> + return false;
> +}
> +
> +/* Handle the kernel #VE */
> +static bool tdx_virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
> +{
> + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
> + return false;
> +}
> +
> +bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
> +{
> + bool ret;
> +
> + if (user_mode(regs))
> + ret = tdx_virt_exception_user(regs, ve);
> + else
> + ret = tdx_virt_exception_kernel(regs, ve);
> +
> + /* After successful #VE handling, move the IP */
> + if (ret)
> + regs->ip += ve->instr_len;
> +
> + return ret;
> +}
> +
> bool is_tdx_guest(void)
> {
> return tdx_guest_detected;
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index c9d566dcf89a..428504535912 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -61,6 +61,7 @@
> #include <asm/insn.h>
> #include <asm/insn-eval.h>
> #include <asm/vdso.h>
> +#include <asm/tdx.h>
>
> #ifdef CONFIG_X86_64
> #include <asm/x86_init.h>
> @@ -1212,6 +1213,115 @@ DEFINE_IDTENTRY(exc_device_not_available)
> }
> }
>
> +#ifdef CONFIG_INTEL_TDX_GUEST
> +
> +#define VE_FAULT_STR "VE fault"
> +
> +static void ve_raise_fault(struct pt_regs *regs, long error_code)
> +{
> + struct task_struct *tsk = current;
> +
> + if (user_mode(regs)) {
> + tsk->thread.error_code = error_code;
> + tsk->thread.trap_nr = X86_TRAP_VE;
> + show_signal(tsk, SIGSEGV, "", VE_FAULT_STR, regs, error_code);
> + force_sig(SIGSEGV);
> + return;
> + }
> +
> + /*
> + * Attempt to recover from #VE exception failure without
> + * triggering OOPS (useful for MSR read/write failures)
> + */
> + if (fixup_exception(regs, X86_TRAP_VE, error_code, 0))
> + return;
> +
> + tsk->thread.error_code = error_code;
> + tsk->thread.trap_nr = X86_TRAP_VE;
> +
> + /*
> + * To be potentially processing a kprobe fault and to trust the result
> + * from kprobe_running(), it should be non-preemptible.
> + */
> + if (!preemptible() && kprobe_running() &&
> + kprobe_fault_handler(regs, X86_TRAP_VE))
> + return;
> +
> + /* Notify about #VE handling failure, useful for debugger hooks */
> + if (notify_die(DIE_GPF, VE_FAULT_STR, regs, error_code,
> + X86_TRAP_VE, SIGSEGV) == NOTIFY_STOP)
> + return;
> +
> + /* Trigger OOPS and panic */
> + die_addr(VE_FAULT_STR, regs, error_code, 0);
This is pretty much a copy of the #GP handling. So why not consolidating
this properly?
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -559,6 +559,36 @@ static bool fixup_iopl_exception(struct
return true;
}
+static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, long error_code,
+ const char *str)
+{
+ if (fixup_exception(regs, trapnr, error_code, 0))
+ return true;
+
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+
+ /*
+ * To be potentially processing a kprobe fault and to trust the result
+ * from kprobe_running(), we have to be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(regs, trapnr))
+ return true;
+
+ ret = notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV);
+ return ret == NOTIFY_STOP;
+}
+
+static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, long error_code,
+ const char *str)
+{
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+ show_signal(current, SIGSEGV, "", str, regs, error_code);
+ force_sig(SIGSEGV);
+}
+
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
@@ -587,34 +617,14 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_pr
if (fixup_iopl_exception(regs))
goto exit;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
goto exit;
- show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
- force_sig(SIGSEGV);
+ gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc);
goto exit;
}
- if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- goto exit;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
- /*
- * To be potentially processing a kprobe fault and to trust the result
- * from kprobe_running(), we have to be non-preemptible.
- */
- if (!preemptible() &&
- kprobe_running() &&
- kprobe_fault_handler(regs, X86_TRAP_GP))
- goto exit;
-
- ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
- if (ret == NOTIFY_STOP)
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc))
goto exit;
if (error_code)
which makes this:
static void ve_raise_fault(struct pt_regs *regs, long error_code)
{
if (user_mode(regs)) {
gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
return;
}
if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)
return;
die_addr(VE_FAULT_STR, regs, error_code, 0);
}
Hmm?
> +/*
> + * Virtualization Exceptions (#VE) are delivered to TDX guests due to
> + * specific guest actions which may happen in either user space or the
> + * kernel:
> + *
> + * * Specific instructions (WBINVD, for example)
> + * * Specific MSR accesses
> + * * Specific CPUID leaf accesses
> + * * Access to unmapped pages (EPT violation)
> + *
> + * In the settings that Linux will run in, virtualization exceptions are
> + * never generated on accesses to normal, TD-private memory that has been
> + * accepted.
> + *
> + * Syscall entry code has a critical window where the kernel stack is not
> + * yet set up. Any exception in this window leads to hard to debug issues
> + * and can be exploited for privilege escalation. Exceptions in the NMI
> + * entry code also cause issues. Returning from the exception handler with
> + * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
> + *
> + * For these reasons, the kernel avoids #VEs during the syscall gap and
> + * the NMI entry code. Entry code paths do not access TD-shared memory,
> + * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
> + * that might generate #VE.
How is that enforced or validated? What checks for a violation of that
assumption?
Thanks,
tglx
next prev parent reply other threads:[~2022-02-01 21:03 UTC|newest]
Thread overview: 154+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-01-24 15:01 [PATCHv2 00/29] TDX Guest: TDX core support Kirill A. Shutemov
2022-01-24 15:01 ` [PATCHv2 01/29] x86/tdx: Detect running as a TDX guest in early boot Kirill A. Shutemov
2022-02-01 19:29 ` Thomas Gleixner
2022-02-01 23:14 ` Kirill A. Shutemov
2022-02-03 0:32 ` Josh Poimboeuf
2022-02-03 14:09 ` Kirill A. Shutemov
2022-02-03 15:13 ` Dave Hansen
2022-01-24 15:01 ` [PATCHv2 02/29] x86/tdx: Extend the cc_platform_has() API to support TDX guests Kirill A. Shutemov
2022-02-01 19:31 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 03/29] x86/tdx: Add __tdx_module_call() and __tdx_hypercall() helper functions Kirill A. Shutemov
2022-02-01 19:58 ` Thomas Gleixner
2022-02-02 2:55 ` Kirill A. Shutemov
2022-02-02 10:59 ` Kai Huang
2022-02-03 14:44 ` Kirill A. Shutemov
2022-02-03 23:47 ` Kai Huang
2022-02-04 3:43 ` Kirill A. Shutemov
2022-02-04 9:51 ` Kai Huang
2022-02-04 13:20 ` Kirill A. Shutemov
2022-02-04 10:12 ` Kai Huang
2022-02-04 13:18 ` Kirill A. Shutemov
2022-02-05 0:06 ` Kai Huang
2022-02-02 17:08 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 04/29] x86/traps: Add #VE support for TDX guest Kirill A. Shutemov
2022-02-01 21:02 ` Thomas Gleixner [this message]
2022-02-01 21:26 ` Sean Christopherson
2022-02-12 1:42 ` Kirill A. Shutemov
2022-01-24 15:01 ` [PATCHv2 05/29] x86/tdx: Add HLT support for TDX guests Kirill A. Shutemov
2022-01-29 14:53 ` Borislav Petkov
2022-01-29 22:30 ` [PATCHv2.1 " Kirill A. Shutemov
2022-02-01 21:21 ` Thomas Gleixner
2022-02-02 12:48 ` Kirill A. Shutemov
2022-02-02 17:17 ` Thomas Gleixner
2022-02-04 16:55 ` Kirill A. Shutemov
2022-02-07 22:52 ` Sean Christopherson
2022-02-09 14:34 ` Kirill A. Shutemov
2022-02-09 18:05 ` Sean Christopherson
2022-02-09 22:23 ` Kirill A. Shutemov
2022-02-10 1:21 ` Sean Christopherson
2022-01-24 15:01 ` [PATCHv2 06/29] x86/tdx: Add MSR " Kirill A. Shutemov
2022-02-01 21:38 ` Thomas Gleixner
2022-02-02 13:06 ` Kirill A. Shutemov
2022-02-02 17:18 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 07/29] x86/tdx: Handle CPUID via #VE Kirill A. Shutemov
2022-02-01 21:39 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 08/29] x86/tdx: Handle in-kernel MMIO Kirill A. Shutemov
2022-01-24 19:30 ` Josh Poimboeuf
2022-01-24 22:08 ` Kirill A. Shutemov
2022-01-24 23:04 ` Josh Poimboeuf
2022-01-24 22:40 ` Dave Hansen
2022-01-24 23:04 ` [PATCHv2.1 " Kirill A. Shutemov
2022-02-01 16:14 ` Borislav Petkov
2022-02-01 22:30 ` [PATCHv2 " Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 09/29] x86/tdx: Detect TDX at early kernel decompression time Kirill A. Shutemov
2022-02-01 18:30 ` Borislav Petkov
2022-02-01 22:33 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 10/29] x86: Consolidate port I/O helpers Kirill A. Shutemov
2022-02-01 22:36 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 11/29] x86/boot: Allow to hook up alternative " Kirill A. Shutemov
2022-02-01 19:02 ` Borislav Petkov
2022-02-01 22:39 ` Thomas Gleixner
2022-02-01 22:53 ` Thomas Gleixner
2022-02-02 17:20 ` Kirill A. Shutemov
2022-02-02 19:05 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 12/29] x86/boot/compressed: Support TDX guest port I/O at decompression time Kirill A. Shutemov
2022-02-01 22:55 ` Thomas Gleixner
2022-01-24 15:01 ` [PATCHv2 13/29] x86/tdx: Add port I/O emulation Kirill A. Shutemov
2022-02-01 23:01 ` Thomas Gleixner
2022-02-02 6:22 ` Borislav Petkov
2022-01-24 15:02 ` [PATCHv2 14/29] x86/tdx: Early boot handling of port I/O Kirill A. Shutemov
2022-02-01 23:02 ` Thomas Gleixner
2022-02-02 10:09 ` Borislav Petkov
2022-01-24 15:02 ` [PATCHv2 15/29] x86/tdx: Wire up KVM hypercalls Kirill A. Shutemov
2022-02-01 23:05 ` Thomas Gleixner
2022-01-24 15:02 ` [PATCHv2 16/29] x86/boot: Add a trampoline for booting APs via firmware handoff Kirill A. Shutemov
2022-02-01 23:06 ` Thomas Gleixner
2022-02-02 11:27 ` Borislav Petkov
2022-02-04 11:27 ` Kuppuswamy, Sathyanarayanan
2022-02-04 13:49 ` Borislav Petkov
2022-02-15 21:36 ` Kirill A. Shutemov
2022-02-16 10:07 ` Borislav Petkov
2022-02-16 14:10 ` Kirill A. Shutemov
2022-02-10 0:25 ` Kai Huang
2022-01-24 15:02 ` [PATCHv2 17/29] x86/acpi, x86/boot: Add multiprocessor wake-up support Kirill A. Shutemov
2022-02-01 23:27 ` Thomas Gleixner
2022-02-05 12:37 ` Kuppuswamy, Sathyanarayanan
2022-01-24 15:02 ` [PATCHv2 18/29] x86/boot: Avoid #VE during boot for TDX platforms Kirill A. Shutemov
2022-02-02 0:04 ` Thomas Gleixner
2022-02-11 16:13 ` Kirill A. Shutemov
2022-01-24 15:02 ` [PATCHv2 19/29] x86/topology: Disable CPU online/offline control for TDX guests Kirill A. Shutemov
2022-02-02 0:09 ` Thomas Gleixner
2022-02-02 0:11 ` Thomas Gleixner
2022-02-03 15:00 ` Borislav Petkov
2022-02-03 21:26 ` Thomas Gleixner
2022-01-24 15:02 ` [PATCHv2 20/29] x86/tdx: Get page shared bit info from the TDX module Kirill A. Shutemov
2022-02-02 0:14 ` Thomas Gleixner
2022-02-07 22:27 ` Sean Christopherson
2022-02-07 10:44 ` Borislav Petkov
2022-01-24 15:02 ` [PATCHv2 21/29] x86/tdx: Exclude shared bit from __PHYSICAL_MASK Kirill A. Shutemov
2022-02-02 0:18 ` Thomas Gleixner
2022-01-24 15:02 ` [PATCHv2 22/29] x86/tdx: Make pages shared in ioremap() Kirill A. Shutemov
2022-02-02 0:25 ` Thomas Gleixner
2022-02-02 19:27 ` Kirill A. Shutemov
2022-02-02 19:47 ` Thomas Gleixner
2022-02-07 16:27 ` Borislav Petkov
2022-02-07 16:57 ` Dave Hansen
2022-02-07 17:28 ` Borislav Petkov
2022-02-14 22:09 ` Kirill A. Shutemov
2022-02-15 10:50 ` Borislav Petkov
2022-02-15 14:49 ` Tom Lendacky
2022-02-15 15:41 ` Kirill A. Shutemov
2022-02-15 15:55 ` Tom Lendacky
2022-02-15 16:27 ` Kirill A. Shutemov
2022-02-15 16:34 ` Dave Hansen
2022-02-15 17:33 ` Kirill A. Shutemov
2022-02-16 9:58 ` Borislav Petkov
2022-02-16 15:37 ` Kirill A. Shutemov
2022-02-17 15:24 ` Borislav Petkov
2022-01-24 15:02 ` [PATCHv2 23/29] x86/tdx: Add helper to convert memory between shared and private Kirill A. Shutemov
2022-02-02 0:35 ` Thomas Gleixner
2022-02-08 12:12 ` Borislav Petkov
2022-02-09 23:21 ` Kirill A. Shutemov
2022-01-24 15:02 ` [PATCHv2 24/29] x86/mm/cpa: Add support for TDX shared memory Kirill A. Shutemov
2022-02-02 1:27 ` Thomas Gleixner
2022-01-24 15:02 ` [PATCHv2 25/29] x86/kvm: Use bounce buffers for TD guest Kirill A. Shutemov
2022-01-24 15:02 ` [PATCHv2 26/29] x86/tdx: ioapic: Add shared bit for IOAPIC base address Kirill A. Shutemov
2022-02-02 1:33 ` Thomas Gleixner
2022-02-04 22:09 ` Yamahata, Isaku
2022-02-04 22:31 ` Kirill A. Shutemov
2022-02-07 14:08 ` Tom Lendacky
2022-01-24 15:02 ` [PATCHv2 27/29] ACPICA: Avoid cache flush on TDX guest Kirill A. Shutemov
2022-01-24 15:02 ` [PATCHv2 28/29] x86/tdx: Warn about unexpected WBINVD Kirill A. Shutemov
2022-02-02 1:46 ` Thomas Gleixner
2022-02-04 21:35 ` Kirill A. Shutemov
2022-01-24 15:02 ` [PATCHv2 29/29] Documentation/x86: Document TDX kernel architecture Kirill A. Shutemov
2022-02-24 9:08 ` Xiaoyao Li
2022-02-09 10:56 ` [PATCHv2 00/29] TDX Guest: TDX core support Kai Huang
2022-02-09 11:08 ` Borislav Petkov
2022-02-09 11:30 ` Kai Huang
2022-02-09 11:40 ` Borislav Petkov
2022-02-09 11:48 ` Kai Huang
2022-02-09 11:56 ` Borislav Petkov
2022-02-09 11:58 ` Kai Huang
2022-02-09 16:50 ` Sean Christopherson
2022-02-09 19:11 ` Borislav Petkov
2022-02-09 20:07 ` Sean Christopherson
2022-02-09 20:36 ` Borislav Petkov
2022-02-10 0:05 ` Kai Huang
2022-02-16 16:08 ` Sean Christopherson
2022-02-16 15:48 ` Kirill A. Shutemov
2022-02-17 15:19 ` Borislav Petkov
2022-02-17 15:26 ` Kirill A. Shutemov
2022-02-17 15:34 ` Borislav Petkov
2022-02-17 15:29 ` Sean Christopherson
2022-02-17 15:31 ` Borislav Petkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=877daez4em.ffs@tglx \
--to=tglx@linutronix.de \
--cc=aarcange@redhat.com \
--cc=ak@linux.intel.com \
--cc=bp@alien8.de \
--cc=dan.j.williams@intel.com \
--cc=dave.hansen@intel.com \
--cc=david@redhat.com \
--cc=hpa@zytor.com \
--cc=jgross@suse.com \
--cc=jmattson@google.com \
--cc=joro@8bytes.org \
--cc=jpoimboe@redhat.com \
--cc=kirill.shutemov@linux.intel.com \
--cc=knsathya@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=luto@kernel.org \
--cc=mingo@redhat.com \
--cc=pbonzini@redhat.com \
--cc=peterz@infradead.org \
--cc=sathyanarayanan.kuppuswamy@linux.intel.com \
--cc=sdeep@vmware.com \
--cc=sean.j.christopherson@intel.com \
--cc=seanjc@google.com \
--cc=tony.luck@intel.com \
--cc=vkuznets@redhat.com \
--cc=wanpengli@tencent.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).