* [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c
@ 2024-07-10  6:29 Paolo Bonzini
  2024-07-10  6:29 ` [PATCH 01/10] target/i386/tcg: Remove SEG_ADDL Paolo Bonzini
                   ` (10 more replies)
  0 siblings, 11 replies; 27+ messages in thread
From: Paolo Bonzini @ 2024-07-10  6:29 UTC (permalink / raw)
  To: qemu-devel; +Cc: rrh.henry, richard.henderson
This includes bugfixes:
- allowing IRET from user mode to user mode with SMAP (do not use implicit
  kernel accesses, which break if the stack is in userspace)
- use DPL-level accesses for interrupts and call gates
- various fixes for task switching
And two related cleanups: computing MMU index once for far calls and returns
(including task switches), and using X86Access for TSS access.
Tested with a really ugly patch to kvm-unit-tests, included after signature.
Paolo Bonzini (7):
  target/i386/tcg: Allow IRET from user mode to user mode with SMAP
  target/i386/tcg: use PUSHL/PUSHW for error code
  target/i386/tcg: Compute MMU index once
  target/i386/tcg: Use DPL-level accesses for interrupts and call gates
  target/i386/tcg: check for correct busy state before switching to a
    new task
  target/i386/tcg: use X86Access for TSS access
  target/i386/tcg: save current task state before loading new one
Richard Henderson (3):
  target/i386/tcg: Remove SEG_ADDL
  target/i386/tcg: Reorg push/pop within seg_helper.c
  target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl
 target/i386/cpu.h            |  11 +-
 target/i386/cpu.c            |  27 +-
 target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++----------------
 3 files changed, 354 insertions(+), 290 deletions(-)
-- 
2.45.2
diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c
index c3ec0ad7..0bf40c6d 100644
--- a/lib/x86/usermode.c
+++ b/lib/x86/usermode.c
@@ -5,13 +5,15 @@
 #include "x86/desc.h"
 #include "x86/isr.h"
 #include "alloc.h"
+#include "alloc_page.h"
 #include "setjmp.h"
 #include "usermode.h"
 
 #include "libcflat.h"
 #include <stdint.h>
 
-#define USERMODE_STACK_SIZE	0x2000
+#define USERMODE_STACK_ORDER	1 /* 8k */
+#define USERMODE_STACK_SIZE	(1 << (12 + USERMODE_STACK_ORDER))
 #define RET_TO_KERNEL_IRQ	0x20
 
 static jmp_buf jmpbuf;
@@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 {
 	extern char ret_to_kernel;
 	volatile uint64_t rax = 0;
-	static unsigned char user_stack[USERMODE_STACK_SIZE];
+	static unsigned char *user_stack;
 	handler old_ex;
 
+	if (!user_stack) {
+		user_stack = alloc_pages(USERMODE_STACK_ORDER);
+		printf("%p\n", user_stack);
+	}
+
 	*raised_vector = 0;
 	set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3);
 	old_ex = handle_exception(fault_vector,
@@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 		return 0;
 	}
 
+	memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8);
+
 	asm volatile (
 			/* Prepare kernel SP for exception handlers */
 			"mov %%rsp, %[rsp0]\n\t"
@@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 			"pushq %[user_stack_top]\n\t"
 			"pushfq\n\t"
 			"pushq %[user_cs]\n\t"
-			"lea user_mode(%%rip), %%rax\n\t"
+			"lea user_mode+0x800000(%%rip), %%rax\n\t" // smap.flat places usermode addresses at 8MB-16MB
 			"pushq %%rax\n\t"
 			"iretq\n"
 
 			"user_mode:\n\t"
 			/* Back up volatile registers before invoking func */
+			"pop %%rax\n\t"
 			"push %%rcx\n\t"
 			"push %%rdx\n\t"
 			"push %%rdi\n\t"
@@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 			"push %%r10\n\t"
 			"push %%r11\n\t"
 			/* Call user mode function */
+			"add $0x800000,%%rbp\n\t"
 			"mov %[arg1], %%rdi\n\t"
 			"mov %[arg2], %%rsi\n\t"
 			"mov %[arg3], %%rdx\n\t"
 			"mov %[arg4], %%rcx\n\t"
-			"call *%[func]\n\t"
+			"call *%%rax\n\t"
 			/* Restore registers */
 			"pop %%r11\n\t"
 			"pop %%r10\n\t"
@@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 			[arg2]"m"(arg2),
 			[arg3]"m"(arg3),
 			[arg4]"m"(arg4),
-			[func]"m"(func),
 			[user_ds]"i"(USER_DS),
 			[user_cs]"i"(USER_CS),
 			[kernel_ds]"rm"(KERNEL_DS),
 			[user_stack_top]"r"(user_stack +
-					sizeof(user_stack)),
+					USERMODE_STACK_SIZE - 8),
 			[kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ));
 
 	handle_exception(fault_vector, old_ex);
diff --git a/x86/smap.c b/x86/smap.c
index 9a823a55..65119442 100644
--- a/x86/smap.c
+++ b/x86/smap.c
@@ -2,6 +2,7 @@
 #include <alloc_page.h>
 #include "x86/desc.h"
 #include "x86/processor.h"
+#include "x86/usermode.h"
 #include "x86/vm.h"
 
 volatile int pf_count = 0;
@@ -89,6 +90,31 @@ static void check_smap_nowp(void)
 	write_cr3(read_cr3());
 }
 
+#ifdef __x86_64__
+static void iret(void)
+{
+	asm volatile(
+	    "mov %%rsp, %%rcx;"
+	    "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;"
+	    "pushf;"
+	    "movl %%cs, %%ebx; pushq %%rbx; "
+	    "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:"
+
+		: : : "ebx", "ecx", "cc"); /* RPL=0 */
+}
+
+static void test_user_iret(void)
+{
+	bool raised_vector;
+	uintptr_t user_iret = (uintptr_t)iret + USER_BASE;
+
+	run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0,
+		    &raised_vector);
+
+	report(!raised_vector, "No #PF on CPL=3 DPL=3 iret");
+}
+#endif
+
 int main(int ac, char **av)
 {
 	unsigned long i;
@@ -196,7 +222,9 @@ int main(int ac, char **av)
 
 	check_smap_nowp();
 
-	// TODO: implicit kernel access from ring 3 (e.g. int)
+#ifdef __x86_64__
+	test_user_iret();
+#endif
 
 	return report_summary();
 }
^ permalink raw reply related	[flat|nested] 27+ messages in thread* [PATCH 01/10] target/i386/tcg: Remove SEG_ADDL 2024-07-10 6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini @ 2024-07-10 6:29 ` Paolo Bonzini 2024-07-10 6:29 ` [PATCH 02/10] target/i386/tcg: Allow IRET from user mode to user mode with SMAP Paolo Bonzini ` (9 subsequent siblings) 10 siblings, 0 replies; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 6:29 UTC (permalink / raw) To: qemu-devel; +Cc: rrh.henry, richard.henderson From: Richard Henderson <richard.henderson@linaro.org> This truncation is now handled by MMU_*32_IDX. The introduction of MMU_*32_IDX in fact applied correct 32-bit wraparound to 16-bit accesses with a high segment base (e.g. big real mode or vm86 mode), which did not use SEG_ADDL. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Link: https://lore.kernel.org/r/20240617161210.4639-3-richard.henderson@linaro.org Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- target/i386/tcg/seg_helper.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c index aee3d19f29b..19d6b41a589 100644 --- a/target/i386/tcg/seg_helper.c +++ b/target/i386/tcg/seg_helper.c @@ -579,10 +579,6 @@ int exception_has_error_code(int intno) } while (0) #endif -/* in 64-bit machines, this can overflow. So this segment addition macro - * can be used to trim the value to 32-bit whenever needed */ -#define SEG_ADDL(ssp, sp, sp_mask) ((uint32_t)((ssp) + (sp & (sp_mask)))) - /* XXX: add a is_user flag to have proper security support */ #define PUSHW_RA(ssp, sp, sp_mask, val, ra) \ { \ @@ -593,7 +589,7 @@ int exception_has_error_code(int intno) #define PUSHL_RA(ssp, sp, sp_mask, val, ra) \ { \ sp -= 4; \ - cpu_stl_kernel_ra(env, SEG_ADDL(ssp, sp, sp_mask), (uint32_t)(val), ra); \ + cpu_stl_kernel_ra(env, (ssp) + (sp & (sp_mask)), (val), ra); \ } #define POPW_RA(ssp, sp, sp_mask, val, ra) \ @@ -604,7 +600,7 @@ int exception_has_error_code(int intno) #define POPL_RA(ssp, sp, sp_mask, val, ra) \ { \ - val = (uint32_t)cpu_ldl_kernel_ra(env, SEG_ADDL(ssp, sp, sp_mask), ra); \ + val = (uint32_t)cpu_ldl_kernel_ra(env, (ssp) + (sp & (sp_mask)), ra); \ sp += 4; \ } -- 2.45.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 02/10] target/i386/tcg: Allow IRET from user mode to user mode with SMAP 2024-07-10 6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini 2024-07-10 6:29 ` [PATCH 01/10] target/i386/tcg: Remove SEG_ADDL Paolo Bonzini @ 2024-07-10 6:29 ` Paolo Bonzini 2024-07-10 15:22 ` Richard Henderson 2024-07-10 6:29 ` [PATCH 03/10] target/i386/tcg: use PUSHL/PUSHW for error code Paolo Bonzini ` (8 subsequent siblings) 10 siblings, 1 reply; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 6:29 UTC (permalink / raw) To: qemu-devel; +Cc: rrh.henry, richard.henderson This fixes a bug wherein i386/tcg assumed an interrupt return using the IRET instruction was always returning from kernel mode to either kernel mode or user mode. This assumption is violated when IRET is used as a clever way to restore thread state, as for example in the dotnet runtime. There, IRET returns from user mode to user mode. This bug is that stack accesses from IRET and RETF, as well as accesses to the parameters in a call gate, are normal data accesses using the current CPL. This manifested itself as a page fault in the guest Linux kernel due to SMAP preventing the access. This bug appears to have been in QEMU since the beginning. Analyzed-by: Robert R. Henry <rrh.henry@gmail.com> Co-developed-by: Robert R. Henry <rrh.henry@gmail.com> Signed-off-by: Robert R. Henry <rrh.henry@gmail.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- target/i386/tcg/seg_helper.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c index 19d6b41a589..4977a5f5b3a 100644 --- a/target/i386/tcg/seg_helper.c +++ b/target/i386/tcg/seg_helper.c @@ -594,13 +594,13 @@ int exception_has_error_code(int intno) #define POPW_RA(ssp, sp, sp_mask, val, ra) \ { \ - val = cpu_lduw_kernel_ra(env, (ssp) + (sp & (sp_mask)), ra); \ + val = cpu_lduw_data_ra(env, (ssp) + (sp & (sp_mask)), ra); \ sp += 2; \ } #define POPL_RA(ssp, sp, sp_mask, val, ra) \ { \ - val = (uint32_t)cpu_ldl_kernel_ra(env, (ssp) + (sp & (sp_mask)), ra); \ + val = (uint32_t)cpu_ldl_data_ra(env, (ssp) + (sp & (sp_mask)), ra); \ sp += 4; \ } @@ -847,7 +847,7 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, #define POPQ_RA(sp, val, ra) \ { \ - val = cpu_ldq_kernel_ra(env, sp, ra); \ + val = cpu_ldq_data_ra(env, sp, ra); \ sp += 8; \ } @@ -1797,18 +1797,18 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, PUSHL_RA(ssp, sp, sp_mask, env->segs[R_SS].selector, GETPC()); PUSHL_RA(ssp, sp, sp_mask, env->regs[R_ESP], GETPC()); for (i = param_count - 1; i >= 0; i--) { - val = cpu_ldl_kernel_ra(env, old_ssp + - ((env->regs[R_ESP] + i * 4) & - old_sp_mask), GETPC()); + val = cpu_ldl_data_ra(env, + old_ssp + ((env->regs[R_ESP] + i * 4) & old_sp_mask), + GETPC()); PUSHL_RA(ssp, sp, sp_mask, val, GETPC()); } } else { PUSHW_RA(ssp, sp, sp_mask, env->segs[R_SS].selector, GETPC()); PUSHW_RA(ssp, sp, sp_mask, env->regs[R_ESP], GETPC()); for (i = param_count - 1; i >= 0; i--) { - val = cpu_lduw_kernel_ra(env, old_ssp + - ((env->regs[R_ESP] + i * 2) & - old_sp_mask), GETPC()); + val = cpu_lduw_data_ra(env, + old_ssp + ((env->regs[R_ESP] + i * 2) & old_sp_mask), + GETPC()); PUSHW_RA(ssp, sp, sp_mask, val, GETPC()); } } -- 2.45.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 02/10] target/i386/tcg: Allow IRET from user mode to user mode with SMAP 2024-07-10 6:29 ` [PATCH 02/10] target/i386/tcg: Allow IRET from user mode to user mode with SMAP Paolo Bonzini @ 2024-07-10 15:22 ` Richard Henderson 0 siblings, 0 replies; 27+ messages in thread From: Richard Henderson @ 2024-07-10 15:22 UTC (permalink / raw) To: Paolo Bonzini, qemu-devel; +Cc: rrh.henry On 7/9/24 23:29, Paolo Bonzini wrote: > This fixes a bug wherein i386/tcg assumed an interrupt return using > the IRET instruction was always returning from kernel mode to either > kernel mode or user mode. This assumption is violated when IRET is used > as a clever way to restore thread state, as for example in the dotnet > runtime. There, IRET returns from user mode to user mode. > > This bug is that stack accesses from IRET and RETF, as well as accesses > to the parameters in a call gate, are normal data accesses using the > current CPL. This manifested itself as a page fault in the guest Linux > kernel due to SMAP preventing the access. > > This bug appears to have been in QEMU since the beginning. > > Analyzed-by: Robert R. Henry<rrh.henry@gmail.com> > Co-developed-by: Robert R. Henry<rrh.henry@gmail.com> > Signed-off-by: Robert R. Henry<rrh.henry@gmail.com> > Signed-off-by: Paolo Bonzini<pbonzini@redhat.com> > --- > target/i386/tcg/seg_helper.c | 18 +++++++++--------- > 1 file changed, 9 insertions(+), 9 deletions(-) Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~ ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 03/10] target/i386/tcg: use PUSHL/PUSHW for error code 2024-07-10 6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini 2024-07-10 6:29 ` [PATCH 01/10] target/i386/tcg: Remove SEG_ADDL Paolo Bonzini 2024-07-10 6:29 ` [PATCH 02/10] target/i386/tcg: Allow IRET from user mode to user mode with SMAP Paolo Bonzini @ 2024-07-10 6:29 ` Paolo Bonzini 2024-07-10 15:24 ` Richard Henderson 2024-07-10 6:29 ` [PATCH 04/10] target/i386/tcg: Reorg push/pop within seg_helper.c Paolo Bonzini ` (7 subsequent siblings) 10 siblings, 1 reply; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 6:29 UTC (permalink / raw) To: qemu-devel; +Cc: rrh.henry, richard.henderson Do not pre-decrement esp, let the macros subtract the appropriate operand size. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- target/i386/tcg/seg_helper.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c index 4977a5f5b3a..0653bc10936 100644 --- a/target/i386/tcg/seg_helper.c +++ b/target/i386/tcg/seg_helper.c @@ -670,22 +670,20 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, } shift = switch_tss(env, intno * 8, e1, e2, SWITCH_TSS_CALL, old_eip); if (has_error_code) { - uint32_t mask; - /* push the error code */ if (env->segs[R_SS].flags & DESC_B_MASK) { - mask = 0xffffffff; + sp_mask = 0xffffffff; } else { - mask = 0xffff; + sp_mask = 0xffff; } - esp = (env->regs[R_ESP] - (2 << shift)) & mask; - ssp = env->segs[R_SS].base + esp; + esp = env->regs[R_ESP]; + ssp = env->segs[R_SS].base; if (shift) { - cpu_stl_kernel(env, ssp, error_code); + PUSHL(ssp, esp, sp_mask, error_code); } else { - cpu_stw_kernel(env, ssp, error_code); + PUSHW(ssp, esp, sp_mask, error_code); } - SET_ESP(esp, mask); + SET_ESP(esp, sp_mask); } return; } -- 2.45.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 03/10] target/i386/tcg: use PUSHL/PUSHW for error code 2024-07-10 6:29 ` [PATCH 03/10] target/i386/tcg: use PUSHL/PUSHW for error code Paolo Bonzini @ 2024-07-10 15:24 ` Richard Henderson 0 siblings, 0 replies; 27+ messages in thread From: Richard Henderson @ 2024-07-10 15:24 UTC (permalink / raw) To: Paolo Bonzini, qemu-devel; +Cc: rrh.henry On 7/9/24 23:29, Paolo Bonzini wrote: > Do not pre-decrement esp, let the macros subtract the appropriate > operand size. > > Signed-off-by: Paolo Bonzini<pbonzini@redhat.com> > --- > target/i386/tcg/seg_helper.c | 16 +++++++--------- > 1 file changed, 7 insertions(+), 9 deletions(-) Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~ ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 04/10] target/i386/tcg: Reorg push/pop within seg_helper.c 2024-07-10 6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini ` (2 preceding siblings ...) 2024-07-10 6:29 ` [PATCH 03/10] target/i386/tcg: use PUSHL/PUSHW for error code Paolo Bonzini @ 2024-07-10 6:29 ` Paolo Bonzini 2024-07-10 6:29 ` [PATCH 05/10] target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl Paolo Bonzini ` (6 subsequent siblings) 10 siblings, 0 replies; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 6:29 UTC (permalink / raw) To: qemu-devel; +Cc: rrh.henry, richard.henderson From: Richard Henderson <richard.henderson@linaro.org> Interrupts and call gates should use accesses with the DPL as the privilege level. While computing the applicable MMU index is easy, the harder thing is how to plumb it in the code. One possibility could be to add a single argument to the PUSH* macros for the privilege level, but this is repetitive and risks confusion between the involved privilege levels. Another possibility is to pass both CPL and DPL, and adjusting both PUSH* and POP* to use specific privilege levels (instead of using cpu_{ld,st}*_data). This makes the code more symmetric. However, a more complicated but much nicer approach is to use a structure to contain the stack parameters, env, unwind return address, and rewrite the macros into functions. The struct provides an easy home for the MMU index as well. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Link: https://lore.kernel.org/r/20240617161210.4639-4-richard.henderson@linaro.org Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- target/i386/tcg/seg_helper.c | 439 +++++++++++++++++++---------------- 1 file changed, 238 insertions(+), 201 deletions(-) diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c index 0653bc10936..6b3de7a2be4 100644 --- a/target/i386/tcg/seg_helper.c +++ b/target/i386/tcg/seg_helper.c @@ -579,35 +579,47 @@ int exception_has_error_code(int intno) } while (0) #endif -/* XXX: add a is_user flag to have proper security support */ -#define PUSHW_RA(ssp, sp, sp_mask, val, ra) \ - { \ - sp -= 2; \ - cpu_stw_kernel_ra(env, (ssp) + (sp & (sp_mask)), (val), ra); \ - } +/* XXX: use mmu_index to have proper DPL support */ +typedef struct StackAccess +{ + CPUX86State *env; + uintptr_t ra; + target_ulong ss_base; + target_ulong sp; + target_ulong sp_mask; +} StackAccess; -#define PUSHL_RA(ssp, sp, sp_mask, val, ra) \ - { \ - sp -= 4; \ - cpu_stl_kernel_ra(env, (ssp) + (sp & (sp_mask)), (val), ra); \ - } +static void pushw(StackAccess *sa, uint16_t val) +{ + sa->sp -= 2; + cpu_stw_kernel_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask), + val, sa->ra); +} -#define POPW_RA(ssp, sp, sp_mask, val, ra) \ - { \ - val = cpu_lduw_data_ra(env, (ssp) + (sp & (sp_mask)), ra); \ - sp += 2; \ - } +static void pushl(StackAccess *sa, uint32_t val) +{ + sa->sp -= 4; + cpu_stl_kernel_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask), + val, sa->ra); +} -#define POPL_RA(ssp, sp, sp_mask, val, ra) \ - { \ - val = (uint32_t)cpu_ldl_data_ra(env, (ssp) + (sp & (sp_mask)), ra); \ - sp += 4; \ - } +static uint16_t popw(StackAccess *sa) +{ + uint16_t ret = cpu_lduw_data_ra(sa->env, + sa->ss_base + (sa->sp & sa->sp_mask), + sa->ra); + sa->sp += 2; + return ret; +} -#define PUSHW(ssp, sp, sp_mask, val) PUSHW_RA(ssp, sp, sp_mask, val, 0) -#define PUSHL(ssp, sp, sp_mask, val) PUSHL_RA(ssp, sp, sp_mask, val, 0) -#define POPW(ssp, sp, sp_mask, val) POPW_RA(ssp, sp, sp_mask, val, 0) -#define POPL(ssp, sp, sp_mask, val) POPL_RA(ssp, sp, sp_mask, val, 0) +static uint32_t popl(StackAccess *sa) +{ + uint32_t ret = cpu_ldl_data_ra(sa->env, + sa->ss_base + (sa->sp & sa->sp_mask), + sa->ra); + sa->sp += 4; + return ret; +} /* protected mode interrupt */ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, @@ -615,12 +627,13 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, int is_hw) { SegmentCache *dt; - target_ulong ptr, ssp; + target_ulong ptr; int type, dpl, selector, ss_dpl, cpl; int has_error_code, new_stack, shift; - uint32_t e1, e2, offset, ss = 0, esp, ss_e1 = 0, ss_e2 = 0; - uint32_t old_eip, sp_mask, eflags; + uint32_t e1, e2, offset, ss = 0, ss_e1 = 0, ss_e2 = 0; + uint32_t old_eip, eflags; int vm86 = env->eflags & VM_MASK; + StackAccess sa; bool set_rf; has_error_code = 0; @@ -662,6 +675,9 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, raise_exception_err(env, EXCP0D_GPF, intno * 8 + 2); } + sa.env = env; + sa.ra = 0; + if (type == 5) { /* task gate */ /* must do that check here to return the correct error code */ @@ -672,18 +688,18 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, if (has_error_code) { /* push the error code */ if (env->segs[R_SS].flags & DESC_B_MASK) { - sp_mask = 0xffffffff; + sa.sp_mask = 0xffffffff; } else { - sp_mask = 0xffff; + sa.sp_mask = 0xffff; } - esp = env->regs[R_ESP]; - ssp = env->segs[R_SS].base; + sa.sp = env->regs[R_ESP]; + sa.ss_base = env->segs[R_SS].base; if (shift) { - PUSHL(ssp, esp, sp_mask, error_code); + pushl(&sa, error_code); } else { - PUSHW(ssp, esp, sp_mask, error_code); + pushw(&sa, error_code); } - SET_ESP(esp, sp_mask); + SET_ESP(sa.sp, sa.sp_mask); } return; } @@ -717,6 +733,7 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, } if (dpl < cpl) { /* to inner privilege */ + uint32_t esp; get_ss_esp_from_tss(env, &ss, &esp, dpl, 0); if ((ss & 0xfffc) == 0) { raise_exception_err(env, EXCP0A_TSS, ss & 0xfffc); @@ -740,17 +757,18 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, raise_exception_err(env, EXCP0A_TSS, ss & 0xfffc); } new_stack = 1; - sp_mask = get_sp_mask(ss_e2); - ssp = get_seg_base(ss_e1, ss_e2); + sa.sp = esp; + sa.sp_mask = get_sp_mask(ss_e2); + sa.ss_base = get_seg_base(ss_e1, ss_e2); } else { /* to same privilege */ if (vm86) { raise_exception_err(env, EXCP0D_GPF, selector & 0xfffc); } new_stack = 0; - sp_mask = get_sp_mask(env->segs[R_SS].flags); - ssp = env->segs[R_SS].base; - esp = env->regs[R_ESP]; + sa.sp = env->regs[R_ESP]; + sa.sp_mask = get_sp_mask(env->segs[R_SS].flags); + sa.ss_base = env->segs[R_SS].base; } shift = type >> 3; @@ -775,36 +793,36 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, if (shift == 1) { if (new_stack) { if (vm86) { - PUSHL(ssp, esp, sp_mask, env->segs[R_GS].selector); - PUSHL(ssp, esp, sp_mask, env->segs[R_FS].selector); - PUSHL(ssp, esp, sp_mask, env->segs[R_DS].selector); - PUSHL(ssp, esp, sp_mask, env->segs[R_ES].selector); + pushl(&sa, env->segs[R_GS].selector); + pushl(&sa, env->segs[R_FS].selector); + pushl(&sa, env->segs[R_DS].selector); + pushl(&sa, env->segs[R_ES].selector); } - PUSHL(ssp, esp, sp_mask, env->segs[R_SS].selector); - PUSHL(ssp, esp, sp_mask, env->regs[R_ESP]); + pushl(&sa, env->segs[R_SS].selector); + pushl(&sa, env->regs[R_ESP]); } - PUSHL(ssp, esp, sp_mask, eflags); - PUSHL(ssp, esp, sp_mask, env->segs[R_CS].selector); - PUSHL(ssp, esp, sp_mask, old_eip); + pushl(&sa, eflags); + pushl(&sa, env->segs[R_CS].selector); + pushl(&sa, old_eip); if (has_error_code) { - PUSHL(ssp, esp, sp_mask, error_code); + pushl(&sa, error_code); } } else { if (new_stack) { if (vm86) { - PUSHW(ssp, esp, sp_mask, env->segs[R_GS].selector); - PUSHW(ssp, esp, sp_mask, env->segs[R_FS].selector); - PUSHW(ssp, esp, sp_mask, env->segs[R_DS].selector); - PUSHW(ssp, esp, sp_mask, env->segs[R_ES].selector); + pushw(&sa, env->segs[R_GS].selector); + pushw(&sa, env->segs[R_FS].selector); + pushw(&sa, env->segs[R_DS].selector); + pushw(&sa, env->segs[R_ES].selector); } - PUSHW(ssp, esp, sp_mask, env->segs[R_SS].selector); - PUSHW(ssp, esp, sp_mask, env->regs[R_ESP]); + pushw(&sa, env->segs[R_SS].selector); + pushw(&sa, env->regs[R_ESP]); } - PUSHW(ssp, esp, sp_mask, eflags); - PUSHW(ssp, esp, sp_mask, env->segs[R_CS].selector); - PUSHW(ssp, esp, sp_mask, old_eip); + pushw(&sa, eflags); + pushw(&sa, env->segs[R_CS].selector); + pushw(&sa, old_eip); if (has_error_code) { - PUSHW(ssp, esp, sp_mask, error_code); + pushw(&sa, error_code); } } @@ -822,10 +840,10 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0, 0); } ss = (ss & ~3) | dpl; - cpu_x86_load_seg_cache(env, R_SS, ss, - ssp, get_seg_limit(ss_e1, ss_e2), ss_e2); + cpu_x86_load_seg_cache(env, R_SS, ss, sa.ss_base, + get_seg_limit(ss_e1, ss_e2), ss_e2); } - SET_ESP(esp, sp_mask); + SET_ESP(sa.sp, sa.sp_mask); selector = (selector & ~3) | dpl; cpu_x86_load_seg_cache(env, R_CS, selector, @@ -837,20 +855,18 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, #ifdef TARGET_X86_64 -#define PUSHQ_RA(sp, val, ra) \ - { \ - sp -= 8; \ - cpu_stq_kernel_ra(env, sp, (val), ra); \ - } +static void pushq(StackAccess *sa, uint64_t val) +{ + sa->sp -= 8; + cpu_stq_kernel_ra(sa->env, sa->sp, val, sa->ra); +} -#define POPQ_RA(sp, val, ra) \ - { \ - val = cpu_ldq_data_ra(env, sp, ra); \ - sp += 8; \ - } - -#define PUSHQ(sp, val) PUSHQ_RA(sp, val, 0) -#define POPQ(sp, val) POPQ_RA(sp, val, 0) +static uint64_t popq(StackAccess *sa) +{ + uint64_t ret = cpu_ldq_data_ra(sa->env, sa->sp, sa->ra); + sa->sp += 8; + return ret; +} static inline target_ulong get_rsp_from_tss(CPUX86State *env, int level) { @@ -893,8 +909,9 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int, int type, dpl, selector, cpl, ist; int has_error_code, new_stack; uint32_t e1, e2, e3, ss, eflags; - target_ulong old_eip, esp, offset; + target_ulong old_eip, offset; bool set_rf; + StackAccess sa; has_error_code = 0; if (!is_int && !is_hw) { @@ -962,10 +979,15 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int, if (e2 & DESC_C_MASK) { dpl = cpl; } + + sa.env = env; + sa.ra = 0; + sa.sp_mask = -1; + sa.ss_base = 0; if (dpl < cpl || ist != 0) { /* to inner privilege */ new_stack = 1; - esp = get_rsp_from_tss(env, ist != 0 ? ist + 3 : dpl); + sa.sp = get_rsp_from_tss(env, ist != 0 ? ist + 3 : dpl); ss = 0; } else { /* to same privilege */ @@ -973,9 +995,9 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int, raise_exception_err(env, EXCP0D_GPF, selector & 0xfffc); } new_stack = 0; - esp = env->regs[R_ESP]; + sa.sp = env->regs[R_ESP]; } - esp &= ~0xfLL; /* align stack */ + sa.sp &= ~0xfLL; /* align stack */ /* See do_interrupt_protected. */ eflags = cpu_compute_eflags(env); @@ -983,13 +1005,13 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int, eflags |= RF_MASK; } - PUSHQ(esp, env->segs[R_SS].selector); - PUSHQ(esp, env->regs[R_ESP]); - PUSHQ(esp, eflags); - PUSHQ(esp, env->segs[R_CS].selector); - PUSHQ(esp, old_eip); + pushq(&sa, env->segs[R_SS].selector); + pushq(&sa, env->regs[R_ESP]); + pushq(&sa, eflags); + pushq(&sa, env->segs[R_CS].selector); + pushq(&sa, old_eip); if (has_error_code) { - PUSHQ(esp, error_code); + pushq(&sa, error_code); } /* interrupt gate clear IF mask */ @@ -1002,7 +1024,7 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int, ss = 0 | dpl; cpu_x86_load_seg_cache(env, R_SS, ss, 0, 0, dpl << DESC_DPL_SHIFT); } - env->regs[R_ESP] = esp; + env->regs[R_ESP] = sa.sp; selector = (selector & ~3) | dpl; cpu_x86_load_seg_cache(env, R_CS, selector, @@ -1074,10 +1096,11 @@ static void do_interrupt_real(CPUX86State *env, int intno, int is_int, int error_code, unsigned int next_eip) { SegmentCache *dt; - target_ulong ptr, ssp; + target_ulong ptr; int selector; - uint32_t offset, esp; + uint32_t offset; uint32_t old_cs, old_eip; + StackAccess sa; /* real mode (simpler!) */ dt = &env->idt; @@ -1087,8 +1110,13 @@ static void do_interrupt_real(CPUX86State *env, int intno, int is_int, ptr = dt->base + intno * 4; offset = cpu_lduw_kernel(env, ptr); selector = cpu_lduw_kernel(env, ptr + 2); - esp = env->regs[R_ESP]; - ssp = env->segs[R_SS].base; + + sa.env = env; + sa.ra = 0; + sa.sp = env->regs[R_ESP]; + sa.sp_mask = 0xffff; + sa.ss_base = env->segs[R_SS].base; + if (is_int) { old_eip = next_eip; } else { @@ -1096,12 +1124,12 @@ static void do_interrupt_real(CPUX86State *env, int intno, int is_int, } old_cs = env->segs[R_CS].selector; /* XXX: use SS segment size? */ - PUSHW(ssp, esp, 0xffff, cpu_compute_eflags(env)); - PUSHW(ssp, esp, 0xffff, old_cs); - PUSHW(ssp, esp, 0xffff, old_eip); + pushw(&sa, cpu_compute_eflags(env)); + pushw(&sa, old_cs); + pushw(&sa, old_eip); /* update processor state */ - env->regs[R_ESP] = (env->regs[R_ESP] & ~0xffff) | (esp & 0xffff); + SET_ESP(sa.sp, sa.sp_mask); env->eip = offset; env->segs[R_CS].selector = selector; env->segs[R_CS].base = (selector << 4); @@ -1544,21 +1572,23 @@ void helper_ljmp_protected(CPUX86State *env, int new_cs, target_ulong new_eip, void helper_lcall_real(CPUX86State *env, uint32_t new_cs, uint32_t new_eip, int shift, uint32_t next_eip) { - uint32_t esp, esp_mask; - target_ulong ssp; + StackAccess sa; + + sa.env = env; + sa.ra = GETPC(); + sa.sp = env->regs[R_ESP]; + sa.sp_mask = get_sp_mask(env->segs[R_SS].flags); + sa.ss_base = env->segs[R_SS].base; - esp = env->regs[R_ESP]; - esp_mask = get_sp_mask(env->segs[R_SS].flags); - ssp = env->segs[R_SS].base; if (shift) { - PUSHL_RA(ssp, esp, esp_mask, env->segs[R_CS].selector, GETPC()); - PUSHL_RA(ssp, esp, esp_mask, next_eip, GETPC()); + pushl(&sa, env->segs[R_CS].selector); + pushl(&sa, next_eip); } else { - PUSHW_RA(ssp, esp, esp_mask, env->segs[R_CS].selector, GETPC()); - PUSHW_RA(ssp, esp, esp_mask, next_eip, GETPC()); + pushw(&sa, env->segs[R_CS].selector); + pushw(&sa, next_eip); } - SET_ESP(esp, esp_mask); + SET_ESP(sa.sp, sa.sp_mask); env->eip = new_eip; env->segs[R_CS].selector = new_cs; env->segs[R_CS].base = (new_cs << 4); @@ -1570,9 +1600,10 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, { int new_stack, i; uint32_t e1, e2, cpl, dpl, rpl, selector, param_count; - uint32_t ss = 0, ss_e1 = 0, ss_e2 = 0, type, ss_dpl, sp_mask; + uint32_t ss = 0, ss_e1 = 0, ss_e2 = 0, type, ss_dpl; uint32_t val, limit, old_sp_mask; - target_ulong ssp, old_ssp, offset, sp; + target_ulong old_ssp, offset; + StackAccess sa; LOG_PCALL("lcall %04x:" TARGET_FMT_lx " s=%d\n", new_cs, new_eip, shift); LOG_PCALL_STATE(env_cpu(env)); @@ -1584,6 +1615,10 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, } cpl = env->hflags & HF_CPL_MASK; LOG_PCALL("desc=%08x:%08x\n", e1, e2); + + sa.env = env; + sa.ra = GETPC(); + if (e2 & DESC_S_MASK) { if (!(e2 & DESC_CS_MASK)) { raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, GETPC()); @@ -1611,14 +1646,14 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, #ifdef TARGET_X86_64 /* XXX: check 16/32 bit cases in long mode */ if (shift == 2) { - target_ulong rsp; - /* 64 bit case */ - rsp = env->regs[R_ESP]; - PUSHQ_RA(rsp, env->segs[R_CS].selector, GETPC()); - PUSHQ_RA(rsp, next_eip, GETPC()); + sa.sp = env->regs[R_ESP]; + sa.sp_mask = -1; + sa.ss_base = 0; + pushq(&sa, env->segs[R_CS].selector); + pushq(&sa, next_eip); /* from this point, not restartable */ - env->regs[R_ESP] = rsp; + env->regs[R_ESP] = sa.sp; cpu_x86_load_seg_cache(env, R_CS, (new_cs & 0xfffc) | cpl, get_seg_base(e1, e2), get_seg_limit(e1, e2), e2); @@ -1626,15 +1661,15 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, } else #endif { - sp = env->regs[R_ESP]; - sp_mask = get_sp_mask(env->segs[R_SS].flags); - ssp = env->segs[R_SS].base; + sa.sp = env->regs[R_ESP]; + sa.sp_mask = get_sp_mask(env->segs[R_SS].flags); + sa.ss_base = env->segs[R_SS].base; if (shift) { - PUSHL_RA(ssp, sp, sp_mask, env->segs[R_CS].selector, GETPC()); - PUSHL_RA(ssp, sp, sp_mask, next_eip, GETPC()); + pushl(&sa, env->segs[R_CS].selector); + pushl(&sa, next_eip); } else { - PUSHW_RA(ssp, sp, sp_mask, env->segs[R_CS].selector, GETPC()); - PUSHW_RA(ssp, sp, sp_mask, next_eip, GETPC()); + pushw(&sa, env->segs[R_CS].selector); + pushw(&sa, next_eip); } limit = get_seg_limit(e1, e2); @@ -1642,7 +1677,7 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, GETPC()); } /* from this point, not restartable */ - SET_ESP(sp, sp_mask); + SET_ESP(sa.sp, sa.sp_mask); cpu_x86_load_seg_cache(env, R_CS, (new_cs & 0xfffc) | cpl, get_seg_base(e1, e2), limit, e2); env->eip = new_eip; @@ -1737,13 +1772,13 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, /* to inner privilege */ #ifdef TARGET_X86_64 if (shift == 2) { - sp = get_rsp_from_tss(env, dpl); ss = dpl; /* SS = NULL selector with RPL = new CPL */ new_stack = 1; - sp_mask = 0; - ssp = 0; /* SS base is always zero in IA-32e mode */ + sa.sp = get_rsp_from_tss(env, dpl); + sa.sp_mask = -1; + sa.ss_base = 0; /* SS base is always zero in IA-32e mode */ LOG_PCALL("new ss:rsp=%04x:%016llx env->regs[R_ESP]=" - TARGET_FMT_lx "\n", ss, sp, env->regs[R_ESP]); + TARGET_FMT_lx "\n", ss, sa.sp, env->regs[R_ESP]); } else #endif { @@ -1752,7 +1787,6 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, LOG_PCALL("new ss:esp=%04x:%08x param_count=%d env->regs[R_ESP]=" TARGET_FMT_lx "\n", ss, sp32, param_count, env->regs[R_ESP]); - sp = sp32; if ((ss & 0xfffc) == 0) { raise_exception_err_ra(env, EXCP0A_TSS, ss & 0xfffc, GETPC()); } @@ -1775,63 +1809,64 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, raise_exception_err_ra(env, EXCP0A_TSS, ss & 0xfffc, GETPC()); } - sp_mask = get_sp_mask(ss_e2); - ssp = get_seg_base(ss_e1, ss_e2); + sa.sp = sp32; + sa.sp_mask = get_sp_mask(ss_e2); + sa.ss_base = get_seg_base(ss_e1, ss_e2); } /* push_size = ((param_count * 2) + 8) << shift; */ - old_sp_mask = get_sp_mask(env->segs[R_SS].flags); old_ssp = env->segs[R_SS].base; + #ifdef TARGET_X86_64 if (shift == 2) { /* XXX: verify if new stack address is canonical */ - PUSHQ_RA(sp, env->segs[R_SS].selector, GETPC()); - PUSHQ_RA(sp, env->regs[R_ESP], GETPC()); + pushq(&sa, env->segs[R_SS].selector); + pushq(&sa, env->regs[R_ESP]); /* parameters aren't supported for 64-bit call gates */ } else #endif if (shift == 1) { - PUSHL_RA(ssp, sp, sp_mask, env->segs[R_SS].selector, GETPC()); - PUSHL_RA(ssp, sp, sp_mask, env->regs[R_ESP], GETPC()); + pushl(&sa, env->segs[R_SS].selector); + pushl(&sa, env->regs[R_ESP]); for (i = param_count - 1; i >= 0; i--) { val = cpu_ldl_data_ra(env, - old_ssp + ((env->regs[R_ESP] + i * 4) & old_sp_mask), - GETPC()); - PUSHL_RA(ssp, sp, sp_mask, val, GETPC()); + old_ssp + ((env->regs[R_ESP] + i * 4) & old_sp_mask), + GETPC()); + pushl(&sa, val); } } else { - PUSHW_RA(ssp, sp, sp_mask, env->segs[R_SS].selector, GETPC()); - PUSHW_RA(ssp, sp, sp_mask, env->regs[R_ESP], GETPC()); + pushw(&sa, env->segs[R_SS].selector); + pushw(&sa, env->regs[R_ESP]); for (i = param_count - 1; i >= 0; i--) { val = cpu_lduw_data_ra(env, - old_ssp + ((env->regs[R_ESP] + i * 2) & old_sp_mask), - GETPC()); - PUSHW_RA(ssp, sp, sp_mask, val, GETPC()); + old_ssp + ((env->regs[R_ESP] + i * 2) & old_sp_mask), + GETPC()); + pushw(&sa, val); } } new_stack = 1; } else { /* to same privilege */ - sp = env->regs[R_ESP]; - sp_mask = get_sp_mask(env->segs[R_SS].flags); - ssp = env->segs[R_SS].base; + sa.sp = env->regs[R_ESP]; + sa.sp_mask = get_sp_mask(env->segs[R_SS].flags); + sa.ss_base = env->segs[R_SS].base; /* push_size = (4 << shift); */ new_stack = 0; } #ifdef TARGET_X86_64 if (shift == 2) { - PUSHQ_RA(sp, env->segs[R_CS].selector, GETPC()); - PUSHQ_RA(sp, next_eip, GETPC()); + pushq(&sa, env->segs[R_CS].selector); + pushq(&sa, next_eip); } else #endif if (shift == 1) { - PUSHL_RA(ssp, sp, sp_mask, env->segs[R_CS].selector, GETPC()); - PUSHL_RA(ssp, sp, sp_mask, next_eip, GETPC()); + pushl(&sa, env->segs[R_CS].selector); + pushl(&sa, next_eip); } else { - PUSHW_RA(ssp, sp, sp_mask, env->segs[R_CS].selector, GETPC()); - PUSHW_RA(ssp, sp, sp_mask, next_eip, GETPC()); + pushw(&sa, env->segs[R_CS].selector); + pushw(&sa, next_eip); } /* from this point, not restartable */ @@ -1845,7 +1880,7 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, { ss = (ss & ~3) | dpl; cpu_x86_load_seg_cache(env, R_SS, ss, - ssp, + sa.ss_base, get_seg_limit(ss_e1, ss_e2), ss_e2); } @@ -1856,7 +1891,7 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, get_seg_base(e1, e2), get_seg_limit(e1, e2), e2); - SET_ESP(sp, sp_mask); + SET_ESP(sa.sp, sa.sp_mask); env->eip = offset; } } @@ -1864,26 +1899,28 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, /* real and vm86 mode iret */ void helper_iret_real(CPUX86State *env, int shift) { - uint32_t sp, new_cs, new_eip, new_eflags, sp_mask; - target_ulong ssp; + uint32_t new_cs, new_eip, new_eflags; int eflags_mask; + StackAccess sa; + + sa.env = env; + sa.ra = GETPC(); + sa.sp_mask = 0xffff; /* XXXX: use SS segment size? */ + sa.sp = env->regs[R_ESP]; + sa.ss_base = env->segs[R_SS].base; - sp_mask = 0xffff; /* XXXX: use SS segment size? */ - sp = env->regs[R_ESP]; - ssp = env->segs[R_SS].base; if (shift == 1) { /* 32 bits */ - POPL_RA(ssp, sp, sp_mask, new_eip, GETPC()); - POPL_RA(ssp, sp, sp_mask, new_cs, GETPC()); - new_cs &= 0xffff; - POPL_RA(ssp, sp, sp_mask, new_eflags, GETPC()); + new_eip = popl(&sa); + new_cs = popl(&sa) & 0xffff; + new_eflags = popl(&sa); } else { /* 16 bits */ - POPW_RA(ssp, sp, sp_mask, new_eip, GETPC()); - POPW_RA(ssp, sp, sp_mask, new_cs, GETPC()); - POPW_RA(ssp, sp, sp_mask, new_eflags, GETPC()); + new_eip = popw(&sa); + new_cs = popw(&sa); + new_eflags = popw(&sa); } - env->regs[R_ESP] = (env->regs[R_ESP] & ~sp_mask) | (sp & sp_mask); + SET_ESP(sa.sp, sa.sp_mask); env->segs[R_CS].selector = new_cs; env->segs[R_CS].base = (new_cs << 4); env->eip = new_eip; @@ -1936,47 +1973,49 @@ static inline void helper_ret_protected(CPUX86State *env, int shift, uint32_t new_es, new_ds, new_fs, new_gs; uint32_t e1, e2, ss_e1, ss_e2; int cpl, dpl, rpl, eflags_mask, iopl; - target_ulong ssp, sp, new_eip, new_esp, sp_mask; + target_ulong new_eip, new_esp; + StackAccess sa; + + sa.env = env; + sa.ra = retaddr; #ifdef TARGET_X86_64 if (shift == 2) { - sp_mask = -1; + sa.sp_mask = -1; } else #endif { - sp_mask = get_sp_mask(env->segs[R_SS].flags); + sa.sp_mask = get_sp_mask(env->segs[R_SS].flags); } - sp = env->regs[R_ESP]; - ssp = env->segs[R_SS].base; + sa.sp = env->regs[R_ESP]; + sa.ss_base = env->segs[R_SS].base; new_eflags = 0; /* avoid warning */ #ifdef TARGET_X86_64 if (shift == 2) { - POPQ_RA(sp, new_eip, retaddr); - POPQ_RA(sp, new_cs, retaddr); - new_cs &= 0xffff; + new_eip = popq(&sa); + new_cs = popq(&sa) & 0xffff; if (is_iret) { - POPQ_RA(sp, new_eflags, retaddr); + new_eflags = popq(&sa); } } else #endif { if (shift == 1) { /* 32 bits */ - POPL_RA(ssp, sp, sp_mask, new_eip, retaddr); - POPL_RA(ssp, sp, sp_mask, new_cs, retaddr); - new_cs &= 0xffff; + new_eip = popl(&sa); + new_cs = popl(&sa) & 0xffff; if (is_iret) { - POPL_RA(ssp, sp, sp_mask, new_eflags, retaddr); + new_eflags = popl(&sa); if (new_eflags & VM_MASK) { goto return_to_vm86; } } } else { /* 16 bits */ - POPW_RA(ssp, sp, sp_mask, new_eip, retaddr); - POPW_RA(ssp, sp, sp_mask, new_cs, retaddr); + new_eip = popw(&sa); + new_cs = popw(&sa); if (is_iret) { - POPW_RA(ssp, sp, sp_mask, new_eflags, retaddr); + new_eflags = popw(&sa); } } } @@ -2012,7 +2051,7 @@ static inline void helper_ret_protected(CPUX86State *env, int shift, raise_exception_err_ra(env, EXCP0B_NOSEG, new_cs & 0xfffc, retaddr); } - sp += addend; + sa.sp += addend; if (rpl == cpl && (!(env->hflags & HF_CS64_MASK) || ((env->hflags & HF_CS64_MASK) && !is_iret))) { /* return to same privilege level */ @@ -2024,21 +2063,19 @@ static inline void helper_ret_protected(CPUX86State *env, int shift, /* return to different privilege level */ #ifdef TARGET_X86_64 if (shift == 2) { - POPQ_RA(sp, new_esp, retaddr); - POPQ_RA(sp, new_ss, retaddr); - new_ss &= 0xffff; + new_esp = popq(&sa); + new_ss = popq(&sa) & 0xffff; } else #endif { if (shift == 1) { /* 32 bits */ - POPL_RA(ssp, sp, sp_mask, new_esp, retaddr); - POPL_RA(ssp, sp, sp_mask, new_ss, retaddr); - new_ss &= 0xffff; + new_esp = popl(&sa); + new_ss = popl(&sa) & 0xffff; } else { /* 16 bits */ - POPW_RA(ssp, sp, sp_mask, new_esp, retaddr); - POPW_RA(ssp, sp, sp_mask, new_ss, retaddr); + new_esp = popw(&sa); + new_ss = popw(&sa); } } LOG_PCALL("new ss:esp=%04x:" TARGET_FMT_lx "\n", @@ -2088,14 +2125,14 @@ static inline void helper_ret_protected(CPUX86State *env, int shift, get_seg_base(e1, e2), get_seg_limit(e1, e2), e2); - sp = new_esp; + sa.sp = new_esp; #ifdef TARGET_X86_64 if (env->hflags & HF_CS64_MASK) { - sp_mask = -1; + sa.sp_mask = -1; } else #endif { - sp_mask = get_sp_mask(ss_e2); + sa.sp_mask = get_sp_mask(ss_e2); } /* validate data segments */ @@ -2104,9 +2141,9 @@ static inline void helper_ret_protected(CPUX86State *env, int shift, validate_seg(env, R_FS, rpl); validate_seg(env, R_GS, rpl); - sp += addend; + sa.sp += addend; } - SET_ESP(sp, sp_mask); + SET_ESP(sa.sp, sa.sp_mask); env->eip = new_eip; if (is_iret) { /* NOTE: 'cpl' is the _old_ CPL */ @@ -2126,12 +2163,12 @@ static inline void helper_ret_protected(CPUX86State *env, int shift, return; return_to_vm86: - POPL_RA(ssp, sp, sp_mask, new_esp, retaddr); - POPL_RA(ssp, sp, sp_mask, new_ss, retaddr); - POPL_RA(ssp, sp, sp_mask, new_es, retaddr); - POPL_RA(ssp, sp, sp_mask, new_ds, retaddr); - POPL_RA(ssp, sp, sp_mask, new_fs, retaddr); - POPL_RA(ssp, sp, sp_mask, new_gs, retaddr); + new_esp = popl(&sa); + new_ss = popl(&sa); + new_es = popl(&sa); + new_ds = popl(&sa); + new_fs = popl(&sa); + new_gs = popl(&sa); /* modify processor state */ cpu_load_eflags(env, new_eflags, TF_MASK | AC_MASK | ID_MASK | -- 2.45.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 05/10] target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl 2024-07-10 6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini ` (3 preceding siblings ...) 2024-07-10 6:29 ` [PATCH 04/10] target/i386/tcg: Reorg push/pop within seg_helper.c Paolo Bonzini @ 2024-07-10 6:29 ` Paolo Bonzini 2024-07-10 6:29 ` [PATCH 06/10] target/i386/tcg: Compute MMU index once Paolo Bonzini ` (5 subsequent siblings) 10 siblings, 0 replies; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 6:29 UTC (permalink / raw) To: qemu-devel; +Cc: rrh.henry, richard.henderson From: Richard Henderson <richard.henderson@linaro.org> Disconnect mmu index computation from the current pl as stored in env->hflags. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Link: https://lore.kernel.org/r/20240617161210.4639-2-richard.henderson@linaro.org Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- target/i386/cpu.h | 11 ++--------- target/i386/cpu.c | 27 ++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index c43ac01c794..1e121acef54 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -2445,15 +2445,8 @@ static inline bool is_mmu_index_32(int mmu_index) return mmu_index & 1; } -static inline int cpu_mmu_index_kernel(CPUX86State *env) -{ - int mmu_index_32 = (env->hflags & HF_LMA_MASK) ? 0 : 1; - int mmu_index_base = - !(env->hflags & HF_SMAP_MASK) ? MMU_KNOSMAP64_IDX : - ((env->hflags & HF_CPL_MASK) < 3 && (env->eflags & AC_MASK)) ? MMU_KNOSMAP64_IDX : MMU_KSMAP64_IDX; - - return mmu_index_base + mmu_index_32; -} +int x86_mmu_index_pl(CPUX86State *env, unsigned pl); +int cpu_mmu_index_kernel(CPUX86State *env); #define CC_DST (env->cc_dst) #define CC_SRC (env->cc_src) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index c05765eeafc..4688d140c2d 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -8122,18 +8122,39 @@ static bool x86_cpu_has_work(CPUState *cs) return x86_cpu_pending_interrupt(cs, cs->interrupt_request) != 0; } -static int x86_cpu_mmu_index(CPUState *cs, bool ifetch) +int x86_mmu_index_pl(CPUX86State *env, unsigned pl) { - CPUX86State *env = cpu_env(cs); int mmu_index_32 = (env->hflags & HF_CS64_MASK) ? 0 : 1; int mmu_index_base = - (env->hflags & HF_CPL_MASK) == 3 ? MMU_USER64_IDX : + pl == 3 ? MMU_USER64_IDX : !(env->hflags & HF_SMAP_MASK) ? MMU_KNOSMAP64_IDX : (env->eflags & AC_MASK) ? MMU_KNOSMAP64_IDX : MMU_KSMAP64_IDX; return mmu_index_base + mmu_index_32; } +static int x86_cpu_mmu_index(CPUState *cs, bool ifetch) +{ + CPUX86State *env = cpu_env(cs); + return x86_mmu_index_pl(env, env->hflags & HF_CPL_MASK); +} + +static int x86_mmu_index_kernel_pl(CPUX86State *env, unsigned pl) +{ + int mmu_index_32 = (env->hflags & HF_LMA_MASK) ? 0 : 1; + int mmu_index_base = + !(env->hflags & HF_SMAP_MASK) ? MMU_KNOSMAP64_IDX : + (pl < 3 && (env->eflags & AC_MASK) + ? MMU_KNOSMAP64_IDX : MMU_KSMAP64_IDX); + + return mmu_index_base + mmu_index_32; +} + +int cpu_mmu_index_kernel(CPUX86State *env) +{ + return x86_mmu_index_kernel_pl(env, env->hflags & HF_CPL_MASK); +} + static void x86_disas_set_info(CPUState *cs, disassemble_info *info) { X86CPU *cpu = X86_CPU(cs); -- 2.45.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 06/10] target/i386/tcg: Compute MMU index once 2024-07-10 6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini ` (4 preceding siblings ...) 2024-07-10 6:29 ` [PATCH 05/10] target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl Paolo Bonzini @ 2024-07-10 6:29 ` Paolo Bonzini 2024-07-10 15:55 ` Richard Henderson 2024-07-10 6:29 ` [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates Paolo Bonzini ` (4 subsequent siblings) 10 siblings, 1 reply; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 6:29 UTC (permalink / raw) To: qemu-devel; +Cc: rrh.henry, richard.henderson Add the MMU index to the StackAccess struct, so that it can be cached or (in the next patch) computed from information that is not in CPUX86State. Co-developed-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- target/i386/tcg/seg_helper.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c index 6b3de7a2be4..07e3667639a 100644 --- a/target/i386/tcg/seg_helper.c +++ b/target/i386/tcg/seg_helper.c @@ -587,36 +587,37 @@ typedef struct StackAccess target_ulong ss_base; target_ulong sp; target_ulong sp_mask; + int mmu_index; } StackAccess; static void pushw(StackAccess *sa, uint16_t val) { sa->sp -= 2; - cpu_stw_kernel_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask), - val, sa->ra); + cpu_stw_mmuidx_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask), + val, sa->mmu_index, sa->ra); } static void pushl(StackAccess *sa, uint32_t val) { sa->sp -= 4; - cpu_stl_kernel_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask), - val, sa->ra); + cpu_stl_mmuidx_ra(sa->env, sa->ss_base + (sa->sp & sa->sp_mask), + val, sa->mmu_index, sa->ra); } static uint16_t popw(StackAccess *sa) { - uint16_t ret = cpu_lduw_data_ra(sa->env, - sa->ss_base + (sa->sp & sa->sp_mask), - sa->ra); + uint16_t ret = cpu_lduw_mmuidx_ra(sa->env, + sa->ss_base + (sa->sp & sa->sp_mask), + sa->mmu_index, sa->ra); sa->sp += 2; return ret; } static uint32_t popl(StackAccess *sa) { - uint32_t ret = cpu_ldl_data_ra(sa->env, - sa->ss_base + (sa->sp & sa->sp_mask), - sa->ra); + uint32_t ret = cpu_ldl_mmuidx_ra(sa->env, + sa->ss_base + (sa->sp & sa->sp_mask), + sa->mmu_index, sa->ra); sa->sp += 4; return ret; } @@ -677,6 +678,7 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, sa.env = env; sa.ra = 0; + sa.mmu_index = cpu_mmu_index_kernel(env); if (type == 5) { /* task gate */ @@ -858,12 +860,12 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, static void pushq(StackAccess *sa, uint64_t val) { sa->sp -= 8; - cpu_stq_kernel_ra(sa->env, sa->sp, val, sa->ra); + cpu_stq_mmuidx_ra(sa->env, sa->sp, val, sa->mmu_index, sa->ra); } static uint64_t popq(StackAccess *sa) { - uint64_t ret = cpu_ldq_data_ra(sa->env, sa->sp, sa->ra); + uint64_t ret = cpu_ldq_mmuidx_ra(sa->env, sa->sp, sa->mmu_index, sa->ra); sa->sp += 8; return ret; } @@ -982,6 +984,7 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int, sa.env = env; sa.ra = 0; + sa.mmu_index = cpu_mmu_index_kernel(env); sa.sp_mask = -1; sa.ss_base = 0; if (dpl < cpl || ist != 0) { @@ -1116,6 +1119,7 @@ static void do_interrupt_real(CPUX86State *env, int intno, int is_int, sa.sp = env->regs[R_ESP]; sa.sp_mask = 0xffff; sa.ss_base = env->segs[R_SS].base; + sa.mmu_index = cpu_mmu_index_kernel(env); if (is_int) { old_eip = next_eip; @@ -1579,6 +1583,7 @@ void helper_lcall_real(CPUX86State *env, uint32_t new_cs, uint32_t new_eip, sa.sp = env->regs[R_ESP]; sa.sp_mask = get_sp_mask(env->segs[R_SS].flags); sa.ss_base = env->segs[R_SS].base; + sa.mmu_index = cpu_mmu_index_kernel(env); if (shift) { pushl(&sa, env->segs[R_CS].selector); @@ -1618,6 +1623,7 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, sa.env = env; sa.ra = GETPC(); + sa.mmu_index = cpu_mmu_index_kernel(env); if (e2 & DESC_S_MASK) { if (!(e2 & DESC_CS_MASK)) { @@ -1905,6 +1911,7 @@ void helper_iret_real(CPUX86State *env, int shift) sa.env = env; sa.ra = GETPC(); + sa.mmu_index = x86_mmu_index_pl(env, 0); sa.sp_mask = 0xffff; /* XXXX: use SS segment size? */ sa.sp = env->regs[R_ESP]; sa.ss_base = env->segs[R_SS].base; @@ -1976,8 +1983,11 @@ static inline void helper_ret_protected(CPUX86State *env, int shift, target_ulong new_eip, new_esp; StackAccess sa; + cpl = env->hflags & HF_CPL_MASK; + sa.env = env; sa.ra = retaddr; + sa.mmu_index = x86_mmu_index_pl(env, cpl); #ifdef TARGET_X86_64 if (shift == 2) { @@ -2032,7 +2042,6 @@ static inline void helper_ret_protected(CPUX86State *env, int shift, !(e2 & DESC_CS_MASK)) { raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, retaddr); } - cpl = env->hflags & HF_CPL_MASK; rpl = new_cs & 3; if (rpl < cpl) { raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, retaddr); -- 2.45.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 06/10] target/i386/tcg: Compute MMU index once 2024-07-10 6:29 ` [PATCH 06/10] target/i386/tcg: Compute MMU index once Paolo Bonzini @ 2024-07-10 15:55 ` Richard Henderson 0 siblings, 0 replies; 27+ messages in thread From: Richard Henderson @ 2024-07-10 15:55 UTC (permalink / raw) To: Paolo Bonzini, qemu-devel; +Cc: rrh.henry On 7/9/24 23:29, Paolo Bonzini wrote: > Add the MMU index to the StackAccess struct, so that it can be cached > or (in the next patch) computed from information that is not in > CPUX86State. > > Co-developed-by: Richard Henderson <richard.henderson@linaro.org> > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~ ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates 2024-07-10 6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini ` (5 preceding siblings ...) 2024-07-10 6:29 ` [PATCH 06/10] target/i386/tcg: Compute MMU index once Paolo Bonzini @ 2024-07-10 6:29 ` Paolo Bonzini 2024-07-10 15:57 ` Richard Henderson 2024-10-18 16:02 ` Michael Tokarev 2024-07-10 6:29 ` [PATCH 08/10] target/i386/tcg: check for correct busy state before switching to a new task Paolo Bonzini ` (3 subsequent siblings) 10 siblings, 2 replies; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 6:29 UTC (permalink / raw) To: qemu-devel; +Cc: rrh.henry, richard.henderson This fixes a bug wherein i386/tcg assumed an interrupt return using the CALL or JMP instructions were always going from kernel or user mode to kernel mode, when using a call gate. This assumption is violated if the call gate has a DPL that is greater than 0. In addition, the stack accesses should count as explicit, not implicit ("kernel" in QEMU code), so that SMAP is not applied if DPL=3. Analyzed-by: Robert R. Henry <rrh.henry@gmail.com> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/249 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- target/i386/tcg/seg_helper.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c index 07e3667639a..1430f477c43 100644 --- a/target/i386/tcg/seg_helper.c +++ b/target/i386/tcg/seg_helper.c @@ -678,7 +678,7 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, sa.env = env; sa.ra = 0; - sa.mmu_index = cpu_mmu_index_kernel(env); + sa.mmu_index = x86_mmu_index_pl(env, dpl); if (type == 5) { /* task gate */ @@ -984,7 +984,7 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int, sa.env = env; sa.ra = 0; - sa.mmu_index = cpu_mmu_index_kernel(env); + sa.mmu_index = x86_mmu_index_pl(env, dpl); sa.sp_mask = -1; sa.ss_base = 0; if (dpl < cpl || ist != 0) { @@ -1119,7 +1119,7 @@ static void do_interrupt_real(CPUX86State *env, int intno, int is_int, sa.sp = env->regs[R_ESP]; sa.sp_mask = 0xffff; sa.ss_base = env->segs[R_SS].base; - sa.mmu_index = cpu_mmu_index_kernel(env); + sa.mmu_index = x86_mmu_index_pl(env, 0); if (is_int) { old_eip = next_eip; @@ -1583,7 +1583,7 @@ void helper_lcall_real(CPUX86State *env, uint32_t new_cs, uint32_t new_eip, sa.sp = env->regs[R_ESP]; sa.sp_mask = get_sp_mask(env->segs[R_SS].flags); sa.ss_base = env->segs[R_SS].base; - sa.mmu_index = cpu_mmu_index_kernel(env); + sa.mmu_index = x86_mmu_index_pl(env, 0); if (shift) { pushl(&sa, env->segs[R_CS].selector); @@ -1619,17 +1619,17 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, GETPC()); } cpl = env->hflags & HF_CPL_MASK; + dpl = (e2 >> DESC_DPL_SHIFT) & 3; LOG_PCALL("desc=%08x:%08x\n", e1, e2); sa.env = env; sa.ra = GETPC(); - sa.mmu_index = cpu_mmu_index_kernel(env); + sa.mmu_index = x86_mmu_index_pl(env, dpl); if (e2 & DESC_S_MASK) { if (!(e2 & DESC_CS_MASK)) { raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, GETPC()); } - dpl = (e2 >> DESC_DPL_SHIFT) & 3; if (e2 & DESC_C_MASK) { /* conforming code segment */ if (dpl > cpl) { @@ -1691,7 +1691,6 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, } else { /* check gate type */ type = (e2 >> DESC_TYPE_SHIFT) & 0x1f; - dpl = (e2 >> DESC_DPL_SHIFT) & 3; rpl = new_cs & 3; #ifdef TARGET_X86_64 -- 2.45.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates 2024-07-10 6:29 ` [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates Paolo Bonzini @ 2024-07-10 15:57 ` Richard Henderson 2024-10-18 16:02 ` Michael Tokarev 1 sibling, 0 replies; 27+ messages in thread From: Richard Henderson @ 2024-07-10 15:57 UTC (permalink / raw) To: Paolo Bonzini, qemu-devel; +Cc: rrh.henry On 7/9/24 23:29, Paolo Bonzini wrote: > This fixes a bug wherein i386/tcg assumed an interrupt return using > the CALL or JMP instructions were always going from kernel or user mode to > kernel mode, when using a call gate. This assumption is violated if > the call gate has a DPL that is greater than 0. > > In addition, the stack accesses should count as explicit, not implicit > ("kernel" in QEMU code), so that SMAP is not applied if DPL=3. > > Analyzed-by: Robert R. Henry<rrh.henry@gmail.com> > Resolves:https://gitlab.com/qemu-project/qemu/-/issues/249 > Signed-off-by: Paolo Bonzini<pbonzini@redhat.com> > --- > target/i386/tcg/seg_helper.c | 13 ++++++------- > 1 file changed, 6 insertions(+), 7 deletions(-) Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~ ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates 2024-07-10 6:29 ` [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates Paolo Bonzini 2024-07-10 15:57 ` Richard Henderson @ 2024-10-18 16:02 ` Michael Tokarev 2024-10-25 15:26 ` Michael Tokarev 1 sibling, 1 reply; 27+ messages in thread From: Michael Tokarev @ 2024-10-18 16:02 UTC (permalink / raw) To: Paolo Bonzini, qemu-devel; +Cc: rrh.henry, richard.henderson 10.07.2024 09:29, Paolo Bonzini wrote: > This fixes a bug wherein i386/tcg assumed an interrupt return using > the CALL or JMP instructions were always going from kernel or user mode to > kernel mode, when using a call gate. This assumption is violated if > the call gate has a DPL that is greater than 0. > > In addition, the stack accesses should count as explicit, not implicit > ("kernel" in QEMU code), so that SMAP is not applied if DPL=3. > > Analyzed-by: Robert R. Henry <rrh.henry@gmail.com> > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/249 > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> This sounds like qemu-stable material, is it not? It can be picked up for 9.1.x, but for 9.0 and before it needs a few other changes in this area, like v9.0.0-2238-g8053862af9 "target/i386/tcg: Compute MMU index once" and v9.0.0-2236-g059368bcf5 "target/i386/tcg: Reorg push/pop within seg_helper.c", or it needs a proper backport. What do you think? Thanks, /mjt ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates 2024-10-18 16:02 ` Michael Tokarev @ 2024-10-25 15:26 ` Michael Tokarev 2024-10-25 15:28 ` Paolo Bonzini 0 siblings, 1 reply; 27+ messages in thread From: Michael Tokarev @ 2024-10-25 15:26 UTC (permalink / raw) To: Paolo Bonzini, qemu-devel; +Cc: rrh.henry, richard.henderson 18.10.2024 19:02, Michael Tokarev wrote: > 10.07.2024 09:29, Paolo Bonzini wrote: >> This fixes a bug wherein i386/tcg assumed an interrupt return using >> the CALL or JMP instructions were always going from kernel or user mode to >> kernel mode, when using a call gate. This assumption is violated if >> the call gate has a DPL that is greater than 0. >> >> In addition, the stack accesses should count as explicit, not implicit >> ("kernel" in QEMU code), so that SMAP is not applied if DPL=3. >> >> Analyzed-by: Robert R. Henry <rrh.henry@gmail.com> >> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/249 >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > > This sounds like qemu-stable material, is it not? > > It can be picked up for 9.1.x, but for 9.0 and before it needs a few > other changes in this area, like v9.0.0-2238-g8053862af9 "target/i386/tcg: > Compute MMU index once" and v9.0.0-2236-g059368bcf5 "target/i386/tcg: > Reorg push/pop within seg_helper.c", or it needs a proper backport. > > What do you think? A friendly ping/help? :) Or should I drop this from 9.1.x too? Thanks, /mjt ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates 2024-10-25 15:26 ` Michael Tokarev @ 2024-10-25 15:28 ` Paolo Bonzini 2024-10-25 15:31 ` Michael Tokarev 0 siblings, 1 reply; 27+ messages in thread From: Paolo Bonzini @ 2024-10-25 15:28 UTC (permalink / raw) To: Michael Tokarev; +Cc: qemu-devel, rrh.henry, richard.henderson On Fri, Oct 25, 2024 at 5:26 PM Michael Tokarev <mjt@tls.msk.ru> wrote: > > It can be picked up for 9.1.x, but for 9.0 and before it needs a few > > other changes in this area, like v9.0.0-2238-g8053862af9 "target/i386/tcg: > > Compute MMU index once" and v9.0.0-2236-g059368bcf5 "target/i386/tcg: > > Reorg push/pop within seg_helper.c", or it needs a proper backport. > > > > What do you think? > > A friendly ping/help? :) Hi! No, this is totally ok for 9.1.x; it missed 9.1 but it was already submitted back then and it's okay to apply it there. On the other hand, Richard wrote some large cleanup patches to enable this relatively small patch. The bug has been there for many years, we can keep it in 9.0.x and earlier. Thanks, Paolo ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates 2024-10-25 15:28 ` Paolo Bonzini @ 2024-10-25 15:31 ` Michael Tokarev 0 siblings, 0 replies; 27+ messages in thread From: Michael Tokarev @ 2024-10-25 15:31 UTC (permalink / raw) To: Paolo Bonzini; +Cc: qemu-devel, rrh.henry, richard.henderson, qemu-stable 25.10.2024 18:28, Paolo Bonzini wrote: > Hi! No, this is totally ok for 9.1.x; it missed 9.1 but it was already > submitted back then and it's okay to apply it there. > > On the other hand, Richard wrote some large cleanup patches to enable > this relatively small patch. The bug has been there for many years, we > can keep it in 9.0.x and earlier. Aha. That makes good sense. Thank you for the follow-up and for clearing my confusion :) /mjt ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 08/10] target/i386/tcg: check for correct busy state before switching to a new task 2024-07-10 6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini ` (6 preceding siblings ...) 2024-07-10 6:29 ` [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates Paolo Bonzini @ 2024-07-10 6:29 ` Paolo Bonzini 2024-07-10 15:58 ` Richard Henderson 2024-07-10 6:29 ` [PATCH 09/10] target/i386/tcg: use X86Access for TSS access Paolo Bonzini ` (2 subsequent siblings) 10 siblings, 1 reply; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 6:29 UTC (permalink / raw) To: qemu-devel; +Cc: rrh.henry, richard.henderson This step is listed in the Intel manual: "Checks that the new task is available (call, jump, exception, or interrupt) or busy (IRET return)". The AMD manual lists the same operation under the "Preventing recursion" paragraph of "12.3.4 Nesting Tasks", though it is not clear if the processor checks the busy bit in the IRET case. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- target/i386/tcg/seg_helper.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c index 1430f477c43..25af9d4a4ec 100644 --- a/target/i386/tcg/seg_helper.c +++ b/target/i386/tcg/seg_helper.c @@ -306,6 +306,11 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, old_tss_limit_max = 43; } + /* new TSS must be busy iff the source is an IRET instruction */ + if (!!(e2 & DESC_TSS_BUSY_MASK) != (source == SWITCH_TSS_IRET)) { + raise_exception_err_ra(env, EXCP0A_TSS, tss_selector & 0xfffc, retaddr); + } + /* read all the registers from the new TSS */ if (type & 8) { /* 32 bit */ -- 2.45.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 08/10] target/i386/tcg: check for correct busy state before switching to a new task 2024-07-10 6:29 ` [PATCH 08/10] target/i386/tcg: check for correct busy state before switching to a new task Paolo Bonzini @ 2024-07-10 15:58 ` Richard Henderson 0 siblings, 0 replies; 27+ messages in thread From: Richard Henderson @ 2024-07-10 15:58 UTC (permalink / raw) To: Paolo Bonzini, qemu-devel; +Cc: rrh.henry On 7/9/24 23:29, Paolo Bonzini wrote: > This step is listed in the Intel manual: "Checks that the new task is available > (call, jump, exception, or interrupt) or busy (IRET return)". > > The AMD manual lists the same operation under the "Preventing recursion" > paragraph of "12.3.4 Nesting Tasks", though it is not clear if the processor > checks the busy bit in the IRET case. > > Signed-off-by: Paolo Bonzini<pbonzini@redhat.com> > --- > target/i386/tcg/seg_helper.c | 5 +++++ > 1 file changed, 5 insertions(+) Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r! ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 09/10] target/i386/tcg: use X86Access for TSS access 2024-07-10 6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini ` (7 preceding siblings ...) 2024-07-10 6:29 ` [PATCH 08/10] target/i386/tcg: check for correct busy state before switching to a new task Paolo Bonzini @ 2024-07-10 6:29 ` Paolo Bonzini 2024-07-10 16:45 ` Richard Henderson 2024-07-10 6:29 ` [PATCH 10/10] target/i386/tcg: save current task state before loading new one Paolo Bonzini 2024-07-10 21:00 ` [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Robert Henry 10 siblings, 1 reply; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 6:29 UTC (permalink / raw) To: qemu-devel; +Cc: rrh.henry, richard.henderson This takes care of probing the vaddr range in advance, and is also faster because it avoids repeated TLB lookups. It also matches the Intel manual better, as it says "Checks that the current (old) TSS, new TSS, and all segment descriptors used in the task switch are paged into system memory"; note however that it's not clear how the processor checks for segment descriptors, and this check is not included in the AMD manual. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- target/i386/tcg/seg_helper.c | 101 ++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 50 deletions(-) diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c index 25af9d4a4ec..77f2c65c3cf 100644 --- a/target/i386/tcg/seg_helper.c +++ b/target/i386/tcg/seg_helper.c @@ -27,6 +27,7 @@ #include "exec/log.h" #include "helper-tcg.h" #include "seg_helper.h" +#include "access.h" int get_pg_mode(CPUX86State *env) { @@ -250,7 +251,7 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, uint32_t e1, uint32_t e2, int source, uint32_t next_eip, uintptr_t retaddr) { - int tss_limit, tss_limit_max, type, old_tss_limit_max, old_type, v1, v2, i; + int tss_limit, tss_limit_max, type, old_tss_limit_max, old_type, i; target_ulong tss_base; uint32_t new_regs[8], new_segs[6]; uint32_t new_eflags, new_eip, new_cr3, new_ldt, new_trap; @@ -258,6 +259,7 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, SegmentCache *dt; int index; target_ulong ptr; + X86Access old, new; type = (e2 >> DESC_TYPE_SHIFT) & 0xf; LOG_PCALL("switch_tss: sel=0x%04x type=%d src=%d\n", tss_selector, type, @@ -311,35 +313,44 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, raise_exception_err_ra(env, EXCP0A_TSS, tss_selector & 0xfffc, retaddr); } + /* X86Access avoids memory exceptions during the task switch */ + access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max, + MMU_DATA_STORE, cpu_mmu_index_kernel(env), retaddr); + + if (source == SWITCH_TSS_CALL) { + /* Probe for future write of parent task */ + probe_access(env, tss_base, 2, MMU_DATA_STORE, + cpu_mmu_index_kernel(env), retaddr); + } + access_prepare_mmu(&new, env, tss_base, tss_limit, + MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr); + /* read all the registers from the new TSS */ if (type & 8) { /* 32 bit */ - new_cr3 = cpu_ldl_kernel_ra(env, tss_base + 0x1c, retaddr); - new_eip = cpu_ldl_kernel_ra(env, tss_base + 0x20, retaddr); - new_eflags = cpu_ldl_kernel_ra(env, tss_base + 0x24, retaddr); + new_cr3 = access_ldl(&new, tss_base + 0x1c); + new_eip = access_ldl(&new, tss_base + 0x20); + new_eflags = access_ldl(&new, tss_base + 0x24); for (i = 0; i < 8; i++) { - new_regs[i] = cpu_ldl_kernel_ra(env, tss_base + (0x28 + i * 4), - retaddr); + new_regs[i] = access_ldl(&new, tss_base + (0x28 + i * 4)); } for (i = 0; i < 6; i++) { - new_segs[i] = cpu_lduw_kernel_ra(env, tss_base + (0x48 + i * 4), - retaddr); + new_segs[i] = access_ldw(&new, tss_base + (0x48 + i * 4)); } - new_ldt = cpu_lduw_kernel_ra(env, tss_base + 0x60, retaddr); - new_trap = cpu_ldl_kernel_ra(env, tss_base + 0x64, retaddr); + new_ldt = access_ldw(&new, tss_base + 0x60); + new_trap = access_ldl(&new, tss_base + 0x64); } else { /* 16 bit */ new_cr3 = 0; - new_eip = cpu_lduw_kernel_ra(env, tss_base + 0x0e, retaddr); - new_eflags = cpu_lduw_kernel_ra(env, tss_base + 0x10, retaddr); + new_eip = access_ldw(&new, tss_base + 0x0e); + new_eflags = access_ldw(&new, tss_base + 0x10); for (i = 0; i < 8; i++) { - new_regs[i] = cpu_lduw_kernel_ra(env, tss_base + (0x12 + i * 2), retaddr); + new_regs[i] = access_ldw(&new, tss_base + (0x12 + i * 2)); } for (i = 0; i < 4; i++) { - new_segs[i] = cpu_lduw_kernel_ra(env, tss_base + (0x22 + i * 2), - retaddr); + new_segs[i] = access_ldw(&new, tss_base + (0x22 + i * 2)); } - new_ldt = cpu_lduw_kernel_ra(env, tss_base + 0x2a, retaddr); + new_ldt = access_ldw(&new, tss_base + 0x2a); new_segs[R_FS] = 0; new_segs[R_GS] = 0; new_trap = 0; @@ -349,16 +360,6 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, chapters 12.2.5 and 13.2.4 on how to implement TSS Trap bit */ (void)new_trap; - /* NOTE: we must avoid memory exceptions during the task switch, - so we make dummy accesses before */ - /* XXX: it can still fail in some cases, so a bigger hack is - necessary to valid the TLB after having done the accesses */ - - v1 = cpu_ldub_kernel_ra(env, env->tr.base, retaddr); - v2 = cpu_ldub_kernel_ra(env, env->tr.base + old_tss_limit_max, retaddr); - cpu_stb_kernel_ra(env, env->tr.base, v1, retaddr); - cpu_stb_kernel_ra(env, env->tr.base + old_tss_limit_max, v2, retaddr); - /* clear busy bit (it is restartable) */ if (source == SWITCH_TSS_JMP || source == SWITCH_TSS_IRET) { tss_set_busy(env, env->tr.selector, 0, retaddr); @@ -371,35 +372,35 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, /* save the current state in the old TSS */ if (old_type & 8) { /* 32 bit */ - cpu_stl_kernel_ra(env, env->tr.base + 0x20, next_eip, retaddr); - cpu_stl_kernel_ra(env, env->tr.base + 0x24, old_eflags, retaddr); - cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 0 * 4), env->regs[R_EAX], retaddr); - cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 1 * 4), env->regs[R_ECX], retaddr); - cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 2 * 4), env->regs[R_EDX], retaddr); - cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 3 * 4), env->regs[R_EBX], retaddr); - cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 4 * 4), env->regs[R_ESP], retaddr); - cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 5 * 4), env->regs[R_EBP], retaddr); - cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 6 * 4), env->regs[R_ESI], retaddr); - cpu_stl_kernel_ra(env, env->tr.base + (0x28 + 7 * 4), env->regs[R_EDI], retaddr); + access_stl(&old, env->tr.base + 0x20, next_eip); + access_stl(&old, env->tr.base + 0x24, old_eflags); + access_stl(&old, env->tr.base + (0x28 + 0 * 4), env->regs[R_EAX]); + access_stl(&old, env->tr.base + (0x28 + 1 * 4), env->regs[R_ECX]); + access_stl(&old, env->tr.base + (0x28 + 2 * 4), env->regs[R_EDX]); + access_stl(&old, env->tr.base + (0x28 + 3 * 4), env->regs[R_EBX]); + access_stl(&old, env->tr.base + (0x28 + 4 * 4), env->regs[R_ESP]); + access_stl(&old, env->tr.base + (0x28 + 5 * 4), env->regs[R_EBP]); + access_stl(&old, env->tr.base + (0x28 + 6 * 4), env->regs[R_ESI]); + access_stl(&old, env->tr.base + (0x28 + 7 * 4), env->regs[R_EDI]); for (i = 0; i < 6; i++) { - cpu_stw_kernel_ra(env, env->tr.base + (0x48 + i * 4), - env->segs[i].selector, retaddr); + access_stw(&old, env->tr.base + (0x48 + i * 4), + env->segs[i].selector); } } else { /* 16 bit */ - cpu_stw_kernel_ra(env, env->tr.base + 0x0e, next_eip, retaddr); - cpu_stw_kernel_ra(env, env->tr.base + 0x10, old_eflags, retaddr); - cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 0 * 2), env->regs[R_EAX], retaddr); - cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 1 * 2), env->regs[R_ECX], retaddr); - cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 2 * 2), env->regs[R_EDX], retaddr); - cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 3 * 2), env->regs[R_EBX], retaddr); - cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 4 * 2), env->regs[R_ESP], retaddr); - cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 5 * 2), env->regs[R_EBP], retaddr); - cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 6 * 2), env->regs[R_ESI], retaddr); - cpu_stw_kernel_ra(env, env->tr.base + (0x12 + 7 * 2), env->regs[R_EDI], retaddr); + access_stw(&old, env->tr.base + 0x0e, next_eip); + access_stw(&old, env->tr.base + 0x10, old_eflags); + access_stw(&old, env->tr.base + (0x12 + 0 * 2), env->regs[R_EAX]); + access_stw(&old, env->tr.base + (0x12 + 1 * 2), env->regs[R_ECX]); + access_stw(&old, env->tr.base + (0x12 + 2 * 2), env->regs[R_EDX]); + access_stw(&old, env->tr.base + (0x12 + 3 * 2), env->regs[R_EBX]); + access_stw(&old, env->tr.base + (0x12 + 4 * 2), env->regs[R_ESP]); + access_stw(&old, env->tr.base + (0x12 + 5 * 2), env->regs[R_EBP]); + access_stw(&old, env->tr.base + (0x12 + 6 * 2), env->regs[R_ESI]); + access_stw(&old, env->tr.base + (0x12 + 7 * 2), env->regs[R_EDI]); for (i = 0; i < 4; i++) { - cpu_stw_kernel_ra(env, env->tr.base + (0x22 + i * 2), - env->segs[i].selector, retaddr); + access_stw(&old, env->tr.base + (0x22 + i * 2), + env->segs[i].selector); } } -- 2.45.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 09/10] target/i386/tcg: use X86Access for TSS access 2024-07-10 6:29 ` [PATCH 09/10] target/i386/tcg: use X86Access for TSS access Paolo Bonzini @ 2024-07-10 16:45 ` Richard Henderson 2024-07-10 18:40 ` Paolo Bonzini 0 siblings, 1 reply; 27+ messages in thread From: Richard Henderson @ 2024-07-10 16:45 UTC (permalink / raw) To: Paolo Bonzini, qemu-devel; +Cc: rrh.henry On 7/9/24 23:29, Paolo Bonzini wrote: > This takes care of probing the vaddr range in advance, and is also faster > because it avoids repeated TLB lookups. It also matches the Intel manual > better, as it says "Checks that the current (old) TSS, new TSS, and all > segment descriptors used in the task switch are paged into system memory"; > note however that it's not clear how the processor checks for segment > descriptors, and this check is not included in the AMD manual. > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > target/i386/tcg/seg_helper.c | 101 ++++++++++++++++++----------------- > 1 file changed, 51 insertions(+), 50 deletions(-) > > diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c > index 25af9d4a4ec..77f2c65c3cf 100644 > --- a/target/i386/tcg/seg_helper.c > +++ b/target/i386/tcg/seg_helper.c > @@ -27,6 +27,7 @@ > #include "exec/log.h" > #include "helper-tcg.h" > #include "seg_helper.h" > +#include "access.h" > > int get_pg_mode(CPUX86State *env) > { > @@ -250,7 +251,7 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, > uint32_t e1, uint32_t e2, int source, > uint32_t next_eip, uintptr_t retaddr) > { > - int tss_limit, tss_limit_max, type, old_tss_limit_max, old_type, v1, v2, i; > + int tss_limit, tss_limit_max, type, old_tss_limit_max, old_type, i; > target_ulong tss_base; > uint32_t new_regs[8], new_segs[6]; > uint32_t new_eflags, new_eip, new_cr3, new_ldt, new_trap; > @@ -258,6 +259,7 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, > SegmentCache *dt; > int index; > target_ulong ptr; > + X86Access old, new; > > type = (e2 >> DESC_TYPE_SHIFT) & 0xf; > LOG_PCALL("switch_tss: sel=0x%04x type=%d src=%d\n", tss_selector, type, > @@ -311,35 +313,44 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, > raise_exception_err_ra(env, EXCP0A_TSS, tss_selector & 0xfffc, retaddr); > } > > + /* X86Access avoids memory exceptions during the task switch */ > + access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max, > + MMU_DATA_STORE, cpu_mmu_index_kernel(env), retaddr); > + > + if (source == SWITCH_TSS_CALL) { > + /* Probe for future write of parent task */ > + probe_access(env, tss_base, 2, MMU_DATA_STORE, > + cpu_mmu_index_kernel(env), retaddr); > + } > + access_prepare_mmu(&new, env, tss_base, tss_limit, > + MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr); You're computing cpu_mmu_index_kernel 3 times. This appears to be conservative in that you're requiring only 2 bytes (a minimum) of 0x68 to be writable. Is it legal to place the TSS at offset 0xffe of page 0, with the balance on page 1, with page 0 writable and page 1 read-only? Otherwise I would think you could just check the entire TSS for writability. Anyway, after the MMU_DATA_STORE probe, you have proved that 'X86Access new' contains an address range that may be stored. So you can change the SWITCH_TSS_CALL store below to access_stw() too. > @@ -349,16 +360,6 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, > chapters 12.2.5 and 13.2.4 on how to implement TSS Trap bit */ > (void)new_trap; > > - /* NOTE: we must avoid memory exceptions during the task switch, > - so we make dummy accesses before */ > - /* XXX: it can still fail in some cases, so a bigger hack is > - necessary to valid the TLB after having done the accesses */ > - > - v1 = cpu_ldub_kernel_ra(env, env->tr.base, retaddr); > - v2 = cpu_ldub_kernel_ra(env, env->tr.base + old_tss_limit_max, retaddr); > - cpu_stb_kernel_ra(env, env->tr.base, v1, retaddr); > - cpu_stb_kernel_ra(env, env->tr.base + old_tss_limit_max, v2, retaddr); OMG. Looks like a fantastic cleanup overall. r~ ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 09/10] target/i386/tcg: use X86Access for TSS access 2024-07-10 16:45 ` Richard Henderson @ 2024-07-10 18:40 ` Paolo Bonzini 2024-07-11 6:28 ` Paolo Bonzini 0 siblings, 1 reply; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 18:40 UTC (permalink / raw) To: Richard Henderson; +Cc: qemu-devel, Robert Henry [-- Attachment #1: Type: text/plain, Size: 3150 bytes --] Il mer 10 lug 2024, 18:47 Richard Henderson <richard.henderson@linaro.org> ha scritto: > On 7/9/24 23:29, Paolo Bonzini wrote: > > This takes care of probing the vaddr range in advance, and is also faster > > because it avoids repeated TLB lookups. It also matches the Intel manual > > better, as it says "Checks that the current (old) TSS, new TSS, and all > > segment descriptors used in the task switch are paged into system > memory"; > > note however that it's not clear how the processor checks for segment > > descriptors, and this check is not included in the AMD manual. > > > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > > --- > > target/i386/tcg/seg_helper.c | 101 ++++++++++++++++++----------------- > > 1 file changed, 51 insertions(+), 50 deletions(-) > > > > diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c > > index 25af9d4a4ec..77f2c65c3cf 100644 > > --- a/target/i386/tcg/seg_helper.c > > +++ b/target/i386/tcg/seg_helper.c > > @@ -311,35 +313,44 @@ static int switch_tss_ra(CPUX86State *env, int > tss_selector, > > raise_exception_err_ra(env, EXCP0A_TSS, tss_selector & 0xfffc, > retaddr); > > } > > > > + /* X86Access avoids memory exceptions during the task switch */ > > + access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max, > > + MMU_DATA_STORE, cpu_mmu_index_kernel(env), retaddr); > > + > > + if (source == SWITCH_TSS_CALL) { > > + /* Probe for future write of parent task */ > > + probe_access(env, tss_base, 2, MMU_DATA_STORE, > > + cpu_mmu_index_kernel(env), retaddr); > > + } > > + access_prepare_mmu(&new, env, tss_base, tss_limit, > > + MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr); > > You're computing cpu_mmu_index_kernel 3 times. > Oh, indeed. Better than 30. :) This appears to be conservative in that you're requiring only 2 bytes (a > minimum) of 0x68 > to be writable. Is it legal to place the TSS at offset 0xffe of page 0, > with the balance > on page 1, with page 0 writable and page 1 read-only? Yes, paging is totally optional here. The only field that is written is the link. Otherwise I would think you could > just check the entire TSS for writability. > > Anyway, after the MMU_DATA_STORE probe, you have proved that 'X86Access > new' contains an > address range that may be stored. So you can change the SWITCH_TSS_CALL > store below to > access_stw() too. > Nice. > - /* NOTE: we must avoid memory exceptions during the task switch, > > - so we make dummy accesses before */ > > - /* XXX: it can still fail in some cases, so a bigger hack is > > - necessary to valid the TLB after having done the accesses */ > > - > > - v1 = cpu_ldub_kernel_ra(env, env->tr.base, retaddr); > > - v2 = cpu_ldub_kernel_ra(env, env->tr.base + old_tss_limit_max, > retaddr); > > - cpu_stb_kernel_ra(env, env->tr.base, v1, retaddr); > > - cpu_stb_kernel_ra(env, env->tr.base + old_tss_limit_max, v2, > retaddr); > > OMG. > Haha, yeah X86Access is perfect here. Paolo Looks like a fantastic cleanup overall. > > > r~ > > [-- Attachment #2: Type: text/html, Size: 5043 bytes --] ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 09/10] target/i386/tcg: use X86Access for TSS access 2024-07-10 18:40 ` Paolo Bonzini @ 2024-07-11 6:28 ` Paolo Bonzini 2024-07-11 15:30 ` Richard Henderson 0 siblings, 1 reply; 27+ messages in thread From: Paolo Bonzini @ 2024-07-11 6:28 UTC (permalink / raw) To: Richard Henderson; +Cc: qemu-devel, Robert Henry On 7/10/24 20:40, Paolo Bonzini wrote: > > > Il mer 10 lug 2024, 18:47 Richard Henderson > <richard.henderson@linaro.org <mailto:richard.henderson@linaro.org>> ha > scritto: > > On 7/9/24 23:29, Paolo Bonzini wrote: > > This takes care of probing the vaddr range in advance, and is > also faster > > because it avoids repeated TLB lookups. It also matches the > Intel manual > > better, as it says "Checks that the current (old) TSS, new TSS, > and all > > segment descriptors used in the task switch are paged into system > memory"; > > note however that it's not clear how the processor checks for segment > > descriptors, and this check is not included in the AMD manual. > > > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com > <mailto:pbonzini@redhat.com>> > > --- > > target/i386/tcg/seg_helper.c | 101 > ++++++++++++++++++----------------- > > 1 file changed, 51 insertions(+), 50 deletions(-) > > > > diff --git a/target/i386/tcg/seg_helper.c > b/target/i386/tcg/seg_helper.c > > index 25af9d4a4ec..77f2c65c3cf 100644 > > --- a/target/i386/tcg/seg_helper.c > > +++ b/target/i386/tcg/seg_helper.c > > @@ -311,35 +313,44 @@ static int switch_tss_ra(CPUX86State *env, > int tss_selector, > > raise_exception_err_ra(env, EXCP0A_TSS, tss_selector & > 0xfffc, retaddr); > > } > > > > + /* X86Access avoids memory exceptions during the task switch */ > > + access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max, > > + MMU_DATA_STORE, cpu_mmu_index_kernel(env), > retaddr); > > + > > + if (source == SWITCH_TSS_CALL) { > > + /* Probe for future write of parent task */ > > + probe_access(env, tss_base, 2, MMU_DATA_STORE, > > + cpu_mmu_index_kernel(env), retaddr); > > + } > > + access_prepare_mmu(&new, env, tss_base, tss_limit, > > + MMU_DATA_LOAD, cpu_mmu_index_kernel(env), > retaddr); > > You're computing cpu_mmu_index_kernel 3 times. Squashing this in (easier to review than the whole thing): diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c index 4123ff1245e..4edfd26135f 100644 --- a/target/i386/tcg/seg_helper.c +++ b/target/i386/tcg/seg_helper.c @@ -321,7 +321,7 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector, uint32_t new_eflags, new_eip, new_cr3, new_ldt, new_trap; uint32_t old_eflags, eflags_mask; SegmentCache *dt; - int index; + int mmu_index, index; target_ulong ptr; X86Access old, new; @@ -378,16 +378,17 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector, } /* X86Access avoids memory exceptions during the task switch */ + mmu_index = cpu_mmu_index_kernel(env); access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max, - MMU_DATA_STORE, cpu_mmu_index_kernel(env), retaddr); + MMU_DATA_STORE, mmu_index, retaddr); if (source == SWITCH_TSS_CALL) { /* Probe for future write of parent task */ probe_access(env, tss_base, 2, MMU_DATA_STORE, - cpu_mmu_index_kernel(env), retaddr); + mmu_index, retaddr); } access_prepare_mmu(&new, env, tss_base, tss_limit, - MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr); + MMU_DATA_LOAD, mmu_index, retaddr); /* read all the registers from the new TSS */ if (type & 8) { @@ -468,7 +469,11 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector, context */ if (source == SWITCH_TSS_CALL) { - cpu_stw_kernel_ra(env, tss_base, env->tr.selector, retaddr); + /* + * Thanks to the probe_access above, we know the first two + * bytes addressed by &new are writable too. + */ + access_stw(&new, tss_base, env->tr.selector); new_eflags |= NT_MASK; } Paolo ^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 09/10] target/i386/tcg: use X86Access for TSS access 2024-07-11 6:28 ` Paolo Bonzini @ 2024-07-11 15:30 ` Richard Henderson 0 siblings, 0 replies; 27+ messages in thread From: Richard Henderson @ 2024-07-11 15:30 UTC (permalink / raw) To: Paolo Bonzini; +Cc: qemu-devel, Robert Henry On 7/10/24 23:28, Paolo Bonzini wrote: > On 7/10/24 20:40, Paolo Bonzini wrote: >> >> >> Il mer 10 lug 2024, 18:47 Richard Henderson <richard.henderson@linaro.org >> <mailto:richard.henderson@linaro.org>> ha scritto: >> >> On 7/9/24 23:29, Paolo Bonzini wrote: >> > This takes care of probing the vaddr range in advance, and is >> also faster >> > because it avoids repeated TLB lookups. It also matches the >> Intel manual >> > better, as it says "Checks that the current (old) TSS, new TSS, >> and all >> > segment descriptors used in the task switch are paged into system >> memory"; >> > note however that it's not clear how the processor checks for segment >> > descriptors, and this check is not included in the AMD manual. >> > >> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com >> <mailto:pbonzini@redhat.com>> >> > --- >> > target/i386/tcg/seg_helper.c | 101 >> ++++++++++++++++++----------------- >> > 1 file changed, 51 insertions(+), 50 deletions(-) >> > >> > diff --git a/target/i386/tcg/seg_helper.c >> b/target/i386/tcg/seg_helper.c >> > index 25af9d4a4ec..77f2c65c3cf 100644 >> > --- a/target/i386/tcg/seg_helper.c >> > +++ b/target/i386/tcg/seg_helper.c >> > @@ -311,35 +313,44 @@ static int switch_tss_ra(CPUX86State *env, >> int tss_selector, >> > raise_exception_err_ra(env, EXCP0A_TSS, tss_selector & >> 0xfffc, retaddr); >> > } >> > >> > + /* X86Access avoids memory exceptions during the task switch */ >> > + access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max, >> > + MMU_DATA_STORE, cpu_mmu_index_kernel(env), >> retaddr); >> > + >> > + if (source == SWITCH_TSS_CALL) { >> > + /* Probe for future write of parent task */ >> > + probe_access(env, tss_base, 2, MMU_DATA_STORE, >> > + cpu_mmu_index_kernel(env), retaddr); >> > + } >> > + access_prepare_mmu(&new, env, tss_base, tss_limit, >> > + MMU_DATA_LOAD, cpu_mmu_index_kernel(env), >> retaddr); >> >> You're computing cpu_mmu_index_kernel 3 times. > > Squashing this in (easier to review than the whole thing): Excellent, thanks! r~ > > diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c > index 4123ff1245e..4edfd26135f 100644 > --- a/target/i386/tcg/seg_helper.c > +++ b/target/i386/tcg/seg_helper.c > @@ -321,7 +321,7 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector, > uint32_t new_eflags, new_eip, new_cr3, new_ldt, new_trap; > uint32_t old_eflags, eflags_mask; > SegmentCache *dt; > - int index; > + int mmu_index, index; > target_ulong ptr; > X86Access old, new; > > @@ -378,16 +378,17 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector, > } > > /* X86Access avoids memory exceptions during the task switch */ > + mmu_index = cpu_mmu_index_kernel(env); > access_prepare_mmu(&old, env, env->tr.base, old_tss_limit_max, > - MMU_DATA_STORE, cpu_mmu_index_kernel(env), retaddr); > + MMU_DATA_STORE, mmu_index, retaddr); > > if (source == SWITCH_TSS_CALL) { > /* Probe for future write of parent task */ > probe_access(env, tss_base, 2, MMU_DATA_STORE, > - cpu_mmu_index_kernel(env), retaddr); > + mmu_index, retaddr); > } > access_prepare_mmu(&new, env, tss_base, tss_limit, > - MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr); > + MMU_DATA_LOAD, mmu_index, retaddr); > > /* read all the registers from the new TSS */ > if (type & 8) { > @@ -468,7 +469,11 @@ static void switch_tss_ra(CPUX86State *env, int tss_selector, > context */ > > if (source == SWITCH_TSS_CALL) { > - cpu_stw_kernel_ra(env, tss_base, env->tr.selector, retaddr); > + /* > + * Thanks to the probe_access above, we know the first two > + * bytes addressed by &new are writable too. > + */ > + access_stw(&new, tss_base, env->tr.selector); > new_eflags |= NT_MASK; > } > > Paolo > ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 10/10] target/i386/tcg: save current task state before loading new one 2024-07-10 6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini ` (8 preceding siblings ...) 2024-07-10 6:29 ` [PATCH 09/10] target/i386/tcg: use X86Access for TSS access Paolo Bonzini @ 2024-07-10 6:29 ` Paolo Bonzini 2024-07-10 16:50 ` Richard Henderson 2024-07-10 21:00 ` [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Robert Henry 10 siblings, 1 reply; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 6:29 UTC (permalink / raw) To: qemu-devel; +Cc: rrh.henry, richard.henderson This is how the steps are ordered in the manual. EFLAGS.NT is overwritten after the fact in the saved image. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- target/i386/tcg/seg_helper.c | 85 +++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c index 77f2c65c3cf..7663e653e84 100644 --- a/target/i386/tcg/seg_helper.c +++ b/target/i386/tcg/seg_helper.c @@ -325,6 +325,42 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, access_prepare_mmu(&new, env, tss_base, tss_limit, MMU_DATA_LOAD, cpu_mmu_index_kernel(env), retaddr); + /* save the current state in the old TSS */ + old_eflags = cpu_compute_eflags(env); + if (old_type & 8) { + /* 32 bit */ + access_stl(&old, env->tr.base + 0x20, next_eip); + access_stl(&old, env->tr.base + 0x24, old_eflags); + access_stl(&old, env->tr.base + (0x28 + 0 * 4), env->regs[R_EAX]); + access_stl(&old, env->tr.base + (0x28 + 1 * 4), env->regs[R_ECX]); + access_stl(&old, env->tr.base + (0x28 + 2 * 4), env->regs[R_EDX]); + access_stl(&old, env->tr.base + (0x28 + 3 * 4), env->regs[R_EBX]); + access_stl(&old, env->tr.base + (0x28 + 4 * 4), env->regs[R_ESP]); + access_stl(&old, env->tr.base + (0x28 + 5 * 4), env->regs[R_EBP]); + access_stl(&old, env->tr.base + (0x28 + 6 * 4), env->regs[R_ESI]); + access_stl(&old, env->tr.base + (0x28 + 7 * 4), env->regs[R_EDI]); + for (i = 0; i < 6; i++) { + access_stw(&old, env->tr.base + (0x48 + i * 4), + env->segs[i].selector); + } + } else { + /* 16 bit */ + access_stw(&old, env->tr.base + 0x0e, next_eip); + access_stw(&old, env->tr.base + 0x10, old_eflags); + access_stw(&old, env->tr.base + (0x12 + 0 * 2), env->regs[R_EAX]); + access_stw(&old, env->tr.base + (0x12 + 1 * 2), env->regs[R_ECX]); + access_stw(&old, env->tr.base + (0x12 + 2 * 2), env->regs[R_EDX]); + access_stw(&old, env->tr.base + (0x12 + 3 * 2), env->regs[R_EBX]); + access_stw(&old, env->tr.base + (0x12 + 4 * 2), env->regs[R_ESP]); + access_stw(&old, env->tr.base + (0x12 + 5 * 2), env->regs[R_EBP]); + access_stw(&old, env->tr.base + (0x12 + 6 * 2), env->regs[R_ESI]); + access_stw(&old, env->tr.base + (0x12 + 7 * 2), env->regs[R_EDI]); + for (i = 0; i < 4; i++) { + access_stw(&old, env->tr.base + (0x22 + i * 2), + env->segs[i].selector); + } + } + /* read all the registers from the new TSS */ if (type & 8) { /* 32 bit */ @@ -364,49 +400,16 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, if (source == SWITCH_TSS_JMP || source == SWITCH_TSS_IRET) { tss_set_busy(env, env->tr.selector, 0, retaddr); } - old_eflags = cpu_compute_eflags(env); + if (source == SWITCH_TSS_IRET) { old_eflags &= ~NT_MASK; + if (old_type & 8) { + access_stl(&old, env->tr.base + 0x24, old_eflags); + } else { + access_stw(&old, env->tr.base + 0x10, old_eflags); + } } - /* save the current state in the old TSS */ - if (old_type & 8) { - /* 32 bit */ - access_stl(&old, env->tr.base + 0x20, next_eip); - access_stl(&old, env->tr.base + 0x24, old_eflags); - access_stl(&old, env->tr.base + (0x28 + 0 * 4), env->regs[R_EAX]); - access_stl(&old, env->tr.base + (0x28 + 1 * 4), env->regs[R_ECX]); - access_stl(&old, env->tr.base + (0x28 + 2 * 4), env->regs[R_EDX]); - access_stl(&old, env->tr.base + (0x28 + 3 * 4), env->regs[R_EBX]); - access_stl(&old, env->tr.base + (0x28 + 4 * 4), env->regs[R_ESP]); - access_stl(&old, env->tr.base + (0x28 + 5 * 4), env->regs[R_EBP]); - access_stl(&old, env->tr.base + (0x28 + 6 * 4), env->regs[R_ESI]); - access_stl(&old, env->tr.base + (0x28 + 7 * 4), env->regs[R_EDI]); - for (i = 0; i < 6; i++) { - access_stw(&old, env->tr.base + (0x48 + i * 4), - env->segs[i].selector); - } - } else { - /* 16 bit */ - access_stw(&old, env->tr.base + 0x0e, next_eip); - access_stw(&old, env->tr.base + 0x10, old_eflags); - access_stw(&old, env->tr.base + (0x12 + 0 * 2), env->regs[R_EAX]); - access_stw(&old, env->tr.base + (0x12 + 1 * 2), env->regs[R_ECX]); - access_stw(&old, env->tr.base + (0x12 + 2 * 2), env->regs[R_EDX]); - access_stw(&old, env->tr.base + (0x12 + 3 * 2), env->regs[R_EBX]); - access_stw(&old, env->tr.base + (0x12 + 4 * 2), env->regs[R_ESP]); - access_stw(&old, env->tr.base + (0x12 + 5 * 2), env->regs[R_EBP]); - access_stw(&old, env->tr.base + (0x12 + 6 * 2), env->regs[R_ESI]); - access_stw(&old, env->tr.base + (0x12 + 7 * 2), env->regs[R_EDI]); - for (i = 0; i < 4; i++) { - access_stw(&old, env->tr.base + (0x22 + i * 2), - env->segs[i].selector); - } - } - - /* now if an exception occurs, it will occurs in the next task - context */ - if (source == SWITCH_TSS_CALL) { cpu_stw_kernel_ra(env, tss_base, env->tr.selector, retaddr); new_eflags |= NT_MASK; @@ -418,7 +421,9 @@ static int switch_tss_ra(CPUX86State *env, int tss_selector, } /* set the new CPU state */ - /* from this point, any exception which occurs can give problems */ + + /* now if an exception occurs, it will occur in the next task context */ + env->cr[0] |= CR0_TS_MASK; env->hflags |= HF_TS_MASK; env->tr.selector = tss_selector; -- 2.45.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 10/10] target/i386/tcg: save current task state before loading new one 2024-07-10 6:29 ` [PATCH 10/10] target/i386/tcg: save current task state before loading new one Paolo Bonzini @ 2024-07-10 16:50 ` Richard Henderson 0 siblings, 0 replies; 27+ messages in thread From: Richard Henderson @ 2024-07-10 16:50 UTC (permalink / raw) To: Paolo Bonzini, qemu-devel; +Cc: rrh.henry On 7/9/24 23:29, Paolo Bonzini wrote: > This is how the steps are ordered in the manual. EFLAGS.NT is > overwritten after the fact in the saved image. > > Signed-off-by: Paolo Bonzini<pbonzini@redhat.com> > --- > target/i386/tcg/seg_helper.c | 85 +++++++++++++++++++----------------- > 1 file changed, 45 insertions(+), 40 deletions(-) Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~ ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c 2024-07-10 6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini ` (9 preceding siblings ...) 2024-07-10 6:29 ` [PATCH 10/10] target/i386/tcg: save current task state before loading new one Paolo Bonzini @ 2024-07-10 21:00 ` Robert Henry 2024-07-10 21:08 ` Paolo Bonzini 10 siblings, 1 reply; 27+ messages in thread From: Robert Henry @ 2024-07-10 21:00 UTC (permalink / raw) To: Paolo Bonzini; +Cc: qemu-devel, richard.henderson [-- Attachment #1: Type: text/plain, Size: 7505 bytes --] I have only skimmed the diffs. Your knowledge of the deep semantics, gained by close differential reading of intel and amd docs, is truly amazing. Many thanks for pushing this through! I have 2 nits, perhaps stylistic only. For code like "sp -= 2" or "sp += 2" followed or preceded by a write to the stack pointer of a uint16_t variable 'x', would it be better/more robust to rewrite as: "sp -= sizeof(x)" ? There are a lot of masks constructed using -1. I think it would be clearer to use 0xffffffff (for 32-bit masks) as that reminds the reader that this is a bit mask. But it seems that using -1 is how the original code was written. On Tue, Jul 9, 2024 at 11:29 PM Paolo Bonzini <pbonzini@redhat.com> wrote: > This includes bugfixes: > - allowing IRET from user mode to user mode with SMAP (do not use implicit > kernel accesses, which break if the stack is in userspace) > > - use DPL-level accesses for interrupts and call gates > > - various fixes for task switching > > And two related cleanups: computing MMU index once for far calls and > returns > (including task switches), and using X86Access for TSS access. > > Tested with a really ugly patch to kvm-unit-tests, included after > signature. > > Paolo Bonzini (7): > target/i386/tcg: Allow IRET from user mode to user mode with SMAP > target/i386/tcg: use PUSHL/PUSHW for error code > target/i386/tcg: Compute MMU index once > target/i386/tcg: Use DPL-level accesses for interrupts and call gates > target/i386/tcg: check for correct busy state before switching to a > new task > target/i386/tcg: use X86Access for TSS access > target/i386/tcg: save current task state before loading new one > > Richard Henderson (3): > target/i386/tcg: Remove SEG_ADDL > target/i386/tcg: Reorg push/pop within seg_helper.c > target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl > > target/i386/cpu.h | 11 +- > target/i386/cpu.c | 27 +- > target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++---------------- > 3 files changed, 354 insertions(+), 290 deletions(-) > > -- > 2.45.2 > > diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c > index c3ec0ad7..0bf40c6d 100644 > --- a/lib/x86/usermode.c > +++ b/lib/x86/usermode.c > @@ -5,13 +5,15 @@ > #include "x86/desc.h" > #include "x86/isr.h" > #include "alloc.h" > +#include "alloc_page.h" > #include "setjmp.h" > #include "usermode.h" > > #include "libcflat.h" > #include <stdint.h> > > -#define USERMODE_STACK_SIZE 0x2000 > +#define USERMODE_STACK_ORDER 1 /* 8k */ > +#define USERMODE_STACK_SIZE (1 << (12 + USERMODE_STACK_ORDER)) > #define RET_TO_KERNEL_IRQ 0x20 > > static jmp_buf jmpbuf; > @@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int > fault_vector, > { > extern char ret_to_kernel; > volatile uint64_t rax = 0; > - static unsigned char user_stack[USERMODE_STACK_SIZE]; > + static unsigned char *user_stack; > handler old_ex; > > + if (!user_stack) { > + user_stack = alloc_pages(USERMODE_STACK_ORDER); > + printf("%p\n", user_stack); > + } > + > *raised_vector = 0; > set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3); > old_ex = handle_exception(fault_vector, > @@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int > fault_vector, > return 0; > } > > + memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8); > + > asm volatile ( > /* Prepare kernel SP for exception handlers */ > "mov %%rsp, %[rsp0]\n\t" > @@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int > fault_vector, > "pushq %[user_stack_top]\n\t" > "pushfq\n\t" > "pushq %[user_cs]\n\t" > - "lea user_mode(%%rip), %%rax\n\t" > + "lea user_mode+0x800000(%%rip), %%rax\n\t" // > smap.flat places usermode addresses at 8MB-16MB > "pushq %%rax\n\t" > "iretq\n" > > "user_mode:\n\t" > /* Back up volatile registers before invoking func > */ > + "pop %%rax\n\t" > "push %%rcx\n\t" > "push %%rdx\n\t" > "push %%rdi\n\t" > @@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int > fault_vector, > "push %%r10\n\t" > "push %%r11\n\t" > /* Call user mode function */ > + "add $0x800000,%%rbp\n\t" > "mov %[arg1], %%rdi\n\t" > "mov %[arg2], %%rsi\n\t" > "mov %[arg3], %%rdx\n\t" > "mov %[arg4], %%rcx\n\t" > - "call *%[func]\n\t" > + "call *%%rax\n\t" > /* Restore registers */ > "pop %%r11\n\t" > "pop %%r10\n\t" > @@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned > int fault_vector, > [arg2]"m"(arg2), > [arg3]"m"(arg3), > [arg4]"m"(arg4), > - [func]"m"(func), > [user_ds]"i"(USER_DS), > [user_cs]"i"(USER_CS), > [kernel_ds]"rm"(KERNEL_DS), > [user_stack_top]"r"(user_stack + > - sizeof(user_stack)), > + USERMODE_STACK_SIZE - 8), > [kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ)); > > handle_exception(fault_vector, old_ex); > diff --git a/x86/smap.c b/x86/smap.c > index 9a823a55..65119442 100644 > --- a/x86/smap.c > +++ b/x86/smap.c > @@ -2,6 +2,7 @@ > #include <alloc_page.h> > #include "x86/desc.h" > #include "x86/processor.h" > +#include "x86/usermode.h" > #include "x86/vm.h" > > volatile int pf_count = 0; > @@ -89,6 +90,31 @@ static void check_smap_nowp(void) > write_cr3(read_cr3()); > } > > +#ifdef __x86_64__ > +static void iret(void) > +{ > + asm volatile( > + "mov %%rsp, %%rcx;" > + "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;" > + "pushf;" > + "movl %%cs, %%ebx; pushq %%rbx; " > + "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:" > + > + : : : "ebx", "ecx", "cc"); /* RPL=0 */ > +} > + > +static void test_user_iret(void) > +{ > + bool raised_vector; > + uintptr_t user_iret = (uintptr_t)iret + USER_BASE; > + > + run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0, > + &raised_vector); > + > + report(!raised_vector, "No #PF on CPL=3 DPL=3 iret"); > +} > +#endif > + > int main(int ac, char **av) > { > unsigned long i; > @@ -196,7 +222,9 @@ int main(int ac, char **av) > > check_smap_nowp(); > > - // TODO: implicit kernel access from ring 3 (e.g. int) > +#ifdef __x86_64__ > + test_user_iret(); > +#endif > > return report_summary(); > } > > > > [-- Attachment #2: Type: text/html, Size: 9530 bytes --] ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c 2024-07-10 21:00 ` [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Robert Henry @ 2024-07-10 21:08 ` Paolo Bonzini 0 siblings, 0 replies; 27+ messages in thread From: Paolo Bonzini @ 2024-07-10 21:08 UTC (permalink / raw) To: Robert Henry; +Cc: qemu-devel, Richard Henderson [-- Attachment #1: Type: text/plain, Size: 8284 bytes --] Il mer 10 lug 2024, 23:01 Robert Henry <rrh.henry@gmail.com> ha scritto: > I have only skimmed the diffs. Your knowledge of the deep semantics, > gained by close differential reading of intel and amd docs, is truly > amazing. Many thanks for pushing this through! > Thanks for bringing this to our attention too, apart from the practical bug hopefully it will help future readers to have a more precise implementation. I tried to acknowledge your contribution in the commit messages. I have 2 nits, perhaps stylistic only. > > For code like "sp -= 2" or "sp += 2" followed or preceded by a write to > the stack pointer of a uint16_t variable 'x', would it be better/more > robust to rewrite as: "sp -= sizeof(x)" ? > I think that's intentional because the value subtracted is related to the "stw" or "stl" in the store (likewise for incrementing after a load) more than to the size of x. There are a lot of masks constructed using -1. I think it would be clearer > to use 0xffffffff (for 32-bit masks) as that reminds the reader that this > is a bit mask. But it seems that using -1 is how the original code was > written. > -1 is used for 64-bit masks only. They get unwieldy quickly. :) Paolo > On Tue, Jul 9, 2024 at 11:29 PM Paolo Bonzini <pbonzini@redhat.com> wrote: > >> This includes bugfixes: >> - allowing IRET from user mode to user mode with SMAP (do not use implicit >> kernel accesses, which break if the stack is in userspace) >> >> - use DPL-level accesses for interrupts and call gates >> >> - various fixes for task switching >> >> And two related cleanups: computing MMU index once for far calls and >> returns >> (including task switches), and using X86Access for TSS access. >> >> Tested with a really ugly patch to kvm-unit-tests, included after >> signature. >> >> Paolo Bonzini (7): >> target/i386/tcg: Allow IRET from user mode to user mode with SMAP >> target/i386/tcg: use PUSHL/PUSHW for error code >> target/i386/tcg: Compute MMU index once >> target/i386/tcg: Use DPL-level accesses for interrupts and call gates >> target/i386/tcg: check for correct busy state before switching to a >> new task >> target/i386/tcg: use X86Access for TSS access >> target/i386/tcg: save current task state before loading new one >> >> Richard Henderson (3): >> target/i386/tcg: Remove SEG_ADDL >> target/i386/tcg: Reorg push/pop within seg_helper.c >> target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl >> >> target/i386/cpu.h | 11 +- >> target/i386/cpu.c | 27 +- >> target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++---------------- >> 3 files changed, 354 insertions(+), 290 deletions(-) >> >> -- >> 2.45.2 >> >> diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c >> index c3ec0ad7..0bf40c6d 100644 >> --- a/lib/x86/usermode.c >> +++ b/lib/x86/usermode.c >> @@ -5,13 +5,15 @@ >> #include "x86/desc.h" >> #include "x86/isr.h" >> #include "alloc.h" >> +#include "alloc_page.h" >> #include "setjmp.h" >> #include "usermode.h" >> >> #include "libcflat.h" >> #include <stdint.h> >> >> -#define USERMODE_STACK_SIZE 0x2000 >> +#define USERMODE_STACK_ORDER 1 /* 8k */ >> +#define USERMODE_STACK_SIZE (1 << (12 + USERMODE_STACK_ORDER)) >> #define RET_TO_KERNEL_IRQ 0x20 >> >> static jmp_buf jmpbuf; >> @@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int >> fault_vector, >> { >> extern char ret_to_kernel; >> volatile uint64_t rax = 0; >> - static unsigned char user_stack[USERMODE_STACK_SIZE]; >> + static unsigned char *user_stack; >> handler old_ex; >> >> + if (!user_stack) { >> + user_stack = alloc_pages(USERMODE_STACK_ORDER); >> + printf("%p\n", user_stack); >> + } >> + >> *raised_vector = 0; >> set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3); >> old_ex = handle_exception(fault_vector, >> @@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int >> fault_vector, >> return 0; >> } >> >> + memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8); >> + >> asm volatile ( >> /* Prepare kernel SP for exception handlers */ >> "mov %%rsp, %[rsp0]\n\t" >> @@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int >> fault_vector, >> "pushq %[user_stack_top]\n\t" >> "pushfq\n\t" >> "pushq %[user_cs]\n\t" >> - "lea user_mode(%%rip), %%rax\n\t" >> + "lea user_mode+0x800000(%%rip), %%rax\n\t" // >> smap.flat places usermode addresses at 8MB-16MB >> "pushq %%rax\n\t" >> "iretq\n" >> >> "user_mode:\n\t" >> /* Back up volatile registers before invoking >> func */ >> + "pop %%rax\n\t" >> "push %%rcx\n\t" >> "push %%rdx\n\t" >> "push %%rdi\n\t" >> @@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int >> fault_vector, >> "push %%r10\n\t" >> "push %%r11\n\t" >> /* Call user mode function */ >> + "add $0x800000,%%rbp\n\t" >> "mov %[arg1], %%rdi\n\t" >> "mov %[arg2], %%rsi\n\t" >> "mov %[arg3], %%rdx\n\t" >> "mov %[arg4], %%rcx\n\t" >> - "call *%[func]\n\t" >> + "call *%%rax\n\t" >> /* Restore registers */ >> "pop %%r11\n\t" >> "pop %%r10\n\t" >> @@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned >> int fault_vector, >> [arg2]"m"(arg2), >> [arg3]"m"(arg3), >> [arg4]"m"(arg4), >> - [func]"m"(func), >> [user_ds]"i"(USER_DS), >> [user_cs]"i"(USER_CS), >> [kernel_ds]"rm"(KERNEL_DS), >> [user_stack_top]"r"(user_stack + >> - sizeof(user_stack)), >> + USERMODE_STACK_SIZE - 8), >> [kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ)); >> >> handle_exception(fault_vector, old_ex); >> diff --git a/x86/smap.c b/x86/smap.c >> index 9a823a55..65119442 100644 >> --- a/x86/smap.c >> +++ b/x86/smap.c >> @@ -2,6 +2,7 @@ >> #include <alloc_page.h> >> #include "x86/desc.h" >> #include "x86/processor.h" >> +#include "x86/usermode.h" >> #include "x86/vm.h" >> >> volatile int pf_count = 0; >> @@ -89,6 +90,31 @@ static void check_smap_nowp(void) >> write_cr3(read_cr3()); >> } >> >> +#ifdef __x86_64__ >> +static void iret(void) >> +{ >> + asm volatile( >> + "mov %%rsp, %%rcx;" >> + "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;" >> + "pushf;" >> + "movl %%cs, %%ebx; pushq %%rbx; " >> + "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:" >> + >> + : : : "ebx", "ecx", "cc"); /* RPL=0 */ >> +} >> + >> +static void test_user_iret(void) >> +{ >> + bool raised_vector; >> + uintptr_t user_iret = (uintptr_t)iret + USER_BASE; >> + >> + run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0, >> + &raised_vector); >> + >> + report(!raised_vector, "No #PF on CPL=3 DPL=3 iret"); >> +} >> +#endif >> + >> int main(int ac, char **av) >> { >> unsigned long i; >> @@ -196,7 +222,9 @@ int main(int ac, char **av) >> >> check_smap_nowp(); >> >> - // TODO: implicit kernel access from ring 3 (e.g. int) >> +#ifdef __x86_64__ >> + test_user_iret(); >> +#endif >> >> return report_summary(); >> } >> >> >> >> [-- Attachment #2: Type: text/html, Size: 11341 bytes --] ^ permalink raw reply [flat|nested] 27+ messages in thread
end of thread, other threads:[~2024-10-25 15:32 UTC | newest]
Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-07-10  6:29 [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Paolo Bonzini
2024-07-10  6:29 ` [PATCH 01/10] target/i386/tcg: Remove SEG_ADDL Paolo Bonzini
2024-07-10  6:29 ` [PATCH 02/10] target/i386/tcg: Allow IRET from user mode to user mode with SMAP Paolo Bonzini
2024-07-10 15:22   ` Richard Henderson
2024-07-10  6:29 ` [PATCH 03/10] target/i386/tcg: use PUSHL/PUSHW for error code Paolo Bonzini
2024-07-10 15:24   ` Richard Henderson
2024-07-10  6:29 ` [PATCH 04/10] target/i386/tcg: Reorg push/pop within seg_helper.c Paolo Bonzini
2024-07-10  6:29 ` [PATCH 05/10] target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl Paolo Bonzini
2024-07-10  6:29 ` [PATCH 06/10] target/i386/tcg: Compute MMU index once Paolo Bonzini
2024-07-10 15:55   ` Richard Henderson
2024-07-10  6:29 ` [PATCH 07/10] target/i386/tcg: Use DPL-level accesses for interrupts and call gates Paolo Bonzini
2024-07-10 15:57   ` Richard Henderson
2024-10-18 16:02   ` Michael Tokarev
2024-10-25 15:26     ` Michael Tokarev
2024-10-25 15:28       ` Paolo Bonzini
2024-10-25 15:31         ` Michael Tokarev
2024-07-10  6:29 ` [PATCH 08/10] target/i386/tcg: check for correct busy state before switching to a new task Paolo Bonzini
2024-07-10 15:58   ` Richard Henderson
2024-07-10  6:29 ` [PATCH 09/10] target/i386/tcg: use X86Access for TSS access Paolo Bonzini
2024-07-10 16:45   ` Richard Henderson
2024-07-10 18:40     ` Paolo Bonzini
2024-07-11  6:28       ` Paolo Bonzini
2024-07-11 15:30         ` Richard Henderson
2024-07-10  6:29 ` [PATCH 10/10] target/i386/tcg: save current task state before loading new one Paolo Bonzini
2024-07-10 16:50   ` Richard Henderson
2024-07-10 21:00 ` [PATCH 00/10] target/i386/tcg: fixes for seg_helper.c Robert Henry
2024-07-10 21:08   ` Paolo Bonzini
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).