public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC] Patch - Big real mode emulation
@ 2008-05-21  9:34 Guillaume Thouvenin
  2008-05-21 13:59 ` Avi Kivity
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Guillaume Thouvenin @ 2008-05-21  9:34 UTC (permalink / raw)
  To: kvm
  Cc: Avi Kivity, Anthony Liguori, Mohammed Gamal, Kamble, Nitin A,
	Alexander Graf, Marcelo Tosatti

Hello,

 Here is a patch that allows to boot OpenSuse-10.3. The problem with
Opensuse 10.3 is it uses a version of gfxboot that reads SS after
switching from real to protected mode, where SS contains an invalid
value, which VMX does not allow. So this patch 

 1) removes the code that writes sane value in SS in order to detect VM
entry failure due to CS.RPL != SS.RPL
 2) adds an handler to catch the VMentry failure

 The handler calls instruction's emulator and to boot opensuse we need
to emulate the following instructions:

	ljmp   $0x18,$0x6e18
	mov    $0x20,%ax
	mov    %eax,%ds
	mov    %ss,%eax
	and    $0xffff,%esp
	shl    $0x4,%eax
	add    %eax,%esp
	mov    $0x8,%ax
	mov    %eax,%ss	
	-> At this point CS.RPL == SS.RPL

 There is an issue with the patch. When removing the SS patching we see
other problems. So to be able to still boot distribution that was
already able to boot we added a hack that allows to modify SS_SELECTOR
(as it was done previously) when emulation failed. The future solution
will be to emulate instruction that need to be emulated.

 On my Intel Xeon I am able to boot:
  - OpenSuse 10.3 x86_64
  - Ubuntu 7.10 liveCD amd64
  - Ubuntu 8.04 desktop-i386
  - Fedora 9 i386
  - NetBSD AMD 64
  - Dragon Fly 1.8.0
  - WinXP Professional 32 bits
  - WinXP Professional 64 bits 
  - SunOS Release 5.10 32 bits
  - SunOS Release 5.11 64 bits
  - Plan9
  - FreeDOS: there is an issue with the HIMEM XMS-memory driver as
reported by Marcelo on KVM in the thread entitled "Protected mode
transitions and big real mode... still an issue".


Regards,
Guillaume


 arch/x86/kvm/vmx.c         |   72 ++++++++++++++++++++
 arch/x86/kvm/vmx.h         |    3 
 arch/x86/kvm/x86.c         |   12 +--
 arch/x86/kvm/x86_emulate.c |  158 +++++++++++++++++++++++++++++++++++++++++++--
 include/asm-x86/kvm_host.h |    5 +
 5 files changed, 238 insertions(+), 12 deletions(-)

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Laurent Vivier <laurent.vivier@bull.net>

---

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aaa99ed..e32bf6b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1292,7 +1292,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 	fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
 	fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
 
-	vmcs_write16(GUEST_SS_SELECTOR, 0);
+	if (vcpu->arch.rmode_failed)
+		vmcs_write16(GUEST_SS_SELECTOR, 0);
 	vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
 
 	vmcs_write16(GUEST_CS_SELECTOR,
@@ -2673,6 +2674,69 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int invalid_guest_state(struct kvm_vcpu *vcpu,
+		struct kvm_run *kvm_run, u32 failure_reason)
+{
+	u16 ss, cs;
+	u8 opcodes[4];
+	unsigned long rip = vcpu->arch.rip;
+	unsigned long rip_linear;
+
+	ss = vmcs_read16(GUEST_SS_SELECTOR);
+	cs = vmcs_read16(GUEST_CS_SELECTOR);
+
+	if ((ss & 0x03) != (cs & 0x03)) {
+		int err;
+		rip_linear = rip + vmx_get_segment_base(vcpu, VCPU_SREG_CS);
+		emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
+		err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+		switch (err) {
+			case EMULATE_DONE:
+				return 1;
+			case EMULATE_DO_MMIO:
+				printk(KERN_INFO "mmio?\n");
+				return 0;
+			default:
+				/* HACK: If we can not emulate the instruction
+				 * we write a sane value on SS to pass sanity 
+				 * checks. The good thing to do is to emulate the 
+				 * instruction */
+				kvm_report_emulation_failure(vcpu, "vmentry failure");
+				printk(KERN_INFO "   => Quit real mode emulation\n");
+				vcpu->arch.rmode_failed = 1;
+				vmcs_write16(GUEST_SS_SELECTOR, 0);
+				return 1;
+		}
+	}
+
+	kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+	kvm_run->hw.hardware_exit_reason = failure_reason;
+	return 0;
+}
+
+static int handle_vmentry_failure(struct kvm_vcpu *vcpu,
+				  struct kvm_run *kvm_run,
+				  u32 failure_reason)
+{
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	switch (failure_reason) {
+		case EXIT_REASON_INVALID_GUEST_STATE:
+			return invalid_guest_state(vcpu, kvm_run, failure_reason);
+		case EXIT_REASON_MSR_LOADING:
+			printk("VMentry failure caused by MSR entry %ld loading.\n", 
+					exit_qualification);
+			printk("  ... Not handled\n");
+			break;
+		case EXIT_REASON_MACHINE_CHECK:
+			printk("VMentry failure caused by machine check.\n");
+			printk("  ... Not handled\n");
+			break;
+		default:
+			printk("reason not known yet!\n");
+			break;
+	}
+	return 0;
+}
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -2735,6 +2799,12 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			exit_reason != EXIT_REASON_EPT_VIOLATION))
 		printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
 		       "exit reason is 0x%x\n", __func__, exit_reason);
+
+	if ((exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+		exit_reason &= ~VMX_EXIT_REASONS_FAILED_VMENTRY;
+		return handle_vmentry_failure(vcpu, kvm_run, exit_reason);
+	}
+
 	if (exit_reason < kvm_vmx_max_exit_handlers
 	    && kvm_vmx_exit_handlers[exit_reason])
 		return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 425a134..966460e 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -239,7 +239,10 @@ enum vmcs_field {
 #define EXIT_REASON_IO_INSTRUCTION      30
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
+#define EXIT_REASON_INVALID_GUEST_STATE 33
+#define EXIT_REASON_MSR_LOADING         34
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_MACHINE_CHECK       41
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
 #define EXIT_REASON_EPT_VIOLATION       48
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e537005..e4d6f3b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3016,8 +3016,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	return 0;
 }
 
-static void get_segment(struct kvm_vcpu *vcpu,
-			struct kvm_segment *var, int seg)
+void get_segment(struct kvm_vcpu *vcpu,
+		 struct kvm_segment *var, int seg)
 {
 	kvm_x86_ops->get_segment(vcpu, var, seg);
 }
@@ -3100,8 +3100,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static void set_segment(struct kvm_vcpu *vcpu,
-			struct kvm_segment *var, int seg)
+void set_segment(struct kvm_vcpu *vcpu,
+		 struct kvm_segment *var, int seg)
 {
 	kvm_x86_ops->set_segment(vcpu, var, seg);
 }
@@ -3259,8 +3259,8 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
-				   int type_bits, int seg)
+int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+			    int type_bits, int seg)
 {
 	struct kvm_segment kvm_seg;
 
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 8a96320..d6ddd3f 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -138,9 +138,12 @@ static u16 opcode_table[256] = {
 	/* 0x88 - 0x8F */
 	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
 	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	0, ModRM | DstReg, 0, Group | Group1A,
-	/* 0x90 - 0x9F */
-	0, 0, 0, 0, 0, 0, 0, 0,
+	DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
+	DstReg | SrcMem | ModRM | Mov, Group | Group1A,
+	/* 0x90 - 0x97 */
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	/* 0x98 - 0x9F */
 	0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
 	/* 0xA0 - 0xA7 */
 	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -152,7 +155,8 @@ static u16 opcode_table[256] = {
 	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
 	ByteOp | ImplicitOps | String, ImplicitOps | String,
 	/* 0xB0 - 0xBF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xC0 - 0xC7 */
 	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
 	0, ImplicitOps | Stack, 0, 0,
@@ -168,7 +172,7 @@ static u16 opcode_table[256] = {
 	/* 0xE0 - 0xE7 */
 	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xE8 - 0xEF */
-	ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
+	ImplicitOps | Stack, SrcImm | ImplicitOps, ImplicitOps, SrcImmByte | ImplicitOps,
 	0, 0, 0, 0,
 	/* 0xF0 - 0xF7 */
 	0, 0, 0, 0,
@@ -1342,6 +1346,10 @@ special_insn:
 	switch (c->b) {
 	case 0x00 ... 0x05:
 	      add:		/* add */
+		if ((c->d & ModRM) && c->modrm_mod == 3) {
+			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+			c->dst.ptr =  decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
+		}
 		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0x08 ... 0x0d:
@@ -1514,14 +1522,120 @@ special_insn:
 		break;
 	case 0x88 ... 0x8b:	/* mov */
 		goto mov;
+	case 0x8c: { /* mov r/m, sreg */
+		struct kvm_segment segreg;
+
+		if (c->modrm_mod == 0x3)
+			c->src.val = c->modrm_val;
+
+		switch ( c->modrm_reg ) {
+		case 0:
+			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_ES);
+			break;
+		case 1:
+			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_CS);
+			break;
+		case 2:
+			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_SS);
+			break;
+		case 3:
+			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_DS);
+			break;
+		case 4:
+			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_FS);
+			break;
+		case 5:
+			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_GS);
+			break;
+		default:
+			printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
+					 c->modrm);
+			goto cannot_emulate;
+		}
+		c->dst.val = segreg.selector;
+		c->dst.bytes = 2;
+		c->dst.ptr = (unsigned long *)decode_register(c->modrm_rm, c->regs,
+							      c->d & ByteOp);
+		break;
+	}
 	case 0x8d: /* lea r16/r32, m */
 		c->dst.val = c->modrm_ea;
 		break;
+	case 0x8e: { /* mov seg, r/m16 */
+		uint16_t sel;
+
+		sel = c->src.val;
+		switch ( c->modrm_reg ) {
+		case 0:
+			if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_ES) < 0)
+				goto cannot_emulate;
+			break;
+		case 1:
+			if (load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0)
+				goto cannot_emulate;
+			break;
+		case 2:
+			if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_SS) < 0)
+				goto cannot_emulate;
+			break;
+		case 3:
+			if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_DS) < 0)
+				goto cannot_emulate;
+			break;
+		case 4:
+			if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_FS) < 0)
+				goto cannot_emulate;
+			break;
+		case 5:
+			if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_GS) < 0)
+				goto cannot_emulate;
+			break;
+		default:
+			printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
+					  c->modrm);
+			goto cannot_emulate;
+		}
+
+		c->dst.type = OP_NONE;  /* Disable writeback. */
+		break;
+	}
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		rc = emulate_grp1a(ctxt, ops);
 		if (rc != 0)
 			goto done;
 		break;
+	case 0x90 ... 0x97: /* xchg ax, r16 / xchg eax, r32 / xchg rax, r64 */
+		c->src.ptr = & c->regs[c->b & 0x7];
+		c->dst.ptr = & c->regs[VCPU_REGS_RAX];
+
+		switch (c->op_bytes) {
+                case 2:
+			c->dst.val = *(u16*) c->dst.ptr;
+			c->src.val = *(u16*) c->src.ptr;
+			*(u16 *) c->dst.ptr = (u16) c->src.val;
+			*(u16 *) c->src.ptr = (u16) c->dst.val;
+			break;
+		case 4:
+			c->dst.val = *(u32*) c->dst.ptr;
+			c->src.val = *(u32*) c->src.ptr;
+			*(u32 *) c->dst.ptr = (u32) c->src.val;
+			*(u32 *) c->src.ptr = (u32) c->dst.val;
+			break;
+		case 8:
+			c->dst.val = *(u64*) c->dst.ptr;
+			c->src.val = *(u64*) c->src.ptr;
+			*(u64 *) c->dst.ptr = (u64) c->src.val;
+			*(u64 *) c->src.ptr = (u64) c->dst.val;
+			break;
+		default:
+			printk("xchg: op_bytes=%d is not supported.\n", c->op_bytes);
+			goto cannot_emulate;
+		}
+		c->dst.type = OP_NONE;  /* Disable writeback. */
+		break;
+	case 0xb8: /* mov r, imm */
+		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX + (c->b & 0x7)];
+		goto mov;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
 		emulate_push(ctxt);
@@ -1623,6 +1737,10 @@ special_insn:
 		DPRINTF("Urk! I don't handle SCAS.\n");
 		goto cannot_emulate;
 	case 0xc0 ... 0xc1:
+		if ((c->d & ModRM) && c->modrm_mod == 3) {
+			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+			c->dst.ptr =  decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
+		}
 		emulate_grp2(ctxt);
 		break;
 	case 0xc3: /* ret */
@@ -1660,6 +1778,36 @@ special_insn:
 		break;
 	}
 	case 0xe9: /* jmp rel */
+		jmp_rel(c, c->src.val);
+		c->dst.type = OP_NONE; /* Disable writeback. */
+		break;
+	case 0xea: /* jmp far */ {
+		uint32_t eip;
+		uint16_t sel;
+
+		switch (c->op_bytes) {
+		case 2:
+			eip = insn_fetch(u16, 2, c->eip);
+			eip = eip & 0x0000FFFF; /* clear upper 16 bits */
+			break;
+		case 4:
+			eip = insn_fetch(u32, 4, c->eip);
+			break;
+		default:
+			DPRINTF("jmp far: Invalid op_bytes\n");
+			goto cannot_emulate;
+		}
+		sel = insn_fetch(u16, 2, c->eip);
+		if (ctxt->mode == X86EMUL_MODE_REAL)
+			eip |= (sel << 4);
+		else if (load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
+			DPRINTF("jmp far: Failed to load CS descriptor\n");
+			goto cannot_emulate;
+		}
+
+		c->eip = eip;
+		break;
+	}
 	case 0xeb: /* jmp rel short */
 		jmp_rel(c, c->src.val);
 		c->dst.type = OP_NONE; /* Disable writeback. */
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index a71f3aa..fb85da4 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -275,6 +275,7 @@ struct kvm_vcpu_arch {
 		} tr, es, ds, fs, gs;
 	} rmode;
 	int halt_request; /* real mode on Intel only */
+	int rmode_failed;
 
 	int cpuid_nent;
 	struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
@@ -498,6 +499,10 @@ int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
 		    unsigned long value);
 
+void set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+			    int type_bits, int seg);
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
 
 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2008-05-22 22:52 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-21  9:34 [RFC] Patch - Big real mode emulation Guillaume Thouvenin
2008-05-21 13:59 ` Avi Kivity
2008-05-21 14:10   ` Avi Kivity
2008-05-22  8:55     ` Guillaume Thouvenin
2008-05-21 17:19   ` Marcelo Tosatti
2008-05-21 15:32 ` Mohammed Gamal
2008-05-21 16:18 ` Marcelo Tosatti
2008-05-22  9:02   ` Guillaume Thouvenin
2008-05-21 23:18 ` Kamble, Nitin A
2008-05-22 22:52   ` Mohammed Gamal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox