All of lore.kernel.org
 help / color / mirror / Atom feed
From: Avi Kivity <avi@qumranet.com>
To: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Cc: kvm@vger.kernel.org, Anthony Liguori <anthony@codemonkey.ws>,
	Mohammed Gamal <m.gamal005@gmail.com>,
	"Kamble, Nitin A" <nitin.a.kamble@intel.com>,
	Alexander Graf <alex@csgraf.de>,
	Marcelo Tosatti <mtosatti@redhat.com>
Subject: Re: [RFC] Patch - Big real mode emulation
Date: Wed, 21 May 2008 16:59:46 +0300	[thread overview]
Message-ID: <48342AD2.40406@qumranet.com> (raw)
In-Reply-To: <20080521113410.43ec182f@frecb000711.frec.bull.fr>

Guillaume Thouvenin wrote:
> Hello,
>
>  Here is a patch that allows to boot OpenSuse-10.3. The problem with
> Opensuse 10.3 is it uses a version of gfxboot that reads SS after
> switching from real to protected mode, where SS contains an invalid
> value, which VMX does not allow. 

Good to see progress on this issue.

> So this patch 
>
>  1) removes the code that writes sane value in SS in order to detect VM
> entry failure due to CS.RPL != SS.RPL
>  2) adds an handler to catch the VMentry failure
>
>  The handler calls instruction's emulator and to boot opensuse we need
> to emulate the following instructions:
>
> 	ljmp   $0x18,$0x6e18
> 	mov    $0x20,%ax
> 	mov    %eax,%ds
> 	mov    %ss,%eax
> 	and    $0xffff,%esp
> 	shl    $0x4,%eax
> 	add    %eax,%esp
> 	mov    $0x8,%ax
> 	mov    %eax,%ss	
> 	-> At this point CS.RPL == SS.RPL
>
>  There is an issue with the patch. When removing the SS patching we see
> other problems. So to be able to still boot distribution that was
> already able to boot we added a hack that allows to modify SS_SELECTOR
> (as it was done previously) when emulation failed. The future solution
> will be to emulate instruction that need to be emulated.
>   

Which instructions are still problematic?
> index e537005..e4d6f3b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3016,8 +3016,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>  	return 0;
>  }
>  
> -static void get_segment(struct kvm_vcpu *vcpu,
> -			struct kvm_segment *var, int seg)
> +void get_segment(struct kvm_vcpu *vcpu,
> +		 struct kvm_segment *var, int seg)
>  {
>  	kvm_x86_ops->get_segment(vcpu, var, seg);
>  }
> @@ -3100,8 +3100,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
>  	return 0;
>  }
>  
> -static void set_segment(struct kvm_vcpu *vcpu,
> -			struct kvm_segment *var, int seg)
> +void set_segment(struct kvm_vcpu *vcpu,
> +		 struct kvm_segment *var, int seg)
>  {
>  	kvm_x86_ops->set_segment(vcpu, var, seg);
>  }
> @@ -3259,8 +3259,8 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
>  	return 0;
>  }
>  
> -static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
> -				   int type_bits, int seg)
> +int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
> +			    int type_bits, int seg)
>  {
>  	struct kvm_segment kvm_seg;
>   

These names need to be prefixed with kvm_, since they're much too generic.

> @@ -1342,6 +1346,10 @@ special_insn:
>  	switch (c->b) {
>  	case 0x00 ... 0x05:
>  	      add:		/* add */
> +		if ((c->d & ModRM) && c->modrm_mod == 3) {
> +			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
> +			c->dst.ptr =  decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
> +		}
>   

This is not specific to the add instruction. We really need to switch 
decode_modrm() to decode into a struct operand.

>  		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
>  		break;
>  	case 0x08 ... 0x0d:
> @@ -1514,14 +1522,120 @@ special_insn:
>  		break;
>  	case 0x88 ... 0x8b:	/* mov */
>  		goto mov;
> +	case 0x8c: { /* mov r/m, sreg */
> +		struct kvm_segment segreg;
> +
> +		if (c->modrm_mod == 0x3)
> +			c->src.val = c->modrm_val;
> +
> +		switch ( c->modrm_reg ) {
> +		case 0:
> +			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_ES);
> +			break;
> +		case 1:
> +			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_CS);
> +			break;
> +		case 2:
> +			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_SS);
> +			break;
> +		case 3:
> +			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_DS);
> +			break;
> +		case 4:
> +			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_FS);
> +			break;
> +		case 5:
> +			get_segment(ctxt->vcpu, &segreg, VCPU_SREG_GS);
> +			break;
> +		default:
>   

A look up table would be nice. Or better, reorder VCPU_SREG_xx to 
reflect the natural order.

One can even imagine a SrcSreg and DstSreg decoding flags.

> +			printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
> +					 c->modrm);
> +			goto cannot_emulate;
> +		}
> +		c->dst.val = segreg.selector;
> +		c->dst.bytes = 2;
> +		c->dst.ptr = (unsigned long *)decode_register(c->modrm_rm, c->regs,
> +							      c->d & ByteOp);
> +		break;
> +	}
>  	case 0x8d: /* lea r16/r32, m */
>  		c->dst.val = c->modrm_ea;
>  		break;
> +	case 0x8e: { /* mov seg, r/m16 */
> +		uint16_t sel;
> +
> +		sel = c->src.val;
> +		switch ( c->modrm_reg ) {
> +		case 0:
> +			if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_ES) < 0)
> +				goto cannot_emulate;
> +			break;
> +		case 1:
> +			if (load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0)
> +				goto cannot_emulate;
> +			break;
> +		case 2:
> +			if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_SS) < 0)
> +				goto cannot_emulate;
> +			break;
> +		case 3:
> +			if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_DS) < 0)
> +				goto cannot_emulate;
> +			break;
> +		case 4:
> +			if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_FS) < 0)
> +				goto cannot_emulate;
> +			break;
> +		case 5:
> +			if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_GS) < 0)
> +				goto cannot_emulate;
> +			break;
>   

Ditto.

> +		default:
> +			printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
> +					  c->modrm);
> +			goto cannot_emulate;
> +		}
> +
> +		c->dst.type = OP_NONE;  /* Disable writeback. */
> +		break;
> +	}
>  	case 0x8f:		/* pop (sole member of Grp1a) */
>  		rc = emulate_grp1a(ctxt, ops);
>  		if (rc != 0)
>  			goto done;
>  		break;
> +	case 0x90 ... 0x97: /* xchg ax, r16 / xchg eax, r32 / xchg rax, r64 */
>   

0x90 is a special case, nop (it's different from xchg %eax, %eax!)

> +		c->src.ptr = & c->regs[c->b & 0x7];
> +		c->dst.ptr = & c->regs[VCPU_REGS_RAX];
> +
> +		switch (c->op_bytes) {
> +                case 2:
> +			c->dst.val = *(u16*) c->dst.ptr;
> +			c->src.val = *(u16*) c->src.ptr;
> +			*(u16 *) c->dst.ptr = (u16) c->src.val;
> +			*(u16 *) c->src.ptr = (u16) c->dst.val;
>   

You only need to write back the source pointer. The destination pointer 
will be written back by the writeback code.

> +			break;
> +		case 4:
> +			c->dst.val = *(u32*) c->dst.ptr;
> +			c->src.val = *(u32*) c->src.ptr;
> +			*(u32 *) c->dst.ptr = (u32) c->src.val;
> +			*(u32 *) c->src.ptr = (u32) c->dst.val;
>   

Needs to be (unsigned long) for the destination, since x86_64 sign extends.

> +			break;
> +		case 8:
> +			c->dst.val = *(u64*) c->dst.ptr;
> +			c->src.val = *(u64*) c->src.ptr;
> +			*(u64 *) c->dst.ptr = (u64) c->src.val;
> +			*(u64 *) c->src.ptr = (u64) c->dst.val;
>   

On 32-bit registers are not u64 (the code will never be executed, but 
still).

> +			break;
> +		default:
> +			printk("xchg: op_bytes=%d is not supported.\n", c->op_bytes);
> +			goto cannot_emulate;
> +		}
> +		c->dst.type = OP_NONE;  /* Disable writeback. */
> +		break;
> +	case 0xb8: /* mov r, imm */
>   

0xb8 .. 0xbf. Use DstReg instead of decoding yourself.

> +		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX + (c->b & 0x7)];
> +		goto mov;
>  	case 0x9c: /* pushf */
>  		c->src.val =  (unsigned long) ctxt->eflags;
>  		emulate_push(ctxt);
> @@ -1623,6 +1737,10 @@ special_insn:
>  		DPRINTF("Urk! I don't handle SCAS.\n");
>  		goto cannot_emulate;
>  	case 0xc0 ... 0xc1:
> +		if ((c->d & ModRM) && c->modrm_mod == 3) {
> +			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
> +			c->dst.ptr =  decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
> +		}
>   

Again, fix decode_modrm instead.

>  		emulate_grp2(ctxt);
>  		break;
>  	case 0xc3: /* ret */
> @@ -1660,6 +1778,36 @@ special_insn:
>  		break;
>  	}
>  	case 0xe9: /* jmp rel */
> +		jmp_rel(c, c->src.val);
> +		c->dst.type = OP_NONE; /* Disable writeback. */
> +		break;
> +	case 0xea: /* jmp far */ {
> +		uint32_t eip;
> +		uint16_t sel;
> +
> +		switch (c->op_bytes) {
> +		case 2:
> +			eip = insn_fetch(u16, 2, c->eip);
> +			eip = eip & 0x0000FFFF; /* clear upper 16 bits */
>   

No need, insn_fetch will not sign extend.

> +			break;
> +		case 4:
> +			eip = insn_fetch(u32, 4, c->eip);
> +			break;
> +		default:
> +			DPRINTF("jmp far: Invalid op_bytes\n");
> +			goto cannot_emulate;
> +		}
> +		sel = insn_fetch(u16, 2, c->eip);
> +		if (ctxt->mode == X86EMUL_MODE_REAL)
> +			eip |= (sel << 4);
>   

No, eip doesn't change.

> +		else if (load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
> +			DPRINTF("jmp far: Failed to load CS descriptor\n");
> +			goto cannot_emulate;
> +		}
> +
> +		c->eip = eip;
> +		break;
> +	}
>  	case 0xeb: /* jmp rel short */
>  		jmp_rel(c, c->src.val);
>  		c->dst.type = OP_NONE; /* Disable writeback. */
> diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
> index a71f3aa..fb85da4 100644

The patch should be split into many parts (enhancing emulator 
infrastructure, one patch per insn, vmx changes), but in essence it does 
the right things.


-- 
error compiling committee.c: too many arguments to function


  reply	other threads:[~2008-05-21 13:59 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-05-21  9:34 [RFC] Patch - Big real mode emulation Guillaume Thouvenin
2008-05-21 13:59 ` Avi Kivity [this message]
2008-05-21 14:10   ` Avi Kivity
2008-05-22  8:55     ` Guillaume Thouvenin
2008-05-21 17:19   ` Marcelo Tosatti
2008-05-21 15:32 ` Mohammed Gamal
2008-05-21 16:18 ` Marcelo Tosatti
2008-05-22  9:02   ` Guillaume Thouvenin
2008-05-21 23:18 ` Kamble, Nitin A
2008-05-22 22:52   ` Mohammed Gamal

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=48342AD2.40406@qumranet.com \
    --to=avi@qumranet.com \
    --cc=alex@csgraf.de \
    --cc=anthony@codemonkey.ws \
    --cc=guillaume.thouvenin@ext.bull.net \
    --cc=kvm@vger.kernel.org \
    --cc=m.gamal005@gmail.com \
    --cc=mtosatti@redhat.com \
    --cc=nitin.a.kamble@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.