* [PATCH 1/3] x86: fix FS/GS base handling when using the fsgsbase feature
2014-02-05 14:48 [PATCH 0/3] guest context management adjustments Jan Beulich
@ 2014-02-05 14:52 ` Jan Beulich
2014-02-05 15:50 ` Andrew Cooper
2014-02-05 14:54 ` [PATCH 2/3] domctl: also pause domain for extended context updates Jan Beulich
` (3 subsequent siblings)
4 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2014-02-05 14:52 UTC (permalink / raw)
To: xen-devel; +Cc: George Dunlap, Keir Fraser
[-- Attachment #1: Type: text/plain, Size: 5075 bytes --]
In that case, due to the respective instructions not being privileged,
we can't rely on our in-memory data to always be correct: While the
guest is running, it may change without us knowing about it. Therefore
we need to
- read the correct values from hardware during context switch out
(save_segments())
- read the correct values from hardware during RDMSR emulation
- update in-memory values during guest mode change
(toggle_guest_mode())
For completeness/consistency, WRMSR emulation is also being switched
to use wr[fg]sbase().
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1251,6 +1251,15 @@ static void save_segments(struct vcpu *v
regs->fs = read_segment_register(fs);
regs->gs = read_segment_register(gs);
+ if ( cpu_has_fsgsbase && !is_pv_32bit_vcpu(v) )
+ {
+ v->arch.pv_vcpu.fs_base = __rdfsbase();
+ if ( v->arch.flags & TF_kernel_mode )
+ v->arch.pv_vcpu.gs_base_kernel = __rdgsbase();
+ else
+ v->arch.pv_vcpu.gs_base_user = __rdgsbase();
+ }
+
if ( regs->ds )
dirty_segment_mask |= DIRTY_DS;
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2382,15 +2382,13 @@ static int emulate_privileged_op(struct
case MSR_FS_BASE:
if ( is_pv_32on64_vcpu(v) )
goto fail;
- if ( wrmsr_safe(MSR_FS_BASE, msr_content) )
- goto fail;
+ wrfsbase(msr_content);
v->arch.pv_vcpu.fs_base = msr_content;
break;
case MSR_GS_BASE:
if ( is_pv_32on64_vcpu(v) )
goto fail;
- if ( wrmsr_safe(MSR_GS_BASE, msr_content) )
- goto fail;
+ wrgsbase(msr_content);
v->arch.pv_vcpu.gs_base_kernel = msr_content;
break;
case MSR_SHADOW_GS_BASE:
@@ -2535,15 +2533,14 @@ static int emulate_privileged_op(struct
case MSR_FS_BASE:
if ( is_pv_32on64_vcpu(v) )
goto fail;
- regs->eax = v->arch.pv_vcpu.fs_base & 0xFFFFFFFFUL;
- regs->edx = v->arch.pv_vcpu.fs_base >> 32;
- break;
+ val = cpu_has_fsgsbase ? __rdfsbase() : v->arch.pv_vcpu.fs_base;
+ goto rdmsr_writeback;
case MSR_GS_BASE:
if ( is_pv_32on64_vcpu(v) )
goto fail;
- regs->eax = v->arch.pv_vcpu.gs_base_kernel & 0xFFFFFFFFUL;
- regs->edx = v->arch.pv_vcpu.gs_base_kernel >> 32;
- break;
+ val = cpu_has_fsgsbase ? __rdgsbase()
+ : v->arch.pv_vcpu.gs_base_kernel;
+ goto rdmsr_writeback;
case MSR_SHADOW_GS_BASE:
if ( is_pv_32on64_vcpu(v) )
goto fail;
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -257,6 +257,13 @@ void toggle_guest_mode(struct vcpu *v)
{
if ( is_pv_32bit_vcpu(v) )
return;
+ if ( cpu_has_fsgsbase )
+ {
+ if ( v->arch.flags & TF_kernel_mode )
+ v->arch.pv_vcpu.gs_base_kernel = __rdgsbase();
+ else
+ v->arch.pv_vcpu.gs_base_user = __rdgsbase();
+ }
v->arch.flags ^= TF_kernel_mode;
asm volatile ( "swapgs" );
update_cr3(v);
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -98,34 +98,52 @@ static inline int wrmsr_safe(unsigned in
: "=a" (low), "=d" (high) \
: "c" (counter))
-static inline unsigned long rdfsbase(void)
+static inline unsigned long __rdfsbase(void)
{
unsigned long base;
- if ( cpu_has_fsgsbase )
#ifdef HAVE_GAS_FSGSBASE
- asm volatile ( "rdfsbase %0" : "=r" (base) );
+ asm volatile ( "rdfsbase %0" : "=r" (base) );
#else
- asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc0" : "=a" (base) );
+ asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc0" : "=a" (base) );
#endif
- else
- rdmsrl(MSR_FS_BASE, base);
return base;
}
-static inline unsigned long rdgsbase(void)
+static inline unsigned long __rdgsbase(void)
{
unsigned long base;
- if ( cpu_has_fsgsbase )
#ifdef HAVE_GAS_FSGSBASE
- asm volatile ( "rdgsbase %0" : "=r" (base) );
+ asm volatile ( "rdgsbase %0" : "=r" (base) );
#else
- asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc8" : "=a" (base) );
+ asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc8" : "=a" (base) );
#endif
- else
- rdmsrl(MSR_GS_BASE, base);
+
+ return base;
+}
+
+static inline unsigned long rdfsbase(void)
+{
+ unsigned long base;
+
+ if ( cpu_has_fsgsbase )
+ return __rdfsbase();
+
+ rdmsrl(MSR_FS_BASE, base);
+
+ return base;
+}
+
+static inline unsigned long rdgsbase(void)
+{
+ unsigned long base;
+
+ if ( cpu_has_fsgsbase )
+ return __rdgsbase();
+
+ rdmsrl(MSR_GS_BASE, base);
return base;
}
[-- Attachment #2: x86-pv-fsgsbase-rw.patch --]
[-- Type: text/plain, Size: 5135 bytes --]
x86: fix FS/GS base handling when using the fsgsbase feature
In that case, due to the respective instructions not being privileged,
we can't rely on our in-memory data to always be correct: While the
guest is running, it may change without us knowing about it. Therefore
we need to
- read the correct values from hardware during context switch out
(save_segments())
- read the correct values from hardware during RDMSR emulation
- update in-memory values during guest mode change
(toggle_guest_mode())
For completeness/consistency, WRMSR emulation is also being switched
to use wr[fg]sbase().
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1251,6 +1251,15 @@ static void save_segments(struct vcpu *v
regs->fs = read_segment_register(fs);
regs->gs = read_segment_register(gs);
+ if ( cpu_has_fsgsbase && !is_pv_32bit_vcpu(v) )
+ {
+ v->arch.pv_vcpu.fs_base = __rdfsbase();
+ if ( v->arch.flags & TF_kernel_mode )
+ v->arch.pv_vcpu.gs_base_kernel = __rdgsbase();
+ else
+ v->arch.pv_vcpu.gs_base_user = __rdgsbase();
+ }
+
if ( regs->ds )
dirty_segment_mask |= DIRTY_DS;
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2382,15 +2382,13 @@ static int emulate_privileged_op(struct
case MSR_FS_BASE:
if ( is_pv_32on64_vcpu(v) )
goto fail;
- if ( wrmsr_safe(MSR_FS_BASE, msr_content) )
- goto fail;
+ wrfsbase(msr_content);
v->arch.pv_vcpu.fs_base = msr_content;
break;
case MSR_GS_BASE:
if ( is_pv_32on64_vcpu(v) )
goto fail;
- if ( wrmsr_safe(MSR_GS_BASE, msr_content) )
- goto fail;
+ wrgsbase(msr_content);
v->arch.pv_vcpu.gs_base_kernel = msr_content;
break;
case MSR_SHADOW_GS_BASE:
@@ -2535,15 +2533,14 @@ static int emulate_privileged_op(struct
case MSR_FS_BASE:
if ( is_pv_32on64_vcpu(v) )
goto fail;
- regs->eax = v->arch.pv_vcpu.fs_base & 0xFFFFFFFFUL;
- regs->edx = v->arch.pv_vcpu.fs_base >> 32;
- break;
+ val = cpu_has_fsgsbase ? __rdfsbase() : v->arch.pv_vcpu.fs_base;
+ goto rdmsr_writeback;
case MSR_GS_BASE:
if ( is_pv_32on64_vcpu(v) )
goto fail;
- regs->eax = v->arch.pv_vcpu.gs_base_kernel & 0xFFFFFFFFUL;
- regs->edx = v->arch.pv_vcpu.gs_base_kernel >> 32;
- break;
+ val = cpu_has_fsgsbase ? __rdgsbase()
+ : v->arch.pv_vcpu.gs_base_kernel;
+ goto rdmsr_writeback;
case MSR_SHADOW_GS_BASE:
if ( is_pv_32on64_vcpu(v) )
goto fail;
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -257,6 +257,13 @@ void toggle_guest_mode(struct vcpu *v)
{
if ( is_pv_32bit_vcpu(v) )
return;
+ if ( cpu_has_fsgsbase )
+ {
+ if ( v->arch.flags & TF_kernel_mode )
+ v->arch.pv_vcpu.gs_base_kernel = __rdgsbase();
+ else
+ v->arch.pv_vcpu.gs_base_user = __rdgsbase();
+ }
v->arch.flags ^= TF_kernel_mode;
asm volatile ( "swapgs" );
update_cr3(v);
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -98,34 +98,52 @@ static inline int wrmsr_safe(unsigned in
: "=a" (low), "=d" (high) \
: "c" (counter))
-static inline unsigned long rdfsbase(void)
+static inline unsigned long __rdfsbase(void)
{
unsigned long base;
- if ( cpu_has_fsgsbase )
#ifdef HAVE_GAS_FSGSBASE
- asm volatile ( "rdfsbase %0" : "=r" (base) );
+ asm volatile ( "rdfsbase %0" : "=r" (base) );
#else
- asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc0" : "=a" (base) );
+ asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc0" : "=a" (base) );
#endif
- else
- rdmsrl(MSR_FS_BASE, base);
return base;
}
-static inline unsigned long rdgsbase(void)
+static inline unsigned long __rdgsbase(void)
{
unsigned long base;
- if ( cpu_has_fsgsbase )
#ifdef HAVE_GAS_FSGSBASE
- asm volatile ( "rdgsbase %0" : "=r" (base) );
+ asm volatile ( "rdgsbase %0" : "=r" (base) );
#else
- asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc8" : "=a" (base) );
+ asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc8" : "=a" (base) );
#endif
- else
- rdmsrl(MSR_GS_BASE, base);
+
+ return base;
+}
+
+static inline unsigned long rdfsbase(void)
+{
+ unsigned long base;
+
+ if ( cpu_has_fsgsbase )
+ return __rdfsbase();
+
+ rdmsrl(MSR_FS_BASE, base);
+
+ return base;
+}
+
+static inline unsigned long rdgsbase(void)
+{
+ unsigned long base;
+
+ if ( cpu_has_fsgsbase )
+ return __rdgsbase();
+
+ rdmsrl(MSR_GS_BASE, base);
return base;
}
[-- Attachment #3: Type: text/plain, Size: 126 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 12+ messages in thread* [PATCH 2/3] domctl: also pause domain for extended context updates
2014-02-05 14:48 [PATCH 0/3] guest context management adjustments Jan Beulich
2014-02-05 14:52 ` [PATCH 1/3] x86: fix FS/GS base handling when using the fsgsbase feature Jan Beulich
@ 2014-02-05 14:54 ` Jan Beulich
2014-02-05 15:05 ` Andrew Cooper
2014-02-05 14:54 ` [PATCH 3/3] domctl: pause vCPU for context reads Jan Beulich
` (2 subsequent siblings)
4 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2014-02-05 14:54 UTC (permalink / raw)
To: xen-devel; +Cc: George Dunlap, Keir Fraser
[-- Attachment #1: Type: text/plain, Size: 2216 bytes --]
This is not just for consistency with "base" context updates, but
actually needed so that guest side accesses can't race with control
domain side updates.
This would have been a security issue if XSA-77 hadn't waived them on
the affected domctl operation.
While looking at the code I also spotted a redundant NULL check in the
"base" context update handling code, which is being removed.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -853,6 +853,8 @@ long arch_do_domctl(
}
else
{
+ if ( d == current->domain ) /* no domain_pause() */
+ break;
ret = -EINVAL;
if ( evc->size < offsetof(typeof(*evc), vmce) )
break;
@@ -861,6 +863,7 @@ long arch_do_domctl(
if ( !is_canonical_address(evc->sysenter_callback_eip) ||
!is_canonical_address(evc->syscall32_callback_eip) )
break;
+ domain_pause(d);
fixup_guest_code_selector(d, evc->sysenter_callback_cs);
v->arch.pv_vcpu.sysenter_callback_cs =
evc->sysenter_callback_cs;
@@ -881,6 +884,8 @@ long arch_do_domctl(
(evc->syscall32_callback_cs & ~3) ||
evc->syscall32_callback_eip )
break;
+ else
+ domain_pause(d);
BUILD_BUG_ON(offsetof(struct xen_domctl_ext_vcpucontext,
mcg_cap) !=
@@ -899,6 +904,8 @@ long arch_do_domctl(
}
else
ret = 0;
+
+ domain_unpause(d);
}
}
break;
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -334,10 +334,6 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
unsigned int vcpu = op->u.vcpucontext.vcpu;
struct vcpu *v;
- ret = -ESRCH;
- if ( d == NULL )
- break;
-
ret = -EINVAL;
if ( (d == current->domain) || /* no domain_pause() */
(vcpu >= d->max_vcpus) || ((v = d->vcpu[vcpu]) == NULL) )
[-- Attachment #2: vcpu-set-context-pause.patch --]
[-- Type: text/plain, Size: 2270 bytes --]
domctl: also pause domain for "extended" context updates
This is not just for consistency with "base" context updates, but
actually needed so that guest side accesses can't race with control
domain side updates.
This would have been a security issue if XSA-77 hadn't waived them on
the affected domctl operation.
While looking at the code I also spotted a redundant NULL check in the
"base" context update handling code, which is being removed.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -853,6 +853,8 @@ long arch_do_domctl(
}
else
{
+ if ( d == current->domain ) /* no domain_pause() */
+ break;
ret = -EINVAL;
if ( evc->size < offsetof(typeof(*evc), vmce) )
break;
@@ -861,6 +863,7 @@ long arch_do_domctl(
if ( !is_canonical_address(evc->sysenter_callback_eip) ||
!is_canonical_address(evc->syscall32_callback_eip) )
break;
+ domain_pause(d);
fixup_guest_code_selector(d, evc->sysenter_callback_cs);
v->arch.pv_vcpu.sysenter_callback_cs =
evc->sysenter_callback_cs;
@@ -881,6 +884,8 @@ long arch_do_domctl(
(evc->syscall32_callback_cs & ~3) ||
evc->syscall32_callback_eip )
break;
+ else
+ domain_pause(d);
BUILD_BUG_ON(offsetof(struct xen_domctl_ext_vcpucontext,
mcg_cap) !=
@@ -899,6 +904,8 @@ long arch_do_domctl(
}
else
ret = 0;
+
+ domain_unpause(d);
}
}
break;
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -334,10 +334,6 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
unsigned int vcpu = op->u.vcpucontext.vcpu;
struct vcpu *v;
- ret = -ESRCH;
- if ( d == NULL )
- break;
-
ret = -EINVAL;
if ( (d == current->domain) || /* no domain_pause() */
(vcpu >= d->max_vcpus) || ((v = d->vcpu[vcpu]) == NULL) )
[-- Attachment #3: Type: text/plain, Size: 126 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 12+ messages in thread* [PATCH 3/3] domctl: pause vCPU for context reads
2014-02-05 14:48 [PATCH 0/3] guest context management adjustments Jan Beulich
2014-02-05 14:52 ` [PATCH 1/3] x86: fix FS/GS base handling when using the fsgsbase feature Jan Beulich
2014-02-05 14:54 ` [PATCH 2/3] domctl: also pause domain for extended context updates Jan Beulich
@ 2014-02-05 14:54 ` Jan Beulich
2014-02-05 15:29 ` Andrew Cooper
2014-02-05 14:55 ` [PATCH 0/3] guest context management adjustments Jan Beulich
2014-02-05 15:35 ` Keir Fraser
4 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2014-02-05 14:54 UTC (permalink / raw)
To: xen-devel; +Cc: George Dunlap, Keir Fraser
[-- Attachment #1: Type: text/plain, Size: 2238 bytes --]
"Base" context reads already paused the subject vCPU when being the
current one, but that special case isn't being properly dealt with
anyway (at the very least when x86's fsgsbase feature is in use), so
just disallow it.
"Extended" context reads so far didn't do any pausing.
While we can't avoid the reported data being stale by the time it
arrives at the caller, this way we at least guarantee that it is
consistent.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -819,7 +819,13 @@ long arch_do_domctl(
if ( domctl->cmd == XEN_DOMCTL_get_ext_vcpucontext )
{
+ if ( v == current ) /* no vcpu_pause() */
+ break;
+
evc->size = sizeof(*evc);
+
+ vcpu_pause(v);
+
if ( is_pv_domain(d) )
{
evc->sysenter_callback_cs =
@@ -849,6 +855,7 @@ long arch_do_domctl(
evc->vmce.mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2;
ret = 0;
+ vcpu_unpause(v);
copyback = 1;
}
else
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -675,11 +675,9 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
struct vcpu *v;
ret = -EINVAL;
- if ( op->u.vcpucontext.vcpu >= d->max_vcpus )
- goto getvcpucontext_out;
-
- ret = -ESRCH;
- if ( (v = d->vcpu[op->u.vcpucontext.vcpu]) == NULL )
+ if ( op->u.vcpucontext.vcpu >= d->max_vcpus ||
+ (v = d->vcpu[op->u.vcpucontext.vcpu]) == NULL ||
+ v == current ) /* no vcpu_pause() */
goto getvcpucontext_out;
ret = -ENODATA;
@@ -694,14 +692,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
if ( (c.nat = xmalloc(struct vcpu_guest_context)) == NULL )
goto getvcpucontext_out;
- if ( v != current )
- vcpu_pause(v);
+ vcpu_pause(v);
arch_get_info_guest(v, c);
ret = 0;
- if ( v != current )
- vcpu_unpause(v);
+ vcpu_unpause(v);
#ifdef CONFIG_COMPAT
if ( !is_pv_32on64_vcpu(v) )
[-- Attachment #2: vcpu-get-context-pause.patch --]
[-- Type: text/plain, Size: 2272 bytes --]
domctl: pause vCPU for context reads
"Base" context reads already paused the subject vCPU when being the
current one, but that special case isn't being properly dealt with
anyway (at the very least when x86's fsgsbase feature is in use), so
just disallow it.
"Extended" context reads so far didn't do any pausing.
While we can't avoid the reported data being stale by the time it
arrives at the caller, this way we at least guarantee that it is
consistent.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -819,7 +819,13 @@ long arch_do_domctl(
if ( domctl->cmd == XEN_DOMCTL_get_ext_vcpucontext )
{
+ if ( v == current ) /* no vcpu_pause() */
+ break;
+
evc->size = sizeof(*evc);
+
+ vcpu_pause(v);
+
if ( is_pv_domain(d) )
{
evc->sysenter_callback_cs =
@@ -849,6 +855,7 @@ long arch_do_domctl(
evc->vmce.mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2;
ret = 0;
+ vcpu_unpause(v);
copyback = 1;
}
else
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -675,11 +675,9 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
struct vcpu *v;
ret = -EINVAL;
- if ( op->u.vcpucontext.vcpu >= d->max_vcpus )
- goto getvcpucontext_out;
-
- ret = -ESRCH;
- if ( (v = d->vcpu[op->u.vcpucontext.vcpu]) == NULL )
+ if ( op->u.vcpucontext.vcpu >= d->max_vcpus ||
+ (v = d->vcpu[op->u.vcpucontext.vcpu]) == NULL ||
+ v == current ) /* no vcpu_pause() */
goto getvcpucontext_out;
ret = -ENODATA;
@@ -694,14 +692,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
if ( (c.nat = xmalloc(struct vcpu_guest_context)) == NULL )
goto getvcpucontext_out;
- if ( v != current )
- vcpu_pause(v);
+ vcpu_pause(v);
arch_get_info_guest(v, c);
ret = 0;
- if ( v != current )
- vcpu_unpause(v);
+ vcpu_unpause(v);
#ifdef CONFIG_COMPAT
if ( !is_pv_32on64_vcpu(v) )
[-- Attachment #3: Type: text/plain, Size: 126 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 12+ messages in thread