* [PATCH] x86/xpti: avoid copying L4 page table contents when possible
@ 2018-02-15 12:52 Juergen Gross
2018-02-19 16:59 ` Jan Beulich
[not found] ` <5A8B107302000078001A9407@suse.com>
0 siblings, 2 replies; 7+ messages in thread
From: Juergen Gross @ 2018-02-15 12:52 UTC (permalink / raw)
To: xen-devel; +Cc: Juergen Gross, andrew.cooper3, jbeulich, dfaggioli
For mitigation of Meltdown the current L4 page table is copied to the
cpu local root page table each time a 64 bit pv guest is entered.
Copying can be avoided in cases where the guest L4 page table hasn't
been modified while running the hypervisor, e.g. when handling
interrupts or any hypercall not modifying the L4 page table or %cr3.
So add a per-cpu flag whether the copying should be performed and set
that flag only when loading a new %cr3 or modifying the L4 page table.
This includes synchronization of the cpu local root page table with
other cpus, so add a special synchronization flag for that case.
A simple performance check (compiling the hypervisor via "make -j 4")
in dom0 with 4 vcpus shows a significant improvement:
- real time drops from 112 seconds to 103 seconds
- system time drops from 142 seconds to 131 seconds
Signed-off-by: Juergen Gross <jgross@suse.com>
---
To be applied on top of Jan's "Meltdown band-aid overhead reduction"
series
---
xen/arch/x86/mm.c | 32 +++++++++++++++++++-------------
xen/arch/x86/pv/domain.c | 1 +
xen/arch/x86/smp.c | 2 ++
xen/arch/x86/x86_64/asm-offsets.c | 1 +
xen/arch/x86/x86_64/entry.S | 8 ++++++--
xen/include/asm-x86/current.h | 8 ++++++++
xen/include/asm-x86/flushtlb.h | 2 ++
7 files changed, 39 insertions(+), 15 deletions(-)
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 4c9340ca30..5340a49f00 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -510,6 +510,8 @@ void make_cr3(struct vcpu *v, mfn_t mfn)
void write_ptbase(struct vcpu *v)
{
write_cr3(v->arch.cr3);
+ /* Setting copy_l4 unconditionally does no harm. */
+ get_cpu_info()->copy_l4 = true;
}
/*
@@ -3704,18 +3706,22 @@ long do_mmu_update(
break;
rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
- /*
- * No need to sync if all uses of the page can be accounted
- * to the page lock we hold, its pinned status, and uses on
- * this (v)CPU.
- */
- if ( !rc && !cpu_has_no_xpti &&
- ((page->u.inuse.type_info & PGT_count_mask) >
- (1 + !!(page->u.inuse.type_info & PGT_pinned) +
- (pagetable_get_pfn(curr->arch.guest_table) == mfn) +
- (pagetable_get_pfn(curr->arch.guest_table_user) ==
- mfn))) )
- sync_guest = true;
+ if ( !rc && !cpu_has_no_xpti )
+ {
+ get_cpu_info()->copy_l4 = true;
+ /*
+ * No need to sync if all uses of the page can be
+ * accounted to the page lock we hold, its pinned
+ * status, and uses on this (v)CPU.
+ */
+ if ( (page->u.inuse.type_info & PGT_count_mask) >
+ (1 + !!(page->u.inuse.type_info & PGT_pinned) +
+ (pagetable_get_pfn(curr->arch.guest_table) ==
+ mfn) +
+ (pagetable_get_pfn(curr->arch.guest_table_user) ==
+ mfn)) )
+ sync_guest = true;
+ }
break;
case PGT_writable_page:
@@ -3830,7 +3836,7 @@ long do_mmu_update(
cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu));
if ( !cpumask_empty(mask) )
- flush_mask(mask, FLUSH_TLB_GLOBAL);
+ flush_mask(mask, FLUSH_TLB_GLOBAL | XPTI_L4_UPDATE);
}
perfc_add(num_page_updates, i);
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index 868a23fd7e..4e6d29e76f 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -238,6 +238,7 @@ static void _toggle_guest_pt(struct vcpu *v, bool force_cr3)
/* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
+ get_cpu_info()->copy_l4 = true;
if ( !(v->arch.flags & TF_kernel_mode) )
return;
diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
index 033dd05958..6e3c8904fb 100644
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -207,6 +207,8 @@ void invalidate_interrupt(struct cpu_user_regs *regs)
unsigned int flags = flush_flags;
ack_APIC_irq();
perfc_incr(ipis);
+ if ( flags & XPTI_L4_UPDATE )
+ get_cpu_info()->copy_l4 = true;
if ( (flags & FLUSH_VCPU_STATE) && __sync_local_execstate() )
flags &= ~(FLUSH_TLB | FLUSH_TLB_GLOBAL);
if ( flags & ~(FLUSH_VCPU_STATE | FLUSH_ORDER_MASK) )
diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
index e925e6589c..362fd1cbd6 100644
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -144,6 +144,7 @@ void __dummy__(void)
OFFSET(CPUINFO_shadow_spec_ctrl, struct cpu_info, shadow_spec_ctrl);
OFFSET(CPUINFO_use_shadow_spec_ctrl, struct cpu_info, use_shadow_spec_ctrl);
OFFSET(CPUINFO_bti_ist_info, struct cpu_info, bti_ist_info);
+ OFFSET(CPUINFO_copy_l4, struct cpu_info, copy_l4);
DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info));
BLANK();
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index b955cbd43d..764bf792cb 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -46,10 +46,13 @@ restore_all_guest:
.Lrag_cr3_start:
mov VCPU_cr3(%rbx), %r9
GET_STACK_END(dx)
- mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
+ mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rax
+ cmpb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
+ je .Lrag_copyend
+ movb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
movabs $PADDR_MASK & PAGE_MASK, %rsi
movabs $DIRECTMAP_VIRT_START, %rcx
- mov %rdi, %rax
+ mov %rax, %rdi
and %rsi, %rdi
and %r9, %rsi
add %rcx, %rdi
@@ -65,6 +68,7 @@ restore_all_guest:
sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \
ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi
rep movsq
+.Lrag_copyend:
mov STACK_CPUINFO_FIELD(cr4)(%rdx), %rdi
mov %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx)
mov %rdi, %rsi
diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h
index 4678a0fcf5..23957206c3 100644
--- a/xen/include/asm-x86/current.h
+++ b/xen/include/asm-x86/current.h
@@ -59,6 +59,14 @@ struct cpu_info {
bool use_shadow_spec_ctrl;
uint8_t bti_ist_info;
+ /*
+ * The following field controls copying of the L4 page table of 64-bit
+ * PV guests to the per-cpu root page table on entering the guest context.
+ * If set the L4 page table is being copied to the root page table and
+ * the field will be reset.
+ */
+ bool copy_l4;
+
unsigned long __pad;
/* get_stack_bottom() must be 16-byte aligned */
};
diff --git a/xen/include/asm-x86/flushtlb.h b/xen/include/asm-x86/flushtlb.h
index 2cade9cbfb..bd0493fba0 100644
--- a/xen/include/asm-x86/flushtlb.h
+++ b/xen/include/asm-x86/flushtlb.h
@@ -103,6 +103,8 @@ void write_cr3(unsigned long cr3);
#define FLUSH_VA_VALID 0x800
/* Flush CPU state */
#define FLUSH_VCPU_STATE 0x1000
+ /* Update XPTI root page table */
+#define XPTI_L4_UPDATE 0x2000
/* Flush local TLBs/caches. */
unsigned int flush_area_local(const void *va, unsigned int flags);
--
2.13.6
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] x86/xpti: avoid copying L4 page table contents when possible
2018-02-15 12:52 [PATCH] x86/xpti: avoid copying L4 page table contents when possible Juergen Gross
@ 2018-02-19 16:59 ` Jan Beulich
[not found] ` <5A8B107302000078001A9407@suse.com>
1 sibling, 0 replies; 7+ messages in thread
From: Jan Beulich @ 2018-02-19 16:59 UTC (permalink / raw)
To: Juergen Gross; +Cc: andrew.cooper3, Dario Faggioli, xen-devel
>>> On 15.02.18 at 13:52, <jgross@suse.com> wrote:
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -510,6 +510,8 @@ void make_cr3(struct vcpu *v, mfn_t mfn)
> void write_ptbase(struct vcpu *v)
> {
> write_cr3(v->arch.cr3);
> + /* Setting copy_l4 unconditionally does no harm. */
> + get_cpu_info()->copy_l4 = true;
To limit code churn when 5-level page tables get introduced, can
you please avoid using l4 when you really mean the top level page
table (irrespective of how many levels there are)? I'm also not
convinced of using "copy" in the field name - you want to indicate
that the page table changed, yet what action the consumer of the
flag takes doesn't really matter elsewhere. What about
"root_pgt_changed"?
Additionally - why would you set this for a 32-bit vCPU? Further,
what about clearing the flag when context switching out a PV
vCPU? I realize both are benign with just the single current
consumer plus the fact that the flag will always be set when
context switching in a (64-bit) PV vCPU, but the value of the
flag could be misleading during debugging, and could become an
actual problem if another consumer appeared.
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -46,10 +46,13 @@ restore_all_guest:
> .Lrag_cr3_start:
> mov VCPU_cr3(%rbx), %r9
> GET_STACK_END(dx)
> - mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
> + mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rax
> + cmpb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
> + je .Lrag_copyend
> + movb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
Use %bl instead of $0 in both cases (with a comment)?
> @@ -65,6 +68,7 @@ restore_all_guest:
> sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \
> ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi
> rep movsq
> +.Lrag_copyend:
.Lrag_copy_end (or .Lrag_copy_done)
> --- a/xen/include/asm-x86/flushtlb.h
> +++ b/xen/include/asm-x86/flushtlb.h
> @@ -103,6 +103,8 @@ void write_cr3(unsigned long cr3);
> #define FLUSH_VA_VALID 0x800
> /* Flush CPU state */
> #define FLUSH_VCPU_STATE 0x1000
> + /* Update XPTI root page table */
> +#define XPTI_L4_UPDATE 0x2000
Please keep this in line with its siblings, i.e. have it have a FLUSH_
prefix.
Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] x86/xpti: avoid copying L4 page table contents when possible
[not found] ` <5A8B107302000078001A9407@suse.com>
@ 2018-02-19 17:19 ` Juergen Gross
2018-02-20 8:56 ` Jan Beulich
[not found] ` <5A8BF0BE02000078001A9781@suse.com>
0 siblings, 2 replies; 7+ messages in thread
From: Juergen Gross @ 2018-02-19 17:19 UTC (permalink / raw)
To: Jan Beulich; +Cc: andrew.cooper3, xen-devel, Dario Faggioli
On 19/02/18 17:59, Jan Beulich wrote:
>>>> On 15.02.18 at 13:52, <jgross@suse.com> wrote:
>> --- a/xen/arch/x86/mm.c
>> +++ b/xen/arch/x86/mm.c
>> @@ -510,6 +510,8 @@ void make_cr3(struct vcpu *v, mfn_t mfn)
>> void write_ptbase(struct vcpu *v)
>> {
>> write_cr3(v->arch.cr3);
>> + /* Setting copy_l4 unconditionally does no harm. */
>> + get_cpu_info()->copy_l4 = true;
>
> To limit code churn when 5-level page tables get introduced, can
> you please avoid using l4 when you really mean the top level page
> table (irrespective of how many levels there are)? I'm also not
> convinced of using "copy" in the field name - you want to indicate
> that the page table changed, yet what action the consumer of the
> flag takes doesn't really matter elsewhere. What about
> "root_pgt_changed"?
Works for me.
> Additionally - why would you set this for a 32-bit vCPU? Further,
> what about clearing the flag when context switching out a PV
> vCPU? I realize both are benign with just the single current
> consumer plus the fact that the flag will always be set when
> context switching in a (64-bit) PV vCPU, but the value of the
> flag could be misleading during debugging, and could become an
> actual problem if another consumer appeared.
In fact I have another patch pending which requires to special
case the 64-bit PV domain here, resulting in setting the bit
for those doamins only. I can move that special casing into this
patch and add resetting the flag for other domains.
>
>> --- a/xen/arch/x86/x86_64/entry.S
>> +++ b/xen/arch/x86/x86_64/entry.S
>> @@ -46,10 +46,13 @@ restore_all_guest:
>> .Lrag_cr3_start:
>> mov VCPU_cr3(%rbx), %r9
>> GET_STACK_END(dx)
>> - mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
>> + mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rax
>> + cmpb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
>> + je .Lrag_copyend
>> + movb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
>
> Use %bl instead of $0 in both cases (with a comment)?
Do you really think we should add such a micro optimization relying
on the vcpu pointer being aligned to page boundary? And are you sure
adding the register dependency isn't hurting more than we will gain
from saving an instruction byte?
>
>> @@ -65,6 +68,7 @@ restore_all_guest:
>> sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \
>> ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi
>> rep movsq
>> +.Lrag_copyend:
>
> .Lrag_copy_end (or .Lrag_copy_done)
Okay.
>
>> --- a/xen/include/asm-x86/flushtlb.h
>> +++ b/xen/include/asm-x86/flushtlb.h
>> @@ -103,6 +103,8 @@ void write_cr3(unsigned long cr3);
>> #define FLUSH_VA_VALID 0x800
>> /* Flush CPU state */
>> #define FLUSH_VCPU_STATE 0x1000
>> + /* Update XPTI root page table */
>> +#define XPTI_L4_UPDATE 0x2000
>
> Please keep this in line with its siblings, i.e. have it have a FLUSH_
> prefix.
Okay.
Juergen
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] x86/xpti: avoid copying L4 page table contents when possible
2018-02-19 17:19 ` Juergen Gross
@ 2018-02-20 8:56 ` Jan Beulich
[not found] ` <5A8BF0BE02000078001A9781@suse.com>
1 sibling, 0 replies; 7+ messages in thread
From: Jan Beulich @ 2018-02-20 8:56 UTC (permalink / raw)
To: Juergen Gross; +Cc: andrew.cooper3, Dario Faggioli, xen-devel
>>> On 19.02.18 at 18:19, <jgross@suse.com> wrote:
> On 19/02/18 17:59, Jan Beulich wrote:
>>>>> On 15.02.18 at 13:52, <jgross@suse.com> wrote:
>>> --- a/xen/arch/x86/x86_64/entry.S
>>> +++ b/xen/arch/x86/x86_64/entry.S
>>> @@ -46,10 +46,13 @@ restore_all_guest:
>>> .Lrag_cr3_start:
>>> mov VCPU_cr3(%rbx), %r9
>>> GET_STACK_END(dx)
>>> - mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
>>> + mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rax
>>> + cmpb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
>>> + je .Lrag_copyend
>>> + movb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
>>
>> Use %bl instead of $0 in both cases (with a comment)?
>
> Do you really think we should add such a micro optimization relying
> on the vcpu pointer being aligned to page boundary? And are you sure
> adding the register dependency isn't hurting more than we will gain
> from saving an instruction byte?
Well, both of what you say are the precise reasons why I've
asked a question rather than asking for the change to be done
unconditionally. I'm not really worried about the register
dependency - %rbx is loaded sufficiently much earlier. And
the struct-vcpu-is-aligned aspect is why I did suggest adding
a comment; I assume you realize we can't easily break that
alignment, hence I see no significant risk here. But I'm not
going to insist, and I'm open for further counter arguments.
Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] x86/xpti: avoid copying L4 page table contents when possible
[not found] ` <5A8BF0BE02000078001A9781@suse.com>
@ 2018-02-20 9:03 ` Juergen Gross
2018-02-20 9:10 ` Jan Beulich
[not found] ` <5A8BF40302000078001A97C1@suse.com>
0 siblings, 2 replies; 7+ messages in thread
From: Juergen Gross @ 2018-02-20 9:03 UTC (permalink / raw)
To: Jan Beulich; +Cc: andrew.cooper3, xen-devel, Dario Faggioli
On 20/02/18 09:56, Jan Beulich wrote:
>>>> On 19.02.18 at 18:19, <jgross@suse.com> wrote:
>> On 19/02/18 17:59, Jan Beulich wrote:
>>>>>> On 15.02.18 at 13:52, <jgross@suse.com> wrote:
>>>> --- a/xen/arch/x86/x86_64/entry.S
>>>> +++ b/xen/arch/x86/x86_64/entry.S
>>>> @@ -46,10 +46,13 @@ restore_all_guest:
>>>> .Lrag_cr3_start:
>>>> mov VCPU_cr3(%rbx), %r9
>>>> GET_STACK_END(dx)
>>>> - mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
>>>> + mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rax
>>>> + cmpb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
>>>> + je .Lrag_copyend
>>>> + movb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
>>>
>>> Use %bl instead of $0 in both cases (with a comment)?
>>
>> Do you really think we should add such a micro optimization relying
>> on the vcpu pointer being aligned to page boundary? And are you sure
>> adding the register dependency isn't hurting more than we will gain
>> from saving an instruction byte?
>
> Well, both of what you say are the precise reasons why I've
> asked a question rather than asking for the change to be done
> unconditionally. I'm not really worried about the register
> dependency - %rbx is loaded sufficiently much earlier. And
> the struct-vcpu-is-aligned aspect is why I did suggest adding
> a comment; I assume you realize we can't easily break that
> alignment, hence I see no significant risk here. But I'm not
> going to insist, and I'm open for further counter arguments.
What about the following idea: instead of special casing this single
instance of replacing $0 with a register we could e.g. replace all
%rbx uses by %r13 and set %rbx to 0 globally. This would enable us
to replace _all_ $0 uses by %rbx/%ebx/%bx/%bl.
Juergen
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] x86/xpti: avoid copying L4 page table contents when possible
2018-02-20 9:03 ` Juergen Gross
@ 2018-02-20 9:10 ` Jan Beulich
[not found] ` <5A8BF40302000078001A97C1@suse.com>
1 sibling, 0 replies; 7+ messages in thread
From: Jan Beulich @ 2018-02-20 9:10 UTC (permalink / raw)
To: Juergen Gross; +Cc: andrew.cooper3, Dario Faggioli, xen-devel
>>> On 20.02.18 at 10:03, <jgross@suse.com> wrote:
> On 20/02/18 09:56, Jan Beulich wrote:
>>>>> On 19.02.18 at 18:19, <jgross@suse.com> wrote:
>>> On 19/02/18 17:59, Jan Beulich wrote:
>>>>>>> On 15.02.18 at 13:52, <jgross@suse.com> wrote:
>>>>> --- a/xen/arch/x86/x86_64/entry.S
>>>>> +++ b/xen/arch/x86/x86_64/entry.S
>>>>> @@ -46,10 +46,13 @@ restore_all_guest:
>>>>> .Lrag_cr3_start:
>>>>> mov VCPU_cr3(%rbx), %r9
>>>>> GET_STACK_END(dx)
>>>>> - mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
>>>>> + mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rax
>>>>> + cmpb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
>>>>> + je .Lrag_copyend
>>>>> + movb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
>>>>
>>>> Use %bl instead of $0 in both cases (with a comment)?
>>>
>>> Do you really think we should add such a micro optimization relying
>>> on the vcpu pointer being aligned to page boundary? And are you sure
>>> adding the register dependency isn't hurting more than we will gain
>>> from saving an instruction byte?
>>
>> Well, both of what you say are the precise reasons why I've
>> asked a question rather than asking for the change to be done
>> unconditionally. I'm not really worried about the register
>> dependency - %rbx is loaded sufficiently much earlier. And
>> the struct-vcpu-is-aligned aspect is why I did suggest adding
>> a comment; I assume you realize we can't easily break that
>> alignment, hence I see no significant risk here. But I'm not
>> going to insist, and I'm open for further counter arguments.
>
> What about the following idea: instead of special casing this single
> instance of replacing $0 with a register we could e.g. replace all
> %rbx uses by %r13 and set %rbx to 0 globally. This would enable us
> to replace _all_ $0 uses by %rbx/%ebx/%bx/%bl.
I dislike this, both because use of %r13 will require a REX prefix
even on insns that currently don't need one, and because I
dislike wasting a register just to hold zero. It's no like we have
that many callee-saved registers left, now that we had to start
using some of them beyond %rbx and %rbp.
Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] x86/xpti: avoid copying L4 page table contents when possible
[not found] ` <5A8BF40302000078001A97C1@suse.com>
@ 2018-02-20 9:22 ` Juergen Gross
0 siblings, 0 replies; 7+ messages in thread
From: Juergen Gross @ 2018-02-20 9:22 UTC (permalink / raw)
To: Jan Beulich; +Cc: andrew.cooper3, xen-devel, Dario Faggioli
On 20/02/18 10:10, Jan Beulich wrote:
>>>> On 20.02.18 at 10:03, <jgross@suse.com> wrote:
>> On 20/02/18 09:56, Jan Beulich wrote:
>>>>>> On 19.02.18 at 18:19, <jgross@suse.com> wrote:
>>>> On 19/02/18 17:59, Jan Beulich wrote:
>>>>>>>> On 15.02.18 at 13:52, <jgross@suse.com> wrote:
>>>>>> --- a/xen/arch/x86/x86_64/entry.S
>>>>>> +++ b/xen/arch/x86/x86_64/entry.S
>>>>>> @@ -46,10 +46,13 @@ restore_all_guest:
>>>>>> .Lrag_cr3_start:
>>>>>> mov VCPU_cr3(%rbx), %r9
>>>>>> GET_STACK_END(dx)
>>>>>> - mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
>>>>>> + mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rax
>>>>>> + cmpb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
>>>>>> + je .Lrag_copyend
>>>>>> + movb $0, STACK_CPUINFO_FIELD(copy_l4)(%rdx)
>>>>>
>>>>> Use %bl instead of $0 in both cases (with a comment)?
>>>>
>>>> Do you really think we should add such a micro optimization relying
>>>> on the vcpu pointer being aligned to page boundary? And are you sure
>>>> adding the register dependency isn't hurting more than we will gain
>>>> from saving an instruction byte?
>>>
>>> Well, both of what you say are the precise reasons why I've
>>> asked a question rather than asking for the change to be done
>>> unconditionally. I'm not really worried about the register
>>> dependency - %rbx is loaded sufficiently much earlier. And
>>> the struct-vcpu-is-aligned aspect is why I did suggest adding
>>> a comment; I assume you realize we can't easily break that
>>> alignment, hence I see no significant risk here. But I'm not
>>> going to insist, and I'm open for further counter arguments.
>>
>> What about the following idea: instead of special casing this single
>> instance of replacing $0 with a register we could e.g. replace all
>> %rbx uses by %r13 and set %rbx to 0 globally. This would enable us
>> to replace _all_ $0 uses by %rbx/%ebx/%bx/%bl.
>
> I dislike this, both because use of %r13 will require a REX prefix
> even on insns that currently don't need one, and because I
> dislike wasting a register just to hold zero. It's no like we have
> that many callee-saved registers left, now that we had to start
> using some of them beyond %rbx and %rbp.
Okay, fair enough.
Nevertheless I'd like to defer replacing $0 by %bl to another patch,
especially as there are are multiple other places where this could be
done (e.g. in test_events).
Juergen
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2018-02-20 9:22 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-02-15 12:52 [PATCH] x86/xpti: avoid copying L4 page table contents when possible Juergen Gross
2018-02-19 16:59 ` Jan Beulich
[not found] ` <5A8B107302000078001A9407@suse.com>
2018-02-19 17:19 ` Juergen Gross
2018-02-20 8:56 ` Jan Beulich
[not found] ` <5A8BF0BE02000078001A9781@suse.com>
2018-02-20 9:03 ` Juergen Gross
2018-02-20 9:10 ` Jan Beulich
[not found] ` <5A8BF40302000078001A97C1@suse.com>
2018-02-20 9:22 ` Juergen Gross
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).