* [PATCH V3 01/19] x86: Create per-domain mapping of guest_root_pt
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
@ 2024-05-13 11:10 ` Elias El Yandouzi
2024-05-13 15:27 ` Roger Pau Monné
2024-05-13 11:11 ` [PATCH V3 02/19] x86/pv: Domheap pages should be mapped while relocating initrd Elias El Yandouzi
` (18 subsequent siblings)
19 siblings, 1 reply; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:10 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
Create a per-domain mapping of PV guest_root_pt as direct map is being
removed.
Note that we do not map and unmap root_pgt for now since it is still a
xenheap page.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in V3:
* Rename SHADOW_ROOT
* Haven't addressed the potentially over-allocation issue as I don't get it
Changes in V2:
* Rework the shadow perdomain mapping solution in the follow-up patches
Changes since Hongyan's version:
* Remove the final dot in the commit title
diff --git a/xen/arch/x86/include/asm/config.h b/xen/arch/x86/include/asm/config.h
index ab7288cb36..5d710384df 100644
--- a/xen/arch/x86/include/asm/config.h
+++ b/xen/arch/x86/include/asm/config.h
@@ -203,7 +203,7 @@ extern unsigned char boot_edid_info[128];
/* Slot 260: per-domain mappings (including map cache). */
#define PERDOMAIN_VIRT_START (PML4_ADDR(260))
#define PERDOMAIN_SLOT_MBYTES (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER))
-#define PERDOMAIN_SLOTS 3
+#define PERDOMAIN_SLOTS 4
#define PERDOMAIN_VIRT_SLOT(s) (PERDOMAIN_VIRT_START + (s) * \
(PERDOMAIN_SLOT_MBYTES << 20))
/* Slot 4: mirror of per-domain mappings (for compat xlat area accesses). */
@@ -317,6 +317,14 @@ extern unsigned long xen_phys_start;
#define ARG_XLAT_START(v) \
(ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
+/* pv_root_pt mapping area. The fourth per-domain-mapping sub-area */
+#define PV_ROOT_PT_MAPPING_VIRT_START PERDOMAIN_VIRT_SLOT(3)
+#define PV_ROOT_PT_MAPPING_ENTRIES MAX_VIRT_CPUS
+
+/* The address of a particular VCPU's PV_ROOT_PT */
+#define PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v) \
+ (PV_ROOT_PT_MAPPING_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))
+
#define ELFSIZE 64
#define ARCH_CRASH_SAVE_VMCOREINFO
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index f5daeb182b..8a97530607 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -272,6 +272,7 @@ struct time_scale {
struct pv_domain
{
l1_pgentry_t **gdt_ldt_l1tab;
+ l1_pgentry_t **root_pt_l1tab;
atomic_t nr_l4_pages;
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index d968bbbc73..efdf20f775 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -505,6 +505,13 @@ void share_xen_page_with_guest(struct page_info *page, struct domain *d,
nrspin_unlock(&d->page_alloc_lock);
}
+#define pv_root_pt_idx(v) \
+ ((v)->vcpu_id >> PAGETABLE_ORDER)
+
+#define pv_root_pt_pte(v) \
+ ((v)->domain->arch.pv.root_pt_l1tab[pv_root_pt_idx(v)] + \
+ ((v)->vcpu_id & (L1_PAGETABLE_ENTRIES - 1)))
+
void make_cr3(struct vcpu *v, mfn_t mfn)
{
struct domain *d = v->domain;
@@ -524,6 +531,13 @@ void write_ptbase(struct vcpu *v)
if ( is_pv_vcpu(v) && v->domain->arch.pv.xpti )
{
+ mfn_t guest_root_pt = _mfn(MASK_EXTR(v->arch.cr3, PAGE_MASK));
+ l1_pgentry_t *pte = pv_root_pt_pte(v);
+
+ ASSERT(v == current);
+
+ l1e_write(pte, l1e_from_mfn(guest_root_pt, __PAGE_HYPERVISOR_RO));
+
cpu_info->root_pgt_changed = true;
cpu_info->pv_cr3 = __pa(this_cpu(root_pgt));
if ( new_cr4 & X86_CR4_PCIDE )
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index 2a445bb17b..1b025986f7 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -288,6 +288,21 @@ static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
1U << GDT_LDT_VCPU_SHIFT);
}
+static int pv_create_root_pt_l1tab(struct vcpu *v)
+{
+ return create_perdomain_mapping(v->domain,
+ PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v),
+ 1, v->domain->arch.pv.root_pt_l1tab,
+ NULL);
+}
+
+static void pv_destroy_root_pt_l1tab(struct vcpu *v)
+
+{
+ destroy_perdomain_mapping(v->domain,
+ PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v), 1);
+}
+
void pv_vcpu_destroy(struct vcpu *v)
{
if ( is_pv_32bit_vcpu(v) )
@@ -297,6 +312,7 @@ void pv_vcpu_destroy(struct vcpu *v)
}
pv_destroy_gdt_ldt_l1tab(v);
+ pv_destroy_root_pt_l1tab(v);
XFREE(v->arch.pv.trap_ctxt);
}
@@ -311,6 +327,13 @@ int pv_vcpu_initialise(struct vcpu *v)
if ( rc )
return rc;
+ if ( v->domain->arch.pv.xpti )
+ {
+ rc = pv_create_root_pt_l1tab(v);
+ if ( rc )
+ goto done;
+ }
+
BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) >
PAGE_SIZE);
v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS);
@@ -346,10 +369,12 @@ void pv_domain_destroy(struct domain *d)
destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
+ destroy_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START, PV_ROOT_PT_MAPPING_ENTRIES);
XFREE(d->arch.pv.cpuidmasks);
FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
+ FREE_XENHEAP_PAGE(d->arch.pv.root_pt_l1tab);
}
void noreturn cf_check continue_pv_domain(void);
@@ -371,6 +396,12 @@ int pv_domain_initialise(struct domain *d)
goto fail;
clear_page(d->arch.pv.gdt_ldt_l1tab);
+ d->arch.pv.root_pt_l1tab =
+ alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
+ if ( !d->arch.pv.root_pt_l1tab )
+ goto fail;
+ clear_page(d->arch.pv.root_pt_l1tab);
+
if ( levelling_caps & ~LCAP_faulting &&
(d->arch.pv.cpuidmasks = xmemdup(&cpuidmask_defaults)) == NULL )
goto fail;
@@ -381,6 +412,11 @@ int pv_domain_initialise(struct domain *d)
if ( rc )
goto fail;
+ rc = create_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START,
+ PV_ROOT_PT_MAPPING_ENTRIES, NULL, NULL);
+ if ( rc )
+ goto fail;
+
d->arch.ctxt_switch = &pv_csw;
d->arch.pv.xpti = is_hardware_domain(d) ? opt_xpti_hwdom : opt_xpti_domu;
diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
index 630bdc3945..c1ae5013af 100644
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -80,6 +80,7 @@ void __dummy__(void)
#undef OFFSET_EF
+ OFFSET(VCPU_id, struct vcpu, vcpu_id);
OFFSET(VCPU_processor, struct vcpu, processor);
OFFSET(VCPU_domain, struct vcpu, domain);
OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map);
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index df015589ce..c1377da7a5 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -162,7 +162,15 @@ FUNC_LOCAL(restore_all_guest)
and %rsi, %rdi
and %r9, %rsi
add %rcx, %rdi
+
+ /*
+ * The address in the vCPU cr3 is always mapped in the per-domain
+ * pv_root_pt virt area.
+ */
+ imul $PAGE_SIZE, VCPU_id(%rbx), %esi
+ movabs $PV_ROOT_PT_MAPPING_VIRT_START, %rcx
add %rcx, %rsi
+
mov $ROOT_PAGETABLE_FIRST_XEN_SLOT, %ecx
mov root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rsi), %r8
mov %r8, root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rdi)
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH V3 01/19] x86: Create per-domain mapping of guest_root_pt
2024-05-13 11:10 ` [PATCH V3 01/19] x86: Create per-domain mapping of guest_root_pt Elias El Yandouzi
@ 2024-05-13 15:27 ` Roger Pau Monné
2024-05-14 8:03 ` Jan Beulich
2024-05-14 17:15 ` Elias El Yandouzi
0 siblings, 2 replies; 26+ messages in thread
From: Roger Pau Monné @ 2024-05-13 15:27 UTC (permalink / raw)
To: Elias El Yandouzi
Cc: xen-devel, julien, pdurrant, dwmw, Hongyan Xia, Julien Grall
On Mon, May 13, 2024 at 11:10:59AM +0000, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
>
> Create a per-domain mapping of PV guest_root_pt as direct map is being
> removed.
>
> Note that we do not map and unmap root_pgt for now since it is still a
> xenheap page.
I'm afraid this needs more context, at least for me to properly
understand.
I think I've figured out what create_perdomain_mapping() is supposed
to do, and I have to admit the interface is very awkward.
Anyway, attempted to provide some feedback.
>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
> Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
>
> ----
> Changes in V3:
> * Rename SHADOW_ROOT
> * Haven't addressed the potentially over-allocation issue as I don't get it
>
> Changes in V2:
> * Rework the shadow perdomain mapping solution in the follow-up patches
>
> Changes since Hongyan's version:
> * Remove the final dot in the commit title
>
> diff --git a/xen/arch/x86/include/asm/config.h b/xen/arch/x86/include/asm/config.h
> index ab7288cb36..5d710384df 100644
> --- a/xen/arch/x86/include/asm/config.h
> +++ b/xen/arch/x86/include/asm/config.h
> @@ -203,7 +203,7 @@ extern unsigned char boot_edid_info[128];
> /* Slot 260: per-domain mappings (including map cache). */
> #define PERDOMAIN_VIRT_START (PML4_ADDR(260))
> #define PERDOMAIN_SLOT_MBYTES (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER))
> -#define PERDOMAIN_SLOTS 3
> +#define PERDOMAIN_SLOTS 4
> #define PERDOMAIN_VIRT_SLOT(s) (PERDOMAIN_VIRT_START + (s) * \
> (PERDOMAIN_SLOT_MBYTES << 20))
> /* Slot 4: mirror of per-domain mappings (for compat xlat area accesses). */
> @@ -317,6 +317,14 @@ extern unsigned long xen_phys_start;
> #define ARG_XLAT_START(v) \
> (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
>
> +/* pv_root_pt mapping area. The fourth per-domain-mapping sub-area */
> +#define PV_ROOT_PT_MAPPING_VIRT_START PERDOMAIN_VIRT_SLOT(3)
> +#define PV_ROOT_PT_MAPPING_ENTRIES MAX_VIRT_CPUS
> +
> +/* The address of a particular VCPU's PV_ROOT_PT */
> +#define PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v) \
> + (PV_ROOT_PT_MAPPING_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))
I know we are not there yet, but I wonder if we need to start having
some non-shared per-cpu mapping area in the page-tables. Right now
this is shared between all the vCPUs, as it's per-domain space
(instead of per-vCPU).
> +
> #define ELFSIZE 64
>
> #define ARCH_CRASH_SAVE_VMCOREINFO
> diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
> index f5daeb182b..8a97530607 100644
> --- a/xen/arch/x86/include/asm/domain.h
> +++ b/xen/arch/x86/include/asm/domain.h
> @@ -272,6 +272,7 @@ struct time_scale {
> struct pv_domain
> {
> l1_pgentry_t **gdt_ldt_l1tab;
> + l1_pgentry_t **root_pt_l1tab;
>
> atomic_t nr_l4_pages;
>
> diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
> index d968bbbc73..efdf20f775 100644
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -505,6 +505,13 @@ void share_xen_page_with_guest(struct page_info *page, struct domain *d,
> nrspin_unlock(&d->page_alloc_lock);
> }
>
> +#define pv_root_pt_idx(v) \
> + ((v)->vcpu_id >> PAGETABLE_ORDER)
> +
> +#define pv_root_pt_pte(v) \
> + ((v)->domain->arch.pv.root_pt_l1tab[pv_root_pt_idx(v)] + \
> + ((v)->vcpu_id & (L1_PAGETABLE_ENTRIES - 1)))
> +
> void make_cr3(struct vcpu *v, mfn_t mfn)
> {
> struct domain *d = v->domain;
> @@ -524,6 +531,13 @@ void write_ptbase(struct vcpu *v)
>
> if ( is_pv_vcpu(v) && v->domain->arch.pv.xpti )
> {
> + mfn_t guest_root_pt = _mfn(MASK_EXTR(v->arch.cr3, PAGE_MASK));
> + l1_pgentry_t *pte = pv_root_pt_pte(v);
> +
> + ASSERT(v == current);
> +
> + l1e_write(pte, l1e_from_mfn(guest_root_pt, __PAGE_HYPERVISOR_RO));
> +
> cpu_info->root_pgt_changed = true;
> cpu_info->pv_cr3 = __pa(this_cpu(root_pgt));
> if ( new_cr4 & X86_CR4_PCIDE )
> diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
> index 2a445bb17b..1b025986f7 100644
> --- a/xen/arch/x86/pv/domain.c
> +++ b/xen/arch/x86/pv/domain.c
> @@ -288,6 +288,21 @@ static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
> 1U << GDT_LDT_VCPU_SHIFT);
> }
>
> +static int pv_create_root_pt_l1tab(struct vcpu *v)
> +{
> + return create_perdomain_mapping(v->domain,
> + PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v),
> + 1, v->domain->arch.pv.root_pt_l1tab,
> + NULL);
> +}
> +
> +static void pv_destroy_root_pt_l1tab(struct vcpu *v)
The two 'v' parameters could be const here.
> +
> +{
> + destroy_perdomain_mapping(v->domain,
> + PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v), 1);
> +}
> +
> void pv_vcpu_destroy(struct vcpu *v)
> {
> if ( is_pv_32bit_vcpu(v) )
> @@ -297,6 +312,7 @@ void pv_vcpu_destroy(struct vcpu *v)
> }
>
> pv_destroy_gdt_ldt_l1tab(v);
> + pv_destroy_root_pt_l1tab(v);
> XFREE(v->arch.pv.trap_ctxt);
> }
>
> @@ -311,6 +327,13 @@ int pv_vcpu_initialise(struct vcpu *v)
> if ( rc )
> return rc;
>
> + if ( v->domain->arch.pv.xpti )
> + {
> + rc = pv_create_root_pt_l1tab(v);
> + if ( rc )
> + goto done;
> + }
> +
> BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) >
> PAGE_SIZE);
> v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS);
> @@ -346,10 +369,12 @@ void pv_domain_destroy(struct domain *d)
>
> destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
> GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
> + destroy_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START, PV_ROOT_PT_MAPPING_ENTRIES);
Line too long. Also see comment below about using d->max_vcpus
instead of MAX_VIRT_CPUS.
>
> XFREE(d->arch.pv.cpuidmasks);
>
> FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
> + FREE_XENHEAP_PAGE(d->arch.pv.root_pt_l1tab);
> }
>
> void noreturn cf_check continue_pv_domain(void);
> @@ -371,6 +396,12 @@ int pv_domain_initialise(struct domain *d)
> goto fail;
> clear_page(d->arch.pv.gdt_ldt_l1tab);
>
> + d->arch.pv.root_pt_l1tab =
> + alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
> + if ( !d->arch.pv.root_pt_l1tab )
> + goto fail;
> + clear_page(d->arch.pv.root_pt_l1tab);
> +
> if ( levelling_caps & ~LCAP_faulting &&
> (d->arch.pv.cpuidmasks = xmemdup(&cpuidmask_defaults)) == NULL )
> goto fail;
> @@ -381,6 +412,11 @@ int pv_domain_initialise(struct domain *d)
> if ( rc )
> goto fail;
>
> + rc = create_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START,
> + PV_ROOT_PT_MAPPING_ENTRIES, NULL, NULL);
Why not use d->max_vcpus here, instead of forcing up to MAX_VIRT_CPUS?
By the time pv_domain_initialise() is called max_vcpus should already
be initialized. AFAICT it doesn't make a difference, because for the
call here only the L3 table is created, as last two parameters are
NULL, but still is more accurate to use max_vcpus.
> + if ( rc )
> + goto fail;
> +
> d->arch.ctxt_switch = &pv_csw;
>
> d->arch.pv.xpti = is_hardware_domain(d) ? opt_xpti_hwdom : opt_xpti_domu;
> diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
> index 630bdc3945..c1ae5013af 100644
> --- a/xen/arch/x86/x86_64/asm-offsets.c
> +++ b/xen/arch/x86/x86_64/asm-offsets.c
> @@ -80,6 +80,7 @@ void __dummy__(void)
>
> #undef OFFSET_EF
>
> + OFFSET(VCPU_id, struct vcpu, vcpu_id);
> OFFSET(VCPU_processor, struct vcpu, processor);
> OFFSET(VCPU_domain, struct vcpu, domain);
> OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map);
> diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
> index df015589ce..c1377da7a5 100644
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -162,7 +162,15 @@ FUNC_LOCAL(restore_all_guest)
> and %rsi, %rdi
> and %r9, %rsi
> add %rcx, %rdi
> +
> + /*
> + * The address in the vCPU cr3 is always mapped in the per-domain
> + * pv_root_pt virt area.
> + */
> + imul $PAGE_SIZE, VCPU_id(%rbx), %esi
Aren't some of the previous operations against %rsi now useless since
it gets unconditionally overwritten here?
and %r9, %rsi
[...]
add %rcx, %rsi
Thanks, Roger.
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH V3 01/19] x86: Create per-domain mapping of guest_root_pt
2024-05-13 15:27 ` Roger Pau Monné
@ 2024-05-14 8:03 ` Jan Beulich
2024-05-14 15:46 ` Alejandro Vallejo
2024-05-14 17:15 ` Elias El Yandouzi
1 sibling, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2024-05-14 8:03 UTC (permalink / raw)
To: Roger Pau Monné
Cc: xen-devel, julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
On 13.05.2024 17:27, Roger Pau Monné wrote:
> On Mon, May 13, 2024 at 11:10:59AM +0000, Elias El Yandouzi wrote:
>> @@ -317,6 +317,14 @@ extern unsigned long xen_phys_start;
>> #define ARG_XLAT_START(v) \
>> (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
>>
>> +/* pv_root_pt mapping area. The fourth per-domain-mapping sub-area */
>> +#define PV_ROOT_PT_MAPPING_VIRT_START PERDOMAIN_VIRT_SLOT(3)
>> +#define PV_ROOT_PT_MAPPING_ENTRIES MAX_VIRT_CPUS
>> +
>> +/* The address of a particular VCPU's PV_ROOT_PT */
>> +#define PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v) \
>> + (PV_ROOT_PT_MAPPING_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))
>
> I know we are not there yet, but I wonder if we need to start having
> some non-shared per-cpu mapping area in the page-tables. Right now
> this is shared between all the vCPUs, as it's per-domain space
> (instead of per-vCPU).
In turn requiring per-vCPU page tables, posing a problem when a guest
uses the same page tables for multiple vCPU-s.
Jan
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH V3 01/19] x86: Create per-domain mapping of guest_root_pt
2024-05-14 8:03 ` Jan Beulich
@ 2024-05-14 15:46 ` Alejandro Vallejo
0 siblings, 0 replies; 26+ messages in thread
From: Alejandro Vallejo @ 2024-05-14 15:46 UTC (permalink / raw)
To: Jan Beulich, Roger Pau Monné
Cc: xen-devel, julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
On 14/05/2024 09:03, Jan Beulich wrote:
> On 13.05.2024 17:27, Roger Pau Monné wrote:
>> On Mon, May 13, 2024 at 11:10:59AM +0000, Elias El Yandouzi wrote:
>>> @@ -317,6 +317,14 @@ extern unsigned long xen_phys_start;
>>> #define ARG_XLAT_START(v) \
>>> (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
>>>
>>> +/* pv_root_pt mapping area. The fourth per-domain-mapping sub-area */
>>> +#define PV_ROOT_PT_MAPPING_VIRT_START PERDOMAIN_VIRT_SLOT(3)
>>> +#define PV_ROOT_PT_MAPPING_ENTRIES MAX_VIRT_CPUS
>>> +
>>> +/* The address of a particular VCPU's PV_ROOT_PT */
>>> +#define PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v) \
>>> + (PV_ROOT_PT_MAPPING_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))
>>
>> I know we are not there yet, but I wonder if we need to start having
>> some non-shared per-cpu mapping area in the page-tables. Right now
>> this is shared between all the vCPUs, as it's per-domain space
>> (instead of per-vCPU).
>
> In turn requiring per-vCPU page tables, posing a problem when a guest
> uses the same page tables for multiple vCPU-s.
>
> Jan
>
True. Having separate page tables per CPU is an unavoidable end goal for
a hypervisor claiming to hold no secrets, however. Otherwise any CPU can
still speculatively read the stacks of other CPUs and take well-timed
glances over mappings set transiently by any other CPU.
Cheers,
Alejandro
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH V3 01/19] x86: Create per-domain mapping of guest_root_pt
2024-05-13 15:27 ` Roger Pau Monné
2024-05-14 8:03 ` Jan Beulich
@ 2024-05-14 17:15 ` Elias El Yandouzi
2024-05-15 9:05 ` Roger Pau Monné
1 sibling, 1 reply; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-14 17:15 UTC (permalink / raw)
To: Roger Pau Monné; +Cc: xen-devel, julien, pdurrant, dwmw, Julien Grall
Hi Roger,
On 13/05/2024 16:27, Roger Pau Monné wrote:
>> diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
>> index 2a445bb17b..1b025986f7 100644
>> --- a/xen/arch/x86/pv/domain.c
>> +++ b/xen/arch/x86/pv/domain.c
>> @@ -288,6 +288,21 @@ static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
>> 1U << GDT_LDT_VCPU_SHIFT);
>> }
>>
>> +static int pv_create_root_pt_l1tab(struct vcpu *v)
>> +{
>> + return create_perdomain_mapping(v->domain,
>> + PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v),
>> + 1, v->domain->arch.pv.root_pt_l1tab,
>> + NULL);
>> +}
>> +
>> +static void pv_destroy_root_pt_l1tab(struct vcpu *v)
>
> The two 'v' parameters could be const here.
I could constify the parameters but the functions wouldn't be consistent
with the two above for gdt/ldt.
>> @@ -381,6 +412,11 @@ int pv_domain_initialise(struct domain *d)
>> if ( rc )
>> goto fail;
>>
>> + rc = create_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START,
>> + PV_ROOT_PT_MAPPING_ENTRIES, NULL, NULL);
>
> Why not use d->max_vcpus here, instead of forcing up to MAX_VIRT_CPUS?
>
> By the time pv_domain_initialise() is called max_vcpus should already
> be initialized. AFAICT it doesn't make a difference, because for the
> call here only the L3 table is created, as last two parameters are
> NULL, but still is more accurate to use max_vcpus.
There is no particular reason. I think we can safely use d->max_vcpus.
>> diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
>> index df015589ce..c1377da7a5 100644
>> --- a/xen/arch/x86/x86_64/entry.S
>> +++ b/xen/arch/x86/x86_64/entry.S
>> @@ -162,7 +162,15 @@ FUNC_LOCAL(restore_all_guest)
>> and %rsi, %rdi
>> and %r9, %rsi
>> add %rcx, %rdi
>> +
>> + /*
>> + * The address in the vCPU cr3 is always mapped in the per-domain
>> + * pv_root_pt virt area.
>> + */
>> + imul $PAGE_SIZE, VCPU_id(%rbx), %esi
>
> Aren't some of the previous operations against %rsi now useless since
> it gets unconditionally overwritten here?
I think I can just get rid off of:
and %r9, %rsi
> and %r9, %rsi
> [...]
> add %rcx, %rsi
The second operation you suggested is actually used to retrieve the VA
of the PV_ROOT_PT.
Cheers,
Elias
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH V3 01/19] x86: Create per-domain mapping of guest_root_pt
2024-05-14 17:15 ` Elias El Yandouzi
@ 2024-05-15 9:05 ` Roger Pau Monné
0 siblings, 0 replies; 26+ messages in thread
From: Roger Pau Monné @ 2024-05-15 9:05 UTC (permalink / raw)
To: Elias El Yandouzi; +Cc: xen-devel, julien, pdurrant, dwmw, Julien Grall
On Tue, May 14, 2024 at 06:15:57PM +0100, Elias El Yandouzi wrote:
> Hi Roger,
>
> On 13/05/2024 16:27, Roger Pau Monné wrote:
> > > diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
> > > index 2a445bb17b..1b025986f7 100644
> > > --- a/xen/arch/x86/pv/domain.c
> > > +++ b/xen/arch/x86/pv/domain.c
> > > @@ -288,6 +288,21 @@ static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
> > > 1U << GDT_LDT_VCPU_SHIFT);
> > > }
> > > +static int pv_create_root_pt_l1tab(struct vcpu *v)
> > > +{
> > > + return create_perdomain_mapping(v->domain,
> > > + PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v),
> > > + 1, v->domain->arch.pv.root_pt_l1tab,
> > > + NULL);
> > > +}
> > > +
> > > +static void pv_destroy_root_pt_l1tab(struct vcpu *v)
> >
> > The two 'v' parameters could be const here.
>
> I could constify the parameters but the functions wouldn't be consistent
> with the two above for gdt/ldt.
The fact they are not const for the other helpers would also need
fixing at some point IMO, it's best if those are already using the
correct type.
> > > diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
> > > index df015589ce..c1377da7a5 100644
> > > --- a/xen/arch/x86/x86_64/entry.S
> > > +++ b/xen/arch/x86/x86_64/entry.S
> > > @@ -162,7 +162,15 @@ FUNC_LOCAL(restore_all_guest)
> > > and %rsi, %rdi
> > > and %r9, %rsi
> > > add %rcx, %rdi
> > > +
> > > + /*
> > > + * The address in the vCPU cr3 is always mapped in the per-domain
> > > + * pv_root_pt virt area.
> > > + */
> > > + imul $PAGE_SIZE, VCPU_id(%rbx), %esi
> >
> > Aren't some of the previous operations against %rsi now useless since
> > it gets unconditionally overwritten here?
>
> I think I can just get rid off of:
>
> and %r9, %rsi
>
> > and %r9, %rsi
> > [...]
> > add %rcx, %rsi
>
> The second operation you suggested is actually used to retrieve the VA of
> the PV_ROOT_PT.
Oh, yes, sorry, got confused when looking at the source file together
with the diff, it's only the `and` that can be removed.
Thanks, Roger.
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH V3 02/19] x86/pv: Domheap pages should be mapped while relocating initrd
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
2024-05-13 11:10 ` [PATCH V3 01/19] x86: Create per-domain mapping of guest_root_pt Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 03/19] x86/pv: Rewrite how building PV dom0 handles domheap mappings Elias El Yandouzi
` (17 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Wei Liu, Wei Wang, Julien Grall,
Elias El Yandouzi
From: Wei Liu <wei.liu2@citrix.com>
Xen shouldn't use domheap page as if they were xenheap pages. Map and
unmap pages accordingly.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: Wei Wang <wawei@amazon.de>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in V3:
* Rename commit title
* Rework the for loop copying the pages
Changes in V2:
* Get rid of mfn_to_virt
* Don't open code copy_domain_page()
Changes since Hongyan's version:
* Add missing newline after the variable declaration
diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
index d8043fa58a..807296c280 100644
--- a/xen/arch/x86/pv/dom0_build.c
+++ b/xen/arch/x86/pv/dom0_build.c
@@ -618,18 +618,24 @@ int __init dom0_construct_pv(struct domain *d,
if ( d->arch.physaddr_bitsize &&
((mfn + count - 1) >> (d->arch.physaddr_bitsize - PAGE_SHIFT)) )
{
+ unsigned int nr_pages = 1UL << order;
+
order = get_order_from_pages(count);
page = alloc_domheap_pages(d, order, MEMF_no_scrub);
if ( !page )
panic("Not enough RAM for domain 0 initrd\n");
+
for ( count = -count; order--; )
if ( count & (1UL << order) )
{
free_domheap_pages(page, order);
page += 1UL << order;
+ nr_pages -= 1UL << order;
}
- memcpy(page_to_virt(page), mfn_to_virt(initrd->mod_start),
- initrd_len);
+
+ for ( ; nr_pages-- ; page++, mfn++ )
+ copy_domain_page(page_to_mfn(page), _mfn(mfn));
+
mpt_alloc = (paddr_t)initrd->mod_start << PAGE_SHIFT;
init_domheap_pages(mpt_alloc,
mpt_alloc + PAGE_ALIGN(initrd_len));
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 03/19] x86/pv: Rewrite how building PV dom0 handles domheap mappings
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
2024-05-13 11:10 ` [PATCH V3 01/19] x86: Create per-domain mapping of guest_root_pt Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 02/19] x86/pv: Domheap pages should be mapped while relocating initrd Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 04/19] x86: Lift mapcache variable to the arch level Elias El Yandouzi
` (16 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
Building a PV dom0 is allocating from the domheap but uses it like the
xenheap. Use the pages as they should be.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in V3:
* Fold following patch 'x86/pv: Map L4 page table for shim domain'
Changes in V2:
* Clarify the commit message
* Break the patch in two parts
Changes since Hongyan's version:
* Rebase
* Remove spurious newline
diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
index 807296c280..ac910b438a 100644
--- a/xen/arch/x86/pv/dom0_build.c
+++ b/xen/arch/x86/pv/dom0_build.c
@@ -382,6 +382,10 @@ int __init dom0_construct_pv(struct domain *d,
l3_pgentry_t *l3tab = NULL, *l3start = NULL;
l2_pgentry_t *l2tab = NULL, *l2start = NULL;
l1_pgentry_t *l1tab = NULL, *l1start = NULL;
+ mfn_t l4start_mfn = INVALID_MFN;
+ mfn_t l3start_mfn = INVALID_MFN;
+ mfn_t l2start_mfn = INVALID_MFN;
+ mfn_t l1start_mfn = INVALID_MFN;
/*
* This fully describes the memory layout of the initial domain. All
@@ -710,22 +714,32 @@ int __init dom0_construct_pv(struct domain *d,
v->arch.pv.event_callback_cs = FLAT_COMPAT_KERNEL_CS;
}
+#define UNMAP_MAP_AND_ADVANCE(mfn_var, virt_var, maddr) \
+do { \
+ unmap_domain_page(virt_var); \
+ mfn_var = maddr_to_mfn(maddr); \
+ maddr += PAGE_SIZE; \
+ virt_var = map_domain_page(mfn_var); \
+} while ( false )
+
if ( !compat )
{
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
- l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+ UNMAP_MAP_AND_ADVANCE(l4start_mfn, l4start, mpt_alloc);
+ l4tab = l4start;
clear_page(l4tab);
- init_xen_l4_slots(l4tab, _mfn(virt_to_mfn(l4start)),
- d, INVALID_MFN, true);
- v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
+ init_xen_l4_slots(l4tab, l4start_mfn, d, INVALID_MFN, true);
+ v->arch.guest_table = pagetable_from_mfn(l4start_mfn);
}
else
{
/* Monitor table already created by switch_compat(). */
- l4start = l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
+ l4start_mfn = pagetable_get_mfn(v->arch.guest_table);
+ l4start = l4tab = map_domain_page(l4start_mfn);
/* See public/xen.h on why the following is needed. */
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+ UNMAP_MAP_AND_ADVANCE(l3start_mfn, l3start, mpt_alloc);
}
l4tab += l4_table_offset(v_start);
@@ -735,14 +749,16 @@ int __init dom0_construct_pv(struct domain *d,
if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
{
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
- l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+ UNMAP_MAP_AND_ADVANCE(l1start_mfn, l1start, mpt_alloc);
+ l1tab = l1start;
clear_page(l1tab);
if ( count == 0 )
l1tab += l1_table_offset(v_start);
if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
{
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
- l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+ UNMAP_MAP_AND_ADVANCE(l2start_mfn, l2start, mpt_alloc);
+ l2tab = l2start;
clear_page(l2tab);
if ( count == 0 )
l2tab += l2_table_offset(v_start);
@@ -752,19 +768,19 @@ int __init dom0_construct_pv(struct domain *d,
{
maddr_to_page(mpt_alloc)->u.inuse.type_info =
PGT_l3_page_table;
- l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+ UNMAP_MAP_AND_ADVANCE(l3start_mfn, l3start, mpt_alloc);
}
l3tab = l3start;
clear_page(l3tab);
if ( count == 0 )
l3tab += l3_table_offset(v_start);
- *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
+ *l4tab = l4e_from_mfn(l3start_mfn, L4_PROT);
l4tab++;
}
- *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
+ *l3tab = l3e_from_mfn(l2start_mfn, L3_PROT);
l3tab++;
}
- *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
+ *l2tab = l2e_from_mfn(l1start_mfn, L2_PROT);
l2tab++;
}
if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
@@ -783,27 +799,31 @@ int __init dom0_construct_pv(struct domain *d,
if ( compat )
{
- l2_pgentry_t *l2t;
-
/* Ensure the first four L3 entries are all populated. */
for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
{
if ( !l3e_get_intpte(*l3tab) )
{
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
- l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
- clear_page(l2tab);
- *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
+ UNMAP_MAP_AND_ADVANCE(l2start_mfn, l2start, mpt_alloc);
+ clear_page(l2start);
+ *l3tab = l3e_from_mfn(l2start_mfn, L3_PROT);
}
if ( i == 3 )
l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
}
- l2t = map_l2t_from_l3e(l3start[3]);
- init_xen_pae_l2_slots(l2t, d);
- unmap_domain_page(l2t);
+ UNMAP_DOMAIN_PAGE(l2start);
+ l2start = map_l2t_from_l3e(l3start[3]);
+ init_xen_pae_l2_slots(l2start, d);
}
+#undef UNMAP_MAP_AND_ADVANCE
+
+ UNMAP_DOMAIN_PAGE(l1start);
+ UNMAP_DOMAIN_PAGE(l2start);
+ UNMAP_DOMAIN_PAGE(l3start);
+
/* Pages that are part of page tables must be read only. */
mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages, &flush_flags);
@@ -976,6 +996,8 @@ int __init dom0_construct_pv(struct domain *d,
pv_shim_setup_dom(d, l4start, v_start, vxenstore_start, vconsole_start,
vphysmap_start, si);
+ UNMAP_DOMAIN_PAGE(l4start);
+
#ifdef CONFIG_COMPAT
if ( compat )
xlat_start_info(si, pv_shim ? XLAT_start_info_console_domU
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 04/19] x86: Lift mapcache variable to the arch level
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (2 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 03/19] x86/pv: Rewrite how building PV dom0 handles domheap mappings Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 05/19] x86/mapcache: Initialise the mapcache for the idle domain Elias El Yandouzi
` (15 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Wei Liu, Wei Wang, Hongyan Xia,
Julien Grall
From: Wei Liu <wei.liu2@citrix.com>
It is going to be needed by HVM and idle domain as well, because without
the direct map, both need a mapcache to map pages.
This commit lifts the mapcache variable up and initialise it a bit earlier
for PV and HVM domains.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: Wei Wang <wawei@amazon.de>
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 20e83cf38b..507d704f16 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -851,6 +851,8 @@ int arch_domain_create(struct domain *d,
psr_domain_init(d);
+ mapcache_domain_init(d);
+
if ( is_hvm_domain(d) )
{
if ( (rc = hvm_domain_initialise(d, config)) != 0 )
@@ -858,8 +860,6 @@ int arch_domain_create(struct domain *d,
}
else if ( is_pv_domain(d) )
{
- mapcache_domain_init(d);
-
if ( (rc = pv_domain_initialise(d)) != 0 )
goto fail;
}
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index eac5e3304f..55e337aaf7 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -82,11 +82,11 @@ void *map_domain_page(mfn_t mfn)
#endif
v = mapcache_current_vcpu();
- if ( !v || !is_pv_vcpu(v) )
+ if ( !v )
return mfn_to_virt(mfn_x(mfn));
- dcache = &v->domain->arch.pv.mapcache;
- vcache = &v->arch.pv.mapcache;
+ dcache = &v->domain->arch.mapcache;
+ vcache = &v->arch.mapcache;
if ( !dcache->inuse )
return mfn_to_virt(mfn_x(mfn));
@@ -187,14 +187,14 @@ void unmap_domain_page(const void *ptr)
ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
v = mapcache_current_vcpu();
- ASSERT(v && is_pv_vcpu(v));
+ ASSERT(v);
- dcache = &v->domain->arch.pv.mapcache;
+ dcache = &v->domain->arch.mapcache;
ASSERT(dcache->inuse);
idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
mfn = l1e_get_pfn(MAPCACHE_L1ENT(idx));
- hashent = &v->arch.pv.mapcache.hash[MAPHASH_HASHFN(mfn)];
+ hashent = &v->arch.mapcache.hash[MAPHASH_HASHFN(mfn)];
local_irq_save(flags);
@@ -233,11 +233,9 @@ void unmap_domain_page(const void *ptr)
int mapcache_domain_init(struct domain *d)
{
- struct mapcache_domain *dcache = &d->arch.pv.mapcache;
+ struct mapcache_domain *dcache = &d->arch.mapcache;
unsigned int bitmap_pages;
- ASSERT(is_pv_domain(d));
-
#ifdef NDEBUG
if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
return 0;
@@ -261,12 +259,12 @@ int mapcache_domain_init(struct domain *d)
int mapcache_vcpu_init(struct vcpu *v)
{
struct domain *d = v->domain;
- struct mapcache_domain *dcache = &d->arch.pv.mapcache;
+ struct mapcache_domain *dcache = &d->arch.mapcache;
unsigned long i;
unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES;
unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long));
- if ( !is_pv_vcpu(v) || !dcache->inuse )
+ if ( !dcache->inuse )
return 0;
if ( ents > dcache->entries )
@@ -293,7 +291,7 @@ int mapcache_vcpu_init(struct vcpu *v)
BUILD_BUG_ON(MAPHASHENT_NOTINUSE < MAPCACHE_ENTRIES);
for ( i = 0; i < MAPHASH_ENTRIES; i++ )
{
- struct vcpu_maphash_entry *hashent = &v->arch.pv.mapcache.hash[i];
+ struct vcpu_maphash_entry *hashent = &v->arch.mapcache.hash[i];
hashent->mfn = ~0UL; /* never valid to map */
hashent->idx = MAPHASHENT_NOTINUSE;
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index 8a97530607..7f0480d7a7 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -285,9 +285,6 @@ struct pv_domain
/* Mitigate L1TF with shadow/crashing? */
bool check_l1tf;
- /* map_domain_page() mapping cache. */
- struct mapcache_domain mapcache;
-
struct cpuidmasks *cpuidmasks;
};
@@ -326,6 +323,9 @@ struct arch_domain
uint8_t scf; /* See SCF_DOM_MASK */
+ /* map_domain_page() mapping cache. */
+ struct mapcache_domain mapcache;
+
union {
struct pv_domain pv;
struct hvm_domain hvm;
@@ -516,9 +516,6 @@ struct arch_domain
struct pv_vcpu
{
- /* map_domain_page() mapping cache. */
- struct mapcache_vcpu mapcache;
-
unsigned int vgc_flags;
struct trap_info *trap_ctxt;
@@ -618,6 +615,9 @@ struct arch_vcpu
#define async_exception_state(t) async_exception_state[(t)-1]
uint8_t async_exception_mask;
+ /* map_domain_page() mapping cache. */
+ struct mapcache_vcpu mapcache;
+
/* Virtual Machine Extensions */
union {
struct pv_vcpu pv;
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 05/19] x86/mapcache: Initialise the mapcache for the idle domain
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (3 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 04/19] x86: Lift mapcache variable to the arch level Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 06/19] x86: Add a boot option to enable and disable the direct map Elias El Yandouzi
` (14 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Wei Wang, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
In order to use the mapcache in the idle domain, we also have to
populate its page tables in the PERDOMAIN region, and we need to move
mapcache_domain_init() earlier in arch_domain_create().
Note, commit 'x86: lift mapcache variable to the arch level' has
initialised the mapcache for HVM domains. With this patch, PV, HVM,
idle domains now all initialise the mapcache.
Signed-off-by: Wei Wang <wawei@amazon.de>
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in V2:
* Free resources if mapcache initialisation fails
* Remove `is_idle_domain()` check from `create_perdomain_mappings()`
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 507d704f16..3303bdb53e 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -758,9 +758,16 @@ int arch_domain_create(struct domain *d,
spin_lock_init(&d->arch.e820_lock);
+ if ( (rc = mapcache_domain_init(d)) != 0)
+ {
+ free_perdomain_mappings(d);
+ return rc;
+ }
+
/* Minimal initialisation for the idle domain. */
if ( unlikely(is_idle_domain(d)) )
{
+ struct page_info *pg = d->arch.perdomain_l3_pg;
static const struct arch_csw idle_csw = {
.from = paravirt_ctxt_switch_from,
.to = paravirt_ctxt_switch_to,
@@ -771,6 +778,9 @@ int arch_domain_create(struct domain *d,
d->arch.cpu_policy = ZERO_BLOCK_PTR; /* Catch stray misuses. */
+ idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)] =
+ l4e_from_page(pg, __PAGE_HYPERVISOR_RW);
+
return 0;
}
@@ -851,8 +861,6 @@ int arch_domain_create(struct domain *d,
psr_domain_init(d);
- mapcache_domain_init(d);
-
if ( is_hvm_domain(d) )
{
if ( (rc = hvm_domain_initialise(d, config)) != 0 )
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 06/19] x86: Add a boot option to enable and disable the direct map
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (4 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 05/19] x86/mapcache: Initialise the mapcache for the idle domain Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 07/19] xen/x86: Add support for the PMAP Elias El Yandouzi
` (13 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
Also add a helper function to retrieve it. Change arch_mfns_in_direct_map
to check this option before returning.
This is added as a Kconfig option as well as a boot command line option.
While being generic, the Kconfig option is only usable for x86 at the moment.
Note that there remains some users of the directmap at this point. The option
is introduced now as it will be needed in follow-up patches.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in V2:
* Introduce a Kconfig option
* Reword the commit message
* Make opt_directmap and helper generic
Changes since Hongyan's version:
* Reword the commit message
* opt_directmap is only modified during boot so mark it as
__ro_after_init
diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
index e760f3266e..743d343ffa 100644
--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -799,6 +799,18 @@ that enabling this option cannot guarantee anything beyond what underlying
hardware guarantees (with, where available and known to Xen, respective
tweaks applied).
+### directmap (x86)
+> `= <boolean>`
+
+> Default: `true`
+
+Enable or disable the directmap region in Xen.
+
+By default, Xen creates the directmap region which maps physical memory
+in that region. Setting this to no will sparsely populate the directmap,
+blocking exploits that leak secrets via speculative memory access in the
+directmap.
+
### dma_bits
> `= <integer>`
diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
index 7e03e4bc55..b4ec0e582e 100644
--- a/xen/arch/x86/Kconfig
+++ b/xen/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86
select HAS_PCI_MSI
select HAS_PIRQ
select HAS_SCHED_GRANULARITY
+ select HAS_SECRET_HIDING
select HAS_UBSAN
select HAS_VPCI if HVM
select NEEDS_LIBELF
diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h
index 98b66edaca..54d835f156 100644
--- a/xen/arch/x86/include/asm/mm.h
+++ b/xen/arch/x86/include/asm/mm.h
@@ -622,11 +622,17 @@ void write_32bit_pse_identmap(uint32_t *l2);
/*
* x86 maps part of physical memory via the directmap region.
* Return whether the range of MFN falls in the directmap region.
+ *
+ * When boot command line sets directmap=no, the directmap will mostly be empty
+ * so this will always return false.
*/
static inline bool arch_mfns_in_directmap(unsigned long mfn, unsigned long nr)
{
unsigned long eva = min(DIRECTMAP_VIRT_END, HYPERVISOR_VIRT_END);
+ if ( !has_directmap() )
+ return false;
+
return (mfn + nr) <= (virt_to_mfn(eva - 1) + 1);
}
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index f84e1cd79c..bd6b1184f5 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1517,6 +1517,8 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
if ( highmem_start )
xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
+ printk("Booting with directmap %s\n", has_directmap() ? "on" : "off");
+
/*
* Walk every RAM region and map it in its entirety (on x86/64, at least)
* and notify it to the boot allocator.
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index 565ceda741..856604068c 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -80,12 +80,29 @@ config HAS_PMAP
config HAS_SCHED_GRANULARITY
bool
+config HAS_SECRET_HIDING
+ bool
+
config HAS_UBSAN
bool
config MEM_ACCESS_ALWAYS_ON
bool
+config SECRET_HIDING
+ bool "Secret hiding"
+ depends on HAS_SECRET_HIDING
+ help
+ The directmap contains mapping for most of the RAM which makes domain
+ memory easily accessible. While making the performance better, it also makes
+ the hypervisor more vulnerable to speculation attacks.
+
+ Enabling this feature will allow the user to decide whether the memory
+ is always mapped at boot or mapped only on demand (see the command line
+ option "directmap").
+
+ If unsure, say N.
+
config MEM_ACCESS
def_bool MEM_ACCESS_ALWAYS_ON
prompt "Memory Access and VM events" if !MEM_ACCESS_ALWAYS_ON
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 7c1bdfc046..9b7e4721cd 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -174,6 +174,11 @@ paddr_t __ro_after_init mem_hotplug;
static char __initdata opt_badpage[100] = "";
string_param("badpage", opt_badpage);
+bool __ro_after_init opt_directmap = true;
+#ifdef CONFIG_HAS_SECRET_HIDING
+boolean_param("directmap", opt_directmap);
+#endif
+
/*
* no-bootscrub -> Free pages are not zeroed during boot.
*/
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index 7561297a75..9d4f1f2d0d 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -167,6 +167,13 @@ extern unsigned long max_page;
extern unsigned long total_pages;
extern paddr_t mem_hotplug;
+extern bool opt_directmap;
+
+static inline bool has_directmap(void)
+{
+ return opt_directmap;
+}
+
/*
* Extra fault info types which are used to further describe
* the source of an access violation.
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 07/19] xen/x86: Add support for the PMAP
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (5 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 06/19] x86: Add a boot option to enable and disable the direct map Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 08/19] xen/x86: Add build assertion for fixmap entries Elias El Yandouzi
` (12 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Julien Grall, Elias El Yandouzi
From: Julien Grall <jgrall@amazon.com>
PMAP will be used in a follow-up patch to bootstrap map domain
page infrastructure -- we need some way to map pages to setup the
mapcache without a direct map.
The functions pmap_{map, unmap} open code {set, clear}_fixmap to break
the loop.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
The PMAP infrastructure was upstream separately for Arm since
Hongyan sent the secret-free hypervisor series. So this is a new
patch to plumb the feature on x86.
Changes in v2:
* Declare PMAP entries earlier in fixed_addresses
* Reword the commit message
diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
index b4ec0e582e..56feb0c564 100644
--- a/xen/arch/x86/Kconfig
+++ b/xen/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86
select HAS_PCI
select HAS_PCI_MSI
select HAS_PIRQ
+ select HAS_PMAP
select HAS_SCHED_GRANULARITY
select HAS_SECRET_HIDING
select HAS_UBSAN
diff --git a/xen/arch/x86/include/asm/fixmap.h b/xen/arch/x86/include/asm/fixmap.h
index 516ec3fa6c..a7ac365fc6 100644
--- a/xen/arch/x86/include/asm/fixmap.h
+++ b/xen/arch/x86/include/asm/fixmap.h
@@ -21,6 +21,8 @@
#include <xen/acpi.h>
#include <xen/pfn.h>
+#include <xen/pmap.h>
+
#include <asm/apicdef.h>
#include <asm/msi.h>
#include <acpi/apei.h>
@@ -53,6 +55,8 @@ enum fixed_addresses {
FIX_PV_CONSOLE,
FIX_XEN_SHARED_INFO,
#endif /* CONFIG_XEN_GUEST */
+ FIX_PMAP_BEGIN,
+ FIX_PMAP_END = FIX_PMAP_BEGIN + NUM_FIX_PMAP,
/* Everything else should go further down. */
FIX_APIC_BASE,
FIX_IO_APIC_BASE_0,
diff --git a/xen/arch/x86/include/asm/pmap.h b/xen/arch/x86/include/asm/pmap.h
new file mode 100644
index 0000000000..62746e191d
--- /dev/null
+++ b/xen/arch/x86/include/asm/pmap.h
@@ -0,0 +1,25 @@
+#ifndef __ASM_PMAP_H__
+#define __ASM_PMAP_H__
+
+#include <asm/fixmap.h>
+
+static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
+{
+ unsigned long linear = (unsigned long)fix_to_virt(slot);
+ l1_pgentry_t *pl1e = &l1_fixmap[l1_table_offset(linear)];
+
+ ASSERT(!(l1e_get_flags(*pl1e) & _PAGE_PRESENT));
+
+ l1e_write_atomic(pl1e, l1e_from_mfn(mfn, PAGE_HYPERVISOR));
+}
+
+static inline void arch_pmap_unmap(unsigned int slot)
+{
+ unsigned long linear = (unsigned long)fix_to_virt(slot);
+ l1_pgentry_t *pl1e = &l1_fixmap[l1_table_offset(linear)];
+
+ l1e_write_atomic(pl1e, l1e_empty());
+ flush_tlb_one_local(linear);
+}
+
+#endif /* __ASM_PMAP_H__ */
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 08/19] xen/x86: Add build assertion for fixmap entries
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (6 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 07/19] xen/x86: Add support for the PMAP Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 09/19] x86/domain_page: Remove the fast paths when mfn is not in the directmap Elias El Yandouzi
` (11 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Elias El Yandouzi
The early fixed addresses must all fit into the static L1 table.
Introduce a build assertion to this end.
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in v2:
* New patch
diff --git a/xen/arch/x86/include/asm/fixmap.h b/xen/arch/x86/include/asm/fixmap.h
index a7ac365fc6..904bee0480 100644
--- a/xen/arch/x86/include/asm/fixmap.h
+++ b/xen/arch/x86/include/asm/fixmap.h
@@ -77,6 +77,11 @@ enum fixed_addresses {
#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+static inline void fixaddr_build_assertion(void)
+{
+ BUILD_BUG_ON(FIX_PMAP_END > L1_PAGETABLE_ENTRIES - 1);
+}
+
extern void __set_fixmap(
enum fixed_addresses idx, unsigned long mfn, unsigned long flags);
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 09/19] x86/domain_page: Remove the fast paths when mfn is not in the directmap
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (7 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 08/19] xen/x86: Add build assertion for fixmap entries Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 10/19] xen/page_alloc: Add a path for xenheap when there is no direct map Elias El Yandouzi
` (10 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall
From: Hongyan Xia <hongyxia@amazon.com>
When mfn is not in direct map, never use mfn_to_virt for any mappings.
We replace mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) with
arch_mfns_in_direct_map(mfn, 1) because these two are equivalent. The
extra comparison in arch_mfns_in_direct_map() looks different but because
DIRECTMAP_VIRT_END is always higher, it does not make any difference.
Lastly, domain_page_map_to_mfn() needs to gain to a special case for
the PMAP.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
----
Changes since Hongyan's version:
* arch_mfn_in_direct_map() was renamed to arch_mfns_in_directmap()
* add a special case for the PMAP in domain_page_map_to_mfn()
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 55e337aaf7..89caefc8a2 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -14,8 +14,10 @@
#include <xen/sched.h>
#include <xen/vmap.h>
#include <asm/current.h>
+#include <asm/fixmap.h>
#include <asm/flushtlb.h>
#include <asm/hardirq.h>
+#include <asm/pmap.h>
#include <asm/setup.h>
static DEFINE_PER_CPU(struct vcpu *, override);
@@ -35,10 +37,11 @@ static inline struct vcpu *mapcache_current_vcpu(void)
/*
* When using efi runtime page tables, we have the equivalent of the idle
* domain's page tables but current may point at another domain's VCPU.
- * Return NULL as though current is not properly set up yet.
+ * Return the idle domains's vcpu on that core because the efi per-domain
+ * region (where the mapcache is) is in-sync with the idle domain.
*/
if ( efi_rs_using_pgtables() )
- return NULL;
+ return idle_vcpu[smp_processor_id()];
/*
* If guest_table is NULL, and we are running a paravirtualised guest,
@@ -77,18 +80,24 @@ void *map_domain_page(mfn_t mfn)
struct vcpu_maphash_entry *hashent;
#ifdef NDEBUG
- if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
+ if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
return mfn_to_virt(mfn_x(mfn));
#endif
v = mapcache_current_vcpu();
- if ( !v )
- return mfn_to_virt(mfn_x(mfn));
+ if ( !v || !v->domain->arch.mapcache.inuse )
+ {
+ if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
+ return mfn_to_virt(mfn_x(mfn));
+ else
+ {
+ BUG_ON(system_state >= SYS_STATE_smp_boot);
+ return pmap_map(mfn);
+ }
+ }
dcache = &v->domain->arch.mapcache;
vcache = &v->arch.mapcache;
- if ( !dcache->inuse )
- return mfn_to_virt(mfn_x(mfn));
perfc_incr(map_domain_page_count);
@@ -184,6 +193,12 @@ void unmap_domain_page(const void *ptr)
if ( !va || va >= DIRECTMAP_VIRT_START )
return;
+ if ( va >= FIXADDR_START && va < FIXADDR_TOP )
+ {
+ pmap_unmap((void *)ptr);
+ return;
+ }
+
ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
v = mapcache_current_vcpu();
@@ -237,7 +252,7 @@ int mapcache_domain_init(struct domain *d)
unsigned int bitmap_pages;
#ifdef NDEBUG
- if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
+ if ( !mem_hotplug && arch_mfn_in_directmap(0, max_page) )
return 0;
#endif
@@ -308,7 +323,7 @@ void *map_domain_page_global(mfn_t mfn)
local_irq_is_enabled()));
#ifdef NDEBUG
- if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
+ if ( arch_mfn_in_directmap(mfn_x(mfn, 1)) )
return mfn_to_virt(mfn_x(mfn));
#endif
@@ -335,6 +350,23 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
if ( va >= DIRECTMAP_VIRT_START )
return _mfn(virt_to_mfn(ptr));
+ /*
+ * The fixmap is stealing the top-end of the VMAP. So the check for
+ * the PMAP *must* happen first.
+ *
+ * Also, the fixmap translate a slot to an address backwards. The
+ * logic will rely on it to avoid any complexity. So check at
+ * compile time this will always hold.
+ */
+ BUILD_BUG_ON(fix_to_virt(FIX_PMAP_BEGIN) < fix_to_virt(FIX_PMAP_END));
+
+ if ( ((unsigned long)fix_to_virt(FIX_PMAP_END) <= va) &&
+ ((va & PAGE_MASK) <= (unsigned long)fix_to_virt(FIX_PMAP_BEGIN)) )
+ {
+ BUG_ON(system_state >= SYS_STATE_smp_boot);
+ return l1e_get_mfn(l1_fixmap[l1_table_offset(va)]);
+ }
+
if ( va >= VMAP_VIRT_START && va < VMAP_VIRT_END )
return vmap_to_mfn(va);
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 10/19] xen/page_alloc: Add a path for xenheap when there is no direct map
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (8 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 09/19] x86/domain_page: Remove the fast paths when mfn is not in the directmap Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 11/19] x86/setup: Leave early boot slightly earlier Elias El Yandouzi
` (9 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
When there is not an always-mapped direct map, xenheap allocations need
to be mapped and unmapped on-demand.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
I have left the call to map_pages_to_xen() and destroy_xen_mappings()
in the split heap for now. I am not entirely convinced this is necessary
because in that setup only the xenheap would be always mapped and
this doesn't contain any guest memory (aside the grant-table).
So map/unmapping for every allocation seems unnecessary.
Changes in v2:
* Fix remaining wrong indentation in alloc_xenheap_pages()
Changes since Hongyan's version:
* Rebase
* Fix indentation in alloc_xenheap_pages()
* Fix build for arm32
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 9b7e4721cd..dfb2c05322 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -2242,6 +2242,7 @@ void init_xenheap_pages(paddr_t ps, paddr_t pe)
void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
{
struct page_info *pg;
+ void *ret;
ASSERT_ALLOC_CONTEXT();
@@ -2250,17 +2251,36 @@ void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
if ( unlikely(pg == NULL) )
return NULL;
+ ret = page_to_virt(pg);
+
+ if ( !has_directmap() &&
+ map_pages_to_xen((unsigned long)ret, page_to_mfn(pg), 1UL << order,
+ PAGE_HYPERVISOR) )
+ {
+ /* Failed to map xenheap pages. */
+ free_heap_pages(pg, order, false);
+ return NULL;
+ }
+
return page_to_virt(pg);
}
void free_xenheap_pages(void *v, unsigned int order)
{
+ unsigned long va = (unsigned long)v & PAGE_MASK;
+
ASSERT_ALLOC_CONTEXT();
if ( v == NULL )
return;
+ if ( !has_directmap() &&
+ destroy_xen_mappings(va, va + (1UL << (order + PAGE_SHIFT))) )
+ dprintk(XENLOG_WARNING,
+ "Error while destroying xenheap mappings at %p, order %u\n",
+ v, order);
+
free_heap_pages(virt_to_page(v), order, false);
}
@@ -2284,6 +2304,7 @@ void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
{
struct page_info *pg;
unsigned int i;
+ void *ret;
ASSERT_ALLOC_CONTEXT();
@@ -2296,16 +2317,28 @@ void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
if ( unlikely(pg == NULL) )
return NULL;
+ ret = page_to_virt(pg);
+
+ if ( !has_directmap() &&
+ map_pages_to_xen((unsigned long)ret, page_to_mfn(pg), 1UL << order,
+ PAGE_HYPERVISOR) )
+ {
+ /* Failed to map xenheap pages. */
+ free_domheap_pages(pg, order);
+ return NULL;
+ }
+
for ( i = 0; i < (1u << order); i++ )
pg[i].count_info |= PGC_xen_heap;
- return page_to_virt(pg);
+ return ret;
}
void free_xenheap_pages(void *v, unsigned int order)
{
struct page_info *pg;
unsigned int i;
+ unsigned long va = (unsigned long)v & PAGE_MASK;
ASSERT_ALLOC_CONTEXT();
@@ -2317,6 +2350,12 @@ void free_xenheap_pages(void *v, unsigned int order)
for ( i = 0; i < (1u << order); i++ )
pg[i].count_info &= ~PGC_xen_heap;
+ if ( !has_directmap() &&
+ destroy_xen_mappings(va, va + (1UL << (order + PAGE_SHIFT))) )
+ dprintk(XENLOG_WARNING,
+ "Error while destroying xenheap mappings at %p, order %u\n",
+ v, order);
+
free_heap_pages(pg, order, true);
}
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 11/19] x86/setup: Leave early boot slightly earlier
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (9 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 10/19] xen/page_alloc: Add a path for xenheap when there is no direct map Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 12/19] x86/setup: vmap heap nodes when they are outside the direct map Elias El Yandouzi
` (8 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
When we do not have a direct map, memory for metadata of heap nodes in
init_node_heap() is allocated from xenheap, which needs to be mapped and
unmapped on demand. However, we cannot just take memory from the boot
allocator to create the PTEs while we are passing memory to the heap
allocator.
To solve this race, we leave early boot slightly sooner so that Xen PTE
pages are allocated from the heap instead of the boot allocator. We can
do this because the metadata for the 1st node is statically allocated,
and by the time we need memory to create mappings for the 2nd node, we
already have enough memory in the heap allocator in the 1st node.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index bd6b1184f5..f26c9799e4 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1751,6 +1751,22 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
numa_initmem_init(0, raw_max_page);
+ /*
+ * When we do not have a direct map, memory for metadata of heap nodes in
+ * init_node_heap() is allocated from xenheap, which needs to be mapped and
+ * unmapped on demand. However, we cannot just take memory from the boot
+ * allocator to create the PTEs while we are passing memory to the heap
+ * allocator during end_boot_allocator().
+ *
+ * To solve this race, we need to leave early boot before
+ * end_boot_allocator() so that Xen PTE pages are allocated from the heap
+ * instead of the boot allocator. We can do this because the metadata for
+ * the 1st node is statically allocated, and by the time we need memory to
+ * create mappings for the 2nd node, we already have enough memory in the
+ * heap allocator in the 1st node.
+ */
+ system_state = SYS_STATE_boot;
+
if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
{
unsigned long lo = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
@@ -1782,8 +1798,6 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
else
end_boot_allocator();
- system_state = SYS_STATE_boot;
-
bsp_stack = cpu_alloc_stack(0);
if ( !bsp_stack )
panic("No memory for BSP stack\n");
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 12/19] x86/setup: vmap heap nodes when they are outside the direct map
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (10 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 11/19] x86/setup: Leave early boot slightly earlier Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 13/19] x86/setup: Do not create valid mappings when directmap=no Elias El Yandouzi
` (7 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
When we do not have a direct map, archs_mfn_in_direct_map() will always
return false, thus init_node_heap() will allocate xenheap pages from an
existing node for the metadata of a new node. This means that the
metadata of a new node is in a different node, slowing down heap
allocation.
Since we now have early vmap, vmap the metadata locally in the new node.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in v2:
* vmap_contig_pages() was renamed to vmap_contig()
* Fix indentation and coding style
Changes from Hongyan's version:
* arch_mfn_in_direct_map() was renamed to
arch_mfns_in_direct_map()
* Use vmap_contig_pages() rather than __vmap(...).
* Add missing include (xen/vmap.h) so it compiles on Arm
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index dfb2c05322..3c0909f333 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -136,6 +136,7 @@
#include <xen/sched.h>
#include <xen/softirq.h>
#include <xen/spinlock.h>
+#include <xen/vmap.h>
#include <asm/flushtlb.h>
#include <asm/page.h>
@@ -605,22 +606,44 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
needed = 0;
}
else if ( *use_tail && nr >= needed &&
- arch_mfns_in_directmap(mfn + nr - needed, needed) &&
(!xenheap_bits ||
- !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
+ !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
{
- _heap[node] = mfn_to_virt(mfn + nr - needed);
- avail[node] = mfn_to_virt(mfn + nr - 1) +
- PAGE_SIZE - sizeof(**avail) * NR_ZONES;
+ if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
+ {
+ _heap[node] = mfn_to_virt(mfn + nr - needed);
+ avail[node] = mfn_to_virt(mfn + nr - 1) +
+ PAGE_SIZE - sizeof(**avail) * NR_ZONES;
+ }
+ else
+ {
+ mfn_t needed_start = _mfn(mfn + nr - needed);
+
+ _heap[node] = vmap_contig(needed_start, needed);
+ BUG_ON(!_heap[node]);
+ avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
+ sizeof(**avail) * NR_ZONES;
+ }
}
else if ( nr >= needed &&
- arch_mfns_in_directmap(mfn, needed) &&
(!xenheap_bits ||
- !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
+ !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
{
- _heap[node] = mfn_to_virt(mfn);
- avail[node] = mfn_to_virt(mfn + needed - 1) +
- PAGE_SIZE - sizeof(**avail) * NR_ZONES;
+ if ( arch_mfns_in_directmap(mfn, needed) )
+ {
+ _heap[node] = mfn_to_virt(mfn);
+ avail[node] = mfn_to_virt(mfn + needed - 1) +
+ PAGE_SIZE - sizeof(**avail) * NR_ZONES;
+ }
+ else
+ {
+ mfn_t needed_start = _mfn(mfn);
+
+ _heap[node] = vmap_contig(needed_start, needed);
+ BUG_ON(!_heap[node]);
+ avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
+ sizeof(**avail) * NR_ZONES;
+ }
*use_tail = false;
}
else if ( get_order_from_bytes(sizeof(**_heap)) ==
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 13/19] x86/setup: Do not create valid mappings when directmap=no
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (11 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 12/19] x86/setup: vmap heap nodes when they are outside the direct map Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 14/19] Rename mfn_to_virt() calls Elias El Yandouzi
` (6 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
Create empty mappings in the second e820 pass. Also, destroy existing
direct map mappings created in the first pass.
To make xenheap pages visible in guests, it is necessary to create empty
L3 tables in the direct map even when directmap=no, since guest cr3s
copy idle domain's L4 entries, which means they will share mappings in
the direct map if we pre-populate idle domain's L4 entries and L3
tables. A helper is introduced for this.
Also, after the direct map is actually gone, we need to stop updating
the direct map in update_xen_mappings().
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index f26c9799e4..919347d8c2 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -978,6 +978,57 @@ static struct domain *__init create_dom0(const module_t *image,
/* How much of the directmap is prebuilt at compile time. */
#define PREBUILT_MAP_LIMIT (1 << L2_PAGETABLE_SHIFT)
+/*
+ * This either populates a valid direct map, or allocates empty L3 tables and
+ * creates the L4 entries for virtual address between [start, end) in the
+ * direct map depending on has_directmap();
+ *
+ * When directmap=no, we still need to populate empty L3 tables in the
+ * direct map region. The reason is that on-demand xenheap mappings are
+ * created in the idle domain's page table but must be seen by
+ * everyone. Since all domains share the direct map L4 entries, they
+ * will share xenheap mappings if we pre-populate the L4 entries and L3
+ * tables in the direct map region for all RAM. We also rely on the fact
+ * that L3 tables are never freed.
+ */
+static void __init populate_directmap(uint64_t pstart, uint64_t pend,
+ unsigned int flags)
+{
+ unsigned long vstart = (unsigned long)__va(pstart);
+ unsigned long vend = (unsigned long)__va(pend);
+
+ if ( pstart >= pend )
+ return;
+
+ BUG_ON(vstart < DIRECTMAP_VIRT_START);
+ BUG_ON(vend > DIRECTMAP_VIRT_END);
+
+ if ( has_directmap() )
+ /* Populate valid direct map. */
+ BUG_ON(map_pages_to_xen(vstart, maddr_to_mfn(pstart),
+ PFN_DOWN(pend - pstart), flags));
+ else
+ {
+ /* Create empty L3 tables. */
+ unsigned long vaddr = vstart & ~((1UL << L4_PAGETABLE_SHIFT) - 1);
+
+ for ( ; vaddr < vend; vaddr += (1UL << L4_PAGETABLE_SHIFT) )
+ {
+ l4_pgentry_t *pl4e = &idle_pg_table[l4_table_offset(vaddr)];
+
+ if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
+ {
+ mfn_t mfn = alloc_boot_pages(1, 1);
+ void *v = map_domain_page(mfn);
+
+ clear_page(v);
+ UNMAP_DOMAIN_PAGE(v);
+ l4e_write(pl4e, l4e_from_mfn(mfn, __PAGE_HYPERVISOR));
+ }
+ }
+ }
+}
+
void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
{
const char *memmap_type = NULL, *loader, *cmdline = "";
@@ -1601,8 +1652,17 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
map_e = min_t(uint64_t, e,
ARRAY_SIZE(l2_directmap) << L2_PAGETABLE_SHIFT);
- /* Pass mapped memory to allocator /before/ creating new mappings. */
+ /*
+ * Pass mapped memory to allocator /before/ creating new mappings.
+ * The direct map for the bottom 4GiB has been populated in the first
+ * e820 pass. In the second pass, we make sure those existing mappings
+ * are destroyed when directmap=no.
+ */
init_boot_pages(s, min(map_s, e));
+ if ( !has_directmap() )
+ destroy_xen_mappings((unsigned long)__va(s),
+ (unsigned long)__va(min(map_s, e)));
+
s = map_s;
if ( s < map_e )
{
@@ -1610,6 +1670,9 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
map_s = (s + mask) & ~mask;
map_e &= ~mask;
init_boot_pages(map_s, map_e);
+ if ( !has_directmap() )
+ destroy_xen_mappings((unsigned long)__va(map_s),
+ (unsigned long)__va(map_e));
}
if ( map_s > map_e )
@@ -1623,8 +1686,7 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
if ( map_e < end )
{
- map_pages_to_xen((unsigned long)__va(map_e), maddr_to_mfn(map_e),
- PFN_DOWN(end - map_e), PAGE_HYPERVISOR);
+ populate_directmap(map_e, end, PAGE_HYPERVISOR);
init_boot_pages(map_e, end);
map_e = end;
}
@@ -1633,13 +1695,11 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
{
/* This range must not be passed to the boot allocator and
* must also not be mapped with _PAGE_GLOBAL. */
- map_pages_to_xen((unsigned long)__va(map_e), maddr_to_mfn(map_e),
- PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
+ populate_directmap(map_e, e, __PAGE_HYPERVISOR_RW);
}
if ( s < map_s )
{
- map_pages_to_xen((unsigned long)__va(s), maddr_to_mfn(s),
- PFN_DOWN(map_s - s), PAGE_HYPERVISOR);
+ populate_directmap(s, map_s, PAGE_HYPERVISOR);
init_boot_pages(s, map_s);
}
}
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 14/19] Rename mfn_to_virt() calls
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (12 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 13/19] x86/setup: Do not create valid mappings when directmap=no Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 15/19] Rename maddr_to_virt() calls Elias El Yandouzi
` (5 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Elias El Yandouzi
Until directmap gets completely removed, we'd still need to
keep some calls to mfn_to_virt() for xenheap pages or when the
directmap is enabled.
Rename the macro to mfn_to_directmap_virt() to flag them and
prevent further use of mfn_to_virt().
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
diff --git a/xen/arch/arm/include/asm/mm.h b/xen/arch/arm/include/asm/mm.h
index 48538b5337..2bca3f9e87 100644
--- a/xen/arch/arm/include/asm/mm.h
+++ b/xen/arch/arm/include/asm/mm.h
@@ -336,6 +336,7 @@ static inline uint64_t gvirt_to_maddr(vaddr_t va, paddr_t *pa,
*/
#define virt_to_mfn(va) __virt_to_mfn(va)
#define mfn_to_virt(mfn) __mfn_to_virt(mfn)
+#define mfn_to_directmap_virt(mfn) mfn_to_virt(mfn)
/* Convert between Xen-heap virtual addresses and page-info structures. */
static inline struct page_info *virt_to_page(const void *v)
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 89caefc8a2..62d6fee0f4 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -81,14 +81,14 @@ void *map_domain_page(mfn_t mfn)
#ifdef NDEBUG
if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
- return mfn_to_virt(mfn_x(mfn));
+ return mfn_to_directmap_virt(mfn_x(mfn));
#endif
v = mapcache_current_vcpu();
if ( !v || !v->domain->arch.mapcache.inuse )
{
if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
- return mfn_to_virt(mfn_x(mfn));
+ return mfn_to_directmap_virt(mfn_x(mfn));
else
{
BUG_ON(system_state >= SYS_STATE_smp_boot);
@@ -324,7 +324,7 @@ void *map_domain_page_global(mfn_t mfn)
#ifdef NDEBUG
if ( arch_mfn_in_directmap(mfn_x(mfn, 1)) )
- return mfn_to_virt(mfn_x(mfn));
+ return mfn_to_directmap_virt(mfn_x(mfn));
#endif
return vmap(&mfn, 1);
diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
index b0cb96c3bc..d1482ae2f7 100644
--- a/xen/arch/x86/hvm/dom0_build.c
+++ b/xen/arch/x86/hvm/dom0_build.c
@@ -439,7 +439,7 @@ static int __init pvh_populate_p2m(struct domain *d)
d->arch.e820[i].addr + d->arch.e820[i].size);
enum hvm_translation_result res =
hvm_copy_to_guest_phys(mfn_to_maddr(_mfn(addr)),
- mfn_to_virt(addr),
+ mfn_to_directmap_virt(addr),
end - d->arch.e820[i].addr,
v);
@@ -725,7 +725,7 @@ static int __init pvh_load_kernel(struct domain *d, const module_t *image,
if ( initrd != NULL )
{
- rc = hvm_copy_to_guest_phys(last_addr, mfn_to_virt(initrd->mod_start),
+ rc = hvm_copy_to_guest_phys(last_addr, mfn_to_directmap_virt(initrd->mod_start),
initrd_len, v);
if ( rc )
{
diff --git a/xen/arch/x86/include/asm/page.h b/xen/arch/x86/include/asm/page.h
index 350d1fb110..c6891b52d4 100644
--- a/xen/arch/x86/include/asm/page.h
+++ b/xen/arch/x86/include/asm/page.h
@@ -268,7 +268,7 @@ void copy_page_sse2(void *to, const void *from);
*/
#define mfn_valid(mfn) __mfn_valid(mfn_x(mfn))
#define virt_to_mfn(va) __virt_to_mfn(va)
-#define mfn_to_virt(mfn) __mfn_to_virt(mfn)
+#define mfn_to_directmap_virt(mfn) __mfn_to_virt(mfn)
#define virt_to_maddr(va) __virt_to_maddr((unsigned long)(va))
#define maddr_to_virt(ma) __maddr_to_virt((unsigned long)(ma))
#define maddr_to_page(ma) __maddr_to_page(ma)
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index efdf20f775..337363cf17 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -318,8 +318,8 @@ void __init arch_init_memory(void)
iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
ioend_pfn = min(rstart_pfn, 16UL << (20 - PAGE_SHIFT));
if ( iostart_pfn < ioend_pfn )
- destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
- (unsigned long)mfn_to_virt(ioend_pfn));
+ destroy_xen_mappings((unsigned long)mfn_to_directmap_virt(iostart_pfn),
+ (unsigned long)mfn_to_directmap_virt(ioend_pfn));
/* Mark as I/O up to next RAM region. */
for ( ; pfn < rstart_pfn; pfn++ )
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 919347d8c2..e0671ab3c3 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -399,7 +399,7 @@ void *__init bootstrap_map(const module_t *mod)
void *ret;
if ( system_state != SYS_STATE_early_boot )
- return mod ? mfn_to_virt(mod->mod_start) : NULL;
+ return mod ? mfn_to_directmap_virt(mod->mod_start) : NULL;
if ( !mod )
{
@@ -1708,7 +1708,7 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
{
set_pdx_range(mod[i].mod_start,
mod[i].mod_start + PFN_UP(mod[i].mod_end));
- map_pages_to_xen((unsigned long)mfn_to_virt(mod[i].mod_start),
+ map_pages_to_xen((unsigned long)mfn_to_directmap_virt(mod[i].mod_start),
_mfn(mod[i].mod_start),
PFN_UP(mod[i].mod_end), PAGE_HYPERVISOR);
}
diff --git a/xen/arch/x86/tboot.c b/xen/arch/x86/tboot.c
index ba0700d2d5..58df01dfb9 100644
--- a/xen/arch/x86/tboot.c
+++ b/xen/arch/x86/tboot.c
@@ -253,7 +253,7 @@ static void tboot_gen_xenheap_integrity(const uint8_t key[TB_KEY_SIZE],
continue; /* skip tboot and its page tables */
if ( is_page_in_use(page) && is_special_page(page) )
- vmac_update(mfn_to_virt(mfn), PAGE_SIZE, &ctx);
+ vmac_update(mfn_to_directmap_virt(mfn), PAGE_SIZE, &ctx);
}
*mac = vmac(NULL, 0, nonce, NULL, &ctx);
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index b2a280fba3..1697760d82 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -1247,19 +1247,25 @@ int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
i = virt_to_mfn(HYPERVISOR_VIRT_END - 1) + 1;
if ( spfn < i )
{
- ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), _mfn(spfn),
- min(epfn, i) - spfn, PAGE_HYPERVISOR);
- if ( ret )
- goto destroy_directmap;
+ if ( arch_mfns_in_directmap(spfn, min(epfn, i) - spfn ) )
+ {
+ ret = map_pages_to_xen((unsigned long)mfn_to_directmap_virt(spfn), _mfn(spfn),
+ min(epfn, i) - spfn, PAGE_HYPERVISOR);
+ if ( ret )
+ goto destroy_directmap;
+ }
}
if ( i < epfn )
{
if ( i < spfn )
i = spfn;
- ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), _mfn(i),
- epfn - i, __PAGE_HYPERVISOR_RW);
- if ( ret )
- goto destroy_directmap;
+ if ( arch_mfns_in_directmap(spfn, epfn - i) )
+ {
+ ret = map_pages_to_xen((unsigned long)mfn_to_directmap_virt(i), _mfn(i),
+ epfn - i, __PAGE_HYPERVISOR_RW);
+ if ( ret )
+ goto destroy_directmap;
+ }
}
old_node_start = node_start_pfn(node);
@@ -1348,8 +1354,8 @@ destroy_frametable:
NODE_DATA(node)->node_start_pfn = old_node_start;
NODE_DATA(node)->node_spanned_pages = old_node_span;
destroy_directmap:
- destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
- (unsigned long)mfn_to_virt(epfn));
+ destroy_xen_mappings((unsigned long)mfn_to_directmap_virt(spfn),
+ (unsigned long)mfn_to_directmap_virt(epfn));
return ret;
}
diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
index efbec00af9..39aed5845d 100644
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -1599,7 +1599,7 @@ static __init void copy_mapping(unsigned long mfn, unsigned long end,
for ( ; mfn < end; mfn = next )
{
l4_pgentry_t l4e = efi_l4t[l4_table_offset(mfn << PAGE_SHIFT)];
- unsigned long va = (unsigned long)mfn_to_virt(mfn);
+ unsigned long va = (unsigned long)mfn_to_directmap_virt(mfn);
if ( !(mfn & ((1UL << (L4_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) )
UNMAP_DOMAIN_PAGE(l3dst);
@@ -1757,15 +1757,18 @@ void __init efi_init_memory(void)
if ( pfn_to_pdx(emfn - 1) < (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
pdx_is_region_compressible(mem_base, mem_npages) )
{
- if ( (unsigned long)mfn_to_virt(emfn - 1) >= HYPERVISOR_VIRT_END )
- prot &= ~_PAGE_GLOBAL;
- if ( map_pages_to_xen((unsigned long)mfn_to_virt(smfn),
- _mfn(smfn), emfn - smfn, prot) == 0 )
- desc->VirtualStart =
- (unsigned long)maddr_to_virt(desc->PhysicalStart);
- else
- printk(XENLOG_ERR "Could not map MFNs %#lx-%#lx\n",
- smfn, emfn - 1);
+ if ( arch_mfns_in_directmap(smfn, emfn - smfn) )
+ {
+ if ( (unsigned long)mfn_to_directmap_virt(emfn - 1) >= HYPERVISOR_VIRT_END )
+ prot &= ~_PAGE_GLOBAL;
+ if ( map_pages_to_xen((unsigned long)mfn_to_directmap_virt(smfn),
+ _mfn(smfn), emfn - smfn, prot) == 0 )
+ desc->VirtualStart =
+ (unsigned long)maddr_to_virt(desc->PhysicalStart);
+ else
+ printk(XENLOG_ERR "Could not map MFNs %#lx-%#lx\n",
+ smfn, emfn - 1);
+ }
}
else if ( !((desc->PhysicalStart + len - 1) >> (VADDR_BITS - 1)) &&
(extra = xmalloc(struct rt_extra)) != NULL )
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 3c0909f333..e1e98d4d59 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -611,8 +611,8 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
{
if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
{
- _heap[node] = mfn_to_virt(mfn + nr - needed);
- avail[node] = mfn_to_virt(mfn + nr - 1) +
+ _heap[node] = mfn_to_directmap_virt(mfn + nr - needed);
+ avail[node] = mfn_to_directmap_virt(mfn + nr - 1) +
PAGE_SIZE - sizeof(**avail) * NR_ZONES;
}
else
@@ -631,8 +631,8 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
{
if ( arch_mfns_in_directmap(mfn, needed) )
{
- _heap[node] = mfn_to_virt(mfn);
- avail[node] = mfn_to_virt(mfn + needed - 1) +
+ _heap[node] = mfn_to_directmap_virt(mfn);
+ avail[node] = mfn_to_directmap_virt(mfn + needed - 1) +
PAGE_SIZE - sizeof(**avail) * NR_ZONES;
}
else
diff --git a/xen/common/trace.c b/xen/common/trace.c
index 4e7b080e61..955509a0d8 100644
--- a/xen/common/trace.c
+++ b/xen/common/trace.c
@@ -229,7 +229,7 @@ static int alloc_trace_bufs(unsigned int pages)
offset = t_info->mfn_offset[cpu];
/* Initialize the buffer metadata */
- per_cpu(t_bufs, cpu) = buf = mfn_to_virt(t_info_mfn_list[offset]);
+ per_cpu(t_bufs, cpu) = buf = mfn_to_directmap_virt(t_info_mfn_list[offset]);
buf->cons = buf->prod = 0;
printk(XENLOG_INFO "xentrace: p%d mfn %x offset %u\n",
@@ -268,7 +268,7 @@ out_dealloc:
if ( !mfn )
break;
ASSERT(!(mfn_to_page(_mfn(mfn))->count_info & PGC_allocated));
- free_xenheap_pages(mfn_to_virt(mfn), 0);
+ free_xenheap_pages(mfn_to_directmap_virt(mfn), 0);
}
}
free_xenheap_pages(t_info, get_order_from_pages(t_info_pages));
@@ -529,7 +529,7 @@ static unsigned char *next_record(const struct t_buf *buf, uint32_t *next,
per_cpu_mfn_offset = t_info->mfn_offset[smp_processor_id()];
mfn_list = (uint32_t *)t_info;
mfn = mfn_list[per_cpu_mfn_offset + per_cpu_mfn_nr];
- this_page = mfn_to_virt(mfn);
+ this_page = (unsigned char *)mfn_to_directmap_virt(mfn);
if (per_cpu_mfn_nr + 1 >= opt_tbuf_size)
{
/* reached end of buffer? */
@@ -538,7 +538,7 @@ static unsigned char *next_record(const struct t_buf *buf, uint32_t *next,
else
{
mfn = mfn_list[per_cpu_mfn_offset + per_cpu_mfn_nr + 1];
- *next_page = mfn_to_virt(mfn);
+ *next_page = mfn_to_directmap_virt(mfn);
}
return this_page;
}
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 15/19] Rename maddr_to_virt() calls
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (13 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 14/19] Rename mfn_to_virt() calls Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 16/19] xen/arm32: mm: Rename 'first' to 'root' in init_secondary_pagetables() Elias El Yandouzi
` (4 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Elias El Yandouzi
Until directmap gets completely removed, we'd still need to
keep some calls to mmaddr_to_virt() for xenheap pages or when the
directmap is enabled.
Rename the macro to maddr_to_directmap_virt() to flag them and
prevent further use of maddr_to_virt().
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
diff --git a/xen/arch/x86/dmi_scan.c b/xen/arch/x86/dmi_scan.c
index 81f80c053a..ac016f3a04 100644
--- a/xen/arch/x86/dmi_scan.c
+++ b/xen/arch/x86/dmi_scan.c
@@ -277,7 +277,7 @@ const char *__init dmi_get_table(paddr_t *base, u32 *len)
return "SMBIOS";
}
} else {
- char __iomem *p = maddr_to_virt(0xF0000), *q;
+ char __iomem *p = maddr_to_directmap_virt(0xF0000), *q;
union {
struct dmi_eps dmi;
struct smbios3_eps smbios3;
@@ -364,7 +364,7 @@ static int __init dmi_iterate(void (*decode)(const struct dmi_header *))
dmi.size = 0;
smbios3.length = 0;
- p = maddr_to_virt(0xF0000);
+ p = maddr_to_directmap_virt(0xF0000);
for (q = p; q < p + 0x10000; q += 16) {
if (!dmi.size) {
memcpy_fromio(&dmi, q, sizeof(dmi));
diff --git a/xen/arch/x86/include/asm/mach-default/bios_ebda.h b/xen/arch/x86/include/asm/mach-default/bios_ebda.h
index 42de6b2a5b..8cfe53d1f2 100644
--- a/xen/arch/x86/include/asm/mach-default/bios_ebda.h
+++ b/xen/arch/x86/include/asm/mach-default/bios_ebda.h
@@ -7,7 +7,7 @@
*/
static inline unsigned int get_bios_ebda(void)
{
- unsigned int address = *(unsigned short *)maddr_to_virt(0x40E);
+ unsigned int address = *(unsigned short *)maddr_to_directmap_virt(0x40E);
address <<= 4;
return address; /* 0 means none */
}
diff --git a/xen/arch/x86/include/asm/page.h b/xen/arch/x86/include/asm/page.h
index c6891b52d4..bf7bf08ba4 100644
--- a/xen/arch/x86/include/asm/page.h
+++ b/xen/arch/x86/include/asm/page.h
@@ -240,11 +240,11 @@ void copy_page_sse2(void *to, const void *from);
/* Convert between Xen-heap virtual addresses and machine addresses. */
#define __pa(x) (virt_to_maddr(x))
-#define __va(x) (maddr_to_virt(x))
+#define __va(x) (maddr_to_directmap_virt(x))
/* Convert between Xen-heap virtual addresses and machine frame numbers. */
#define __virt_to_mfn(va) (virt_to_maddr(va) >> PAGE_SHIFT)
-#define __mfn_to_virt(mfn) (maddr_to_virt((paddr_t)(mfn) << PAGE_SHIFT))
+#define __mfn_to_virt(mfn) (maddr_to_directmap_virt((paddr_t)(mfn) << PAGE_SHIFT))
/* Convert between machine frame numbers and page-info structures. */
#define mfn_to_page(mfn) (frame_table + mfn_to_pdx(mfn))
@@ -270,7 +270,7 @@ void copy_page_sse2(void *to, const void *from);
#define virt_to_mfn(va) __virt_to_mfn(va)
#define mfn_to_directmap_virt(mfn) __mfn_to_virt(mfn)
#define virt_to_maddr(va) __virt_to_maddr((unsigned long)(va))
-#define maddr_to_virt(ma) __maddr_to_virt((unsigned long)(ma))
+#define maddr_to_directmap_virt(ma) __maddr_to_directmap_virt((unsigned long)(ma))
#define maddr_to_page(ma) __maddr_to_page(ma)
#define page_to_maddr(pg) __page_to_maddr(pg)
#define virt_to_page(va) __virt_to_page(va)
diff --git a/xen/arch/x86/include/asm/x86_64/page.h b/xen/arch/x86/include/asm/x86_64/page.h
index 19ca64d792..a95ebc088f 100644
--- a/xen/arch/x86/include/asm/x86_64/page.h
+++ b/xen/arch/x86/include/asm/x86_64/page.h
@@ -48,7 +48,7 @@ static inline unsigned long __virt_to_maddr(unsigned long va)
return xen_phys_start + va - XEN_VIRT_START;
}
-static inline void *__maddr_to_virt(unsigned long ma)
+static inline void *__maddr_to_directmap_virt(unsigned long ma)
{
/* Offset in the direct map, accounting for pdx compression */
unsigned long va_offset = maddr_to_directmapoff(ma);
diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c
index d8ccab2449..69181b0abe 100644
--- a/xen/arch/x86/mpparse.c
+++ b/xen/arch/x86/mpparse.c
@@ -664,7 +664,7 @@ void __init get_smp_config (void)
static int __init smp_scan_config (unsigned long base, unsigned long length)
{
- unsigned int *bp = maddr_to_virt(base);
+ unsigned int *bp = maddr_to_directmap_virt(base);
struct intel_mp_floating *mpf;
Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
index 39aed5845d..1b02e2b6d5 100644
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -1764,7 +1764,7 @@ void __init efi_init_memory(void)
if ( map_pages_to_xen((unsigned long)mfn_to_directmap_virt(smfn),
_mfn(smfn), emfn - smfn, prot) == 0 )
desc->VirtualStart =
- (unsigned long)maddr_to_virt(desc->PhysicalStart);
+ (unsigned long)maddr_to_directmap_virt(desc->PhysicalStart);
else
printk(XENLOG_ERR "Could not map MFNs %#lx-%#lx\n",
smfn, emfn - 1);
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 16/19] xen/arm32: mm: Rename 'first' to 'root' in init_secondary_pagetables()
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (14 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 15/19] Rename maddr_to_virt() calls Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 17/19] xen/arm64: mm: Use per-pCPU page-tables Elias El Yandouzi
` (3 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Julien Grall, Elias El Yandouzi
From: Julien Grall <jgrall@amazon.com>
The arm32 version of init_secondary_pagetables() will soon be re-used
for arm64 as well where the root table starts at level 0 rather than level 1.
So rename 'first' to 'root'.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changelog in v2:
* Rebase
* Fix typo
diff --git a/xen/arch/arm/mmu/smpboot.c b/xen/arch/arm/mmu/smpboot.c
index 4ffc8254a4..e29b6f34f2 100644
--- a/xen/arch/arm/mmu/smpboot.c
+++ b/xen/arch/arm/mmu/smpboot.c
@@ -109,32 +109,32 @@ int prepare_secondary_mm(int cpu)
#else
int prepare_secondary_mm(int cpu)
{
- lpae_t *first;
+ lpae_t *root = alloc_xenheap_page();
first = alloc_xenheap_page(); /* root == first level on 32-bit 3-level trie */
- if ( !first )
+ if ( !root )
{
- printk("CPU%u: Unable to allocate the first page-table\n", cpu);
+ printk("CPU%u: Unable to allocate the root page-table\n", cpu);
return -ENOMEM;
}
/* Initialise root pagetable from root of boot tables */
- memcpy(first, per_cpu(xen_pgtable, 0), PAGE_SIZE);
- per_cpu(xen_pgtable, cpu) = first;
+ memcpy(root, per_cpu(xen_pgtable, 0), PAGE_SIZE);
+ per_cpu(xen_pgtable, cpu) = root;
if ( !init_domheap_mappings(cpu) )
{
printk("CPU%u: Unable to prepare the domheap page-tables\n", cpu);
per_cpu(xen_pgtable, cpu) = NULL;
- free_xenheap_page(first);
+ free_xenheap_page(root);
return -ENOMEM;
}
clear_boot_pagetables();
/* Set init_ttbr for this CPU coming up */
- set_init_ttbr(first);
+ set_init_ttbr(root);
return 0;
}
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 17/19] xen/arm64: mm: Use per-pCPU page-tables
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (15 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 16/19] xen/arm32: mm: Rename 'first' to 'root' in init_secondary_pagetables() Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 18/19] xen/arm64: Implement a mapcache for arm64 Elias El Yandouzi
` (2 subsequent siblings)
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Julien Grall, Elias El Yandouzi
From: Julien Grall <jgrall@amazon.com>
At the moment, on Arm64, every pCPU is sharing the same page-tables.
In a follow-up patch, we will allow the possibility to remove the
direct map and therefore it will be necessary to have a mapcache.
While we have plenty of spare virtual address space to reserve part
for each pCPU, it means that temporary mappings (e.g. guest memory)
could be accessible by every pCPU.
In order to increase our security posture, it would be better if
those mappings are only accessible by the pCPU doing the temporary
mapping.
In addition to that, a per-pCPU page-tables opens the way to have
per-domain mapping area.
Arm32 is already using per-pCPU page-tables so most of the code
can be re-used. Arm64 doesn't yet have support for the mapcache,
so a stub is provided (moved to its own header asm/domain_page.h).
Take the opportunity to fix a typo in a comment that is modified.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changelog since v1:
* Rebase
* Fix typoes
diff --git a/xen/arch/arm/arm64/mmu/mm.c b/xen/arch/arm/arm64/mmu/mm.c
index 293acb67e0..2ec1ffe1dc 100644
--- a/xen/arch/arm/arm64/mmu/mm.c
+++ b/xen/arch/arm/arm64/mmu/mm.c
@@ -76,6 +76,7 @@ static void __init prepare_runtime_identity_mapping(void)
paddr_t id_addr = virt_to_maddr(_start);
lpae_t pte;
DECLARE_OFFSETS(id_offsets, id_addr);
+ lpae_t *root = this_cpu(xen_pgtable);
if ( id_offsets[0] >= IDENTITY_MAPPING_AREA_NR_L0 )
panic("Cannot handle ID mapping above %uTB\n",
@@ -86,7 +87,7 @@ static void __init prepare_runtime_identity_mapping(void)
pte.pt.table = 1;
pte.pt.xn = 0;
- write_pte(&xen_pgtable[id_offsets[0]], pte);
+ write_pte(&root[id_offsets[0]], pte);
/* Link second ID table */
pte = pte_of_xenaddr((vaddr_t)xen_second_id);
diff --git a/xen/arch/arm/domain_page.c b/xen/arch/arm/domain_page.c
index 3a43601623..ac2a6d0332 100644
--- a/xen/arch/arm/domain_page.c
+++ b/xen/arch/arm/domain_page.c
@@ -3,6 +3,8 @@
#include <xen/pmap.h>
#include <xen/vmap.h>
+#include <asm/domain_page.h>
+
/* Override macros from asm/page.h to make them work with mfn_t */
#undef virt_to_mfn
#define virt_to_mfn(va) _mfn(__virt_to_mfn(va))
diff --git a/xen/arch/arm/include/asm/arm32/mm.h b/xen/arch/arm/include/asm/arm32/mm.h
index 856f2dbec4..87a315db01 100644
--- a/xen/arch/arm/include/asm/arm32/mm.h
+++ b/xen/arch/arm/include/asm/arm32/mm.h
@@ -1,12 +1,6 @@
#ifndef __ARM_ARM32_MM_H__
#define __ARM_ARM32_MM_H__
-#include <xen/percpu.h>
-
-#include <asm/lpae.h>
-
-DECLARE_PER_CPU(lpae_t *, xen_pgtable);
-
/*
* Only a limited amount of RAM, called xenheap, is always mapped on ARM32.
* For convenience always return false.
@@ -16,8 +10,6 @@ static inline bool arch_mfns_in_directmap(unsigned long mfn, unsigned long nr)
return false;
}
-bool init_domheap_mappings(unsigned int cpu);
-
static inline void arch_setup_page_tables(void)
{
}
diff --git a/xen/arch/arm/include/asm/domain_page.h b/xen/arch/arm/include/asm/domain_page.h
new file mode 100644
index 0000000000..e9f52685e2
--- /dev/null
+++ b/xen/arch/arm/include/asm/domain_page.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_ARM_DOMAIN_PAGE_H__
+#define __ASM_ARM_DOMAIN_PAGE_H__
+
+#ifdef CONFIG_ARCH_MAP_DOMAIN_PAGE
+bool init_domheap_mappings(unsigned int cpu);
+#else
+static inline bool init_domheap_mappings(unsigned int cpu)
+{
+ return true;
+}
+#endif
+
+#endif /* __ASM_ARM_DOMAIN_PAGE_H__ */
diff --git a/xen/arch/arm/include/asm/mm.h b/xen/arch/arm/include/asm/mm.h
index 2bca3f9e87..60e0122cba 100644
--- a/xen/arch/arm/include/asm/mm.h
+++ b/xen/arch/arm/include/asm/mm.h
@@ -2,6 +2,9 @@
#define __ARCH_ARM_MM__
#include <xen/kernel.h>
+#include <xen/percpu.h>
+
+#include <asm/lpae.h>
#include <asm/page.h>
#include <public/xen.h>
#include <xen/pdx.h>
diff --git a/xen/arch/arm/include/asm/mmu/mm.h b/xen/arch/arm/include/asm/mmu/mm.h
index c5e03a66bf..c03c3a51e4 100644
--- a/xen/arch/arm/include/asm/mmu/mm.h
+++ b/xen/arch/arm/include/asm/mmu/mm.h
@@ -2,6 +2,8 @@
#ifndef __ARM_MMU_MM_H__
#define __ARM_MMU_MM_H__
+DECLARE_PER_CPU(lpae_t *, xen_pgtable);
+
/* Non-boot CPUs use this to find the correct pagetables. */
extern uint64_t init_ttbr;
diff --git a/xen/arch/arm/mmu/pt.c b/xen/arch/arm/mmu/pt.c
index da28d669e7..1ed1a53ab1 100644
--- a/xen/arch/arm/mmu/pt.c
+++ b/xen/arch/arm/mmu/pt.c
@@ -607,9 +607,9 @@ static int xen_pt_update(unsigned long virt,
unsigned long left = nr_mfns;
/*
- * For arm32, page-tables are different on each CPUs. Yet, they share
- * some common mappings. It is assumed that only common mappings
- * will be modified with this function.
+ * Page-tables are different on each CPU. Yet, they share some common
+ * mappings. It is assumed that only common mappings will be modified
+ * with this function.
*
* XXX: Add a check.
*/
diff --git a/xen/arch/arm/mmu/setup.c b/xen/arch/arm/mmu/setup.c
index f4bb424c3c..7b981456e6 100644
--- a/xen/arch/arm/mmu/setup.c
+++ b/xen/arch/arm/mmu/setup.c
@@ -28,17 +28,15 @@
* PCPUs.
*/
-#ifdef CONFIG_ARM_64
-DEFINE_PAGE_TABLE(xen_pgtable);
-static DEFINE_PAGE_TABLE(xen_first);
-#define THIS_CPU_PGTABLE xen_pgtable
-#else
/* Per-CPU pagetable pages */
/* xen_pgtable == root of the trie (zeroeth level on 64-bit, first on 32-bit) */
DEFINE_PER_CPU(lpae_t *, xen_pgtable);
#define THIS_CPU_PGTABLE this_cpu(xen_pgtable)
/* Root of the trie for cpu0, other CPU's PTs are dynamically allocated */
static DEFINE_PAGE_TABLE(cpu0_pgtable);
+
+#ifdef CONFIG_ARM_64
+static DEFINE_PAGE_TABLE(xen_first);
#endif
/* Common pagetable leaves */
@@ -228,19 +226,22 @@ void __init setup_pagetables(unsigned long boot_phys_offset)
lpae_t pte, *p;
int i;
+ p = cpu0_pgtable;
+
phys_offset = boot_phys_offset;
+ /* arch_setup_page_tables() may need to access the root page-tables. */
+ per_cpu(xen_pgtable, 0) = cpu0_pgtable;
+
arch_setup_page_tables();
#ifdef CONFIG_ARM_64
pte = pte_of_xenaddr((uintptr_t)xen_first);
pte.pt.table = 1;
pte.pt.xn = 0;
- xen_pgtable[zeroeth_table_offset(XEN_VIRT_START)] = pte;
+ p[zeroeth_table_offset(XEN_VIRT_START)] = pte;
- p = (void *) xen_first;
-#else
- p = (void *) cpu0_pgtable;
+ p = xen_first;
#endif
/* Map xen second level page-table */
@@ -283,19 +284,11 @@ void __init setup_pagetables(unsigned long boot_phys_offset)
pte.pt.table = 1;
xen_second[second_table_offset(FIXMAP_ADDR(0))] = pte;
-#ifdef CONFIG_ARM_64
- ttbr = (uintptr_t) xen_pgtable + phys_offset;
-#else
ttbr = (uintptr_t) cpu0_pgtable + phys_offset;
-#endif
switch_ttbr(ttbr);
xen_pt_enforce_wnx();
-
-#ifdef CONFIG_ARM_32
- per_cpu(xen_pgtable, 0) = cpu0_pgtable;
-#endif
}
void *__init arch_vmap_virt_end(void)
diff --git a/xen/arch/arm/mmu/smpboot.c b/xen/arch/arm/mmu/smpboot.c
index e29b6f34f2..eb51d6aae9 100644
--- a/xen/arch/arm/mmu/smpboot.c
+++ b/xen/arch/arm/mmu/smpboot.c
@@ -7,6 +7,7 @@
#include <xen/domain_page.h>
+#include <asm/domain_page.h>
#include <asm/setup.h>
/* Override macros from asm/page.h to make them work with mfn_t */
@@ -93,20 +94,6 @@ static void set_init_ttbr(lpae_t *root)
unmap_domain_page(ptr);
}
-#ifdef CONFIG_ARM_64
-int prepare_secondary_mm(int cpu)
-{
- clear_boot_pagetables();
-
- /*
- * Set init_ttbr for this CPU coming up. All CPUs share a single setof
- * pagetables, but rewrite it each time for consistency with 32 bit.
- */
- set_init_ttbr(xen_pgtable);
-
- return 0;
-}
-#else
int prepare_secondary_mm(int cpu)
{
lpae_t *root = alloc_xenheap_page();
@@ -138,7 +125,6 @@ int prepare_secondary_mm(int cpu)
return 0;
}
-#endif
/*
* Local variables:
diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
index d242674381..d15987d6ea 100644
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -42,6 +42,7 @@
#include <asm/gic.h>
#include <asm/cpuerrata.h>
#include <asm/cpufeature.h>
+#include <asm/domain_page.h>
#include <asm/platform.h>
#include <asm/procinfo.h>
#include <asm/setup.h>
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 18/19] xen/arm64: Implement a mapcache for arm64
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (16 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 17/19] xen/arm64: mm: Use per-pCPU page-tables Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 11:11 ` [PATCH V3 19/19] xen/arm64: Allow the admin to enable/disable the directmap Elias El Yandouzi
2024-05-13 12:52 ` [PATCH V3 00/19] Remove " Roger Pau Monné
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Julien Grall, Elias El Yandouzi
From: Julien Grall <jgrall@amazon.com>
At the moment, on arm64, map_domain_page() is implemented using
virt_to_mfn(). Therefore it is relying on the directmap.
In a follow-up patch, we will allow the admin to remove the directmap.
Therefore we want to implement a mapcache.
Thanksfully there is already one for arm32. So select ARCH_ARM_DOMAIN_PAGE
and add the necessary boiler plate to support 64-bit:
- The page-table start at level 0, so we need to allocate the level
1 page-table
- map_domain_page() should check if the page is in the directmap. If
yes, then use virt_to_mfn() to limit the performance impact
when the directmap is still enabled (this will be selectable
on the command line).
Take the opportunity to replace first_table_offset(...) with offsets[...].
Note that, so far, arch_mfns_in_directmap() always return true on
arm64. So the mapcache is not yet used. This will change in a
follow-up patch.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
There are a few TODOs:
- It is becoming more critical to fix the mapcache
implementation (this is not compliant with the Arm Arm)
- Evaluate the performance
diff --git a/xen/arch/arm/Kconfig b/xen/arch/arm/Kconfig
index 21d03d9f44..0462960fc7 100644
--- a/xen/arch/arm/Kconfig
+++ b/xen/arch/arm/Kconfig
@@ -1,7 +1,6 @@
config ARM_32
def_bool y
depends on "$(ARCH)" = "arm32"
- select ARCH_MAP_DOMAIN_PAGE
config ARM_64
def_bool y
diff --git a/xen/arch/arm/arm64/mmu/mm.c b/xen/arch/arm/arm64/mmu/mm.c
index 2ec1ffe1dc..826864d25d 100644
--- a/xen/arch/arm/arm64/mmu/mm.c
+++ b/xen/arch/arm/arm64/mmu/mm.c
@@ -4,6 +4,7 @@
#include <xen/mm.h>
#include <xen/pfn.h>
+#include <asm/domain_page.h>
#include <asm/setup.h>
#include <asm/static-memory.h>
#include <asm/static-shmem.h>
@@ -237,6 +238,14 @@ void __init setup_mm(void)
setup_frametable_mappings(ram_start, ram_end);
max_page = PFN_DOWN(ram_end);
+ /*
+ * The allocators may need to use map_domain_page() (such as for
+ * scrubbing pages). So we need to prepare the domheap area first.
+ */
+ if ( !init_domheap_mappings(smp_processor_id()) )
+ panic("CPU%u: Unable to prepare the domheap page-tables\n",
+ smp_processor_id());
+
init_staticmem_pages();
init_sharedmem_pages();
}
diff --git a/xen/arch/arm/domain_page.c b/xen/arch/arm/domain_page.c
index ac2a6d0332..0f6ba48892 100644
--- a/xen/arch/arm/domain_page.c
+++ b/xen/arch/arm/domain_page.c
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
+#include <xen/domain_page.h>
#include <xen/mm.h>
#include <xen/pmap.h>
#include <xen/vmap.h>
@@ -8,6 +9,8 @@
/* Override macros from asm/page.h to make them work with mfn_t */
#undef virt_to_mfn
#define virt_to_mfn(va) _mfn(__virt_to_mfn(va))
+#undef mfn_to_virt
+#define mfn_to_virt(va) __mfn_to_virt(mfn_x(mfn))
/* cpu0's domheap page tables */
static DEFINE_PAGE_TABLES(cpu0_dommap, DOMHEAP_SECOND_PAGES);
@@ -31,13 +34,30 @@ bool init_domheap_mappings(unsigned int cpu)
{
unsigned int order = get_order_from_pages(DOMHEAP_SECOND_PAGES);
lpae_t *root = per_cpu(xen_pgtable, cpu);
+ lpae_t *first;
unsigned int i, first_idx;
lpae_t *domheap;
mfn_t mfn;
+ /* Convenience aliases */
+ DECLARE_OFFSETS(offsets, DOMHEAP_VIRT_START);
+
ASSERT(root);
ASSERT(!per_cpu(xen_dommap, cpu));
+ /*
+ * On Arm64, the root is at level 0. Therefore we need an extra step
+ * to allocate the first level page-table.
+ */
+#ifdef CONFIG_ARM_64
+ if ( create_xen_table(&root[offsets[0]]) )
+ return false;
+
+ first = xen_map_table(lpae_get_mfn(root[offsets[0]]));
+#else
+ first = root;
+#endif
+
/*
* The domheap for cpu0 is initialized before the heap is initialized.
* So we need to use pre-allocated pages.
@@ -58,16 +78,20 @@ bool init_domheap_mappings(unsigned int cpu)
* domheap mapping pages.
*/
mfn = virt_to_mfn(domheap);
- first_idx = first_table_offset(DOMHEAP_VIRT_START);
+ first_idx = offsets[1];
for ( i = 0; i < DOMHEAP_SECOND_PAGES; i++ )
{
lpae_t pte = mfn_to_xen_entry(mfn_add(mfn, i), MT_NORMAL);
pte.pt.table = 1;
- write_pte(&root[first_idx + i], pte);
+ write_pte(&first[first_idx + i], pte);
}
per_cpu(xen_dommap, cpu) = domheap;
+#ifdef CONFIG_ARM_64
+ xen_unmap_table(first);
+#endif
+
return true;
}
@@ -91,6 +115,10 @@ void *map_domain_page(mfn_t mfn)
lpae_t pte;
int i, slot;
+ /* Bypass the mapcache if the page is in the directmap */
+ if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
+ return mfn_to_virt(mfn);
+
local_irq_save(flags);
/* The map is laid out as an open-addressed hash table where each
@@ -153,13 +181,25 @@ void *map_domain_page(mfn_t mfn)
/* Release a mapping taken with map_domain_page() */
void unmap_domain_page(const void *ptr)
{
+ unsigned long va = (unsigned long)ptr;
unsigned long flags;
lpae_t *map = this_cpu(xen_dommap);
- int slot = ((unsigned long)ptr - DOMHEAP_VIRT_START) >> SECOND_SHIFT;
+ unsigned int slot;
+
+ /* Below we assume that the domheap area doesn't start at 0 */
+ BUILD_BUG_ON(DOMHEAP_VIRT_START == 0);
- if ( !ptr )
+ /*
+ * map_domain_page() may not have mapped anything if the address
+ * is part of the directmap. So ignore anything outside of the
+ * domheap.
+ */
+ if ( (va < DOMHEAP_VIRT_START) ||
+ ((va - DOMHEAP_VIRT_START) >= DOMHEAP_VIRT_SIZE) )
return;
+ slot = (va - DOMHEAP_VIRT_START) >> SECOND_SHIFT;
+
local_irq_save(flags);
ASSERT(slot >= 0 && slot < DOMHEAP_ENTRIES);
diff --git a/xen/arch/arm/include/asm/mm.h b/xen/arch/arm/include/asm/mm.h
index 60e0122cba..610dfa0466 100644
--- a/xen/arch/arm/include/asm/mm.h
+++ b/xen/arch/arm/include/asm/mm.h
@@ -432,6 +432,11 @@ static inline void page_set_xenheap_gfn(struct page_info *p, gfn_t gfn)
} while ( (y = cmpxchg(&p->u.inuse.type_info, x, nx)) != x );
}
+/* Helpers to allocate, map and unmap a Xen page-table */
+int create_xen_table(lpae_t *entry);
+lpae_t *xen_map_table(mfn_t mfn);
+void xen_unmap_table(const lpae_t *table);
+
#endif /* __ARCH_ARM_MM__ */
/*
* Local variables:
diff --git a/xen/arch/arm/include/asm/mmu/layout.h b/xen/arch/arm/include/asm/mmu/layout.h
index a3b546465b..c549420e8b 100644
--- a/xen/arch/arm/include/asm/mmu/layout.h
+++ b/xen/arch/arm/include/asm/mmu/layout.h
@@ -35,9 +35,13 @@
*
* 32G - 64G Frametable: 56 bytes per page for 2TB of RAM
*
- * 0x00000a8000000000 - 0x00007fffffffffff (512GB+117TB, L0 slots [21..255])
+ * 0x00000a8000000000 - 0x00007f7fffffffff (117TB, L0 slots [21..254])
* Unused
*
+ * 0x00007f8000000000 - 0x00007fffffffffff (512GB, L0 slot [255])
+ * (Relative offsets)
+ * 0 - 2G Domheap: on-demand-mapped
+ *
* 0x0000800000000000 - 0x000084ffffffffff (5TB, L0 slots [256..265])
* 1:1 mapping of RAM
*
@@ -130,6 +134,13 @@
#define FRAMETABLE_SIZE GB(32)
#define FRAMETABLE_NR (FRAMETABLE_SIZE / sizeof(*frame_table))
+#define DOMHEAP_VIRT_START SLOT0(255)
+#define DOMHEAP_VIRT_SIZE GB(2)
+
+#define DOMHEAP_ENTRIES 1024 /* 1024 2MB mapping slots */
+/* Number of domheap pagetable pages required at the second level (2MB mappings) */
+#define DOMHEAP_SECOND_PAGES (DOMHEAP_VIRT_SIZE >> FIRST_SHIFT)
+
#define DIRECTMAP_VIRT_START SLOT0(256)
#define DIRECTMAP_SIZE (SLOT0_ENTRY_SIZE * (266 - 256))
#define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + DIRECTMAP_SIZE - 1)
diff --git a/xen/arch/arm/mmu/pt.c b/xen/arch/arm/mmu/pt.c
index 1ed1a53ab1..da33c6c52e 100644
--- a/xen/arch/arm/mmu/pt.c
+++ b/xen/arch/arm/mmu/pt.c
@@ -33,7 +33,7 @@ mm_printk(const char *fmt, ...) {}
#define HYP_PT_ROOT_LEVEL 1
#endif
-static lpae_t *xen_map_table(mfn_t mfn)
+lpae_t *xen_map_table(mfn_t mfn)
{
/*
* During early boot, map_domain_page() may be unusable. Use the
@@ -45,7 +45,7 @@ static lpae_t *xen_map_table(mfn_t mfn)
return map_domain_page(mfn);
}
-static void xen_unmap_table(const lpae_t *table)
+void xen_unmap_table(const lpae_t *table)
{
/*
* During early boot, xen_map_table() will not use map_domain_page()
@@ -228,7 +228,7 @@ void *ioremap(paddr_t pa, size_t len)
return ioremap_attr(pa, len, PAGE_HYPERVISOR_NOCACHE);
}
-static int create_xen_table(lpae_t *entry)
+int create_xen_table(lpae_t *entry)
{
mfn_t mfn;
void *p;
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH V3 19/19] xen/arm64: Allow the admin to enable/disable the directmap
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (17 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 18/19] xen/arm64: Implement a mapcache for arm64 Elias El Yandouzi
@ 2024-05-13 11:11 ` Elias El Yandouzi
2024-05-13 12:52 ` [PATCH V3 00/19] Remove " Roger Pau Monné
19 siblings, 0 replies; 26+ messages in thread
From: Elias El Yandouzi @ 2024-05-13 11:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Julien Grall, Elias El Yandouzi
From: Julien Grall <jgrall@amazon.com>
Implement the same command line option as x86 to enable/disable the
directmap. By default this is kept enabled.
Also modify setup_directmap_mappings() to populate the L0 entries
related to the directmap area.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in v2:
* Rely on the Kconfig option to enable Secret Hiding on Arm64
* Use generic helper instead of arch_has_directmap()
diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
index 743d343ffa..cccd5e4282 100644
--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -799,7 +799,7 @@ that enabling this option cannot guarantee anything beyond what underlying
hardware guarantees (with, where available and known to Xen, respective
tweaks applied).
-### directmap (x86)
+### directmap (arm64, x86)
> `= <boolean>`
> Default: `true`
diff --git a/xen/arch/arm/Kconfig b/xen/arch/arm/Kconfig
index 0462960fc7..1cb495e334 100644
--- a/xen/arch/arm/Kconfig
+++ b/xen/arch/arm/Kconfig
@@ -7,6 +7,7 @@ config ARM_64
depends on !ARM_32
select 64BIT
select HAS_FAST_MULTIPLY
+ select HAS_SECRET_HIDING
config ARM
def_bool y
diff --git a/xen/arch/arm/arm64/mmu/mm.c b/xen/arch/arm/arm64/mmu/mm.c
index 826864d25d..81115cce51 100644
--- a/xen/arch/arm/arm64/mmu/mm.c
+++ b/xen/arch/arm/arm64/mmu/mm.c
@@ -158,16 +158,27 @@ void __init switch_ttbr(uint64_t ttbr)
update_identity_mapping(false);
}
-/* Map the region in the directmap area. */
+/*
+ * This either populate a valid fdirect map, or allocates empty L1 tables
+ * and creates the L0 entries for the given region in the direct map
+ * depending on has_directmap().
+ *
+ * When directmap=no, we still need to populate empty L1 tables in the
+ * directmap region. The reason is that the root page-table (i.e. L0)
+ * is per-CPU and secondary CPUs will initialize their root page-table
+ * based on the pCPU0 one. So L0 entries will be shared if they are
+ * pre-populated. We also rely on the fact that L1 tables are never
+ * freed.
+ */
static void __init setup_directmap_mappings(unsigned long base_mfn,
unsigned long nr_mfns)
{
+ unsigned long mfn_gb = base_mfn & ~((FIRST_SIZE >> PAGE_SHIFT) - 1);
int rc;
/* First call sets the directmap physical and virtual offset. */
if ( mfn_eq(directmap_mfn_start, INVALID_MFN) )
{
- unsigned long mfn_gb = base_mfn & ~((FIRST_SIZE >> PAGE_SHIFT) - 1);
directmap_mfn_start = _mfn(base_mfn);
directmap_base_pdx = mfn_to_pdx(_mfn(base_mfn));
@@ -188,6 +199,24 @@ static void __init setup_directmap_mappings(unsigned long base_mfn,
panic("cannot add directmap mapping at %lx below heap start %lx\n",
base_mfn, mfn_x(directmap_mfn_start));
+ if ( !has_directmap() )
+ {
+ vaddr_t vaddr = (vaddr_t)__mfn_to_virt(base_mfn);
+ lpae_t *root = this_cpu(xen_pgtable);
+ unsigned int i, slot;
+
+ slot = first_table_offset(vaddr);
+ nr_mfns += base_mfn - mfn_gb;
+ for ( i = 0; i < nr_mfns; i += BIT(XEN_PT_LEVEL_ORDER(0), UL), slot++ )
+ {
+ lpae_t *entry = &root[slot];
+
+ if ( !lpae_is_valid(*entry) && !create_xen_table(entry) )
+ panic("Unable to populate zeroeth slot %u\n", slot);
+ }
+ return;
+ }
+
rc = map_pages_to_xen((vaddr_t)__mfn_to_virt(base_mfn),
_mfn(base_mfn), nr_mfns,
PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
diff --git a/xen/arch/arm/include/asm/arm64/mm.h b/xen/arch/arm/include/asm/arm64/mm.h
index e0bd23a6ed..5888f29159 100644
--- a/xen/arch/arm/include/asm/arm64/mm.h
+++ b/xen/arch/arm/include/asm/arm64/mm.h
@@ -3,13 +3,10 @@
extern DEFINE_PAGE_TABLE(xen_pgtable);
-/*
- * On ARM64, all the RAM is currently direct mapped in Xen.
- * Hence return always true.
- */
+/* On Arm64, the user can chose whether all the RAM is directmap. */
static inline bool arch_mfns_in_directmap(unsigned long mfn, unsigned long nr)
{
- return true;
+ return has_directmap();
}
void arch_setup_page_tables(void);
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index def939172c..0f3ffab6ba 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -12,6 +12,7 @@
#include <xen/grant_table.h>
#include <xen/guest_access.h>
#include <xen/mm.h>
+#include <xen/param.h>
#include <xsm/xsm.h>
diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
index d15987d6ea..6b06e2f4f5 100644
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -778,6 +778,7 @@ void asmlinkage __init start_xen(unsigned long boot_phys_offset,
cmdline_parse(cmdline);
setup_mm();
+ printk("Booting with directmap %s\n", has_directmap() ? "on" : "off");
vm_init();
--
2.40.1
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH V3 00/19] Remove the directmap
2024-05-13 11:10 [PATCH V3 00/19] Remove the directmap Elias El Yandouzi
` (18 preceding siblings ...)
2024-05-13 11:11 ` [PATCH V3 19/19] xen/arm64: Allow the admin to enable/disable the directmap Elias El Yandouzi
@ 2024-05-13 12:52 ` Roger Pau Monné
19 siblings, 0 replies; 26+ messages in thread
From: Roger Pau Monné @ 2024-05-13 12:52 UTC (permalink / raw)
To: Elias El Yandouzi
Cc: xen-devel, julien, pdurrant, dwmw, Andrew Cooper, George Dunlap,
Stefano Stabellini, Wei Liu, Bertrand Marquis, Michal Orzel,
Volodymyr Babchuk, Roger Pau Monné, Lukasz Hawrylko,
Daniel P. Smith, Mateusz Mówka
You seem to have forgotten to add the maintainers on Cc for the
patches. Adding them here for reference.
Regards, Roger.
On Mon, May 13, 2024 at 11:10:58AM +0000, Elias El Yandouzi wrote:
> Hi all,
>
> A few years ago, Wei Liu implemented a PoC to remove the directmap
> from Xen. The last version was sent by Hongyan Xia [1].
>
> I will start with thanking both Wei and Hongyan for the initial work
> to upstream the feature. A lot of patches already went in and this is
> the last few patches missing to effectively enable the feature.
>
> === What is the directmap? ===
>
> At the moment, on both arm64 and x86, most of the RAM is mapped
> in Xen address space. This means that domain memory is easily
> accessible in Xen.
>
> === Why do we want to remove the directmap? ===
>
> (Summarizing my understanding of the previous discussion)
>
> Speculation attacks (like Spectre SP1) rely on loading piece of memory
> in the cache. If the region is not mapped then it can't be loaded.
>
> So removing reducing the amount of memory mapped in Xen will also
> reduce the surface attack.
>
> === What's the performance impact? ===
>
> As the guest memory is not always mapped, then the cost of mapping
> will increase. I haven't done the numbers with this new version, but
> some measurement were provided in the previous version for x86.
>
> === Improvement possible ===
>
> The known area to improve on x86 are:
> * Mapcache: There was a patch sent by Hongyan:
> https://lore.kernel.org/xen-devel/4058e92ce21627731c49b588a95809dc0affd83a.1581015491.git.hongyxia@amazon.com/
> * EPT: At the moment an guest page-tabel walk requires about 20 map/unmap.
> This will have an very high impact on the performance. We need to decide
> whether keep the EPT always mapped is a problem
>
> The original series didn't have support for Arm64. But as there were
> some interest, I have provided a PoC.
>
> There are more extra work for Arm64:
> * The mapcache is quite simple. We would investigate the performance
> * The mapcache should be made compliant to the Arm Arm (this is now
> more critical).
> * We will likely have the same problem as for the EPT.
> * We have no support for merging table to a superpage, neither
> free empty page-tables. (See more below)
>
> === Implementation ===
>
> The subject is probably a misnomer. The directmap is still present but
> the RAM is not mapped by default. Instead, the region will still be used
> to map pages allocate via alloc_xenheap_pages().
>
> The advantage is the solution is simple (so IHMO good enough for been
> merged as a tech preview). The disadvantage is the page allocator is not
> trying to keep all the xenheap pages together. So we may end up to have
> an increase of page table usage.
>
> In the longer term, we should consider to remove the direct map
> completely and switch to vmap(). The main problem with this approach
> is it is frequent to use mfn_to_virt() in the code. So we would need
> to cache the mapping (maybe in the struct page_info).
>
> === Why arm32 is not covered? ===
>
> On Arm32, the domheap and xenheap is always separated. So by design
> the guest memory is not mapped by default.
>
> At this stage, it seems unnecessary to have to map/unmap xenheap pages
> every time they are allocated.
>
> === Why not using a separate domheap and xenheap? ===
>
> While a separate xenheap/domheap reduce the page-table usage (all
> xenheap pages are contiguous and could be always mapped), it is also
> currently less scalable because the split is fixed at boot time (XXX:
> Can this be dynamic?).
>
> === Future of secret-free hypervisor ===
>
> There are some information in an e-mail from Andrew a few years ago:
>
> https://lore.kernel.org/xen-devel/e3219697-0759-39fc-2486-715cdec1ca9e@citrix.com/
>
> Cheers,
>
> [1] https://lore.kernel.org/xen-devel/cover.1588278317.git.hongyxia@amazon.com/
>
> *** BLURB HERE ***
>
> Elias El Yandouzi (3):
> xen/x86: Add build assertion for fixmap entries
> Rename mfn_to_virt() calls
> Rename maddr_to_virt() calls
>
> Hongyan Xia (9):
> x86: Create per-domain mapping of guest_root_pt
> x86/pv: Rewrite how building PV dom0 handles domheap mappings
> x86/mapcache: Initialise the mapcache for the idle domain
> x86: Add a boot option to enable and disable the direct map
> x86/domain_page: Remove the fast paths when mfn is not in the
> directmap
> xen/page_alloc: Add a path for xenheap when there is no direct map
> x86/setup: Leave early boot slightly earlier
> x86/setup: vmap heap nodes when they are outside the direct map
> x86/setup: Do not create valid mappings when directmap=no
>
> Julien Grall (5):
> xen/x86: Add support for the PMAP
> xen/arm32: mm: Rename 'first' to 'root' in init_secondary_pagetables()
> xen/arm64: mm: Use per-pCPU page-tables
> xen/arm64: Implement a mapcache for arm64
> xen/arm64: Allow the admin to enable/disable the directmap
>
> Wei Liu (2):
> x86/pv: Domheap pages should be mapped while relocating initrd
> x86: Lift mapcache variable to the arch level
>
> docs/misc/xen-command-line.pandoc | 12 +++
> xen/arch/arm/Kconfig | 2 +-
> xen/arch/arm/arm64/mmu/mm.c | 45 ++++++++-
> xen/arch/arm/domain_page.c | 50 +++++++++-
> xen/arch/arm/include/asm/arm32/mm.h | 8 --
> xen/arch/arm/include/asm/arm64/mm.h | 7 +-
> xen/arch/arm/include/asm/domain_page.h | 13 +++
> xen/arch/arm/include/asm/mm.h | 9 ++
> xen/arch/arm/include/asm/mmu/layout.h | 13 ++-
> xen/arch/arm/include/asm/mmu/mm.h | 2 +
> xen/arch/arm/mm.c | 1 +
> xen/arch/arm/mmu/pt.c | 12 +--
> xen/arch/arm/mmu/setup.c | 27 ++---
> xen/arch/arm/mmu/smpboot.c | 30 ++----
> xen/arch/arm/setup.c | 2 +
> xen/arch/x86/Kconfig | 2 +
> xen/arch/x86/dmi_scan.c | 4 +-
> xen/arch/x86/domain.c | 12 ++-
> xen/arch/x86/domain_page.c | 74 +++++++++-----
> xen/arch/x86/hvm/dom0_build.c | 4 +-
> xen/arch/x86/include/asm/config.h | 10 +-
> xen/arch/x86/include/asm/domain.h | 13 +--
> xen/arch/x86/include/asm/fixmap.h | 9 ++
> .../x86/include/asm/mach-default/bios_ebda.h | 2 +-
> xen/arch/x86/include/asm/mm.h | 6 ++
> xen/arch/x86/include/asm/page.h | 8 +-
> xen/arch/x86/include/asm/pmap.h | 25 +++++
> xen/arch/x86/include/asm/x86_64/page.h | 2 +-
> xen/arch/x86/mm.c | 18 +++-
> xen/arch/x86/mpparse.c | 2 +-
> xen/arch/x86/pv/dom0_build.c | 70 +++++++++----
> xen/arch/x86/pv/domain.c | 36 +++++++
> xen/arch/x86/setup.c | 98 ++++++++++++++++---
> xen/arch/x86/tboot.c | 2 +-
> xen/arch/x86/x86_64/asm-offsets.c | 1 +
> xen/arch/x86/x86_64/entry.S | 8 ++
> xen/arch/x86/x86_64/mm.c | 26 +++--
> xen/common/Kconfig | 17 ++++
> xen/common/efi/boot.c | 23 +++--
> xen/common/page_alloc.c | 89 ++++++++++++++---
> xen/common/trace.c | 8 +-
> xen/include/xen/mm.h | 7 ++
> 42 files changed, 630 insertions(+), 179 deletions(-)
> create mode 100644 xen/arch/arm/include/asm/domain_page.h
> create mode 100644 xen/arch/x86/include/asm/pmap.h
>
> --
> 2.40.1
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread