* [PATCH V4 01/15] x86: Create per-domain mapping for guest_root_pt
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-12-16 16:01 ` Alejandro Vallejo
2024-11-11 13:11 ` [PATCH V4 02/15] x86/pv: Use copy_domain_page() to manage domheap pages during initrd relocation Elias El Yandouzi
` (14 subsequent siblings)
15 siblings, 1 reply; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
This patch introduces a per-domain mapping for the `guest_root_pt` in PV
guests as part of the effort to remove the direct map in Xen.
For the time being, the `root_pgt` is not mapped or unmapped, as it remains
a Xenheap page. This will be addressed in subsequent patches.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in V4:
* Fix over-allocation issue
* Update the mappings when switching from kernel to user-mode
Changes in V3:
* Rename SHADOW_ROOT
* Haven't addressed the potentially over-allocation issue as I don't get it
Changes in V2:
* Rework the shadow perdomain mapping solution in the follow-up patches
Changes since Hongyan's version:
* Remove the final dot in the commit title
diff --git a/xen/arch/x86/include/asm/config.h b/xen/arch/x86/include/asm/config.h
index f8a5a4913b07..bd360ec4141e 100644
--- a/xen/arch/x86/include/asm/config.h
+++ b/xen/arch/x86/include/asm/config.h
@@ -174,7 +174,7 @@
/* Slot 260: per-domain mappings (including map cache). */
#define PERDOMAIN_VIRT_START (PML4_ADDR(260))
#define PERDOMAIN_SLOT_MBYTES (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER))
-#define PERDOMAIN_SLOTS 3
+#define PERDOMAIN_SLOTS 4
#define PERDOMAIN_VIRT_SLOT(s) (PERDOMAIN_VIRT_START + (s) * \
(PERDOMAIN_SLOT_MBYTES << 20))
/* Slot 4: mirror of per-domain mappings (for compat xlat area accesses). */
@@ -288,6 +288,14 @@ extern unsigned long xen_phys_start;
#define ARG_XLAT_START(v) \
(ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
+/* pv_root_pt mapping area. The fourth per-domain-mapping sub-area */
+#define PV_ROOT_PT_MAPPING_VIRT_START PERDOMAIN_VIRT_SLOT(3)
+#define PV_ROOT_PT_MAPPING_ENTRIES MAX_VIRT_CPUS
+
+/* The address of a particular VCPU's PV_ROOT_PT */
+#define PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v) \
+ (PV_ROOT_PT_MAPPING_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))
+
#define ELFSIZE 64
#define ARCH_CRASH_SAVE_VMCOREINFO
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index b79d6badd71c..478ce41ad8ca 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -272,6 +272,7 @@ struct time_scale {
struct pv_domain
{
l1_pgentry_t **gdt_ldt_l1tab;
+ l1_pgentry_t **root_pt_l1tab;
atomic_t nr_l4_pages;
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index d537a799bced..a152e21bb086 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -516,6 +516,13 @@ void make_cr3(struct vcpu *v, mfn_t mfn)
v->arch.cr3 |= get_pcid_bits(v, false);
}
+#define pv_root_pt_idx(v) \
+ ((v)->vcpu_id >> PAGETABLE_ORDER)
+
+#define pv_root_pt_pte(v) \
+ ((v)->domain->arch.pv.root_pt_l1tab[pv_root_pt_idx(v)] + \
+ ((v)->vcpu_id & (L1_PAGETABLE_ENTRIES - 1)))
+
void write_ptbase(struct vcpu *v)
{
const struct domain *d = v->domain;
@@ -527,11 +534,16 @@ void write_ptbase(struct vcpu *v)
if ( is_pv_domain(d) && d->arch.pv.xpti )
{
+ mfn_t guest_root_pt = _mfn(MASK_EXTR(v->arch.cr3, X86_CR3_ADDR_MASK));
+ l1_pgentry_t *pte = pv_root_pt_pte(v);
+
cpu_info->root_pgt_changed = true;
cpu_info->pv_cr3 = __pa(this_cpu(root_pgt));
if ( new_cr4 & X86_CR4_PCIDE )
cpu_info->pv_cr3 |= get_pcid_bits(v, true);
switch_cr3_cr4(v->arch.cr3, new_cr4);
+
+ l1e_write(pte, l1e_from_mfn(guest_root_pt, __PAGE_HYPERVISOR_RO));
}
else
{
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index d5a8564c1cbe..1a1c999743ac 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -289,6 +289,21 @@ static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
1U << GDT_LDT_VCPU_SHIFT);
}
+static int pv_create_root_pt_l1tab(const struct vcpu *v)
+{
+ return create_perdomain_mapping(v->domain,
+ PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v),
+ 1, v->domain->arch.pv.root_pt_l1tab,
+ NULL);
+}
+
+static void pv_destroy_root_pt_l1tab(const struct vcpu *v)
+
+{
+ destroy_perdomain_mapping(v->domain,
+ PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v), 1);
+}
+
void pv_vcpu_destroy(struct vcpu *v)
{
if ( is_pv_32bit_vcpu(v) )
@@ -298,6 +313,7 @@ void pv_vcpu_destroy(struct vcpu *v)
}
pv_destroy_gdt_ldt_l1tab(v);
+ pv_destroy_root_pt_l1tab(v);
XFREE(v->arch.pv.trap_ctxt);
}
@@ -312,6 +328,13 @@ int pv_vcpu_initialise(struct vcpu *v)
if ( rc )
return rc;
+ if ( v->domain->arch.pv.xpti )
+ {
+ rc = pv_create_root_pt_l1tab(v);
+ if ( rc )
+ goto done;
+ }
+
BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) >
PAGE_SIZE);
v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS);
@@ -347,10 +370,12 @@ void pv_domain_destroy(struct domain *d)
destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
+ destroy_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START, d->max_vcpus);
XFREE(d->arch.pv.cpuidmasks);
FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
+ FREE_XENHEAP_PAGE(d->arch.pv.root_pt_l1tab);
}
void noreturn cf_check continue_pv_domain(void);
@@ -382,8 +407,22 @@ int pv_domain_initialise(struct domain *d)
if ( rc )
goto fail;
+ rc = create_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START,
+ d->max_vcpus, NULL, NULL);
+ if ( rc )
+ goto fail;
+
d->arch.ctxt_switch = &pv_csw;
+ if ( d->arch.pv.xpti )
+ {
+ d->arch.pv.root_pt_l1tab =
+ xzalloc_array(l1_pgentry_t *,
+ DIV_ROUND_UP(d->max_vcpus, L1_PAGETABLE_ENTRIES));
+ if ( !d->arch.pv.root_pt_l1tab )
+ goto fail;
+ }
+
if ( !is_pv_32bit_domain(d) && use_invpcid && cpu_has_pcid )
switch ( ACCESS_ONCE(opt_pcid) )
{
@@ -457,7 +496,8 @@ static void _toggle_guest_pt(struct vcpu *v)
guest_update = false;
}
}
- write_cr3(cr3);
+
+ write_ptbase(v);
if ( !pagetable_is_null(old_shadow) )
shadow_put_top_level(v->domain, old_shadow);
@@ -497,9 +537,6 @@ void toggle_guest_mode(struct vcpu *v)
{
struct cpu_info *cpu_info = get_cpu_info();
- cpu_info->root_pgt_changed = true;
- cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
- (d->arch.pv.pcid ? get_pcid_bits(v, true) : 0);
/*
* As in _toggle_guest_pt() the XPTI CR3 write needs to be a TLB-
* flushing one too for shadow mode guests.
diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
index 630bdc39451d..c1ae5013af96 100644
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -80,6 +80,7 @@ void __dummy__(void)
#undef OFFSET_EF
+ OFFSET(VCPU_id, struct vcpu, vcpu_id);
OFFSET(VCPU_processor, struct vcpu, processor);
OFFSET(VCPU_domain, struct vcpu, domain);
OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map);
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index c5c723b5f4d4..91413b905768 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -170,9 +170,16 @@ FUNC_LOCAL(restore_all_guest)
movabs $PADDR_MASK & PAGE_MASK, %rsi
movabs $DIRECTMAP_VIRT_START, %rcx
and %rsi, %rdi
- and %r9, %rsi
add %rcx, %rdi
+
+ /*
+ * The address in the vCPU cr3 is always mapped in the per-domain
+ * pv_root_pt virt area.
+ */
+ imul $PAGE_SIZE, VCPU_id(%rbx), %esi
+ movabs $PV_ROOT_PT_MAPPING_VIRT_START, %rcx
add %rcx, %rsi
+
mov $ROOT_PAGETABLE_FIRST_XEN_SLOT, %ecx
mov root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rsi), %r8
mov %r8, root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rdi)
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* Re: [PATCH V4 01/15] x86: Create per-domain mapping for guest_root_pt
2024-11-11 13:11 ` [PATCH V4 01/15] x86: Create per-domain mapping for guest_root_pt Elias El Yandouzi
@ 2024-12-16 16:01 ` Alejandro Vallejo
0 siblings, 0 replies; 25+ messages in thread
From: Alejandro Vallejo @ 2024-12-16 16:01 UTC (permalink / raw)
To: Elias El Yandouzi, xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall
On Mon Nov 11, 2024 at 1:11 PM GMT, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
>
> This patch introduces a per-domain mapping for the `guest_root_pt` in PV
> guests as part of the effort to remove the direct map in Xen.
>
> For the time being, the `root_pgt` is not mapped or unmapped, as it remains
> a Xenheap page. This will be addressed in subsequent patches.
>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
> Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
>
> ----
> Changes in V4:
> * Fix over-allocation issue
> * Update the mappings when switching from kernel to user-mode
>
> Changes in V3:
> * Rename SHADOW_ROOT
> * Haven't addressed the potentially over-allocation issue as I don't get it
>
> Changes in V2:
> * Rework the shadow perdomain mapping solution in the follow-up patches
>
> Changes since Hongyan's version:
> * Remove the final dot in the commit title
>
> diff --git a/xen/arch/x86/include/asm/config.h b/xen/arch/x86/include/asm/config.h
> index f8a5a4913b07..bd360ec4141e 100644
> --- a/xen/arch/x86/include/asm/config.h
> +++ b/xen/arch/x86/include/asm/config.h
> @@ -174,7 +174,7 @@
> /* Slot 260: per-domain mappings (including map cache). */
> #define PERDOMAIN_VIRT_START (PML4_ADDR(260))
> #define PERDOMAIN_SLOT_MBYTES (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER))
> -#define PERDOMAIN_SLOTS 3
> +#define PERDOMAIN_SLOTS 4
> #define PERDOMAIN_VIRT_SLOT(s) (PERDOMAIN_VIRT_START + (s) * \
> (PERDOMAIN_SLOT_MBYTES << 20))
> /* Slot 4: mirror of per-domain mappings (for compat xlat area accesses). */
> @@ -288,6 +288,14 @@ extern unsigned long xen_phys_start;
> #define ARG_XLAT_START(v) \
> (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
>
> +/* pv_root_pt mapping area. The fourth per-domain-mapping sub-area */
> +#define PV_ROOT_PT_MAPPING_VIRT_START PERDOMAIN_VIRT_SLOT(3)
> +#define PV_ROOT_PT_MAPPING_ENTRIES MAX_VIRT_CPUS
> +
> +/* The address of a particular VCPU's PV_ROOT_PT */
> +#define PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v) \
> + (PV_ROOT_PT_MAPPING_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))
> +
> #define ELFSIZE 64
>
> #define ARCH_CRASH_SAVE_VMCOREINFO
> diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
> index b79d6badd71c..478ce41ad8ca 100644
> --- a/xen/arch/x86/include/asm/domain.h
> +++ b/xen/arch/x86/include/asm/domain.h
> @@ -272,6 +272,7 @@ struct time_scale {
> struct pv_domain
> {
> l1_pgentry_t **gdt_ldt_l1tab;
> + l1_pgentry_t **root_pt_l1tab;
>
> atomic_t nr_l4_pages;
>
> diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
> index d537a799bced..a152e21bb086 100644
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -516,6 +516,13 @@ void make_cr3(struct vcpu *v, mfn_t mfn)
> v->arch.cr3 |= get_pcid_bits(v, false);
> }
>
> +#define pv_root_pt_idx(v) \
> + ((v)->vcpu_id >> PAGETABLE_ORDER)
> +
> +#define pv_root_pt_pte(v) \
> + ((v)->domain->arch.pv.root_pt_l1tab[pv_root_pt_idx(v)] + \
> + ((v)->vcpu_id & (L1_PAGETABLE_ENTRIES - 1)))
> +
> void write_ptbase(struct vcpu *v)
> {
> const struct domain *d = v->domain;
> @@ -527,11 +534,16 @@ void write_ptbase(struct vcpu *v)
>
> if ( is_pv_domain(d) && d->arch.pv.xpti )
> {
> + mfn_t guest_root_pt = _mfn(MASK_EXTR(v->arch.cr3, X86_CR3_ADDR_MASK));
> + l1_pgentry_t *pte = pv_root_pt_pte(v);
> +
> cpu_info->root_pgt_changed = true;
> cpu_info->pv_cr3 = __pa(this_cpu(root_pgt));
> if ( new_cr4 & X86_CR4_PCIDE )
> cpu_info->pv_cr3 |= get_pcid_bits(v, true);
> switch_cr3_cr4(v->arch.cr3, new_cr4);
> +
> + l1e_write(pte, l1e_from_mfn(guest_root_pt, __PAGE_HYPERVISOR_RO));
> }
> else
> {
> diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
> index d5a8564c1cbe..1a1c999743ac 100644
> --- a/xen/arch/x86/pv/domain.c
> +++ b/xen/arch/x86/pv/domain.c
> @@ -289,6 +289,21 @@ static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
> 1U << GDT_LDT_VCPU_SHIFT);
> }
>
> +static int pv_create_root_pt_l1tab(const struct vcpu *v)
> +{
> + return create_perdomain_mapping(v->domain,
> + PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v),
> + 1, v->domain->arch.pv.root_pt_l1tab,
> + NULL);
> +}
> +
> +static void pv_destroy_root_pt_l1tab(const struct vcpu *v)
> +
> +{
> + destroy_perdomain_mapping(v->domain,
> + PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v), 1);
> +}
> +
> void pv_vcpu_destroy(struct vcpu *v)
> {
> if ( is_pv_32bit_vcpu(v) )
> @@ -298,6 +313,7 @@ void pv_vcpu_destroy(struct vcpu *v)
> }
>
> pv_destroy_gdt_ldt_l1tab(v);
> + pv_destroy_root_pt_l1tab(v);
> XFREE(v->arch.pv.trap_ctxt);
> }
>
> @@ -312,6 +328,13 @@ int pv_vcpu_initialise(struct vcpu *v)
> if ( rc )
> return rc;
>
> + if ( v->domain->arch.pv.xpti )
> + {
> + rc = pv_create_root_pt_l1tab(v);
> + if ( rc )
> + goto done;
> + }
> +
> BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) >
> PAGE_SIZE);
> v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS);
> @@ -347,10 +370,12 @@ void pv_domain_destroy(struct domain *d)
>
> destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
> GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
> + destroy_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START, d->max_vcpus);
>
> XFREE(d->arch.pv.cpuidmasks);
>
> FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
> + FREE_XENHEAP_PAGE(d->arch.pv.root_pt_l1tab);
With root_pt_l1tab coming from xzalloc_array(), this must use XFREE() instead.
XFREE(v->arch.pv.root_pt_l1tab);
> }
>
> void noreturn cf_check continue_pv_domain(void);
> @@ -382,8 +407,22 @@ int pv_domain_initialise(struct domain *d)
> if ( rc )
> goto fail;
>
> + rc = create_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START,
> + d->max_vcpus, NULL, NULL);
> + if ( rc )
> + goto fail;
> +
> d->arch.ctxt_switch = &pv_csw;
>
> + if ( d->arch.pv.xpti )
> + {
> + d->arch.pv.root_pt_l1tab =
> + xzalloc_array(l1_pgentry_t *,
> + DIV_ROUND_UP(d->max_vcpus, L1_PAGETABLE_ENTRIES));
> + if ( !d->arch.pv.root_pt_l1tab )
> + goto fail;
> + }
> +
> if ( !is_pv_32bit_domain(d) && use_invpcid && cpu_has_pcid )
> switch ( ACCESS_ONCE(opt_pcid) )
> {
> @@ -457,7 +496,8 @@ static void _toggle_guest_pt(struct vcpu *v)
> guest_update = false;
> }
> }
> - write_cr3(cr3);
> +
> + write_ptbase(v);
>
> if ( !pagetable_is_null(old_shadow) )
> shadow_put_top_level(v->domain, old_shadow);
> @@ -497,9 +537,6 @@ void toggle_guest_mode(struct vcpu *v)
> {
> struct cpu_info *cpu_info = get_cpu_info();
>
> - cpu_info->root_pgt_changed = true;
> - cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
> - (d->arch.pv.pcid ? get_pcid_bits(v, true) : 0);
> /*
> * As in _toggle_guest_pt() the XPTI CR3 write needs to be a TLB-
> * flushing one too for shadow mode guests.
> diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
> index 630bdc39451d..c1ae5013af96 100644
> --- a/xen/arch/x86/x86_64/asm-offsets.c
> +++ b/xen/arch/x86/x86_64/asm-offsets.c
> @@ -80,6 +80,7 @@ void __dummy__(void)
>
> #undef OFFSET_EF
>
> + OFFSET(VCPU_id, struct vcpu, vcpu_id);
> OFFSET(VCPU_processor, struct vcpu, processor);
> OFFSET(VCPU_domain, struct vcpu, domain);
> OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map);
> diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
> index c5c723b5f4d4..91413b905768 100644
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -170,9 +170,16 @@ FUNC_LOCAL(restore_all_guest)
> movabs $PADDR_MASK & PAGE_MASK, %rsi
> movabs $DIRECTMAP_VIRT_START, %rcx
> and %rsi, %rdi
> - and %r9, %rsi
> add %rcx, %rdi
> +
> + /*
> + * The address in the vCPU cr3 is always mapped in the per-domain
> + * pv_root_pt virt area.
> + */
> + imul $PAGE_SIZE, VCPU_id(%rbx), %esi
> + movabs $PV_ROOT_PT_MAPPING_VIRT_START, %rcx
> add %rcx, %rsi
> +
> mov $ROOT_PAGETABLE_FIRST_XEN_SLOT, %ecx
> mov root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rsi), %r8
> mov %r8, root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rdi)
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH V4 02/15] x86/pv: Use copy_domain_page() to manage domheap pages during initrd relocation
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
2024-11-11 13:11 ` [PATCH V4 01/15] x86: Create per-domain mapping for guest_root_pt Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-11 13:11 ` [PATCH V4 03/15] x86/pv: Rewrite how building PV dom0 handles domheap mappings Elias El Yandouzi
` (13 subsequent siblings)
15 siblings, 0 replies; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Wei Liu, Wei Wang, Julien Grall,
Elias El Yandouzi
From: Wei Liu <wei.liu2@citrix.com>
Replace the manual copying logic with a call to `copy_domain_page()`
while relocating intird which map and unmap pages accordingly.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: Wei Wang <wawei@amazon.de>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in V4:
* Use the count of pages instead of calculating from order for nr_pages initialization
Changes in V3:
* Rename commit title
* Rework the for loop copying the pages
Changes in V2:
* Get rid of mfn_to_virt
* Don't open code copy_domain_page()
Changes since Hongyan's version:
* Add missing newline after the variable declaration
diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
index cc882bee61c3..18b7a3e4e025 100644
--- a/xen/arch/x86/pv/dom0_build.c
+++ b/xen/arch/x86/pv/dom0_build.c
@@ -618,18 +618,24 @@ static int __init dom0_construct(struct domain *d,
if ( d->arch.physaddr_bitsize &&
((mfn + count - 1) >> (d->arch.physaddr_bitsize - PAGE_SHIFT)) )
{
+ unsigned int nr_pages = count;
+
order = get_order_from_pages(count);
page = alloc_domheap_pages(d, order, MEMF_no_scrub);
if ( !page )
panic("Not enough RAM for domain 0 initrd\n");
+
for ( count = -count; order--; )
if ( count & (1UL << order) )
{
free_domheap_pages(page, order);
page += 1UL << order;
+ nr_pages -= 1UL << order;
}
- memcpy(page_to_virt(page), mfn_to_virt(initrd->mod_start),
- initrd_len);
+
+ for ( ; nr_pages-- ; page++, mfn++ )
+ copy_domain_page(page_to_mfn(page), _mfn(mfn));
+
mpt_alloc = (paddr_t)initrd->mod_start << PAGE_SHIFT;
init_domheap_pages(mpt_alloc,
mpt_alloc + PAGE_ALIGN(initrd_len));
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* [PATCH V4 03/15] x86/pv: Rewrite how building PV dom0 handles domheap mappings
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
2024-11-11 13:11 ` [PATCH V4 01/15] x86: Create per-domain mapping for guest_root_pt Elias El Yandouzi
2024-11-11 13:11 ` [PATCH V4 02/15] x86/pv: Use copy_domain_page() to manage domheap pages during initrd relocation Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-12-09 17:42 ` Alejandro Vallejo
2024-11-11 13:11 ` [PATCH V4 04/15] x86: Initialize mapcache for PV, HVM, and idle domains Elias El Yandouzi
` (12 subsequent siblings)
15 siblings, 1 reply; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
Building a PV dom0 is allocating from the domheap but uses it like the
xenheap. Use the pages as they should be.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in V4:
* Reduce the scope of l{1,2,4}start_mfn variables
* Make the macro `UNMAP_MAP_AND_ADVANCE` return the new virtual address
Changes in V3:
* Fold following patch 'x86/pv: Map L4 page table for shim domain'
Changes in V2:
* Clarify the commit message
* Break the patch in two parts
Changes since Hongyan's version:
* Rebase
* Remove spurious newline
diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
index 18b7a3e4e025..b03df609cadb 100644
--- a/xen/arch/x86/pv/dom0_build.c
+++ b/xen/arch/x86/pv/dom0_build.c
@@ -382,6 +382,7 @@ static int __init dom0_construct(struct domain *d,
l3_pgentry_t *l3tab = NULL, *l3start = NULL;
l2_pgentry_t *l2tab = NULL, *l2start = NULL;
l1_pgentry_t *l1tab = NULL, *l1start = NULL;
+ mfn_t l3start_mfn = INVALID_MFN;
/*
* This fully describes the memory layout of the initial domain. All
@@ -719,22 +720,34 @@ static int __init dom0_construct(struct domain *d,
v->arch.pv.event_callback_cs = FLAT_COMPAT_KERNEL_CS;
}
+#define UNMAP_MAP_AND_ADVANCE(mfn_var, virt_var, maddr) ({ \
+ do { \
+ unmap_domain_page(virt_var); \
+ mfn_var = maddr_to_mfn(maddr); \
+ maddr += PAGE_SIZE; \
+ virt_var = map_domain_page(mfn_var); \
+ } while ( false ); \
+ virt_var; \
+})
+
if ( !compat )
{
+ mfn_t l4start_mfn;
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
- l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+ l4tab = UNMAP_MAP_AND_ADVANCE(l4start_mfn, l4start, mpt_alloc);
clear_page(l4tab);
- init_xen_l4_slots(l4tab, _mfn(virt_to_mfn(l4start)),
- d, INVALID_MFN, true);
- v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
+ init_xen_l4_slots(l4tab, l4start_mfn, d, INVALID_MFN, true);
+ v->arch.guest_table = pagetable_from_mfn(l4start_mfn);
}
else
{
/* Monitor table already created by switch_compat(). */
- l4start = l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
+ mfn_t l4start_mfn = pagetable_get_mfn(v->arch.guest_table);
+ l4start = l4tab = map_domain_page(l4start_mfn);
/* See public/xen.h on why the following is needed. */
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+ UNMAP_MAP_AND_ADVANCE(l3start_mfn, l3start, mpt_alloc);
}
l4tab += l4_table_offset(v_start);
@@ -743,15 +756,17 @@ static int __init dom0_construct(struct domain *d,
{
if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
{
+ mfn_t l1start_mfn;
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
- l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+ l1tab = UNMAP_MAP_AND_ADVANCE(l1start_mfn, l1start, mpt_alloc);
clear_page(l1tab);
if ( count == 0 )
l1tab += l1_table_offset(v_start);
if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
{
+ mfn_t l2start_mfn;
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
- l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+ l2tab = UNMAP_MAP_AND_ADVANCE(l2start_mfn, l2start, mpt_alloc);
clear_page(l2tab);
if ( count == 0 )
l2tab += l2_table_offset(v_start);
@@ -761,19 +776,19 @@ static int __init dom0_construct(struct domain *d,
{
maddr_to_page(mpt_alloc)->u.inuse.type_info =
PGT_l3_page_table;
- l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+ UNMAP_MAP_AND_ADVANCE(l3start_mfn, l3start, mpt_alloc);
}
l3tab = l3start;
clear_page(l3tab);
if ( count == 0 )
l3tab += l3_table_offset(v_start);
- *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
+ *l4tab = l4e_from_mfn(l3start_mfn, L4_PROT);
l4tab++;
}
- *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
+ *l3tab = l3e_from_mfn(l2start_mfn, L3_PROT);
l3tab++;
}
- *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
+ *l2tab = l2e_from_mfn(l1start_mfn, L2_PROT);
l2tab++;
}
if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
@@ -792,27 +807,32 @@ static int __init dom0_construct(struct domain *d,
if ( compat )
{
- l2_pgentry_t *l2t;
-
/* Ensure the first four L3 entries are all populated. */
for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
{
if ( !l3e_get_intpte(*l3tab) )
{
+ mfn_t l2start_mfn;
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
- l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
- clear_page(l2tab);
- *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
+ UNMAP_MAP_AND_ADVANCE(l2start_mfn, l2start, mpt_alloc);
+ clear_page(l2start);
+ *l3tab = l3e_from_mfn(l2start_mfn, L3_PROT);
}
if ( i == 3 )
l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
}
- l2t = map_l2t_from_l3e(l3start[3]);
- init_xen_pae_l2_slots(l2t, d);
- unmap_domain_page(l2t);
+ UNMAP_DOMAIN_PAGE(l2start);
+ l2start = map_l2t_from_l3e(l3start[3]);
+ init_xen_pae_l2_slots(l2start, d);
}
+#undef UNMAP_MAP_AND_ADVANCE
+
+ UNMAP_DOMAIN_PAGE(l1start);
+ UNMAP_DOMAIN_PAGE(l2start);
+ UNMAP_DOMAIN_PAGE(l3start);
+
/* Pages that are part of page tables must be read only. */
mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages, &flush_flags);
@@ -987,6 +1007,8 @@ static int __init dom0_construct(struct domain *d,
pv_shim_setup_dom(d, l4start, v_start, vxenstore_start, vconsole_start,
vphysmap_start, si);
+ UNMAP_DOMAIN_PAGE(l4start);
+
#ifdef CONFIG_COMPAT
if ( compat )
xlat_start_info(si, pv_shim ? XLAT_start_info_console_domU
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* Re: [PATCH V4 03/15] x86/pv: Rewrite how building PV dom0 handles domheap mappings
2024-11-11 13:11 ` [PATCH V4 03/15] x86/pv: Rewrite how building PV dom0 handles domheap mappings Elias El Yandouzi
@ 2024-12-09 17:42 ` Alejandro Vallejo
0 siblings, 0 replies; 25+ messages in thread
From: Alejandro Vallejo @ 2024-12-09 17:42 UTC (permalink / raw)
To: Elias El Yandouzi, xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall
Hi,
I've been trying to run this series for a while, but it crashes very
frequentyly starting from the patch that generalizes the mapcache. I think I've
tracked it down to this patch.
On Mon Nov 11, 2024 at 1:11 PM GMT, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
>
> Building a PV dom0 is allocating from the domheap but uses it like the
> xenheap. Use the pages as they should be.
>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
> Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
>
> ----
> Changes in V4:
> * Reduce the scope of l{1,2,4}start_mfn variables
> * Make the macro `UNMAP_MAP_AND_ADVANCE` return the new virtual address
>
> Changes in V3:
> * Fold following patch 'x86/pv: Map L4 page table for shim domain'
>
> Changes in V2:
> * Clarify the commit message
> * Break the patch in two parts
>
> Changes since Hongyan's version:
> * Rebase
> * Remove spurious newline
>
> diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
> index 18b7a3e4e025..b03df609cadb 100644
> --- a/xen/arch/x86/pv/dom0_build.c
> +++ b/xen/arch/x86/pv/dom0_build.c
> @@ -382,6 +382,7 @@ static int __init dom0_construct(struct domain *d,
> l3_pgentry_t *l3tab = NULL, *l3start = NULL;
> l2_pgentry_t *l2tab = NULL, *l2start = NULL;
> l1_pgentry_t *l1tab = NULL, *l1start = NULL;
> + mfn_t l3start_mfn = INVALID_MFN;
>
> /*
> * This fully describes the memory layout of the initial domain. All
> @@ -719,22 +720,34 @@ static int __init dom0_construct(struct domain *d,
> v->arch.pv.event_callback_cs = FLAT_COMPAT_KERNEL_CS;
> }
>
> +#define UNMAP_MAP_AND_ADVANCE(mfn_var, virt_var, maddr) ({ \
> + do { \
> + unmap_domain_page(virt_var); \
> + mfn_var = maddr_to_mfn(maddr); \
> + maddr += PAGE_SIZE; \
> + virt_var = map_domain_page(mfn_var); \
> + } while ( false ); \
> + virt_var; \
> +})
> +
> if ( !compat )
> {
> + mfn_t l4start_mfn;
> maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
> - l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
> + l4tab = UNMAP_MAP_AND_ADVANCE(l4start_mfn, l4start, mpt_alloc);
In here l4start is mapped on the idle domain perdomain area, but...
> clear_page(l4tab);
> - init_xen_l4_slots(l4tab, _mfn(virt_to_mfn(l4start)),
> - d, INVALID_MFN, true);
> - v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
> + init_xen_l4_slots(l4tab, l4start_mfn, d, INVALID_MFN, true);
> + v->arch.guest_table = pagetable_from_mfn(l4start_mfn);
> }
> else
> {
> /* Monitor table already created by switch_compat(). */
> - l4start = l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
> + mfn_t l4start_mfn = pagetable_get_mfn(v->arch.guest_table);
> + l4start = l4tab = map_domain_page(l4start_mfn);
> /* See public/xen.h on why the following is needed. */
> maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
> l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
> + UNMAP_MAP_AND_ADVANCE(l3start_mfn, l3start, mpt_alloc);
> }
>
> l4tab += l4_table_offset(v_start);
> @@ -743,15 +756,17 @@ static int __init dom0_construct(struct domain *d,
> {
> if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
> {
> + mfn_t l1start_mfn;
> maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
> - l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
> + l1tab = UNMAP_MAP_AND_ADVANCE(l1start_mfn, l1start, mpt_alloc);
> clear_page(l1tab);
> if ( count == 0 )
> l1tab += l1_table_offset(v_start);
> if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
> {
> + mfn_t l2start_mfn;
> maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
> - l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
> + l2tab = UNMAP_MAP_AND_ADVANCE(l2start_mfn, l2start, mpt_alloc);
> clear_page(l2tab);
> if ( count == 0 )
> l2tab += l2_table_offset(v_start);
> @@ -761,19 +776,19 @@ static int __init dom0_construct(struct domain *d,
> {
> maddr_to_page(mpt_alloc)->u.inuse.type_info =
> PGT_l3_page_table;
> - l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
> + UNMAP_MAP_AND_ADVANCE(l3start_mfn, l3start, mpt_alloc);
> }
> l3tab = l3start;
> clear_page(l3tab);
> if ( count == 0 )
> l3tab += l3_table_offset(v_start);
> - *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
> + *l4tab = l4e_from_mfn(l3start_mfn, L4_PROT);
> l4tab++;
> }
> - *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
> + *l3tab = l3e_from_mfn(l2start_mfn, L3_PROT);
> l3tab++;
> }
> - *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
> + *l2tab = l2e_from_mfn(l1start_mfn, L2_PROT);
> l2tab++;
> }
> if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
> @@ -792,27 +807,32 @@ static int __init dom0_construct(struct domain *d,
>
> if ( compat )
> {
> - l2_pgentry_t *l2t;
> -
> /* Ensure the first four L3 entries are all populated. */
> for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
> {
> if ( !l3e_get_intpte(*l3tab) )
> {
> + mfn_t l2start_mfn;
> maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
> - l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
> - clear_page(l2tab);
> - *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
> + UNMAP_MAP_AND_ADVANCE(l2start_mfn, l2start, mpt_alloc);
> + clear_page(l2start);
> + *l3tab = l3e_from_mfn(l2start_mfn, L3_PROT);
> }
> if ( i == 3 )
> l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
> }
>
> - l2t = map_l2t_from_l3e(l3start[3]);
> - init_xen_pae_l2_slots(l2t, d);
> - unmap_domain_page(l2t);
> + UNMAP_DOMAIN_PAGE(l2start);
> + l2start = map_l2t_from_l3e(l3start[3]);
> + init_xen_pae_l2_slots(l2start, d);
> }
>
> +#undef UNMAP_MAP_AND_ADVANCE
> +
> + UNMAP_DOMAIN_PAGE(l1start);
> + UNMAP_DOMAIN_PAGE(l2start);
> + UNMAP_DOMAIN_PAGE(l3start);
... l4start is not unmapped here. This is a problem, because we're about to
change the page tables into dom0's and start using its mapcache.
IMO, we should be unmapping here, and remapping in dom0's context. Otherwise
l4start becomes a transiently stale pointer. Any remaining pointer obtained via
map_domain_page() is a dangling pointer after the mapcache+pagetable switch.
> +
> /* Pages that are part of page tables must be read only. */
> mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages, &flush_flags);
>
> @@ -987,6 +1007,8 @@ static int __init dom0_construct(struct domain *d,
> pv_shim_setup_dom(d, l4start, v_start, vxenstore_start, vconsole_start,
> vphysmap_start, si);
>
> + UNMAP_DOMAIN_PAGE(l4start);
As it is, this unmap is operating on the wrong mapcache, I think. I don't quite
understand why I see intermittent boot crashes and not constant ones, but this
seems like a bug.
What we want, I think, is:
1. Increase the scope of l4start_mfn to be function-level.
2. Do UNMAP_DOMAIN_PAGE(l4start) along with l1start, l2start and l3start.
3. Include a pair of map_domain_page() and UNMAP_DOMAIN_PAGE() within the
conditional, surrounding pv_shim_setup_dom.
> +
> #ifdef CONFIG_COMPAT
> if ( compat )
> xlat_start_info(si, pv_shim ? XLAT_start_info_console_domU
I'll keep testing it in case I missed something, but this seems to work.
Cheers,
Alejandro
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH V4 04/15] x86: Initialize mapcache for PV, HVM, and idle domains
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (2 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 03/15] x86/pv: Rewrite how building PV dom0 handles domheap mappings Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-11 18:46 ` Andrew Cooper
2024-11-11 13:11 ` [PATCH V4 05/15] x86: Add a boot option to enable and disable the direct map Elias El Yandouzi
` (11 subsequent siblings)
15 siblings, 1 reply; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Wei Liu, Wei Wang, Hongyan Xia,
Julien Grall, Elias El Yandouzi
From: Wei Liu <wei.liu2@citrix.com>
To support the transition away from the direct map, the mapcache will now
be used by HVM and idle domains as well. This patch lifts the `mapcache`
to the arch level and moves its initialization earlier in
`arch_domain_create()` to cover PV, HVM, and idle domains.
For the idle domain to utilize the mapcache, this patch also populates the
mapcache page tables within the `PERDOMAIN` region and adjusts the
initialization sequence.
With this change, mapcache initialization is now unified across all domain
types—PV, HVM, and idle.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: Wei Wang <wawei@amazon.de>
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in V4:
* Reword the commit message
* Rebase it on top of staging
* The logic for the creation of the domain has been reworked
so introduced #ifdef CONFIG_X86 in the common code to
initialise the mapcache
Changes in V2:
* Free resources if mapcache initialisation fails
* Remove `is_idle_domain()` check from `create_perdomain_mappings()`
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 78a13e6812c9..38338f519dea 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -777,6 +777,10 @@ void __init arch_init_idle_domain(struct domain *d)
};
d->arch.ctxt_switch = &idle_csw;
+
+ /* Slot 260: Per-domain mappings. */
+ idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)] =
+ l4e_from_page(d->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW);
}
int arch_domain_create(struct domain *d,
@@ -870,9 +874,6 @@ int arch_domain_create(struct domain *d,
}
else if ( is_pv_domain(d) )
{
- if ( (rc = mapcache_domain_init(d)) != 0 )
- goto fail;
-
if ( (rc = pv_domain_initialise(d)) != 0 )
goto fail;
}
@@ -909,7 +910,6 @@ int arch_domain_create(struct domain *d,
XFREE(d->arch.cpu_policy);
if ( paging_initialised )
paging_final_teardown(d);
- free_perdomain_mappings(d);
return rc;
}
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index eac5e3304fb8..55e337aaf703 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -82,11 +82,11 @@ void *map_domain_page(mfn_t mfn)
#endif
v = mapcache_current_vcpu();
- if ( !v || !is_pv_vcpu(v) )
+ if ( !v )
return mfn_to_virt(mfn_x(mfn));
- dcache = &v->domain->arch.pv.mapcache;
- vcache = &v->arch.pv.mapcache;
+ dcache = &v->domain->arch.mapcache;
+ vcache = &v->arch.mapcache;
if ( !dcache->inuse )
return mfn_to_virt(mfn_x(mfn));
@@ -187,14 +187,14 @@ void unmap_domain_page(const void *ptr)
ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
v = mapcache_current_vcpu();
- ASSERT(v && is_pv_vcpu(v));
+ ASSERT(v);
- dcache = &v->domain->arch.pv.mapcache;
+ dcache = &v->domain->arch.mapcache;
ASSERT(dcache->inuse);
idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
mfn = l1e_get_pfn(MAPCACHE_L1ENT(idx));
- hashent = &v->arch.pv.mapcache.hash[MAPHASH_HASHFN(mfn)];
+ hashent = &v->arch.mapcache.hash[MAPHASH_HASHFN(mfn)];
local_irq_save(flags);
@@ -233,11 +233,9 @@ void unmap_domain_page(const void *ptr)
int mapcache_domain_init(struct domain *d)
{
- struct mapcache_domain *dcache = &d->arch.pv.mapcache;
+ struct mapcache_domain *dcache = &d->arch.mapcache;
unsigned int bitmap_pages;
- ASSERT(is_pv_domain(d));
-
#ifdef NDEBUG
if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
return 0;
@@ -261,12 +259,12 @@ int mapcache_domain_init(struct domain *d)
int mapcache_vcpu_init(struct vcpu *v)
{
struct domain *d = v->domain;
- struct mapcache_domain *dcache = &d->arch.pv.mapcache;
+ struct mapcache_domain *dcache = &d->arch.mapcache;
unsigned long i;
unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES;
unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long));
- if ( !is_pv_vcpu(v) || !dcache->inuse )
+ if ( !dcache->inuse )
return 0;
if ( ents > dcache->entries )
@@ -293,7 +291,7 @@ int mapcache_vcpu_init(struct vcpu *v)
BUILD_BUG_ON(MAPHASHENT_NOTINUSE < MAPCACHE_ENTRIES);
for ( i = 0; i < MAPHASH_ENTRIES; i++ )
{
- struct vcpu_maphash_entry *hashent = &v->arch.pv.mapcache.hash[i];
+ struct vcpu_maphash_entry *hashent = &v->arch.mapcache.hash[i];
hashent->mfn = ~0UL; /* never valid to map */
hashent->idx = MAPHASHENT_NOTINUSE;
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index 478ce41ad8ca..b0fd477c62e7 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -285,9 +285,6 @@ struct pv_domain
/* Mitigate L1TF with shadow/crashing? */
bool check_l1tf;
- /* map_domain_page() mapping cache. */
- struct mapcache_domain mapcache;
-
struct cpuidmasks *cpuidmasks;
};
@@ -326,6 +323,9 @@ struct arch_domain
uint8_t scf; /* See SCF_DOM_MASK */
+ /* map_domain_page() mapping cache. */
+ struct mapcache_domain mapcache;
+
union {
struct pv_domain pv;
struct hvm_domain hvm;
@@ -516,9 +516,6 @@ struct arch_domain
struct pv_vcpu
{
- /* map_domain_page() mapping cache. */
- struct mapcache_vcpu mapcache;
-
unsigned int vgc_flags;
struct trap_info *trap_ctxt;
@@ -612,6 +609,9 @@ struct arch_vcpu
#define async_exception_state(t) async_exception_state[(t)-1]
uint8_t async_exception_mask;
+ /* map_domain_page() mapping cache. */
+ struct mapcache_vcpu mapcache;
+
/* Virtual Machine Extensions */
union {
struct pv_vcpu pv;
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 92263a4fbdc5..a7f4929b5893 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -738,6 +738,11 @@ struct domain *domain_create(domid_t domid,
rangeset_domain_initialise(d);
+#ifdef CONFIG_X86
+ if ( (err = mapcache_domain_init(d)) != 0)
+ goto fail;
+#endif
+
if ( is_idle_domain(d) )
arch_init_idle_domain(d);
@@ -820,6 +825,10 @@ struct domain *domain_create(domid_t domid,
ASSERT(err < 0); /* Sanity check paths leading here. */
err = err ?: -EILSEQ; /* Release build safety. */
+#ifdef CONFIG_X86
+ free_perdomain_mappings(d);
+#endif
+
d->is_dying = DOMDYING_dead;
if ( hardware_domain == d )
hardware_domain = old_hwdom;
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* Re: [PATCH V4 04/15] x86: Initialize mapcache for PV, HVM, and idle domains
2024-11-11 13:11 ` [PATCH V4 04/15] x86: Initialize mapcache for PV, HVM, and idle domains Elias El Yandouzi
@ 2024-11-11 18:46 ` Andrew Cooper
0 siblings, 0 replies; 25+ messages in thread
From: Andrew Cooper @ 2024-11-11 18:46 UTC (permalink / raw)
To: Elias El Yandouzi, xen-devel
Cc: julien, pdurrant, dwmw, Wei Wang, Hongyan Xia, Julien Grall
On 11/11/2024 1:11 pm, Elias El Yandouzi wrote:
> From: Wei Liu <wei.liu2@citrix.com>
>
> To support the transition away from the direct map, the mapcache will now
> be used by HVM and idle domains as well. This patch lifts the `mapcache`
> to the arch level and moves its initialization earlier in
> `arch_domain_create()` to cover PV, HVM, and idle domains.
This part of the commit message is a correct description of how the
logic wants to end up, but ...
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 92263a4fbdc5..a7f4929b5893 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -738,6 +738,11 @@ struct domain *domain_create(domid_t domid,
>
> rangeset_domain_initialise(d);
>
> +#ifdef CONFIG_X86
> + if ( (err = mapcache_domain_init(d)) != 0)
> + goto fail;
> +#endif
> +
> if ( is_idle_domain(d) )
> arch_init_idle_domain(d);
>
> @@ -820,6 +825,10 @@ struct domain *domain_create(domid_t domid,
> ASSERT(err < 0); /* Sanity check paths leading here. */
> err = err ?: -EILSEQ; /* Release build safety. */
>
> +#ifdef CONFIG_X86
> + free_perdomain_mappings(d);
> +#endif
> +
> d->is_dying = DOMDYING_dead;
> if ( hardware_domain == d )
> hardware_domain = old_hwdom;
... this is not what the commit message says.
These should be implemented as per the commit message. i.e. living in
arch_*() functions, not ifdef'd in common code.
If there are not arch functions in the right places, we can make the
appear. It looks like arch_init_idle_domain() needs to grow a failure
case, but that seems to be the extent of the changes needed in order to
move these calls into arch-specific code.
~Andrew
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH V4 05/15] x86: Add a boot option to enable and disable the direct map
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (3 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 04/15] x86: Initialize mapcache for PV, HVM, and idle domains Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-11 13:11 ` [PATCH V4 06/15] xen/x86: Add support for the PMAP Elias El Yandouzi
` (10 subsequent siblings)
15 siblings, 0 replies; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
This is added as a Kconfig option as well as a boot command line option.
While being generic, the Kconfig option is only usable for x86 at the moment.
Note that there remains some users of the directmap at this point. The option
is introduced now as it will be needed in follow-up patches.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in V4:
* Rename the Kconfig options
* Set opt_directmap to true if CONFIG_HAS_ONDEMAND_DIRECTMAP is not enabled
Changes in V2:
* Introduce a Kconfig option
* Reword the commit message
* Make opt_directmap and helper generic
Changes since Hongyan's version:
* Reword the commit message
* opt_directmap is only modified during boot so mark it as
__ro_after_init
diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
index 293dbc1a957b..10b0b2714661 100644
--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -799,6 +799,18 @@ that enabling this option cannot guarantee anything beyond what underlying
hardware guarantees (with, where available and known to Xen, respective
tweaks applied).
+### directmap (x86)
+> `= <boolean>`
+
+> Default: `true`
+
+Enable or disable fully populating the directmap region in Xen.
+
+By default, Xen creates the directmap region which maps all physical memory
+in that region. Disabling this option will sparsely populate the directmap,
+blocking exploits that leak secrets via speculative memory access in the
+directmap.
+
### dma_bits
> `= <integer>`
diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
index 9cdd04721afa..55f1e8702ab9 100644
--- a/xen/arch/x86/Kconfig
+++ b/xen/arch/x86/Kconfig
@@ -23,6 +23,7 @@ config X86
select HAS_IOPORTS
select HAS_KEXEC
select HAS_NS16550
+ select HAS_ONDEMAND_DIRECTMAP
select HAS_PASSTHROUGH
select HAS_PCI
select HAS_PCI_MSI
diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h
index 71a29b2cb3af..d3936bd173fa 100644
--- a/xen/arch/x86/include/asm/mm.h
+++ b/xen/arch/x86/include/asm/mm.h
@@ -620,11 +620,17 @@ void write_32bit_pse_identmap(uint32_t *l2);
/*
* x86 maps part of physical memory via the directmap region.
* Return whether the range of MFN falls in the directmap region.
+ *
+ * When boot command line sets directmap=no, the directmap will mostly be empty
+ * so this will always return false.
*/
static inline bool arch_mfns_in_directmap(unsigned long mfn, unsigned long nr)
{
unsigned long eva = min(DIRECTMAP_VIRT_END, HYPERVISOR_VIRT_END);
+ if ( !has_directmap() )
+ return false;
+
return (mfn + nr) <= (virt_to_mfn(eva - 1) + 1);
}
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 177f4024abca..a3c21ca05099 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1586,6 +1586,8 @@ void asmlinkage __init noreturn __start_xen(void)
if ( highmem_start )
xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
+ printk("Booting with directmap %s\n", has_directmap() ? "full" : "on demand");
+
/*
* Walk every RAM region and map it in its entirety (on x86/64, at least)
* and notify it to the boot allocator.
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index 90268d92499a..72094c491756 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -71,6 +71,9 @@ config HAS_IOPORTS
config HAS_KEXEC
bool
+config HAS_ONDEMAND_DIRECTMAP
+ bool
+
config HAS_PIRQ
bool
@@ -415,6 +418,20 @@ source "common/sched/Kconfig"
config CRYPTO
bool
+config ONDEMAND_DIRECTMAP
+ bool "On-Demand Directmap"
+ depends on HAS_ONDEMAND_DIRECTMAP
+ help
+ The directmap contains mapping for most of the RAM, making domain
+ memory easily accessible. While this can improve performance, it also
+ increases the vulnerability to speculation attacks.
+
+ Enabling this feature allows the user to control whether the memory
+ is always mapped at boot or mapped only on demand (see the command line
+ option "directmap").
+
+ If unsure, say N.
+
config LIVEPATCH
bool "Live patching support"
default X86
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 33c8c917d984..b0be246780b7 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -175,6 +175,11 @@ paddr_t __ro_after_init mem_hotplug;
static char __initdata opt_badpage[100] = "";
string_param("badpage", opt_badpage);
+#ifdef CONFIG_HAS_ONDEMAND_DIRECTMAP
+bool __ro_after_init opt_directmap = true;
+boolean_param("directmap", opt_directmap);
+#endif
+
/*
* no-bootscrub -> Free pages are not zeroed during boot.
*/
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index 7561297a7553..9e3c9f3b6dfa 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -167,6 +167,17 @@ extern unsigned long max_page;
extern unsigned long total_pages;
extern paddr_t mem_hotplug;
+#ifdef CONFIG_HAS_ONDEMAND_DIRECTMAP
+ extern bool opt_directmap;
+#else
+ #define opt_directmap true
+#endif
+
+static inline bool has_directmap(void)
+{
+ return !IS_ENABLED(CONFIG_HAS_ONDEMAND_DIRECTMAP) || opt_directmap;
+}
+
/*
* Extra fault info types which are used to further describe
* the source of an access violation.
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* [PATCH V4 06/15] xen/x86: Add support for the PMAP
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (4 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 05/15] x86: Add a boot option to enable and disable the direct map Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-11 13:11 ` [PATCH V4 07/15] x86/domain_page: Remove the fast paths when mfn is not in the directmap Elias El Yandouzi
` (9 subsequent siblings)
15 siblings, 0 replies; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Julien Grall, Elias El Yandouzi
From: Julien Grall <jgrall@amazon.com>
PMAP will be used in a follow-up patch to bootstrap map domain
page infrastructure -- we need some way to map pages to setup the
mapcache without a direct map.
The functions pmap_{map, unmap} open code {set, clear}_fixmap to break
the loop.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
The PMAP infrastructure was upstream separately for Arm since
Hongyan sent the secret-free hypervisor series. So this is a new
patch to plumb the feature on x86.
Changes in v4:
* Select PMAP KConfig option iff ONDEMAND_DIRECTMAP is used
Changes in v2:
* Declare PMAP entries earlier in fixed_addresses
* Reword the commit message
diff --git a/xen/arch/x86/include/asm/fixmap.h b/xen/arch/x86/include/asm/fixmap.h
index 516ec3fa6c95..80b7b74fd816 100644
--- a/xen/arch/x86/include/asm/fixmap.h
+++ b/xen/arch/x86/include/asm/fixmap.h
@@ -21,6 +21,8 @@
#include <xen/acpi.h>
#include <xen/pfn.h>
+#include <xen/pmap.h>
+
#include <asm/apicdef.h>
#include <asm/msi.h>
#include <acpi/apei.h>
@@ -53,6 +55,10 @@ enum fixed_addresses {
FIX_PV_CONSOLE,
FIX_XEN_SHARED_INFO,
#endif /* CONFIG_XEN_GUEST */
+#ifdef CONFIG_HAS_PMAP
+ FIX_PMAP_BEGIN,
+ FIX_PMAP_END = FIX_PMAP_BEGIN + NUM_FIX_PMAP,
+#endif
/* Everything else should go further down. */
FIX_APIC_BASE,
FIX_IO_APIC_BASE_0,
diff --git a/xen/arch/x86/include/asm/pmap.h b/xen/arch/x86/include/asm/pmap.h
new file mode 100644
index 000000000000..1b3b729b90b2
--- /dev/null
+++ b/xen/arch/x86/include/asm/pmap.h
@@ -0,0 +1,35 @@
+#ifndef __ASM_PMAP_H__
+#define __ASM_PMAP_H__
+
+#include <asm/fixmap.h>
+
+static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
+{
+ unsigned long linear = (unsigned long)fix_to_virt(slot);
+ l1_pgentry_t *pl1e = &l1_fixmap[l1_table_offset(linear)];
+
+ BUILD_BUG_ON(FIX_APIC_BASE - 1 > L1_PAGETABLE_ENTRIES - 1);
+ ASSERT(!(l1e_get_flags(*pl1e) & _PAGE_PRESENT));
+
+ l1e_write(pl1e, l1e_from_mfn(mfn, PAGE_HYPERVISOR));
+}
+
+static inline void arch_pmap_unmap(unsigned int slot)
+{
+ unsigned long linear = (unsigned long)fix_to_virt(slot);
+ l1_pgentry_t *pl1e = &l1_fixmap[l1_table_offset(linear)];
+
+ l1e_write(pl1e, l1e_empty());
+ flush_tlb_one_local(linear);
+}
+
+#endif /* __ASM_PMAP_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index 72094c491756..2a6c33493927 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -421,6 +421,7 @@ config CRYPTO
config ONDEMAND_DIRECTMAP
bool "On-Demand Directmap"
depends on HAS_ONDEMAND_DIRECTMAP
+ select HAS_PMAP
help
The directmap contains mapping for most of the RAM, making domain
memory easily accessible. While this can improve performance, it also
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* [PATCH V4 07/15] x86/domain_page: Remove the fast paths when mfn is not in the directmap
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (5 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 06/15] xen/x86: Add support for the PMAP Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-18 18:08 ` Alejandro Vallejo
2024-11-19 7:55 ` Jan Beulich
2024-11-11 13:11 ` [PATCH V4 08/15] xen/page_alloc: Add a path for xenheap when there is no direct map Elias El Yandouzi
` (8 subsequent siblings)
15 siblings, 2 replies; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall
From: Hongyan Xia <hongyxia@amazon.com>
When mfn is not in direct map, never use mfn_to_virt for any mappings.
We replace mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) with
arch_mfns_in_direct_map(mfn, 1) because these two are equivalent. The
extra comparison in arch_mfns_in_direct_map() looks different but because
DIRECTMAP_VIRT_END is always higher, it does not make any difference.
Lastly, domain_page_map_to_mfn() needs to gain to a special case for
the PMAP.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
----
Changes in v4:
* Introduce helper functions virt_is_fixmap and virt_in_fixmap_range
Changes since Hongyan's version:
* arch_mfn_in_direct_map() was renamed to arch_mfns_in_directmap()
* add a special case for the PMAP in domain_page_map_to_mfn()
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 55e337aaf703..df7d4750ef05 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -14,8 +14,10 @@
#include <xen/sched.h>
#include <xen/vmap.h>
#include <asm/current.h>
+#include <asm/fixmap.h>
#include <asm/flushtlb.h>
#include <asm/hardirq.h>
+#include <asm/pmap.h>
#include <asm/setup.h>
static DEFINE_PER_CPU(struct vcpu *, override);
@@ -24,6 +26,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
{
/* In the common case we use the mapcache of the running VCPU. */
struct vcpu *v = this_cpu(override) ?: current;
+ struct vcpu *idle_v = idle_vcpu[smp_processor_id()];
/*
* When current isn't properly set up yet, this is equivalent to
@@ -35,10 +38,11 @@ static inline struct vcpu *mapcache_current_vcpu(void)
/*
* When using efi runtime page tables, we have the equivalent of the idle
* domain's page tables but current may point at another domain's VCPU.
- * Return NULL as though current is not properly set up yet.
+ * Return the idle domains's vcpu on that core because the efi per-domain
+ * region (where the mapcache is) is in-sync with the idle domain.
*/
if ( efi_rs_using_pgtables() )
- return NULL;
+ return idle_v;
/*
* If guest_table is NULL, and we are running a paravirtualised guest,
@@ -48,7 +52,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
if ( unlikely(pagetable_is_null(v->arch.guest_table)) && is_pv_vcpu(v) )
{
/* If we really are idling, perform lazy context switch now. */
- if ( (v = idle_vcpu[smp_processor_id()]) == current )
+ if ( (v = idle_v) == current )
sync_local_execstate();
/* We must now be running on the idle page table. */
ASSERT(cr3_pa(read_cr3()) == __pa(idle_pg_table));
@@ -77,18 +81,24 @@ void *map_domain_page(mfn_t mfn)
struct vcpu_maphash_entry *hashent;
#ifdef NDEBUG
- if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
+ if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
return mfn_to_virt(mfn_x(mfn));
#endif
v = mapcache_current_vcpu();
- if ( !v )
- return mfn_to_virt(mfn_x(mfn));
+ if ( !v || !v->domain->arch.mapcache.inuse )
+ {
+ if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
+ return mfn_to_virt(mfn_x(mfn));
+ else
+ {
+ BUG_ON(system_state >= SYS_STATE_smp_boot);
+ return pmap_map(mfn);
+ }
+ }
dcache = &v->domain->arch.mapcache;
vcache = &v->arch.mapcache;
- if ( !dcache->inuse )
- return mfn_to_virt(mfn_x(mfn));
perfc_incr(map_domain_page_count);
@@ -184,6 +194,12 @@ void unmap_domain_page(const void *ptr)
if ( !va || va >= DIRECTMAP_VIRT_START )
return;
+ if ( virt_is_fixmap(va) )
+ {
+ pmap_unmap(ptr);
+ return;
+ }
+
ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
v = mapcache_current_vcpu();
@@ -237,7 +253,7 @@ int mapcache_domain_init(struct domain *d)
unsigned int bitmap_pages;
#ifdef NDEBUG
- if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
+ if ( !mem_hotplug && arch_mfn_in_directmap(0, max_page) )
return 0;
#endif
@@ -308,7 +324,7 @@ void *map_domain_page_global(mfn_t mfn)
local_irq_is_enabled()));
#ifdef NDEBUG
- if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
+ if ( arch_mfn_in_directmap(mfn_x(mfn, 1)) )
return mfn_to_virt(mfn_x(mfn));
#endif
@@ -335,6 +351,22 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
if ( va >= DIRECTMAP_VIRT_START )
return _mfn(virt_to_mfn(ptr));
+ /*
+ * The fixmap is stealing the top-end of the VMAP. So the check for
+ * the PMAP *must* happen first.
+ *
+ * Also, the fixmap translate a slot to an address backwards. The
+ * logic will rely on it to avoid any complexity. So check at
+ * compile time this will always hold.
+ */
+ BUILD_BUG_ON(fix_to_virt(FIX_PMAP_BEGIN) < fix_to_virt(FIX_PMAP_END));
+
+ if ( virt_in_fixmap_range(va, FIX_PMAP_BEGIN, FIX_PMAP_END) )
+ {
+ BUG_ON(system_state >= SYS_STATE_smp_boot);
+ return l1e_get_mfn(l1_fixmap[l1_table_offset(va)]);
+ }
+
if ( va >= VMAP_VIRT_START && va < VMAP_VIRT_END )
return vmap_to_mfn(va);
diff --git a/xen/arch/x86/include/asm/fixmap.h b/xen/arch/x86/include/asm/fixmap.h
index 80b7b74fd816..381c95a8b11f 100644
--- a/xen/arch/x86/include/asm/fixmap.h
+++ b/xen/arch/x86/include/asm/fixmap.h
@@ -101,6 +101,31 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
return __virt_to_fix(vaddr);
}
+static inline bool virt_is_fixmap(const unsigned long vaddr)
+{
+ return vaddr >= FIXADDR_START && vaddr < FIXADDR_TOP;
+}
+
+static inline bool virt_in_fixmap_range(
+ const unsigned long vaddr,
+ const unsigned int start_idx,
+ const unsigned int end_idx
+)
+{
+ unsigned long start_addr = (unsigned long)fix_to_virt(start_idx);
+ unsigned long end_addr = (unsigned long)fix_to_virt(end_idx);
+
+ /*
+ * The check ensures that the virtual address (vaddr) is within the
+ * fixmap range. The addresses are allocated backwards, meaning the
+ * start address is higher than the end address. As a result, the
+ * check ensures that the virtual address is greater than or equal to
+ * the end address, and less than or equal to the start address, which
+ * may appear counterintuitive due to the reverse allocation order.
+ */
+ return ((vaddr & PAGE_MASK) <= start_addr) && (vaddr >= end_addr);
+}
+
enum fixed_addresses_x {
/* Index 0 is reserved since fix_x_to_virt(0) == FIXADDR_X_TOP. */
FIX_X_RESERVED,
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* Re: [PATCH V4 07/15] x86/domain_page: Remove the fast paths when mfn is not in the directmap
2024-11-11 13:11 ` [PATCH V4 07/15] x86/domain_page: Remove the fast paths when mfn is not in the directmap Elias El Yandouzi
@ 2024-11-18 18:08 ` Alejandro Vallejo
2024-11-19 7:55 ` Jan Beulich
1 sibling, 0 replies; 25+ messages in thread
From: Alejandro Vallejo @ 2024-11-18 18:08 UTC (permalink / raw)
To: Elias El Yandouzi, xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall
I'm still headscratching about various things, but the build errors are on
release builds without pmap enabled. I'm highlighted them here.
On Mon Nov 11, 2024 at 1:11 PM GMT, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
>
> When mfn is not in direct map, never use mfn_to_virt for any mappings.
>
> We replace mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) with
> arch_mfns_in_direct_map(mfn, 1) because these two are equivalent. The
> extra comparison in arch_mfns_in_direct_map() looks different but because
> DIRECTMAP_VIRT_END is always higher, it does not make any difference.
>
> Lastly, domain_page_map_to_mfn() needs to gain to a special case for
> the PMAP.
>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
>
> ----
>
> Changes in v4:
> * Introduce helper functions virt_is_fixmap and virt_in_fixmap_range
>
> Changes since Hongyan's version:
> * arch_mfn_in_direct_map() was renamed to arch_mfns_in_directmap()
> * add a special case for the PMAP in domain_page_map_to_mfn()
>
> diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
> index 55e337aaf703..df7d4750ef05 100644
> --- a/xen/arch/x86/domain_page.c
> +++ b/xen/arch/x86/domain_page.c
> @@ -14,8 +14,10 @@
> #include <xen/sched.h>
> #include <xen/vmap.h>
> #include <asm/current.h>
> +#include <asm/fixmap.h>
> #include <asm/flushtlb.h>
> #include <asm/hardirq.h>
> +#include <asm/pmap.h>
> #include <asm/setup.h>
>
> static DEFINE_PER_CPU(struct vcpu *, override);
> @@ -24,6 +26,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
> {
> /* In the common case we use the mapcache of the running VCPU. */
> struct vcpu *v = this_cpu(override) ?: current;
> + struct vcpu *idle_v = idle_vcpu[smp_processor_id()];
>
> /*
> * When current isn't properly set up yet, this is equivalent to
> @@ -35,10 +38,11 @@ static inline struct vcpu *mapcache_current_vcpu(void)
> /*
> * When using efi runtime page tables, we have the equivalent of the idle
> * domain's page tables but current may point at another domain's VCPU.
> - * Return NULL as though current is not properly set up yet.
> + * Return the idle domains's vcpu on that core because the efi per-domain
> + * region (where the mapcache is) is in-sync with the idle domain.
> */
> if ( efi_rs_using_pgtables() )
> - return NULL;
> + return idle_v;
>
> /*
> * If guest_table is NULL, and we are running a paravirtualised guest,
> @@ -48,7 +52,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
> if ( unlikely(pagetable_is_null(v->arch.guest_table)) && is_pv_vcpu(v) )
> {
> /* If we really are idling, perform lazy context switch now. */
> - if ( (v = idle_vcpu[smp_processor_id()]) == current )
> + if ( (v = idle_v) == current )
> sync_local_execstate();
> /* We must now be running on the idle page table. */
> ASSERT(cr3_pa(read_cr3()) == __pa(idle_pg_table));
> @@ -77,18 +81,24 @@ void *map_domain_page(mfn_t mfn)
> struct vcpu_maphash_entry *hashent;
>
> #ifdef NDEBUG
> - if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
> + if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
> return mfn_to_virt(mfn_x(mfn));
> #endif
>
> v = mapcache_current_vcpu();
> - if ( !v )
> - return mfn_to_virt(mfn_x(mfn));
> + if ( !v || !v->domain->arch.mapcache.inuse )
> + {
> + if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
> + return mfn_to_virt(mfn_x(mfn));
> + else
> + {
> + BUG_ON(system_state >= SYS_STATE_smp_boot);
Missing CONFIG_HAS_PMAP guards around this return. Without it this wants to
BUG(), I think. I'm not entirely convinced the current logic takes into account
the extended directmap present in HVM and idle vCPUs though.
arch_mfns_in_directmap() merely checks they fit in DIRECTMAP_SIZE, doesn't it?
> + return pmap_map(mfn);
> + }
> + }
>
> dcache = &v->domain->arch.mapcache;
> vcache = &v->arch.mapcache;
> - if ( !dcache->inuse )
> - return mfn_to_virt(mfn_x(mfn));
>
> perfc_incr(map_domain_page_count);
>
> @@ -184,6 +194,12 @@ void unmap_domain_page(const void *ptr)
> if ( !va || va >= DIRECTMAP_VIRT_START )
> return;
>
> + if ( virt_is_fixmap(va) )
> + {
> + pmap_unmap(ptr);
> + return;
> + }
> +
This hunk is also missing CONFIG_HAS_PMAP guards.
> ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
>
> v = mapcache_current_vcpu();
> @@ -237,7 +253,7 @@ int mapcache_domain_init(struct domain *d)
> unsigned int bitmap_pages;
>
> #ifdef NDEBUG
> - if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
> + if ( !mem_hotplug && arch_mfn_in_directmap(0, max_page) )
I suspect you wanted arch_mfns_in_directmap() rather than _mfn_
> return 0;
> #endif
>
> @@ -308,7 +324,7 @@ void *map_domain_page_global(mfn_t mfn)
> local_irq_is_enabled()));
>
> #ifdef NDEBUG
> - if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
> + if ( arch_mfn_in_directmap(mfn_x(mfn, 1)) )
I suspect you wanted 's/mfn_x(mfn, 1)/mfn_x(mfn), 1/' instead?
> return mfn_to_virt(mfn_x(mfn));
> #endif
>
> @@ -335,6 +351,22 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
> if ( va >= DIRECTMAP_VIRT_START )
> return _mfn(virt_to_mfn(ptr));
>
> + /*
> + * The fixmap is stealing the top-end of the VMAP. So the check for
> + * the PMAP *must* happen first.
> + *
> + * Also, the fixmap translate a slot to an address backwards. The
> + * logic will rely on it to avoid any complexity. So check at
> + * compile time this will always hold.
> + */
> + BUILD_BUG_ON(fix_to_virt(FIX_PMAP_BEGIN) < fix_to_virt(FIX_PMAP_END));
> +
> + if ( virt_in_fixmap_range(va, FIX_PMAP_BEGIN, FIX_PMAP_END) )
> + {
> + BUG_ON(system_state >= SYS_STATE_smp_boot);
> + return l1e_get_mfn(l1_fixmap[l1_table_offset(va)]);
> + }
> +
This hunk should be surrounded by CONFIG_HAS_PMAP guards or it'll fail to
compile.
> if ( va >= VMAP_VIRT_START && va < VMAP_VIRT_END )
> return vmap_to_mfn(va);
>
> diff --git a/xen/arch/x86/include/asm/fixmap.h b/xen/arch/x86/include/asm/fixmap.h
> index 80b7b74fd816..381c95a8b11f 100644
> --- a/xen/arch/x86/include/asm/fixmap.h
> +++ b/xen/arch/x86/include/asm/fixmap.h
> @@ -101,6 +101,31 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
> return __virt_to_fix(vaddr);
> }
>
> +static inline bool virt_is_fixmap(const unsigned long vaddr)
> +{
> + return vaddr >= FIXADDR_START && vaddr < FIXADDR_TOP;
> +}
> +
> +static inline bool virt_in_fixmap_range(
> + const unsigned long vaddr,
> + const unsigned int start_idx,
> + const unsigned int end_idx
> +)
> +{
> + unsigned long start_addr = (unsigned long)fix_to_virt(start_idx);
> + unsigned long end_addr = (unsigned long)fix_to_virt(end_idx);
> +
> + /*
> + * The check ensures that the virtual address (vaddr) is within the
> + * fixmap range. The addresses are allocated backwards, meaning the
> + * start address is higher than the end address. As a result, the
> + * check ensures that the virtual address is greater than or equal to
> + * the end address, and less than or equal to the start address, which
> + * may appear counterintuitive due to the reverse allocation order.
> + */
> + return ((vaddr & PAGE_MASK) <= start_addr) && (vaddr >= end_addr);
> +}
> +
> enum fixed_addresses_x {
> /* Index 0 is reserved since fix_x_to_virt(0) == FIXADDR_X_TOP. */
> FIX_X_RESERVED,
Cheers,
Alejandro
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH V4 07/15] x86/domain_page: Remove the fast paths when mfn is not in the directmap
2024-11-11 13:11 ` [PATCH V4 07/15] x86/domain_page: Remove the fast paths when mfn is not in the directmap Elias El Yandouzi
2024-11-18 18:08 ` Alejandro Vallejo
@ 2024-11-19 7:55 ` Jan Beulich
1 sibling, 0 replies; 25+ messages in thread
From: Jan Beulich @ 2024-11-19 7:55 UTC (permalink / raw)
To: Elias El Yandouzi
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall, xen-devel
On 11.11.2024 14:11, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
>
> When mfn is not in direct map, never use mfn_to_virt for any mappings.
>
> We replace mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) with
> arch_mfns_in_direct_map(mfn, 1) because these two are equivalent. The
> extra comparison in arch_mfns_in_direct_map() looks different but because
> DIRECTMAP_VIRT_END is always higher, it does not make any difference.
>
> Lastly, domain_page_map_to_mfn() needs to gain to a special case for
> the PMAP.
>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
>
> ----
Just to mention it (noticed while reading Alejandro's reply, and I didn't
check the rest of the series): This is lacking your S-o-b.
Jan
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH V4 08/15] xen/page_alloc: Add a path for xenheap when there is no direct map
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (6 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 07/15] x86/domain_page: Remove the fast paths when mfn is not in the directmap Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-11 13:11 ` [PATCH V4 09/15] x86/setup: Leave early boot slightly earlier Elias El Yandouzi
` (7 subsequent siblings)
15 siblings, 0 replies; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
When there is not an always-mapped direct map, xenheap allocations need
to be mapped and unmapped on-demand.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
I have left the call to map_pages_to_xen() and destroy_xen_mappings()
in the split heap for now. I am not entirely convinced this is necessary
because in that setup only the xenheap would be always mapped and
this doesn't contain any guest memory (aside the grant-table).
So map/unmapping for every allocation seems unnecessary.
Changes in v4:
* Call printk instead of dprintk()
Changes in v2:
* Fix remaining wrong indentation in alloc_xenheap_pages()
Changes since Hongyan's version:
* Rebase
* Fix indentation in alloc_xenheap_pages()
* Fix build for arm32
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index b0be246780b7..2cef521ad85a 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -2243,6 +2243,7 @@ void init_xenheap_pages(paddr_t ps, paddr_t pe)
void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
{
struct page_info *pg;
+ void *virt_addr;
ASSERT_ALLOC_CONTEXT();
@@ -2251,17 +2252,36 @@ void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
if ( unlikely(pg == NULL) )
return NULL;
- return page_to_virt(pg);
+ virt_addr = page_to_virt(pg);
+
+ if ( !has_directmap() &&
+ map_pages_to_xen((unsigned long)virt_addr, page_to_mfn(pg), 1UL << order,
+ PAGE_HYPERVISOR) )
+ {
+ /* Failed to map xenheap pages. */
+ free_heap_pages(pg, order, false);
+ return NULL;
+ }
+
+ return virt_addr;
}
void free_xenheap_pages(void *v, unsigned int order)
{
+ unsigned long va = (unsigned long)v & PAGE_MASK;
+
ASSERT_ALLOC_CONTEXT();
if ( v == NULL )
return;
+ if ( !has_directmap() &&
+ destroy_xen_mappings(va, va + (PAGE_SIZE << order)) )
+ printk(XENLOG_WARNING,
+ "Error while destroying xenheap mappings at %p, order %u\n",
+ v, order);
+
free_heap_pages(virt_to_page(v), order, false);
}
@@ -2285,6 +2305,7 @@ void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
{
struct page_info *pg;
unsigned int i;
+ void *virt_addr;
ASSERT_ALLOC_CONTEXT();
@@ -2297,16 +2318,28 @@ void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
if ( unlikely(pg == NULL) )
return NULL;
+ virt_addr = page_to_virt(pg);
+
+ if ( !has_directmap() &&
+ map_pages_to_xen((unsigned long)virt_addr, page_to_mfn(pg), 1UL << order,
+ PAGE_HYPERVISOR) )
+ {
+ /* Failed to map xenheap pages. */
+ free_domheap_pages(pg, order);
+ return NULL;
+ }
+
for ( i = 0; i < (1u << order); i++ )
pg[i].count_info |= PGC_xen_heap;
- return page_to_virt(pg);
+ return virt_addr;
}
void free_xenheap_pages(void *v, unsigned int order)
{
struct page_info *pg;
unsigned int i;
+ unsigned long va = (unsigned long)v & PAGE_MASK;
ASSERT_ALLOC_CONTEXT();
@@ -2318,6 +2351,12 @@ void free_xenheap_pages(void *v, unsigned int order)
for ( i = 0; i < (1u << order); i++ )
pg[i].count_info &= ~PGC_xen_heap;
+ if ( !has_directmap() &&
+ destroy_xen_mappings(va, va + (PAGE_SIZE << order)) )
+ printk(XENLOG_WARNING
+ "Error while destroying xenheap mappings at %p, order %u\n",
+ v, order);
+
free_heap_pages(pg, order, true);
}
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* [PATCH V4 09/15] x86/setup: Leave early boot slightly earlier
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (7 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 08/15] xen/page_alloc: Add a path for xenheap when there is no direct map Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-11 13:11 ` [PATCH V4 10/15] xen/page_alloc: vmap heap nodes when they are outside the direct map Elias El Yandouzi
` (6 subsequent siblings)
15 siblings, 0 replies; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
When we do not have a direct map, memory for metadata of heap nodes in
init_node_heap() is allocated from xenheap, which needs to be mapped and
unmapped on demand. However, we cannot just take memory from the boot
allocator to create the PTEs while we are passing memory to the heap
allocator.
To solve this race, we leave early boot slightly sooner so that Xen PTE
pages are allocated from the heap instead of the boot allocator. We can
do this because the metadata for the 1st node is statically allocated,
and by the time we need memory to create mappings for the 2nd node, we
already have enough memory in the heap allocator in the 1st node.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in v4:
* Fix indentation
* Refactor the code to reduce code duplication
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index a3c21ca05099..4e258419ac34 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1822,6 +1822,22 @@ void asmlinkage __init noreturn __start_xen(void)
numa_initmem_init(0, raw_max_page);
+ /*
+ * When we do not have a direct map, memory for metadata of heap nodes in
+ * init_node_heap() is allocated from xenheap, which needs to be mapped and
+ * unmapped on demand. However, we cannot just take memory from the boot
+ * allocator to create the PTEs while we are passing memory to the heap
+ * allocator during end_boot_allocator().
+ *
+ * To solve this race, we need to leave early boot before
+ * end_boot_allocator() so that Xen PTE pages are allocated from the heap
+ * instead of the boot allocator. We can do this because the metadata for
+ * the 1st node is statically allocated, and by the time we need memory to
+ * create mappings for the 2nd node, we already have enough memory in the
+ * heap allocator in the 1st node.
+ */
+ system_state = SYS_STATE_boot;
+
if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
{
unsigned long lo = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
@@ -1853,8 +1869,6 @@ void asmlinkage __init noreturn __start_xen(void)
else
end_boot_allocator();
- system_state = SYS_STATE_boot;
-
bsp_stack = cpu_alloc_stack(0);
if ( !bsp_stack )
panic("No memory for BSP stack\n");
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* [PATCH V4 10/15] xen/page_alloc: vmap heap nodes when they are outside the direct map
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (8 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 09/15] x86/setup: Leave early boot slightly earlier Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-12-13 13:46 ` Alejandro Vallejo
2024-12-13 14:59 ` Alejandro Vallejo
2024-11-11 13:11 ` [PATCH V4 11/15] x86/setup: Do not create valid mappings when directmap=no Elias El Yandouzi
` (5 subsequent siblings)
15 siblings, 2 replies; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
When we do not have a direct map, archs_mfn_in_direct_map() will always
return false, thus init_node_heap() will allocate xenheap pages from an
existing node for the metadata of a new node. This means that the
metadata of a new node is in a different node, slowing down heap
allocation.
Since we now have early vmap, vmap the metadata locally in the new node.
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in v4:
* Change type of the parameters to paddr_t
* Use clear_domain_page() instead of open-coding it
Changes in v2:
* vmap_contig_pages() was renamed to vmap_contig()
* Fix indentation and coding style
Changes from Hongyan's version:
* arch_mfn_in_direct_map() was renamed to
arch_mfns_in_direct_map()
* Use vmap_contig_pages() rather than __vmap(...).
* Add missing include (xen/vmap.h) so it compiles on Arm
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 2cef521ad85a..62cdeb5013a3 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -137,6 +137,7 @@
#include <xen/sections.h>
#include <xen/softirq.h>
#include <xen/spinlock.h>
+#include <xen/vmap.h>
#include <asm/flushtlb.h>
#include <asm/page.h>
@@ -606,22 +607,32 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
needed = 0;
}
else if ( *use_tail && nr >= needed &&
- arch_mfns_in_directmap(mfn + nr - needed, needed) &&
(!xenheap_bits ||
!((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
{
- _heap[node] = mfn_to_virt(mfn + nr - needed);
- avail[node] = mfn_to_virt(mfn + nr - 1) +
- PAGE_SIZE - sizeof(**avail) * NR_ZONES;
+ if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
+ _heap[node] = mfn_to_virt(mfn + nr - needed);
+ else
+ _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
+
+ BUG_ON(!_heap[node]);
+ avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
+ sizeof(**avail) * NR_ZONES;
+
}
else if ( nr >= needed &&
- arch_mfns_in_directmap(mfn, needed) &&
(!xenheap_bits ||
- !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
+ !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
{
- _heap[node] = mfn_to_virt(mfn);
- avail[node] = mfn_to_virt(mfn + needed - 1) +
- PAGE_SIZE - sizeof(**avail) * NR_ZONES;
+ if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
+ _heap[node] = mfn_to_virt(mfn + nr - needed);
+ else
+ _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
+
+ BUG_ON(!_heap[node]);
+ avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
+ sizeof(**avail) * NR_ZONES;
+
*use_tail = false;
}
else if ( get_order_from_bytes(sizeof(**_heap)) ==
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* Re: [PATCH V4 10/15] xen/page_alloc: vmap heap nodes when they are outside the direct map
2024-11-11 13:11 ` [PATCH V4 10/15] xen/page_alloc: vmap heap nodes when they are outside the direct map Elias El Yandouzi
@ 2024-12-13 13:46 ` Alejandro Vallejo
2024-12-13 14:59 ` Alejandro Vallejo
1 sibling, 0 replies; 25+ messages in thread
From: Alejandro Vallejo @ 2024-12-13 13:46 UTC (permalink / raw)
To: Elias El Yandouzi, xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall
Hi,
I'm seeing crashes on NUMA machines, which can be attributed to a bug below:
On Mon Nov 11, 2024 at 1:11 PM GMT, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
>
> When we do not have a direct map, archs_mfn_in_direct_map() will always
> return false, thus init_node_heap() will allocate xenheap pages from an
> existing node for the metadata of a new node. This means that the
> metadata of a new node is in a different node, slowing down heap
> allocation.
>
> Since we now have early vmap, vmap the metadata locally in the new node.
>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
> Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
>
> ----
>
> Changes in v4:
> * Change type of the parameters to paddr_t
> * Use clear_domain_page() instead of open-coding it
>
> Changes in v2:
> * vmap_contig_pages() was renamed to vmap_contig()
> * Fix indentation and coding style
>
> Changes from Hongyan's version:
> * arch_mfn_in_direct_map() was renamed to
> arch_mfns_in_direct_map()
> * Use vmap_contig_pages() rather than __vmap(...).
> * Add missing include (xen/vmap.h) so it compiles on Arm
>
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index 2cef521ad85a..62cdeb5013a3 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -137,6 +137,7 @@
> #include <xen/sections.h>
> #include <xen/softirq.h>
> #include <xen/spinlock.h>
> +#include <xen/vmap.h>
>
> #include <asm/flushtlb.h>
> #include <asm/page.h>
> @@ -606,22 +607,32 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
> needed = 0;
> }
> else if ( *use_tail && nr >= needed &&
> - arch_mfns_in_directmap(mfn + nr - needed, needed) &&
> (!xenheap_bits ||
> !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> {
> - _heap[node] = mfn_to_virt(mfn + nr - needed);
> - avail[node] = mfn_to_virt(mfn + nr - 1) +
> - PAGE_SIZE - sizeof(**avail) * NR_ZONES;
> + if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
> + _heap[node] = mfn_to_virt(mfn + nr - needed);
> + else
> + _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
> +
> + BUG_ON(!_heap[node]);
> + avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
> + sizeof(**avail) * NR_ZONES;
> +
> }
> else if ( nr >= needed &&
> - arch_mfns_in_directmap(mfn, needed) &&
> (!xenheap_bits ||
> - !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> + !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> {
> - _heap[node] = mfn_to_virt(mfn);
> - avail[node] = mfn_to_virt(mfn + needed - 1) +
> - PAGE_SIZE - sizeof(**avail) * NR_ZONES;
> + if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
> + _heap[node] = mfn_to_virt(mfn + nr - needed);
> + else
> + _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
This isn't quite the same thing, I think it regressed in v4 when acting on
Roger's feedback. It should be:
if ( arch_mfns_in_directmap(mfn, needed) )
_heap[node] = mfn_to_virt(mfn);
else
_heap[node] = vmap_contig(_mfn(mfn), needed);
Otherwise `use_tail` serves is unconditionally considered as set. With this
change in place, I can boot on NUMA machines.
> +
> + BUG_ON(!_heap[node]);
> + avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
> + sizeof(**avail) * NR_ZONES;
> +
> *use_tail = false;
> }
> else if ( get_order_from_bytes(sizeof(**_heap)) ==
Cheers,
Alejandro
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH V4 10/15] xen/page_alloc: vmap heap nodes when they are outside the direct map
2024-11-11 13:11 ` [PATCH V4 10/15] xen/page_alloc: vmap heap nodes when they are outside the direct map Elias El Yandouzi
2024-12-13 13:46 ` Alejandro Vallejo
@ 2024-12-13 14:59 ` Alejandro Vallejo
1 sibling, 0 replies; 25+ messages in thread
From: Alejandro Vallejo @ 2024-12-13 14:59 UTC (permalink / raw)
To: Elias El Yandouzi, xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall
On Mon Nov 11, 2024 at 1:11 PM GMT, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
>
> When we do not have a direct map, archs_mfn_in_direct_map() will always
> return false, thus init_node_heap() will allocate xenheap pages from an
> existing node for the metadata of a new node. This means that the
> metadata of a new node is in a different node, slowing down heap
> allocation.
>
> Since we now have early vmap, vmap the metadata locally in the new node.
>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
> Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
>
> ----
>
> Changes in v4:
> * Change type of the parameters to paddr_t
> * Use clear_domain_page() instead of open-coding it
>
> Changes in v2:
> * vmap_contig_pages() was renamed to vmap_contig()
> * Fix indentation and coding style
>
> Changes from Hongyan's version:
> * arch_mfn_in_direct_map() was renamed to
> arch_mfns_in_direct_map()
> * Use vmap_contig_pages() rather than __vmap(...).
> * Add missing include (xen/vmap.h) so it compiles on Arm
>
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index 2cef521ad85a..62cdeb5013a3 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -137,6 +137,7 @@
> #include <xen/sections.h>
> #include <xen/softirq.h>
> #include <xen/spinlock.h>
> +#include <xen/vmap.h>
>
> #include <asm/flushtlb.h>
> #include <asm/page.h>
> @@ -606,22 +607,32 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
> needed = 0;
> }
> else if ( *use_tail && nr >= needed &&
> - arch_mfns_in_directmap(mfn + nr - needed, needed) &&
> (!xenheap_bits ||
> !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> {
> - _heap[node] = mfn_to_virt(mfn + nr - needed);
> - avail[node] = mfn_to_virt(mfn + nr - 1) +
> - PAGE_SIZE - sizeof(**avail) * NR_ZONES;
> + if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
> + _heap[node] = mfn_to_virt(mfn + nr - needed);
> + else
> + _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
... and looking more carefully, couldn't we simply map_pages_to_xen() on the
directmap using mfn_to_virt() as the target? It's not like the NUMA information
is a secret, and even if it was the vmap is no less exposed.
I _GUESS_ this was done with the intent of eventually removing the directmap
altogether, but it's probably a lot better to keep it around for things like
the p2m structures and other global data (like these per-node structures).
> +
> + BUG_ON(!_heap[node]);
> + avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
> + sizeof(**avail) * NR_ZONES;
> +
> }
> else if ( nr >= needed &&
> - arch_mfns_in_directmap(mfn, needed) &&
> (!xenheap_bits ||
> - !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> + !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
> {
> - _heap[node] = mfn_to_virt(mfn);
> - avail[node] = mfn_to_virt(mfn + needed - 1) +
> - PAGE_SIZE - sizeof(**avail) * NR_ZONES;
> + if ( arch_mfns_in_directmap(mfn + nr - needed, needed) )
> + _heap[node] = mfn_to_virt(mfn + nr - needed);
> + else
> + _heap[node] = vmap_contig(_mfn(mfn + nr - needed), needed);
> +
> + BUG_ON(!_heap[node]);
> + avail[node] = (void *)(_heap[node]) + (needed << PAGE_SHIFT) -
> + sizeof(**avail) * NR_ZONES;
> +
> *use_tail = false;
> }
> else if ( get_order_from_bytes(sizeof(**_heap)) ==
I'm compiling all these fixes/enhancements into a separate branch while testing
the whole thing.
Cheers,
Alejandro
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH V4 11/15] x86/setup: Do not create valid mappings when directmap=no
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (9 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 10/15] xen/page_alloc: vmap heap nodes when they are outside the direct map Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-11 13:11 ` [PATCH V4 12/15] xen/arm32: mm: Rename 'first' to 'root' in init_secondary_pagetables() Elias El Yandouzi
` (4 subsequent siblings)
15 siblings, 0 replies; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel
Cc: julien, pdurrant, dwmw, Hongyan Xia, Julien Grall,
Elias El Yandouzi
From: Hongyan Xia <hongyxia@amazon.com>
Create empty mappings in the second e820 pass. Also, destroy existing
direct map mappings created in the first pass.
To make xenheap pages visible in guests, it is necessary to create empty
L3 tables in the direct map even when directmap=no, since guest cr3s
copy idle domain's L4 entries, which means they will share mappings in
the direct map if we pre-populate idle domain's L4 entries and L3
tables. A helper is introduced for this.
Also, after the direct map is actually gone, we need to stop updating
the direct map in update_xen_mappings().
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 4e258419ac34..1633ed0302b1 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1022,6 +1022,56 @@ static struct domain *__init create_dom0(const module_t *image,
return d;
}
+/*
+ * This either populates a valid direct map, or allocates empty L3 tables and
+ * creates the L4 entries for virtual address between [start, end) in the
+ * direct map depending on has_directmap();
+ *
+ * When directmap=no, we still need to populate empty L3 tables in the
+ * direct map region. The reason is that on-demand xenheap mappings are
+ * created in the idle domain's page table but must be seen by
+ * everyone. Since all domains share the direct map L4 entries, they
+ * will share xenheap mappings if we pre-populate the L4 entries and L3
+ * tables in the direct map region for all RAM. We also rely on the fact
+ * that L3 tables are never freed.
+ */
+static void __init populate_directmap(paddr_t pstart, paddr_t pend,
+ unsigned int flags)
+{
+ unsigned long vstart = (unsigned long)__va(pstart);
+ unsigned long vend = (unsigned long)__va(pend);
+
+ if ( pstart >= pend )
+ return;
+
+ BUG_ON(vstart < DIRECTMAP_VIRT_START);
+ BUG_ON(vend > DIRECTMAP_VIRT_END);
+
+ if ( has_directmap() )
+ /* Populate valid direct map. */
+ BUG_ON(map_pages_to_xen(vstart, maddr_to_mfn(pstart),
+ PFN_DOWN(pend - pstart), flags));
+ else
+ {
+ /* Create empty L3 tables. */
+ unsigned long vaddr = vstart & ~((1UL << L4_PAGETABLE_SHIFT) - 1);
+
+ for ( unsigned long idx = l4_table_offset(vaddr);
+ idx <= l4_table_offset(vend); idx++ )
+ {
+ l4_pgentry_t *pl4e = &idle_pg_table[l4_table_offset(idx)];
+
+ if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
+ {
+ mfn_t mfn = alloc_boot_pages(1, 1);
+
+ clear_domain_page(mfn);
+ l4e_write(pl4e, l4e_from_mfn(mfn, __PAGE_HYPERVISOR));
+ }
+ }
+ }
+}
+
/* How much of the directmap is prebuilt at compile time. */
#define PREBUILT_MAP_LIMIT (1 << L2_PAGETABLE_SHIFT)
@@ -1670,8 +1720,17 @@ void asmlinkage __init noreturn __start_xen(void)
map_e = min_t(uint64_t, e,
ARRAY_SIZE(l2_directmap) << L2_PAGETABLE_SHIFT);
- /* Pass mapped memory to allocator /before/ creating new mappings. */
+ /*
+ * Pass mapped memory to allocator /before/ creating new mappings.
+ * The direct map for the bottom 4GiB has been populated in the first
+ * e820 pass. In the second pass, we make sure those existing mappings
+ * are destroyed when directmap=no.
+ */
init_boot_pages(s, min(map_s, e));
+ if ( !has_directmap() )
+ destroy_xen_mappings((unsigned long)__va(s),
+ (unsigned long)__va(min(map_s, e)));
+
s = map_s;
if ( s < map_e )
{
@@ -1679,6 +1738,9 @@ void asmlinkage __init noreturn __start_xen(void)
map_s = (s + mask) & ~mask;
map_e &= ~mask;
init_boot_pages(map_s, map_e);
+ if ( !has_directmap() )
+ destroy_xen_mappings((unsigned long)__va(map_s),
+ (unsigned long)__va(map_e));
}
if ( map_s > map_e )
@@ -1692,8 +1754,7 @@ void asmlinkage __init noreturn __start_xen(void)
if ( map_e < end )
{
- map_pages_to_xen((unsigned long)__va(map_e), maddr_to_mfn(map_e),
- PFN_DOWN(end - map_e), PAGE_HYPERVISOR);
+ populate_directmap(map_e, end, PAGE_HYPERVISOR);
init_boot_pages(map_e, end);
map_e = end;
}
@@ -1702,13 +1763,11 @@ void asmlinkage __init noreturn __start_xen(void)
{
/* This range must not be passed to the boot allocator and
* must also not be mapped with _PAGE_GLOBAL. */
- map_pages_to_xen((unsigned long)__va(map_e), maddr_to_mfn(map_e),
- PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
+ populate_directmap(map_e, e, __PAGE_HYPERVISOR_RW);
}
if ( s < map_s )
{
- map_pages_to_xen((unsigned long)__va(s), maddr_to_mfn(s),
- PFN_DOWN(map_s - s), PAGE_HYPERVISOR);
+ populate_directmap(s, map_s, PAGE_HYPERVISOR);
init_boot_pages(s, map_s);
}
}
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* [PATCH V4 12/15] xen/arm32: mm: Rename 'first' to 'root' in init_secondary_pagetables()
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (10 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 11/15] x86/setup: Do not create valid mappings when directmap=no Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-21 10:34 ` Michal Orzel
2024-11-11 13:11 ` [PATCH V4 13/15] xen/arm64: mm: Use per-pCPU page-tables Elias El Yandouzi
` (3 subsequent siblings)
15 siblings, 1 reply; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Julien Grall, Elias El Yandouzi
From: Julien Grall <jgrall@amazon.com>
The arm32 version of init_secondary_pagetables() will soon be re-used
for arm64 as well where the root table starts at level 0 rather than level 1.
So rename 'first' to 'root'.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changelog in v2:
* Rebase
* Fix typo
diff --git a/xen/arch/arm/mmu/smpboot.c b/xen/arch/arm/mmu/smpboot.c
index 4ffc8254a44b..37e91d72b785 100644
--- a/xen/arch/arm/mmu/smpboot.c
+++ b/xen/arch/arm/mmu/smpboot.c
@@ -109,32 +109,30 @@ int prepare_secondary_mm(int cpu)
#else
int prepare_secondary_mm(int cpu)
{
- lpae_t *first;
+ lpae_t *root = alloc_xenheap_page();
- first = alloc_xenheap_page(); /* root == first level on 32-bit 3-level trie */
-
- if ( !first )
+ if ( !root )
{
- printk("CPU%u: Unable to allocate the first page-table\n", cpu);
+ printk("CPU%u: Unable to allocate the root page-table\n", cpu);
return -ENOMEM;
}
/* Initialise root pagetable from root of boot tables */
- memcpy(first, per_cpu(xen_pgtable, 0), PAGE_SIZE);
- per_cpu(xen_pgtable, cpu) = first;
+ memcpy(root, per_cpu(xen_pgtable, 0), PAGE_SIZE);
+ per_cpu(xen_pgtable, cpu) = root;
if ( !init_domheap_mappings(cpu) )
{
printk("CPU%u: Unable to prepare the domheap page-tables\n", cpu);
per_cpu(xen_pgtable, cpu) = NULL;
- free_xenheap_page(first);
+ free_xenheap_page(root);
return -ENOMEM;
}
clear_boot_pagetables();
/* Set init_ttbr for this CPU coming up */
- set_init_ttbr(first);
+ set_init_ttbr(root);
return 0;
}
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* Re: [PATCH V4 12/15] xen/arm32: mm: Rename 'first' to 'root' in init_secondary_pagetables()
2024-11-11 13:11 ` [PATCH V4 12/15] xen/arm32: mm: Rename 'first' to 'root' in init_secondary_pagetables() Elias El Yandouzi
@ 2024-11-21 10:34 ` Michal Orzel
0 siblings, 0 replies; 25+ messages in thread
From: Michal Orzel @ 2024-11-21 10:34 UTC (permalink / raw)
To: Elias El Yandouzi, xen-devel; +Cc: julien, pdurrant, dwmw, Julien Grall
On 11/11/2024 14:11, Elias El Yandouzi wrote:
>
>
> From: Julien Grall <jgrall@amazon.com>
>
> The arm32 version of init_secondary_pagetables() will soon be re-used
> for arm64 as well where the root table starts at level 0 rather than level 1.
>
> So rename 'first' to 'root'.
>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
> Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
>
> ----
NIT: 3 dashes instead of 4. Otherwise this will end up in a commit msg.
This patch can be merged right away as it's not dependent on other patches.
Reviewed-by: Michal Orzel <michal.orzel@amd.com>
~Michal
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH V4 13/15] xen/arm64: mm: Use per-pCPU page-tables
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (11 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 12/15] xen/arm32: mm: Rename 'first' to 'root' in init_secondary_pagetables() Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-11 13:11 ` [PATCH V4 14/15] xen/arm64: Implement a mapcache for arm64 Elias El Yandouzi
` (2 subsequent siblings)
15 siblings, 0 replies; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Julien Grall, Elias El Yandouzi
From: Julien Grall <jgrall@amazon.com>
At the moment, on Arm64, every pCPU is sharing the same page-tables.
In a follow-up patch, we will allow the possibility to remove the
direct map and therefore it will be necessary to have a mapcache.
While we have plenty of spare virtual address space to reserve part
for each pCPU, it means that temporary mappings (e.g. guest memory)
could be accessible by every pCPU.
In order to increase our security posture, it would be better if
those mappings are only accessible by the pCPU doing the temporary
mapping.
In addition to that, a per-pCPU page-tables opens the way to have
per-domain mapping area.
Arm32 is already using per-pCPU page-tables so most of the code
can be re-used. Arm64 doesn't yet have support for the mapcache,
so a stub is provided (moved to its own header asm/domain_page.h).
Take the opportunity to fix a typo in a comment that is modified.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changelog since v1:
* Rebase
* Fix typoes
diff --git a/xen/arch/arm/arm64/mmu/mm.c b/xen/arch/arm/arm64/mmu/mm.c
index 671eaadbc1d5..c1c6450ca2e3 100644
--- a/xen/arch/arm/arm64/mmu/mm.c
+++ b/xen/arch/arm/arm64/mmu/mm.c
@@ -76,6 +76,7 @@ static void __init prepare_runtime_identity_mapping(void)
paddr_t id_addr = virt_to_maddr(_start);
lpae_t pte;
DECLARE_OFFSETS(id_offsets, id_addr);
+ lpae_t *root = this_cpu(xen_pgtable);
if ( id_offsets[0] >= IDENTITY_MAPPING_AREA_NR_L0 )
panic("Cannot handle ID mapping above %uTB\n",
@@ -86,7 +87,7 @@ static void __init prepare_runtime_identity_mapping(void)
pte.pt.table = 1;
pte.pt.xn = 0;
- write_pte(&xen_pgtable[id_offsets[0]], pte);
+ write_pte(&root[id_offsets[0]], pte);
/* Link second ID table */
pte = pte_of_xenaddr((vaddr_t)xen_second_id);
diff --git a/xen/arch/arm/include/asm/arm32/mm.h b/xen/arch/arm/include/asm/arm32/mm.h
index 856f2dbec4ad..87a315db013d 100644
--- a/xen/arch/arm/include/asm/arm32/mm.h
+++ b/xen/arch/arm/include/asm/arm32/mm.h
@@ -1,12 +1,6 @@
#ifndef __ARM_ARM32_MM_H__
#define __ARM_ARM32_MM_H__
-#include <xen/percpu.h>
-
-#include <asm/lpae.h>
-
-DECLARE_PER_CPU(lpae_t *, xen_pgtable);
-
/*
* Only a limited amount of RAM, called xenheap, is always mapped on ARM32.
* For convenience always return false.
@@ -16,8 +10,6 @@ static inline bool arch_mfns_in_directmap(unsigned long mfn, unsigned long nr)
return false;
}
-bool init_domheap_mappings(unsigned int cpu);
-
static inline void arch_setup_page_tables(void)
{
}
diff --git a/xen/arch/arm/include/asm/domain_page.h b/xen/arch/arm/include/asm/domain_page.h
new file mode 100644
index 000000000000..e9f52685e2ec
--- /dev/null
+++ b/xen/arch/arm/include/asm/domain_page.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_ARM_DOMAIN_PAGE_H__
+#define __ASM_ARM_DOMAIN_PAGE_H__
+
+#ifdef CONFIG_ARCH_MAP_DOMAIN_PAGE
+bool init_domheap_mappings(unsigned int cpu);
+#else
+static inline bool init_domheap_mappings(unsigned int cpu)
+{
+ return true;
+}
+#endif
+
+#endif /* __ASM_ARM_DOMAIN_PAGE_H__ */
diff --git a/xen/arch/arm/include/asm/mm.h b/xen/arch/arm/include/asm/mm.h
index 5abd4b0d1c73..cbfaeb2c4da1 100644
--- a/xen/arch/arm/include/asm/mm.h
+++ b/xen/arch/arm/include/asm/mm.h
@@ -2,6 +2,9 @@
#define __ARCH_ARM_MM__
#include <xen/kernel.h>
+#include <xen/percpu.h>
+
+#include <asm/lpae.h>
#include <asm/page.h>
#include <public/xen.h>
#include <xen/pdx.h>
diff --git a/xen/arch/arm/include/asm/mmu/mm.h b/xen/arch/arm/include/asm/mmu/mm.h
index c5e03a66bf9e..c03c3a51e46b 100644
--- a/xen/arch/arm/include/asm/mmu/mm.h
+++ b/xen/arch/arm/include/asm/mmu/mm.h
@@ -2,6 +2,8 @@
#ifndef __ARM_MMU_MM_H__
#define __ARM_MMU_MM_H__
+DECLARE_PER_CPU(lpae_t *, xen_pgtable);
+
/* Non-boot CPUs use this to find the correct pagetables. */
extern uint64_t init_ttbr;
diff --git a/xen/arch/arm/mmu/pt.c b/xen/arch/arm/mmu/pt.c
index da28d669e796..1ed1a53ab1f2 100644
--- a/xen/arch/arm/mmu/pt.c
+++ b/xen/arch/arm/mmu/pt.c
@@ -607,9 +607,9 @@ static int xen_pt_update(unsigned long virt,
unsigned long left = nr_mfns;
/*
- * For arm32, page-tables are different on each CPUs. Yet, they share
- * some common mappings. It is assumed that only common mappings
- * will be modified with this function.
+ * Page-tables are different on each CPU. Yet, they share some common
+ * mappings. It is assumed that only common mappings will be modified
+ * with this function.
*
* XXX: Add a check.
*/
diff --git a/xen/arch/arm/mmu/setup.c b/xen/arch/arm/mmu/setup.c
index 9664e85ee6c0..850a961ae5ef 100644
--- a/xen/arch/arm/mmu/setup.c
+++ b/xen/arch/arm/mmu/setup.c
@@ -30,17 +30,15 @@
* PCPUs.
*/
-#ifdef CONFIG_ARM_64
-DEFINE_PAGE_TABLE(xen_pgtable);
-static DEFINE_PAGE_TABLE(xen_first);
-#define THIS_CPU_PGTABLE xen_pgtable
-#else
/* Per-CPU pagetable pages */
/* xen_pgtable == root of the trie (zeroeth level on 64-bit, first on 32-bit) */
DEFINE_PER_CPU(lpae_t *, xen_pgtable);
#define THIS_CPU_PGTABLE this_cpu(xen_pgtable)
/* Root of the trie for cpu0, other CPU's PTs are dynamically allocated */
static DEFINE_PAGE_TABLE(cpu0_pgtable);
+
+#ifdef CONFIG_ARM_64
+static DEFINE_PAGE_TABLE(xen_first);
#endif
/* Common pagetable leaves */
@@ -232,17 +230,20 @@ void __init setup_pagetables(void)
lpae_t pte, *p;
int i;
+ p = cpu0_pgtable;
+
+ /* arch_setup_page_tables() may need to access the root page-tables. */
+ per_cpu(xen_pgtable, 0) = cpu0_pgtable;
+
arch_setup_page_tables();
#ifdef CONFIG_ARM_64
pte = pte_of_xenaddr((uintptr_t)xen_first);
pte.pt.table = 1;
pte.pt.xn = 0;
- xen_pgtable[zeroeth_table_offset(XEN_VIRT_START)] = pte;
+ p[zeroeth_table_offset(XEN_VIRT_START)] = pte;
- p = (void *) xen_first;
-#else
- p = (void *) cpu0_pgtable;
+ p = xen_first;
#endif
/* Map xen second level page-table */
@@ -285,19 +286,12 @@ void __init setup_pagetables(void)
pte.pt.table = 1;
xen_second[second_table_offset(FIXMAP_ADDR(0))] = pte;
-#ifdef CONFIG_ARM_64
- ttbr = virt_to_maddr(xen_pgtable);
-#else
ttbr = virt_to_maddr(cpu0_pgtable);
-#endif
+
switch_ttbr(ttbr);
xen_pt_enforce_wnx();
-
-#ifdef CONFIG_ARM_32
- per_cpu(xen_pgtable, 0) = cpu0_pgtable;
-#endif
}
void *__init arch_vmap_virt_end(void)
diff --git a/xen/arch/arm/mmu/smpboot.c b/xen/arch/arm/mmu/smpboot.c
index 37e91d72b785..e4bde31605bd 100644
--- a/xen/arch/arm/mmu/smpboot.c
+++ b/xen/arch/arm/mmu/smpboot.c
@@ -7,6 +7,7 @@
#include <xen/domain_page.h>
+#include <asm/domain_page.h>
#include <asm/setup.h>
/* Override macros from asm/page.h to make them work with mfn_t */
@@ -93,20 +94,6 @@ static void set_init_ttbr(lpae_t *root)
unmap_domain_page(ptr);
}
-#ifdef CONFIG_ARM_64
-int prepare_secondary_mm(int cpu)
-{
- clear_boot_pagetables();
-
- /*
- * Set init_ttbr for this CPU coming up. All CPUs share a single setof
- * pagetables, but rewrite it each time for consistency with 32 bit.
- */
- set_init_ttbr(xen_pgtable);
-
- return 0;
-}
-#else
int prepare_secondary_mm(int cpu)
{
lpae_t *root = alloc_xenheap_page();
@@ -136,7 +123,6 @@ int prepare_secondary_mm(int cpu)
return 0;
}
-#endif
/*
* Local variables:
diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
index 71ebaa77ca94..b33483b8eacf 100644
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -42,6 +42,7 @@
#include <asm/gic.h>
#include <asm/cpuerrata.h>
#include <asm/cpufeature.h>
+#include <asm/domain_page.h>
#include <asm/platform.h>
#include <asm/procinfo.h>
#include <asm/setup.h>
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* [PATCH V4 14/15] xen/arm64: Implement a mapcache for arm64
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (12 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 13/15] xen/arm64: mm: Use per-pCPU page-tables Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-11 13:11 ` [PATCH V4 15/15] xen/arm64: Allow the admin to enable/disable the directmap Elias El Yandouzi
2024-11-11 19:03 ` [PATCH V4 00/15] Remove " Andrew Cooper
15 siblings, 0 replies; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Julien Grall, Elias El Yandouzi
From: Julien Grall <jgrall@amazon.com>
At the moment, on arm64, map_domain_page() is implemented using
virt_to_mfn(). Therefore it is relying on the directmap.
In a follow-up patch, we will allow the admin to remove the directmap.
Therefore we want to implement a mapcache.
Thanksfully there is already one for arm32. So select ARCH_ARM_DOMAIN_PAGE
and add the necessary boiler plate to support 64-bit:
- The page-table start at level 0, so we need to allocate the level
1 page-table
- map_domain_page() should check if the page is in the directmap. If
yes, then use virt_to_mfn() to limit the performance impact
when the directmap is still enabled (this will be selectable
on the command line).
Take the opportunity to replace first_table_offset(...) with offsets[...].
Note that, so far, arch_mfns_in_directmap() always return true on
arm64. So the mapcache is not yet used. This will change in a
follow-up patch.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
There are a few TODOs:
- It is becoming more critical to fix the mapcache
implementation (this is not compliant with the Arm Arm)
- Evaluate the performance
diff --git a/xen/arch/arm/Kconfig b/xen/arch/arm/Kconfig
index 323c96736185..aa9e4c381c55 100644
--- a/xen/arch/arm/Kconfig
+++ b/xen/arch/arm/Kconfig
@@ -1,7 +1,6 @@
config ARM_32
def_bool y
depends on "$(ARCH)" = "arm32"
- select ARCH_MAP_DOMAIN_PAGE
config ARM_64
def_bool y
diff --git a/xen/arch/arm/arm64/mmu/mm.c b/xen/arch/arm/arm64/mmu/mm.c
index c1c6450ca2e3..d7cb3ad2da80 100644
--- a/xen/arch/arm/arm64/mmu/mm.c
+++ b/xen/arch/arm/arm64/mmu/mm.c
@@ -4,6 +4,7 @@
#include <xen/mm.h>
#include <xen/pfn.h>
+#include <asm/domain_page.h>
#include <asm/setup.h>
#include <asm/static-memory.h>
#include <asm/static-shmem.h>
@@ -249,6 +250,14 @@ void __init setup_mm(void)
setup_frametable_mappings(ram_start, ram_end);
max_page = PFN_DOWN(ram_end);
+ /*
+ * The allocators may need to use map_domain_page() (such as for
+ * scrubbing pages). So we need to prepare the domheap area first.
+ */
+ if ( !init_domheap_mappings(smp_processor_id()) )
+ panic("CPU%u: Unable to prepare the domheap page-tables\n",
+ smp_processor_id());
+
init_staticmem_pages();
init_sharedmem_pages();
}
diff --git a/xen/arch/arm/include/asm/mm.h b/xen/arch/arm/include/asm/mm.h
index cbfaeb2c4da1..6545aeda3b13 100644
--- a/xen/arch/arm/include/asm/mm.h
+++ b/xen/arch/arm/include/asm/mm.h
@@ -429,6 +429,11 @@ static inline void page_set_xenheap_gfn(struct page_info *p, gfn_t gfn)
} while ( (y = cmpxchg(&p->u.inuse.type_info, x, nx)) != x );
}
+/* Helpers to allocate, map and unmap a Xen page-table */
+int create_xen_table(lpae_t *entry);
+lpae_t *xen_map_table(mfn_t mfn);
+void xen_unmap_table(const lpae_t *table);
+
#endif /* __ARCH_ARM_MM__ */
/*
* Local variables:
diff --git a/xen/arch/arm/include/asm/mmu/layout.h b/xen/arch/arm/include/asm/mmu/layout.h
index a3b546465b5a..c549420e8b25 100644
--- a/xen/arch/arm/include/asm/mmu/layout.h
+++ b/xen/arch/arm/include/asm/mmu/layout.h
@@ -35,9 +35,13 @@
*
* 32G - 64G Frametable: 56 bytes per page for 2TB of RAM
*
- * 0x00000a8000000000 - 0x00007fffffffffff (512GB+117TB, L0 slots [21..255])
+ * 0x00000a8000000000 - 0x00007f7fffffffff (117TB, L0 slots [21..254])
* Unused
*
+ * 0x00007f8000000000 - 0x00007fffffffffff (512GB, L0 slot [255])
+ * (Relative offsets)
+ * 0 - 2G Domheap: on-demand-mapped
+ *
* 0x0000800000000000 - 0x000084ffffffffff (5TB, L0 slots [256..265])
* 1:1 mapping of RAM
*
@@ -130,6 +134,13 @@
#define FRAMETABLE_SIZE GB(32)
#define FRAMETABLE_NR (FRAMETABLE_SIZE / sizeof(*frame_table))
+#define DOMHEAP_VIRT_START SLOT0(255)
+#define DOMHEAP_VIRT_SIZE GB(2)
+
+#define DOMHEAP_ENTRIES 1024 /* 1024 2MB mapping slots */
+/* Number of domheap pagetable pages required at the second level (2MB mappings) */
+#define DOMHEAP_SECOND_PAGES (DOMHEAP_VIRT_SIZE >> FIRST_SHIFT)
+
#define DIRECTMAP_VIRT_START SLOT0(256)
#define DIRECTMAP_SIZE (SLOT0_ENTRY_SIZE * (266 - 256))
#define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + DIRECTMAP_SIZE - 1)
diff --git a/xen/arch/arm/mmu/domain_page.c b/xen/arch/arm/mmu/domain_page.c
index 3a43601623f0..72f200145d64 100644
--- a/xen/arch/arm/mmu/domain_page.c
+++ b/xen/arch/arm/mmu/domain_page.c
@@ -29,13 +29,30 @@ bool init_domheap_mappings(unsigned int cpu)
{
unsigned int order = get_order_from_pages(DOMHEAP_SECOND_PAGES);
lpae_t *root = per_cpu(xen_pgtable, cpu);
+ lpae_t *first;
unsigned int i, first_idx;
lpae_t *domheap;
mfn_t mfn;
+ /* Convenience aliases */
+ DECLARE_OFFSETS(offsets, DOMHEAP_VIRT_START);
+
ASSERT(root);
ASSERT(!per_cpu(xen_dommap, cpu));
+ /*
+ * On Arm64, the root is at level 0. Therefore we need an extra step
+ * to allocate the first level page-table.
+ */
+#ifdef CONFIG_ARM_64
+ if ( create_xen_table(&root[offsets[0]]) )
+ return false;
+
+ first = xen_map_table(lpae_get_mfn(root[offsets[0]]));
+#else
+ first = root;
+#endif
+
/*
* The domheap for cpu0 is initialized before the heap is initialized.
* So we need to use pre-allocated pages.
@@ -56,16 +73,20 @@ bool init_domheap_mappings(unsigned int cpu)
* domheap mapping pages.
*/
mfn = virt_to_mfn(domheap);
- first_idx = first_table_offset(DOMHEAP_VIRT_START);
+ first_idx = offsets[1];
for ( i = 0; i < DOMHEAP_SECOND_PAGES; i++ )
{
lpae_t pte = mfn_to_xen_entry(mfn_add(mfn, i), MT_NORMAL);
pte.pt.table = 1;
- write_pte(&root[first_idx + i], pte);
+ write_pte(&first[first_idx + i], pte);
}
per_cpu(xen_dommap, cpu) = domheap;
+#ifdef CONFIG_ARM_64
+ xen_unmap_table(first);
+#endif
+
return true;
}
@@ -89,6 +110,10 @@ void *map_domain_page(mfn_t mfn)
lpae_t pte;
int i, slot;
+ /* Bypass the mapcache if the page is in the directmap */
+ if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
+ return mfn_to_virt(mfn);
+
local_irq_save(flags);
/* The map is laid out as an open-addressed hash table where each
@@ -151,13 +176,25 @@ void *map_domain_page(mfn_t mfn)
/* Release a mapping taken with map_domain_page() */
void unmap_domain_page(const void *ptr)
{
+ unsigned long va = (unsigned long)ptr;
unsigned long flags;
lpae_t *map = this_cpu(xen_dommap);
- int slot = ((unsigned long)ptr - DOMHEAP_VIRT_START) >> SECOND_SHIFT;
+ unsigned int slot;
+
+ /* Below we assume that the domheap area doesn't start at 0 */
+ BUILD_BUG_ON(DOMHEAP_VIRT_START == 0);
- if ( !ptr )
+ /*
+ * map_domain_page() may not have mapped anything if the address
+ * is part of the directmap. So ignore anything outside of the
+ * domheap.
+ */
+ if ( (va < DOMHEAP_VIRT_START) ||
+ ((va - DOMHEAP_VIRT_START) >= DOMHEAP_VIRT_SIZE) )
return;
+ slot = (va - DOMHEAP_VIRT_START) >> SECOND_SHIFT;
+
local_irq_save(flags);
ASSERT(slot >= 0 && slot < DOMHEAP_ENTRIES);
diff --git a/xen/arch/arm/mmu/pt.c b/xen/arch/arm/mmu/pt.c
index 1ed1a53ab1f2..da33c6c52e39 100644
--- a/xen/arch/arm/mmu/pt.c
+++ b/xen/arch/arm/mmu/pt.c
@@ -33,7 +33,7 @@ mm_printk(const char *fmt, ...) {}
#define HYP_PT_ROOT_LEVEL 1
#endif
-static lpae_t *xen_map_table(mfn_t mfn)
+lpae_t *xen_map_table(mfn_t mfn)
{
/*
* During early boot, map_domain_page() may be unusable. Use the
@@ -45,7 +45,7 @@ static lpae_t *xen_map_table(mfn_t mfn)
return map_domain_page(mfn);
}
-static void xen_unmap_table(const lpae_t *table)
+void xen_unmap_table(const lpae_t *table)
{
/*
* During early boot, xen_map_table() will not use map_domain_page()
@@ -228,7 +228,7 @@ void *ioremap(paddr_t pa, size_t len)
return ioremap_attr(pa, len, PAGE_HYPERVISOR_NOCACHE);
}
-static int create_xen_table(lpae_t *entry)
+int create_xen_table(lpae_t *entry)
{
mfn_t mfn;
void *p;
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* [PATCH V4 15/15] xen/arm64: Allow the admin to enable/disable the directmap
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (13 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 14/15] xen/arm64: Implement a mapcache for arm64 Elias El Yandouzi
@ 2024-11-11 13:11 ` Elias El Yandouzi
2024-11-11 19:03 ` [PATCH V4 00/15] Remove " Andrew Cooper
15 siblings, 0 replies; 25+ messages in thread
From: Elias El Yandouzi @ 2024-11-11 13:11 UTC (permalink / raw)
To: xen-devel; +Cc: julien, pdurrant, dwmw, Julien Grall, Elias El Yandouzi
From: Julien Grall <jgrall@amazon.com>
Implement the same command line option as x86 to enable/disable the
directmap. By default this is kept enabled.
Also modify setup_directmap_mappings() to populate the L0 entries
related to the directmap area.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
----
Changes in v2:
* Rely on the Kconfig option to enable Secret Hiding on Arm64
* Use generic helper instead of arch_has_directmap()
diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
index 10b0b2714661..c238b866061f 100644
--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -799,7 +799,7 @@ that enabling this option cannot guarantee anything beyond what underlying
hardware guarantees (with, where available and known to Xen, respective
tweaks applied).
-### directmap (x86)
+### directmap (arm64, x86)
> `= <boolean>`
> Default: `true`
diff --git a/xen/arch/arm/Kconfig b/xen/arch/arm/Kconfig
index aa9e4c381c55..390fe3dd9ebf 100644
--- a/xen/arch/arm/Kconfig
+++ b/xen/arch/arm/Kconfig
@@ -7,6 +7,7 @@ config ARM_64
depends on !ARM_32
select 64BIT
select HAS_FAST_MULTIPLY
+ select HAS_SECRET_HIDING
config ARM
def_bool y
diff --git a/xen/arch/arm/arm64/mmu/mm.c b/xen/arch/arm/arm64/mmu/mm.c
index d7cb3ad2da80..320e9efde93e 100644
--- a/xen/arch/arm/arm64/mmu/mm.c
+++ b/xen/arch/arm/arm64/mmu/mm.c
@@ -170,16 +170,27 @@ void __init switch_ttbr(uint64_t ttbr)
update_identity_mapping(false);
}
-/* Map the region in the directmap area. */
+/*
+ * This either populate a valid fdirect map, or allocates empty L1 tables
+ * and creates the L0 entries for the given region in the direct map
+ * depending on has_directmap().
+ *
+ * When directmap=no, we still need to populate empty L1 tables in the
+ * directmap region. The reason is that the root page-table (i.e. L0)
+ * is per-CPU and secondary CPUs will initialize their root page-table
+ * based on the pCPU0 one. So L0 entries will be shared if they are
+ * pre-populated. We also rely on the fact that L1 tables are never
+ * freed.
+ */
static void __init setup_directmap_mappings(unsigned long base_mfn,
unsigned long nr_mfns)
{
+ unsigned long mfn_gb = base_mfn & ~((FIRST_SIZE >> PAGE_SHIFT) - 1);
int rc;
/* First call sets the directmap physical and virtual offset. */
if ( mfn_eq(directmap_mfn_start, INVALID_MFN) )
{
- unsigned long mfn_gb = base_mfn & ~((FIRST_SIZE >> PAGE_SHIFT) - 1);
directmap_mfn_start = _mfn(base_mfn);
directmap_base_pdx = mfn_to_pdx(_mfn(base_mfn));
@@ -200,6 +211,24 @@ static void __init setup_directmap_mappings(unsigned long base_mfn,
panic("cannot add directmap mapping at %lx below heap start %lx\n",
base_mfn, mfn_x(directmap_mfn_start));
+ if ( !has_directmap() )
+ {
+ vaddr_t vaddr = (vaddr_t)__mfn_to_virt(base_mfn);
+ lpae_t *root = this_cpu(xen_pgtable);
+ unsigned int i, slot;
+
+ slot = first_table_offset(vaddr);
+ nr_mfns += base_mfn - mfn_gb;
+ for ( i = 0; i < nr_mfns; i += BIT(XEN_PT_LEVEL_ORDER(0), UL), slot++ )
+ {
+ lpae_t *entry = &root[slot];
+
+ if ( !lpae_is_valid(*entry) && !create_xen_table(entry) )
+ panic("Unable to populate zeroeth slot %u\n", slot);
+ }
+ return;
+ }
+
rc = map_pages_to_xen((vaddr_t)__mfn_to_virt(base_mfn),
_mfn(base_mfn), nr_mfns,
PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
diff --git a/xen/arch/arm/include/asm/arm64/mm.h b/xen/arch/arm/include/asm/arm64/mm.h
index b4f7545d2c87..2b1140a6b994 100644
--- a/xen/arch/arm/include/asm/arm64/mm.h
+++ b/xen/arch/arm/include/asm/arm64/mm.h
@@ -3,13 +3,10 @@
extern DEFINE_PAGE_TABLE(xen_pgtable);
-/*
- * On ARM64, all the RAM is currently direct mapped in Xen.
- * Hence return always true.
- */
+/* On Arm64, the user can chose whether all the RAM is directmap. */
static inline bool arch_mfns_in_directmap(unsigned long mfn, unsigned long nr)
{
- return true;
+ return has_directmap();
}
void arch_setup_page_tables(void);
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index def939172cc5..0f3ffab6bab6 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -12,6 +12,7 @@
#include <xen/grant_table.h>
#include <xen/guest_access.h>
#include <xen/mm.h>
+#include <xen/param.h>
#include <xsm/xsm.h>
diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
index b33483b8eacf..2e0870dc8af6 100644
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -328,6 +328,7 @@ void asmlinkage __init start_xen(unsigned long fdt_paddr)
cmdline_parse(cmdline);
setup_mm();
+ printk("Booting with directmap %s\n", has_directmap() ? "on" : "off");
vm_init();
--
2.40.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* Re: [PATCH V4 00/15] Remove the directmap
2024-11-11 13:11 [PATCH V4 00/15] Remove the directmap Elias El Yandouzi
` (14 preceding siblings ...)
2024-11-11 13:11 ` [PATCH V4 15/15] xen/arm64: Allow the admin to enable/disable the directmap Elias El Yandouzi
@ 2024-11-11 19:03 ` Andrew Cooper
15 siblings, 0 replies; 25+ messages in thread
From: Andrew Cooper @ 2024-11-11 19:03 UTC (permalink / raw)
To: Elias El Yandouzi, xen-devel; +Cc: julien, pdurrant, dwmw
On 11/11/2024 1:11 pm, Elias El Yandouzi wrote:
> Hongyan Xia (8):
> x86: Create per-domain mapping for guest_root_pt
> x86/pv: Rewrite how building PV dom0 handles domheap mappings
> x86: Add a boot option to enable and disable the direct map
> x86/domain_page: Remove the fast paths when mfn is not in the
> directmap
> xen/page_alloc: Add a path for xenheap when there is no direct map
> x86/setup: Leave early boot slightly earlier
> xen/page_alloc: vmap heap nodes when they are outside the direct map
> x86/setup: Do not create valid mappings when directmap=no
>
> Julien Grall (5):
> xen/x86: Add support for the PMAP
> xen/arm32: mm: Rename 'first' to 'root' in init_secondary_pagetables()
> xen/arm64: mm: Use per-pCPU page-tables
> xen/arm64: Implement a mapcache for arm64
> xen/arm64: Allow the admin to enable/disable the directmap
>
> Wei Liu (2):
> x86/pv: Use copy_domain_page() to manage domheap pages during initrd
> relocation
> x86: Initialize mapcache for PV, HVM, and idle domains
There are rather a lot of build failures somewhere in this series:
https://gitlab.com/xen-project/people/andyhhp/xen/-/pipelines/1536969743
~Andrew
^ permalink raw reply [flat|nested] 25+ messages in thread