* [PATCH v6 1/6] system/physmem: handle hugetlb correctly in qemu_ram_remap()
2025-01-27 21:31 [PATCH v6 0/6] Poisoned memory recovery on reboot “William Roche
@ 2025-01-27 21:31 ` “William Roche
2025-01-30 10:05 ` David Hildenbrand
2025-01-27 21:31 ` [PATCH v6 2/6] system/physmem: poisoned memory discard on reboot “William Roche
` (4 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: “William Roche @ 2025-01-27 21:31 UTC (permalink / raw)
To: david, kvm, qemu-devel, qemu-arm
Cc: william.roche, peterx, pbonzini, richard.henderson, philmd,
peter.maydell, mtosatti, imammedo, eduardo, marcel.apfelbaum,
wangyanan55, zhao1.liu, joao.m.martins
From: William Roche <william.roche@oracle.com>
The list of hwpoison pages used to remap the memory on reset
is based on the backend real page size.
To correctly handle hugetlb, we must mmap(MAP_FIXED) a complete
hugetlb page; hugetlb pages cannot be partially mapped.
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: William Roche <william.roche@oracle.com>
---
accel/kvm/kvm-all.c | 2 +-
include/exec/cpu-common.h | 2 +-
system/physmem.c | 38 +++++++++++++++++++++++++++++---------
3 files changed, 31 insertions(+), 11 deletions(-)
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index c65b790433..f89568bfa3 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1288,7 +1288,7 @@ static void kvm_unpoison_all(void *param)
QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
QLIST_REMOVE(page, list);
- qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
+ qemu_ram_remap(page->ram_addr);
g_free(page);
}
}
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index b1d76d6985..3771b2130c 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -67,7 +67,7 @@ typedef uintptr_t ram_addr_t;
/* memory API */
-void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
+void qemu_ram_remap(ram_addr_t addr);
/* This should not be used by devices. */
ram_addr_t qemu_ram_addr_from_host(void *ptr);
ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr);
diff --git a/system/physmem.c b/system/physmem.c
index c76503aea8..3dd2adde73 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2167,17 +2167,35 @@ void qemu_ram_free(RAMBlock *block)
}
#ifndef _WIN32
-void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
+/*
+ * qemu_ram_remap - remap a single RAM page
+ *
+ * @addr: address in ram_addr_t address space.
+ *
+ * This function will try remapping a single page of guest RAM identified by
+ * @addr, essentially discarding memory to recover from previously poisoned
+ * memory (MCE). The page size depends on the RAMBlock (i.e., hugetlb). @addr
+ * does not have to point at the start of the page.
+ *
+ * This function is only to be used during system resets; it will kill the
+ * VM if remapping failed.
+ */
+void qemu_ram_remap(ram_addr_t addr)
{
RAMBlock *block;
- ram_addr_t offset;
+ uint64_t offset;
int flags;
void *area, *vaddr;
int prot;
+ size_t page_size;
RAMBLOCK_FOREACH(block) {
offset = addr - block->offset;
if (offset < block->max_length) {
+ /* Respect the pagesize of our RAMBlock */
+ page_size = qemu_ram_pagesize(block);
+ offset = QEMU_ALIGN_DOWN(offset, page_size);
+
vaddr = ramblock_ptr(block, offset);
if (block->flags & RAM_PREALLOC) {
;
@@ -2191,21 +2209,23 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
prot = PROT_READ;
prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
if (block->fd >= 0) {
- area = mmap(vaddr, length, prot, flags, block->fd,
+ area = mmap(vaddr, page_size, prot, flags, block->fd,
offset + block->fd_offset);
} else {
flags |= MAP_ANONYMOUS;
- area = mmap(vaddr, length, prot, flags, -1, 0);
+ area = mmap(vaddr, page_size, prot, flags, -1, 0);
}
if (area != vaddr) {
- error_report("Could not remap addr: "
- RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
- length, addr);
+ error_report("Could not remap RAM %s:%" PRIx64 "+%" PRIx64
+ " +%zx", block->idstr, offset,
+ block->fd_offset, page_size);
exit(1);
}
- memory_try_enable_merging(vaddr, length);
- qemu_ram_setup_dump(vaddr, length);
+ memory_try_enable_merging(vaddr, page_size);
+ qemu_ram_setup_dump(vaddr, page_size);
}
+
+ break;
}
}
}
--
2.43.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v6 1/6] system/physmem: handle hugetlb correctly in qemu_ram_remap()
2025-01-27 21:31 ` [PATCH v6 1/6] system/physmem: handle hugetlb correctly in qemu_ram_remap() “William Roche
@ 2025-01-30 10:05 ` David Hildenbrand
0 siblings, 0 replies; 12+ messages in thread
From: David Hildenbrand @ 2025-01-30 10:05 UTC (permalink / raw)
To: “William Roche, kvm, qemu-devel, qemu-arm
Cc: peterx, pbonzini, richard.henderson, philmd, peter.maydell,
mtosatti, imammedo, eduardo, marcel.apfelbaum, wangyanan55,
zhao1.liu, joao.m.martins
On 27.01.25 22:31, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
>
> The list of hwpoison pages used to remap the memory on reset
> is based on the backend real page size.
> To correctly handle hugetlb, we must mmap(MAP_FIXED) a complete
> hugetlb page; hugetlb pages cannot be partially mapped.
>
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: William Roche <william.roche@oracle.com>
> ---
Acked-by: David Hildenbrand <david@redhat.com>
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v6 2/6] system/physmem: poisoned memory discard on reboot
2025-01-27 21:31 [PATCH v6 0/6] Poisoned memory recovery on reboot “William Roche
2025-01-27 21:31 ` [PATCH v6 1/6] system/physmem: handle hugetlb correctly in qemu_ram_remap() “William Roche
@ 2025-01-27 21:31 ` “William Roche
2025-01-30 10:08 ` David Hildenbrand
2025-01-27 21:31 ` [PATCH v6 3/6] accel/kvm: Report the loss of a large memory page “William Roche
` (3 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: “William Roche @ 2025-01-27 21:31 UTC (permalink / raw)
To: david, kvm, qemu-devel, qemu-arm
Cc: william.roche, peterx, pbonzini, richard.henderson, philmd,
peter.maydell, mtosatti, imammedo, eduardo, marcel.apfelbaum,
wangyanan55, zhao1.liu, joao.m.martins
From: William Roche <william.roche@oracle.com>
Repair poisoned memory location(s), calling ram_block_discard_range():
punching a hole in the backend file when necessary and regenerating
a usable memory.
If the kernel doesn't support the madvise calls used by this function
and we are dealing with anonymous memory, fall back to remapping the
location(s).
Signed-off-by: William Roche <william.roche@oracle.com>
---
system/physmem.c | 54 ++++++++++++++++++++++++++++--------------------
1 file changed, 32 insertions(+), 22 deletions(-)
diff --git a/system/physmem.c b/system/physmem.c
index 3dd2adde73..3dc10ae27b 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2167,6 +2167,23 @@ void qemu_ram_free(RAMBlock *block)
}
#ifndef _WIN32
+/* Simply remap the given VM memory location from start to start+length */
+static int qemu_ram_remap_mmap(RAMBlock *block, uint64_t start, size_t length)
+{
+ int flags, prot;
+ void *area;
+ void *host_startaddr = block->host + start;
+
+ assert(block->fd < 0);
+ flags = MAP_FIXED | MAP_ANONYMOUS;
+ flags |= block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE;
+ flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
+ prot = PROT_READ;
+ prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
+ area = mmap(host_startaddr, length, prot, flags, -1, 0);
+ return area != host_startaddr ? -errno : 0;
+}
+
/*
* qemu_ram_remap - remap a single RAM page
*
@@ -2184,9 +2201,7 @@ void qemu_ram_remap(ram_addr_t addr)
{
RAMBlock *block;
uint64_t offset;
- int flags;
- void *area, *vaddr;
- int prot;
+ void *vaddr;
size_t page_size;
RAMBLOCK_FOREACH(block) {
@@ -2201,25 +2216,20 @@ void qemu_ram_remap(ram_addr_t addr)
;
} else if (xen_enabled()) {
abort();
- } else {
- flags = MAP_FIXED;
- flags |= block->flags & RAM_SHARED ?
- MAP_SHARED : MAP_PRIVATE;
- flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
- prot = PROT_READ;
- prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
- if (block->fd >= 0) {
- area = mmap(vaddr, page_size, prot, flags, block->fd,
- offset + block->fd_offset);
- } else {
- flags |= MAP_ANONYMOUS;
- area = mmap(vaddr, page_size, prot, flags, -1, 0);
- }
- if (area != vaddr) {
- error_report("Could not remap RAM %s:%" PRIx64 "+%" PRIx64
- " +%zx", block->idstr, offset,
- block->fd_offset, page_size);
- exit(1);
+ if (ram_block_discard_range(block, offset, page_size) != 0) {
+ /*
+ * Fall back to using mmap() only for anonymous mapping,
+ * as if a backing file is associated we may not be able
+ * to recover the memory in all cases.
+ * So don't take the risk of using only mmap and fail now.
+ */
+ if (block->fd >= 0 ||
+ qemu_ram_remap_mmap(block, offset, page_size) != 0) {
+ error_report("Could not remap RAM %s:%" PRIx64 "+%"
+ PRIx64 " +%zx", block->idstr, offset,
+ block->fd_offset, page_size);
+ exit(1);
+ }
}
memory_try_enable_merging(vaddr, page_size);
qemu_ram_setup_dump(vaddr, page_size);
--
2.43.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v6 2/6] system/physmem: poisoned memory discard on reboot
2025-01-27 21:31 ` [PATCH v6 2/6] system/physmem: poisoned memory discard on reboot “William Roche
@ 2025-01-30 10:08 ` David Hildenbrand
0 siblings, 0 replies; 12+ messages in thread
From: David Hildenbrand @ 2025-01-30 10:08 UTC (permalink / raw)
To: “William Roche, kvm, qemu-devel, qemu-arm
Cc: peterx, pbonzini, richard.henderson, philmd, peter.maydell,
mtosatti, imammedo, eduardo, marcel.apfelbaum, wangyanan55,
zhao1.liu, joao.m.martins
On 27.01.25 22:31, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
>
> Repair poisoned memory location(s), calling ram_block_discard_range():
> punching a hole in the backend file when necessary and regenerating
> a usable memory.
> If the kernel doesn't support the madvise calls used by this function
> and we are dealing with anonymous memory, fall back to remapping the
> location(s).
>
> Signed-off-by: William Roche <william.roche@oracle.com>
> ---
> system/physmem.c | 54 ++++++++++++++++++++++++++++--------------------
> 1 file changed, 32 insertions(+), 22 deletions(-)
>
> diff --git a/system/physmem.c b/system/physmem.c
> index 3dd2adde73..3dc10ae27b 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -2167,6 +2167,23 @@ void qemu_ram_free(RAMBlock *block)
> }
>
> #ifndef _WIN32
> +/* Simply remap the given VM memory location from start to start+length */
> +static int qemu_ram_remap_mmap(RAMBlock *block, uint64_t start, size_t length)
> +{
> + int flags, prot;
> + void *area;
> + void *host_startaddr = block->host + start;
> +
> + assert(block->fd < 0);
> + flags = MAP_FIXED | MAP_ANONYMOUS;
> + flags |= block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE;
> + flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
> + prot = PROT_READ;
> + prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
> + area = mmap(host_startaddr, length, prot, flags, -1, 0);
> + return area != host_startaddr ? -errno : 0;
> +}
> +
> /*
> * qemu_ram_remap - remap a single RAM page
> *
> @@ -2184,9 +2201,7 @@ void qemu_ram_remap(ram_addr_t addr)
> {
> RAMBlock *block;
> uint64_t offset;
> - int flags;
> - void *area, *vaddr;
> - int prot;
> + void *vaddr;
> size_t page_size;
>
> RAMBLOCK_FOREACH(block) {
> @@ -2201,25 +2216,20 @@ void qemu_ram_remap(ram_addr_t addr)
> ;
> } else if (xen_enabled()) {
> abort();
> - } else {
> - flags = MAP_FIXED;
> - flags |= block->flags & RAM_SHARED ?
> - MAP_SHARED : MAP_PRIVATE;
> - flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
> - prot = PROT_READ;
> - prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
> - if (block->fd >= 0) {
> - area = mmap(vaddr, page_size, prot, flags, block->fd,
> - offset + block->fd_offset);
> - } else {
> - flags |= MAP_ANONYMOUS;
> - area = mmap(vaddr, page_size, prot, flags, -1, 0);
> - }
> - if (area != vaddr) {
> - error_report("Could not remap RAM %s:%" PRIx64 "+%" PRIx64
> - " +%zx", block->idstr, offset,
> - block->fd_offset, page_size);
> - exit(1);
> + if (ram_block_discard_range(block, offset, page_size) != 0) {
> + /*
> + * Fall back to using mmap() only for anonymous mapping,
> + * as if a backing file is associated we may not be able
> + * to recover the memory in all cases.
> + * So don't take the risk of using only mmap and fail now.
> + */
> + if (block->fd >= 0 ||
> + qemu_ram_remap_mmap(block, offset, page_size) != 0) {
> + error_report("Could not remap RAM %s:%" PRIx64 "+%"
> + PRIx64 " +%zx", block->idstr, offset,
> + block->fd_offset, page_size);
> + exit(1);
> + }
> }
> memory_try_enable_merging(vaddr, page_size);
> qemu_ram_setup_dump(vaddr, page_size);
Acked-by: David Hildenbrand <david@redhat.com>
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v6 3/6] accel/kvm: Report the loss of a large memory page
2025-01-27 21:31 [PATCH v6 0/6] Poisoned memory recovery on reboot “William Roche
2025-01-27 21:31 ` [PATCH v6 1/6] system/physmem: handle hugetlb correctly in qemu_ram_remap() “William Roche
2025-01-27 21:31 ` [PATCH v6 2/6] system/physmem: poisoned memory discard on reboot “William Roche
@ 2025-01-27 21:31 ` “William Roche
2025-01-30 12:02 ` David Hildenbrand
2025-01-30 17:02 ` David Hildenbrand
2025-01-27 21:31 ` [PATCH v6 4/6] numa: Introduce and use ram_block_notify_remap() “William Roche
` (2 subsequent siblings)
5 siblings, 2 replies; 12+ messages in thread
From: “William Roche @ 2025-01-27 21:31 UTC (permalink / raw)
To: david, kvm, qemu-devel, qemu-arm
Cc: william.roche, peterx, pbonzini, richard.henderson, philmd,
peter.maydell, mtosatti, imammedo, eduardo, marcel.apfelbaum,
wangyanan55, zhao1.liu, joao.m.martins
From: William Roche <william.roche@oracle.com>
In case of a large page impacted by a memory error, provide an
information about the impacted large page before the memory
error injection message.
This message would also appear on ras enabled ARM platforms, with
the introduction of an x86 similar error injection message.
In the case of a large page impacted, we now report:
Memory Error on large page from <backend>:<address>+<fd_offset> +<page_size>
Signed-off-by: William Roche <william.roche@oracle.com>
---
accel/kvm/kvm-all.c | 11 +++++++++++
include/exec/cpu-common.h | 9 +++++++++
system/physmem.c | 21 +++++++++++++++++++++
target/arm/kvm.c | 3 +++
4 files changed, 44 insertions(+)
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f89568bfa3..08e14f8960 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1296,6 +1296,17 @@ static void kvm_unpoison_all(void *param)
void kvm_hwpoison_page_add(ram_addr_t ram_addr)
{
HWPoisonPage *page;
+ struct RAMBlockInfo rb_info;
+
+ if (qemu_ram_block_location_info_from_addr(ram_addr, &rb_info)) {
+ size_t ps = rb_info.page_size;
+ if (ps > TARGET_PAGE_SIZE) {
+ uint64_t offset = ram_addr - rb_info.offset;
+ error_report("Memory Error on large page from %s:%" PRIx64
+ "+%" PRIx64 " +%zx", rb_info.idstr,
+ QEMU_ALIGN_DOWN(offset, ps), rb_info.fd_offset, ps);
+ }
+ }
QLIST_FOREACH(page, &hwpoison_page_list, list) {
if (page->ram_addr == ram_addr) {
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 3771b2130c..176537fd80 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -110,6 +110,15 @@ int qemu_ram_get_fd(RAMBlock *rb);
size_t qemu_ram_pagesize(RAMBlock *block);
size_t qemu_ram_pagesize_largest(void);
+struct RAMBlockInfo {
+ char idstr[256];
+ ram_addr_t offset;
+ uint64_t fd_offset;
+ size_t page_size;
+};
+bool qemu_ram_block_location_info_from_addr(ram_addr_t ram_addr,
+ struct RAMBlockInfo *block);
+
/**
* cpu_address_space_init:
* @cpu: CPU to add this address space to
diff --git a/system/physmem.c b/system/physmem.c
index 3dc10ae27b..c835fef26b 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1678,6 +1678,27 @@ size_t qemu_ram_pagesize_largest(void)
return largest;
}
+/* Copy RAMBlock information associated to the given ram_addr location */
+bool qemu_ram_block_location_info_from_addr(ram_addr_t ram_addr,
+ struct RAMBlockInfo *b_info)
+{
+ RAMBlock *rb;
+
+ assert(b_info);
+
+ RCU_READ_LOCK_GUARD();
+ rb = qemu_get_ram_block(ram_addr);
+ if (!rb) {
+ return false;
+ }
+
+ pstrcat(b_info->idstr, sizeof(b_info->idstr), rb->idstr);
+ b_info->offset = rb->offset;
+ b_info->fd_offset = rb->fd_offset;
+ b_info->page_size = rb->page_size;
+ return true;
+}
+
static int memory_try_enable_merging(void *addr, size_t len)
{
if (!machine_mem_merge(current_machine)) {
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index da30bdbb23..d9dedc6d74 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -2389,6 +2389,9 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
kvm_cpu_synchronize_state(c);
if (!acpi_ghes_memory_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
kvm_inject_arm_sea(c);
+ error_report("Guest Memory Error at QEMU addr %p and "
+ "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
+ addr, paddr, "BUS_MCEERR_AR");
} else {
error_report("failed to record the error");
abort();
--
2.43.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v6 3/6] accel/kvm: Report the loss of a large memory page
2025-01-27 21:31 ` [PATCH v6 3/6] accel/kvm: Report the loss of a large memory page “William Roche
@ 2025-01-30 12:02 ` David Hildenbrand
2025-01-30 17:02 ` David Hildenbrand
1 sibling, 0 replies; 12+ messages in thread
From: David Hildenbrand @ 2025-01-30 12:02 UTC (permalink / raw)
To: “William Roche, kvm, qemu-devel, qemu-arm
Cc: peterx, pbonzini, richard.henderson, philmd, peter.maydell,
mtosatti, imammedo, eduardo, marcel.apfelbaum, wangyanan55,
zhao1.liu, joao.m.martins
On 27.01.25 22:31, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
>
> In case of a large page impacted by a memory error, provide an
> information about the impacted large page before the memory
> error injection message.
>
> This message would also appear on ras enabled ARM platforms, with
> the introduction of an x86 similar error injection message.
>
> In the case of a large page impacted, we now report:
> Memory Error on large page from <backend>:<address>+<fd_offset> +<page_size>
>
> Signed-off-by: William Roche <william.roche@oracle.com>
> ---
> accel/kvm/kvm-all.c | 11 +++++++++++
> include/exec/cpu-common.h | 9 +++++++++
> system/physmem.c | 21 +++++++++++++++++++++
> target/arm/kvm.c | 3 +++
> 4 files changed, 44 insertions(+)
>
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index f89568bfa3..08e14f8960 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1296,6 +1296,17 @@ static void kvm_unpoison_all(void *param)
> void kvm_hwpoison_page_add(ram_addr_t ram_addr)
> {
> HWPoisonPage *page;
> + struct RAMBlockInfo rb_info;
> +
> + if (qemu_ram_block_location_info_from_addr(ram_addr, &rb_info)) {
> + size_t ps = rb_info.page_size;
Empty line missing.
> + if (ps > TARGET_PAGE_SIZE) {
> + uint64_t offset = ram_addr - rb_info.offset;
Empty line missing.
Don't you have to align the fd_offset also down?
I suggest doing the alignment already when calculating "uint64_t offset"
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v6 3/6] accel/kvm: Report the loss of a large memory page
2025-01-27 21:31 ` [PATCH v6 3/6] accel/kvm: Report the loss of a large memory page “William Roche
2025-01-30 12:02 ` David Hildenbrand
@ 2025-01-30 17:02 ` David Hildenbrand
2025-02-01 9:57 ` William Roche
1 sibling, 1 reply; 12+ messages in thread
From: David Hildenbrand @ 2025-01-30 17:02 UTC (permalink / raw)
To: “William Roche, kvm, qemu-devel, qemu-arm
Cc: peterx, pbonzini, richard.henderson, philmd, peter.maydell,
mtosatti, imammedo, eduardo, marcel.apfelbaum, wangyanan55,
zhao1.liu, joao.m.martins
On 27.01.25 22:31, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
>
> In case of a large page impacted by a memory error, provide an
> information about the impacted large page before the memory
> error injection message.
>
> This message would also appear on ras enabled ARM platforms, with
> the introduction of an x86 similar error injection message.
>
> In the case of a large page impacted, we now report:
> Memory Error on large page from <backend>:<address>+<fd_offset> +<page_size>
>
> Signed-off-by: William Roche <william.roche@oracle.com>
> ---
> accel/kvm/kvm-all.c | 11 +++++++++++
> include/exec/cpu-common.h | 9 +++++++++
> system/physmem.c | 21 +++++++++++++++++++++
> target/arm/kvm.c | 3 +++
> 4 files changed, 44 insertions(+)
>
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index f89568bfa3..08e14f8960 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1296,6 +1296,17 @@ static void kvm_unpoison_all(void *param)
> void kvm_hwpoison_page_add(ram_addr_t ram_addr)
> {
> HWPoisonPage *page;
> + struct RAMBlockInfo rb_info;
> +
> + if (qemu_ram_block_location_info_from_addr(ram_addr, &rb_info)) {
> + size_t ps = rb_info.page_size;
> + if (ps > TARGET_PAGE_SIZE) {
> + uint64_t offset = ram_addr - rb_info.offset;
> + error_report("Memory Error on large page from %s:%" PRIx64
> + "+%" PRIx64 " +%zx", rb_info.idstr,
> + QEMU_ALIGN_DOWN(offset, ps), rb_info.fd_offset, ps);
> + }
> + }
Some smaller nits:
1) I'd call it qemu_ram_block_info_from_addr() -- drop the "_location"
2) Printing the fd_offset only makes sense if there is an fd, right?
You'd have to communicate that information as well.
Apart from that, this series LGTM, thanks!
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v6 3/6] accel/kvm: Report the loss of a large memory page
2025-01-30 17:02 ` David Hildenbrand
@ 2025-02-01 9:57 ` William Roche
0 siblings, 0 replies; 12+ messages in thread
From: William Roche @ 2025-02-01 9:57 UTC (permalink / raw)
To: David Hildenbrand, kvm, qemu-devel, qemu-arm
Cc: peterx, pbonzini, richard.henderson, philmd, peter.maydell,
mtosatti, imammedo, eduardo, marcel.apfelbaum, wangyanan55,
zhao1.liu, joao.m.martins
On 1/30/25 18:02, David Hildenbrand wrote:
> On 27.01.25 22:31, “William Roche wrote:
>> From: William Roche <william.roche@oracle.com>
>>
>> In case of a large page impacted by a memory error, provide an
>> information about the impacted large page before the memory
>> error injection message.
>>
>> This message would also appear on ras enabled ARM platforms, with
>> the introduction of an x86 similar error injection message.
>>
>> In the case of a large page impacted, we now report:
>> Memory Error on large page from <backend>:<address>+<fd_offset>
>> +<page_size>
>>
>> Signed-off-by: William Roche <william.roche@oracle.com>
>> ---
>> accel/kvm/kvm-all.c | 11 +++++++++++
>> include/exec/cpu-common.h | 9 +++++++++
>> system/physmem.c | 21 +++++++++++++++++++++
>> target/arm/kvm.c | 3 +++
>> 4 files changed, 44 insertions(+)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index f89568bfa3..08e14f8960 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -1296,6 +1296,17 @@ static void kvm_unpoison_all(void *param)
>> void kvm_hwpoison_page_add(ram_addr_t ram_addr)
>> {
>> HWPoisonPage *page;
>> + struct RAMBlockInfo rb_info;
>> +
>> + if (qemu_ram_block_location_info_from_addr(ram_addr, &rb_info)) {
>> + size_t ps = rb_info.page_size;
>> + if (ps > TARGET_PAGE_SIZE) {
>> + uint64_t offset = ram_addr - rb_info.offset;
>> + error_report("Memory Error on large page from %s:%" PRIx64
>> + "+%" PRIx64 " +%zx", rb_info.idstr,
>> + QEMU_ALIGN_DOWN(offset, ps),
>> rb_info.fd_offset, ps);
>> + }
>> + }
>
> Some smaller nits:
>
> 1) I'd call it qemu_ram_block_info_from_addr() -- drop the "_location"
>
> 2) Printing the fd_offset only makes sense if there is an fd, right?
> You'd have to communicate that information as well.
>
>
>
> Apart from that, this series LGTM, thanks!
Thank you David for your feedback.
I'll change the 2 nits above, and add the 2 empty lines missing (in
patch 3/6).
I also removed the fd_offset information in the message from the
qemu_ram_remap() function when we don't have an associated fd (patch 2/6).
You also asked me:
> Don't you have to align the fd_offset also down?
>
fd_offset doesn't need to be aligned as it is used with the value given,
which should already be adapted to the backend needs (when given by the
administrator for example)
> I suggest doing the alignment already when calculating "uint64_t offset"
>
But yes for the offset value itself, it's much better to already align
it when giving it an initial value. Thanks, I've also made the change.
I'm sending a v7 now including all these changes.
I'll also send an update about the kdump behavior on ARM later next week.
Thanks again,
William.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v6 4/6] numa: Introduce and use ram_block_notify_remap()
2025-01-27 21:31 [PATCH v6 0/6] Poisoned memory recovery on reboot “William Roche
` (2 preceding siblings ...)
2025-01-27 21:31 ` [PATCH v6 3/6] accel/kvm: Report the loss of a large memory page “William Roche
@ 2025-01-27 21:31 ` “William Roche
2025-01-27 21:31 ` [PATCH v6 5/6] hostmem: Factor out applying settings “William Roche
2025-01-27 21:31 ` [PATCH v6 6/6] hostmem: Handle remapping of RAM “William Roche
5 siblings, 0 replies; 12+ messages in thread
From: “William Roche @ 2025-01-27 21:31 UTC (permalink / raw)
To: david, kvm, qemu-devel, qemu-arm
Cc: william.roche, peterx, pbonzini, richard.henderson, philmd,
peter.maydell, mtosatti, imammedo, eduardo, marcel.apfelbaum,
wangyanan55, zhao1.liu, joao.m.martins
From: David Hildenbrand <david@redhat.com>
Notify registered listeners about the remap at the end of
qemu_ram_remap() so e.g., a memory backend can re-apply its
settings correctly.
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: William Roche <william.roche@oracle.com>
---
hw/core/numa.c | 11 +++++++++++
include/exec/ramlist.h | 3 +++
system/physmem.c | 1 +
3 files changed, 15 insertions(+)
diff --git a/hw/core/numa.c b/hw/core/numa.c
index 218576f745..003bcd8a66 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -895,3 +895,14 @@ void ram_block_notify_resize(void *host, size_t old_size, size_t new_size)
}
}
}
+
+void ram_block_notify_remap(void *host, size_t offset, size_t size)
+{
+ RAMBlockNotifier *notifier;
+
+ QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
+ if (notifier->ram_block_remapped) {
+ notifier->ram_block_remapped(notifier, host, offset, size);
+ }
+ }
+}
diff --git a/include/exec/ramlist.h b/include/exec/ramlist.h
index d9cfe530be..c1dc785a57 100644
--- a/include/exec/ramlist.h
+++ b/include/exec/ramlist.h
@@ -72,6 +72,8 @@ struct RAMBlockNotifier {
size_t max_size);
void (*ram_block_resized)(RAMBlockNotifier *n, void *host, size_t old_size,
size_t new_size);
+ void (*ram_block_remapped)(RAMBlockNotifier *n, void *host, size_t offset,
+ size_t size);
QLIST_ENTRY(RAMBlockNotifier) next;
};
@@ -80,6 +82,7 @@ void ram_block_notifier_remove(RAMBlockNotifier *n);
void ram_block_notify_add(void *host, size_t size, size_t max_size);
void ram_block_notify_remove(void *host, size_t size, size_t max_size);
void ram_block_notify_resize(void *host, size_t old_size, size_t new_size);
+void ram_block_notify_remap(void *host, size_t offset, size_t size);
GString *ram_block_format(void);
diff --git a/system/physmem.c b/system/physmem.c
index c835fef26b..146a78cce7 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2254,6 +2254,7 @@ void qemu_ram_remap(ram_addr_t addr)
}
memory_try_enable_merging(vaddr, page_size);
qemu_ram_setup_dump(vaddr, page_size);
+ ram_block_notify_remap(block->host, offset, page_size);
}
break;
--
2.43.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v6 5/6] hostmem: Factor out applying settings
2025-01-27 21:31 [PATCH v6 0/6] Poisoned memory recovery on reboot “William Roche
` (3 preceding siblings ...)
2025-01-27 21:31 ` [PATCH v6 4/6] numa: Introduce and use ram_block_notify_remap() “William Roche
@ 2025-01-27 21:31 ` “William Roche
2025-01-27 21:31 ` [PATCH v6 6/6] hostmem: Handle remapping of RAM “William Roche
5 siblings, 0 replies; 12+ messages in thread
From: “William Roche @ 2025-01-27 21:31 UTC (permalink / raw)
To: david, kvm, qemu-devel, qemu-arm
Cc: william.roche, peterx, pbonzini, richard.henderson, philmd,
peter.maydell, mtosatti, imammedo, eduardo, marcel.apfelbaum,
wangyanan55, zhao1.liu, joao.m.martins
From: David Hildenbrand <david@redhat.com>
We want to reuse the functionality when remapping RAM.
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: William Roche <william.roche@oracle.com>
---
backends/hostmem.c | 155 ++++++++++++++++++++++++---------------------
1 file changed, 82 insertions(+), 73 deletions(-)
diff --git a/backends/hostmem.c b/backends/hostmem.c
index bceca1a8d9..46d80f98b4 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -36,6 +36,87 @@ QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_BIND != MPOL_BIND);
QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE);
#endif
+static void host_memory_backend_apply_settings(HostMemoryBackend *backend,
+ void *ptr, uint64_t size,
+ Error **errp)
+{
+ bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);
+
+ if (backend->merge) {
+ qemu_madvise(ptr, size, QEMU_MADV_MERGEABLE);
+ }
+ if (!backend->dump) {
+ qemu_madvise(ptr, size, QEMU_MADV_DONTDUMP);
+ }
+#ifdef CONFIG_NUMA
+ unsigned long lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
+ /* lastbit == MAX_NODES means maxnode = 0 */
+ unsigned long maxnode = (lastbit + 1) % (MAX_NODES + 1);
+ /*
+ * Ensure policy won't be ignored in case memory is preallocated
+ * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so
+ * this doesn't catch hugepage case.
+ */
+ unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
+ int mode = backend->policy;
+
+ /*
+ * Check for invalid host-nodes and policies and give more verbose
+ * error messages than mbind().
+ */
+ if (maxnode && backend->policy == MPOL_DEFAULT) {
+ error_setg(errp, "host-nodes must be empty for policy default,"
+ " or you should explicitly specify a policy other"
+ " than default");
+ return;
+ } else if (maxnode == 0 && backend->policy != MPOL_DEFAULT) {
+ error_setg(errp, "host-nodes must be set for policy %s",
+ HostMemPolicy_str(backend->policy));
+ return;
+ }
+
+ /*
+ * We can have up to MAX_NODES nodes, but we need to pass maxnode+1
+ * as argument to mbind() due to an old Linux bug (feature?) which
+ * cuts off the last specified node. This means backend->host_nodes
+ * must have MAX_NODES+1 bits available.
+ */
+ assert(sizeof(backend->host_nodes) >=
+ BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
+ assert(maxnode <= MAX_NODES);
+
+#ifdef HAVE_NUMA_HAS_PREFERRED_MANY
+ if (mode == MPOL_PREFERRED && numa_has_preferred_many() > 0) {
+ /*
+ * Replace with MPOL_PREFERRED_MANY otherwise the mbind() below
+ * silently picks the first node.
+ */
+ mode = MPOL_PREFERRED_MANY;
+ }
+#endif
+
+ if (maxnode &&
+ mbind(ptr, size, mode, backend->host_nodes, maxnode + 1, flags)) {
+ if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
+ error_setg_errno(errp, errno,
+ "cannot bind memory to host NUMA nodes");
+ return;
+ }
+ }
+#endif
+ /*
+ * Preallocate memory after the NUMA policy has been instantiated.
+ * This is necessary to guarantee memory is allocated with
+ * specified NUMA policy in place.
+ */
+ if (backend->prealloc &&
+ !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
+ ptr, size, backend->prealloc_threads,
+ backend->prealloc_context, async, errp)) {
+ return;
+ }
+}
+
char *
host_memory_backend_get_name(HostMemoryBackend *backend)
{
@@ -337,7 +418,6 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
void *ptr;
uint64_t sz;
size_t pagesize;
- bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);
if (!bc->alloc) {
return;
@@ -357,78 +437,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
return;
}
- if (backend->merge) {
- qemu_madvise(ptr, sz, QEMU_MADV_MERGEABLE);
- }
- if (!backend->dump) {
- qemu_madvise(ptr, sz, QEMU_MADV_DONTDUMP);
- }
-#ifdef CONFIG_NUMA
- unsigned long lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
- /* lastbit == MAX_NODES means maxnode = 0 */
- unsigned long maxnode = (lastbit + 1) % (MAX_NODES + 1);
- /*
- * Ensure policy won't be ignored in case memory is preallocated
- * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so
- * this doesn't catch hugepage case.
- */
- unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
- int mode = backend->policy;
-
- /* check for invalid host-nodes and policies and give more verbose
- * error messages than mbind(). */
- if (maxnode && backend->policy == MPOL_DEFAULT) {
- error_setg(errp, "host-nodes must be empty for policy default,"
- " or you should explicitly specify a policy other"
- " than default");
- return;
- } else if (maxnode == 0 && backend->policy != MPOL_DEFAULT) {
- error_setg(errp, "host-nodes must be set for policy %s",
- HostMemPolicy_str(backend->policy));
- return;
- }
-
- /*
- * We can have up to MAX_NODES nodes, but we need to pass maxnode+1
- * as argument to mbind() due to an old Linux bug (feature?) which
- * cuts off the last specified node. This means backend->host_nodes
- * must have MAX_NODES+1 bits available.
- */
- assert(sizeof(backend->host_nodes) >=
- BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
- assert(maxnode <= MAX_NODES);
-
-#ifdef HAVE_NUMA_HAS_PREFERRED_MANY
- if (mode == MPOL_PREFERRED && numa_has_preferred_many() > 0) {
- /*
- * Replace with MPOL_PREFERRED_MANY otherwise the mbind() below
- * silently picks the first node.
- */
- mode = MPOL_PREFERRED_MANY;
- }
-#endif
-
- if (maxnode &&
- mbind(ptr, sz, mode, backend->host_nodes, maxnode + 1, flags)) {
- if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
- error_setg_errno(errp, errno,
- "cannot bind memory to host NUMA nodes");
- return;
- }
- }
-#endif
- /*
- * Preallocate memory after the NUMA policy has been instantiated.
- * This is necessary to guarantee memory is allocated with
- * specified NUMA policy in place.
- */
- if (backend->prealloc && !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
- ptr, sz,
- backend->prealloc_threads,
- backend->prealloc_context,
- async, errp)) {
- return;
- }
+ host_memory_backend_apply_settings(backend, ptr, sz, errp);
}
static bool
--
2.43.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v6 6/6] hostmem: Handle remapping of RAM
2025-01-27 21:31 [PATCH v6 0/6] Poisoned memory recovery on reboot “William Roche
` (4 preceding siblings ...)
2025-01-27 21:31 ` [PATCH v6 5/6] hostmem: Factor out applying settings “William Roche
@ 2025-01-27 21:31 ` “William Roche
5 siblings, 0 replies; 12+ messages in thread
From: “William Roche @ 2025-01-27 21:31 UTC (permalink / raw)
To: david, kvm, qemu-devel, qemu-arm
Cc: william.roche, peterx, pbonzini, richard.henderson, philmd,
peter.maydell, mtosatti, imammedo, eduardo, marcel.apfelbaum,
wangyanan55, zhao1.liu, joao.m.martins
From: William Roche <william.roche@oracle.com>
Let's register a RAM block notifier and react on remap notifications.
Simply re-apply the settings. Exit if something goes wrong.
Merging and dump settings are handled by the remap notification
in addition to memory policy and preallocation.
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: William Roche <william.roche@oracle.com>
---
backends/hostmem.c | 34 ++++++++++++++++++++++++++++++++++
include/system/hostmem.h | 1 +
system/physmem.c | 4 ----
3 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/backends/hostmem.c b/backends/hostmem.c
index 46d80f98b4..4589467c77 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -361,11 +361,37 @@ static void host_memory_backend_set_prealloc_threads(Object *obj, Visitor *v,
backend->prealloc_threads = value;
}
+static void host_memory_backend_ram_remapped(RAMBlockNotifier *n, void *host,
+ size_t offset, size_t size)
+{
+ HostMemoryBackend *backend = container_of(n, HostMemoryBackend,
+ ram_notifier);
+ Error *err = NULL;
+
+ if (!host_memory_backend_mr_inited(backend) ||
+ memory_region_get_ram_ptr(&backend->mr) != host) {
+ return;
+ }
+
+ host_memory_backend_apply_settings(backend, host + offset, size, &err);
+ if (err) {
+ /*
+ * If memory settings can't be successfully applied on remap,
+ * don't take the risk to continue without them.
+ */
+ error_report_err(err);
+ exit(1);
+ }
+}
+
static void host_memory_backend_init(Object *obj)
{
HostMemoryBackend *backend = MEMORY_BACKEND(obj);
MachineState *machine = MACHINE(qdev_get_machine());
+ backend->ram_notifier.ram_block_remapped = host_memory_backend_ram_remapped;
+ ram_block_notifier_add(&backend->ram_notifier);
+
/* TODO: convert access to globals to compat properties */
backend->merge = machine_mem_merge(machine);
backend->dump = machine_dump_guest_core(machine);
@@ -379,6 +405,13 @@ static void host_memory_backend_post_init(Object *obj)
object_apply_compat_props(obj);
}
+static void host_memory_backend_finalize(Object *obj)
+{
+ HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+
+ ram_block_notifier_remove(&backend->ram_notifier);
+}
+
bool host_memory_backend_mr_inited(HostMemoryBackend *backend)
{
/*
@@ -595,6 +628,7 @@ static const TypeInfo host_memory_backend_info = {
.instance_size = sizeof(HostMemoryBackend),
.instance_init = host_memory_backend_init,
.instance_post_init = host_memory_backend_post_init,
+ .instance_finalize = host_memory_backend_finalize,
.interfaces = (InterfaceInfo[]) {
{ TYPE_USER_CREATABLE },
{ }
diff --git a/include/system/hostmem.h b/include/system/hostmem.h
index 5c21ca55c0..170849e8a4 100644
--- a/include/system/hostmem.h
+++ b/include/system/hostmem.h
@@ -83,6 +83,7 @@ struct HostMemoryBackend {
HostMemPolicy policy;
MemoryRegion mr;
+ RAMBlockNotifier ram_notifier;
};
bool host_memory_backend_mr_inited(HostMemoryBackend *backend);
diff --git a/system/physmem.c b/system/physmem.c
index 146a78cce7..52cfdeabe7 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2222,7 +2222,6 @@ void qemu_ram_remap(ram_addr_t addr)
{
RAMBlock *block;
uint64_t offset;
- void *vaddr;
size_t page_size;
RAMBLOCK_FOREACH(block) {
@@ -2232,7 +2231,6 @@ void qemu_ram_remap(ram_addr_t addr)
page_size = qemu_ram_pagesize(block);
offset = QEMU_ALIGN_DOWN(offset, page_size);
- vaddr = ramblock_ptr(block, offset);
if (block->flags & RAM_PREALLOC) {
;
} else if (xen_enabled()) {
@@ -2252,8 +2250,6 @@ void qemu_ram_remap(ram_addr_t addr)
exit(1);
}
}
- memory_try_enable_merging(vaddr, page_size);
- qemu_ram_setup_dump(vaddr, page_size);
ram_block_notify_remap(block->host, offset, page_size);
}
--
2.43.5
^ permalink raw reply related [flat|nested] 12+ messages in thread