[RFC RESEND 4/6] system: Introducing hugetlbfs largepage RAS feature

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: "“William Roche" <william.roche@oracle.com>
To: pbonzini@redhat.com, peterx@redhat.com, david@redhat.com,
	philmd@linaro.org, marcandre.lureau@redhat.com,
	berrange@redhat.com, thuth@redhat.com,
	richard.henderson@linaro.org, peter.maydell@linaro.org,
	mtosatti@redhat.com, qemu-devel@nongnu.org
Cc: kvm@vger.kernel.org, qemu-arm@nongnu.org,
	william.roche@oracle.com, joao.m.martins@oracle.com
Subject: [RFC RESEND 4/6] system: Introducing hugetlbfs largepage RAS feature
Date: Tue, 10 Sep 2024 10:02:14 +0000	[thread overview]
Message-ID: <20240910100216.2744078-5-william.roche@oracle.com> (raw)
In-Reply-To: <20240910100216.2744078-1-william.roche@oracle.com>

From: William Roche <william.roche@oracle.com>

We need to deal with hugetlbfs memory large pages facing HW errors,
to increase the probability to survive a memory poisoning.
When an error is detected, the platform kernel marks the entire
hugetlbfs large page as "poisoned" and reports the event to all
potential users using SIGBUS.

This change introduces 2 aspects:
. trying to recover not HWPOISON data from the error impacted large page
. splitting this large page into standard sized pages

When Qemu receives this SIGBUS, it will try to recover as much data
as possible from the hugepage backend file (which has to be a MAP_SHARED
mapping) with the help of the following kernel feature:
 linux commit 38c1ddbde6c6 ("hugetlbfs: improve read HWPOISON hugepage")

The impacted hugetlbfs large page is replaced by a set of standard pages
populated with the content of the recovered area or a poison for the
unrecoverable locations, reading the backend file.
Any error reading this file results in the corresponding standard
sized page to be poisoned. And if this file mapping is not set with
"share=on", the entire replacing set of pages is poisoned.

We pause the VM to perfom the memory replacement. To do so we have
to get out of the SIGBUS handler(s). But the signal will be
reraised on VM resume.

The platform kernel may report the beginning of the error impacted
large page in the SIGBUS siginfo data, so we may have to adjust this
poison address information to point to the finer grain actual
poison location based on the replacing standard sized pages.
Aiming at a more precise poison information reported to the VM
gives the possibility to better react to this situation, improving
the overall RAS of hugetlbfs VMs.

Signed-off-by: William Roche <william.roche@oracle.com>
---
 accel/kvm/kvm-all.c    |   7 +
 meson.build            |   2 +
 system/hugetlbfs_ras.c | 546 +++++++++++++++++++++++++++++++++++++++++
 system/hugetlbfs_ras.h |   3 +
 system/meson.build     |   1 +
 system/physmem.c       |  17 ++
 target/arm/kvm.c       |  10 +
 target/i386/kvm/kvm.c  |  10 +
 8 files changed, 596 insertions(+)
 create mode 100644 system/hugetlbfs_ras.c
 create mode 100644 system/hugetlbfs_ras.h

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index bcccf80bd7..6c6841f935 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -57,6 +57,10 @@
 #include <sys/eventfd.h>
 #endif
 
+#ifdef CONFIG_HUGETLBFS_RAS
+#include "system/hugetlbfs_ras.h"
+#endif
+
 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
  * need to use the real host PAGE_SIZE, as that's what KVM will use.
  */
@@ -1211,6 +1215,9 @@ static void kvm_unpoison_all(void *param)
 {
     HWPoisonPage *page, *next_page;
 
+#ifdef CONFIG_HUGETLBFS_RAS
+    hugetlbfs_ras_empty();
+#endif
     QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
         QLIST_REMOVE(page, list);
         qemu_ram_remap(page->ram_addr, page->page_size);
diff --git a/meson.build b/meson.build
index fbda17c987..03214586c4 100644
--- a/meson.build
+++ b/meson.build
@@ -3029,6 +3029,8 @@ if host_os == 'windows'
   endif
 endif
 
+config_host_data.set('CONFIG_HUGETLBFS_RAS', host_os == 'linux')
+
 ########################
 # Target configuration #
 ########################
diff --git a/system/hugetlbfs_ras.c b/system/hugetlbfs_ras.c
new file mode 100644
index 0000000000..2f7e550f56
--- /dev/null
+++ b/system/hugetlbfs_ras.c
@@ -0,0 +1,546 @@
+/*
+ * Deal with memory hugetlbfs largepage errors in userland.
+ *
+ * Copyright (c) 2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
+#include <errno.h>
+#include <string.h>
+
+#include "exec/address-spaces.h"
+#include "exec/memory.h"
+#include "exec/ramblock.h"
+#include "qemu/thread.h"
+#include "qemu/queue.h"
+#include "qemu/error-report.h"
+#include "block/thread-pool.h"
+#include "sysemu/runstate.h"
+#include "sysemu/kvm.h"
+#include "qemu/main-loop.h"
+#include "block/aio-wait.h"
+
+#include "hugetlbfs_ras.h"
+
+/* #define DEBUG_HUGETLBFS_RAS */
+
+#ifdef DEBUG_HUGETLBFS_RAS
+#define DPRINTF(fmt, ...) \
+    do { fprintf(stderr, "lpg_ras[%s]: " fmt, __func__, ## __VA_ARGS__); \
+    } while (0)
+#else
+#define DPRINTF(fmt, ...) do {} while (0)
+#endif
+
+/*
+ * We track the Large Poisoned Pages to be able to:
+ * - Identify if a faulting page is already under replacement
+ * - Trigger a replacement for new pages
+ * - Inform the suspended signal handlers that they can continue
+ */
+typedef enum LPP_state {
+    LPP_SUBMITTED = 1,
+    LPP_PREPARING,
+    LPP_DONE,
+    LPP_FAILED,
+} LPP_state;
+
+typedef struct LargeHWPoisonPage {
+    void     *page_addr; /* hva of the poisoned large page */
+    size_t    page_size;
+    LPP_state page_state;
+    void     *first_poison; /* location of the first poison found */
+    struct timespec creation_time;
+    QLIST_ENTRY(LargeHWPoisonPage) list;
+} LargeHWPoisonPage;
+
+static QLIST_HEAD(, LargeHWPoisonPage) large_hwpoison_page_list =
+    QLIST_HEAD_INITIALIZER(large_hwpoison_page_list);
+static int large_hwpoison_vm_stop; /* indicate that VM is stopping */
+static QemuCond large_hwpoison_cv;
+static QemuCond large_hwpoison_new;
+static QemuCond large_hwpoison_vm_running;
+static QemuMutex large_hwpoison_mtx;
+static QemuThread thread;
+static void *hugetlbfs_ras_listener(void *arg);
+static int vm_running;
+static bool hugetlbfs_ras_initialized;
+static int _PAGE_SIZE = 4096;
+static int _PAGE_SHIFT = 12;
+
+/* size should be a power of 2 */
+static int
+shift(int sz)
+{
+    int e, s = 0;
+
+    for (e = 0; (s < sz) && (e < 31); e++) {
+        s = (1 << e);
+        if (s == sz) {
+            return e;
+        }
+    }
+    return -1;
+}
+
+static int
+hugetlbfs_ras_init(void)
+{
+    _PAGE_SIZE = qemu_real_host_page_size();
+    _PAGE_SHIFT = shift(_PAGE_SIZE);
+    if (_PAGE_SHIFT < 0) {
+        warn_report("No support for hugetlbfs largepage errors: "
+                    "Bad page size (%d)", _PAGE_SIZE);
+        return -EIO;
+    }
+    qemu_cond_init(&large_hwpoison_cv);
+    qemu_cond_init(&large_hwpoison_new);
+    qemu_cond_init(&large_hwpoison_vm_running);
+    qemu_mutex_init(&large_hwpoison_mtx);
+
+    qemu_thread_create(&thread, "hugetlbfs_error", hugetlbfs_ras_listener,
+                       NULL, QEMU_THREAD_DETACHED);
+
+    hugetlbfs_ras_initialized = true;
+    return 0;
+}
+
+bool
+hugetlbfs_ras_use(void)
+{
+    static bool answered;
+
+    if (answered) {
+        return hugetlbfs_ras_initialized;
+    }
+
+    /* XXX we could verify if ras feature should be used or not (on ARM) */
+
+    /* CAP_SYS_ADMIN capability needed for madvise(MADV_HWPOISON) */
+    if (getuid() != 0) {
+        warn_report("Priviledges needed to deal with hugetlbfs memory poison");
+    } else {
+        hugetlbfs_ras_init();
+    }
+
+    answered = true;
+    return hugetlbfs_ras_initialized;
+}
+
+/* return the backend real page size used for the given address */
+static size_t
+hugetlbfs_ras_backend_sz(void *addr)
+{
+    ram_addr_t offset;
+    RAMBlock *rb;
+
+    rb = qemu_ram_block_from_host(addr, false, &offset);
+    if (!rb) {
+        warn_report("No associated RAMBlock to addr: %p", addr);
+        return _PAGE_SIZE;
+    }
+    return rb->page_size;
+}
+
+/*
+ * Report if this std page address of the given faulted large page should be
+ * retried or if the current signal handler should continue to deal with it.
+ * Once the mapping is replaced, we retry the errors appeared before the
+ * 'page struct' creation, to deal with previous errors that haven't been
+ * taken into account yet.
+ * But the 4k pages of the mapping can also experience HW errors in their
+ * lifetime.
+ */
+static int
+hugetlbfs_ras_retry(void *addr, LargeHWPoisonPage *page,
+                    struct timespec *entry_time)
+{
+    if (addr == page->first_poison) {
+        return 0;
+    }
+    if (entry_time->tv_sec < page->creation_time.tv_sec) {
+        return 1;
+    }
+    if ((entry_time->tv_sec == page->creation_time.tv_sec) &&
+        (entry_time->tv_nsec <= page->creation_time.tv_nsec)) {
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * If the given address is a large page, we try to replace it
+ * with a set of standard sized pages where we copy what remains
+ * valid from the failed large page.
+ * We reset the two values pointed by paddr and psz to point
+ * to the first poisoned page in the new set, and the size
+ * of this poisoned page.
+ * Return True when it's done. The handler continues with the
+ * possibly corrected values.
+ * Returning False means that there is no signal handler further
+ * action to be taken, the handler should exit.
+ */
+bool
+hugetlbfs_ras_correct(void **paddr, size_t *psz, int code)
+{
+    void *p, *reported_addr;
+    size_t reported_sz, real_sz;
+    LargeHWPoisonPage *page;
+    int found = 0;
+    struct timespec et;
+
+    assert(psz != NULL && paddr != NULL);
+
+    DPRINTF("SIGBUS (%s) at %p (size: %lu)\n",
+        (code == BUS_MCEERR_AR ? "AR" : "AO"), *paddr, *psz);
+
+    if (!hugetlbfs_ras_initialized) {
+        return true;
+    }
+
+    /*
+     * XXX kernel provided size is not reliable...
+     * As kvm_send_hwpoison_signal() uses a hard-coded PAGE_SHIFT
+     * signal value on hwpoison signal.
+     * So in the case of a std page size, we must identify the actual
+     * size to consider (from the mapping block size, or if we
+     * already reduced the page to 4k chunks)
+     */
+    reported_sz = *psz;
+
+    p = *paddr;
+    reported_addr = p;
+
+    if (clock_gettime(CLOCK_MONOTONIC, &et) != 0) {
+        et.tv_sec = 0;
+        et.tv_nsec = 1;
+    }
+    qemu_mutex_lock(&large_hwpoison_mtx);
+
+    if (large_hwpoison_vm_stop) {
+        qemu_mutex_unlock(&large_hwpoison_mtx);
+        return false;
+    }
+
+    QLIST_FOREACH(page, &large_hwpoison_page_list, list) {
+        if (page->page_addr <= p &&
+            page->page_addr + page->page_size > p) {
+            found = 1;
+            break;
+        }
+    }
+
+    if (!found) {
+        if (reported_sz > _PAGE_SIZE) {
+            /* we trust the kernel in this case */
+            real_sz = reported_sz;
+        } else {
+            real_sz = hugetlbfs_ras_backend_sz(p);
+            if (real_sz <= _PAGE_SIZE) {
+                /* not part of a large page */
+                qemu_mutex_unlock(&large_hwpoison_mtx);
+                return true;
+            }
+        }
+        page = g_new0(LargeHWPoisonPage, 1);
+        p = (void *)ROUND_DOWN((unsigned long long)p, real_sz);
+        page->page_addr = p;
+        page->page_size = real_sz;
+        page->page_state = LPP_SUBMITTED;
+        QLIST_INSERT_HEAD(&large_hwpoison_page_list, page, list);
+        qemu_cond_signal(&large_hwpoison_new);
+    } else {
+        if ((code == BUS_MCEERR_AR) && (reported_sz <= _PAGE_SIZE) &&
+            hugetlbfs_ras_retry(p, page, &et)) {
+            *paddr = NULL;
+        }
+    }
+
+    while (page->page_state < LPP_DONE && !large_hwpoison_vm_stop) {
+        qemu_cond_wait(&large_hwpoison_cv, &large_hwpoison_mtx);
+    }
+
+    if (large_hwpoison_vm_stop) {
+        DPRINTF("Handler exit requested as on page %p\n", page->page_addr);
+        *paddr = NULL;
+    }
+    qemu_mutex_unlock(&large_hwpoison_mtx);
+
+    if (page->page_state == LPP_FAILED) {
+        warn_report("Failed recovery for page: %p (error at %p)",
+                    page->page_addr, reported_addr);
+        return (*paddr == NULL ? false : true);
+    }
+
+    *psz = (size_t)_PAGE_SIZE;
+
+    DPRINTF("SIGBUS (%s) corrected from %p to %p (size %ld to %ld)\n",
+            (code == BUS_MCEERR_AR ? "AR" : "AO"),
+            reported_addr, *paddr, reported_sz, *psz);
+
+    return (*paddr == NULL ? false : true);
+}
+
+/*
+ * Sequentially read the valid data from the failed large page (shared) backend
+ * file and copy that into our set of standard sized pages.
+ * Any error reading this file (not only EIO) means that we don't retrieve
+ * valid data for the read location, so it results in the corresponding
+ * standard page to be marked as poisoned.
+ * And if this file mapping is not set with "share=on", we can't rely on the
+ * content on the backend file, so the entire replacing set of pages
+ * is poisoned in this case.
+ */
+static int take_valid_data_lpg(LargeHWPoisonPage *page, const char **err)
+{
+    int fd, i, ps = _PAGE_SIZE, slot_num, poison_count = 0;
+    ram_addr_t offset;
+    RAMBlock *rb;
+    uint64_t fd_offset;
+    ssize_t count, retrieved;
+
+    /* find the backend to get the associated fd and offset */
+    rb = qemu_ram_block_from_host(page->page_addr, false, &offset);
+    if (!rb) {
+        if (err) {
+            *err = "No associated RAMBlock";
+        }
+        return -1;
+    }
+    fd = qemu_ram_get_fd(rb);
+    fd_offset = rb->fd_offset;
+    offset += fd_offset;
+    assert(page->page_size == qemu_ram_pagesize(rb));
+    slot_num = page->page_size / ps;
+
+    if (!qemu_ram_is_shared(rb)) { /* we can't use the backend file */
+        if (madvise(page->page_addr, page->page_size, MADV_HWPOISON) == 0) {
+            page->first_poison = page->page_addr;
+            warn_report("Large memory error, unrecoverable section "
+                "(unshared hugetlbfs): start:%p length: %ld",
+                page->page_addr, page->page_size);
+            return 0;
+        } else {
+            if (err) {
+                *err = "large poison injection failed";
+            }
+            return -1;
+        }
+    }
+
+    for (i = 0; i < slot_num; i++) {
+        retrieved = 0;
+        while (retrieved < ps) {
+            count = pread(fd, page->page_addr + i * ps + retrieved,
+                ps - retrieved, offset + i * ps + retrieved);
+            if (count == 0) {
+                DPRINTF("read reach end of the file\n");
+                break;
+            } else if (count < 0) {
+                DPRINTF("read backend failed: %s\n", strerror(errno));
+                break;
+            }
+            retrieved += count;
+        }
+        if (retrieved < ps) { /* consider this page as poisoned */
+            if (madvise(page->page_addr + i * ps, ps, MADV_HWPOISON)) {
+                if (err) {
+                    *err = "poison injection failed";
+                }
+                return -1;
+            }
+            if (page->first_poison == NULL) {
+                page->first_poison = page->page_addr + i * ps;
+            }
+            poison_count++;
+            DPRINTF("Found a poison at index %d = addr %p\n",
+                i, page->page_addr + i * ps);
+        }
+    }
+
+    /*
+     * A large page without at least a 4k poison will not have an
+     * entry into hwpoison_page_list, so won't be correctly replaced
+     * with a new large page on VM reset with qemu_ram_remap().
+     * Any new error on this area will fail to be handled.
+     */
+    if (poison_count == 0) {
+        if (err) {
+            *err = "No Poison found";
+        }
+        return -1;
+    }
+
+    DPRINTF("Num poison for page %p : %d / %d\n",
+            page->page_addr, poison_count, i);
+    return 0;
+}
+
+/*
+ * Empty the large_hwpoison_page_list -- to be called on address space
+ * poison cleanup outside of concurrent memory access.
+ */
+void hugetlbfs_ras_empty(void)
+{
+    LargeHWPoisonPage *page, *next_page;
+
+    if (!hugetlbfs_ras_initialized) {
+        return;
+    }
+    qemu_mutex_lock(&large_hwpoison_mtx);
+    QLIST_FOREACH_SAFE(page, &large_hwpoison_page_list, list, next_page) {
+        QLIST_REMOVE(page, list);
+        g_free(page);
+    }
+    qemu_mutex_unlock(&large_hwpoison_mtx);
+}
+
+/*
+ * Deal with the given page, initializing its data.
+ */
+static void
+hugetlbfs_ras_transform_page(LargeHWPoisonPage *page, const char **err_info)
+{
+    const char *err_msg;
+    int fd;
+    ram_addr_t offset;
+    RAMBlock *rb;
+
+    /* find the backend to get the associated fd and offset */
+    rb = qemu_ram_block_from_host(page->page_addr, false, &offset);
+    if (!rb) {
+        DPRINTF("No associated RAMBlock to %p\n", page->page_addr);
+        err_msg = "qemu_ram_block_from_host error";
+        goto err;
+    }
+    fd = qemu_ram_get_fd(rb);
+
+    if (sync_file_range(fd, offset, page->page_size,
+                        SYNC_FILE_RANGE_WAIT_AFTER) != 0) {
+        err_msg = "sync_file_range error on the backend";
+        perror("sync_file_range");
+        goto err;
+    }
+    if (fsync(fd) != 0) {
+        err_msg = "fsync error on the backend";
+        perror("fsync");
+        goto err;
+    }
+    if (msync(page->page_addr, page->page_size, MS_SYNC) != 0) {
+        err_msg = "msync error on the backend";
+        perror("msync");
+        goto err;
+    }
+    page->page_state = LPP_PREPARING;
+
+    if (munmap(page->page_addr, page->page_size) != 0) {
+        err_msg = "Failed to unmap";
+        perror("munmap");
+        goto err;
+    }
+
+    /* replace the large page with standard pages */
+    if (mmap(page->page_addr, page->page_size, PROT_READ | PROT_WRITE,
+            MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0)
+            == MAP_FAILED) {
+        err_msg = "Failed to map std page";
+        perror("mmap");
+        goto err;
+    }
+
+    /* take a copy of still valid data and mark the failed pages as poisoned */
+    if (take_valid_data_lpg(page, &err_msg) != 0) {
+        goto err;
+    }
+
+    if (clock_gettime(CLOCK_MONOTONIC, &page->creation_time) != 0) {
+        err_msg = "Failed to set creation time";
+        perror("clock_gettime");
+        goto err;
+    }
+
+    page->page_state = LPP_DONE;
+    return;
+
+err:
+    if (err_info) {
+        *err_info = err_msg;
+    }
+    page->page_state = LPP_FAILED;
+}
+
+/* attempt to vm_stop the entire VM in the IOthread */
+static void coroutine_hugetlbfs_ras_vmstop_bh(void *opaque)
+{
+    vm_stop(RUN_STATE_PAUSED);
+    DPRINTF("VM STOPPED\n");
+    qemu_mutex_lock(&large_hwpoison_mtx);
+    vm_running = 0;
+    qemu_cond_signal(&large_hwpoison_vm_running);
+    qemu_mutex_unlock(&large_hwpoison_mtx);
+}
+
+static void coroutine_hugetlbfs_ras_vmstart_bh(void *opaque)
+{
+    vm_start();
+}
+
+static void *
+hugetlbfs_ras_listener(void *arg)
+{
+    LargeHWPoisonPage *page;
+    int new;
+    const char *err;
+
+    /* monitor any newly submitted element in the list */
+    qemu_mutex_lock(&large_hwpoison_mtx);
+    while (1) {
+        new = 0;
+        QLIST_FOREACH(page, &large_hwpoison_page_list, list) {
+            if (page->page_state == LPP_SUBMITTED) {
+                new++;
+                vm_running = 1;
+                DPRINTF("Stopping the VM\n");
+                aio_bh_schedule_oneshot(qemu_get_aio_context(),
+                                coroutine_hugetlbfs_ras_vmstop_bh, NULL);
+                /* inform all SIGBUS threads that they have to return */
+                large_hwpoison_vm_stop++;
+                qemu_cond_broadcast(&large_hwpoison_cv);
+
+                /* wait until VM is stopped */
+                while (vm_running) {
+                    DPRINTF("waiting for vm to stop\n");
+                    qemu_cond_wait(&large_hwpoison_vm_running,
+                                   &large_hwpoison_mtx);
+                }
+
+                hugetlbfs_ras_transform_page(page, &err);
+                if (page->page_state == LPP_FAILED) {
+                    error_report("fatal: unrecoverable hugepage memory error"
+                                 " at %p (%s)", page->page_addr, err);
+                    exit(1);
+                }
+
+                large_hwpoison_vm_stop--;
+
+                DPRINTF("Restarting the VM\n");
+                aio_bh_schedule_oneshot(qemu_get_aio_context(),
+                                coroutine_hugetlbfs_ras_vmstart_bh, NULL);
+            }
+        }
+        if (new) {
+            qemu_cond_broadcast(&large_hwpoison_cv);
+        }
+
+        qemu_cond_wait(&large_hwpoison_new, &large_hwpoison_mtx);
+    }
+    qemu_mutex_unlock(&large_hwpoison_mtx);
+    return NULL;
+}
diff --git a/system/hugetlbfs_ras.h b/system/hugetlbfs_ras.h
new file mode 100644
index 0000000000..324228bda3
--- /dev/null
+++ b/system/hugetlbfs_ras.h
@@ -0,0 +1,3 @@
+bool hugetlbfs_ras_use(void);
+bool hugetlbfs_ras_correct(void **paddr, size_t *psz, int code);
+void hugetlbfs_ras_empty(void);
diff --git a/system/meson.build b/system/meson.build
index a296270cb0..eda92f55a9 100644
--- a/system/meson.build
+++ b/system/meson.build
@@ -37,4 +37,5 @@ system_ss.add(when: 'CONFIG_DEVICE_TREE',
               if_false: files('device_tree-stub.c'))
 if host_os == 'linux'
   system_ss.add(files('async-teardown.c'))
+  system_ss.add(files('hugetlbfs_ras.c'))
 endif
diff --git a/system/physmem.c b/system/physmem.c
index 5c176146c0..78de507bd0 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -82,6 +82,10 @@
 #include <daxctl/libdaxctl.h>
 #endif
 
+#ifdef CONFIG_HUGETLBFS_RAS
+#include "system/hugetlbfs_ras.h"
+#endif
+
 //#define DEBUG_SUBPAGE
 
 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
@@ -2061,6 +2065,19 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
         return NULL;
     }
 
+#ifdef CONFIG_HUGETLBFS_RAS
+    {
+        QemuFsType ftyp = qemu_fd_getfs(fd);
+
+        if (ftyp == QEMU_FS_TYPE_HUGETLBFS) {
+            if (hugetlbfs_ras_use() && !(ram_flags & RAM_SHARED)) {
+                warn_report("'share=on' option must be set to support "
+                            "hugetlbfs memory error handling");
+            }
+        }
+    }
+#endif
+
     block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset, errp);
     if (!block) {
         if (created) {
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index f62e53e423..6215d1acb5 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -40,6 +40,10 @@
 #include "hw/acpi/ghes.h"
 #include "target/arm/gtimer.h"
 
+#ifdef CONFIG_HUGETLBFS_RAS
+#include "system/hugetlbfs_ras.h"
+#endif
+
 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
     KVM_CAP_LAST_INFO
 };
@@ -2356,6 +2360,12 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr, short addr_lsb)
 
     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
 
+#ifdef CONFIG_HUGETLBFS_RAS
+    if (!hugetlbfs_ras_correct(&addr, &sz, code)) {
+        return;
+    }
+#endif
+
     if (acpi_ghes_present() && addr) {
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 99b87140cc..c99095cb1f 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -69,6 +69,10 @@
 #include "exec/memattrs.h"
 #include "trace.h"
 
+#ifdef CONFIG_HUGETLBFS_RAS
+#include "system/hugetlbfs_ras.h"
+#endif
+
 #include CONFIG_DEVICES
 
 //#define DEBUG_KVM
@@ -729,6 +733,12 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr, short addr_lsb)
      */
     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
 
+#ifdef CONFIG_HUGETLBFS_RAS
+    if (!hugetlbfs_ras_correct(&addr, &sz, code)) {
+        return;
+    }
+#endif
+
     if ((env->mcg_cap & MCG_SER_P) && addr) {
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
-- 
2.43.5

next prev parent reply	other threads:[~2024-09-10 10:03 UTC|newest]

Thread overview: 119+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-10  9:07 [RFC 0/6] hugetlbfs largepage RAS project “William Roche
2024-09-10  9:07 ` [RFC 1/6] accel/kvm: SIGBUS handler should also deal with si_addr_lsb “William Roche
2024-09-10  9:07 ` [RFC 2/6] accel/kvm: Keep track of the HWPoisonPage sizes “William Roche
2024-09-10  9:07 ` [RFC 3/6] system/physmem: Remap memory pages on reset based on the page size “William Roche
2024-09-10  9:07 ` [RFC 4/6] system: Introducing hugetlbfs largepage RAS feature “William Roche
2024-09-10  9:07 ` [RFC 5/6] system/hugetlb_ras: Handle madvise SIGBUS signal on listener “William Roche
2024-09-10  9:07 ` [RFC 6/6] system/hugetlb_ras: Replay lost BUS_MCEERR_AO signals on VM resume “William Roche
2024-09-10 10:02 ` [RFC RESEND 0/6] hugetlbfs largepage RAS project “William Roche
2024-09-10 10:02   ` [RFC RESEND 1/6] accel/kvm: SIGBUS handler should also deal with si_addr_lsb “William Roche
2024-09-10 10:02   ` [RFC RESEND 2/6] accel/kvm: Keep track of the HWPoisonPage sizes “William Roche
2024-09-10 10:02   ` [RFC RESEND 3/6] system/physmem: Remap memory pages on reset based on the page size “William Roche
2024-09-10 10:02   ` “William Roche [this message]
2024-09-10 10:02   ` [RFC RESEND 5/6] system/hugetlb_ras: Handle madvise SIGBUS signal on listener “William Roche
2024-09-10 10:02   ` [RFC RESEND 6/6] system/hugetlb_ras: Replay lost BUS_MCEERR_AO signals on VM resume “William Roche
2024-09-10 11:36   ` [RFC RESEND 0/6] hugetlbfs largepage RAS project David Hildenbrand
2024-09-10 16:24     ` William Roche
2024-09-11 22:07       ` David Hildenbrand
2024-09-12 17:07         ` William Roche
2024-09-19 16:52           ` William Roche
2024-10-09 15:45             ` Peter Xu
2024-10-10 20:35               ` William Roche
2024-10-22 21:34               ` [PATCH v1 0/4] hugetlbfs memory HW error fixes “William Roche
2024-10-22 21:35                 ` [PATCH v1 1/4] accel/kvm: SIGBUS handler should also deal with si_addr_lsb “William Roche
2024-10-22 21:35                 ` [PATCH v1 2/4] accel/kvm: Keep track of the HWPoisonPage page_size “William Roche
2024-10-23  7:28                   ` David Hildenbrand
2024-10-25 23:27                     ` William Roche
2024-10-28 16:42                       ` David Hildenbrand
2024-10-30  1:56                         ` William Roche
2024-11-04 14:10                           ` David Hildenbrand
2024-10-25 23:30                     ` William Roche
2024-10-22 21:35                 ` [PATCH v1 3/4] system/physmem: Largepage punch hole before reset of memory pages “William Roche
2024-10-23  7:30                   ` David Hildenbrand
2024-10-25 23:27                     ` William Roche
2024-10-28 17:01                       ` David Hildenbrand
2024-10-30  1:56                         ` William Roche
2024-11-04 13:30                           ` David Hildenbrand
2024-11-07 10:21                             ` [PATCH v2 0/7] hugetlbfs memory HW error fixes “William Roche
2024-11-07 10:21                               ` [PATCH v2 1/7] accel/kvm: Keep track of the HWPoisonPage page_size “William Roche
2024-11-12 10:30                                 ` David Hildenbrand
2024-11-12 18:17                                   ` William Roche
2024-11-12 21:35                                     ` David Hildenbrand
2024-11-07 10:21                               ` [PATCH v2 2/7] system/physmem: poisoned memory discard on reboot “William Roche
2024-11-12 11:07                                 ` David Hildenbrand
2024-11-12 18:17                                   ` William Roche
2024-11-12 22:06                                     ` David Hildenbrand
2024-11-07 10:21                               ` [PATCH v2 3/7] accel/kvm: Report the loss of a large memory page “William Roche
2024-11-12 11:13                                 ` David Hildenbrand
2024-11-12 18:17                                   ` William Roche
2024-11-12 22:22                                     ` David Hildenbrand
2024-11-15 21:03                                       ` William Roche
2024-11-18  9:45                                         ` David Hildenbrand
2024-11-07 10:21                               ` [PATCH v2 4/7] numa: Introduce and use ram_block_notify_remap() “William Roche
2024-11-07 10:21                               ` [PATCH v2 5/7] hostmem: Factor out applying settings “William Roche
2024-11-07 10:21                               ` [PATCH v2 6/7] hostmem: Handle remapping of RAM “William Roche
2024-11-12 13:45                                 ` David Hildenbrand
2024-11-12 18:17                                   ` William Roche
2024-11-12 22:24                                     ` David Hildenbrand
2024-11-07 10:21                               ` [PATCH v2 7/7] system/physmem: Memory settings applied on remap notification “William Roche
2024-10-22 21:35                 ` [PATCH v1 4/4] accel/kvm: Report the loss of a large memory page “William Roche
2024-10-28 16:32             ` [RFC RESEND 0/6] hugetlbfs largepage RAS project David Hildenbrand
2024-11-25 14:27         ` [PATCH v3 0/7] hugetlbfs memory HW error fixes “William Roche
2024-11-25 14:27           ` [PATCH v3 1/7] hwpoison_page_list and qemu_ram_remap are based of pages “William Roche
2024-11-25 14:27           ` [PATCH v3 2/7] system/physmem: poisoned memory discard on reboot “William Roche
2024-11-25 14:27           ` [PATCH v3 3/7] accel/kvm: Report the loss of a large memory page “William Roche
2024-11-25 14:27           ` [PATCH v3 4/7] numa: Introduce and use ram_block_notify_remap() “William Roche
2024-11-25 14:27           ` [PATCH v3 5/7] hostmem: Factor out applying settings “William Roche
2024-11-25 14:27           ` [PATCH v3 6/7] hostmem: Handle remapping of RAM “William Roche
2024-11-25 14:27           ` [PATCH v3 7/7] system/physmem: Memory settings applied on remap notification “William Roche
2024-12-02 15:41           ` [PATCH v3 0/7] hugetlbfs memory HW error fixes William Roche
2024-12-02 16:00             ` David Hildenbrand
2024-12-03  0:15               ` William Roche
2024-12-03 14:08                 ` David Hildenbrand
2024-12-03 14:39                   ` William Roche
2024-12-03 15:00                     ` David Hildenbrand
2024-12-06 18:26                       ` William Roche
2024-12-09 21:25                         ` David Hildenbrand
2024-12-14 13:45         ` [PATCH v4 0/7] Poisoned memory recovery on reboot “William Roche
2024-12-14 13:45           ` [PATCH v4 1/7] hwpoison_page_list and qemu_ram_remap are based on pages “William Roche
2025-01-08 21:34             ` David Hildenbrand
2025-01-10 20:56               ` William Roche
2025-01-14 13:56                 ` David Hildenbrand
2024-12-14 13:45           ` [PATCH v4 2/7] system/physmem: poisoned memory discard on reboot “William Roche
2025-01-08 21:44             ` David Hildenbrand
2025-01-10 20:56               ` William Roche
2025-01-14 14:00                 ` David Hildenbrand
2025-01-27 21:15                   ` William Roche
2024-12-14 13:45           ` [PATCH v4 3/7] accel/kvm: Report the loss of a large memory page “William Roche
2024-12-14 13:45           ` [PATCH v4 4/7] numa: Introduce and use ram_block_notify_remap() “William Roche
2024-12-14 13:45           ` [PATCH v4 5/7] hostmem: Factor out applying settings “William Roche
2025-01-08 21:58             ` David Hildenbrand
2025-01-10 20:56               ` William Roche
2024-12-14 13:45           ` [PATCH v4 6/7] hostmem: Handle remapping of RAM “William Roche
2025-01-08 21:51             ` [PATCH v4 6/7] c David Hildenbrand
2025-01-10 20:57               ` [PATCH v4 6/7] hostmem: Handle remapping of RAM William Roche
2024-12-14 13:45           ` [PATCH v4 7/7] system/physmem: Memory settings applied on remap notification “William Roche
2025-01-08 21:53             ` David Hildenbrand
2025-01-10 20:57               ` William Roche
2025-01-14 14:01                 ` David Hildenbrand
2025-01-08 21:22           ` [PATCH v4 0/7] Poisoned memory recovery on reboot David Hildenbrand
2025-01-10 20:55             ` William Roche
2025-01-10 21:13         ` [PATCH v5 0/6] " “William Roche
2025-01-10 21:14           ` [PATCH v5 1/6] system/physmem: handle hugetlb correctly in qemu_ram_remap() “William Roche
2025-01-14 14:02             ` David Hildenbrand
2025-01-27 21:16               ` William Roche
2025-01-28 18:41                 ` David Hildenbrand
2025-01-10 21:14           ` [PATCH v5 2/6] system/physmem: poisoned memory discard on reboot “William Roche
2025-01-14 14:07             ` David Hildenbrand
2025-01-27 21:16               ` William Roche
2025-01-10 21:14           ` [PATCH v5 3/6] accel/kvm: Report the loss of a large memory page “William Roche
2025-01-14 14:09             ` David Hildenbrand
2025-01-27 21:16               ` William Roche
2025-01-28 18:45                 ` David Hildenbrand
2025-01-10 21:14           ` [PATCH v5 4/6] numa: Introduce and use ram_block_notify_remap() “William Roche
2025-01-10 21:14           ` [PATCH v5 5/6] hostmem: Factor out applying settings “William Roche
2025-01-10 21:14           ` [PATCH v5 6/6] hostmem: Handle remapping of RAM “William Roche
2025-01-14 14:11             ` David Hildenbrand
2025-01-27 21:16               ` William Roche
2025-01-14 14:12           ` [PATCH v5 0/6] Poisoned memory recovery on reboot David Hildenbrand
2025-01-27 21:16             ` William Roche

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:bcccf80bd dfblob:6c6841f93 dfblob:fbda17c98 dfblob:03214586c
dfblob:2f7e550f5 dfblob:324228bda dfblob:a296270cb dfblob:eda92f55a
dfblob:5c176146c dfblob:78de507bd dfblob:f62e53e42 dfblob:6215d1acb
dfblob:99b87140c dfblob:c99095cb1 )
 OR (
bs:"[RFC RESEND 4/6] system: Introducing hugetlbfs largepage RAS feature" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240910100216.2744078-5-william.roche@oracle.com \
    --to=william.roche@oracle.com \
    --cc=berrange@redhat.com \
    --cc=david@redhat.com \
    --cc=joao.m.martins@oracle.com \
    --cc=kvm@vger.kernel.org \
    --cc=marcandre.lureau@redhat.com \
    --cc=mtosatti@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=peter.maydell@linaro.org \
    --cc=peterx@redhat.com \
    --cc=philmd@linaro.org \
    --cc=qemu-arm@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    --cc=richard.henderson@linaro.org \
    --cc=thuth@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).