qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: "Cédric Le Goater" <clg@kaod.org>
To: qemu-ppc@nongnu.org, qemu-devel@nongnu.org
Cc: "Peter Maydell" <peter.maydell@linaro.org>,
	"Daniel Henrique Barboza" <danielhb413@gmail.com>,
	"Cédric Le Goater" <clg@kaod.org>,
	"Shivaprasad G Bhat" <sbhat@linux.ibm.com>
Subject: [PULL 02/39] spapr: nvdimm: Implement H_SCM_FLUSH hcall
Date: Fri, 18 Feb 2022 11:37:50 +0100	[thread overview]
Message-ID: <20220218103827.682032-3-clg@kaod.org> (raw)
In-Reply-To: <20220218103827.682032-1-clg@kaod.org>

From: Shivaprasad G Bhat <sbhat@linux.ibm.com>

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch. The
hcall is applicable only for new SPAPR specific device class which is
also introduced in this patch.

The hcall expects the semantics such that the flush to return with
H_LONG_BUSY_ORDER_10_MSEC when the operation is expected to take longer
time along with a continue_token. The hcall to be called again by providing
the continue_token to get the status. So, all fresh requests are put into
a 'pending' list and flush worker is submitted to the thread pool. The
thread pool completion callbacks move the requests to 'completed' list,
which are cleaned up after collecting the return status for the guest
in subsequent hcall from the guest.

The semantics makes it necessary to preserve the continue_tokens and
their return status across migrations. So, the completed flush states
are forwarded to the destination and the pending ones are restarted
at the destination in post_load. The necessary nvdimm flush specific
vmstate structures are also introduced in this patch which are to be
saved in the new SPAPR specific nvdimm device to be introduced in the
following patch.

Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Message-Id: <164396254862.109112.16675611182159105748.stgit@ltczzess4.aus.stglabs.ibm.com>
Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/hw/ppc/spapr.h        |   4 +-
 include/hw/ppc/spapr_nvdimm.h |   1 +
 hw/ppc/spapr.c                |   2 +
 hw/ppc/spapr_nvdimm.c         | 260 ++++++++++++++++++++++++++++++++++
 4 files changed, 266 insertions(+), 1 deletion(-)

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index ee7504b9760b..727b2a0e7fc9 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -341,6 +341,7 @@ struct SpaprMachineState {
 #define H_P7              -60
 #define H_P8              -61
 #define H_P9              -62
+#define H_UNSUPPORTED     -67
 #define H_OVERLAP         -68
 #define H_UNSUPPORTED_FLAG -256
 #define H_MULTI_THREADS_ACTIVE -9005
@@ -559,8 +560,9 @@ struct SpaprMachineState {
 #define H_SCM_UNBIND_ALL        0x3FC
 #define H_SCM_HEALTH            0x400
 #define H_RPT_INVALIDATE        0x448
+#define H_SCM_FLUSH             0x44C
 
-#define MAX_HCALL_OPCODE        H_RPT_INVALIDATE
+#define MAX_HCALL_OPCODE        H_SCM_FLUSH
 
 /* The hcalls above are standardized in PAPR and implemented by pHyp
  * as well.
diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h
index 764f999f5471..e9436cb6ef61 100644
--- a/include/hw/ppc/spapr_nvdimm.h
+++ b/include/hw/ppc/spapr_nvdimm.h
@@ -21,5 +21,6 @@ void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt);
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
                            uint64_t size, Error **errp);
 void spapr_add_nvdimm(DeviceState *dev, uint64_t slot);
+void spapr_nvdimm_finish_flushes(void);
 
 #endif
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3d6ec309dd21..92639856635a 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1634,6 +1634,8 @@ static void spapr_machine_reset(MachineState *machine)
         spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
     }
 
+    spapr_nvdimm_finish_flushes();
+
     /* DRC reset may cause a device to be unplugged. This will cause troubles
      * if this device is used by another device (eg, a running vhost backend
      * will crash QEMU if the DIMM holding the vring goes away). To avoid such
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 91de1052f233..ac44e0015317 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -30,6 +31,9 @@
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
+#include "migration/vmstate.h"
+#include "qemu/pmem.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -47,6 +51,14 @@
 /* Have an explicit check for alignment */
 QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
 
+#define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
+OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, SPAPR_NVDIMM)
+
+struct SPAPRNVDIMMClass {
+    /* private */
+    NVDIMMClass parent_class;
+};
+
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
                            uint64_t size, Error **errp)
 {
@@ -375,6 +387,253 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
     return H_SUCCESS;
 }
 
+typedef struct SpaprNVDIMMDeviceFlushState {
+    uint64_t continue_token;
+    int64_t hcall_ret;
+    uint32_t drcidx;
+
+    QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node;
+} SpaprNVDIMMDeviceFlushState;
+
+typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
+struct SpaprNVDIMMDevice {
+    NVDIMMDevice parent_obj;
+
+    uint64_t nvdimm_flush_token;
+    QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
+    QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
+};
+
+static int flush_worker_cb(void *opaque)
+{
+    SpaprNVDIMMDeviceFlushState *state = opaque;
+    SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
+    PCDIMMDevice *dimm = PC_DIMM(drc->dev);
+    HostMemoryBackend *backend = MEMORY_BACKEND(dimm->hostmem);
+    int backend_fd = memory_region_get_fd(&backend->mr);
+
+    if (object_property_get_bool(OBJECT(backend), "pmem", NULL)) {
+        MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
+        void *ptr = memory_region_get_ram_ptr(mr);
+        size_t size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
+                                               NULL);
+
+        /* flush pmem backend */
+        pmem_persist(ptr, size);
+    } else {
+        /* flush raw backing image */
+        if (qemu_fdatasync(backend_fd) < 0) {
+            error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+                         strerror(errno));
+            return H_HARDWARE;
+        }
+    }
+
+    return H_SUCCESS;
+}
+
+static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret)
+{
+    SpaprNVDIMMDeviceFlushState *state = opaque;
+    SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
+    SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(drc->dev);
+
+    state->hcall_ret = hcall_ret;
+    QLIST_REMOVE(state, node);
+    QLIST_INSERT_HEAD(&s_nvdimm->completed_nvdimm_flush_states, state, node);
+}
+
+static int spapr_nvdimm_flush_post_load(void *opaque, int version_id)
+{
+    SpaprNVDIMMDevice *s_nvdimm = (SpaprNVDIMMDevice *)opaque;
+    SpaprNVDIMMDeviceFlushState *state;
+    ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+
+    QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) {
+        thread_pool_submit_aio(pool, flush_worker_cb, state,
+                               spapr_nvdimm_flush_completion_cb, state);
+    }
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_spapr_nvdimm_flush_state = {
+     .name = "spapr_nvdimm_flush_state",
+     .version_id = 1,
+     .minimum_version_id = 1,
+     .fields = (VMStateField[]) {
+         VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
+         VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
+         VMSTATE_UINT32(drcidx, SpaprNVDIMMDeviceFlushState),
+         VMSTATE_END_OF_LIST()
+     },
+};
+
+const VMStateDescription vmstate_spapr_nvdimm_states = {
+    .name = "spapr_nvdimm_states",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .post_load = spapr_nvdimm_flush_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(nvdimm_flush_token, SpaprNVDIMMDevice),
+        VMSTATE_QLIST_V(completed_nvdimm_flush_states, SpaprNVDIMMDevice, 1,
+                        vmstate_spapr_nvdimm_flush_state,
+                        SpaprNVDIMMDeviceFlushState, node),
+        VMSTATE_QLIST_V(pending_nvdimm_flush_states, SpaprNVDIMMDevice, 1,
+                        vmstate_spapr_nvdimm_flush_state,
+                        SpaprNVDIMMDeviceFlushState, node),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+/*
+ * Assign a token and reserve it for the new flush state.
+ */
+static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(
+                                                SpaprNVDIMMDevice *spapr_nvdimm)
+{
+    SpaprNVDIMMDeviceFlushState *state;
+
+    state = g_malloc0(sizeof(*state));
+
+    spapr_nvdimm->nvdimm_flush_token++;
+    /* Token zero is presumed as no job pending. Assert on overflow to zero */
+    g_assert(spapr_nvdimm->nvdimm_flush_token != 0);
+
+    state->continue_token = spapr_nvdimm->nvdimm_flush_token;
+
+    QLIST_INSERT_HEAD(&spapr_nvdimm->pending_nvdimm_flush_states, state, node);
+
+    return state;
+}
+
+/*
+ * spapr_nvdimm_finish_flushes
+ *      Waits for all pending flush requests to complete
+ *      their execution and free the states
+ */
+void spapr_nvdimm_finish_flushes(void)
+{
+    SpaprNVDIMMDeviceFlushState *state, *next;
+    GSList *list, *nvdimms;
+
+    /*
+     * Called on reset path, the main loop thread which calls
+     * the pending BHs has gotten out running in the reset path,
+     * finally reaching here. Other code path being guest
+     * h_client_architecture_support, thats early boot up.
+     */
+    nvdimms = nvdimm_get_device_list();
+    for (list = nvdimms; list; list = list->next) {
+        NVDIMMDevice *nvdimm = list->data;
+        if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
+            SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(nvdimm);
+            while (!QLIST_EMPTY(&s_nvdimm->pending_nvdimm_flush_states)) {
+                aio_poll(qemu_get_aio_context(), true);
+            }
+
+            QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states,
+                               node, next) {
+                QLIST_REMOVE(state, node);
+                g_free(state);
+            }
+        }
+    }
+    g_slist_free(nvdimms);
+}
+
+/*
+ * spapr_nvdimm_get_flush_status
+ *      Fetches the status of the hcall worker and returns
+ *      H_LONG_BUSY_ORDER_10_MSEC if the worker is still running.
+ */
+static int spapr_nvdimm_get_flush_status(SpaprNVDIMMDevice *s_nvdimm,
+                                         uint64_t token)
+{
+    SpaprNVDIMMDeviceFlushState *state, *node;
+
+    QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) {
+        if (state->continue_token == token) {
+            return H_LONG_BUSY_ORDER_10_MSEC;
+        }
+    }
+
+    QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states,
+                       node, node) {
+        if (state->continue_token == token) {
+            int ret = state->hcall_ret;
+            QLIST_REMOVE(state, node);
+            g_free(state);
+            return ret;
+        }
+    }
+
+    /* If not found in complete list too, invalid token */
+    return H_P2;
+}
+
+/*
+ * H_SCM_FLUSH
+ * Input: drc_index, continue-token
+ * Out: continue-token
+ * Return Value: H_SUCCESS, H_Parameter, H_P2, H_LONG_BUSY_ORDER_10_MSEC,
+ *               H_UNSUPPORTED
+ *
+ * Given a DRC Index Flush the data to backend NVDIMM device. The hcall returns
+ * H_LONG_BUSY_ORDER_10_MSEC when the flush takes longer time and the hcall
+ * needs to be issued multiple times in order to be completely serviced. The
+ * continue-token from the output to be passed in the argument list of
+ * subsequent hcalls until the hcall is completely serviced at which point
+ * H_SUCCESS or other error is returned.
+ */
+static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr,
+                                target_ulong opcode, target_ulong *args)
+{
+    int ret;
+    uint32_t drc_index = args[0];
+    uint64_t continue_token = args[1];
+    SpaprDrc *drc = spapr_drc_by_index(drc_index);
+    PCDIMMDevice *dimm;
+    HostMemoryBackend *backend = NULL;
+    SpaprNVDIMMDeviceFlushState *state;
+    ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+    int fd;
+
+    if (!drc || !drc->dev ||
+        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+        return H_PARAMETER;
+    }
+
+    dimm = PC_DIMM(drc->dev);
+    if (continue_token == 0) {
+        backend = MEMORY_BACKEND(dimm->hostmem);
+        fd = memory_region_get_fd(&backend->mr);
+
+        if (fd < 0) {
+            return H_UNSUPPORTED;
+        }
+
+        state = spapr_nvdimm_init_new_flush_state(SPAPR_NVDIMM(dimm));
+        if (!state) {
+            return H_HARDWARE;
+        }
+
+        state->drcidx = drc_index;
+
+        thread_pool_submit_aio(pool, flush_worker_cb, state,
+                               spapr_nvdimm_flush_completion_cb, state);
+
+        continue_token = state->continue_token;
+    }
+
+    ret = spapr_nvdimm_get_flush_status(SPAPR_NVDIMM(dimm), continue_token);
+    if (H_IS_LONG_BUSY(ret)) {
+        args[0] = continue_token;
+    }
+
+    return ret;
+}
+
 static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
                                      target_ulong opcode, target_ulong *args)
 {
@@ -523,6 +782,7 @@ static void spapr_scm_register_types(void)
     spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
     spapr_register_hypercall(H_SCM_UNBIND_ALL, h_scm_unbind_all);
     spapr_register_hypercall(H_SCM_HEALTH, h_scm_health);
+    spapr_register_hypercall(H_SCM_FLUSH, h_scm_flush);
 }
 
 type_init(spapr_scm_register_types)
-- 
2.34.1



  parent reply	other threads:[~2022-02-18 10:57 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-18 10:37 [PULL 00/39] ppc queue Cédric Le Goater
2022-02-18 10:37 ` [PULL 01/39] nvdimm: Add realize, unrealize callbacks to NVDIMMDevice class Cédric Le Goater
2022-02-18 10:37 ` Cédric Le Goater [this message]
2022-02-18 10:37 ` [PULL 03/39] spapr: nvdimm: Introduce spapr-nvdimm device Cédric Le Goater
2022-02-18 10:37 ` [PULL 04/39] target/ppc: raise HV interrupts for partition table entry problems Cédric Le Goater
2022-02-18 10:37 ` [PULL 05/39] spapr: prevent hdec timer being set up under virtual hypervisor Cédric Le Goater
2022-02-18 10:37 ` [PULL 06/39] ppc: allow the hdecr timer to be created/destroyed Cédric Le Goater
2022-02-18 10:37 ` [PULL 07/39] target/ppc: add vhyp addressing mode helper for radix MMU Cédric Le Goater
2022-02-18 10:37 ` [PULL 08/39] target/ppc: make vhyp get_pate method take lpid and return success Cédric Le Goater
2022-02-18 10:37 ` [PULL 09/39] target/ppc: add helper for books vhyp hypercall handler Cédric Le Goater
2022-02-18 10:37 ` [PULL 10/39] target/ppc: Add powerpc_reset_excp_state helper Cédric Le Goater
2022-02-18 10:37 ` [PULL 11/39] target/ppc: Introduce a vhyp framework for nested HV support Cédric Le Goater
2022-02-18 10:38 ` [PULL 12/39] spapr: implement nested-hv capability for the virtual hypervisor Cédric Le Goater
2022-02-18 10:38 ` [PULL 13/39] target/ppc: cpu_init: Remove not implemented comments Cédric Le Goater
2022-02-18 10:38 ` [PULL 14/39] target/ppc: cpu_init: Remove G2LE init code Cédric Le Goater
2022-02-18 10:38 ` [PULL 15/39] target/ppc: cpu_init: Group registration of generic SPRs Cédric Le Goater
2022-02-18 10:38 ` [PULL 16/39] target/ppc: cpu_init: Move Timebase registration into the common function Cédric Le Goater
2022-02-18 10:38 ` [PULL 17/39] target/ppc: cpu_init: Avoid nested SPR register functions Cédric Le Goater
2022-02-18 10:38 ` [PULL 18/39] target/ppc: cpu_init: Move 405 SPRs into register_405_sprs Cédric Le Goater
2022-02-18 10:38 ` [PULL 19/39] target/ppc: cpu_init: Move G2 SPRs into register_G2_sprs Cédric Le Goater
2022-02-18 10:38 ` [PULL 20/39] target/ppc: cpu_init: Decouple G2 SPR registration from 755 Cédric Le Goater
2022-02-18 10:38 ` [PULL 21/39] target/ppc: cpu_init: Decouple 74xx SPR registration from 7xx Cédric Le Goater
2022-02-18 10:38 ` [PULL 22/39] target/ppc: cpu_init: Deduplicate 440 SPR registration Cédric Le Goater
2022-02-18 10:38 ` [PULL 23/39] target/ppc: cpu_init: Deduplicate 603 " Cédric Le Goater
2022-02-18 10:38 ` [PULL 24/39] target/ppc: cpu_init: Deduplicate 604 " Cédric Le Goater
2022-02-18 10:38 ` [PULL 25/39] target/ppc: cpu_init: Deduplicate 745/755 " Cédric Le Goater
2022-02-18 10:38 ` [PULL 26/39] target/ppc: cpu_init: Deduplicate 7xx " Cédric Le Goater
2022-02-18 10:38 ` [PULL 27/39] target/ppc: cpu_init: Move 755 L2 cache SPRs into a function Cédric Le Goater
2022-02-18 10:38 ` [PULL 28/39] target/ppc: cpu_init: Move e300 SPR registration " Cédric Le Goater
2022-02-18 10:38 ` [PULL 29/39] target/ppc: cpu_init: Move 604e " Cédric Le Goater
2022-02-18 10:38 ` [PULL 30/39] target/ppc: cpu_init: Reuse init_proc_603 for the e300 Cédric Le Goater
2022-02-18 10:38 ` [PULL 31/39] target/ppc: cpu_init: Reuse init_proc_604 for the 604e Cédric Le Goater
2022-02-18 10:38 ` [PULL 32/39] target/ppc: cpu_init: Reuse init_proc_745 for the 755 Cédric Le Goater
2022-02-18 10:38 ` [PULL 33/39] target/ppc: cpu_init: Rename register_ne_601_sprs Cédric Le Goater
2022-02-18 10:38 ` [PULL 34/39] target/ppc: cpu_init: Remove register_usprg3_sprs Cédric Le Goater
2022-02-18 10:38 ` [PULL 35/39] target/ppc: Rename spr_tcg.h to spr_common.h Cédric Le Goater
2022-02-18 10:38 ` [PULL 36/39] target/ppc: cpu_init: Expose some SPR registration helpers Cédric Le Goater
2022-02-18 10:38 ` [PULL 37/39] target/ppc: cpu_init: Move SPR registration macros to a header Cédric Le Goater
2022-02-18 10:38 ` [PULL 38/39] target/ppc: cpu_init: Move check_pow and QOM " Cédric Le Goater
2022-02-18 10:38 ` [PULL 39/39] target/ppc: Move common SPR functions out of cpu_init Cédric Le Goater
2022-02-20 19:28 ` [PULL 00/39] ppc queue Peter Maydell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220218103827.682032-3-clg@kaod.org \
    --to=clg@kaod.org \
    --cc=danielhb413@gmail.com \
    --cc=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    --cc=qemu-ppc@nongnu.org \
    --cc=sbhat@linux.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).