From mboxrd@z Thu Jan 1 00:00:00 1970 From: Shivaprasad G Bhat Date: Tue, 01 Feb 2022 21:53:06 +0000 Subject: Re: [PATCH REBASED v5 1/2] spapr: nvdimm: Implement H_SCM_FLUSH hcall Message-Id: <33114b36-15ca-3f6f-aa8d-8be9e69e8476@linux.ibm.com> List-Id: References: <162571302321.1030381.15196355582642786915.stgit@lep8c.aus.stglabs.ibm.com> <162571303048.1030381.13893352223345979621.stgit@lep8c.aus.stglabs.ibm.com> In-Reply-To: MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit To: David Gibson Cc: groug@kaod.org, qemu-ppc@nongnu.org, qemu-devel@nongnu.org, aneesh.kumar@linux.ibm.com, nvdimm@lists.linux.dev, kvm-ppc@vger.kernel.org, bharata@linux.vnet.ibm.com Hi David, Thanks for comments. Sorry about the delay. Replies inline. On 9/21/21 11:53, David Gibson wrote: > On Wed, Jul 07, 2021 at 09:57:21PM -0500, Shivaprasad G Bhat wrote: >> The patch adds support for the SCM flush hcall for the nvdimm devices. >> To be available for exploitation by guest through the next patch. >> >> The hcall expects the semantics such that the flush to return >> with one of H_LONG_BUSY when the operation is expected to take longer >> time along with a continue_token. The hcall to be called again providing >> the continue_token to get the status. So, all fresh requests are put into >> a 'pending' list and flush worker is submitted to the thread pool. The >> thread pool completion callbacks move the requests to 'completed' list, >> which are cleaned up after reporting to guest in subsequent hcalls t >> @@ -30,6 +31,7 @@ >> #include "hw/ppc/fdt.h" >> #include "qemu/range.h" >> #include "hw/ppc/spapr_numa.h" >> +#include "block/thread-pool.h" >> >> /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */ >> /* SCM device is unable to persist memory contents */ >> @@ -375,6 +377,243 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr, >> return H_SUCCESS; >> } >> >> +static uint64_t flush_token; > > Better to put this in the machine state structure than a global. Moved it to device state itself as suggested, the states list is per device now. > >> +static int flush_worker_cb(void *opaque) >> +{ >> + int ret = H_SUCCESS; >> + SpaprNVDIMMDeviceFlushState *state = opaque; >> + >> + /* flush raw backing image */ >> + >> + !QLIST_EMPTY(&spapr->completed_flush_states)); >> +} >> + >> +static int spapr_nvdimm_post_load(void *opaque, int version_id) >> +{ >> + SpaprMachineState *spapr = (SpaprMachineState *)opaque; >> + SpaprNVDIMMDeviceFlushState *state, *next; >> + PCDIMMDevice *dimm; >> + HostMemoryBackend *backend = NULL; >> + ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); >> + SpaprDrc *drc; >> + >> + QLIST_FOREACH_SAFE(state, &spapr->completed_flush_states, node, next) { > > I don't think you need FOREACH_SAFE here. You're not removing entries > from the loop body. If you're trying to protect against concurrent > removals, I don't think FOREACH_SAFE is sufficient, you'll need an > actual lock (but I think it's already protected by the BQL). Changing here, below and also at spapr_nvdimm_get_flush_status() while traversing the pending list. Verified all these invocations are called with BQL. > >> + if (flush_token < state->continue_token) { >> + flush_token = state->continue_token; >> + } >> + } >> + >> + QLIST_FOREACH_SAFE(state, &spapr->pending_flush_states, node, next) { > > Sane comments here. > >> + if (flush_token < state->continue_token) { >> + flush_token = state->continue_token; >> + } >> + >> + drc = spapr_drc_by_index(state->drcidx); >> + dimm = PC_DIMM(drc->dev); >> + backend = MEMORY_BACKEND(dimm->hostmem); >> + state->backend_fd = memory_region_get_fd(&backend->mr); >> + >> + thread_pool_submit_aio(pool, flush_worker_cb, state, >> + spapr_nvdimm_flush_completion_cb, state); >> + } >> + >> + return 0; >> +} >> + >> +const VMStateDescription vmstate_spapr_nvdimm_states = { >> + .name = "spapr_nvdimm_states", >> + .version_id = 1, >> + .minimum_version_id = 1, >> + .needed = spapr_nvdimm_states_needed, >> + .post_load = spapr_nvdimm_post_load, >> + .fields = (VMStateField[]) { >> + VMSTATE_QLIST_V(completed_flush_states, SpaprMachineState, 1, >> + vmstate_spapr_nvdimm_flush_state, >> + SpaprNVDIMMDeviceFlushState, node), >> + VMSTATE_QLIST_V(pending_flush_states, SpaprMachineState, 1, >> + vmstate_spapr_nvdimm_flush_state, >> + SpaprNVDIMMDeviceFlushState, node), >> + VMSTATE_END_OF_LIST() >> + }, >> +}; >> + >> +/* >> + * Assign a token and reserve it for the new flush state. >> + */ >> +static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state( >> + SpaprMachineState *spapr) >> +{ >> + SpaprNVDIMMDeviceFlushState *state; >> + >> + state = g_malloc0(sizeof(*state)); >> + >> + flush_token++; >> + /* Token zero is presumed as no job pending. Handle the overflow to zero */ >> + if (flush_token = 0) { >> + flush_token++; > > Hmm... strictly speaking, this isn't safe. It's basically never going > to happen in practice, but in theory there's nothing preventing > continue_token 1 still being outstanding when the flush_token counter > overflows. > > Come to think of it, since it's a uint64_t, I think an actual overflow > is also never going to happen in practice. Maybe we should just > assert() on overflow, and fix it in the unlikely event that we ever > discover a case where it could happen. Have added the assert on overflow. > >> + } >> + state->continue_token = flush_token; >> + >> + QLIST_INSERT_HEAD(&spapr->pending_flush_states, state, node); >> + >> + return state; >> +} >> + >> +/* >> + * Thanks!