* [PATCH v1 2/2] osdep: support mempolicy for preallocation in os_mem_prealloc
2021-12-07 7:06 [PATCH v1 1/2] hostmem: use a static size for maxnode, validate policy everywhere Daniil Tatianin
@ 2021-12-07 7:06 ` Daniil Tatianin
2021-12-07 8:13 ` David Hildenbrand
2021-12-07 8:31 ` [PATCH v1 1/2] hostmem: use a static size for maxnode, validate policy everywhere David Hildenbrand
1 sibling, 1 reply; 5+ messages in thread
From: Daniil Tatianin @ 2021-12-07 7:06 UTC (permalink / raw)
To: qemu-devel; +Cc: imammedo, sw, pbonzini, yc-core, david
This is needed for cases where we want to make sure that a shared memory
region gets allocated from a specific NUMA node. This is impossible to do
with mbind(2) because it ignores the policy for memory mapped with
MAP_SHARED. We work around this by calling set_mempolicy from prealloc
threads instead.
Signed-off-by: Daniil Tatianin <d-tatianin@yandex-team.ru>
---
backends/hostmem.c | 6 ++++--
include/qemu/osdep.h | 3 ++-
util/meson.build | 2 ++
util/oslib-posix.c | 29 ++++++++++++++++++++++++++---
util/oslib-win32.c | 3 ++-
5 files changed, 36 insertions(+), 7 deletions(-)
diff --git a/backends/hostmem.c b/backends/hostmem.c
index 392026efe6..0c508ed9df 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -269,7 +269,8 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
void *ptr = memory_region_get_ram_ptr(&backend->mr);
uint64_t sz = memory_region_size(&backend->mr);
- os_mem_prealloc(fd, ptr, sz, backend->prealloc_threads, &local_err);
+ os_mem_prealloc(fd, ptr, sz, backend->prealloc_threads, backend->policy,
+ backend->host_nodes, MAX_NODES + 1, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
@@ -415,7 +416,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
*/
if (backend->prealloc) {
os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,
- backend->prealloc_threads, &local_err);
+ backend->prealloc_threads, backend->policy,
+ backend->host_nodes, MAX_NODES + 1, &local_err);
if (local_err) {
goto out;
}
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 60718fc342..abf88aeb0e 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -688,7 +688,8 @@ unsigned long qemu_getauxval(unsigned long type);
void qemu_set_tty_echo(int fd, bool echo);
void os_mem_prealloc(int fd, char *area, size_t sz, int smp_cpus,
- Error **errp);
+ int policy, unsigned long *node_bitmap,
+ unsigned long max_node, Error **errp);
/**
* qemu_get_pid_name:
diff --git a/util/meson.build b/util/meson.build
index 05b593055a..25f9fca379 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -87,3 +87,5 @@ if have_block
if_false: files('filemonitor-stub.c'))
util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
endif
+
+util_ss.add(when: 'CONFIG_NUMA', if_true: numa)
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e8bdb02e1d..bca25698c5 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -38,11 +38,13 @@
#include "qemu/sockets.h"
#include "qemu/thread.h"
#include <libgen.h>
+#include "qemu/bitmap.h"
#include "qemu/cutils.h"
#include "qemu/compiler.h"
#ifdef CONFIG_LINUX
#include <sys/syscall.h>
+#include <numaif.h>
#endif
#ifdef __FreeBSD__
@@ -79,6 +81,9 @@ struct MemsetThread {
size_t hpagesize;
QemuThread pgthread;
sigjmp_buf env;
+ int policy;
+ unsigned long *node_bitmap;
+ unsigned long max_node;
};
typedef struct MemsetThread MemsetThread;
@@ -464,6 +469,18 @@ static void *do_touch_pages(void *arg)
}
qemu_mutex_unlock(&page_mutex);
+#ifdef CONFIG_NUMA
+ if (memset_args->max_node &&
+ !bitmap_empty(memset_args->node_bitmap, memset_args->max_node)) {
+ long ret = set_mempolicy(memset_args->policy, memset_args->node_bitmap,
+ memset_args->max_node);
+ if (ret < 0) {
+ memset_thread_failed = true;
+ return NULL;
+ }
+ }
+#endif
+
/* unblock SIGBUS */
sigemptyset(&set);
sigaddset(&set, SIGBUS);
@@ -510,7 +527,8 @@ static inline int get_memset_num_threads(int smp_cpus)
}
static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
- int smp_cpus)
+ int smp_cpus, int policy,
+ unsigned long *node_bitmap, unsigned long max_node)
{
static gsize initialized = 0;
size_t numpages_per_thread, leftover;
@@ -533,6 +551,9 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
memset_thread[i].addr = addr;
memset_thread[i].numpages = numpages_per_thread + (i < leftover);
memset_thread[i].hpagesize = hpagesize;
+ memset_thread[i].policy = policy;
+ memset_thread[i].node_bitmap = node_bitmap;
+ memset_thread[i].max_node = max_node;
qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
do_touch_pages, &memset_thread[i],
QEMU_THREAD_JOINABLE);
@@ -554,7 +575,8 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
}
void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
- Error **errp)
+ int policy, unsigned long *node_bitmap,
+ unsigned long max_node, Error **errp)
{
int ret;
struct sigaction act, oldact;
@@ -573,7 +595,8 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
}
/* touch pages simultaneously */
- if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
+ if (touch_all_pages(area, hpagesize, numpages, smp_cpus, policy,
+ node_bitmap, max_node)) {
error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
"pages available to allocate guest RAM");
}
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index af559ef339..3e56bf9f09 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -371,7 +371,8 @@ int getpagesize(void)
}
void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
- Error **errp)
+ int policy, unsigned long *node_bitmap,
+ unsigned long max_node, Error **errp)
{
int i;
size_t pagesize = qemu_real_host_page_size;
--
2.25.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH v1 1/2] hostmem: use a static size for maxnode, validate policy everywhere
2021-12-07 7:06 [PATCH v1 1/2] hostmem: use a static size for maxnode, validate policy everywhere Daniil Tatianin
2021-12-07 7:06 ` [PATCH v1 2/2] osdep: support mempolicy for preallocation in os_mem_prealloc Daniil Tatianin
@ 2021-12-07 8:31 ` David Hildenbrand
1 sibling, 0 replies; 5+ messages in thread
From: David Hildenbrand @ 2021-12-07 8:31 UTC (permalink / raw)
To: Daniil Tatianin, qemu-devel; +Cc: imammedo, pbonzini, yc-core, sw
On 07.12.21 08:06, Daniil Tatianin wrote:
> Previously we would calculate the last set bit in the mask, and add
> 2 to that value to get the maxnode value. This is unnecessary since
> the mbind syscall allows the bitmap to be any (reasonable) size as
> long as all the unused bits are clear. This also adds policy validation
> in multiple places so that it's guaranteed to be valid when we call
> mbind.
>
> Signed-off-by: Daniil Tatianin <d-tatianin@yandex-team.ru>
> ---
> backends/hostmem.c | 64 +++++++++++++++++++++++++++++++---------------
> 1 file changed, 43 insertions(+), 21 deletions(-)
>
> diff --git a/backends/hostmem.c b/backends/hostmem.c
> index 4c05862ed5..392026efe6 100644
> --- a/backends/hostmem.c
> +++ b/backends/hostmem.c
> @@ -38,6 +38,29 @@ host_memory_backend_get_name(HostMemoryBackend *backend)
> return object_get_canonical_path(OBJECT(backend));
> }
>
> +static bool
> +validate_policy(HostMemPolicy policy, bool nodes_empty, Error **errp)
> +{
> + /*
> + * check for invalid host-nodes and policies and give more verbose
> + * error messages than mbind().
> + */
> + if (!nodes_empty && policy == MPOL_DEFAULT) {
> + error_setg(errp, "host-nodes must be empty for policy default,"
> + " or you should explicitly specify a policy other"
> + " than default");
> + return false;
> + }
> +
> + if (nodes_empty && policy != MPOL_DEFAULT) {
> + error_setg(errp, "host-nodes must be set for policy %s",
> + HostMemPolicy_str(policy));
> + return false;
> + }
> +
> + return true;
> +}
Hm, we set two properties individually but bail out when the current combination
is impossible, which is nasty. It means we have modify properties in the right order
(which will differ based on the policy) to make a change.
Do we have any sane use case of modifying the policy/host-nodes at runtime?
I mean, it's just completely wrong when we already have any memory
preallocated/touched inside the range, as we won't issue another mbind call.
I suggest instead to fix this hole:
diff --git a/backends/hostmem.c b/backends/hostmem.c
index 4c05862ed5..7edc3a075e 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -111,6 +111,11 @@ host_memory_backend_set_host_nodes(Object *obj, Visitor *v, const char *name,
HostMemoryBackend *backend = MEMORY_BACKEND(obj);
uint16List *l, *host_nodes = NULL;
+ if (host_memory_backend_mr_inited(backend)) {
+ error_setg(errp, "Property 'host-nodes' cannot be changed anymore.");
+ return;
+ }
+
visit_type_uint16List(v, name, &host_nodes, errp);
for (l = host_nodes; l; l = l->next) {
@@ -142,6 +147,12 @@ static void
host_memory_backend_set_policy(Object *obj, int policy, Error **errp)
{
HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+
+ if (host_memory_backend_mr_inited(backend)) {
+ error_setg(errp, "Property 'policy' cannot be changed anymore.");
+ return;
+ }
+
backend->policy = policy;
#ifndef CONFIG_NUMA
--
Thanks,
David / dhildenb
^ permalink raw reply related [flat|nested] 5+ messages in thread