* [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave
2023-10-03 0:21 [RFC PATCH v2 0/4] mm/mempolicy: get/set_mempolicy2 syscalls Gregory Price
@ 2023-10-03 0:21 ` Gregory Price
0 siblings, 0 replies; 4+ messages in thread
From: Gregory Price @ 2023-10-03 0:21 UTC (permalink / raw)
To: linux-mm
Cc: linux-kernel, linux-arch, linux-api, linux-cxl, luto, tglx, mingo,
bp, dave.hansen, hpa, arnd, akpm, x86, Gregory Price
The weighted-interleave mempolicy implements weights per-node
which are used to distribute memory while interleaving.
For example:
nodes: 0,1,2
weights: 5,3,2
Over 10 consecutive allocations, the following nodes will be selected:
[0,0,0,0,0,1,1,1,2,2]
In this example there is a 50%/30%/20% distribution of memory across
the enabled nodes.
If a node is enabled, the minimum weight is expected to be 0. If an
enabled node ends up with a weight of 0 (as can happen if weights
are being recalculated due to a cgroup mask update), a minimum
of 1 is applied during the interleave mechanism.
Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
include/linux/mempolicy.h | 6 +
include/uapi/linux/mempolicy.h | 6 +
mm/mempolicy.c | 261 ++++++++++++++++++++++++++++++++-
3 files changed, 269 insertions(+), 4 deletions(-)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 8f918488c61c..8763e536d4a2 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -54,6 +54,12 @@ struct mempolicy {
int weight;
int count;
} pil;
+ /* weighted interleave */
+ struct {
+ unsigned int il_weight;
+ unsigned char cur_weight;
+ unsigned char weights[MAX_NUMNODES];
+ } wil;
};
union {
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 41c35f404c5e..913ca9bf9af7 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -25,6 +25,7 @@ enum {
MPOL_PREFERRED_MANY,
MPOL_LEGACY, /* set_mempolicy limited to above modes */
MPOL_PREFERRED_INTERLEAVE,
+ MPOL_WEIGHTED_INTERLEAVE,
MPOL_MAX, /* always last member of enum */
};
@@ -58,6 +59,11 @@ struct mempolicy_args {
unsigned long weight; /* get and set */
unsigned long next_node; /* get only */
} pil;
+ /* Weighted interleave */
+ struct {
+ unsigned long next_node; /* get only */
+ unsigned char *weights; /* get and set */
+ } wil;
};
};
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6374312cef5f..92be74d4c431 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -195,11 +195,43 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
nodes_onto(*ret, tmp, *rel);
}
+static void mpol_recalculate_weights(struct mempolicy *pol)
+{
+ unsigned int il_weight = 0;
+ int node;
+
+ /* Recalculate weights to ensure minimum node weight */
+ for (node = 0; node < MAX_NUMNODES; node++) {
+ if (!node_isset(node, pol->nodes) && pol->wil.weights[node]) {
+ /* If node is not set, weight should be 0 */
+ pol->wil.weights[node] = 0;
+ } else if (!pol->wil.weights[node]) {
+ /* If node is set, weight should be minimum of 1 */
+ pol->wil.weights[node] = 1;
+ pol->wil.il_weight += 1;
+ il_weight += 1;
+ } else {
+ /* Otherwise, keep the existing weight */
+ il_weight += pol->wil.weights[node];
+ }
+ }
+ pol->wil.il_weight = il_weight;
+ /*
+ * It's possible an allocation has been occurring at this point
+ * force it to go to the next node, since we just changed weights
+ */
+ pol->wil.cur_weight = 0;
+}
+
static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
pol->nodes = *nodes;
+
+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ mpol_recalculate_weights(pol);
+
return 0;
}
@@ -334,6 +366,10 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
tmp = *nodes;
pol->nodes = tmp;
+
+ /* After a change to the nodemask, weights must be recalculated */
+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ mpol_recalculate_weights(pol);
}
static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -403,6 +439,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
+ [MPOL_WEIGHTED_INTERLEAVE] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_nodemask,
+ },
[MPOL_PREFERRED] = {
.create = mpol_new_preferred,
.rebind = mpol_rebind_preferred,
@@ -878,8 +918,10 @@ static long replace_mempolicy(struct mempolicy *new, nodemask_t *nodes)
old = current->mempolicy;
current->mempolicy = new;
if (new && (new->mode == MPOL_INTERLEAVE ||
- new->mode == MPOL_PREFERRED_INTERLEAVE))
+ new->mode == MPOL_PREFERRED_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE))
current->il_prev = MAX_NUMNODES-1;
+
out:
task_unlock(current);
mpol_put(old);
@@ -921,6 +963,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
*nodes = p->nodes;
@@ -1632,6 +1675,56 @@ static long do_set_preferred_interleave(struct mempolicy_args *args,
return 0;
}
+static long do_set_weighted_interleave(struct mempolicy_args *args,
+ struct mempolicy *new,
+ nodemask_t *nodes)
+{
+ unsigned char weight;
+ unsigned char *weights;
+ int node;
+ int ret = 0;
+
+ /* Weighted interleave cannot be done with no nodemask */
+ if (nodes_empty(*nodes))
+ return -EINVAL;
+
+ /* Weighted interleave requires a set of weights */
+ if (!args->wil.weights)
+ return -EINVAL;
+
+ weights = kmalloc(MAX_NUMNODES, GFP_KERNEL);
+ if (!weights)
+ return -ENOMEM;
+
+ ret = copy_from_user(weights, args->wil.weights, MAX_NUMNODES);
+ if (ret) {
+ ret = -EFAULT;
+ goto weights_out;
+ }
+
+ new->wil.cur_weight = 0;
+ new->wil.il_weight = 0;
+ memset(new->wil.weights, 0, sizeof(new->wil.weights));
+
+ /* Weights for set nodes cannot be 0 */
+ node = first_node(*nodes);
+ while (node != MAX_NUMNODES) {
+ weight = weights[node];
+ if (!weight) {
+ ret = -EINVAL;
+ goto weights_out;
+ }
+ /* policy creation initializes total to nr_nodes, adjust it */
+ new->wil.il_weight += weight;
+ new->wil.weights[node] = weight;
+ node = next_node(node, *nodes);
+ }
+
+weights_out:
+ kfree(weights);
+ return ret;
+}
+
static long do_set_mempolicy2(struct mempolicy_args *args)
{
struct mempolicy *new = NULL;
@@ -1656,6 +1749,9 @@ static long do_set_mempolicy2(struct mempolicy_args *args)
case MPOL_PREFERRED_INTERLEAVE:
err = do_set_preferred_interleave(args, new, &nodes);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ err = do_set_weighted_interleave(args, new, &nodes);
+ break;
default:
BUG();
}
@@ -1799,6 +1895,12 @@ static long do_get_mempolicy2(struct mempolicy_args *kargs)
kargs->pil.weight = pol->pil.weight;
rc = 0;
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ kargs->wil.next_node = next_node_in(current->il_prev,
+ pol->nodes);
+ rc = copy_to_user(kargs->wil.weights, pol->wil.weights,
+ MAX_NUMNODES);
+ break;
default:
BUG();
}
@@ -2160,6 +2262,27 @@ static unsigned int preferred_interleave_nodes(struct mempolicy *policy)
return next;
}
+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
+{
+ unsigned int next;
+ unsigned char next_weight;
+ struct task_struct *me = current;
+
+ /* When weight reaches 0, we're on a new node, reset the weight */
+ next = next_node_in(me->il_prev, policy->nodes);
+ if (!policy->wil.cur_weight) {
+ /* If the node is set, at least 1 allocation is required */
+ next_weight = policy->wil.weights[next];
+ policy->wil.cur_weight = next_weight ? next_weight : 1;
+ }
+
+ policy->wil.cur_weight--;
+ if (next < MAX_NUMNODES && !policy->wil.cur_weight)
+ me->il_prev = next;
+
+ return next;
+}
+
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
@@ -2168,6 +2291,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
if (policy->mode == MPOL_PREFERRED_INTERLEAVE)
return preferred_interleave_nodes(policy);
+ else if (policy->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return weighted_interleave_nodes(policy);
next = next_node_in(me->il_prev, policy->nodes);
if (next < MAX_NUMNODES)
@@ -2197,6 +2322,7 @@ unsigned int mempolicy_slab_node(void)
case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
return interleave_nodes(policy);
case MPOL_BIND:
@@ -2273,6 +2399,40 @@ static unsigned int offset_pil_node(struct mempolicy *pol, unsigned long n)
return nid;
}
+static unsigned int offset_wil_node(struct mempolicy *pol, unsigned long n)
+{
+ nodemask_t nodemask = pol->nodes;
+ unsigned int target, nnodes;
+ unsigned char weight;
+ int nid;
+
+ /*
+ * The barrier will stabilize the nodemask in a register or on
+ * the stack so that it will stop changing under the code.
+ *
+ * Between first_node() and next_node(), pol->nodes could be changed
+ * by other threads. So we put pol->nodes in a local stack.
+ */
+ barrier();
+
+ nnodes = nodes_weight(nodemask);
+ if (!nnodes)
+ return numa_node_id();
+ target = (unsigned int)n % pol->wil.il_weight;
+ nid = first_node(nodemask);
+ while (target) {
+ weight = pol->wil.weights[nid];
+ /* If weights are being recaculated, revert to interleave */
+ if (!weight)
+ weight = 1;
+ if (target < weight)
+ break;
+ target -= weight;
+ nid = next_node_in(nid, nodemask);
+ }
+ return nid;
+}
+
/*
* Do static interleaving for a VMA with known offset @n. Returns the n'th
* node in pol->nodes (starting from n=0), wrapping around if n exceeds the
@@ -2287,6 +2447,8 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
if (pol->mode == MPOL_PREFERRED_INTERLEAVE)
return offset_pil_node(pol, n);
+ else if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return offset_wil_node(pol, n);
nodemask = pol->nodes;
@@ -2358,7 +2520,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
mode = (*mpol)->mode;
if (unlikely(mode == MPOL_INTERLEAVE) ||
- unlikely(mode == MPOL_PREFERRED_INTERLEAVE)) {
+ unlikely(mode == MPOL_PREFERRED_INTERLEAVE) ||
+ unlikely(mode == MPOL_WEIGHTED_INTERLEAVE)) {
nid = interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma)));
} else {
@@ -2400,6 +2563,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
*mask = mempolicy->nodes;
break;
@@ -2511,7 +2675,8 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
pol = get_vma_policy(vma, addr);
if (pol->mode == MPOL_INTERLEAVE ||
- pol->mode == MPOL_PREFERRED_INTERLEAVE) {
+ pol->mode == MPOL_PREFERRED_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
struct page *page;
unsigned nid;
@@ -2614,7 +2779,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE ||
- pol->mode == MPOL_PREFERRED_INTERLEAVE)
+ pol->mode == MPOL_PREFERRED_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else if (pol->mode == MPOL_PREFERRED_MANY)
page = alloc_pages_preferred_many(gfp, order,
@@ -2737,6 +2903,84 @@ static unsigned long alloc_pages_bulk_array_pil(gfp_t gfp,
return allocated;
}
+static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ struct task_struct *me = current;
+ unsigned long total_allocated = 0;
+ unsigned long nr_allocated;
+ unsigned long rounds;
+ unsigned long node_pages, delta;
+ unsigned char weight;
+ int nnodes, node, prev_node;
+ int i;
+
+ nnodes = nodes_weight(pol->nodes);
+ /* Continue allocating from most recent node and adjust the nr_pages */
+ if (pol->wil.cur_weight) {
+ node = next_node_in(me->il_prev, pol->nodes);
+ node_pages = pol->wil.cur_weight;
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ /* if that's all the pages, no need to interleave */
+ if (nr_pages <= pol->wil.cur_weight) {
+ pol->wil.cur_weight -= nr_pages;
+ return total_allocated;
+ }
+ /* Otherwise we adjust nr_pages down, and continue from there */
+ nr_pages -= pol->wil.cur_weight;
+ pol->wil.cur_weight = 0;
+ prev_node = node;
+ }
+
+ /* Now we can continue allocating from this point */
+ rounds = nr_pages / pol->wil.il_weight;
+ delta = nr_pages % pol->wil.il_weight;
+ for (i = 0; i < nnodes; i++) {
+ node = next_node_in(prev_node, pol->nodes);
+ weight = pol->wil.weights[node];
+ node_pages = weight * rounds;
+ if (delta) {
+ if (delta > weight) {
+ node_pages += weight;
+ delta -= weight;
+ } else {
+ node_pages += delta;
+ delta = 0;
+ }
+ }
+ /* We may not make it all the way around */
+ if (!node_pages)
+ break;
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ prev_node = node;
+ }
+
+ /*
+ * Finally, we need to update me->il_prev and pol->wil.cur_weight
+ * if there were overflow pages, but not equivalent to the node
+ * weight, set the cur_weight to node_weight - delta and the
+ * me->il_prev to the previous node. Otherwise if it was perfect
+ * we can simply set il_prev to node and cur_weight to 0
+ */
+ delta %= weight;
+ if (node_pages) {
+ me->il_prev = prev_node;
+ pol->wil.cur_weight = pol->wil.weights[node] - node_pages;
+ } else {
+ me->il_prev = node;
+ pol->wil.cur_weight = 0;
+ }
+
+ return total_allocated;
+}
+
static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
@@ -2779,6 +3023,11 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
return alloc_pages_bulk_array_pil(gfp, pol, nr_pages,
page_array);
+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return alloc_pages_bulk_array_weighted_interleave(gfp, pol,
+ nr_pages,
+ page_array);
+
if (pol->mode == MPOL_PREFERRED_MANY)
return alloc_pages_bulk_array_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
@@ -2852,6 +3101,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
return !!nodes_equal(a->nodes, b->nodes);
@@ -2989,6 +3239,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
switch (pol->mode) {
case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
pgoff = vma->vm_pgoff;
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
polnid = offset_il_node(pol, pgoff);
@@ -3377,6 +3628,7 @@ static const char * const policy_modes[] =
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_PREFERRED_INTERLEAVE] = "preferred interleave",
+ [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
[MPOL_LOCAL] = "local",
[MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -3548,6 +3800,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
nodes = pol->nodes;
break;
default:
--
2.39.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* RE: [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave
[not found] <CGME20231206080944epcms2p76ebb230b9f4595f5cfcd2531d67ab3ce@epcms2p7>
@ 2023-12-06 8:09 ` Seungjun Ha
2023-12-06 17:02 ` Gregory Price
0 siblings, 1 reply; 4+ messages in thread
From: Seungjun Ha @ 2023-12-06 8:09 UTC (permalink / raw)
To: Gregory Price
Cc: linux-cxl@vger.kernel.org, Gregory Price, KyungSan Kim,
Wonjae Lee
> The weighted-interleave mempolicy implements weights per-node
> which are used to distribute memory while interleaving.
>
> For example:
> nodes: 0,1,2
> weights: 5,3,2
>
> Over 10 consecutive allocations, the following nodes will be selected:
> [0,0,0,0,0,1,1,1,2,2]
>
> In this example there is a 50%/30%/20% distribution of memory across
> the enabled nodes.
>
> If a node is enabled, the minimum weight is expected to be 0. If an
> enabled node ends up with a weight of 0 (as can happen if weights
> are being recalculated due to a cgroup mask update), a minimum
> of 1 is applied during the interleave mechanism.
I found an issue while using the RFCv2, and want to report it.
In my testbed, calling set_mempolicy2() causes pthread_create() failure or system hang, depending on weight combinations.
FYI please find my testbed where there are 3 memory-nodes.
Node 0 Node 1 Node 2 Result
Weights
6 >= 1 1 pthread_create error: 11(Cannot allocate memory)
1~5 1 1 Pass
1 8 >= 1 pthread_create error: 11(Cannot allocate memory)
1 1~7 1 Pass
1 1 8 >= pthread_create error: 11(Cannot allocate memory)
1 1 1~7 Pass
6 7 7 pthread_create error: 11(Cannot allocate memory)
5 8 7 Pass
5 7 8 Pass
40 30 20 Kernel Hang
Below is the test code to reproduce the issue.
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <numa.h>
#include <errno.h>
#include <numaif.h>
#include <unistd.h>
#include <pthread.h>
#define MPOL_WEIGHTED_INTERLEAVE MPOL_DEFAULT + 8
#define SET_MEMPOLICY2(a, b) syscall(454, a, b)
struct mempolicy_args { on this RFC... }
struct mempolicy_args wil_args;
struct bitmask *wil_nodes;
unsigned char *weights;
int total_nodes = -1;
pthread_t tid;
void set_mempolicy_call()
{
weights = (unsigned char *)calloc(total_nodes, sizeof(unsigned char));
wil_nodes = numa_allocate_nodemask();
numa_bitmask_setbit(wil_nodes, 0); weights[0] = 40;
numa_bitmask_setbit(wil_nodes, 1); weights[1] = 30;
numa_bitmask_setbit(wil_nodes, 2); weights[2] = 20;
wil_args.maxnode = total_nodes + 1;
wil_args.wil.weights = weights;
wil_args.nodemask = wil_nodes->maskp;
wil_args.mode = MPOL_WEIGHTED_INTERLEAVE;
wil_args.flags = 0;
int ret = SET_MEMPOLICY2(&wil_args, sizeof(wil_args));
fprintf(stderr, "set_mempolicy2 result: %d(%s)\n", ret, strerror(errno));
}
int main()
{
total_nodes = numa_max_node() + 1;
set_mempolicy_call();
pthread_create(&tid, NULL, func, NULL);
return 0;
}
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave
2023-12-06 8:09 ` [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave Seungjun Ha
@ 2023-12-06 17:02 ` Gregory Price
2023-12-07 8:52 ` Seungjun Ha
0 siblings, 1 reply; 4+ messages in thread
From: Gregory Price @ 2023-12-06 17:02 UTC (permalink / raw)
To: Seungjun Ha
Cc: Gregory Price, linux-cxl@vger.kernel.org, KyungSan Kim,
Wonjae Lee
On Wed, Dec 06, 2023 at 05:09:44PM +0900, Seungjun Ha wrote:
> > The weighted-interleave mempolicy implements weights per-node
> > which are used to distribute memory while interleaving.
> >
> > For example:
> > nodes: 0,1,2
> > weights: 5,3,2
> >
> > Over 10 consecutive allocations, the following nodes will be selected:
> > [0,0,0,0,0,1,1,1,2,2]
> >
> > In this example there is a 50%/30%/20% distribution of memory across
> > the enabled nodes.
> >
> > If a node is enabled, the minimum weight is expected to be 0. If an
> > enabled node ends up with a weight of 0 (as can happen if weights
> > are being recalculated due to a cgroup mask update), a minimum
> > of 1 is applied during the interleave mechanism.
>
> I found an issue while using the RFCv2, and want to report it.
first, thank you very much for testing! I'll run this on my latest fork.
> In my testbed, calling set_mempolicy2() causes pthread_create() failure or system hang, depending on weight combinations.
>
I think this is likely because i did not handle __mpol_dup correctly.
The newer fork changes the way weights are stored, so this should not
but an issue, but I will use your test to validate this.
New RFC should hopefully be out this or next week.
> FYI please find my testbed where there are 3 memory-nodes.
>
> Node 0 Node 1 Node 2 Result
> Weights
> 6 >= 1 1 pthread_create error: 11(Cannot allocate memory)
> 1~5 1 1 Pass
> 1 8 >= 1 pthread_create error: 11(Cannot allocate memory)
> 1 1~7 1 Pass
> 1 1 8 >= pthread_create error: 11(Cannot allocate memory)
> 1 1 1~7 Pass
>
> 6 7 7 pthread_create error: 11(Cannot allocate memory)
> 5 8 7 Pass
> 5 7 8 Pass
>
> 40 30 20 Kernel Hang
>
>
> Below is the test code to reproduce the issue.
>
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <stdlib.h>
> #include <numa.h>
> #include <errno.h>
> #include <numaif.h>
> #include <unistd.h>
> #include <pthread.h>
>
> #define MPOL_WEIGHTED_INTERLEAVE MPOL_DEFAULT + 8
> #define SET_MEMPOLICY2(a, b) syscall(454, a, b)
>
> struct mempolicy_args { on this RFC... }
>
> struct mempolicy_args wil_args;
> struct bitmask *wil_nodes;
> unsigned char *weights;
> int total_nodes = -1;
> pthread_t tid;
>
> void set_mempolicy_call()
> {
> weights = (unsigned char *)calloc(total_nodes, sizeof(unsigned char));
> wil_nodes = numa_allocate_nodemask();
>
> numa_bitmask_setbit(wil_nodes, 0); weights[0] = 40;
> numa_bitmask_setbit(wil_nodes, 1); weights[1] = 30;
> numa_bitmask_setbit(wil_nodes, 2); weights[2] = 20;
>
> wil_args.maxnode = total_nodes + 1;
> wil_args.wil.weights = weights;
> wil_args.nodemask = wil_nodes->maskp;
> wil_args.mode = MPOL_WEIGHTED_INTERLEAVE;
> wil_args.flags = 0;
>
> int ret = SET_MEMPOLICY2(&wil_args, sizeof(wil_args));
> fprintf(stderr, "set_mempolicy2 result: %d(%s)\n", ret, strerror(errno));
> }
>
>
> int main()
> {
> total_nodes = numa_max_node() + 1;
>
> set_mempolicy_call();
> pthread_create(&tid, NULL, func, NULL);
>
> return 0;
> }
^ permalink raw reply [flat|nested] 4+ messages in thread
* RE: Re: [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave
2023-12-06 17:02 ` Gregory Price
@ 2023-12-07 8:52 ` Seungjun Ha
0 siblings, 0 replies; 4+ messages in thread
From: Seungjun Ha @ 2023-12-07 8:52 UTC (permalink / raw)
To: Gregory Price
Cc: Gregory Price, linux-cxl@vger.kernel.org, KyungSan Kim,
Wonjae Lee
> On Wed, Dec 06, 2023 at 05:09:44PM +0900, Seungjun Ha wrote:
> > > The weighted-interleave mempolicy implements weights per-node
> > > which are used to distribute memory while interleaving.
> > >
> > > For example:
> > > nodes: 0,1,2
> > > weights: 5,3,2
> > >
> > > Over 10 consecutive allocations, the following nodes will be selected:
> > > [0,0,0,0,0,1,1,1,2,2]
> > >
> > > In this example there is a 50%/30%/20% distribution of memory across
> > > the enabled nodes.
> > >
> > > If a node is enabled, the minimum weight is expected to be 0. If an
> > > enabled node ends up with a weight of 0 (as can happen if weights
> > > are being recalculated due to a cgroup mask update), a minimum
> > > of 1 is applied during the interleave mechanism.
> >
> > I found an issue while using the RFCv2, and want to report it.
>
> first, thank you very much for testing! I'll run this on my latest fork.
>
> > In my testbed, calling set_mempolicy2() causes pthread_create() failure or system hang, depending on weight combinations.
> >
>
> I think this is likely because i did not handle __mpol_dup correctly.
> The newer fork changes the way weights are stored, so this should not
> but an issue, but I will use your test to validate this.
>
> New RFC should hopefully be out this or next week.
>
FYI, send the whole dmesg log that occured in the problem situation.
[ 464.155511] app: vmalloc error: size 155648, failed to allocate pages, mode:0xdc2(GFP_KERNEL|__GFP_HIGHMEM|__GFP_ZERO), nodemask=(null),cpuset=/,mems_allowed=0-4
[ 464.155550] CPU: 93 PID: 5281 Comm: app Tainted: G D W E 6.6.0-rc4+ #2
[ 464.155561] Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDREL1.SYS.0085.D15.2207241333 07/24/2022
[ 464.155565] Call Trace:
[ 464.155572] <TASK>
[ 464.155580] dump_stack_lvl+0x48/0x70
[ 464.155597] dump_stack+0x10/0x20
[ 464.155602] warn_alloc+0x119/0x190
[ 464.155613] ? __vmalloc_node_range+0x1d2/0x850
[ 464.155623] __vmalloc_node_range+0x7d4/0x850
[ 464.155631] ? kernel_clone+0x9d/0x3c0
[ 464.155650] copy_process+0xa0f/0x1d20
[ 464.155661] ? kernel_clone+0x9d/0x3c0
[ 464.155671] ? __handle_mm_fault+0x769/0xdb0
[ 464.155686] kernel_clone+0x9d/0x3c0
[ 464.155698] __do_sys_clone+0x66/0x90
[ 464.155712] __x64_sys_clone+0x25/0x30
[ 464.155722] do_syscall_64+0x59/0x90
[ 464.155730] ? exc_page_fault+0x8a/0x180
[ 464.155739] entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[ 464.155750] RIP: 0033:0x7f50093f4125
[ 464.155762] Code: 48 85 ff 74 3d 48 85 f6 74 38 48 83 ee 10 48 89 4e 08 48 89 3e 48 89 d7 4c 89 c2 4d 89 c8 4c 8b 54 24 08 b8 38 00 00 00 0f 05 <48> 85 c0 7c 13 74 01 c3 31 ed 58 5f ff d0 48 89 c7 b8 3c 00 00 00
[ 464.155772] RSP: 002b:00007ffea8877418 EFLAGS: 00000202 ORIG_RAX: 0000000000000038
[ 464.155781] RAX: ffffffffffffffda RBX: 00007f50082cf700 RCX: 00007f50093f4125
[ 464.155786] RDX: 00007f50082cf9d0 RSI: 00007f50082cefb0 RDI: 00000000003d0f00
[ 464.155791] RBP: 00007ffea88774d0 R08: 00007f50082cf700 R09: 00007f50082cf700
[ 464.155795] R10: 00007f50082cf9d0 R11: 0000000000000202 R12: 00007ffea88774ce
[ 464.155799] R13: 00007ffea88774cf R14: 00007ffea88774d0 R15: 00007f50082cefc0
[ 464.155809] </TASK>
[ 464.155852] Mem-Info:
[ 464.155902] active_anon:1025 inactive_anon:174970 isolated_anon:0
active_file:154846 inactive_file:211215 isolated_file:0
unevictable:2029 dirty:237 writeback:0
slab_reclaimable:32736 slab_unreclaimable:106078
mapped:97039 shmem:6403 pagetables:3841
sec_pagetables:0 bounce:0
kernel_misc_reclaimable:0
free:164219875 free_pcp:36236 free_cma:0
[ 464.155921] Node 0 active_anon:2128kB inactive_anon:491864kB active_file:150184kB inactive_file:522756kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:245188kB dirty:68kB writeback:0kB shmem:11812kB shmem_thp:0kB shmem_pmdmapped:0kB anon_thp:0kB writeback_tmp:0kB kernel_stack:18552kB pagetables:9492kB sec_pagetables:0kB all_unreclaimable? no
[ 464.155935] Node 1 active_anon:1972kB inactive_anon:208016kB active_file:469200kB inactive_file:322104kB unevictable:8116kB isolated(anon):0kB isolated(file):0kB mapped:142968kB dirty:880kB writeback:0kB shmem:13800kB shmem_thp:0kB shmem_pmdmapped:0kB anon_thp:0kB writeback_tmp:0kB kernel_stack:18712kB pagetables:5872kB sec_pagetables:0kB all_unreclaimable? no
[ 464.155954] Node 0 DMA free:11264kB boost:0kB min:8kB low:20kB high:32kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:15992kB managed:15360kB mlocked:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
[ 464.155970] lowmem_reserve[]: 0 1765 64175 64175 64175
[ 464.155988] Node 0 DMA32 free:1803124kB boost:0kB min:1236kB low:3040kB high:4844kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:1873332kB managed:1807640kB mlocked:0kB bounce:0kB free_pcp:420kB local_pcp:0kB free_cma:0kB
[ 464.156005] lowmem_reserve[]: 0 0 62409 62409 62409
[ 464.156023] Node 0 Normal free:55826092kB boost:0kB min:43704kB low:107608kB high:171512kB reserved_highatomic:0KB active_anon:2128kB inactive_anon:491864kB active_file:150184kB inactive_file:522756kB unevictable:0kB writepending:68kB present:65011712kB managed:63907776kB mlocked:0kB bounce:0kB free_pcp:59312kB local_pcp:592kB free_cma:0kB
[ 464.156040] lowmem_reserve[]: 0 0 0 0 0
[ 464.156053] Node 1 Normal free:62368616kB boost:0kB min:45156kB low:111184kB high:177212kB reserved_highatomic:0KB active_anon:1972kB inactive_anon:208016kB active_file:469200kB inactive_file:322104kB unevictable:8116kB writepending:880kB present:67108864kB managed:66029252kB mlocked:0kB bounce:0kB free_pcp:85212kB local_pcp:652kB free_cma:0kB
[ 464.156067] lowmem_reserve[]: 0 0 0 0 0
[ 464.156078] Node 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 1*1024kB (U) 1*2048kB (M) 2*4096kB (M) = 11264kB
[ 464.156106] Node 0 DMA32: 7*4kB (UM) 9*8kB (M) 7*16kB (M) 7*32kB (M) 7*64kB (M) 8*128kB (M) 4*256kB (M) 4*512kB (M) 6*1024kB (M) 5*2048kB (M) 435*4096kB (M) = 1803124kB
[ 464.156142] Node 0 Normal: 0*4kB 443*8kB (UME) 1063*16kB (UME) 2103*32kB (UME) 1484*64kB (UME) 859*128kB (UME) 416*256kB (UME) 204*512kB (UM) 64*1024kB (UME) 11*2048kB (M) 13485*4096kB (M) = 55826344kB
[ 464.156178] Node 1 Normal: 585*4kB (UME) 489*8kB (UME) 306*16kB (UME) 493*32kB (UME) 199*64kB (UME) 137*128kB (UME) 11*256kB (UME) 36*512kB (UM) 40*1024kB (UM) 7*2048kB (UME) 15194*4096kB (M) = 62368364kB
[ 464.156225] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[ 464.156232] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[ 464.156238] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[ 464.156243] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[ 464.156248] Node 2 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[ 464.156255] Node 2 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[ 464.156259] Node 3 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[ 464.156263] Node 3 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[ 464.156266] Node 4 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[ 464.156270] Node 4 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[ 464.156275] 369910 total pagecache pages
[ 464.156280] 0 pages in swap cache
[ 464.156283] Free swap = 10485756kB
[ 464.156286] Total swap = 10485756kB
[ 464.156290] 167720203 pages RAM
[ 464.156293] 0 pages HighMem/MovableOnly
[ 464.156296] 562468 pages reserved
[ 464.156299] 0 pages hwpoisoned
[ 464.156331] general protection fault, probably for non-canonical address 0x300b162afbcb6e0f: 0000 [#2] PREEMPT SMP NOPTI
[ 464.156341] CPU: 93 PID: 5281 Comm: app Tainted: G D W E 6.6.0-rc4+ #2
[ 464.156350] Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDREL1.SYS.0085.D15.2207241333 07/24/2022
[ 464.156355] RIP: 0010:vfree+0x7e/0x2d0
[ 464.156364] Code: 48 85 c0 0f 84 42 02 00 00 8b 50 2c f6 40 19 01 0f 85 0c 01 00 00 85 d2 0f 85 85 00 00 00 e9 96 00 00 00 66 90 e8 42 6c e0 ff <49> 8b 44 24 08 a8 01 0f 85 e2 00 00 00 0f 1f 44 00 00 4c 89 e0 48
[ 464.156373] RSP: 0018:ffa000000cfebc00 EFLAGS: 00010202
[ 464.156379] RAX: 0000000000000001 RBX: 0000000000000006 RCX: 0000000000000000
[ 464.156384] RDX: ff11001086c2e000 RSI: 000000000000009a RDI: 00000000ffffffff
[ 464.156389] RBP: ffa000000cfebc30 R08: 0000000000000000 R09: ff1100203f1784c0
[ 464.156394] R10: ffd40000046248c8 R11: 0000000000000000 R12: 300b162afbcb6e07
[ 464.156397] R13: ff1100108ecd9340 R14: 00000000ffffffff R15: ff1100108ecd9340
[ 464.156401] FS: 00007f50092d2740(0000) GS:ff1100203f140000(0000) knlGS:0000000000000000
[ 464.156408] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 464.156415] CR2: 00007f50082cefb8 CR3: 00000011535fa006 CR4: 0000000000771ee0
[ 464.156422] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 464.156426] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
[ 464.156430] PKRU: 55555554
[ 464.156434] Call Trace:
[ 464.156438] <TASK>
[ 464.156442] ? show_regs+0x68/0x70
[ 464.156457] ? __die_body+0x20/0x70
[ 464.156467] ? die_addr+0x3e/0x60
[ 464.156477] ? exc_general_protection+0x231/0x460
[ 464.156491] ? asm_exc_general_protection+0x27/0x30
[ 464.156503] ? vfree+0x7e/0x2d0
[ 464.156512] __vmalloc_node_range+0x7dd/0x850
[ 464.156523] ? kernel_clone+0x9d/0x3c0
[ 464.156538] copy_process+0xa0f/0x1d20
[ 464.156549] ? kernel_clone+0x9d/0x3c0
[ 464.156559] ? __handle_mm_fault+0x769/0xdb0
[ 464.156569] kernel_clone+0x9d/0x3c0
[ 464.156582] __do_sys_clone+0x66/0x90
[ 464.156594] __x64_sys_clone+0x25/0x30
[ 464.156606] do_syscall_64+0x59/0x90
[ 464.156613] ? exc_page_fault+0x8a/0x180
[ 464.156620] entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[ 464.156627] RIP: 0033:0x7f50093f4125
[ 464.156631] Code: 48 85 ff 74 3d 48 85 f6 74 38 48 83 ee 10 48 89 4e 08 48 89 3e 48 89 d7 4c 89 c2 4d 89 c8 4c 8b 54 24 08 b8 38 00 00 00 0f 05 <48> 85 c0 7c 13 74 01 c3 31 ed 58 5f ff d0 48 89 c7 b8 3c 00 00 00
[ 464.156636] RSP: 002b:00007ffea8877418 EFLAGS: 00000202 ORIG_RAX: 0000000000000038
[ 464.156643] RAX: ffffffffffffffda RBX: 00007f50082cf700 RCX: 00007f50093f4125
[ 464.156648] RDX: 00007f50082cf9d0 RSI: 00007f50082cefb0 RDI: 00000000003d0f00
[ 464.156652] RBP: 00007ffea88774d0 R08: 00007f50082cf700 R09: 00007f50082cf700
[ 464.156657] R10: 00007f50082cf9d0 R11: 0000000000000202 R12: 00007ffea88774ce
[ 464.156661] R13: 00007ffea88774cf R14: 00007ffea88774d0 R15: 00007f50082cefc0
[ 464.156668] </TASK>
[ 464.156670] Modules linked in: xt_conntrack(E) xt_MASQUERADE(E) nf_conntrack_netlink(E) nfnetlink(E) xfrm_user(E) xfrm_algo(E) iptable_nat(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) libcrc32c(E) xt_addrtype(E) iptable_filter(E) bpfilter(E) br_netfilter(E) bridge(E) stp(E) llc(E) overlay(E) nls_iso8859_1(E) intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency(E) intel_uncore_frequency_common(E) i10nm_edac(E) nfit(E) snd_hda_codec_realtek(E) snd_hda_codec_generic(E) ledtrig_audio(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) coretemp(E) snd_hda_intel(E) kvm_intel(E) kmem(E) snd_intel_dspcfg(E) snd_intel_sdw_acpi(E) input_leds(E) device_dax(E) snd_hda_codec(E) kvm(E) cxl_mem(E) snd_hda_core(E) cxl_port(E) crct10dif_pclmul(E) cxl_pmu(E) snd_hwdep(E) ghash_clmulni_intel(E) snd_pcm(E) sha512_ssse3(E) binfmt_misc(E) snd_seq_midi(E) snd_seq_midi_event(E) aesni_intel(E) crypto_simd(E) snd_rawmidi(E) cryptd(E) snd_seq(E) rapl(E) ast(E) drm_shmem_helper(E) dax_hmem(E) snd_seq_device(E)
[ 464.156784] intel_cstate(E) snd_timer(E) drm_kms_helper(E) cxl_acpi(E) cxl_pci(E) isst_if_mmio(E) i2c_algo_bit(E) isst_if_mbox_pci(E) snd(E) mei_me(E) idxd(E) isst_if_common(E) ipmi_ssif(E) cxl_core(E) idxd_bus(E) soundcore(E) mei(E) acpi_ipmi(E) ipmi_si(E) ipmi_devintf(E) ipmi_msghandler(E) acpi_power_meter(E) acpi_pad(E) mac_hid(E) sch_fq_codel(E) msr(E) parport_pc(E) ppdev(E) lp(E) ramoops(E) parport(E) reed_solomon(E) drm(E) efi_pstore(E) ip_tables(E) x_tables(E) autofs4(E) hid_generic(E) usbhid(E) hid(E) i2c_i801(E) nvme(E) ahci(E) xhci_pci(E) nvme_core(E) libahci(E) i2c_smbus(E) crc32_pclmul(E) igc(E) i2c_ismt(E) xhci_pci_renesas(E) wmi(E) pinctrl_emmitsburg(E)
[ 464.156883] ---[ end trace 0000000000000000 ]---
[ 464.344754] RIP: 0010:vfree+0x7e/0x2d0
[ 464.344764] Code: 48 85 c0 0f 84 42 02 00 00 8b 50 2c f6 40 19 01 0f 85 0c 01 00 00 85 d2 0f 85 85 00 00 00 e9 96 00 00 00 66 90 e8 42 6c e0 ff <49> 8b 44 24 08 a8 01 0f 85 e2 00 00 00 0f 1f 44 00 00 4c 89 e0 48
[ 464.344772] RSP: 0018:ffa000000ce73bf0 EFLAGS: 00010202
[ 464.344788] RAX: 0000000000000001 RBX: 0000000000000006 RCX: 0000000000000000
[ 464.344799] RDX: ff110001260a2000 RSI: 00000000000000c5 RDI: 00000000ffffffff
[ 464.344806] RBP: ffa000000ce73c20 R08: 0000000000000000 R09: ff1100103f4784c0
[ 464.344813] R10: ffd40000042e6c88 R11: 0000000000000000 R12: d0ec0eba6bd389a7
[ 464.344820] R13: ff1100010fd51880 R14: 00000000ffffffff R15: ff1100010fd51880
[ 464.344826] FS: 00007f50092d2740(0000) GS:ff1100203f140000(0000) knlGS:0000000000000000
[ 464.344836] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 464.344843] CR2: 00007f50082cefb8 CR3: 00000011535fa006 CR4: 0000000000771ee0
[ 464.344850] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 464.344856] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
[ 464.344863] PKRU: 55555554
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2023-12-07 8:52 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <CGME20231206080944epcms2p76ebb230b9f4595f5cfcd2531d67ab3ce@epcms2p7>
2023-12-06 8:09 ` [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave Seungjun Ha
2023-12-06 17:02 ` Gregory Price
2023-12-07 8:52 ` Seungjun Ha
2023-10-03 0:21 [RFC PATCH v2 0/4] mm/mempolicy: get/set_mempolicy2 syscalls Gregory Price
2023-10-03 0:21 ` [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave Gregory Price
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox