Linux CXL
 help / color / mirror / Atom feed
* [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave
  2023-10-03  0:21 [RFC PATCH v2 0/4] mm/mempolicy: get/set_mempolicy2 syscalls Gregory Price
@ 2023-10-03  0:21 ` Gregory Price
  0 siblings, 0 replies; 4+ messages in thread
From: Gregory Price @ 2023-10-03  0:21 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, linux-arch, linux-api, linux-cxl, luto, tglx, mingo,
	bp, dave.hansen, hpa, arnd, akpm, x86, Gregory Price

The weighted-interleave mempolicy implements weights per-node
which are used to distribute memory while interleaving.

For example:
   nodes: 0,1,2
   weights: 5,3,2

Over 10 consecutive allocations, the following nodes will be selected:
[0,0,0,0,0,1,1,1,2,2]

In this example there is a 50%/30%/20% distribution of memory across
the enabled nodes.

If a node is enabled, the minimum weight is expected to be 0. If an
enabled node ends up with a weight of 0 (as can happen if weights
are being recalculated due to a cgroup mask update), a minimum
of 1 is applied during the interleave mechanism.

Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 include/linux/mempolicy.h      |   6 +
 include/uapi/linux/mempolicy.h |   6 +
 mm/mempolicy.c                 | 261 ++++++++++++++++++++++++++++++++-
 3 files changed, 269 insertions(+), 4 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 8f918488c61c..8763e536d4a2 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -54,6 +54,12 @@ struct mempolicy {
 			int weight;
 			int count;
 		} pil;
+		/* weighted interleave */
+		struct {
+			unsigned int il_weight;
+			unsigned char cur_weight;
+			unsigned char weights[MAX_NUMNODES];
+		} wil;
 	};
 
 	union {
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 41c35f404c5e..913ca9bf9af7 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -25,6 +25,7 @@ enum {
 	MPOL_PREFERRED_MANY,
 	MPOL_LEGACY,	/* set_mempolicy limited to above modes */
 	MPOL_PREFERRED_INTERLEAVE,
+	MPOL_WEIGHTED_INTERLEAVE,
 	MPOL_MAX,	/* always last member of enum */
 };
 
@@ -58,6 +59,11 @@ struct mempolicy_args {
 			unsigned long weight;  /* get and set */
 			unsigned long next_node; /* get only */
 		} pil;
+		/* Weighted interleave */
+		struct {
+			unsigned long next_node; /* get only */
+			unsigned char *weights; /* get and set */
+		} wil;
 	};
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6374312cef5f..92be74d4c431 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -195,11 +195,43 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 	nodes_onto(*ret, tmp, *rel);
 }
 
+static void mpol_recalculate_weights(struct mempolicy *pol)
+{
+	unsigned int il_weight = 0;
+	int node;
+
+	/* Recalculate weights to ensure minimum node weight */
+	for (node = 0; node < MAX_NUMNODES; node++) {
+		if (!node_isset(node, pol->nodes) && pol->wil.weights[node]) {
+			/* If node is not set, weight should be 0 */
+			pol->wil.weights[node] = 0;
+		} else if (!pol->wil.weights[node]) {
+			/* If node is set, weight should be minimum of 1 */
+			pol->wil.weights[node] = 1;
+			pol->wil.il_weight += 1;
+			il_weight += 1;
+		} else {
+			/* Otherwise, keep the existing weight */
+			il_weight += pol->wil.weights[node];
+		}
+	}
+	pol->wil.il_weight = il_weight;
+	/*
+	 * It's possible an allocation has been occurring at this point
+	 * force it to go to the next node, since we just changed weights
+	 */
+	pol->wil.cur_weight = 0;
+}
+
 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	if (nodes_empty(*nodes))
 		return -EINVAL;
 	pol->nodes = *nodes;
+
+	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+		mpol_recalculate_weights(pol);
+
 	return 0;
 }
 
@@ -334,6 +366,10 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 		tmp = *nodes;
 
 	pol->nodes = tmp;
+
+	/* After a change to the nodemask, weights must be recalculated */
+	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+		mpol_recalculate_weights(pol);
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -403,6 +439,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 		.create = mpol_new_nodemask,
 		.rebind = mpol_rebind_nodemask,
 	},
+	[MPOL_WEIGHTED_INTERLEAVE] = {
+		.create = mpol_new_nodemask,
+		.rebind = mpol_rebind_nodemask,
+	},
 	[MPOL_PREFERRED] = {
 		.create = mpol_new_preferred,
 		.rebind = mpol_rebind_preferred,
@@ -878,8 +918,10 @@ static long replace_mempolicy(struct mempolicy *new, nodemask_t *nodes)
 	old = current->mempolicy;
 	current->mempolicy = new;
 	if (new && (new->mode == MPOL_INTERLEAVE ||
-		    new->mode == MPOL_PREFERRED_INTERLEAVE))
+		    new->mode == MPOL_PREFERRED_INTERLEAVE ||
+		    new->mode == MPOL_WEIGHTED_INTERLEAVE))
 		current->il_prev = MAX_NUMNODES-1;
+
 out:
 	task_unlock(current);
 	mpol_put(old);
@@ -921,6 +963,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
 		*nodes = p->nodes;
@@ -1632,6 +1675,56 @@ static long do_set_preferred_interleave(struct mempolicy_args *args,
 	return 0;
 }
 
+static long do_set_weighted_interleave(struct mempolicy_args *args,
+				       struct mempolicy *new,
+				       nodemask_t *nodes)
+{
+	unsigned char weight;
+	unsigned char *weights;
+	int node;
+	int ret = 0;
+
+	/* Weighted interleave cannot be done with no nodemask */
+	if (nodes_empty(*nodes))
+		return -EINVAL;
+
+	/* Weighted interleave requires a set of weights */
+	if (!args->wil.weights)
+		return -EINVAL;
+
+	weights = kmalloc(MAX_NUMNODES, GFP_KERNEL);
+	if (!weights)
+		return -ENOMEM;
+
+	ret = copy_from_user(weights, args->wil.weights, MAX_NUMNODES);
+	if (ret) {
+		ret = -EFAULT;
+		goto weights_out;
+	}
+
+	new->wil.cur_weight = 0;
+	new->wil.il_weight = 0;
+	memset(new->wil.weights, 0, sizeof(new->wil.weights));
+
+	/* Weights for set nodes cannot be 0 */
+	node = first_node(*nodes);
+	while (node != MAX_NUMNODES) {
+		weight = weights[node];
+		if (!weight) {
+			ret = -EINVAL;
+			goto weights_out;
+		}
+		/* policy creation initializes total to nr_nodes, adjust it */
+		new->wil.il_weight += weight;
+		new->wil.weights[node] = weight;
+		node = next_node(node, *nodes);
+	}
+
+weights_out:
+	kfree(weights);
+	return ret;
+}
+
 static long do_set_mempolicy2(struct mempolicy_args *args)
 {
 	struct mempolicy *new = NULL;
@@ -1656,6 +1749,9 @@ static long do_set_mempolicy2(struct mempolicy_args *args)
 	case MPOL_PREFERRED_INTERLEAVE:
 		err = do_set_preferred_interleave(args, new, &nodes);
 		break;
+	case MPOL_WEIGHTED_INTERLEAVE:
+		err = do_set_weighted_interleave(args, new, &nodes);
+		break;
 	default:
 		BUG();
 	}
@@ -1799,6 +1895,12 @@ static long do_get_mempolicy2(struct mempolicy_args *kargs)
 		kargs->pil.weight = pol->pil.weight;
 		rc = 0;
 		break;
+	case MPOL_WEIGHTED_INTERLEAVE:
+		kargs->wil.next_node = next_node_in(current->il_prev,
+						    pol->nodes);
+		rc = copy_to_user(kargs->wil.weights, pol->wil.weights,
+				  MAX_NUMNODES);
+		break;
 	default:
 		BUG();
 	}
@@ -2160,6 +2262,27 @@ static unsigned int preferred_interleave_nodes(struct mempolicy *policy)
 	return next;
 }
 
+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
+{
+	unsigned int next;
+	unsigned char next_weight;
+	struct task_struct *me = current;
+
+	/* When weight reaches 0, we're on a new node, reset the weight */
+	next = next_node_in(me->il_prev, policy->nodes);
+	if (!policy->wil.cur_weight) {
+		/* If the node is set, at least 1 allocation is required */
+		next_weight = policy->wil.weights[next];
+		policy->wil.cur_weight = next_weight ? next_weight : 1;
+	}
+
+	policy->wil.cur_weight--;
+	if (next < MAX_NUMNODES && !policy->wil.cur_weight)
+		me->il_prev = next;
+
+	return next;
+}
+
 /* Do dynamic interleaving for a process */
 static unsigned interleave_nodes(struct mempolicy *policy)
 {
@@ -2168,6 +2291,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 
 	if (policy->mode == MPOL_PREFERRED_INTERLEAVE)
 		return preferred_interleave_nodes(policy);
+	else if (policy->mode == MPOL_WEIGHTED_INTERLEAVE)
+		return weighted_interleave_nodes(policy);
 
 	next = next_node_in(me->il_prev, policy->nodes);
 	if (next < MAX_NUMNODES)
@@ -2197,6 +2322,7 @@ unsigned int mempolicy_slab_node(void)
 
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		return interleave_nodes(policy);
 
 	case MPOL_BIND:
@@ -2273,6 +2399,40 @@ static unsigned int offset_pil_node(struct mempolicy *pol, unsigned long n)
 	return nid;
 }
 
+static unsigned int offset_wil_node(struct mempolicy *pol, unsigned long n)
+{
+	nodemask_t nodemask = pol->nodes;
+	unsigned int target, nnodes;
+	unsigned char weight;
+	int nid;
+
+	/*
+	 * The barrier will stabilize the nodemask in a register or on
+	 * the stack so that it will stop changing under the code.
+	 *
+	 * Between first_node() and next_node(), pol->nodes could be changed
+	 * by other threads. So we put pol->nodes in a local stack.
+	 */
+	barrier();
+
+	nnodes = nodes_weight(nodemask);
+	if (!nnodes)
+		return numa_node_id();
+	target = (unsigned int)n % pol->wil.il_weight;
+	nid = first_node(nodemask);
+	while (target) {
+		weight = pol->wil.weights[nid];
+		/* If weights are being recaculated, revert to interleave */
+		if (!weight)
+			weight = 1;
+		if (target < weight)
+			break;
+		target -= weight;
+		nid = next_node_in(nid, nodemask);
+	}
+	return nid;
+}
+
 /*
  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
  * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
@@ -2287,6 +2447,8 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 
 	if (pol->mode == MPOL_PREFERRED_INTERLEAVE)
 		return offset_pil_node(pol, n);
+	else if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+		return offset_wil_node(pol, n);
 
 	nodemask = pol->nodes;
 
@@ -2358,7 +2520,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
 	mode = (*mpol)->mode;
 
 	if (unlikely(mode == MPOL_INTERLEAVE) ||
-	    unlikely(mode == MPOL_PREFERRED_INTERLEAVE)) {
+	    unlikely(mode == MPOL_PREFERRED_INTERLEAVE) ||
+	    unlikely(mode == MPOL_WEIGHTED_INTERLEAVE)) {
 		nid = interleave_nid(*mpol, vma, addr,
 					huge_page_shift(hstate_vma(vma)));
 	} else {
@@ -2400,6 +2563,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		*mask = mempolicy->nodes;
 		break;
 
@@ -2511,7 +2675,8 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
 	pol = get_vma_policy(vma, addr);
 
 	if (pol->mode == MPOL_INTERLEAVE ||
-	    pol->mode == MPOL_PREFERRED_INTERLEAVE) {
+	    pol->mode == MPOL_PREFERRED_INTERLEAVE ||
+	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
 		struct page *page;
 		unsigned nid;
 
@@ -2614,7 +2779,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
 	 * nor system default_policy
 	 */
 	if (pol->mode == MPOL_INTERLEAVE ||
-	    pol->mode == MPOL_PREFERRED_INTERLEAVE)
+	    pol->mode == MPOL_PREFERRED_INTERLEAVE ||
+	    pol->mode == MPOL_WEIGHTED_INTERLEAVE)
 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	else if (pol->mode == MPOL_PREFERRED_MANY)
 		page = alloc_pages_preferred_many(gfp, order,
@@ -2737,6 +2903,84 @@ static unsigned long alloc_pages_bulk_array_pil(gfp_t gfp,
 	return allocated;
 }
 
+static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+	struct mempolicy *pol, unsigned long nr_pages,
+	struct page **page_array)
+{
+	struct task_struct *me = current;
+	unsigned long total_allocated = 0;
+	unsigned long nr_allocated;
+	unsigned long rounds;
+	unsigned long node_pages, delta;
+	unsigned char weight;
+	int nnodes, node, prev_node;
+	int i;
+
+	nnodes = nodes_weight(pol->nodes);
+	/* Continue allocating from most recent node and adjust the nr_pages */
+	if (pol->wil.cur_weight) {
+		node = next_node_in(me->il_prev, pol->nodes);
+		node_pages = pol->wil.cur_weight;
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
+		page_array += nr_allocated;
+		total_allocated += nr_allocated;
+		/* if that's all the pages, no need to interleave */
+		if (nr_pages <= pol->wil.cur_weight) {
+			pol->wil.cur_weight -= nr_pages;
+			return total_allocated;
+		}
+		/* Otherwise we adjust nr_pages down, and continue from there */
+		nr_pages -= pol->wil.cur_weight;
+		pol->wil.cur_weight = 0;
+		prev_node = node;
+	}
+
+	/* Now we can continue allocating from this point */
+	rounds = nr_pages / pol->wil.il_weight;
+	delta = nr_pages % pol->wil.il_weight;
+	for (i = 0; i < nnodes; i++) {
+		node = next_node_in(prev_node, pol->nodes);
+		weight = pol->wil.weights[node];
+		node_pages = weight * rounds;
+		if (delta) {
+			if (delta > weight) {
+				node_pages += weight;
+				delta -= weight;
+			} else {
+				node_pages += delta;
+				delta = 0;
+			}
+		}
+		/* We may not make it all the way around */
+		if (!node_pages)
+			break;
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
+		page_array += nr_allocated;
+		total_allocated += nr_allocated;
+		prev_node = node;
+	}
+
+	/*
+	 * Finally, we need to update me->il_prev and pol->wil.cur_weight
+	 * if there were overflow pages, but not equivalent to the node
+	 * weight, set the cur_weight to node_weight - delta and the
+	 * me->il_prev to the previous node. Otherwise if it was perfect
+	 * we can simply set il_prev to node and cur_weight to 0
+	 */
+	delta %= weight;
+	if (node_pages) {
+		me->il_prev = prev_node;
+		pol->wil.cur_weight = pol->wil.weights[node] - node_pages;
+	} else {
+		me->il_prev = node;
+		pol->wil.cur_weight = 0;
+	}
+
+	return total_allocated;
+}
+
 static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
 		struct mempolicy *pol, unsigned long nr_pages,
 		struct page **page_array)
@@ -2779,6 +3023,11 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
 		return alloc_pages_bulk_array_pil(gfp, pol, nr_pages,
 						  page_array);
 
+	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+		return alloc_pages_bulk_array_weighted_interleave(gfp, pol,
+								  nr_pages,
+								  page_array);
+
 	if (pol->mode == MPOL_PREFERRED_MANY)
 		return alloc_pages_bulk_array_preferred_many(gfp,
 				numa_node_id(), pol, nr_pages, page_array);
@@ -2852,6 +3101,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
 		return !!nodes_equal(a->nodes, b->nodes);
@@ -2989,6 +3239,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 	switch (pol->mode) {
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		pgoff = vma->vm_pgoff;
 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
 		polnid = offset_il_node(pol, pgoff);
@@ -3377,6 +3628,7 @@ static const char * const policy_modes[] =
 	[MPOL_BIND]       = "bind",
 	[MPOL_INTERLEAVE] = "interleave",
 	[MPOL_PREFERRED_INTERLEAVE] = "preferred interleave",
+	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
 	[MPOL_LOCAL]      = "local",
 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
 };
@@ -3548,6 +3800,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		nodes = pol->nodes;
 		break;
 	default:
-- 
2.39.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* RE: [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave
       [not found] <CGME20231206080944epcms2p76ebb230b9f4595f5cfcd2531d67ab3ce@epcms2p7>
@ 2023-12-06  8:09 ` Seungjun Ha
  2023-12-06 17:02   ` Gregory Price
  0 siblings, 1 reply; 4+ messages in thread
From: Seungjun Ha @ 2023-12-06  8:09 UTC (permalink / raw)
  To: Gregory Price
  Cc: linux-cxl@vger.kernel.org, Gregory Price, KyungSan Kim,
	Wonjae Lee

> The weighted-interleave mempolicy implements weights per-node
> which are used to distribute memory while interleaving.
>
> For example:
>    nodes: 0,1,2
>    weights: 5,3,2
>
> Over 10 consecutive allocations, the following nodes will be selected:
> [0,0,0,0,0,1,1,1,2,2]
>
> In this example there is a 50%/30%/20% distribution of memory across
> the enabled nodes.
>
> If a node is enabled, the minimum weight is expected to be 0. If an
> enabled node ends up with a weight of 0 (as can happen if weights
> are being recalculated due to a cgroup mask update), a minimum
> of 1 is applied during the interleave mechanism.

I found an issue while using the RFCv2, and want to report it. 
In my testbed, calling set_mempolicy2() causes pthread_create() failure or system hang, depending on weight combinations.

FYI please find my testbed where there are 3 memory-nodes. 

            Node 0  Node 1  Node 2  Result
Weights  
            6 >=    1       1       pthread_create error: 11(Cannot allocate memory)
            1~5     1       1       Pass
            1       8 >=    1       pthread_create error: 11(Cannot allocate memory)
            1       1~7     1       Pass
            1       1       8 >=    pthread_create error: 11(Cannot allocate memory)
            1       1       1~7     Pass

            6       7       7       pthread_create error: 11(Cannot allocate memory)
            5       8       7       Pass
            5       7       8       Pass

            40      30      20      Kernel Hang


Below is the test code to reproduce the issue.

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <numa.h>
#include <errno.h>
#include <numaif.h>
#include <unistd.h>
#include <pthread.h>

#define MPOL_WEIGHTED_INTERLEAVE MPOL_DEFAULT + 8
#define SET_MEMPOLICY2(a, b) syscall(454, a, b)

struct mempolicy_args { on this RFC... }

struct mempolicy_args wil_args;
struct bitmask *wil_nodes;
unsigned char *weights;
int total_nodes = -1;
pthread_t tid;

void set_mempolicy_call()
{
        weights = (unsigned char *)calloc(total_nodes, sizeof(unsigned char));
        wil_nodes = numa_allocate_nodemask();

        numa_bitmask_setbit(wil_nodes, 0); weights[0] = 40;
        numa_bitmask_setbit(wil_nodes, 1); weights[1] = 30;
        numa_bitmask_setbit(wil_nodes, 2); weights[2] = 20;

        wil_args.maxnode = total_nodes + 1;
        wil_args.wil.weights = weights;
        wil_args.nodemask = wil_nodes->maskp;
        wil_args.mode = MPOL_WEIGHTED_INTERLEAVE;
        wil_args.flags = 0;

        int ret = SET_MEMPOLICY2(&wil_args, sizeof(wil_args));
        fprintf(stderr, "set_mempolicy2 result: %d(%s)\n", ret, strerror(errno));
}


int main()
{
        total_nodes = numa_max_node() + 1;

        set_mempolicy_call();
        pthread_create(&tid, NULL, func, NULL);

        return 0;
}

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave
  2023-12-06  8:09 ` [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave Seungjun Ha
@ 2023-12-06 17:02   ` Gregory Price
  2023-12-07  8:52     ` Seungjun Ha
  0 siblings, 1 reply; 4+ messages in thread
From: Gregory Price @ 2023-12-06 17:02 UTC (permalink / raw)
  To: Seungjun Ha
  Cc: Gregory Price, linux-cxl@vger.kernel.org, KyungSan Kim,
	Wonjae Lee

On Wed, Dec 06, 2023 at 05:09:44PM +0900, Seungjun Ha wrote:
> > The weighted-interleave mempolicy implements weights per-node
> > which are used to distribute memory while interleaving.
> >
> > For example:
> >    nodes: 0,1,2
> >    weights: 5,3,2
> >
> > Over 10 consecutive allocations, the following nodes will be selected:
> > [0,0,0,0,0,1,1,1,2,2]
> >
> > In this example there is a 50%/30%/20% distribution of memory across
> > the enabled nodes.
> >
> > If a node is enabled, the minimum weight is expected to be 0. If an
> > enabled node ends up with a weight of 0 (as can happen if weights
> > are being recalculated due to a cgroup mask update), a minimum
> > of 1 is applied during the interleave mechanism.
> 
> I found an issue while using the RFCv2, and want to report it. 

first, thank you very much for testing! I'll run this on my latest fork.

> In my testbed, calling set_mempolicy2() causes pthread_create() failure or system hang, depending on weight combinations.
> 

I think this is likely because i did not handle __mpol_dup correctly.
The newer fork changes the way weights are stored, so this should not
but an issue, but I will use your test to validate this.

New RFC should hopefully be out this or next week.

> FYI please find my testbed where there are 3 memory-nodes. 
> 
>             Node 0  Node 1  Node 2  Result
> Weights  
>             6 >=    1       1       pthread_create error: 11(Cannot allocate memory)
>             1~5     1       1       Pass
>             1       8 >=    1       pthread_create error: 11(Cannot allocate memory)
>             1       1~7     1       Pass
>             1       1       8 >=    pthread_create error: 11(Cannot allocate memory)
>             1       1       1~7     Pass
> 
>             6       7       7       pthread_create error: 11(Cannot allocate memory)
>             5       8       7       Pass
>             5       7       8       Pass
> 
>             40      30      20      Kernel Hang
> 
> 
> Below is the test code to reproduce the issue.
> 
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <stdlib.h>
> #include <numa.h>
> #include <errno.h>
> #include <numaif.h>
> #include <unistd.h>
> #include <pthread.h>
> 
> #define MPOL_WEIGHTED_INTERLEAVE MPOL_DEFAULT + 8
> #define SET_MEMPOLICY2(a, b) syscall(454, a, b)
> 
> struct mempolicy_args { on this RFC... }
> 
> struct mempolicy_args wil_args;
> struct bitmask *wil_nodes;
> unsigned char *weights;
> int total_nodes = -1;
> pthread_t tid;
> 
> void set_mempolicy_call()
> {
>         weights = (unsigned char *)calloc(total_nodes, sizeof(unsigned char));
>         wil_nodes = numa_allocate_nodemask();
> 
>         numa_bitmask_setbit(wil_nodes, 0); weights[0] = 40;
>         numa_bitmask_setbit(wil_nodes, 1); weights[1] = 30;
>         numa_bitmask_setbit(wil_nodes, 2); weights[2] = 20;
> 
>         wil_args.maxnode = total_nodes + 1;
>         wil_args.wil.weights = weights;
>         wil_args.nodemask = wil_nodes->maskp;
>         wil_args.mode = MPOL_WEIGHTED_INTERLEAVE;
>         wil_args.flags = 0;
> 
>         int ret = SET_MEMPOLICY2(&wil_args, sizeof(wil_args));
>         fprintf(stderr, "set_mempolicy2 result: %d(%s)\n", ret, strerror(errno));
> }
> 
> 
> int main()
> {
>         total_nodes = numa_max_node() + 1;
> 
>         set_mempolicy_call();
>         pthread_create(&tid, NULL, func, NULL);
> 
>         return 0;
> }

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: Re: [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave
  2023-12-06 17:02   ` Gregory Price
@ 2023-12-07  8:52     ` Seungjun Ha
  0 siblings, 0 replies; 4+ messages in thread
From: Seungjun Ha @ 2023-12-07  8:52 UTC (permalink / raw)
  To: Gregory Price
  Cc: Gregory Price, linux-cxl@vger.kernel.org, KyungSan Kim,
	Wonjae Lee

> On Wed, Dec 06, 2023 at 05:09:44PM +0900, Seungjun Ha wrote:
> > > The weighted-interleave mempolicy implements weights per-node
> > > which are used to distribute memory while interleaving.
> > >
> > > For example:
> > >    nodes: 0,1,2
> > >    weights: 5,3,2
> > >
> > > Over 10 consecutive allocations, the following nodes will be selected:
> > > [0,0,0,0,0,1,1,1,2,2]
> > >
> > > In this example there is a 50%/30%/20% distribution of memory across
> > > the enabled nodes.
> > >
> > > If a node is enabled, the minimum weight is expected to be 0. If an
> > > enabled node ends up with a weight of 0 (as can happen if weights
> > > are being recalculated due to a cgroup mask update), a minimum
> > > of 1 is applied during the interleave mechanism.
> >
> > I found an issue while using the RFCv2, and want to report it. 
> 
> first, thank you very much for testing! I'll run this on my latest fork.
> 
> > In my testbed, calling set_mempolicy2() causes pthread_create() failure or system hang, depending on weight combinations.
> >
>
> I think this is likely because i did not handle __mpol_dup correctly.
> The newer fork changes the way weights are stored, so this should not
> but an issue, but I will use your test to validate this.
>
> New RFC should hopefully be out this or next week.
>

FYI, send the whole dmesg log that occured in the problem situation.

[  464.155511] app: vmalloc error: size 155648, failed to allocate pages, mode:0xdc2(GFP_KERNEL|__GFP_HIGHMEM|__GFP_ZERO), nodemask=(null),cpuset=/,mems_allowed=0-4
[  464.155550] CPU: 93 PID: 5281 Comm: app Tainted: G      D W   E      6.6.0-rc4+ #2
[  464.155561] Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDREL1.SYS.0085.D15.2207241333 07/24/2022
[  464.155565] Call Trace:
[  464.155572]  <TASK>
[  464.155580]  dump_stack_lvl+0x48/0x70
[  464.155597]  dump_stack+0x10/0x20
[  464.155602]  warn_alloc+0x119/0x190
[  464.155613]  ? __vmalloc_node_range+0x1d2/0x850
[  464.155623]  __vmalloc_node_range+0x7d4/0x850
[  464.155631]  ? kernel_clone+0x9d/0x3c0
[  464.155650]  copy_process+0xa0f/0x1d20
[  464.155661]  ? kernel_clone+0x9d/0x3c0
[  464.155671]  ? __handle_mm_fault+0x769/0xdb0
[  464.155686]  kernel_clone+0x9d/0x3c0
[  464.155698]  __do_sys_clone+0x66/0x90
[  464.155712]  __x64_sys_clone+0x25/0x30
[  464.155722]  do_syscall_64+0x59/0x90
[  464.155730]  ? exc_page_fault+0x8a/0x180
[  464.155739]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[  464.155750] RIP: 0033:0x7f50093f4125
[  464.155762] Code: 48 85 ff 74 3d 48 85 f6 74 38 48 83 ee 10 48 89 4e 08 48 89 3e 48 89 d7 4c 89 c2 4d 89 c8 4c 8b 54 24 08 b8 38 00 00 00 0f 05 <48> 85 c0 7c 13 74 01 c3 31 ed 58 5f ff d0 48 89 c7 b8 3c 00 00 00
[  464.155772] RSP: 002b:00007ffea8877418 EFLAGS: 00000202 ORIG_RAX: 0000000000000038
[  464.155781] RAX: ffffffffffffffda RBX: 00007f50082cf700 RCX: 00007f50093f4125
[  464.155786] RDX: 00007f50082cf9d0 RSI: 00007f50082cefb0 RDI: 00000000003d0f00
[  464.155791] RBP: 00007ffea88774d0 R08: 00007f50082cf700 R09: 00007f50082cf700
[  464.155795] R10: 00007f50082cf9d0 R11: 0000000000000202 R12: 00007ffea88774ce
[  464.155799] R13: 00007ffea88774cf R14: 00007ffea88774d0 R15: 00007f50082cefc0
[  464.155809]  </TASK>
[  464.155852] Mem-Info:
[  464.155902] active_anon:1025 inactive_anon:174970 isolated_anon:0
                active_file:154846 inactive_file:211215 isolated_file:0
                unevictable:2029 dirty:237 writeback:0
                slab_reclaimable:32736 slab_unreclaimable:106078
                mapped:97039 shmem:6403 pagetables:3841
                sec_pagetables:0 bounce:0
                kernel_misc_reclaimable:0
                free:164219875 free_pcp:36236 free_cma:0
[  464.155921] Node 0 active_anon:2128kB inactive_anon:491864kB active_file:150184kB inactive_file:522756kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:245188kB dirty:68kB writeback:0kB shmem:11812kB shmem_thp:0kB shmem_pmdmapped:0kB anon_thp:0kB writeback_tmp:0kB kernel_stack:18552kB pagetables:9492kB sec_pagetables:0kB all_unreclaimable? no
[  464.155935] Node 1 active_anon:1972kB inactive_anon:208016kB active_file:469200kB inactive_file:322104kB unevictable:8116kB isolated(anon):0kB isolated(file):0kB mapped:142968kB dirty:880kB writeback:0kB shmem:13800kB shmem_thp:0kB shmem_pmdmapped:0kB anon_thp:0kB writeback_tmp:0kB kernel_stack:18712kB pagetables:5872kB sec_pagetables:0kB all_unreclaimable? no
[  464.155954] Node 0 DMA free:11264kB boost:0kB min:8kB low:20kB high:32kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:15992kB managed:15360kB mlocked:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
[  464.155970] lowmem_reserve[]: 0 1765 64175 64175 64175
[  464.155988] Node 0 DMA32 free:1803124kB boost:0kB min:1236kB low:3040kB high:4844kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:1873332kB managed:1807640kB mlocked:0kB bounce:0kB free_pcp:420kB local_pcp:0kB free_cma:0kB
[  464.156005] lowmem_reserve[]: 0 0 62409 62409 62409
[  464.156023] Node 0 Normal free:55826092kB boost:0kB min:43704kB low:107608kB high:171512kB reserved_highatomic:0KB active_anon:2128kB inactive_anon:491864kB active_file:150184kB inactive_file:522756kB unevictable:0kB writepending:68kB present:65011712kB managed:63907776kB mlocked:0kB bounce:0kB free_pcp:59312kB local_pcp:592kB free_cma:0kB
[  464.156040] lowmem_reserve[]: 0 0 0 0 0
[  464.156053] Node 1 Normal free:62368616kB boost:0kB min:45156kB low:111184kB high:177212kB reserved_highatomic:0KB active_anon:1972kB inactive_anon:208016kB active_file:469200kB inactive_file:322104kB unevictable:8116kB writepending:880kB present:67108864kB managed:66029252kB mlocked:0kB bounce:0kB free_pcp:85212kB local_pcp:652kB free_cma:0kB
[  464.156067] lowmem_reserve[]: 0 0 0 0 0
[  464.156078] Node 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 1*1024kB (U) 1*2048kB (M) 2*4096kB (M) = 11264kB
[  464.156106] Node 0 DMA32: 7*4kB (UM) 9*8kB (M) 7*16kB (M) 7*32kB (M) 7*64kB (M) 8*128kB (M) 4*256kB (M) 4*512kB (M) 6*1024kB (M) 5*2048kB (M) 435*4096kB (M) = 1803124kB
[  464.156142] Node 0 Normal: 0*4kB 443*8kB (UME) 1063*16kB (UME) 2103*32kB (UME) 1484*64kB (UME) 859*128kB (UME) 416*256kB (UME) 204*512kB (UM) 64*1024kB (UME) 11*2048kB (M) 13485*4096kB (M) = 55826344kB
[  464.156178] Node 1 Normal: 585*4kB (UME) 489*8kB (UME) 306*16kB (UME) 493*32kB (UME) 199*64kB (UME) 137*128kB (UME) 11*256kB (UME) 36*512kB (UM) 40*1024kB (UM) 7*2048kB (UME) 15194*4096kB (M) = 62368364kB
[  464.156225] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[  464.156232] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[  464.156238] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[  464.156243] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[  464.156248] Node 2 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[  464.156255] Node 2 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[  464.156259] Node 3 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[  464.156263] Node 3 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[  464.156266] Node 4 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[  464.156270] Node 4 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[  464.156275] 369910 total pagecache pages
[  464.156280] 0 pages in swap cache
[  464.156283] Free swap  = 10485756kB
[  464.156286] Total swap = 10485756kB
[  464.156290] 167720203 pages RAM
[  464.156293] 0 pages HighMem/MovableOnly
[  464.156296] 562468 pages reserved
[  464.156299] 0 pages hwpoisoned
[  464.156331] general protection fault, probably for non-canonical address 0x300b162afbcb6e0f: 0000 [#2] PREEMPT SMP NOPTI
[  464.156341] CPU: 93 PID: 5281 Comm: app Tainted: G      D W   E      6.6.0-rc4+ #2
[  464.156350] Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDREL1.SYS.0085.D15.2207241333 07/24/2022
[  464.156355] RIP: 0010:vfree+0x7e/0x2d0
[  464.156364] Code: 48 85 c0 0f 84 42 02 00 00 8b 50 2c f6 40 19 01 0f 85 0c 01 00 00 85 d2 0f 85 85 00 00 00 e9 96 00 00 00 66 90 e8 42 6c e0 ff <49> 8b 44 24 08 a8 01 0f 85 e2 00 00 00 0f 1f 44 00 00 4c 89 e0 48
[  464.156373] RSP: 0018:ffa000000cfebc00 EFLAGS: 00010202
[  464.156379] RAX: 0000000000000001 RBX: 0000000000000006 RCX: 0000000000000000
[  464.156384] RDX: ff11001086c2e000 RSI: 000000000000009a RDI: 00000000ffffffff
[  464.156389] RBP: ffa000000cfebc30 R08: 0000000000000000 R09: ff1100203f1784c0
[  464.156394] R10: ffd40000046248c8 R11: 0000000000000000 R12: 300b162afbcb6e07
[  464.156397] R13: ff1100108ecd9340 R14: 00000000ffffffff R15: ff1100108ecd9340
[  464.156401] FS:  00007f50092d2740(0000) GS:ff1100203f140000(0000) knlGS:0000000000000000
[  464.156408] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  464.156415] CR2: 00007f50082cefb8 CR3: 00000011535fa006 CR4: 0000000000771ee0
[  464.156422] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  464.156426] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
[  464.156430] PKRU: 55555554
[  464.156434] Call Trace:
[  464.156438]  <TASK>
[  464.156442]  ? show_regs+0x68/0x70
[  464.156457]  ? __die_body+0x20/0x70
[  464.156467]  ? die_addr+0x3e/0x60
[  464.156477]  ? exc_general_protection+0x231/0x460
[  464.156491]  ? asm_exc_general_protection+0x27/0x30
[  464.156503]  ? vfree+0x7e/0x2d0
[  464.156512]  __vmalloc_node_range+0x7dd/0x850
[  464.156523]  ? kernel_clone+0x9d/0x3c0
[  464.156538]  copy_process+0xa0f/0x1d20
[  464.156549]  ? kernel_clone+0x9d/0x3c0
[  464.156559]  ? __handle_mm_fault+0x769/0xdb0
[  464.156569]  kernel_clone+0x9d/0x3c0
[  464.156582]  __do_sys_clone+0x66/0x90
[  464.156594]  __x64_sys_clone+0x25/0x30
[  464.156606]  do_syscall_64+0x59/0x90
[  464.156613]  ? exc_page_fault+0x8a/0x180
[  464.156620]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[  464.156627] RIP: 0033:0x7f50093f4125
[  464.156631] Code: 48 85 ff 74 3d 48 85 f6 74 38 48 83 ee 10 48 89 4e 08 48 89 3e 48 89 d7 4c 89 c2 4d 89 c8 4c 8b 54 24 08 b8 38 00 00 00 0f 05 <48> 85 c0 7c 13 74 01 c3 31 ed 58 5f ff d0 48 89 c7 b8 3c 00 00 00
[  464.156636] RSP: 002b:00007ffea8877418 EFLAGS: 00000202 ORIG_RAX: 0000000000000038
[  464.156643] RAX: ffffffffffffffda RBX: 00007f50082cf700 RCX: 00007f50093f4125
[  464.156648] RDX: 00007f50082cf9d0 RSI: 00007f50082cefb0 RDI: 00000000003d0f00
[  464.156652] RBP: 00007ffea88774d0 R08: 00007f50082cf700 R09: 00007f50082cf700
[  464.156657] R10: 00007f50082cf9d0 R11: 0000000000000202 R12: 00007ffea88774ce
[  464.156661] R13: 00007ffea88774cf R14: 00007ffea88774d0 R15: 00007f50082cefc0
[  464.156668]  </TASK>
[  464.156670] Modules linked in: xt_conntrack(E) xt_MASQUERADE(E) nf_conntrack_netlink(E) nfnetlink(E) xfrm_user(E) xfrm_algo(E) iptable_nat(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) libcrc32c(E) xt_addrtype(E) iptable_filter(E) bpfilter(E) br_netfilter(E) bridge(E) stp(E) llc(E) overlay(E) nls_iso8859_1(E) intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency(E) intel_uncore_frequency_common(E) i10nm_edac(E) nfit(E) snd_hda_codec_realtek(E) snd_hda_codec_generic(E) ledtrig_audio(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) coretemp(E) snd_hda_intel(E) kvm_intel(E) kmem(E) snd_intel_dspcfg(E) snd_intel_sdw_acpi(E) input_leds(E) device_dax(E) snd_hda_codec(E) kvm(E) cxl_mem(E) snd_hda_core(E) cxl_port(E) crct10dif_pclmul(E) cxl_pmu(E) snd_hwdep(E) ghash_clmulni_intel(E) snd_pcm(E) sha512_ssse3(E) binfmt_misc(E) snd_seq_midi(E) snd_seq_midi_event(E) aesni_intel(E) crypto_simd(E) snd_rawmidi(E) cryptd(E) snd_seq(E) rapl(E) ast(E) drm_shmem_helper(E) dax_hmem(E) snd_seq_device(E)
[  464.156784]  intel_cstate(E) snd_timer(E) drm_kms_helper(E) cxl_acpi(E) cxl_pci(E) isst_if_mmio(E) i2c_algo_bit(E) isst_if_mbox_pci(E) snd(E) mei_me(E) idxd(E) isst_if_common(E) ipmi_ssif(E) cxl_core(E) idxd_bus(E) soundcore(E) mei(E) acpi_ipmi(E) ipmi_si(E) ipmi_devintf(E) ipmi_msghandler(E) acpi_power_meter(E) acpi_pad(E) mac_hid(E) sch_fq_codel(E) msr(E) parport_pc(E) ppdev(E) lp(E) ramoops(E) parport(E) reed_solomon(E) drm(E) efi_pstore(E) ip_tables(E) x_tables(E) autofs4(E) hid_generic(E) usbhid(E) hid(E) i2c_i801(E) nvme(E) ahci(E) xhci_pci(E) nvme_core(E) libahci(E) i2c_smbus(E) crc32_pclmul(E) igc(E) i2c_ismt(E) xhci_pci_renesas(E) wmi(E) pinctrl_emmitsburg(E)
[  464.156883] ---[ end trace 0000000000000000 ]---
[  464.344754] RIP: 0010:vfree+0x7e/0x2d0
[  464.344764] Code: 48 85 c0 0f 84 42 02 00 00 8b 50 2c f6 40 19 01 0f 85 0c 01 00 00 85 d2 0f 85 85 00 00 00 e9 96 00 00 00 66 90 e8 42 6c e0 ff <49> 8b 44 24 08 a8 01 0f 85 e2 00 00 00 0f 1f 44 00 00 4c 89 e0 48
[  464.344772] RSP: 0018:ffa000000ce73bf0 EFLAGS: 00010202
[  464.344788] RAX: 0000000000000001 RBX: 0000000000000006 RCX: 0000000000000000
[  464.344799] RDX: ff110001260a2000 RSI: 00000000000000c5 RDI: 00000000ffffffff
[  464.344806] RBP: ffa000000ce73c20 R08: 0000000000000000 R09: ff1100103f4784c0
[  464.344813] R10: ffd40000042e6c88 R11: 0000000000000000 R12: d0ec0eba6bd389a7
[  464.344820] R13: ff1100010fd51880 R14: 00000000ffffffff R15: ff1100010fd51880
[  464.344826] FS:  00007f50092d2740(0000) GS:ff1100203f140000(0000) knlGS:0000000000000000
[  464.344836] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  464.344843] CR2: 00007f50082cefb8 CR3: 00000011535fa006 CR4: 0000000000771ee0
[  464.344850] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  464.344856] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
[  464.344863] PKRU: 55555554


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-12-07  8:52 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <CGME20231206080944epcms2p76ebb230b9f4595f5cfcd2531d67ab3ce@epcms2p7>
2023-12-06  8:09 ` [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave Seungjun Ha
2023-12-06 17:02   ` Gregory Price
2023-12-07  8:52     ` Seungjun Ha
2023-10-03  0:21 [RFC PATCH v2 0/4] mm/mempolicy: get/set_mempolicy2 syscalls Gregory Price
2023-10-03  0:21 ` [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave Gregory Price

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox