All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ravi Jonnalagadda <ravis.opensrc@micron.com>
To: <linux-mm@vger.kernel.org>, <linux-cxl@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>, <linux-arch@vger.kernel.org>,
	<linux-api@vger.kernel.org>, <luto@kernel.org>,
	<tglx@linutronix.de>, <mingo@redhat.com>, <bp@alien8.de>,
	<dietmar.eggemann@arm.com>, <vincent.guittot@linaro.org>,
	<dave.hansen@linux.intel.com>, <hpa@zytor.com>, <arnd@arndb.de>,
	<akpm@linux-foundation.org>, <x86@kernel.org>,
	<aneesh.kumar@linux.ibm.com>, <gregory.price@memverge.com>,
	<ying.huang@intel.com>, <jgroves@micron.com>,
	<ravis.opensrc@micron.com>, <sthanneeru@micron.com>,
	<emirakhur@micron.com>, <vtanna@micron.com>
Subject: [PATCH 2/2] mm: mempolicy: Interleave policy for tiered memory nodes
Date: Wed, 27 Sep 2023 15:20:02 +0530	[thread overview]
Message-ID: <20230927095002.10245-3-ravis.opensrc@micron.com> (raw)
In-Reply-To: <20230927095002.10245-1-ravis.opensrc@micron.com>

From: Srinivasulu Thanneeru <sthanneeru@micron.com>

Existing interleave policy spreads out pages evenly across a set of
specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems
have CPU-less memory nodes with different peak bandwidth and
latency-bandwidth characteristics. In such systems, we will want to
use the additional bandwidth provided by lowtier memory for
bandwidth-intensive applications. However, the default 1:1 interleave
can lead to suboptimal bandwidth distribution.

Introduce an interleave policy for multi-tiers that is based on
interleave weights, where pages are assigned from nodes of the tier
based on the tier weight.

For instance, 50:30:20 are the weights of tiers 0, 1, and 3, which
leads to a 50%/30%/20% traffic breakdown across the three tiers.

Signed-off-by: Srinivasulu Thanneeru <sthanneeru@micron.com>
Co-authored-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
---
 include/linux/memory-tiers.h |  25 +++++++-
 include/linux/sched.h        |   2 +
 mm/memory-tiers.c            |  31 ++--------
 mm/mempolicy.c               | 107 +++++++++++++++++++++++++++++++++--
 4 files changed, 132 insertions(+), 33 deletions(-)

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index c62d286749d0..74be39cb56c4 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_MEMORY_TIERS_H
 #define _LINUX_MEMORY_TIERS_H
 
+#include <linux/device.h>
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <linux/kref.h>
@@ -21,7 +22,27 @@
 
 #define MAX_TIER_INTERLEAVE_WEIGHT 100
 
-struct memory_tier;
+struct memory_tier {
+	/* hierarchy of memory tiers */
+	struct list_head list;
+	/* list of all memory types part of this tier */
+	struct list_head memory_types;
+	/*
+	 * By default all tiers will have weight as 1, which means they
+	 * follow default standard allocation.
+	 */
+	unsigned short interleave_weight;
+	/*
+	 * start value of abstract distance. memory tier maps
+	 * an abstract distance  range,
+	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
+	 */
+	int adistance_start;
+	struct device dev;
+	/* All the nodes that are part of all the lower memory tiers. */
+	nodemask_t lower_tier_mask;
+};
+
 struct memory_dev_type {
 	/* list of memory types that are part of same tier as this type */
 	struct list_head tier_sibiling;
@@ -38,6 +59,8 @@ struct memory_dev_type *alloc_memory_type(int adistance);
 void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+struct memory_tier *node_get_memory_tier(int node);
+nodemask_t get_memtier_nodemask(struct memory_tier *memtier);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..07ea837c3afb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1252,7 +1252,9 @@ struct task_struct {
 	/* Protected by alloc_lock: */
 	struct mempolicy		*mempolicy;
 	short				il_prev;
+	unsigned short			il_count;
 	short				pref_node_fork;
+	unsigned int			current_node;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 	int				numa_scan_seq;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 7e06c9e0fa41..5e2ddc9f994a 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -8,27 +8,6 @@
 
 #include "internal.h"
 
-struct memory_tier {
-	/* hierarchy of memory tiers */
-	struct list_head list;
-	/* list of all memory types part of this tier */
-	struct list_head memory_types;
-	/*
-	 * By default all tiers will have weight as 1, which means they
-	 * follow default standard allocation.
-	 */
-	unsigned short interleave_weight;
-	/*
-	 * start value of abstract distance. memory tier maps
-	 * an abstract distance  range,
-	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
-	 */
-	int adistance_start;
-	struct device dev;
-	/* All the nodes that are part of all the lower memory tiers. */
-	nodemask_t lower_tier_mask;
-};
-
 struct demotion_nodes {
 	nodemask_t preferred;
 };
@@ -115,7 +94,7 @@ static inline struct memory_tier *to_memory_tier(struct device *device)
 	return container_of(device, struct memory_tier, dev);
 }
 
-static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
+nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
 {
 	nodemask_t nodes = NODE_MASK_NONE;
 	struct memory_dev_type *memtype;
@@ -264,7 +243,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
 	return memtier;
 }
 
-static struct memory_tier *__node_get_memory_tier(int node)
+struct memory_tier *node_get_memory_tier(int node)
 {
 	pg_data_t *pgdat;
 
@@ -380,7 +359,7 @@ static void disable_all_demotion_targets(void)
 		 * We are holding memory_tier_lock, it is safe
 		 * to access pgda->memtier.
 		 */
-		memtier = __node_get_memory_tier(node);
+		memtier = node_get_memory_tier(node);
 		if (memtier)
 			memtier->lower_tier_mask = NODE_MASK_NONE;
 	}
@@ -417,7 +396,7 @@ static void establish_demotion_targets(void)
 		best_distance = -1;
 		nd = &node_demotion[node];
 
-		memtier = __node_get_memory_tier(node);
+		memtier = node_get_memory_tier(node);
 		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
 			continue;
 		/*
@@ -562,7 +541,7 @@ static bool clear_node_memory_tier(int node)
 	 * This also enables us to free the destroyed memory tier
 	 * with kfree instead of kfree_rcu
 	 */
-	memtier = __node_get_memory_tier(node);
+	memtier = node_get_memory_tier(node);
 	if (memtier) {
 		struct memory_dev_type *memtype;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 42b5567e3773..4f80c6ee1176 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -100,6 +100,8 @@
 #include <linux/ctype.h>
 #include <linux/mm_inline.h>
 #include <linux/mmu_notifier.h>
+#include <linux/memory-tiers.h>
+#include <linux/nodemask.h>
 #include <linux/printk.h>
 #include <linux/swapops.h>
 
@@ -882,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE)
+	if (new && new->mode == MPOL_INTERLEAVE) {
 		current->il_prev = MAX_NUMNODES-1;
+		current->il_count = 0;
+		current->current_node = MAX_NUMNODES;
+	}
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -1899,13 +1904,76 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 	return nd;
 }
 
+/* Return interleave weight of node from tier's weight */
+static unsigned short node_interleave_weight(int nid, nodemask_t pol_nodemask)
+{
+	struct memory_tier *memtier;
+	nodemask_t tier_nodes, tier_and_pol;
+	unsigned short avrg_weight = 0;
+	int node, nnodes, reminder;
+
+	memtier = node_get_memory_tier(nid);
+
+	if (!memtier)
+		return 0;
+
+	tier_nodes = get_memtier_nodemask(memtier);
+	nodes_and(tier_and_pol, tier_nodes, pol_nodemask);
+	nnodes = nodes_weight(tier_and_pol);
+	if (!nnodes)
+		return 0;
+
+	avrg_weight = memtier->interleave_weight / nnodes;
+	/* Set minimum weight of node as 1 so that at least one page
+	 * is allocated.
+	 */
+	if (!avrg_weight)
+		return 1;
+
+	reminder = memtier->interleave_weight % nnodes;
+	if (reminder) {
+		for_each_node_mask(node, tier_and_pol) {
+			/* Increment target node's weight by 1, if it falls
+			 * within remaining weightage 'reminder'.
+			 */
+			if (node == nid) {
+				if (reminder > 0)
+					avrg_weight = avrg_weight + 1;
+				break;
+			}
+			reminder--;
+		}
+	}
+	return avrg_weight;
+}
+
 /* Do dynamic interleaving for a process */
 static unsigned interleave_nodes(struct mempolicy *policy)
 {
 	unsigned next;
 	struct task_struct *me = current;
+	unsigned short node_weight = 0;
 
-	next = next_node_in(me->il_prev, policy->nodes);
+	/* select current node or next node from nodelist based on
+	 * available tier interleave weight.
+	 */
+	if (me->current_node == MAX_NUMNODES)
+		next = next_node_in(me->il_prev, policy->nodes);
+	else
+		next = me->current_node;
+	node_weight = node_interleave_weight(next, policy->nodes);
+	if (!node_weight)
+		goto set_il_prev;
+	if (me->il_count < node_weight) {
+		me->il_count++;
+		me->current_node = next;
+		if (me->il_count == node_weight) {
+			me->current_node = MAX_NUMNODES;
+			me->il_count = 0;
+		}
+	}
+
+set_il_prev:
 	if (next < MAX_NUMNODES)
 		me->il_prev = next;
 	return next;
@@ -1966,9 +2034,10 @@ unsigned int mempolicy_slab_node(void)
 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 {
 	nodemask_t nodemask = pol->nodes;
-	unsigned int target, nnodes;
-	int i;
-	int nid;
+	unsigned int target, nnodes, vnnodes = 0;
+	unsigned short node_weight = 0;
+	int nid, vtarget, i;
+
 	/*
 	 * The barrier will stabilize the nodemask in a register or on
 	 * the stack so that it will stop changing under the code.
@@ -1981,7 +2050,33 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 	nnodes = nodes_weight(nodemask);
 	if (!nnodes)
 		return numa_node_id();
-	target = (unsigned int)n % nnodes;
+
+	/*
+	 * Calculate the virtual target for @n in a nodelist that is scaled
+	 * with interleave weights....
+	 */
+	for_each_node_mask(nid, nodemask) {
+		node_weight = node_interleave_weight(nid, nodemask);
+		if (!node_weight)
+			continue;
+		vnnodes += node_weight;
+	}
+	if (!vnnodes)
+		return numa_node_id();
+	vtarget = (int)((unsigned int)n % vnnodes);
+
+	/* ...then map it back to the physical nodelist */
+	target = 0;
+	for_each_node_mask(nid, nodemask) {
+		node_weight = node_interleave_weight(nid, nodemask);
+		if (!node_weight)
+			continue;
+		vtarget -= node_weight;
+		if (vtarget < 0)
+			break;
+		target++;
+	}
+
 	nid = first_node(nodemask);
 	for (i = 0; i < target; i++)
 		nid = next_node(nid, nodemask);
-- 
2.39.3


  parent reply	other threads:[~2023-09-27  9:51 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-27  9:50 [RFC PATCH 0/2] mm: mempolicy: Multi-tier interleaving Ravi Jonnalagadda
2023-09-27  9:50 ` [PATCH 1/2] memory tier: Introduce sysfs for tier interleave weights Ravi Jonnalagadda
2023-10-02 10:26   ` Jonathan Cameron
2023-09-27  9:50 ` Ravi Jonnalagadda [this message]
2023-09-28  6:56   ` [PATCH 2/2] mm: mempolicy: Interleave policy for tiered memory nodes Huang, Ying
2023-09-28 18:25   ` Gregory Price
2023-10-02 10:48   ` Jonathan Cameron
2023-10-20 17:05   ` kernel test robot
2023-09-28  6:14 ` [RFC PATCH 0/2] mm: mempolicy: Multi-tier interleaving Huang, Ying
2023-10-03  5:07   ` [EXT] " Srinivasulu Thanneeru

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230927095002.10245-3-ravis.opensrc@micron.com \
    --to=ravis.opensrc@micron.com \
    --cc=akpm@linux-foundation.org \
    --cc=aneesh.kumar@linux.ibm.com \
    --cc=arnd@arndb.de \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=emirakhur@micron.com \
    --cc=gregory.price@memverge.com \
    --cc=hpa@zytor.com \
    --cc=jgroves@micron.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mingo@redhat.com \
    --cc=sthanneeru@micron.com \
    --cc=tglx@linutronix.de \
    --cc=vincent.guittot@linaro.org \
    --cc=vtanna@micron.com \
    --cc=x86@kernel.org \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.