From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
To: linux-mm@kvack.org, akpm@linux-foundation.org
Cc: Huang Ying <ying.huang@intel.com>,
Greg Thelen <gthelen@google.com>, Yang Shi <shy828301@gmail.com>,
Davidlohr Bueso <dave@stgolabs.net>,
Tim C Chen <tim.c.chen@intel.com>,
Brice Goglin <brice.goglin@gmail.com>,
Michal Hocko <mhocko@kernel.org>,
Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
Hesham Almatary <hesham.almatary@huawei.com>,
Dave Hansen <dave.hansen@intel.com>,
Jonathan Cameron <Jonathan.Cameron@huawei.com>,
Alistair Popple <apopple@nvidia.com>,
Dan Williams <dan.j.williams@intel.com>,
Feng Tang <feng.tang@intel.com>,
Jagdish Gediya <jvgediya@linux.ibm.com>,
Baolin Wang <baolin.wang@linux.alibaba.com>,
David Rientjes <rientjes@google.com>,
"Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
Subject: [RFC PATCH v4 2/7] mm/demotion: Expose per node memory tier to sysfs
Date: Fri, 27 May 2022 17:55:23 +0530 [thread overview]
Message-ID: <20220527122528.129445-3-aneesh.kumar@linux.ibm.com> (raw)
In-Reply-To: <20220527122528.129445-1-aneesh.kumar@linux.ibm.com>
From: Jagdish Gediya <jvgediya@linux.ibm.com>
Add support to read/write the memory tierindex for a NUMA node.
/sys/devices/system/node/nodeN/memtier
where N = node id
When read, It list the memory tier that the node belongs to.
When written, the kernel moves the node into the specified
memory tier, the tier assignment of all other nodes are not
affected.
If the memory tier does not exist, writing to the above file
create the tier and assign the NUMA node to that tier.
mutex memory_tier_lock is introduced to protect memory tier
related chanegs as it can happen from sysfs as well on hot
plug events.
Signed-off-by: Jagdish Gediya <jvgediya@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
drivers/base/node.c | 35 ++++++++++++++
include/linux/migrate.h | 4 +-
mm/migrate.c | 103 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 141 insertions(+), 1 deletion(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index ec8bb24a5a22..cf4a58446d8c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -20,6 +20,7 @@
#include <linux/pm_runtime.h>
#include <linux/swap.h>
#include <linux/slab.h>
+#include <linux/migrate.h>
static struct bus_type node_subsys = {
.name = "node",
@@ -560,11 +561,45 @@ static ssize_t node_read_distance(struct device *dev,
}
static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
+#ifdef CONFIG_TIERED_MEMORY
+static ssize_t memtier_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ int node = dev->id;
+
+ return sysfs_emit(buf, "%d\n", node_get_memory_tier(node));
+}
+
+static ssize_t memtier_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long tier;
+ int node = dev->id;
+
+ int ret = kstrtoul(buf, 10, &tier);
+ if (ret)
+ return ret;
+
+ ret = node_reset_memory_tier(node, tier);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(memtier);
+#endif
+
static struct attribute *node_dev_attrs[] = {
&dev_attr_meminfo.attr,
&dev_attr_numastat.attr,
&dev_attr_distance.attr,
&dev_attr_vmstat.attr,
+#ifdef CONFIG_TIERED_MEMORY
+ &dev_attr_memtier.attr,
+#endif
NULL
};
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 0ec653623565..d37d1d5dee82 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -177,13 +177,15 @@ enum memory_tier_type {
};
int next_demotion_node(int node);
-
extern void migrate_on_reclaim_init(void);
#ifdef CONFIG_HOTPLUG_CPU
extern void set_migration_target_nodes(void);
#else
static inline void set_migration_target_nodes(void) {}
#endif
+int node_get_memory_tier(int node);
+int node_set_memory_tier(int node, int tier);
+int node_reset_memory_tier(int node, int tier);
#else
#define numa_demotion_enabled false
static inline int next_demotion_node(int node)
diff --git a/mm/migrate.c b/mm/migrate.c
index f28ee93fb017..304559ba3372 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2132,6 +2132,7 @@ static struct bus_type memory_tier_subsys = {
.dev_name = "memtier",
};
+DEFINE_MUTEX(memory_tier_lock);
static struct memory_tier *memory_tiers[MAX_MEMORY_TIERS];
static ssize_t nodelist_show(struct device *dev,
@@ -2225,6 +2226,108 @@ static const struct attribute_group *memory_tier_attr_groups[] = {
NULL,
};
+static int __node_get_memory_tier(int node)
+{
+ int tier;
+
+ for (tier = 0; tier < MAX_MEMORY_TIERS; tier++) {
+ if (memory_tiers[tier] && node_isset(node, memory_tiers[tier]->nodelist))
+ return tier;
+ }
+
+ return -1;
+}
+
+int node_get_memory_tier(int node)
+{
+ int tier;
+
+ /*
+ * Make sure memory tier is not unregistered
+ * while it is being read.
+ */
+ mutex_lock(&memory_tier_lock);
+
+ tier = __node_get_memory_tier(node);
+
+ mutex_unlock(&memory_tier_lock);
+
+ return tier;
+}
+
+int __node_set_memory_tier(int node, int tier)
+{
+ int ret = 0;
+ /*
+ * As register_memory_tier() for new tier can fail,
+ * try it before modifying existing tier. register
+ * tier makes tier visible in sysfs.
+ */
+ if (!memory_tiers[tier]) {
+ ret = register_memory_tier(tier);
+ if (ret) {
+ goto out;
+ }
+ }
+
+ node_set(node, memory_tiers[tier]->nodelist);
+
+out:
+ return ret;
+}
+
+int node_reset_memory_tier(int node, int tier)
+{
+ int current_tier, ret = 0;
+
+ mutex_lock(&memory_tier_lock);
+
+ current_tier = __node_get_memory_tier(node);
+ if (current_tier == tier)
+ goto out;
+
+ if (current_tier != -1 )
+ node_clear(node, memory_tiers[current_tier]->nodelist);
+
+ ret = __node_set_memory_tier(node, tier);
+
+ if (!ret) {
+ if (nodes_empty(memory_tiers[current_tier]->nodelist))
+ unregister_memory_tier(current_tier);
+ } else {
+ /* reset it back to older tier */
+ ret = __node_set_memory_tier(node, current_tier);
+ }
+out:
+ mutex_unlock(&memory_tier_lock);
+
+ return ret;
+}
+
+int node_set_memory_tier(int node, int tier)
+{
+ int current_tier, ret = 0;
+
+ if (tier >= MAX_MEMORY_TIERS)
+ return -EINVAL;
+
+ mutex_lock(&memory_tier_lock);
+ current_tier = __node_get_memory_tier(node);
+ /*
+ * if node is already part of the tier proceed with the
+ * current tier value, because we might want to establish
+ * new migration paths now. The node might be added to a tier
+ * before it was made part of N_MEMORY, hence estabilish_migration_targets
+ * will have skipped this node.
+ */
+ if (current_tier != -1)
+ tier = current_tier;
+ ret = __node_set_memory_tier(node, tier);
+ mutex_unlock(&memory_tier_lock);
+
+ return ret;
+}
+
/*
* node_demotion[] example:
*
--
2.36.1
next prev parent reply other threads:[~2022-05-27 12:26 UTC|newest]
Thread overview: 72+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-05-26 21:22 RFC: Memory Tiering Kernel Interfaces (v3) Wei Xu
2022-05-27 2:58 ` Ying Huang
2022-05-27 14:05 ` Hesham Almatary
2022-05-27 16:25 ` Wei Xu
2022-05-27 12:25 ` [RFC PATCH v4 0/7] mm/demotion: Memory tiers and demotion Aneesh Kumar K.V
2022-05-27 12:25 ` [RFC PATCH v4 1/7] mm/demotion: Add support for explicit memory tiers Aneesh Kumar K.V
2022-05-27 13:59 ` Jonathan Cameron
2022-06-02 6:07 ` Ying Huang
2022-06-06 2:49 ` Ying Huang
2022-06-06 3:56 ` Aneesh Kumar K V
2022-06-06 5:33 ` Ying Huang
2022-06-06 6:01 ` Aneesh Kumar K V
2022-06-06 6:27 ` Aneesh Kumar K.V
2022-06-06 7:53 ` Ying Huang
2022-06-06 8:01 ` Aneesh Kumar K V
2022-06-06 8:52 ` Ying Huang
2022-06-06 9:02 ` Aneesh Kumar K V
2022-06-08 1:24 ` Ying Huang
2022-06-08 7:16 ` Ying Huang
2022-06-08 8:24 ` Aneesh Kumar K V
2022-06-08 8:27 ` Ying Huang
2022-05-27 12:25 ` Aneesh Kumar K.V [this message]
2022-05-27 14:15 ` [RFC PATCH v4 2/7] mm/demotion: Expose per node memory tier to sysfs Jonathan Cameron
2022-06-03 8:40 ` Aneesh Kumar K V
2022-06-06 14:59 ` Jonathan Cameron
2022-06-06 16:01 ` Aneesh Kumar K V
2022-06-06 16:16 ` Jonathan Cameron
2022-06-06 16:39 ` Aneesh Kumar K V
2022-06-06 17:46 ` Aneesh Kumar K.V
2022-06-07 14:32 ` Jonathan Cameron
2022-06-08 7:18 ` Ying Huang
2022-06-08 8:25 ` Aneesh Kumar K V
2022-06-08 8:29 ` Ying Huang
2022-05-27 12:25 ` [RFC PATCH v4 3/7] mm/demotion: Build demotion targets based on explicit memory tiers Aneesh Kumar K.V
2022-05-27 14:31 ` Jonathan Cameron
2022-05-30 3:35 ` [mm/demotion] 8ebccd60c2: BUG:sleeping_function_called_from_invalid_context_at_mm/compaction.c kernel test robot
2022-05-27 12:25 ` [RFC PATCH v4 4/7] mm/demotion/dax/kmem: Set node's memory tier to MEMORY_TIER_PMEM Aneesh Kumar K.V
2022-06-01 6:29 ` Bharata B Rao
2022-06-01 13:49 ` Aneesh Kumar K V
2022-06-02 6:36 ` Bharata B Rao
2022-06-03 9:04 ` Aneesh Kumar K V
2022-06-06 10:11 ` Bharata B Rao
2022-06-06 10:16 ` Aneesh Kumar K V
2022-06-06 11:54 ` Aneesh Kumar K.V
2022-06-06 12:09 ` Bharata B Rao
2022-06-06 13:00 ` Aneesh Kumar K V
2022-05-27 12:25 ` [RFC PATCH v4 5/7] mm/demotion: Add support to associate rank with memory tier Aneesh Kumar K.V
2022-05-27 14:45 ` Jonathan Cameron
2022-05-27 15:45 ` Aneesh Kumar K V
2022-05-30 12:36 ` Jonathan Cameron
2022-06-02 6:41 ` Ying Huang
2022-05-27 12:25 ` [RFC PATCH v4 6/7] mm/demotion: Add support for removing node from demotion memory tiers Aneesh Kumar K.V
2022-06-02 6:43 ` Ying Huang
2022-05-27 12:25 ` [RFC PATCH v4 7/7] mm/demotion: Demote pages according to allocation fallback order Aneesh Kumar K.V
2022-05-27 15:03 ` Jonathan Cameron
2022-06-02 7:35 ` Ying Huang
2022-06-03 15:09 ` Aneesh Kumar K V
2022-06-06 0:43 ` Ying Huang
2022-06-06 4:07 ` Aneesh Kumar K V
2022-06-06 5:26 ` Ying Huang
2022-06-06 6:21 ` Aneesh Kumar K.V
2022-06-06 7:42 ` Ying Huang
2022-06-06 8:02 ` Aneesh Kumar K V
2022-06-06 8:06 ` Ying Huang
2022-06-06 17:07 ` Yang Shi
2022-05-27 13:40 ` RFC: Memory Tiering Kernel Interfaces (v3) Aneesh Kumar K V
2022-05-27 16:30 ` Wei Xu
2022-05-29 4:31 ` Ying Huang
2022-05-30 12:50 ` Jonathan Cameron
2022-05-31 1:57 ` Ying Huang
2022-06-07 19:25 ` Tim Chen
2022-06-08 4:41 ` Aneesh Kumar K V
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220527122528.129445-3-aneesh.kumar@linux.ibm.com \
--to=aneesh.kumar@linux.ibm.com \
--cc=Jonathan.Cameron@huawei.com \
--cc=akpm@linux-foundation.org \
--cc=apopple@nvidia.com \
--cc=baolin.wang@linux.alibaba.com \
--cc=brice.goglin@gmail.com \
--cc=dan.j.williams@intel.com \
--cc=dave.hansen@intel.com \
--cc=dave@stgolabs.net \
--cc=feng.tang@intel.com \
--cc=gthelen@google.com \
--cc=hesham.almatary@huawei.com \
--cc=jvgediya@linux.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=rientjes@google.com \
--cc=shy828301@gmail.com \
--cc=tim.c.chen@intel.com \
--cc=ying.huang@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).