From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
To: linux-mm@kvack.org, akpm@linux-foundation.org
Cc: Wei Xu <weixugc@google.com>, Huang Ying <ying.huang@intel.com>,
Yang Shi <shy828301@gmail.com>,
Davidlohr Bueso <dave@stgolabs.net>,
Tim C Chen <tim.c.chen@intel.com>,
Michal Hocko <mhocko@kernel.org>,
Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
Hesham Almatary <hesham.almatary@huawei.com>,
Dave Hansen <dave.hansen@intel.com>,
Jonathan Cameron <Jonathan.Cameron@huawei.com>,
Alistair Popple <apopple@nvidia.com>,
Dan Williams <dan.j.williams@intel.com>,
Johannes Weiner <hannes@cmpxchg.org>,
jvgediya.oss@gmail.com, Bharata B Rao <bharata@amd.com>,
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Subject: [PATCH v3 updated] mm/demotion: Expose memory tier details via sysfs
Date: Tue, 30 Aug 2022 13:47:36 +0530 [thread overview]
Message-ID: <20220830081736.119281-1-aneesh.kumar@linux.ibm.com> (raw)
This patch adds /sys/devices/virtual/memory_tiering/ where all memory tier
related details can be found. All allocated memory tiers will be listed
there as /sys/devices/virtual/memory_tiering/memory_tierN/
The nodes which are part of a specific memory tier can be listed via
/sys/devices/virtual/memory_tiering/memory_tierN/nodes
A directory hierarchy looks like
:/sys/devices/virtual/memory_tiering$ tree memory_tier4/
memory_tier4/
├── nodes
├── subsystem -> ../../../../bus/memory_tiering
└── uevent
All toptier nodes are listed via
/sys/devices/virtual/memory_tiering/toptier_nodes
:/sys/devices/virtual/memory_tiering$ cat toptier_nodes
0,2
:/sys/devices/virtual/memory_tiering$ cat memory_tier4/nodes
0,2
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Changes from v2:
* update macro to static inline
* Fix build error with CONFIG_MIGRATION disabled
* drop abstract_distance
* update commit message
.../ABI/testing/sysfs-kernel-mm-memory-tiers | 35 ++++
mm/memory-tiers.c | 154 +++++++++++++++---
2 files changed, 167 insertions(+), 22 deletions(-)
create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
new file mode 100644
index 000000000000..55051fcf5502
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
@@ -0,0 +1,35 @@
+What: /sys/devices/virtual/memory_tiering/
+Date: August 2022
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: A collection of all the memory tiers allocated.
+
+ Individual memory tier details are contained in subdirectories
+ named by the abstract distance of the memory tier.
+
+ /sys/devices/virtual/memory_tiering/memory_tierN/
+
+
+What: /sys/devices/virtual/memory_tiering/memory_tierN/
+ /sys/devices/virtual/memory_tiering/memory_tierN/nodes
+Date: August 2022
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Directory with details of a specific memory tier
+
+ This is the directory containing information about a particular
+ memory tier, memtierN, where N is derived based on abstract distance.
+
+ A smaller value of N implies a higher (faster) memory tier in the
+ hierarchy.
+
+ nodes: NUMA nodes that are part of this memory tier.
+
+
+What: /sys/devices/virtual/memory_tiering/toptier_nodes
+Date: August 2022
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Toptier node mask
+
+ A toptier is defined as the memory tier from which memory promotion
+ is not done by the kernel.
+
+ toptier_nodes: Union of NUMA nodes that are part of each toptier.
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index c82eb0111383..33673ed9b3dc 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -19,6 +19,7 @@ struct memory_tier {
* adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
*/
int adistance_start;
+ struct device dev;
/* All the nodes that are part of all the lower memory tiers. */
nodemask_t lower_tier_mask;
};
@@ -36,6 +37,12 @@ static DEFINE_MUTEX(memory_tier_lock);
static LIST_HEAD(memory_tiers);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
static struct memory_dev_type *default_dram_type;
+
+static struct bus_type memory_tier_subsys = {
+ .name = "memory_tiering",
+ .dev_name = "memory_tier",
+};
+
#ifdef CONFIG_MIGRATION
static int top_tier_adistance;
/*
@@ -98,8 +105,63 @@ static int top_tier_adistance;
static struct demotion_nodes *node_demotion __read_mostly;
#endif /* CONFIG_MIGRATION */
+static inline struct memory_tier *to_memory_tier(struct device *device)
+{
+ return container_of(device, struct memory_tier, dev);
+}
+
+static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
+{
+ nodemask_t nodes = NODE_MASK_NONE;
+ struct memory_dev_type *memtype;
+
+ list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
+ nodes_or(nodes, nodes, memtype->nodes);
+
+ return nodes;
+}
+
+static void memory_tier_device_release(struct device *dev)
+{
+ struct memory_tier *tier = to_memory_tier(dev);
+ /*
+ * synchronize_rcu in clear_node_memory_tier makes sure
+ * we don't have rcu access to this memory tier.
+ */
+ kfree(tier);
+}
+
+static ssize_t nodes_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int ret;
+ nodemask_t nmask;
+
+ mutex_lock(&memory_tier_lock);
+ nmask = get_memtier_nodemask(to_memory_tier(dev));
+ ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
+ mutex_unlock(&memory_tier_lock);
+ return ret;
+}
+static DEVICE_ATTR_RO(nodes);
+
+static struct attribute *memtier_dev_attrs[] = {
+ &dev_attr_nodes.attr,
+ NULL
+};
+
+static const struct attribute_group memtier_dev_group = {
+ .attrs = memtier_dev_attrs,
+};
+
+static const struct attribute_group *memtier_dev_groups[] = {
+ &memtier_dev_group,
+ NULL
+};
+
static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
{
+ int ret;
bool found_slot = false;
struct memory_tier *memtier, *new_memtier;
int adistance = memtype->adistance;
@@ -123,15 +185,14 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
list_for_each_entry(memtier, &memory_tiers, list) {
if (adistance == memtier->adistance_start) {
- list_add(&memtype->tier_sibiling, &memtier->memory_types);
- return memtier;
+ goto link_memtype;
} else if (adistance < memtier->adistance_start) {
found_slot = true;
break;
}
}
- new_memtier = kmalloc(sizeof(struct memory_tier), GFP_KERNEL);
+ new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
if (!new_memtier)
return ERR_PTR(-ENOMEM);
@@ -142,8 +203,23 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
list_add_tail(&new_memtier->list, &memtier->list);
else
list_add_tail(&new_memtier->list, &memory_tiers);
- list_add(&memtype->tier_sibiling, &new_memtier->memory_types);
- return new_memtier;
+
+ new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
+ new_memtier->dev.bus = &memory_tier_subsys;
+ new_memtier->dev.release = memory_tier_device_release;
+ new_memtier->dev.groups = memtier_dev_groups;
+
+ ret = device_register(&new_memtier->dev);
+ if (ret) {
+ list_del(&memtier->list);
+ put_device(&memtier->dev);
+ return ERR_PTR(ret);
+ }
+ memtier = new_memtier;
+
+link_memtype:
+ list_add(&memtype->tier_sibiling, &memtier->memory_types);
+ return memtier;
}
static struct memory_tier *__node_get_memory_tier(int node)
@@ -275,17 +351,6 @@ static void disable_all_demotion_targets(void)
synchronize_rcu();
}
-static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
-{
- nodemask_t nodes = NODE_MASK_NONE;
- struct memory_dev_type *memtype;
-
- list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
- nodes_or(nodes, nodes, memtype->nodes);
-
- return nodes;
-}
-
/*
* Find an automatic demotion target for all memory
* nodes. Failing here is OK. It might just indicate
@@ -433,11 +498,7 @@ static struct memory_tier *set_node_memory_tier(int node)
static void destroy_memory_tier(struct memory_tier *memtier)
{
list_del(&memtier->list);
- /*
- * synchronize_rcu in clear_node_memory_tier makes sure
- * we don't have rcu access to this memory tier.
- */
- kfree(memtier);
+ device_unregister(&memtier->dev);
}
static bool clear_node_memory_tier(int node)
@@ -564,11 +625,60 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
return notifier_from_errno(0);
}
+#ifdef CONFIG_MIGRATION
+static ssize_t toptier_nodes_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int ret;
+ nodemask_t nmask, top_tier_mask = NODE_MASK_NONE;
+ struct memory_tier *memtier = to_memory_tier(dev);
+
+ mutex_lock(&memory_tier_lock);
+ list_for_each_entry(memtier, &memory_tiers, list) {
+ if (memtier->adistance_start > top_tier_adistance)
+ break;
+ nmask = get_memtier_nodemask(memtier);
+ nodes_or(top_tier_mask, top_tier_mask, nmask);
+ }
+
+ ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&top_tier_mask));
+ mutex_unlock(&memory_tier_lock);
+ return ret;
+}
+#else
+static ssize_t toptier_nodes_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ nodemask_t top_tier_mask = node_states[N_MEMORY];
+
+ return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&top_tier_mask));
+}
+#endif
+static DEVICE_ATTR_RO(toptier_nodes);
+
+static struct attribute *memtier_subsys_attrs[] = {
+ &dev_attr_toptier_nodes.attr,
+ NULL
+};
+
+static const struct attribute_group memtier_subsys_group = {
+ .attrs = memtier_subsys_attrs,
+};
+
+static const struct attribute_group *memtier_subsys_groups[] = {
+ &memtier_subsys_group,
+ NULL
+};
+
static int __init memory_tier_init(void)
{
- int node;
+ int ret, node;
struct memory_tier *memtier;
+ ret = subsys_virtual_register(&memory_tier_subsys, memtier_subsys_groups);
+ if (ret)
+ panic("%s() failed to register memory tier subsystem\n", __func__);
+
#ifdef CONFIG_MIGRATION
node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
GFP_KERNEL);
--
2.37.2
next reply other threads:[~2022-08-30 8:18 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-08-30 8:17 Aneesh Kumar K.V [this message]
2022-09-01 7:01 ` [PATCH v3 updated] mm/demotion: Expose memory tier details via sysfs Huang, Ying
2022-09-01 8:24 ` Aneesh Kumar K V
2022-09-02 0:29 ` Huang, Ying
2022-09-02 5:09 ` Wei Xu
2022-09-02 5:15 ` Huang, Ying
2022-09-02 5:23 ` Aneesh Kumar K V
2022-09-02 5:40 ` Huang, Ying
2022-09-02 5:46 ` Aneesh Kumar K V
2022-09-02 6:12 ` Huang, Ying
2022-09-02 6:31 ` Aneesh Kumar K V
2022-09-02 6:40 ` Huang, Ying
2022-09-02 6:44 ` Aneesh Kumar K V
2022-09-02 7:02 ` Wei Xu
2022-09-02 7:57 ` Huang, Ying
2022-09-02 8:48 ` Aneesh Kumar K V
2022-09-02 9:04 ` Huang, Ying
2022-09-02 9:44 ` Aneesh Kumar K V
2022-09-05 1:52 ` Huang, Ying
2022-09-05 3:50 ` Aneesh Kumar K V
2022-09-05 5:13 ` Huang, Ying
2022-09-05 5:27 ` Aneesh Kumar K V
2022-09-05 5:53 ` Huang, Ying
2022-09-05 6:14 ` Aneesh Kumar K V
2022-09-05 6:24 ` Huang, Ying
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220830081736.119281-1-aneesh.kumar@linux.ibm.com \
--to=aneesh.kumar@linux.ibm.com \
--cc=Jonathan.Cameron@huawei.com \
--cc=akpm@linux-foundation.org \
--cc=apopple@nvidia.com \
--cc=bharata@amd.com \
--cc=dan.j.williams@intel.com \
--cc=dave.hansen@intel.com \
--cc=dave@stgolabs.net \
--cc=hannes@cmpxchg.org \
--cc=hesham.almatary@huawei.com \
--cc=jvgediya.oss@gmail.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=shy828301@gmail.com \
--cc=tim.c.chen@intel.com \
--cc=weixugc@google.com \
--cc=ying.huang@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.