public inbox for linux-mm@kvack.org
 help / color / mirror / Atom feed
* [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth
@ 2026-03-12  9:12 YeeLi
  2026-03-12  9:42 ` Huang, Ying
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: YeeLi @ 2026-03-12  9:12 UTC (permalink / raw)
  To: akpm, david, dan.j.williams, ying.huang, linux-mm, joshua.hahnjy
  Cc: linux-kernel, Jonathan.Cameron, linux-cxl, dave.jiang, yeeli

From: yeeli <seven.yi.lee@gmail.com>

Automatic tuning for weighted interleaving [1] provides real benefits on
systems with CXL support. However, platforms that lack HMAT or CDAT
information cannot make use of this feature.

If the bandwidth reported by firmware or the device deviates from the
actual measured bandwidth, administrators also lack a clear way to adjust
the per-node weight values.

This patch introduces an optional Kconfig option,
CONFIG_NUMA_BW_MANUAL_OVERRIDE (default n), which exposes node bandwidth
R/W sysfs attributes under:

  /sys/kernel/mm/mempolicy/weighted_interleave/bw_nodeN

The sysfs files are created and removed dynamically on node hotplug
events, in sync with the existing weighted_interleave/nodeN attributes.

Userspace can write a single bandwidth value (in MB/s) to override both
read_bandwidth and write_bandwidth for the corresponding NUMA node. The
value is then propagated to the internal node_bw_table via
mempolicy_set_node_perf().

This interface is intended for debugging and experimentation only.

[1] Link:
https://lkml.kernel.org/r/20250505182328.4148265-1-joshua.hahnjy@gmail.com

Signed-off-by: yeeli <seven.yi.lee@gmail.com>
---
 mm/Kconfig     |  20 +++++++
 mm/mempolicy.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index bd0ea5454af8..40554df18edc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1441,6 +1441,26 @@ config NUMA_EMU
 	  into virtual nodes when booted with "numa=fake=N", where N is the
 	  number of nodes. This is only useful for debugging.
 
+config NUMA_BW_MANUAL_OVERRIDE
+	bool "Allow manual override of per-NUMA-node bandwidth for weighted interleave"
+	depends on NUMA && SYSFS
+	default n
+	help
+	  This option exposes writable sysfs attributes under
+	  /sys/kernel/mm/mempolicy/weighted_interleave/bw_nodeN, allowing
+	  userspace to manually set read/write bandwidth values for each NUMA node.
+
+	  These values update the internal node_bw_table and can influence
+	  weighted interleave auto-tuning (if enabled).
+
+	  WARNING: This is intended for debugging, development, or platforms
+	  with incorrect HMAT/CDAT firmware data. Overriding hardware-reported
+	  bandwidth can lead to suboptimal performance, instability, or
+	  incorrect resource allocation decisions.
+
+	  Say N unless you are actively developing or debugging bandwidth-aware
+	  memory policies.
+
 config ARCH_HAS_USER_SHADOW_STACK
 	bool
 	help
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 68a98ba57882..0b7f42491748 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -226,6 +226,7 @@ int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
 
 	bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
 	new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+
 	if (!new_bw)
 		return -ENOMEM;
 
@@ -3614,6 +3615,9 @@ struct iw_node_attr {
 struct sysfs_wi_group {
 	struct kobject wi_kobj;
 	struct mutex kobj_lock;
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+	struct iw_node_attr *bw_attrs[MAX_NUMNODES];
+#endif
 	struct iw_node_attr *nattrs[];
 };
 
@@ -3855,6 +3859,128 @@ static int sysfs_wi_node_add(int nid)
 	return ret;
 }
 
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+static ssize_t bw_node_show(struct kobject *kobj,
+			    struct kobj_attribute *attr,
+			    char *buf)
+{
+	struct iw_node_attr *node_attr;
+
+	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+
+	/*A Node without CDAT or HMAT*/
+	if (!node_bw_table)
+		return sprintf(buf, "N/A\n");
+
+	if (!node_bw_table[node_attr->nid])
+		return sprintf(buf, "0\n");
+
+	return sprintf(buf, "%u(MB/s)\n", node_bw_table[node_attr->nid]);
+}
+
+static ssize_t bw_node_store(struct kobject *kobj,
+			     struct kobj_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct iw_node_attr *node_attr;
+	unsigned long val = 0;
+	int ret;
+	struct access_coordinate coords = {
+		.read_bandwidth = 0,
+		.write_bandwidth = 0,
+	};
+
+	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+
+	ret = kstrtoul(buf, 0, &val);
+
+	coords.read_bandwidth = val;
+	coords.write_bandwidth = val;
+
+	if (ret)
+		return ret;
+
+	if (val > UINT_MAX)
+		return -EINVAL;
+
+	ret = mempolicy_set_node_perf(node_attr->nid, &coords);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static int sysfs_bw_node_add(int nid)
+{
+	int ret;
+	char *name;
+	struct iw_node_attr *new_attr;
+
+	if (nid < 0 || nid >= nr_node_ids) {
+		pr_err("invalid node id: %d\n", nid);
+		return -EINVAL;
+	}
+
+	new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
+	if (!new_attr)
+		return -ENOMEM;
+
+	name = kasprintf(GFP_KERNEL, "bw_node%d", nid);
+	if (!name) {
+		kfree(new_attr);
+		return -ENOMEM;
+	}
+
+	sysfs_attr_init(&new_attr->kobj_attr.attr);
+	new_attr->kobj_attr.attr.name = name;
+	new_attr->kobj_attr.attr.mode = 0644;
+	new_attr->kobj_attr.show = bw_node_show;
+	new_attr->kobj_attr.store = bw_node_store;
+	new_attr->nid = nid;
+
+	mutex_lock(&wi_group->kobj_lock);
+	if (wi_group->bw_attrs[nid]) {
+		mutex_unlock(&wi_group->kobj_lock);
+		ret = -EEXIST;
+		goto out;
+	}
+
+	ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
+
+	if (ret) {
+		mutex_unlock(&wi_group->kobj_lock);
+		goto out;
+	}
+	wi_group->bw_attrs[nid] = new_attr;
+	mutex_unlock(&wi_group->kobj_lock);
+	return 0;
+
+out:
+	kfree(new_attr->kobj_attr.attr.name);
+	kfree(new_attr);
+	return ret;
+}
+
+static void sysfs_bw_node_delete(int nid)
+{
+	struct iw_node_attr *attr;
+
+	if (nid < 0 || nid >= nr_node_ids)
+		return;
+
+	mutex_lock(&wi_group->kobj_lock);
+	attr = wi_group->bw_attrs[nid];
+
+	if (attr) {
+		sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
+		kfree(attr->kobj_attr.attr.name);
+		kfree(attr);
+		wi_group->nattrs[nid] = NULL;
+	}
+	mutex_unlock(&wi_group->kobj_lock);
+}
+#endif
+
 static int wi_node_notifier(struct notifier_block *nb,
 			       unsigned long action, void *data)
 {
@@ -3868,9 +3994,22 @@ static int wi_node_notifier(struct notifier_block *nb,
 		if (err)
 			pr_err("failed to add sysfs for node%d during hotplug: %d\n",
 			       nid, err);
+
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+		err = sysfs_bw_node_add(nid);
+		if (err)
+			pr_err("failed to add sysfs bw_node%d: %d\n",
+			       nid, err);
+#endif
 		break;
+
 	case NODE_REMOVED_LAST_MEMORY:
 		sysfs_wi_node_delete(nid);
+
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+		sysfs_bw_node_delete(nid);
+#endif
+
 		break;
 	}
 
@@ -3906,6 +4045,15 @@ static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
 			       nid, err);
 			goto err_cleanup_kobj;
 		}
+
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+		err = sysfs_bw_node_add(nid);
+		if (err) {
+			pr_err("failed to add sysfs bw_node%d during init: %d\n", nid, err);
+			goto err_cleanup_kobj;
+		}
+#endif
+
 	}
 
 	hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2026-03-13  3:51 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-12  9:12 [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth YeeLi
2026-03-12  9:42 ` Huang, Ying
2026-03-12 10:26   ` Yee Li
2026-03-12 11:58 ` Jonathan Cameron
2026-03-13  3:05   ` Yee Li
2026-03-12 15:00 ` Joshua Hahn
2026-03-12 15:05   ` Joshua Hahn
2026-03-13  3:39   ` Yee Li
2026-03-12 16:12 ` Gregory Price
2026-03-13  3:51   ` Yee Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox