* [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth
@ 2026-03-12 9:12 YeeLi
2026-03-12 9:42 ` Huang, Ying
` (3 more replies)
0 siblings, 4 replies; 10+ messages in thread
From: YeeLi @ 2026-03-12 9:12 UTC (permalink / raw)
To: akpm, david, dan.j.williams, ying.huang, linux-mm, joshua.hahnjy
Cc: linux-kernel, Jonathan.Cameron, linux-cxl, dave.jiang, yeeli
From: yeeli <seven.yi.lee@gmail.com>
Automatic tuning for weighted interleaving [1] provides real benefits on
systems with CXL support. However, platforms that lack HMAT or CDAT
information cannot make use of this feature.
If the bandwidth reported by firmware or the device deviates from the
actual measured bandwidth, administrators also lack a clear way to adjust
the per-node weight values.
This patch introduces an optional Kconfig option,
CONFIG_NUMA_BW_MANUAL_OVERRIDE (default n), which exposes node bandwidth
R/W sysfs attributes under:
/sys/kernel/mm/mempolicy/weighted_interleave/bw_nodeN
The sysfs files are created and removed dynamically on node hotplug
events, in sync with the existing weighted_interleave/nodeN attributes.
Userspace can write a single bandwidth value (in MB/s) to override both
read_bandwidth and write_bandwidth for the corresponding NUMA node. The
value is then propagated to the internal node_bw_table via
mempolicy_set_node_perf().
This interface is intended for debugging and experimentation only.
[1] Link:
https://lkml.kernel.org/r/20250505182328.4148265-1-joshua.hahnjy@gmail.com
Signed-off-by: yeeli <seven.yi.lee@gmail.com>
---
mm/Kconfig | 20 +++++++
mm/mempolicy.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 168 insertions(+)
diff --git a/mm/Kconfig b/mm/Kconfig
index bd0ea5454af8..40554df18edc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1441,6 +1441,26 @@ config NUMA_EMU
into virtual nodes when booted with "numa=fake=N", where N is the
number of nodes. This is only useful for debugging.
+config NUMA_BW_MANUAL_OVERRIDE
+ bool "Allow manual override of per-NUMA-node bandwidth for weighted interleave"
+ depends on NUMA && SYSFS
+ default n
+ help
+ This option exposes writable sysfs attributes under
+ /sys/kernel/mm/mempolicy/weighted_interleave/bw_nodeN, allowing
+ userspace to manually set read/write bandwidth values for each NUMA node.
+
+ These values update the internal node_bw_table and can influence
+ weighted interleave auto-tuning (if enabled).
+
+ WARNING: This is intended for debugging, development, or platforms
+ with incorrect HMAT/CDAT firmware data. Overriding hardware-reported
+ bandwidth can lead to suboptimal performance, instability, or
+ incorrect resource allocation decisions.
+
+ Say N unless you are actively developing or debugging bandwidth-aware
+ memory policies.
+
config ARCH_HAS_USER_SHADOW_STACK
bool
help
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 68a98ba57882..0b7f42491748 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -226,6 +226,7 @@ int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+
if (!new_bw)
return -ENOMEM;
@@ -3614,6 +3615,9 @@ struct iw_node_attr {
struct sysfs_wi_group {
struct kobject wi_kobj;
struct mutex kobj_lock;
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+ struct iw_node_attr *bw_attrs[MAX_NUMNODES];
+#endif
struct iw_node_attr *nattrs[];
};
@@ -3855,6 +3859,128 @@ static int sysfs_wi_node_add(int nid)
return ret;
}
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+static ssize_t bw_node_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct iw_node_attr *node_attr;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+
+ /*A Node without CDAT or HMAT*/
+ if (!node_bw_table)
+ return sprintf(buf, "N/A\n");
+
+ if (!node_bw_table[node_attr->nid])
+ return sprintf(buf, "0\n");
+
+ return sprintf(buf, "%u(MB/s)\n", node_bw_table[node_attr->nid]);
+}
+
+static ssize_t bw_node_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct iw_node_attr *node_attr;
+ unsigned long val = 0;
+ int ret;
+ struct access_coordinate coords = {
+ .read_bandwidth = 0,
+ .write_bandwidth = 0,
+ };
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+
+ ret = kstrtoul(buf, 0, &val);
+
+ coords.read_bandwidth = val;
+ coords.write_bandwidth = val;
+
+ if (ret)
+ return ret;
+
+ if (val > UINT_MAX)
+ return -EINVAL;
+
+ ret = mempolicy_set_node_perf(node_attr->nid, &coords);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static int sysfs_bw_node_add(int nid)
+{
+ int ret;
+ char *name;
+ struct iw_node_attr *new_attr;
+
+ if (nid < 0 || nid >= nr_node_ids) {
+ pr_err("invalid node id: %d\n", nid);
+ return -EINVAL;
+ }
+
+ new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
+ if (!new_attr)
+ return -ENOMEM;
+
+ name = kasprintf(GFP_KERNEL, "bw_node%d", nid);
+ if (!name) {
+ kfree(new_attr);
+ return -ENOMEM;
+ }
+
+ sysfs_attr_init(&new_attr->kobj_attr.attr);
+ new_attr->kobj_attr.attr.name = name;
+ new_attr->kobj_attr.attr.mode = 0644;
+ new_attr->kobj_attr.show = bw_node_show;
+ new_attr->kobj_attr.store = bw_node_store;
+ new_attr->nid = nid;
+
+ mutex_lock(&wi_group->kobj_lock);
+ if (wi_group->bw_attrs[nid]) {
+ mutex_unlock(&wi_group->kobj_lock);
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
+
+ if (ret) {
+ mutex_unlock(&wi_group->kobj_lock);
+ goto out;
+ }
+ wi_group->bw_attrs[nid] = new_attr;
+ mutex_unlock(&wi_group->kobj_lock);
+ return 0;
+
+out:
+ kfree(new_attr->kobj_attr.attr.name);
+ kfree(new_attr);
+ return ret;
+}
+
+static void sysfs_bw_node_delete(int nid)
+{
+ struct iw_node_attr *attr;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return;
+
+ mutex_lock(&wi_group->kobj_lock);
+ attr = wi_group->bw_attrs[nid];
+
+ if (attr) {
+ sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
+ kfree(attr->kobj_attr.attr.name);
+ kfree(attr);
+ wi_group->nattrs[nid] = NULL;
+ }
+ mutex_unlock(&wi_group->kobj_lock);
+}
+#endif
+
static int wi_node_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
@@ -3868,9 +3994,22 @@ static int wi_node_notifier(struct notifier_block *nb,
if (err)
pr_err("failed to add sysfs for node%d during hotplug: %d\n",
nid, err);
+
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+ err = sysfs_bw_node_add(nid);
+ if (err)
+ pr_err("failed to add sysfs bw_node%d: %d\n",
+ nid, err);
+#endif
break;
+
case NODE_REMOVED_LAST_MEMORY:
sysfs_wi_node_delete(nid);
+
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+ sysfs_bw_node_delete(nid);
+#endif
+
break;
}
@@ -3906,6 +4045,15 @@ static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
nid, err);
goto err_cleanup_kobj;
}
+
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+ err = sysfs_bw_node_add(nid);
+ if (err) {
+ pr_err("failed to add sysfs bw_node%d during init: %d\n", nid, err);
+ goto err_cleanup_kobj;
+ }
+#endif
+
}
hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
--
2.34.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth
2026-03-12 9:12 [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth YeeLi
@ 2026-03-12 9:42 ` Huang, Ying
2026-03-12 10:26 ` Yee Li
2026-03-12 11:58 ` Jonathan Cameron
` (2 subsequent siblings)
3 siblings, 1 reply; 10+ messages in thread
From: Huang, Ying @ 2026-03-12 9:42 UTC (permalink / raw)
To: YeeLi
Cc: akpm, david, dan.j.williams, linux-mm, joshua.hahnjy,
linux-kernel, Jonathan.Cameron, linux-cxl, dave.jiang
YeeLi <seven.yi.lee@gmail.com> writes:
> From: yeeli <seven.yi.lee@gmail.com>
>
> Automatic tuning for weighted interleaving [1] provides real benefits on
> systems with CXL support. However, platforms that lack HMAT or CDAT
> information cannot make use of this feature.
>
> If the bandwidth reported by firmware or the device deviates from the
> actual measured bandwidth, administrators also lack a clear way to adjust
> the per-node weight values.
>
> This patch introduces an optional Kconfig option,
> CONFIG_NUMA_BW_MANUAL_OVERRIDE (default n), which exposes node bandwidth
> R/W sysfs attributes under:
>
> /sys/kernel/mm/mempolicy/weighted_interleave/bw_nodeN
>
> The sysfs files are created and removed dynamically on node hotplug
> events, in sync with the existing weighted_interleave/nodeN attributes.
>
> Userspace can write a single bandwidth value (in MB/s) to override both
> read_bandwidth and write_bandwidth for the corresponding NUMA node. The
> value is then propagated to the internal node_bw_table via
> mempolicy_set_node_perf().
>
> This interface is intended for debugging and experimentation only.
How to use this new interface? Can you provide an example?
---
Best Regards,
Huang, Ying
> [1] Link:
> https://lkml.kernel.org/r/20250505182328.4148265-1-joshua.hahnjy@gmail.com
>
[snip]
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth
2026-03-12 9:42 ` Huang, Ying
@ 2026-03-12 10:26 ` Yee Li
0 siblings, 0 replies; 10+ messages in thread
From: Yee Li @ 2026-03-12 10:26 UTC (permalink / raw)
To: Huang, Ying
Cc: akpm, david, dan.j.williams, linux-mm, joshua.hahnjy,
linux-kernel, Jonathan.Cameron, linux-cxl, dave.jiang
> How to use this new interface? Can you provide an example?
Hi Ying,
A quick example:
1. Check current (original) bandwidth
cd /sys/kernel/mm/mempolicy/weighted_interleave
$ cat bw_node*
20000(MB/s)
10000(MB/s)
0
2. Check current interleave weights
$ cat node*
21
10
1
3. Override a node's bandwidth
$ echo 10000 > bw_node2
4. After override, check results:
$ cat bw_node*
20000(MB/s)
10000(MB/s)
10000(MB/s)
$ cat node*
2
1
1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth
2026-03-12 9:12 [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth YeeLi
2026-03-12 9:42 ` Huang, Ying
@ 2026-03-12 11:58 ` Jonathan Cameron
2026-03-13 3:05 ` Yee Li
2026-03-12 15:00 ` Joshua Hahn
2026-03-12 16:12 ` Gregory Price
3 siblings, 1 reply; 10+ messages in thread
From: Jonathan Cameron @ 2026-03-12 11:58 UTC (permalink / raw)
To: YeeLi
Cc: akpm, david, dan.j.williams, ying.huang, linux-mm, joshua.hahnjy,
linux-kernel, linux-cxl, dave.jiang
On Thu, 12 Mar 2026 17:12:07 +0800
YeeLi <seven.yi.lee@gmail.com> wrote:
> From: yeeli <seven.yi.lee@gmail.com>
>
> Automatic tuning for weighted interleaving [1] provides real benefits on
> systems with CXL support. However, platforms that lack HMAT or CDAT
> information cannot make use of this feature.
>
> If the bandwidth reported by firmware or the device deviates from the
> actual measured bandwidth, administrators also lack a clear way to adjust
> the per-node weight values.
>
> This patch introduces an optional Kconfig option,
> CONFIG_NUMA_BW_MANUAL_OVERRIDE (default n), which exposes node bandwidth
> R/W sysfs attributes under:
>
> /sys/kernel/mm/mempolicy/weighted_interleave/bw_nodeN
>
> The sysfs files are created and removed dynamically on node hotplug
> events, in sync with the existing weighted_interleave/nodeN attributes.
>
> Userspace can write a single bandwidth value (in MB/s) to override both
> read_bandwidth and write_bandwidth for the corresponding NUMA node. The
> value is then propagated to the internal node_bw_table via
> mempolicy_set_node_perf().
>
> This interface is intended for debugging and experimentation only.
>
> [1] Link:
> https://lkml.kernel.org/r/20250505182328.4148265-1-joshua.hahnjy@gmail.com
>
> Signed-off-by: yeeli <seven.yi.lee@gmail.com>
As I note below, I'm not convinced this is the best layer to be injecting
data at if we want a debug interface that allows us to reflect real
hardware. Might be the sort of patch that is useful outside the tree though
so thanks for sharing it!
> ---
> mm/Kconfig | 20 +++++++
> mm/mempolicy.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 168 insertions(+)
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index bd0ea5454af8..40554df18edc 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1441,6 +1441,26 @@ config NUMA_EMU
> into virtual nodes when booted with "numa=fake=N", where N is the
> number of nodes. This is only useful for debugging.
>
> +config NUMA_BW_MANUAL_OVERRIDE
> + bool "Allow manual override of per-NUMA-node bandwidth for weighted interleave"
> + depends on NUMA && SYSFS
> + default n
Drop this. Everything is default n (by default!)
> + help
> + This option exposes writable sysfs attributes under
> + /sys/kernel/mm/mempolicy/weighted_interleave/bw_nodeN, allowing
> + userspace to manually set read/write bandwidth values for each NUMA node.
> +
> + These values update the internal node_bw_table and can influence
> + weighted interleave auto-tuning (if enabled).
> +
> + WARNING: This is intended for debugging, development, or platforms
> + with incorrect HMAT/CDAT firmware data. Overriding hardware-reported
> + bandwidth can lead to suboptimal performance, instability, or
> + incorrect resource allocation decisions.
If it was an SRAT (for GPs) / HMAT issue I'd suggest table injection is
an adequate way to do testing and debug.
CDAT is trickier, but maybe we should be thinking about a debug path to inject
that as well - similar to what we do for ACPI tables.
From my point of view I'd rather see debug / testing interfaces that test the
whole flow rather than just this top layer control.
However, I can see this is a useful patch for developers. But if you
are developing debugging bandwidth-aware memory policies, I'm going to assume
you don't mind building a kernel and adding this patch to make your life
easier!
So to me, useful tool but I'm not 'yet' convinced it should be upstream.
> +
> + Say N unless you are actively developing or debugging bandwidth-aware
> + memory policies.
> +
> config ARCH_HAS_USER_SHADOW_STACK
> bool
> help
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 68a98ba57882..0b7f42491748 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -226,6 +226,7 @@ int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
>
> bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
> new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
> +
Stray change. Always check for these bnefore posting as they slow down
patch sets for no good reason.
> if (!new_bw)
> return -ENOMEM;
>
> @@ -3614,6 +3615,9 @@ struct iw_node_attr {
> struct sysfs_wi_group {
> struct kobject wi_kobj;
> struct mutex kobj_lock;
> +#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
> + struct iw_node_attr *bw_attrs[MAX_NUMNODES];
Ok. It's debug so I guess this is ok rather than what is
done for nattrs with nr_node_ids.
Alternative would be a pointer and a separate allocation.
> +#endif
> struct iw_node_attr *nattrs[];
> };
>
> @@ -3855,6 +3859,128 @@ static int sysfs_wi_node_add(int nid)
> return ret;
> }
>
> +#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
> +static ssize_t bw_node_show(struct kobject *kobj,
> + struct kobj_attribute *attr,
> + char *buf)
Can fit on fewer lines without going past 80 chars.
> +{
> + struct iw_node_attr *node_attr;
> +
> + node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
> +
> + /*A Node without CDAT or HMAT*/
/* A node without either CDAT or HMAT */
> + if (!node_bw_table)
> + return sprintf(buf, "N/A\n");
> +
> + if (!node_bw_table[node_attr->nid])
> + return sprintf(buf, "0\n");
> +
> + return sprintf(buf, "%u(MB/s)\n", node_bw_table[node_attr->nid]);
> +}
> +
> +static ssize_t bw_node_store(struct kobject *kobj,
> + struct kobj_attribute *attr,
> + const char *buf, size_t count)
> +{
> + struct iw_node_attr *node_attr;
> + unsigned long val = 0;
> + int ret;
> + struct access_coordinate coords = {
> + .read_bandwidth = 0,
> + .write_bandwidth = 0,
> + };
> +
> + node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
Can do that at declaration of node_Attr above.
> +
> + ret = kstrtoul(buf, 0, &val);
Check ret before filling in values.
> +
> + coords.read_bandwidth = val;
> + coords.write_bandwidth = val;
> +
> + if (ret)
> + return ret;
> +
> + if (val > UINT_MAX)
> + return -EINVAL;
> +
> + ret = mempolicy_set_node_perf(node_attr->nid, &coords);
> + if (ret)
> + return ret;
> +
> + return count;
> +}
> +
> +static int sysfs_bw_node_add(int nid)
> +{
> + int ret;
> + char *name;
> + struct iw_node_attr *new_attr;
> +
> + if (nid < 0 || nid >= nr_node_ids) {
> + pr_err("invalid node id: %d\n", nid);
> + return -EINVAL;
> + }
> +
> + new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
> + if (!new_attr)
> + return -ENOMEM;
> +
> + name = kasprintf(GFP_KERNEL, "bw_node%d", nid);
> + if (!name) {
> + kfree(new_attr);
> + return -ENOMEM;
> + }
> +
> + sysfs_attr_init(&new_attr->kobj_attr.attr);
> + new_attr->kobj_attr.attr.name = name;
> + new_attr->kobj_attr.attr.mode = 0644;
> + new_attr->kobj_attr.show = bw_node_show;
> + new_attr->kobj_attr.store = bw_node_store;
> + new_attr->nid = nid;
> +
> + mutex_lock(&wi_group->kobj_lock);
> + if (wi_group->bw_attrs[nid]) {
Add another label and put the mutex unlock in the error path.
> + mutex_unlock(&wi_group->kobj_lock);
> + ret = -EEXIST;
> + goto out;
> + }
> +
> + ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
> +
No blank line here.
> + if (ret) {
> + mutex_unlock(&wi_group->kobj_lock);
> + goto out;
> + }
> + wi_group->bw_attrs[nid] = new_attr;
> + mutex_unlock(&wi_group->kobj_lock);
> + return 0;
> +
> +out:
> + kfree(new_attr->kobj_attr.attr.name);
> + kfree(new_attr);
> + return ret;
> +}
> +
> +static void sysfs_bw_node_delete(int nid)
> +{
> + struct iw_node_attr *attr;
> +
> + if (nid < 0 || nid >= nr_node_ids)
> + return;
> +
> + mutex_lock(&wi_group->kobj_lock);
> + attr = wi_group->bw_attrs[nid];
> +
> + if (attr) {
> + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
> + kfree(attr->kobj_attr.attr.name);
> + kfree(attr);
> + wi_group->nattrs[nid] = NULL;
> + }
> + mutex_unlock(&wi_group->kobj_lock);
> +}
> +#endif
> +
> static int wi_node_notifier(struct notifier_block *nb,
> unsigned long action, void *data)
> {
> @@ -3868,9 +3994,22 @@ static int wi_node_notifier(struct notifier_block *nb,
> if (err)
> pr_err("failed to add sysfs for node%d during hotplug: %d\n",
> nid, err);
> +
> +#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
> + err = sysfs_bw_node_add(nid);
> + if (err)
> + pr_err("failed to add sysfs bw_node%d: %d\n",
> + nid, err);
> +#endif
> break;
> +
> case NODE_REMOVED_LAST_MEMORY:
> sysfs_wi_node_delete(nid);
> +
> +#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
Probably better to stub the function than to scatter ifdefs throughout.
> + sysfs_bw_node_delete(nid);
> +#endif
> +
> break;
> }
>
> @@ -3906,6 +4045,15 @@ static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
> nid, err);
> goto err_cleanup_kobj;
> }
> +
> +#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
> + err = sysfs_bw_node_add(nid);
> + if (err) {
> + pr_err("failed to add sysfs bw_node%d during init: %d\n", nid, err);
> + goto err_cleanup_kobj;
> + }
> +#endif
> +
> }
>
> hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth
2026-03-12 11:58 ` Jonathan Cameron
@ 2026-03-13 3:05 ` Yee Li
0 siblings, 0 replies; 10+ messages in thread
From: Yee Li @ 2026-03-13 3:05 UTC (permalink / raw)
To: Jonathan Cameron
Cc: akpm, david, dan.j.williams, ying.huang, linux-mm, joshua.hahnjy,
linux-kernel, linux-cxl, dave.jiang
Hi Jonathan,
Thanks for the review.
> As I note below, I'm not convinced this is the best layer to be injecting
> data at if we want a debug interface that allows us to reflect real
> hardware. Might be the sort of patch that is useful outside the tree though
> so thanks for sharing it!
Perhaps a small tool to compute weights from the desired bandwidth
would be a better approach.
> If it was an SRAT (for GPs) / HMAT issue I'd suggest table injection is
> an adequate way to do testing and debug.
>
> CDAT is trickier, but maybe we should be thinking about a debug path to inject
> that as well - similar to what we do for ACPI tables.
>
> From my point of view I'd rather see debug / testing interfaces that test the
> whole flow rather than just this top layer control.
>
> However, I can see this is a useful patch for developers. But if you
> are developing debugging bandwidth-aware memory policies, I'm going to assume
> you don't mind building a kernel and adding this patch to make your life
> easier!
>
> So to me, useful tool but I'm not 'yet' convinced it should be upstream.
You are right, table injection is likely the better approach.
During early hardware development the platform is often unstable, and
firmware may not be able to report it accurately. Letting the OS handle
debugging and evaluate the results can make experimentation easier.
And, thank you for checking the code.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth
2026-03-12 9:12 [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth YeeLi
2026-03-12 9:42 ` Huang, Ying
2026-03-12 11:58 ` Jonathan Cameron
@ 2026-03-12 15:00 ` Joshua Hahn
2026-03-12 15:05 ` Joshua Hahn
2026-03-13 3:39 ` Yee Li
2026-03-12 16:12 ` Gregory Price
3 siblings, 2 replies; 10+ messages in thread
From: Joshua Hahn @ 2026-03-12 15:00 UTC (permalink / raw)
To: YeeLi
Cc: akpm, david, dan.j.williams, ying.huang, linux-mm, joshua.hahnjy,
linux-kernel, Jonathan.Cameron, linux-cxl, dave.jiang, gourry
On Thu, 12 Mar 2026 17:12:07 +0800 YeeLi <seven.yi.lee@gmail.com> wrote:
Hello YeeLi,
I hope you are doing well! Thank you for the patch, I think it is an
interesting and helpful debugging tool.
> Automatic tuning for weighted interleaving [1] provides real benefits on
> systems with CXL support. However, platforms that lack HMAT or CDAT
> information cannot make use of this feature.
Thank you for the kind words : -) I'm also CCing Gregory Price, who
is the original author of the weighted interleave mempolicy.
> If the bandwidth reported by firmware or the device deviates from the
> actual measured bandwidth, administrators also lack a clear way to adjust
> the per-node weight values.
I just wanted to touch on this. Is this true? Weighted interleave already
exposes sysfs that lets the user set the weights they want. On one hand
I think it would be helpful for there to be a tool to see what the
effects of setting bandwidth information on a system could be, if it
had other side effects. I'm not sure if there is a strong case to do this
purely for weighted interleave auto-tuning, which already has a manual
weight override and pretty simple calculations : -)
> This patch introduces an optional Kconfig option,
> CONFIG_NUMA_BW_MANUAL_OVERRIDE (default n), which exposes node bandwidth
> R/W sysfs attributes under:
>
> /sys/kernel/mm/mempolicy/weighted_interleave/bw_nodeN
>
> The sysfs files are created and removed dynamically on node hotplug
> events, in sync with the existing weighted_interleave/nodeN attributes.
>
> Userspace can write a single bandwidth value (in MB/s) to override both
> read_bandwidth and write_bandwidth for the corresponding NUMA node. The
> value is then propagated to the internal node_bw_table via
> mempolicy_set_node_perf().
>
> This interface is intended for debugging and experimentation only.
Thank you for the helpful clarification. Just to be sure, are there
other intended things that the user can test other than what weights
weighted interleave would resolve to?
Thank you again for the patch, I hope you have a great day!
Joshua
> [1] Link:
> https://lkml.kernel.org/r/20250505182328.4148265-1-joshua.hahnjy@gmail.com
>
> Signed-off-by: yeeli <seven.yi.lee@gmail.com>
> ---
> mm/Kconfig | 20 +++++++
> mm/mempolicy.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 168 insertions(+)
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth
2026-03-12 15:00 ` Joshua Hahn
@ 2026-03-12 15:05 ` Joshua Hahn
2026-03-13 3:39 ` Yee Li
1 sibling, 0 replies; 10+ messages in thread
From: Joshua Hahn @ 2026-03-12 15:05 UTC (permalink / raw)
To: Joshua Hahn
Cc: YeeLi, akpm, david, dan.j.williams, ying.huang, linux-mm,
linux-kernel, Jonathan.Cameron, linux-cxl, dave.jiang, gourry
On Thu, 12 Mar 2026 08:00:21 -0700 Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
> On Thu, 12 Mar 2026 17:12:07 +0800 YeeLi <seven.yi.lee@gmail.com> wrote:
Cc-ing Gregory's REAL address, this time... sorry for the noise!
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth
2026-03-12 15:00 ` Joshua Hahn
2026-03-12 15:05 ` Joshua Hahn
@ 2026-03-13 3:39 ` Yee Li
1 sibling, 0 replies; 10+ messages in thread
From: Yee Li @ 2026-03-13 3:39 UTC (permalink / raw)
To: Joshua Hahn
Cc: akpm, david, dan.j.williams, ying.huang, linux-mm, linux-kernel,
Jonathan.Cameron, linux-cxl, dave.jiang, gourry
Hi Joshua,
Thanks for your feedback and kind wishes.
> I just wanted to touch on this. Is this true? Weighted interleave already
> exposes sysfs that lets the user set the weights they want. On one hand
> I think it would be helpful for there to be a tool to see what the
> effects of setting bandwidth information on a system could be, if it
> had other side effects. I'm not sure if there is a strong case to do this
> purely for weighted interleave auto-tuning, which already has a manual
> weight override and pretty simple calculations : -)
I agree — writing a small userspace tool to compute the weights based on
actual measured bandwidths (from mlc or stream) is a really good approach.
> Thank you for the helpful clarification. Just to be sure, are there
> other intended things that the user can test other than what weights
> weighted interleave would resolve to?
We can continue DRAM + CXL weighted interleave performance testing
later, following Gregory Price's experiments [1].
[1] Link:
https://lore.kernel.org/linux-mm/20240103224209.2541-1-gregory.price@memverge.com/
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth
2026-03-12 9:12 [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth YeeLi
` (2 preceding siblings ...)
2026-03-12 15:00 ` Joshua Hahn
@ 2026-03-12 16:12 ` Gregory Price
2026-03-13 3:51 ` Yee Li
3 siblings, 1 reply; 10+ messages in thread
From: Gregory Price @ 2026-03-12 16:12 UTC (permalink / raw)
To: YeeLi
Cc: akpm, david, dan.j.williams, ying.huang, linux-mm, joshua.hahnjy,
linux-kernel, Jonathan.Cameron, linux-cxl, dave.jiang
On Thu, Mar 12, 2026 at 05:12:07PM +0800, YeeLi wrote:
> From: yeeli <seven.yi.lee@gmail.com>
>
> If the bandwidth reported by firmware or the device deviates from the
> actual measured bandwidth, administrators also lack a clear way to adjust
> the per-node weight values.
>
If userland is setting the raw bandwidth stat, why can't they do a tiny
bit of math to figure out the preferred weighting and just write the
weights?
>
> Userspace can write a single bandwidth value (in MB/s) to override both
> read_bandwidth and write_bandwidth for the corresponding NUMA node. The
> value is then propagated to the internal node_bw_table via
> mempolicy_set_node_perf().
>
> This interface is intended for debugging and experimentation only.
>
Should be posted as [RFC] then, I don't think we want this upstream.
~Gregory
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth
2026-03-12 16:12 ` Gregory Price
@ 2026-03-13 3:51 ` Yee Li
0 siblings, 0 replies; 10+ messages in thread
From: Yee Li @ 2026-03-13 3:51 UTC (permalink / raw)
To: Gregory Price
Cc: akpm, david, dan.j.williams, ying.huang, linux-mm, joshua.hahnjy,
linux-kernel, Jonathan.Cameron, linux-cxl, dave.jiang
Hi Gregory,
> If userland is setting the raw bandwidth stat, why can't they do a tiny
> bit of math to figure out the preferred weighting and just write the
> weights?
We're not setting the raw bandwidth stat directly, but overriding
the top-level bandwidth to observe the final effect.
> Should be posted as [RFC] then, I don't think we want this upstream.
Thanks for the suggestion.
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2026-03-13 3:51 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-12 9:12 [PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth YeeLi
2026-03-12 9:42 ` Huang, Ying
2026-03-12 10:26 ` Yee Li
2026-03-12 11:58 ` Jonathan Cameron
2026-03-13 3:05 ` Yee Li
2026-03-12 15:00 ` Joshua Hahn
2026-03-12 15:05 ` Joshua Hahn
2026-03-13 3:39 ` Yee Li
2026-03-12 16:12 ` Gregory Price
2026-03-13 3:51 ` Yee Li
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox