* [PATCH v3 1/4] mm/page_owner: add filter infrastructure
2026-04-28 7:11 [PATCH v3 0/4] mm/page_owner: add filter infrastructure for print_mode and NUMA filtering Zhen Ni
@ 2026-04-28 7:11 ` Zhen Ni
2026-04-28 7:11 ` [PATCH v3 2/4] mm/page_owner: add print_mode filter Zhen Ni
` (3 subsequent siblings)
4 siblings, 0 replies; 18+ messages in thread
From: Zhen Ni @ 2026-04-28 7:11 UTC (permalink / raw)
To: akpm, vbabka
Cc: surenb, mhocko, jackmanb, hannes, ziy, linux-mm, linux-kernel,
Zhen Ni
Add data structure for page_owner filtering functionality and create
debugfs directory for filter controls.
This adds:
- enum page_owner_print_mode with values for full_stack and stack_handle
- struct page_owner_filter with print_mode and nid_mask fields
- Static owner_filter instance initialized with default values
- page_owner_filter debugfs directory
The filter infrastructure will be used to add print_mode and NUMA node
filtering capabilities in subsequent commits.
Link: https://lore.kernel.org/linux-mm/20260417154638.22370-2-zhen.ni@easystack.cn/
Suggested-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
---
Changes in v2:
- Use enum page_owner_print_mode instead of bool 'compact' for better clarity
- Use nodemask_t instead of int 'nid' to support multi-node filtering
Changes in v3:
- No code changes
---
mm/page_owner.c | 20 +++++++++++++++++++-
1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 8178e0be557f..5884d883837e 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -54,6 +54,21 @@ struct stack_print_ctx {
u8 flags;
};
+enum page_owner_print_mode {
+ PAGE_OWNER_PRINT_FULL_STACK,
+ PAGE_OWNER_PRINT_STACK_HANDLE,
+};
+
+struct page_owner_filter {
+ enum page_owner_print_mode print_mode;
+ nodemask_t nid_mask;
+};
+
+static struct page_owner_filter owner_filter = {
+ .print_mode = PAGE_OWNER_PRINT_FULL_STACK,
+ .nid_mask = NODE_MASK_NONE,
+};
+
static bool page_owner_enabled __initdata;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
@@ -973,7 +988,7 @@ DEFINE_SIMPLE_ATTRIBUTE(page_owner_threshold_fops, &page_owner_threshold_get,
static int __init pageowner_init(void)
{
- struct dentry *dir;
+ struct dentry *dir, *filter_dir;
if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
@@ -981,6 +996,9 @@ static int __init pageowner_init(void)
}
debugfs_create_file("page_owner", 0400, NULL, NULL, &page_owner_fops);
+
+ filter_dir = debugfs_create_dir("page_owner_filter", NULL);
+
dir = debugfs_create_dir("page_owner_stacks", NULL);
debugfs_create_file("show_stacks", 0400, dir,
(void *)(STACK_PRINT_FLAG_STACK |
--
2.20.1
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH v3 2/4] mm/page_owner: add print_mode filter
2026-04-28 7:11 [PATCH v3 0/4] mm/page_owner: add filter infrastructure for print_mode and NUMA filtering Zhen Ni
2026-04-28 7:11 ` [PATCH v3 1/4] mm/page_owner: add filter infrastructure Zhen Ni
@ 2026-04-28 7:11 ` Zhen Ni
2026-04-29 0:57 ` SeongJae Park
2026-04-28 7:11 ` [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support Zhen Ni
` (2 subsequent siblings)
4 siblings, 1 reply; 18+ messages in thread
From: Zhen Ni @ 2026-04-28 7:11 UTC (permalink / raw)
To: akpm, vbabka
Cc: surenb, mhocko, jackmanb, hannes, ziy, linux-mm, linux-kernel,
Zhen Ni
Add print_mode functionality to reduce page_owner output size by
printing only the stack handle instead of the full stack trace.
Example output with print_mode enabled:
Page allocated via order 0, mask 0x42800(GFP_NOWAIT|__GFP_COMP),
pid 1, tgid 1 (systemd), ts 349667370 ns
PFN 0xa00a2 type Unmovable Block 1280 type Unmovable
Flags 0x33fffe0000004124(referenced|lru|active|private|node=3|zone=0|
lastcpupid=0x1ffff)
handle: 17432583
Charged to memcg /
Print mode significantly reduces output size while preserving all
other page allocation information. The correspondence between handles
and stack traces can be obtained through the show_stacks_handles interface.
Link: https://lore.kernel.org/linux-mm/20260417154638.22370-3-zhen.ni@easystack.cn/
Suggested-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
---
Changes in v2:
- Renamed from 'compact mode' to 'print_mode' for better clarity
- Use enum values (0=full_stack, 1=stack_handle) instead of boolean
- Update debugfs filename from 'compact' to 'print_mode'
Changes in v3:
- No code changes
---
mm/page_owner.c | 28 +++++++++++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 5884d883837e..6d87b6948cfa 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -590,7 +590,13 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
migratetype_names[pageblock_mt],
&page->flags);
- ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
+ /* Print mode: full stack or stack handle */
+ if (READ_ONCE(owner_filter.print_mode) == PAGE_OWNER_PRINT_STACK_HANDLE) {
+ ret += scnprintf(kbuf + ret, count - ret,
+ "handle: %d\n", handle);
+ } else {
+ ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
+ }
if (ret >= count)
goto err;
@@ -985,6 +991,24 @@ static int page_owner_threshold_set(void *data, u64 val)
DEFINE_SIMPLE_ATTRIBUTE(page_owner_threshold_fops, &page_owner_threshold_get,
&page_owner_threshold_set, "%llu");
+static int page_owner_print_mode_get(void *data, u64 *val)
+{
+ *val = READ_ONCE(owner_filter.print_mode);
+ return 0;
+}
+
+static int page_owner_print_mode_set(void *data, u64 val)
+{
+ if (val > PAGE_OWNER_PRINT_STACK_HANDLE)
+ return -EINVAL;
+ WRITE_ONCE(owner_filter.print_mode, val);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(page_owner_print_mode_fops,
+ &page_owner_print_mode_get,
+ &page_owner_print_mode_set, "%lld");
+
static int __init pageowner_init(void)
{
@@ -998,6 +1022,8 @@ static int __init pageowner_init(void)
debugfs_create_file("page_owner", 0400, NULL, NULL, &page_owner_fops);
filter_dir = debugfs_create_dir("page_owner_filter", NULL);
+ debugfs_create_file("print_mode", 0600, filter_dir, NULL,
+ &page_owner_print_mode_fops);
dir = debugfs_create_dir("page_owner_stacks", NULL);
debugfs_create_file("show_stacks", 0400, dir,
--
2.20.1
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH v3 2/4] mm/page_owner: add print_mode filter
2026-04-28 7:11 ` [PATCH v3 2/4] mm/page_owner: add print_mode filter Zhen Ni
@ 2026-04-29 0:57 ` SeongJae Park
2026-04-29 8:19 ` zhen.ni
0 siblings, 1 reply; 18+ messages in thread
From: SeongJae Park @ 2026-04-29 0:57 UTC (permalink / raw)
To: Zhen Ni
Cc: SeongJae Park, akpm, vbabka, surenb, mhocko, jackmanb, hannes,
ziy, linux-mm, linux-kernel
On Tue, 28 Apr 2026 15:11:10 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
> Add print_mode functionality to reduce page_owner output size by
> printing only the stack handle instead of the full stack trace.
>
> Example output with print_mode enabled:
> Page allocated via order 0, mask 0x42800(GFP_NOWAIT|__GFP_COMP),
> pid 1, tgid 1 (systemd), ts 349667370 ns
> PFN 0xa00a2 type Unmovable Block 1280 type Unmovable
> Flags 0x33fffe0000004124(referenced|lru|active|private|node=3|zone=0|
> lastcpupid=0x1ffff)
> handle: 17432583
> Charged to memcg /
Looks nice to me. I have just trivial cosmetic comments below.
>
> Print mode significantly reduces output size while preserving all
> other page allocation information. The correspondence between handles
> and stack traces can be obtained through the show_stacks_handles interface.
I understand the mode was introduced for the stack handle mode, but it exists
for both full stack and stack handle mode? The wording makes me assume the
mode is only enabled vs disabled (boolean). It is true that there are only two
modes, but I feel like this commit message might better written.
>
> Link: https://lore.kernel.org/linux-mm/20260417154638.22370-3-zhen.ni@easystack.cn/
Seems this is a link to v1 of this patch. I think adding the context or moving
this to the changelog [1] with the brief context explanation would be nice.
> Suggested-by: Zi Yan <ziy@nvidia.com>
> Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
> ---
>
> Changes in v2:
> - Renamed from 'compact mode' to 'print_mode' for better clarity
> - Use enum values (0=full_stack, 1=stack_handle) instead of boolean
> - Update debugfs filename from 'compact' to 'print_mode'
>
> Changes in v3:
> - No code changes
> ---
> mm/page_owner.c | 28 +++++++++++++++++++++++++++-
> 1 file changed, 27 insertions(+), 1 deletion(-)
>
> diff --git a/mm/page_owner.c b/mm/page_owner.c
> index 5884d883837e..6d87b6948cfa 100644
> --- a/mm/page_owner.c
> +++ b/mm/page_owner.c
> @@ -590,7 +590,13 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
> migratetype_names[pageblock_mt],
> &page->flags);
>
> - ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
> + /* Print mode: full stack or stack handle */
I think this comment should be useful if it was compact_mode. But because now
it is called print_mode and values have good names that self explaining what
the mode is, I'm not sure if this comment is really needed.
> + if (READ_ONCE(owner_filter.print_mode) == PAGE_OWNER_PRINT_STACK_HANDLE) {
> + ret += scnprintf(kbuf + ret, count - ret,
> + "handle: %d\n", handle);
> + } else {
> + ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
> + }
Braces are not needed [2] here?
> if (ret >= count)
> goto err;
>
> @@ -985,6 +991,24 @@ static int page_owner_threshold_set(void *data, u64 val)
> DEFINE_SIMPLE_ATTRIBUTE(page_owner_threshold_fops, &page_owner_threshold_get,
> &page_owner_threshold_set, "%llu");
>
> +static int page_owner_print_mode_get(void *data, u64 *val)
> +{
> + *val = READ_ONCE(owner_filter.print_mode);
> + return 0;
> +}
> +
> +static int page_owner_print_mode_set(void *data, u64 val)
> +{
> + if (val > PAGE_OWNER_PRINT_STACK_HANDLE)
> + return -EINVAL;
> + WRITE_ONCE(owner_filter.print_mode, val);
> + return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(page_owner_print_mode_fops,
> + &page_owner_print_mode_get,
> + &page_owner_print_mode_set, "%lld");
> +
>
> static int __init pageowner_init(void)
> {
> @@ -998,6 +1022,8 @@ static int __init pageowner_init(void)
> debugfs_create_file("page_owner", 0400, NULL, NULL, &page_owner_fops);
>
> filter_dir = debugfs_create_dir("page_owner_filter", NULL);
> + debugfs_create_file("print_mode", 0600, filter_dir, NULL,
> + &page_owner_print_mode_fops);
>
> dir = debugfs_create_dir("page_owner_stacks", NULL);
> debugfs_create_file("show_stacks", 0400, dir,
> --
> 2.20.1
[1] https://docs.kernel.org/process/submitting-patches.html#commentary
[2] https://docs.kernel.org/process/coding-style.html#placing-braces-and-spaces
Thanks,
SJ
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH v3 2/4] mm/page_owner: add print_mode filter
2026-04-29 0:57 ` SeongJae Park
@ 2026-04-29 8:19 ` zhen.ni
0 siblings, 0 replies; 18+ messages in thread
From: zhen.ni @ 2026-04-29 8:19 UTC (permalink / raw)
To: SeongJae Park
Cc: akpm, vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm,
linux-kernel
在 2026/4/29 08:57, SeongJae Park 写道:
> On Tue, 28 Apr 2026 15:11:10 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
>
>> Add print_mode functionality to reduce page_owner output size by
>> printing only the stack handle instead of the full stack trace.
>>
>> Example output with print_mode enabled:
>> Page allocated via order 0, mask 0x42800(GFP_NOWAIT|__GFP_COMP),
>> pid 1, tgid 1 (systemd), ts 349667370 ns
>> PFN 0xa00a2 type Unmovable Block 1280 type Unmovable
>> Flags 0x33fffe0000004124(referenced|lru|active|private|node=3|zone=0|
>> lastcpupid=0x1ffff)
>> handle: 17432583
>> Charged to memcg /
>
> Looks nice to me. I have just trivial cosmetic comments below.
>
>>
>> Print mode significantly reduces output size while preserving all
>> other page allocation information. The correspondence between handles
>> and stack traces can be obtained through the show_stacks_handles interface.
>
> I understand the mode was introduced for the stack handle mode, but it exists
> for both full stack and stack handle mode? The wording makes me assume the
> mode is only enabled vs disabled (boolean). It is true that there are only two
> modes, but I feel like this commit message might better written.
>
The commit message could be clearer.
>>
>> Link: https://lore.kernel.org/linux-mm/20260417154638.22370-3-zhen.ni@easystack.cn/
>
> Seems this is a link to v1 of this patch. I think adding the context or moving
> this to the changelog [1] with the brief context explanation would be nice.
>
Good suggestion.
>> Suggested-by: Zi Yan <ziy@nvidia.com>
>> Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
>> ---
>>
>> Changes in v2:
>> - Renamed from 'compact mode' to 'print_mode' for better clarity
>> - Use enum values (0=full_stack, 1=stack_handle) instead of boolean
>> - Update debugfs filename from 'compact' to 'print_mode'
>>
>> Changes in v3:
>> - No code changes
>> ---
>> mm/page_owner.c | 28 +++++++++++++++++++++++++++-
>> 1 file changed, 27 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/page_owner.c b/mm/page_owner.c
>> index 5884d883837e..6d87b6948cfa 100644
>> --- a/mm/page_owner.c
>> +++ b/mm/page_owner.c
>> @@ -590,7 +590,13 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
>> migratetype_names[pageblock_mt],
>> &page->flags);
>>
>> - ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
>> + /* Print mode: full stack or stack handle */
>
> I think this comment should be useful if it was compact_mode. But because now
> it is called print_mode and values have good names that self explaining what
> the mode is, I'm not sure if this comment is really needed.
>
I'll remove the redundant comment
>> + if (READ_ONCE(owner_filter.print_mode) == PAGE_OWNER_PRINT_STACK_HANDLE) {
>> + ret += scnprintf(kbuf + ret, count - ret,
>> + "handle: %d\n", handle);
>> + } else {
>> + ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
>> + }
>
> Braces are not needed [2] here?
>
I'll remove the unnecessary braces
Thanks for the detailed review
>> if (ret >= count)
>> goto err;
>>
>> @@ -985,6 +991,24 @@ static int page_owner_threshold_set(void *data, u64 val)
>> DEFINE_SIMPLE_ATTRIBUTE(page_owner_threshold_fops, &page_owner_threshold_get,
>> &page_owner_threshold_set, "%llu");
>>
>> +static int page_owner_print_mode_get(void *data, u64 *val)
>> +{
>> + *val = READ_ONCE(owner_filter.print_mode);
>> + return 0;
>> +}
>> +
>> +static int page_owner_print_mode_set(void *data, u64 val)
>> +{
>> + if (val > PAGE_OWNER_PRINT_STACK_HANDLE)
>> + return -EINVAL;
>> + WRITE_ONCE(owner_filter.print_mode, val);
>> + return 0;
>> +}
>> +
>> +DEFINE_SIMPLE_ATTRIBUTE(page_owner_print_mode_fops,
>> + &page_owner_print_mode_get,
>> + &page_owner_print_mode_set, "%lld");
>> +
>>
>> static int __init pageowner_init(void)
>> {
>> @@ -998,6 +1022,8 @@ static int __init pageowner_init(void)
>> debugfs_create_file("page_owner", 0400, NULL, NULL, &page_owner_fops);
>>
>> filter_dir = debugfs_create_dir("page_owner_filter", NULL);
>> + debugfs_create_file("print_mode", 0600, filter_dir, NULL,
>> + &page_owner_print_mode_fops);
>>
>> dir = debugfs_create_dir("page_owner_stacks", NULL);
>> debugfs_create_file("show_stacks", 0400, dir,
>> --
>> 2.20.1
>
> [1] https://docs.kernel.org/process/submitting-patches.html#commentary
> [2] https://docs.kernel.org/process/coding-style.html#placing-braces-and-spaces
>
>
> Thanks,
> SJ
>
>
Best regards,
Zhen Ni
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support
2026-04-28 7:11 [PATCH v3 0/4] mm/page_owner: add filter infrastructure for print_mode and NUMA filtering Zhen Ni
2026-04-28 7:11 ` [PATCH v3 1/4] mm/page_owner: add filter infrastructure Zhen Ni
2026-04-28 7:11 ` [PATCH v3 2/4] mm/page_owner: add print_mode filter Zhen Ni
@ 2026-04-28 7:11 ` Zhen Ni
2026-04-28 14:16 ` Andrew Morton
2026-04-29 1:28 ` SeongJae Park
2026-04-28 7:11 ` [PATCH v3 4/4] mm/page_owner: document page_owner filter features Zhen Ni
2026-04-28 14:15 ` [PATCH v3 0/4] mm/page_owner: add filter infrastructure for print_mode and NUMA filtering Andrew Morton
4 siblings, 2 replies; 18+ messages in thread
From: Zhen Ni @ 2026-04-28 7:11 UTC (permalink / raw)
To: akpm, vbabka
Cc: surenb, mhocko, jackmanb, hannes, ziy, linux-mm, linux-kernel,
Zhen Ni
Add NUMA node filtering functionality to page_owner to allow
filtering pages by specific NUMA node(s) using nodelist format.
The filter allows users to focus on pages from specific NUMA nodes,
which is useful for NUMA-aware memory allocation analysis and debugging.
Supported input formats:
- Single node: echo "2" > nid
- Multiple nodes: echo "0,2,3" > nid
- Node range: echo "0-3" > nid
- Mixed format: echo "0,2-4,7" > nid
- Disable filter: echo "-1" > nid
Link: https://lore.kernel.org/linux-mm/20260417154638.22370-4-zhen.ni@easystack.cn/
Link: https://lore.kernel.org/linux-mm/20260419155540.376847-4-zhen.ni@easystack.cn/
Suggested-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
---
Changes in v2:
- Use nodemask_t instead of int to support multiple nodes
- Implement nodelist_parse() to support flexible input formats
* Single node: "0", "2"
* Multiple nodes: "0,2,3"
* Ranges: "0-3"
* Mixed: "0,2-4,7"
- Use %*pbl format for output (e.g., "0-2", "0,2-4,7")
- Use dynamic memory allocation (kmalloc) to handle variable-length input
- Follow cpuset's max_write_len pattern: (100 + 6 * MAX_NUMNODES)
Changes in v3:
- Remove READ_ONCE/WRITE_ONCE for nodemask_t (fixes compilation errors)
* nodemask_t is a large structure (128 bytes) that triggers compile-time asserts
* Direct assignment is safe for this use case
- Add comment explaining input length calculation formula
* 6 bytes = ",NNNNN" (comma + 5-digit node number)
- Simplify "-1" check using kstrtoint() instead of dual strcmp()
- Move nodemask_t mask read outside PFN iteration loop for performance
* Avoids 128-byte structure copy on each iteration
---
mm/page_owner.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 82 insertions(+)
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 6d87b6948cfa..e674a374669a 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -685,6 +685,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct page_ext *page_ext;
struct page_owner *page_owner;
depot_stack_handle_t handle;
+ nodemask_t mask;
if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
@@ -698,6 +699,8 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
pfn++;
+ mask = owner_filter.nid_mask;
+
/* Find an allocated page */
for (; pfn < max_pfn; pfn++) {
/*
@@ -730,6 +733,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
if (unlikely(!page_ext))
continue;
+ /* NUMA node filter using bitmask */
+ if (!nodes_empty(mask)) {
+ int nid = page_to_nid(page);
+
+ if (!node_isset(nid, mask))
+ goto ext_put_continue;
+ }
+
/*
* Some pages could be missed by concurrent allocation or free,
* because we don't hold the zone lock.
@@ -1009,6 +1020,75 @@ DEFINE_SIMPLE_ATTRIBUTE(page_owner_print_mode_fops,
&page_owner_print_mode_get,
&page_owner_print_mode_set, "%lld");
+static ssize_t nid_filter_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char *kbuf;
+ nodemask_t mask;
+ int ret;
+ int val;
+
+ /*
+ * Limit input size to handle worst-case nodelist (all nodes).
+ * Worst case per node: ",NNNNN" (comma + 5-digit node number) = 6 bytes.
+ * Formula: 100 bytes overhead + 6 * MAX_NUMNODES
+ */
+ if (count > (100 + 6 * MAX_NUMNODES))
+ return -EINVAL;
+
+ kbuf = kmalloc(count + 1, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ if (copy_from_user(kbuf, buf, count)) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+ kbuf[count] = '\0';
+
+ /* Support: "-1" to clear, or nodelist format like "0", "0,2", "0-3" */
+ if (kstrtoint(kbuf, 10, &val) == 0 && val == -1)
+ nodes_clear(mask);
+ else if (nodelist_parse(kbuf, mask)) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+
+ owner_filter.nid_mask = mask;
+ ret = count;
+
+out_free:
+ kfree(kbuf);
+ return ret;
+}
+
+static int nid_filter_show(struct seq_file *m, void *v)
+{
+ nodemask_t mask = owner_filter.nid_mask;
+
+ if (nodes_empty(mask))
+ seq_puts(m, "-1\n");
+ else
+ seq_printf(m, "%*pbl\n", nodemask_pr_args(&mask));
+
+ return 0;
+}
+
+static int nid_filter_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, nid_filter_show, NULL);
+}
+
+static const struct file_operations nid_filter_fops = {
+ .owner = THIS_MODULE,
+ .open = nid_filter_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = nid_filter_write,
+ .release = single_release,
+};
+
static int __init pageowner_init(void)
{
@@ -1024,6 +1104,8 @@ static int __init pageowner_init(void)
filter_dir = debugfs_create_dir("page_owner_filter", NULL);
debugfs_create_file("print_mode", 0600, filter_dir, NULL,
&page_owner_print_mode_fops);
+ debugfs_create_file("nid", 0600, filter_dir, NULL,
+ &nid_filter_fops);
dir = debugfs_create_dir("page_owner_stacks", NULL);
debugfs_create_file("show_stacks", 0400, dir,
--
2.20.1
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support
2026-04-28 7:11 ` [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support Zhen Ni
@ 2026-04-28 14:16 ` Andrew Morton
2026-04-29 7:30 ` zhen.ni
2026-04-29 1:28 ` SeongJae Park
1 sibling, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2026-04-28 14:16 UTC (permalink / raw)
To: Zhen Ni
Cc: vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm,
linux-kernel
On Tue, 28 Apr 2026 15:11:11 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
> Add NUMA node filtering functionality to page_owner to allow
> filtering pages by specific NUMA node(s) using nodelist format.
>
> The filter allows users to focus on pages from specific NUMA nodes,
> which is useful for NUMA-aware memory allocation analysis and debugging.
>
> Supported input formats:
> - Single node: echo "2" > nid
> - Multiple nodes: echo "0,2,3" > nid
> - Node range: echo "0-3" > nid
> - Mixed format: echo "0,2-4,7" > nid
> - Disable filter: echo "-1" > nid
>
> ...
>
> +static ssize_t nid_filter_write(struct file *file,
> + const char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + char *kbuf;
> + nodemask_t mask;
> + int ret;
> + int val;
> +
> + /*
> + * Limit input size to handle worst-case nodelist (all nodes).
> + * Worst case per node: ",NNNNN" (comma + 5-digit node number) = 6 bytes.
> + * Formula: 100 bytes overhead + 6 * MAX_NUMNODES
> + */
> + if (count > (100 + 6 * MAX_NUMNODES))
> + return -EINVAL;
> +
> + kbuf = kmalloc(count + 1, GFP_KERNEL);
> + if (!kbuf)
> + return -ENOMEM;
> +
> + if (copy_from_user(kbuf, buf, count)) {
> + ret = -EFAULT;
> + goto out_free;
> + }
> + kbuf[count] = '\0';
strncpy_from_user() was not useful here?
> + /* Support: "-1" to clear, or nodelist format like "0", "0,2", "0-3" */
> + if (kstrtoint(kbuf, 10, &val) == 0 && val == -1)
> + nodes_clear(mask);
> + else if (nodelist_parse(kbuf, mask)) {
> + ret = -EINVAL;
> + goto out_free;
> + }
> +
> + owner_filter.nid_mask = mask;
> + ret = count;
> +
> +out_free:
> + kfree(kbuf);
> + return ret;
> +}
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support
2026-04-28 14:16 ` Andrew Morton
@ 2026-04-29 7:30 ` zhen.ni
0 siblings, 0 replies; 18+ messages in thread
From: zhen.ni @ 2026-04-29 7:30 UTC (permalink / raw)
To: Andrew Morton
Cc: vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm,
linux-kernel
在 2026/4/28 22:16, Andrew Morton 写道:
> On Tue, 28 Apr 2026 15:11:11 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
>
>> Add NUMA node filtering functionality to page_owner to allow
>> filtering pages by specific NUMA node(s) using nodelist format.
>>
>> The filter allows users to focus on pages from specific NUMA nodes,
>> which is useful for NUMA-aware memory allocation analysis and debugging.
>>
>> Supported input formats:
>> - Single node: echo "2" > nid
>> - Multiple nodes: echo "0,2,3" > nid
>> - Node range: echo "0-3" > nid
>> - Mixed format: echo "0,2-4,7" > nid
>> - Disable filter: echo "-1" > nid
>>
>> ...
>>
>> +static ssize_t nid_filter_write(struct file *file,
>> + const char __user *buf,
>> + size_t count, loff_t *ppos)
>> +{
>> + char *kbuf;
>> + nodemask_t mask;
>> + int ret;
>> + int val;
>> +
>> + /*
>> + * Limit input size to handle worst-case nodelist (all nodes).
>> + * Worst case per node: ",NNNNN" (comma + 5-digit node number) = 6 bytes.
>> + * Formula: 100 bytes overhead + 6 * MAX_NUMNODES
>> + */
>> + if (count > (100 + 6 * MAX_NUMNODES))
>> + return -EINVAL;
>> +
>> + kbuf = kmalloc(count + 1, GFP_KERNEL);
>> + if (!kbuf)
>> + return -ENOMEM;
>> +
>> + if (copy_from_user(kbuf, buf, count)) {
>> + ret = -EFAULT;
>> + goto out_free;
>> + }
>> + kbuf[count] = '\0';
>
> strncpy_from_user() was not useful here?
>
After checking similar usage in mm/ (e.g., mm/kmemleak.c),
I'll switch to strncpy_from_user().
The change is straightforward:
long len;
len = strncpy_from_user(kbuf, buf, count + 1);
if (len < 0) {
ret = -EFAULT;
goto out_free;
}
kbuf[count] = '\0';
I'll make this change in the next version.
>> + /* Support: "-1" to clear, or nodelist format like "0", "0,2", "0-3" */
>> + if (kstrtoint(kbuf, 10, &val) == 0 && val == -1)
>> + nodes_clear(mask);
>> + else if (nodelist_parse(kbuf, mask)) {
>> + ret = -EINVAL;
>> + goto out_free;
>> + }
>> +
>> + owner_filter.nid_mask = mask;
>> + ret = count;
>> +
>> +out_free:
>> + kfree(kbuf);
>> + return ret;
>> +}
>
>
>
Best regards,
Zhen Ni
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support
2026-04-28 7:11 ` [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support Zhen Ni
2026-04-28 14:16 ` Andrew Morton
@ 2026-04-29 1:28 ` SeongJae Park
2026-04-29 9:03 ` zhen.ni
1 sibling, 1 reply; 18+ messages in thread
From: SeongJae Park @ 2026-04-29 1:28 UTC (permalink / raw)
To: Zhen Ni
Cc: SeongJae Park, akpm, vbabka, surenb, mhocko, jackmanb, hannes,
ziy, linux-mm, linux-kernel
On Tue, 28 Apr 2026 15:11:11 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
> Add NUMA node filtering functionality to page_owner to allow
> filtering pages by specific NUMA node(s) using nodelist format.
>
> The filter allows users to focus on pages from specific NUMA nodes,
> which is useful for NUMA-aware memory allocation analysis and debugging.
>
> Supported input formats:
> - Single node: echo "2" > nid
> - Multiple nodes: echo "0,2,3" > nid
> - Node range: echo "0-3" > nid
> - Mixed format: echo "0,2-4,7" > nid
> - Disable filter: echo "-1" > nid
>
> Link: https://lore.kernel.org/linux-mm/20260417154638.22370-4-zhen.ni@easystack.cn/
> Link: https://lore.kernel.org/linux-mm/20260419155540.376847-4-zhen.ni@easystack.cn/
Seems the above two links are for v1 and v2 of this patch. I think putting
those with the context at commentary area [1] could be useful.
> Suggested-by: Zi Yan <ziy@nvidia.com>
> Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
> ---
[...]
> diff --git a/mm/page_owner.c b/mm/page_owner.c
> index 6d87b6948cfa..e674a374669a 100644
> --- a/mm/page_owner.c
> +++ b/mm/page_owner.c
> @@ -685,6 +685,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> struct page_ext *page_ext;
> struct page_owner *page_owner;
> depot_stack_handle_t handle;
> + nodemask_t mask;
>
> if (!static_branch_unlikely(&page_owner_inited))
> return -EINVAL;
> @@ -698,6 +699,8 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
> pfn++;
>
> + mask = owner_filter.nid_mask;
> +
READ_ONCE() was used for owner_filter.print_mode. Should nid_mask also read
using READ_ONCE()?
> /* Find an allocated page */
> for (; pfn < max_pfn; pfn++) {
> /*
> @@ -730,6 +733,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> if (unlikely(!page_ext))
> continue;
>
> + /* NUMA node filter using bitmask */
> + if (!nodes_empty(mask)) {
> + int nid = page_to_nid(page);
> +
> + if (!node_isset(nid, mask))
> + goto ext_put_continue;
> + }
> +
> /*
> * Some pages could be missed by concurrent allocation or free,
> * because we don't hold the zone lock.
> @@ -1009,6 +1020,75 @@ DEFINE_SIMPLE_ATTRIBUTE(page_owner_print_mode_fops,
> &page_owner_print_mode_get,
> &page_owner_print_mode_set, "%lld");
>
> +static ssize_t nid_filter_write(struct file *file,
> + const char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + char *kbuf;
> + nodemask_t mask;
> + int ret;
> + int val;
> +
> + /*
> + * Limit input size to handle worst-case nodelist (all nodes).
> + * Worst case per node: ",NNNNN" (comma + 5-digit node number) = 6 bytes.
> + * Formula: 100 bytes overhead + 6 * MAX_NUMNODES
> + */
> + if (count > (100 + 6 * MAX_NUMNODES))
> + return -EINVAL;
> +
> + kbuf = kmalloc(count + 1, GFP_KERNEL);
> + if (!kbuf)
> + return -ENOMEM;
> +
> + if (copy_from_user(kbuf, buf, count)) {
> + ret = -EFAULT;
> + goto out_free;
> + }
> + kbuf[count] = '\0';
> +
> + /* Support: "-1" to clear, or nodelist format like "0", "0,2", "0-3" */
> + if (kstrtoint(kbuf, 10, &val) == 0 && val == -1)
> + nodes_clear(mask);
> + else if (nodelist_parse(kbuf, mask)) {
> + ret = -EINVAL;
> + goto out_free;
> + }
Doesn't empty string input to nodelist_parse() clears the mask? Can't it be
reused?
> +
> + owner_filter.nid_mask = mask;
> + ret = count;
> +
> +out_free:
> + kfree(kbuf);
> + return ret;
> +}
> +
> +static int nid_filter_show(struct seq_file *m, void *v)
> +{
> + nodemask_t mask = owner_filter.nid_mask;
> +
> + if (nodes_empty(mask))
> + seq_puts(m, "-1\n");
> + else
> + seq_printf(m, "%*pbl\n", nodemask_pr_args(&mask));
> +
> + return 0;
> +}
> +
> +static int nid_filter_open(struct inode *inode, struct file *file)
> +{
> + return single_open(file, nid_filter_show, NULL);
> +}
> +
> +static const struct file_operations nid_filter_fops = {
> + .owner = THIS_MODULE,
> + .open = nid_filter_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .write = nid_filter_write,
> + .release = single_release,
> +};
> +
>
> static int __init pageowner_init(void)
> {
> @@ -1024,6 +1104,8 @@ static int __init pageowner_init(void)
> filter_dir = debugfs_create_dir("page_owner_filter", NULL);
> debugfs_create_file("print_mode", 0600, filter_dir, NULL,
> &page_owner_print_mode_fops);
> + debugfs_create_file("nid", 0600, filter_dir, NULL,
> + &nid_filter_fops);
Why don't you use 'page_owner_' prefix like other fops, for consistency?
>
> dir = debugfs_create_dir("page_owner_stacks", NULL);
> debugfs_create_file("show_stacks", 0400, dir,
> --
> 2.20.1
[1] https://docs.kernel.org/process/submitting-patches.html#commentary
Thanks,
SJ
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support
2026-04-29 1:28 ` SeongJae Park
@ 2026-04-29 9:03 ` zhen.ni
2026-04-29 14:56 ` SeongJae Park
0 siblings, 1 reply; 18+ messages in thread
From: zhen.ni @ 2026-04-29 9:03 UTC (permalink / raw)
To: SeongJae Park
Cc: akpm, vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm,
linux-kernel
在 2026/4/29 09:28, SeongJae Park 写道:
> On Tue, 28 Apr 2026 15:11:11 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
>
>> Add NUMA node filtering functionality to page_owner to allow
>> filtering pages by specific NUMA node(s) using nodelist format.
>>
>> The filter allows users to focus on pages from specific NUMA nodes,
>> which is useful for NUMA-aware memory allocation analysis and debugging.
>>
>> Supported input formats:
>> - Single node: echo "2" > nid
>> - Multiple nodes: echo "0,2,3" > nid
>> - Node range: echo "0-3" > nid
>> - Mixed format: echo "0,2-4,7" > nid
>> - Disable filter: echo "-1" > nid
>>
>> Link: https://lore.kernel.org/linux-mm/20260417154638.22370-4-zhen.ni@easystack.cn/
>> Link: https://lore.kernel.org/linux-mm/20260419155540.376847-4-zhen.ni@easystack.cn/
>
> Seems the above two links are for v1 and v2 of this patch. I think putting
> those with the context at commentary area [1] could be useful.
>
Good suggestion.
>> Suggested-by: Zi Yan <ziy@nvidia.com>
>> Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
>> ---
> [...]
>> diff --git a/mm/page_owner.c b/mm/page_owner.c
>> index 6d87b6948cfa..e674a374669a 100644
>> --- a/mm/page_owner.c
>> +++ b/mm/page_owner.c
>> @@ -685,6 +685,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
>> struct page_ext *page_ext;
>> struct page_owner *page_owner;
>> depot_stack_handle_t handle;
>> + nodemask_t mask;
>>
>> if (!static_branch_unlikely(&page_owner_inited))
>> return -EINVAL;
>> @@ -698,6 +699,8 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
>> while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
>> pfn++;
>>
>> + mask = owner_filter.nid_mask;
>> +
>
> READ_ONCE() was used for owner_filter.print_mode. Should nid_mask also read
> using READ_ONCE()?
>
The reason is that `owner_filter.nid_mask` is a nodemask_t, which is a
128-byte structure. READ_ONCE() only supports types up to 8 bytes and
will trigger a compile-time assertion failure for larger structures.
This was actually an issue in v2 - the AI review tool (sashiko.dev) and
Andrew both caught the compilation error with READ_ONCE/WRITE_ONCE on
nodemask_t, so v3 removed them.
>> /* Find an allocated page */
>> for (; pfn < max_pfn; pfn++) {
>> /*
>> @@ -730,6 +733,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
>> if (unlikely(!page_ext))
>> continue;
>>
>> + /* NUMA node filter using bitmask */
>> + if (!nodes_empty(mask)) {
>> + int nid = page_to_nid(page);
>> +
>> + if (!node_isset(nid, mask))
>> + goto ext_put_continue;
>> + }
>> +
>> /*
>> * Some pages could be missed by concurrent allocation or free,
>> * because we don't hold the zone lock.
>> @@ -1009,6 +1020,75 @@ DEFINE_SIMPLE_ATTRIBUTE(page_owner_print_mode_fops,
>> &page_owner_print_mode_get,
>> &page_owner_print_mode_set, "%lld");
>>
>> +static ssize_t nid_filter_write(struct file *file,
>> + const char __user *buf,
>> + size_t count, loff_t *ppos)
>> +{
>> + char *kbuf;
>> + nodemask_t mask;
>> + int ret;
>> + int val;
>> +
>> + /*
>> + * Limit input size to handle worst-case nodelist (all nodes).
>> + * Worst case per node: ",NNNNN" (comma + 5-digit node number) = 6 bytes.
>> + * Formula: 100 bytes overhead + 6 * MAX_NUMNODES
>> + */
>> + if (count > (100 + 6 * MAX_NUMNODES))
>> + return -EINVAL;
>> +
>> + kbuf = kmalloc(count + 1, GFP_KERNEL);
>> + if (!kbuf)
>> + return -ENOMEM;
>> +
>> + if (copy_from_user(kbuf, buf, count)) {
>> + ret = -EFAULT;
>> + goto out_free;
>> + }
>> + kbuf[count] = '\0';
>> +
>> + /* Support: "-1" to clear, or nodelist format like "0", "0,2", "0-3" */
>> + if (kstrtoint(kbuf, 10, &val) == 0 && val == -1)
>> + nodes_clear(mask);
>> + else if (nodelist_parse(kbuf, mask)) {
>> + ret = -EINVAL;
>> + goto out_free;
>> + }
>
> Doesn't empty string input to nodelist_parse() clears the mask? Can't it be
> reused?
>
Yes, empty input (echo > nid) works because nodelist_parse() handles it
correctly. However, nodelist_parse() - which is implemented via
bitmap_parselist() - cannot handle "-1" as it's not a valid range format
and would return an error. The explicit "-1" check is necessary to
support `echo "-1" > nid` without returning an error.
So the "-1" check handles a case that nodelist_parse() cannot handle.
>> +
>> + owner_filter.nid_mask = mask;
>> + ret = count;
>> +
>> +out_free:
>> + kfree(kbuf);
>> + return ret;
>> +}
>> +
>> +static int nid_filter_show(struct seq_file *m, void *v)
>> +{
>> + nodemask_t mask = owner_filter.nid_mask;
>> +
>> + if (nodes_empty(mask))
>> + seq_puts(m, "-1\n");
>> + else
>> + seq_printf(m, "%*pbl\n", nodemask_pr_args(&mask));
>> +
>> + return 0;
>> +}
>> +
>> +static int nid_filter_open(struct inode *inode, struct file *file)
>> +{
>> + return single_open(file, nid_filter_show, NULL);
>> +}
>> +
>> +static const struct file_operations nid_filter_fops = {
>> + .owner = THIS_MODULE,
>> + .open = nid_filter_open,
>> + .read = seq_read,
>> + .llseek = seq_lseek,
>> + .write = nid_filter_write,
>> + .release = single_release,
>> +};
>> +
>>
>> static int __init pageowner_init(void)
>> {
>> @@ -1024,6 +1104,8 @@ static int __init pageowner_init(void)
>> filter_dir = debugfs_create_dir("page_owner_filter", NULL);
>> debugfs_create_file("print_mode", 0600, filter_dir, NULL,
>> &page_owner_print_mode_fops);
>> + debugfs_create_file("nid", 0600, filter_dir, NULL,
>> + &nid_filter_fops);
>
> Why don't you use 'page_owner_' prefix like other fops, for consistency?
>
For consistency with the other file_operations
in this module (page_owner_fops, page_owner_threshold_fops,
page_owner_print_mode_fops), I'll rename nid_filter_fops to
page_owner_nid_filter_fops.
I'll incorporate these improvements in the next version.
Thanks for the detailed review!
>>
>> dir = debugfs_create_dir("page_owner_stacks", NULL);
>> debugfs_create_file("show_stacks", 0400, dir,
>> --
>> 2.20.1
>
> [1] https://docs.kernel.org/process/submitting-patches.html#commentary
>
>
> Thanks,
> SJ
>
>
Best regards,
Zhen Ni
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support
2026-04-29 9:03 ` zhen.ni
@ 2026-04-29 14:56 ` SeongJae Park
2026-04-30 3:56 ` zhen.ni
0 siblings, 1 reply; 18+ messages in thread
From: SeongJae Park @ 2026-04-29 14:56 UTC (permalink / raw)
To: zhen.ni
Cc: SeongJae Park, akpm, vbabka, surenb, mhocko, jackmanb, hannes,
ziy, linux-mm, linux-kernel
On Wed, 29 Apr 2026 17:03:56 +0800 "zhen.ni" <zhen.ni@easystack.cn> wrote:
>
>
> 在 2026/4/29 09:28, SeongJae Park 写道:
> > On Tue, 28 Apr 2026 15:11:11 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
[...]
> >> @@ -685,6 +685,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> >> struct page_ext *page_ext;
> >> struct page_owner *page_owner;
> >> depot_stack_handle_t handle;
> >> + nodemask_t mask;
> >>
> >> if (!static_branch_unlikely(&page_owner_inited))
> >> return -EINVAL;
> >> @@ -698,6 +699,8 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> >> while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
> >> pfn++;
> >>
> >> + mask = owner_filter.nid_mask;
> >> +
> >
> > READ_ONCE() was used for owner_filter.print_mode. Should nid_mask also read
> > using READ_ONCE()?
> >
> The reason is that `owner_filter.nid_mask` is a nodemask_t, which is a
> 128-byte structure. READ_ONCE() only supports types up to 8 bytes and
> will trigger a compile-time assertion failure for larger structures.
>
> This was actually an issue in v2 - the AI review tool (sashiko.dev) and
> Andrew both caught the compilation error with READ_ONCE/WRITE_ONCE on
> nodemask_t, so v3 removed them.
Thank you for kindly sharing the context. Now I understand why READ_ONCE()
cannot be used. But, is plain load/store safe enough for nodemask_t?
Shouldn't it still be protected against races?
[...]
> >> +static ssize_t nid_filter_write(struct file *file,
> >> + const char __user *buf,
> >> + size_t count, loff_t *ppos)
> >> +{
> >> + char *kbuf;
> >> + nodemask_t mask;
> >> + int ret;
> >> + int val;
> >> +
> >> + /*
> >> + * Limit input size to handle worst-case nodelist (all nodes).
> >> + * Worst case per node: ",NNNNN" (comma + 5-digit node number) = 6 bytes.
> >> + * Formula: 100 bytes overhead + 6 * MAX_NUMNODES
> >> + */
> >> + if (count > (100 + 6 * MAX_NUMNODES))
> >> + return -EINVAL;
> >> +
> >> + kbuf = kmalloc(count + 1, GFP_KERNEL);
> >> + if (!kbuf)
> >> + return -ENOMEM;
> >> +
> >> + if (copy_from_user(kbuf, buf, count)) {
> >> + ret = -EFAULT;
> >> + goto out_free;
> >> + }
> >> + kbuf[count] = '\0';
> >> +
> >> + /* Support: "-1" to clear, or nodelist format like "0", "0,2", "0-3" */
> >> + if (kstrtoint(kbuf, 10, &val) == 0 && val == -1)
> >> + nodes_clear(mask);
> >> + else if (nodelist_parse(kbuf, mask)) {
> >> + ret = -EINVAL;
> >> + goto out_free;
> >> + }
> >
> > Doesn't empty string input to nodelist_parse() clears the mask? Can't it be
> > reused?
> >
> Yes, empty input (echo > nid) works because nodelist_parse() handles it
> correctly. However, nodelist_parse() - which is implemented via
> bitmap_parselist() - cannot handle "-1" as it's not a valid range format
> and would return an error. The explicit "-1" check is necessary to
> support `echo "-1" > nid` without returning an error.
>
> So the "-1" check handles a case that nodelist_parse() cannot handle.
Thank you for kindly explaining the reason. But, do we really need to support
"-1" input? Couldn't we just redefine the interface?
[...]
> >> + debugfs_create_file("nid", 0600, filter_dir, NULL,
> >> + &nid_filter_fops);
> >
> > Why don't you use 'page_owner_' prefix like other fops, for consistency?
> >
> For consistency with the other file_operations
> in this module (page_owner_fops, page_owner_threshold_fops,
> page_owner_print_mode_fops), I'll rename nid_filter_fops to
> page_owner_nid_filter_fops.
>
> I'll incorporate these improvements in the next version.
Thank you for kindly accepting my humble suggestions.
>
> Thanks for the detailed review!
Thank you for sharing this nice work, too!
Thanks,
SJ
[...]
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support
2026-04-29 14:56 ` SeongJae Park
@ 2026-04-30 3:56 ` zhen.ni
2026-04-30 5:16 ` SeongJae Park
0 siblings, 1 reply; 18+ messages in thread
From: zhen.ni @ 2026-04-30 3:56 UTC (permalink / raw)
To: SeongJae Park
Cc: akpm, vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm,
linux-kernel
在 2026/4/29 22:56, SeongJae Park 写道:
> On Wed, 29 Apr 2026 17:03:56 +0800 "zhen.ni" <zhen.ni@easystack.cn> wrote:
>
>>
>>
>> 在 2026/4/29 09:28, SeongJae Park 写道:
>>> On Tue, 28 Apr 2026 15:11:11 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
> [...]
>>>> @@ -685,6 +685,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
>>>> struct page_ext *page_ext;
>>>> struct page_owner *page_owner;
>>>> depot_stack_handle_t handle;
>>>> + nodemask_t mask;
>>>>
>>>> if (!static_branch_unlikely(&page_owner_inited))
>>>> return -EINVAL;
>>>> @@ -698,6 +699,8 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
>>>> while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
>>>> pfn++;
>>>>
>>>> + mask = owner_filter.nid_mask;
>>>> +
>>>
>>> READ_ONCE() was used for owner_filter.print_mode. Should nid_mask also read
>>> using READ_ONCE()?
>>>
>> The reason is that `owner_filter.nid_mask` is a nodemask_t, which is a
>> 128-byte structure. READ_ONCE() only supports types up to 8 bytes and
>> will trigger a compile-time assertion failure for larger structures.
>>
>> This was actually an issue in v2 - the AI review tool (sashiko.dev) and
>> Andrew both caught the compilation error with READ_ONCE/WRITE_ONCE on
>> nodemask_t, so v3 removed them.
>
> Thank you for kindly sharing the context. Now I understand why READ_ONCE()
> cannot be used. But, is plain load/store safe enough for nodemask_t?
> Shouldn't it still be protected against races?
>
Concurrency Safety:
I considered spinlock and RCU, but decided against them:
- Spinlock: Adds overhead on every read, overkill for a debug facility
- RCU: Requires dynamic allocation of 128-byte nodemask_t, too complex
- READ_ONCE/WRITE_ONCE: Not possible, exceeds 8-byte limit
Plain load/store is safe here because:
1. page_owner is debug code with low-frequency filter changes
2. Worst case of torn read is temporary inconsistency in debug output
3. Similar debugfs interfaces use the same approach
The overhead of locking doesn't justify the benefit for this debug use case.
Do you think this is acceptable, or would you prefer I add locking?
> [...]
>>>> +static ssize_t nid_filter_write(struct file *file,
>>>> + const char __user *buf,
>>>> + size_t count, loff_t *ppos)
>>>> +{
>>>> + char *kbuf;
>>>> + nodemask_t mask;
>>>> + int ret;
>>>> + int val;
>>>> +
>>>> + /*
>>>> + * Limit input size to handle worst-case nodelist (all nodes).
>>>> + * Worst case per node: ",NNNNN" (comma + 5-digit node number) = 6 bytes.
>>>> + * Formula: 100 bytes overhead + 6 * MAX_NUMNODES
>>>> + */
>>>> + if (count > (100 + 6 * MAX_NUMNODES))
>>>> + return -EINVAL;
>>>> +
>>>> + kbuf = kmalloc(count + 1, GFP_KERNEL);
>>>> + if (!kbuf)
>>>> + return -ENOMEM;
>>>> +
>>>> + if (copy_from_user(kbuf, buf, count)) {
>>>> + ret = -EFAULT;
>>>> + goto out_free;
>>>> + }
>>>> + kbuf[count] = '\0';
>>>> +
>>>> + /* Support: "-1" to clear, or nodelist format like "0", "0,2", "0-3" */
>>>> + if (kstrtoint(kbuf, 10, &val) == 0 && val == -1)
>>>> + nodes_clear(mask);
>>>> + else if (nodelist_parse(kbuf, mask)) {
>>>> + ret = -EINVAL;
>>>> + goto out_free;
>>>> + }
>>>
>>> Doesn't empty string input to nodelist_parse() clears the mask? Can't it be
>>> reused?
>>>
>> Yes, empty input (echo > nid) works because nodelist_parse() handles it
>> correctly. However, nodelist_parse() - which is implemented via
>> bitmap_parselist() - cannot handle "-1" as it's not a valid range format
>> and would return an error. The explicit "-1" check is necessary to
>> support `echo "-1" > nid` without returning an error.
>>
>> So the "-1" check handles a case that nodelist_parse() cannot handle.
>
> Thank you for kindly explaining the reason. But, do we really need to support
> "-1" input? Couldn't we just redefine the interface?
>
I chose "-1" to clearly differentiate from valid NUMA node IDs (0, 1, 2,
3...).Since node IDs are non-negative integers, "-1" naturally means
"invalid" or "no filter", which is an intuitive convention in Linux
(e.g., pid -1, signal -1).
Do you have a better suggestion for how to represent "clear filter"?
> [...]
>>>> + debugfs_create_file("nid", 0600, filter_dir, NULL,
>>>> + &nid_filter_fops);
>>>
>>> Why don't you use 'page_owner_' prefix like other fops, for consistency?
>>>
>> For consistency with the other file_operations
>> in this module (page_owner_fops, page_owner_threshold_fops,
>> page_owner_print_mode_fops), I'll rename nid_filter_fops to
>> page_owner_nid_filter_fops.
>>
>> I'll incorporate these improvements in the next version.
>
> Thank you for kindly accepting my humble suggestions.
>
>>
>> Thanks for the detailed review!
>
> Thank you for sharing this nice work, too!
>
>
> Thanks,
> SJ
>
> [...]
>
>
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support
2026-04-30 3:56 ` zhen.ni
@ 2026-04-30 5:16 ` SeongJae Park
2026-04-30 6:00 ` zhen.ni
0 siblings, 1 reply; 18+ messages in thread
From: SeongJae Park @ 2026-04-30 5:16 UTC (permalink / raw)
To: zhen.ni
Cc: SeongJae Park, akpm, vbabka, surenb, mhocko, jackmanb, hannes,
ziy, linux-mm, linux-kernel
On Thu, 30 Apr 2026 11:56:33 +0800 "zhen.ni" <zhen.ni@easystack.cn> wrote:
>
>
> 在 2026/4/29 22:56, SeongJae Park 写道:
> > On Wed, 29 Apr 2026 17:03:56 +0800 "zhen.ni" <zhen.ni@easystack.cn> wrote:
> >
> >>
> >>
> >> 在 2026/4/29 09:28, SeongJae Park 写道:
> >>> On Tue, 28 Apr 2026 15:11:11 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
> > [...]
> >>>> @@ -685,6 +685,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> >>>> struct page_ext *page_ext;
> >>>> struct page_owner *page_owner;
> >>>> depot_stack_handle_t handle;
> >>>> + nodemask_t mask;
> >>>>
> >>>> if (!static_branch_unlikely(&page_owner_inited))
> >>>> return -EINVAL;
> >>>> @@ -698,6 +699,8 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> >>>> while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
> >>>> pfn++;
> >>>>
> >>>> + mask = owner_filter.nid_mask;
> >>>> +
> >>>
> >>> READ_ONCE() was used for owner_filter.print_mode. Should nid_mask also read
> >>> using READ_ONCE()?
> >>>
> >> The reason is that `owner_filter.nid_mask` is a nodemask_t, which is a
> >> 128-byte structure. READ_ONCE() only supports types up to 8 bytes and
> >> will trigger a compile-time assertion failure for larger structures.
> >>
> >> This was actually an issue in v2 - the AI review tool (sashiko.dev) and
> >> Andrew both caught the compilation error with READ_ONCE/WRITE_ONCE on
> >> nodemask_t, so v3 removed them.
> >
> > Thank you for kindly sharing the context. Now I understand why READ_ONCE()
> > cannot be used. But, is plain load/store safe enough for nodemask_t?
> > Shouldn't it still be protected against races?
> >
> Concurrency Safety:
> I considered spinlock and RCU, but decided against them:
>
> - Spinlock: Adds overhead on every read, overkill for a debug facility
> - RCU: Requires dynamic allocation of 128-byte nodemask_t, too complex
> - READ_ONCE/WRITE_ONCE: Not possible, exceeds 8-byte limit
>
> Plain load/store is safe here because:
> 1. page_owner is debug code with low-frequency filter changes
> 2. Worst case of torn read is temporary inconsistency in debug output
> 3. Similar debugfs interfaces use the same approach
>
> The overhead of locking doesn't justify the benefit for this debug use case.
>
> Do you think this is acceptable, or would you prefer I add locking?
Thank you for kindly explaining this. Unless others have different opinions, I
think this is ok. But, I think this would be good to be clarly documented, on
the code or the user documentation.
[...]
> >>>> + /* Support: "-1" to clear, or nodelist format like "0", "0,2", "0-3" */
> >>>> + if (kstrtoint(kbuf, 10, &val) == 0 && val == -1)
> >>>> + nodes_clear(mask);
> >>>> + else if (nodelist_parse(kbuf, mask)) {
> >>>> + ret = -EINVAL;
> >>>> + goto out_free;
> >>>> + }
> >>>
> >>> Doesn't empty string input to nodelist_parse() clears the mask? Can't it be
> >>> reused?
> >>>
> >> Yes, empty input (echo > nid) works because nodelist_parse() handles it
> >> correctly. However, nodelist_parse() - which is implemented via
> >> bitmap_parselist() - cannot handle "-1" as it's not a valid range format
> >> and would return an error. The explicit "-1" check is necessary to
> >> support `echo "-1" > nid` without returning an error.
> >>
> >> So the "-1" check handles a case that nodelist_parse() cannot handle.
> >
> > Thank you for kindly explaining the reason. But, do we really need to support
> > "-1" input? Couldn't we just redefine the interface?
> >
> I chose "-1" to clearly differentiate from valid NUMA node IDs (0, 1, 2,
> 3...).Since node IDs are non-negative integers, "-1" naturally means
> "invalid" or "no filter", which is an intuitive convention in Linux
> (e.g., pid -1, signal -1).
>
> Do you have a better suggestion for how to represent "clear filter"?
Seems my suggestion was too implicit. I'm suggesting using empty string
instead of "-1". I think it is also clarly differentiated from valid NUMA node
IDs?
Thanks,
SJ
[...]
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support
2026-04-30 5:16 ` SeongJae Park
@ 2026-04-30 6:00 ` zhen.ni
0 siblings, 0 replies; 18+ messages in thread
From: zhen.ni @ 2026-04-30 6:00 UTC (permalink / raw)
To: SeongJae Park
Cc: akpm, vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm,
linux-kernel
在 2026/4/30 13:16, SeongJae Park 写道:
> On Thu, 30 Apr 2026 11:56:33 +0800 "zhen.ni" <zhen.ni@easystack.cn> wrote:
>
>>
>>
>> 在 2026/4/29 22:56, SeongJae Park 写道:
>>> On Wed, 29 Apr 2026 17:03:56 +0800 "zhen.ni" <zhen.ni@easystack.cn> wrote:
>>>
>>>>
>>>>
>>>> 在 2026/4/29 09:28, SeongJae Park 写道:
>>>>> On Tue, 28 Apr 2026 15:11:11 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
>>> [...]
>>>>>> @@ -685,6 +685,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
>>>>>> struct page_ext *page_ext;
>>>>>> struct page_owner *page_owner;
>>>>>> depot_stack_handle_t handle;
>>>>>> + nodemask_t mask;
>>>>>>
>>>>>> if (!static_branch_unlikely(&page_owner_inited))
>>>>>> return -EINVAL;
>>>>>> @@ -698,6 +699,8 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
>>>>>> while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
>>>>>> pfn++;
>>>>>>
>>>>>> + mask = owner_filter.nid_mask;
>>>>>> +
>>>>>
>>>>> READ_ONCE() was used for owner_filter.print_mode. Should nid_mask also read
>>>>> using READ_ONCE()?
>>>>>
>>>> The reason is that `owner_filter.nid_mask` is a nodemask_t, which is a
>>>> 128-byte structure. READ_ONCE() only supports types up to 8 bytes and
>>>> will trigger a compile-time assertion failure for larger structures.
>>>>
>>>> This was actually an issue in v2 - the AI review tool (sashiko.dev) and
>>>> Andrew both caught the compilation error with READ_ONCE/WRITE_ONCE on
>>>> nodemask_t, so v3 removed them.
>>>
>>> Thank you for kindly sharing the context. Now I understand why READ_ONCE()
>>> cannot be used. But, is plain load/store safe enough for nodemask_t?
>>> Shouldn't it still be protected against races?
>>>
>> Concurrency Safety:
>> I considered spinlock and RCU, but decided against them:
>>
>> - Spinlock: Adds overhead on every read, overkill for a debug facility
>> - RCU: Requires dynamic allocation of 128-byte nodemask_t, too complex
>> - READ_ONCE/WRITE_ONCE: Not possible, exceeds 8-byte limit
>>
>> Plain load/store is safe here because:
>> 1. page_owner is debug code with low-frequency filter changes
>> 2. Worst case of torn read is temporary inconsistency in debug output
>> 3. Similar debugfs interfaces use the same approach
>>
>> The overhead of locking doesn't justify the benefit for this debug use case.
>>
>> Do you think this is acceptable, or would you prefer I add locking?
>
> Thank you for kindly explaining this. Unless others have different opinions, I
> think this is ok. But, I think this would be good to be clarly documented, on
> the code or the user documentation.
>
I'll add a comment in the code explaining the concurrency consideration
> [...]
>>>>>> + /* Support: "-1" to clear, or nodelist format like "0", "0,2", "0-3" */
>>>>>> + if (kstrtoint(kbuf, 10, &val) == 0 && val == -1)
>>>>>> + nodes_clear(mask);
>>>>>> + else if (nodelist_parse(kbuf, mask)) {
>>>>>> + ret = -EINVAL;
>>>>>> + goto out_free;
>>>>>> + }
>>>>>
>>>>> Doesn't empty string input to nodelist_parse() clears the mask? Can't it be
>>>>> reused?
>>>>>
>>>> Yes, empty input (echo > nid) works because nodelist_parse() handles it
>>>> correctly. However, nodelist_parse() - which is implemented via
>>>> bitmap_parselist() - cannot handle "-1" as it's not a valid range format
>>>> and would return an error. The explicit "-1" check is necessary to
>>>> support `echo "-1" > nid` without returning an error.
>>>>
>>>> So the "-1" check handles a case that nodelist_parse() cannot handle.
>>>
>>> Thank you for kindly explaining the reason. But, do we really need to support
>>> "-1" input? Couldn't we just redefine the interface?
>>>
>> I chose "-1" to clearly differentiate from valid NUMA node IDs (0, 1, 2,
>> 3...).Since node IDs are non-negative integers, "-1" naturally means
>> "invalid" or "no filter", which is an intuitive convention in Linux
>> (e.g., pid -1, signal -1).
>>
>> Do you have a better suggestion for how to represent "clear filter"?
>
> Seems my suggestion was too implicit. I'm suggesting using empty string
> instead of "-1". I think it is also clarly differentiated from valid NUMA node
> IDs?
>
I understand your point about simplifying the code by removing the "-1"
special case. I'll remove it and use only empty string for clearing the
filter.
Thank you for the suggestions.
>
> Thanks,
> SJ
>
> [...]
>
>
Thanks,
Zhen
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH v3 4/4] mm/page_owner: document page_owner filter features
2026-04-28 7:11 [PATCH v3 0/4] mm/page_owner: add filter infrastructure for print_mode and NUMA filtering Zhen Ni
` (2 preceding siblings ...)
2026-04-28 7:11 ` [PATCH v3 3/4] mm/page_owner: add NUMA node filter with nodelist support Zhen Ni
@ 2026-04-28 7:11 ` Zhen Ni
2026-04-29 1:35 ` SeongJae Park
2026-04-28 14:15 ` [PATCH v3 0/4] mm/page_owner: add filter infrastructure for print_mode and NUMA filtering Andrew Morton
4 siblings, 1 reply; 18+ messages in thread
From: Zhen Ni @ 2026-04-28 7:11 UTC (permalink / raw)
To: akpm, vbabka
Cc: surenb, mhocko, jackmanb, hannes, ziy, linux-mm, linux-kernel,
Zhen Ni
Add documentation for the page_owner filter functionality, including:
- Print mode filter (full stack vs stack handle)
- NUMA node filter (single node, multiple nodes, ranges)
- Usage examples for both filters
Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
---
Changes in v3:
- New patch to document filter features as requested by Andrew Morton
---
Documentation/mm/page_owner.rst | 55 ++++++++++++++++++++++++++++++++-
1 file changed, 54 insertions(+), 1 deletion(-)
diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst
index 6b12f3b007ec..6261366d33fe 100644
--- a/Documentation/mm/page_owner.rst
+++ b/Documentation/mm/page_owner.rst
@@ -74,7 +74,17 @@ Usage
3) Do the job that you want to debug.
-4) Analyze information from page owner::
+4) (Optional) Use filters to focus on specific memory allocations::
+
+ cd /sys/kernel/debug/page_owner_filter
+
+ # Print only stack handles instead of full traces
+ echo 1 > print_mode
+
+ # Filter by NUMA nodes
+ echo "0,2-3" > nid
+
+5) Analyze information from page owner::
cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
cat stacks.txt
@@ -238,6 +248,49 @@ Usage
./page_owner_sort <input> <output> --tgid=1,2,3
./page_owner_sort <input> <output> --name name1,name2
+Page Owner Filters
+==================
+
+The page_owner feature provides filtering capabilities to focus on specific
+memory allocations (e.g., by NUMA node). Filters are controlled through debugfs
+files in ``/sys/kernel/debug/page_owner_filter/``.
+
+Print Mode Filter
+-----------------
+
+The ``print_mode`` file controls the level of detail in stack trace output.
+
+Available modes:
+
+- ``0`` (default): Print full stack traces
+- ``1``: Print only stack handles
+
+The ``print_mode=1`` output format::
+
+ Page allocated via order 0, mask 0x42800(GFP_NOWAIT|__GFP_COMP),
+ pid 1, tgid 1 (systemd), ts 349667370 ns
+ PFN 0xa00a2 type Unmovable Block 1280 type Unmovable
+ Flags 0x33fffe0000004124(...)
+ handle: 17432583
+
+To retrieve the full stack trace for a handle, use::
+
+ cat /sys/kernel/debug/page_owner_stacks/show_stacks_handles
+
+NUMA Node Filter
+----------------
+
+The ``nid`` file filters pages by NUMA node. This is useful for NUMA-aware
+environments to analyze node-specific memory allocation.
+
+Supported input formats:
+
+- Single node: ``echo "2" > nid``
+- Multiple nodes: ``echo "0,2,3" > nid``
+- Node range: ``echo "0-3" > nid``
+- Mixed format: ``echo "0,2-4,7" > nid``
+- Disable filter: ``echo "-1" > nid``
+
STANDARD FORMAT SPECIFIERS
==========================
::
--
2.20.1
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH v3 4/4] mm/page_owner: document page_owner filter features
2026-04-28 7:11 ` [PATCH v3 4/4] mm/page_owner: document page_owner filter features Zhen Ni
@ 2026-04-29 1:35 ` SeongJae Park
2026-04-29 9:14 ` zhen.ni
0 siblings, 1 reply; 18+ messages in thread
From: SeongJae Park @ 2026-04-29 1:35 UTC (permalink / raw)
To: Zhen Ni
Cc: SeongJae Park, akpm, vbabka, surenb, mhocko, jackmanb, hannes,
ziy, linux-mm, linux-kernel
On Tue, 28 Apr 2026 15:11:12 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
> Add documentation for the page_owner filter functionality, including:
> - Print mode filter (full stack vs stack handle)
> - NUMA node filter (single node, multiple nodes, ranges)
> - Usage examples for both filters
>
> Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
> ---
>
> Changes in v3:
> - New patch to document filter features as requested by Andrew Morton
> ---
> Documentation/mm/page_owner.rst | 55 ++++++++++++++++++++++++++++++++-
> 1 file changed, 54 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst
> index 6b12f3b007ec..6261366d33fe 100644
> --- a/Documentation/mm/page_owner.rst
> +++ b/Documentation/mm/page_owner.rst
> @@ -74,7 +74,17 @@ Usage
>
> 3) Do the job that you want to debug.
>
> -4) Analyze information from page owner::
> +4) (Optional) Use filters to focus on specific memory allocations::
> +
> + cd /sys/kernel/debug/page_owner_filter
> +
> + # Print only stack handles instead of full traces
> + echo 1 > print_mode
> +
> + # Filter by NUMA nodes
> + echo "0,2-3" > nid
Nit. Other parts of the document uses tab for the indentation, while the above
new snippet is using four spaces. How about using tab for the consistency?
> +
> +5) Analyze information from page owner::
>
> cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
> cat stacks.txt
> @@ -238,6 +248,49 @@ Usage
> ./page_owner_sort <input> <output> --tgid=1,2,3
> ./page_owner_sort <input> <output> --name name1,name2
>
> +Page Owner Filters
> +==================
> +
> +The page_owner feature provides filtering capabilities to focus on specific
> +memory allocations (e.g., by NUMA node). Filters are controlled through debugfs
> +files in ``/sys/kernel/debug/page_owner_filter/``.
> +
> +Print Mode Filter
> +-----------------
> +
> +The ``print_mode`` file controls the level of detail in stack trace output.
> +
> +Available modes:
> +
> +- ``0`` (default): Print full stack traces
> +- ``1``: Print only stack handles
> +
> +The ``print_mode=1`` output format::
> +
> + Page allocated via order 0, mask 0x42800(GFP_NOWAIT|__GFP_COMP),
> + pid 1, tgid 1 (systemd), ts 349667370 ns
> + PFN 0xa00a2 type Unmovable Block 1280 type Unmovable
> + Flags 0x33fffe0000004124(...)
> + handle: 17432583
Tab for indentation?
> +
> +To retrieve the full stack trace for a handle, use::
> +
> + cat /sys/kernel/debug/page_owner_stacks/show_stacks_handles
Tab?
> +
> +NUMA Node Filter
> +----------------
> +
> +The ``nid`` file filters pages by NUMA node. This is useful for NUMA-aware
> +environments to analyze node-specific memory allocation.
> +
> +Supported input formats:
> +
> +- Single node: ``echo "2" > nid``
> +- Multiple nodes: ``echo "0,2,3" > nid``
> +- Node range: ``echo "0-3" > nid``
> +- Mixed format: ``echo "0,2-4,7" > nid``
> +- Disable filter: ``echo "-1" > nid``
> +
> STANDARD FORMAT SPECIFIERS
> ==========================
> ::
> --
> 2.20.1
Thanks,
SJ
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v3 4/4] mm/page_owner: document page_owner filter features
2026-04-29 1:35 ` SeongJae Park
@ 2026-04-29 9:14 ` zhen.ni
0 siblings, 0 replies; 18+ messages in thread
From: zhen.ni @ 2026-04-29 9:14 UTC (permalink / raw)
To: SeongJae Park
Cc: akpm, vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm,
linux-kernel
在 2026/4/29 09:35, SeongJae Park 写道:
> On Tue, 28 Apr 2026 15:11:12 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
>
>> Add documentation for the page_owner filter functionality, including:
>> - Print mode filter (full stack vs stack handle)
>> - NUMA node filter (single node, multiple nodes, ranges)
>> - Usage examples for both filters
>>
>> Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
>> ---
>>
>> Changes in v3:
>> - New patch to document filter features as requested by Andrew Morton
>> ---
>> Documentation/mm/page_owner.rst | 55 ++++++++++++++++++++++++++++++++-
>> 1 file changed, 54 insertions(+), 1 deletion(-)
>>
>> diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst
>> index 6b12f3b007ec..6261366d33fe 100644
>> --- a/Documentation/mm/page_owner.rst
>> +++ b/Documentation/mm/page_owner.rst
>> @@ -74,7 +74,17 @@ Usage
>>
>> 3) Do the job that you want to debug.
>>
>> -4) Analyze information from page owner::
>> +4) (Optional) Use filters to focus on specific memory allocations::
>> +
>> + cd /sys/kernel/debug/page_owner_filter
>> +
>> + # Print only stack handles instead of full traces
>> + echo 1 > print_mode
>> +
>> + # Filter by NUMA nodes
>> + echo "0,2-3" > nid
>
> Nit. Other parts of the document uses tab for the indentation, while the above
> new snippet is using four spaces. How about using tab for the consistency?
>
>> +
>> +5) Analyze information from page owner::
>>
>> cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
>> cat stacks.txt
>> @@ -238,6 +248,49 @@ Usage
>> ./page_owner_sort <input> <output> --tgid=1,2,3
>> ./page_owner_sort <input> <output> --name name1,name2
>>
>> +Page Owner Filters
>> +==================
>> +
>> +The page_owner feature provides filtering capabilities to focus on specific
>> +memory allocations (e.g., by NUMA node). Filters are controlled through debugfs
>> +files in ``/sys/kernel/debug/page_owner_filter/``.
>> +
>> +Print Mode Filter
>> +-----------------
>> +
>> +The ``print_mode`` file controls the level of detail in stack trace output.
>> +
>> +Available modes:
>> +
>> +- ``0`` (default): Print full stack traces
>> +- ``1``: Print only stack handles
>> +
>> +The ``print_mode=1`` output format::
>> +
>> + Page allocated via order 0, mask 0x42800(GFP_NOWAIT|__GFP_COMP),
>> + pid 1, tgid 1 (systemd), ts 349667370 ns
>> + PFN 0xa00a2 type Unmovable Block 1280 type Unmovable
>> + Flags 0x33fffe0000004124(...)
>> + handle: 17432583
>
> Tab for indentation?
>
>> +
>> +To retrieve the full stack trace for a handle, use::
>> +
>> + cat /sys/kernel/debug/page_owner_stacks/show_stacks_handles
>
> Tab?
>
>> +
>> +NUMA Node Filter
>> +----------------
>> +
>> +The ``nid`` file filters pages by NUMA node. This is useful for NUMA-aware
>> +environments to analyze node-specific memory allocation.
>> +
>> +Supported input formats:
>> +
>> +- Single node: ``echo "2" > nid``
>> +- Multiple nodes: ``echo "0,2,3" > nid``
>> +- Node range: ``echo "0-3" > nid``
>> +- Mixed format: ``echo "0,2-4,7" > nid``
>> +- Disable filter: ``echo "-1" > nid``
>> +
>> STANDARD FORMAT SPECIFIERS
>> ==========================
>> ::
>> --
>> 2.20.1
>
>
> Thanks,
> SJ
>
>
I will fix all instances to use tab .
Thanks,
Zhen
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v3 0/4] mm/page_owner: add filter infrastructure for print_mode and NUMA filtering
2026-04-28 7:11 [PATCH v3 0/4] mm/page_owner: add filter infrastructure for print_mode and NUMA filtering Zhen Ni
` (3 preceding siblings ...)
2026-04-28 7:11 ` [PATCH v3 4/4] mm/page_owner: document page_owner filter features Zhen Ni
@ 2026-04-28 14:15 ` Andrew Morton
4 siblings, 0 replies; 18+ messages in thread
From: Andrew Morton @ 2026-04-28 14:15 UTC (permalink / raw)
To: Zhen Ni
Cc: vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm,
linux-kernel
On Tue, 28 Apr 2026 15:11:08 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
> This patch series introduces filtering capabilities to the page_owner
> feature to address storage and performance challenges in production
> environments.
Thanks, I updated mm.git's mm-new branch to this version.
> Changes from v2:
> - Remove READ_ONCE/WRITE_ONCE for nodemask_t (fixes compilation errors)
> * nodemask_t is a large structure (128 bytes) that triggers compile-time asserts
> * Direct assignment is safe for this use case
> - Add comment explaining input length calculation formula
> * 6 bytes = ",NNNNN" (comma + 5-digit node number)
> - Simplify "-1" check using kstrtoint() instead of dual strcmp()
> - Move nodemask_t mask read outside PFN iteration loop for performance
> * Avoids 128-byte structure copy on each iteration
> - Add documentation for filter features (patch 4/4)
Here's how v3 altered mm.git:
Documentation/mm/page_owner.rst | 55 +++++++++++++++++++++++++++++-
mm/page_owner.c | 14 +++++--
2 files changed, 64 insertions(+), 5 deletions(-)
--- a/Documentation/mm/page_owner.rst~b
+++ a/Documentation/mm/page_owner.rst
@@ -74,7 +74,17 @@ Usage
3) Do the job that you want to debug.
-4) Analyze information from page owner::
+4) (Optional) Use filters to focus on specific memory allocations::
+
+ cd /sys/kernel/debug/page_owner_filter
+
+ # Print only stack handles instead of full traces
+ echo 1 > print_mode
+
+ # Filter by NUMA nodes
+ echo "0,2-3" > nid
+
+5) Analyze information from page owner::
cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
cat stacks.txt
@@ -238,6 +248,49 @@ Usage
./page_owner_sort <input> <output> --tgid=1,2,3
./page_owner_sort <input> <output> --name name1,name2
+Page Owner Filters
+==================
+
+The page_owner feature provides filtering capabilities to focus on specific
+memory allocations (e.g., by NUMA node). Filters are controlled through debugfs
+files in ``/sys/kernel/debug/page_owner_filter/``.
+
+Print Mode Filter
+-----------------
+
+The ``print_mode`` file controls the level of detail in stack trace output.
+
+Available modes:
+
+- ``0`` (default): Print full stack traces
+- ``1``: Print only stack handles
+
+The ``print_mode=1`` output format::
+
+ Page allocated via order 0, mask 0x42800(GFP_NOWAIT|__GFP_COMP),
+ pid 1, tgid 1 (systemd), ts 349667370 ns
+ PFN 0xa00a2 type Unmovable Block 1280 type Unmovable
+ Flags 0x33fffe0000004124(...)
+ handle: 17432583
+
+To retrieve the full stack trace for a handle, use::
+
+ cat /sys/kernel/debug/page_owner_stacks/show_stacks_handles
+
+NUMA Node Filter
+----------------
+
+The ``nid`` file filters pages by NUMA node. This is useful for NUMA-aware
+environments to analyze node-specific memory allocation.
+
+Supported input formats:
+
+- Single node: ``echo "2" > nid``
+- Multiple nodes: ``echo "0,2,3" > nid``
+- Node range: ``echo "0-3" > nid``
+- Mixed format: ``echo "0,2-4,7" > nid``
+- Disable filter: ``echo "-1" > nid``
+
STANDARD FORMAT SPECIFIERS
==========================
::
--- a/mm/page_owner.c~b
+++ a/mm/page_owner.c
@@ -685,6 +685,7 @@ read_page_owner(struct file *file, char
struct page_ext *page_ext;
struct page_owner *page_owner;
depot_stack_handle_t handle;
+ nodemask_t mask;
if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
@@ -698,6 +699,8 @@ read_page_owner(struct file *file, char
while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
pfn++;
+ mask = owner_filter.nid_mask;
+
/* Find an allocated page */
for (; pfn < max_pfn; pfn++) {
/*
@@ -707,7 +710,6 @@ read_page_owner(struct file *file, char
* user through copy_to_user() or GFP_KERNEL allocations.
*/
struct page_owner page_owner_tmp;
- nodemask_t mask;
/*
* If the new page is in a new MAX_ORDER_NR_PAGES area,
@@ -732,7 +734,6 @@ read_page_owner(struct file *file, char
continue;
/* NUMA node filter using bitmask */
- mask = owner_filter.nid_mask;
if (!nodes_empty(mask)) {
int nid = page_to_nid(page);
@@ -1026,8 +1027,13 @@ static ssize_t nid_filter_write(struct f
char *kbuf;
nodemask_t mask;
int ret;
+ int val;
- /* Limit input size to handle worst-case nodelist (all nodes) */
+ /*
+ * Limit input size to handle worst-case nodelist (all nodes).
+ * Worst case per node: ",NNNNN" (comma + 5-digit node number) = 6 bytes.
+ * Formula: 100 bytes overhead + 6 * MAX_NUMNODES
+ */
if (count > (100 + 6 * MAX_NUMNODES))
return -EINVAL;
@@ -1042,7 +1048,7 @@ static ssize_t nid_filter_write(struct f
kbuf[count] = '\0';
/* Support: "-1" to clear, or nodelist format like "0", "0,2", "0-3" */
- if (strcmp(kbuf, "-1\n") == 0 || strcmp(kbuf, "-1") == 0)
+ if (kstrtoint(kbuf, 10, &val) == 0 && val == -1)
nodes_clear(mask);
else if (nodelist_parse(kbuf, mask)) {
ret = -EINVAL;
_
^ permalink raw reply [flat|nested] 18+ messages in thread