* [RFC PATCH v1.1 10/11] mm/damon/sysfs: split out filters setup function
From: SeongJae Park @ 2026-06-25 5:07 UTC (permalink / raw)
Cc: SeongJae Park, Andrew Morton, damon, linux-kernel, linux-mm
In-Reply-To: <20260625050756.91115-1-sj@kernel.org>
damon_sysfs_set_probe() is doing not only probe setup but also filters
setup. Split out filters setup for readability.
Signed-off-by: SeongJae Park <sj@kernel.org>
---
mm/damon/sysfs.c | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 982d824f63c21..f3bb146b204df 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1899,16 +1899,11 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
return damon_set_attrs(ctx, &attrs);
}
-static int damon_sysfs_set_probe(struct damon_probe *probe,
- struct damon_sysfs_probe *sys_probe)
+static int damon_sysfs_set_filters(struct damon_probe *probe,
+ struct damon_sysfs_filters *sys_filters)
{
- struct damon_sysfs_filters *sys_filters;
int i;
- sys_filters = sys_probe->filters;
- if (!sys_filters)
- return 0;
-
for (i = 0; i < sys_filters->nr; i++) {
struct damon_sysfs_filter *sys_filter =
sys_filters->filters_arr[i];
@@ -1935,6 +1930,17 @@ static int damon_sysfs_set_probe(struct damon_probe *probe,
return 0;
}
+static int damon_sysfs_set_probe(struct damon_probe *probe,
+ struct damon_sysfs_probe *sys_probe)
+{
+ struct damon_sysfs_filters *sys_filters;
+
+ sys_filters = sys_probe->filters;
+ if (!sys_filters)
+ return 0;
+ return damon_sysfs_set_filters(probe, sys_filters);
+}
+
static int damon_sysfs_set_probes(struct damon_ctx *ctx,
struct damon_sysfs_probes *sys_probes)
{
--
2.47.3
^ permalink raw reply related
* [RFC PATCH v1.1 09/11] mm/damon/sysfs: split probe setup function out
From: SeongJae Park @ 2026-06-25 5:07 UTC (permalink / raw)
Cc: SeongJae Park, Andrew Morton, damon, linux-kernel, linux-mm
In-Reply-To: <20260625050756.91115-1-sj@kernel.org>
damon_sysfs_set_probes() function is relatively long. It has two nested
loop for setting two nested entities, namely probe and filter. Split
out the probe level setup for readability.
Signed-off-by: SeongJae Park <sj@kernel.org>
---
mm/damon/sysfs.c | 80 ++++++++++++++++++++++++++++--------------------
1 file changed, 46 insertions(+), 34 deletions(-)
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 2e95e3bac774d..982d824f63c21 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1899,47 +1899,59 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
return damon_set_attrs(ctx, &attrs);
}
-static int damon_sysfs_set_probes(struct damon_ctx *ctx,
- struct damon_sysfs_probes *sys_probes)
+static int damon_sysfs_set_probe(struct damon_probe *probe,
+ struct damon_sysfs_probe *sys_probe)
{
+ struct damon_sysfs_filters *sys_filters;
int i;
- for (i = 0; i < sys_probes->nr; i++) {
- struct damon_sysfs_filters *sys_filters =
- sys_probes->probes_arr[i]->filters;
- struct damon_probe *c;
- int j;
+ sys_filters = sys_probe->filters;
+ if (!sys_filters)
+ return 0;
- if (!sys_filters)
- continue;
- c = damon_new_probe();
- if (!c)
+ for (i = 0; i < sys_filters->nr; i++) {
+ struct damon_sysfs_filter *sys_filter =
+ sys_filters->filters_arr[i];
+ struct damon_filter *filter;
+
+ filter = damon_new_filter(sys_filter->type,
+ sys_filter->matching,
+ sys_filter->allow);
+ if (!filter)
return -ENOMEM;
- damon_add_probe(ctx, c);
-
- for (j = 0; j < sys_filters->nr; j++) {
- struct damon_sysfs_filter *sys_filter =
- sys_filters->filters_arr[j];
- struct damon_filter *filter;
-
- filter = damon_new_filter(sys_filter->type,
- sys_filter->matching,
- sys_filter->allow);
- if (!filter)
- return -ENOMEM;
- if (filter->type == DAMON_FILTER_TYPE_MEMCG) {
- int err;
-
- err = damon_sysfs_memcg_path_to_id(
- sys_filter->path,
- &filter->memcg_id);
- if (err) {
- damon_destroy_filter(filter);
- return err;
- }
+ if (filter->type == DAMON_FILTER_TYPE_MEMCG) {
+ int err;
+
+ err = damon_sysfs_memcg_path_to_id(
+ sys_filter->path,
+ &filter->memcg_id);
+ if (err) {
+ damon_destroy_filter(filter);
+ return err;
}
- damon_add_filter(c, filter);
}
+ damon_add_filter(probe, filter);
+ }
+ return 0;
+}
+
+static int damon_sysfs_set_probes(struct damon_ctx *ctx,
+ struct damon_sysfs_probes *sys_probes)
+{
+ int i, err;
+
+ for (i = 0; i < sys_probes->nr; i++) {
+ struct damon_sysfs_probe *sys_probe;
+ struct damon_probe *p;
+
+ p = damon_new_probe();
+ if (!p)
+ return -ENOMEM;
+ damon_add_probe(ctx, p);
+ sys_probe = sys_probes->probes_arr[i];
+ err = damon_sysfs_set_probe(p, sys_probe);
+ if (err)
+ return err;
}
return 0;
}
--
2.47.3
^ permalink raw reply related
* [RFC PATCH v1.1 08/11] mm/damon/core: reduce range setup in damon_commit_target_regions()
From: SeongJae Park @ 2026-06-25 5:07 UTC (permalink / raw)
Cc: SeongJae Park, Andrew Morton, damon, linux-kernel, linux-mm
In-Reply-To: <20260625050756.91115-1-sj@kernel.org>
damon_commit_target_regions() calls damon_set_regions() for updating the
destination target's monitoring target region boundaries. It sets the
boundaries same to source target's monitoring regions, even if they are
adjacent. Meanwhile, damon_set_region() sets the destination target
regions exactly the same to the source, only when the target regions are
empty. When there are existing target regions, only a few regions are
expanded or shrunk to fit on only the boundaries for disjoint regions in
the source. Hence the adjacent source ranges mean nothing in common
cases. When there are many regions, such adjacent range setup is only a
waste of time and space. We recently found [1] it is actually causing
memory overhead. Setup the ranges for only distinct ranges.
[1] https://lore.kernel.org/20260603112306.58490-1-akinobu.mita@gmail.com
Signed-off-by: SeongJae Park <sj@kernel.org>
---
mm/damon/core.c | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 7e4b9affc5b06..ce5294cb1b4f3 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1349,21 +1349,33 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx)
static int damon_commit_target_regions(struct damon_target *dst,
struct damon_target *src, unsigned long src_min_region_sz)
{
- struct damon_region *src_region;
+ struct damon_region *src_region, *prev = NULL;
struct damon_addr_range *ranges;
int i = 0, err;
- damon_for_each_region(src_region, src)
- i++;
+ damon_for_each_region(src_region, src) {
+ if (!prev || prev->ar.end != src_region->ar.start)
+ i++;
+ prev = src_region;
+ }
if (!i)
return 0;
ranges = kmalloc_objs(*ranges, i, GFP_KERNEL | __GFP_NOWARN);
if (!ranges)
return -ENOMEM;
+ prev = NULL;
i = 0;
- damon_for_each_region(src_region, src)
- ranges[i++] = src_region->ar;
+ damon_for_each_region(src_region, src) {
+ if (!prev) {
+ ranges[i].start = src_region->ar.start;
+ } else if (prev->ar.end != src_region->ar.start) {
+ ranges[i++].end = prev->ar.end;
+ ranges[i].start = src_region->ar.start;
+ }
+ prev = src_region;
+ }
+ ranges[i++].end = damon_last_region(src)->ar.end;
err = damon_set_regions(dst, ranges, i, src_min_region_sz);
kfree(ranges);
return err;
--
2.47.3
^ permalink raw reply related
* [RFC PATCH v1.1 07/11] selftests/damon/sysfs.sh: test all files in quota goal dir
From: SeongJae Park @ 2026-06-25 5:07 UTC (permalink / raw)
Cc: SeongJae Park, Shuah Khan, damon, linux-kernel, linux-kselftest,
linux-mm
In-Reply-To: <20260625050756.91115-1-sj@kernel.org>
DAMON sysfs interface for DAMOS quota has quite extended since its
initial introduction. The test case for that in DAMON sysfs interface
essential file operations test (sysfs.sh) has not accordingly extended,
though. Extend the test case to test all existing files.
Signed-off-by: SeongJae Park <sj@kernel.org>
---
tools/testing/selftests/damon/sysfs.sh | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index ffa8413b5ab3d..15fb9df928818 100755
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -199,6 +199,20 @@ test_goal()
ensure_dir "$goal_dir" "exist"
ensure_file "$goal_dir/target_value" "exist" "600"
ensure_file "$goal_dir/current_value" "exist" "600"
+ ensure_file "$goal_dir/target_metric" "exist" "600"
+ local fpath="$goal_dir/target_metric"
+ ensure_write_succ "$fpath" "user_input" "valid input"
+ ensure_write_succ "$fpath" "some_mem_psi_us" "valid input"
+ ensure_write_succ "$fpath" "node_mem_used_bp" "valid input"
+ ensure_write_succ "$fpath" "node_mem_free_bp" "valid input"
+ ensure_write_succ "$fpath" "node_memcg_used_bp" "valid input"
+ ensure_write_succ "$fpath" "node_memcg_free_bp" "valid input"
+ ensure_write_succ "$fpath" "active_mem_bp" "valid input"
+ ensure_write_succ "$fpath" "inactive_mem_bp" "valid input"
+ ensure_write_succ "$fpath" "node_eligible_mem_bp" "valid input"
+ ensure_write_fail "$fpath" "foo" "invalid input"
+ ensure_file "$goal_dir/nid" "exist" "600"
+ ensure_file "$goal_dir/path" "exist" "600"
}
test_goals()
--
2.47.3
^ permalink raw reply related
* [RFC PATCH v1.1 06/11] selftests/damon/sysfs.sh: test dests dir
From: SeongJae Park @ 2026-06-25 5:07 UTC (permalink / raw)
Cc: SeongJae Park, Shuah Khan, damon, linux-kernel, linux-kselftest,
linux-mm
In-Reply-To: <20260625050756.91115-1-sj@kernel.org>
DAMON selftest interface essential file operations test (sysfs.sh) is
not testing DAMOS dests/ directory. Add the test.
Signed-off-by: SeongJae Park <sj@kernel.org>
---
tools/testing/selftests/damon/sysfs.sh | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index 07a33995be852..ffa8413b5ab3d 100755
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -99,6 +99,29 @@ test_stats()
done
}
+test_dest()
+{
+ dest_dir=$1
+ ensure_file "$dest_dir/id" "exist"
+ ensure_file "$dest_dir/weight" "exist"
+}
+
+test_dests()
+{
+ dests_dir=$1
+ ensure_file "$dests_dir/nr_dests" "exist" "600"
+ ensure_write_succ "$dests_dir/nr_dests" "1" "valid input"
+ test_dest "$dests_dir/0"
+
+ ensure_write_succ "$dests_dir/nr_dests" "2" "valid input"
+ test_dest "$dests_dir/0"
+ test_dest "$dests_dir/1"
+
+ ensure_write_succ "$dests_dir/nr_dests" "0" "valid input"
+ ensure_dir "$dests_dir/0" "not_exist"
+ ensure_dir "$dests_dir/1" "not_exist"
+}
+
test_filter()
{
filter_dir=$1
--
2.47.3
^ permalink raw reply related
* [RFC PATCH v1.1 05/11] selftests/damon/sysfs.sh: test {core,ops}_filters/ directories
From: SeongJae Park @ 2026-06-25 5:07 UTC (permalink / raw)
Cc: SeongJae Park, Shuah Khan, damon, linux-kernel, linux-kselftest,
linux-mm
In-Reply-To: <20260625050756.91115-1-sj@kernel.org>
DAMON sysfs interface essential file operations test (sysf.sh) is not
testing DAMOS {core,ops}_filters directories. Add the tests.
Signed-off-by: SeongJae Park <sj@kernel.org>
---
tools/testing/selftests/damon/sysfs.sh | 28 ++++++++++++++++++++++----
1 file changed, 24 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index 0f2ef462a6b6a..07a33995be852 100755
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -103,10 +103,28 @@ test_filter()
{
filter_dir=$1
ensure_file "$filter_dir/type" "exist" "600"
- ensure_write_succ "$filter_dir/type" "anon" "valid input"
- ensure_write_succ "$filter_dir/type" "memcg" "valid input"
- ensure_write_succ "$filter_dir/type" "addr" "valid input"
- ensure_write_succ "$filter_dir/type" "target" "valid input"
+
+ local dir_name=$(basename "$(dirname "$filter_dir")")
+ if [ "$dir_name" = "filters" ] || [ "$dir_name" = "ops_filters" ]
+ then
+ ensure_write_succ "$filter_dir/type" "anon" "valid input"
+ ensure_write_succ "$filter_dir/type" "memcg" "valid input"
+ fi
+ if [ "$dir_name" = "filters" ] || [ "$dir_name" = "core_filters" ]
+ then
+ ensure_write_succ "$filter_dir/type" "addr" "valid input"
+ ensure_write_succ "$filter_dir/type" "target" "valid input"
+ fi
+ if [ "$dir_name" = "core_filters" ]
+ then
+ ensure_write_fail "$filter_dir/type" "anon" "ops type"
+ ensure_write_fail "$filter_dir/type" "memcg" "ops type"
+ fi
+ if [ "$dir_name" = "ops_filters" ]
+ then
+ ensure_write_fail "$filter_dir/type" "addr" "core type"
+ ensure_write_fail "$filter_dir/type" "target" "core type"
+ fi
ensure_write_fail "$filter_dir/type" "foo" "invalid input"
ensure_file "$filter_dir/matching" "exist" "600"
ensure_file "$filter_dir/memcg_path" "exist" "600"
@@ -208,6 +226,8 @@ test_scheme()
test_quotas "$scheme_dir/quotas"
test_watermarks "$scheme_dir/watermarks"
test_filters "$scheme_dir/filters"
+ test_filters "$scheme_dir/core_filters"
+ test_filters "$scheme_dir/ops_filters"
test_stats "$scheme_dir/stats"
test_tried_regions "$scheme_dir/tried_regions"
}
--
2.47.3
^ permalink raw reply related
* [RFC PATCH v1.1 04/11] selftests/damon/sysfs.sh: test multiple probe dirs creation
From: SeongJae Park @ 2026-06-25 5:07 UTC (permalink / raw)
Cc: SeongJae Park, Shuah Khan, damon, linux-kernel, linux-kselftest,
linux-mm
In-Reply-To: <20260625050756.91115-1-sj@kernel.org>
DAMON sysfs essential file operations test (sysfs.sh) was extended to
test DAMON probes sysfs directory, by commit 14885da09b0f
("selftests/damon/sysfs.sh: test probes dir"). Unlike other DAMON sysfs
files, it is testing only a single directory case. Extend it for
multiple directories.
Signed-off-by: SeongJae Park <sj@kernel.org>
---
tools/testing/selftests/damon/sysfs.sh | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index 78f4badb5bebb..0f2ef462a6b6a 100755
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -346,8 +346,13 @@ test_probes()
ensure_write_succ "$probes_dir/nr_probes" "1" "valid input"
test_probe "$probes_dir/0"
+ ensure_write_succ "$probes_dir/nr_probes" "2" "valid input"
+ test_probe "$probes_dir/0"
+ test_probe "$probes_dir/1"
+
ensure_write_succ "$probes_dir/nr_probes" "0" "valid input"
ensure_dir "$probes_dir/0" "not_exist"
+ ensure_dir "$probes_dir/1" "not_exist"
}
test_monitoring_attrs()
--
2.47.3
^ permalink raw reply related
* [RFC PATCH v1.1 03/11] mm/damon/tests/core-kunit: test damon_rand()
From: SeongJae Park @ 2026-06-25 5:07 UTC (permalink / raw)
Cc: SeongJae Park, Andrew Morton, Brendan Higgins, David Gow, damon,
kunit-dev, linux-kernel, linux-kselftest, linux-mm
In-Reply-To: <20260625050756.91115-1-sj@kernel.org>
Commit 9012c4e647df ("mm/damon: replace damon_rand() with a per-ctx
lockless PRNG") optimized DAMON for better performance. Add a kunit
test for ensuring the pseudo randomness quality.
Signed-off-by: SeongJae Park <sj@kernel.org>
---
mm/damon/tests/core-kunit.h | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index 1cfb8c176b873..756f3b9e2ed3b 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -1460,6 +1460,26 @@ static void damon_test_is_last_region(struct kunit *test)
damon_free_target(t);
}
+static void damon_test_rand(struct kunit *test)
+{
+ struct damon_ctx ctx;
+ int counts[10] = {};
+ int i;
+
+ prandom_seed_state(&ctx.rnd_state, get_random_u64());
+ for (i = 0; i < 10000; i++) {
+ unsigned long rnd = damon_rand(&ctx, 0, 10);
+
+ KUNIT_EXPECT_GE(test, rnd, 0);
+ KUNIT_EXPECT_LE(test, rnd, 9);
+ counts[rnd]++;
+ }
+ for (i = 0; i < 10; i++) {
+ KUNIT_EXPECT_GE(test, counts[i], 900);
+ KUNIT_EXPECT_LE(test, counts[i], 1100);
+ }
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -1489,6 +1509,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_set_filters_default_reject),
KUNIT_CASE(damon_test_apply_min_nr_regions),
KUNIT_CASE(damon_test_is_last_region),
+ KUNIT_CASE(damon_test_rand),
{},
};
--
2.47.3
^ permalink raw reply related
* [RFC PATCH v1.1 02/11] Docs/ABI/damon: document probe files
From: SeongJae Park @ 2026-06-25 5:07 UTC (permalink / raw)
Cc: SeongJae Park, Liam R. Howlett, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Michal Hocko, Mike Rapoport, Suren Baghdasaryan,
Vlastimil Babka, damon, linux-kernel, linux-mm
In-Reply-To: <20260625050756.91115-1-sj@kernel.org>
DAMON ABI document is not updated for the DAMON probe sysfs files.
Update.
Signed-off-by: SeongJae Park <sj@kernel.org>
---
.../ABI/testing/sysfs-kernel-mm-damon | 40 +++++++++++++++++++
1 file changed, 40 insertions(+)
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index b73e6bc28ea5f..2815f6bc05c18 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -157,6 +157,46 @@ Description: Writing a value to this file sets the maximum number of
monitoring regions of the DAMON context as the value. Reading
this file returns the value.
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/probes/nr_probes
+Date: May 2026
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a number 'N' to this file creates the number of
+ directories for each DAMON probe named '0' to 'N-1' under the
+ probes/ directory.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/probes/<P>/filters/nr_filters
+Date: May 2026
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a number 'N' to this file creates the number of
+ directories for each DAMON probe filter named '0' to 'N-1'
+ under the filters/ directory.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/probes/<P>/filters/<F>/type
+Date: May 2026
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the type of
+ the memory of the interest.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/probes/<P>/filters/<F>/path
+Date: May 2026
+Contact: SeongJae Park <sj@kernel.org>
+Description: If 'memmcg' is written to the 'type' file, writing to and
+ reading from this file sets and geets the path to the memory
+ cgroup of the interest.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/probes/<P>/filters/<F>/matching
+Date: May 2026
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing 'Y' or 'N' to this file sets whether the filter is for
+ the memory of the 'type', or all except the 'type'.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/probes/<P>/filters/<F>/allow
+Date: May 2026
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing 'Y' or 'N' to this file sets whether to allow or reject
+ hitting the probe for the memory that satisfies the 'type' and
+ the 'matching' of the directory.
+
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/nr_targets
Date: Mar 2022
Contact: SeongJae Park <sj@kernel.org>
--
2.47.3
^ permalink raw reply related
* [RFC PATCH v1.1 01/11] Docs/mm/damon/design: update for DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP
From: SeongJae Park @ 2026-06-25 5:07 UTC (permalink / raw)
Cc: SeongJae Park, Liam R. Howlett, Andrew Morton, David Hildenbrand,
Jonathan Corbet, Lorenzo Stoakes, Michal Hocko, Mike Rapoport,
Shuah Khan, Suren Baghdasaryan, Vlastimil Babka, damon, linux-doc,
linux-kernel, linux-mm
In-Reply-To: <20260625050756.91115-1-sj@kernel.org>
Commit 9138e27a3bc3 ("mm/damon: add node_eligible_mem_bp goal metric")
introduced DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP but forgot updating the
DAMON design document for that. Update.
Signed-off-by: SeongJae Park <sj@kernel.org>
---
Documentation/mm/damon/design.rst | 2 ++
1 file changed, 2 insertions(+)
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index 2da7ca0d3d17a..1ed02f2280790 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -686,6 +686,8 @@ mechanism tries to make ``current_value`` of ``target_metric`` be same to
(1/10,000).
- ``inactive_mem_bp``: Inactive to active + inactive (LRU) memory size ratio in
bp (1/10,000).
+- ``node_eligible_mem_bp``: Scheme target access pattern-eligible memory ratio
+ of a node in bp (1/10,000).
``nid`` is optionally required for only ``node_mem_used_bp``,
``node_mem_free_bp``, ``node_memcg_used_bp`` and ``node_memcg_free_bp`` to
--
2.47.3
^ permalink raw reply related
* [RFC PATCH v1.1 00/11] mm/damon: update, optimize, and clean up doc, tests, and code
From: SeongJae Park @ 2026-06-25 5:07 UTC (permalink / raw)
Cc: SeongJae Park, Liam R. Howlett, Andrew Morton, Brendan Higgins,
David Gow, David Hildenbrand, Jonathan Corbet, Lorenzo Stoakes,
Michal Hocko, Mike Rapoport, Shuah Khan, Shuah Khan,
Suren Baghdasaryan, Vlastimil Babka, damon, kunit-dev, linux-doc,
linux-kernel, linux-kselftest, linux-mm
Patches 1 and 2 update the design and ABI documents for recently added
DAMON features. Patches 3-7 add or update more unit and self tests for
DAMON to cover recently changed or added functions and sysfs files.
Patch 8 optimizes damon_commit_target_regions() to skip unnecessary
adjacent ranges setup. Patches 9-11 clean and fix up recently added
DAMON sysfs interface code for readability.
Changes from RFC
- RFC: https://lore.kernel.org/20260624142008.87180-1-sj@kernel.org
- Rebase directly to latest mm-new.
SeongJae Park (11):
Docs/mm/damon/design: update for DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP
Docs/ABI/damon: document probe files
mm/damon/tests/core-kunit: test damon_rand()
selftests/damon/sysfs.sh: test multiple probe dirs creation
selftests/damon/sysfs.sh: test {core,ops}_filters/ directories
selftests/damon/sysfs.sh: test dests dir
selftests/damon/sysfs.sh: test all files in quota goal dir
mm/damon/core: reduce range setup in damon_commit_target_regions()
mm/damon/sysfs: split probe setup function out
mm/damon/sysfs: split out filters setup function
mm/damon/sysfs: fix typos in probe_{add,rm}_dirs: s/attr/probe/
.../ABI/testing/sysfs-kernel-mm-damon | 40 +++++++
Documentation/mm/damon/design.rst | 2 +
mm/damon/core.c | 22 +++-
mm/damon/sysfs.c | 102 ++++++++++--------
mm/damon/tests/core-kunit.h | 21 ++++
tools/testing/selftests/damon/sysfs.sh | 70 +++++++++++-
6 files changed, 206 insertions(+), 51 deletions(-)
base-commit: 09ff70563340c38d31012044b9c6c18f225f4fbf
--
2.47.3
^ permalink raw reply
* Re: [PATCH] mm/rmap: use huge_ptep_get() in try_to_unmap_one()
From: Dev Jain @ 2026-06-25 5:06 UTC (permalink / raw)
To: Andrew Morton
Cc: david, ljs, riel, liam, vbabka, harry, jannh, kas, linux-mm,
linux-kernel, ryan.roberts, anshuman.khandual, stable
In-Reply-To: <20260624214230.4aff522ec8fa4a8f1607c942@linux-foundation.org>
On 25/06/26 10:12 am, Andrew Morton wrote:
> On Thu, 25 Jun 2026 04:28:51 +0000 Dev Jain <dev.jain@arm.com> wrote:
>
>> try_to_unmap_one() handles hugetlb folios when memory failure needs
>> to replace a poisoned hugetlb mapping with a hwpoison entry. In that
>> case page_vma_mapped_walk() returns the hugetlb entry in pvmw.pte, but
>> the code reads it with ptep_get() before decoding the PFN.
>>
>> That is wrong on architectures where hugetlb entries are not encoded as
>> regular PTEs. On s390, for example, a raw huge RSTE must be converted
>> by huge_ptep_get() before helpers such as pte_pfn() can inspect it. A
>> raw decode can select the wrong subpage, so try_to_unmap_one() can
>> install a hwpoison entry for the wrong PFN.
>>
>> The userspace-visible result is that a later access to the poisoned
>> hugetlb subpage can miss the expected SIGBUS. With DEBUG_VM, the wrong
>> subpage can also trip the PageHWPoison check.
>>
>> Use huge_ptep_get() for hugetlb mappings before decoding the PFN.
>>
>> Before c7ab0d2fdc84, the bug existed in the form of a plain dereference:
>> we would check the head page pfn of the hugetlb with pte_pfn(*pte), and
>> bail out on mismatch. This would mean that the hwpoisoned entry will not
>> get installed.
>>
>> I am not sure what is the procedure on such kinds of very old bugs - how
>> back should I really go?
>
> I think 9 years is enough ;)
>
>> There are similar old bugs present, in try_to_migrate_one(), check_pte(),
>> remove_migration_pte(), prot_none_hugetlb_entry().
>
> Why now? Was there some more recent (s390?) change which exposed this?
I was refactoring the hugetlb bits in try_to_unmap_one, so the bug got
caught in review by David (which reminds me to put a "Reported-by" tag
on this patch).
I guess if someone would run hugetlb-read-hwpoison.c on s390, this would
be caught. Turns out, this selftest is in a category of "destructive tests"
in run_vmtests.sh, so ./run_vmtests.sh or even ./run_vmtests.sh -a won't
run this. We are supposed to run this with ./run_vmtests.sh -d, and that
option was broken until one month ago, see 3432cbb291aa. So essentially
no one has been running that test.
>
^ permalink raw reply
* Re: [PATCH] mm/page_vma_mapped: guard check_pmd() with CONFIG_TRANSPARENT_HUGEPAGE
From: Andrew Morton @ 2026-06-25 4:59 UTC (permalink / raw)
To: Wei Yang
Cc: david, ljs, riel, liam, vbabka, harry, jannh, willy, linux-mm,
linux-kernel, lance.yang
In-Reply-To: <20260625034629.nmgdwl2c4luwod77@master>
On Thu, 25 Jun 2026 03:46:29 +0000 Wei Yang <richard.weiyang@gmail.com> wrote:
> >Sashiko had an off-topic complaint about the surrounding code:
> > https://lore.kernel.org/oe-kbuild-all/202606240042.ffPsEXVc-lkp@intel.com/
>
> I see this robot reply, but not see the Sashiko comment.
>
> How can I view Sashiko's commnet?
oop sorry.
You can go to https://sashiko.dev/ and search for the email subject.
Or append your Message-ID to "https://sashiko.dev/#/patchset":
https://sashiko.dev/#/patchset/20260624082359.2869-1-richard.weiyang@gmail.com
^ permalink raw reply
* Re: [PATCH v11 0/4] mm/page_owner: add per-fd filter infrastructure for print_mode and NUMA filtering
From: Andrew Morton @ 2026-06-25 4:55 UTC (permalink / raw)
To: Zhen Ni
Cc: vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm,
linux-kernel, Yichong Chen, Ye Liu
In-Reply-To: <20260625043101.338794-1-zhen.ni@easystack.cn>
On Thu, 25 Jun 2026 12:30:57 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
> This patch series introduces per-file-descriptor filtering capabilities to the
> page_owner feature.
Well, I assume this work was inspired by your own operational
experience with page_owner. There's no better inspiration than this!
Review is thin (absent) at v11. This is typical with page_owner
changes :(. I'll add the series for testing while interested people
check over it (please).
AI review might have found a few things which you might choose to
address. Please check it out:
https://sashiko.dev/#/patchset/20260625043101.338794-1-zhen.ni@easystack.cn
^ permalink raw reply
* Re: [PATCH] mm/page_alloc: bypass watermark when PCP has pages for allocation
From: Harry Yoo @ 2026-06-25 4:51 UTC (permalink / raw)
To: suhua, akpm, vbabka, surenb, mhocko, jackmanb, hannes, ziy; +Cc: linux-mm
In-Reply-To: <20260623132917.104234-1-suhua.tanke@gmail.com>
[-- Attachment #1.1: Type: text/plain, Size: 2936 bytes --]
On 6/23/26 10:29 PM, suhua wrote:
> NR_FREE_PAGES only tracks buddy system pages, not pages cached in
> per-cpu (PCP) lists. The watermark check in get_page_from_freelist()
> uses zone_page_state(z, NR_FREE_PAGES), so PCP pages are invisible
> to it. When the watermark check fails, the allocator falls through
> to the slowpath (reclaim/compaction/OOM) even though PCP may still
> have cached pages that can satisfy the allocation directly.
When you hit the watermark, the system is low on memory.
Why do you think it's reasonable to ignore that and allocate memory?
> Add pcp_has_pages() which checks if the current CPU's PCP list has
> pages for the requested order and migratetype. If so, skip the
> watermark check and jump to rmqueue(), which will try PCP first via
> rmqueue_pcplist(). This is more direct than draining PCP to buddy:
> no page movement overhead, just a list_empty check.
You shouldn't bypass the watermark check just because you can.
--
Cheers,
Harry / Hyeonggon
> Signed-off-by: suhua <suhua.tanke@gmail.com>
> ---
> mm/page_alloc.c | 36 ++++++++++++++++++++++++++++++++++++
> 1 file changed, 36 insertions(+)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index ee902a468c2f..70f5d0dc2485 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3784,6 +3784,32 @@ static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
> return alloc_flags;
> }
>
> +/*
> + * Check if the current CPU's PCP list has pages for the given order and
> + * migratetype. This is useful when watermark checks fail but PCP may
> + * still have cached pages that can satisfy the allocation directly.
> + */
> +static bool pcp_has_pages(struct zone *zone, unsigned int order,
> + int migratetype)
> +{
> + struct per_cpu_pages *pcp;
> + bool has_pages;
> + int pindex;
> +
> + if (!pcp_allowed_order(order))
> + return false;
> +
> + pcp = pcp_spin_trylock(zone->per_cpu_pageset);
> + if (!pcp)
> + return false;
> +
> + pindex = order_to_pindex(migratetype, order);
> + has_pages = !list_empty(&pcp->lists[pindex]);
> + pcp_spin_unlock(pcp);
> +
> + return has_pages;
> +}
> +
> /*
> * get_page_from_freelist goes through the zonelist trying to allocate
> * a page.
> @@ -3912,6 +3938,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
> if (_deferred_grow_zone(zone, order))
> goto try_this_zone;
> }
> +
> + /*
> + * NR_FREE_PAGES does not account for PCP pages.
> + * If PCP has cached pages for this order and
> + * migratetype, skip watermark and let rmqueue
> + * allocate directly from PCP.
> + */
> + if (pcp_has_pages(zone, order, ac->migratetype))
> + goto try_this_zone;
> +
> /* Checked here to keep the fast path fast */
> BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
> if (alloc_flags & ALLOC_NO_WATERMARKS)
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply
* Re: [PATCH v11 3/4] tools/mm: add page_owner_filter userspace tool
From: Andrew Morton @ 2026-06-25 4:50 UTC (permalink / raw)
To: Zhen Ni
Cc: vbabka, surenb, mhocko, jackmanb, hannes, ziy, linux-mm,
linux-kernel
In-Reply-To: <20260625043101.338794-4-zhen.ni@easystack.cn>
On Thu, 25 Jun 2026 12:31:00 +0800 Zhen Ni <zhen.ni@easystack.cn> wrote:
> Add a userspace filtering tool for page_owner that supports per-fd
> filtering with print_mode and NUMA node filters.
Could this functionality have been built into page_owner_sort.c, or
does that not make any sense?
^ permalink raw reply
* Re: [PATCH] mm/rmap: use huge_ptep_get() in try_to_unmap_one()
From: Andrew Morton @ 2026-06-25 4:42 UTC (permalink / raw)
To: Dev Jain
Cc: david, ljs, riel, liam, vbabka, harry, jannh, kas, linux-mm,
linux-kernel, ryan.roberts, anshuman.khandual, stable
In-Reply-To: <20260625042853.2752898-1-dev.jain@arm.com>
On Thu, 25 Jun 2026 04:28:51 +0000 Dev Jain <dev.jain@arm.com> wrote:
> try_to_unmap_one() handles hugetlb folios when memory failure needs
> to replace a poisoned hugetlb mapping with a hwpoison entry. In that
> case page_vma_mapped_walk() returns the hugetlb entry in pvmw.pte, but
> the code reads it with ptep_get() before decoding the PFN.
>
> That is wrong on architectures where hugetlb entries are not encoded as
> regular PTEs. On s390, for example, a raw huge RSTE must be converted
> by huge_ptep_get() before helpers such as pte_pfn() can inspect it. A
> raw decode can select the wrong subpage, so try_to_unmap_one() can
> install a hwpoison entry for the wrong PFN.
>
> The userspace-visible result is that a later access to the poisoned
> hugetlb subpage can miss the expected SIGBUS. With DEBUG_VM, the wrong
> subpage can also trip the PageHWPoison check.
>
> Use huge_ptep_get() for hugetlb mappings before decoding the PFN.
>
> Before c7ab0d2fdc84, the bug existed in the form of a plain dereference:
> we would check the head page pfn of the hugetlb with pte_pfn(*pte), and
> bail out on mismatch. This would mean that the hwpoisoned entry will not
> get installed.
>
> I am not sure what is the procedure on such kinds of very old bugs - how
> back should I really go?
I think 9 years is enough ;)
> There are similar old bugs present, in try_to_migrate_one(), check_pte(),
> remove_migration_pte(), prot_none_hugetlb_entry().
Why now? Was there some more recent (s390?) change which exposed this?
^ permalink raw reply
* Re: [PATCH v3] mm/slub: deduplicate NUMA policy calculation in allocation paths
From: Harry Yoo @ 2026-06-25 4:35 UTC (permalink / raw)
To: Hao Li, vbabka; +Cc: akpm, cl, rientjes, roman.gushchin, linux-mm, linux-kernel
In-Reply-To: <20260624100320.430115-1-hao.li@linux.dev>
[-- Attachment #1.1: Type: text/plain, Size: 1864 bytes --]
On 6/24/26 7:00 PM, Hao Li wrote:
> Currently, alloc_from_pcs() and __slab_alloc_node() both calculate the
> NUMA policy independently. Since they are called consecutively in paths
> like __kmalloc_nolock_noprof() and slab_alloc_node(), this leads to
> redundant code snippets.
>
> Introduce a helper function to resolve the NUMA policy once, eliminating
> the duplicated code and reducing execution overhead.
>
> Also remove __slab_alloc_node() function because it is almost empty.
> The callers of __slab_alloc_node now call ___slab_alloc() directly.
>
> Additional notes:
>
> Previously, when slab_strict_numa was enabled, alloc_from_pcs() and
> __slab_alloc_node() could each resolve the task mempolicy, so
> MPOL_INTERLEAVE or MPOL_WEIGHTED_INTERLEAVE could advance the
> interleave state twice for a single object allocation attempt.
> And each retry will also advance the interleave state.
>
> With this change, the strict NUMA node is resolved once and reused by
> both alloc_from_pcs() and ___slab_alloc() in each retry.
>
> This is a behavior change, but it better matches the intent of
> selecting one policy node for one allocation attempt.
>
> Signed-off-by: Hao Li <hao.li@linux.dev>
> ---
> Changes in v3:
> * Move apply_strict_numa_policy before retry label to simplify code (Thanks
> Harry)
>
> Changes in v2:
> * Use a better function name apply_strict_numa_policy() (Thanks Harry)
> * Remove almost empty function __slab_alloc_node.
> * Add a local variable, strict_node, so the retry path in
> __kmalloc_nolock_noprof() computes the strict NUMA node from the original
> node parameter instead of a previously resolved node value.
> ---
Looks good to me,
Reviewed-by: Harry Yoo (Oracle) <harry@kernel.org>
Thanks!
--
Cheers,
Harry / Hyeonggon
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply
* [PATCH v11 4/4] mm/page_owner: document page_owner filter
From: Zhen Ni @ 2026-06-25 4:31 UTC (permalink / raw)
To: akpm, vbabka
Cc: surenb, mhocko, jackmanb, hannes, ziy, linux-mm, linux-kernel,
Zhen Ni
In-Reply-To: <20260625043101.338794-1-zhen.ni@easystack.cn>
Add documentation for the page_owner_filter userspace tool and
kernel-level filtering features.
Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
---
Changes in v11:
- No changes
Changes in v10:
- No changes
Changes in v9:
- No changes
Changes in v8:
- Fix Sphinx double colon warning
Changes in v7:
- document for per-file-descriptor implementation
Changes in v6:
- No code changes
Changes in v5:
- No code changes
Changes in v4:
- Update print_mode documentation to reflect string-based interface
* Change from "0/1" to "full_stack"/"stack_handle"
* Add bracket notation example: "[full_stack] stack_handle"
- Update NUMA filter documentation
* Remove "-1" example
* Add empty string as clear method
- Fix indentation: use tabs instead of spaces in code examples
Changes in v3:
- New patch to document filter features as requested by Andrew Morton
v10: https://lore.kernel.org/linux-mm/20260618035750.3724613-5-zhen.ni@easystack.cn/
v9: https://lore.kernel.org/linux-mm/20260525081652.2210206-5-zhen.ni@easystack.cn/
v8: https://lore.kernel.org/linux-mm/20260520075641.1931080-5-zhen.ni@easystack.cn/
v7: https://lore.kernel.org/linux-mm/20260515091942.1535677-5-zhen.ni@easystack.cn/
v6: https://lore.kernel.org/linux-mm/20260511033017.747781-4-zhen.ni@easystack.cn/
v5: https://lore.kernel.org/linux-mm/20260507064643.179187-4-zhen.ni@easystack.cn/
v4: https://lore.kernel.org/linux-mm/20260430163247.13628-4-zhen.ni@easystack.cn/
v3: https://lore.kernel.org/linux-mm/20260428071112.1420380-5-zhen.ni@easystack.cn/
---
Documentation/mm/page_owner.rst | 77 ++++++++++++++++++++++++++++++++-
1 file changed, 75 insertions(+), 2 deletions(-)
diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst
index 6b12f3b007ec..383e59c42743 100644
--- a/Documentation/mm/page_owner.rst
+++ b/Documentation/mm/page_owner.rst
@@ -65,7 +65,14 @@ un-tracking state.
Usage
=====
-1) Build user-space helper::
+1) Build user-space helpers::
+
+To filter page_owner output:
+
+ cd tools/mm
+ make page_owner_filter
+
+To sort and analyze page_owner output:
cd tools/mm
make page_owner_sort
@@ -74,7 +81,11 @@ Usage
3) Do the job that you want to debug.
-4) Analyze information from page owner::
+4) (Optional) Filter page_owner output::
+
+ ./page_owner_filter -m handle -n 0,1,2 > filtered_page_owner.txt
+
+5) Analyze information from page owner::
cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
cat stacks.txt
@@ -263,3 +274,65 @@ STANDARD FORMAT SPECIFIERS
f free whether the page has been released or not
st stacktrace stack trace of the page allocation
ator allocator memory allocator for pages
+
+Filtering page_owner output
+============================
+
+page_owner supports filtering output at the kernel level before reading,
+which reduces the amount of data that needs to be processed in userspace.
+
+The page_owner_filter tool provides a convenient interface for this filtering
+capability. It supports two types of filters:
+
+1. **print_mode filter**: Control what information is printed for each page
+ - ``stack``: Print full stack traces (default, compatible with existing usage)
+ - ``handle``: Print only stack handle numbers (much faster, smaller output)
+ - ``stack_handle``: Print both stack traces and handle numbers
+
+ The ``handle`` mode uses numeric identifiers instead of full stack traces.
+ The mapping from handles to actual stack traces can be obtained via the
+ show_stacks_handles interface.
+
+2. **NUMA node filter**: Filter pages by NUMA node ID
+ - Supports single node: ``-n 0``
+ - Multiple nodes: ``-n 0,1,2``
+ - Ranges: ``-n 0-3``
+ - Mixed format: ``-n 0,2-3,5``
+
+Usage examples::
+
+ # Filter by print mode
+ ./page_owner_filter -m handle
+ ./page_owner_filter -m stack_handle
+
+ # Filter by NUMA node
+ ./page_owner_filter -n 0
+ ./page_owner_filter -n 0-3
+
+ # Combined filters
+ ./page_owner_filter -m stack -n 0,1,2
+ ./page_owner_filter -m handle -n 0,2-3
+
+ # Save to file
+ ./page_owner_filter -m handle -o filtered_output.txt
+
+The handle mode is particularly useful for monitoring and performance-critical
+scenarios as it dramatically reduces output size. Testing shows handle mode can
+reduce output size by ~66% (84MB vs 244MB) and improve read performance by ~4.4x
+compared to full stack output.
+
+The NUMA node filter is useful for NUMA-aware memory allocation analysis and debugging.
+
+Behind the scenes, page_owner_filter opens /sys/kernel/debug/page_owner and
+writes filter commands before reading the filtered output. The filtering uses
+per-file-descriptor state, allowing each open() to have independent filter settings.
+
+Each file descriptor maintains its own filter state, so you can have multiple
+independent filtering operations running concurrently. For example, in different
+terminals you can run different filters simultaneously::
+
+ # Terminal 1: Filter node 0
+ ./page_owner_filter -n 0 > node0_output.txt
+
+ # Terminal 2: Filter node 1 (runs concurrently)
+ ./page_owner_filter -n 1 > node1_output.txt
--
2.20.1
^ permalink raw reply related
* [PATCH v11 3/4] tools/mm: add page_owner_filter userspace tool
From: Zhen Ni @ 2026-06-25 4:31 UTC (permalink / raw)
To: akpm, vbabka
Cc: surenb, mhocko, jackmanb, hannes, ziy, linux-mm, linux-kernel,
Zhen Ni
In-Reply-To: <20260625043101.338794-1-zhen.ni@easystack.cn>
Add a userspace filtering tool for page_owner that supports per-fd
filtering with print_mode and NUMA node filters.
Features:
- Three print modes: stack (default), handle, stack_handle
- NUMA node filtering with flexible formats (single: 0, multiple: 0,1,2,
range: 0-3, mixed: 0,2-3)
- Per-file-descriptor filter state for independent filtering
Usage examples:
# Filter by print mode
./page_owner_filter -m handle
./page_owner_filter -m stack_handle
# Filter by NUMA node
./page_owner_filter -n 0
./page_owner_filter -n 0-3
# Combined filters
./page_owner_filter -m stack -n 0,1,2
./page_owner_filter -m handle -n 0,2-3
The tool validates inputs before sending commands to the kernel and
provides clear error messages when the kernel does not support
per-fd filtering.
Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
---
Changes in v11:
- Add signal(SIGPIPE, SIG_IGN) to ignore SIGPIPE and handle EPIPE gracefully
- Treat EPIPE from fwrite() and fflush() as success (broken pipe case)
Changes in v10:
- Improve error handling: check fwrite() and fflush() return values
- Handle EPIPE correctly: treat broken pipe as success
Changes in v9:
- Fix isdigit() usage: cast to unsigned char to avoid undefined behavior with non-ASCII input
- Optimize I/O performance: replace fprintf() + fflush() in loop with fwrite() + single fflush() after loop
Changes in v8:
- Add validation to reject multiple dashes in nid list (e.g., "1-2-3")
- Fix snprintf return value handling to prevent command overflow
Changes in v7:
- New patch for userspace tool
v10: https://lore.kernel.org/linux-mm/20260618035750.3724613-4-zhen.ni@easystack.cn/
v9: https://lore.kernel.org/linux-mm/20260525081652.2210206-4-zhen.ni@easystack.cn/
v8: https://lore.kernel.org/linux-mm/20260520075641.1931080-4-zhen.ni@easystack.cn/
v7: https://lore.kernel.org/linux-mm/20260515091942.1535677-4-zhen.ni@easystack.cn/
---
tools/mm/Makefile | 4 +-
tools/mm/page_owner_filter.c | 310 +++++++++++++++++++++++++++++++++++
2 files changed, 312 insertions(+), 2 deletions(-)
create mode 100644 tools/mm/page_owner_filter.c
diff --git a/tools/mm/Makefile b/tools/mm/Makefile
index f5725b5c23aa..858186a6eefd 100644
--- a/tools/mm/Makefile
+++ b/tools/mm/Makefile
@@ -3,7 +3,7 @@
#
include ../scripts/Makefile.include
-BUILD_TARGETS=page-types slabinfo page_owner_sort thp_swap_allocator_test
+BUILD_TARGETS=page-types slabinfo page_owner_sort page_owner_filter thp_swap_allocator_test
INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps
LIB_DIR = ../lib/api
@@ -23,7 +23,7 @@ $(LIBS):
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
clean:
- $(RM) page-types slabinfo page_owner_sort thp_swap_allocator_test
+ $(RM) page-types slabinfo page_owner_sort page_owner_filter thp_swap_allocator_test
make -C $(LIB_DIR) clean
sbindir ?= /usr/sbin
diff --git a/tools/mm/page_owner_filter.c b/tools/mm/page_owner_filter.c
new file mode 100644
index 000000000000..1d1f0a38678a
--- /dev/null
+++ b/tools/mm/page_owner_filter.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * User-space helper to filter page_owner output per-fd
+ *
+ * Example use:
+ * ./page_owner_filter -m handle
+ * ./page_owner_filter -m stack_handle
+ * ./page_owner_filter -n 0,1,2
+ *
+ * See Documentation/mm/page_owner.rst
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <ctype.h>
+#include <getopt.h>
+#include <signal.h>
+
+#define MAX_CMD_LEN 512
+
+static void usage(const char *prog)
+{
+ fprintf(stderr, "Usage: %s [OPTIONS]\n", prog);
+ fprintf(stderr, "\nOptions:\n");
+ fprintf(stderr, " -m, --mode MODE : print_mode (stack, handle, or stack_handle)\n");
+ fprintf(stderr, " -n, --nid NID_LIST : NUMA node IDs (comma-separated or ranges)\n");
+ fprintf(stderr, " -o, --output FILE : output file (default: stdout)\n");
+ fprintf(stderr, " -h, --help : show this help message\n");
+ fprintf(stderr, "\nExamples:\n");
+ fprintf(stderr, " %s -m stack\n", prog);
+ fprintf(stderr, " %s -m handle\n", prog);
+ fprintf(stderr, " %s -m stack_handle\n", prog);
+ fprintf(stderr, " %s -m stack -o output.txt\n", prog);
+ fprintf(stderr, " %s -n 0,1,2\n", prog);
+ fprintf(stderr, " %s -m stack -n 0\n", prog);
+}
+
+static int validate_mode(const char *mode)
+{
+ if (strcmp(mode, "stack") == 0 ||
+ strcmp(mode, "handle") == 0 ||
+ strcmp(mode, "stack_handle") == 0)
+ return 0;
+
+ fprintf(stderr, "Error: Invalid mode '%s'\n", mode);
+ fprintf(stderr, "Valid modes: stack, handle, stack_handle\n");
+ return -1;
+}
+
+static int validate_nid_list(const char *nid_list)
+{
+ const char *p;
+ int i = 0;
+ int has_digit = 0;
+ int in_range = 0;
+ int prev_num = 0;
+ int curr_num = 0;
+
+ if (!nid_list || strlen(nid_list) == 0)
+ return 0;
+
+ for (p = nid_list; *p; p++) {
+ if (*p == ',') {
+ if (!has_digit) {
+ fprintf(stderr, "Error: Invalid nid_list format\n");
+ return -1;
+ }
+ if (in_range && prev_num > curr_num) {
+ fprintf(stderr,
+ "Error: Invalid range %d-%d (start must be <= end)\n",
+ prev_num, curr_num);
+ return -1;
+ }
+ i = 0;
+ has_digit = 0;
+ in_range = 0;
+ prev_num = 0;
+ curr_num = 0;
+ continue;
+ }
+
+ if (*p == '-') {
+ if (!has_digit) {
+ fprintf(stderr,
+ "Error: Invalid nid_list format ");
+ fprintf(stderr,
+ "(dash without preceding number)\n");
+ return -1;
+ }
+ if (in_range) {
+ fprintf(stderr, "Error: Multiple dashes in nid_list\n");
+ return -1;
+ }
+ prev_num = curr_num;
+ curr_num = 0;
+ i = 0;
+ has_digit = 0;
+ in_range = 1;
+ continue;
+ }
+
+ if (!isdigit((unsigned char)*p)) {
+ fprintf(stderr, "Error: Invalid character '%c' in nid_list\n", *p);
+ return -1;
+ }
+
+ if (i > 5) {
+ fprintf(stderr, "Error: NID too long (max 65536)\n");
+ return -1;
+ }
+ curr_num = curr_num * 10 + (*p - '0');
+ i++;
+ has_digit = 1;
+ }
+
+ if (!has_digit) {
+ fprintf(stderr, "Error: Invalid nid_list format\n");
+ return -1;
+ }
+
+ if (in_range && prev_num > curr_num) {
+ fprintf(stderr,
+ "Error: Invalid range %d-%d (start must be <= end)\n",
+ prev_num, curr_num);
+ return -1;
+ }
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ const char *output_file = NULL;
+ char filter_cmd[MAX_CMD_LEN];
+ FILE *output = NULL;
+ int fd = -1;
+ ssize_t ret;
+ char buf[4096];
+ int opt;
+ size_t cmd_len = 0;
+
+ signal(SIGPIPE, SIG_IGN);
+
+ static struct option long_options[] = {
+ {"mode", required_argument, 0, 'm'},
+ {"nid", required_argument, 0, 'n'},
+ {"output", required_argument, 0, 'o'},
+ {"help", no_argument, 0, 'h'},
+ {0, 0, 0, 0}
+ };
+
+ filter_cmd[0] = '\0';
+
+ if (argc > 1) {
+ for (int i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+ usage(argv[0]);
+ return 0;
+ }
+ }
+ }
+
+ /* Check if page_owner exists and is readable */
+ if (access("/sys/kernel/debug/page_owner", F_OK) != 0) {
+ if (errno == ENOENT)
+ fprintf(stderr, "Error: /sys/kernel/debug/page_owner does not exist\n");
+ else
+ perror("Error accessing /sys/kernel/debug/page_owner");
+ fprintf(stderr, "Make sure page_owner is enabled in kernel\n");
+ return 1;
+ }
+
+ while ((opt = getopt_long(argc, argv, "m:n:o:h", long_options, NULL)) != -1) {
+ int len;
+
+ switch (opt) {
+ case 'm': {
+ const char *mode = optarg;
+
+ if (validate_mode(mode) < 0)
+ return 1;
+ len = snprintf(filter_cmd + cmd_len, MAX_CMD_LEN - cmd_len,
+ "%smode=%s", cmd_len > 0 ? " " : "", mode);
+ if (len < 0 || cmd_len + len >= MAX_CMD_LEN) {
+ fprintf(stderr, "Error: Command too long\n");
+ return 1;
+ }
+ cmd_len += len;
+ break;
+ }
+ case 'n': {
+ const char *nid_list = optarg;
+
+ if (validate_nid_list(nid_list) < 0)
+ return 1;
+ len = snprintf(filter_cmd + cmd_len, MAX_CMD_LEN - cmd_len,
+ "%snid=%s", cmd_len > 0 ? " " : "", nid_list);
+ if (len < 0 || cmd_len + len >= MAX_CMD_LEN) {
+ fprintf(stderr, "Error: Command too long\n");
+ return 1;
+ }
+ cmd_len += len;
+ break;
+ }
+ case 'o':
+ output_file = optarg;
+ break;
+ case 'h':
+ /* Already handled above */
+ break;
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ /* At least one filter must be specified */
+ if (cmd_len == 0) {
+ fprintf(stderr, "Error: At least one filter (-m or -n) must be specified\n\n");
+ usage(argv[0]);
+ return 1;
+ }
+
+ /* Open page_owner for read-write - this will fail if kernel doesn't support write */
+ fd = open("/sys/kernel/debug/page_owner", O_RDWR);
+ if (fd < 0) {
+ if (errno == EACCES || errno == EPERM) {
+ fprintf(stderr, "Error: /sys/kernel/debug/page_owner ");
+ fprintf(stderr, "does not support write access\n");
+ fprintf(stderr, "This kernel does not support ");
+ fprintf(stderr, "per-fd filtering.\n");
+ fprintf(stderr, "Please ensure you have a kernel with ");
+ fprintf(stderr, "per-fd filtering support.\n");
+ } else {
+ perror("Error opening /sys/kernel/debug/page_owner");
+ }
+ return 1;
+ }
+
+ if (output_file) {
+ output = fopen(output_file, "w");
+ if (!output) {
+ perror("open output file");
+ close(fd);
+ return 1;
+ }
+ } else {
+ output = stdout;
+ }
+
+ ret = write(fd, filter_cmd, strlen(filter_cmd));
+
+ if (ret < 0) {
+ if (errno == EINVAL) {
+ fprintf(stderr, "Error: Kernel rejected the filter command.\n");
+ fprintf(stderr, "Possible causes:\n");
+ fprintf(stderr, " - Kernel does not support per-fd filtering\n");
+ fprintf(stderr, " - NUMA node has no memory\n");
+ fprintf(stderr, " - Unknown reason\n");
+ } else {
+ perror("write filter command");
+ }
+ goto out;
+ }
+
+ if ((size_t)ret != strlen(filter_cmd))
+ fprintf(stderr, "Warning: Partial write (%zd/%zu)\n", ret, strlen(filter_cmd));
+
+ /* Read and display filtered output */
+ ret = 0;
+ while ((ret = read(fd, buf, sizeof(buf))) > 0) {
+ size_t written = fwrite(buf, 1, ret, output);
+
+ if (written != (size_t)ret) {
+ if (errno == EPIPE) {
+ /* Pipe closed, treat as success */
+ ret = 0;
+ goto out;
+ }
+ perror("write output");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ if (ret < 0) {
+ perror("read page_owner");
+ goto out;
+ }
+
+ if (fflush(output)) {
+ if (errno == EPIPE) {
+ /* Pipe closed, treat as success */
+ ret = 0;
+ } else {
+ perror("flush output");
+ ret = -1;
+ }
+ }
+
+out:
+ close(fd);
+ if (output != stdout)
+ fclose(output);
+ return ret < 0 ? 1 : 0;
+}
--
2.20.1
^ permalink raw reply related
* [PATCH v11 2/4] mm/page_owner: add NUMA node filter
From: Zhen Ni @ 2026-06-25 4:30 UTC (permalink / raw)
To: akpm, vbabka
Cc: surenb, mhocko, jackmanb, hannes, ziy, linux-mm, linux-kernel,
Zhen Ni
In-Reply-To: <20260625043101.338794-1-zhen.ni@easystack.cn>
Add NUMA node filtering functionality to page_owner to allow filtering
pages by specific NUMA node(s). This is useful for NUMA-aware memory
allocation analysis and debugging.
The filter supports flexible input formats:
- Single node: nid=0
- Multiple nodes: nid=0,2,3
- Node range: nid=0-3
- Mixed format: nid=0,2-4,7
Example usage:
# Using the page_owner_filter tool (recommended)
./page_owner_filter -n 0-3
./page_owner_filter -m stack_handle -n 0,2-4,7
The implementation uses per-file-descriptor filter state stored in
file->private_data, allowing each opener to have independent filter
configuration. It uses nodemask_t for efficient multi-node filtering and
nodelist_parse() for flexible input parsing. Node validity is verified
using nodes_subset() to reject nodes without memory.
Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
---
Changes in v11:
- Remove 'nid' member from struct page_owner to save memory
- Read page->flags directly with poison checking
Changes in v10:
- Add 'nid' member to struct page_owner and record it at allocation time
- Remove cond_resched() in page iteration loop (unconditional call)
- Update NUMA filter to use saved nid instead of page_to_nid()
Changes in v9:
- Add spinlock protection for NUMA filter state access
- Use memdesc_nid() instead of page_to_nid() to bypass PF_POISONED_CHECK()
Changes in v8:
- Add cond_resched() in page iteration loop to prevent RCU stalls
- Reject empty nid list to avoid enabling an empty filter
- Improve comment: "Commit all filter changes"
Changes in v7:
- per-file-descriptor implementation
Changes in v6:
- Add node validity check using nodes_subset
to reject invalid node numbers that don't exist in the system
- Move bool filter_by_nid declaration to top of block
- Use kmalloc_objs instead of kmalloc
- Remove 100 bytes overhead
Changes in v5:
- Optimize nodes_empty() check in page iteration loop
- Add __data_racy qualifier to nid_mask field
Changes in v4:
- Remove "-1" support, use empty string to clear filter
- Use strncpy_from_user() instead of copy_from_user()
- Add concurrency safety documentation for nid_mask access
- Rename fops to page_owner_nid_filter_fops for consistency
Changes in v3:
- Remove READ_ONCE/WRITE_ONCE for nodemask_t (fixes compilation errors)
* nodemask_t is a large structure (128 bytes) that triggers compile-time asserts
* Direct assignment is safe for this use case
- Add comment explaining input length calculation formula
* 6 bytes = ",NNNNN" (comma + 5-digit node number)
- Simplify "-1" check using kstrtoint() instead of dual strcmp()
- Move nodemask_t mask read outside PFN iteration loop for performance
* Avoids 128-byte structure copy on each iteration
Changes in v2:
- Use nodemask_t instead of int to support multiple nodes
- Implement nodelist_parse() to support flexible input formats
* Single node: "0", "2"
* Multiple nodes: "0,2,3"
* Ranges: "0-3"
* Mixed: "0,2-4,7"
- Use %*pbl format for output (e.g., "0-2", "0,2-4,7")
- Use dynamic memory allocation (kmalloc) to handle variable-length input
- Follow cpuset's max_write_len pattern: (100 + 6 * MAX_NUMNODES)
v10: https://lore.kernel.org/linux-mm/20260618035750.3724613-3-zhen.ni@easystack.cn/
v9: https://lore.kernel.org/linux-mm/20260525081652.2210206-3-zhen.ni@easystack.cn/
v8: https://lore.kernel.org/linux-mm/20260520075641.1931080-3-zhen.ni@easystack.cn/
v7: https://lore.kernel.org/linux-mm/20260515091942.1535677-3-zhen.ni@easystack.cn/
v6: https://lore.kernel.org/linux-mm/20260511033017.747781-3-zhen.ni@easystack.cn/
v5: https://lore.kernel.org/linux-mm/20260507064643.179187-3-zhen.ni@easystack.cn/
v4: https://lore.kernel.org/linux-mm/20260430163247.13628-3-zhen.ni@easystack.cn/
v3: https://lore.kernel.org/linux-mm/20260428071112.1420380-4-zhen.ni@easystack.cn/
v2: https://lore.kernel.org/linux-mm/20260419155540.376847-4-zhen.ni@easystack.cn/
v1: https://lore.kernel.org/linux-mm/20260417154638.22370-4-zhen.ni@easystack.cn/
---
mm/page_owner.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 59 insertions(+), 2 deletions(-)
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 7595735979bf..cae5abf0ac9a 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -68,6 +68,8 @@ static const char * const page_owner_print_mode_strings[] = {
struct page_owner_filter_state {
enum page_owner_print_mode print_mode;
+ nodemask_t nid_filter;
+ bool nid_filter_enabled;
spinlock_t lock;
};
@@ -698,6 +700,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct page_owner *page_owner;
depot_stack_handle_t handle;
struct page_owner_filter_state *state = file->private_data;
+ unsigned long flags;
if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
@@ -774,6 +777,27 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
if (!handle)
goto ext_put_continue;
+ spin_lock_irqsave(&state->lock, flags);
+ if (state->nid_filter_enabled) {
+ int nid;
+ memdesc_flags_t page_flags = READ_ONCE(page->flags);
+
+ /*
+ * Bypass PF_POISONED_CHECK() in page_to_nid() to avoid
+ * VM_BUG_ON when accessing poisoned pages.
+ */
+ if (page_flags.f == PAGE_POISON_PATTERN) {
+ spin_unlock_irqrestore(&state->lock, flags);
+ goto ext_put_continue;
+ }
+ nid = memdesc_nid(page_flags);
+ if (!node_isset(nid, state->nid_filter)) {
+ spin_unlock_irqrestore(&state->lock, flags);
+ goto ext_put_continue;
+ }
+ }
+ spin_unlock_irqrestore(&state->lock, flags);
+
/* Record the next PFN to read in the file offset */
*ppos = pfn + 1;
@@ -783,6 +807,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
&page_owner_tmp, handle, state);
ext_put_continue:
page_ext_put(page_ext);
+ cond_resched();
}
return 0;
@@ -891,6 +916,8 @@ static int page_owner_open(struct inode *inode, struct file *file)
spin_lock_init(&state->lock);
state->print_mode = PAGE_OWNER_PRINT_STACK;
+ nodes_clear(state->nid_filter);
+ state->nid_filter_enabled = false;
file->private_data = state;
return 0;
}
@@ -912,13 +939,18 @@ static ssize_t page_owner_write(struct file *file,
size_t max_input_len;
struct page_owner_filter_state *state = file->private_data;
enum page_owner_print_mode new_print_mode;
+ nodemask_t new_nid_filter;
+ bool new_nid_filter_enabled;
unsigned long flags;
/*
* Maximum input length for filter commands:
- * 32: print_mode command max length is 17 ("mode=stack_handle").
+ * - 32: print_mode command max length is 17 ("mode=stack_handle")
+ * with sufficient buffer
+ * - 6 * MAX_NUMNODES: worst case for nid list
+ * Worst case per node: ",NNNNN" (comma + 5-digit node number) = 6 bytes
*/
- max_input_len = 32;
+ max_input_len = 32 + 6 * MAX_NUMNODES;
if (count > max_input_len)
return -EINVAL;
@@ -931,6 +963,8 @@ static ssize_t page_owner_write(struct file *file,
spin_lock_irqsave(&state->lock, flags);
new_print_mode = state->print_mode;
+ new_nid_filter = state->nid_filter;
+ new_nid_filter_enabled = state->nid_filter_enabled;
spin_unlock_irqrestore(&state->lock, flags);
while ((token = strsep(&kbuf, " \t\n")) != NULL) {
@@ -943,14 +977,37 @@ static ssize_t page_owner_write(struct file *file,
if (ret < 0)
goto out_free;
new_print_mode = ret;
+ } else if (!strncmp(token, "nid=", 4)) {
+ ret = nodelist_parse(token + 4, new_nid_filter);
+ if (ret < 0)
+ goto out_free;
+
+ if (nodes_empty(new_nid_filter)) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+
+ /*
+ * We want to filter memory allocations by numa nodes, so make sure
+ * that the specified nodes have memory.
+ */
+ if (!nodes_subset(new_nid_filter, node_states[N_MEMORY])) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+
+ new_nid_filter_enabled = true;
} else {
ret = -EINVAL;
goto out_free;
}
}
+ /* Commit all filter changes */
spin_lock_irqsave(&state->lock, flags);
state->print_mode = new_print_mode;
+ state->nid_filter = new_nid_filter;
+ state->nid_filter_enabled = new_nid_filter_enabled;
spin_unlock_irqrestore(&state->lock, flags);
ret = count;
--
2.20.1
^ permalink raw reply related
* [PATCH v11 0/4] mm/page_owner: add per-fd filter infrastructure for print_mode and NUMA filtering
From: Zhen Ni @ 2026-06-25 4:30 UTC (permalink / raw)
To: akpm, vbabka
Cc: surenb, mhocko, jackmanb, hannes, ziy, linux-mm, linux-kernel,
Zhen Ni
This patch series introduces per-file-descriptor filtering capabilities to the
page_owner feature.
Changes in v11:
- Remove 'nid' member from struct page_owner to save memory
- Read page->flags directly with poison checking
- Add signal(SIGPIPE, SIG_IGN) to ignore SIGPIPE and handle EPIPE gracefully
- Treat EPIPE from fwrite() and fflush() as success (broken pipe case)
Changes in v10:
- Add 'nid' member to struct page_owner and record it at allocation time
- Remove cond_resched() in page iteration loop (unconditional call)
- Improve error handling: check fwrite() and fflush() return values
- Handle EPIPE correctly: treat broken pipe as success
Changes in v9:
- Add spinlock_t lock to struct page_owner_filter_state
- Use memdesc_nid() instead of page_to_nid()
- Fix isdigit() usage
- Optimize I/O performance
Changes in v8:
- Fix buffer overflow, strsep() memory corruption, and unsafe string handling issues
- Add cond_resched() to prevent RCU stalls in page iteration loop
- Improve validation and error handling (e.g., "1-2-3") in userspace tool
- Fix documentation warnings and improve code comments
v8 additional testing with invalid inputs:
./page_owner_filter -n 1-2-3
Error: Multiple dashes in nid_list
./page_owner_filter -n 0,1-2-3
Error: Multiple dashes in nid_list
./page_owner_filter -n 1-2-3,2-3
Error: Multiple dashes in nid_list
Changes in v7:
- print_mode and NUMA node filter implementation (patches 1-2)
- Add page_owner_filter userspace tool (patch 3)
- Update documentation for per-fd interface (patch 4)
Changes in v6:
- Address SeongJae Park's review comments for patch 1/3:
* Remove unnecessary braces in if/else statement
* Use stack array instead of kmalloc for input buffer
- Address SeongJae Park's review comments for patch 2/3:
* Add node validity check using nodes_subset() to reject non-existent nodes
* Separate variable declaration and statement
* Use kmalloc_objs() for consistency with kernel patterns
* Remove 100 bytes overhead
- Add lore links to all previous versions
Changes in v5:
- Optimize nodes_empty() check in page iteration loop
- Add __data_racy qualifier to nid_mask field
Changes in v4:
- Change print_mode from numeric (0/1) to string-based interface
* Use "full_stack"/"stack_handle" strings instead of numbers
* Display current mode with bracket notation: "[full_stack] stack_handle"
- Remove "-1" support from NUMA filter
* Use empty string to clear filter (echo > nid)
- Use strncpy_from_user() instead of copy_from_user()
- Rename nid_filter_fops to page_owner_nid_filter_fops for consistency
- Merge patch 1 (infrastructure) and patch 2 (print_mode) from v3
- Update documentation to match new interface
* String-based examples
* Tab indentation in code blocks
Changes in v3:
- Remove READ_ONCE/WRITE_ONCE for nodemask_t (fixes compilation errors)
* nodemask_t is a large structure (128 bytes) that triggers compile-time asserts
* Direct assignment is safe for this use case
- Add comment explaining input length calculation formula
* 6 bytes = ",NNNNN" (comma + 5-digit node number)
- Simplify "-1" check using kstrtoint() instead of dual strcmp()
- Move nodemask_t mask read outside PFN iteration loop for performance
* Avoids 128-byte structure copy on each iteration
- Add documentation for filter features (patch 3/3)
Changes in v2:
- Renamed 'compact' to 'print_mode' with enum type for better clarity
* PAGE_OWNER_PRINT_FULL_STACK (0): print full stack traces
* PAGE_OWNER_PRINT_STACK_HANDLE (1): print only stack handles
- Changed NUMA filter from single node to nodelist with bitmask support
* Uses nodelist_parse() to support "0", "0,2", "0-3", "0,2-4,7" formats
* Uses nodemask_t internally for efficient multi-node filtering
* Output uses %*pbl format (e.g., "0-2", "0,2-4,7")
- Improved memory handling in nid_filter_write using dynamic allocation
* Limit: (100 + 6 * MAX_NUMNODES) to handle worst-case input
Problem Statement
=================
In production environments with large memory configurations (e.g., 250GB+),
collecting page_owner information often results in files ranging from
several gigabytes to over 10GB. This creates significant challenges:
1. Storage pressure on production systems
2. Difficulty transferring large files from production environments
3. Post-processing overhead with tools/mm/page_owner_sort.c
The primary contributor to file size is redundant stack trace
information. While the kernel already deduplicates stacks via
stackdepot, page_owner retrieves and stores full stack traces for
each page, only to deduplicate them again during post-processing.
Additionally, in NUMA-aware environments (e.g., DPDK-based cloud
deployments where QEMU processes are bound to specific NUMA nodes),
OOM events are often node-specific rather than system-wide.
Previously, page_owner could not filter by NUMA node, forcing users to
collect and analyze data for all nodes.
Solution
========
This patch series introduces a per-file-descriptor filter infrastructure
with two initial filters:
1. **Print Mode Filter**: Outputs only stack handles instead of
full stack traces. The handle-to-stack mapping can be retrieved
from the existing show_stacks_handles interface. This dramatically
reduces output size while preserving all allocation metadata.
2. **NUMA Node Filter**: Allows filtering pages by specific NUMA node(s)
using flexible nodelist format, enabling targeted analysis of memory
issues in NUMA-aware deployments.
The per-fd design allows multiple concurrent page_owner reads with
different filters, solving coordination issues in multi-user production
environments.
Implementation
==============
The series is structured as follows:
- Patch 1: Implement print_mode filter infrastructure
* Add file->private_data to store per-fd filter state
* Add .open, .release, and .write file operations
* Support "stack", "handle", and "stack_handle" modes via "mode=" write commands
- Patch 2: Implement NUMA node filter infrastructure
* Add nid_filter field to per-fd state
* Support flexible nodelist format via "nid=" write commands (single, multiple, ranges)
* Validate nodes and reject non-existent nodes using nodes_subset()
- Patch 3: Add page_owner_filter userspace tool
* Manages per-fd filters via write() interface
* Provides user-friendly command-line interface
* Includes comprehensive input validation
- Patch 4: Document filter features and usage
Usage Example
=============
Using the page_owner_filter tool with per-fd filters:
# ./page_owner_filter -m stack_handle -n "0,2-3" -o page_owner.txt
The tool opens /sys/kernel/debug/page_owner, sets filters via write(),
then reads the filtered output to the specified file (or stdout).
Sample print_mode output (showing handles only):
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper),
ts 0 ns PFN 0x40000 type Unmovable Block 512 type Unmovable
Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Page allocated via order 0, mask 0x252000(__GFP_NOWARN|
__GFP_NORETRY|__GFP_COMP|__GFP_THISNODE), pid 0, tgid 0 (swapper),
ts 0 ns PFN 0x40002 type Unmovable Block 512 type Unmovable
Flags 0x23fffe0000000200(workingset|node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Testing
=======
Tested on a 4-node NUMA system. Verified that:
1. **Kernel without page_owner enabled**:
Tool properly detects and reports missing page_owner support:
```
$ ./page_owner_filter -m stack
Error: /sys/kernel/debug/page_owner does not exist
Make sure page_owner is enabled in kernel
```
2. **Kernel without per-fd filter support**:
Tool properly detects and reports missing filter support:
```
$ ./page_owner_filter -m stack
Error: Kernel rejected the filter command.
Possible causes:
- Kernel does not support per-fd filtering
- NUMA node has no memory
- Unknown reason
```
3. **Comprehensive userspace tool testing**:
Tested 26 test cases covering:
- Help messages (-h, --help)
- Invalid inputs (mode, nid format, range validation)
- Valid modes (stack, handle, stack_handle)
- Valid nid filters (single node, multiple nodes, ranges)
- Combined mode and nid filters
- Node validity verification (grep-based verification)
- Error handling for out-of-range nodes
Test script (test_page_owner_filter.sh):
```bash
#!/bin/bash
# Test script for page_owner_filter tool
cd "$(dirname "$0")"
echo "========================================="
echo "page_owner_filter Test Suite"
echo "========================================="
echo
echo "Test 1: -h"
echo "./page_owner_filter -h"
./page_owner_filter -h
echo
echo "Test 2: --help"
echo "./page_owner_filter --help"
./page_owner_filter --help
echo
echo "Test 3: Invalid mode"
echo ./page_owner_filter -m invalid
./page_owner_filter -m invalid
echo
echo "Test 4: Invalid nid with letters"
echo ./page_owner_filter -n 0,a,2
./page_owner_filter -n 0,a,2
echo
echo "Test 5: Invalid nid with double comma"
echo ./page_owner_filter -n 0,,2
./page_owner_filter -n 0,,2
echo
echo "Test 6: Invalid nid starting with comma"
echo ./page_owner_filter -n ,0,1
./page_owner_filter -n ,0,1
echo
echo "Test 7: Invalid nid ending with comma"
echo ./page_owner_filter -n "0,1,"
./page_owner_filter -n "0,1,"
echo
echo "Test 8: No filters specified"
echo ./page_owner_filter
./page_owner_filter
echo
echo "Test 9: Invalid nid - node 4 (out of range)"
echo ./page_owner_filter -n 4
./page_owner_filter -n 4
echo
echo "Test 10: Invalid nid - large number"
echo './page_owner_filter -n 65535'
./page_owner_filter -n 65535
echo
echo "Test 11: Invalid mode AND invalid nid"
echo ./page_owner_filter -m wrong -n abc
./page_owner_filter -m wrong -n abc
echo
echo "Test 12: Two invalid modes (try both)"
echo ./page_owner_filter -m wrong1 -m wrong2
./page_owner_filter -m wrong1 -m wrong2
echo
echo "Test 13: Valid mode - stack"
echo './page_owner_filter -m stack | head -20'
./page_owner_filter -m stack | head -20
echo
echo "Test 14: Valid mode - handle"
echo './page_owner_filter -m handle | head -20'
./page_owner_filter -m handle | head -20
echo
echo "Test 15: Valid mode - stack_handle"
echo './page_owner_filter -m stack_handle | head -20'
./page_owner_filter -m stack_handle | head -20
echo
echo "Test 16: All modes"
echo './page_owner_filter -m stack -m handle -m stack_handle | head -20'
./page_owner_filter -m stack -m handle -m stack_handle | head -20
echo
echo "Test 17: Valid nid - single"
echo './page_owner_filter -n 0 | head -20'
./page_owner_filter -n 0 | head -20
echo 'Verify: should have node=0, should NOT have node=1,2,3'
echo './page_owner_filter -n 0 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c'
./page_owner_filter -n 0 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
echo
echo "Test 18: Valid nid - multiple"
echo 'Verify: should have node=0,1,3, should NOT have node=2'
echo './page_owner_filter -n 0,1,3 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c'
./page_owner_filter -n 0,1,3 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
echo
echo "Test 19: Valid nid - range"
echo 'Verify: should have node=2,3, should NOT have node=0,1'
echo './page_owner_filter -n 2-3 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c'
./page_owner_filter -n 2-3 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
echo
echo "Test 20: Valid nid - range"
echo 'Verify: should have node=0,1,2,3'
echo './page_owner_filter -n 2-3,0-1 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c'
./page_owner_filter -n 2-3,0-1 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
echo
echo "Test 21: Valid nid - range"
echo 'Verify: should have node=2, should NOT have node=0,1,3'
echo './page_owner_filter -n 2-2 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c'
./page_owner_filter -n 2-2 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
echo
echo "Test 22: Invalid nid - range start must be <= end"
echo './page_owner_filter -n 3-0'
./page_owner_filter -n 3-0
echo
echo './page_owner_filter -n 1-0,0-1'
./page_owner_filter -n 1-0,0-1
echo
echo './page_owner_filter -n 2-3,1-0,0-1'
./page_owner_filter -n 2-3,1-0,0-1
echo
echo './page_owner_filter -n 3,1-0,1'
./page_owner_filter -n 3,1-0,1
echo
echo "Test 23: Invalid nid - NUMA node 4 and above have no memory"
echo './page_owner_filter -n 0-4'
./page_owner_filter -n 0-4
echo
echo './page_owner_filter -n 1,0-4'
./page_owner_filter -n 1,0-4
echo
echo './page_owner_filter -n 7-8'
./page_owner_filter -n 7-8
echo
echo './page_owner_filter -n 8-1'
./page_owner_filter -n 8-1
echo
echo "Test 24: Valid nid - range and comma mixed"
echo 'Verify: should have node=0,2,3, should NOT have node=1'
echo './page_owner_filter -n 2-3,0| grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c'
./page_owner_filter -n 2-3,0 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
echo
echo "Test 25: Valid nid - range and comma mixed"
echo 'Verify: should have node=1,2,3, should NOT have node=0'
echo './page_owner_filter -n 1,2-3| grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c'
./page_owner_filter -n 1,2-3 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
echo
echo "Test 26: Valid handle mode + nid filter"
echo './page_owner_filter -m handle -n "0,1" | head -20'
./page_owner_filter -m handle -n "0,1" | head -20
echo 'Verify: should show stacks, and only node=0,1 (not 2,3)'
echo './page_owner_filter -m handle -n "0,1" | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c'
./page_owner_filter -m handle -n "0,1" | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
echo
echo "========================================="
echo "Tests completed. Please check output above."
echo "========================================="
```
Test output:
```
=========================================
page_owner_filter Test Suite
=========================================
Test 1: -h
./page_owner_filter -h
Usage: ./page_owner_filter [OPTIONS]
Options:
-m, --mode MODE : print_mode (stack, handle, or stack_handle)
-n, --nid NID_LIST : NUMA node IDs (comma-separated or ranges)
-o, --output FILE : output file (default: stdout)
-h, --help : show this help message
Examples:
./page_owner_filter -m stack
./page_owner_filter -m handle
./page_owner_filter -m stack_handle
./page_owner_filter -m stack -o output.txt
./page_owner_filter -n 0,1,2
./page_owner_filter -m stack -n 0
Test 2: --help
./page_owner_filter --help
Usage: ./page_owner_filter [OPTIONS]
Options:
-m, --mode MODE : print_mode (stack, handle, or stack_handle)
-n, --nid NID_LIST : NUMA node IDs (comma-separated or ranges)
-o, --output FILE : output file (default: stdout)
-h, --help : show this help message
Examples:
./page_owner_filter -m stack
./page_owner_filter -m handle
./page_owner_filter -m stack_handle
./page_owner_filter -m stack -o output.txt
./page_owner_filter -n 0,1,2
./page_owner_filter -m stack -n 0
Test 3: Invalid mode
./page_owner_filter -m invalid
Error: Invalid mode 'invalid'
Valid modes: stack, handle, stack_handle
Test 4: Invalid nid with letters
./page_owner_filter -n 0,a,2
Error: Invalid character 'a' in nid_list
Test 5: Invalid nid with double comma
./page_owner_filter -n 0,,2
Error: Invalid nid_list format
Test 6: Invalid nid starting with comma
./page_owner_filter -n ,0,1
Error: Invalid nid_list format
Test 7: Invalid nid ending with comma
./page_owner_filter -n 0,1,
Error: Invalid nid_list format
Test 8: No filters specified
./page_owner_filter
Error: At least one filter (-m or -n) must be specified
Usage: ./page_owner_filter [OPTIONS]
Options:
-m, --mode MODE : print_mode (stack, handle, or stack_handle)
-n, --nid NID_LIST : NUMA node IDs (comma-separated or ranges)
-o, --output FILE : output file (default: stdout)
-h, --help : show this help message
Examples:
./page_owner_filter -m stack
./page_owner_filter -m handle
./page_owner_filter -m stack_handle
./page_owner_filter -m stack -o output.txt
./page_owner_filter -n 0,1,2
./page_owner_filter -m stack -n 0
Test 9: Invalid nid - node 4 (out of range)
./page_owner_filter -n 4
Error: Kernel rejected the filter command.
Possible causes:
- Kernel does not support per-fd filtering
- NUMA node has no memory
- Unknown reason
Test 10: Invalid nid - large number
./page_owner_filter -n 65535
write filter command: Numerical result out of range
Test 11: Invalid mode AND invalid nid
./page_owner_filter -m wrong -n abc
Error: Invalid mode 'wrong'
Valid modes: stack, handle, stack_handle
Test 12: Two invalid modes (try both)
./page_owner_filter -m wrong1 -m wrong2
Error: Invalid mode 'wrong1'
Valid modes: stack, handle, stack_handle
Test 13: Valid mode - stack
./page_owner_filter -m stack | head -20
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40000 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
page_ext_init+0x204/0x298
mm_core_init+0xdc/0x14c
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40001 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
page_ext_init+0x204/0x298
mm_core_init+0xdc/0x14c
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40002 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
page_ext_init+0x204/0x298
mm_core_init+0xdc/0x14c
Test 14: Valid mode - handle
./page_owner_filter -m handle | head -20
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40000 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40001 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40002 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40003 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40004 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000040(head|node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Test 15: Valid mode - stack_handle
./page_owner_filter -m stack_handle | head -20
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40000 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
page_ext_init+0x204/0x298
mm_core_init+0xdc/0x14c
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40001 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
page_ext_init+0x204/0x298
mm_core_init+0xdc/0x14c
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40002 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
Test 16: All modes
./page_owner_filter -m stack -m handle -m stack_handle | head -20
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40000 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
page_ext_init+0x204/0x298
mm_core_init+0xdc/0x14c
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40001 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
page_ext_init+0x204/0x298
mm_core_init+0xdc/0x14c
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40002 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
Test 17: Valid nid - single
./page_owner_filter -n 0 | head -20
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40000 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
page_ext_init+0x204/0x298
mm_core_init+0xdc/0x14c
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40001 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
page_ext_init+0x204/0x298
mm_core_init+0xdc/0x14c
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40002 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
register_early_stack+0x2c/0x70
init_page_owner+0x2c/0x460
page_ext_init+0x204/0x298
mm_core_init+0xdc/0x14c
Verify: should have node=0, should NOT have node=1,2,3
./page_owner_filter -n 0 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
91327 node=0
Test 18: Valid nid - multiple
Verify: should have node=0,1,3, should NOT have node=2
./page_owner_filter -n 0,1,3 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
91299 node=0
43515 node=1
110404 node=3
Test 19: Valid nid - range
Verify: should have node=2,3, should NOT have node=0,1
./page_owner_filter -n 2-3 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
19391 node=2
110287 node=3
Test 20: Valid nid - range
Verify: should have node=0,1,2,3
./page_owner_filter -n 2-3,0-1 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
91562 node=0
43527 node=1
19495 node=2
110286 node=3
Test 21: Valid nid - range
Verify: should have node=2, should NOT have node=0,1,3
./page_owner_filter -n 2-2 | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
19505 node=2
Test 22: Invalid nid - range start must be <= end
./page_owner_filter -n 3-0
Error: Invalid range 3-0 (start must be <= end)
./page_owner_filter -n 1-0,0-1
Error: Invalid range 1-0 (start must be <= end)
./page_owner_filter -n 2-3,1-0,0-1
Error: Invalid range 1-0 (start must be <= end)
./page_owner_filter -n 3,1-0,1
Error: Invalid range 1-0 (start must be <= end)
Test 23: Invalid nid - NUMA node 4 and above have no memory
./page_owner_filter -n 0-4
Error: Kernel rejected the filter command.
Possible causes:
- Kernel does not support per-fd filtering
- NUMA node has no memory
- Unknown reason
./page_owner_filter -n 1,0-4
Error: Kernel rejected the filter command.
Possible causes:
- Kernel does not support per-fd filtering
- NUMA node has no memory
- Unknown reason
./page_owner_filter -n 7-8
Error: Kernel rejected the filter command.
Possible causes:
- Kernel does not support per-fd filtering
- NUMA node has no memory
- Unknown reason
./page_owner_filter -n 8-1
Error: Invalid range 8-1 (start must be <= end)
Test 24: Valid nid - range and comma mixed
Verify: should have node=0,2,3, should NOT have node=1
./page_owner_filter -n 2-3,0| grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
91741 node=0
19389 node=2
110286 node=3
Test 25: Valid nid - range and comma mixed
Verify: should have node=1,2,3, should NOT have node=0
./page_owner_filter -n 1,2-3| grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
43462 node=1
19402 node=2
110288 node=3
Test 26: Valid handle mode + nid filter
./page_owner_filter -m handle -n "0,1" | head -20
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40000 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40001 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40002 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40003 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000000(node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Page allocated via order 0, mask 0x0(), pid 0, tgid 0 (swapper), ts 0 ns
PFN 0x40004 type Unmovable Block 512 type Unmovable Flags 0x3fffe0000000040(head|node=0|zone=0|lastcpupid=0x1ffff)
handle: 1048577
Verify: should show stacks, and only node=0,1 (not 2,3)
./page_owner_filter -m handle -n "0,1" | grep "PFN" | grep -o "node=[0-9]" | sort | uniq -c
91677 node=0
43458 node=1
=========================================
Tests completed. Please check output above.
=========================================
```
Future Enhancements
===================
The per-fd filter infrastructure is designed to be extensible. Potential
future filters could include:
- PID/TGID filtering
- Time range filtering (allocation timestamp windows)
- GFP flag filtering
- Migration type filtering
v10: https://lore.kernel.org/linux-mm/20260618035750.3724613-1-zhen.ni@easystack.cn/
v9: https://lore.kernel.org/linux-mm/20260525081652.2210206-1-zhen.ni@easystack.cn/
v8: https://lore.kernel.org/linux-mm/20260520075641.1931080-1-zhen.ni@easystack.cn/
v7: https://lore.kernel.org/linux-mm/20260515091942.1535677-1-zhen.ni@easystack.cn/
v6: https://lore.kernel.org/linux-mm/20260511024748.183550-1-zhen.ni@easystack.cn/
v5: https://lore.kernel.org/linux-mm/20260507064643.179187-1-zhen.ni@easystack.cn/
v4: https://lore.kernel.org/linux-mm/20260430163247.13628-1-zhen.ni@easystack.cn/
v3: https://lore.kernel.org/linux-mm/20260428071112.1420380-1-zhen.ni@easystack.cn/
v2: https://lore.kernel.org/linux-mm/20260419155540.376847-1-zhen.ni@easystack.cn/
v1: https://lore.kernel.org/linux-mm/20260417154638.22370-1-zhen.ni@easystack.cn/
Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
---
Zhen Ni (4):
mm/page_owner: add print_mode filter
mm/page_owner: add NUMA node filter
tools/mm: add page_owner_filter userspace tool
mm/page_owner: document page_owner filter
Documentation/mm/page_owner.rst | 77 +++++++-
mm/page_owner.c | 186 ++++++++++++++++++-
tools/mm/Makefile | 4 +-
tools/mm/page_owner_filter.c | 310 ++++++++++++++++++++++++++++++++
4 files changed, 567 insertions(+), 10 deletions(-)
create mode 100644 tools/mm/page_owner_filter.c
--
2.20.1
^ permalink raw reply
* [PATCH v11 1/4] mm/page_owner: add print_mode filter
From: Zhen Ni @ 2026-06-25 4:30 UTC (permalink / raw)
To: akpm, vbabka
Cc: surenb, mhocko, jackmanb, hannes, ziy, linux-mm, linux-kernel,
Zhen Ni
In-Reply-To: <20260625043101.338794-1-zhen.ni@easystack.cn>
Add a print_mode filter to page_owner that allows users to choose between
printing stack traces, stack handles, or both, providing flexibility for
different debugging and analysis scenarios.
The filter provides three modes via page_owner:
- Writing "mode=stack" prints stack traces for each page (default)
- Writing "mode=handle" prints only the handle number
- Writing "mode=stack_handle" prints both stack traces and handles
The default stack mode maintains backward compatibility with existing
usage, displaying complete stack traces for each page allocation.
The handle mode dramatically reduces log size and improves performance by
showing only the handle number instead of the full stack trace. Testing
shows handle mode reduces output size by ~66% (84MB vs 244MB) and
improves read performance by ~4.4x compared to full stack output. The
mapping from handles to actual stack traces can be obtained via the
show_stacks_handles interface.
The stack_handle mode prints both stack traces and handles, making it
easier to identify pages with the same allocation pattern by comparing
handle numbers instead of comparing large stack traces.
Example usage:
# Using the page_owner_filter tool (recommended)
./page_owner_filter -m stack # Print only stack traces (default)
./page_owner_filter -m handle # Print only handles
./page_owner_filter -m stack_handle # Print both stack and handles
Sample output (handle mode):
Page allocated via order 0, migratetype Unmovable, gfp_mask 0x1100ca,
pid 1, tgid 1 (systemd), ts 123456789 ns
PFN 0x1000 type Unmovable Block 1 type Unmovable
Flags 0x3fffe800000084(referenced|lru|active|private|node=0|zone=1)
handle: 17432583
...
This implementation uses per-file-descriptor filter state stored in
file->private_data, allowing each opener to have independent filter
configuration.
Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
---
Changes in v11:
- No changes
Changes in v10:
- No changes
Changes in v9:
- Add spinlock_t lock to struct page_owner_filter_state for concurrent access protection
Changes in v8:
- Fix buffer overflow by adding bounds check between stack_depot_snprint() and scnprintf()
- Fix unsafe string handling: use memdup_user_nul() instead of kmalloc_objs + strncpy_from_user()
- Fix strsep() memory corruption by saving original pointer before strsep() call
- Change format specifier from %d to %u for depot_stack_handle_t
Changes in v7:
- per-file-descriptor implementation
Changes in v6:
- Remove unnecessary braces in if/else statement (coding style)
- Use stack array (char kbuf[33]) instead of kmalloc for input buffer
Changes in v5:
- No code changes
Changes in v4:
- Change from numeric (0/1) to string-based interface ("full_stack"/"stack_handle")
- Merge infrastructure patch into this patch
Changes in v3:
- No code changes
Changes in v2:
- Renamed from 'compact mode' to 'print_mode' for better clarity
- Use enum values (0=full_stack, 1=stack_handle) instead of boolean
- Update debugfs filename from 'compact' to 'print_mode'
v10: https://lore.kernel.org/linux-mm/20260618035750.3724613-2-zhen.ni@easystack.cn/
v9: https://lore.kernel.org/linux-mm/20260525081652.2210206-2-zhen.ni@easystack.cn/
v8: https://lore.kernel.org/linux-mm/20260520075641.1931080-2-zhen.ni@easystack.cn/
v7: https://lore.kernel.org/linux-mm/20260515091942.1535677-2-zhen.ni@easystack.cn/
v6: https://lore.kernel.org/linux-mm/20260511033017.747781-2-zhen.ni@easystack.cn/
v5: https://lore.kernel.org/linux-mm/20260507064643.179187-2-zhen.ni@easystack.cn/
v4: https://lore.kernel.org/linux-mm/20260430163247.13628-2-zhen.ni@easystack.cn/
v3: https://lore.kernel.org/linux-mm/20260428071112.1420380-2-zhen.ni@easystack.cn/
https://lore.kernel.org/linux-mm/20260428071112.1420380-3-zhen.ni@easystack.cn/
v2: https://lore.kernel.org/linux-mm/20260419155540.376847-2-zhen.ni@easystack.cn/
https://lore.kernel.org/linux-mm/20260419155540.376847-3-zhen.ni@easystack.cn/
v1: https://lore.kernel.org/linux-mm/20260417154638.22370-2-zhen.ni@easystack.cn/
https://lore.kernel.org/linux-mm/20260417154638.22370-3-zhen.ni@easystack.cn/
---
mm/page_owner.c | 129 +++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 123 insertions(+), 6 deletions(-)
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 8178e0be557f..7595735979bf 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -54,6 +54,23 @@ struct stack_print_ctx {
u8 flags;
};
+enum page_owner_print_mode {
+ PAGE_OWNER_PRINT_STACK,
+ PAGE_OWNER_PRINT_HANDLE,
+ PAGE_OWNER_PRINT_STACK_HANDLE,
+};
+
+static const char * const page_owner_print_mode_strings[] = {
+ [PAGE_OWNER_PRINT_STACK] = "stack",
+ [PAGE_OWNER_PRINT_HANDLE] = "handle",
+ [PAGE_OWNER_PRINT_STACK_HANDLE] = "stack_handle",
+};
+
+struct page_owner_filter_state {
+ enum page_owner_print_mode print_mode;
+ spinlock_t lock;
+};
+
static bool page_owner_enabled __initdata;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
@@ -547,16 +564,23 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_owner *page_owner,
- depot_stack_handle_t handle)
+ depot_stack_handle_t handle,
+ struct page_owner_filter_state *state)
{
int ret, pageblock_mt, page_mt;
char *kbuf;
+ enum page_owner_print_mode print_mode;
+ unsigned long flags;
count = min_t(size_t, count, PAGE_SIZE);
kbuf = kmalloc(count, GFP_KERNEL);
if (!kbuf)
return -ENOMEM;
+ spin_lock_irqsave(&state->lock, flags);
+ print_mode = state->print_mode;
+ spin_unlock_irqrestore(&state->lock, flags);
+
ret = scnprintf(kbuf, count,
"Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n",
page_owner->order, page_owner->gfp_mask,
@@ -575,9 +599,18 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
migratetype_names[pageblock_mt],
&page->flags);
- ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
- if (ret >= count)
- goto err;
+ if (print_mode != PAGE_OWNER_PRINT_HANDLE) {
+ ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
+ if (ret >= count)
+ goto err;
+ }
+
+ if (print_mode != PAGE_OWNER_PRINT_STACK) {
+ ret += scnprintf(kbuf + ret, count - ret, "handle: %u\n",
+ handle);
+ if (ret >= count)
+ goto err;
+ }
if (page_owner->last_migrate_reason != -1) {
ret += scnprintf(kbuf + ret, count - ret,
@@ -664,6 +697,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct page_ext *page_ext;
struct page_owner *page_owner;
depot_stack_handle_t handle;
+ struct page_owner_filter_state *state = file->private_data;
if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
@@ -746,7 +780,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
page_owner_tmp = *page_owner;
page_ext_put(page_ext);
return print_page_owner(buf, count, pfn, page,
- &page_owner_tmp, handle);
+ &page_owner_tmp, handle, state);
ext_put_continue:
page_ext_put(page_ext);
}
@@ -847,7 +881,90 @@ static void init_early_allocated_pages(void)
init_pages_in_zone(zone);
}
+static int page_owner_open(struct inode *inode, struct file *file)
+{
+ struct page_owner_filter_state *state;
+
+ state = kzalloc_obj(*state);
+ if (!state)
+ return -ENOMEM;
+
+ spin_lock_init(&state->lock);
+ state->print_mode = PAGE_OWNER_PRINT_STACK;
+ file->private_data = state;
+ return 0;
+}
+
+static int page_owner_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static ssize_t page_owner_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char *kbuf;
+ char *orig;
+ char *token;
+ int ret;
+ size_t max_input_len;
+ struct page_owner_filter_state *state = file->private_data;
+ enum page_owner_print_mode new_print_mode;
+ unsigned long flags;
+
+ /*
+ * Maximum input length for filter commands:
+ * 32: print_mode command max length is 17 ("mode=stack_handle").
+ */
+ max_input_len = 32;
+
+ if (count > max_input_len)
+ return -EINVAL;
+
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ orig = kbuf;
+
+ spin_lock_irqsave(&state->lock, flags);
+ new_print_mode = state->print_mode;
+ spin_unlock_irqrestore(&state->lock, flags);
+
+ while ((token = strsep(&kbuf, " \t\n")) != NULL) {
+ if (*token == '\0')
+ continue;
+
+ if (!strncmp(token, "mode=", 5)) {
+ ret = sysfs_match_string(page_owner_print_mode_strings,
+ token + 5);
+ if (ret < 0)
+ goto out_free;
+ new_print_mode = ret;
+ } else {
+ ret = -EINVAL;
+ goto out_free;
+ }
+ }
+
+ spin_lock_irqsave(&state->lock, flags);
+ state->print_mode = new_print_mode;
+ spin_unlock_irqrestore(&state->lock, flags);
+
+ ret = count;
+
+out_free:
+ kfree(orig);
+ return ret;
+}
+
static const struct file_operations page_owner_fops = {
+ .owner = THIS_MODULE,
+ .open = page_owner_open,
+ .release = page_owner_release,
+ .write = page_owner_write,
.read = read_page_owner,
.llseek = lseek_page_owner,
};
@@ -980,7 +1097,7 @@ static int __init pageowner_init(void)
return 0;
}
- debugfs_create_file("page_owner", 0400, NULL, NULL, &page_owner_fops);
+ debugfs_create_file("page_owner", 0600, NULL, NULL, &page_owner_fops);
dir = debugfs_create_dir("page_owner_stacks", NULL);
debugfs_create_file("show_stacks", 0400, dir,
(void *)(STACK_PRINT_FLAG_STACK |
--
2.20.1
^ permalink raw reply related
* [PATCH] mm/rmap: use huge_ptep_get() in try_to_unmap_one()
From: Dev Jain @ 2026-06-25 4:28 UTC (permalink / raw)
To: akpm, david, ljs
Cc: Dev Jain, riel, liam, vbabka, harry, jannh, kas, linux-mm,
linux-kernel, ryan.roberts, anshuman.khandual, stable
try_to_unmap_one() handles hugetlb folios when memory failure needs
to replace a poisoned hugetlb mapping with a hwpoison entry. In that
case page_vma_mapped_walk() returns the hugetlb entry in pvmw.pte, but
the code reads it with ptep_get() before decoding the PFN.
That is wrong on architectures where hugetlb entries are not encoded as
regular PTEs. On s390, for example, a raw huge RSTE must be converted
by huge_ptep_get() before helpers such as pte_pfn() can inspect it. A
raw decode can select the wrong subpage, so try_to_unmap_one() can
install a hwpoison entry for the wrong PFN.
The userspace-visible result is that a later access to the poisoned
hugetlb subpage can miss the expected SIGBUS. With DEBUG_VM, the wrong
subpage can also trip the PageHWPoison check.
Use huge_ptep_get() for hugetlb mappings before decoding the PFN.
Before c7ab0d2fdc84, the bug existed in the form of a plain dereference:
we would check the head page pfn of the hugetlb with pte_pfn(*pte), and
bail out on mismatch. This would mean that the hwpoisoned entry will not
get installed.
I am not sure what is the procedure on such kinds of very old bugs - how
back should I really go?
Fixes: c7ab0d2fdc84 ("mm: convert try_to_unmap_one() to use page_vma_mapped_walk()")
Cc: stable@vger.kernel.org
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
Applies on mm-unstable (d17fe8a046a2).
There are similar old bugs present, in try_to_migrate_one(), check_pte(),
remove_migration_pte(), prot_none_hugetlb_entry().
mm/rmap.c | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/mm/rmap.c b/mm/rmap.c
index 1c77d5dc06e9f..aa8a254efaecc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2095,11 +2095,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- /*
- * Handle PFN swap PTEs, such as device-exclusive ones, that
- * actually map pages.
- */
- pteval = ptep_get(pvmw.pte);
+ address = pvmw.address;
+ if (folio_test_hugetlb(folio)) {
+ pteval = huge_ptep_get(mm, address, pvmw.pte);
+ } else {
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones,
+ * that actually map pages.
+ */
+ pteval = ptep_get(pvmw.pte);
+ }
if (likely(pte_present(pteval))) {
pfn = pte_pfn(pteval);
} else {
@@ -2110,7 +2115,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
}
subpage = folio_page(folio, pfn - folio_pfn(folio));
- address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
--
2.43.0
^ permalink raw reply related
* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
From: Harry Yoo @ 2026-06-25 4:16 UTC (permalink / raw)
To: Qi Zheng, akpm, david, kasong, shakeel.butt, baohua,
axelrasmussen, yuanchu, weixugc, hannes, muchun.song, peiyang_he,
mhocko, roman.gushchin, ljs
Cc: linux-mm, linux-kernel, Qi Zheng, stable
In-Reply-To: <1d638906-6d64-4e57-a181-4b77683652b5@linux.dev>
[-- Attachment #1.1: Type: text/plain, Size: 6334 bytes --]
On 6/24/26 4:11 PM, Qi Zheng wrote:
> Hi Harry,
>
> On 6/24/26 12:29 PM, Harry Yoo wrote:
>> On 6/23/26 6:14 PM, Qi Zheng wrote:
>>> Hi Harry,
>>>
>>> On 6/23/26 4:18 PM, Harry Yoo wrote:
>>>> On 6/23/26 4:16 PM, Qi Zheng wrote:
>>>>> Hi Harry,
>>>>
>>>> Hi Qi!
>>>>
>>>>> On 6/23/26 2:17 PM, Harry Yoo wrote:
>>>>>> On 6/23/26 11:42 AM, Qi Zheng wrote:
>>>>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>>>
>>>>>>> The mglru page table walker batches per-generation size deltas in
>>>>>>> walk->nr_pages while walking page tables without holding the lruvec
>>>>>>> lock.
>>>>>>> The reset_batch_size() later folds those deltas into walk->lruvec
>>>>>>> under
>>>>>>> the lruvec lock.
>>>>>>
>>>>>> Ouch.
>>>>>>
>>>>>> IIRC the user-visible impact of underestimated nr_pages in MGLRU
>>>>>> was premature OOMs because MGLRU does not try to reclaim memory when
>>>>>> nr_pages reaches zero, but there are still more pages.
>>>>>>
>>>>>> Perhaps worth mentioning in the changelog?
>>>>>
>>>>> Maybe this should be placed before "To fix it...".
>>>>
>>>> Thanks!
>>>>
>>>>>>> The page table walker can run concurrently with the memcg
>>>>>>> reparenting
>>>>>>> path
>>>>>>> as follows:
>>>>>>>
>>>>>>> CPU0 CPU1
>>>>>>> ==== ====
>>>>>>>
>>>>>>> walk_mm
>>>>>>> --> walk_page_range
>>>>>>> --> update_batch_size
>>>>>>> --> walk->nr_pages += delta
>>>>>>>
>>>>>>> mem_cgroup_css_offline
>>>>>>> --> memcg_reparent_objcgs
>>>>>>> --> lock lruvec
>>>>>>> lru_gen_reparent_memcg
>>>>>>> --> reparent child
>>>>>>> folios to
>>>>>>> parent
>>>>>>> unlock lruvec
>>>>>>>
>>>>>>> lock lruvec
>>>>>>> reset_batch_size
>>>>>>> --> child lrugen->nr_pages += delta
>>>>>>
>>>>>> The problem here is that, while grabbing a reference to memcg
>>>>>> (via mem_cgroup_iter(), for example) makes sure that the memcg is not
>>>>>> freed, it does not prevent offlining happening, and
>>>>>> reset_batch_size()
>>>>>> doesn't check whether the lruvec has been reparented, or the lruvec
>>>>>> is going to be reparented.
>>>>>>
>>>>>>> This will trigger the following warning in lru_gen_exit_memcg():
>>>>>>>
>>>>>>> VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
>>>>>>> sizeof(lruvec->lrugen.nr_pages)));
>>>>>>>
>>>>>>> To fix it, add lrugen->reparented to remember the new owner of a
>>>>>>> reparented lruvec, and make reset_batch_size() charge pending
>>>>>>> deltas to
>>>>>>> that owner.
>>>>>>
>>>>>> Could you please explain why it is unavoidable to introduce the new
>>>>>> field and why checking whether the cgroup is dying (and charging
>>>>>> deltas
>>>>>> to non-dying parent) doesn't work?
>>>>>
>>>>> Peiyang tried doing this [1], but it doesn't work because
>>>>> ss->css_offline() is called before clearing the CSS_ONLINE flag.
>>>>
>>>> Right.
>>>>
>>>>> I also considered using mem_cgroup_tryget_online(), but that only
>>>>> prevent
>>>>> the memcg from being freed. It's doesn't prevent the offlining.
>>>>
>>>> Right.
>>>>
>>>> I think checking CSS_DYING under RCU and grabbing the lruvec
>>>> of the first non-dying memcg should work (this pattern is already
>>>> used where we use RCU to guarantee memcgs are not freed).
>>>>
>>>> If we do not observe CSS_DYING flag, it is safe to charge deltas
>>>> to the lruvec because RCU guarantees that reparenting cannot happen
>>>> under us.
>>>>
>>>> If we do observe CSS_DYING, we can walk up the hierarchy and charge
>>>> deltas to the first non-dying memcg.
>>>
>>> Checking CSS_DYING looks feasible, but the rcu lock alone cannot prevent
>>> reparenting. We should recheck CSS_DYING after acquiring the lruvec
>>> lock, otherwise we might run into the following race:
>>
>> Haha, actually, I was thinking of checking CSS_DYING under both RCU and
>> lruvec lock. (because that's the pattern)
>>
>>> CPU0 reset_batch_size CPU1 memcg teardown
>>> ===================== ==================
>>>
>>> read !CSS_DYING
>>>
>>> set CSS_DYING
>>
>> Oh, I thought the entire critical section is covered by RCU.
>> (I see lock_batch_lruvec() you suggested below doesn't do that)
>>
>> Isn't RCU enough to prevent reparenting because RCU guarantees that
>> all readers who read !CSS_DYING complete before reparenting?
>
> Oh, I think you are right.
>
> I forgot that offlining is executed in the rcu work context.
It's confusing :)
> Let's walk through this again:
>
> cgroup_destroy_locked
> --> kill_css_sync
> --> css->flags |= CSS_DYING; 1)
> kill_css_finish
> --> css_killed_ref_fn
> --> css_killed_work_fn <-- RCU work !! 2)
> --> offline_css
> --> reparent memcg
>
> So while holding the rcu lock, if CSS_DYING is not observed,
> css_killed_work_fn() will not be called until rcu_read_unlock().
Right.
> So lock_batch_lruvec() can be implemented like this:
>
> #ifdef CONFIG_MEMCG
> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
> {
> struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>
> rcu_read_lock();
>
> /*
> * The memcg can be NULL when the memory controller is disabled.
> * Otherwise, the caller keeps the memcg owning @lruvec alive.
> */
> if (!memcg || !css_is_dying(&memcg->css))
> goto lock;
>
> do {
> memcg = parent_mem_cgroup(memcg);
> } while (memcg && css_is_dying(&memcg->css));
> lruvec = mem_cgroup_lruvec(memcg, pgdat);
>
> lock:
> spin_lock_irq(&lruvec->lru_lock);
>
> return lruvec;
> }
> #else
> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
> {
> lruvec_lock_irq(lruvec);
>
> return lruvec;
> }
> #endif
>
> Does this make sense?
Yes, looks good to me!
--
Cheers,
Harry / Hyeonggon
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox