LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH V4 2/2] tools/perf: Use scnprintf in buffer offset calculations
From: Athira Rajeev @ 2026-05-04 15:42 UTC (permalink / raw)
  To: acme, jolsa, adrian.hunter, mpetlan, tmricht, maddy, irogers,
	namhyung
  Cc: linux-perf-users, linuxppc-dev, atrajeev, hbathini, Tejas.Manhas1,
	Tanushree.Shah, shivani
In-Reply-To: <20260504154205.21394-1-atrajeev@linux.ibm.com>

Replace snprintf with scnprintf in buffer offset calculations to
ensure the 'used' count will not exceed the "len".

The current logic in perf_pmu__for_each_event uses an unconditional
+ 1 increment to buf_used to account for null terminators. This can
cause a a stack buffer overflow in the subsequent scnprintf call.
When the local stack buffer buf (1024 bytes) is full, buf_used can
reach 1025. This causes the subsequent remaining space calculation
sizeof(buf) - buf_used to underflow.

Use sub_non_neg() to see if space actually existed, and only
increment the offset if remaning space is present.

Changes includes:
- Use sub_non_neg to check if space exists
- Replacing snprintf with scnprintf to ensure the return value
reflects the actual bytes written into the buffer.
- Only increment buf_used by 1 if space exists
- If a parameterized event uses a built-in perf keyword for its
parameter name (eg, config=?), the lexer parses it as a predefined
term token, which sets term->config to NULL. Add check to use
parse_events__term_type_str() if term->config is NULL.

Signed-off-by: Athira Rajeev <atrajeev@linux.ibm.com>
---
Changelog:
v2 -> v3:
- Split the scnprintf related changes in separate patch
- Handle the overflow issues and unconditional increment
wrapped around sub_non_neg addressing review comment from Sashiko

 tools/perf/util/pmu.c | 46 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 0b8d58543f17..4b9ade1a4cf9 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -2129,15 +2129,19 @@ static char *format_alias(char *buf, int len, const struct perf_pmu *pmu,
 		pr_err("Failure to parse '%s' terms '%s': %d\n",
 			alias->name, alias->terms, ret);
 		parse_events_terms__exit(&terms);
-		snprintf(buf, len, "%.*s/%s/", (int)pmu_name_len, pmu->name, alias->name);
+		scnprintf(buf, len, "%.*s/%s/", (int)pmu_name_len, pmu->name, alias->name);
 		return buf;
 	}
-	used = snprintf(buf, len, "%.*s/%s", (int)pmu_name_len, pmu->name, alias->name);
+	used = scnprintf(buf, len, "%.*s/%s", (int)pmu_name_len, pmu->name, alias->name);
 
 	list_for_each_entry(term, &terms.terms, list) {
+		const char *name = term->config;
+
+		if (!name)
+			name = parse_events__term_type_str(term->type_term);
 		if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR)
-			used += snprintf(buf + used, sub_non_neg(len, used),
-					",%s=%s", term->config,
+			used += scnprintf(buf + used, sub_non_neg(len, used),
+					",%s=%s", name,
 					term->val.str);
 	}
 	parse_events_terms__exit(&terms);
@@ -2201,6 +2205,7 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus,
 	int ret = 0;
 	struct hashmap_entry *entry;
 	size_t bkt;
+	size_t size_rem, len;
 
 	if (perf_pmu__is_tracepoint(pmu))
 		return tp_pmu__for_each_event(pmu, state, cb);
@@ -2234,17 +2239,36 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus,
 			}
 			buf_used = strlen(buf) + 1;
 		}
+
 		info.scale_unit = NULL;
 		if (strlen(event->unit) || event->scale != 1.0) {
-			info.scale_unit = buf + buf_used;
-			buf_used += snprintf(buf + buf_used, sizeof(buf) - buf_used,
-					"%G%s", event->scale, event->unit) + 1;
+			/* Check the remaining space */
+			size_rem = sub_non_neg(sizeof(buf), buf_used);
+
+			if (size_rem > 0) {
+				info.scale_unit = buf + buf_used;
+				len = scnprintf(buf + buf_used, size_rem, "%G%s",
+						event->scale, event->unit);
+				/*
+				 * Increment buf_used by 1 only if
+				 * it fits remaining space
+				 */
+				buf_used += min(len + 1, size_rem);
+			}
 		}
 		info.desc = event->desc;
 		info.long_desc = event->long_desc;
-		info.encoding_desc = buf + buf_used;
-		buf_used += snprintf(buf + buf_used, sizeof(buf) - buf_used,
-				"%.*s/%s/", (int)pmu_name_len, info.pmu_name, event->terms) + 1;
+		info.encoding_desc = NULL;
+
+		/* Check the remaining space */
+		size_rem = sub_non_neg(sizeof(buf), buf_used);
+		if (size_rem > 0) {
+			info.encoding_desc = buf + buf_used;
+			len = scnprintf(buf + buf_used, size_rem, "%.*s/%s/",
+					(int)pmu_name_len, info.pmu_name, event->terms);
+			buf_used += min(len + 1, size_rem);
+		}
+
 		info.str = event->terms;
 		info.topic = event->topic;
 		info.deprecated = perf_pmu_alias__check_deprecated(pmu, event);
@@ -2254,7 +2278,7 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus,
 	}
 	if (pmu->selectable) {
 		info.name = buf;
-		snprintf(buf, sizeof(buf), "%s//", pmu->name);
+		scnprintf(buf, sizeof(buf), "%s//", pmu->name);
 		info.alias = NULL;
 		info.scale_unit = NULL;
 		info.desc = NULL;
-- 
2.47.3



^ permalink raw reply related

* [PATCH V4 1/2] tools/perf: Fix the check for parameterized field in event term
From: Athira Rajeev @ 2026-05-04 15:42 UTC (permalink / raw)
  To: acme, jolsa, adrian.hunter, mpetlan, tmricht, maddy, irogers,
	namhyung
  Cc: linux-perf-users, linuxppc-dev, atrajeev, hbathini, Tejas.Manhas1,
	Tanushree.Shah, shivani

The format_alias() function in util/pmu.c has a check to
detect whether the event has parameterized field ( =? ).
The string alias->terms contains the event and if the event
has user configurable parameter, there will be presence of
sub string "=?" in the alias->terms.

Snippet of code:

 /* Paramemterized events have the parameters shown. */
       if (strstr(alias->terms, "=?")) {
               /* No parameters. */
               snprintf(buf, len, "%.*s/%s/", (int)pmu_name_len, pmu->name, alias->name);

if "strstr" contains the substring, it returns a pointer
and hence enters the above check which is not the expected
check. And hence "perf list" doesn't have the parameterized
fields in the result.

Fix this check to use:

if (!strstr(alias->terms, "=?")) {

With this change, perf list shows the events correctly with
the strings showing parameters.

Before the fix:

 # ./perf list|grep -w PM_PAU_CYC
  hv_24x7/PM_PAU_CYC/                                [Kernel PMU event]

With this fix:

 # ./perf list|grep -w PM_PAU_CYC
  hv_24x7/PM_PAU_CYC,chip=?/                         [Kernel PMU event]

Signed-off-by: Athira Rajeev <atrajeev@linux.ibm.com>
---
Changelog:
v3 -> v4:
Updated commit message to show real example
addressing review comment from Namhyung.

v2 -> v3:
Split the strstr correction in a single patch

 tools/perf/util/pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 23337d2fa281..0b8d58543f17 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -2117,7 +2117,7 @@ static char *format_alias(char *buf, int len, const struct perf_pmu *pmu,
 						   skip_duplicate_pmus);
 
 	/* Paramemterized events have the parameters shown. */
-	if (strstr(alias->terms, "=?")) {
+	if (!strstr(alias->terms, "=?")) {
 		/* No parameters. */
 		snprintf(buf, len, "%.*s/%s/", (int)pmu_name_len, pmu->name, alias->name);
 		return buf;
-- 
2.47.3



^ permalink raw reply related

* [PATCH V5 2/2] tools/perf: Add kernel-doc comment to auxtrace_record__init() function
From: Athira Rajeev @ 2026-05-04 15:13 UTC (permalink / raw)
  To: acme, jolsa, adrian.hunter, mpetlan, tmricht, maddy, irogers,
	namhyung
  Cc: linux-perf-users, linuxppc-dev, atrajeev, hbathini, Tejas.Manhas1,
	Tanushree.Shah, shivani
In-Reply-To: <20260504151321.12346-1-atrajeev@linux.ibm.com>

Add documentation comment describing the parameters
and return code for auxtrace_record__init() in util/auxtrace.c

Signed-off-by: Athira Rajeev <atrajeev@linux.ibm.com>
---
Changelog:
v4 > v5:
Addressed review comment from Namhyung:
- Used original placement for __maybe_unused as
"struct evlist *evlist __maybe_unused"
- Added return code expectation on success case

v3 -> v4:
Addressed review comment from Sashiko:
Update return value expectation for fail as
non zero return err code.

v2 -> v3:
Addressed review comment from Sashiko:
Update return value expectation for success and fail
correctly.

 tools/perf/util/auxtrace.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index a224687ffbc1..a9f007d47c0b 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -896,6 +896,21 @@ int auxtrace_parse_aux_action(struct evlist *evlist)
 	return 0;
 }
 
+/**
+ * auxtrace_record__init - Initialize an AUX area tracing record.
+ * @evlist: The list of events to check for AUX area tracing event.
+ * @err: Pointer to an integer to store return code.
+ *
+ * This function looks through the @evlist to determine which AUX area
+ * tracing hardware is being used and initializes the auxtrace_record
+ * structure.
+ *
+ * Return:
+ * a) A pointer to the struct auxtrace_record with @err = 0 on success.
+ * b) NULL with @err = 0 if no AUX area tracing event is found/supported
+ *    (not considered an error).
+ * c) NULL with non-zero @err on actual auxtrace_record__init failure.
+ */
 struct auxtrace_record *__weak
 auxtrace_record__init(struct evlist *evlist __maybe_unused, int *err)
 {
-- 
2.47.3



^ permalink raw reply related

* [PATCH V5 1/2] powerpc tools perf: Initialize error code in auxtrace_record_init function
From: Athira Rajeev @ 2026-05-04 15:13 UTC (permalink / raw)
  To: acme, jolsa, adrian.hunter, mpetlan, tmricht, maddy, irogers,
	namhyung
  Cc: linux-perf-users, linuxppc-dev, atrajeev, hbathini, Tejas.Manhas1,
	Tanushree.Shah, shivani

perf trace record fails some cases in powerpc

 # ./perf test "perf trace record and replay"
 128: perf trace record and replay                                    : FAILED!

 # ./perf trace record sleep 1
 # echo $?
   32

This is happening because of non-zero err value from
auxtrace_record__init() function.

 static int record__auxtrace_init(struct record *rec)
 {
        int err;

        if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
            && record__threads_enabled(rec)) {
                pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
                return -EINVAL;
        }

        if (!rec->itr) {
                rec->itr = auxtrace_record__init(rec->evlist, &err);
                if (err)
                        return err;
        }

Here "int err" is not initialised. The code expects "err" to be set
from auxtrace_record__init() function.

Update auxtrace_record__init() in arch/powerpc/util/auxtrace.c to clear
err value in the beginning.

- Clear err value in beginning of function. Any fail later will
set appropriate return code to err.
- Even if we haven't found any event for auxtrace, perf record
should continue for other events. NULL return
will indicate that there is no auxtrace record initialized.
- Not having "err" set here will affect monitoring of other events
also because perf record will fail seeing random value in err.

Set err to -EINVAL before invoking auxtrace_record__init() in
builtin-record.c

With the fix,

 # ./perf trace record sleep 1
 [ perf record: Woken up 2 times to write data ]
 [ perf record: Captured and wrote 0.033 MB perf.data (228 samples) ]

Fixes: 1dbfaf94cf66 ("perf powerpc: Add basic CONFIG_AUXTRACE support for VPA pmu on powerpc")
Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Athira Rajeev <atrajeev@linux.ibm.com>
---
Changelog:
v4:
Added Reviewed-by from Adrian

v1 -> v2
Addressed review comment from Adrian:
- Set err to -EINVAL before invoking auxtrace_record__init() in
  builtin-record.c
- Added kernel-doc to auxtrace_record__init() in tools/perf/util/auxtrace.c
Addressed review comment from Namhyung:
- Added fixes tag

 tools/perf/arch/powerpc/util/auxtrace.c | 6 ++++++
 tools/perf/builtin-record.c             | 1 +
 2 files changed, 7 insertions(+)

diff --git a/tools/perf/arch/powerpc/util/auxtrace.c b/tools/perf/arch/powerpc/util/auxtrace.c
index e39deff6c857..4600a1661b4f 100644
--- a/tools/perf/arch/powerpc/util/auxtrace.c
+++ b/tools/perf/arch/powerpc/util/auxtrace.c
@@ -71,6 +71,12 @@ struct auxtrace_record *auxtrace_record__init(struct evlist *evlist,
 	struct evsel *pos;
 	int found = 0;
 
+	/*
+	 * Set err value to zero here. Any fail later
+	 * will set appropriate return code to err.
+	 */
+	*err = 0;
+
 	evlist__for_each_entry(evlist, pos) {
 		if (strstarts(pos->name, "vpa_dtl")) {
 			found = 1;
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 4a5eba498c02..708825747af5 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -865,6 +865,7 @@ static int record__auxtrace_init(struct record *rec)
 	}
 
 	if (!rec->itr) {
+		err = -EINVAL;
 		rec->itr = auxtrace_record__init(rec->evlist, &err);
 		if (err)
 			return err;
-- 
2.47.3



^ permalink raw reply related

* Re: [PATCH v2 0/5] mm: reduce mmap_lock contention and improve page fault performance
From: Barry Song @ 2026-05-04 14:15 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox, akpm, linux-mm, david, ljs, liam, vbabka, rppt,
	surenb, mhocko, pfalcato, wanglian, chentao, lianux.mm,
	kunwu.chan, liyangouwen1, chrisl, kasong, shikemeng, nphamcs, bhe,
	youngjun.park, linux-arm-kernel, linux-kernel, loongarch,
	linuxppc-dev, linux-riscv, linux-s390
In-Reply-To: <n5vsmp6ingbhfztl6wzcwmrz56gkkfbpzardk43scr7rcgqsw5@evtnov34c5bt>

On Mon, May 4, 2026 at 9:04 PM Jan Kara <jack@suse.cz> wrote:
>
> On Mon 04-05-26 03:55:43, Barry Song wrote:
> > On Mon, May 4, 2026 at 2:17 AM Jan Kara <jack@suse.cz> wrote:
> > > On Fri 01-05-26 18:57:52, Matthew Wilcox wrote:
> > > > On Sat, May 02, 2026 at 01:44:34AM +0800, Barry Song wrote:
> > > > > On Fri, May 1, 2026 at 10:57 PM Matthew Wilcox <willy@infradead.org> wrote:
> > > > > > On Fri, May 01, 2026 at 06:49:58AM +0800, Barry Song wrote:
> > > > > > > 1. There is no deterministic latency for I/O completion. It depends on
> > > > > > > both the hardware and the software stack (bio/request queues and the
> > > > > > > block scheduler). Sometimes the latency is short; at other times it can
> > > > > > > be quite long. In such cases, a high-priority thread performing operations
> > > > > > > such as mprotect, unmap, prctl_set_vma, or madvise may be forced to wait
> > > > > > > for an unpredictable amount of time.
> > > > > >
> > > > > > But does that actually happen?  I find it hard to believe that thread A
> > > > > > unmaps a VMA while thread B is in the middle of taking a page fault in
> > > > > > that same VMA.  mprotect() and madvise() are more likely to happen, but
> > > > > > it still seems really unlikely to me.
> > > > >
> > > > > It doesn’t have to involve unmapping or applying mprotect to
> > > > > the entire VMA—just a portion of it is sufficient.
> > > >
> > > > Yes, but that still fails to answer "does this actually happen".  How much
> > > > performance is all this complexity in the page fault handler buying us?
> > > > If you don't answer this question, I'm just going to go in and rip it
> > > > all out.
> > >
> > > I fully agree with you we should verify whether the retry code still brings
> > > in real-world advantage today with VMA locks. After all the retry logic has
> > > been introduced in 2010. That being said if there are realistic loads where
> > > one thread needs VMA write lock while another thread is faulting the VMA,
> > > then the latencies can be indeed extreme. For example things like cgroup IO
> > > throttling happen on the IO path and thus can throttle IO of a low-priority
> > > thread for a long time.
> >
> > I’m quite sure that swap-in and VMA writes can occur
> > concurrently, and this is fairly common. For example,
> > Java GC may use mprotect or userfaultfd on a small
> > portion of a large Java heap while other portions are
> > still under do_swap_page().
>
> OK, makes sense.
>
> > If we start exploring different approaches for anon and
> > file, I agree I can revisit this on an Android phone if
> > there is a real, serious case where a file VMA can be
> > written and a page fault occurs at the same time.
> >
> > Please note that, as an Android developer, I am particularly
> > cautious about priority inversion. A recent issue causing
> > severe priority inversion is zram attempting to support
> > preemption[1]. When a task performing compression or
> > decompression is migrated to another CPU and then preempted
> > by other tasks, high-priority tasks waiting on the mutex may
> > be significantly delayed, impacting user experience.
>
> Well, container people are concerned about priority inversion as well. But
> usually this is with coarse lock (such as global filesystem locks) but VMA
> lock is specific to a task (and a VMA) so there the opportunity for
> priority inversion looks more limited.  But the example with Java where GC
> thread can presumably have higher priority than ordinary Java threads is an
> interesting one.

A major difference in Android apps is that each thread can
affect user experience differently. And it is not simply a matter
of whether a VMA writer has higher or lower priority than a
page-fault (PF) thread performing I/O.

For example, thread A handles a PF; thread B attempts to
modify the VMA where the PF occurs; thread C tries to modify
another VMA (requiring mmap_lock in write mode) or iterate
VMAs (requiring mmap_lock in read mode). Regardless of
thread B’s priority, it holds mmap_lock in write mode while
waiting for the VMA lock. The usual pattern for a VMA writer
is:

mmap_write_lock()
vma_start_write()

As a result, thread C can be blocked even if it has higher
priority but operates on a different VMA.

In essence, when a PF and a VMA write occur concurrently,
high-priority threads may be blocked even if they operate on
different VMAs, not necessarily the same one.

Thanks
Barry


^ permalink raw reply

* Re: [PATCH] lib/crypto: powerpc/md5: Drop powerpc optimized MD5 code
From: Ard Biesheuvel @ 2026-05-04 13:56 UTC (permalink / raw)
  To: Christophe Leroy (CS GROUP), Eric Biggers, linux-crypto
  Cc: linux-kernel, Jason A . Donenfeld, Herbert Xu, linuxppc-dev,
	Nicholas Piggin, Michael Ellerman, Madhavan Srinivasan
In-Reply-To: <111ea924-fef5-441e-9849-83f938c913a7@kernel.org>

Hello Christophe,

On Mon, 4 May 2026, at 15:28, Christophe Leroy (CS GROUP) wrote:
...
> I'm really concerned with the optimised MD5 going away now, and I'm also 
> wondering what will be the way to splice a file into the kernel and get 
> it's MD-5 hash from the TALITOS if AF_ALG goes away in medium-term.
>
> What is the way forward ? I'm open to any suggestion as I really can't 
> see where to go for now.
>

AF_ALG was created to give user space access to crypto accelerators that
require privileged execution, for sharing between clients, and for managing
DMA etc.

The fact that kernel crypto code that does not have this requirement was
exposed via AF_ALG too is a historical accident, and this is causing the
pain that Eric describes wrt attack surface etc.

It sounds like you have constructed a vertically integrated system where
the kernel provides the fallback when the Talitos engine is not available
via AF_ALG.

This fallback does not need to live in the kernel, and it would be much
better (as well as more efficient) if user space would implemented MD5
itself if the Talitos cannot be accessed via AF_ALG. In user space, you
can use any implementation you like, generic or asm accelerated. This is
what all other architectures already implement, in OpenSSL etc.

Claiming that your user space software must only implement one code path,
and that punting this to the kernel is therefore required is not a
technical argument: this is just policy on your part that the community
is not bound to.

However, deprecating AF_ALG does not mean that we will ever be able to
remove it entirely. Especially the crypto accelerators that cannot be
accessed by user space in any other way will remain supported as long
as needed for legacy use cases.

But I think we should consider libkcapi as a general purpose crypto
library deprecated too, as well as any other use of AF_ALG in lieu of
user space libraries. It is not the kernel's job to execute user space
code that can easily execute non-privileged as well.

I suppose there will be more discussion soon about AF_ALG deprecation
for software crypto. It is likely that we will need to come up with
an allowlist of algorithms, in order to limit the attack surface to those
algorithms (such as your MD5) that are known to be relied upon by user space,
rather than any random combination of all the buggy template code and
null_ciphers etc.

Do you have any use cases where MD5 is a bottle neck, and the generic
implementation is too slow?

^ permalink raw reply

* Re: [PATCH v2 0/5] mm: reduce mmap_lock contention and improve page fault performance
From: Barry Song @ 2026-05-04 13:35 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox, akpm, linux-mm, david, ljs, liam, vbabka, rppt,
	surenb, mhocko, pfalcato, wanglian, chentao, lianux.mm,
	kunwu.chan, liyangouwen1, chrisl, kasong, shikemeng, nphamcs, bhe,
	youngjun.park, linux-arm-kernel, linux-kernel, loongarch,
	linuxppc-dev, linux-riscv, linux-s390
In-Reply-To: <n5vsmp6ingbhfztl6wzcwmrz56gkkfbpzardk43scr7rcgqsw5@evtnov34c5bt>

On Mon, May 4, 2026 at 9:04 PM Jan Kara <jack@suse.cz> wrote:
[...]
>
> > > BTW I'm not sure I quite understand Barry's priority inversion problem
> > > since I'd expect all threads of a task to generally be treated with the
> > > same priority...
> >
> > Exactly not. Maybe these slides[2] and this project[3] can give
> > you a hint—they aim to standardize things on Linux by
> > learning from Apple OS. Basically, tasks are classified
> > into five types:
> >
> > USER_INTERACTIVE: Requires immediate response.
> > USER_INITIATED: Tolerates a short delay, but must respond quickly still.
> > UTILITY: Tolerates long delays, but not prolonged ones.
> > BACKGROUND: Doesn’t mind prolonged delays.
> > DEFAULT: System default behavior.
>
> Again, this is a clasification of tasks but not really of threads in a task
> so at least for VMA lock there's no inversion so have?

I’m specifically referring to a task (i.e., a thread) when
discussing scheduler context. It may be clearer to use the
terms process and thread explicitly.

In a typical process sharing an mm_struct, each thread can
have a different priority.

In an Android app, some threads handle the UI and require
higher priority, such as the main thread and RenderThread;
otherwise, frame drops may occur.

The Linux scheduler can control scheduling policy and
priority for each thread.

Thanks
Barry


^ permalink raw reply

* Re: [PATCH] lib/crypto: powerpc/md5: Drop powerpc optimized MD5 code
From: Christophe Leroy (CS GROUP) @ 2026-05-04 13:28 UTC (permalink / raw)
  To: Eric Biggers, linux-crypto
  Cc: linux-kernel, Ard Biesheuvel, Jason A . Donenfeld, Herbert Xu,
	linuxppc-dev, Nicholas Piggin, Michael Ellerman,
	Madhavan Srinivasan
In-Reply-To: <20260504041448.15820-1-ebiggers@kernel.org>

Hi Eric,

Le 04/05/2026 à 06:14, Eric Biggers a écrit :
> Earlier the decision was made to keep this code for a while, despite no
> other architectures having optimized MD5 code anymore, because of
> someone using it via AF_ALG via libkcapi-hasher
> (https://lore.kernel.org/r/f0d771d5-ed70-444c-957a-ad4c16f6c115@csgroup.eu/)
> 
> However, with AF_ALG itself now being on its way out due to its
> continuous stream of security vulnerabilities
> (https://lore.kernel.org/r/20260430011544.31823-1-ebiggers@kernel.org/),
> it's time to be a bit more forceful with nudging people towards
> userspace crypto code.  It's always been the better solution anyway, and
> it's much more efficient if properly optimized code is used.

Ok, why not, but what do you propose as an alternative ? Let me explain 
the situation.

We have two versions of boards:
- One with powerpc MPC885E, which embeds a SECURITY Engine called 
TALITOS for offloading crypto operations
- One with powerpc MPC866, which doesn't have the security engine.

To use the security engine, our software use the AF_ALG interface (via 
libkcapi).

Our software has to run on both boards, we can't afford two different 
versions of the software and the software shall have no dead code. 
Therefore we rely on the capability of the kernel to do the hash by 
itself when the TALITOS in not available.

The kernel has always been the place where we do board specific stuff, 
not the application. I can't see why the application would have to ask 
the kernel when the Talitos is there and have to do the hashing by 
itself when the Talitos is not there.

I'm really concerned with the optimised MD5 going away now, and I'm also 
wondering what will be the way to splice a file into the kernel and get 
it's MD-5 hash from the TALITOS if AF_ALG goes away in medium-term.

What is the way forward ? I'm open to any suggestion as I really can't 
see where to go for now.

But please don't remove powerpc MD5 before we find an alternative solution.

Thanks
Christophe

> 
> Thus, drop the PowerPC optimized MD5 code.  Note that this code contains
> no privileged instructions and could be run in userspace just fine.
> 
> MD5 is still supported, just with the generic code only.  I.e., this
> commit only changes performance; it isn't a hard break.
> 
> This also has no effect on implementations of md5sum that already just
> use userspace code (as they should), for example the coreutils one.
> 
> Signed-off-by: Eric Biggers <ebiggers@kernel.org>
> ---
>   lib/crypto/Kconfig           |   5 -
>   lib/crypto/Makefile          |   4 -
>   lib/crypto/md5.c             |  20 ++-
>   lib/crypto/powerpc/md5-asm.S | 235 -----------------------------------
>   lib/crypto/powerpc/md5.h     |  12 --
>   5 files changed, 7 insertions(+), 269 deletions(-)
>   delete mode 100644 lib/crypto/powerpc/md5-asm.S
>   delete mode 100644 lib/crypto/powerpc/md5.h
> 
> diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
> index d3904b72dae7..591c1c2a7fb3 100644
> --- a/lib/crypto/Kconfig
> +++ b/lib/crypto/Kconfig
> @@ -129,15 +129,10 @@ config CRYPTO_LIB_MD5
>   	tristate
>   	help
>   	  The MD5 and HMAC-MD5 library functions.  Select this if your module
>   	  uses any of the functions from <crypto/md5.h>.
>   
> -config CRYPTO_LIB_MD5_ARCH
> -	bool
> -	depends on CRYPTO_LIB_MD5 && !UML
> -	default y if PPC
> -
>   config CRYPTO_LIB_MLDSA
>   	tristate
>   	select CRYPTO_LIB_SHA3
>   	help
>   	  The ML-DSA library functions.  Select this if your module uses any of
> diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
> index 4ad91f390038..f1e9bf89785f 100644
> --- a/lib/crypto/Makefile
> +++ b/lib/crypto/Makefile
> @@ -185,14 +185,10 @@ clean-files += powerpc/ghashp8-ppc.S
>   
>   ################################################################################
>   
>   obj-$(CONFIG_CRYPTO_LIB_MD5) += libmd5.o
>   libmd5-y := md5.o
> -ifeq ($(CONFIG_CRYPTO_LIB_MD5_ARCH),y)
> -CFLAGS_md5.o += -I$(src)/$(SRCARCH)
> -libmd5-$(CONFIG_PPC) += powerpc/md5-asm.o
> -endif # CONFIG_CRYPTO_LIB_MD5_ARCH
>   
>   ################################################################################
>   
>   obj-$(CONFIG_CRYPTO_LIB_MLDSA) += libmldsa.o
>   libmldsa-y := mldsa.o
> diff --git a/lib/crypto/md5.c b/lib/crypto/md5.c
> index c4af57db0ea8..6bf130cfbbf9 100644
> --- a/lib/crypto/md5.c
> +++ b/lib/crypto/md5.c
> @@ -1,11 +1,11 @@
>   // SPDX-License-Identifier: GPL-2.0-or-later
>   /*
>    * MD5 and HMAC-MD5 library functions
>    *
> - * md5_block_generic() is derived from cryptoapi implementation, originally
> - * based on the public domain implementation written by Colin Plumb in 1993.
> + * md5_block() is derived from cryptoapi implementation, originally based on the
> + * public domain implementation written by Colin Plumb in 1993.
>    *
>    * Copyright (c) Cryptoapi developers.
>    * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
>    * Copyright 2025 Google LLC
>    */
> @@ -29,12 +29,12 @@ static const struct md5_block_state md5_iv = {
>   #define F4(x, y, z) (y ^ (x | ~z))
>   
>   #define MD5STEP(f, w, x, y, z, in, s) \
>   	(w += f(x, y, z) + in, w = rol32(w, s) + x)
>   
> -static void md5_block_generic(struct md5_block_state *state,
> -			      const u8 data[MD5_BLOCK_SIZE])
> +static void md5_block(struct md5_block_state *state,
> +		      const u8 data[MD5_BLOCK_SIZE])
>   {
>   	u32 in[MD5_BLOCK_WORDS];
>   	u32 a, b, c, d;
>   
>   	memcpy(in, data, MD5_BLOCK_SIZE);
> @@ -117,25 +117,19 @@ static void md5_block_generic(struct md5_block_state *state,
>   	state->h[1] += b;
>   	state->h[2] += c;
>   	state->h[3] += d;
>   }
>   
> -static void __maybe_unused md5_blocks_generic(struct md5_block_state *state,
> -					      const u8 *data, size_t nblocks)
> +static void md5_blocks(struct md5_block_state *state,
> +		       const u8 *data, size_t nblocks)
>   {
>   	do {
> -		md5_block_generic(state, data);
> +		md5_block(state, data);
>   		data += MD5_BLOCK_SIZE;
>   	} while (--nblocks);
>   }
>   
> -#ifdef CONFIG_CRYPTO_LIB_MD5_ARCH
> -#include "md5.h" /* $(SRCARCH)/md5.h */
> -#else
> -#define md5_blocks md5_blocks_generic
> -#endif
> -
>   void md5_init(struct md5_ctx *ctx)
>   {
>   	ctx->state = md5_iv;
>   	ctx->bytecount = 0;
>   }
> diff --git a/lib/crypto/powerpc/md5-asm.S b/lib/crypto/powerpc/md5-asm.S
> deleted file mode 100644
> index fa6bc440cf4a..000000000000
> --- a/lib/crypto/powerpc/md5-asm.S
> +++ /dev/null
> @@ -1,235 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0-or-later */
> -/*
> - * Fast MD5 implementation for PPC
> - *
> - * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
> - */
> -#include <asm/ppc_asm.h>
> -#include <asm/asm-offsets.h>
> -#include <asm/asm-compat.h>
> -
> -#define rHP	r3
> -#define rWP	r4
> -
> -#define rH0	r0
> -#define rH1	r6
> -#define rH2	r7
> -#define rH3	r5
> -
> -#define rW00	r8
> -#define rW01	r9
> -#define rW02	r10
> -#define rW03	r11
> -#define rW04	r12
> -#define rW05	r14
> -#define rW06	r15
> -#define rW07	r16
> -#define rW08	r17
> -#define rW09	r18
> -#define rW10	r19
> -#define rW11	r20
> -#define rW12	r21
> -#define rW13	r22
> -#define rW14	r23
> -#define rW15	r24
> -
> -#define rT0	r25
> -#define rT1	r26
> -
> -#define INITIALIZE \
> -	PPC_STLU r1,-INT_FRAME_SIZE(r1); \
> -	SAVE_GPRS(14, 26, r1)		/* push registers onto stack	*/
> -
> -#define FINALIZE \
> -	REST_GPRS(14, 26, r1);		/* pop registers from stack	*/ \
> -	addi	r1,r1,INT_FRAME_SIZE
> -
> -#ifdef __BIG_ENDIAN__
> -#define LOAD_DATA(reg, off) \
> -	lwbrx		reg,0,rWP;	/* load data			*/
> -#define INC_PTR \
> -	addi		rWP,rWP,4;	/* increment per word		*/
> -#define NEXT_BLOCK			/* nothing to do		*/
> -#else
> -#define LOAD_DATA(reg, off) \
> -	lwz		reg,off(rWP);	/* load data			*/
> -#define INC_PTR				/* nothing to do		*/
> -#define NEXT_BLOCK \
> -	addi		rWP,rWP,64;	/* increment per block		*/
> -#endif
> -
> -#define R_00_15(a, b, c, d, w0, w1, p, q, off, k0h, k0l, k1h, k1l) \
> -	LOAD_DATA(w0, off)		/*    W				*/ \
> -	and		rT0,b,c;	/* 1: f = b and c		*/ \
> -	INC_PTR				/*    ptr++			*/ \
> -	andc		rT1,d,b;	/* 1: f' = ~b and d		*/ \
> -	LOAD_DATA(w1, off+4)		/*    W				*/ \
> -	or		rT0,rT0,rT1;	/* 1: f = f or f'		*/ \
> -	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
> -	add		a,a,rT0;	/* 1: a = a + f			*/ \
> -	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
> -	addis		w1,w1,k1h;	/* 2: wk = w + k		*/ \
> -	add		a,a,w0;		/* 1: a = a + wk		*/ \
> -	addi		w1,w1,k1l;	/* 2: wk = w + k'		*/ \
> -	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
> -	add		d,d,w1;		/* 2: a = a + wk		*/ \
> -	add		a,a,b;		/* 1: a = a + b			*/ \
> -	and		rT0,a,b;	/* 2: f = b and c		*/ \
> -	andc		rT1,c,a;	/* 2: f' = ~b and d		*/ \
> -	or		rT0,rT0,rT1;	/* 2: f = f or f'		*/ \
> -	add		d,d,rT0;	/* 2: a = a + f			*/ \
> -	INC_PTR				/*    ptr++			*/ \
> -	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
> -	add		d,d,a;		/* 2: a = a + b			*/
> -
> -#define R_16_31(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
> -	andc		rT0,c,d;	/* 1: f = c and ~d		*/ \
> -	and		rT1,b,d;	/* 1: f' = b and d		*/ \
> -	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
> -	or		rT0,rT0,rT1;	/* 1: f = f or f'		*/ \
> -	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
> -	add		a,a,rT0;	/* 1: a = a + f			*/ \
> -	addi		w1,w1,k1l;	/* 2: wk = w + k		*/ \
> -	add		a,a,w0;		/* 1: a = a + wk		*/ \
> -	addis		w1,w1,k1h;	/* 2: wk = w + k'		*/ \
> -	andc		rT0,b,c;	/* 2: f = c and ~d		*/ \
> -	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
> -	add		a,a,b;		/* 1: a = a + b			*/ \
> -	add		d,d,w1;		/* 2: a = a + wk		*/ \
> -	and		rT1,a,c;	/* 2: f' = b and d		*/ \
> -	or		rT0,rT0,rT1;	/* 2: f = f or f'		*/ \
> -	add		d,d,rT0;	/* 2: a = a + f			*/ \
> -	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
> -	add		d,d,a;		/* 2: a = a +b			*/
> -
> -#define R_32_47(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
> -	xor		rT0,b,c;	/* 1: f' = b xor c		*/ \
> -	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
> -	xor		rT1,rT0,d;	/* 1: f = f xor f'		*/ \
> -	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
> -	add		a,a,rT1;	/* 1: a = a + f			*/ \
> -	addi		w1,w1,k1l;	/* 2: wk = w + k		*/ \
> -	add		a,a,w0;		/* 1: a = a + wk		*/ \
> -	addis		w1,w1,k1h;	/* 2: wk = w + k'		*/ \
> -	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
> -	add		d,d,w1;		/* 2: a = a + wk		*/ \
> -	add		a,a,b;		/* 1: a = a + b			*/ \
> -	xor		rT1,rT0,a;	/* 2: f = b xor f'		*/ \
> -	add		d,d,rT1;	/* 2: a = a + f			*/ \
> -	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
> -	add		d,d,a;		/* 2: a = a + b			*/
> -
> -#define R_48_63(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
> -	addi		w0,w0,k0l;	/* 1: w = w + k			*/ \
> -	orc		rT0,b,d;	/* 1: f = b or ~d		*/ \
> -	addis		w0,w0,k0h;	/* 1: w = w + k'		*/ \
> -	xor		rT0,rT0,c;	/* 1: f = f xor c		*/ \
> -	add		a,a,w0;		/* 1: a = a + wk		*/ \
> -	addi		w1,w1,k1l;	/* 2: w = w + k			*/ \
> -	add		a,a,rT0;	/* 1: a = a + f			*/ \
> -	addis		w1,w1,k1h;	/* 2: w = w + k'		*/ \
> -	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
> -	add		a,a,b;		/* 1: a = a + b			*/ \
> -	orc		rT0,a,c;	/* 2: f = b or ~d		*/ \
> -	add		d,d,w1;		/* 2: a = a + wk		*/ \
> -	xor		rT0,rT0,b;	/* 2: f = f xor c		*/ \
> -	add		d,d,rT0;	/* 2: a = a + f			*/ \
> -	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
> -	add		d,d,a;		/* 2: a = a + b			*/
> -
> -_GLOBAL(ppc_md5_transform)
> -	INITIALIZE
> -
> -	mtctr		r5
> -	lwz		rH0,0(rHP)
> -	lwz		rH1,4(rHP)
> -	lwz		rH2,8(rHP)
> -	lwz		rH3,12(rHP)
> -
> -ppc_md5_main:
> -	R_00_15(rH0, rH1, rH2, rH3, rW00, rW01, 25, 20, 0,
> -		0xd76b, -23432, 0xe8c8, -18602)
> -	R_00_15(rH2, rH3, rH0, rH1, rW02, rW03, 15, 10, 8,
> -		0x2420, 0x70db, 0xc1be, -12562)
> -	R_00_15(rH0, rH1, rH2, rH3, rW04, rW05, 25, 20, 16,
> -		0xf57c, 0x0faf, 0x4788, -14806)
> -	R_00_15(rH2, rH3, rH0, rH1, rW06, rW07, 15, 10, 24,
> -		0xa830, 0x4613, 0xfd47, -27391)
> -	R_00_15(rH0, rH1, rH2, rH3, rW08, rW09, 25, 20, 32,
> -		0x6981, -26408, 0x8b45,  -2129)
> -	R_00_15(rH2, rH3, rH0, rH1, rW10, rW11, 15, 10, 40,
> -		0xffff, 0x5bb1, 0x895d, -10306)
> -	R_00_15(rH0, rH1, rH2, rH3, rW12, rW13, 25, 20, 48,
> -		0x6b90, 0x1122, 0xfd98, 0x7193)
> -	R_00_15(rH2, rH3, rH0, rH1, rW14, rW15, 15, 10, 56,
> -		0xa679, 0x438e, 0x49b4, 0x0821)
> -
> -	R_16_31(rH0, rH1, rH2, rH3, rW01, rW06, 27, 23,
> -		0x0d56, 0x6e0c, 0x1810, 0x6d2d)
> -	R_16_31(rH2, rH3, rH0, rH1, rW11, rW00, 18, 12,
> -		0x9d02, -32109, 0x124c, 0x2332)
> -	R_16_31(rH0, rH1, rH2, rH3, rW05, rW10, 27, 23,
> -		0x8ea7, 0x4a33, 0x0245, -18270)
> -	R_16_31(rH2, rH3, rH0, rH1, rW15, rW04, 18, 12,
> -		0x8eee,  -8608, 0xf258,  -5095)
> -	R_16_31(rH0, rH1, rH2, rH3, rW09, rW14, 27, 23,
> -		0x969d, -10697, 0x1cbe, -15288)
> -	R_16_31(rH2, rH3, rH0, rH1, rW03, rW08, 18, 12,
> -		0x3317, 0x3e99, 0xdbd9, 0x7c15)
> -	R_16_31(rH0, rH1, rH2, rH3, rW13, rW02, 27, 23,
> -		0xac4b, 0x7772, 0xd8cf, 0x331d)
> -	R_16_31(rH2, rH3, rH0, rH1, rW07, rW12, 18, 12,
> -		0x6a28, 0x6dd8, 0x219a, 0x3b68)
> -
> -	R_32_47(rH0, rH1, rH2, rH3, rW05, rW08, 28, 21,
> -		0x29cb, 0x28e5, 0x4218,  -7788)
> -	R_32_47(rH2, rH3, rH0, rH1, rW11, rW14, 16,  9,
> -		0x473f, 0x06d1, 0x3aae, 0x3036)
> -	R_32_47(rH0, rH1, rH2, rH3, rW01, rW04, 28, 21,
> -		0xaea1, -15134, 0x640b, -11295)
> -	R_32_47(rH2, rH3, rH0, rH1, rW07, rW10, 16,  9,
> -		0x8f4c, 0x4887, 0xbc7c, -22499)
> -	R_32_47(rH0, rH1, rH2, rH3, rW13, rW00, 28, 21,
> -		0x7eb8, -27199, 0x00ea, 0x6050)
> -	R_32_47(rH2, rH3, rH0, rH1, rW03, rW06, 16,  9,
> -		0xe01a, 0x22fe, 0x4447, 0x69c5)
> -	R_32_47(rH0, rH1, rH2, rH3, rW09, rW12, 28, 21,
> -		0xb7f3, 0x0253, 0x59b1, 0x4d5b)
> -	R_32_47(rH2, rH3, rH0, rH1, rW15, rW02, 16,  9,
> -		0x4701, -27017, 0xc7bd, -19859)
> -
> -	R_48_63(rH0, rH1, rH2, rH3, rW00, rW07, 26, 22,
> -		0x0988,  -1462, 0x4c70, -19401)
> -	R_48_63(rH2, rH3, rH0, rH1, rW14, rW05, 17, 11,
> -		0xadaf,  -5221, 0xfc99, 0x66f7)
> -	R_48_63(rH0, rH1, rH2, rH3, rW12, rW03, 26, 22,
> -		0x7e80, -16418, 0xba1e, -25587)
> -	R_48_63(rH2, rH3, rH0, rH1, rW10, rW01, 17, 11,
> -		0x4130, 0x380d, 0xe0c5, 0x738d)
> -	lwz		rW00,0(rHP)
> -	R_48_63(rH0, rH1, rH2, rH3, rW08, rW15, 26, 22,
> -		0xe837, -30770, 0xde8a, 0x69e8)
> -	lwz		rW14,4(rHP)
> -	R_48_63(rH2, rH3, rH0, rH1, rW06, rW13, 17, 11,
> -		0x9e79, 0x260f, 0x256d, -27941)
> -	lwz		rW12,8(rHP)
> -	R_48_63(rH0, rH1, rH2, rH3, rW04, rW11, 26, 22,
> -		0xab75, -20775, 0x4f9e, -28397)
> -	lwz		rW10,12(rHP)
> -	R_48_63(rH2, rH3, rH0, rH1, rW02, rW09, 17, 11,
> -		0x662b, 0x7c56, 0x11b2, 0x0358)
> -
> -	add		rH0,rH0,rW00
> -	stw		rH0,0(rHP)
> -	add		rH1,rH1,rW14
> -	stw		rH1,4(rHP)
> -	add		rH2,rH2,rW12
> -	stw		rH2,8(rHP)
> -	add		rH3,rH3,rW10
> -	stw		rH3,12(rHP)
> -	NEXT_BLOCK
> -
> -	bdnz		ppc_md5_main
> -
> -	FINALIZE
> -	blr
> diff --git a/lib/crypto/powerpc/md5.h b/lib/crypto/powerpc/md5.h
> deleted file mode 100644
> index 540b08e34d1d..000000000000
> --- a/lib/crypto/powerpc/md5.h
> +++ /dev/null
> @@ -1,12 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0-or-later */
> -/*
> - * MD5 optimized for PowerPC
> - */
> -
> -void ppc_md5_transform(u32 *state, const u8 *data, size_t nblocks);
> -
> -static void md5_blocks(struct md5_block_state *state,
> -		       const u8 *data, size_t nblocks)
> -{
> -	ppc_md5_transform(state->h, data, nblocks);
> -}
> 
> base-commit: 7fd2df204f342fc17d1a0bfcd474b24232fb0f32



^ permalink raw reply

* Re: [PATCH v2 0/5] mm: reduce mmap_lock contention and improve page fault performance
From: Jan Kara @ 2026-05-04 13:03 UTC (permalink / raw)
  To: Barry Song
  Cc: Jan Kara, Matthew Wilcox, akpm, linux-mm, david, ljs, liam,
	vbabka, rppt, surenb, mhocko, pfalcato, wanglian, chentao,
	lianux.mm, kunwu.chan, liyangouwen1, chrisl, kasong, shikemeng,
	nphamcs, bhe, youngjun.park, linux-arm-kernel, linux-kernel,
	loongarch, linuxppc-dev, linux-riscv, linux-s390
In-Reply-To: <CAGsJ_4y8JtCxEy9hVxVd8eaf8D9WC8BcRQp9afUVu=FJA1-awA@mail.gmail.com>

On Mon 04-05-26 03:55:43, Barry Song wrote:
> On Mon, May 4, 2026 at 2:17 AM Jan Kara <jack@suse.cz> wrote:
> > On Fri 01-05-26 18:57:52, Matthew Wilcox wrote:
> > > On Sat, May 02, 2026 at 01:44:34AM +0800, Barry Song wrote:
> > > > On Fri, May 1, 2026 at 10:57 PM Matthew Wilcox <willy@infradead.org> wrote:
> > > > > On Fri, May 01, 2026 at 06:49:58AM +0800, Barry Song wrote:
> > > > > > 1. There is no deterministic latency for I/O completion. It depends on
> > > > > > both the hardware and the software stack (bio/request queues and the
> > > > > > block scheduler). Sometimes the latency is short; at other times it can
> > > > > > be quite long. In such cases, a high-priority thread performing operations
> > > > > > such as mprotect, unmap, prctl_set_vma, or madvise may be forced to wait
> > > > > > for an unpredictable amount of time.
> > > > >
> > > > > But does that actually happen?  I find it hard to believe that thread A
> > > > > unmaps a VMA while thread B is in the middle of taking a page fault in
> > > > > that same VMA.  mprotect() and madvise() are more likely to happen, but
> > > > > it still seems really unlikely to me.
> > > >
> > > > It doesn’t have to involve unmapping or applying mprotect to
> > > > the entire VMA—just a portion of it is sufficient.
> > >
> > > Yes, but that still fails to answer "does this actually happen".  How much
> > > performance is all this complexity in the page fault handler buying us?
> > > If you don't answer this question, I'm just going to go in and rip it
> > > all out.
> >
> > I fully agree with you we should verify whether the retry code still brings
> > in real-world advantage today with VMA locks. After all the retry logic has
> > been introduced in 2010. That being said if there are realistic loads where
> > one thread needs VMA write lock while another thread is faulting the VMA,
> > then the latencies can be indeed extreme. For example things like cgroup IO
> > throttling happen on the IO path and thus can throttle IO of a low-priority
> > thread for a long time.
> 
> I’m quite sure that swap-in and VMA writes can occur
> concurrently, and this is fairly common. For example,
> Java GC may use mprotect or userfaultfd on a small
> portion of a large Java heap while other portions are
> still under do_swap_page().

OK, makes sense.

> If we start exploring different approaches for anon and
> file, I agree I can revisit this on an Android phone if
> there is a real, serious case where a file VMA can be
> written and a page fault occurs at the same time.
> 
> Please note that, as an Android developer, I am particularly
> cautious about priority inversion. A recent issue causing
> severe priority inversion is zram attempting to support
> preemption[1]. When a task performing compression or
> decompression is migrated to another CPU and then preempted
> by other tasks, high-priority tasks waiting on the mutex may
> be significantly delayed, impacting user experience.

Well, container people are concerned about priority inversion as well. But
usually this is with coarse lock (such as global filesystem locks) but VMA
lock is specific to a task (and a VMA) so there the opportunity for
priority inversion looks more limited.  But the example with Java where GC
thread can presumably have higher priority than ordinary Java threads is an
interesting one.

> > BTW I'm not sure I quite understand Barry's priority inversion problem
> > since I'd expect all threads of a task to generally be treated with the
> > same priority...
> 
> Exactly not. Maybe these slides[2] and this project[3] can give
> you a hint—they aim to standardize things on Linux by
> learning from Apple OS. Basically, tasks are classified
> into five types:
> 
> USER_INTERACTIVE: Requires immediate response.
> USER_INITIATED: Tolerates a short delay, but must respond quickly still.
> UTILITY: Tolerates long delays, but not prolonged ones.
> BACKGROUND: Doesn’t mind prolonged delays.
> DEFAULT: System default behavior.

Again, this is a clasification of tasks but not really of threads in a task
so at least for VMA lock there's no inversion so have?

								Honza

> [1] https://lore.kernel.org/linux-mm/20250303022425.285971-3-senozhatsky@chromium.org/
> [2] https://lpc.events/event/19/contributions/2089/attachments/1797/3877/Userspace%20Assisted%20Scheduling%20via%20Sched%20QoS.pdf
> [3] https://lore.kernel.org/lkml/20260415000910.2h5misvwc45bdumu@airbuntu/
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR


^ permalink raw reply

* Re: [PATCH] lib/crypto: powerpc/md5: Drop powerpc optimized MD5 code
From: Ard Biesheuvel @ 2026-05-04 11:43 UTC (permalink / raw)
  To: Eric Biggers, linux-crypto
  Cc: linux-kernel, Jason A . Donenfeld, Herbert Xu, linuxppc-dev,
	Christophe Leroy (CS GROUP), Nicholas Piggin, Michael Ellerman,
	Madhavan Srinivasan
In-Reply-To: <20260504041448.15820-1-ebiggers@kernel.org>


On Mon, 4 May 2026, at 06:14, Eric Biggers wrote:
> Earlier the decision was made to keep this code for a while, despite no
> other architectures having optimized MD5 code anymore, because of
> someone using it via AF_ALG via libkcapi-hasher
> (https://lore.kernel.org/r/f0d771d5-ed70-444c-957a-ad4c16f6c115@csgroup.eu/)
>
> However, with AF_ALG itself now being on its way out due to its
> continuous stream of security vulnerabilities
> (https://lore.kernel.org/r/20260430011544.31823-1-ebiggers@kernel.org/),
> it's time to be a bit more forceful with nudging people towards
> userspace crypto code.  It's always been the better solution anyway, and
> it's much more efficient if properly optimized code is used.
>
> Thus, drop the PowerPC optimized MD5 code.  Note that this code contains
> no privileged instructions and could be run in userspace just fine.
>
> MD5 is still supported, just with the generic code only.  I.e., this
> commit only changes performance; it isn't a hard break.
>
> This also has no effect on implementations of md5sum that already just
> use userspace code (as they should), for example the coreutils one.
>
> Signed-off-by: Eric Biggers <ebiggers@kernel.org>

Acked-by: Ard Biesheuvel <ardb@kernel.org>


^ permalink raw reply

* Re: [mainline][bpf] build failure: conflicting bpf_arena_* prototypes between vmlinux.h and bpf_arena_common.h
From: Alexei Starovoitov @ 2026-05-04  8:44 UTC (permalink / raw)
  To: Venkat Rao Bagalkote
  Cc: Saket Kumar Bhaskar, Hari Bathini, Madhavan Srinivasan, bpf,
	linuxppc-dev, LKML
In-Reply-To: <362eaba2-54dd-4b59-81f5-65abb916e916@linux.ibm.com>

On Mon, May 4, 2026 at 10:42 AM Venkat Rao Bagalkote
<venkat88@linux.ibm.com> wrote:
>
> Greetings!!!
>
> I’m seeing a CLANG-BPF build failure in tools/testing/selftests/bpf
> related to conflicting prototypes of bpf_arena_* helpers between
> vmlinux.h and bpf_arena_common.h, on mainline kernel.
>
>
> Error log:
>
>    In file included from progs/arena_strsearch.c:12:
>    In file included from
> tools/testing/selftests/bpf/bpf_arena_strsearch.h:4:
>    tools/testing/selftests/bpf/bpf_arena_common.h:47:15:
>      error: conflicting types for 'bpf_arena_alloc_pages'
>        void __arena* bpf_arena_alloc_pages(void *map,
>                                            void __arena *addr,
>                                            __u32 page_cnt,
>                                            __u32 node_id,
>                                            __u64 flags);
>
>    tools/testing/selftests/bpf/tools/include/vmlinux.h:135295:14:
>      note: previous declaration is here
>        extern void *bpf_arena_alloc_pages(void *p__map,
>                                           void *addr__ign,
>                                           u32 page_cnt,
>                                           int node_id,
>                                           u64 flags) __weak __ksym;
>
>    tools/testing/selftests/bpf/bpf_arena_common.h:49:5:
>      error: conflicting types for 'bpf_arena_reserve_pages'
>        int bpf_arena_reserve_pages(void *map,
>                                    void __arena *addr,
>                                    __u32 page_cnt) __ksym __weak;
>
>    tools/testing/selftests/bpf/tools/include/vmlinux.h:135297:12:
>      note: previous declaration is here
>        extern int bpf_arena_reserve_pages(void *p__map,
>                                           void *ptr__ign,
>                                           u32 page_cnt) __weak __ksym;
>
>    tools/testing/selftests/bpf/bpf_arena_common.h:50:6:
>      error: conflicting types for 'bpf_arena_free_pages'
>        void bpf_arena_free_pages(void *map,
>                                  void __arena *ptr,
>                                  __u32 page_cnt) __ksym __weak;
>
>    tools/testing/selftests/bpf/tools/include/vmlinux.h:135296:13:
>      note: previous declaration is here
>        extern void bpf_arena_free_pages(void *p__map,
>                                         void *ptr__ign,
>                                         u32 page_cnt) __weak __ksym;
>
> Tool chain Versions:
>
> gcc (GCC) 14.2.0
> # clang --version
> clang version 23.0.0git (https://github.com/llvm/llvm-project.git
> bd6bfba3e50343c112a04b639394ab85be17c29b)
>
> # llc --version
> LLVM (http://llvm.org/):
>    LLVM version 23.0.0git
>    Optimized build.
>    Default target: powerpc64le-unknown-linux-gnu
>    Host CPU: (unknown)
>
>    Registered Targets:
>      bpf     - BPF (host endian)
>      bpfeb   - BPF (big endian)
>      bpfel   - BPF (little endian)
>      ppc32   - PowerPC 32
>      ppc32le - PowerPC 32 LE
>      ppc64   - PowerPC 64
>      ppc64le - PowerPC 64 LE
>
> # objcopy --version | head -1
> GNU objcopy version 2.35.2-67.el9_7.1
> # ld --version | head -1
> GNU ld version 2.35.2-67.el9_7.1
>
>
> If you happen to fix this, please add below tag.
>
>
> Reported-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>

The issue is in your setup.
It was brought up a couple times in the past.
Please search archives.


^ permalink raw reply

* [mainline][bpf] build failure: conflicting bpf_arena_* prototypes between vmlinux.h and bpf_arena_common.h
From: Venkat Rao Bagalkote @ 2026-05-04  8:42 UTC (permalink / raw)
  To: Saket Kumar Bhaskar, Hari Bathini, Madhavan Srinivasan, bpf,
	linuxppc-dev
  Cc: LKML

Greetings!!!

I’m seeing a CLANG-BPF build failure in tools/testing/selftests/bpf
related to conflicting prototypes of bpf_arena_* helpers between
vmlinux.h and bpf_arena_common.h, on mainline kernel.


Error log:

   In file included from progs/arena_strsearch.c:12:
   In file included from 
tools/testing/selftests/bpf/bpf_arena_strsearch.h:4:
   tools/testing/selftests/bpf/bpf_arena_common.h:47:15:
     error: conflicting types for 'bpf_arena_alloc_pages'
       void __arena* bpf_arena_alloc_pages(void *map,
                                           void __arena *addr,
                                           __u32 page_cnt,
                                           __u32 node_id,
                                           __u64 flags);

   tools/testing/selftests/bpf/tools/include/vmlinux.h:135295:14:
     note: previous declaration is here
       extern void *bpf_arena_alloc_pages(void *p__map,
                                          void *addr__ign,
                                          u32 page_cnt,
                                          int node_id,
                                          u64 flags) __weak __ksym;

   tools/testing/selftests/bpf/bpf_arena_common.h:49:5:
     error: conflicting types for 'bpf_arena_reserve_pages'
       int bpf_arena_reserve_pages(void *map,
                                   void __arena *addr,
                                   __u32 page_cnt) __ksym __weak;

   tools/testing/selftests/bpf/tools/include/vmlinux.h:135297:12:
     note: previous declaration is here
       extern int bpf_arena_reserve_pages(void *p__map,
                                          void *ptr__ign,
                                          u32 page_cnt) __weak __ksym;

   tools/testing/selftests/bpf/bpf_arena_common.h:50:6:
     error: conflicting types for 'bpf_arena_free_pages'
       void bpf_arena_free_pages(void *map,
                                 void __arena *ptr,
                                 __u32 page_cnt) __ksym __weak;

   tools/testing/selftests/bpf/tools/include/vmlinux.h:135296:13:
     note: previous declaration is here
       extern void bpf_arena_free_pages(void *p__map,
                                        void *ptr__ign,
                                        u32 page_cnt) __weak __ksym;

Tool chain Versions:

gcc (GCC) 14.2.0
# clang --version
clang version 23.0.0git (https://github.com/llvm/llvm-project.git 
bd6bfba3e50343c112a04b639394ab85be17c29b)

# llc --version
LLVM (http://llvm.org/):
   LLVM version 23.0.0git
   Optimized build.
   Default target: powerpc64le-unknown-linux-gnu
   Host CPU: (unknown)

   Registered Targets:
     bpf     - BPF (host endian)
     bpfeb   - BPF (big endian)
     bpfel   - BPF (little endian)
     ppc32   - PowerPC 32
     ppc32le - PowerPC 32 LE
     ppc64   - PowerPC 64
     ppc64le - PowerPC 64 LE

# objcopy --version | head -1
GNU objcopy version 2.35.2-67.el9_7.1
# ld --version | head -1
GNU ld version 2.35.2-67.el9_7.1


If you happen to fix this, please add below tag.


Reported-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>


Regards,

Venkat.




^ permalink raw reply

* [PATCH v6 05/14] selftests/mm: fix cgroup task placement and drop memory.current checks in hugetlb_reparenting_test.sh
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

The test currently moves the calling shell ($$) into the target cgroup
before executing write_to_hugetlbfs. This results in the shell and any
intermediate allocations being charged to the cgroup, introducing noise
and nondeterminism in accounting. It also requires moving the shell back
to the root cgroup after execution.

Spawn a helper process that joins the target cgroup and
exec()'s write_to_hugetlbfs. This ensures that only the workload is
accounted to the cgroup and avoids unintended charging from the shell.

The test currently validates both hugetlb usage and memory.current.
However, memory.current includes internal memcg allocations and
per-CPU batched accounting (MEMCG_CHARGE_BATCH), which are not
synchronized and can vary across systems, leading to
non-deterministic results.

Since hugetlb memory is accounted via hugetlb.<size>.current,
memory.current is not a reliable indicator here. Drop memory.current
checks and rely only on hugetlb controller statistics for stable
and accurate validation.

Fixes: 29750f71a9b4 ("hugetlb_cgroup: add hugetlb_cgroup reservation tests")
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 .../selftests/mm/hugetlb_reparenting_test.sh  | 42 ++++++++-----------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
index d724b6e45432..95f517c3bd16 100755
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -105,22 +105,17 @@ function assert_with_retry() {
 }
 
 function assert_state() {
-  local expected_a="$1"
-  local expected_a_hugetlb="$2"
-  local expected_b=""
+  local expected_a_hugetlb="$1"
   local expected_b_hugetlb=""
 
-  if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then
-    expected_b="$3"
-    expected_b_hugetlb="$4"
+  if [ ! -z ${2:-} ]; then
+    expected_b_hugetlb="$2"
   fi
 
-  assert_with_retry "$CGROUP_ROOT/a/memory.$usage_file" "$expected_a"
   assert_with_retry \
 	  "$CGROUP_ROOT/a/hugetlb.${MB_DISPLAY}${UNIT}.$usage_file" "$expected_a_hugetlb"
 
-  if [[ -n "$expected_b" && -n "$expected_b_hugetlb" ]]; then
-    assert_with_retry "$CGROUP_ROOT/a/b/memory.$usage_file" "$expected_b"
+  if [[ -n "$expected_b_hugetlb" ]]; then
     assert_with_retry \
 	  "$CGROUP_ROOT/a/b/hugetlb.${MB_DISPLAY}${UNIT}.$usage_file" "$expected_b_hugetlb"
   fi
@@ -154,18 +149,17 @@ write_hugetlbfs() {
   local size="$3"
 
   if [[ $cgroup2 ]]; then
-    echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs
+    cg_file="$CGROUP_ROOT/$cgroup/cgroup.procs"
   else
     echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems
     echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus
-    echo $$ >"$CGROUP_ROOT/$cgroup/tasks"
-  fi
-  ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o
-  if [[ $cgroup2 ]]; then
-    echo $$ >$CGROUP_ROOT/cgroup.procs
-  else
-    echo $$ >"$CGROUP_ROOT/tasks"
+    cg_file="$CGROUP_ROOT/$cgroup/tasks"
   fi
+
+  # Spawn helper to join cgroup before exec to ensure correct cgroup accounting
+  bash -c 'echo $$ > "$1"; exec ./write_to_hugetlbfs -p "$2" -s "$3" -m 0 -o' _ \
+	  "$cg_file" "$path" "$size" & pid=$!
+  wait "$pid"
   echo
 }
 
@@ -203,21 +197,21 @@ if [[ ! $cgroup2 ]]; then
   write_hugetlbfs a "$MNT"/test $size
 
   echo Assert memory charged correctly for parent use.
-  assert_state 0 $size 0 0
+  assert_state $size 0
 
   write_hugetlbfs a/b "$MNT"/test2 $size
 
   echo Assert memory charged correctly for child use.
-  assert_state 0 $(($size * 2)) 0 $size
+  assert_state $(($size * 2)) $size
 
   rmdir "$CGROUP_ROOT"/a/b
   echo Assert memory reparent correctly.
-  assert_state 0 $(($size * 2))
+  assert_state $(($size * 2))
 
   rm -rf "$MNT"/*
   umount "$MNT"
   echo Assert memory uncharged correctly.
-  assert_state 0 0
+  assert_state 0
 
   cleanup
 fi
@@ -231,16 +225,16 @@ echo write
 write_hugetlbfs a/b "$MNT"/test2 $size
 
 echo Assert memory charged correctly for child only use.
-assert_state 0 $(($size)) 0 $size
+assert_state $(($size)) $size
 
 rmdir "$CGROUP_ROOT"/a/b
 echo Assert memory reparent correctly.
-assert_state 0 $size
+assert_state $size
 
 rm -rf "$MNT"/*
 umount "$MNT"
 echo Assert memory uncharged correctly.
-assert_state 0 0
+assert_state 0
 
 cleanup
 
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 13/14] selftests/mm: move hwpoison setup into run_test() and silence modprobe output for memory-failure category
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

run_vmtests.sh contains special handling to ensure the hwpoison_inject
module is available for the memory-failure tests. This logic was
implemented outside of run_test(), making the setup category-specific
but managed globally.

Move the hwpoison_inject handling into run_test() and restrict it
to the memory-failure category so that:
1. the module is checked and loaded only when memory-failure tests run,
2. the test is skipped if the module or the debugfs interface
(/sys/kernel/debug/hwpoison/) is not available.
3. the module is unloaded after the test if it was loaded by the script.

This localizes category-specific setup and makes the test flow
consistent with other per-category preparations.

While updating this logic, fix the module availability check.
The script previously used:

	modprobe -R hwpoison_inject

The -R option prints the resolved module name to stdout, causing every
run to print:

	hwpoison_inject

in the test output, even when no action is required, introducing
unnecessary noise.

Replace this with:

	modprobe -n hwpoison_inject

which verifies that the module is loadable without producing output,
keeping the selftest logs clean and consistent.

Also, ensure that skipped tests do not override a previously recorded
failure. A skipped test currently sets exitcode to ksft_skip even if a
prior test has failed, which can mask failures in the final exit status.
Update the logic to only set exitcode to ksft_skip when no failure has
been recorded.

Fixes: ff4ef2fbd101 ("selftests/mm: add memory failure anonymous page test")
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 tools/testing/selftests/mm/run_vmtests.sh | 52 ++++++++++++++---------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index d8468451b3a3..eb6f0ae48cc7 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -235,6 +235,7 @@ pretty_name() {
 run_test() {
 	if test_selected ${CATEGORY}; then
 		local skip=0
+		local LOADED_HWPOISON_INJECT_MOD=0
 
 		# On memory constrainted systems some tests can fail to allocate hugepages.
 		# perform some cleanup before the test for a higher success rate.
@@ -250,6 +251,28 @@ run_test() {
 			fi
 		fi
 
+		# Ensure hwpoison_inject is available for memory-failure tests
+		if [ "${CATEGORY}" = "memory-failure" ]; then
+			# Try to load hwpoison_inject if not present.
+			HWPOISON_DIR=/sys/kernel/debug/hwpoison/
+			if [ ! -d "$HWPOISON_DIR" ]; then
+				if ! modprobe -n hwpoison_inject > /dev/null 2>&1; then
+					echo "Module hwpoison_inject not found, skipping..." \
+						| tap_prefix
+					skip=1
+				else
+					modprobe hwpoison_inject > /dev/null 2>&1
+					LOADED_HWPOISON_INJECT_MOD=1
+					if [ ! -d "$HWPOISON_DIR" ]; then
+						echo "hwpoison debugfs interface not present" \
+							| tap_prefix
+						skip=1
+					fi
+				fi
+			fi
+
+		fi
+
 		local test=$(pretty_name "$*")
 		local title="running $*"
 		local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
@@ -261,6 +284,12 @@ run_test() {
 		else
 			local ret=$ksft_skip
 		fi
+
+		# Unload hwpoison_inject if we loaded it
+		if [ "${LOADED_HWPOISON_INJECT_MOD}" = "1" ]; then
+			modprobe -r hwpoison_inject > /dev/null 2>&1
+		fi
+
 		count_total=$(( count_total + 1 ))
 		if [ $ret -eq 0 ]; then
 			count_pass=$(( count_pass + 1 ))
@@ -270,7 +299,9 @@ run_test() {
 			count_skip=$(( count_skip + 1 ))
 			echo "[SKIP]" | tap_prefix
 			echo "ok ${count_total} ${test} # SKIP" | tap_output
-			exitcode=$ksft_skip
+			if [ $exitcode -eq 0 ]; then
+				exitcode=$ksft_skip
+			fi
 		else
 			count_fail=$(( count_fail + 1 ))
 			echo "[FAIL]" | tap_prefix
@@ -531,24 +562,7 @@ CATEGORY="page_frag" run_test ./test_page_frag.sh nonaligned
 
 CATEGORY="rmap" run_test ./rmap
 
-# Try to load hwpoison_inject if not present.
-HWPOISON_DIR=/sys/kernel/debug/hwpoison/
-if [ ! -d "$HWPOISON_DIR" ]; then
-	if ! modprobe -q -R hwpoison_inject; then
-		echo "Module hwpoison_inject not found, skipping..."
-	else
-		modprobe hwpoison_inject > /dev/null 2>&1
-		LOADED_MOD=1
-	fi
-fi
-
-if [ -d "$HWPOISON_DIR" ]; then
-	CATEGORY="memory-failure" run_test ./memory-failure
-fi
-
-if [ -n "${LOADED_MOD}" ]; then
-	modprobe -r hwpoison_inject > /dev/null 2>&1
-fi
+CATEGORY="memory-failure" run_test ./memory-failure
 
 if [ "${HAVE_HUGEPAGES}" = 1 ]; then
 	echo "$orig_nr_hugepgs" > /proc/sys/vm/nr_hugepages
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 14/14] selftests/mm: clarify alternate unmapping in compaction_test
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

Add a comment explaining that every other entry in the list is
unmapped to intentionally create fragmentation with
locked pages before invoking check_compaction().

Fixes: bd67d5c15cc1 ("Test compaction of mlocked memory")
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 tools/testing/selftests/mm/compaction_test.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c
index 30209c40b697..e1610e2fbdf6 100644
--- a/tools/testing/selftests/mm/compaction_test.c
+++ b/tools/testing/selftests/mm/compaction_test.c
@@ -261,6 +261,9 @@ int main(int argc, char **argv)
 		mem_fragmentable_MB -= MAP_SIZE_MB;
 	}
 
+	/* Unmap every other entry in the list to create fragmentation with
+	 * locked pages before invoking check_compaction().
+	 */
 	for (entry = list; entry != NULL; entry = entry->next) {
 		munmap(entry->map, MAP_SIZE);
 		if (!entry->next)
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 11/14] selftests/mm: skip uffd-wp-mremap if UFFD write-protect is unsupported
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

The uffd-wp-mremap test requires the UFFD_FEATURE_PAGEFAULT_FLAG_WP
capability. On systems where userfaultfd write-protect is
not supported, uffd_register() fails and the test reports failures.

Check for the required feature at startup and skip the test when the
UFFD_FEATURE_PAGEFAULT_FLAG_WP capability is not present,
preventing false failures on unsupported configurations.

Before patch:
 running ./uffd-wp-mremap
 ------------------------
  [INFO] detected THP size: 256 KiB
  [INFO] detected THP size: 512 KiB
  [INFO] detected THP size: 1024 KiB
  [INFO] detected THP size: 2048 KiB
  [INFO] detected hugetlb page size: 2048 KiB
  [INFO] detected hugetlb page size: 1048576 KiB
 1..24
  [RUN] test_one_folio(size=65536, private=false, swapout=false,
  hugetlb=false)
 not ok 1 uffd_register() failed
  [RUN] test_one_folio(size=65536, private=true, swapout=false,
  hugetlb=false)
 not ok 2 uffd_register() failed
  [RUN] test_one_folio(size=65536, private=false, swapout=true,
  hugetlb=false)
 not ok 3 uffd_register() failed
  [RUN] test_one_folio(size=65536, private=true, swapout=true,
  hugetlb=false)
 not ok 4 uffd_register() failed
  [RUN] test_one_folio(size=262144, private=false, swapout=false,
  hugetlb=false)
 not ok 5 uffd_register() failed
  [RUN] test_one_folio(size=524288, private=false, swapout=false,
  hugetlb=false)
 not ok 6 uffd_register() failed
 .
 .
 .
 Bail out! 24 out of 24 tests failed
  Totals: pass:0 fail:24 xfail:0 xpass:0 skip:0 error:0
 [FAIL]
not ok 1 uffd-wp-mremap # exit=1

After patch:
 running ./uffd-wp-mremap
 ------------------------
 1..0 # SKIP uffd-wp feature not supported
 [SKIP]
ok 1 uffd-wp-mremap # SKIP

Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 tools/testing/selftests/mm/uffd-wp-mremap.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c
index 17186d4a4147..8f288484d5f5 100644
--- a/tools/testing/selftests/mm/uffd-wp-mremap.c
+++ b/tools/testing/selftests/mm/uffd-wp-mremap.c
@@ -19,6 +19,17 @@ static size_t thpsizes[20];
 static int nr_hugetlbsizes;
 static size_t hugetlbsizes[10];
 
+static void check_uffd_wp_feature_supported(void)
+{
+	uint64_t features = 0;
+
+	if (uffd_get_features(&features))
+		ksft_exit_skip("failed to get available features (%d)\n", errno);
+
+	if (!(features & UFFD_FEATURE_PAGEFAULT_FLAG_WP))
+		ksft_exit_skip("uffd-wp feature not supported\n");
+}
+
 static int detect_thp_sizes(size_t sizes[], int max)
 {
 	int count = 0;
@@ -336,6 +347,8 @@ int main(int argc, char **argv)
 	struct thp_settings settings;
 	int i, j, plan = 0;
 
+	check_uffd_wp_feature_supported();
+
 	pagesize = getpagesize();
 	nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
 	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 12/14] selftests/mm: skip uffd-stress test when nr_pages_per_cpu is zero
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

uffd-stress currently fails when the computed nr_pages_per_cpu
evaluates to zero:

nr_pages_per_cpu = bytes / page_size / nr_parallel

This can occur on systems with large hugepage sizes (e.g. 1GB) and a
high number of CPUs, where the total allocated memory is sufficient
overall but not enough to provide at least one page per cpu.

In such cases, the failure is due to insufficient test resources
rather than incorrect kernel behaviour. Update the test
to treat this condition as a test skip instead of reporting an error.

Fixes: db0f1c138f18 ("selftests/mm: print some details when uffd-stress gets bad params")
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 tools/testing/selftests/mm/uffd-stress.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c
index 700fbaa18d44..b8f22ea859a6 100644
--- a/tools/testing/selftests/mm/uffd-stress.c
+++ b/tools/testing/selftests/mm/uffd-stress.c
@@ -491,9 +491,9 @@ int main(int argc, char **argv)
 
 	gopts->nr_pages_per_cpu = bytes / gopts->page_size / gopts->nr_parallel;
 	if (!gopts->nr_pages_per_cpu) {
-		_err("pages_per_cpu = 0, cannot test (%lu / %lu / %lu)",
-			bytes, gopts->page_size, gopts->nr_parallel);
-		usage();
+		ksft_print_msg("pages_per_cpu = 0, cannot test (%lu / %lu / %lu)\n",
+			       bytes, gopts->page_size, gopts->nr_parallel);
+		return KSFT_SKIP;
 	}
 
 	bounces = atoi(argv[3]);
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 10/14] selftests/mm: ensure destination is hugetlb-backed in hugepage-mremap
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

The hugepage-mremap selftest reserves the destination address using a
anonymous base-page mapping before calling mremap() with MREMAP_FIXED,
while the source region is hugetlb-backed.

When remapping a hugetlb mapping into a base-page VMA may fail with:

    mremap: Device or resource busy

This is observed on powerpc hash MMU systems where slice constraints
and page size incompatibilities prevent the remap.

Ensure the destination region is created using MAP_HUGETLB so that both
source and destination VMAs are hugetlb-backed and compatible.

Update the FLAGS macro to include MAP_HUGETLB | MAP_SHARED
so that both mappings are hugetlb-backed and compatible.
Also use the macro for the mmap() calls to avoid repeating
the flag combination.

This ensures the test reliably exercises hugetlb mremap instead of
failing due to VMA type mismatch.

Fixes: 12b613206474 ("mm, hugepages: add hugetlb vma mremap() test")
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 tools/testing/selftests/mm/hugepage-mremap.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c
index 68c35d1d8a5f..a442d01908cc 100644
--- a/tools/testing/selftests/mm/hugepage-mremap.c
+++ b/tools/testing/selftests/mm/hugepage-mremap.c
@@ -31,7 +31,7 @@
 #define MB_TO_BYTES(x) (x * 1024 * 1024)
 
 #define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC)
-#define FLAGS (MAP_SHARED | MAP_ANONYMOUS)
+#define FLAGS (MAP_HUGETLB | MAP_SHARED)
 #define ALIGN(x, a) (((x) + ((a) - 1)) & ~((a) - 1))
 
 static void check_bytes(char *addr)
@@ -131,23 +131,20 @@ int main(int argc, char *argv[])
 
 	/* mmap to a PUD aligned address to hopefully trigger pmd sharing. */
 	unsigned long suggested_addr = 0x7eaa40000000;
-	void *haddr = mmap((void *)suggested_addr, length, PROTECTION,
-			   MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
+	void *haddr = mmap((void *)suggested_addr, length, PROTECTION, FLAGS, fd, 0);
 	ksft_print_msg("Map haddr: Returned address is %p\n", haddr);
 	if (haddr == MAP_FAILED)
 		ksft_exit_fail_msg("mmap1: %s\n", strerror(errno));
 
 	/* mmap again to a dummy address to hopefully trigger pmd sharing. */
 	suggested_addr = 0x7daa40000000;
-	void *daddr = mmap((void *)suggested_addr, length, PROTECTION,
-			   MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
+	void *daddr = mmap((void *)suggested_addr, length, PROTECTION, FLAGS, fd, 0);
 	ksft_print_msg("Map daddr: Returned address is %p\n", daddr);
 	if (daddr == MAP_FAILED)
 		ksft_exit_fail_msg("mmap3: %s\n", strerror(errno));
 
 	suggested_addr = 0x7faa40000000;
-	void *vaddr =
-		mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0);
+	void *vaddr = mmap((void *)suggested_addr, length, PROTECTION, FLAGS, fd, 0);
 	ksft_print_msg("Map vaddr: Returned address is %p\n", vaddr);
 	if (vaddr == MAP_FAILED)
 		ksft_exit_fail_msg("mmap2: %s\n", strerror(errno));
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 09/14] selftest/mm: register existing mapping with userfaultfd in hugepage-mremap
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

Previously, register_region_with_uffd() created a new anonymous
mapping and overwrote the address supplied by the caller before
registering the range with userfaultfd.

As a result, userfaultfd was applied to an unrelated anonymous mapping
instead of the hugetlb region used by the test.

Remove the extra mmap() and register the caller-provided address range
directly using UFFDIO_REGISTER_MODE_MISSING, so that faults are
generated for the hugetlb mapping used by the test.

This ensures userfaultfd operates on the actual hugetlb test region and
validates the expected fault handling.

Before patch:
 running ./hugepage-mremap
 -------------------------
 TAP version 13
 1..1
  Map haddr: Returned address is 0x7eaa40000000
  Map daddr: Returned address is 0x7daa40000000
  Map vaddr: Returned address is 0x7faa40000000
  Address returned by mmap() = 0x7fff9d000000
  Mremap: Returned address is 0x7faa40000000
  First hex is 0
  First hex is 3020100
 ok 1 Read same data
 Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0
 [PASS]
 ok 1 hugepage-mremap

After patch:
 running ./hugepage-mremap
 -------------------------
 TAP version 13
 1..1
  Map haddr: Returned address is 0x7eaa40000000
  Map daddr: Returned address is 0x7daa40000000
  Map vaddr: Returned address is 0x7faa40000000
  Registered memory at address 0x7eaa40000000 with userfaultfd
  Mremap: Returned address is 0x7faa40000000
  First hex is 0
  First hex is 3020100
 ok 1 Read same data
 Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0
 [PASS]
 ok 1 hugepage-mremap

Fixes: 12b613206474 ("mm, hugepages: add hugetlb vma mremap() test")
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 tools/testing/selftests/mm/hugepage-mremap.c | 21 +++++---------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c
index f66e4d806477..68c35d1d8a5f 100644
--- a/tools/testing/selftests/mm/hugepage-mremap.c
+++ b/tools/testing/selftests/mm/hugepage-mremap.c
@@ -86,25 +86,14 @@ static void register_region_with_uffd(char *addr, size_t len)
 	if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
 		ksft_exit_fail_msg("ioctl-UFFDIO_API: %s\n", strerror(errno));
 
-	/* Create a private anonymous mapping. The memory will be
-	 * demand-zero paged--that is, not yet allocated. When we
-	 * actually touch the memory, it will be allocated via
-	 * the userfaultfd.
-	 */
-
-	addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
-		    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (addr == MAP_FAILED)
-		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
-
-	ksft_print_msg("Address returned by mmap() = %p\n", addr);
-
-	/* Register the memory range of the mapping we just created for
-	 * handling by the userfaultfd object. In mode, we request to track
-	 * missing pages (i.e., pages that have not yet been faulted in).
+	/* Register the passed memory range for handling by the userfaultfd object.
+	 * In mode, we request to track missing pages
+	 * (i.e., pages that have not yet been faulted in).
 	 */
 	if (uffd_register(uffd, addr, len, true, false, false))
 		ksft_exit_fail_msg("ioctl-UFFDIO_REGISTER: %s\n", strerror(errno));
+
+	ksft_print_msg("Registered memory at address %p with userfaultfd\n", addr);
 }
 
 int main(int argc, char *argv[])
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 08/14] selftest/mm: align memory size to huge page size in hugepage-mremap test
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

The hugepage-mremap selftest uses a default mapping size of 10MB
when no argument is provided. This size is not guaranteed to be
aligned to the system hugepage size, which can cause munmap() to fail
and mremap() to succeed where a failure is expected.

Align the mapping length to the runtime hugepage size using
default_huge_page_size() to ensure the mapping is properly
aligned. Also handle the case where
default_huge_page_size() returns 0 by skipping the test.

Before patch:
  running ./hugepage-mremap
  ------------------------------
  TAP version 13
  1..1
  Map haddr: Returned address is 0x7eaa40000000
  Map daddr: Returned address is 0x7daa40000000
  Map vaddr: Returned address is 0x7faa40000000
  Address returned by mmap() = 0x7fffaa600000
  Mremap: Returned address is 0x7faa40000000
  First hex is 0
  First hex is 3020100
  Bail out! mremap: Expected failure, but call succeeded
  Planned tests != run tests (1 != 0)
  Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0
  [FAIL]
not ok 1 hugepage-mremap # exit=1

After patch:
  running ./hugepage-mremap
  -------------------------
  TAP version 13
  1..1
  Map haddr: Returned address is 0x7eaa40000000
  Map daddr: Returned address is 0x7daa40000000
  Map vaddr: Returned address is 0x7faa40000000
  Address returned by mmap() = 0x7fff13000000
  Mremap: Returned address is 0x7faa40000000
  First hex is 0
  First hex is 3020100
  ok 1 Read same data
  Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0
  [PASS]
ok 1 hugepage-mremap

Fixes: f77a286de48c ("mm, hugepages: make memory size variable in hugepage-mremap selftest")
Reviewed-by: Zi Yan <ziy@nvidia.com>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 tools/testing/selftests/mm/hugepage-mremap.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c
index b8f7d92e5a35..f66e4d806477 100644
--- a/tools/testing/selftests/mm/hugepage-mremap.c
+++ b/tools/testing/selftests/mm/hugepage-mremap.c
@@ -32,6 +32,7 @@
 
 #define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC)
 #define FLAGS (MAP_SHARED | MAP_ANONYMOUS)
+#define ALIGN(x, a) (((x) + ((a) - 1)) & ~((a) - 1))
 
 static void check_bytes(char *addr)
 {
@@ -110,6 +111,7 @@ int main(int argc, char *argv[])
 {
 	size_t length = 0;
 	int ret = 0, fd;
+	size_t hpage_size;
 
 	ksft_print_header();
 	ksft_set_plan(1);
@@ -126,6 +128,14 @@ int main(int argc, char *argv[])
 		length = DEFAULT_LENGTH_MB;
 
 	length = MB_TO_BYTES(length);
+
+	hpage_size = default_huge_page_size();
+	if (!hpage_size)
+		ksft_exit_skip("Unable to determine huge page size\n");
+
+	/* Ensure length is hugepage aligned */
+	length = ALIGN(length, hpage_size);
+
 	fd = memfd_create(argv[0], MFD_HUGETLB);
 	if (fd < 0)
 		ksft_exit_fail_msg("Open failed: %s\n", strerror(errno));
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 07/14] selftests/mm: free dynamically allocated PMD-sized buffers in split_huge_page_test
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

Dynamically allocated buffers of PMD size for file-backed
THP operations (file_buf1 and file_buf2) were not freed on
the success path and some failure paths. Since the
function is called repeatedly in a loop for each split order,
this can cause significant memory leaks.

On architectures with large PMD sizes, repeated leaks
could exhaust system memory and trigger the OOM killer
during test execution.

Ensure all allocated buffers are freed to maintain
stable repeated test runs.

Fixes: 035a112e5fd5 ("selftests/mm: make file-backed THP split work by writing PMD size data")
Reviewed-by: Zi Yan <ziy@nvidia.com>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 .../selftests/mm/split_huge_page_test.c       | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 02938f38b880..ee256afecd8f 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -473,12 +473,15 @@ static void split_file_backed_thp(int order)
 	unsigned long size = 2 * pmd_pagesize;
 	char opts[64];
 	ssize_t num_written, num_read;
-	char *file_buf1, *file_buf2;
+	char *file_buf1 = NULL, *file_buf2 = NULL;
 	uint64_t pgoff_start = 0, pgoff_end = 1024;
 	int i;
 
 	ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() for more info.\n");
 
+	if (!tmpfs_loc)
+		ksft_exit_fail_msg("mkdtemp failed\n");
+
 	file_buf1 = (char *)malloc(pmd_pagesize);
 	file_buf2 = (char *)malloc(pmd_pagesize);
 
@@ -494,8 +497,10 @@ static void split_file_backed_thp(int order)
 	snprintf(opts, sizeof(opts), "huge=always,size=%lu", size);
 	status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, opts);
 
-	if (status)
-		ksft_exit_fail_msg("Unable to create a tmpfs for testing\n");
+	if (status) {
+		ksft_print_msg("Unable to create a tmpfs for testing\n");
+		goto out;
+	}
 
 	status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
 	if (status >= INPUT_MAX) {
@@ -547,10 +552,13 @@ static void split_file_backed_thp(int order)
 
 	status = umount(tmpfs_loc);
 	if (status) {
-		rmdir(tmpfs_loc);
-		ksft_exit_fail_msg("Unable to umount %s\n", tmpfs_loc);
+		ksft_print_msg("Unable to umount %s\n", tmpfs_loc);
+		goto out;
 	}
 
+	free(file_buf1);
+	free(file_buf2);
+
 	status = rmdir(tmpfs_loc);
 	if (status)
 		ksft_exit_fail_msg("cannot remove tmp dir: %s\n", strerror(errno));
@@ -563,8 +571,10 @@ static void split_file_backed_thp(int order)
 	close(fd);
 cleanup:
 	umount(tmpfs_loc);
-	rmdir(tmpfs_loc);
 out:
+	free(file_buf1);
+	free(file_buf2);
+	rmdir(tmpfs_loc);
 	ksft_exit_fail_msg("Error occurred\n");
 }
 
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 06/14] selftests/mm: size tmpfs according to PMD page size in split_huge_page_test
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

The split_file_backed_thp() test mounts a tmpfs with a fixed size of
"4m". This works on systems with smaller PMD page sizes,
but fails on configurations where the PMD huge page size is
larger (e.g. 16MB).

On such systems, the fixed 4MB tmpfs is insufficient to allocate even
a single PMD-sized THP, causing the test to fail.

Fix this by sizing the tmpfs dynamically based on the runtime
pmd_pagesize, allocating space for two PMD-sized pages.

Before patch:
  running ./split_huge_page_test /tmp/xfs_dir_YTrI5E
  --------------------------------------------------
  TAP version 13
  1..55
  ok 1 Split zero filled huge pages successful
  ok 2 Split huge pages to order 0 successful
  ok 3 Split huge pages to order 2 successful
  ok 4 Split huge pages to order 3 successful
  ok 5 Split huge pages to order 4 successful
  ok 6 Split huge pages to order 5 successful
  ok 7 Split huge pages to order 6 successful
  ok 8 Split huge pages to order 7 successful
  ok 9 Split PTE-mapped huge pages successful
   Please enable pr_debug in split_huge_pages_in_file() for more info.
   Failed to write data to testing file: Success (0)
  Bail out! Error occurred
   Planned tests != run tests (55 != 9)
   Totals: pass:9 fail:0 xfail:0 xpass:0 skip:0 error:0
 [FAIL]

After patch:
  running ./split_huge_page_test /tmp/xfs_dir_bMvj6o
  --------------------------------------------------
  TAP version 13
  1..55
  ok 1 Split zero filled huge pages successful
  ok 2 Split huge pages to order 0 successful
  ok 3 Split huge pages to order 2 successful
  ok 4 Split huge pages to order 3 successful
  ok 5 Split huge pages to order 4 successful
  ok 6 Split huge pages to order 5 successful
  ok 7 Split huge pages to order 6 successful
  ok 8 Split huge pages to order 7 successful
  ok 9 Split PTE-mapped huge pages successful
   Please enable pr_debug in split_huge_pages_in_file() for more info.
   Please check dmesg for more information
  ok 10 File-backed THP split to order 0 test done
   Please enable pr_debug in split_huge_pages_in_file() for more info.
   Please check dmesg for more information
  ok 11 File-backed THP split to order 1 test done
   Please enable pr_debug in split_huge_pages_in_file() for more info.
   Please check dmesg for more information
  ok 12 File-backed THP split to order 2 test done
...
  ok 55 Split PMD-mapped pagecache folio to order 7 at
    in-folio offset 128 passed
   Totals: pass:55 fail:0 xfail:0 xpass:0 skip:0 error:0
   [PASS]
ok 1 split_huge_page_test /tmp/xfs_dir_bMvj6o

Fixes: fbe37501b252 ("mm: huge_memory: debugfs for file-backed THP split")
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: David Hildenbrand (Arm) <david@kernel.org>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 tools/testing/selftests/mm/split_huge_page_test.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 500d07c4938b..02938f38b880 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -470,6 +470,8 @@ static void split_file_backed_thp(int order)
 	char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
 	const char *tmpfs_loc = mkdtemp(tmpfs_template);
 	char testfile[INPUT_MAX];
+	unsigned long size = 2 * pmd_pagesize;
+	char opts[64];
 	ssize_t num_written, num_read;
 	char *file_buf1, *file_buf2;
 	uint64_t pgoff_start = 0, pgoff_end = 1024;
@@ -489,7 +491,8 @@ static void split_file_backed_thp(int order)
 		file_buf1[i] = (char)i;
 	memset(file_buf2, 0, pmd_pagesize);
 
-	status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
+	snprintf(opts, sizeof(opts), "huge=always,size=%lu", size);
+	status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, opts);
 
 	if (status)
 		ksft_exit_fail_msg("Unable to create a tmpfs for testing\n");
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 04/14] selftests/mm: fix hugetlb pathname construction in hugetlb_reparenting_test.sh
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

The hugetlb_reparenting_test.sh script constructs hugetlb cgroup
memory interface file names based on the configured huge page size. The
script formats the size only in MB units, which causes mismatches on
systems using larger huge pages where the kernel exposes normalized
units (e.g. "1GB" instead of "1024MB").

As a result, the test fails to locate the corresponding cgroup files
when 1GB huge pages are configured.

Update the script to detect the huge page size and select the
appropriate unit (MB or GB) so that the constructed paths match the
kernel's hugetlb controller naming.

Also print an explicit "Fail" message when a test failure occurs to
improve result visibility.

Fixes: e487a5d513cb ("selftest/mm: make hugetlb_reparenting_test tolerant to async reparenting")
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 .../selftests/mm/hugetlb_reparenting_test.sh       | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
index 11f914831146..d724b6e45432 100755
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -48,6 +48,13 @@ function get_machine_hugepage_size() {
 }
 
 MB=$(get_machine_hugepage_size)
+if (( MB >= 1024 )); then
+  UNIT="GB"
+  MB_DISPLAY=$((MB / 1024))
+else
+  UNIT="MB"
+  MB_DISPLAY=$MB
+fi
 
 function cleanup() {
   echo cleanup
@@ -88,6 +95,7 @@ function assert_with_retry() {
     if [[ $elapsed -ge $timeout ]]; then
       echo "actual = $((${actual%% *} / 1024 / 1024)) MB"
       echo "expected = $((${expected%% *} / 1024 / 1024)) MB"
+      echo FAIL
       cleanup
       exit 1
     fi
@@ -108,11 +116,13 @@ function assert_state() {
   fi
 
   assert_with_retry "$CGROUP_ROOT/a/memory.$usage_file" "$expected_a"
-  assert_with_retry "$CGROUP_ROOT/a/hugetlb.${MB}MB.$usage_file" "$expected_a_hugetlb"
+  assert_with_retry \
+	  "$CGROUP_ROOT/a/hugetlb.${MB_DISPLAY}${UNIT}.$usage_file" "$expected_a_hugetlb"
 
   if [[ -n "$expected_b" && -n "$expected_b_hugetlb" ]]; then
     assert_with_retry "$CGROUP_ROOT/a/b/memory.$usage_file" "$expected_b"
-    assert_with_retry "$CGROUP_ROOT/a/b/hugetlb.${MB}MB.$usage_file" "$expected_b_hugetlb"
+    assert_with_retry \
+	  "$CGROUP_ROOT/a/b/hugetlb.${MB_DISPLAY}${UNIT}.$usage_file" "$expected_b_hugetlb"
   fi
 }
 
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 02/14] selftests/mm: fix hugetlb pathname construction in charge_reserved_hugetlb.sh
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

The charge_reserved_hugetlb.sh script assumes hugetlb cgroup memory
interface file names use the "<size>MB" format
(e.g. hugetlb.1024MB.current).
This assumption breaks on systems with larger huge pages such as 1GB,
where the kernel exposes normalized units:
    hugetlb.1GB.current
    hugetlb.1GB.max
    hugetlb.1GB.rsvd.max
    ...

As a result, the script attempts to access files like
hugetlb.1024MB.current, which do not exist when the kernel reports the
size in GB.

Normalize the huge page size and construct the pathname using the
appropriate unit (MB or GB), matching the hugetlb controller naming.

Fixes: 209376ed2a84 ("selftests/vm: make charge_reserved_hugetlb.sh work with existing cgroup setting")
Fixes: 29750f71a9b4 ("hugetlb_cgroup: add hugetlb_cgroup reservation tests")
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 .../selftests/mm/charge_reserved_hugetlb.sh   | 42 +++++++++++++------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
index e1945901fd20..a1cfd3a349db 100755
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -94,6 +94,15 @@ function get_machine_hugepage_size() {
 }
 
 MB=$(get_machine_hugepage_size)
+if (( MB >= 1024 )); then
+        # For 1GB hugepages
+        UNIT="GB"
+        MB_DISPLAY=$((MB / 1024))
+else
+        # For 2MB hugepages
+        UNIT="MB"
+        MB_DISPLAY=$MB
+fi
 
 function setup_cgroup() {
   local name="$1"
@@ -103,11 +112,12 @@ function setup_cgroup() {
   mkdir $cgroup_path/$name
 
   echo writing cgroup limit: "$cgroup_limit"
-  echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file
+  echo "$cgroup_limit" > \
+	  $cgroup_path/$name/hugetlb.${MB_DISPLAY}${UNIT}.$fault_limit_file
 
   echo writing reservation limit: "$reservation_limit"
   echo "$reservation_limit" > \
-    $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file
+    $cgroup_path/$name/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_limit_file
 
   if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then
     echo 0 >$cgroup_path/$name/cpuset.cpus
@@ -142,7 +152,7 @@ function wait_for_file_value() {
 
 function wait_for_hugetlb_memory_to_get_depleted() {
   local cgroup="$1"
-  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+  local path="$cgroup_path/$cgroup/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file"
 
   wait_for_file_value "$path" "0"
 }
@@ -150,7 +160,7 @@ function wait_for_hugetlb_memory_to_get_depleted() {
 function wait_for_hugetlb_memory_to_get_reserved() {
   local cgroup="$1"
   local size="$2"
-  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+  local path="$cgroup_path/$cgroup/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file"
 
   wait_for_file_value "$path" "$size"
 }
@@ -158,7 +168,7 @@ function wait_for_hugetlb_memory_to_get_reserved() {
 function wait_for_hugetlb_memory_to_get_written() {
   local cgroup="$1"
   local size="$2"
-  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file"
+  local path="$cgroup_path/$cgroup/hugetlb.${MB_DISPLAY}${UNIT}.$fault_usage_file"
 
   wait_for_file_value "$path" "$size"
 }
@@ -180,8 +190,8 @@ function write_hugetlbfs_and_get_usage() {
   hugetlb_difference=0
   reserved_difference=0
 
-  local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file
-  local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file
+  local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB_DISPLAY}${UNIT}.$fault_usage_file
+  local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file
 
   local hugetlb_before=$(cat $hugetlb_usage)
   local reserved_before=$(cat $reserved_usage)
@@ -312,8 +322,10 @@ function run_test() {
 
   cleanup_hugetlb_memory "hugetlb_cgroup_test"
 
-  local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file)
-  local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file)
+  local final_hugetlb=$(cat \
+	 $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB_DISPLAY}${UNIT}.$fault_usage_file)
+  local final_reservation=$(cat \
+	  $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file)
 
   echo $hugetlb_difference
   echo $reserved_difference
@@ -369,10 +381,14 @@ function run_multiple_cgroup_test() {
   reservation_failed1=$reservation_failed
   oom_killed1=$oom_killed
 
-  local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file
-  local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file
-  local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file
-  local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file
+  local cgroup1_hugetlb_usage=\
+	  $cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB_DISPLAY}${UNIT}.$fault_usage_file
+  local cgroup1_reservation_usage=\
+	  $cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file
+  local cgroup2_hugetlb_usage=\
+	  $cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB_DISPLAY}${UNIT}.$fault_usage_file
+  local cgroup2_reservation_usage=\
+	  $cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file
 
   local usage_before_second_write=$(cat $cgroup1_hugetlb_usage)
   local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage)
-- 
2.52.0



^ permalink raw reply related

* [PATCH v6 01/14] selftests/mm: restore default nr_hugepages value via exit trap in charge_reserved_hugetlb.sh
From: Sayali Patil @ 2026-05-04  8:24 UTC (permalink / raw)
  To: Andrew Morton, Shuah Khan, linux-mm, linux-kernel,
	linux-kselftest, Ritesh Harjani
  Cc: David Hildenbrand, Zi Yan, Michal Hocko, Oscar Salvador,
	Lorenzo Stoakes, Dev Jain, Liam.Howlett, linuxppc-dev, Miaohe Lin,
	Venkat Rao Bagalkote, Sayali Patil
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

cleanup() resets nr_hugepages to 0 on every invocation, while the test
reconfigures it again in the next iteration. This leads to repeated
allocation and freeing of large numbers of hugepages, especially when
the original value is high.

Additionally, with set -e, failures in earlier cleanup steps (e.g.,
rmdir or umount returning EBUSY while background activity is still
ongoing) can cause the script to exit before restoring the original
value, leaving the system in a modified state.

Introduce a trap on EXIT, INT, and TERM to restore the original
nr_hugepages value once at script termination. This avoids
unnecessary allocation churn and ensures the original value
is reliably restored on all exit paths.

Fixes: 7d695b1c3695b ("selftests/mm: save and restore nr_hugepages value")
Acked-by: Zi Yan <ziy@nvidia.com>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 tools/testing/selftests/mm/charge_reserved_hugetlb.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
index 44f4e703deb9..e1945901fd20 100755
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -17,6 +17,7 @@ if ! command -v killall >/dev/null 2>&1; then
 fi
 
 nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
+trap 'echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages' EXIT INT TERM
 
 fault_limit_file=limit_in_bytes
 reservation_limit_file=rsvd.limit_in_bytes
@@ -70,7 +71,6 @@ function cleanup() {
   if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then
     rmdir $cgroup_path/hugetlb_cgroup_test2
   fi
-  echo 0 >/proc/sys/vm/nr_hugepages
   echo CLEANUP DONE
 }
 
@@ -599,4 +599,3 @@ if [[ $do_umount ]]; then
   rmdir $cgroup_path
 fi
 
-echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
-- 
2.52.0



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox