[PATCH v2] mm/damon: replace damon_rand() with a per-ctx lockless PRNG

The Linux Kernel Mailing List
 help / color / mirror / Atom feed

* [PATCH v2] mm/damon: replace damon_rand() with a per-ctx lockless PRNG
@ 2026-05-05 14:52 Jiayuan Chen
  2026-05-06 16:37 ` SeongJae Park
  0 siblings, 1 reply; 4+ messages in thread
From: Jiayuan Chen @ 2026-05-05 14:52 UTC (permalink / raw)
  To: damon
  Cc: Jiayuan Chen, Jiayuan Chen, SeongJae Park, Andrew Morton,
	Shu Anzai, Quanmin Yan, linux-mm, linux-kernel

From: Jiayuan Chen <jiayuan.chen@shopee.com>

damon_rand() on the sampling_addr hot path called
get_random_u32_below(), which takes a local_lock_irqsave() around a
per-CPU batched entropy pool and periodically refills it with
ChaCha20.  At elevated nr_regions counts (20k+), the lock_acquire /
local_lock pair plus __get_random_u32_below() dominate kdamond perf
profiles.

Replace the helper with a lockless lfsr113 generator (struct rnd_state)
held per damon_ctx and seeded from get_random_u64() in damon_new_ctx().
kdamond is the single consumer of a given ctx, so no synchronization
is required.  Range mapping uses Lemire's (u64)rnd * span >> 32 on the
fast path; for spans larger than U32_MAX (only reachable on 64-bit) the
slow path combines two u32 outputs and uses mul_u64_u64_shr() at 64-bit
width.  On 32-bit the slow path is dead code and gets eliminated by
the compiler.

The new helper takes a ctx parameter; damon_split_regions_of() and
the kunit tests that call it directly are updated accordingly.

lfsr113 is a linear PRNG and MUST NOT be used for anything
security-sensitive.  DAMON's sampling_addr is not exposed to userspace
and is only consumed as a probe point for PTE accessed-bit sampling,
so a non-cryptographic PRNG is appropriate here.

Tested with paddr monitoring and max_nr_regions=20000: kdamond CPU
usage reduced from ~72% to ~50% of one core.

Link: https://lore.kernel.org/damon/20260426173346.86238-1-sj@kernel.org/T/#m4f1fd74112728f83a41511e394e8c3fef703039c
Cc: Jiayuan Chen <jiayuan.chen@linux.dev>
Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
---
 include/linux/damon.h       | 27 +++++++++++++++++++++------
 mm/damon/core.c             | 12 ++++++++----
 mm/damon/paddr.c            |  8 ++++----
 mm/damon/tests/core-kunit.h | 28 ++++++++++++++++++++++------
 mm/damon/vaddr.c            |  7 ++++---
 5 files changed, 59 insertions(+), 23 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index f2cdb7c3f5e6..e16012a7f41a 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -8,8 +8,10 @@
 #ifndef _DAMON_H_
 #define _DAMON_H_
 
+#include <linux/math64.h>
 #include <linux/memcontrol.h>
 #include <linux/mutex.h>
+#include <linux/prandom.h>
 #include <linux/time64.h>
 #include <linux/types.h>
 #include <linux/random.h>
@@ -19,12 +21,6 @@
 /* Max priority score for DAMON-based operation schemes */
 #define DAMOS_MAX_SCORE		(99)
 
-/* Get a random number in [l, r) */
-static inline unsigned long damon_rand(unsigned long l, unsigned long r)
-{
-	return l + get_random_u32_below(r - l);
-}
-
 /**
  * struct damon_addr_range - Represents an address region of [@start, @end).
  * @start:	Start address of the region (inclusive).
@@ -843,8 +839,27 @@ struct damon_ctx {
 
 	struct list_head adaptive_targets;
 	struct list_head schemes;
+
+	/* Per-ctx PRNG state for damon_rand(); kdamond is the sole consumer. */
+	struct rnd_state rnd_state;
 };
 
+/* Get a random number in [@l, @r) using @ctx's lockless PRNG. */
+static inline unsigned long damon_rand(struct damon_ctx *ctx,
+				       unsigned long l, unsigned long r)
+{
+	unsigned long span = r - l;
+	u64 rnd;
+
+	if (span <= U32_MAX) {
+		rnd = prandom_u32_state(&ctx->rnd_state);
+		return l + (unsigned long)((rnd * span) >> 32);
+	}
+	rnd = ((u64)prandom_u32_state(&ctx->rnd_state) << 32) |
+	      prandom_u32_state(&ctx->rnd_state);
+	return l + mul_u64_u64_shr(rnd, span, 64);
+}
+
 static inline struct damon_region *damon_next_region(struct damon_region *r)
 {
 	return container_of(r->list.next, struct damon_region, list);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 3dbbbfdeff71..cbbb35a72b18 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -607,6 +607,8 @@ struct damon_ctx *damon_new_ctx(void)
 	INIT_LIST_HEAD(&ctx->adaptive_targets);
 	INIT_LIST_HEAD(&ctx->schemes);
 
+	prandom_seed_state(&ctx->rnd_state, get_random_u64());
+
 	return ctx;
 }
 
@@ -2715,8 +2717,9 @@ static void damon_split_region_at(struct damon_target *t,
 }
 
 /* Split every region in the given target into 'nr_subs' regions */
-static void damon_split_regions_of(struct damon_target *t, int nr_subs,
-				  unsigned long min_region_sz)
+static void damon_split_regions_of(struct damon_ctx *ctx,
+				   struct damon_target *t, int nr_subs,
+				   unsigned long min_region_sz)
 {
 	struct damon_region *r, *next;
 	unsigned long sz_region, sz_sub = 0;
@@ -2731,7 +2734,7 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs,
 			 * Randomly select size of left sub-region to be at
 			 * least 10 percent and at most 90% of original region
 			 */
-			sz_sub = ALIGN_DOWN(damon_rand(1, 10) *
+			sz_sub = ALIGN_DOWN(damon_rand(ctx, 1, 10) *
 					sz_region / 10, min_region_sz);
 			/* Do not allow blank region */
 			if (sz_sub == 0 || sz_sub >= sz_region)
@@ -2772,7 +2775,8 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
 		nr_subregions = 3;
 
 	damon_for_each_target(t, ctx)
-		damon_split_regions_of(t, nr_subregions, ctx->min_region_sz);
+		damon_split_regions_of(ctx, t, nr_subregions,
+				       ctx->min_region_sz);
 
 	last_nr_regions = nr_regions;
 }
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 5cdcc5037cbc..c4738cd5e221 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -49,11 +49,11 @@ static void damon_pa_mkold(phys_addr_t paddr)
 }
 
 static void __damon_pa_prepare_access_check(struct damon_region *r,
-		unsigned long addr_unit)
+		struct damon_ctx *ctx)
 {
-	r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
+	r->sampling_addr = damon_rand(ctx, r->ar.start, r->ar.end);
 
-	damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, addr_unit));
+	damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, ctx->addr_unit));
 }
 
 static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
@@ -63,7 +63,7 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
 
 	damon_for_each_target(t, ctx) {
 		damon_for_each_region(r, t)
-			__damon_pa_prepare_access_check(r, ctx->addr_unit);
+			__damon_pa_prepare_access_check(r, ctx);
 	}
 }
 
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index 9e5904c2beeb..ee0a58eeb25c 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -273,54 +273,70 @@ static void damon_test_merge_regions_of(struct kunit *test)
 
 static void damon_test_split_regions_of(struct kunit *test)
 {
+	struct damon_ctx *c;
 	struct damon_target *t;
 	struct damon_region *r;
 	unsigned long sa[] = {0, 300, 500};
 	unsigned long ea[] = {220, 400, 700};
 	int i;
 
+	c = damon_new_ctx();
+	if (!c)
+		kunit_skip(test, "ctx alloc fail");
+
 	t = damon_new_target();
-	if (!t)
+	if (!t) {
+		damon_destroy_ctx(c);
 		kunit_skip(test, "target alloc fail");
+	}
 	r = damon_new_region(0, 22);
 	if (!r) {
 		damon_free_target(t);
+		damon_destroy_ctx(c);
 		kunit_skip(test, "region alloc fail");
 	}
 	damon_add_region(r, t);
-	damon_split_regions_of(t, 2, 1);
+	damon_split_regions_of(c, t, 2, 1);
 	KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
 	damon_free_target(t);
 
 	t = damon_new_target();
-	if (!t)
+	if (!t) {
+		damon_destroy_ctx(c);
 		kunit_skip(test, "second target alloc fail");
+	}
 	r = damon_new_region(0, 220);
 	if (!r) {
 		damon_free_target(t);
+		damon_destroy_ctx(c);
 		kunit_skip(test, "second region alloc fail");
 	}
 	damon_add_region(r, t);
-	damon_split_regions_of(t, 4, 1);
+	damon_split_regions_of(c, t, 4, 1);
 	KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
 	damon_free_target(t);
 
 	t = damon_new_target();
-	if (!t)
+	if (!t) {
+		damon_destroy_ctx(c);
 		kunit_skip(test, "third target alloc fail");
+	}
 	for (i = 0; i < ARRAY_SIZE(sa); i++) {
 		r = damon_new_region(sa[i], ea[i]);
 		if (!r) {
 			damon_free_target(t);
+			damon_destroy_ctx(c);
 			kunit_skip(test, "region alloc fail");
 		}
 		damon_add_region(r, t);
 	}
-	damon_split_regions_of(t, 4, 5);
+	damon_split_regions_of(c, t, 4, 5);
 	KUNIT_EXPECT_LE(test, damon_nr_regions(t), 12u);
 	damon_for_each_region(r, t)
 		KUNIT_EXPECT_GE(test, damon_sz_region(r) % 5ul, 0ul);
 	damon_free_target(t);
+
+	damon_destroy_ctx(c);
 }
 
 static void damon_test_ops_registration(struct kunit *test)
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b069dbc7e3d2..2e936f04de3d 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -333,9 +333,10 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
  */
 
 static void __damon_va_prepare_access_check(struct mm_struct *mm,
-					struct damon_region *r)
+					struct damon_region *r,
+					struct damon_ctx *ctx)
 {
-	r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
+	r->sampling_addr = damon_rand(ctx, r->ar.start, r->ar.end);
 
 	damon_va_mkold(mm, r->sampling_addr);
 }
@@ -351,7 +352,7 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
 		if (!mm)
 			continue;
 		damon_for_each_region(r, t)
-			__damon_va_prepare_access_check(mm, r);
+			__damon_va_prepare_access_check(mm, r, ctx);
 		mmput(mm);
 	}
 }
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] mm/damon: replace damon_rand() with a per-ctx lockless PRNG
  2026-05-05 14:52 [PATCH v2] mm/damon: replace damon_rand() with a per-ctx lockless PRNG Jiayuan Chen
@ 2026-05-06 16:37 ` SeongJae Park
  2026-05-06 23:41   ` Jiayuan Chen
  0 siblings, 1 reply; 4+ messages in thread
From: SeongJae Park @ 2026-05-06 16:37 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: SeongJae Park, damon, Jiayuan Chen, Andrew Morton, Shu Anzai,
	Quanmin Yan, linux-mm, linux-kernel

Hello Jiayuan,


Thank you for posting this second version!

On Tue,  5 May 2026 22:52:06 +0800 Jiayuan Chen <jiayuan.chen@linux.dev> wrote:

> From: Jiayuan Chen <jiayuan.chen@shopee.com>
> 
> damon_rand() on the sampling_addr hot path called
> get_random_u32_below(), which takes a local_lock_irqsave() around a
> per-CPU batched entropy pool and periodically refills it with
> ChaCha20.  At elevated nr_regions counts (20k+), the lock_acquire /
> local_lock pair plus __get_random_u32_below() dominate kdamond perf
> profiles.
> 
> Replace the helper with a lockless lfsr113 generator (struct rnd_state)
> held per damon_ctx and seeded from get_random_u64() in damon_new_ctx().
> kdamond is the single consumer of a given ctx, so no synchronization
> is required.  Range mapping uses Lemire's (u64)rnd * span >> 32 on the
> fast path;

Could we add some links to the algorithm?  I found a blog [1] from Google and
guessing that is what you're saying, but I'm not really sure since I failed at
finding time to thoroughtly read it.

> for spans larger than U32_MAX (only reachable on 64-bit) the
> slow path combines two u32 outputs and uses mul_u64_u64_shr() at 64-bit
> width.  On 32-bit the slow path is dead code and gets eliminated by
> the compiler.
> 
> The new helper takes a ctx parameter; damon_split_regions_of() and
> the kunit tests that call it directly are updated accordingly.
> 
> lfsr113 is a linear PRNG and MUST NOT be used for anything
> security-sensitive.  DAMON's sampling_addr is not exposed to userspace
> and is only consumed as a probe point for PTE accessed-bit sampling,
> so a non-cryptographic PRNG is appropriate here.
> 
> Tested with paddr monitoring and max_nr_regions=20000: kdamond CPU
> usage reduced from ~72% to ~50% of one core.
> 
> Link: https://lore.kernel.org/damon/20260426173346.86238-1-sj@kernel.org/T/#m4f1fd74112728f83a41511e394e8c3fef703039c

Why are you adding the above link?  It would be good to add description of the
link on the commit message.

Other than that, commit message looks good to me.

> Cc: Jiayuan Chen <jiayuan.chen@linux.dev>
> Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
> ---
>  include/linux/damon.h       | 27 +++++++++++++++++++++------
>  mm/damon/core.c             | 12 ++++++++----
>  mm/damon/paddr.c            |  8 ++++----
>  mm/damon/tests/core-kunit.h | 28 ++++++++++++++++++++++------
>  mm/damon/vaddr.c            |  7 ++++---
>  5 files changed, 59 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/damon.h b/include/linux/damon.h
> index f2cdb7c3f5e6..e16012a7f41a 100644
> --- a/include/linux/damon.h
> +++ b/include/linux/damon.h
> @@ -8,8 +8,10 @@
>  #ifndef _DAMON_H_
>  #define _DAMON_H_
>  
> +#include <linux/math64.h>
>  #include <linux/memcontrol.h>
>  #include <linux/mutex.h>
> +#include <linux/prandom.h>
>  #include <linux/time64.h>
>  #include <linux/types.h>
>  #include <linux/random.h>

Maybe we can remove random.h?

> @@ -19,12 +21,6 @@
>  /* Max priority score for DAMON-based operation schemes */
>  #define DAMOS_MAX_SCORE		(99)
>  
> -/* Get a random number in [l, r) */
> -static inline unsigned long damon_rand(unsigned long l, unsigned long r)
> -{
> -	return l + get_random_u32_below(r - l);
> -}
> -
>  /**
>   * struct damon_addr_range - Represents an address region of [@start, @end).
>   * @start:	Start address of the region (inclusive).
> @@ -843,8 +839,27 @@ struct damon_ctx {
>  
>  	struct list_head adaptive_targets;
>  	struct list_head schemes;
> +
> +	/* Per-ctx PRNG state for damon_rand(); kdamond is the sole consumer. */
> +	struct rnd_state rnd_state;
>  };
>  
> +/* Get a random number in [@l, @r) using @ctx's lockless PRNG. */
> +static inline unsigned long damon_rand(struct damon_ctx *ctx,
> +				       unsigned long l, unsigned long r)
> +{
> +	unsigned long span = r - l;
> +	u64 rnd;
> +
> +	if (span <= U32_MAX) {
> +		rnd = prandom_u32_state(&ctx->rnd_state);
> +		return l + (unsigned long)((rnd * span) >> 32);
> +	}
> +	rnd = ((u64)prandom_u32_state(&ctx->rnd_state) << 32) |
> +	      prandom_u32_state(&ctx->rnd_state);
> +	return l + mul_u64_u64_shr(rnd, span, 64);
> +}
> +

I was unable to find time to thoroughly read the link I found, or understand
the algorithm with only the code, so tested by myself like below.

'''
$ cat ./foo.c
#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdint.h>

int main(int argc, char *argv[])
{
        unsigned long span;
        int i;

        if (argc != 2) {
                printf("Usge: %s <span>\n", argv[0]);
                return -1;
        }
        span = atoi(argv[1]);

        for (i = 0; i < 1000; i++) {
                uint64_t rnd = rand();

                printf("%lu\n", (uint64_t)((rnd * span) >> 32));
        }
        return 0;
}
$ gcc ./foo.c
$ ./a.out 10 | sort -n | uniq --count
    195 0
    194 1
    188 2
    223 3
    200 4
'''

So I expected the program to generate number 0-9 randomly with similar
proportions.  But it is generating only numbers 0-4.  I confirmed doubling
'span' value makes it work in my expected way.  Should we do that here, too?

[...]

Other than above, code changes look good to me.

[1] https://lemire.me/blog/2024/08/17/faster-random-integer-generation-with-batching/


Thanks,
SJ

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] mm/damon: replace damon_rand() with a per-ctx lockless PRNG
  2026-05-06 16:37 ` SeongJae Park
@ 2026-05-06 23:41   ` Jiayuan Chen
  2026-05-07  8:10     ` SeongJae Park
  0 siblings, 1 reply; 4+ messages in thread
From: Jiayuan Chen @ 2026-05-06 23:41 UTC (permalink / raw)
  To: SeongJae Park
  Cc: damon, Jiayuan Chen, Andrew Morton, Shu Anzai, Quanmin Yan,
	linux-mm, linux-kernel

Hello SJ,


On 5/7/26 12:37 AM, SeongJae Park wrote:
> Hello Jiayuan,
>
>
> Thank you for posting this second version!
>
> On Tue,  5 May 2026 22:52:06 +0800 Jiayuan Chen <jiayuan.chen@linux.dev> wrote:
>
>> From: Jiayuan Chen <jiayuan.chen@shopee.com>
>>
>> damon_rand() on the sampling_addr hot path called
>> get_random_u32_below(), which takes a local_lock_irqsave() around a
>> per-CPU batched entropy pool and periodically refills it with
>> ChaCha20.  At elevated nr_regions counts (20k+), the lock_acquire /
>> local_lock pair plus __get_random_u32_below() dominate kdamond perf
>> profiles.
>>
>> Replace the helper with a lockless lfsr113 generator (struct rnd_state)
>> held per damon_ctx and seeded from get_random_u64() in damon_new_ctx().
>> kdamond is the single consumer of a given ctx, so no synchronization
>> is required.  Range mapping uses Lemire's (u64)rnd * span >> 32 on the
>> fast path;
> Could we add some links to the algorithm?  I found a blog [1] from Google and
> guessing that is what you're saying, but I'm not really sure since I failed at
> finding time to thoroughtly read it.



The link covers a related family of techniques, but I don't
think we need an external reference.  The pattern is just the
multiply-shift fast path that the kernel already uses in
__get_random_u32_below() (drivers/char/random.c) -- (u64)rnd *
span >> 32 -- which the existing comment there calls "traditional
reciprocal multiplication". Of course I can add simple comment like

"traditional reciprocal multiplication, similar as get_random_u32_below()"

>> for spans larger than U32_MAX (only reachable on 64-bit) the
>> slow path combines two u32 outputs and uses mul_u64_u64_shr() at 64-bit
>> width.  On 32-bit the slow path is dead code and gets eliminated by
>> the compiler.
>>
>> The new helper takes a ctx parameter; damon_split_regions_of() and
>> the kunit tests that call it directly are updated accordingly.
>>
>> lfsr113 is a linear PRNG and MUST NOT be used for anything
>> security-sensitive.  DAMON's sampling_addr is not exposed to userspace
>> and is only consumed as a probe point for PTE accessed-bit sampling,
>> so a non-cryptographic PRNG is appropriate here.
>>
>> Tested with paddr monitoring and max_nr_regions=20000: kdamond CPU
>> usage reduced from ~72% to ~50% of one core.
>>
>> Link: https://lore.kernel.org/damon/20260426173346.86238-1-sj@kernel.org/T/#m4f1fd74112728f83a41511e394e8c3fef703039c
> Why are you adding the above link?  It would be good to add description of the
> link on the commit message.
>
> Other than that, commit message looks good to me.

Will move it to the patch changelog (after ---) instead of the
commit message


>> Cc: Jiayuan Chen <jiayuan.chen@linux.dev>
>> Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
>> ---
>>   include/linux/damon.h       | 27 +++++++++++++++++++++------
>>   mm/damon/core.c             | 12 ++++++++----
>>   mm/damon/paddr.c            |  8 ++++----
>>   mm/damon/tests/core-kunit.h | 28 ++++++++++++++++++++++------
>>   mm/damon/vaddr.c            |  7 ++++---
>>   5 files changed, 59 insertions(+), 23 deletions(-)
>>
>> diff --git a/include/linux/damon.h b/include/linux/damon.h
>> index f2cdb7c3f5e6..e16012a7f41a 100644
>> --- a/include/linux/damon.h
>> +++ b/include/linux/damon.h
>> @@ -8,8 +8,10 @@
>>   #ifndef _DAMON_H_
>>   #define _DAMON_H_
>>   
>> +#include <linux/math64.h>
>>   #include <linux/memcontrol.h>
>>   #include <linux/mutex.h>
>> +#include <linux/prandom.h>
>>   #include <linux/time64.h>
>>   #include <linux/types.h>
>>   #include <linux/random.h>
> Maybe we can remove random.h?

Right !


>> @@ -19,12 +21,6 @@
>>   /* Max priority score for DAMON-based operation schemes */
>>   #define DAMOS_MAX_SCORE		(99)
>>   
>> -/* Get a random number in [l, r) */
>> -static inline unsigned long damon_rand(unsigned long l, unsigned long r)
>> -{
>> -	return l + get_random_u32_below(r - l);
>> -}
>> -
>>   /**
>>    * struct damon_addr_range - Represents an address region of [@start, @end).
>>    * @start:	Start address of the region (inclusive).
>> @@ -843,8 +839,27 @@ struct damon_ctx {
>>   
>>   	struct list_head adaptive_targets;
>>   	struct list_head schemes;
>> +
>> +	/* Per-ctx PRNG state for damon_rand(); kdamond is the sole consumer. */
>> +	struct rnd_state rnd_state;
>>   };
>>   
>> +/* Get a random number in [@l, @r) using @ctx's lockless PRNG. */
>> +static inline unsigned long damon_rand(struct damon_ctx *ctx,
>> +				       unsigned long l, unsigned long r)
>> +{
>> +	unsigned long span = r - l;
>> +	u64 rnd;
>> +
>> +	if (span <= U32_MAX) {
>> +		rnd = prandom_u32_state(&ctx->rnd_state);
>> +		return l + (unsigned long)((rnd * span) >> 32);
>> +	}
>> +	rnd = ((u64)prandom_u32_state(&ctx->rnd_state) << 32) |
>> +	      prandom_u32_state(&ctx->rnd_state);
>> +	return l + mul_u64_u64_shr(rnd, span, 64);
>> +}
>> +
> I was unable to find time to thoroughly read the link I found, or understand
> the algorithm with only the code, so tested by myself like below.
>
> '''
> $ cat ./foo.c
> #include <stdio.h>
> #include <stdbool.h>
> #include <stdlib.h>
> #include <stdint.h>
>
> int main(int argc, char *argv[])
> {
>          unsigned long span;
>          int i;
>
>          if (argc != 2) {
>                  printf("Usge: %s <span>\n", argv[0]);
>                  return -1;
>          }
>          span = atoi(argv[1]);
>
>          for (i = 0; i < 1000; i++) {
>                  uint64_t rnd = rand();
>
>                  printf("%lu\n", (uint64_t)((rnd * span) >> 32));
>          }
>          return 0;
> }
> $ gcc ./foo.c
> $ ./a.out 10 | sort -n | uniq --count
>      195 0
>      194 1
>      188 2
>      223 3
>      200 4
> '''
>
> So I expected the program to generate number 0-9 randomly with similar
> proportions.  But it is generating only numbers 0-4.  I confirmed doubling
> 'span' value makes it work in my expected way.  Should we do that here, too?



I wrote a kmod to verify.  The helper is a verbatim copy of
damon_rand() from the patch:

   static inline unsigned long damon_rand_test(struct rnd_state *state,
                                               unsigned long l, unsigned 
long r)
   {
           unsigned long span = r - l;
           u64 rnd;

           if (span <= U32_MAX) {
                   rnd = prandom_u32_state(state);
                   return l + (unsigned long)((rnd * span) >> 32);
           }
           rnd = ((u64)prandom_u32_state(state) << 32) |
                 prandom_u32_state(state);
           return l + mul_u64_u64_shr(rnd, span, 64);
   }

   static int __init damon_rand_test_init(void)
   {
           struct rnd_state state;
           unsigned long counts[SPAN] = {0};
           unsigned long i, v;
           int j;

           prandom_seed_state(&state, get_random_u64());

           for (i = 0; i < ITERATIONS; i++) {
                   v = damon_rand_test(&state, 0, SPAN);
                   if (v >= SPAN) {
                           pr_err("out of range: %lu\n", v);
                           return -EINVAL;
                   }
                   counts[v]++;
           }

           pr_info("damon_rand_test: span=%lu iterations=%lu\n",
                   SPAN, ITERATIONS);
           for (j = 0; j < SPAN; j++)
                   pr_info("damon_rand_test:   %d -> %lu\n", j, counts[j]);

           return 0;
   }

dmesg output (span=10, 1000 iterations):

   damon_rand_test: span=10 iterations=1000
   damon_rand_test:   0 -> 99
   damon_rand_test:   1 -> 107
   damon_rand_test:   2 -> 79
   damon_rand_test:   3 -> 94
   damon_rand_test:   4 -> 97
   damon_rand_test:   5 -> 97
   damon_rand_test:   6 -> 94
   damon_rand_test:   7 -> 100
   damon_rand_test:   8 -> 114
   damon_rand_test:   9 -> 119

All 10 buckets get hits, roughly uniform.

Looking into the usrspace test program, the issue is that libc rand()
returns a value in [0, RAND_MAX].  On glibc RAND_MAX is
2^31 - 1 (not 2^32 - 1), so the upper bit is missing and
the multiply-shift only fills the lower half of the output
range.  prandom_u32_state() returns a full u32 in [0, 2^32),
so the in-kernel formula behaves correctly without doubling.

> [...]
>
> Other than above, code changes look good to me.
>
> [1] https://lemire.me/blog/2024/08/17/faster-random-integer-generation-with-batching/
>
>
> Thanks,
> SJ

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] mm/damon: replace damon_rand() with a per-ctx lockless PRNG
  2026-05-06 23:41   ` Jiayuan Chen
@ 2026-05-07  8:10     ` SeongJae Park
  0 siblings, 0 replies; 4+ messages in thread
From: SeongJae Park @ 2026-05-07  8:10 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: SeongJae Park, damon, Jiayuan Chen, Andrew Morton, Shu Anzai,
	Quanmin Yan, linux-mm, linux-kernel

On Thu, 7 May 2026 07:41:11 +0800 Jiayuan Chen <jiayuan.chen@linux.dev> wrote:

> Hello SJ,
> 
> 
> On 5/7/26 12:37 AM, SeongJae Park wrote:
> > Hello Jiayuan,
> >
> >
> > Thank you for posting this second version!
> >
> > On Tue,  5 May 2026 22:52:06 +0800 Jiayuan Chen <jiayuan.chen@linux.dev> wrote:
[...]
> >> is required.  Range mapping uses Lemire's (u64)rnd * span >> 32 on the
> >> fast path;
> > Could we add some links to the algorithm?  I found a blog [1] from Google and
> > guessing that is what you're saying, but I'm not really sure since I failed at
> > finding time to thoroughtly read it.
> 
> 
> 
> The link covers a related family of techniques, but I don't
> think we need an external reference.  The pattern is just the
> multiply-shift fast path that the kernel already uses in
> __get_random_u32_below() (drivers/char/random.c) -- (u64)rnd *
> span >> 32 -- which the existing comment there calls "traditional
> reciprocal multiplication". Of course I can add simple comment like
> 
> "traditional reciprocal multiplication, similar as get_random_u32_below()"

Makes sense, thank you for enlightening me!  And yes, your suggestion looks
very good to me!

[...]
> >> Link: https://lore.kernel.org/damon/20260426173346.86238-1-sj@kernel.org/T/#m4f1fd74112728f83a41511e394e8c3fef703039c
> > Why are you adding the above link?  It would be good to add description of the
> > link on the commit message.
> >
> > Other than that, commit message looks good to me.
> 
> Will move it to the patch changelog (after ---) instead of the
> commit message

Oh, so the intention was the patch changelog.  Yes, let's add a formal
changelog on the commentary area [1] from the next time.

[...]
> >> +#include <linux/math64.h>
> >>   #include <linux/memcontrol.h>
> >>   #include <linux/mutex.h>
> >> +#include <linux/prandom.h>
> >>   #include <linux/time64.h>
> >>   #include <linux/types.h>
> >>   #include <linux/random.h>
> > Maybe we can remove random.h?
> 
> Right !

Thank you for confirming!

[...]
> >> +/* Get a random number in [@l, @r) using @ctx's lockless PRNG. */
> >> +static inline unsigned long damon_rand(struct damon_ctx *ctx,
> >> +				       unsigned long l, unsigned long r)
> >> +{
> >> +	unsigned long span = r - l;
> >> +	u64 rnd;
> >> +
> >> +	if (span <= U32_MAX) {
> >> +		rnd = prandom_u32_state(&ctx->rnd_state);
> >> +		return l + (unsigned long)((rnd * span) >> 32);
> >> +	}
> >> +	rnd = ((u64)prandom_u32_state(&ctx->rnd_state) << 32) |
> >> +	      prandom_u32_state(&ctx->rnd_state);
> >> +	return l + mul_u64_u64_shr(rnd, span, 64);
> >> +}
> >> +
> > I was unable to find time to thoroughly read the link I found, or understand
> > the algorithm with only the code, so tested by myself like below.
> >
> > '''
> > $ cat ./foo.c
> > #include <stdio.h>
> > #include <stdbool.h>
> > #include <stdlib.h>
> > #include <stdint.h>
> >
> > int main(int argc, char *argv[])
> > {
> >          unsigned long span;
> >          int i;
> >
> >          if (argc != 2) {
> >                  printf("Usge: %s <span>\n", argv[0]);
> >                  return -1;
> >          }
> >          span = atoi(argv[1]);
> >
> >          for (i = 0; i < 1000; i++) {
> >                  uint64_t rnd = rand();
> >
> >                  printf("%lu\n", (uint64_t)((rnd * span) >> 32));
> >          }
> >          return 0;
> > }
> > $ gcc ./foo.c
> > $ ./a.out 10 | sort -n | uniq --count
> >      195 0
> >      194 1
> >      188 2
> >      223 3
> >      200 4
> > '''
> >
> > So I expected the program to generate number 0-9 randomly with similar
> > proportions.  But it is generating only numbers 0-4.  I confirmed doubling
> > 'span' value makes it work in my expected way.  Should we do that here, too?
> 
> 
> 
> I wrote a kmod to verify.  The helper is a verbatim copy of
> damon_rand() from the patch:
> 
>    static inline unsigned long damon_rand_test(struct rnd_state *state,
>                                                unsigned long l, unsigned 
> long r)
>    {
>            unsigned long span = r - l;
>            u64 rnd;
> 
>            if (span <= U32_MAX) {
>                    rnd = prandom_u32_state(state);
>                    return l + (unsigned long)((rnd * span) >> 32);
>            }
>            rnd = ((u64)prandom_u32_state(state) << 32) |
>                  prandom_u32_state(state);
>            return l + mul_u64_u64_shr(rnd, span, 64);
>    }
> 
>    static int __init damon_rand_test_init(void)
>    {
>            struct rnd_state state;
>            unsigned long counts[SPAN] = {0};
>            unsigned long i, v;
>            int j;
> 
>            prandom_seed_state(&state, get_random_u64());
> 
>            for (i = 0; i < ITERATIONS; i++) {
>                    v = damon_rand_test(&state, 0, SPAN);
>                    if (v >= SPAN) {
>                            pr_err("out of range: %lu\n", v);
>                            return -EINVAL;
>                    }
>                    counts[v]++;
>            }
> 
>            pr_info("damon_rand_test: span=%lu iterations=%lu\n",
>                    SPAN, ITERATIONS);
>            for (j = 0; j < SPAN; j++)
>                    pr_info("damon_rand_test:   %d -> %lu\n", j, counts[j]);
> 
>            return 0;
>    }
> 
> dmesg output (span=10, 1000 iterations):
> 
>    damon_rand_test: span=10 iterations=1000
>    damon_rand_test:   0 -> 99
>    damon_rand_test:   1 -> 107
>    damon_rand_test:   2 -> 79
>    damon_rand_test:   3 -> 94
>    damon_rand_test:   4 -> 97
>    damon_rand_test:   5 -> 97
>    damon_rand_test:   6 -> 94
>    damon_rand_test:   7 -> 100
>    damon_rand_test:   8 -> 114
>    damon_rand_test:   9 -> 119
> 
> All 10 buckets get hits, roughly uniform.
> 
> Looking into the usrspace test program, the issue is that libc rand()
> returns a value in [0, RAND_MAX].  On glibc RAND_MAX is
> 2^31 - 1 (not 2^32 - 1), so the upper bit is missing and
> the multiply-shift only fills the lower half of the output
> range.  prandom_u32_state() returns a full u32 in [0, 2^32),
> so the in-kernel formula behaves correctly without doubling.

Thank you for kindly checking all these details and sharing with me.  All make
sense to me.

Reviewed-by: SeongJae Park <sj@kernel.org>

I added this patch to my damon/next tree with small modifications including
removal of random.h include, simple range mapping description update, and
adding my Reviewed-by: tag.

I will repost my version for mm.git inclusion soon (maybe within 1-2 days)
unless Andrew picks this into mm.git or you send a new version before.  My
reposting plan is only for a case involved people dropping a ball, so please
feel free to make action before my reposting if you want and don't mind.

[1] https://docs.kernel.org/process/submitting-patches.html#commentary
[2] https://git.kernel.org/pub/scm/linux/kernel/git/sj/damon-hack.git/tree/patches/next/mm-damon-replace-damon_rand-with-a-per-ctx-lockless-.patch


Thanks,
SJ

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-05-07  8:10 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-05 14:52 [PATCH v2] mm/damon: replace damon_rand() with a per-ctx lockless PRNG Jiayuan Chen
2026-05-06 16:37 ` SeongJae Park
2026-05-06 23:41   ` Jiayuan Chen
2026-05-07  8:10     ` SeongJae Park

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox