* [PATCH 1/7] mm/migrate: rename PAGE_ migration flags to FOLIO_
2026-04-28 15:50 [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Shivank Garg
@ 2026-04-28 15:50 ` Shivank Garg
2026-04-30 9:07 ` Huang, Ying
2026-04-28 15:50 ` [PATCH 2/7] mm/migrate: use migrate_info field instead of private Shivank Garg
` (8 subsequent siblings)
9 siblings, 1 reply; 20+ messages in thread
From: Shivank Garg @ 2026-04-28 15:50 UTC (permalink / raw)
To: akpm, david
Cc: kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy, rppt, surenb,
mhocko, ziy, matthew.brost, joshua.hahnjy, rakie.kim, byungchul,
gourry, ying.huang, apopple, dave, Jonathan.Cameron, rkodsara,
vkoul, bharata, sj, rientjes, xuezhengchu, yiannis, dave.hansen,
hannes, jhubbard, peterx, riel, shakeel.butt, stalexan, tj,
nifan.cxl, jic23, aneesh.kumar, nathan.lynch, Frank.li, djbw,
linux-kernel, linux-mm, Shivank Garg, Baolin Wang, Lance Yang
These flags only track folio-specific state during migration and are
not used for movable_ops pages. Rename the enum values and the
old_page_state variable to match.
No functional change.
Suggested-by: David Hildenbrand <david@kernel.org>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
mm/migrate.c | 48 +++++++++++++++++++++++-------------------------
1 file changed, 23 insertions(+), 25 deletions(-)
diff --git a/mm/migrate.c b/mm/migrate.c
index 8a64291ab5b4..0c6a0ab6ecce 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1135,26 +1135,24 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
* This is safe because nobody is using it except us.
*/
enum {
- PAGE_WAS_MAPPED = BIT(0),
- PAGE_WAS_MLOCKED = BIT(1),
- PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
+ FOLIO_WAS_MAPPED = BIT(0),
+ FOLIO_WAS_MLOCKED = BIT(1),
+ FOLIO_OLD_STATES = FOLIO_WAS_MAPPED | FOLIO_WAS_MLOCKED,
};
static void __migrate_folio_record(struct folio *dst,
- int old_page_state,
- struct anon_vma *anon_vma)
+ int old_folio_state, struct anon_vma *anon_vma)
{
- dst->private = (void *)anon_vma + old_page_state;
+ dst->private = (void *)anon_vma + old_folio_state;
}
static void __migrate_folio_extract(struct folio *dst,
- int *old_page_state,
- struct anon_vma **anon_vmap)
+ int *old_folio_state, struct anon_vma **anon_vmap)
{
unsigned long private = (unsigned long)dst->private;
- *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES);
- *old_page_state = private & PAGE_OLD_STATES;
+ *anon_vmap = (struct anon_vma *)(private & ~FOLIO_OLD_STATES);
+ *old_folio_state = private & FOLIO_OLD_STATES;
dst->private = NULL;
}
@@ -1209,7 +1207,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
{
struct folio *dst;
int rc = -EAGAIN;
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
bool locked = false;
bool dst_locked = false;
@@ -1253,7 +1251,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
}
locked = true;
if (folio_test_mlocked(src))
- old_page_state |= PAGE_WAS_MLOCKED;
+ old_folio_state |= FOLIO_WAS_MLOCKED;
if (folio_test_writeback(src)) {
/*
@@ -1302,7 +1300,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
dst_locked = true;
if (unlikely(page_has_movable_ops(&src->page))) {
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return 0;
}
@@ -1328,11 +1326,11 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
VM_BUG_ON_FOLIO(folio_test_anon(src) &&
!folio_test_ksm(src) && !anon_vma, src);
try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
- old_page_state |= PAGE_WAS_MAPPED;
+ old_folio_state |= FOLIO_WAS_MAPPED;
}
if (!folio_mapped(src)) {
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return 0;
}
@@ -1344,7 +1342,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
if (rc == -EAGAIN)
ret = NULL;
- migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+ migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, locked, ret);
migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
@@ -1358,13 +1356,13 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
struct list_head *ret)
{
int rc;
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
bool src_deferred_split = false;
bool src_partially_mapped = false;
struct list_head *prev;
- __migrate_folio_extract(dst, &old_page_state, &anon_vma);
+ __migrate_folio_extract(dst, &old_folio_state, &anon_vma);
prev = dst->lru.prev;
list_del(&dst->lru);
@@ -1404,10 +1402,10 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
* isolated from the unevictable LRU: but this case is the easiest.
*/
folio_add_lru(dst);
- if (old_page_state & PAGE_WAS_MLOCKED)
+ if (old_folio_state & FOLIO_WAS_MLOCKED)
lru_add_drain();
- if (old_page_state & PAGE_WAS_MAPPED)
+ if (old_folio_state & FOLIO_WAS_MAPPED)
remove_migration_ptes(src, dst, 0);
out_unlock_both:
@@ -1439,11 +1437,11 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
*/
if (rc == -EAGAIN) {
list_add(&dst->lru, prev);
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return rc;
}
- migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+ migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, true, ret);
migrate_folio_undo_dst(dst, true, put_new_folio, private);
@@ -1777,11 +1775,11 @@ static void migrate_folios_undo(struct list_head *src_folios,
dst = list_first_entry(dst_folios, struct folio, lru);
dst2 = list_next_entry(dst, lru);
list_for_each_entry_safe(folio, folio2, src_folios, lru) {
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
- __migrate_folio_extract(dst, &old_page_state, &anon_vma);
- migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
+ __migrate_folio_extract(dst, &old_folio_state, &anon_vma);
+ migrate_folio_undo_src(folio, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, true, ret_folios);
list_del(&dst->lru);
migrate_folio_undo_dst(dst, true, put_new_folio, private);
--
2.43.0
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 1/7] mm/migrate: rename PAGE_ migration flags to FOLIO_
2026-04-28 15:50 ` [PATCH 1/7] mm/migrate: rename PAGE_ migration flags to FOLIO_ Shivank Garg
@ 2026-04-30 9:07 ` Huang, Ying
0 siblings, 0 replies; 20+ messages in thread
From: Huang, Ying @ 2026-04-30 9:07 UTC (permalink / raw)
To: Shivank Garg
Cc: akpm, david, kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy,
rppt, surenb, mhocko, ziy, matthew.brost, joshua.hahnjy,
rakie.kim, byungchul, gourry, apopple, dave, Jonathan.Cameron,
rkodsara, vkoul, bharata, sj, rientjes, xuezhengchu, yiannis,
dave.hansen, hannes, jhubbard, peterx, riel, shakeel.butt,
stalexan, tj, nifan.cxl, jic23, aneesh.kumar, nathan.lynch,
Frank.li, djbw, linux-kernel, linux-mm, Baolin Wang, Lance Yang
Shivank Garg <shivankg@amd.com> writes:
> These flags only track folio-specific state during migration and are
> not used for movable_ops pages. Rename the enum values and the
> old_page_state variable to match.
>
> No functional change.
>
> Suggested-by: David Hildenbrand <david@kernel.org>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Signed-off-by: Shivank Garg <shivankg@amd.com>
LGTM, Thanks! Feel free to add my
Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
in the future versions.
[snip]
---
Best Regards,
Huang, Ying
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 2/7] mm/migrate: use migrate_info field instead of private
2026-04-28 15:50 [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Shivank Garg
2026-04-28 15:50 ` [PATCH 1/7] mm/migrate: rename PAGE_ migration flags to FOLIO_ Shivank Garg
@ 2026-04-28 15:50 ` Shivank Garg
2026-05-07 9:43 ` Huang, Ying
2026-04-28 15:50 ` [PATCH 3/7] mm/migrate: skip data copy for already-copied folios Shivank Garg
` (7 subsequent siblings)
9 siblings, 1 reply; 20+ messages in thread
From: Shivank Garg @ 2026-04-28 15:50 UTC (permalink / raw)
To: akpm, david
Cc: kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy, rppt, surenb,
mhocko, ziy, matthew.brost, joshua.hahnjy, rakie.kim, byungchul,
gourry, ying.huang, apopple, dave, Jonathan.Cameron, rkodsara,
vkoul, bharata, sj, rientjes, xuezhengchu, yiannis, dave.hansen,
hannes, jhubbard, peterx, riel, shakeel.butt, stalexan, tj,
nifan.cxl, jic23, aneesh.kumar, nathan.lynch, Frank.li, djbw,
linux-kernel, linux-mm, Shivank Garg
Add an unsigned long migrate_info member to the struct folio union and
use it to store migration state (anon_vma pointer and FOLIO_WAS_*
flags) instead of using folio->private.
No functional change.
Suggested-by: David Hildenbrand <david@kernel.org>
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
include/linux/mm_types.h | 1 +
mm/migrate.c | 14 +++++++-------
2 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a308e2c23b82..f52818dcf4d2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -426,6 +426,7 @@ struct folio {
union {
void *private;
swp_entry_t swap;
+ unsigned long migrate_info;
};
atomic_t _mapcount;
atomic_t _refcount;
diff --git a/mm/migrate.c b/mm/migrate.c
index 0c6a0ab6ecce..03c2a6f7e5e4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1130,7 +1130,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
}
/*
- * To record some information during migration, we use unused private
+ * To record some information during migration, we use the migrate_info
* field of struct folio of the newly allocated destination folio.
* This is safe because nobody is using it except us.
*/
@@ -1143,17 +1143,17 @@ enum {
static void __migrate_folio_record(struct folio *dst,
int old_folio_state, struct anon_vma *anon_vma)
{
- dst->private = (void *)anon_vma + old_folio_state;
+ dst->migrate_info = (unsigned long)anon_vma | old_folio_state;
}
static void __migrate_folio_extract(struct folio *dst,
int *old_folio_state, struct anon_vma **anon_vmap)
{
- unsigned long private = (unsigned long)dst->private;
+ unsigned long info = dst->migrate_info;
- *anon_vmap = (struct anon_vma *)(private & ~FOLIO_OLD_STATES);
- *old_folio_state = private & FOLIO_OLD_STATES;
- dst->private = NULL;
+ *anon_vmap = (struct anon_vma *)(info & ~FOLIO_OLD_STATES);
+ *old_folio_state = info & FOLIO_OLD_STATES;
+ dst->migrate_info = 0;
}
/* Restore the source folio to the original state upon failure */
@@ -1217,7 +1217,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
return -ENOMEM;
*dstp = dst;
- dst->private = NULL;
+ dst->migrate_info = 0;
if (!folio_trylock(src)) {
if (mode == MIGRATE_ASYNC)
--
2.43.0
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 2/7] mm/migrate: use migrate_info field instead of private
2026-04-28 15:50 ` [PATCH 2/7] mm/migrate: use migrate_info field instead of private Shivank Garg
@ 2026-05-07 9:43 ` Huang, Ying
0 siblings, 0 replies; 20+ messages in thread
From: Huang, Ying @ 2026-05-07 9:43 UTC (permalink / raw)
To: Shivank Garg
Cc: akpm, david, kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy,
rppt, surenb, mhocko, ziy, matthew.brost, joshua.hahnjy,
rakie.kim, byungchul, gourry, apopple, dave, Jonathan.Cameron,
rkodsara, vkoul, bharata, sj, rientjes, xuezhengchu, yiannis,
dave.hansen, hannes, jhubbard, peterx, riel, shakeel.butt,
stalexan, tj, nifan.cxl, jic23, aneesh.kumar, nathan.lynch,
Frank.li, djbw, linux-kernel, linux-mm
Shivank Garg <shivankg@amd.com> writes:
> Add an unsigned long migrate_info member to the struct folio union and
> use it to store migration state (anon_vma pointer and FOLIO_WAS_*
> flags) instead of using folio->private.
>
> No functional change.
>
> Suggested-by: David Hildenbrand <david@kernel.org>
> Signed-off-by: Shivank Garg <shivankg@amd.com>
> ---
> include/linux/mm_types.h | 1 +
> mm/migrate.c | 14 +++++++-------
> 2 files changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index a308e2c23b82..f52818dcf4d2 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -426,6 +426,7 @@ struct folio {
> union {
> void *private;
> swp_entry_t swap;
> + unsigned long migrate_info;
> };
> atomic_t _mapcount;
> atomic_t _refcount;
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 0c6a0ab6ecce..03c2a6f7e5e4 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1130,7 +1130,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
> }
>
> /*
> - * To record some information during migration, we use unused private
> + * To record some information during migration, we use the migrate_info
> * field of struct folio of the newly allocated destination folio.
> * This is safe because nobody is using it except us.
> */
> @@ -1143,17 +1143,17 @@ enum {
> static void __migrate_folio_record(struct folio *dst,
> int old_folio_state, struct anon_vma *anon_vma)
> {
> - dst->private = (void *)anon_vma + old_folio_state;
> + dst->migrate_info = (unsigned long)anon_vma | old_folio_state;
> }
>
> static void __migrate_folio_extract(struct folio *dst,
> int *old_folio_state, struct anon_vma **anon_vmap)
> {
> - unsigned long private = (unsigned long)dst->private;
> + unsigned long info = dst->migrate_info;
>
> - *anon_vmap = (struct anon_vma *)(private & ~FOLIO_OLD_STATES);
> - *old_folio_state = private & FOLIO_OLD_STATES;
> - dst->private = NULL;
> + *anon_vmap = (struct anon_vma *)(info & ~FOLIO_OLD_STATES);
> + *old_folio_state = info & FOLIO_OLD_STATES;
> + dst->migrate_info = 0;
> }
>
> /* Restore the source folio to the original state upon failure */
> @@ -1217,7 +1217,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
> return -ENOMEM;
> *dstp = dst;
>
> - dst->private = NULL;
> + dst->migrate_info = 0;
>
> if (!folio_trylock(src)) {
> if (mode == MIGRATE_ASYNC)
LGTM! Feel free to add my
Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
in the future versions.
---
Best Regards,
Huang, Ying
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 3/7] mm/migrate: skip data copy for already-copied folios
2026-04-28 15:50 [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Shivank Garg
2026-04-28 15:50 ` [PATCH 1/7] mm/migrate: rename PAGE_ migration flags to FOLIO_ Shivank Garg
2026-04-28 15:50 ` [PATCH 2/7] mm/migrate: use migrate_info field instead of private Shivank Garg
@ 2026-04-28 15:50 ` Shivank Garg
2026-04-28 15:50 ` [PATCH 4/7] mm/migrate: add batch-copy path in migrate_pages_batch Shivank Garg
` (6 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Shivank Garg @ 2026-04-28 15:50 UTC (permalink / raw)
To: akpm, david
Cc: kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy, rppt, surenb,
mhocko, ziy, matthew.brost, joshua.hahnjy, rakie.kim, byungchul,
gourry, ying.huang, apopple, dave, Jonathan.Cameron, rkodsara,
vkoul, bharata, sj, rientjes, xuezhengchu, yiannis, dave.hansen,
hannes, jhubbard, peterx, riel, shakeel.butt, stalexan, tj,
nifan.cxl, jic23, aneesh.kumar, nathan.lynch, Frank.li, djbw,
linux-kernel, linux-mm, Shivank Garg
Add a FOLIO_ALREADY_COPIED flag to the dst->migrate_info migration
state. When set, __migrate_folio() skips folio_mc_copy() and
performs metadata-only migration. All callers currently pass
already_copied=false. The batch-copy path enables it later in a
subsequent patch.
Move the dst->migrate_info state enum earlier in the file so
__migrate_folio() and move_to_new_folio() can see FOLIO_ALREADY_COPIED.
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
mm/migrate.c | 53 +++++++++++++++++++++++++++++++---------------------
1 file changed, 32 insertions(+), 21 deletions(-)
diff --git a/mm/migrate.c b/mm/migrate.c
index 03c2a6f7e5e4..c493e67e359d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -850,6 +850,19 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
}
EXPORT_SYMBOL(folio_migrate_flags);
+/*
+ * To record some information during migration, we use the migrate_info
+ * field of struct folio of the newly allocated destination folio.
+ * This is safe because nobody is using it except us.
+ */
+enum {
+ FOLIO_WAS_MAPPED = BIT(0),
+ FOLIO_WAS_MLOCKED = BIT(1),
+ FOLIO_ALREADY_COPIED = BIT(2),
+ FOLIO_OLD_STATES = FOLIO_WAS_MAPPED | FOLIO_WAS_MLOCKED |
+ FOLIO_ALREADY_COPIED,
+};
+
/************************************************************
* Migration functions
***********************************************************/
@@ -859,14 +872,20 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
enum migrate_mode mode)
{
int rc, expected_count = folio_expected_ref_count(src) + 1;
+ bool already_copied = (dst->migrate_info & FOLIO_ALREADY_COPIED);
+
+ if (already_copied)
+ dst->migrate_info = 0;
/* Check whether src does not have extra refs before we do more work */
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
- rc = folio_mc_copy(dst, src);
- if (unlikely(rc))
- return rc;
+ if (!already_copied) {
+ rc = folio_mc_copy(dst, src);
+ if (unlikely(rc))
+ return rc;
+ }
rc = __folio_migrate_mapping(mapping, dst, src, expected_count);
if (rc)
@@ -1090,7 +1109,7 @@ static int fallback_migrate_folio(struct address_space *mapping,
* 0 - success
*/
static int move_to_new_folio(struct folio *dst, struct folio *src,
- enum migrate_mode mode)
+ enum migrate_mode mode, bool already_copied)
{
struct address_space *mapping = folio_mapping(src);
int rc = -EAGAIN;
@@ -1098,6 +1117,9 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
+ if (already_copied)
+ dst->migrate_info = FOLIO_ALREADY_COPIED;
+
if (!mapping)
rc = migrate_folio(mapping, dst, src, mode);
else if (mapping_inaccessible(mapping))
@@ -1129,17 +1151,6 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
return rc;
}
-/*
- * To record some information during migration, we use the migrate_info
- * field of struct folio of the newly allocated destination folio.
- * This is safe because nobody is using it except us.
- */
-enum {
- FOLIO_WAS_MAPPED = BIT(0),
- FOLIO_WAS_MLOCKED = BIT(1),
- FOLIO_OLD_STATES = FOLIO_WAS_MAPPED | FOLIO_WAS_MLOCKED,
-};
-
static void __migrate_folio_record(struct folio *dst,
int old_folio_state, struct anon_vma *anon_vma)
{
@@ -1353,7 +1364,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
struct folio *src, struct folio *dst,
enum migrate_mode mode, enum migrate_reason reason,
- struct list_head *ret)
+ struct list_head *ret, bool already_copied)
{
int rc;
int old_folio_state = 0;
@@ -1379,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
src_partially_mapped = folio_test_partially_mapped(src);
}
- rc = move_to_new_folio(dst, src, mode);
+ rc = move_to_new_folio(dst, src, mode, already_copied);
if (rc)
goto out;
@@ -1536,7 +1547,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
}
if (!folio_mapped(src))
- rc = move_to_new_folio(dst, src, mode);
+ rc = move_to_new_folio(dst, src, mode, false);
if (page_was_mapped)
remove_migration_ptes(src, !rc ? dst : src, ttu);
@@ -1720,7 +1731,7 @@ static void migrate_folios_move(struct list_head *src_folios,
struct list_head *ret_folios,
struct migrate_pages_stats *stats,
int *retry, int *thp_retry, int *nr_failed,
- int *nr_retry_pages)
+ int *nr_retry_pages, bool already_copied)
{
struct folio *folio, *folio2, *dst, *dst2;
bool is_thp;
@@ -1737,7 +1748,7 @@ static void migrate_folios_move(struct list_head *src_folios,
rc = migrate_folio_move(put_new_folio, private,
folio, dst, mode,
- reason, ret_folios);
+ reason, ret_folios, already_copied);
/*
* The rules are:
* 0: folio will be freed
@@ -1994,7 +2005,7 @@ static int migrate_pages_batch(struct list_head *from,
migrate_folios_move(&unmap_folios, &dst_folios,
put_new_folio, private, mode, reason,
ret_folios, stats, &retry, &thp_retry,
- &nr_failed, &nr_retry_pages);
+ &nr_failed, &nr_retry_pages, false);
}
nr_failed += retry;
stats->nr_thp_failed += thp_retry;
--
2.43.0
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 4/7] mm/migrate: add batch-copy path in migrate_pages_batch
2026-04-28 15:50 [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Shivank Garg
` (2 preceding siblings ...)
2026-04-28 15:50 ` [PATCH 3/7] mm/migrate: skip data copy for already-copied folios Shivank Garg
@ 2026-04-28 15:50 ` Shivank Garg
2026-04-28 15:50 ` [PATCH 5/7] mm/migrate: add copy offload registration infrastructure Shivank Garg
` (5 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Shivank Garg @ 2026-04-28 15:50 UTC (permalink / raw)
To: akpm, david
Cc: kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy, rppt, surenb,
mhocko, ziy, matthew.brost, joshua.hahnjy, rakie.kim, byungchul,
gourry, ying.huang, apopple, dave, Jonathan.Cameron, rkodsara,
vkoul, bharata, sj, rientjes, xuezhengchu, yiannis, dave.hansen,
hannes, jhubbard, peterx, riel, shakeel.butt, stalexan, tj,
nifan.cxl, jic23, aneesh.kumar, nathan.lynch, Frank.li, djbw,
linux-kernel, linux-mm, Shivank Garg
Add folios_mc_copy() which walks list of src and dst folios in lockstep,
and copies folio content via folio_mc_copy(). folios_cnt parameter is
unused here, but is part of the offload_copy callback signature used by
later patches in the series.
Split unmapped folios into batch-eligible (unmap_batch/dst_batch) and
standard (unmap_single/dst_single) lists, gated by the
migrate_offload_enabled which is off by default. So, when no offload
driver is active, the branch is never taken and everything goes
through the standard path.
After TLB flush, batch copy the eligible folios via folios_mc_copy()
and pass already_copied=true into migrate_folios_move() so
__migrate_folio() skips the per-folio copy.
On batch copy failure, already_copied flag stays false and each folio
fall back to individual copy.
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
include/linux/mm.h | 2 ++
mm/migrate.c | 61 +++++++++++++++++++++++++++++++++++-----------
mm/util.c | 30 +++++++++++++++++++++++
3 files changed, 79 insertions(+), 14 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0b776907152e..e6ab9bc3de8f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1917,6 +1917,8 @@ void __folio_put(struct folio *folio);
void split_page(struct page *page, unsigned int order);
void folio_copy(struct folio *dst, struct folio *src);
int folio_mc_copy(struct folio *dst, struct folio *src);
+int folios_mc_copy(struct list_head *dst_list, struct list_head *src_list,
+ unsigned int __always_unused folios_cnt);
unsigned long nr_free_buffer_pages(void);
diff --git a/mm/migrate.c b/mm/migrate.c
index c493e67e359d..6c2f1cb66f96 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -43,6 +43,7 @@
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/pagewalk.h>
+#include <linux/jump_label.h>
#include <asm/tlbflush.h>
@@ -51,6 +52,8 @@
#include "internal.h"
#include "swap.h"
+DEFINE_STATIC_KEY_FALSE(migrate_offload_enabled);
+
static const struct movable_operations *offline_movable_ops;
static const struct movable_operations *zsmalloc_movable_ops;
@@ -1724,6 +1727,12 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
return nr_failed;
}
+/* movable_ops folios have their own migrate path */
+static bool folio_supports_batch_copy(struct folio *folio)
+{
+ return likely(!page_has_movable_ops(&folio->page));
+}
+
static void migrate_folios_move(struct list_head *src_folios,
struct list_head *dst_folios,
free_folio_t put_new_folio, unsigned long private,
@@ -1752,7 +1761,7 @@ static void migrate_folios_move(struct list_head *src_folios,
/*
* The rules are:
* 0: folio will be freed
- * -EAGAIN: stay on the unmap_folios list
+ * -EAGAIN: stay on the src_folios list
* Other errno: put on ret_folios list
*/
switch (rc) {
@@ -1823,8 +1832,12 @@ static int migrate_pages_batch(struct list_head *from,
bool is_large = false;
struct folio *folio, *folio2, *dst = NULL;
int rc, rc_saved = 0, nr_pages;
- LIST_HEAD(unmap_folios);
- LIST_HEAD(dst_folios);
+ unsigned int nr_batch = 0;
+ bool batch_copied = false;
+ LIST_HEAD(unmap_batch);
+ LIST_HEAD(dst_batch);
+ LIST_HEAD(unmap_single);
+ LIST_HEAD(dst_single);
bool nosplit = (reason == MR_NUMA_MISPLACED);
VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
@@ -1919,8 +1932,8 @@ static int migrate_pages_batch(struct list_head *from,
private, folio, &dst, mode, ret_folios);
/*
* The rules are:
- * 0: folio will be put on unmap_folios list,
- * dst folio put on dst_folios list
+ * 0: folio put on unmap_batch or unmap_single,
+ * dst folio put on dst_batch or dst_single
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
@@ -1961,7 +1974,7 @@ static int migrate_pages_batch(struct list_head *from,
/* nr_failed isn't updated for not used */
stats->nr_thp_failed += thp_retry;
rc_saved = rc;
- if (list_empty(&unmap_folios))
+ if (list_empty(&unmap_batch) && list_empty(&unmap_single))
goto out;
else
goto move;
@@ -1971,8 +1984,15 @@ static int migrate_pages_batch(struct list_head *from,
nr_retry_pages += nr_pages;
break;
case 0:
- list_move_tail(&folio->lru, &unmap_folios);
- list_add_tail(&dst->lru, &dst_folios);
+ if (static_branch_unlikely(&migrate_offload_enabled) &&
+ folio_supports_batch_copy(folio)) {
+ list_move_tail(&folio->lru, &unmap_batch);
+ list_add_tail(&dst->lru, &dst_batch);
+ nr_batch++;
+ } else {
+ list_move_tail(&folio->lru, &unmap_single);
+ list_add_tail(&dst->lru, &dst_single);
+ }
break;
default:
/*
@@ -1995,17 +2015,28 @@ static int migrate_pages_batch(struct list_head *from,
/* Flush TLBs for all unmapped folios */
try_to_unmap_flush();
+ /* Batch-copy eligible folios before the move phase */
+ if (!list_empty(&unmap_batch)) {
+ rc = folios_mc_copy(&dst_batch, &unmap_batch, nr_batch);
+ batch_copied = (rc == 0);
+ }
+
retry = 1;
for (pass = 0; pass < nr_pass && retry; pass++) {
retry = 0;
thp_retry = 0;
nr_retry_pages = 0;
- /* Move the unmapped folios */
- migrate_folios_move(&unmap_folios, &dst_folios,
- put_new_folio, private, mode, reason,
- ret_folios, stats, &retry, &thp_retry,
- &nr_failed, &nr_retry_pages, false);
+ if (!list_empty(&unmap_batch))
+ migrate_folios_move(&unmap_batch, &dst_batch, put_new_folio,
+ private, mode, reason, ret_folios, stats,
+ &retry, &thp_retry, &nr_failed,
+ &nr_retry_pages, batch_copied);
+ if (!list_empty(&unmap_single))
+ migrate_folios_move(&unmap_single, &dst_single, put_new_folio,
+ private, mode, reason, ret_folios, stats,
+ &retry, &thp_retry, &nr_failed,
+ &nr_retry_pages, false);
}
nr_failed += retry;
stats->nr_thp_failed += thp_retry;
@@ -2014,7 +2045,9 @@ static int migrate_pages_batch(struct list_head *from,
rc = rc_saved ? : nr_failed;
out:
/* Cleanup remaining folios */
- migrate_folios_undo(&unmap_folios, &dst_folios,
+ migrate_folios_undo(&unmap_batch, &dst_batch,
+ put_new_folio, private, ret_folios);
+ migrate_folios_undo(&unmap_single, &dst_single,
put_new_folio, private, ret_folios);
return rc;
diff --git a/mm/util.c b/mm/util.c
index 232c3930a662..77eeb285def1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -778,6 +778,36 @@ int folio_mc_copy(struct folio *dst, struct folio *src)
}
EXPORT_SYMBOL(folio_mc_copy);
+/**
+ * folios_mc_copy - Copy the contents of list of folios.
+ * @dst_list: destination folio list.
+ * @src_list: source folio list.
+ * @folios_cnt: unused here, present for callback signature compatibility.
+ *
+ * Walks list of src and dst folios in lockstep and copies folio
+ * content via folio_mc_copy(). The caller must ensure both lists have
+ * the same number of entries. This may sleep.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int folios_mc_copy(struct list_head *dst_list, struct list_head *src_list,
+ unsigned int __always_unused folios_cnt)
+{
+ struct folio *src, *dst;
+ int ret;
+
+ dst = list_first_entry(dst_list, struct folio, lru);
+ list_for_each_entry(src, src_list, lru) {
+ ret = folio_mc_copy(dst, src);
+ if (ret)
+ return ret;
+ dst = list_next_entry(dst, lru);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(folios_mc_copy);
+
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
static int sysctl_overcommit_ratio __read_mostly = 50;
static unsigned long sysctl_overcommit_kbytes __read_mostly;
--
2.43.0
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 5/7] mm/migrate: add copy offload registration infrastructure
2026-04-28 15:50 [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Shivank Garg
` (3 preceding siblings ...)
2026-04-28 15:50 ` [PATCH 4/7] mm/migrate: add batch-copy path in migrate_pages_batch Shivank Garg
@ 2026-04-28 15:50 ` Shivank Garg
2026-04-28 15:50 ` [PATCH 6/7] drivers/migrate_offload: add DMA batch copy driver (dcbm) Shivank Garg
` (4 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Shivank Garg @ 2026-04-28 15:50 UTC (permalink / raw)
To: akpm, david
Cc: kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy, rppt, surenb,
mhocko, ziy, matthew.brost, joshua.hahnjy, rakie.kim, byungchul,
gourry, ying.huang, apopple, dave, Jonathan.Cameron, rkodsara,
vkoul, bharata, sj, rientjes, xuezhengchu, yiannis, dave.hansen,
hannes, jhubbard, peterx, riel, shakeel.butt, stalexan, tj,
nifan.cxl, jic23, aneesh.kumar, nathan.lynch, Frank.li, djbw,
linux-kernel, linux-mm, Shivank Garg, Mike Day
Add a registration interface that lets a single offload provider
(DMA, multi-threaded CPU copy, etc) take over the batch folio copy
performed by migrate_pages_batch().
The provider fills in a struct migrator with an offload_copy()
callback and calls migrate_offload_register(). Registration patches
the migrate_offload_copy() static_call and flips the
migrate_offload_enabled static branch. The migrate_offload_unregister()
reverts both.
Whether a migration reason is batch-copy eligible is decided by the
core in migrate_offload_do_batch(). A migrator may decline a particular
batch (e.g. when nr_batch is too small to amortize setup) by returning
-EOPNOTSUPP, and the move phase falls back to per-folio CPU copy.
Only one migrator can be active at a time. A second registration
returns -EBUSY, and only the active migrator can unregister itself.
The static_call dispatch is protected by SRCU so that the
synchronize_srcu() in unregister waits for all in-flight copy before
the module reference is dropped.
Co-developed-by: Mike Day <michael.day@amd.com>
Signed-off-by: Mike Day <michael.day@amd.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
include/linux/migrate_copy_offload.h | 44 +++++++++++++
mm/Kconfig | 6 ++
mm/Makefile | 1 +
mm/migrate.c | 57 +++++++++++++++--
mm/migrate_copy_offload.c | 94 ++++++++++++++++++++++++++++
5 files changed, 198 insertions(+), 4 deletions(-)
create mode 100644 include/linux/migrate_copy_offload.h
create mode 100644 mm/migrate_copy_offload.c
diff --git a/include/linux/migrate_copy_offload.h b/include/linux/migrate_copy_offload.h
new file mode 100644
index 000000000000..d68b10a84743
--- /dev/null
+++ b/include/linux/migrate_copy_offload.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MIGRATE_COPY_OFFLOAD_H
+#define _LINUX_MIGRATE_COPY_OFFLOAD_H
+
+#include <linux/errno.h>
+#include <linux/jump_label.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+
+struct list_head;
+struct module;
+
+#define MIGRATOR_NAME_LEN 32
+
+/**
+ * struct migrator - batch-copy provider for page migration.
+ * @name: name of the provider.
+ * @offload_copy: copy @folio_cnt folios from @src_list to @dst_list.
+ *
+ * The migrator may inspect @folio_cnt to decide whether the batch
+ * is worth offloading, e.g. skip when the batch is too small to
+ * amortize setup cost. If returns error, the core falls back to CPU copy.
+ *
+ * @owner: module providing the migrator.
+ */
+struct migrator {
+ char name[MIGRATOR_NAME_LEN];
+ int (*offload_copy)(struct list_head *dst_list,
+ struct list_head *src_list,
+ unsigned int folio_cnt);
+ struct module *owner;
+};
+
+#ifdef CONFIG_MIGRATION_COPY_OFFLOAD
+extern struct static_key_false migrate_offload_enabled;
+extern struct srcu_struct migrate_offload_srcu;
+int migrate_offload_register(struct migrator *m);
+int migrate_offload_unregister(struct migrator *m);
+#else
+static inline int migrate_offload_register(struct migrator *m) { return -EOPNOTSUPP; }
+static inline int migrate_offload_unregister(struct migrator *m) { return -EOPNOTSUPP; }
+#endif
+
+#endif /* _LINUX_MIGRATE_COPY_OFFLOAD_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index e8bf1e9e6ad9..325d79619680 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -647,6 +647,12 @@ config MIGRATION
config DEVICE_MIGRATION
def_bool MIGRATION && ZONE_DEVICE
+# Page-migration batch-copy offload infrastructure.
+# Selected by migrator drivers (e.g. CONFIG_DCBM_DMA).
+config MIGRATION_COPY_OFFLOAD
+ bool
+ depends on MIGRATION
+
config ARCH_ENABLE_HUGEPAGE_MIGRATION
bool
diff --git a/mm/Makefile b/mm/Makefile
index 8ad2ab08244e..db1ac8097089 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_FAIL_PAGE_ALLOC) += fail_page_alloc.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_MIGRATION_COPY_OFFLOAD) += migrate_copy_offload.o
obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
diff --git a/mm/migrate.c b/mm/migrate.c
index 6c2f1cb66f96..9af070f9a1f2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -44,6 +44,8 @@
#include <linux/memory-tiers.h>
#include <linux/pagewalk.h>
#include <linux/jump_label.h>
+#include <linux/static_call.h>
+#include <linux/migrate_copy_offload.h>
#include <asm/tlbflush.h>
@@ -54,6 +56,51 @@
DEFINE_STATIC_KEY_FALSE(migrate_offload_enabled);
+#ifdef CONFIG_MIGRATION_COPY_OFFLOAD
+DEFINE_SRCU(migrate_offload_srcu);
+DEFINE_STATIC_CALL(migrate_offload_copy, folios_mc_copy);
+
+static bool migrate_offload_do_batch(int reason)
+{
+ if (!static_branch_unlikely(&migrate_offload_enabled))
+ return false;
+
+ switch (reason) {
+ case MR_COMPACTION:
+ case MR_SYSCALL:
+ case MR_DEMOTION:
+ case MR_NUMA_MISPLACED:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int migrate_offload_batch_copy(struct list_head *dst_batch,
+ struct list_head *src_batch,
+ unsigned int nr_batch)
+{
+ int idx, rc;
+
+ idx = srcu_read_lock(&migrate_offload_srcu);
+ rc = static_call(migrate_offload_copy)(dst_batch, src_batch, nr_batch);
+ srcu_read_unlock(&migrate_offload_srcu, idx);
+ return rc;
+}
+#else
+static bool migrate_offload_do_batch(int reason)
+{
+ return false;
+}
+
+static int migrate_offload_batch_copy(struct list_head *dst_batch,
+ struct list_head *src_batch,
+ unsigned int nr_batch)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
static const struct movable_operations *offline_movable_ops;
static const struct movable_operations *zsmalloc_movable_ops;
@@ -1833,7 +1880,7 @@ static int migrate_pages_batch(struct list_head *from,
struct folio *folio, *folio2, *dst = NULL;
int rc, rc_saved = 0, nr_pages;
unsigned int nr_batch = 0;
- bool batch_copied = false;
+ bool do_batch = false, batch_copied = false;
LIST_HEAD(unmap_batch);
LIST_HEAD(dst_batch);
LIST_HEAD(unmap_single);
@@ -1843,6 +1890,8 @@ static int migrate_pages_batch(struct list_head *from,
VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
!list_empty(from) && !list_is_singular(from));
+ do_batch = migrate_offload_do_batch(reason);
+
for (pass = 0; pass < nr_pass && retry; pass++) {
retry = 0;
thp_retry = 0;
@@ -1984,8 +2033,7 @@ static int migrate_pages_batch(struct list_head *from,
nr_retry_pages += nr_pages;
break;
case 0:
- if (static_branch_unlikely(&migrate_offload_enabled) &&
- folio_supports_batch_copy(folio)) {
+ if (do_batch && folio_supports_batch_copy(folio)) {
list_move_tail(&folio->lru, &unmap_batch);
list_add_tail(&dst->lru, &dst_batch);
nr_batch++;
@@ -2017,7 +2065,8 @@ static int migrate_pages_batch(struct list_head *from,
/* Batch-copy eligible folios before the move phase */
if (!list_empty(&unmap_batch)) {
- rc = folios_mc_copy(&dst_batch, &unmap_batch, nr_batch);
+ rc = migrate_offload_batch_copy(&dst_batch, &unmap_batch,
+ nr_batch);
batch_copied = (rc == 0);
}
diff --git a/mm/migrate_copy_offload.c b/mm/migrate_copy_offload.c
new file mode 100644
index 000000000000..6f837c725239
--- /dev/null
+++ b/mm/migrate_copy_offload.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/jump_label.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+#include <linux/migrate.h>
+#include <linux/migrate_copy_offload.h>
+#include <linux/static_call.h>
+
+static DEFINE_MUTEX(migrator_mutex);
+static struct migrator *active_migrator;
+
+DECLARE_STATIC_CALL(migrate_offload_copy, folios_mc_copy);
+
+/**
+ * migrate_offload_register - register a batch-copy provider for page migration.
+ * @m: migrator to install.
+ *
+ * Only one provider can be active at a time, returns -EBUSY if another migrator
+ * is already registered.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int migrate_offload_register(struct migrator *m)
+{
+ int ret = 0;
+
+ if (!m || !m->offload_copy || !m->owner)
+ return -EINVAL;
+
+ mutex_lock(&migrator_mutex);
+ if (active_migrator) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ if (!try_module_get(m->owner)) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+
+ static_call_update(migrate_offload_copy, m->offload_copy);
+ active_migrator = m;
+ static_branch_enable(&migrate_offload_enabled);
+
+unlock:
+ mutex_unlock(&migrator_mutex);
+
+ if (ret)
+ pr_err("migrate_offload: %s: failed to register (%d)\n",
+ m->name, ret);
+ else
+ pr_info("migrate_offload: enabled by %s\n", m->name);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(migrate_offload_register);
+
+/**
+ * migrate_offload_unregister - unregister the active batch-copy provider.
+ * @m: migrator to remove (must be the currently active one).
+ *
+ * Reverts static_call targets and waits for SRCU grace period so that
+ * no in-flight migration is still calling the driver functions before
+ * releasing the module.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int migrate_offload_unregister(struct migrator *m)
+{
+ struct module *owner;
+
+ mutex_lock(&migrator_mutex);
+ if (active_migrator != m) {
+ mutex_unlock(&migrator_mutex);
+ return -EINVAL;
+ }
+
+ /*
+ * Disable the static branch first so new migrate_pages_batch calls
+ * won't enter the batch copy path.
+ */
+ static_branch_disable(&migrate_offload_enabled);
+ static_call_update(migrate_offload_copy, folios_mc_copy);
+ owner = active_migrator->owner;
+ active_migrator = NULL;
+ mutex_unlock(&migrator_mutex);
+
+ /* Wait for all in-flight callers to finish before module_put(). */
+ synchronize_srcu(&migrate_offload_srcu);
+ module_put(owner);
+
+ pr_info("migrate_offload: disabled by %s\n", m->name);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(migrate_offload_unregister);
--
2.43.0
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 6/7] drivers/migrate_offload: add DMA batch copy driver (dcbm)
2026-04-28 15:50 [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Shivank Garg
` (4 preceding siblings ...)
2026-04-28 15:50 ` [PATCH 5/7] mm/migrate: add copy offload registration infrastructure Shivank Garg
@ 2026-04-28 15:50 ` Shivank Garg
2026-04-28 15:50 ` [PATCH 7/7] mm/migrate: adjust NR_MAX_BATCHED_MIGRATION for testing Shivank Garg
` (3 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Shivank Garg @ 2026-04-28 15:50 UTC (permalink / raw)
To: akpm, david
Cc: kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy, rppt, surenb,
mhocko, ziy, matthew.brost, joshua.hahnjy, rakie.kim, byungchul,
gourry, ying.huang, apopple, dave, Jonathan.Cameron, rkodsara,
vkoul, bharata, sj, rientjes, xuezhengchu, yiannis, dave.hansen,
hannes, jhubbard, peterx, riel, shakeel.butt, stalexan, tj,
nifan.cxl, jic23, aneesh.kumar, nathan.lynch, Frank.li, djbw,
linux-kernel, linux-mm, Shivank Garg
Simple DMAEngine-based migrator that plugs into the page migration
copy offload infrastructure to batch-copy folios via DMA memcpy
channels. It is intended for testing the offload plumbing and as a
template for future migrators (SDXI, multi-threaded CPU copy, etc.).
When DMA fails, the callback returns an error and the migration path
falls back to per-folio CPU copy.
Loading the module exposes attributes under /sys/module/dcbm/:
offloading - enable/disable DMA offload
nr_dma_chan - max DMA channels to use
folios_migrated - folios copied via DMA
folios_failures - fallback count
CONFIG_DCBM_DMA selects MIGRATION_COPY_OFFLOAD so enabling the
driver pulls in the infrastructure automatically.
Channel acquisition uses dma_request_chan_by_mask(DMA_MEMCPY), which
works for providers that set DMA_PRIVATE (e.g. AMD PTDMA). Generic
mem-to-mem engines that do not set DMA_PRIVATE (e.g. SDXI) should
acquire channels via dma_find_channel(DMA_MEMCPY) or the async_tx
APIs, which can be added in a follow-up.
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
drivers/Kconfig | 2 +
drivers/Makefile | 2 +
drivers/migrate_offload/Kconfig | 9 +
drivers/migrate_offload/Makefile | 1 +
drivers/migrate_offload/dcbm/Makefile | 1 +
drivers/migrate_offload/dcbm/dcbm.c | 440 ++++++++++++++++++++++++++
6 files changed, 455 insertions(+)
create mode 100644 drivers/migrate_offload/Kconfig
create mode 100644 drivers/migrate_offload/Makefile
create mode 100644 drivers/migrate_offload/dcbm/Makefile
create mode 100644 drivers/migrate_offload/dcbm/dcbm.c
diff --git a/drivers/Kconfig b/drivers/Kconfig
index f2bed2ddeb66..3e83a1475cbc 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -253,4 +253,6 @@ source "drivers/cdx/Kconfig"
source "drivers/resctrl/Kconfig"
+source "drivers/migrate_offload/Kconfig"
+
endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 0841ea851847..88cb8e3e88df 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -42,6 +42,8 @@ obj-y += clk/
# really early.
obj-$(CONFIG_DMADEVICES) += dma/
+obj-$(CONFIG_MIGRATION_COPY_OFFLOAD) += migrate_offload/
+
# SOC specific infrastructure drivers.
obj-y += soc/
obj-$(CONFIG_PM_GENERIC_DOMAINS) += pmdomain/
diff --git a/drivers/migrate_offload/Kconfig b/drivers/migrate_offload/Kconfig
new file mode 100644
index 000000000000..930d8605c15d
--- /dev/null
+++ b/drivers/migrate_offload/Kconfig
@@ -0,0 +1,9 @@
+config DCBM_DMA
+ tristate "DMA Core Batch Migrator"
+ depends on MIGRATION && DMA_ENGINE
+ select MIGRATION_COPY_OFFLOAD
+ help
+ DMA-based batch copy engine for page migration. Uses
+ DMAEngine memcpy channels to offload folio data copies
+ during migration. Primarily intended for testing the copy
+ offload infrastructure.
diff --git a/drivers/migrate_offload/Makefile b/drivers/migrate_offload/Makefile
new file mode 100644
index 000000000000..9e16018beb15
--- /dev/null
+++ b/drivers/migrate_offload/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DCBM_DMA) += dcbm/
diff --git a/drivers/migrate_offload/dcbm/Makefile b/drivers/migrate_offload/dcbm/Makefile
new file mode 100644
index 000000000000..56ba47cce0f1
--- /dev/null
+++ b/drivers/migrate_offload/dcbm/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DCBM_DMA) += dcbm.o
diff --git a/drivers/migrate_offload/dcbm/dcbm.c b/drivers/migrate_offload/dcbm/dcbm.c
new file mode 100644
index 000000000000..893580cb9fac
--- /dev/null
+++ b/drivers/migrate_offload/dcbm/dcbm.c
@@ -0,0 +1,440 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * DMA Core Batch Migrator (DCBM)
+ *
+ * Uses DMAEngine memcpy channels to offload batch folio copies during
+ * page migration. Reference driver meant for testing the offload
+ * infrastructure.
+ *
+ * Copyright (C) 2024-26 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/migrate.h>
+#include <linux/migrate_copy_offload.h>
+
+#define MAX_DMA_CHANNELS 16
+
+static atomic_long_t folios_migrated;
+static atomic_long_t folios_failures;
+
+static bool offloading_enabled;
+static unsigned int nr_dma_channels = 1;
+static DEFINE_MUTEX(dcbm_mutex);
+
+struct dma_work {
+ struct dma_chan *chan;
+ struct completion done;
+ atomic_t pending;
+ struct sg_table *src_sgt;
+ struct sg_table *dst_sgt;
+ bool mapped;
+};
+
+static void dma_completion_callback(void *data)
+{
+ struct dma_work *work = data;
+
+ if (atomic_dec_and_test(&work->pending))
+ complete(&work->done);
+}
+
+static int setup_sg_tables(struct dma_work *work, struct list_head **src_pos,
+ struct list_head **dst_pos, int nr)
+{
+ struct scatterlist *sg_src, *sg_dst;
+ struct device *dev;
+ int i, ret;
+
+ work->src_sgt = kmalloc_obj(*work->src_sgt, GFP_KERNEL);
+ if (!work->src_sgt)
+ return -ENOMEM;
+ work->dst_sgt = kmalloc_obj(*work->dst_sgt, GFP_KERNEL);
+ if (!work->dst_sgt) {
+ ret = -ENOMEM;
+ goto err_free_src;
+ }
+
+ ret = sg_alloc_table(work->src_sgt, nr, GFP_KERNEL);
+ if (ret)
+ goto err_free_dst;
+ ret = sg_alloc_table(work->dst_sgt, nr, GFP_KERNEL);
+ if (ret)
+ goto err_free_src_table;
+
+ sg_src = work->src_sgt->sgl;
+ sg_dst = work->dst_sgt->sgl;
+ for (i = 0; i < nr; i++) {
+ struct folio *src = list_entry(*src_pos, struct folio, lru);
+ struct folio *dst = list_entry(*dst_pos, struct folio, lru);
+
+ sg_set_folio(sg_src, src, folio_size(src), 0);
+ sg_set_folio(sg_dst, dst, folio_size(dst), 0);
+
+ *src_pos = (*src_pos)->next;
+ *dst_pos = (*dst_pos)->next;
+
+ if (i < nr - 1) {
+ sg_src = sg_next(sg_src);
+ sg_dst = sg_next(sg_dst);
+ }
+ }
+
+ dev = dmaengine_get_dma_device(work->chan);
+ if (!dev) {
+ ret = -ENODEV;
+ goto err_free_dst_table;
+ }
+ ret = dma_map_sgtable(dev, work->src_sgt, DMA_TO_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+ if (ret)
+ goto err_free_dst_table;
+ ret = dma_map_sgtable(dev, work->dst_sgt, DMA_FROM_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+ if (ret)
+ goto err_unmap_src;
+
+ /*
+ * TODO: IOMMU may merge segments unevenly on the two sides, fall back
+ * bail to CPU copy. In practice, I have not observed merging in tests.
+ * Handling unequal nents is left for follow-up.
+ */
+ if (work->src_sgt->nents != work->dst_sgt->nents) {
+ ret = -EINVAL;
+ goto err_unmap_dst;
+ }
+ work->mapped = true;
+ return 0;
+
+err_unmap_dst:
+ dma_unmap_sgtable(dev, work->dst_sgt, DMA_FROM_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+err_unmap_src:
+ dma_unmap_sgtable(dev, work->src_sgt, DMA_TO_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+err_free_dst_table:
+ sg_free_table(work->dst_sgt);
+err_free_src_table:
+ sg_free_table(work->src_sgt);
+err_free_dst:
+ kfree(work->dst_sgt);
+ work->dst_sgt = NULL;
+err_free_src:
+ kfree(work->src_sgt);
+ work->src_sgt = NULL;
+ return ret;
+}
+
+static void cleanup_dma_work(struct dma_work *works, int actual_channels)
+{
+ struct device *dev;
+ int i;
+
+ if (!works)
+ return;
+
+ for (i = 0; i < actual_channels; i++) {
+ if (!works[i].chan)
+ continue;
+
+ dev = dmaengine_get_dma_device(works[i].chan);
+
+ if (works[i].mapped)
+ dmaengine_terminate_sync(works[i].chan);
+
+ if (dev && works[i].mapped) {
+ if (works[i].src_sgt) {
+ dma_unmap_sgtable(dev, works[i].src_sgt,
+ DMA_TO_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC |
+ DMA_ATTR_NO_KERNEL_MAPPING);
+ sg_free_table(works[i].src_sgt);
+ kfree(works[i].src_sgt);
+ }
+ if (works[i].dst_sgt) {
+ dma_unmap_sgtable(dev, works[i].dst_sgt,
+ DMA_FROM_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC |
+ DMA_ATTR_NO_KERNEL_MAPPING);
+ sg_free_table(works[i].dst_sgt);
+ kfree(works[i].dst_sgt);
+ }
+ }
+ dma_release_channel(works[i].chan);
+ }
+ kfree(works);
+}
+
+static int submit_dma_transfers(struct dma_work *work)
+{
+ struct scatterlist *sg_src, *sg_dst;
+ struct dma_async_tx_descriptor *tx;
+ unsigned long flags = DMA_CTRL_ACK;
+ dma_cookie_t cookie;
+ int i;
+
+ atomic_set(&work->pending, 1);
+
+ sg_src = work->src_sgt->sgl;
+ sg_dst = work->dst_sgt->sgl;
+ for_each_sgtable_dma_sg(work->src_sgt, sg_src, i) {
+ if (i == work->src_sgt->nents - 1)
+ flags |= DMA_PREP_INTERRUPT;
+
+ tx = dmaengine_prep_dma_memcpy(work->chan,
+ sg_dma_address(sg_dst),
+ sg_dma_address(sg_src),
+ sg_dma_len(sg_src), flags);
+ if (!tx) {
+ atomic_set(&work->pending, 0);
+ return -EIO;
+ }
+
+ if (i == work->src_sgt->nents - 1) {
+ tx->callback = dma_completion_callback;
+ tx->callback_param = work;
+ }
+
+ cookie = dmaengine_submit(tx);
+ if (dma_submit_error(cookie)) {
+ atomic_set(&work->pending, 0);
+ return -EIO;
+ }
+ sg_dst = sg_next(sg_dst);
+ }
+ return 0;
+}
+
+/**
+ * folios_copy_dma - copy a batch of folios via DMA memcpy
+ * @dst_list: destination folio list
+ * @src_list: source folio list
+ * @nr_folios: number of folios in each list
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+static int folios_copy_dma(struct list_head *dst_list,
+ struct list_head *src_list, unsigned int nr_folios)
+{
+ struct dma_work *works;
+ struct list_head *src_pos = src_list->next;
+ struct list_head *dst_pos = dst_list->next;
+ int i, folios_per_chan, ret;
+ dma_cap_mask_t mask;
+ int actual_channels = 0;
+ unsigned int max_channels;
+
+ max_channels = min3(nr_dma_channels, nr_folios,
+ (unsigned int)MAX_DMA_CHANNELS);
+
+ works = kcalloc(max_channels, sizeof(*works), GFP_KERNEL);
+ if (!works)
+ return -ENOMEM;
+
+ dma_cap_zero(mask);
+ dma_cap_set(DMA_MEMCPY, mask);
+
+ for (i = 0; i < max_channels; i++) {
+ works[actual_channels].chan = dma_request_chan_by_mask(&mask);
+ if (IS_ERR(works[actual_channels].chan))
+ break;
+ init_completion(&works[actual_channels].done);
+ actual_channels++;
+ }
+
+ if (actual_channels == 0) {
+ kfree(works);
+ return -ENODEV;
+ }
+
+ for (i = 0; i < actual_channels; i++) {
+ folios_per_chan = nr_folios * (i + 1) / actual_channels -
+ (nr_folios * i) / actual_channels;
+ if (folios_per_chan == 0)
+ continue;
+
+ ret = setup_sg_tables(&works[i], &src_pos, &dst_pos,
+ folios_per_chan);
+ if (ret)
+ goto err_cleanup;
+ }
+
+ for (i = 0; i < actual_channels; i++) {
+ ret = submit_dma_transfers(&works[i]);
+ if (ret)
+ goto err_cleanup;
+ }
+
+ for (i = 0; i < actual_channels; i++) {
+ if (atomic_read(&works[i].pending) > 0)
+ dma_async_issue_pending(works[i].chan);
+ }
+
+ for (i = 0; i < actual_channels; i++) {
+ if (atomic_read(&works[i].pending) == 0)
+ continue;
+ if (!wait_for_completion_timeout(&works[i].done,
+ msecs_to_jiffies(10000))) {
+ ret = -ETIMEDOUT;
+ goto err_cleanup;
+ }
+ }
+
+ cleanup_dma_work(works, actual_channels);
+
+ atomic_long_add(nr_folios, &folios_migrated);
+ return 0;
+
+err_cleanup:
+ pr_warn_ratelimited("dcbm: DMA copy failed (%d), falling back to CPU\n",
+ ret);
+ cleanup_dma_work(works, actual_channels);
+
+ atomic_long_add(nr_folios, &folios_failures);
+ return ret;
+}
+
+static struct migrator dma_migrator = {
+ .name = "DCBM",
+ .offload_copy = folios_copy_dma,
+ .owner = THIS_MODULE,
+};
+
+static ssize_t offloading_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", offloading_enabled);
+}
+
+static ssize_t offloading_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ bool enable;
+ int ret;
+
+ ret = kstrtobool(buf, &enable);
+ if (ret)
+ return ret;
+
+ mutex_lock(&dcbm_mutex);
+
+ if (enable == offloading_enabled)
+ goto out;
+
+ if (enable) {
+ ret = migrate_offload_register(&dma_migrator);
+ if (ret) {
+ mutex_unlock(&dcbm_mutex);
+ return ret;
+ }
+ offloading_enabled = true;
+ } else {
+ migrate_offload_unregister(&dma_migrator);
+ offloading_enabled = false;
+ }
+out:
+ mutex_unlock(&dcbm_mutex);
+ return count;
+}
+
+static ssize_t folios_migrated_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", atomic_long_read(&folios_migrated));
+}
+
+static ssize_t folios_migrated_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ atomic_long_set(&folios_migrated, 0);
+ return count;
+}
+
+static ssize_t folios_failures_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", atomic_long_read(&folios_failures));
+}
+
+static ssize_t folios_failures_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ atomic_long_set(&folios_failures, 0);
+ return count;
+}
+
+static ssize_t nr_dma_chan_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", nr_dma_channels);
+}
+
+static ssize_t nr_dma_chan_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ unsigned int val;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val < 1 || val > MAX_DMA_CHANNELS)
+ return -EINVAL;
+
+ mutex_lock(&dcbm_mutex);
+ nr_dma_channels = val;
+ mutex_unlock(&dcbm_mutex);
+ return count;
+}
+
+static struct kobj_attribute offloading_attr = __ATTR_RW(offloading);
+static struct kobj_attribute nr_dma_chan_attr = __ATTR_RW(nr_dma_chan);
+static struct kobj_attribute folios_migrated_attr = __ATTR_RW(folios_migrated);
+static struct kobj_attribute folios_failures_attr = __ATTR_RW(folios_failures);
+
+static struct attribute *dcbm_attrs[] = {
+ &offloading_attr.attr,
+ &nr_dma_chan_attr.attr,
+ &folios_migrated_attr.attr,
+ &folios_failures_attr.attr,
+ NULL
+};
+
+static const struct attribute_group dcbm_attr_group = {
+ .attrs = dcbm_attrs,
+};
+
+static int __init dcbm_init(void)
+{
+ int ret;
+
+ ret = sysfs_create_group(&THIS_MODULE->mkobj.kobj, &dcbm_attr_group);
+ if (ret)
+ return ret;
+
+ pr_info("dcbm: DMA Core Batch Migrator initialized\n");
+ return 0;
+}
+
+static void __exit dcbm_exit(void)
+{
+ mutex_lock(&dcbm_mutex);
+ if (offloading_enabled) {
+ migrate_offload_unregister(&dma_migrator);
+ offloading_enabled = false;
+ }
+ mutex_unlock(&dcbm_mutex);
+
+ sysfs_remove_group(&THIS_MODULE->mkobj.kobj, &dcbm_attr_group);
+ pr_info("dcbm: DMA Core Batch Migrator unloaded\n");
+}
+
+module_init(dcbm_init);
+module_exit(dcbm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Shivank Garg");
+MODULE_DESCRIPTION("DMA Core Batch Migrator");
--
2.43.0
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 7/7] mm/migrate: adjust NR_MAX_BATCHED_MIGRATION for testing
2026-04-28 15:50 [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Shivank Garg
` (5 preceding siblings ...)
2026-04-28 15:50 ` [PATCH 6/7] drivers/migrate_offload: add DMA batch copy driver (dcbm) Shivank Garg
@ 2026-04-28 15:50 ` Shivank Garg
2026-04-28 17:11 ` [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Garg, Shivank
` (2 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Shivank Garg @ 2026-04-28 15:50 UTC (permalink / raw)
To: akpm, david
Cc: kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy, rppt, surenb,
mhocko, ziy, matthew.brost, joshua.hahnjy, rakie.kim, byungchul,
gourry, ying.huang, apopple, dave, Jonathan.Cameron, rkodsara,
vkoul, bharata, sj, rientjes, xuezhengchu, yiannis, dave.hansen,
hannes, jhubbard, peterx, riel, shakeel.butt, stalexan, tj,
nifan.cxl, jic23, aneesh.kumar, nathan.lynch, Frank.li, djbw,
linux-kernel, linux-mm, Shivank Garg
From: Zi Yan <ziy@nvidia.com>
Change NR_MAX_BATCHED_MIGRATION to HPAGE_PUD_NR to allow batching THP
copies.
These are for testing purpose only.
Signed-off-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
mm/migrate.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/migrate.c b/mm/migrate.c
index 9af070f9a1f2..a16c009d31d0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1658,7 +1658,7 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR
+#define NR_MAX_BATCHED_MIGRATION HPAGE_PUD_NR
#else
#define NR_MAX_BATCHED_MIGRATION 512
#endif
--
2.43.0
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 0/7] Accelerate page migration with batch copying and hardware offload
2026-04-28 15:50 [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Shivank Garg
` (6 preceding siblings ...)
2026-04-28 15:50 ` [PATCH 7/7] mm/migrate: adjust NR_MAX_BATCHED_MIGRATION for testing Shivank Garg
@ 2026-04-28 17:11 ` Garg, Shivank
2026-04-28 19:33 ` David Hildenbrand (Arm)
2026-04-30 8:47 ` Huang, Ying
2026-05-07 9:58 ` Huang, Ying
9 siblings, 1 reply; 20+ messages in thread
From: Garg, Shivank @ 2026-04-28 17:11 UTC (permalink / raw)
To: akpm, david
Cc: kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy, rppt, surenb,
mhocko, ziy, matthew.brost, joshua.hahnjy, rakie.kim, byungchul,
gourry, ying.huang, apopple, dave, Jonathan.Cameron, rkodsara,
vkoul, bharata, sj, rientjes, xuezhengchu, yiannis, dave.hansen,
hannes, jhubbard, peterx, riel, shakeel.butt, stalexan, tj,
nifan.cxl, jic23, aneesh.kumar, nathan.lynch, Frank.li, djbw,
linux-kernel, linux-mm
Hi all,
Apologies. The subject prefix should have been [RFC PATCH v5 0/7].
This is the fifth RFC, as mentioned in the cover letter, but I
missed the prefix while formatting the patches. Please treat this
round as RFC v5.
Thanks,
Shivank
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 0/7] Accelerate page migration with batch copying and hardware offload
2026-04-28 17:11 ` [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Garg, Shivank
@ 2026-04-28 19:33 ` David Hildenbrand (Arm)
2026-04-29 5:51 ` Garg, Shivank
0 siblings, 1 reply; 20+ messages in thread
From: David Hildenbrand (Arm) @ 2026-04-28 19:33 UTC (permalink / raw)
To: Garg, Shivank, akpm
Cc: kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy, rppt, surenb,
mhocko, ziy, matthew.brost, joshua.hahnjy, rakie.kim, byungchul,
gourry, ying.huang, apopple, dave, Jonathan.Cameron, rkodsara,
vkoul, bharata, sj, rientjes, xuezhengchu, yiannis, dave.hansen,
hannes, jhubbard, peterx, riel, shakeel.butt, stalexan, tj,
nifan.cxl, jic23, aneesh.kumar, nathan.lynch, Frank.li, djbw,
linux-kernel, linux-mm
On 4/28/26 19:11, Garg, Shivank wrote:
> Hi all,
>
> Apologies. The subject prefix should have been [RFC PATCH v5 0/7].
>
> This is the fifth RFC, as mentioned in the cover letter, but I
> missed the prefix while formatting the patches. Please treat this
> round as RFC v5.
Ever since I switched to b4 for patch management, the quality of my life improved :)
$ b4 prep -n SERIES -f mm/mm-unstable
$ b4 prep --set-prefixes RFC
... add patches
$ b4 prep --auto-to-cc
$ b4 prep --edit-cover
$ b4 send --no-sign
--
Cheers,
David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 0/7] Accelerate page migration with batch copying and hardware offload
2026-04-28 19:33 ` David Hildenbrand (Arm)
@ 2026-04-29 5:51 ` Garg, Shivank
0 siblings, 0 replies; 20+ messages in thread
From: Garg, Shivank @ 2026-04-29 5:51 UTC (permalink / raw)
To: David Hildenbrand (Arm), akpm
Cc: kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy, rppt, surenb,
mhocko, ziy, matthew.brost, joshua.hahnjy, rakie.kim, byungchul,
gourry, ying.huang, apopple, dave, Jonathan.Cameron, rkodsara,
vkoul, bharata, sj, rientjes, xuezhengchu, yiannis, dave.hansen,
hannes, jhubbard, peterx, riel, shakeel.butt, stalexan, tj,
nifan.cxl, jic23, aneesh.kumar, nathan.lynch, Frank.li, djbw,
linux-kernel, linux-mm
On 4/29/2026 1:03 AM, David Hildenbrand (Arm) wrote:
> On 4/28/26 19:11, Garg, Shivank wrote:
>> Hi all,
>>
>> Apologies. The subject prefix should have been [RFC PATCH v5 0/7].
>>
>> This is the fifth RFC, as mentioned in the cover letter, but I
>> missed the prefix while formatting the patches. Please treat this
>> round as RFC v5.
>
> Ever since I switched to b4 for patch management, the quality of my life improved :)
>
> $ b4 prep -n SERIES -f mm/mm-unstable
> $ b4 prep --set-prefixes RFC
> ... add patches
> $ b4 prep --auto-to-cc
> $ b4 prep --edit-cover
> $ b4 send --no-sign
>
Thanks, appreciate the pointers. :)
I'll switch to b4.
Best regards,
Shivank
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 0/7] Accelerate page migration with batch copying and hardware offload
2026-04-28 15:50 [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Shivank Garg
` (7 preceding siblings ...)
2026-04-28 17:11 ` [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Garg, Shivank
@ 2026-04-30 8:47 ` Huang, Ying
2026-05-08 11:04 ` Garg, Shivank
2026-05-07 9:58 ` Huang, Ying
9 siblings, 1 reply; 20+ messages in thread
From: Huang, Ying @ 2026-04-30 8:47 UTC (permalink / raw)
To: Shivank Garg
Cc: akpm, david, kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy,
rppt, surenb, mhocko, ziy, matthew.brost, joshua.hahnjy,
rakie.kim, byungchul, gourry, apopple, dave, Jonathan.Cameron,
rkodsara, vkoul, bharata, sj, rientjes, xuezhengchu, yiannis,
dave.hansen, hannes, jhubbard, peterx, riel, shakeel.butt,
stalexan, tj, nifan.cxl, jic23, aneesh.kumar, nathan.lynch,
Frank.li, djbw, linux-kernel, linux-mm
Shivank Garg <shivankg@amd.com> writes:
> This is the fifth RFC of the patchset to enhance page migration by
> batching folio-copy operations and enabling acceleration via DMA offload.
>
> Single-threaded, folio-by-folio copying bottlenecks page migration in
> modern systems with deep memory hierarchies, especially for large folios
> where copy overhead dominates, leaving significant hardware potential
> untapped.
>
> By batching the copy phase, we create an opportunity for hardware
> acceleration. This series builds the framework and provides a DMA
> offload driver (dcbm) as a reference implementation, targeting bulk
> migration workloads where offloading the copy improves throughput
> and latency while freeing the CPU cycles.
>
> See the RFC V3 cover letter [2] for motivation.
>
> Changelog since V4:
> -------------------
>
> 1. Renamed PAGE_* migration state flags to FOLIO_*. (David)
> 2. Use the new folio->migrate_info field instead of folio->private
> for migration state. (David)
> 3. Fold folios_mc_copy patch in batch-copy implementation patch. (David)
> 3. Renamed migrate_offload_start()/stop() to register()/unregister().
> (Huang, Ying)
> 4. Dropped should_batch() callback from struct migrator. Reason-based
> policy now lives in migrate_pages_batch(). Migrators can still skip
> a batch they don't want (size based policy). (Huang, Ying)
> 5. CONFIG_MIGRATION_COPY_OFFLOAD is now hidden and selected by the
> migrator driver. CONFIG_DCBM_DMA is tristate. (Huang Ying, Gregory Price).
> 6. Wrapped the SRCU + static_call dispatch in a small helper. (Huang, Ying)
> 7. Requir m->owner in migrate_offload_register(), SRCU sync at
> unregister relies on it. Counters are atomic_long_t to avoid lock-order
> issue.
> 9. Moved DCBM sysfs from /sys/kernel/dcbm to /sys/module/dcbm (Huang, Ying)
> 10. Rebased on v7.1-rc1.
>
>
> DESIGN:
> -------
>
> New Migration Flow:
>
> [ migrate_pages_batch() ]
> |
> |--> do_batch = migrate_offload_do_batch(reason) // core filters by migration reason
> |
> |--> for each folio:
> | migrate_folio_unmap() // unmap the folio
> | |
> | +--> (success):
> | if do_batch && folio_supports_batch_copy():
> | -> unmap_batch / dst_batch // batch list for copy offloading
> | else:
> | -> unmap_single / dst_single // single lists for per-folio CPU copy
> |
> |--> try_to_unmap_flush() // single batched TLB flush
> |
> |--> Batch copy (if unmap_batch not empty):
> | - Migrator is configurable at runtime via sysfs.
> |
> | static_call(migrate_offload_copy) // Pluggable Migrators
> | / | \
> | v v v
> | [ Default ] [ DMA Offload ] [ ... ]
> |
> | On -EOPNOTSUPP or other error, batch falls back to per-folio CPU copy.
> |
> +--> migrate_folios_move() // metadata, update PTEs, finalize
> (batch list with already_copied=true, single list with false)
>
> Offload Registration:
>
> Driver fills struct migrator { .name, .offload_copy, .owner } and calls
> migrate_offload_register(). This:
> - Pins the module via try_module_get()
> - Patches the migrate_offload_copy() static_call target
> - Enables the migrate_offload_enabled static branch
>
> migrate_offload_unregister() disables the static branch and reverts
> the static_call, then synchronize_srcu() waits for in-flight migrations
> before module_put().
>
> PERFORMANCE RESULTS:
> --------------------
>
> Re-ran the V4 workload on v7.1-rc1 with this series; relative
> speedups match V4 (~6x for 2MB folios at 16 DMA channels). No design
> change in V5 alters this picture; please refer to the V4 cover letter
> for the throughput tables [1].
IMHO, it's better to copy performance data here.
In addition to the performance benefit, I want to know the downside as
well. For example, the migration latency of the first folio may be
longer. If so, by how much? Can you measure the batch number vs. total
migration time (benefit) and first folio migration time (downside)?
That can be used to determine the optimal batch number.
> PLAN:
> -----
>
> Patches 1-4 (the batching infrastructure) don't depend on the migrator
> interface, so if it helps I can split them off and post them ahead of
> the migrator and DCBM bits, which still have a few open questions to
> work through.
>
> I would appreciate guidance on splitting the infrastructure portion
> ahead of the migrator interface if that matches maintainers' preference.
>
> OPEN QUESTIONS:
> ---------------
>
> 1. Should the batch path run without a registered migrator? Patches 1-4
> are self-contained and use folios_mc_copy() (CPU). I have several
> options like making batch path always-on for eligible folios, or
> giving admin an option to flip the static branch, or keep the gate.
> I'm leaning toward always-on.
>
> 2. Carrying already_copied via folio->migrate_info vs changing the
> migrate_folio() callback signature (Huang, Ying). I went with the
> field for now to avoid touching every fs callback before the design
> settles. Happy to revisit.
>
> 3. Per-caller offload selection: Today eligibility is by migrate_reason
> only. Some are latency-tolerant, others may be not. Is reason the
> right granularity, or do we want a per-caller hint?
>
> 4. Cgroup integration: How should per-cgroup be accounted for different
> migrators (e.g.: any accounting for DMA-busy time)?
>
> 5. Tuning migrate_pages callers for offloading. For instance, in
> compaction COMPACT_CLUSTER_MAX = 32 caps DMA's payoff for compaction
> (V4 experiment).
>
> 6. Where do batch-size thresholds live, and how are they tuned? Per
> Huang Ying's split, that policy lives in the migrator. DCBM has no
> threshold today. Open whether it should later be a per-migrator
> sysfs knob or hard-coded; probably clearer once a second migrator
> (SDXI, mtcopy) shows the trade-off.
>
>
> FOLLOW-UPS:
> --------------
>
> 1. dmaengine_prep_dma_memcpy_sg() in DCBM (Vinod Koul). The SG-prep
> variant cuts per-batch prep/submit cost (=CPU savings), but ptdma does
> not implement the SG hook yet [10]. The end-to-end migration throughput
> delta is small because per-descriptor execute time dominates.
> I'll post the ptdma SG hook + DCBM switch as a follow-up.
>
> 2. SDXI as a second migrator. The SDXI series [11] is in review. SDXI is
> a generic memcpy engine without DMA_PRIVATE, so channel acquisition
> goes through dma_find_channel() or async_tx rather than
> dma_request_chan_by_mask(). I have a local DCBM variant working on top
> of the SDXI driver. I'm planning to send it as a follow-up once the
> SDXI series settles.
>
> 3. IOMMU SG merging in DCBM (Gregory). dma_map_sgtable() may merge
> contiguous PFNs unevenly, so src.nents != dst.nents. DCBM falls back
> to CPU for safety. Though I haven't seen it on Zen3 + PTDMA. I'll
> understand this and address it a follow-up.
>
> 4. Revisit Multi-threaded CPU copy migrator once the infra is settled.
>
> EARLIER POSTINGS:
> -----------------
> [1] RFC V4: https://lore.kernel.org/all/20260309120725.308854-3-shivankg@amd.com
> [2] RFC V3: https://lore.kernel.org/all/20250923174752.35701-1-shivankg@amd.com
> [3] RFC V2: https://lore.kernel.org/all/20250319192211.10092-1-shivankg@amd.com
> [4] RFC V1: https://lore.kernel.org/all/20240614221525.19170-1-shivankg@amd.com
> [5] RFC from Zi Yan: https://lore.kernel.org/all/20250103172419.4148674-1-ziy@nvidia.com
>
> RELATED DISCUSSIONS:
> --------------------
> [6] MM-alignment Session [Nov 12, 2025]:
> https://lore.kernel.org/linux-mm/bd6a3c75-b9f0-cbcf-f7c4-1ef5dff06d24@google.com
> [7] Linux Memory Hotness and Promotion call [Nov 6, 2025]:
> https://lore.kernel.org/linux-mm/8ff2fd10-c9ac-4912-cf56-7ecd4afd2770@google.com
> [8] LSFMM 2025:
> https://lore.kernel.org/all/cf6fc05d-c0b0-4de3-985e-5403977aa3aa@amd.com
> [9] OSS India:
> https://ossindia2025.sched.com/event/23Jk1
> [10] DMA_MEMCPY_SG comparison:
> https://lore.kernel.org/linux-mm/3e73addb-ac01-4a05-bc75-c6c1c56072df@amd.com
> [11] SDXI V1:
> https://lore.kernel.org/all/20260410-sdxi-base-v1-0-1d184cb5c60a@amd.com
>
> Thanks to everyone who reviewed, tested or participated in discussions
> around this series. Your feedback helped me throughout the development
> process.
>
> Best Regards,
> Shivank
>
>
> Shivank Garg (6):
> mm/migrate: rename PAGE_ migration flags to FOLIO_
> mm/migrate: use migrate_info field instead of private
> mm/migrate: skip data copy for already-copied folios
> mm/migrate: add batch-copy path in migrate_pages_batch
> mm/migrate: add copy offload registration infrastructure
> drivers/migrate_offload: add DMA batch copy driver (dcbm)
>
> Zi Yan (1):
> mm/migrate: adjust NR_MAX_BATCHED_MIGRATION for testing
>
> drivers/Kconfig | 2 +
> drivers/Makefile | 2 +
> drivers/migrate_offload/Kconfig | 9 +
> drivers/migrate_offload/Makefile | 1 +
> drivers/migrate_offload/dcbm/Makefile | 1 +
> drivers/migrate_offload/dcbm/dcbm.c | 440 ++++++++++++++++++++++++++
> include/linux/migrate_copy_offload.h | 44 +++
> include/linux/mm.h | 2 +
> include/linux/mm_types.h | 1 +
> mm/Kconfig | 6 +
> mm/Makefile | 1 +
> mm/migrate.c | 211 ++++++++----
> mm/migrate_copy_offload.c | 94 ++++++
> mm/util.c | 30 ++
> 14 files changed, 784 insertions(+), 60 deletions(-)
> create mode 100644 drivers/migrate_offload/Kconfig
> create mode 100644 drivers/migrate_offload/Makefile
> create mode 100644 drivers/migrate_offload/dcbm/Makefile
> create mode 100644 drivers/migrate_offload/dcbm/dcbm.c
> create mode 100644 include/linux/migrate_copy_offload.h
> create mode 100644 mm/migrate_copy_offload.c
>
>
> base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
---
Best Regards,
Huang, Ying
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 0/7] Accelerate page migration with batch copying and hardware offload
2026-04-30 8:47 ` Huang, Ying
@ 2026-05-08 11:04 ` Garg, Shivank
2026-05-08 11:28 ` Huang, Ying
0 siblings, 1 reply; 20+ messages in thread
From: Garg, Shivank @ 2026-05-08 11:04 UTC (permalink / raw)
To: Huang, Ying
Cc: akpm, david, kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy,
rppt, surenb, mhocko, ziy, matthew.brost, joshua.hahnjy,
rakie.kim, byungchul, gourry, apopple, dave, Jonathan.Cameron,
rkodsara, vkoul, bharata, sj, rientjes, xuezhengchu, yiannis,
dave.hansen, hannes, jhubbard, peterx, riel, shakeel.butt,
stalexan, tj, nifan.cxl, jic23, aneesh.kumar, nathan.lynch,
Frank.li, djbw, linux-kernel, linux-mm
On 4/30/2026 2:17 PM, Huang, Ying wrote:
> Shivank Garg <shivankg@amd.com> writes:
>> PERFORMANCE RESULTS:
>> --------------------
>>
>> Re-ran the V4 workload on v7.1-rc1 with this series; relative
>> speedups match V4 (~6x for 2MB folios at 16 DMA channels). No design
>> change in V5 alters this picture; please refer to the V4 cover letter
>> for the throughput tables [1].
>
> IMHO, it's better to copy performance data here.
>
> In addition to the performance benefit, I want to know the downside as
> well. For example, the migration latency of the first folio may be
> longer. If so, by how much? Can you measure the batch number vs. total
> migration time (benefit) and first folio migration time (downside)?
> That can be used to determine the optimal batch number.
>
System Info: AMD Zen 3 EPYC server (2-sockets, 32 cores, SMT Enabled),
1 NUMA node per socket, v7.1-rc1, DVFS set to Performance, PTDMA hardware.
Benchmark: move_pages() syscall to move pages between two NUMA nodes.
1). Moving different sized folios such that total transfer size is constant
(1GB), with different number of DMA channels. Throughput in GB/s.
a. Baseline (vanilla kernel, single-threaded, serial folio_copy):
================================================================================
4K | 16K | 64K | 256K | 1M | 2M |
================================================================================
3.31±0.18 | 5.61±0.07 | 6.66±0.03 | 7.01±0.03 | 7.13±0.08 | 11.02±0.17 |
b. DMA offload (Patched Kernel, dcbm driver, N DMA channels):
============================================================================================
N channel| 4K | 16K | 64K | 256K | 1M | 2M |
============================================================================================
1 | 2.16±0.14 | 2.58±0.02 | 3.00±0.04 | 4.56±0.28 | 4.62±0.02 | 12.65±0.08 |
2 | 2.68±0.09 | 3.69±0.15 | 4.52±0.04 | 6.75±0.06 | 7.19±0.19 | 14.38±0.06 |
4 | 3.07±0.13 | 4.62±0.09 | 6.47±0.56 | 9.22±0.15 | 10.24±0.47 | 27.01±0.11 |
8 | 3.43±0.09 | 5.40±0.16 | 7.67±0.08 | 11.25±0.17 | 12.60±0.60 | 45.62±0.52 |
12 | 3.50±0.11 | 5.66±0.16 | 8.12±0.10 | 11.97±0.19 | 13.43±0.08 | 61.02±0.92 |
16 | 3.54±0.12 | 5.79±0.14 | 8.50±0.13 | 12.59±0.15 | 17.21±6.40 | 65.23±1.70 |
2). First-folio latency: Instrumented with custom tracepoints to measure latency per migrate_pages_batch() call.
Result: throughput (GB/s) and first-folio latency (in microseconds), median of 10 runs.
A). Vanilla Kernel:
Here, n = workload size passed to move_pages() in folios. Move n number of folios with move_pages().
NR_MAX_BATCHED_MIGRATION is upstream default value 512.
--- Order 0 (4K folios) ---
n vanilla/cpu
(folios) GB/s | first(us)
--------------------------
1 0.04 | 24
4 0.16 | 25
8 0.29 | 31
16 0.54 | 27
64 1.15 | 68
256 1.86 | 162
512 2.21 | 264
2048 2.62 | 208
4096 2.74 | 182
16384 2.73 | 173
65536 3.28 | 166
262144 3.20 | 167
--- Order 9 (2M folios) ---
n vanilla/cpu
(folios) GB/s | first(us)
--------------------------
1 7.05 | 194
4 8.78 | 186
8 8.47 | 188
16 7.20 | 193
64 8.23 | 191
256 10.51 | 180
512 10.88 | 173
Takeaway:
In each migrate_pages_batch() call, folios are first unmapped, then try_to_unmap_flush(),
and only then folios enter move_to_new_folio(). So first-folio latency is bounded by the
per-batch unmap+flush cost, and then plateaus once workload is large enough.
B). Patched kernel:
Here, N = NR_MAX_BATCHED_MIGRATION (in page). Total migrated data is fixed at 1 GB.
Change N with a knob to measure impact of different max batched size.
--- ORDER 0 (4K folios) ---
N offload/dma1 offload/dma4 offload/dma16
GB/s | first(us) GB/s | first(us) GB/s | first(us)
------------------------------------------------------------------------
512 2.13 | 639 3.23 | 290 3.27 | 253
1024 2.17 | 1261 3.44 | 582 3.58 | 536
2048 2.01 | 2769 3.09 | 1360 3.45 | 1083
4096 2.10 | 5059 3.13 | 2737 3.58 | 2115
8192 2.21 | 9320 3.17 | 5015 3.75 | 3617
16384 2.15 | 18689 3.31 | 9623 3.87 | 6937
32768 2.12 | 42692 3.38 | 18893 3.83 | 14255
65536 2.09 | 81956 3.38 | 38556 3.64 | 29003
131072 2.02 | 169563 3.22 | 81082 3.63 | 62236
262144 2.21 | 318424 3.12 | 170174 3.50 | 129413
--- ORDER 9 (2M folios) ---
N offload/dma1 offload/dma4 offload/dma16
GB/s | first(us) GB/s | first(us) GB/s | first(us)
-------------------------------------------------------------------------
512 11.66 | 160 11.68 | 160 11.65 | 160
1024 12.16 | 310 13.67 | 275 13.64 | 276
2048 12.30 | 613 25.47 | 290 25.48 | 291
4096 12.48 | 1215 26.19 | 566 42.59 | 335
8192 12.56 | 2424 26.57 | 1118 58.72 | 470 *
16384 12.61 | 4839 26.77 | 2218 61.94 | 896
32768 12.60 | 9667 26.98 | 4422 63.75 | 1748
65536 12.63 | 19318 26.99 | 8838 60.66 | 3543
131072 12.64 | 38935 27.02 | 17935 61.06 | 7178
262144 12.66 | 77694 26.85 | 35871 65.06 | 14129
In the batch-copy offload approach, DMA copy phase is inserted between unmap/flush and move,
So larger N increases first-folio wall clock latency. Throughput improves but with diminishing
returns.
For DCBM+PTDMA setup, the optimal batch for 2M folios sits around N=8192-16384,
because a larger batch allows the driver to distribute more folios across available DMA channels.
This is where we get most throughput while keeping the first folio latency in check.
This optimal batch value is hardware-specific. Other engines (eg. SDXI) and memory tier (eg. CXL)
will likely have different curves.
Does this approach and experiment look good to you?
Thanks,
Shivank
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 0/7] Accelerate page migration with batch copying and hardware offload
2026-05-08 11:04 ` Garg, Shivank
@ 2026-05-08 11:28 ` Huang, Ying
2026-05-08 12:34 ` Garg, Shivank
0 siblings, 1 reply; 20+ messages in thread
From: Huang, Ying @ 2026-05-08 11:28 UTC (permalink / raw)
To: Garg, Shivank
Cc: akpm, david, kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy,
rppt, surenb, mhocko, ziy, matthew.brost, joshua.hahnjy,
rakie.kim, byungchul, gourry, apopple, dave, Jonathan.Cameron,
rkodsara, vkoul, bharata, sj, rientjes, xuezhengchu, yiannis,
dave.hansen, hannes, jhubbard, peterx, riel, shakeel.butt,
stalexan, tj, nifan.cxl, jic23, aneesh.kumar, nathan.lynch,
Frank.li, djbw, linux-kernel, linux-mm
Hi, Shivank,
"Garg, Shivank" <shivankg@amd.com> writes:
> On 4/30/2026 2:17 PM, Huang, Ying wrote:
>> Shivank Garg <shivankg@amd.com> writes:
>
>>> PERFORMANCE RESULTS:
>>> --------------------
>>>
>>> Re-ran the V4 workload on v7.1-rc1 with this series; relative
>>> speedups match V4 (~6x for 2MB folios at 16 DMA channels). No design
>>> change in V5 alters this picture; please refer to the V4 cover letter
>>> for the throughput tables [1].
>>
>> IMHO, it's better to copy performance data here.
>>
>> In addition to the performance benefit, I want to know the downside as
>> well. For example, the migration latency of the first folio may be
>> longer. If so, by how much? Can you measure the batch number vs. total
>> migration time (benefit) and first folio migration time (downside)?
>> That can be used to determine the optimal batch number.
>>
>
> System Info: AMD Zen 3 EPYC server (2-sockets, 32 cores, SMT Enabled),
> 1 NUMA node per socket, v7.1-rc1, DVFS set to Performance, PTDMA hardware.
>
> Benchmark: move_pages() syscall to move pages between two NUMA nodes.
>
> 1). Moving different sized folios such that total transfer size is constant
> (1GB), with different number of DMA channels. Throughput in GB/s.
>
> a. Baseline (vanilla kernel, single-threaded, serial folio_copy):
>
> ================================================================================
> 4K | 16K | 64K | 256K | 1M | 2M |
> ================================================================================
> 3.31±0.18 | 5.61±0.07 | 6.66±0.03 | 7.01±0.03 | 7.13±0.08 | 11.02±0.17 |
>
>
> b. DMA offload (Patched Kernel, dcbm driver, N DMA channels):
>
> ============================================================================================
> N channel| 4K | 16K | 64K | 256K | 1M | 2M |
> ============================================================================================
> 1 | 2.16±0.14 | 2.58±0.02 | 3.00±0.04 | 4.56±0.28 | 4.62±0.02 | 12.65±0.08 |
> 2 | 2.68±0.09 | 3.69±0.15 | 4.52±0.04 | 6.75±0.06 | 7.19±0.19 | 14.38±0.06 |
> 4 | 3.07±0.13 | 4.62±0.09 | 6.47±0.56 | 9.22±0.15 | 10.24±0.47 | 27.01±0.11 |
> 8 | 3.43±0.09 | 5.40±0.16 | 7.67±0.08 | 11.25±0.17 | 12.60±0.60 | 45.62±0.52 |
> 12 | 3.50±0.11 | 5.66±0.16 | 8.12±0.10 | 11.97±0.19 | 13.43±0.08 | 61.02±0.92 |
> 16 | 3.54±0.12 | 5.79±0.14 | 8.50±0.13 | 12.59±0.15 | 17.21±6.40 | 65.23±1.70 |
>
>
> 2). First-folio latency: Instrumented with custom tracepoints to measure latency per migrate_pages_batch() call.
> Result: throughput (GB/s) and first-folio latency (in microseconds), median of 10 runs.
Thanks for detailed data. Per my understanding, the run time of
migrate_pages_batch() may be not good enough for measuring first folio
latency. IIUC, the migration procedure is something like,
for each folio
unmap
flush
for each folio
copy
remap ===> first folio migrated
Some tracepoint should be better to measure it.
> A). Vanilla Kernel:
>
> Here, n = workload size passed to move_pages() in folios. Move n number of folios with move_pages().
> NR_MAX_BATCHED_MIGRATION is upstream default value 512.
>
> --- Order 0 (4K folios) ---
> n vanilla/cpu
> (folios) GB/s | first(us)
> --------------------------
> 1 0.04 | 24
> 4 0.16 | 25
> 8 0.29 | 31
> 16 0.54 | 27
> 64 1.15 | 68
> 256 1.86 | 162
> 512 2.21 | 264
> 2048 2.62 | 208
> 4096 2.74 | 182
> 16384 2.73 | 173
> 65536 3.28 | 166
> 262144 3.20 | 167
>
> --- Order 9 (2M folios) ---
> n vanilla/cpu
> (folios) GB/s | first(us)
> --------------------------
> 1 7.05 | 194
> 4 8.78 | 186
> 8 8.47 | 188
> 16 7.20 | 193
> 64 8.23 | 191
> 256 10.51 | 180
> 512 10.88 | 173
>
> Takeaway:
> In each migrate_pages_batch() call, folios are first unmapped, then try_to_unmap_flush(),
> and only then folios enter move_to_new_folio(). So first-folio latency is bounded by the
> per-batch unmap+flush cost, and then plateaus once workload is large enough.
>
>
> B). Patched kernel:
>
> Here, N = NR_MAX_BATCHED_MIGRATION (in page). Total migrated data is fixed at 1 GB.
Emm, so NR_MAX_BATCHED_MIGRATION could be very large? I think that it
needs to be bounded. If it is too large, too many pages may be in an
inaccessible state for a longer time. That will hurt the workload
performance, although it is optimal for migration performance.
> Change N with a knob to measure impact of different max batched size.
>
> --- ORDER 0 (4K folios) ---
> N offload/dma1 offload/dma4 offload/dma16
> GB/s | first(us) GB/s | first(us) GB/s | first(us)
> ------------------------------------------------------------------------
> 512 2.13 | 639 3.23 | 290 3.27 | 253
> 1024 2.17 | 1261 3.44 | 582 3.58 | 536
> 2048 2.01 | 2769 3.09 | 1360 3.45 | 1083
> 4096 2.10 | 5059 3.13 | 2737 3.58 | 2115
> 8192 2.21 | 9320 3.17 | 5015 3.75 | 3617
> 16384 2.15 | 18689 3.31 | 9623 3.87 | 6937
> 32768 2.12 | 42692 3.38 | 18893 3.83 | 14255
> 65536 2.09 | 81956 3.38 | 38556 3.64 | 29003
> 131072 2.02 | 169563 3.22 | 81082 3.63 | 62236
> 262144 2.21 | 318424 3.12 | 170174 3.50 | 129413
>
> --- ORDER 9 (2M folios) ---
> N offload/dma1 offload/dma4 offload/dma16
> GB/s | first(us) GB/s | first(us) GB/s | first(us)
> -------------------------------------------------------------------------
> 512 11.66 | 160 11.68 | 160 11.65 | 160
> 1024 12.16 | 310 13.67 | 275 13.64 | 276
> 2048 12.30 | 613 25.47 | 290 25.48 | 291
> 4096 12.48 | 1215 26.19 | 566 42.59 | 335
> 8192 12.56 | 2424 26.57 | 1118 58.72 | 470 *
> 16384 12.61 | 4839 26.77 | 2218 61.94 | 896
> 32768 12.60 | 9667 26.98 | 4422 63.75 | 1748
> 65536 12.63 | 19318 26.99 | 8838 60.66 | 3543
> 131072 12.64 | 38935 27.02 | 17935 61.06 | 7178
> 262144 12.66 | 77694 26.85 | 35871 65.06 | 14129
>
> In the batch-copy offload approach, DMA copy phase is inserted between unmap/flush and move,
> So larger N increases first-folio wall clock latency. Throughput improves but with diminishing
> returns.
>
> For DCBM+PTDMA setup, the optimal batch for 2M folios sits around N=8192-16384,
> because a larger batch allows the driver to distribute more folios across available DMA channels.
> This is where we get most throughput while keeping the first folio latency in check.
>
> This optimal batch value is hardware-specific. Other engines (eg. SDXI) and memory tier (eg. CXL)
> will likely have different curves.
>
> Does this approach and experiment look good to you?
---
Best Regards,
Huang, Ying
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 0/7] Accelerate page migration with batch copying and hardware offload
2026-05-08 11:28 ` Huang, Ying
@ 2026-05-08 12:34 ` Garg, Shivank
2026-05-09 7:49 ` Huang, Ying
0 siblings, 1 reply; 20+ messages in thread
From: Garg, Shivank @ 2026-05-08 12:34 UTC (permalink / raw)
To: Huang, Ying
Cc: akpm, david, kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy,
rppt, surenb, mhocko, ziy, matthew.brost, joshua.hahnjy,
rakie.kim, byungchul, gourry, apopple, dave, Jonathan.Cameron,
rkodsara, vkoul, bharata, sj, rientjes, xuezhengchu, yiannis,
dave.hansen, hannes, jhubbard, peterx, riel, shakeel.butt,
stalexan, tj, nifan.cxl, jic23, aneesh.kumar, nathan.lynch,
Frank.li, djbw, linux-kernel, linux-mm
On 5/8/2026 4:58 PM, Huang, Ying wrote:
> Hi, Shivank,
>
> "Garg, Shivank" <shivankg@amd.com> writes:
>
>> On 4/30/2026 2:17 PM, Huang, Ying wrote:
>>> Shivank Garg <shivankg@amd.com> writes:
>>
>>>> PERFORMANCE RESULTS:
>>>> --------------------
>>>>
>>>> Re-ran the V4 workload on v7.1-rc1 with this series; relative
>>>> speedups match V4 (~6x for 2MB folios at 16 DMA channels). No design
>>>> change in V5 alters this picture; please refer to the V4 cover letter
>>>> for the throughput tables [1].
>>>
>>> IMHO, it's better to copy performance data here.
>>>
>>> In addition to the performance benefit, I want to know the downside as
>>> well. For example, the migration latency of the first folio may be
>>> longer. If so, by how much? Can you measure the batch number vs. total
>>> migration time (benefit) and first folio migration time (downside)?
>>> That can be used to determine the optimal batch number.
>>>
>>
>> System Info: AMD Zen 3 EPYC server (2-sockets, 32 cores, SMT Enabled),
>> 1 NUMA node per socket, v7.1-rc1, DVFS set to Performance, PTDMA hardware.
>>
>> Benchmark: move_pages() syscall to move pages between two NUMA nodes.
>>
>> 1). Moving different sized folios such that total transfer size is constant
>> (1GB), with different number of DMA channels. Throughput in GB/s.
>>
>> a. Baseline (vanilla kernel, single-threaded, serial folio_copy):
>>
>> ================================================================================
>> 4K | 16K | 64K | 256K | 1M | 2M |
>> ================================================================================
>> 3.31±0.18 | 5.61±0.07 | 6.66±0.03 | 7.01±0.03 | 7.13±0.08 | 11.02±0.17 |
>>
>>
>> b. DMA offload (Patched Kernel, dcbm driver, N DMA channels):
>>
>> ============================================================================================
>> N channel| 4K | 16K | 64K | 256K | 1M | 2M |
>> ============================================================================================
>> 1 | 2.16±0.14 | 2.58±0.02 | 3.00±0.04 | 4.56±0.28 | 4.62±0.02 | 12.65±0.08 |
>> 2 | 2.68±0.09 | 3.69±0.15 | 4.52±0.04 | 6.75±0.06 | 7.19±0.19 | 14.38±0.06 |
>> 4 | 3.07±0.13 | 4.62±0.09 | 6.47±0.56 | 9.22±0.15 | 10.24±0.47 | 27.01±0.11 |
>> 8 | 3.43±0.09 | 5.40±0.16 | 7.67±0.08 | 11.25±0.17 | 12.60±0.60 | 45.62±0.52 |
>> 12 | 3.50±0.11 | 5.66±0.16 | 8.12±0.10 | 11.97±0.19 | 13.43±0.08 | 61.02±0.92 |
>> 16 | 3.54±0.12 | 5.79±0.14 | 8.50±0.13 | 12.59±0.15 | 17.21±6.40 | 65.23±1.70 |
>>
>>
>> 2). First-folio latency: Instrumented with custom tracepoints to measure latency per migrate_pages_batch() call.
>> Result: throughput (GB/s) and first-folio latency (in microseconds), median of 10 runs.
>
> Thanks for detailed data. Per my understanding, the run time of
> migrate_pages_batch() may be not good enough for measuring first folio
> latency. IIUC, the migration procedure is something like,
>
> for each folio
> unmap
> flush
> for each folio
> copy
> remap ===> first folio migrated
>
> Some tracepoint should be better to measure it.
Sorry, my earlier write-up was unclear.
For first folio latency, I add two tracepoints: one at the start of migrate_pages_batch()
and one in migrate_folio_done().
I agree that the user-accessible point tracepoint should be right after remove_migration_ptes().
Though, migrate_folio_done() runs only a few operations later, and will have a constant
offset, so it's unlikely to change the shape of the trade-off curve.
I'll move the tracepoint right after remove_migration_ptes() for new posting.
>
>> A). Vanilla Kernel:
>>
>> Here, n = workload size passed to move_pages() in folios. Move n number of folios with move_pages().
>> NR_MAX_BATCHED_MIGRATION is upstream default value 512.
>>
>> --- Order 0 (4K folios) ---
>> n vanilla/cpu
>> (folios) GB/s | first(us)
>> --------------------------
>> 1 0.04 | 24
>> 4 0.16 | 25
>> 8 0.29 | 31
>> 16 0.54 | 27
>> 64 1.15 | 68
>> 256 1.86 | 162
>> 512 2.21 | 264
>> 2048 2.62 | 208
>> 4096 2.74 | 182
>> 16384 2.73 | 173
>> 65536 3.28 | 166
>> 262144 3.20 | 167
>>
>> --- Order 9 (2M folios) ---
>> n vanilla/cpu
>> (folios) GB/s | first(us)
>> --------------------------
>> 1 7.05 | 194
>> 4 8.78 | 186
>> 8 8.47 | 188
>> 16 7.20 | 193
>> 64 8.23 | 191
>> 256 10.51 | 180
>> 512 10.88 | 173
>>
>> Takeaway:
>> In each migrate_pages_batch() call, folios are first unmapped, then try_to_unmap_flush(),
>> and only then folios enter move_to_new_folio(). So first-folio latency is bounded by the
>> per-batch unmap+flush cost, and then plateaus once workload is large enough.
>>
>>
>> B). Patched kernel:
>>
>> Here, N = NR_MAX_BATCHED_MIGRATION (in page). Total migrated data is fixed at 1 GB.
>
> Emm, so NR_MAX_BATCHED_MIGRATION could be very large? I think that it
> needs to be bounded. If it is too large, too many pages may be in an
> inaccessible state for a longer time. That will hurt the workload
> performance, although it is optimal for migration performance.
>
Agreed, it must be bounded.
>> Change N with a knob to measure impact of different max batched size.
>>
>> --- ORDER 0 (4K folios) ---
>> N offload/dma1 offload/dma4 offload/dma16
>> GB/s | first(us) GB/s | first(us) GB/s | first(us)
>> ------------------------------------------------------------------------
>> 512 2.13 | 639 3.23 | 290 3.27 | 253
>> 1024 2.17 | 1261 3.44 | 582 3.58 | 536
>> 2048 2.01 | 2769 3.09 | 1360 3.45 | 1083
>> 4096 2.10 | 5059 3.13 | 2737 3.58 | 2115
>> 8192 2.21 | 9320 3.17 | 5015 3.75 | 3617
>> 16384 2.15 | 18689 3.31 | 9623 3.87 | 6937
>> 32768 2.12 | 42692 3.38 | 18893 3.83 | 14255
>> 65536 2.09 | 81956 3.38 | 38556 3.64 | 29003
>> 131072 2.02 | 169563 3.22 | 81082 3.63 | 62236
>> 262144 2.21 | 318424 3.12 | 170174 3.50 | 129413
>>
>> --- ORDER 9 (2M folios) ---
>> N offload/dma1 offload/dma4 offload/dma16
>> GB/s | first(us) GB/s | first(us) GB/s | first(us)
>> -------------------------------------------------------------------------
>> 512 11.66 | 160 11.68 | 160 11.65 | 160
>> 1024 12.16 | 310 13.67 | 275 13.64 | 276
>> 2048 12.30 | 613 25.47 | 290 25.48 | 291
>> 4096 12.48 | 1215 26.19 | 566 42.59 | 335
>> 8192 12.56 | 2424 26.57 | 1118 58.72 | 470 *
>> 16384 12.61 | 4839 26.77 | 2218 61.94 | 896
>> 32768 12.60 | 9667 26.98 | 4422 63.75 | 1748
>> 65536 12.63 | 19318 26.99 | 8838 60.66 | 3543
>> 131072 12.64 | 38935 27.02 | 17935 61.06 | 7178
>> 262144 12.66 | 77694 26.85 | 35871 65.06 | 14129
>>
>> In the batch-copy offload approach, DMA copy phase is inserted between unmap/flush and move,
>> So larger N increases first-folio wall clock latency. Throughput improves but with diminishing
>> returns.
>>
>> For DCBM+PTDMA setup, the optimal batch for 2M folios sits around N=8192-16384,
>> because a larger batch allows the driver to distribute more folios across available DMA channels.
>> This is where we get most throughput while keeping the first folio latency in check.
>>
>> This optimal batch value is hardware-specific. Other engines (eg. SDXI) and memory tier (eg. CXL)
>> will likely have different curves.
>>
>> Does this approach and experiment look good to you?
>
> ---
> Best Regards,
> Huang, Ying
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 0/7] Accelerate page migration with batch copying and hardware offload
2026-05-08 12:34 ` Garg, Shivank
@ 2026-05-09 7:49 ` Huang, Ying
2026-05-10 15:03 ` Garg, Shivank
0 siblings, 1 reply; 20+ messages in thread
From: Huang, Ying @ 2026-05-09 7:49 UTC (permalink / raw)
To: Garg, Shivank
Cc: akpm, david, kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy,
rppt, surenb, mhocko, ziy, matthew.brost, joshua.hahnjy,
rakie.kim, byungchul, gourry, apopple, dave, Jonathan.Cameron,
rkodsara, vkoul, bharata, sj, rientjes, xuezhengchu, yiannis,
dave.hansen, hannes, jhubbard, peterx, riel, shakeel.butt,
stalexan, tj, nifan.cxl, jic23, aneesh.kumar, nathan.lynch,
Frank.li, djbw, linux-kernel, linux-mm
"Garg, Shivank" <shivankg@amd.com> writes:
> On 5/8/2026 4:58 PM, Huang, Ying wrote:
>> Hi, Shivank,
>>
>> "Garg, Shivank" <shivankg@amd.com> writes:
>>
>>> On 4/30/2026 2:17 PM, Huang, Ying wrote:
>>>> Shivank Garg <shivankg@amd.com> writes:
>>>
>>>>> PERFORMANCE RESULTS:
>>>>> --------------------
>>>>>
>>>>> Re-ran the V4 workload on v7.1-rc1 with this series; relative
>>>>> speedups match V4 (~6x for 2MB folios at 16 DMA channels). No design
>>>>> change in V5 alters this picture; please refer to the V4 cover letter
>>>>> for the throughput tables [1].
>>>>
>>>> IMHO, it's better to copy performance data here.
>>>>
>>>> In addition to the performance benefit, I want to know the downside as
>>>> well. For example, the migration latency of the first folio may be
>>>> longer. If so, by how much? Can you measure the batch number vs. total
>>>> migration time (benefit) and first folio migration time (downside)?
>>>> That can be used to determine the optimal batch number.
>>>>
>>>
>>> System Info: AMD Zen 3 EPYC server (2-sockets, 32 cores, SMT Enabled),
>>> 1 NUMA node per socket, v7.1-rc1, DVFS set to Performance, PTDMA hardware.
>>>
>>> Benchmark: move_pages() syscall to move pages between two NUMA nodes.
>>>
>>> 1). Moving different sized folios such that total transfer size is constant
>>> (1GB), with different number of DMA channels. Throughput in GB/s.
>>>
>>> a. Baseline (vanilla kernel, single-threaded, serial folio_copy):
>>>
>>> ================================================================================
>>> 4K | 16K | 64K | 256K | 1M | 2M |
>>> ================================================================================
>>> 3.31±0.18 | 5.61±0.07 | 6.66±0.03 | 7.01±0.03 | 7.13±0.08 | 11.02±0.17 |
>>>
>>>
>>> b. DMA offload (Patched Kernel, dcbm driver, N DMA channels):
>>>
>>> ============================================================================================
>>> N channel| 4K | 16K | 64K | 256K | 1M | 2M |
>>> ============================================================================================
>>> 1 | 2.16±0.14 | 2.58±0.02 | 3.00±0.04 | 4.56±0.28 | 4.62±0.02 | 12.65±0.08 |
>>> 2 | 2.68±0.09 | 3.69±0.15 | 4.52±0.04 | 6.75±0.06 | 7.19±0.19 | 14.38±0.06 |
>>> 4 | 3.07±0.13 | 4.62±0.09 | 6.47±0.56 | 9.22±0.15 | 10.24±0.47 | 27.01±0.11 |
>>> 8 | 3.43±0.09 | 5.40±0.16 | 7.67±0.08 | 11.25±0.17 | 12.60±0.60 | 45.62±0.52 |
>>> 12 | 3.50±0.11 | 5.66±0.16 | 8.12±0.10 | 11.97±0.19 | 13.43±0.08 | 61.02±0.92 |
>>> 16 | 3.54±0.12 | 5.79±0.14 | 8.50±0.13 | 12.59±0.15 | 17.21±6.40 | 65.23±1.70 |
>>>
>>>
>>> 2). First-folio latency: Instrumented with custom tracepoints to
>>> measure latency per migrate_pages_batch() call.
>>> Result: throughput (GB/s) and first-folio latency (in microseconds), median of 10 runs.
>>
>> Thanks for detailed data. Per my understanding, the run time of
>> migrate_pages_batch() may be not good enough for measuring first folio
>> latency. IIUC, the migration procedure is something like,
>>
>> for each folio
>> unmap
>> flush
>> for each folio
>> copy
>> remap ===> first folio migrated
>>
>> Some tracepoint should be better to measure it.
>
> Sorry, my earlier write-up was unclear.
> For first folio latency, I add two tracepoints: one at the start of migrate_pages_batch()
> and one in migrate_folio_done().
>
> I agree that the user-accessible point tracepoint should be right after remove_migration_ptes().
> Though, migrate_folio_done() runs only a few operations later, and will have a constant
> offset, so it's unlikely to change the shape of the trade-off curve.
> I'll move the tracepoint right after remove_migration_ptes() for new posting.
Thanks for explanation. Trace point in migrate_folio_done() should be OK.
>>
>>> A). Vanilla Kernel:
>>>
>>> Here, n = workload size passed to move_pages() in folios. Move n number of folios with move_pages().
>>> NR_MAX_BATCHED_MIGRATION is upstream default value 512.
>>>
>>> --- Order 0 (4K folios) ---
>>> n vanilla/cpu
>>> (folios) GB/s | first(us)
>>> --------------------------
>>> 1 0.04 | 24
>>> 4 0.16 | 25
>>> 8 0.29 | 31
>>> 16 0.54 | 27
>>> 64 1.15 | 68
>>> 256 1.86 | 162
>>> 512 2.21 | 264
>>> 2048 2.62 | 208
>>> 4096 2.74 | 182
>>> 16384 2.73 | 173
>>> 65536 3.28 | 166
>>> 262144 3.20 | 167
>>>
>>> --- Order 9 (2M folios) ---
>>> n vanilla/cpu
>>> (folios) GB/s | first(us)
>>> --------------------------
>>> 1 7.05 | 194
>>> 4 8.78 | 186
>>> 8 8.47 | 188
>>> 16 7.20 | 193
>>> 64 8.23 | 191
>>> 256 10.51 | 180
>>> 512 10.88 | 173
>>>
>>> Takeaway:
>>> In each migrate_pages_batch() call, folios are first unmapped, then try_to_unmap_flush(),
>>> and only then folios enter move_to_new_folio(). So first-folio latency is bounded by the
>>> per-batch unmap+flush cost, and then plateaus once workload is large enough.
>>>
>>>
>>> B). Patched kernel:
>>>
>>> Here, N = NR_MAX_BATCHED_MIGRATION (in page). Total migrated data is fixed at 1 GB.
>>
>> Emm, so NR_MAX_BATCHED_MIGRATION could be very large? I think that it
>> needs to be bounded. If it is too large, too many pages may be in an
>> inaccessible state for a longer time. That will hurt the workload
>> performance, although it is optimal for migration performance.
>>
>
> Agreed, it must be bounded.
Thanks! Could you retest with bounded NR_MAX_BATCHED_MIGRATION. If the
upstream default doesn't work well for you. We can find a better one
that balances throughput and latency well.
>>> Change N with a knob to measure impact of different max batched size.
>>>
>>> --- ORDER 0 (4K folios) ---
>>> N offload/dma1 offload/dma4 offload/dma16
>>> GB/s | first(us) GB/s | first(us) GB/s | first(us)
>>> ------------------------------------------------------------------------
>>> 512 2.13 | 639 3.23 | 290 3.27 | 253
>>> 1024 2.17 | 1261 3.44 | 582 3.58 | 536
>>> 2048 2.01 | 2769 3.09 | 1360 3.45 | 1083
>>> 4096 2.10 | 5059 3.13 | 2737 3.58 | 2115
>>> 8192 2.21 | 9320 3.17 | 5015 3.75 | 3617
>>> 16384 2.15 | 18689 3.31 | 9623 3.87 | 6937
>>> 32768 2.12 | 42692 3.38 | 18893 3.83 | 14255
>>> 65536 2.09 | 81956 3.38 | 38556 3.64 | 29003
>>> 131072 2.02 | 169563 3.22 | 81082 3.63 | 62236
>>> 262144 2.21 | 318424 3.12 | 170174 3.50 | 129413
>>>
>>> --- ORDER 9 (2M folios) ---
>>> N offload/dma1 offload/dma4 offload/dma16
>>> GB/s | first(us) GB/s | first(us) GB/s | first(us)
>>> -------------------------------------------------------------------------
>>> 512 11.66 | 160 11.68 | 160 11.65 | 160
>>> 1024 12.16 | 310 13.67 | 275 13.64 | 276
>>> 2048 12.30 | 613 25.47 | 290 25.48 | 291
>>> 4096 12.48 | 1215 26.19 | 566 42.59 | 335
>>> 8192 12.56 | 2424 26.57 | 1118 58.72 | 470 *
>>> 16384 12.61 | 4839 26.77 | 2218 61.94 | 896
>>> 32768 12.60 | 9667 26.98 | 4422 63.75 | 1748
>>> 65536 12.63 | 19318 26.99 | 8838 60.66 | 3543
>>> 131072 12.64 | 38935 27.02 | 17935 61.06 | 7178
>>> 262144 12.66 | 77694 26.85 | 35871 65.06 | 14129
>>>
>>> In the batch-copy offload approach, DMA copy phase is inserted between unmap/flush and move,
>>> So larger N increases first-folio wall clock latency. Throughput improves but with diminishing
>>> returns.
>>>
>>> For DCBM+PTDMA setup, the optimal batch for 2M folios sits around N=8192-16384,
>>> because a larger batch allows the driver to distribute more folios across available DMA channels.
>>> This is where we get most throughput while keeping the first folio latency in check.
>>>
>>> This optimal batch value is hardware-specific. Other engines (eg. SDXI) and memory tier (eg. CXL)
>>> will likely have different curves.
>>>
>>> Does this approach and experiment look good to you?
---
Best Regards,
Huang, Ying
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 0/7] Accelerate page migration with batch copying and hardware offload
2026-05-09 7:49 ` Huang, Ying
@ 2026-05-10 15:03 ` Garg, Shivank
0 siblings, 0 replies; 20+ messages in thread
From: Garg, Shivank @ 2026-05-10 15:03 UTC (permalink / raw)
To: Huang, Ying
Cc: akpm, david, kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy,
rppt, surenb, mhocko, ziy, matthew.brost, joshua.hahnjy,
rakie.kim, byungchul, gourry, apopple, dave, Jonathan.Cameron,
rkodsara, vkoul, bharata, sj, rientjes, xuezhengchu, yiannis,
dave.hansen, hannes, jhubbard, peterx, riel, shakeel.butt,
stalexan, tj, nifan.cxl, jic23, aneesh.kumar, nathan.lynch,
Frank.li, djbw, linux-kernel, linux-mm
On 5/9/2026 1:19 PM, Huang, Ying wrote:
> "Garg, Shivank" <shivankg@amd.com> writes:
>
>> On 5/8/2026 4:58 PM, Huang, Ying wrote:
>>> Hi, Shivank,
>>>
>>> "Garg, Shivank" <shivankg@amd.com> writes:
>>>
>>>> On 4/30/2026 2:17 PM, Huang, Ying wrote:
>>>>> Shivank Garg <shivankg@amd.com> writes:
>>>>
>>>>>> PERFORMANCE RESULTS:
>>>>>> --------------------
>>>>>>
>>>>>> Re-ran the V4 workload on v7.1-rc1 with this series; relative
>>>>>> speedups match V4 (~6x for 2MB folios at 16 DMA channels). No design
>>>>>> change in V5 alters this picture; please refer to the V4 cover letter
>>>>>> for the throughput tables [1].
>>>>>
>>>>> IMHO, it's better to copy performance data here.
>>>>>
>>>>> In addition to the performance benefit, I want to know the downside as
>>>>> well. For example, the migration latency of the first folio may be
>>>>> longer. If so, by how much? Can you measure the batch number vs. total
>>>>> migration time (benefit) and first folio migration time (downside)?
>>>>> That can be used to determine the optimal batch number.
>>>>>
>>>>
>>>> System Info: AMD Zen 3 EPYC server (2-sockets, 32 cores, SMT Enabled),
>>>> 1 NUMA node per socket, v7.1-rc1, DVFS set to Performance, PTDMA hardware.
>>>>
>>>> Benchmark: move_pages() syscall to move pages between two NUMA nodes.
>>>>
>>>> 1). Moving different sized folios such that total transfer size is constant
>>>> (1GB), with different number of DMA channels. Throughput in GB/s.
>>>>
>>>> a. Baseline (vanilla kernel, single-threaded, serial folio_copy):
>>>>
>>>> ================================================================================
>>>> 4K | 16K | 64K | 256K | 1M | 2M |
>>>> ================================================================================
>>>> 3.31±0.18 | 5.61±0.07 | 6.66±0.03 | 7.01±0.03 | 7.13±0.08 | 11.02±0.17 |
>>>>
>>>>
>>>> b. DMA offload (Patched Kernel, dcbm driver, N DMA channels):
>>>>
>>>> ============================================================================================
>>>> N channel| 4K | 16K | 64K | 256K | 1M | 2M |
>>>> ============================================================================================
>>>> 1 | 2.16±0.14 | 2.58±0.02 | 3.00±0.04 | 4.56±0.28 | 4.62±0.02 | 12.65±0.08 |
>>>> 2 | 2.68±0.09 | 3.69±0.15 | 4.52±0.04 | 6.75±0.06 | 7.19±0.19 | 14.38±0.06 |
>>>> 4 | 3.07±0.13 | 4.62±0.09 | 6.47±0.56 | 9.22±0.15 | 10.24±0.47 | 27.01±0.11 |
>>>> 8 | 3.43±0.09 | 5.40±0.16 | 7.67±0.08 | 11.25±0.17 | 12.60±0.60 | 45.62±0.52 |
>>>> 12 | 3.50±0.11 | 5.66±0.16 | 8.12±0.10 | 11.97±0.19 | 13.43±0.08 | 61.02±0.92 |
>>>> 16 | 3.54±0.12 | 5.79±0.14 | 8.50±0.13 | 12.59±0.15 | 17.21±6.40 | 65.23±1.70 |
>>>>
>>>>
>>>> 2). First-folio latency: Instrumented with custom tracepoints to
>>>> measure latency per migrate_pages_batch() call.
>>>> Result: throughput (GB/s) and first-folio latency (in microseconds), median of 10 runs.
>>>
>>> Thanks for detailed data. Per my understanding, the run time of
>>> migrate_pages_batch() may be not good enough for measuring first folio
>>> latency. IIUC, the migration procedure is something like,
>>>
>>> for each folio
>>> unmap
>>> flush
>>> for each folio
>>> copy
>>> remap ===> first folio migrated
>>>
>>> Some tracepoint should be better to measure it.
>>
>> Sorry, my earlier write-up was unclear.
>> For first folio latency, I add two tracepoints: one at the start of migrate_pages_batch()
>> and one in migrate_folio_done().
>>
>> I agree that the user-accessible point tracepoint should be right after remove_migration_ptes().
>> Though, migrate_folio_done() runs only a few operations later, and will have a constant
>> offset, so it's unlikely to change the shape of the trade-off curve.
>> I'll move the tracepoint right after remove_migration_ptes() for new posting.
>
> Thanks for explanation. Trace point in migrate_folio_done() should be OK.
>
>>>
>>>> A). Vanilla Kernel:
>>>>
>>>> Here, n = workload size passed to move_pages() in folios. Move n number of folios with move_pages().
>>>> NR_MAX_BATCHED_MIGRATION is upstream default value 512.
>>>>
>>>> --- Order 0 (4K folios) ---
>>>> n vanilla/cpu
>>>> (folios) GB/s | first(us)
>>>> --------------------------
>>>> 1 0.04 | 24
>>>> 4 0.16 | 25
>>>> 8 0.29 | 31
>>>> 16 0.54 | 27
>>>> 64 1.15 | 68
>>>> 256 1.86 | 162
>>>> 512 2.21 | 264
>>>> 2048 2.62 | 208
>>>> 4096 2.74 | 182
>>>> 16384 2.73 | 173
>>>> 65536 3.28 | 166
>>>> 262144 3.20 | 167
>>>>
>>>> --- Order 9 (2M folios) ---
>>>> n vanilla/cpu
>>>> (folios) GB/s | first(us)
>>>> --------------------------
>>>> 1 7.05 | 194
>>>> 4 8.78 | 186
>>>> 8 8.47 | 188
>>>> 16 7.20 | 193
>>>> 64 8.23 | 191
>>>> 256 10.51 | 180
>>>> 512 10.88 | 173
>>>>
>>>> Takeaway:
>>>> In each migrate_pages_batch() call, folios are first unmapped, then try_to_unmap_flush(),
>>>> and only then folios enter move_to_new_folio(). So first-folio latency is bounded by the
>>>> per-batch unmap+flush cost, and then plateaus once workload is large enough.
>>>>
>>>>
>>>> B). Patched kernel:
>>>>
>>>> Here, N = NR_MAX_BATCHED_MIGRATION (in page). Total migrated data is fixed at 1 GB.
>>>
>>> Emm, so NR_MAX_BATCHED_MIGRATION could be very large? I think that it
>>> needs to be bounded. If it is too large, too many pages may be in an
>>> inaccessible state for a longer time. That will hurt the workload
>>> performance, although it is optimal for migration performance.
>>>
>>
>> Agreed, it must be bounded.
>
> Thanks! Could you retest with bounded NR_MAX_BATCHED_MIGRATION. If the
> upstream default doesn't work well for you. We can find a better one
> that balances throughput and latency well.
>
Thanks. Below tables sweep NR_MAX_BATCHED_MIGRATION from 512 up to 262144. On 2M folios,
16-channel PTDMA, the knee is at N=8192-16384 (= {16 to 32} * 512 ).
>>>> 8192 12.56 | 2424 26.57 | 1118 58.72 | 470 *
One thing worth flagging on the "bounded default": at the upstream cap of 512 pages,
migrate_pages_batch() receives at most one 2M folio per call, so PTDMA can only use
one of its 16 channels per batch and the offload reduces to vanilla. (DCBM offloads
one 2M folio to each channel).
The larger-N rows are what exercise the channel parallelism for PTDMA case.
"SDXI"[1] like memory-to-memory data movers should reach good throughput with just 1 channel,
and thus may not require increasing the NR_MAX_BATCHED_MIGRATION for good throughput.
I'm not tying series this to specific perf default for now, the design review (batch-copy
path, migrator interface, registration, static_call dispatch) is the part I'd like to converge
on first, then tune the threshold after it. Does that ordering work?
[1] https://lore.kernel.org/all/20260410-sdxi-base-v1-0-1d184cb5c60a@amd.com
Best regards,
Shivank
>>>> Change N with a knob to measure impact of different max batched size.
>>>>
>>>> --- ORDER 0 (4K folios) ---
>>>> N offload/dma1 offload/dma4 offload/dma16
>>>> GB/s | first(us) GB/s | first(us) GB/s | first(us)
>>>> ------------------------------------------------------------------------
>>>> 512 2.13 | 639 3.23 | 290 3.27 | 253
>>>> 1024 2.17 | 1261 3.44 | 582 3.58 | 536
>>>> 2048 2.01 | 2769 3.09 | 1360 3.45 | 1083
>>>> 4096 2.10 | 5059 3.13 | 2737 3.58 | 2115
>>>> 8192 2.21 | 9320 3.17 | 5015 3.75 | 3617
>>>> 16384 2.15 | 18689 3.31 | 9623 3.87 | 6937
>>>> 32768 2.12 | 42692 3.38 | 18893 3.83 | 14255
>>>> 65536 2.09 | 81956 3.38 | 38556 3.64 | 29003
>>>> 131072 2.02 | 169563 3.22 | 81082 3.63 | 62236
>>>> 262144 2.21 | 318424 3.12 | 170174 3.50 | 129413
>>>>
>>>> --- ORDER 9 (2M folios) ---
>>>> N offload/dma1 offload/dma4 offload/dma16
>>>> GB/s | first(us) GB/s | first(us) GB/s | first(us)
>>>> -------------------------------------------------------------------------
>>>> 512 11.66 | 160 11.68 | 160 11.65 | 160
>>>> 1024 12.16 | 310 13.67 | 275 13.64 | 276
>>>> 2048 12.30 | 613 25.47 | 290 25.48 | 291
>>>> 4096 12.48 | 1215 26.19 | 566 42.59 | 335
>>>> 8192 12.56 | 2424 26.57 | 1118 58.72 | 470 *
>>>> 16384 12.61 | 4839 26.77 | 2218 61.94 | 896
>>>> 32768 12.60 | 9667 26.98 | 4422 63.75 | 1748
>>>> 65536 12.63 | 19318 26.99 | 8838 60.66 | 3543
>>>> 131072 12.64 | 38935 27.02 | 17935 61.06 | 7178
>>>> 262144 12.66 | 77694 26.85 | 35871 65.06 | 14129
>>>>
>>>> In the batch-copy offload approach, DMA copy phase is inserted between unmap/flush and move,
>>>> So larger N increases first-folio wall clock latency. Throughput improves but with diminishing
>>>> returns.
>>>>
>>>> For DCBM+PTDMA setup, the optimal batch for 2M folios sits around N=8192-16384,
>>>> because a larger batch allows the driver to distribute more folios across available DMA channels.
>>>> This is where we get most throughput while keeping the first folio latency in check.
>>>>
>>>> This optimal batch value is hardware-specific. Other engines (eg. SDXI) and memory tier (eg. CXL)
>>>> will likely have different curves.
>>>>
>>>> Does this approach and experiment look good to you?
>
> ---
> Best Regards,
> Huang, Ying
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 0/7] Accelerate page migration with batch copying and hardware offload
2026-04-28 15:50 [PATCH 0/7] Accelerate page migration with batch copying and hardware offload Shivank Garg
` (8 preceding siblings ...)
2026-04-30 8:47 ` Huang, Ying
@ 2026-05-07 9:58 ` Huang, Ying
9 siblings, 0 replies; 20+ messages in thread
From: Huang, Ying @ 2026-05-07 9:58 UTC (permalink / raw)
To: Shivank Garg
Cc: akpm, david, kinseyho, weixugc, ljs, Liam.Howlett, vbabka, willy,
rppt, surenb, mhocko, ziy, matthew.brost, joshua.hahnjy,
rakie.kim, byungchul, gourry, apopple, dave, Jonathan.Cameron,
rkodsara, vkoul, bharata, sj, rientjes, xuezhengchu, yiannis,
dave.hansen, hannes, jhubbard, peterx, riel, shakeel.butt,
stalexan, tj, nifan.cxl, jic23, aneesh.kumar, nathan.lynch,
Frank.li, djbw, linux-kernel, linux-mm
Shivank Garg <shivankg@amd.com> writes:
> This is the fifth RFC of the patchset to enhance page migration by
> batching folio-copy operations and enabling acceleration via DMA offload.
>
> Single-threaded, folio-by-folio copying bottlenecks page migration in
> modern systems with deep memory hierarchies, especially for large folios
> where copy overhead dominates, leaving significant hardware potential
> untapped.
>
> By batching the copy phase, we create an opportunity for hardware
> acceleration. This series builds the framework and provides a DMA
> offload driver (dcbm) as a reference implementation, targeting bulk
> migration workloads where offloading the copy improves throughput
> and latency while freeing the CPU cycles.
>
> See the RFC V3 cover letter [2] for motivation.
>
> Changelog since V4:
> -------------------
>
> 1. Renamed PAGE_* migration state flags to FOLIO_*. (David)
> 2. Use the new folio->migrate_info field instead of folio->private
> for migration state. (David)
> 3. Fold folios_mc_copy patch in batch-copy implementation patch. (David)
> 3. Renamed migrate_offload_start()/stop() to register()/unregister().
> (Huang, Ying)
> 4. Dropped should_batch() callback from struct migrator. Reason-based
> policy now lives in migrate_pages_batch(). Migrators can still skip
> a batch they don't want (size based policy). (Huang, Ying)
> 5. CONFIG_MIGRATION_COPY_OFFLOAD is now hidden and selected by the
> migrator driver. CONFIG_DCBM_DMA is tristate. (Huang Ying, Gregory Price).
> 6. Wrapped the SRCU + static_call dispatch in a small helper. (Huang, Ying)
> 7. Requir m->owner in migrate_offload_register(), SRCU sync at
> unregister relies on it. Counters are atomic_long_t to avoid lock-order
> issue.
> 9. Moved DCBM sysfs from /sys/kernel/dcbm to /sys/module/dcbm (Huang, Ying)
> 10. Rebased on v7.1-rc1.
>
>
> DESIGN:
> -------
>
> New Migration Flow:
>
> [ migrate_pages_batch() ]
> |
> |--> do_batch = migrate_offload_do_batch(reason) // core filters by migration reason
> |
> |--> for each folio:
> | migrate_folio_unmap() // unmap the folio
> | |
> | +--> (success):
> | if do_batch && folio_supports_batch_copy():
> | -> unmap_batch / dst_batch // batch list for copy offloading
> | else:
> | -> unmap_single / dst_single // single lists for per-folio CPU copy
> |
> |--> try_to_unmap_flush() // single batched TLB flush
> |
> |--> Batch copy (if unmap_batch not empty):
> | - Migrator is configurable at runtime via sysfs.
> |
> | static_call(migrate_offload_copy) // Pluggable Migrators
> | / | \
> | v v v
> | [ Default ] [ DMA Offload ] [ ... ]
> |
> | On -EOPNOTSUPP or other error, batch falls back to per-folio CPU copy.
> |
> +--> migrate_folios_move() // metadata, update PTEs, finalize
> (batch list with already_copied=true, single list with false)
>
> Offload Registration:
>
> Driver fills struct migrator { .name, .offload_copy, .owner } and calls
> migrate_offload_register(). This:
> - Pins the module via try_module_get()
> - Patches the migrate_offload_copy() static_call target
> - Enables the migrate_offload_enabled static branch
>
> migrate_offload_unregister() disables the static branch and reverts
> the static_call, then synchronize_srcu() waits for in-flight migrations
> before module_put().
>
> PERFORMANCE RESULTS:
> --------------------
>
> Re-ran the V4 workload on v7.1-rc1 with this series; relative
> speedups match V4 (~6x for 2MB folios at 16 DMA channels). No design
> change in V5 alters this picture; please refer to the V4 cover letter
> for the throughput tables [1].
>
>
> PLAN:
> -----
>
> Patches 1-4 (the batching infrastructure) don't depend on the migrator
> interface, so if it helps I can split them off and post them ahead of
> the migrator and DCBM bits, which still have a few open questions to
> work through.
>
> I would appreciate guidance on splitting the infrastructure portion
> ahead of the migrator interface if that matches maintainers' preference.
>
> OPEN QUESTIONS:
> ---------------
>
> 1. Should the batch path run without a registered migrator? Patches 1-4
> are self-contained and use folios_mc_copy() (CPU). I have several
> options like making batch path always-on for eligible folios, or
> giving admin an option to flip the static branch, or keep the gate.
> I'm leaning toward always-on.
>
> 2. Carrying already_copied via folio->migrate_info vs changing the
> migrate_folio() callback signature (Huang, Ying). I went with the
> field for now to avoid touching every fs callback before the design
> settles. Happy to revisit.
Personally, I still prefer to change migrate_folio() callbacks for
better readability.
> 3. Per-caller offload selection: Today eligibility is by migrate_reason
> only. Some are latency-tolerant, others may be not. Is reason the
> right granularity, or do we want a per-caller hint?
>
> 4. Cgroup integration: How should per-cgroup be accounted for different
> migrators (e.g.: any accounting for DMA-busy time)?
>
> 5. Tuning migrate_pages callers for offloading. For instance, in
> compaction COMPACT_CLUSTER_MAX = 32 caps DMA's payoff for compaction
> (V4 experiment).
>
> 6. Where do batch-size thresholds live, and how are they tuned? Per
> Huang Ying's split, that policy lives in the migrator. DCBM has no
> threshold today. Open whether it should later be a per-migrator
> sysfs knob or hard-coded; probably clearer once a second migrator
> (SDXI, mtcopy) shows the trade-off.
>
>
> FOLLOW-UPS:
> --------------
>
> 1. dmaengine_prep_dma_memcpy_sg() in DCBM (Vinod Koul). The SG-prep
> variant cuts per-batch prep/submit cost (=CPU savings), but ptdma does
> not implement the SG hook yet [10]. The end-to-end migration throughput
> delta is small because per-descriptor execute time dominates.
> I'll post the ptdma SG hook + DCBM switch as a follow-up.
>
> 2. SDXI as a second migrator. The SDXI series [11] is in review. SDXI is
> a generic memcpy engine without DMA_PRIVATE, so channel acquisition
> goes through dma_find_channel() or async_tx rather than
> dma_request_chan_by_mask(). I have a local DCBM variant working on top
> of the SDXI driver. I'm planning to send it as a follow-up once the
> SDXI series settles.
>
> 3. IOMMU SG merging in DCBM (Gregory). dma_map_sgtable() may merge
> contiguous PFNs unevenly, so src.nents != dst.nents. DCBM falls back
> to CPU for safety. Though I haven't seen it on Zen3 + PTDMA. I'll
> understand this and address it a follow-up.
>
> 4. Revisit Multi-threaded CPU copy migrator once the infra is settled.
>
> EARLIER POSTINGS:
> -----------------
> [1] RFC V4: https://lore.kernel.org/all/20260309120725.308854-3-shivankg@amd.com
> [2] RFC V3: https://lore.kernel.org/all/20250923174752.35701-1-shivankg@amd.com
> [3] RFC V2: https://lore.kernel.org/all/20250319192211.10092-1-shivankg@amd.com
> [4] RFC V1: https://lore.kernel.org/all/20240614221525.19170-1-shivankg@amd.com
> [5] RFC from Zi Yan: https://lore.kernel.org/all/20250103172419.4148674-1-ziy@nvidia.com
>
> RELATED DISCUSSIONS:
> --------------------
> [6] MM-alignment Session [Nov 12, 2025]:
> https://lore.kernel.org/linux-mm/bd6a3c75-b9f0-cbcf-f7c4-1ef5dff06d24@google.com
> [7] Linux Memory Hotness and Promotion call [Nov 6, 2025]:
> https://lore.kernel.org/linux-mm/8ff2fd10-c9ac-4912-cf56-7ecd4afd2770@google.com
> [8] LSFMM 2025:
> https://lore.kernel.org/all/cf6fc05d-c0b0-4de3-985e-5403977aa3aa@amd.com
> [9] OSS India:
> https://ossindia2025.sched.com/event/23Jk1
> [10] DMA_MEMCPY_SG comparison:
> https://lore.kernel.org/linux-mm/3e73addb-ac01-4a05-bc75-c6c1c56072df@amd.com
> [11] SDXI V1:
> https://lore.kernel.org/all/20260410-sdxi-base-v1-0-1d184cb5c60a@amd.com
>
> Thanks to everyone who reviewed, tested or participated in discussions
> around this series. Your feedback helped me throughout the development
> process.
>
> Best Regards,
> Shivank
>
>
> Shivank Garg (6):
> mm/migrate: rename PAGE_ migration flags to FOLIO_
> mm/migrate: use migrate_info field instead of private
> mm/migrate: skip data copy for already-copied folios
> mm/migrate: add batch-copy path in migrate_pages_batch
> mm/migrate: add copy offload registration infrastructure
> drivers/migrate_offload: add DMA batch copy driver (dcbm)
>
> Zi Yan (1):
> mm/migrate: adjust NR_MAX_BATCHED_MIGRATION for testing
>
> drivers/Kconfig | 2 +
> drivers/Makefile | 2 +
> drivers/migrate_offload/Kconfig | 9 +
> drivers/migrate_offload/Makefile | 1 +
> drivers/migrate_offload/dcbm/Makefile | 1 +
> drivers/migrate_offload/dcbm/dcbm.c | 440 ++++++++++++++++++++++++++
> include/linux/migrate_copy_offload.h | 44 +++
> include/linux/mm.h | 2 +
> include/linux/mm_types.h | 1 +
> mm/Kconfig | 6 +
> mm/Makefile | 1 +
> mm/migrate.c | 211 ++++++++----
> mm/migrate_copy_offload.c | 94 ++++++
> mm/util.c | 30 ++
> 14 files changed, 784 insertions(+), 60 deletions(-)
> create mode 100644 drivers/migrate_offload/Kconfig
> create mode 100644 drivers/migrate_offload/Makefile
> create mode 100644 drivers/migrate_offload/dcbm/Makefile
> create mode 100644 drivers/migrate_offload/dcbm/dcbm.c
> create mode 100644 include/linux/migrate_copy_offload.h
> create mode 100644 mm/migrate_copy_offload.c
>
>
> base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
---
Best Regards,
Huang, Ying
^ permalink raw reply [flat|nested] 20+ messages in thread