[PATCH] mm: Add Kcompressd for accelerated memory compression

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] mm: Add Kcompressd for accelerated memory compression
@ 2025-04-30  8:26 Qun-Wei Lin
  2025-04-30 17:05 ` Nhat Pham
                   ` (5 more replies)
  0 siblings, 6 replies; 20+ messages in thread
From: Qun-Wei Lin @ 2025-04-30  8:26 UTC (permalink / raw)
  To: Andrew Morton, Mike Rapoport, Matthias Brugger,
	AngeloGioacchino Del Regno, Nhat Pham, Sergey Senozhatsky,
	Minchan Kim
  Cc: linux-mm, linux-kernel, linux-arm-kernel, linux-mediatek,
	Casper Li, Chinwen Chang, Andrew Yang, James Hsu, Qun-Wei Lin,
	Barry Song

This patch series introduces a new mechanism called kcompressd to
improve the efficiency of memory reclaiming in the operating system.

Problem:
  In the current system, the kswapd thread is responsible for both scanning
  the LRU pages and handling memory compression tasks (such as those
  involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
  to significant performance bottlenecks, especially under high memory
  pressure. The kswapd thread becomes a single point of contention, causing
  delays in memory reclaiming and overall system performance degradation.

Solution:
  Introduced kcompressd to handle asynchronous compression during memory
  reclaim, improving efficiency by offloading compression tasks from
  kswapd. This allows kswapd to focus on its primary task of page reclaim
  without being burdened by the additional overhead of compression.

In our handheld devices, we found that applying this mechanism under high
memory pressure scenarios can increase the rate of pgsteal_anon per second
by over 260% compared to the situation with only kswapd. Additionally, we
observed a reduction of over 50% in page allocation stall occurrences,
further demonstrating the effectiveness of kcompressd in alleviating memory
pressure and improving system responsiveness.

Co-developed-by: Barry Song <21cnbao@gmail.com>
Signed-off-by: Barry Song <21cnbao@gmail.com>
Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
Reference: Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd - Barry Song
           https://lore.kernel.org/lkml/20250313093005.13998-1-21cnbao@gmail.com/
---
 include/linux/mmzone.h |  6 ++++
 mm/mm_init.c           |  1 +
 mm/page_io.c           | 71 ++++++++++++++++++++++++++++++++++++++++++
 mm/swap.h              |  6 ++++
 mm/vmscan.c            | 25 +++++++++++++++
 5 files changed, 109 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6ccec1bf2896..93c9195a54ae 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -23,6 +23,7 @@
 #include <linux/page-flags.h>
 #include <linux/local_lock.h>
 #include <linux/zswap.h>
+#include <linux/kfifo.h>
 #include <asm/page.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -1398,6 +1399,11 @@ typedef struct pglist_data {
 
 	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
 
+#define KCOMPRESS_FIFO_SIZE 256
+	wait_queue_head_t kcompressd_wait;
+	struct task_struct *kcompressd;
+	struct kfifo kcompress_fifo;
+
 #ifdef CONFIG_COMPACTION
 	int kcompactd_max_order;
 	enum zone_type kcompactd_highest_zoneidx;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 9659689b8ace..49bae1dd4584 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1410,6 +1410,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 	pgdat_init_kcompactd(pgdat);
 
 	init_waitqueue_head(&pgdat->kswapd_wait);
+	init_waitqueue_head(&pgdat->kcompressd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 
 	for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
diff --git a/mm/page_io.c b/mm/page_io.c
index 4bce19df557b..d85deb494a6a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -233,6 +233,38 @@ static void swap_zeromap_folio_clear(struct folio *folio)
 	}
 }
 
+static bool swap_sched_async_compress(struct folio *folio)
+{
+	struct swap_info_struct *sis = swp_swap_info(folio->swap);
+	int nid = numa_node_id();
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	if (unlikely(!pgdat->kcompressd))
+		return false;
+
+	if (!current_is_kswapd())
+		return false;
+
+	if (!folio_test_anon(folio))
+		return false;
+	/*
+	 * This case needs to synchronously return AOP_WRITEPAGE_ACTIVATE
+	 */
+	if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio)))
+		return false;
+
+	sis = swp_swap_info(folio->swap);
+	if (zswap_is_enabled() || data_race(sis->flags & SWP_SYNCHRONOUS_IO)) {
+		if (kfifo_avail(&pgdat->kcompress_fifo) >= sizeof(folio) &&
+			kfifo_in(&pgdat->kcompress_fifo, &folio, sizeof(folio))) {
+			wake_up_interruptible(&pgdat->kcompressd_wait);
+			return true;
+		}
+	}
+
+	return false;
+}
+
 /*
  * We may have stale swap cache pages in memory: notice
  * them here and get rid of the unnecessary final write.
@@ -275,6 +307,15 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		 */
 		swap_zeromap_folio_clear(folio);
 	}
+
+	/*
+	 * Compression within zswap and zram might block rmap, unmap
+	 * of both file and anon pages, try to do compression async
+	 * if possible
+	 */
+	if (swap_sched_async_compress(folio))
+		return 0;
+
 	if (zswap_store(folio)) {
 		count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
 		folio_unlock(folio);
@@ -289,6 +330,36 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 	return 0;
 }
 
+int kcompressd(void *p)
+{
+	pg_data_t *pgdat = (pg_data_t *)p;
+	struct folio *folio;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = SWAP_CLUSTER_MAX,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+		.for_reclaim = 1,
+	};
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(pgdat->kcompressd_wait,
+				!kfifo_is_empty(&pgdat->kcompress_fifo));
+
+		while (!kfifo_is_empty(&pgdat->kcompress_fifo)) {
+			if (kfifo_out(&pgdat->kcompress_fifo, &folio, sizeof(folio))) {
+				if (zswap_store(folio)) {
+					count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
+					folio_unlock(folio);
+					continue;
+				}
+				__swap_writepage(folio, &wbc);
+			}
+		}
+	}
+	return 0;
+}
+
 static inline void count_swpout_vm_event(struct folio *folio)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/swap.h b/mm/swap.h
index 6f4a3f927edb..3579da413dc2 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -22,6 +22,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
 void swap_write_unplug(struct swap_iocb *sio);
 int swap_writepage(struct page *page, struct writeback_control *wbc);
 void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
+int kcompressd(void *p);
 
 /* linux/mm/swap_state.c */
 /* One swap address space for each 64M swap space */
@@ -199,6 +200,11 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
 	return 0;
 }
 
+static inline int kcompressd(void *p)
+{
+	return 0;
+}
+
 #endif /* CONFIG_SWAP */
 
 #endif /* _MM_SWAP_H */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3783e45bfc92..2d7b9167bfd6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7420,6 +7420,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 void __meminit kswapd_run(int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
+	int ret;
 
 	pgdat_kswapd_lock(pgdat);
 	if (!pgdat->kswapd) {
@@ -7433,7 +7434,26 @@ void __meminit kswapd_run(int nid)
 		} else {
 			wake_up_process(pgdat->kswapd);
 		}
+		ret = kfifo_alloc(&pgdat->kcompress_fifo,
+				KCOMPRESS_FIFO_SIZE * sizeof(struct folio *),
+				GFP_KERNEL);
+		if (ret) {
+			pr_err("%s: fail to kfifo_alloc\n", __func__);
+			goto out;
+		}
+
+		pgdat->kcompressd = kthread_create_on_node(kcompressd, pgdat, nid,
+				"kcompressd%d", nid);
+		if (IS_ERR(pgdat->kcompressd)) {
+			pr_err("Failed to start kcompressd on node %d，ret=%ld\n",
+					nid, PTR_ERR(pgdat->kcompressd));
+			pgdat->kcompressd = NULL;
+			kfifo_free(&pgdat->kcompress_fifo);
+		} else {
+			wake_up_process(pgdat->kcompressd);
+		}
 	}
+out:
 	pgdat_kswapd_unlock(pgdat);
 }
 
@@ -7452,6 +7472,11 @@ void __meminit kswapd_stop(int nid)
 		kthread_stop(kswapd);
 		pgdat->kswapd = NULL;
 	}
+	if (pgdat->kcompressd) {
+		kthread_stop(pgdat->kcompressd);
+		pgdat->kcompressd = NULL;
+		kfifo_free(&pgdat->kcompress_fifo);
+	}
 	pgdat_kswapd_unlock(pgdat);
 }
 
-- 
2.45.2



^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-04-30  8:26 [PATCH] mm: Add Kcompressd for accelerated memory compression Qun-Wei Lin
@ 2025-04-30 17:05 ` Nhat Pham
  2025-04-30 17:22 ` Nhat Pham
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 20+ messages in thread
From: Nhat Pham @ 2025-04-30 17:05 UTC (permalink / raw)
  To: Qun-Wei Lin
  Cc: Andrew Morton, Mike Rapoport, Matthias Brugger,
	AngeloGioacchino Del Regno, Sergey Senozhatsky, Minchan Kim,
	linux-mm, linux-kernel, linux-arm-kernel, linux-mediatek,
	Casper Li, Chinwen Chang, Andrew Yang, James Hsu, Barry Song,
	Joshua Hahn

On Wed, Apr 30, 2025 at 1:27 AM Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
>
> This patch series introduces a new mechanism called kcompressd to
> improve the efficiency of memory reclaiming in the operating system.
>
> Problem:
>   In the current system, the kswapd thread is responsible for both scanning
>   the LRU pages and handling memory compression tasks (such as those
>   involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
>   to significant performance bottlenecks, especially under high memory
>   pressure. The kswapd thread becomes a single point of contention, causing
>   delays in memory reclaiming and overall system performance degradation.
>
> Solution:
>   Introduced kcompressd to handle asynchronous compression during memory
>   reclaim, improving efficiency by offloading compression tasks from
>   kswapd. This allows kswapd to focus on its primary task of page reclaim
>   without being burdened by the additional overhead of compression.
>
> In our handheld devices, we found that applying this mechanism under high
> memory pressure scenarios can increase the rate of pgsteal_anon per second
> by over 260% compared to the situation with only kswapd. Additionally, we
> observed a reduction of over 50% in page allocation stall occurrences,
> further demonstrating the effectiveness of kcompressd in alleviating memory
> pressure and improving system responsiveness.
>
> Co-developed-by: Barry Song <21cnbao@gmail.com>
> Signed-off-by: Barry Song <21cnbao@gmail.com>
> Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
> Reference: Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd - Barry Song
>            https://lore.kernel.org/lkml/20250313093005.13998-1-21cnbao@gmail.com/
> ---
>  include/linux/mmzone.h |  6 ++++
>  mm/mm_init.c           |  1 +
>  mm/page_io.c           | 71 ++++++++++++++++++++++++++++++++++++++++++
>  mm/swap.h              |  6 ++++
>  mm/vmscan.c            | 25 +++++++++++++++
>  5 files changed, 109 insertions(+)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 6ccec1bf2896..93c9195a54ae 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -23,6 +23,7 @@
>  #include <linux/page-flags.h>
>  #include <linux/local_lock.h>
>  #include <linux/zswap.h>
> +#include <linux/kfifo.h>
>  #include <asm/page.h>
>
>  /* Free memory management - zoned buddy allocator.  */
> @@ -1398,6 +1399,11 @@ typedef struct pglist_data {
>
>         int kswapd_failures;            /* Number of 'reclaimed == 0' runs */
>
> +#define KCOMPRESS_FIFO_SIZE 256
> +       wait_queue_head_t kcompressd_wait;
> +       struct task_struct *kcompressd;
> +       struct kfifo kcompress_fifo;
> +
>  #ifdef CONFIG_COMPACTION
>         int kcompactd_max_order;
>         enum zone_type kcompactd_highest_zoneidx;
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 9659689b8ace..49bae1dd4584 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1410,6 +1410,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>         pgdat_init_kcompactd(pgdat);
>
>         init_waitqueue_head(&pgdat->kswapd_wait);
> +       init_waitqueue_head(&pgdat->kcompressd_wait);
>         init_waitqueue_head(&pgdat->pfmemalloc_wait);
>
>         for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 4bce19df557b..d85deb494a6a 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -233,6 +233,38 @@ static void swap_zeromap_folio_clear(struct folio *folio)
>         }
>  }
>
> +static bool swap_sched_async_compress(struct folio *folio)
> +{
> +       struct swap_info_struct *sis = swp_swap_info(folio->swap);
> +       int nid = numa_node_id();
> +       pg_data_t *pgdat = NODE_DATA(nid);
> +
> +       if (unlikely(!pgdat->kcompressd))
> +               return false;
> +
> +       if (!current_is_kswapd())
> +               return false;
> +
> +       if (!folio_test_anon(folio))
> +               return false;
> +       /*
> +        * This case needs to synchronously return AOP_WRITEPAGE_ACTIVATE
> +        */
> +       if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio)))
> +               return false;

Ah, this is unfortunate.

At this point, we do not know whether the page is compressible yet. If
we decide to perform async compression here, and the page is
incompressible, and we disable zswap writeback, we risk not being able
to activate it down the line, making it more likely that we try it
again too soon :(

Hopefully we can remove this limitation, when Joshua's work to store
incompressible pages in the zswap LRU lands. Then, even if the page is
incompressible, we won't retry it and just put it in the zswap LRU...

> +
> +       sis = swp_swap_info(folio->swap);

There's a slight hitch here. Upstream-wise, zswap differs slightly
from zram: it is cgroup-controlled. zswap can be disabled on a
per-cgroup basis. This is useful, for e.g, when we know for certain
that a workload's data are not compressible, and/or they are not
latency-sensitive so might as well use disk swap.

If the folio's cgroup has its zswap limit reached/disables zswap, then
we should fallback to disk swapping right away, instead of holding the
page. I think we should check it here. Maybe add a
mem_cgroup_may_zswap() helper (see obj_cgroup_may_zswap() for
implementation details - should be a simple-ish refactor), and check
here, in addition to zswap_is_enabled() check? Something like:

if ((zswap_is_enabled() && mem_cgroup_may_zswap(folio_memcg(folio)))
|| data_race(sis->flags & SWP_SYNCHRONOUS_IO))

Does that sound reasonable, Qun-Wei and Barry?

> +       if (zswap_is_enabled() || data_race(sis->flags & SWP_SYNCHRONOUS_IO)) {
> +               if (kfifo_avail(&pgdat->kcompress_fifo) >= sizeof(folio) &&
> +                       kfifo_in(&pgdat->kcompress_fifo, &folio, sizeof(folio))) {
> +                       wake_up_interruptible(&pgdat->kcompressd_wait);
> +                       return true;
> +               }
> +       }
> +
> +       return false;
> +}
> +
>  /*
>   * We may have stale swap cache pages in memory: notice
>   * them here and get rid of the unnecessary final write.
> @@ -275,6 +307,15 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
>                  */
>                 swap_zeromap_folio_clear(folio);
>         }
> +
> +       /*
> +        * Compression within zswap and zram might block rmap, unmap
> +        * of both file and anon pages, try to do compression async
> +        * if possible
> +        */
> +       if (swap_sched_async_compress(folio))
> +               return 0;
> +
>         if (zswap_store(folio)) {
>                 count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
>                 folio_unlock(folio);
> @@ -289,6 +330,36 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
>         return 0;
>  }
>
> +int kcompressd(void *p)
> +{
> +       pg_data_t *pgdat = (pg_data_t *)p;
> +       struct folio *folio;
> +       struct writeback_control wbc = {
> +               .sync_mode = WB_SYNC_NONE,
> +               .nr_to_write = SWAP_CLUSTER_MAX,
> +               .range_start = 0,
> +               .range_end = LLONG_MAX,
> +               .for_reclaim = 1,
> +       };
> +
> +       while (!kthread_should_stop()) {
> +               wait_event_interruptible(pgdat->kcompressd_wait,
> +                               !kfifo_is_empty(&pgdat->kcompress_fifo));
> +
> +               while (!kfifo_is_empty(&pgdat->kcompress_fifo)) {
> +                       if (kfifo_out(&pgdat->kcompress_fifo, &folio, sizeof(folio))) {
> +                               if (zswap_store(folio)) {
> +                                       count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
> +                                       folio_unlock(folio);
> +                                       continue;
> +                               }
> +                               __swap_writepage(folio, &wbc);
> +                       }
> +               }
> +       }
> +       return 0;
> +}
> +
>  static inline void count_swpout_vm_event(struct folio *folio)
>  {
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> diff --git a/mm/swap.h b/mm/swap.h
> index 6f4a3f927edb..3579da413dc2 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -22,6 +22,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
>  void swap_write_unplug(struct swap_iocb *sio);
>  int swap_writepage(struct page *page, struct writeback_control *wbc);
>  void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
> +int kcompressd(void *p);
>
>  /* linux/mm/swap_state.c */
>  /* One swap address space for each 64M swap space */
> @@ -199,6 +200,11 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
>         return 0;
>  }
>
> +static inline int kcompressd(void *p)
> +{
> +       return 0;
> +}
> +
>  #endif /* CONFIG_SWAP */
>
>  #endif /* _MM_SWAP_H */
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 3783e45bfc92..2d7b9167bfd6 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -7420,6 +7420,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
>  void __meminit kswapd_run(int nid)
>  {
>         pg_data_t *pgdat = NODE_DATA(nid);
> +       int ret;
>
>         pgdat_kswapd_lock(pgdat);
>         if (!pgdat->kswapd) {
> @@ -7433,7 +7434,26 @@ void __meminit kswapd_run(int nid)
>                 } else {
>                         wake_up_process(pgdat->kswapd);
>                 }
> +               ret = kfifo_alloc(&pgdat->kcompress_fifo,
> +                               KCOMPRESS_FIFO_SIZE * sizeof(struct folio *),
> +                               GFP_KERNEL);
> +               if (ret) {
> +                       pr_err("%s: fail to kfifo_alloc\n", __func__);
> +                       goto out;
> +               }
> +
> +               pgdat->kcompressd = kthread_create_on_node(kcompressd, pgdat, nid,
> +                               "kcompressd%d", nid);
> +               if (IS_ERR(pgdat->kcompressd)) {
> +                       pr_err("Failed to start kcompressd on node %d，ret=%ld\n",
> +                                       nid, PTR_ERR(pgdat->kcompressd));
> +                       pgdat->kcompressd = NULL;
> +                       kfifo_free(&pgdat->kcompress_fifo);
> +               } else {
> +                       wake_up_process(pgdat->kcompressd);
> +               }
>         }
> +out:
>         pgdat_kswapd_unlock(pgdat);
>  }
>
> @@ -7452,6 +7472,11 @@ void __meminit kswapd_stop(int nid)
>                 kthread_stop(kswapd);
>                 pgdat->kswapd = NULL;
>         }
> +       if (pgdat->kcompressd) {
> +               kthread_stop(pgdat->kcompressd);
> +               pgdat->kcompressd = NULL;
> +               kfifo_free(&pgdat->kcompress_fifo);
> +       }
>         pgdat_kswapd_unlock(pgdat);
>  }
>
> --
> 2.45.2
>


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-04-30  8:26 [PATCH] mm: Add Kcompressd for accelerated memory compression Qun-Wei Lin
  2025-04-30 17:05 ` Nhat Pham
@ 2025-04-30 17:22 ` Nhat Pham
  2025-04-30 21:51 ` Andrew Morton
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 20+ messages in thread
From: Nhat Pham @ 2025-04-30 17:22 UTC (permalink / raw)
  To: Qun-Wei Lin
  Cc: Andrew Morton, Mike Rapoport, Matthias Brugger,
	AngeloGioacchino Del Regno, Sergey Senozhatsky, Minchan Kim,
	linux-mm, linux-kernel, linux-arm-kernel, linux-mediatek,
	Casper Li, Chinwen Chang, Andrew Yang, James Hsu, Barry Song,
	Yosry Ahmed, Johannes Weiner, Kairui Song, Shakeel Butt,
	Joshua Hahn, Chengming Zhou

On Wed, Apr 30, 2025 at 1:27 AM Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
>

cc-ing a couple more folks who are interested/working on this area
(Kairui, Shakeel, Johannes, Yosry, Chengming, etc.).


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-04-30  8:26 [PATCH] mm: Add Kcompressd for accelerated memory compression Qun-Wei Lin
  2025-04-30 17:05 ` Nhat Pham
  2025-04-30 17:22 ` Nhat Pham
@ 2025-04-30 21:51 ` Andrew Morton
  2025-04-30 22:49   ` Barry Song
  2025-05-01 14:02 ` Johannes Weiner
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2025-04-30 21:51 UTC (permalink / raw)
  To: Qun-Wei Lin
  Cc: Mike Rapoport, Matthias Brugger, AngeloGioacchino Del Regno,
	Nhat Pham, Sergey Senozhatsky, Minchan Kim, linux-mm,
	linux-kernel, linux-arm-kernel, linux-mediatek, Casper Li,
	Chinwen Chang, Andrew Yang, James Hsu, Barry Song

On Wed, 30 Apr 2025 16:26:41 +0800 Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:

> This patch series introduces a new mechanism called kcompressd to
> improve the efficiency of memory reclaiming in the operating system.
> 
> Problem:
>   In the current system, the kswapd thread is responsible for both scanning
>   the LRU pages and handling memory compression tasks (such as those
>   involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
>   to significant performance bottlenecks, especially under high memory
>   pressure. The kswapd thread becomes a single point of contention, causing
>   delays in memory reclaiming and overall system performance degradation.
> 
> Solution:
>   Introduced kcompressd to handle asynchronous compression during memory
>   reclaim, improving efficiency by offloading compression tasks from
>   kswapd. This allows kswapd to focus on its primary task of page reclaim
>   without being burdened by the additional overhead of compression.
> 
> In our handheld devices, we found that applying this mechanism under high
> memory pressure scenarios can increase the rate of pgsteal_anon per second
> by over 260% compared to the situation with only kswapd. Additionally, we
> observed a reduction of over 50% in page allocation stall occurrences,
> further demonstrating the effectiveness of kcompressd in alleviating memory
> pressure and improving system responsiveness.

It's a significant change and I'm thinking that broader performance
testing across a broader range of machines is needed before we can
confidently upstream such a change.

Also, it's presumably a small net loss on single-CPU machines (do these
exist any more?).  Is it hard to disable this feature on such machines?

>  
> +static bool swap_sched_async_compress(struct folio *folio)
> +{
> +	struct swap_info_struct *sis = swp_swap_info(folio->swap);
> +	int nid = numa_node_id();
> +	pg_data_t *pgdat = NODE_DATA(nid);
> +
> +	if (unlikely(!pgdat->kcompressd))
> +		return false;
> +
> +	if (!current_is_kswapd())
> +		return false;
> +
> +	if (!folio_test_anon(folio))
> +		return false;

Are you sure the above three tests are really needed?




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-04-30 21:51 ` Andrew Morton
@ 2025-04-30 22:49   ` Barry Song
  2025-05-07 15:11     ` Nhat Pham
  0 siblings, 1 reply; 20+ messages in thread
From: Barry Song @ 2025-04-30 22:49 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Qun-Wei Lin, Mike Rapoport, Matthias Brugger,
	AngeloGioacchino Del Regno, Nhat Pham, Sergey Senozhatsky,
	Minchan Kim, linux-mm, linux-kernel, linux-arm-kernel,
	linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu

On Thu, May 1, 2025 at 9:51 AM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Wed, 30 Apr 2025 16:26:41 +0800 Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
>
> > This patch series introduces a new mechanism called kcompressd to
> > improve the efficiency of memory reclaiming in the operating system.
> >
> > Problem:
> >   In the current system, the kswapd thread is responsible for both scanning
> >   the LRU pages and handling memory compression tasks (such as those
> >   involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
> >   to significant performance bottlenecks, especially under high memory
> >   pressure. The kswapd thread becomes a single point of contention, causing
> >   delays in memory reclaiming and overall system performance degradation.
> >
> > Solution:
> >   Introduced kcompressd to handle asynchronous compression during memory
> >   reclaim, improving efficiency by offloading compression tasks from
> >   kswapd. This allows kswapd to focus on its primary task of page reclaim
> >   without being burdened by the additional overhead of compression.
> >
> > In our handheld devices, we found that applying this mechanism under high
> > memory pressure scenarios can increase the rate of pgsteal_anon per second
> > by over 260% compared to the situation with only kswapd. Additionally, we
> > observed a reduction of over 50% in page allocation stall occurrences,
> > further demonstrating the effectiveness of kcompressd in alleviating memory
> > pressure and improving system responsiveness.
>
> It's a significant change and I'm thinking that broader performance
> testing across a broader range of machines is needed before we can
> confidently upstream such a change.

We ran the same test on our phones and saw the same results as Qun-Wei.
The async compression significantly reduces allocation stalls and improves
reclamation speed. However, I agree that broader testing is needed, and
we’ll also need the zswap team’s help with testing zswap cases.

>
> Also, it's presumably a small net loss on single-CPU machines (do these
> exist any more?).  Is it hard to disable this feature on such machines?

A net loss is possible, but kswapd can sometimes enter sleep contexts,
allowing the parallel kcompressd thread to continue compression.
This could actually be a win. But I agree that additional testing on
single-CPU machines may be necessary.

It could be disabled by the following if we discover any regression on
single-CPU machines?

if (num_online_cpus() == 1)
     return false;

>
> >
> > +static bool swap_sched_async_compress(struct folio *folio)
> > +{
> > +     struct swap_info_struct *sis = swp_swap_info(folio->swap);
> > +     int nid = numa_node_id();
> > +     pg_data_t *pgdat = NODE_DATA(nid);
> > +
> > +     if (unlikely(!pgdat->kcompressd))
> > +             return false;
> > +
> > +     if (!current_is_kswapd())
> > +             return false;
> > +
> > +     if (!folio_test_anon(folio))
> > +             return false;
>
> Are you sure the above three tests are really needed?

Currently, it runs as a per-node thread mainly to accelerate asynchronous
reclamation, which effectively reduces direct reclamation. Since direct
reclamation already follows the slow path, asynchronous compression offers
limited additional benefit in that context. Moreover, it's difficult
to determine
the optimal number of threads for direct reclamation, whereas the  compression
in the current direct reclamation allows it to utilize all CPUs.

The first condition checks whether kcompressd is present. The second
ensures that we're in kswapd asynchronous reclamation, not direct
reclamation. The third condition might be optimized or dropped, at least for
swap-backed shmem, and similar cases.

Thanks
Barry


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-04-30  8:26 [PATCH] mm: Add Kcompressd for accelerated memory compression Qun-Wei Lin
                   ` (2 preceding siblings ...)
  2025-04-30 21:51 ` Andrew Morton
@ 2025-05-01 14:02 ` Johannes Weiner
  2025-05-01 15:12   ` Nhat Pham
  2025-05-02  9:16   ` Qun-wei Lin (林群崴)
  2025-05-01 15:50 ` Nhat Pham
  2025-05-07  1:12 ` Harry Yoo
  5 siblings, 2 replies; 20+ messages in thread
From: Johannes Weiner @ 2025-05-01 14:02 UTC (permalink / raw)
  To: Qun-Wei Lin
  Cc: Andrew Morton, Mike Rapoport, Matthias Brugger,
	AngeloGioacchino Del Regno, Nhat Pham, Sergey Senozhatsky,
	Minchan Kim, linux-mm, linux-kernel, linux-arm-kernel,
	linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu,
	Barry Song

On Wed, Apr 30, 2025 at 04:26:41PM +0800, Qun-Wei Lin wrote:
> This patch series introduces a new mechanism called kcompressd to
> improve the efficiency of memory reclaiming in the operating system.
> 
> Problem:
>   In the current system, the kswapd thread is responsible for both scanning
>   the LRU pages and handling memory compression tasks (such as those
>   involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
>   to significant performance bottlenecks, especially under high memory
>   pressure. The kswapd thread becomes a single point of contention, causing
>   delays in memory reclaiming and overall system performance degradation.
> 
> Solution:
>   Introduced kcompressd to handle asynchronous compression during memory
>   reclaim, improving efficiency by offloading compression tasks from
>   kswapd. This allows kswapd to focus on its primary task of page reclaim
>   without being burdened by the additional overhead of compression.
> 
> In our handheld devices, we found that applying this mechanism under high
> memory pressure scenarios can increase the rate of pgsteal_anon per second
> by over 260% compared to the situation with only kswapd. Additionally, we
> observed a reduction of over 50% in page allocation stall occurrences,
> further demonstrating the effectiveness of kcompressd in alleviating memory
> pressure and improving system responsiveness.

Yes, I think parallelizing this work makes a lot of sense.

> Co-developed-by: Barry Song <21cnbao@gmail.com>
> Signed-off-by: Barry Song <21cnbao@gmail.com>
> Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
> Reference: Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd - Barry Song
>            https://lore.kernel.org/lkml/20250313093005.13998-1-21cnbao@gmail.com/
> ---
>  include/linux/mmzone.h |  6 ++++
>  mm/mm_init.c           |  1 +
>  mm/page_io.c           | 71 ++++++++++++++++++++++++++++++++++++++++++
>  mm/swap.h              |  6 ++++
>  mm/vmscan.c            | 25 +++++++++++++++
>  5 files changed, 109 insertions(+)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 6ccec1bf2896..93c9195a54ae 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -23,6 +23,7 @@
>  #include <linux/page-flags.h>
>  #include <linux/local_lock.h>
>  #include <linux/zswap.h>
> +#include <linux/kfifo.h>
>  #include <asm/page.h>
>  
>  /* Free memory management - zoned buddy allocator.  */
> @@ -1398,6 +1399,11 @@ typedef struct pglist_data {
>  
>  	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
>  
> +#define KCOMPRESS_FIFO_SIZE 256
> +	wait_queue_head_t kcompressd_wait;
> +	struct task_struct *kcompressd;
> +	struct kfifo kcompress_fifo;

The way you implemented this adds time-and-space overhead even on
systems that don't have any sort of swap compression enabled.

That seems unnecessary. There is an existing method for asynchronous
writeback, and pageout() is naturally fully set up to handle this.

IMO the better way to do this is to make zswap_store() (and
zram_bio_write()?) asynchronous. Make those functions queue the work
and wake the compression daemon, and then have the daemon call
folio_end_writeback() / bio_endio() when it's done with it.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-05-01 14:02 ` Johannes Weiner
@ 2025-05-01 15:12   ` Nhat Pham
  2025-06-16  3:41     ` Barry Song
  2025-05-02  9:16   ` Qun-wei Lin (林群崴)
  1 sibling, 1 reply; 20+ messages in thread
From: Nhat Pham @ 2025-05-01 15:12 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Qun-Wei Lin, Andrew Morton, Mike Rapoport, Matthias Brugger,
	AngeloGioacchino Del Regno, Sergey Senozhatsky, Minchan Kim,
	linux-mm, linux-kernel, linux-arm-kernel, linux-mediatek,
	Casper Li, Chinwen Chang, Andrew Yang, James Hsu, Barry Song

On Thu, May 1, 2025 at 7:02 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
>
> The way you implemented this adds time-and-space overhead even on
> systems that don't have any sort of swap compression enabled.
>
> That seems unnecessary. There is an existing method for asynchronous
> writeback, and pageout() is naturally fully set up to handle this.
>
> IMO the better way to do this is to make zswap_store() (and
> zram_bio_write()?) asynchronous. Make those functions queue the work
> and wake the compression daemon, and then have the daemon call
> folio_end_writeback() / bio_endio() when it's done with it.

+1.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-04-30  8:26 [PATCH] mm: Add Kcompressd for accelerated memory compression Qun-Wei Lin
                   ` (3 preceding siblings ...)
  2025-05-01 14:02 ` Johannes Weiner
@ 2025-05-01 15:50 ` Nhat Pham
  2025-05-07  1:12 ` Harry Yoo
  5 siblings, 0 replies; 20+ messages in thread
From: Nhat Pham @ 2025-05-01 15:50 UTC (permalink / raw)
  To: Qun-Wei Lin
  Cc: Andrew Morton, Mike Rapoport, Matthias Brugger,
	AngeloGioacchino Del Regno, Sergey Senozhatsky, Minchan Kim,
	linux-mm, linux-kernel, linux-arm-kernel, linux-mediatek,
	Casper Li, Chinwen Chang, Andrew Yang, James Hsu, Barry Song,
	Johannes Weiner, Yosry Ahmed, Chengming Zhou, Shakeel Butt,
	Kairui Song, Joshua Hahn

On Wed, Apr 30, 2025 at 1:27 AM Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
>
> This patch series introduces a new mechanism called kcompressd to
> improve the efficiency of memory reclaiming in the operating system.
>
> Problem:
>   In the current system, the kswapd thread is responsible for both scanning
>   the LRU pages and handling memory compression tasks (such as those
>   involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
>   to significant performance bottlenecks, especially under high memory
>   pressure. The kswapd thread becomes a single point of contention, causing
>   delays in memory reclaiming and overall system performance degradation.
>
> Solution:
>   Introduced kcompressd to handle asynchronous compression during memory
>   reclaim, improving efficiency by offloading compression tasks from
>   kswapd. This allows kswapd to focus on its primary task of page reclaim
>   without being burdened by the additional overhead of compression.
>
> In our handheld devices, we found that applying this mechanism under high
> memory pressure scenarios can increase the rate of pgsteal_anon per second
> by over 260% compared to the situation with only kswapd. Additionally, we
> observed a reduction of over 50% in page allocation stall occurrences,
> further demonstrating the effectiveness of kcompressd in alleviating memory
> pressure and improving system responsiveness.
>

Oh btw, testing this on a simple kernel building task triggers this:

[  133.349908] WARNING: CPU: 0 PID: 50 at mm/memcontrol.c:5330
obj_cgroup_charge_zswap+0x22e/0x250
[  133.350505] Modules linked in: virtio_net pata_acpi net_failover
failover virtio_rng rng_core ata_piix libata scsi_mod scsi_common
[  133.351366] CPU: 0 UID: 0 PID: 50 Comm: kcompressd0 Not tainted
6.14.0-ge65b549702a5 #218
[  133.351940] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[  133.352717] RIP: 0010:obj_cgroup_charge_zswap+0x22e/0x250
[  133.353118] Code: d2 ff 85 c0 0f 85 7a fe ff ff be ff ff ff ff 48
c7 c7 88 da f1 91 e8 a1 b4 a3 00 85 c0 0f 85 61 fe ff ff 0f 0b e9 5a
fe ff ff <0f> 0b e9 f5 fd ff ff e8 36 ae a3 00 e9 78 fe ff ff e8 2c ae
a3 00
[  133.354372] RSP: 0018:ffff9f99803bbc00 EFLAGS: 00010246
[  133.354782] RAX: ffff970f42a9a900 RBX: 000000000000013e RCX: 0000000000000002
[  133.355269] RDX: 0000000000000000 RSI: 000000000000013e RDI: ffff970f475eab40
[  133.355774] RBP: ffff970f475eab40 R08: 0000000000000000 R09: 0000000000000000
[  133.356269] R10: ffffffff90a21205 R11: ffffffff90a211ab R12: ffffffff90a21205
[  133.356782] R13: ffffc4984041ff40 R14: ffff970f42e66000 R15: 000000000000013e
[  133.357279] FS:  0000000000000000(0000) GS:ffff970fbdc00000(0000)
knlGS:0000000000000000
[  133.357807] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  133.358186] CR2: 00007f33950c5030 CR3: 00000000038ea000 CR4: 00000000000006f0
[  133.358656] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  133.359121] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  133.359597] Call Trace:
[  133.359767]  <TASK>
[  133.359914]  ? __warn+0x94/0x190
[  133.360136]  ? obj_cgroup_charge_zswap+0x22e/0x250
[  133.360476]  ? report_bug+0x168/0x170
[  133.360742]  ? handle_bug+0x53/0x90
[  133.360982]  ? exc_invalid_op+0x18/0x70
[  133.361240]  ? asm_exc_invalid_op+0x1a/0x20
[  133.361536]  ? zswap_store+0x755/0xf80
[  133.361798]  ? zswap_store+0x6fb/0xf80
[  133.362071]  ? zswap_store+0x755/0xf80
[  133.362338]  ? obj_cgroup_charge_zswap+0x22e/0x250
[  133.362661]  ? zswap_store+0x755/0xf80
[  133.362943]  zswap_store+0x7e7/0xf80
[  133.363203]  ? __pfx_kcompressd+0x10/0x10
[  133.363472]  kcompressd+0xb1/0x180
[  133.363724]  ? __pfx_autoremove_wake_function+0x10/0x10
[  133.364082]  kthread+0xef/0x230
[  133.364298]  ? __pfx_kthread+0x10/0x10
[  133.364548]  ret_from_fork+0x34/0x50
[  133.364810]  ? __pfx_kthread+0x10/0x10
[  133.365063]  ret_from_fork_asm+0x1a/0x30
[  133.365321]  </TASK>
[  133.365471] irq event stamp: 18
[  133.365680] hardirqs last  enabled at (17): [<ffffffff914bd0ef>]
_raw_spin_unlock_irqrestore+0x4f/0x60
[  133.366289] hardirqs last disabled at (18): [<ffffffff914b2031>]
__schedule+0x6b1/0xe80
[  133.366824] softirqs last  enabled at (0): [<ffffffff906b1caf>]
copy_process+0x9af/0x2b50
[  133.367366] softirqs last disabled at (0): [<0000000000000000>] 0x0
[  133.367844] ---[ end trace 0000000000000000 ]---

Seems like we're trigger this warning in the zswap cgroup check (see
obj_cgroup_may_zswap() in mm/memcontrol.c for more details):

VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));

Might wanna fix this...


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-05-01 14:02 ` Johannes Weiner
  2025-05-01 15:12   ` Nhat Pham
@ 2025-05-02  9:16   ` Qun-wei Lin (林群崴)
  1 sibling, 0 replies; 20+ messages in thread
From: Qun-wei Lin (林群崴) @ 2025-05-02  9:16 UTC (permalink / raw)
  To: hannes@cmpxchg.org
  Cc: Andrew Yang (楊智強), rppt@kernel.org,
	nphamcs@gmail.com, 21cnbao@gmail.com,
	James Hsu (徐慶薰), AngeloGioacchino Del Regno,
	akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
	linux-mediatek@lists.infradead.org, linux-mm@kvack.org,
	Chinwen Chang (張錦文),
	Casper Li (李中榮), minchan@kernel.org,
	linux-arm-kernel@lists.infradead.org, matthias.bgg@gmail.com,
	senozhatsky@chromium.org

On Thu, 2025-05-01 at 10:02 -0400, Johannes Weiner wrote:


> External email : Please do not click links or open attachments until
you have verified the sender or the content.
> 
> 
> On Wed, Apr 30, 2025 at 04:26:41PM +0800, Qun-Wei Lin wrote:
> 
> > This patch series introduces a new mechanism called kcompressd to
> > improve the efficiency of memory reclaiming in the operating
system.
> > 
> > Problem:
> >   In the current system, the kswapd thread is responsible for both
scanning
> >   the LRU pages and handling memory compression tasks (such as
those
> >   involving ZSWAP/ZRAM, if enabled). This combined responsibility
can lead
> >   to significant performance bottlenecks, especially under high
memory
> >   pressure. The kswapd thread becomes a single point of contention,
causing
> >   delays in memory reclaiming and overall system performance
degradation.
> > 
> > Solution:
> >   Introduced kcompressd to handle asynchronous compression during
memory
> >   reclaim, improving efficiency by offloading compression tasks
from
> >   kswapd. This allows kswapd to focus on its primary task of page
reclaim
> >   without being burdened by the additional overhead of compression.
> > 
> > In our handheld devices, we found that applying this mechanism
under high
> > memory pressure scenarios can increase the rate of pgsteal_anon per
second
> > by over 260% compared to the situation with only kswapd.
Additionally, we
> > observed a reduction of over 50% in page allocation stall
occurrences,
> > further demonstrating the effectiveness of kcompressd in
alleviating memory
> > pressure and improving system responsiveness.
> 
> 
> Yes, I think parallelizing this work makes a lot of sense.
> 
> 
> > Co-developed-by: Barry Song
<[21cnbao@gmail.com](mailto:21cnbao@gmail.com)>
> > Signed-off-by: Barry Song
<[21cnbao@gmail.com](mailto:21cnbao@gmail.com)>
> > Signed-off-by: Qun-Wei Lin
<[qun-wei.lin@mediatek.com](mailto:qun-wei.lin@mediatek.com)>
> > Reference: Re: [PATCH 0/2] Improve Zram by separating compression
context from kswapd - Barry Song
> >           
[https://lore.kernel.org/lkml/20250313093005.13998-1-21cnbao@gmail.com/](https://lore.kernel.org/lkml/20250313093005.13998-1-21cnbao@gmail.com/)
> > ---
> >  include/linux/mmzone.h |  6 ++++
> >  mm/mm_init.c           |  1 +
> >  mm/page_io.c           | 71
++++++++++++++++++++++++++++++++++++++++++
> >  mm/swap.h              |  6 ++++
> >  mm/vmscan.c            | 25 +++++++++++++++
> >  5 files changed, 109 insertions(+)
> > 
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 6ccec1bf2896..93c9195a54ae 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -23,6 +23,7 @@
> >  #include <linux/page-flags.h>
> >  #include <linux/local_lock.h>
> >  #include <linux/zswap.h>
> > +#include <linux/kfifo.h>
> >  #include <asm/page.h>
> > 
> >  /* Free memory management - zoned buddy allocator.  */
> > @@ -1398,6 +1399,11 @@ typedef struct pglist_data {
> > 
> >       int kswapd_failures;            /* Number of 'reclaimed == 0'
runs */
> > 
> > +#define KCOMPRESS_FIFO_SIZE 256
> > +     wait_queue_head_t kcompressd_wait;
> > +     struct task_struct *kcompressd;
> > +     struct kfifo kcompress_fifo;
> 
> 
> The way you implemented this adds time-and-space overhead even on
> systems that don't have any sort of swap compression enabled.
>


To address the overhead concern, perhaps we can embed only a single
kcompressd pointer within pglist_data and perform lazy initialization
only when a zram device is added or zswap is enabled.


> That seems unnecessary. There is an existing method for asynchronous
> writeback, and pageout() is naturally fully set up to handle this.
> 
> IMO the better way to do this is to make zswap_store() (and
> zram_bio_write()?) asynchronous. Make those functions queue the work
> and wake the compression daemon, and then have the daemon call
> folio_end_writeback() / bio_endio() when it's done with it.



Perhaps we could add an enqueue/wake-upkcompressd interface and call it
within zswap_store() and zram_bio_write(). This would leverage the
existing obj_cgroup_may_zswap() check in zswap_store(), it solved the
problem that zswap is re-compressed too soon. as mentioned by Nhat.

In outline:

1. Per-node pointer in pglist_data:  

   typedef struct pglist_data {  
   ...  
   struct kcompressd_node *kcompressd;  
   ...  
   }

2. Global register/unregister hooks:  

   kcompressd_register_backend(): Register a new backend (zram/zswap).
   Initialize the kcompressd structure and kfifo if this is the first 
   call.
   
   kcompressd_unregister_backend(): Unregister a backend (zram/zswap).
   Use a per-node refcount and bitmap to track how manyzswap/zram   
   instances are active. If the last backend is unregistered, free   
   the kcompressd resources.

> > A net loss is possible, but kswapd can sometimes enter sleep
> > contexts,
> > allowing the parallel kcompressd thread to continue compression.
> > This could actually be a win. But I agree that additional testing
on
> > single-CPU machines may be necessary.
> 
> It could be disabled by the following if we discover any regression
> on
> single-CPU machines?
> 
> if (num_online_cpus() == 1)
>      return false;
>

   We can add this check in the register/unregister function.

3. Enqueue API:  

   kcompressd_enqueue_folio(folio) /kcompressd_enqueue_bio(bio): Push a
   job to the kcompressd’s FIFO and wake up the kcompressd daemon.

With this approach, there is zero runtime cost on nodes when no backend
is active and only one allocation per node.


Thank you for your feedback!  
Please let me know what you think.

Best Regards,  
Qun-wei




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-04-30  8:26 [PATCH] mm: Add Kcompressd for accelerated memory compression Qun-Wei Lin
                   ` (4 preceding siblings ...)
  2025-05-01 15:50 ` Nhat Pham
@ 2025-05-07  1:12 ` Harry Yoo
  2025-05-07  1:50   ` Zi Yan
  5 siblings, 1 reply; 20+ messages in thread
From: Harry Yoo @ 2025-05-07  1:12 UTC (permalink / raw)
  To: Qun-Wei Lin
  Cc: Andrew Morton, Mike Rapoport, Matthias Brugger,
	AngeloGioacchino Del Regno, Nhat Pham, Sergey Senozhatsky,
	Minchan Kim, linux-mm, linux-kernel, linux-arm-kernel,
	linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu,
	Barry Song, Zi Yan

On Wed, Apr 30, 2025 at 04:26:41PM +0800, Qun-Wei Lin wrote:
> This patch series introduces a new mechanism called kcompressd to
> improve the efficiency of memory reclaiming in the operating system.
> 
> Problem:
>   In the current system, the kswapd thread is responsible for both scanning
>   the LRU pages and handling memory compression tasks (such as those
>   involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
>   to significant performance bottlenecks, especially under high memory
>   pressure. The kswapd thread becomes a single point of contention, causing
>   delays in memory reclaiming and overall system performance degradation.
> 
> Solution:
>   Introduced kcompressd to handle asynchronous compression during memory
>   reclaim, improving efficiency by offloading compression tasks from
>   kswapd. This allows kswapd to focus on its primary task of page reclaim
>   without being burdened by the additional overhead of compression.
> 
> In our handheld devices, we found that applying this mechanism under high
> memory pressure scenarios can increase the rate of pgsteal_anon per second
> by over 260% compared to the situation with only kswapd. Additionally, we
> observed a reduction of over 50% in page allocation stall occurrences,
> further demonstrating the effectiveness of kcompressd in alleviating memory
> pressure and improving system responsiveness.
> 
> Co-developed-by: Barry Song <21cnbao@gmail.com>
> Signed-off-by: Barry Song <21cnbao@gmail.com>
> Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
> Reference: Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd - Barry Song
>            https://lore.kernel.org/lkml/20250313093005.13998-1-21cnbao@gmail.com/
> ---

+Cc Zi Yan, who might be interested in writing a framework (or improving
the existing one, padata) for parallelizing jobs (e.g. migration/compression)

>  include/linux/mmzone.h |  6 ++++
>  mm/mm_init.c           |  1 +
>  mm/page_io.c           | 71 ++++++++++++++++++++++++++++++++++++++++++
>  mm/swap.h              |  6 ++++
>  mm/vmscan.c            | 25 +++++++++++++++
>  5 files changed, 109 insertions(+)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 6ccec1bf2896..93c9195a54ae 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -23,6 +23,7 @@
>  #include <linux/page-flags.h>
>  #include <linux/local_lock.h>
>  #include <linux/zswap.h>
> +#include <linux/kfifo.h>
>  #include <asm/page.h>
>  
>  /* Free memory management - zoned buddy allocator.  */
> @@ -1398,6 +1399,11 @@ typedef struct pglist_data {
>  
>  	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
>  
> +#define KCOMPRESS_FIFO_SIZE 256
> +	wait_queue_head_t kcompressd_wait;
> +	struct task_struct *kcompressd;
> +	struct kfifo kcompress_fifo;
> +
>  #ifdef CONFIG_COMPACTION
>  	int kcompactd_max_order;
>  	enum zone_type kcompactd_highest_zoneidx;
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 9659689b8ace..49bae1dd4584 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1410,6 +1410,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>  	pgdat_init_kcompactd(pgdat);
>  
>  	init_waitqueue_head(&pgdat->kswapd_wait);
> +	init_waitqueue_head(&pgdat->kcompressd_wait);
>  	init_waitqueue_head(&pgdat->pfmemalloc_wait);
>  
>  	for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 4bce19df557b..d85deb494a6a 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -233,6 +233,38 @@ static void swap_zeromap_folio_clear(struct folio *folio)
>  	}
>  }
>  
> +static bool swap_sched_async_compress(struct folio *folio)
> +{
> +	struct swap_info_struct *sis = swp_swap_info(folio->swap);
> +	int nid = numa_node_id();
> +	pg_data_t *pgdat = NODE_DATA(nid);
> +
> +	if (unlikely(!pgdat->kcompressd))
> +		return false;
> +
> +	if (!current_is_kswapd())
> +		return false;
> +
> +	if (!folio_test_anon(folio))
> +		return false;
> +	/*
> +	 * This case needs to synchronously return AOP_WRITEPAGE_ACTIVATE
> +	 */
> +	if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio)))
> +		return false;
> +
> +	sis = swp_swap_info(folio->swap);
> +	if (zswap_is_enabled() || data_race(sis->flags & SWP_SYNCHRONOUS_IO)) {
> +		if (kfifo_avail(&pgdat->kcompress_fifo) >= sizeof(folio) &&
> +			kfifo_in(&pgdat->kcompress_fifo, &folio, sizeof(folio))) {
> +			wake_up_interruptible(&pgdat->kcompressd_wait);
> +			return true;
> +		}
> +	}
> +
> +	return false;
> +}
> +
>  /*
>   * We may have stale swap cache pages in memory: notice
>   * them here and get rid of the unnecessary final write.
> @@ -275,6 +307,15 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
>  		 */
>  		swap_zeromap_folio_clear(folio);
>  	}
> +
> +	/*
> +	 * Compression within zswap and zram might block rmap, unmap
> +	 * of both file and anon pages, try to do compression async
> +	 * if possible
> +	 */
> +	if (swap_sched_async_compress(folio))
> +		return 0;
> +
>  	if (zswap_store(folio)) {
>  		count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
>  		folio_unlock(folio);
> @@ -289,6 +330,36 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
>  	return 0;
>  }
>  
> +int kcompressd(void *p)
> +{
> +	pg_data_t *pgdat = (pg_data_t *)p;
> +	struct folio *folio;
> +	struct writeback_control wbc = {
> +		.sync_mode = WB_SYNC_NONE,
> +		.nr_to_write = SWAP_CLUSTER_MAX,
> +		.range_start = 0,
> +		.range_end = LLONG_MAX,
> +		.for_reclaim = 1,
> +	};
> +
> +	while (!kthread_should_stop()) {
> +		wait_event_interruptible(pgdat->kcompressd_wait,
> +				!kfifo_is_empty(&pgdat->kcompress_fifo));
> +
> +		while (!kfifo_is_empty(&pgdat->kcompress_fifo)) {
> +			if (kfifo_out(&pgdat->kcompress_fifo, &folio, sizeof(folio))) {
> +				if (zswap_store(folio)) {
> +					count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
> +					folio_unlock(folio);
> +					continue;
> +				}
> +				__swap_writepage(folio, &wbc);
> +			}
> +		}
> +	}
> +	return 0;
> +}
> +
>  static inline void count_swpout_vm_event(struct folio *folio)
>  {
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> diff --git a/mm/swap.h b/mm/swap.h
> index 6f4a3f927edb..3579da413dc2 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -22,6 +22,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
>  void swap_write_unplug(struct swap_iocb *sio);
>  int swap_writepage(struct page *page, struct writeback_control *wbc);
>  void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
> +int kcompressd(void *p);
>  
>  /* linux/mm/swap_state.c */
>  /* One swap address space for each 64M swap space */
> @@ -199,6 +200,11 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
>  	return 0;
>  }
>  
> +static inline int kcompressd(void *p)
> +{
> +	return 0;
> +}
> +
>  #endif /* CONFIG_SWAP */
>  
>  #endif /* _MM_SWAP_H */
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 3783e45bfc92..2d7b9167bfd6 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -7420,6 +7420,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
>  void __meminit kswapd_run(int nid)
>  {
>  	pg_data_t *pgdat = NODE_DATA(nid);
> +	int ret;
>  
>  	pgdat_kswapd_lock(pgdat);
>  	if (!pgdat->kswapd) {
> @@ -7433,7 +7434,26 @@ void __meminit kswapd_run(int nid)
>  		} else {
>  			wake_up_process(pgdat->kswapd);
>  		}
> +		ret = kfifo_alloc(&pgdat->kcompress_fifo,
> +				KCOMPRESS_FIFO_SIZE * sizeof(struct folio *),
> +				GFP_KERNEL);
> +		if (ret) {
> +			pr_err("%s: fail to kfifo_alloc\n", __func__);
> +			goto out;
> +		}
> +
> +		pgdat->kcompressd = kthread_create_on_node(kcompressd, pgdat, nid,
> +				"kcompressd%d", nid);
> +		if (IS_ERR(pgdat->kcompressd)) {
> +			pr_err("Failed to start kcompressd on node %d，ret=%ld\n",
> +					nid, PTR_ERR(pgdat->kcompressd));
> +			pgdat->kcompressd = NULL;
> +			kfifo_free(&pgdat->kcompress_fifo);
> +		} else {
> +			wake_up_process(pgdat->kcompressd);
> +		}
>  	}
> +out:
>  	pgdat_kswapd_unlock(pgdat);
>  }
>  
> @@ -7452,6 +7472,11 @@ void __meminit kswapd_stop(int nid)
>  		kthread_stop(kswapd);
>  		pgdat->kswapd = NULL;
>  	}
> +	if (pgdat->kcompressd) {
> +		kthread_stop(pgdat->kcompressd);
> +		pgdat->kcompressd = NULL;
> +		kfifo_free(&pgdat->kcompress_fifo);
> +	}
>  	pgdat_kswapd_unlock(pgdat);
>  }
>  
> -- 
> 2.45.2
> 
> 

-- 
Cheers,
Harry / Hyeonggon


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-05-07  1:12 ` Harry Yoo
@ 2025-05-07  1:50   ` Zi Yan
  2025-05-07  2:04     ` Barry Song
  0 siblings, 1 reply; 20+ messages in thread
From: Zi Yan @ 2025-05-07  1:50 UTC (permalink / raw)
  To: Harry Yoo
  Cc: Qun-Wei Lin, Andrew Morton, Mike Rapoport, Matthias Brugger,
	AngeloGioacchino Del Regno, Nhat Pham, Sergey Senozhatsky,
	Minchan Kim, linux-mm, linux-kernel, linux-arm-kernel,
	linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu,
	Barry Song

On 6 May 2025, at 21:12, Harry Yoo wrote:

> On Wed, Apr 30, 2025 at 04:26:41PM +0800, Qun-Wei Lin wrote:
>> This patch series introduces a new mechanism called kcompressd to
>> improve the efficiency of memory reclaiming in the operating system.
>>
>> Problem:
>>   In the current system, the kswapd thread is responsible for both scanning
>>   the LRU pages and handling memory compression tasks (such as those
>>   involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
>>   to significant performance bottlenecks, especially under high memory
>>   pressure. The kswapd thread becomes a single point of contention, causing
>>   delays in memory reclaiming and overall system performance degradation.
>>
>> Solution:
>>   Introduced kcompressd to handle asynchronous compression during memory
>>   reclaim, improving efficiency by offloading compression tasks from
>>   kswapd. This allows kswapd to focus on its primary task of page reclaim
>>   without being burdened by the additional overhead of compression.
>>
>> In our handheld devices, we found that applying this mechanism under high
>> memory pressure scenarios can increase the rate of pgsteal_anon per second
>> by over 260% compared to the situation with only kswapd. Additionally, we
>> observed a reduction of over 50% in page allocation stall occurrences,
>> further demonstrating the effectiveness of kcompressd in alleviating memory
>> pressure and improving system responsiveness.
>>
>> Co-developed-by: Barry Song <21cnbao@gmail.com>
>> Signed-off-by: Barry Song <21cnbao@gmail.com>
>> Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
>> Reference: Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd - Barry Song
>>           https://lore.kernel.org/lkml/20250313093005.13998-1-21cnbao@gmail.com/
>> ---
>
> +Cc Zi Yan, who might be interested in writing a framework (or improving
> the existing one, padata) for parallelizing jobs (e.g. migration/compression)

Thanks.

I am currently looking into padata [1] to perform multithreaded page migration
copy job. But based on this patch, it seems that kcompressed is just an additional
kernel thread of executing zswap_store(). Is there any need for performing
compression with multiple threads?

BTW, I also notice that zswap IAA compress batching patchset[2] is using
hardware accelerator (Intel Analytics Accelerator) to speed up zswap.
I wonder if the handheld devices have similar hardware to get a similar benefit.


[1] https://docs.kernel.org/core-api/padata.html
[2] https://lore.kernel.org/linux-crypto/20250303084724.6490-1-kanchana.p.sridhar@intel.com/
>
>>  include/linux/mmzone.h |  6 ++++
>>  mm/mm_init.c           |  1 +
>>  mm/page_io.c           | 71 ++++++++++++++++++++++++++++++++++++++++++
>>  mm/swap.h              |  6 ++++
>>  mm/vmscan.c            | 25 +++++++++++++++
>>  5 files changed, 109 insertions(+)
>>
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index 6ccec1bf2896..93c9195a54ae 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -23,6 +23,7 @@
>>  #include <linux/page-flags.h>
>>  #include <linux/local_lock.h>
>>  #include <linux/zswap.h>
>> +#include <linux/kfifo.h>
>>  #include <asm/page.h>
>>
>>  /* Free memory management - zoned buddy allocator.  */
>> @@ -1398,6 +1399,11 @@ typedef struct pglist_data {
>>
>>  	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
>>
>> +#define KCOMPRESS_FIFO_SIZE 256
>> +	wait_queue_head_t kcompressd_wait;
>> +	struct task_struct *kcompressd;
>> +	struct kfifo kcompress_fifo;
>> +
>>  #ifdef CONFIG_COMPACTION
>>  	int kcompactd_max_order;
>>  	enum zone_type kcompactd_highest_zoneidx;
>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>> index 9659689b8ace..49bae1dd4584 100644
>> --- a/mm/mm_init.c
>> +++ b/mm/mm_init.c
>> @@ -1410,6 +1410,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>>  	pgdat_init_kcompactd(pgdat);
>>
>>  	init_waitqueue_head(&pgdat->kswapd_wait);
>> +	init_waitqueue_head(&pgdat->kcompressd_wait);
>>  	init_waitqueue_head(&pgdat->pfmemalloc_wait);
>>
>>  	for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
>> diff --git a/mm/page_io.c b/mm/page_io.c
>> index 4bce19df557b..d85deb494a6a 100644
>> --- a/mm/page_io.c
>> +++ b/mm/page_io.c
>> @@ -233,6 +233,38 @@ static void swap_zeromap_folio_clear(struct folio *folio)
>>  	}
>>  }
>>
>> +static bool swap_sched_async_compress(struct folio *folio)
>> +{
>> +	struct swap_info_struct *sis = swp_swap_info(folio->swap);
>> +	int nid = numa_node_id();
>> +	pg_data_t *pgdat = NODE_DATA(nid);
>> +
>> +	if (unlikely(!pgdat->kcompressd))
>> +		return false;
>> +
>> +	if (!current_is_kswapd())
>> +		return false;
>> +
>> +	if (!folio_test_anon(folio))
>> +		return false;
>> +	/*
>> +	 * This case needs to synchronously return AOP_WRITEPAGE_ACTIVATE
>> +	 */
>> +	if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio)))
>> +		return false;
>> +
>> +	sis = swp_swap_info(folio->swap);
>> +	if (zswap_is_enabled() || data_race(sis->flags & SWP_SYNCHRONOUS_IO)) {
>> +		if (kfifo_avail(&pgdat->kcompress_fifo) >= sizeof(folio) &&
>> +			kfifo_in(&pgdat->kcompress_fifo, &folio, sizeof(folio))) {
>> +			wake_up_interruptible(&pgdat->kcompressd_wait);
>> +			return true;
>> +		}
>> +	}
>> +
>> +	return false;
>> +}
>> +
>>  /*
>>   * We may have stale swap cache pages in memory: notice
>>   * them here and get rid of the unnecessary final write.
>> @@ -275,6 +307,15 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
>>  		 */
>>  		swap_zeromap_folio_clear(folio);
>>  	}
>> +
>> +	/*
>> +	 * Compression within zswap and zram might block rmap, unmap
>> +	 * of both file and anon pages, try to do compression async
>> +	 * if possible
>> +	 */
>> +	if (swap_sched_async_compress(folio))
>> +		return 0;
>> +
>>  	if (zswap_store(folio)) {
>>  		count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
>>  		folio_unlock(folio);
>> @@ -289,6 +330,36 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
>>  	return 0;
>>  }
>>
>> +int kcompressd(void *p)
>> +{
>> +	pg_data_t *pgdat = (pg_data_t *)p;
>> +	struct folio *folio;
>> +	struct writeback_control wbc = {
>> +		.sync_mode = WB_SYNC_NONE,
>> +		.nr_to_write = SWAP_CLUSTER_MAX,
>> +		.range_start = 0,
>> +		.range_end = LLONG_MAX,
>> +		.for_reclaim = 1,
>> +	};
>> +
>> +	while (!kthread_should_stop()) {
>> +		wait_event_interruptible(pgdat->kcompressd_wait,
>> +				!kfifo_is_empty(&pgdat->kcompress_fifo));
>> +
>> +		while (!kfifo_is_empty(&pgdat->kcompress_fifo)) {
>> +			if (kfifo_out(&pgdat->kcompress_fifo, &folio, sizeof(folio))) {
>> +				if (zswap_store(folio)) {
>> +					count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
>> +					folio_unlock(folio);
>> +					continue;
>> +				}
>> +				__swap_writepage(folio, &wbc);
>> +			}
>> +		}
>> +	}
>> +	return 0;
>> +}
>> +
>>  static inline void count_swpout_vm_event(struct folio *folio)
>>  {
>>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> diff --git a/mm/swap.h b/mm/swap.h
>> index 6f4a3f927edb..3579da413dc2 100644
>> --- a/mm/swap.h
>> +++ b/mm/swap.h
>> @@ -22,6 +22,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
>>  void swap_write_unplug(struct swap_iocb *sio);
>>  int swap_writepage(struct page *page, struct writeback_control *wbc);
>>  void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
>> +int kcompressd(void *p);
>>
>>  /* linux/mm/swap_state.c */
>>  /* One swap address space for each 64M swap space */
>> @@ -199,6 +200,11 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
>>  	return 0;
>>  }
>>
>> +static inline int kcompressd(void *p)
>> +{
>> +	return 0;
>> +}
>> +
>>  #endif /* CONFIG_SWAP */
>>
>>  #endif /* _MM_SWAP_H */
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 3783e45bfc92..2d7b9167bfd6 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -7420,6 +7420,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
>>  void __meminit kswapd_run(int nid)
>>  {
>>  	pg_data_t *pgdat = NODE_DATA(nid);
>> +	int ret;
>>
>>  	pgdat_kswapd_lock(pgdat);
>>  	if (!pgdat->kswapd) {
>> @@ -7433,7 +7434,26 @@ void __meminit kswapd_run(int nid)
>>  		} else {
>>  			wake_up_process(pgdat->kswapd);
>>  		}
>> +		ret = kfifo_alloc(&pgdat->kcompress_fifo,
>> +				KCOMPRESS_FIFO_SIZE * sizeof(struct folio *),
>> +				GFP_KERNEL);
>> +		if (ret) {
>> +			pr_err("%s: fail to kfifo_alloc\n", __func__);
>> +			goto out;
>> +		}
>> +
>> +		pgdat->kcompressd = kthread_create_on_node(kcompressd, pgdat, nid,
>> +				"kcompressd%d", nid);
>> +		if (IS_ERR(pgdat->kcompressd)) {
>> +			pr_err("Failed to start kcompressd on node %d，ret=%ld\n",
>> +					nid, PTR_ERR(pgdat->kcompressd));
>> +			pgdat->kcompressd = NULL;
>> +			kfifo_free(&pgdat->kcompress_fifo);
>> +		} else {
>> +			wake_up_process(pgdat->kcompressd);
>> +		}
>>  	}
>> +out:
>>  	pgdat_kswapd_unlock(pgdat);
>>  }
>>
>> @@ -7452,6 +7472,11 @@ void __meminit kswapd_stop(int nid)
>>  		kthread_stop(kswapd);
>>  		pgdat->kswapd = NULL;
>>  	}
>> +	if (pgdat->kcompressd) {
>> +		kthread_stop(pgdat->kcompressd);
>> +		pgdat->kcompressd = NULL;
>> +		kfifo_free(&pgdat->kcompress_fifo);
>> +	}
>>  	pgdat_kswapd_unlock(pgdat);
>>  }
>>
>> -- 
>> 2.45.2
>>
>>
>
> -- 
> Cheers,
> Harry / Hyeonggon


--
Best Regards,
Yan, Zi


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-05-07  1:50   ` Zi Yan
@ 2025-05-07  2:04     ` Barry Song
  2025-05-07 15:00       ` Nhat Pham
  0 siblings, 1 reply; 20+ messages in thread
From: Barry Song @ 2025-05-07  2:04 UTC (permalink / raw)
  To: Zi Yan
  Cc: Harry Yoo, Qun-Wei Lin, Andrew Morton, Mike Rapoport,
	Matthias Brugger, AngeloGioacchino Del Regno, Nhat Pham,
	Sergey Senozhatsky, Minchan Kim, linux-mm, linux-kernel,
	linux-arm-kernel, linux-mediatek, Casper Li, Chinwen Chang,
	Andrew Yang, James Hsu

On Wed, May 7, 2025 at 1:50 PM Zi Yan <ziy@nvidia.com> wrote:
>
> On 6 May 2025, at 21:12, Harry Yoo wrote:
>
> > On Wed, Apr 30, 2025 at 04:26:41PM +0800, Qun-Wei Lin wrote:
> >> This patch series introduces a new mechanism called kcompressd to
> >> improve the efficiency of memory reclaiming in the operating system.
> >>
> >> Problem:
> >>   In the current system, the kswapd thread is responsible for both scanning
> >>   the LRU pages and handling memory compression tasks (such as those
> >>   involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
> >>   to significant performance bottlenecks, especially under high memory
> >>   pressure. The kswapd thread becomes a single point of contention, causing
> >>   delays in memory reclaiming and overall system performance degradation.
> >>
> >> Solution:
> >>   Introduced kcompressd to handle asynchronous compression during memory
> >>   reclaim, improving efficiency by offloading compression tasks from
> >>   kswapd. This allows kswapd to focus on its primary task of page reclaim
> >>   without being burdened by the additional overhead of compression.
> >>
> >> In our handheld devices, we found that applying this mechanism under high
> >> memory pressure scenarios can increase the rate of pgsteal_anon per second
> >> by over 260% compared to the situation with only kswapd. Additionally, we
> >> observed a reduction of over 50% in page allocation stall occurrences,
> >> further demonstrating the effectiveness of kcompressd in alleviating memory
> >> pressure and improving system responsiveness.
> >>
> >> Co-developed-by: Barry Song <21cnbao@gmail.com>
> >> Signed-off-by: Barry Song <21cnbao@gmail.com>
> >> Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
> >> Reference: Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd - Barry Song
> >>           https://lore.kernel.org/lkml/20250313093005.13998-1-21cnbao@gmail.com/
> >> ---
> >
> > +Cc Zi Yan, who might be interested in writing a framework (or improving
> > the existing one, padata) for parallelizing jobs (e.g. migration/compression)
>
> Thanks.
>
> I am currently looking into padata [1] to perform multithreaded page migration
> copy job. But based on this patch, it seems that kcompressed is just an additional
> kernel thread of executing zswap_store(). Is there any need for performing
> compression with multiple threads?

The current focus is on enabling kswapd to perform asynchronous compression,
which can significantly reduce direct reclaim and allocstall events.
Therefore, the work begins with supporting a single thread. Supporting
multiple threads might be possible in the future, but it could be difficult
to control—especially on busy phones—since it consumes more power and may
interfere with other threads impacting user experience.

>
> BTW, I also notice that zswap IAA compress batching patchset[2] is using
> hardware accelerator (Intel Analytics Accelerator) to speed up zswap.
> I wonder if the handheld devices have similar hardware to get a similar benefit.

Usually, the answer is no. We use zRAM and CPU, but this patch aims to provide
a common capability that can be shared by both zRAM and zswap.

>
>
> [1] https://docs.kernel.org/core-api/padata.html
> [2] https://lore.kernel.org/linux-crypto/20250303084724.6490-1-kanchana.p.sridhar@intel.com/
> >
> >>  include/linux/mmzone.h |  6 ++++
> >>  mm/mm_init.c           |  1 +
> >>  mm/page_io.c           | 71 ++++++++++++++++++++++++++++++++++++++++++
> >>  mm/swap.h              |  6 ++++
> >>  mm/vmscan.c            | 25 +++++++++++++++
> >>  5 files changed, 109 insertions(+)
> >>
> >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> >> index 6ccec1bf2896..93c9195a54ae 100644
> >> --- a/include/linux/mmzone.h
> >> +++ b/include/linux/mmzone.h
> >> @@ -23,6 +23,7 @@
> >>  #include <linux/page-flags.h>
> >>  #include <linux/local_lock.h>
> >>  #include <linux/zswap.h>
> >> +#include <linux/kfifo.h>
> >>  #include <asm/page.h>
> >>
> >>  /* Free memory management - zoned buddy allocator.  */
> >> @@ -1398,6 +1399,11 @@ typedef struct pglist_data {
> >>
> >>      int kswapd_failures;            /* Number of 'reclaimed == 0' runs */
> >>
> >> +#define KCOMPRESS_FIFO_SIZE 256
> >> +    wait_queue_head_t kcompressd_wait;
> >> +    struct task_struct *kcompressd;
> >> +    struct kfifo kcompress_fifo;
> >> +
> >>  #ifdef CONFIG_COMPACTION
> >>      int kcompactd_max_order;
> >>      enum zone_type kcompactd_highest_zoneidx;
> >> diff --git a/mm/mm_init.c b/mm/mm_init.c
> >> index 9659689b8ace..49bae1dd4584 100644
> >> --- a/mm/mm_init.c
> >> +++ b/mm/mm_init.c
> >> @@ -1410,6 +1410,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
> >>      pgdat_init_kcompactd(pgdat);
> >>
> >>      init_waitqueue_head(&pgdat->kswapd_wait);
> >> +    init_waitqueue_head(&pgdat->kcompressd_wait);
> >>      init_waitqueue_head(&pgdat->pfmemalloc_wait);
> >>
> >>      for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
> >> diff --git a/mm/page_io.c b/mm/page_io.c
> >> index 4bce19df557b..d85deb494a6a 100644
> >> --- a/mm/page_io.c
> >> +++ b/mm/page_io.c
> >> @@ -233,6 +233,38 @@ static void swap_zeromap_folio_clear(struct folio *folio)
> >>      }
> >>  }
> >>
> >> +static bool swap_sched_async_compress(struct folio *folio)
> >> +{
> >> +    struct swap_info_struct *sis = swp_swap_info(folio->swap);
> >> +    int nid = numa_node_id();
> >> +    pg_data_t *pgdat = NODE_DATA(nid);
> >> +
> >> +    if (unlikely(!pgdat->kcompressd))
> >> +            return false;
> >> +
> >> +    if (!current_is_kswapd())
> >> +            return false;
> >> +
> >> +    if (!folio_test_anon(folio))
> >> +            return false;
> >> +    /*
> >> +     * This case needs to synchronously return AOP_WRITEPAGE_ACTIVATE
> >> +     */
> >> +    if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio)))
> >> +            return false;
> >> +
> >> +    sis = swp_swap_info(folio->swap);
> >> +    if (zswap_is_enabled() || data_race(sis->flags & SWP_SYNCHRONOUS_IO)) {
> >> +            if (kfifo_avail(&pgdat->kcompress_fifo) >= sizeof(folio) &&
> >> +                    kfifo_in(&pgdat->kcompress_fifo, &folio, sizeof(folio))) {
> >> +                    wake_up_interruptible(&pgdat->kcompressd_wait);
> >> +                    return true;
> >> +            }
> >> +    }
> >> +
> >> +    return false;
> >> +}
> >> +
> >>  /*
> >>   * We may have stale swap cache pages in memory: notice
> >>   * them here and get rid of the unnecessary final write.
> >> @@ -275,6 +307,15 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
> >>               */
> >>              swap_zeromap_folio_clear(folio);
> >>      }
> >> +
> >> +    /*
> >> +     * Compression within zswap and zram might block rmap, unmap
> >> +     * of both file and anon pages, try to do compression async
> >> +     * if possible
> >> +     */
> >> +    if (swap_sched_async_compress(folio))
> >> +            return 0;
> >> +
> >>      if (zswap_store(folio)) {
> >>              count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
> >>              folio_unlock(folio);
> >> @@ -289,6 +330,36 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
> >>      return 0;
> >>  }
> >>
> >> +int kcompressd(void *p)
> >> +{
> >> +    pg_data_t *pgdat = (pg_data_t *)p;
> >> +    struct folio *folio;
> >> +    struct writeback_control wbc = {
> >> +            .sync_mode = WB_SYNC_NONE,
> >> +            .nr_to_write = SWAP_CLUSTER_MAX,
> >> +            .range_start = 0,
> >> +            .range_end = LLONG_MAX,
> >> +            .for_reclaim = 1,
> >> +    };
> >> +
> >> +    while (!kthread_should_stop()) {
> >> +            wait_event_interruptible(pgdat->kcompressd_wait,
> >> +                            !kfifo_is_empty(&pgdat->kcompress_fifo));
> >> +
> >> +            while (!kfifo_is_empty(&pgdat->kcompress_fifo)) {
> >> +                    if (kfifo_out(&pgdat->kcompress_fifo, &folio, sizeof(folio))) {
> >> +                            if (zswap_store(folio)) {
> >> +                                    count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
> >> +                                    folio_unlock(folio);
> >> +                                    continue;
> >> +                            }
> >> +                            __swap_writepage(folio, &wbc);
> >> +                    }
> >> +            }
> >> +    }
> >> +    return 0;
> >> +}
> >> +
> >>  static inline void count_swpout_vm_event(struct folio *folio)
> >>  {
> >>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> >> diff --git a/mm/swap.h b/mm/swap.h
> >> index 6f4a3f927edb..3579da413dc2 100644
> >> --- a/mm/swap.h
> >> +++ b/mm/swap.h
> >> @@ -22,6 +22,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> >>  void swap_write_unplug(struct swap_iocb *sio);
> >>  int swap_writepage(struct page *page, struct writeback_control *wbc);
> >>  void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
> >> +int kcompressd(void *p);
> >>
> >>  /* linux/mm/swap_state.c */
> >>  /* One swap address space for each 64M swap space */
> >> @@ -199,6 +200,11 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
> >>      return 0;
> >>  }
> >>
> >> +static inline int kcompressd(void *p)
> >> +{
> >> +    return 0;
> >> +}
> >> +
> >>  #endif /* CONFIG_SWAP */
> >>
> >>  #endif /* _MM_SWAP_H */
> >> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >> index 3783e45bfc92..2d7b9167bfd6 100644
> >> --- a/mm/vmscan.c
> >> +++ b/mm/vmscan.c
> >> @@ -7420,6 +7420,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
> >>  void __meminit kswapd_run(int nid)
> >>  {
> >>      pg_data_t *pgdat = NODE_DATA(nid);
> >> +    int ret;
> >>
> >>      pgdat_kswapd_lock(pgdat);
> >>      if (!pgdat->kswapd) {
> >> @@ -7433,7 +7434,26 @@ void __meminit kswapd_run(int nid)
> >>              } else {
> >>                      wake_up_process(pgdat->kswapd);
> >>              }
> >> +            ret = kfifo_alloc(&pgdat->kcompress_fifo,
> >> +                            KCOMPRESS_FIFO_SIZE * sizeof(struct folio *),
> >> +                            GFP_KERNEL);
> >> +            if (ret) {
> >> +                    pr_err("%s: fail to kfifo_alloc\n", __func__);
> >> +                    goto out;
> >> +            }
> >> +
> >> +            pgdat->kcompressd = kthread_create_on_node(kcompressd, pgdat, nid,
> >> +                            "kcompressd%d", nid);
> >> +            if (IS_ERR(pgdat->kcompressd)) {
> >> +                    pr_err("Failed to start kcompressd on node %d，ret=%ld\n",
> >> +                                    nid, PTR_ERR(pgdat->kcompressd));
> >> +                    pgdat->kcompressd = NULL;
> >> +                    kfifo_free(&pgdat->kcompress_fifo);
> >> +            } else {
> >> +                    wake_up_process(pgdat->kcompressd);
> >> +            }
> >>      }
> >> +out:
> >>      pgdat_kswapd_unlock(pgdat);
> >>  }
> >>
> >> @@ -7452,6 +7472,11 @@ void __meminit kswapd_stop(int nid)
> >>              kthread_stop(kswapd);
> >>              pgdat->kswapd = NULL;
> >>      }
> >> +    if (pgdat->kcompressd) {
> >> +            kthread_stop(pgdat->kcompressd);
> >> +            pgdat->kcompressd = NULL;
> >> +            kfifo_free(&pgdat->kcompress_fifo);
> >> +    }
> >>      pgdat_kswapd_unlock(pgdat);
> >>  }
> >>
> >> --
> >> 2.45.2
> >>
> >>
> >
> > --
> > Cheers,
> > Harry / Hyeonggon
>
>
> --
> Best Regards,
> Yan, Zi

Thanks
Barry


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-05-07  2:04     ` Barry Song
@ 2025-05-07 15:00       ` Nhat Pham
  2025-05-07 15:12         ` Zi Yan
  0 siblings, 1 reply; 20+ messages in thread
From: Nhat Pham @ 2025-05-07 15:00 UTC (permalink / raw)
  To: Barry Song
  Cc: Zi Yan, Harry Yoo, Qun-Wei Lin, Andrew Morton, Mike Rapoport,
	Matthias Brugger, AngeloGioacchino Del Regno, Sergey Senozhatsky,
	Minchan Kim, linux-mm, linux-kernel, linux-arm-kernel,
	linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu

On Tue, May 6, 2025 at 7:04 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, May 7, 2025 at 1:50 PM Zi Yan <ziy@nvidia.com> wrote:
> >
> > On 6 May 2025, at 21:12, Harry Yoo wrote:
> >
> > > On Wed, Apr 30, 2025 at 04:26:41PM +0800, Qun-Wei Lin wrote:
> > >> This patch series introduces a new mechanism called kcompressd to
> > >> improve the efficiency of memory reclaiming in the operating system.
> > >>
> > >> Problem:
> > >>   In the current system, the kswapd thread is responsible for both scanning
> > >>   the LRU pages and handling memory compression tasks (such as those
> > >>   involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
> > >>   to significant performance bottlenecks, especially under high memory
> > >>   pressure. The kswapd thread becomes a single point of contention, causing
> > >>   delays in memory reclaiming and overall system performance degradation.
> > >>
> > >> Solution:
> > >>   Introduced kcompressd to handle asynchronous compression during memory
> > >>   reclaim, improving efficiency by offloading compression tasks from
> > >>   kswapd. This allows kswapd to focus on its primary task of page reclaim
> > >>   without being burdened by the additional overhead of compression.
> > >>
> > >> In our handheld devices, we found that applying this mechanism under high
> > >> memory pressure scenarios can increase the rate of pgsteal_anon per second
> > >> by over 260% compared to the situation with only kswapd. Additionally, we
> > >> observed a reduction of over 50% in page allocation stall occurrences,
> > >> further demonstrating the effectiveness of kcompressd in alleviating memory
> > >> pressure and improving system responsiveness.
> > >>
> > >> Co-developed-by: Barry Song <21cnbao@gmail.com>
> > >> Signed-off-by: Barry Song <21cnbao@gmail.com>
> > >> Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
> > >> Reference: Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd - Barry Song
> > >>           https://lore.kernel.org/lkml/20250313093005.13998-1-21cnbao@gmail.com/
> > >> ---
> > >
> > > +Cc Zi Yan, who might be interested in writing a framework (or improving
> > > the existing one, padata) for parallelizing jobs (e.g. migration/compression)
> >
> > Thanks.
> >
> > I am currently looking into padata [1] to perform multithreaded page migration

TIL about padata :)

> > copy job. But based on this patch, it seems that kcompressed is just an additional
> > kernel thread of executing zswap_store(). Is there any need for performing
> > compression with multiple threads?
>
> The current focus is on enabling kswapd to perform asynchronous compression,
> which can significantly reduce direct reclaim and allocstall events.
> Therefore, the work begins with supporting a single thread. Supporting
> multiple threads might be possible in the future, but it could be difficult
> to control—especially on busy phones—since it consumes more power and may
> interfere with other threads impacting user experience.

Right, yeah.

>
> >
> > BTW, I also notice that zswap IAA compress batching patchset[2] is using
> > hardware accelerator (Intel Analytics Accelerator) to speed up zswap.
> > I wonder if the handheld devices have similar hardware to get a similar benefit.
>
> Usually, the answer is no. We use zRAM and CPU, but this patch aims to provide
> a common capability that can be shared by both zRAM and zswap.
>

Also, not everyone and every setup has access to hardware compression
accelerators :) This provides benefits for all users.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-04-30 22:49   ` Barry Song
@ 2025-05-07 15:11     ` Nhat Pham
  0 siblings, 0 replies; 20+ messages in thread
From: Nhat Pham @ 2025-05-07 15:11 UTC (permalink / raw)
  To: Barry Song
  Cc: Andrew Morton, Qun-Wei Lin, Mike Rapoport, Matthias Brugger,
	AngeloGioacchino Del Regno, Sergey Senozhatsky, Minchan Kim,
	linux-mm, linux-kernel, linux-arm-kernel, linux-mediatek,
	Casper Li, Chinwen Chang, Andrew Yang, James Hsu

On Wed, Apr 30, 2025 at 3:50 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Thu, May 1, 2025 at 9:51 AM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Wed, 30 Apr 2025 16:26:41 +0800 Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
> >
> > > This patch series introduces a new mechanism called kcompressd to
> > > improve the efficiency of memory reclaiming in the operating system.
> > >
> > > Problem:
> > >   In the current system, the kswapd thread is responsible for both scanning
> > >   the LRU pages and handling memory compression tasks (such as those
> > >   involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
> > >   to significant performance bottlenecks, especially under high memory
> > >   pressure. The kswapd thread becomes a single point of contention, causing
> > >   delays in memory reclaiming and overall system performance degradation.
> > >
> > > Solution:
> > >   Introduced kcompressd to handle asynchronous compression during memory
> > >   reclaim, improving efficiency by offloading compression tasks from
> > >   kswapd. This allows kswapd to focus on its primary task of page reclaim
> > >   without being burdened by the additional overhead of compression.
> > >
> > > In our handheld devices, we found that applying this mechanism under high
> > > memory pressure scenarios can increase the rate of pgsteal_anon per second
> > > by over 260% compared to the situation with only kswapd. Additionally, we
> > > observed a reduction of over 50% in page allocation stall occurrences,
> > > further demonstrating the effectiveness of kcompressd in alleviating memory
> > > pressure and improving system responsiveness.
> >
> > It's a significant change and I'm thinking that broader performance
> > testing across a broader range of machines is needed before we can
> > confidently upstream such a change.
>
> We ran the same test on our phones and saw the same results as Qun-Wei.
> The async compression significantly reduces allocation stalls and improves
> reclamation speed. However, I agree that broader testing is needed, and
> we’ll also need the zswap team’s help with testing zswap cases.

The warning aside (which I got around by setting and unsetting
PF_MEMALLOC in kcompressd()), I run kernel building tests with zswap.
There is not much performance difference with and without kcompressd.
That probably means kernel building is a mediocre benchmark more than
anything. Ideally, I want to experiment with some real workloads, but
that is a bit more involved to set up, unfortunately :(

I can try again once you have sent v2 that incorporates our review, at
least to make sure everything is stable and there is no obvious
regression. Hopefully I can set up a proper experiment at some point
too...

>
> >
> > Also, it's presumably a small net loss on single-CPU machines (do these
> > exist any more?).  Is it hard to disable this feature on such machines?
>
> A net loss is possible, but kswapd can sometimes enter sleep contexts,
> allowing the parallel kcompressd thread to continue compression.
> This could actually be a win. But I agree that additional testing on
> single-CPU machines may be necessary.
>
> It could be disabled by the following if we discover any regression on
> single-CPU machines?
>
> if (num_online_cpus() == 1)
>      return false;
>
> >
> > >
> > > +static bool swap_sched_async_compress(struct folio *folio)
> > > +{
> > > +     struct swap_info_struct *sis = swp_swap_info(folio->swap);
> > > +     int nid = numa_node_id();
> > > +     pg_data_t *pgdat = NODE_DATA(nid);
> > > +
> > > +     if (unlikely(!pgdat->kcompressd))
> > > +             return false;
> > > +
> > > +     if (!current_is_kswapd())
> > > +             return false;
> > > +
> > > +     if (!folio_test_anon(folio))
> > > +             return false;
> >
> > Are you sure the above three tests are really needed?
>
> Currently, it runs as a per-node thread mainly to accelerate asynchronous
> reclamation, which effectively reduces direct reclamation. Since direct
> reclamation already follows the slow path, asynchronous compression offers
> limited additional benefit in that context. Moreover, it's difficult
> to determine
> the optimal number of threads for direct reclamation, whereas the  compression
> in the current direct reclamation allows it to utilize all CPUs.
>
> The first condition checks whether kcompressd is present. The second
> ensures that we're in kswapd asynchronous reclamation, not direct
> reclamation. The third condition might be optimized or dropped, at least for
> swap-backed shmem, and similar cases.
>
> Thanks
> Barry


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-05-07 15:00       ` Nhat Pham
@ 2025-05-07 15:12         ` Zi Yan
  0 siblings, 0 replies; 20+ messages in thread
From: Zi Yan @ 2025-05-07 15:12 UTC (permalink / raw)
  To: Nhat Pham, Barry Song
  Cc: Harry Yoo, Qun-Wei Lin, Andrew Morton, Mike Rapoport,
	Matthias Brugger, AngeloGioacchino Del Regno, Sergey Senozhatsky,
	Minchan Kim, linux-mm, linux-kernel, linux-arm-kernel,
	linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu

On 7 May 2025, at 11:00, Nhat Pham wrote:

> On Tue, May 6, 2025 at 7:04 PM Barry Song <21cnbao@gmail.com> wrote:
>>
>> On Wed, May 7, 2025 at 1:50 PM Zi Yan <ziy@nvidia.com> wrote:
>>>
>>> On 6 May 2025, at 21:12, Harry Yoo wrote:
>>>
>>>> On Wed, Apr 30, 2025 at 04:26:41PM +0800, Qun-Wei Lin wrote:
>>>>> This patch series introduces a new mechanism called kcompressd to
>>>>> improve the efficiency of memory reclaiming in the operating system.
>>>>>
>>>>> Problem:
>>>>>   In the current system, the kswapd thread is responsible for both scanning
>>>>>   the LRU pages and handling memory compression tasks (such as those
>>>>>   involving ZSWAP/ZRAM, if enabled). This combined responsibility can lead
>>>>>   to significant performance bottlenecks, especially under high memory
>>>>>   pressure. The kswapd thread becomes a single point of contention, causing
>>>>>   delays in memory reclaiming and overall system performance degradation.
>>>>>
>>>>> Solution:
>>>>>   Introduced kcompressd to handle asynchronous compression during memory
>>>>>   reclaim, improving efficiency by offloading compression tasks from
>>>>>   kswapd. This allows kswapd to focus on its primary task of page reclaim
>>>>>   without being burdened by the additional overhead of compression.
>>>>>
>>>>> In our handheld devices, we found that applying this mechanism under high
>>>>> memory pressure scenarios can increase the rate of pgsteal_anon per second
>>>>> by over 260% compared to the situation with only kswapd. Additionally, we
>>>>> observed a reduction of over 50% in page allocation stall occurrences,
>>>>> further demonstrating the effectiveness of kcompressd in alleviating memory
>>>>> pressure and improving system responsiveness.
>>>>>
>>>>> Co-developed-by: Barry Song <21cnbao@gmail.com>
>>>>> Signed-off-by: Barry Song <21cnbao@gmail.com>
>>>>> Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
>>>>> Reference: Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd - Barry Song
>>>>>          https://lore.kernel.org/lkml/20250313093005.13998-1-21cnbao@gmail.com/
>>>>> ---
>>>>
>>>> +Cc Zi Yan, who might be interested in writing a framework (or improving
>>>> the existing one, padata) for parallelizing jobs (e.g. migration/compression)
>>>
>>> Thanks.
>>>
>>> I am currently looking into padata [1] to perform multithreaded page migration
>
> TIL about padata :)
>
>>> copy job. But based on this patch, it seems that kcompressed is just an additional
>>> kernel thread of executing zswap_store(). Is there any need for performing
>>> compression with multiple threads?
>>
>> The current focus is on enabling kswapd to perform asynchronous compression,
>> which can significantly reduce direct reclaim and allocstall events.
>> Therefore, the work begins with supporting a single thread. Supporting
>> multiple threads might be possible in the future, but it could be difficult
>> to control—especially on busy phones—since it consumes more power and may
>> interfere with other threads impacting user experience.
>
> Right, yeah.
>
>>
>>>
>>> BTW, I also notice that zswap IAA compress batching patchset[2] is using
>>> hardware accelerator (Intel Analytics Accelerator) to speed up zswap.
>>> I wonder if the handheld devices have similar hardware to get a similar benefit.
>>
>> Usually, the answer is no. We use zRAM and CPU, but this patch aims to provide
>> a common capability that can be shared by both zRAM and zswap.
>>
>
> Also, not everyone and every setup has access to hardware compression
> accelerators :) This provides benefits for all users.

Got it. Thanks for the explanation.


--
Best Regards,
Yan, Zi


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-05-01 15:12   ` Nhat Pham
@ 2025-06-16  3:41     ` Barry Song
  2025-06-17 14:21       ` Nhat Pham
  0 siblings, 1 reply; 20+ messages in thread
From: Barry Song @ 2025-06-16  3:41 UTC (permalink / raw)
  To: nphamcs, hannes
  Cc: 21cnbao, akpm, andrew.yang, angelogioacchino.delregno, casper.li,
	chinwen.chang, james.hsu, linux-arm-kernel, linux-kernel,
	linux-mediatek, linux-mm, matthias.bgg, minchan, qun-wei.lin,
	rppt, senozhatsky

Hi Nhat, Johannes,

>> The way you implemented this adds time-and-space overhead even on
>> systems that don't have any sort of swap compression enabled.

I agree — we can eliminate the time and space overhead by refining the  
code to hook kcompressed only when zswap or zram is enabled.

>>
>> That seems unnecessary. There is an existing method for asynchronous
>> writeback, and pageout() is naturally fully set up to handle this.
>>
>> IMO the better way to do this is to make zswap_store() (and
>> zram_bio_write()?) asynchronous. Make those functions queue the work
>> and wake the compression daemon, and then have the daemon call
>> folio_end_writeback() / bio_endio() when it's done with it.

> +1.

But,
How could this be possible for zswap? zswap_store() is only a frontend —  
we still need its return value to determine whether __swap_writepage()  
is required. Waiting for the result of zswap_store() is inherently a  
synchronous step.

My point is that folio_end_writeback() and bio_endio() can only be  
called after the entire zswap_store() → __swap_writepage() sequence is  
completed. That’s why both are placed in the new kcompressed.

The use of folio_end_writeback() and bio_endio() was the case for zRAM  
in Qun-Wei's RFC.

https://lore.kernel.org/linux-mm/20250307120141.1566673-3-qun-wei.lin@mediatek.com/

However, the implementation tightly coupled zRAM with reclamation logic.  
For example, zRAM needed to know whether it was running in the kswapd  
context, which is not ideal for a generic block device — the role zRAM  
is supposed to play. Additionally, the code was not shared between zswap  
and zRAM.

Thanks
Barry

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-06-16  3:41     ` Barry Song
@ 2025-06-17 14:21       ` Nhat Pham
  2025-06-23  5:16         ` Barry Song
  0 siblings, 1 reply; 20+ messages in thread
From: Nhat Pham @ 2025-06-17 14:21 UTC (permalink / raw)
  To: Barry Song
  Cc: hannes, akpm, andrew.yang, angelogioacchino.delregno, casper.li,
	chinwen.chang, james.hsu, linux-arm-kernel, linux-kernel,
	linux-mediatek, linux-mm, matthias.bgg, minchan, qun-wei.lin,
	rppt, senozhatsky, SeongJae Park

On Sun, Jun 15, 2025 at 8:41 PM Barry Song <21cnbao@gmail.com> wrote:
>
> Hi Nhat, Johannes,
>
> >> The way you implemented this adds time-and-space overhead even on
> >> systems that don't have any sort of swap compression enabled.
>
> I agree — we can eliminate the time and space overhead by refining the
> code to hook kcompressed only when zswap or zram is enabled.
>
> >>
> >> That seems unnecessary. There is an existing method for asynchronous
> >> writeback, and pageout() is naturally fully set up to handle this.
> >>
> >> IMO the better way to do this is to make zswap_store() (and
> >> zram_bio_write()?) asynchronous. Make those functions queue the work
> >> and wake the compression daemon, and then have the daemon call
> >> folio_end_writeback() / bio_endio() when it's done with it.
>
> > +1.
>
>
> But,
> How could this be possible for zswap? zswap_store() is only a frontend —
> we still need its return value to determine whether __swap_writepage()
> is required. Waiting for the result of zswap_store() is inherently a
> synchronous step.

Hmm, I might be misunderstanding either of you, but it sounds like
what you're describing here does not contradict what Johannes is
proposing?

>
> My point is that folio_end_writeback() and bio_endio() can only be
> called after the entire zswap_store() → __swap_writepage() sequence is
> completed. That’s why both are placed in the new kcompressed.

Hmm, how about:

1. Inside zswap_store(), we first obtain the obj_cgroup reference,
check cgroup and pool limit, and grab a zswap pool reference (in
effect, determining the slot allocator and compressor).

2. Next, we try to queue the work to kcompressd, saving the folio and
the zswap pool (and whatever else we need for the continuation). If
this fails, we can proceed with the old synchronous path.

3. In kcompressed daemon, we perform the continuation of
zswap_store(): compression, slot allocation, storing, zswap's LRU
modification, etc. If this fails, we check if the mem_cgroup enables
writeback. If it's enabled, we can call __swap_writepage(). Ideally,
if writeback is disabled, we should activate the page, but it might
not be possible since shrink_folio_list() might already re-add the
page to the inactive lru. Maybe some modification of pageout() and
shrink_folio_list() can make this work, but I haven't thought too
deeply about it :) If it's impossible, we can perform async
compression only for cgroups that enable writeback for now. Once we
fix zswap's handling of incompressible pages, we can revisit this
decision (+ SJ).

TLDR: move the work-queueing step forward a bit, into the middle of
zswap_store().

One benefit of this is we skip pages of cgroups that disable zswap, or
when zswap pool is full.

>
> The use of folio_end_writeback() and bio_endio() was the case for zRAM
> in Qun-Wei's RFC.
>
> https://lore.kernel.org/linux-mm/20250307120141.1566673-3-qun-wei.lin@mediatek.com/
>
> However, the implementation tightly coupled zRAM with reclamation logic.
> For example, zRAM needed to know whether it was running in the kswapd
> context, which is not ideal for a generic block device — the role zRAM
> is supposed to play. Additionally, the code was not shared between zswap
> and zRAM.
>
> Thanks
> Barry

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-06-17 14:21       ` Nhat Pham
@ 2025-06-23  5:16         ` Barry Song
  2025-06-27 23:21           ` Nhat Pham
  0 siblings, 1 reply; 20+ messages in thread
From: Barry Song @ 2025-06-23  5:16 UTC (permalink / raw)
  To: nphamcs
  Cc: 21cnbao, akpm, andrew.yang, angelogioacchino.delregno, casper.li,
	chinwen.chang, hannes, james.hsu, linux-arm-kernel, linux-kernel,
	linux-mediatek, linux-mm, matthias.bgg, minchan, qun-wei.lin,
	rppt, senozhatsky, sj

Hi Nhat,

On Wed, Jun 18, 2025 at 2:21 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Sun, Jun 15, 2025 at 8:41 PM Barry Song <21cnbao@gmail.com> wrote:
> > >>
> > >> That seems unnecessary. There is an existing method for asynchronous
> > >> writeback, and pageout() is naturally fully set up to handle this.
> > >>
> > >> IMO the better way to do this is to make zswap_store() (and
> > >> zram_bio_write()?) asynchronous. Make those functions queue the work
> > >> and wake the compression daemon, and then have the daemon call
> > >> folio_end_writeback() / bio_endio() when it's done with it.
> >
> > > +1.
> >
> >
> > But,
> > How could this be possible for zswap? zswap_store() is only a frontend —
> > we still need its return value to determine whether __swap_writepage()
> > is required. Waiting for the result of zswap_store() is inherently a
> > synchronous step.
>
> Hmm, I might be misunderstanding either of you, but it sounds like
> what you're describing here does not contradict what Johannes is
> proposing?

It seems contradictory: Johannes proposes that zswap could behave like zRAM
by invoking `folio_end_writeback()` or `bio_endio()`, but this doesn’t align
with actual behavior since zswap_store might not end `swap_writeout()`—it may
still proceed to `__swap_writeback()` to complete the final steps.

Meanwhile, Qun-wei’s RFC has already explored using `folio_end_writeback()` and
`bio_endio()` at the end of `__swap_writepage()` for zRAM, though that approach
also has its own issues.

>
> >
> > My point is that folio_end_writeback() and bio_endio() can only be
> > called after the entire zswap_store() → __swap_writepage() sequence is
> > completed. That’s why both are placed in the new kcompressed.
>
> Hmm, how about:
>
> 1. Inside zswap_store(), we first obtain the obj_cgroup reference,
> check cgroup and pool limit, and grab a zswap pool reference (in
> effect, determining the slot allocator and compressor).
>
> 2. Next, we try to queue the work to kcompressd, saving the folio and
> the zswap pool (and whatever else we need for the continuation). If
> this fails, we can proceed with the old synchronous path.
>
> 3. In kcompressed daemon, we perform the continuation of
> zswap_store(): compression, slot allocation, storing, zswap's LRU
> modification, etc. If this fails, we check if the mem_cgroup enables
> writeback. If it's enabled, we can call __swap_writepage(). Ideally,
> if writeback is disabled, we should activate the page, but it might
> not be possible since shrink_folio_list() might already re-add the
> page to the inactive lru. Maybe some modification of pageout() and
> shrink_folio_list() can make this work, but I haven't thought too
> deeply about it :) If it's impossible, we can perform async
> compression only for cgroups that enable writeback for now. Once we
> fix zswap's handling of incompressible pages, we can revisit this
> decision (+ SJ).
>
> TLDR: move the work-queueing step forward a bit, into the middle of
> zswap_store().
>
> One benefit of this is we skip pages of cgroups that disable zswap, or
> when zswap pool is full.

I assume you meant something like the following:

bool try_to_sched_async_zswap_store()
{
	get_obj_cgroup_from_folio()
	if (err) goto xxx;
	zswap_check_limits();
	if (err) goto xxx;
	zswap_pool_current_get()
	if (err) goto xxx;
	   
	queue_folio_to_kcompressd(folio);
	return true;

xxx:
	error handler things;
	return false;
}

If this function returns true, it suggests that compression requests  
have been queued to kcompressd. Following that, in kcompressd():

int __zswap_store(folio)
{
	for(i=0;i<nr_pages;i++) {
		zswap_store_page();
		if (err) return err;
	}
	return 0;
}

kcompressd()
{
	while(folio_queue_is_not_empty) {
		folio = dequeue_folio();
		if (folio_queued_by_zswap(folio)) {
			if(!__zswap_store(folio))
				continue;
		}
		if ((zswap_store_page_fails && mem_cgroup_zswap_writeback_enabled()) ||
		    folio_queued_by_zram) {
			__swap_writepage();
		}
	}
}

In kswapd, we will need to do
int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
{
	...
	if (try_to_sched_async_zswap_store(folio))
		return;
	if (is_sync_comp_blkdev(swap)) {
		queue_folio_to_kcompressd(folio);
		return;
	}
	__swap_writepage();
}

To be honest, I'm not sure if there's a flag that indicates whether the  
folio was queued by zswap or zram. If not, we may need to add a member  
associated with folio pointers in the queue between kswapd and kcompressd,  
since we need to identify zswap cases. Maybe we can reuse bit 0 of the  
folio pointer?

What I mean is: while queuing, if the folio is queued by zswap, we do  
`pointer |= BIT(0)`. Then in kcompressd, we restore the original folio  
with `folio = pointer & ~BIT(0)`. It's a bit ugly, but I’m not sure  
there’s a better approach.

Thanks
Barry


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-06-23  5:16         ` Barry Song
@ 2025-06-27 23:21           ` Nhat Pham
  2025-07-09  3:25             ` Qun-wei Lin (林群崴)
  0 siblings, 1 reply; 20+ messages in thread
From: Nhat Pham @ 2025-06-27 23:21 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, andrew.yang, angelogioacchino.delregno, casper.li,
	chinwen.chang, hannes, james.hsu, linux-arm-kernel, linux-kernel,
	linux-mediatek, linux-mm, matthias.bgg, minchan, qun-wei.lin,
	rppt, senozhatsky, sj

On Sun, Jun 22, 2025 at 10:16 PM Barry Song <21cnbao@gmail.com> wrote:
>
> Hi Nhat,
>
> On Wed, Jun 18, 2025 at 2:21 AM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Sun, Jun 15, 2025 at 8:41 PM Barry Song <21cnbao@gmail.com> wrote:
> > > >>
> > > >> That seems unnecessary. There is an existing method for asynchronous
> > > >> writeback, and pageout() is naturally fully set up to handle this.
> > > >>
> > > >> IMO the better way to do this is to make zswap_store() (and
> > > >> zram_bio_write()?) asynchronous. Make those functions queue the work
> > > >> and wake the compression daemon, and then have the daemon call
> > > >> folio_end_writeback() / bio_endio() when it's done with it.
> > >
> > > > +1.
> > >
> > >
> > > But,
> > > How could this be possible for zswap? zswap_store() is only a frontend —
> > > we still need its return value to determine whether __swap_writepage()
> > > is required. Waiting for the result of zswap_store() is inherently a
> > > synchronous step.
> >
> > Hmm, I might be misunderstanding either of you, but it sounds like
> > what you're describing here does not contradict what Johannes is
> > proposing?
>
> It seems contradictory: Johannes proposes that zswap could behave like zRAM
> by invoking `folio_end_writeback()` or `bio_endio()`, but this doesn’t align
> with actual behavior since zswap_store might not end `swap_writeout()`—it may
> still proceed to `__swap_writeback()` to complete the final steps.
>
> Meanwhile, Qun-wei’s RFC has already explored using `folio_end_writeback()` and
> `bio_endio()` at the end of `__swap_writepage()` for zRAM, though that approach
> also has its own issues.


Hmm OK. I'll let Johannes comment on this then :)

>
> >
> > >
> > > My point is that folio_end_writeback() and bio_endio() can only be
> > > called after the entire zswap_store() → __swap_writepage() sequence is
> > > completed. That’s why both are placed in the new kcompressed.
> >
> > Hmm, how about:
> >
> > 1. Inside zswap_store(), we first obtain the obj_cgroup reference,
> > check cgroup and pool limit, and grab a zswap pool reference (in
> > effect, determining the slot allocator and compressor).
> >
> > 2. Next, we try to queue the work to kcompressd, saving the folio and
> > the zswap pool (and whatever else we need for the continuation). If
> > this fails, we can proceed with the old synchronous path.
> >
> > 3. In kcompressed daemon, we perform the continuation of
> > zswap_store(): compression, slot allocation, storing, zswap's LRU
> > modification, etc. If this fails, we check if the mem_cgroup enables
> > writeback. If it's enabled, we can call __swap_writepage(). Ideally,
> > if writeback is disabled, we should activate the page, but it might
> > not be possible since shrink_folio_list() might already re-add the
> > page to the inactive lru. Maybe some modification of pageout() and
> > shrink_folio_list() can make this work, but I haven't thought too
> > deeply about it :) If it's impossible, we can perform async
> > compression only for cgroups that enable writeback for now. Once we
> > fix zswap's handling of incompressible pages, we can revisit this
> > decision (+ SJ).
> >
> > TLDR: move the work-queueing step forward a bit, into the middle of
> > zswap_store().
> >
> > One benefit of this is we skip pages of cgroups that disable zswap, or
> > when zswap pool is full.
>
> I assume you meant something like the following:
>
> bool try_to_sched_async_zswap_store()
> {
>         get_obj_cgroup_from_folio()
>         if (err) goto xxx;
>         zswap_check_limits();
>         if (err) goto xxx;
>         zswap_pool_current_get()
>         if (err) goto xxx;
>
>         queue_folio_to_kcompressd(folio);

Something like this, yeah. Can queue_folio_to_kcompressd() fail? If
so, we can also try synchronous compression on failure here
(__zswap_store() ?).


>         return true;
>
> xxx:
>         error handler things;
>         return false;
> }
>
> If this function returns true, it suggests that compression requests
> have been queued to kcompressd. Following that, in kcompressd():
>
> int __zswap_store(folio)
> {
>         for(i=0;i<nr_pages;i++) {
>                 zswap_store_page();
>                 if (err) return err;
>         }
>         return 0;
> }
>
> kcompressd()
> {
>         while(folio_queue_is_not_empty) {
>                 folio = dequeue_folio();
>                 if (folio_queued_by_zswap(folio)) {
>                         if(!__zswap_store(folio))
>                                 continue;
>                 }
>                 if ((zswap_store_page_fails && mem_cgroup_zswap_writeback_enabled()) ||
>                     folio_queued_by_zram) {

If !mem_cgroup_zswap_writeback_enabled(), I wonder if we can activate
the page here?

>                         __swap_writepage();
>                 }
>         }
> }
>
> In kswapd, we will need to do
> int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> {
>         ...
>         if (try_to_sched_async_zswap_store(folio))
>                 return;
>         if (is_sync_comp_blkdev(swap)) {
>                 queue_folio_to_kcompressd(folio);
>                 return;
>         }
>         __swap_writepage();
> }
>
> To be honest, I'm not sure if there's a flag that indicates whether the
> folio was queued by zswap or zram. If not, we may need to add a member

I don't think there is.

> associated with folio pointers in the queue between kswapd and kcompressd,
> since we need to identify zswap cases. Maybe we can reuse bit 0 of the
> folio pointer?
>
> What I mean is: while queuing, if the folio is queued by zswap, we do
> `pointer |= BIT(0)`. Then in kcompressd, we restore the original folio
> with `folio = pointer & ~BIT(0)`. It's a bit ugly, but I’m not sure
> there’s a better approach.

I think this approach is fine.

We can also hack struct zswap_entry, but that would require an extra
xarray look up. OTOH, if we can assume that zram users will not enable
zswap, we might optimize that lookup away? Not sure if it's much
cleaner than just pointer tagging though.

>
> Thanks
> Barry


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mm: Add Kcompressd for accelerated memory compression
  2025-06-27 23:21           ` Nhat Pham
@ 2025-07-09  3:25             ` Qun-wei Lin (林群崴)
  0 siblings, 0 replies; 20+ messages in thread
From: Qun-wei Lin (林群崴) @ 2025-07-09  3:25 UTC (permalink / raw)
  To: hannes@cmpxchg.org
  Cc: Andrew Yang (楊智強), rppt@kernel.org,
	nphamcs@gmail.com, sj@kernel.org, 21cnbao@gmail.com,
	James Hsu (徐慶薰), AngeloGioacchino Del Regno,
	akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
	linux-mediatek@lists.infradead.org, linux-mm@kvack.org,
	Chinwen Chang (張錦文),
	Casper Li (李中榮), minchan@kernel.org,
	linux-arm-kernel@lists.infradead.org, matthias.bgg@gmail.com,
	senozhatsky@chromium.org

On Fri, 2025-06-27 at 16:21 -0700, Nhat Pham wrote:
> 
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
> 
> 
> On Sun, Jun 22, 2025 at 10:16 PM Barry Song <21cnbao@gmail.com>
> wrote:
> > 
> > Hi Nhat,
> > 
> > On Wed, Jun 18, 2025 at 2:21 AM Nhat Pham <nphamcs@gmail.com>
> > wrote:
> > > 
> > > On Sun, Jun 15, 2025 at 8:41 PM Barry Song <21cnbao@gmail.com>
> > > wrote:
> > > > > > 
> > > > > > That seems unnecessary. There is an existing method for
> > > > > > asynchronous
> > > > > > writeback, and pageout() is naturally fully set up to
> > > > > > handle this.
> > > > > > 
> > > > > > IMO the better way to do this is to make zswap_store() (and
> > > > > > zram_bio_write()?) asynchronous. Make those functions queue
> > > > > > the work
> > > > > > and wake the compression daemon, and then have the daemon
> > > > > > call
> > > > > > folio_end_writeback() / bio_endio() when it's done with it.
> > > > 
> > > > > +1.
> > > > 
> > > > 
> > > > But,
> > > > How could this be possible for zswap? zswap_store() is only a
> > > > frontend —
> > > > we still need its return value to determine whether
> > > > __swap_writepage()
> > > > is required. Waiting for the result of zswap_store() is
> > > > inherently a
> > > > synchronous step.
> > > 
> > > Hmm, I might be misunderstanding either of you, but it sounds
> > > like
> > > what you're describing here does not contradict what Johannes is
> > > proposing?
> > 
> > It seems contradictory: Johannes proposes that zswap could behave
> > like zRAM
> > by invoking `folio_end_writeback()` or `bio_endio()`, but this
> > doesn’t align
> > with actual behavior since zswap_store might not end
> > `swap_writeout()`—it may
> > still proceed to `__swap_writeback()` to complete the final steps.
> > 
> > Meanwhile, Qun-wei’s RFC has already explored using
> > `folio_end_writeback()` and
> > `bio_endio()` at the end of `__swap_writepage()` for zRAM, though
> > that approach
> > also has its own issues.
> 
> 
> Hmm OK. I'll let Johannes comment on this then :)

Hi Johannes,

Would appreciate your feedback when you have a moment.

> 
> > 
> > > 
> > > > 
> > > > My point is that folio_end_writeback() and bio_endio() can only
> > > > be
> > > > called after the entire zswap_store() → __swap_writepage()
> > > > sequence is
> > > > completed. That’s why both are placed in the new kcompressed.
> > > 
> > > Hmm, how about:
> > > 
> > > 1. Inside zswap_store(), we first obtain the obj_cgroup
> > > reference,
> > > check cgroup and pool limit, and grab a zswap pool reference (in
> > > effect, determining the slot allocator and compressor).
> > > 
> > > 2. Next, we try to queue the work to kcompressd, saving the folio
> > > and
> > > the zswap pool (and whatever else we need for the continuation).
> > > If
> > > this fails, we can proceed with the old synchronous path.
> > > 
> > > 3. In kcompressed daemon, we perform the continuation of
> > > zswap_store(): compression, slot allocation, storing, zswap's LRU
> > > modification, etc. If this fails, we check if the mem_cgroup
> > > enables
> > > writeback. If it's enabled, we can call __swap_writepage().
> > > Ideally,
> > > if writeback is disabled, we should activate the page, but it
> > > might
> > > not be possible since shrink_folio_list() might already re-add
> > > the
> > > page to the inactive lru. Maybe some modification of pageout()
> > > and
> > > shrink_folio_list() can make this work, but I haven't thought too
> > > deeply about it :) If it's impossible, we can perform async
> > > compression only for cgroups that enable writeback for now. Once
> > > we
> > > fix zswap's handling of incompressible pages, we can revisit this
> > > decision (+ SJ).
> > > 
> > > TLDR: move the work-queueing step forward a bit, into the middle
> > > of
> > > zswap_store().
> > > 
> > > One benefit of this is we skip pages of cgroups that disable
> > > zswap, or
> > > when zswap pool is full.
> > 
> > I assume you meant something like the following:
> > 
> > bool try_to_sched_async_zswap_store()
> > {
> >         get_obj_cgroup_from_folio()
> >         if (err) goto xxx;
> >         zswap_check_limits();
> >         if (err) goto xxx;
> >         zswap_pool_current_get()
> >         if (err) goto xxx;
> > 
> >         queue_folio_to_kcompressd(folio);
> 
> Something like this, yeah. Can queue_folio_to_kcompressd() fail? If
> so, we can also try synchronous compression on failure here
> (__zswap_store() ?).
> 
> 
> >         return true;
> > 
> > xxx:
> >         error handler things;
> >         return false;
> > }
> > 
> > If this function returns true, it suggests that compression
> > requests
> > have been queued to kcompressd. Following that, in kcompressd():
> > 
> > int __zswap_store(folio)
> > {
> >         for(i=0;i<nr_pages;i++) {
> >                 zswap_store_page();
> >                 if (err) return err;
> >         }
> >         return 0;
> > }
> > 
> > kcompressd()
> > {
> >         while(folio_queue_is_not_empty) {
> >                 folio = dequeue_folio();
> >                 if (folio_queued_by_zswap(folio)) {
> >                         if(!__zswap_store(folio))
> >                                 continue;
> >                 }
> >                 if ((zswap_store_page_fails &&
> > mem_cgroup_zswap_writeback_enabled()) ||
> >                     folio_queued_by_zram) {
> 
> If !mem_cgroup_zswap_writeback_enabled(), I wonder if we can activate
> the page here?
> 
> >                         __swap_writepage();
> >                 }
> >         }
> > }
> > 
> > In kswapd, we will need to do
> > int swap_writeout(struct folio *folio, struct swap_iocb
> > **swap_plug)
> > {
> >         ...
> >         if (try_to_sched_async_zswap_store(folio))
> >                 return;
> >         if (is_sync_comp_blkdev(swap)) {
> >                 queue_folio_to_kcompressd(folio);
> >                 return;
> >         }
> >         __swap_writepage();
> > }
> > 
> > To be honest, I'm not sure if there's a flag that indicates whether
> > the
> > folio was queued by zswap or zram. If not, we may need to add a
> > member
> 
> I don't think there is.
> 
> > associated with folio pointers in the queue between kswapd and
> > kcompressd,
> > since we need to identify zswap cases. Maybe we can reuse bit 0 of
> > the
> > folio pointer?
> > 
> > What I mean is: while queuing, if the folio is queued by zswap, we
> > do
> > `pointer |= BIT(0)`. Then in kcompressd, we restore the original
> > folio
> > with `folio = pointer & ~BIT(0)`. It's a bit ugly, but I’m not sure
> > there’s a better approach.
> 
> I think this approach is fine.
> 
> We can also hack struct zswap_entry, but that would require an extra
> xarray look up. OTOH, if we can assume that zram users will not
> enable
> zswap, we might optimize that lookup away? Not sure if it's much
> cleaner than just pointer tagging though.
> 
> > 
> > Thanks
> > Barry

Best regards,
Qun-Wei


^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2025-07-09  3:28 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-30  8:26 [PATCH] mm: Add Kcompressd for accelerated memory compression Qun-Wei Lin
2025-04-30 17:05 ` Nhat Pham
2025-04-30 17:22 ` Nhat Pham
2025-04-30 21:51 ` Andrew Morton
2025-04-30 22:49   ` Barry Song
2025-05-07 15:11     ` Nhat Pham
2025-05-01 14:02 ` Johannes Weiner
2025-05-01 15:12   ` Nhat Pham
2025-06-16  3:41     ` Barry Song
2025-06-17 14:21       ` Nhat Pham
2025-06-23  5:16         ` Barry Song
2025-06-27 23:21           ` Nhat Pham
2025-07-09  3:25             ` Qun-wei Lin (林群崴)
2025-05-02  9:16   ` Qun-wei Lin (林群崴)
2025-05-01 15:50 ` Nhat Pham
2025-05-07  1:12 ` Harry Yoo
2025-05-07  1:50   ` Zi Yan
2025-05-07  2:04     ` Barry Song
2025-05-07 15:00       ` Nhat Pham
2025-05-07 15:12         ` Zi Yan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).