Linux-mm Archive on lore.kernel.org

Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: better block swap batching and a different take on swap_ops v2
From: Jianyue Wu @ 2026-06-14 15:52 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andrew Morton, Baoquan He, Chris Li, Kairui Song, linux-mm,
	Nhat Pham, Kemeng Shi, usama.arif, Youngjun Park

Hi Christoph,

FYI, I posted a small follow-up series on top of your swap_iocb /
swap_ops rework:

  [PATCH v1 0/3] mm/zram: route block swap I/O through swap_ops

Link: https://lore.kernel.org/linux-mm/20260614-zram-swap-ops-block-register-v1-0-6c1a6639c222@gmail.com/T/

It does three things:
  - Add swap_register_block_ops() so a block driver can install custom
    submit_read/submit_write handlers at swapon time.
  - Wire zram through those hooks instead of building one bio per page.
  - Move slot_free_notify into swap_ops next to the other zram callbacks.

Happy to rebase if your series changes before it lands.

Thanks,
Jianyue


^ permalink raw reply

* [PATCH 3/3] mm/swap: route slot free notifications through swap_ops
From: Jianyue Wu @ 2026-06-14 15:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham, Barry Song,
	Kairui Song, Kemeng Shi, Youngjun Park, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc, Jianyue Wu
In-Reply-To: <20260614-zram-swap-ops-block-register-v1-0-6c1a6639c222@gmail.com>

Dispatch slot_free_notify through swap_ops instead of
block_device_operations. Zram keeps slot-free handling alongside its
other swap_ops methods.

Move slot_trylock into the CONFIG_SWAP block. With CONFIG_SWAP=n it
has no callers and the build fails on -Werror=unused-function.

Document the callback locking rules in include/linux/swap.h. Remove
the outdated locking.rst note for swap_slot_free_notify.

Signed-off-by: Jianyue Wu <wujianyue000@gmail.com>
---
 Documentation/filesystems/locking.rst |  5 --
 drivers/block/zram/zram_drv.c         | 88 ++++++++++++++++++-----------------
 include/linux/blkdev.h                |  2 -
 include/linux/swap.h                  |  7 +++
 mm/swapfile.c                         | 13 ++----
 rust/kernel/block/mq/gen_disk.rs      |  1 -
 6 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 70481bdc031d..964c841bf917 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -443,7 +443,6 @@ prototypes::
 				unsigned long *);
 	void (*unlock_native_capacity) (struct gendisk *);
 	int (*getgeo)(struct gendisk *, struct hd_geometry *);
-	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 
 locking rules:
 
@@ -457,12 +456,8 @@ compat_ioctl:		no
 direct_access:		no
 unlock_native_capacity:	no
 getgeo:			no
-swap_slot_free_notify:	no	(see below)
 ======================= ===================
 
-swap_slot_free_notify is called with swap_lock and sometimes the page lock
-held.
-
 
 file_operations
 ===============
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9b2bd0287402..b78246dc1746 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -72,31 +72,6 @@ static void slot_lock_init(struct zram *zram, u32 index)
 			 &__key, 0);
 }
 
-/*
- * entry locking rules:
- *
- * 1) Lock is exclusive
- *
- * 2) lock() function can sleep waiting for the lock
- *
- * 3) Lock owner can sleep
- *
- * 4) Use TRY lock variant when in atomic context
- *    - must check return value and handle locking failers
- */
-static __must_check bool slot_trylock(struct zram *zram, u32 index)
-{
-	unsigned long *lock = &zram->table[index].__lock;
-
-	if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) {
-		mutex_acquire(slot_dep_map(zram, index), 0, 1, _RET_IP_);
-		lock_acquired(slot_dep_map(zram, index), _RET_IP_);
-		return true;
-	}
-
-	return false;
-}
-
 static void slot_lock(struct zram *zram, u32 index)
 {
 	unsigned long *lock = &zram->table[index].__lock;
@@ -2798,23 +2773,6 @@ static void zram_submit_bio(struct bio *bio)
 	}
 }
 
-static void zram_slot_free_notify(struct block_device *bdev,
-				unsigned long index)
-{
-	struct zram *zram;
-
-	zram = bdev->bd_disk->private_data;
-
-	atomic64_inc(&zram->stats.notify_free);
-	if (!slot_trylock(zram, index)) {
-		atomic64_inc(&zram->stats.miss_free);
-		return;
-	}
-
-	slot_free(zram, index);
-	slot_unlock(zram, index);
-}
-
 static void zram_comp_params_reset(struct zram *zram)
 {
 	u32 prio;
@@ -3058,6 +3016,50 @@ static void zram_swap_submit_write(struct swap_io_ctx *ctx)
 	swap_write_end(sio, failed);
 }
 
+/*
+ * entry locking rules:
+ *
+ * 1) Lock is exclusive
+ *
+ * 2) lock() function can sleep waiting for the lock
+ *
+ * 3) Lock owner can sleep
+ *
+ * 4) Use TRY lock variant when in atomic context
+ *    - must check return value and handle locking failers
+ */
+static __must_check bool slot_trylock(struct zram *zram, u32 index)
+{
+	unsigned long *lock = &zram->table[index].__lock;
+
+	if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) {
+		mutex_acquire(slot_dep_map(zram, index), 0, 1, _RET_IP_);
+		lock_acquired(slot_dep_map(zram, index), _RET_IP_);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * swap_range_free() holds the swap cluster lock. Use slot_trylock() so
+ * we never block on a slot that is already locked elsewhere.
+ */
+static void zram_swap_slot_free_notify(struct swap_info_struct *sis,
+				       unsigned long index)
+{
+	struct zram *zram = sis->bdev->bd_disk->private_data;
+
+	atomic64_inc(&zram->stats.notify_free);
+	if (!slot_trylock(zram, index)) {
+		atomic64_inc(&zram->stats.miss_free);
+		return;
+	}
+
+	slot_free(zram, index);
+	slot_unlock(zram, index);
+}
+
 /*
  * No ->can_merge: block rules exist to grow bios on contiguous sectors and
  * matching blkcg.  zram already batches through swap_iocb, and
@@ -3068,6 +3070,7 @@ static void zram_swap_submit_write(struct swap_io_ctx *ctx)
 static const struct swap_ops zram_swap_ops = {
 	.submit_read		= zram_swap_submit_read,
 	.submit_write		= zram_swap_submit_write,
+	.slot_free_notify	= zram_swap_slot_free_notify,
 };
 
 #endif /* CONFIG_SWAP */
@@ -3075,7 +3078,6 @@ static const struct swap_ops zram_swap_ops = {
 static const struct block_device_operations zram_devops = {
 	.open = zram_open,
 	.submit_bio = zram_submit_bio,
-	.swap_slot_free_notify = zram_slot_free_notify,
 	.owner = THIS_MODULE
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 890128cdea1c..f861ceed39eb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1669,8 +1669,6 @@ struct block_device_operations {
 	int (*getgeo)(struct gendisk *, struct hd_geometry *);
 	int (*set_read_only)(struct block_device *bdev, bool ro);
 	void (*free_disk)(struct gendisk *disk);
-	/* this callback is with swap_lock and sometimes page table lock held */
-	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 	int (*report_zones)(struct gendisk *, sector_t sector,
 			    unsigned int nr_zones,
 			    struct blk_report_zones_args *args);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 70bf6f3f04dc..09640eb5a45d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -40,6 +40,11 @@ struct swap_io_ctx {
  *             the iocb is full or the plug is flushed.
  * @submit_write: flush the accumulated write ctx to the backend.
  * @submit_read: flush the accumulated read ctx to the backend.
+ * @slot_free_notify: optional callback invoked when a swap slot
+ *                    becomes free. swap_range_free() calls it with the
+ *                    swap cluster lock held. The folio lock may also be
+ *                    held on swap-cache teardown paths. Must not sleep
+ *                    or block.
  */
 struct swap_ops {
 	unsigned int		flags;
@@ -49,6 +54,8 @@ struct swap_ops {
 					     size_t prev_folio_size, int rw);
 	void			(*submit_write)(struct swap_io_ctx *ctx);
 	void			(*submit_read)(struct swap_io_ctx *ctx);
+	void			(*slot_free_notify)(struct swap_info_struct *sis,
+						    unsigned long offset);
 };
 
 int swap_register_block_ops(const struct block_device_operations *fops,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ebdc96092961..79a4166fb9bf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1311,21 +1311,18 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 			    unsigned int nr_entries)
 {
 	unsigned long end = offset + nr_entries - 1;
-	void (*swap_slot_free_notify)(struct block_device *, unsigned long);
+	void (*slot_free_notify)(struct swap_info_struct *sis,
+				 unsigned long offset);
 	unsigned int i;
 
 	for (i = 0; i < nr_entries; i++)
 		zswap_invalidate(swp_entry(si->type, offset + i));
 
-	if (si->flags & SWP_BLKDEV)
-		swap_slot_free_notify =
-			si->bdev->bd_disk->fops->swap_slot_free_notify;
-	else
-		swap_slot_free_notify = NULL;
+	slot_free_notify = si->ops->slot_free_notify;
 	while (offset <= end) {
 		arch_swap_invalidate_page(si->type, offset);
-		if (swap_slot_free_notify)
-			swap_slot_free_notify(si->bdev, offset);
+		if (slot_free_notify)
+			slot_free_notify(si, offset);
 		offset++;
 	}
 
diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs
index 912cb805caf5..25552d69f711 100644
--- a/rust/kernel/block/mq/gen_disk.rs
+++ b/rust/kernel/block/mq/gen_disk.rs
@@ -135,7 +135,6 @@ pub fn build<T: Operations>(
             unlock_native_capacity: None,
             getgeo: None,
             set_read_only: None,
-            swap_slot_free_notify: None,
             report_zones: None,
             devnode: None,
             alternative_gpt_sector: None,

-- 
2.43.0



^ permalink raw reply related

* [PATCH 2/3] mm/zram: handle swap read/write via swap_ops
From: Jianyue Wu @ 2026-06-14 15:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham, Barry Song,
	Kairui Song, Kemeng Shi, Youngjun Park, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc, Jianyue Wu
In-Reply-To: <20260614-zram-swap-ops-block-register-v1-0-6c1a6639c222@gmail.com>

Register zram_swap_ops at module init.  The swap core still batches
folios into a swap_iocb; on flush, zram_swap_submit_write() maps each
folio page to its swap slot index and stores it via zram_write_page()
into the zspool, avoiding one bio per page.

For swap-in, zram_swap_submit_read() walks the same batch.  Without a
backing device, each slot is decompressed with read_from_zspool() while
slot_lock is held and mark_slot_accessed() runs in the same critical
section, so idle writeback cannot take the slot between read and mark.
When backing_dev is set, delegate the entire iocb to
swap_bdev_submit_read() because the batch may mix ZRAM_WB slots that
live on the backing block device.

Omit ->can_merge: zram batches through swap_iocb and compresses each
slot by index.  Block-sector merge rules do not apply.

Export swap_iocb_nr_folios(), swap_iocb_folio(), swap_read_end(),
swap_write_end(), and swap_bdev_submit_read() for the custom swap I/O
path.

Fail zram_init() if swap_register_block_ops() fails so the module
does not load without its swap path registered.

Signed-off-by: Jianyue Wu <wujianyue000@gmail.com>
---
 drivers/block/zram/zram_drv.c | 127 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/swap.h          |   5 ++
 mm/page_io.c                  |  81 ++++++++++++++++++++++++++-
 3 files changed, 210 insertions(+), 3 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7917fc7a2a29..9b2bd0287402 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -34,6 +34,8 @@
 #include <linux/part_stat.h>
 #include <linux/kernel_read_file.h>
 #include <linux/rcupdate.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 
 #include "zram_drv.h"
 
@@ -55,6 +57,9 @@ static unsigned int num_devices = 1;
 static size_t huge_class_size;
 
 static const struct block_device_operations zram_devops;
+#if IS_ENABLED(CONFIG_SWAP)
+static bool zram_swap_ops_registered;
+#endif
 
 static void slot_free(struct zram *zram, u32 index);
 #define slot_dep_map(zram, index) (&(zram)->table[(index)].dep_map)
@@ -2958,6 +2963,115 @@ static int zram_open(struct gendisk *disk, blk_mode_t mode)
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_SWAP)
+static void zram_swap_submit_read(struct swap_io_ctx *ctx)
+{
+	struct zram *zram = ctx->sis->bdev->bd_disk->private_data;
+	struct swap_iocb *sio = ctx->sio;
+	int nr = swap_iocb_nr_folios(sio);
+	bool failed = false;
+	int i, j;
+
+	/*
+	 * With a backing device configured, the batch may include ZRAM_WB
+	 * slots.  Fall back to the block read path for the whole iocb
+	 * instead of checking each slot.
+	 */
+#ifdef CONFIG_ZRAM_WRITEBACK
+	if (zram->backing_dev) {
+		swap_bdev_submit_read(ctx);
+		return;
+	}
+#endif
+
+	for (i = 0; i < nr; i++) {
+		struct folio *folio = swap_iocb_folio(sio, i);
+		u32 base = swp_offset(folio->swap);
+
+		for (j = 0; j < folio_nr_pages(folio); j++) {
+			u32 idx = base + j;
+			struct page *page = folio_page(folio, j);
+			int ret;
+
+			/*
+			 * read_from_zspool() and mark_slot_accessed() must run
+			 * under the same slot_lock.  zram_read_page() unlocks
+			 * before returning, which leaves a window where
+			 * writeback can pick an idle slot we just read.
+			 */
+			slot_lock(zram, idx);
+			ret = read_from_zspool(zram, page, idx);
+			if (!ret)
+				mark_slot_accessed(zram, idx);
+			slot_unlock(zram, idx);
+			if (ret) {
+				failed = true;
+				atomic64_inc(&zram->stats.failed_reads);
+				pr_alert_ratelimited("Read-error on swap-device %s at index %u: err=%d\n",
+						     zram->disk->disk_name, idx, ret);
+				goto out;
+			}
+			flush_dcache_page(page);
+		}
+	}
+out:
+	swap_read_end(sio, failed);
+}
+
+static void zram_swap_submit_write(struct swap_io_ctx *ctx)
+{
+	struct zram *zram = ctx->sis->bdev->bd_disk->private_data;
+	struct swap_iocb *sio = ctx->sio;
+	int nr = swap_iocb_nr_folios(sio);
+	bool failed = false;
+	int i, j, ret = 0;
+	u32 idx = 0;
+
+	for (i = 0; i < nr; i++) {
+		struct folio *folio = swap_iocb_folio(sio, i);
+		u32 base = swp_offset(folio->swap);
+
+		for (j = 0; j < folio_nr_pages(folio); j++) {
+			idx = base + j;
+			ret = zram_write_page(zram, folio_page(folio, j), idx);
+			if (ret) {
+				/*
+				 * Leave partial zram data in place, same as the bio
+				 * write path.  swap_write_end() re-dirties every
+				 * page in the batch so they stay in swapcache with
+				 * their swap entries.  Freeing zram slots here would
+				 * leave entries pointing at empty indices until
+				 * slot_free_notify runs.
+				 */
+				failed = true;
+				atomic64_inc(&zram->stats.failed_writes);
+				pr_alert_ratelimited("Write-error on swap-device %s at index %u: err=%d\n",
+						     zram->disk->disk_name, idx, ret);
+				goto out;
+			}
+			slot_lock(zram, idx);
+			mark_slot_accessed(zram, idx);
+			slot_unlock(zram, idx);
+		}
+	}
+out:
+	swap_write_end(sio, failed);
+}
+
+/*
+ * No ->can_merge: block rules exist to grow bios on contiguous sectors and
+ * matching blkcg.  zram already batches through swap_iocb, and
+ * submit_write() compresses each slot by index, not by sector layout.
+ * Reusing swap_bdev_can_merge() would only split batches without helping
+ * zspool I/O.
+ */
+static const struct swap_ops zram_swap_ops = {
+	.submit_read		= zram_swap_submit_read,
+	.submit_write		= zram_swap_submit_write,
+};
+
+#endif /* CONFIG_SWAP */
+
 static const struct block_device_operations zram_devops = {
 	.open = zram_open,
 	.submit_bio = zram_submit_bio,
@@ -3233,6 +3347,10 @@ static int zram_remove_cb(int id, void *ptr, void *data)
 
 static void destroy_devices(void)
 {
+#if IS_ENABLED(CONFIG_SWAP)
+	if (zram_swap_ops_registered)
+		swap_unregister_block_ops(&zram_devops);
+#endif
 	class_unregister(&zram_control_class);
 	idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
 	zram_debugfs_destroy();
@@ -3269,6 +3387,15 @@ static int __init zram_init(void)
 		return -EBUSY;
 	}
 
+#if IS_ENABLED(CONFIG_SWAP)
+	ret = swap_register_block_ops(&zram_devops, &zram_swap_ops);
+	if (ret) {
+		pr_err("zram: failed to register swap ops (%d)\n", ret);
+		goto out_error;
+	}
+	zram_swap_ops_registered = true;
+#endif
+
 	while (num_devices != 0) {
 		mutex_lock(&zram_index_mutex);
 		ret = zram_add();
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1d51df4179c1..70bf6f3f04dc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -54,6 +54,11 @@ struct swap_ops {
 int swap_register_block_ops(const struct block_device_operations *fops,
 			    const struct swap_ops *ops);
 void swap_unregister_block_ops(const struct block_device_operations *fops);
+int swap_iocb_nr_folios(struct swap_iocb *sio);
+struct folio *swap_iocb_folio(struct swap_iocb *sio, int idx);
+void swap_read_end(struct swap_iocb *sio, bool failed);
+void swap_write_end(struct swap_iocb *sio, bool failed);
+void swap_bdev_submit_read(struct swap_io_ctx *ctx);
 
 #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
 #define SWAP_FLAG_PRIO_MASK	0x7fff
diff --git a/mm/page_io.c b/mm/page_io.c
index 3ab620860379..7c17e44823d1 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -486,7 +486,21 @@ void swap_read_folio(struct swap_io_ctx *ctx, struct folio *folio)
 	delayacct_swapin_end();
 }
 
-static void swap_write_end(struct swap_iocb *sio, bool failed)
+/**
+ * swap_write_end - finish a swap write iocb
+ * @sio:    swap_iocb whose pages were just written
+ * @failed: true if any of the underlying writes failed
+ *
+ * Ends writeback on every page captured by @sio. On failure each page
+ * is also re-dirtied and PG_reclaim is cleared, mirroring the bio
+ * write completion path. @sio is returned to the swap iocb mempool.
+ *
+ * swap_ops providers must call this exactly once per submit_write()
+ * ctx (typically at the end of their submit_write callback).
+ *
+ * Context: any context the submit_write() callback runs in.
+ */
+void swap_write_end(struct swap_iocb *sio, bool failed)
 {
 	int p;
 
@@ -501,6 +515,7 @@ static void swap_write_end(struct swap_iocb *sio, bool failed)
 	}
 	mempool_free(sio, sio_pool);
 }
+EXPORT_SYMBOL_GPL(swap_write_end);
 
 static void swap_fs_write_complete(struct kiocb *iocb, long ret)
 {
@@ -536,7 +551,26 @@ static void end_swap_bio_write(struct bio *bio)
 	swap_write_end(sio, failed);
 }
 
-static void swap_read_end(struct swap_iocb *sio, bool failed)
+/**
+ * swap_read_end - finish a swap read iocb
+ * @sio:    swap_iocb whose folios were just read in
+ * @failed: true if any of the underlying reads failed
+ *
+ * Unlocks every folio captured by @sio. On success each folio is also
+ * marked uptodate and swap-in counters (PSWPIN, mTHP, memcg) are bumped
+ * by folio_nr_pages(). On failure folios are left not-uptodate so the
+ * caller observes the failure and retries or surfaces an error. @sio is
+ * returned to the swap iocb mempool.
+ *
+ * swap_ops providers must call this exactly once per submit_read() ctx
+ * (typically at the end of their submit_read callback). If the provider
+ * defers to swap_bdev_ops.submit_read() for fallback, the bdev path
+ * will call swap_read_end() itself and the provider must not call it
+ * again for the same ctx.
+ *
+ * Context: any context the submit_read() callback runs in.
+ */
+void swap_read_end(struct swap_iocb *sio, bool failed)
 {
 	int p;
 
@@ -557,6 +591,34 @@ static void swap_read_end(struct swap_iocb *sio, bool failed)
 
 	mempool_free(sio, sio_pool);
 }
+EXPORT_SYMBOL_GPL(swap_read_end);
+
+/**
+ * swap_iocb_nr_folios - number of folios in a swap I/O batch
+ * @sio: swap_iocb passed to a swap_ops submit callback.
+ *
+ * Returns how many folios the swap core has batched into @sio. Used
+ * together with swap_iocb_folio() so swap_ops providers can walk the
+ * batch without depending on the swap core's internal iocb layout.
+ */
+int swap_iocb_nr_folios(struct swap_iocb *sio)
+{
+	return sio->nr_bvecs;
+}
+EXPORT_SYMBOL_GPL(swap_iocb_nr_folios);
+
+/**
+ * swap_iocb_folio - folio at slot @idx in a swap I/O batch
+ * @sio: swap_iocb passed to a swap_ops submit callback.
+ * @idx: index in the range [0, swap_iocb_nr_folios(@sio)).
+ *
+ * Returns the folio at the given batch slot.
+ */
+struct folio *swap_iocb_folio(struct swap_iocb *sio, int idx)
+{
+	return page_folio(sio->bvecs[idx].bv_page);
+}
+EXPORT_SYMBOL_GPL(swap_iocb_folio);
 
 static void swap_fs_read_complete(struct kiocb *iocb, long ret)
 {
@@ -613,7 +675,19 @@ static void swap_bdev_submit_write(struct swap_io_ctx *ctx)
 	}
 }
 
-static void swap_bdev_submit_read(struct swap_io_ctx *ctx)
+/**
+ * swap_bdev_submit_read - fall back to the default block-device read path
+ * @ctx: in-progress submit_read context.
+ *
+ * Builds a bio for the accumulated ctx and submits it through the
+ * normal block layer. swap_ops providers can call this when they
+ * cannot serve a particular ctx themselves (for example zram folios
+ * stored on a backing device). The bio completion path takes care of
+ * calling swap_read_end() on @ctx. The caller must not call it again.
+ *
+ * Context: any context the submit_read() callback runs in.
+ */
+void swap_bdev_submit_read(struct swap_io_ctx *ctx)
 {
 	struct swap_iocb *sio = ctx->sio;
 	struct bio *bio = &sio->bio;
@@ -638,6 +712,7 @@ static void swap_bdev_submit_read(struct swap_io_ctx *ctx)
 		submit_bio(bio);
 	}
 }
+EXPORT_SYMBOL_GPL(swap_bdev_submit_read);
 
 static bool swap_bdev_can_merge(struct folio *folio, struct folio *prev_folio,
 		size_t prev_folio_size, int rw)

-- 
2.43.0



^ permalink raw reply related

* [PATCH 1/3] mm/page_io: let block drivers register custom swap I/O ops
From: Jianyue Wu @ 2026-06-14 15:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham, Barry Song,
	Kairui Song, Kemeng Shi, Youngjun Park, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc, Jianyue Wu
In-Reply-To: <20260614-zram-swap-ops-block-register-v1-0-6c1a6639c222@gmail.com>

Add swap_register_block_ops() so a block driver can install custom
swap read/write handlers instead of always building bios.

When swapon targets a block device (S_ISBLK), setup_swap_extents()
checks whether that driver's block_device_operations were registered.
If yes, sis->ops points at the driver table. Otherwise sis->ops
stays on swap_bdev_ops.

Swap files are unchanged. They still use the filesystem path and
extent tree, because their page index is not a raw disk sector.

Register swap_ops in a single global slot keyed by the driver's
block_device_operations. lookup_swap_block_ops() matches sis->bdev
fops at swapon. -EBUSY if the slot is already taken. That is enough
while only zram needs custom swap I/O. Several block drivers would
need a per-fops lookup table instead.

swap_unregister_block_ops() must pass the same fops that
registered. Swap areas created before unregister keep the old ops
until swapoff. The driver module must remain loaded while they are
in use.

Signed-off-by: Jianyue Wu <wujianyue000@gmail.com>
---
 include/linux/swap.h |  35 +++++++++++++++++
 mm/page_io.c         | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/swap.h            |  18 +--------
 mm/swapfile.c        |   4 ++
 4 files changed, 147 insertions(+), 16 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 636d94108166..1d51df4179c1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -19,6 +19,41 @@
 struct notifier_block;
 
 struct bio;
+struct block_device_operations;
+struct folio;
+struct swap_iocb;
+struct swap_info_struct;
+
+struct swap_io_ctx {
+	struct swap_iocb	*sio;
+	struct swap_info_struct	*sis;
+};
+
+/* Set when the swap backend requires GFP_NOFS allocations. */
+#define SWAP_OPS_F_NOFS		(1U << 0)
+
+/**
+ * struct swap_ops - per-swap-area I/O batching callbacks
+ * @can_merge: optional. Return true iff @folio can be appended to a ctx
+ *             that already holds @prev_folio of @prev_folio_size bytes.
+ *             When NULL, folios on the same swap area are batched until
+ *             the iocb is full or the plug is flushed.
+ * @submit_write: flush the accumulated write ctx to the backend.
+ * @submit_read: flush the accumulated read ctx to the backend.
+ */
+struct swap_ops {
+	unsigned int		flags;
+
+	bool			(*can_merge)(struct folio *folio,
+					     struct folio *prev_folio,
+					     size_t prev_folio_size, int rw);
+	void			(*submit_write)(struct swap_io_ctx *ctx);
+	void			(*submit_read)(struct swap_io_ctx *ctx);
+};
+
+int swap_register_block_ops(const struct block_device_operations *fops,
+			    const struct swap_ops *ops);
+void swap_unregister_block_ops(const struct block_device_operations *fops);
 
 #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
 #define SWAP_FLAG_PRIO_MASK	0x7fff
diff --git a/mm/page_io.c b/mm/page_io.c
index c020e8ebf966..3ab620860379 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -24,6 +24,8 @@
 #include <linux/uio.h>
 #include <linux/sched/task.h>
 #include <linux/delayacct.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
 #include <linux/zswap.h>
 #include "swap.h"
 #include "swap_table.h"
@@ -325,6 +327,8 @@ static bool swap_can_merge(struct swap_io_ctx *ctx, struct folio *folio,
 
 	if (ctx->sis != sis)
 		return false;
+	if (!sis->ops->can_merge)
+		return true;
 	return sis->ops->can_merge(folio, prev_folio, prev_folio_size, rw);
 }
 
@@ -577,6 +581,18 @@ static void swap_bio_read_end_io(struct bio *bio)
 	swap_read_end(sio, failed);
 }
 
+/**
+ * swap_bdev_submit_write - default block-device write path for swap
+ * @ctx: in-progress submit_write context.
+ *
+ * Builds a bio for the accumulated ctx and submits it through the normal
+ * block layer. This is the submit_write implementation used by swap_bdev_ops
+ * for ordinary block swap areas. swap_ops providers that override submit_write
+ * (e.g. zram) but still fall back to the block layer for some I/Os should use
+ * their own bio construction, this function is not exported.
+ *
+ * Context: process context (may sleep if SWP_SYNCHRONOUS_IO is set).
+ */
 static void swap_bdev_submit_write(struct swap_io_ctx *ctx)
 {
 	struct swap_iocb *sio = ctx->sio;
@@ -640,6 +656,96 @@ const struct swap_ops swap_bdev_ops = {
 	.can_merge		= swap_bdev_can_merge,
 };
 
+static DEFINE_MUTEX(swap_block_ops_lock);
+static const struct block_device_operations *swap_block_fops;
+static const struct swap_ops *swap_block_ops;
+
+/**
+ * swap_register_block_ops - install swap callbacks for a block driver
+ * @fops: block_device_operations identifying the driver. Used as a
+ *        match key in setup_swap_extents(): a S_ISBLK swap area is
+ *        routed to @ops when its bdev's gendisk fops equals @fops.
+ * @ops:  swap_ops vtable selected for matching swap areas. Must populate
+ *        ->submit_read and ->submit_write. ->can_merge is optional.
+ *
+ * Lets a block driver (zram and similar) replace the default
+ * swap_bdev_ops with its own submit_read / submit_write implementation.
+ *
+ * Returns 0 on success, -EINVAL when @fops or @ops are bad (a required
+ * callback is missing), or -EBUSY when the single registration slot is
+ * already taken. That slot is enough while only zram needs custom swap I/O.
+ * Several block drivers would need a per-fops lookup table instead.
+ *
+ * Context: process context, may sleep.
+ */
+int swap_register_block_ops(const struct block_device_operations *fops,
+			    const struct swap_ops *ops)
+{
+	int ret;
+
+	if (WARN_ON_ONCE(!fops || !ops || !ops->submit_read ||
+			 !ops->submit_write))
+		return -EINVAL;
+
+	mutex_lock(&swap_block_ops_lock);
+	if (swap_block_fops || swap_block_ops) {
+		ret = -EBUSY;
+		goto out;
+	}
+	swap_block_fops = fops;
+	swap_block_ops = ops;
+	ret = 0;
+out:
+	mutex_unlock(&swap_block_ops_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(swap_register_block_ops);
+
+/**
+ * swap_unregister_block_ops - undo swap_register_block_ops()
+ * @fops: same block_device_operations passed to swap_register_block_ops().
+ *
+ * Clears the registered fops/ops slot so future swapon calls fall back
+ * to swap_bdev_ops. The @fops match acts as a soft owner check so a
+ * driver cannot accidentally tear down another driver's registration.
+ * A mismatch is treated as a bug and triggers WARN_ON_ONCE. Swap areas
+ * that already captured the registered ops keep their sis->ops pointer.
+ * The caller must ensure the module owning the ops outlives any such
+ * swap area. For block drivers this is guaranteed by the bdev open
+ * reference held across swapon.
+ * Calling unregister before a successful register is a no-op.
+ *
+ * Context: process context, may sleep.
+ */
+void swap_unregister_block_ops(const struct block_device_operations *fops)
+{
+	mutex_lock(&swap_block_ops_lock);
+	/* never registered or already unregistered. */
+	if (!swap_block_fops)
+		goto out;
+	if (WARN_ON_ONCE(swap_block_fops != fops))
+		goto out;
+	swap_block_fops = NULL;
+	swap_block_ops = NULL;
+out:
+	mutex_unlock(&swap_block_ops_lock);
+}
+EXPORT_SYMBOL_GPL(swap_unregister_block_ops);
+
+const struct swap_ops *lookup_swap_block_ops(struct swap_info_struct *sis)
+{
+	const struct swap_ops *ops = NULL;
+
+	if (!sis->bdev)
+		return NULL;
+
+	mutex_lock(&swap_block_ops_lock);
+	if (swap_block_fops && sis->bdev->bd_disk->fops == swap_block_fops)
+		ops = swap_block_ops;
+	mutex_unlock(&swap_block_ops_lock);
+	return ops;
+}
+
 static void swap_fs_submit(struct swap_io_ctx *ctx, int rw)
 {
 	struct swap_iocb *sio = ctx->sio;
diff --git a/mm/swap.h b/mm/swap.h
index edb512e619ee..4bdd38f7a5e8 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -4,6 +4,7 @@
 
 #include <linux/atomic.h> /* for atomic_long_t */
 #include <linux/mm.h> /* for PAGE_SHIFT */
+#include <linux/swap.h>
 
 struct mempolicy;
 struct swap_iocb;
@@ -79,22 +80,6 @@ enum swap_cluster_flags {
 	CLUSTER_FLAG_MAX,
 };
 
-struct swap_io_ctx {
-	struct swap_iocb	*sio;
-	struct swap_info_struct	*sis;
-};
-
-#define SWAP_OPS_F_NOFS		(1U << 0)
-
-struct swap_ops {
-	unsigned int		flags;
-
-	bool (*can_merge)(struct folio *folio, struct folio *prev_folio,
-			size_t prev_folio_size, int rw);
-	void (*submit_write)(struct swap_io_ctx *ctx);
-	void (*submit_read)(struct swap_io_ctx *ctx);
-};
-
 #ifdef CONFIG_SWAP
 #include <linux/swapops.h> /* for swp_offset */
 #include <linux/blk_types.h> /* for bio_end_io_t */
@@ -472,6 +457,7 @@ static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci,
 #endif /* CONFIG_SWAP */
 
 extern const struct swap_ops swap_bdev_ops;
+const struct swap_ops *lookup_swap_block_ops(struct swap_info_struct *sis);
 
 int shmem_writeout(struct swap_io_ctx *ctx, struct folio *folio,
 		struct list_head *folio_list);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 284eebc40a70..ebdc96092961 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2849,6 +2849,10 @@ static int setup_swap_extents(struct swap_info_struct *sis,
 	sis->ops = &swap_bdev_ops;
 
 	if (S_ISBLK(inode->i_mode)) {
+		const struct swap_ops *block_ops = lookup_swap_block_ops(sis);
+
+		if (block_ops)
+			sis->ops = block_ops;
 		ret = add_swap_extent(sis, 0, sis->max, 0);
 		*span = sis->pages;
 		return ret;

-- 
2.43.0



^ permalink raw reply related

* [PATCH 0/3] mm/zram: route block swap I/O through swap_ops
From: Jianyue Wu @ 2026-06-14 15:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham, Barry Song,
	Kairui Song, Kemeng Shi, Youngjun Park, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc, Jianyue Wu

This series builds on Christoph Hellwig's swap batching rework that
moves block swap onto struct swap_iocb and per-backend struct
swap_ops handlers [1].  Christoph's patches unify batching for
ordinary block devices and swap files.  zram still needs a custom
path because swap slots map to compressed pages, not disk sectors.

The first patch adds swap_register_block_ops() so a block driver can
install custom submit_read/submit_write handlers when swapon targets
its block device.  The default swap_bdev_ops path is unchanged for
devices that do not register.

The second patch registers zram_swap_ops at module init.  On write,
the swap core still batches folios into a swap_iocb.  zram maps each
folio to a slot index and stores it through zram_write_page() instead
of building one bio per page.  Read handling keeps slot_lock and
mark_slot_accessed() in one critical section.  Writeback-enabled zram
falls back to swap_bdev_submit_read() for ZRAM_WB slots.

The third patch moves slot_free_notify into swap_ops next to the
other zram swap callbacks, and documents the locking contract for
that hook.

Applied on top of Christoph Hellwig's "better block swap batching and
a different take on swap_ops" series [1].

[1] https://lore.kernel.org/linux-mm/?q=better+block+swap+batching

To: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chris Li <chrisl@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Youngjun Park <youngjun.park@lge.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-block@vger.kernel.org
Cc: linux-doc@vger.kernel.org

Signed-off-by: Jianyue Wu <wujianyue000@gmail.com>
---
Jianyue Wu (3):
      mm/page_io: let block drivers register custom swap I/O ops
      mm/zram: handle swap read/write via swap_ops
      mm/swap: route slot free notifications through swap_ops

 Documentation/filesystems/locking.rst |   5 -
 drivers/block/zram/zram_drv.c         | 215 +++++++++++++++++++++++++++-------
 include/linux/blkdev.h                |   2 -
 include/linux/swap.h                  |  47 ++++++++
 mm/page_io.c                          | 187 ++++++++++++++++++++++++++++-
 mm/swap.h                             |  18 +--
 mm/swapfile.c                         |  17 +--
 rust/kernel/block/mq/gen_disk.rs      |   1 -
 8 files changed, 414 insertions(+), 78 deletions(-)
---
base-commit: 842f51deada6449843f811bfa22e536a01ae5a0c
change-id: 20260614-zram-swap-ops-block-register-a1b2c3d4e5f6

Best regards,
-- 
Jianyue Wu <wujianyue000@gmail.com>

^ permalink raw reply

* Re: [PATCH v2 2/5] binder: Make shrinker rely solely on per-VMA lock
From: Carlos Llamas @ 2026-06-14 14:10 UTC (permalink / raw)
  To: Alice Ryhl
  Cc: Dave Hansen, Suren Baghdasaryan, Vlastimil Babka (SUSE),
	Dave Hansen, linux-kernel, Andrew Morton,
	Arve Hjønnevåg, Christian Brauner, David Ahern,
	David S. Miller, Greg Kroah-Hartman, Liam R. Howlett, linux-mm,
	Lorenzo Stoakes, netdev, Shakeel Butt, Todd Kjos
In-Reply-To: <aixi-DxMuc0MiGeO@google.com>

On Fri, Jun 12, 2026 at 07:50:16PM +0000, Alice Ryhl wrote:
> On Fri, Jun 12, 2026 at 11:47:59AM -0700, Dave Hansen wrote:
> > On 6/12/26 10:44, Suren Baghdasaryan wrote:
> > >> It's not impossible, but I do think it is irrelevant. Or at least that
> > >> the *VMA* is irrelevant in this case. binder_alloc_is_mapped()==false
> > >> means that the binder VMA is gone. It's not in the maple tree, and it's
> > >> not coming back. If a VMA is found, it's an impostor.
> > > Right, but before your change we were bailing out early. With your
> > > change we would be generating the traces and freeing the page. I think
> > > that's a functional change. Was that your intention?
> > 
> > Yeah, it was intentional.
> > 
> > I think the existing behavior is buggy. It also complicates the goal of
> > removing the mmap lock fallback. I've broken that behavior change out
> > into a separate patch. (attached here)
> 
> I think you can just:
> 
> 1. do a lock_vma_under_rcu().
> 2. if it fails, check binder_alloc_is_mapped().
> 3. if still mapped, return LRU_SKIP, otherwise behave like a failed
>    vma_lookup() does today under the mmap read lock.

Right! This is the same suggestion I sent.

...
Also, I would _prefer_ if the commit message was more accurate. The
mmap_lock fallback was there because of "compatibility", as per-vma
locking is technically behind CONFIG_PER_VMA_LOCK. This would be the
only part that IMO describes the actual reason for the change:

> Now that per-VMA locks are universally available, lock_vma_under_rcu()
> will not persistently fail. Rely on it alone and simplify the code.

Cheers,
--
Carlos Llamas


^ permalink raw reply

* Re: [PATCH v6 05/12] PCI: liveupdate: Keep bus numbers constant during Live Update
From: Pasha Tatashin @ 2026-06-14 14:01 UTC (permalink / raw)
  To: Pranjal Shrivastava
  Cc: David Matlack, kexec, linux-doc, linux-kernel, linux-mm,
	linux-pci, Adithya Jayachandran, Alexander Graf, Alex Williamson,
	Bjorn Helgaas, Chris Li, David Rientjes, Jacob Pan,
	Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
	Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
	Pratyush Yadav, Saeed Mahameed, Samiullah Khawaja, Shuah Khan,
	Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <aiQAJRINEKiwCmVm@google.com>

On 2026-06-06 11:10:29+00:00, Pranjal Shrivastava wrote:
> On Fri, May 22, 2026 at 08:24:03PM +0000, David Matlack wrote:
> 
> > During a Live Update, preserved devices must be allowed to continue
> > performing memory transactions so the kernel cannot change the fabric
> > topology, including bus numbers, since that would require disabling
> > and flushing any memory transactions first.
> > 
> > To keep bus numbers constant, always inherit the secondary and
> > subordinate bus numbers assigned to bridges during scanning, instead of
> > assigning new ones, if any PCI devices are being preserved. Note that
> > the kernel inherits bus numbers even on bridges without any downstream
> > endpoints that were preserved. This avoids accidentally assigning a
> > bridge a new window that overlaps with a preserved device that is
> > downstream of a different bridge.
> > 
> > If a bridge is scanned with a broken topology or has no bus numbers
> > set during a Live Update, refuse to assign it new bus numbers and refuse
> > to enumerate devices below it until the Live Update is finished. This is
> > a safety measure to prevent topology conflicts.
> > 
> > Require that CONFIG_CARDBUS is not enabled to enable
> > CONFIG_PCI_LIVEUPDATE since inheriting bus numbers on PCI-to-CardBus
> > bridges requires additional work but is not a priority at the moment.
> > 
> > Signed-off-by: David Matlack <dmatlack@google.com>
> > ---
> >  .../admin-guide/kernel-parameters.txt         |  6 +-
> >  drivers/pci/Kconfig                           |  2 +-
> >  drivers/pci/liveupdate.c                      | 83 ++++++++++++++++++-
> >  drivers/pci/liveupdate.h                      | 14 ++++
> >  drivers/pci/probe.c                           | 17 +++-
> >  include/linux/pci_liveupdate.h                |  4 +
> >  6 files changed, 119 insertions(+), 7 deletions(-)
> 
> [...]
> 
> > +		incoming = pci_liveupdate_flb_get_incoming();
> > +		if (!incoming) {
> > +			dev->liveupdate.inherit_buses = false;
> > +			goto out;
> > +		}
> > +
> > +		/*
> > +		 * It is safe to sample incoming->ser->nr_devices and then
> > +		 * drop the rwsem since nr_devices will only decrease. Thus the
> > +		 * only "race" is that the current scan will be overly
> > +		 * conservative and force bus inheritance.
> > +		 */
> > +		dev->liveupdate.inherit_buses = incoming->ser->nr_devices;
> 
> Nit: inherit_buses is a bool, while compiler will handle it correctly,
> maybe we could:
> 
> dev->liveupdate.inherit_buses = !!incoming->ser->nr_devices 

+1

> 
> OR
> 
> dev->liveupdate.inherit_buses = (incoming->ser->nr_devices > 0)
> 
> for readability?
> 
> > +		pci_liveupdate_flb_put_incoming();
> > +	}
> > +
> > +out:
> > +	return dev->liveupdate.inherit_buses;
> > +}
> > +
> 
> [...]
> 
> >  		/*
> > @@ -1497,8 +1501,7 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
> >  		 * do in the second pass.
> >  		 */
> >  		if (!pass) {
> > -			if (pcibios_assign_all_busses() || broken)
> > -
> > +			if (assign_new_buses || broken)
> >  				/*
> >  				 * Temporarily disable forwarding of the
> >  				 * configuration cycles on all bridges in
> > @@ -1512,6 +1515,11 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
> >  			goto out;
> >  		}
> >  
> > +		if (liveupdate) {
> > +			pci_err(dev, "Cannot reconfigure bridge during Live Update, skipping\n");
> > +			goto out;
> > +		}
> 
> Quite helpful! Thanks :)
> 
> > +
> >  		/* Clear errors */
> >  		pci_write_config_word(dev, PCI_STATUS, 0xffff);
> >  
> > @@ -1572,6 +1580,7 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
> >  	pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
> >  
> >  	pm_runtime_put(&dev->dev);
> > +	pci_liveupdate_scan_bridge_end(dev, pass);
> >  
> >  	return max;
> >  }
> 
> With the minor nit above,
> Reviewed-by: Pranjal Shrivastava <praan@google.com>
> 
> Thanks,
> Praan




^ permalink raw reply

* Re: [PATCH v6 05/12] PCI: liveupdate: Keep bus numbers constant during Live Update
From: Pasha Tatashin @ 2026-06-14 13:57 UTC (permalink / raw)
  To: David Matlack
  Cc: kexec, linux-doc, linux-kernel, linux-mm, linux-pci,
	Adithya Jayachandran, Alexander Graf, Alex Williamson,
	Bjorn Helgaas, Chris Li, David Rientjes, Jacob Pan,
	Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
	Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
	Pranjal Shrivastava, Pratyush Yadav, Saeed Mahameed,
	Samiullah Khawaja, Shuah Khan, Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <20260522202410.3104264-6-dmatlack@google.com>

On Fri, 22 May 2026 20:24:03 +0000, David Matlack <dmatlack@google.com> wrote:
> diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
> index 4f2ec6ffdd16..2421bc218916 100644
> --- a/drivers/pci/liveupdate.c
> +++ b/drivers/pci/liveupdate.c
> @@ -103,7 +118,7 @@
>  /**
>   * struct pci_liveupdate_global - Global state for PCI Live Update support
>   * @rwsem: Reader/writer semaphore used to protect the incoming and outgoing
> - *         FLBs, and the references to them in struct pci_dev.
> + *         FLBs and references to them in struct pci_dev.

This change does not belong to this patch.

> @@ -396,6 +411,72 @@ static void pci_liveupdate_flb_put_incoming(void)
>  	liveupdate_flb_put_incoming(&pci_liveupdate_flb);
>  }
>  
> +bool pci_liveupdate_scan_bridge_begin(struct pci_bus *bus, struct pci_dev *dev,
> +				      int pass)

This function requires a header comment; it is public and not self-descriptive.

-- 
Pasha Tatashin <pasha.tatashin@soleen.com>


^ permalink raw reply

* Re: [PATCH v6 04/12] PCI: liveupdate: Document driver binding responsibilities
From: Pasha Tatashin @ 2026-06-14 13:41 UTC (permalink / raw)
  To: David Matlack
  Cc: kexec, linux-doc, linux-kernel, linux-mm, linux-pci,
	Adithya Jayachandran, Alexander Graf, Alex Williamson,
	Bjorn Helgaas, Chris Li, David Rientjes, Jacob Pan,
	Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
	Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
	Pranjal Shrivastava, Pratyush Yadav, Saeed Mahameed,
	Samiullah Khawaja, Shuah Khan, Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <20260522202410.3104264-5-dmatlack@google.com>

On Fri, 22 May 2026 20:24:02 +0000, David Matlack <dmatlack@google.com> wrote:
> Document how driver binding works during a Live Update and what the PCI
> core expects of drivers and users. Note that this is only a description
> of the current division of responsibilities. These can change in the
> future if we decide.

Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>

-- 
Pasha Tatashin <pasha.tatashin@soleen.com>


^ permalink raw reply

* Re: [PATCH v6 03/12] PCI: liveupdate: Track incoming preserved PCI devices
From: Pasha Tatashin @ 2026-06-14 13:38 UTC (permalink / raw)
  To: David Matlack
  Cc: kexec, linux-doc, linux-kernel, linux-mm, linux-pci,
	Adithya Jayachandran, Alexander Graf, Alex Williamson,
	Bjorn Helgaas, Chris Li, David Rientjes, Jacob Pan,
	Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
	Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
	Pranjal Shrivastava, Pratyush Yadav, Saeed Mahameed,
	Samiullah Khawaja, Shuah Khan, Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <20260522202410.3104264-4-dmatlack@google.com>

On Fri, 22 May 2026 20:24:01 +0000, David Matlack <dmatlack@google.com> wrote:
> diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
> index 10c9b65aa242..e68ae5c172d4 100644
> --- a/drivers/pci/Kconfig
> +++ b/drivers/pci/Kconfig
> @@ -330,7 +330,7 @@ config VGA_ARB_MAX_GPUS
>  
>  config PCI_LIVEUPDATE
>  	bool "PCI Live Update Support"
> -	depends on PCI && LIVEUPDATE
> +	depends on PCI && LIVEUPDATE && 64BIT

Please move this to the first patch, fewer changes between patches, and 
also KHO does not support anything but 64-bit mode.

>
> diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
> index 065d5af822f7..96c43b84532c 100644
> --- a/drivers/pci/liveupdate.c
> +++ b/drivers/pci/liveupdate.c
> @@ -128,13 +157,49 @@ static void pci_flb_unpreserve(struct liveupdate_flb_op_args *args)
> [ ... skip 31 lines ... ]
> +
> +err_xa_destroy:
> +	xa_destroy(&incoming->xa);
> +	kfree(incoming);
> +err_restore_free:
> +	kho_restore_free(ser);

This is the pattern we have been enforcing in other places in LUO. If 
the first retrieval fails, return the same error thereafter.

> @@ -270,6 +335,91 @@ void pci_liveupdate_unpreserve(struct pci_dev *dev)
>  }
>  EXPORT_SYMBOL_GPL(pci_liveupdate_unpreserve);
>  
> +static struct pci_flb_incoming *pci_liveupdate_flb_get_incoming(void)
> +{
> +	struct pci_flb_incoming *incoming = NULL;
> +	int ret;

Maybe make the error return static, and avoid another search through compatible 
FLBs if it failed before?

1. Add "saved_err;"; if it is set, return it right away.
2. Change all errors to use goto save_err;, and at the end of the 
function, assign ret to saved_err;

> [ ... skip 15 lines ... ]
> +	 * This could mean that no PCI FLB data was passed by the previous
> +	 * kernel, but it could also mean the previous kernel used a different
> +	 * compatibility string (i.e. a different ABI).
> +	 */
> +	if (ret == -ENOENT) {
> +		pr_info_once("No incoming FLB matched %s\n", pci_liveupdate_flb.compatible);

I would assume this is very normal, e.g., no devices were preserved but 
memfd+hugetlb was preserved. Maybe use pr_debug_once().

> +		return NULL;
> +	}
> +
> +	/*
> +	 * There is incoming FLB data that matches pci_liveupdate_flb.compatible
> +	 * but it cannot be retrieved.
> +	 */
> +	if (ret) {
> +		WARN_ONCE(ret, "Failed to retrieve incoming FLB data\n");

No need to print backtrace, please just print a warning:
pr_warn_once("Failed to retrieve incoming FLB data: %pe\n", ERR_PTR(ret));

> [ ... skip 34 lines ... ]
> +	 * through pci_liveupdate_finish(). This can happen if PCI core probes
> +	 * the same device multiple times, e.g. due to hotplug.
> +	 */
> +	if (!dev_ser->refcount) {
> +		pci_liveupdate_flb_put_incoming();
> +		return;

Pleaes use 'goto put_incoming'

> +	}
> +
> +	pci_info(dev, "Device was preserved by previous kernel across Live Update\n");
> +	dev->liveupdate.incoming = dev_ser;
> +
> +	/*
> +	 * Hold the ref on the incoming FLB until pci_liveupdate_finish() so
> +	 * that dev->liveupdate.incoming does not get freed while it is in use.
> +	 */

How would that work? If finish is not called FLB stays around until the 
next reboot.

-- 
Pasha Tatashin <pasha.tatashin@soleen.com>


^ permalink raw reply

* [PATCH v2] cgroup/cpuset: rebind mm mempolicy to effective_mems, not mems_allowed
From: Farhad Alemi @ 2026-06-14 13:25 UTC (permalink / raw)
  To: Andrew Morton, Waiman Long
  Cc: Farhad Alemi, David Hildenbrand, Gregory Price, Yury Norov,
	Joshua Hahn, Zi Yan, Matthew Brost, Rakie Kim, Byungchul Park,
	Ying Huang, Alistair Popple, Rasmus Villemoes, linux-mm,
	linux-kernel, cgroups, stable
In-Reply-To: <CA+0ovCg05rUk1-3k2ysdxmbcER8aG-wVh9SSTrrbp6LPWpPHYA@mail.gmail.com>

Creating a child cpuset where cpuset.mems is never set leads to a div/0
when a VMA mempolicy with MPOL_F_RELATIVE_NODES rebinds in response to a
CPU hotplug event.

Reproduction steps:
 1) Create a cgroup w/ cpuset controls (do not set cpuset.mems)
 2) Move the task into the child cpuset
 3) Create a VMA mempolicy for that task with MPOL_F_RELATIVE_NODES
 4) unplug and hotplug a cpu
      echo 0 > /sys/devices/system/cpu/cpu1/online
      echo 1 > /sys/devices/system/cpu/cpu1/online
 5) mempolicy rebind does a div/0 in mpol_relative_nodemask on the
    call to __nodes_fold()

The cpuset code passes (cs->mems_allowed) which is not guaranteed to have
nodes to the rebind routine.  Use cs->effective_mems instead, which is
guaranteed to have a non-empty nodemask.

Link: https://lore.kernel.org/linux-mm/CA+0ovCgxbZkXa+OU8w3s84R3KNPNxxRfmsNR-udh+afQBbGNmw@mail.gmail.com/
Link: https://lore.kernel.org/all/CA+0ovCiEz6SP_sn3kN4Tb+_oC=eHMXy_Ffj=usV3wREdQrUtww@mail.gmail.com/
Fixes: ae1c802382f7 ("cpuset: apply cs->effective_{cpus,mems}")
Suggested-by: Gregory Price <gourry@gourry.net>
Suggested-by: Waiman Long <longman@redhat.com>
Signed-off-by: Farhad Alemi <farhad.alemi@berkeley.edu>
Cc: stable@vger.kernel.org
---
v2: rebind to cs->effective_mems instead of newmems (Waiman Long);
    condense the changelog.

 kernel/cgroup/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2649,7 +2649,7 @@ void cpuset_update_tasks_nodemask(struct cpuset *cs)

 		migrate = is_memory_migrate(cs);

-		mpol_rebind_mm(mm, &cs->mems_allowed);
+		mpol_rebind_mm(mm, &cs->effective_mems);
 		if (migrate)
 			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
 		else
-- 
2.43.0


^ permalink raw reply

* Re: [PATCH 1/1] liveupdate: luo_file: Add internal APIs for file preservation
From: Pranjal Shrivastava @ 2026-06-14 12:48 UTC (permalink / raw)
  To: Samiullah Khawaja
  Cc: Pasha Tatashin, Mike Rapoport, Pratyush Yadav, Alexander Graf,
	David Matlack, tarunsahu, open list,
	open list:KEXEC HANDOVER (KHO), open list:KEXEC HANDOVER (KHO)
In-Reply-To: <20260613012521.835490-2-skhawaja@google.com>

On Sat, Jun 13, 2026 at 01:25:20AM +0000, Samiullah Khawaja wrote:
> From: Pasha Tatashin <pasha.tatashin@soleen.com>
> 
> Live update orchestrator file handlers depend on the preservation of
> other files. To make sure that the dependency is preserved, the file
> handlers needs to fetch the preservation token of the preserved
> dependency. Similarly during restore, a file handler wants to fetch the
> restored file of the dependency.
> 
> Add APIs that allows fetching token of dependency during preservation,
> and fetching the restored file dependency during restore.
> 
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
> Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
> ---
>  include/linux/liveupdate.h       | 21 ++++++++++
>  kernel/liveupdate/luo_file.c     | 69 ++++++++++++++++++++++++++++++++
>  kernel/liveupdate/luo_internal.h | 17 ++++++++
>  3 files changed, 107 insertions(+)
>
[...]
> +EXPORT_SYMBOL_GPL(liveupdate_get_token_outgoing);
> +
> +/**
> + * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use.
> + * @s:      The incoming liveupdate session (restored from the previous kernel).
> + * @token:  The unique token identifying the file to retrieve.
> + * @filep:  On success, this will be populated with a pointer to the retrieved
> + *          'struct file'.
> + *
> + * Provides a kernel-internal API for other subsystems to retrieve their
> + * preserved files after a live update. This function is a simple wrapper
> + * around luo_retrieve_file(), allowing callers to find a file by its token.
> + *
> + * The caller receives a new reference to the file and must call fput() when it
> + * is no longer needed. The file's lifetime is managed by LUO and any userspace
> + * file descriptors.
> + *
> + * Context: It must be called with session mutex acquired of a restored session.
> + * Return: 0 on success. Returns -ENOENT if no file with the matching token is
> + *         found, or any other negative errno on failure.
> + */
> +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
> +				 struct file **filep)
> +{
> +	return luo_retrieve_file(luo_file_set_from_session_locked(s),
> +				 token, filep);
> +}
> +EXPORT_SYMBOL_GPL(liveupdate_get_file_incoming);

Thanks for modifying the comment (as discussed in [1]). 
This looks good now, along with the EXPORT_SYMBOL_GPL.

Reviewed-by: Pranjal Shrivastava <praan@google.com>

Thanks,
Praan

[1] https://lore.kernel.org/all/agr6yoyYYq2QFxjL@google.com/


^ permalink raw reply

* Re: [PATCH v2 00/10] sh: remove NUMA and SPARSEMEM support
From: John Paul Adrian Glaubitz @ 2026-06-14 12:32 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-sh, Andrew Morton, Arnd Bergmann, Rich Felker,
	Yoshinori Sato, linux-kernel, linux-mm
In-Reply-To: <ai6Diz0UgSGccbhY@kernel.org>

Hi Mike,

On Sun, 2026-06-14 at 13:33 +0300, Mike Rapoport wrote:
> On Tue, Jun 09, 2026 at 09:21:37AM +0200, John Paul Adrian Glaubitz wrote:
> > Hi Mike,
> > 
> > On Wed, 2026-06-03 at 18:32 +0300, Mike Rapoport wrote:
> > > On Mon, May 18, 2026 at 02:05:39PM +0200, John Paul Adrian Glaubitz wrote:
> > > > Hi Mike,
> > > > 
> > > > On Mon, 2026-05-18 at 14:43 +0300, Mike Rapoport wrote:
> > > > > Gentle ping?
> > > > 
> > > > It's on my TODO list for this week!
> > > 
> > > It's sad to see this being dragged since mid April (if we count v1 and
> > > there were really minor changes in v2).
> > 
> > I apologize. I am doing the maintenance as a hobby in my free time, it's
> > not my primary job and it can sometimes take me a bit longer to take up
> > changes.
> > 
> > > If you don't have time to take care of that, just say so and we'll take
> > > this via one of the mm trees in the next cycle.
> > 
> > It should be better this week. I've been recently busy with CVE fixes during
> > my dayjob and the workload was extremely high.
> > 
> > I am not going to let this slip, don't worry. It's just been a bit too much
> > stress the past weeks due to the AI CVE reporting.
> 
> I understand that this is a hobby for you and there are a day job and other
> obligations and you don't have time for timely responses for arch/sh
> patches.
>  
> I just don't understand why do you insist on taking this via sh tree given
> you don't have the resources to timely deal with the patches.

Because I want to learn something in the process and also perform some basic
testing where possible. I know it takes longer and I can only ask for some
patience, but I will take are of it.

I will get these changes landed for 7.2, promised.

> This set can perfectly go via mm tree as it cleanups a memory management
> feature that should not have been added to sh at the first place.

I appreciate your efforts in cleaning up arch/sh, so I don't want to stress
your patience too much. Again, I will get this into 7.2. Don't worry!

Adrian

-- 
 .''`.  John Paul Adrian Glaubitz
: :' :  Debian Developer
`. `'   Physicist
  `-    GPG: 62FF 8A75 84E0 2956 9546  0006 7426 3B37 F5B5 F913


^ permalink raw reply

* Re: [PATCH v2 17/18] memblock: allow calculating reserved size by flags
From: Mike Rapoport @ 2026-06-14 12:02 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Mike Rapoport, Pasha Tatashin, Alexander Graf, Muchun Song,
	Oscar Salvador, David Hildenbrand, Andrew Morton, Jason Miu,
	Jork Loeser, kexec, linux-mm, linux-kernel
In-Reply-To: <20260605183501.3884950-18-pratyush@kernel.org>

On Fri, 05 Jun 2026 20:34:50 +0200, Pratyush Yadav <pratyush@kernel.org> wrote:
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index b3b4a6145fad..a3b57066611d 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -487,7 +487,8 @@ static inline __init_memblock bool memblock_bottom_up(void)
>  
>  phys_addr_t memblock_phys_mem_size(void);
>  phys_addr_t memblock_reserved_size(void);
> -phys_addr_t memblock_reserved_kern_size(phys_addr_t limit, int nid);
> +phys_addr_t memblock_reserved_size_flags(phys_addr_t limit, int nid,
> +					 enum memblock_flags flags);

Ugh, I'd hate memblock_reserved_hugetlb_size() less ;-)

-- 
Sincerely yours,
Mike.



^ permalink raw reply

* Re: [PATCH v2 16/18] memblock: make HugeTLB bootmem allocation work with KHO
From: Mike Rapoport @ 2026-06-14 12:02 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Mike Rapoport, Pasha Tatashin, Alexander Graf, Muchun Song,
	Oscar Salvador, David Hildenbrand, Andrew Morton, Jason Miu,
	Jork Loeser, kexec, linux-mm, linux-kernel
In-Reply-To: <20260605183501.3884950-17-pratyush@kernel.org>

On Fri, 05 Jun 2026 20:34:49 +0200, Pratyush Yadav <pratyush@kernel.org> wrote:
> Gigantic huge page allocation is somewhat broken currently when KHO is
> used.
> 
> Firstly, they break KHO scratch size accounting. RSRV_KERN is used to
> track how much memory is reserved for use by the kernel. Since
> alloc_bootmem() calls the memblock_alloc*() APIs, the hugepages

hugetlb::alloc_bootmem()

> [...]
> First, it does not use mirrored memory for hugetlb. Mirrored memory is a
> limited resource that is best saved for kernel data structures, not user
> memory.
> 
> Second, if the memory found overlaps with KHO scratch areas, it discards
> the memory and retries.

This sentence is somewhat hard to parse.

>
>
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 6349c48154f4..131e54dd5d8d 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -1756,6 +1761,69 @@ void * __init memblock_alloc_try_nid_raw(
> [ ... skip 51 lines ... ]
> +		if (memblock_bottom_up())
> +			start = addr + size;
> +		else
> +			start = addr - size;
> +
> +		goto retry;

Hmm, two goto retry don't seem nice :/
Although I can't see how to imporove it really.

Maybe add a helper for going the node fallback?

-- 
Sincerely yours,
Mike.



^ permalink raw reply

* Re: [PATCH v2 15/18] kho: extend scratch
From: Mike Rapoport @ 2026-06-14 12:02 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Mike Rapoport, Pasha Tatashin, Alexander Graf, Muchun Song,
	Oscar Salvador, David Hildenbrand, Andrew Morton, Jason Miu,
	Jork Loeser, kexec, linux-mm, linux-kernel
In-Reply-To: <20260605183501.3884950-16-pratyush@kernel.org>

On Fri, 05 Jun 2026 20:34:48 +0200, Pratyush Yadav <pratyush@kernel.org> wrote:
> diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
> index af22086ca2d6..8540608b8602 100644
> --- a/kernel/liveupdate/kexec_handover.c
> +++ b/kernel/liveupdate/kexec_handover.c
> @@ -869,6 +886,119 @@ static void __init kho_reserve_scratch(void)
>  	kho_enable = false;
>  }
>  
> +#define KHO_EXT_SHIFT 30 /* 1 GiB */

Please add a comment why exactly 1 Gib.
I'd also define a size macro as SZ_1G and make shift const_ilog2(SIZE)

> +
> +static int __init kho_ext_walk_key(unsigned long key, void *data)

Maybe _leaf? No strong feelings though

> +{
> +	struct kho_radix_tree *tree = data;

Would be nice to say which tree in the variable name ;)

> [ ... skip 15 lines ... ]
> +	return 0;
> +}
> +
> +static int __init kho_ext_walk_node(phys_addr_t phys, void *data)
> +{
> +	struct kho_radix_tree *tree = data;

Ditto

> [ ... skip 15 lines ... ]
> +
> +	*prev_end = start + (1UL << KHO_EXT_SHIFT);
> +	return 0;
> +}
> +
> +/**

I don't think we expose statics as kernel-doc somewhere, so this
probably shouldn't be a kernel-doc comment

> + * kho_extend_scratch - Extend the scratch regions
> + *
> + * The KHO radix tree mixes both physical address and order into a single key.

Here it's rather the preserved memory map radix tree or something like
that.

> + * This makes it hard to look for free ranges directly. This function first
> + * walks the radix tree and digests it down into another radix tree, whose keys

Here as well, maybe don't even mention radix to make it shorter, the
important part is that we wakk the preserved memory map and create a
radix tree that identifies blocks around the preserved memory.

> + * identify blocks of KHO_EXT_SHIFT which contain preserved memory.
> + *
> + * Then it walks the digested radix tree and marks everything that doesn't have
> + * preserved memory as scratch.
> + *
> + * NOTE: This function allocates memory so it should be called when scratch has
> + * available space.
> + *
> + * NOTE: The pages of the KHO radix tree tables are not marked as preserved in

^ preserved memory map radix tree :)

> + * the KHO tree. But they are expected to remain untouched until the tree is
> + * fully parsed. So this function also considers them to be "preserved memory"
> + * and marks their blocks as busy.
> + */
> +static void __init kho_extend_scratch(void)
> +{
> +	const struct kho_radix_walk_cb kho_cb = {
> +		.leaf = kho_ext_walk_key,
> +		.node = kho_ext_walk_node,
> +	};
> +	const struct kho_radix_walk_cb ext_cb = {
> +		.leaf = kho_ext_mark_scratch,
> +	};
> +	struct kho_radix_tree radix;

sashiko says:

  Is it possible for the radix variable to contain uninitialized stack memory
  here?
  If radix is uninitialized, tree->root might contain garbage data when passed
  to kho_radix_init_tree()

and I agree :)

This should be

	struct kho_radix_tree radix = { 0 };

-- 
Sincerely yours,
Mike.



^ permalink raw reply

* Re: [PATCH v2 14/18] memblock: use kho_scratch_overlap() to decide migratetype
From: Mike Rapoport @ 2026-06-14 12:02 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Mike Rapoport, Pasha Tatashin, Alexander Graf, Muchun Song,
	Oscar Salvador, David Hildenbrand, Andrew Morton, Jason Miu,
	Jork Loeser, kexec, linux-mm, linux-kernel
In-Reply-To: <20260605183501.3884950-15-pratyush@kernel.org>

On Fri, 05 Jun 2026 20:34:47 +0200, Pratyush Yadav <pratyush@kernel.org> wrote:
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 5afcd99aa8c1..546d7ef798b8 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -618,7 +619,7 @@ bool memblock_is_kho_scratch_memory(phys_addr_t addr);
>  static inline enum migratetype kho_scratch_migratetype(unsigned long pfn,
>  						       enum migratetype mt)
>  {
> -	if (memblock_is_kho_scratch_memory(PFN_PHYS(pfn)))
> +	if (kho_scratch_overlap(PFN_PHYS(pfn), pageblock_nr_pages << PAGE_SHIFT))

I'd move it to kexec_handover.h and kill
memblock_is_kho_scratch_memory() in the same patch

BTW, please double check that the extended scratch does not require
updates to efi_init()::reserve_regions().

-- 
Sincerely yours,
Mike.



^ permalink raw reply

* Re: [PATCH v2 12/18] kho: export kho_scratch_overlap()
From: Mike Rapoport @ 2026-06-14 12:02 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Mike Rapoport, Pasha Tatashin, Alexander Graf, Muchun Song,
	Oscar Salvador, David Hildenbrand, Andrew Morton, Jason Miu,
	Jork Loeser, kexec, linux-mm, linux-kernel
In-Reply-To: <20260605183501.3884950-13-pratyush@kernel.org>

On Fri, 05 Jun 2026 20:34:45 +0200, Pratyush Yadav <pratyush@kernel.org> wrote:

Subject: kho: export kho_scratch_overlap()

rather make it available for !kho users and regardless of KHO_DEBUG.

Nit: we are not really exporting this in the sense of EXPORT_SYMBOL, we

Nit: we are not really exporting this in the sense of EXPORT_SYMBOL, we

-- 
Sincerely yours,
Mike.



^ permalink raw reply

* Re: [PATCH v2 03/18] kho: return virtual address of mem_map
From: Mike Rapoport @ 2026-06-14 12:02 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Mike Rapoport, Pasha Tatashin, Alexander Graf, Muchun Song,
	Oscar Salvador, David Hildenbrand, Andrew Morton, Jason Miu,
	Jork Loeser, kexec, linux-mm, linux-kernel
In-Reply-To: <20260605183501.3884950-4-pratyush@kernel.org>

On Fri, 05 Jun 2026 20:34:36 +0200, Pratyush Yadav <pratyush@kernel.org> wrote:
> diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
> index e8454dc5b489..d8dd0ede4f87 100644
> --- a/kernel/liveupdate/kexec_handover.c
> +++ b/kernel/liveupdate/kexec_handover.c
> @@ -521,7 +522,11 @@ static phys_addr_t __init kho_get_mem_map_phys(const void *fdt)
>  		return 0;

sashiko:

  Does this error path need to be updated to return NULL?
  Since the function signature was changed to return a void pointer
  instead of phys_addr_t, returning a plain integer 0 instead of NULL
  might trigger static analysis warnings.

> @@ -1668,8 +1672,8 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
>  		goto unmap_fdt;
>  	}
>  
> -	mem_map_phys = kho_get_mem_map_phys(fdt);
> -	if (!mem_map_phys)
> +	mem_map = kho_get_mem_map(fdt);
> +	if (!mem_map)

Here we can't use mem_map as an actual pointer because phys_to_virt()
returns pre-KASLR addres.

Since we only call get_mem_map() here to verify that it's available in
the FDT and don't care about the actual virtual address, let's add a
comment about that, drop the mem_map varialble and directly check

	if (!kho_get_mem_map_phys())

-- 
Sincerely yours,
Mike.



^ permalink raw reply

* Re: [PATCH v2 00/18] kho: make boot time huge page allocation work nicely with KHO
From: Mike Rapoport @ 2026-06-14 12:02 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Mike Rapoport, Pasha Tatashin, Alexander Graf, Muchun Song,
	Oscar Salvador, David Hildenbrand, Andrew Morton, Jason Miu,
	Jork Loeser, kexec, linux-mm, linux-kernel
In-Reply-To: <20260605183501.3884950-1-pratyush@kernel.org>

On Fri, 05 Jun 2026 20:34:33 +0200, Pratyush Yadav <pratyush@kernel.org> wrote:

Hi,

> [...]
> allocated from scratch, they will fail to be preserved with the upcoming
> hugetlb preservation series [0].
> 
> Fix this by introducing the concept of extended scratch areas. They are
> areas that the kernel discovers on boot by walking the radix tree and
> finding free memory ranges. See patch 10 for more details.

Overlall LGTM.

I have some small comments here and there for now as I din't get into
all the details yet.

-- 
Sincerely yours,
Mike.



^ permalink raw reply

* Re: [PATCH v2 00/10] sh: remove NUMA and SPARSEMEM support
From: Mike Rapoport @ 2026-06-14 10:33 UTC (permalink / raw)
  To: John Paul Adrian Glaubitz
  Cc: linux-sh, Andrew Morton, Arnd Bergmann, Rich Felker,
	Yoshinori Sato, linux-kernel, linux-mm
In-Reply-To: <865aaa7aa64ab69ca9020d64d86baa8d9b700bcb.camel@physik.fu-berlin.de>

Hi Adrian,

On Tue, Jun 09, 2026 at 09:21:37AM +0200, John Paul Adrian Glaubitz wrote:
> Hi Mike,
> 
> On Wed, 2026-06-03 at 18:32 +0300, Mike Rapoport wrote:
> > On Mon, May 18, 2026 at 02:05:39PM +0200, John Paul Adrian Glaubitz wrote:
> > > Hi Mike,
> > > 
> > > On Mon, 2026-05-18 at 14:43 +0300, Mike Rapoport wrote:
> > > > Gentle ping?
> > > 
> > > It's on my TODO list for this week!
> > 
> > It's sad to see this being dragged since mid April (if we count v1 and
> > there were really minor changes in v2).
> 
> I apologize. I am doing the maintenance as a hobby in my free time, it's
> not my primary job and it can sometimes take me a bit longer to take up
> changes.
>
> > If you don't have time to take care of that, just say so and we'll take
> > this via one of the mm trees in the next cycle.
> 
> It should be better this week. I've been recently busy with CVE fixes during
> my dayjob and the workload was extremely high.
> 
> I am not going to let this slip, don't worry. It's just been a bit too much
> stress the past weeks due to the AI CVE reporting.

I understand that this is a hobby for you and there are a day job and other
obligations and you don't have time for timely responses for arch/sh
patches.
 
I just don't understand why do you insist on taking this via sh tree given
you don't have the resources to timely deal with the patches.

This set can perfectly go via mm tree as it cleanups a memory management
feature that should not have been added to sh at the first place.
 
> Adrian

-- 
Sincerely yours,
Mike.


^ permalink raw reply

* Re: [PATCH v3 3/3] selftests/kho: add LoongArch vmtest support
From: Mike Rapoport @ 2026-06-14 10:23 UTC (permalink / raw)
  To: George Guo
  Cc: Huacai Chen, Mike Rapoport, Pasha Tatashin, Pratyush Yadav,
	Shuah Khan, George Guo, WANG Xuerui, Alexander Graf, loongarch,
	linux-kernel, kexec, linux-mm, linux-kselftest, Kexin Liu
In-Reply-To: <20260601093930.112758-3-dongtai.guo@linux.dev>

On Mon, 01 Jun 2026 17:39:30 +0800, George Guo <dongtai.guo@linux.dev> wrote:

Hi,

> Add loongarch.conf to configure QEMU's LoongArch virt machine with a

Please spell out "virtual machine" here and below or use VM abbrevation.

>
>
> diff --git a/tools/testing/selftests/kho/vmtest.sh b/tools/testing/selftests/kho/vmtest.sh
> index 49fdac8e8b15..918698b6dd2a 100755
> --- a/tools/testing/selftests/kho/vmtest.sh
> +++ b/tools/testing/selftests/kho/vmtest.sh
> @@ -107,12 +107,20 @@ function run_qemu() {
> [ ... skip 14 lines ... ]
> +		-kernel "$kernel"
> +		-initrd "$initrd"
> +	)
> +
> +	if [[ -n "${QEMU_TIMEOUT:-}" ]]; then
> +		timeout "$QEMU_TIMEOUT" $qemu_cmd "${qemu_args[@]}" || true

Runnig with timeout can be actually useful for other architecures. Let's
make a local variable for timeout of 120 seconds and always run qemu with timeout.

-- 
Sincerely yours,
Mike.



^ permalink raw reply

* Re: [PATCH v3 14/19] mm/hugetlb: Free cross-zone bootmem gigantic pages after allocation
From: Mike Rapoport @ 2026-06-14  9:55 UTC (permalink / raw)
  To: Muchun Song
  Cc: Muchun Song, Oscar Salvador, David Hildenbrand, Andrew Morton,
	Madhavan Srinivasan, Michael Ellerman, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, linux-mm, linux-kernel,
	Nicholas Piggin, Christophe Leroy (CS GROUP),
	Ritesh Harjani (IBM), Aneesh Kumar K.V, linuxppc-dev, rppt
In-Reply-To: <1ABCD934-CA2C-4541-9B76-32FAEFD398FF@linux.dev>

On Wed, Jun 03, 2026 at 10:53:04AM +0800, Muchun Song wrote:
> > On Tue, 02 Jun 2026 18:10:34 +0800, Muchun Song <songmuchun@bytedance.com> wrote:
> >> 
> >> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> >> index 5e557c05d80a..218fb1ca45f4 100644
> >> --- a/mm/hugetlb.c
> >> +++ b/mm/hugetlb.c
> >> @@ -3073,22 +3076,38 @@ static bool __init alloc_bootmem_huge_page(struct hstate *h, int nid)
> >> [ ... skip 26 lines ... ]
> >> + 	* pages belonging to the requested node.
> >> + 	*/
> >> + 	if (WARN_ON_ONCE(nid_request != NUMA_NO_NODE && nid != nid_request))
> >> + 		list_add(&m->list, &huge_boot_pages[nid_request]);
> >> + 	else
> >> + 		list_add(&m->list, &huge_boot_pages[nid]);
> > 
> > Can we just memblock_free() the page that intersects zones here?
> 
> I had previously considered doing this, but then I realized that if we free the
> allocated cross-zone memory here, memblock is very likely to select the exact
> same block for the next allocation. This means we'd just end up with this
> cross-zone memory again, degrading allocation efficiency. Unless there is a way
> to mark the block so memblock avoids reallocating it, I ultimately chose to
> defer the release to prevent this issue from happening.

You are right, there's no simple way to avoid memblock using the same
range.

The comment at hugetlb_hstate_alloc_pages() hints that we might want to
split allocation of gigantic pages to be more explicit as a followup
rework and then freeing of cross-zone pages would be cleaner as well.
 
> Thanks.

-- 
Sincerely yours,
Mike.


^ permalink raw reply

* Re: [PATCH v4 14/19] mm/hugetlb: Free cross-zone bootmem gigantic pages after allocation
From: Mike Rapoport @ 2026-06-14  9:46 UTC (permalink / raw)
  To: Muchun Song
  Cc: Oscar Salvador, David Hildenbrand, Andrew Morton,
	Madhavan Srinivasan, Michael Ellerman, Muchun Song,
	Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, linux-mm,
	linux-kernel, Nicholas Piggin, Christophe Leroy, Ritesh Harjani,
	Aneesh Kumar K . V, linuxppc-dev, Mike Kravetz
In-Reply-To: <20260612035903.2468601-15-songmuchun@bytedance.com>

On Fri, Jun 12, 2026 at 11:58:58AM +0800, Muchun Song wrote:
> Now that hugetlb reservation runs after zone initialization, bootmem
> gigantic page allocation can detect pages that span multiple zones.
> 
> Keep those cross-zone pages separate during allocation and free them
> after allocation completes, so later hugetlb initialization only sees
> zone-valid gigantic pages.
> 
> This chooses to free cross-zone gigantic pages directly instead of
> retrying allocation. In practice, such cross-zone cases are expected to
> be very rare, so adding retry logic does not seem justified at this
> point. Keeping the handling simple also preserves the previous behavior.
> If similar real-world reports show up later, retry support can be
> reconsidered then.
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>

Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

> ---
>  mm/hugetlb.c | 75 ++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 64 insertions(+), 11 deletions(-)

-- 
Sincerely yours,
Mike.


^ permalink raw reply

* Re: [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs
From: Will Deacon @ 2026-06-14  9:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi, Peter Zijlstra, Catalin Marinas,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	Andrew Morton, David Hildenbrand, Mike Rapoport, Emil Tsalapatis,
	sched-ext, bpf, x86, linux-arm-kernel, linux-mm, linux-kernel
In-Reply-To: <20260522172219.1423324-2-tj@kernel.org>

On Fri, May 22, 2026 at 07:22:12AM -1000, Tejun Heo wrote:
> Add ptep_try_set(ptep, new_pte): atomically set *ptep to new_pte iff it is
> currently pte_none(). Returns true on success, false if the slot was already
> populated or the arch has no implementation.
> 
> The intended caller is the upcoming bpf_arena kernel-side fault recovery
> path. The install runs from a page fault that can be nested under locks
> held by the faulting kernel caller (e.g. a BPF program holding
> raw_res_spin_lock_irqsave on its arena's spinlock), so trylock-and-retry
> would A-A deadlock. Lock-free cmpxchg is the only viable option, which
> constrains this helper to special kernel page tables where concurrent
> writers cooperate via atomic accessors.
> 
> The generic version in <linux/pgtable.h> returns false. x86 and arm64
> override with try_cmpxchg-based implementations on the underlying pteval.
> Other architectures get the false stub - the callers there already fall
> through to oops.
> 
> v2: Rename to ptep_try_set(). Tighten kerneldoc. (David, Alexei)
> v3: Note that strict-zero cmpxchg is narrower than pte_none(). (Andrea)
> 
> Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> Suggested-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reviewed-by: Andrea Righi <arighi@nvidia.com>
> Cc: David Hildenbrand <david@kernel.org>
> ---
>  arch/arm64/include/asm/pgtable.h | 12 ++++++++++++
>  arch/x86/include/asm/pgtable.h   | 12 ++++++++++++
>  include/linux/pgtable.h          | 25 +++++++++++++++++++++++++
>  3 files changed, 49 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 9029b81ccbe8..28bada97d443 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -1830,6 +1830,18 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
>  	return __ptep_get_and_clear(mm, addr, ptep);
>  }
>  
> +/*
> + * Note: strictly-zero compare is narrower than pte_none(), but the gap is
> + * harmless: a fresh kernel PTE has no software bits set.
> + */

This comment really confused me :/

What is a "fresh" kernel PTE and why do you specifically call out "software
bits" if the CAS requires all 64 bits to be 0? Why is that narrower than
pte_none() given that pte_none() for arm64 is:

#define pte_none(pte)           (!pte_val(pte))

Will


^ permalink raw reply

page: next (older)
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox