Linux Documentation
 help / color / mirror / Atom feed
* [PATCH 4/4] mips: vmcore_info: export mips arch-specific struct offsets to vmcoreinfo
From: Pnina Feder @ 2026-06-22 21:14 UTC (permalink / raw)
  To: Andrew Morton, Baoquan He, Mike Rapoport, Pasha Tatashin,
	Pratyush Yadav, Thomas Bogendoerfer, Paul Walmsley,
	Palmer Dabbelt, Albert Ou
  Cc: Dave Young, Jonathan Corbet, Alexandre Ghiti, kexec, linux-kernel,
	linux-mips, linux-riscv, linux-doc, Pnina Feder
In-Reply-To: <20260622211430.4008899-1-pnina.feder@mobileye.com>

Export MIPS architecture-specific struct offsets needed by the
vmcore-tasks tool, including signal frame layouts and register
context structures used to reconstruct user-space register state
from a vmcore dump.

Signed-off-by: Pnina Feder <pnina.feder@mobileye.com>
---
 .../admin-guide/kdump/vmcoreinfo.rst          | 34 +++++++++++++++++++
 arch/mips/kernel/Makefile                     |  1 +
 arch/mips/kernel/signal.c                     |  8 +++++
 arch/mips/kernel/vmcore_info.c                | 22 ++++++++++++
 4 files changed, 65 insertions(+)
 create mode 100644 arch/mips/kernel/vmcore_info.c

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 3c364434b846..4af32ddf5615 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -494,6 +494,40 @@ Used to get the vmalloc_start address from the high_memory symbol.
 
 The maximum number of CPUs.
 
+MIPS
+====
+
+(rt_sigframe, rs_uc)
+--------------------
+
+Offset of the ucontext member within the MIPS rt_sigframe structure.
+Used to locate the signal context within a signal frame on the user
+stack.
+
+(sigcontext, sc_regs)
+---------------------
+
+Offset of the saved register array within struct sigcontext. Used to
+extract user-space register state from signal frames in a vmcore dump.
+
+PAGE_SHIFT
+----------
+
+The base-2 logarithm of the page size. Used for page frame number
+calculations during address translation.
+
+_PFN_MASK|_PAGE_PRESENT|_PAGE_VALID|_PAGE_GLOBAL
+-------------------------------------------------
+
+Page table entry bit masks and flags. Used for walking MIPS page tables
+and translating virtual to physical addresses in a vmcore dump.
+
+PTRS_PER_PGD|PTRS_PER_PMD|PTRS_PER_PTE
+---------------------------------------
+
+Number of entries per page table level. Used for page table walking
+during virtual-to-physical address translation.
+
 powerpc
 =======
 
diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
index 95a1e674fd67..99f2961f6ee1 100644
--- a/arch/mips/kernel/Makefile
+++ b/arch/mips/kernel/Makefile
@@ -24,6 +24,7 @@ CFLAGS_REMOVE_perf_event_mipsxx.o = $(CC_FLAGS_FTRACE)
 endif
 
 obj-$(CONFIG_CEVT_BCM1480)	+= cevt-bcm1480.o
+obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 obj-$(CONFIG_CEVT_R4K)		+= cevt-r4k.o
 obj-$(CONFIG_CEVT_DS1287)	+= cevt-ds1287.o
 obj-$(CONFIG_CEVT_GT641XX)	+= cevt-gt641xx.o
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 4a10f18a8806..f2241f52fa17 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -26,6 +26,7 @@
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include <linux/resume_user_mode.h>
+#include <linux/vmcore_info.h>
 
 #include <asm/abi.h>
 #include <asm/asm.h>
@@ -62,6 +63,13 @@ struct rt_sigframe {
 	struct ucontext rs_uc;
 };
 
+#ifdef CONFIG_VMCORE_INFO
+void mips_rt_signal_frame(void)
+{
+	VMCOREINFO_OFFSET(rt_sigframe, rs_uc);
+}
+#endif
+
 #ifdef CONFIG_MIPS_FP_SUPPORT
 
 /*
diff --git a/arch/mips/kernel/vmcore_info.c b/arch/mips/kernel/vmcore_info.c
new file mode 100644
index 000000000000..5d7fdc662065
--- /dev/null
+++ b/arch/mips/kernel/vmcore_info.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/vmcore_info.h>
+
+#include <asm/pgtable.h>
+#include <asm/sigcontext.h>
+
+extern void mips_rt_signal_frame(void);
+
+void arch_crash_save_vmcoreinfo(void)
+{
+	mips_rt_signal_frame();
+	VMCOREINFO_OFFSET(sigcontext, sc_regs);
+	VMCOREINFO_NUMBER(PAGE_SHIFT);
+	VMCOREINFO_NUMBER(_PFN_MASK);
+	VMCOREINFO_NUMBER(_PAGE_PRESENT);
+	VMCOREINFO_NUMBER(_PAGE_VALID);
+	VMCOREINFO_NUMBER(_PAGE_GLOBAL);
+	VMCOREINFO_NUMBER(PTRS_PER_PGD);
+	VMCOREINFO_NUMBER(PTRS_PER_PMD);
+	VMCOREINFO_NUMBER(PTRS_PER_PTE);
+}
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH v4 0/5] mm/zswap: Implement per-cgroup proactive writeback
From: Yosry Ahmed @ 2026-06-22 21:29 UTC (permalink / raw)
  To: Youngjun Park
  Cc: Hao Jia, Muchun Song, akpm, tj, hannes, shakeel.butt, mhocko,
	mkoutny, nphamcs, chengming.zhou, roman.gushchin, linux-mm,
	linux-kernel, linux-doc, Hao Jia
In-Reply-To: <ajkIkyajJEW2b7/0@yjaykim-PowerEdge-T330>

On Mon, Jun 22, 2026 at 3:04 AM Youngjun Park <youngjun.park@lge.com> wrote:
>
> On Mon, Jun 22, 2026 at 02:08:49PM +0800, Hao Jia wrote:
> >
> >
> > On 2026/6/21 12:20, Muchun Song wrote:
> > >
> > >
> > > > On Jun 18, 2026, at 12:48, Hao Jia <jiahao.kernel@gmail.com> wrote:
> > > >
> > > > From: Hao Jia <jiahao1@lixiang.com>
> > > >
> > > > Zswap currently writes back pages to backing swap reactively, triggered
> > > > either by the shrinker or by the pool reaching its size limit. Although
> > > > proactive memory reclaim can automatically write back a portion of zswap
> > > > pages via the shrinker, it cannot explicitly control the amount of
> > > > writeback for a specific memory cgroup. Moreover, proactive memory reclaim
> > > > may not always be triggered during a steady state.
> > > >
> > > > In certain scenarios, it is desirable to trigger writeback in advance to
> > > > free up memory. For example, users may want to prepare for an upcoming
> > > > memory-intensive workload by flushing cold memory to the backing storage
> > > > when the system is relatively idle.
> > > >
> > > > This patch series introduces a "zswap_writeback_only" key to memory.reclaim
> > > > cgroup interface, allowing users to proactively write back cold compressed
> > > > data from zswap to the backing swap device. When specified, this key
> > > > bypasses standard memory reclaim and exclusively performs proactive zswap
> > > > writeback up to the requested budget. If omitted, the default reclaim
> > > > behavior remains unchanged.
> > > >
> > > > Example usage:
> > > >   # Write back 10MB of compressed data from zswap to the backing swap
> > > >   echo "10M zswap_writeback_only" > memory.reclaim
> > >
> > > I’m not entirely sure if other candidate names were already brought up
> > > in previous discussions, so my apologies if I'm repeating something here!
> > > I do think expanding memory.reclaim is a great approach. That said, I
> > > was wondering if we could make the interface a bit more concise while
> > > keeping it flexible for future extensions.
> > >
> > > Essentially, what we want is to control the specific targets of the reclaim
> > > process—such as file, anon, or zswap. What do you think about using
> > > something like "source=zswap"? For instance, if we want to reclaim 10M from
> > > zswap, the command would look like this:
> > >
> > >     echo "10M source=zswap" > memory.reclaim

I like this suggestion, but I think ultimately we want proactive zswap
writeback to be part of a more general proactive swap demotion, and
zswap is just a swap tier.

> > >
[..]

>
> I also preferred sharing the `memory.reclaim` interface in the future swap demotion,
> since it already takes `zswap_writeback_only`.
> https://lore.kernel.org/all/aieUQUBHI+E3uNPW@yjaykim-PowerEdge-T330/
>
> Alternatively, we could use a separate interface as Yosry suggested
> (e.g. 'swap.tiers.demote'?).
>
> But as Nhat pointed out, allowing user-triggered demotion from the swap tier
> perspective could lead to issues like LRU inversion. We probably need to
> discuss whether this kind of user-triggered tier demotion will actually be
> supported at all.
> https://lore.kernel.org/linux-mm/CAKEwX=NfSy0XiD_UMsDOHGCwpE7sYmBmhV4Y9vk_cbnnr6J6PQ@mail.gmail.com/

I believe what Nhat said is that swap demotion may be used to
prevent/alleviate LRU inversion, not cause it. I don't see how
demotion can cause LRU inversion.

>
> So, IMHO..
>
> 1. If swap tier demotion is NOT exposed.
>
> We can simply choose between "source=" and `zswap_writeback_only` based
> on preference. (since there is no need to consider "swap_tier" demotion.)
>
> However, "source=" seems to offer better extensibility if it is expanded
> to file and anon use cases in the future.
>
> 2. If swap tier demotion IS exposed.
> We need to consider integration vs decoupling.
>
> (In my view, This is a design consideration. avoiding potentially
> redundant interfaces vs adding a new one if it is architecturally correct.)
>
> 2.1 Integration
>  - Integrating into 'memory.reclaim':
>   - "source=": Seems easier to integrate by explicitly specifying the target. (Your suggestion)
>   - 'zswap_writeback_only': Harder to integrate than "source=".
>
>  - Integrating into 'memory.swap.tiers.demote'
>   - 'memory.swap.tiers.demote' could absorb the memory.reclaim functionality.
>   (But since we only want to allow tiering for vswap+zswap cases like
>   the zswap writeback feature as we discussed, the reclaim interface behavior might
>   still need to stay for zswap only.)
>
> 2.2 Decoupling
>  - 'memory.swap.tiers.demote' handles other swap devices (excluding zswap),
> while "source=" or 'zswap_writeback_only' handles only zswap.

I personally think making proactive zswap writeback one use case of
proactive swap demotion makes sense. I think swap demotion in general
makes sense.

^ permalink raw reply

* Re: [PATCH 1/4] nfs: store the full NFS fileid in inode->i_ino
From: Jeff Layton @ 2026-06-22 22:38 UTC (permalink / raw)
  To: Mark Brown
  Cc: Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	linux-nfs, linux-kernel, linux-doc
In-Reply-To: <0750912a-f8dc-4714-ae11-4592d2e8eca7@sirena.org.uk>

On Mon, 2026-06-22 at 22:05 +0100, Mark Brown wrote:
> On Tue, May 12, 2026 at 12:12:42PM -0400, Jeff Layton wrote:
> > Now that inode->i_ino is a 64-bit value, store the full NFS fileid in
> > it directly instead of an XOR-folded hash. This makes NFS_FILEID() and
> > set_nfs_fileid() operate on inode->i_ino rather than the separate
> > nfsi->fileid field.
> 
> This patch is in -next now and is triggering a failure for in the LTP
> ioctl10.c test for me on arm:
> 
> tst_buffers.c:57: TINFO: Test is using guarded buffers
> tst_test.c:2047: TINFO: LTP version: 20260130
> tst_test.c:2050: TINFO: Tested kernel: 7.1.0-next-20260622 #1 SMP @1782128788 armv7l
> 
> ...
> 
> ioctl10.c:111: TFAIL: q->inode (11493907226) != entry.vm_inode (4294967295)
> 

Note that the vm_inode value is arm32's ULONG_MAX.

> arm64 seems unaffected, I didn't really investigate but I'll note that
> unsigned long is 32 bit on arm.
> 
> Full log:
> 
>    https://lava.sirena.org.uk/scheduler/job/2904745#L3852
> 
> bisect log with more test job links:
> 


The testcase does this:

static void parse_maps_file(const char *filename, const char *keyword, struct map_entry *entry)
{
        FILE *fp = SAFE_FOPEN(filename, "r");

        char line[1024];

        while (fgets(line, sizeof(line), fp) != NULL) {
                if (fnmatch(keyword, line, 0) == 0) {
                        if (sscanf(line, "%lx-%lx %s %lx %x:%x %lu %s",
                                                &entry->vm_start, &entry->vm_end, entry->vm_flags_str,
                                                &entry->vm_pgoff, &entry->vm_major, &entry->vm_minor,
                                                &entry->vm_inode, entry->vm_name) < 7)
                                tst_brk(TFAIL, "parse maps file /proc/self/maps failed");

                        entry->vm_flags = parse_vm_flags(entry->vm_flags_str);

                        SAFE_FCLOSE(fp);
                        return;
                }
        }

        SAFE_FCLOSE(fp);
        tst_brk(TFAIL, "parse maps file /proc/self/maps failed");
}

Note that it's trying to stuff the inode number field into an unsigned
long. Before this patch, the maps file would have printed the old
(hashed) inode number on 32-bit. Now, it prints the full 64-bit inode
number.

I asked The Big Pickle and it says:

"In glibc (userspace): The C standard says this is undefined behavior.
In practice, glibc's scanf internally uses strtoul/strtoull, which on
overflow store ULONG_MAX/ULLONG_MAX and set errno = ERANGE. However,
scanf itself does not propagate ERANGE to the caller — it still returns
1 (success). So you'd silently get ULONG_MAX stored."

We could argue that this is a bug in the testcase. It assumes that the
maps file will never print a value larger than ULONG_MAX in that field,
and I don't see why it would make that assumption in this day and age.

Are there actual programs in the field that scrape the maps file that
might be affected by this change?
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply

* [PATCH 1/2] cgroup/cpuset: Avoid unnecessary cpus & mems update in cpuset_hotplug_update_tasks()
From: Waiman Long @ 2026-06-22 22:45 UTC (permalink / raw)
  To: Tejun Heo, Johannes Weiner, Michal Koutný, Ridong Chen,
	Jonathan Corbet, Shuah Khan
  Cc: cgroups, linux-kernel, linux-doc, Waiman Long

As reported by sashiko [1], cpuset_hotplug_update_tasks() may perform
unnecessary task iteration and updating of tasks' CPU and node masks
when mems_allowed and/or cpus_allowed are not set in cpuset v2. It is
due to the fact that the temporary new_cpus and new_mems masks do not
inherit parent's effective_cpus/mems when they are empty which is the
expected behavior for cpuset v2 since commit 4ec22e9c5a90 ("cpuset:
Enable cpuset controller in default hierarchy").

Fix that and avoid unnecessay work by adding the empty mask checks and
inheriting the parent's versions if empty.

[1] https://sashiko.dev/#/patchset/20260621032816.1806773-1-longman%40redhat.com

Fixes: 4ec22e9c5a90 ("cpuset: Enable cpuset controller in default hierarchy")
Signed-off-by: Waiman Long <longman@redhat.com>
---
 kernel/cgroup/cpuset.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index aff86acea701..bc0207fd6e57 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3925,6 +3925,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 	compute_effective_cpumask(&new_cpus, cs, parent);
 	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
 
+	if (is_in_v2_mode()) {
+		/* Inherit parent's effective_cpus/mems if empty */
+		if (cpumask_empty(&new_cpus))
+			cpumask_copy(&new_cpus, parent->effective_cpus);
+		if (nodes_empty(new_mems))
+			new_mems = parent->effective_mems;
+	}
+
 	if (!tmp || !cs->partition_root_state)
 		goto update_tasks;
 
-- 
2.54.0


^ permalink raw reply related

* [PATCH 2/2] cgroup/cpuset: Rebind/migrate mm only for threadgroup leader in cpuset_update_tasks_nodemask()
From: Waiman Long @ 2026-06-22 22:45 UTC (permalink / raw)
  To: Tejun Heo, Johannes Weiner, Michal Koutný, Ridong Chen,
	Jonathan Corbet, Shuah Khan
  Cc: cgroups, linux-kernel, linux-doc, Waiman Long
In-Reply-To: <20260622224509.1927419-1-longman@redhat.com>

As reported by sashiko [1], cpuset_update_tasks_nodemask() will do
mpol_rebind_mm() and possibly cpuset_migrate_mm() for all threads of
a multithreaded process. Since commit 3df9ca0a2b8b ("cpuset: migrate
memory only for threadgroup leaders"), cpuset_attach() had been updated
to rebind and migrate memory only for threadgroup leaders to mark the
group leader as the owner of the mm_struct.

To be consistent and avoid unnecessary performance overhead for heavily
multithreaded processes, follow the cpuset_attach() example and perform
memory rebind and migration only for threadgroup leaders.

Also add a paragraph in cgroup-v2.rst under cpuset.mems that the
threadgroup leader is the memory owner of that threadgroup. Therefore
the non-leading threads shouldn't be in other cgroups whose "cpuset.mems"
doesn't fully overleap that of the group leader.

[1] https://sashiko.dev/#/patchset/20260621032816.1806773-1-longman%40redhat.com

Signed-off-by: Waiman Long <longman@redhat.com>
---
 Documentation/admin-guide/cgroup-v2.rst | 7 +++++++
 kernel/cgroup/cpuset.c                  | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 993446ab66d0..341037c7ec9d 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2527,6 +2527,13 @@ Cpuset Interface Files
 	a need to change "cpuset.mems" with active tasks, it shouldn't
 	be done frequently.
 
+	For a multithreaded process, the threadgroup leader is
+	considered the owner of the group's memory. Memory policy
+	rebinding and migration will only happen with respect to the
+	threadgroup leader. To avoid unexpected result, non-leading
+	threads shouldn't be put into another cgroup whose "cpuset.mems"
+	doesn't full overleap that of the threadgroup leader.
+
   cpuset.mems.effective
 	A read-only multiple values file which exists on all
 	cpuset-enabled cgroups.
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index bc0207fd6e57..27bc7a466468 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2659,6 +2659,10 @@ void cpuset_update_tasks_nodemask(struct cpuset *cs)
 
 		cpuset_change_task_nodemask(task, &newmems);
 
+		/* Rebind and migrate mm only for task group leader */
+		if (task != task->group_leader)
+			continue;
+
 		mm = get_task_mm(task);
 		if (!mm)
 			continue;
-- 
2.54.0


^ permalink raw reply related

* Re: [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test
From: Ackerley Tng @ 2026-06-22 23:01 UTC (permalink / raw)
  To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
In-Reply-To: <deb20fbe3584a8c6bfda276447fe464c6553737d.1780676742.git.tarunsahu@google.com>

Tarun Sahu <tarunsahu@google.com> writes:

> Add a new KVM selftest `guest_memfd_preservation_test` to verify that
> guest memory backed by guest_memfd is preserved properly.
>

Don't think using backticks in commit messages is a common practice but
I might be wrong here.

> The test leverages the Live Update Orchestrator (LUO) infrastructure
> to validate that memory folios and configuration layouts are
> successfully saved and then restored during kernel live updates,
> preventing any memory loss for the guest.
>
> Here, I have used the kvm selftests framework by creating a new
> vm and mapping two memory slots to it. One is the code that is executed
> inside the vm and other is the guest_memfd whose memory is being
> written by the guest code.
>

Don't think commit messages with "I" are common either

> In Phase 1: Once data is written the vm exits and wait for the user
> to trigger the kexec.
>
> In Phase 2: A new vm is created with retrieved kvm and again two
> memory slots are assigned. Once for guest code, and another is for
> retrieved guest_memfd where guest_memfd memory is verified by the
> executed guest code. If verification succeeds, The test passes.
>
>
> [...snip...]
>
> +#define SESSION_NAME "gmem_vm_preservation_session"
> +#define VM_TOKEN 0x1001
> +#define GMEM_TOKEN 0x1002
> +
> +#define GMEM_SIZE (16ULL * 1024 * 1024)
> +#define DATA_SIZE (5ULL * 1024 * 1024)
> +
> +static size_t page_size;
> +
> +/* Deterministic byte pattern generation based on offset */
> +static inline uint8_t get_pattern_byte(size_t offset)
> +{
> +	return (uint8_t)(offset ^ 0x5A);
> +}
> +
> +static void guest_code_phase1(uint64_t gpa, uint64_t size, uint64_t data_size)
> +{
> +	uint8_t *mem = (uint8_t *)gpa;
> +	size_t i;
> +
> +	for (i = 0; i < data_size; i++)
> +		mem[i] = get_pattern_byte(i);
> +
> +	GUEST_DONE();
> +}
> +
> +static void guest_code_phase2(uint64_t gpa, uint64_t size, uint64_t data_size)
> +{
> +	uint8_t *mem = (uint8_t *)gpa;
> +	size_t i;
> +
> +	for (i = 0; i < data_size; i++) {
> +		uint8_t val = get_pattern_byte(i);
> +
> +		__GUEST_ASSERT(mem[i] == val,
> +			       "Data mismatch at offset %lu! Expected 0x%x, got 0x%x",
> +			       i, val, mem[i]);
> +	}
> +
> +	GUEST_DONE();
> +}
> +
> +static void do_phase1(void)
> +{
> +	uint64_t flags = GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED;

Is there a reason to set GUEST_MEMFD_FLAG_MMAP? We're not really
accessing that memory from the host in this test.

> +	int gmem_fd, dev_luo_fd, session_fd, ret;
> +	const uint64_t gpa = SZ_4G;
> +	struct kvm_vcpu *vcpu;
> +	const int slot = 1;
> +	struct kvm_vm *vm;
> +
> +	vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1,
> +					guest_code_phase1);
> +	gmem_fd = vm_create_guest_memfd(vm, GMEM_SIZE, flags);
> +	vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
> +				 gmem_fd, 0);
> +
> +	for (size_t i = 0; i < GMEM_SIZE; i += page_size)
> +		virt_pg_map(vm, gpa + i, gpa + i);
> +
> +	vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);

If GMEM_SIZE and DATA_SIZE are static I think we don't have to set those
as vcpu_args_set(), they can be used as macros from within the guest.

> +
> +	vcpu_run(vcpu);
> +	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
> +
> +	dev_luo_fd = luo_open_device();
> +	TEST_ASSERT(dev_luo_fd >= 0, "Failed to open /dev/liveupdate");
> +
> +	session_fd = luo_create_session(dev_luo_fd, SESSION_NAME);
> +	TEST_ASSERT(session_fd >= 0, "Failed to create LUO session");
> +
> +	ret = luo_session_preserve_fd(session_fd, vm->fd, VM_TOKEN);
> +	TEST_ASSERT(ret == 0, "Failed to preserve VM file descriptor");
> +
> +	ret = luo_session_preserve_fd(session_fd, gmem_fd, GMEM_TOKEN);
> +	TEST_ASSERT(ret == 0, "Failed to preserve guest_memfd file descriptor");
> +

Thanks for showing how this works :)

> +	printf("\n============================================================\n");
> +	printf("Phase 1 Complete Successfully!\n");
> +	printf("VM file and guest_memfd file have been preserved via LUO.\n");
> +	printf("Tokens: VM_TOKEN=0x%x, GMEM_TOKEN=0x%x\n", VM_TOKEN, GMEM_TOKEN);
> +	printf("Machine Size: %llu MB, Data Size: %llu MB\n", GMEM_SIZE / SZ_1M,
> +				 DATA_SIZE / SZ_1M);
> +	printf("------------------------------------------------------------\n");
> +
> +	daemonize_and_wait();
> +}
> +
> +static struct kvm_vm *vm_create_from_fd(int resurrected_vm_fd,
> +					struct vm_shape shape)
> +{
> +	struct kvm_vm *vm;
> +
> +	vm = calloc(1, sizeof(*vm));
> +	TEST_ASSERT(vm != NULL, "Insufficient Memory");
> +
> +	vm_init_fields(vm, shape);

What would happen if the shape was changed between preserving and
restoring?

> +
> +	vm->kvm_fd = open_path_or_exit(KVM_DEV_PATH, O_RDWR);
> +	vm->fd = resurrected_vm_fd;
> +
> +	if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD))
> +		vm->stats.fd = vm_get_stats_fd(vm);
> +	else
> +		vm->stats.fd = -1;
> +
> +	vm_init_memory_properties(vm);
> +
> +	return vm;
> +}
> +

I think vm_create_from_fd() could be introduced in an earlier patch to
reduce the amount of new code in this patch. Also, I think it could
perhaps be moved to kvm_util.c assuming that other test will use it too.

> +static void do_phase2(void)
> +{
> +	int retrieved_vm_fd, retrieved_gmem_fd, dev_luo_fd, session_fd;
> +	struct vm_shape shape = VM_SHAPE_DEFAULT;
> +	const uint64_t gpa = SZ_4G;
> +	struct kvm_vcpu *vcpu;
> +	const int slot = 1;
> +	struct kvm_vm *vm;
> +
> +	dev_luo_fd = luo_open_device();
> +	TEST_ASSERT(dev_luo_fd >= 0, "Failed to open /dev/liveupdate");
> +
> +	session_fd = luo_retrieve_session(dev_luo_fd, SESSION_NAME);
> +	TEST_ASSERT(session_fd >= 0, "Failed to retrieve LUO session");
> +
> +	retrieved_vm_fd = luo_session_retrieve_fd(session_fd, VM_TOKEN);
> +	TEST_ASSERT(retrieved_vm_fd >= 0, "Failed to retrieve VM file descriptor");
> +
> +	retrieved_gmem_fd = luo_session_retrieve_fd(session_fd, GMEM_TOKEN);
> +	TEST_ASSERT(retrieved_gmem_fd >= 0, "Failed to retrieve guest_memfd file descriptor");
> +
> +	vm = vm_create_from_fd(retrieved_vm_fd, shape);
> +
> +	u64 nr_pages = 2048; /* 8MB is plenty for slot0 pages */
> +

I don't think declarations are usually mixed with regular code.

> +	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
> +	kvm_vm_elf_load(vm, program_invocation_name);
> +
> +	for (int i = 0; i < NR_MEM_REGIONS; i++)
> +		vm->memslots[i] = 0;
> +
> +	struct userspace_mem_region *slot0 = memslot2region(vm, 0);
> +
> +	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
> +
> +	vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
> +				   retrieved_gmem_fd, 0);
> +
> +	for (size_t i = 0; i < GMEM_SIZE; i += page_size)
> +		virt_pg_map(vm, gpa + i, gpa + i);
> +
> +	vcpu = vm_vcpu_add(vm, 0, guest_code_phase2);
> +	kvm_arch_vm_finalize_vcpus(vm);
> +
> +	vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);
> +
> +	printf("Resuming / Running VM in Phase 2...\n");
> +	vcpu_run(vcpu);
> +	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
> +
> +	printf("\nSUCCESS: Phase 2 Complete! All 5MB complex data verified intact!\n");
> +
> +	luo_session_finish(session_fd);
> +	close(session_fd);
> +	close(dev_luo_fd);
> +	/* This will also close the vm_fd */
> +	kvm_vm_free(vm);
> +	close(retrieved_gmem_fd);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	bool phase2 = false;
> +
> +	TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD));
> +	page_size = getpagesize();
> +
> +	for (int i = 1; i < argc; i++) {
> +		if (strcmp(argv[i], "--phase2") == 0)
> +			phase2 = true;
> +	}
> +

Maybe use getopt() here?

> +	if (phase2)
> +		do_phase2();
> +	else
> +		do_phase1();
> +
> +	return 0;
> +}
> --
> 2.54.0.1032.g2f8565e1d1-goog

I think we also need tests for trying to allocate while frozen, and
conversion while frozen, and trying to preserve while preservation is
not allowed.

^ permalink raw reply

* Re: [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation
From: Ackerley Tng @ 2026-06-22 23:27 UTC (permalink / raw)
  To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
In-Reply-To: <4b2216f5c459fe699a3f62464cbc765624e20ae6.1780676742.git.tarunsahu@google.com>

Tarun Sahu <tarunsahu@google.com> writes:

> This patch sets up the basic infrastructure to preserve the guest_memfd.
> Currently this supports only fully shared guest_memfd and backed by
> PAGE_SIZE pages.
>
> It registers a new LUO file handler for guest_memfd files to serialize
> and deserialize guest memory. This allows preserving guest memory backed
> by guest_memfd across updates, ensuring that guest instances can be
> resumed seamlessly without losing their memory contents.
>
> Preservation is straight forward. It walks through the folios and
> serialize them.
>
> There is kvm_gmem_freeze call on preserve which freeze the guest_memfd
> inode. It avoids any changes to inode mapping with fallocate calls or
> any new fault allocation (fails) on or after preservation. No need to check
> this during the page fault as preservation is only supported for
> pre-faulted/pre-allocated guest_memfd.
>
> While retrieving the guest_memfd, it requires the struct kvm to create
> new guest_memfd. So it first get the vm_file from the same session using
> the token passed during the preservation. And use it to get
> vm_file->kvm.
>
> This change also update the MAINTAINERS list.
>
> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
> ---
>  MAINTAINERS                 |   1 +
>  include/linux/kho/abi/kvm.h |  79 +++++-
>  virt/kvm/Makefile.kvm       |   2 +-
>  virt/kvm/guest_memfd_luo.c  | 485 ++++++++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c         |   7 +
>  virt/kvm/kvm_mm.h           |   4 +
>  6 files changed, 571 insertions(+), 7 deletions(-)
>  create mode 100644 virt/kvm/guest_memfd_luo.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 9bfc3c1f6676..16cba790a84d 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -14418,6 +14418,7 @@ L:	kexec@lists.infradead.org
>  L:	kvm@vger.kernel.org
>  S:	Maintained
>  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
> +F:	virt/kvm/guest_memfd_luo.c
>  F:	virt/kvm/kvm_luo.c
>
>  KVM PARAVIRT (KVM/paravirt)
> diff --git a/include/linux/kho/abi/kvm.h b/include/linux/kho/abi/kvm.h
> index 718db68a541a..42074d76e04a 100644
> --- a/include/linux/kho/abi/kvm.h
> +++ b/include/linux/kho/abi/kvm.h
> @@ -9,20 +9,23 @@
>  #define _LINUX_KHO_ABI_KVM_H
>
>  #include <linux/types.h>
> +#include <linux/bits.h>
>  #include <linux/kho/abi/kexec_handover.h>
>
>  /**
> - * DOC: KVM Live Update ABI
> + * DOC: KVM and guest_memfd Live Update ABI
>   *
> - * KVM uses the ABI defined below for preserving its state
> + * KVM and guest_memfd use the ABI defined below for preserving their states
>   * across a kexec reboot using the LUO.
>   *
> - * The state is serialized into a packed structure `struct kvm_luo_ser`
> - * which is handed over to the next kernel via the KHO mechanism.
> + * The state is serialized into packed structures (struct kvm_luo_ser and
> + * struct guest_memfd_luo_ser) which are handed over to the next kernel via
> + * the KHO mechanism.
>   *
> - * This interface is a contract. Any modification to the structure layout
> + * This interface is a contract. Any modification to the structure layouts
>   * constitutes a breaking change. Such changes require incrementing the
> - * version number in the KVM_LUO_FH_COMPATIBLE compatibility string.
> + * version number in the KVM_LUO_FH_COMPATIBLE or
> + * GUEST_MEMFD_LUO_FH_COMPATIBLE compatibility strings.
>   */
>
>  /**
> @@ -36,4 +39,68 @@ struct kvm_luo_ser {
>  /* The compatibility string for KVM VM file handler */
>  #define KVM_LUO_FH_COMPATIBLE	"kvm_vm_luo_v1"
>
> +/**
> + * struct guest_memfd_luo_folio_ser - Serialization layout for a single folio in guest_memfd.
> + * @pfn:   Page Frame Number of the folio.
> + * @index: Page offset of the folio within the file.
> + * @flags: State flags associated with the folio.
> + */
> +struct guest_memfd_luo_folio_ser {
> +	u64 pfn:52;
> +	u64 flags:12;
> +	u64 index;
> +} __packed;
> +
> +/**
> + * GUEST_MEMFD_LUO_FOLIO_UPTODATE - The folio is up-to-date.
> + *
> + * This flag is per folio to check if the folio is uptodate.
> + */
> +#define GUEST_MEMFD_LUO_FOLIO_UPTODATE	BIT(0)
> +
> +
> +/**
> + * GUEST_MEMFD_LUO_FLAG_MMAP - The guest_memfd supports mmap.
> + *
> + * This flag indicates that the guest_memfd supports host-side mmap.
> + */
> +#define GUEST_MEMFD_LUO_FLAG_MMAP		BIT(0)
> +
> +/**
> + * GUEST_MEMFD_LUO_FLAG_INIT_SHARED - Initialize memory as shared.
> + *
> + * This flag indicates that the guest_memfd has been initialized as shared
> + * memory.
> + */
> +#define GUEST_MEMFD_LUO_FLAG_INIT_SHARED	BIT(1)
> +
> +/**
> + * GUEST_MEMFD_LUO_SUPPORTED_FLAGS - Supported guest_memfd LUO flags mask.
> + *
> + * A mask of all guest_memfd preservation flags supported by this version
> + * of the KVM LUO ABI.
> + */
> +#define GUEST_MEMFD_LUO_SUPPORTED_FLAGS	(GUEST_MEMFD_LUO_FLAG_MMAP | \
> +						 GUEST_MEMFD_LUO_FLAG_INIT_SHARED)
> +
> +/**
> + * struct guest_memfd_luo_ser - Main serialization structure for guest_memfd.
> + * @size:      The size of the file in bytes.
> + * @flags:     File-level flags.
> + * @nr_folios: Number of folios in the folios array.
> + * @vm_token:  Token of the associated KVM VM instance.
> + * @folios:    KHO vmalloc descriptor pointing to the array of
> + *             struct guest_memfd_luo_folio_ser.
> + */
> +struct guest_memfd_luo_ser {
> +	u64 size;
> +	u64 flags;
> +	u64 nr_folios;
> +	u64 vm_token;
> +	struct kho_vmalloc folios;
> +} __packed;
> +
> +/* The compatibility string for GUEST_MEMFD file handler */
> +#define GUEST_MEMFD_LUO_FH_COMPATIBLE	"guest_memfd_luo_v1"
> +
>  #endif /* _LINUX_KHO_ABI_KVM_H */
> diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
> index c1a962159264..d30fca094c42 100644
> --- a/virt/kvm/Makefile.kvm
> +++ b/virt/kvm/Makefile.kvm
> @@ -13,4 +13,4 @@ kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
>  kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
>  kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
>  kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
> -kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/kvm_luo.o
> +kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/guest_memfd_luo.o $(KVM)/kvm_luo.o
> diff --git a/virt/kvm/guest_memfd_luo.c b/virt/kvm/guest_memfd_luo.c
> new file mode 100644
> index 000000000000..d466f889c9aa
> --- /dev/null
> +++ b/virt/kvm/guest_memfd_luo.c
> @@ -0,0 +1,485 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright (c) 2026, Google LLC.
> + * Tarun Sahu <tarunsahu@google.com>
> + *
> + * Guestmemfd Preservation for Live Update Orchestrator (LUO)
> + */
> +
> +/**
> + * DOC: Guestmemfd Preservation via LUO
> + *
> + * Overview
> + * ========
> + *
> + * Guest memory file descriptors (guest_memfd) can be preserved over a kexec
> + * reboot using the Live Update Orchestrator (LUO) file preservation. This
> + * allows userspace to preserve VM memory across kexec reboots.
> + *
> + * The preservation is not intended to be transparent. Only select properties
> + * of the guest_memfd are preserved, while others are reset to default.
> + *
> + * Preserved Properties
> + * ====================
> + *
> + * The following properties of guest_memfd are preserved across kexec:
> + *
> + * File Size
> + *   The size of the file is preserved.
> + *
> + * File Contents
> + *   All folios present in the page cache are preserved.
> + *
> + * File-level Flags
> + *   The file-level flags (such as MMAP support and INIT_SHARED default mapping)
> + *   are preserved.
> + *
> + * Non-Preserved Properties
> + * ========================
> + *
> + * NUMA Memory Policy
> + *   NUMA memory policies associated with the guest_memfd are not preserved.
> + */
> +#include <linux/liveupdate.h>
> +#include <linux/kvm_host.h>
> +#include <linux/pagemap.h>
> +#include <linux/file.h>
> +#include <linux/err.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/magic.h>
> +#include <linux/kexec_handover.h>
> +#include <linux/kho/abi/kexec_handover.h>
> +#include <linux/kho/abi/kvm.h>
> +#include "guest_memfd.h"
> +
> +static int kvm_gmem_luo_walk_folios(struct address_space *mapping,
> +		pgoff_t end_index, struct guest_memfd_luo_folio_ser *folios_ser,
> +		u64 *out_count)
> +{
> +	struct folio_batch fbatch;
> +	pgoff_t index = 0;
> +	u64 count = 0;
> +	int err = 0;
> +
> +	folio_batch_init(&fbatch);
> +	while (index < end_index) {
> +		unsigned int nr, i;
> +
> +		nr = filemap_get_folios(mapping, &index, end_index - 1, &fbatch);
> +		if (nr == 0)
> +			break;
> +
> +		for (i = 0; i < nr; i++) {
> +			struct folio *folio = fbatch.folios[i];
> +
> +			if (folios_ser) {
> +				if (folio_test_hwpoison(folio)) {
> +					err = -EHWPOISON;
> +					folio_batch_release(&fbatch);
> +					goto out;
> +				}
> +				err = kho_preserve_folio(folio);
> +				if (err) {
> +					folio_batch_release(&fbatch);
> +					goto out;
> +				}
> +
> +				folios_ser[count].pfn = folio_pfn(folio);
> +				folios_ser[count].index = folio->index;
> +				folios_ser[count].flags = folio_test_uptodate(folio) ?
> +							  GUEST_MEMFD_LUO_FOLIO_UPTODATE : 0;
> +			}
> +			count++;
> +		}
> +		folio_batch_release(&fbatch);
> +		cond_resched();
> +	}
> +
> +out:
> +	*out_count = count;
> +	return err;
> +}
> +
> +static bool kvm_gmem_luo_can_preserve(struct liveupdate_file_handler *handler, struct file *file)
> +{
> +	struct inode *inode = file_inode(file);
> +	struct gmem_file *gmem_file = file->private_data;
> +	struct kvm *kvm = gmem_file->kvm;
> +
> +	if (inode->i_sb->s_magic != GUEST_MEMFD_MAGIC)
> +		return 0;
> +

How does .can_preserve decide route to this function? If it already
routes here, wouldn't this inode definitely be a guest_memfd file?

> +	if (kvm_arch_has_private_mem(kvm))
> +		return 0;
> +
> +	if (mapping_large_folio_support(inode->i_mapping))
> +		return 0;
> +
> +	return 1;

Let's return true and false rather than relying on casting.

> +}
> +
> +static int kvm_gmem_luo_preserve(struct liveupdate_file_op_args *args)
> +{
> +	struct guest_memfd_luo_folio_ser *folios_ser = NULL;
> +	u64 count = 0, gmem_flags, abi_flags = 0;
> +	struct guest_memfd_luo_ser *ser;
> +	struct address_space *mapping;
> +	struct gmem_file *gmem_file;
> +	struct inode *inode;
> +	pgoff_t end_index;
> +	struct kvm *kvm;
> +	int err = 0;
> +	long size;
> +
> +	inode = file_inode(args->file);

I think to lock out all allocates, you'd have to take
filemap_invalidate_lock() before freezing.

> +	kvm_gmem_freeze(inode, true);
> +
> +	mapping = inode->i_mapping;
> +	size = i_size_read(inode);
> +	if (!size) {
> +		err = -EINVAL;
> +		goto err_unfreeze_inode;
> +	}
> +
> +	if (WARN_ON_ONCE(!PAGE_ALIGNED(size))) {
> +		err = -EINVAL;
> +		goto err_unfreeze_inode;
> +	}
> +
> +	gmem_file = args->file->private_data;
> +	kvm = gmem_file->kvm;
> +
> +	gmem_flags = READ_ONCE(GMEM_I(inode)->flags);
> +	if (gmem_flags & ~(GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED

Why condition this on MMAP?

After conversions lands, we'd have to iterate to check that the entire
guest_memfd is shared offset-by-offset instead of checking for INIT_SHARED.

> +				| GUEST_MEMFD_F_MAPPING_FROZEN)) {

This would always be true since kvm_gmem_freeze() is done above.

> +		err = -EOPNOTSUPP;
> +		goto err_unfreeze_inode;
> +	}
> +
> +	if (gmem_flags & GUEST_MEMFD_FLAG_MMAP)
> +		abi_flags |= GUEST_MEMFD_LUO_FLAG_MMAP;
> +	if (gmem_flags & GUEST_MEMFD_FLAG_INIT_SHARED)
> +		abi_flags |= GUEST_MEMFD_LUO_FLAG_INIT_SHARED;
> +

Is it intentional to have a different set of flags that are actually
preserved? I think we should refactor out a function to transfer the
flags over.

> +	end_index = size >> PAGE_SHIFT;
> +
> +	ser = kho_alloc_preserve(sizeof(*ser));
> +	if (IS_ERR(ser)) {
> +		err = PTR_ERR(ser);
> +		goto err_unfreeze_inode;
> +	}
> +
> +	/* First pass: Count the folios present in the page cache */
> +	err = kvm_gmem_luo_walk_folios(mapping, end_index, NULL, &count);
> +	if (err)
> +		goto err_free_ser;
> +
> +	ser->size = size;
> +	ser->flags = abi_flags;
> +	ser->nr_folios = count;
> +	ser->vm_token = 0; // It will be set during the kvm_gmem_luo_freeze()

I don't think // is commonly used.

> +
> +	if (count > 0) {
> +		folios_ser = vcalloc(count, sizeof(*folios_ser));
> +		if (!folios_ser) {
> +			err = -ENOMEM;
> +			goto err_free_ser;
> +		}
> +
> +		/* Second pass: Fill the metadata array and preserve folios */
> +		err = kvm_gmem_luo_walk_folios(mapping, end_index, folios_ser, &count);

I think it's clearer to just define 2 functions rather than using the
same function twice to do these different things. The comments on the
two passes can then be dropped.

> +		if (err)
> +			goto err_unpreserve_unlocked;
> +
> +		if (WARN_ON_ONCE(count != ser->nr_folios)) {
> +			err = -EINVAL;
> +			goto err_unpreserve_unlocked;
> +		}
> +	}
> +
> +	if (count > 0) {
> +		err = kho_preserve_vmalloc(folios_ser, &ser->folios);
> +		if (err)
> +			goto err_unpreserve_unlocked;
> +	}
> +
> +	args->serialized_data = virt_to_phys(ser);
> +	args->private_data = folios_ser;
> +
> +	return 0;
> +
> +err_unpreserve_unlocked:
> +	for (long i = (long)count - 1; i >= 0; i--) {

Not sure if it's common to define long i inline.

> +		struct folio *folio = pfn_folio(folios_ser[i].pfn);
> +
> +		kho_unpreserve_folio(folio);
> +	}
> +	vfree(folios_ser);
> +err_free_ser:
> +	kho_unpreserve_free(ser);
> +err_unfreeze_inode:
> +	kvm_gmem_freeze(inode, false);
> +	return err;
> +}
> +
> +static int kvm_gmem_luo_freeze(struct liveupdate_file_op_args *args)
> +{
> +	struct guest_memfd_luo_ser *ser;
> +	struct gmem_file *gmem_file;
> +	struct kvm *kvm;
> +	struct file *kvm_file;
> +	u64 vm_token;
> +	int err;
> +
> +	if (WARN_ON_ONCE(!args->serialized_data))
> +		return -EINVAL;
> +
> +	ser = phys_to_virt(args->serialized_data);
> +
> +	gmem_file = args->file->private_data;
> +	kvm = gmem_file->kvm;
> +
> +	/*
> +	 * Obtain a strong reference to kvm->vm_file to prevent the SLAB_TYPESAFE_BY_RCU
> +	 * file memory from being reallocated while it is being processed.
> +	 */
> +	kvm_file = get_file_active(&kvm->vm_file);
> +	if (!kvm_file)
> +		return -ENOENT;
> +
> +	err = liveupdate_get_token_outgoing(args->session, kvm_file, &vm_token);
> +	fput(kvm_file);
> +	if (err)
> +		return err;
> +
> +	ser->vm_token = vm_token;
> +	return 0;
> +}
> +
> +static void kvm_gmem_luo_discard_folios(
> +	const struct guest_memfd_luo_folio_ser *folios_ser,
> +	u64 nr_folios, u64 start_idx)
> +{
> +	long i;
> +
> +	for (i = start_idx; i < nr_folios; i++) {
> +		struct folio *folio;
> +		phys_addr_t phys;
> +
> +		if (!folios_ser[i].pfn)
> +			continue;
> +
> +		phys = PFN_PHYS(folios_ser[i].pfn);
> +		folio = kho_restore_folio(phys);
> +		if (folio)
> +			folio_put(folio);
> +	}
> +}
> +
> +static void kvm_gmem_luo_unpreserve(struct liveupdate_file_op_args *args)
> +{
> +	struct guest_memfd_luo_folio_ser *folios_ser = args->private_data;
> +	struct guest_memfd_luo_ser *ser;
> +	long i;
> +
> +	if (WARN_ON_ONCE(!args->serialized_data))
> +		return;
> +
> +	ser = phys_to_virt(args->serialized_data);
> +	if (!ser)
> +		return;
> +
> +	if (ser->nr_folios > 0)
> +		kho_unpreserve_vmalloc(&ser->folios);
> +	for (i = ser->nr_folios - 1; i >= 0; i--) {
> +		struct folio *folio;
> +
> +		if (!folios_ser[i].pfn)

Is it possible for pfn to be 0 here? Perhaps this should be a
WARN_ON_ONCE().

> +			continue;
> +
> +		folio = pfn_folio(folios_ser[i].pfn);
> +		kho_unpreserve_folio(folio);
> +	}
> +	vfree(folios_ser);
> +
> +	kho_unpreserve_free(ser);
> +	kvm_gmem_freeze(file_inode(args->file), false);
> +}
> +
>
> [...snip...]
>

^ permalink raw reply

* Re: [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability
From: Yosry Ahmed @ 2026-06-22 23:33 UTC (permalink / raw)
  To: Hao Jia
  Cc: akpm, tj, hannes, shakeel.butt, mhocko, mkoutny, nphamcs,
	chengming.zhou, muchun.song, roman.gushchin, linux-mm,
	linux-kernel, linux-doc, Hao Jia
In-Reply-To: <20260618044857.69439-2-jiahao.kernel@gmail.com>

On Thu, Jun 18, 2026 at 12:48:53PM +0800, Hao Jia wrote:
> From: Hao Jia <jiahao1@lixiang.com>
> 
> Currently, shrink_memcg() writes back at most one entry per-node
> during its traversal. This makes shrink_worker() inefficient, as
> it must repeatedly re-enter shrink_memcg() to make any substantial
> progress.
> 
> To address this, extend shrink_memcg() and rewrite its LRU iteration
> logic to support batch writeback. Introduce the nr_to_writeback
> parameter to support a writeback budget based on compressed size.
> This enables batch writeback in the shrink_worker() path, while
> maintaining a low writeback budget in the zswap_store() path.
> 
> Additionally, to prepare for future proactive writeback, update
> the return value semantics of shrink_memcg(): a positive value now
> represents the actual number of compressed bytes written back, 0
> indicates that candidates existed but no writeback succeeded, and
> a negative value represents an error code.
> 
> Suggested-by: Yosry Ahmed <yosry@kernel.org>
> Signed-off-by: Hao Jia <jiahao1@lixiang.com>
> ---
>  mm/zswap.c | 116 ++++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 97 insertions(+), 19 deletions(-)
> 
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 761cd699e0a3..d7d031dee4cd 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -160,6 +160,11 @@ struct zswap_pool {
>  	char tfm_name[CRYPTO_MAX_ALG_NAME];
>  };
>  
> +struct zswap_shrink_walk_arg {
> +	unsigned long bytes_written;
> +	bool encountered_page_in_swapcache;
> +};
> +
>  /* Global LRU lists shared by all zswap pools. */
>  static struct list_lru zswap_list_lru;
>  
> @@ -1089,8 +1094,9 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>  				       void *arg)
>  {
>  	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
> -	bool *encountered_page_in_swapcache = (bool *)arg;
> +	struct zswap_shrink_walk_arg *walk_arg = arg;
>  	swp_entry_t swpentry;
> +	unsigned int length;
>  	enum lru_status ret = LRU_REMOVED_RETRY;
>  	int writeback_result;
>  
> @@ -1135,8 +1141,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>  	 * Once the lru lock is dropped, the entry might get freed. The
>  	 * swpentry is copied to the stack, and entry isn't deref'd again
>  	 * until the entry is verified to still be alive in the tree.
> +	 *
> +	 * entry->length is also copied while the lock is held, because
> +	 * zswap_writeback_entry() frees the entry on success and we still
> +	 * need its compressed size to account for writeback.

Hmm that's unnecessary, just update "The swpentry is copied to the
stack.." above to "Copy neded fields to the stack.." or something.

>  	 */
>  	swpentry = entry->swpentry;
> +	length = entry->length;
>  
>  	/*
>  	 * It's safe to drop the lock here because we return either
> @@ -1155,12 +1166,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>  		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
>  		 * shrinker context).
>  		 */
> -		if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
> +		if (writeback_result == -EEXIST) {
>  			ret = LRU_STOP;
> -			*encountered_page_in_swapcache = true;
> +			walk_arg->encountered_page_in_swapcache = true;
>  		}
>  	} else {
>  		zswap_written_back_pages++;
> +		walk_arg->bytes_written += length;
>  	}
>  
>  	return ret;
> @@ -1169,8 +1181,11 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>  static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>  		struct shrink_control *sc)
>  {
> +	struct zswap_shrink_walk_arg walk_arg = {
> +		.bytes_written = 0,
> +		.encountered_page_in_swapcache = false,
> +	};
>  	unsigned long shrink_ret;
> -	bool encountered_page_in_swapcache = false;
>  
>  	if (!zswap_shrinker_enabled ||
>  			!mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
> @@ -1179,9 +1194,9 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>  	}
>  
>  	shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
> -		&encountered_page_in_swapcache);
> +		&walk_arg);
>  
> -	if (encountered_page_in_swapcache)
> +	if (walk_arg.encountered_page_in_swapcache)
>  		return SHRINK_STOP;
>  
>  	return shrink_ret ? shrink_ret : SHRINK_STOP;
> @@ -1275,10 +1290,32 @@ static struct shrinker *zswap_alloc_shrinker(void)
>  	return shrinker;
>  }
>  
> -static int shrink_memcg(struct mem_cgroup *memcg)
> -{
> -	int nid, shrunk = 0, scanned = 0;
> +/*
> + * The maximum acceptable scan cost factor for writing back
> + * PAGE_SIZE bytes of compressed data.
> + */
> +#define ZSWAP_WB_SCAN_FACTOR	16UL
> +#define NR_ZSWAP_WB_BATCH	64UL
>  
> +/*
> + * Iterate over the per-node zswap LRUs of @memcg in batches, writing back
> + * up to @nr_to_writeback * PAGE_SIZE bytes of compressed data.
> + *
> + * Return: The number of bytes written back, or -ENOENT if @memcg has
> + * writeback disabled, is a zombie cgroup, or has empty zswap LRUs.
> + */
> +static long shrink_memcg(struct mem_cgroup *memcg,
> +			 unsigned long nr_to_writeback)


Is nr_to_writeback supposed to be the number of pages we want to
writeback (regardless of their compressed size), or the compressed bytes
we want to writeback divided by PAGE_SIZE?

The way it's being used below seems like it's the latter, but the batch
size should be in terms of scanned pages (i.e. uncompressed pages). So
this is confusing.

The zswap_store() path expects to reclaim one uncompressed page, but
this will reclaim PAGE_SIZE worth of compressed memory when passing 1
IIUC (actually maybe more, see below).

> +{
> +	struct zswap_shrink_walk_arg walk_arg = {
> +		.bytes_written = 0,
> +		.encountered_page_in_swapcache = false,
> +	};
> +	u64 bytes_to_writeback = nr_to_writeback << PAGE_SHIFT;
> +	bool memcg_list_is_empty = true;
> +	int nid;
> +
> +	/* Memcg with zswap writeback disabled are not candidates. */

The comment is unnecessary here, it should be obvious.

>  	if (!mem_cgroup_zswap_writeback_enabled(memcg))
>  		return -ENOENT;
>  
> @@ -1290,24 +1327,65 @@ static int shrink_memcg(struct mem_cgroup *memcg)
>  		return -ENOENT;
>  
>  	for_each_node_state(nid, N_NORMAL_MEMORY) {
> -		unsigned long nr_to_walk = 1;
> +		unsigned long nr_to_scan, nr_scanned = 0;
> +		unsigned long remain;
> +		walk_arg.encountered_page_in_swapcache = false;
> +		/*
> +		 * Cap by LRU length: bounds rewalks when referenced
> +		 * entries keep rotating to the tail.
> +		 */
> +		nr_to_scan = list_lru_count_one(&zswap_list_lru, nid, memcg);
> +		if (!nr_to_scan)
> +			continue;

Hmm generally if we are running out of pages to scan then we should scan
the rotated entries, and reclaim them on the second pass, right? So this
should be working as intended. But I guess this doesn't work well when
iterating multiple memcgs, as we don't want to drain referenced entries
in one memcg before reclaiming already rotated entries on another.

So I think the assumption here is that the caller will retry if needed,
handling balancing scanning between multiple memcgs if needed. Maybe we
should document this in the function doc above? We should explain that
referenced entries will be rotated but not reclaimed as part of the same
call.

> +		memcg_list_is_empty = false;
> +
> +		/*
> +		 * Cap by SCAN_FACTOR * remain budget: bounds scan cost
> +		 * to the remaining writeback budget.
> +		 */
> +		remain = DIV_ROUND_UP(bytes_to_writeback - walk_arg.bytes_written, PAGE_SIZE);
> +		nr_to_scan = min(nr_to_scan,
> +				 remain * ZSWAP_WB_SCAN_FACTOR);

For the zswap_store() path bytes_to_writeback=PAGE_SIZE, so remain will
initially be 1. But then we multiply by this factor and now to scan 16
pages? Also, where did this factor and equation come from?

We'll also loop over nodes, so we may end up scanning 32 or more pages
depending on the number of nodes in the system.

If this is just a heuristic, we should really just start simple and add
heuristics later as needed. The caller should probably pass in the
number of pages to scan (i.e. uncompressed pages), and leave it to the
caller to decide when to retry if the actual memory savings are
realized.

>  
> -		shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
> -					    &shrink_memcg_cb, NULL, &nr_to_walk);
> -		scanned += 1 - nr_to_walk;
> +		while (nr_scanned < nr_to_scan) {
> +			unsigned long nr_to_walk = min(NR_ZSWAP_WB_BATCH,
> +						       nr_to_scan - nr_scanned);
> +
> +			/*
> +			 * Account for the committed budget rather than the walker's
> +			 * actual delta. If the list is emptied concurrently, the
> +			 * walker visits nothing and nr_scanned would never advance.
> +			 */
> +			nr_scanned += nr_to_walk;
> +
> +			list_lru_walk_one(&zswap_list_lru, nid, memcg,
> +					  &shrink_memcg_cb,
> +					  &walk_arg,
> +					  &nr_to_walk);
> +
> +			if (walk_arg.bytes_written >= bytes_to_writeback)
> +				return walk_arg.bytes_written;
> +
> +			if (walk_arg.encountered_page_in_swapcache)
> +				break;
> +
> +			cond_resched();
> +		}

If the caller is expected to have a retry loop anyway, should we
simplify this and just scan each per-node LRU once?

We should also probably bail early if the number of scanned pages has
already been reached? Currently shrink_memcg() scans one page at a time,
so if it scans a bit more to balance between the nodes it's probably
fine.

But with batching, we could end up scanning hundres of extra pages just
to balance between all nodes. Is node imbalance a real issue?

>  	}
>  
> -	if (!scanned)
> +	/* Return -ENOENT if all zswap LRU lists are empty. */
> +	if (memcg_list_is_empty)
>  		return -ENOENT;
>  
> -	return shrunk ? 0 : -EAGAIN;
> +	return walk_arg.bytes_written;
>  }
>  
>  static void shrink_worker(struct work_struct *w)
>  {
>  	struct mem_cgroup *memcg;
> -	int ret, failures = 0, attempts = 0;
> +	int failures = 0, attempts = 0;
>  	unsigned long thr;
> +	long ret;
>  
>  	/* Reclaim down to the accept threshold */
>  	thr = zswap_accept_thr_pages();
> @@ -1368,7 +1446,7 @@ static void shrink_worker(struct work_struct *w)
>  			goto resched;
>  		}
>  
> -		ret = shrink_memcg(memcg);
> +		ret = shrink_memcg(memcg, NR_ZSWAP_WB_BATCH);
>  		/* drop the extra reference */
>  		mem_cgroup_put(memcg);
>  
> @@ -1382,7 +1460,7 @@ static void shrink_worker(struct work_struct *w)
>  			continue;
>  		++attempts;
>  
> -		if (ret && ++failures == MAX_RECLAIM_RETRIES)
> +		if (ret <= 0 && ++failures == MAX_RECLAIM_RETRIES)
>  			break;
>  resched:
>  		cond_resched();
> @@ -1492,7 +1570,7 @@ bool zswap_store(struct folio *folio)
>  	objcg = get_obj_cgroup_from_folio(folio);
>  	if (objcg && !obj_cgroup_may_zswap(objcg)) {
>  		memcg = get_mem_cgroup_from_objcg(objcg);
> -		if (shrink_memcg(memcg)) {
> +		if (shrink_memcg(memcg, 1) <= 0) {
>  			mem_cgroup_put(memcg);
>  			goto put_objcg;
>  		}
> -- 
> 2.34.1
> 

^ permalink raw reply

* Re: [PATCH v4 2/5] mm/zswap: Factor writeback loop out of shrink_worker()
From: Yosry Ahmed @ 2026-06-22 23:36 UTC (permalink / raw)
  To: Hao Jia
  Cc: akpm, tj, hannes, shakeel.butt, mhocko, mkoutny, nphamcs,
	chengming.zhou, muchun.song, roman.gushchin, linux-mm,
	linux-kernel, linux-doc, Hao Jia
In-Reply-To: <20260618044857.69439-3-jiahao.kernel@gmail.com>

> +/*
> + * Walk the memcg tree and write back zswap pages until the
> + * (lower_pages, upper_pages) window closes, or abort encounter
> + * MAX_RECLAIM_RETRIES times of the following conditions:
> + * - No writeback-candidate memcgs found in a memcg tree walk.
> + * - Shrinking a writeback-candidate memcg failed.
> + *
> + * For shrink_worker(), it passes lower=thr and upper=zswap_total_pages().
> + * The @upper limit is refreshed in each iteration by re-evaluating
> + * zswap_total_pages(), and the window closes once the total falls
> + * below the threshold.

This is the wrong abstraction level, and it's obvious by the fact that
the function calls zswap_total_pages() again to recalcualte
'upper_pages'. It gets much worse in the next patch as well.

The lower_pages and upper_pages thing is also unnecessarily hard to
follow.

The core of the reuse here is the retry logic. So maybe keep the memcg
iteration in the callers, and define a function that takes in one memcg
and reclaims one batch from it? failures and attempts can be passed into
the function to maintain the state across scans of different memcgs,
like zswap_shrink_walk_arg?

WDYT?

> + */
> +static void zswap_try_to_writeback(unsigned long lower_pages,
> +				   unsigned long upper_pages)
> +{
> +	int failures = 0, attempts = 0;
> +	struct mem_cgroup *iter_memcg;
> +
> +	while (lower_pages < upper_pages) {
> +		unsigned long batch_size;
> +		long shrunk;
>  
> -		if (!memcg) {
> +		cond_resched();
> +
> +		iter_memcg = zswap_iter_global();
> +		if (!iter_memcg) {
>  			/*
>  			 * Continue shrinking without incrementing failures if
>  			 * we found candidate memcgs in the last tree walk.
> @@ -1443,12 +1457,16 @@ static void shrink_worker(struct work_struct *w)
>  				break;
>  
>  			attempts = 0;
> -			goto resched;
> +			continue;
>  		}
>  
> -		ret = shrink_memcg(memcg, NR_ZSWAP_WB_BATCH);
> +		batch_size = min(upper_pages - lower_pages, NR_ZSWAP_WB_BATCH);
> +		shrunk = shrink_memcg(iter_memcg, batch_size);
>  		/* drop the extra reference */
> -		mem_cgroup_put(memcg);
> +		mem_cgroup_put(iter_memcg);
> +
> +		/* zswap total pages might have changed, refresh it. */
> +		upper_pages = zswap_total_pages();
>  
>  		/*
>  		 * There are no writeback-candidate pages in the memcg.
> @@ -1456,15 +1474,23 @@ static void shrink_worker(struct work_struct *w)
>  		 * with pages in zswap. Skip this without incrementing attempts
>  		 * and failures.
>  		 */
> -		if (ret == -ENOENT)
> +		if (shrunk == -ENOENT)
>  			continue;
>  		++attempts;
>  
> -		if (ret <= 0 && ++failures == MAX_RECLAIM_RETRIES)
> +		if (shrunk <= 0 && ++failures == MAX_RECLAIM_RETRIES)
>  			break;
> -resched:
> -		cond_resched();
> -	} while (zswap_total_pages() > thr);
> +	}
> +}
> +
> +static void shrink_worker(struct work_struct *w)
> +{
> +	unsigned long thr;
> +
> +	/* Reclaim down to the accept threshold */
> +	thr = zswap_accept_thr_pages();
> +
> +	zswap_try_to_writeback(thr, zswap_total_pages());
>  }
>  
>  /*********************************
> -- 
> 2.34.1
> 

^ permalink raw reply

* Re: [PATCH v4 3/5] mm/zswap: Implement proactive writeback
From: Yosry Ahmed @ 2026-06-22 23:40 UTC (permalink / raw)
  To: Hao Jia
  Cc: akpm, tj, hannes, shakeel.butt, mhocko, mkoutny, nphamcs,
	chengming.zhou, muchun.song, roman.gushchin, linux-mm,
	linux-kernel, linux-doc, Hao Jia
In-Reply-To: <20260618044857.69439-4-jiahao.kernel@gmail.com>

On Thu, Jun 18, 2026 at 12:48:55PM +0800, Hao Jia wrote:
> From: Hao Jia <jiahao1@lixiang.com>
> 
> Zswap currently writes back pages to backing swap reactively, triggered
> either by the shrinker or when the pool reaches its size limit. There is
> no mechanism to control the amount of writeback for a specific memory
> cgroup. However, users may want to proactively write back zswap pages,
> e.g., to free up memory for other applications or to prepare for
> memory-intensive workloads.
> 
> Introduce a "zswap_writeback_only" key to the memory.reclaim cgroup
> interface. When specified, this key bypasses standard memory reclaim
> and exclusively performs proactive zswap writeback up to the requested
> budget. If omitted, the default reclaim behavior remains unchanged.
> 
> Example usage:
>   # Write back 10MB of compressed data from zswap to the backing swap
>   echo "10M zswap_writeback_only" > memory.reclaim
> 
> Note that the actual amount of compressed data written back may be less
> than requested due to the zswap second-chance algorithm: referenced
> entries are rotated on the LRU on the first encounter and only written
> back on a second pass. If fewer bytes are written back than requested,
> -EAGAIN is returned, matching the existing memory.reclaim semantics.
> 
> Internally, extend user_proactive_reclaim() to parse the new
> "zswap_writeback_only" token and invoke the dedicated handler
> zswap_proactive_writeback(). This handler reuses
> zswap_try_to_writeback() to walk the target memcg subtree, draining
> per-node zswap LRUs through list_lru_walk_one() with the
> shrink_memcg_cb() callback.

I won't comment on the memcg interface as this is more-or-less a
placeholder until an interface is finalized.

> 
> Suggested-by: Yosry Ahmed <yosry@kernel.org>
> Suggested-by: Nhat Pham <nphamcs@gmail.com>
> Signed-off-by: Hao Jia <jiahao1@lixiang.com>
[..]
> diff --git a/mm/zswap.c b/mm/zswap.c
> index e29f8a61412d..28200552dde3 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1423,6 +1423,27 @@ static struct mem_cgroup *zswap_iter_global(void)
>  	return memcg;
>  }
>  
> +/*
> + * Local iteration uses a local cursor to select from online memcgs
> + * under @root in a round-robin fashion.
> + *
> + * Pass the previous return value as @prev to advance the round-robin
> + * iteration, or pass NULL to start a new walk. If exiting early before
> + * the iteration completes, the caller must call mem_cgroup_iter_break()
> + * to release the cursor reference.
> + */
> +static struct mem_cgroup *zswap_iter_local(struct mem_cgroup *root,
> +					   struct mem_cgroup *prev)
> +{
> +	struct mem_cgroup *memcg;
> +
> +	do {
> +		memcg = mem_cgroup_iter(root, prev, NULL);
> +		prev = memcg;
> +	} while (memcg && !mem_cgroup_tryget_online(memcg));
> +	return memcg;
> +}
> +
>  /*
>   * Walk the memcg tree and write back zswap pages until the
>   * (lower_pages, upper_pages) window closes, or abort encounter
> @@ -1430,16 +1451,23 @@ static struct mem_cgroup *zswap_iter_global(void)
>   * - No writeback-candidate memcgs found in a memcg tree walk.
>   * - Shrinking a writeback-candidate memcg failed.
>   *
> - * For shrink_worker(), it passes lower=thr and upper=zswap_total_pages().
> - * The @upper limit is refreshed in each iteration by re-evaluating
> - * zswap_total_pages(), and the window closes once the total falls
> - * below the threshold.
> + * For shrink_worker() (proactive=false), it passes lower=thr and
> + * upper=zswap_total_pages(). The @upper limit is refreshed in each
> + * iteration by re-evaluating zswap_total_pages(), and the window
> + * closes once the total falls below the threshold.
> + *
> + * For zswap_proactive_writeback() (proactive=true), it passes lower=0
> + * and upper=nr_to_writeback. The @lower limit is advanced by the
> + * compressed bytes written back via shrink_memcg(). The window closes
> + * once @nr_to_writeback pages of compressed data have been written back.
>   */
> -static void zswap_try_to_writeback(unsigned long lower_pages,
> -				   unsigned long upper_pages)
> +static int zswap_try_to_writeback(struct mem_cgroup *memcg,
> +				  unsigned long lower_pages,
> +				  unsigned long upper_pages, bool proactive)

As I mentiond in the previous patch, this is the wrong abstraction. The
function is extremely tighyl-coupled to the callers, and needing to
pass in things like proactive makes it even worse.

It should be limited to reclaiming one batch of pages from a memcg, and
the retry logic. Everything else (memcg iteration logic, scan goal
checks) should be in the caller.

[..]  
>  static void shrink_worker(struct work_struct *w)
> @@ -1490,7 +1536,7 @@ static void shrink_worker(struct work_struct *w)
>  	/* Reclaim down to the accept threshold */
>  	thr = zswap_accept_thr_pages();
>  
> -	zswap_try_to_writeback(thr, zswap_total_pages());
> +	zswap_try_to_writeback(NULL, thr, zswap_total_pages(), false);
>  }
>  
>  /*********************************
> @@ -1736,6 +1782,19 @@ int zswap_load(struct folio *folio)
>  	return 0;
>  }
>  
> +int zswap_proactive_writeback(struct mem_cgroup *memcg,
> +			      unsigned long nr_to_writeback)
> +{
> +	if (!memcg)
> +		return -EINVAL;
> +	if (!mem_cgroup_zswap_writeback_enabled(memcg))
> +		return -EINVAL;
> +	if (!nr_to_writeback)
> +		return 0;
> +
> +	return zswap_try_to_writeback(memcg, 0, nr_to_writeback, true);

The memcg loop should be here, together with a check on the written
bytes to check if the reclaim goal was achieved. I think nr_to_writeback
is also very confusing, it's really the reclaim target in bytes divided
by PAGE_SIZE. I think you need to pass in the number of bytes to
reclaim/writeback directly.

> +}
> +
>  void zswap_invalidate(swp_entry_t swp)
>  {
>  	pgoff_t offset = swp_offset(swp);
> -- 
> 2.34.1
> 

^ permalink raw reply

* Re: [PATCH v4 4/5] mm/zswap: Add per-memcg stat for proactive writeback
From: Yosry Ahmed @ 2026-06-22 23:42 UTC (permalink / raw)
  To: Hao Jia
  Cc: akpm, tj, hannes, shakeel.butt, mhocko, mkoutny, nphamcs,
	chengming.zhou, muchun.song, roman.gushchin, linux-mm,
	linux-kernel, linux-doc, Hao Jia
In-Reply-To: <20260618044857.69439-5-jiahao.kernel@gmail.com>

[..]
>  static int zswap_writeback_entry(struct zswap_entry *entry,
> -				 swp_entry_t swpentry)
> +				 swp_entry_t swpentry,
> +				 bool proactive)

IIUC, if we refactor the code as I suggested in previous changes, we
don't really need to add an argument here..

>  {
>  	struct xarray *tree;
>  	pgoff_t offset = swp_offset(swpentry);
> @@ -1045,6 +1047,15 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>  	if (entry->objcg)
>  		count_objcg_events(entry->objcg, ZSWPWB, 1);
>  
> +	if (proactive && entry->objcg) {
> +		struct mem_cgroup *memcg;
> +
> +		rcu_read_lock();
> +		memcg = obj_cgroup_memcg(entry->objcg);
> +		mod_memcg_state(memcg, MEMCG_ZSWPWB_PROACTIVE_B, entry->length);
> +		rcu_read_unlock();
> +	}

..and this chunk of code would end up in zswap_proactive_writeback().

^ permalink raw reply

* [PATCH] crypto: af_alg - Add af_alg_restrict sysctl, defaulting to 1
From: Eric Biggers @ 2026-06-22 23:48 UTC (permalink / raw)
  To: linux-crypto, Herbert Xu
  Cc: linux-kernel, linux-doc, linux-bluetooth, iwd, linux-hardening,
	Milan Broz, Demi Marie Obenour, Andy Lutomirski, Eric Biggers

AF_ALG is a frequent source of vulnerabilities and a maintenance
nightmare.  It exposes far more functionality to userspace than ever
should have been exposed, especially to unprivileged processes.  Recent
exploits have targeted kernel internal implementation details like
"authencesn" that have zero use case for userspace access.

Fortunately, AF_ALG is rarely used in practice, as userspace crypto
libraries exist.  And when it is used, only some functionality is known
to be used, and many users are known to hold capabilities already.
iwd for example requires CAP_NET_ADMIN and has a known algorithm list
(https://lore.kernel.org/linux-crypto/bcbbef00-5881-421b-8892-7be6c04b832d@gmail.com/).

Thus, let's restrict the set of allowed algorithms by default, depending
on the capabilities held.

Add a sysctl /proc/sys/crypto/af_alg_restrict with meaning:

    0: unrestricted
    1: limited functionality
    2: completely disabled

Set the default value to 1, which enables an algorithm allowlist for
unprivileged processes and a slightly longer allowlist for privileged
processes.

Note that the list may be tweaked in the future.  However, the common
use cases such as iwd and bluez are taken into account already.  I've
tested that iwd still works with the default value of 1.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 Documentation/admin-guide/sysctl/crypto.rst | 36 +++++++++++
 Documentation/crypto/userspace-if.rst       | 13 +++-
 crypto/af_alg.c                             | 72 +++++++++++++++++++--
 crypto/algif_aead.c                         | 11 ++++
 crypto/algif_hash.c                         | 24 +++++++
 crypto/algif_rng.c                          |  9 +++
 crypto/algif_skcipher.c                     | 20 ++++++
 include/crypto/if_alg.h                     |  8 +++
 8 files changed, 184 insertions(+), 9 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/crypto.rst b/Documentation/admin-guide/sysctl/crypto.rst
index b707bd314a64..9a1bd53287f4 100644
--- a/Documentation/admin-guide/sysctl/crypto.rst
+++ b/Documentation/admin-guide/sysctl/crypto.rst
@@ -5,10 +5,46 @@
 These files show up in ``/proc/sys/crypto/``, depending on the
 kernel configuration:
 
 .. contents:: :local:
 
+.. _af_alg_restrict:
+
+af_alg_restrict
+===============
+
+Controls the level of restriction of AF_ALG.
+
+AF_ALG is a deprecated and rarely-used userspace interface that is a
+frequent source of vulnerabilities. It also unnecessarily exposes a
+large number of kernel implementation details. For more information
+about AF_ALG, see :ref:`Documentation/crypto/userspace-if.rst
+<crypto_userspace_interface>`.
+
+Starting in Linux v7.3, AF_ALG supports only a limited set of
+algorithms by default. This sysctl allows the system administrator to
+remove this restriction when needed for compatibility reasons, or to
+go further and disable AF_ALG entirely. The default value is 1.
+
+===  ==================================================================
+0    AF_ALG is unrestricted.
+
+1    AF_ALG is supported with a limited list of algorithms. The list
+     is designed for compatibility with known users such as iwd and
+     bluez that haven't yet been fixed to use userspace crypto code.
+
+     Specifically, there is an allowlist for unprivileged processes
+     and a somewhat longer allowlist for processes that hold
+     CAP_SYS_ADMIN or CAP_NET_ADMIN in the initial user namespace.
+
+     Attempts to bind() an AF_ALG socket with a disallowed algorithm
+     fail with ENOENT.
+
+2    AF_ALG is completely disabled. Attempts to create an AF_ALG
+     socket fail with EAFNOSUPPORT.
+===  ==================================================================
+
 fips_enabled
 ============
 
 Read-only flag that indicates whether FIPS mode is enabled.
 
diff --git a/Documentation/crypto/userspace-if.rst b/Documentation/crypto/userspace-if.rst
index ab93300c8e04..d6194346e366 100644
--- a/Documentation/crypto/userspace-if.rst
+++ b/Documentation/crypto/userspace-if.rst
@@ -1,5 +1,7 @@
+.. _crypto_userspace_interface:
+
 User Space Interface
 ====================
 
 Introduction
 ------------
@@ -10,13 +12,18 @@ code.
 
 AF_ALG is insecure and is deprecated. Originally added to the kernel in 2010,
 most kernel developers now consider it to be a mistake. Support for hardware
 accelerators, which was the original purpose of AF_ALG, has been removed.
 
-AF_ALG continues to be supported only for backwards compatibility. On systems
-where no programs using AF_ALG remain, the support for it should be disabled by
-disabling ``CONFIG_CRYPTO_USER_API_*``.
+AF_ALG continues to be supported only for backwards compatibility.
+
+Starting in Linux v7.3, the set of algorithms supported by AF_ALG is limited by
+default. See :ref:`/proc/sys/crypto/af_alg_restrict <af_alg_restrict>`.
+
+On systems where no programs using AF_ALG remain, the support for it should be
+disabled entirely by setting ``/proc/sys/crypto/af_alg_restrict`` to 2 or by
+disabling ``CONFIG_CRYPTO_USER_API_*`` in the kernel configuration.
 
 Deprecation
 -----------
 
 AF_ALG was originally intended to provide userspace programs access to crypto
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index cce000e8590e..34b801568fba 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -6,10 +6,11 @@
  *
  * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
  */
 
 #include <linux/atomic.h>
+#include <linux/capability.h>
 #include <crypto/if_alg.h>
 #include <linux/crypto.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/key.h>
@@ -20,14 +21,32 @@
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/sched/signal.h>
 #include <linux/security.h>
 #include <linux/string.h>
+#include <linux/sysctl.h>
+#include <linux/user_namespace.h>
 #include <keys/user-type.h>
 #include <keys/trusted-type.h>
 #include <keys/encrypted-type.h>
 
+static int af_alg_restrict = 1;
+
+static const struct ctl_table af_alg_table[] = {
+	{
+		.procname       = "af_alg_restrict",
+		.data           = &af_alg_restrict,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_TWO,
+	},
+};
+
+static struct ctl_table_header *af_alg_header;
+
 struct alg_type_list {
 	const struct af_alg_type *type;
 	struct list_head list;
 };
 
@@ -108,10 +127,43 @@ int af_alg_unregister_type(const struct af_alg_type *type)
 
 	return err;
 }
 EXPORT_SYMBOL_GPL(af_alg_unregister_type);
 
+static bool af_alg_capable(void)
+{
+	return ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN) ||
+	       capable(CAP_SYS_ADMIN);
+}
+
+int af_alg_check_restriction(const char *name,
+			     const struct af_alg_allowlist_entry allowlist[])
+{
+	int level = READ_ONCE(af_alg_restrict);
+
+	if (level == 0)
+		return 0;
+	if (level == 1) {
+		for (const struct af_alg_allowlist_entry *ent = allowlist;
+		     ent->name; ent++) {
+			if (strcmp(name, ent->name) == 0 &&
+			    (!ent->privileged || af_alg_capable()))
+				return 0;
+		}
+	}
+	/*
+	 * Use -ENOENT (the error code for "algorithm not found") instead of
+	 * -EACCES or -EPERM, for the highest chance of correctly triggering
+	 * fallback code paths in userspace programs.
+	 *
+	 * Don't log a warning, since it would be noisy.  iwd tries to bind a
+	 * bunch of algorithms that it never uses.
+	 */
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(af_alg_check_restriction);
+
 static void alg_do_release(const struct af_alg_type *type, void *private)
 {
 	if (!type)
 		return;
 
@@ -504,10 +556,13 @@ static int alg_create(struct net *net, struct socket *sock, int protocol,
 		      int kern)
 {
 	struct sock *sk;
 	int err;
 
+	if (READ_ONCE(af_alg_restrict) == 2)
+		return -EAFNOSUPPORT;
+
 	if (sock->type != SOCK_SEQPACKET)
 		return -ESOCKTNOSUPPORT;
 	if (protocol != 0)
 		return -EPROTONOSUPPORT;
 
@@ -1220,31 +1275,36 @@ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
 }
 EXPORT_SYMBOL_GPL(af_alg_get_rsgl);
 
 static int __init af_alg_init(void)
 {
-	int err = proto_register(&alg_proto, 0);
+	int err;
+
+	af_alg_header = register_sysctl("crypto", af_alg_table);
 
+	err = proto_register(&alg_proto, 0);
 	if (err)
-		goto out;
+		goto out_unregister_sysctl;
 
 	err = sock_register(&alg_family);
-	if (err != 0)
+	if (err)
 		goto out_unregister_proto;
 
-out:
-	return err;
+	return 0;
 
 out_unregister_proto:
 	proto_unregister(&alg_proto);
-	goto out;
+out_unregister_sysctl:
+	unregister_sysctl_table(af_alg_header);
+	return err;
 }
 
 static void __exit af_alg_exit(void)
 {
 	sock_unregister(PF_ALG);
 	proto_unregister(&alg_proto);
+	unregister_sysctl_table(af_alg_header);
 }
 
 module_init(af_alg_init);
 module_exit(af_alg_exit);
 MODULE_DESCRIPTION("Crypto userspace interface");
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 787aac8aeb24..b9217f9086aa 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -32,10 +32,15 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/net.h>
 #include <net/sock.h>
 
+static const struct af_alg_allowlist_entry aead_allowlist[] = {
+	{ "ccm(aes)", true }, /* bluez */
+	{},
+};
+
 static inline bool aead_sufficient_data(struct sock *sk)
 {
 	struct alg_sock *ask = alg_sk(sk);
 	struct sock *psk = ask->parent;
 	struct alg_sock *pask = alg_sk(psk);
@@ -342,10 +347,16 @@ static struct proto_ops algif_aead_ops_nokey = {
 	.poll		=	af_alg_poll,
 };
 
 static void *aead_bind(const char *name)
 {
+	int err;
+
+	err = af_alg_check_restriction(name, aead_allowlist);
+	if (err)
+		return ERR_PTR(err);
+
 	return crypto_alloc_aead(name, 0, AF_ALG_CRYPTOAPI_MASK);
 }
 
 static void aead_release(void *private)
 {
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index 5452ad6c1506..a8d958d51ece 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -14,10 +14,28 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/net.h>
 #include <net/sock.h>
 
+static const struct af_alg_allowlist_entry hash_allowlist[] = {
+	{ "cmac(aes)", true }, /* iwd, bluez */
+	{ "hmac(md5)", true }, /* iwd */
+	{ "hmac(sha1)", true }, /* iwd */
+	{ "hmac(sha224)", true }, /* iwd */
+	{ "hmac(sha256)", true }, /* iwd */
+	{ "hmac(sha384)", true }, /* iwd */
+	{ "hmac(sha512)", true }, /* iwd, sha512hmac */
+	{ "md4", true }, /* iwd */
+	{ "md5", true }, /* iwd */
+	{ "sha1", false }, /* iwd, iproute2 < 7.0 */
+	{ "sha224", true }, /* iwd */
+	{ "sha256", true }, /* iwd */
+	{ "sha384", true }, /* iwd */
+	{ "sha512", true }, /* iwd */
+	{},
+};
+
 struct hash_ctx {
 	struct af_alg_sgl sgl;
 
 	u8 *result;
 
@@ -380,10 +398,16 @@ static struct proto_ops algif_hash_ops_nokey = {
 	.accept		=	hash_accept_nokey,
 };
 
 static void *hash_bind(const char *name)
 {
+	int err;
+
+	err = af_alg_check_restriction(name, hash_allowlist);
+	if (err)
+		return ERR_PTR(err);
+
 	return crypto_alloc_ahash(name, 0, AF_ALG_CRYPTOAPI_MASK);
 }
 
 static void hash_release(void *private)
 {
diff --git a/crypto/algif_rng.c b/crypto/algif_rng.c
index 4dfe7899f8fa..bd522915d56d 100644
--- a/crypto/algif_rng.c
+++ b/crypto/algif_rng.c
@@ -48,10 +48,14 @@
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>");
 MODULE_DESCRIPTION("User-space interface for random number generators");
 
+static const struct af_alg_allowlist_entry rng_allowlist[] = {
+	{},
+};
+
 struct rng_ctx {
 #define MAXSIZE 128
 	unsigned int len;
 	struct crypto_rng *drng;
 	u8 *addtl;
@@ -199,10 +203,15 @@ static struct proto_ops __maybe_unused algif_rng_test_ops = {
 
 static void *rng_bind(const char *name)
 {
 	struct rng_parent_ctx *pctx;
 	struct crypto_rng *rng;
+	int err;
+
+	err = af_alg_check_restriction(name, rng_allowlist);
+	if (err)
+		return ERR_PTR(err);
 
 	pctx = kzalloc_obj(*pctx);
 	if (!pctx)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index df20bdfe1f1f..2b8069667974 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -32,10 +32,24 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/net.h>
 #include <net/sock.h>
 
+static const struct af_alg_allowlist_entry skcipher_allowlist[] = {
+	{ "adiantum(xchacha12,aes)", false }, /* cryptsetup */
+	{ "adiantum(xchacha20,aes)", false }, /* cryptsetup */
+	{ "cbc(aes)", true }, /* iwd */
+	{ "cbc(des)", true }, /* iwd */
+	{ "cbc(des3_ede)", true }, /* iwd */
+	{ "ctr(aes)", true }, /* iwd */
+	{ "ecb(aes)", true }, /* iwd, bluez */
+	{ "ecb(des)", true }, /* iwd */
+	{ "hctr2(aes)", false }, /* cryptsetup */
+	{ "xts(aes)", false }, /* cryptsetup benchmark */
+	{},
+};
+
 static int skcipher_sendmsg(struct socket *sock, struct msghdr *msg,
 			    size_t size)
 {
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
@@ -307,10 +321,16 @@ static struct proto_ops algif_skcipher_ops_nokey = {
 	.poll		=	af_alg_poll,
 };
 
 static void *skcipher_bind(const char *name)
 {
+	int err;
+
+	err = af_alg_check_restriction(name, skcipher_allowlist);
+	if (err)
+		return ERR_PTR(err);
+
 	return crypto_alloc_skcipher(name, 0, AF_ALG_CRYPTOAPI_MASK);
 }
 
 static void skcipher_release(void *private)
 {
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index 7643ba954125..4e9ed8e73403 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -159,13 +159,21 @@ struct af_alg_ctx {
 	unsigned int len;
 
 	unsigned int inflight;
 };
 
+struct af_alg_allowlist_entry {
+	const char *name;
+	bool privileged;
+};
+
 int af_alg_register_type(const struct af_alg_type *type);
 int af_alg_unregister_type(const struct af_alg_type *type);
 
+int af_alg_check_restriction(const char *name,
+			     const struct af_alg_allowlist_entry allowlist[]);
+
 int af_alg_release(struct socket *sock);
 void af_alg_release_parent(struct sock *sk);
 int af_alg_accept(struct sock *sk, struct socket *newsock,
 		  struct proto_accept_arg *arg);
 

base-commit: 1dc18801be29bc54709aa355b8acd80e183b03cd
-- 
2.54.0


^ permalink raw reply related

* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
From: Ackerley Tng @ 2026-06-22 23:54 UTC (permalink / raw)
  To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
In-Reply-To: <48777f4749fa43d5648085dbb2037aa99c144a88.1780676742.git.tarunsahu@google.com>

Tarun Sahu <tarunsahu@google.com> writes:

> This patch introduces the freeze on gmem_inode which prevents

Can't find the reference now, but commit messages should take the
imperative mood and avoid "this patch" [*]

[*] https://lore.kernel.org/all/YKRWNaqzo4GVDxHP@google.com/

> the fallocate call and any new page fault allocation. This will avoid
> gmem file modification when it is being preserved
>
> Used srcu lock to synchronise the freeze call, where write blocks
> until all the reads are free. And reads are re-entrant.
>
> Incase fault fails, It return -EPERM and VM_EXIT to userspace. userspace
> must handle this properly as every new fault will fail.
>
> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
>
> [...snip...]
>
> @@ -105,12 +108,20 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  	if (!IS_ERR(folio))
>  		return folio;
>
> +	idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
> +	if (kvm_gmem_is_frozen(inode)) {
> +		srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> +		return ERR_PTR(-EPERM);
> +	}
> +
>  	policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
>  	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
>  					 FGP_LOCK | FGP_CREAT,
>  					 mapping_gfp_mask(inode->i_mapping), policy);
>  	mpol_cond_put(policy);
>
> +	srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> +
>  	/*
>  	 * External interfaces like kvm_gmem_get_pfn() support dealing
>  	 * with hugepages to a degree, but internally, guest_memfd currently
> @@ -273,16 +284,30 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
>  static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
>  			       loff_t len)
>  {
> +	struct inode *inode = file_inode(file);
>  	int ret;
> +	int idx;
>
> -	if (!(mode & FALLOC_FL_KEEP_SIZE))
> -		return -EOPNOTSUPP;
> +	idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
> +	if (kvm_gmem_is_frozen(inode)) {
> +		srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> +		return -EPERM;
> +	}

fallocate may eventually go to kvm_gmem_get_folio(), so that would check
kvm_gmem_is_frozen() twice. Is this meant to catch the punch hole case?

>
> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> -		return -EOPNOTSUPP;
> +	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
>
> -	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> -		return -EINVAL;
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +
> +	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}

There's some reordering here. Why not let the validation happen like
before, then check kvm_gmem_is_frozen()?

>
>  	if (mode & FALLOC_FL_PUNCH_HOLE)
>  		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
>
> [...snip...]
>
> +
> +/**
> + * kvm_gmem_freeze - Freeze or unfreeze a guest_memfd inode mapping.
> + * @inode: The guest_memfd inode.
> + * @freeze: True to freeze, false to unfreeze.
> + *
> + * This API is used strictly during the live update / preservation transition
> + * window to prevent host userspace and guest-side faults from making any
> + * mapping modifications (such as fallocate or page fault allocation)
> + * to the guest_memfd page cache.
> + *
> + * Synchronization Strategy (Sleepable RCU):
> + * To avoid high-contention VFS locks (like inode_lock or
> + * filemap_invalidate_lock) on the vCPU page fault hot paths, this subsystem
> + * implements a lightweight, system-wide Sleepable RCU (SRCU) mechanism
> + * (`kvm_gmem_freeze_srcu`):
> + *
> + * Global vs. Per-Inode SRCU
> + * ======================
> + * A single system-wide global static `srcu_struct` is used instead of a
> + * per-inode SRCU structure to completely prevent unprivileged users from
> + * exhausting the host's per-CPU memory allocator. Because
> + * `init_srcu_struct()` allocates per-CPU memory via `alloc_percpu()`, which
> + * is not accounted by memory cgroups (memcg),
> + * a per-inode SRCU structure would allow a tenant to bypass cgroup limits and
> + * trigger a system-wide Out-of-Memory (OOM) crash simply by spawning a large
> + * number of guest_memfd file descriptors (bounded only by RLIMIT_NOFILE).
> + *
> + * Flag Modification Note:
> + * Since `GUEST_MEMFD_F_MAPPING_FROZEN` is the ONLY flag in
> + * `GMEM_I(inode)->flags` that is mutated dynamically at runtime (all other
> + * flags are creation-time flags which remain strictly read-only), there is
> + * no possibility of concurrent bit-modification races. Therefore, a standard
> + * `WRITE_ONCE` is fully safe and does not require complex `cmpxchg`
> + * synchronization loops.
> + */
> +void kvm_gmem_freeze(struct inode *inode, bool freeze)
> +{
> +	u64 flags = READ_ONCE(GMEM_I(inode)->flags);
> +
> +	if (freeze)
> +		flags |= GUEST_MEMFD_F_MAPPING_FROZEN;
> +	else
> +		flags &= ~GUEST_MEMFD_F_MAPPING_FROZEN;
> +
> +	WRITE_ONCE(GMEM_I(inode)->flags, flags);
> +
> +	if (freeze)
> +		synchronize_srcu(&kvm_gmem_freeze_srcu);

Why only synchronize on freeze but not unfreeze?

> +}
> +
>
> [...snip...]
>

^ permalink raw reply

* Re: [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support
From: Ackerley Tng @ 2026-06-22 23:59 UTC (permalink / raw)
  To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
In-Reply-To: <20ae20f9d1a198b289444ebb4c824314cbba1bcf.1780676742.git.tarunsahu@google.com>

Tarun Sahu <tarunsahu@google.com> writes:

> Introduce core infrastructure to support VM preservation with LUO.
>
> First two changes are just refactoring, no functional change, third
> change introduces a new member in struct kvm.
> - Move ITOA_MAX_LEN to kvm_mm.h for reuse by upcoming kvm_luo code.
> - Add a public kvm_create_vm_file() helper wrapping kvm_create_vm()
>   and anon_inode_getfile() to provide a unified VM file creation API.
> - Track a weak reference to the backing file in struct kvm under
>   CONFIG_LIVEUPDATE_GUEST_MEMFD to enable reverse file resolution
>   without circular lifetime dependencies.
>

Given the above, I think this should be separate patches.

> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
> ---
>  include/linux/kvm_host.h | 14 +++++++
>  virt/kvm/kvm_main.c      | 79 +++++++++++++++++++++++++++++-----------
>  virt/kvm/kvm_mm.h        |  3 ++
>  3 files changed, 75 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4c14aee1fb06..9111a28637af 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -874,6 +874,18 @@ struct kvm {
>  #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
>  	/* Protected by slots_lock (for writes) and RCU (for reads) */
>  	struct xarray mem_attr_array;
> +#endif
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> +	/*
> +	 * Weak reference to the VFS file backing this KVM instance. Stored
> +	 * without incrementing the file refcount to prevent a circular lifetime
> +	 * dependency (since file->private_data already pins this struct kvm).
> +	 * Used exclusively to resolve the file pointer back from struct kvm.
> +	 *
> +	 * Written/cleared via rcu_assign_pointer() and read locklessly under
> +	 * RCU (e.g. via get_file_active() to prevent ABA races).
> +	 */
> +	struct file *vm_file;
>  #endif

We didn't really talk about this during the calls, but it seems weird to
preserve a vm_file with pretty much nothing other than the vm type. The
entire VM is re-created, which means it could potentially be a
completely different VM?

In some sense it's more flexible since the guest_memfd can be restored
with some completely different VM, but it seems like it could introduce
other issues.

I think other KVM folks would probably have more thoughts here.

>  	char stats_id[KVM_STATS_NAME_SIZE];
>  };
> @@ -1074,7 +1086,9 @@ void kvm_get_kvm(struct kvm *kvm);
>  bool kvm_get_kvm_safe(struct kvm *kvm);
>  void kvm_put_kvm(struct kvm *kvm);
>  bool file_is_kvm(struct file *file);
> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname);
>  void kvm_put_kvm_no_destroy(struct kvm *kvm);
> +void kvm_uevent_notify_vm_create(struct kvm *kvm);
>
>  static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
>  {
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 89489996fbc1..65f0c5fb353e 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -67,9 +67,6 @@
>  #include <linux/kvm_dirty_ring.h>
>
>
> -/* Worst case buffer size needed for holding an integer. */
> -#define ITOA_MAX_LEN 12
> -
>  MODULE_AUTHOR("Qumranet");
>  MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
>  MODULE_LICENSE("GPL");
> @@ -1349,6 +1346,19 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
>  {
>  	struct kvm *kvm = filp->private_data;
>
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> +	/*
> +	 * Clear the weak reference of the vm file.
> +	 * In case vm file is closed by userspace, but kvm still has
> +	 * other users like vCPUs, clearing this pointer ensures
> +	 * that we don't have a dangling pointer to a closed file.
> +	 *
> +	 * Cleared via rcu_assign_pointer() to ensure proper memory visibility
> +	 * for concurrent lockless readers under RCU.
> +	 */
> +	rcu_assign_pointer(kvm->vm_file, NULL);
> +#endif
> +
>  	kvm_irqfd_release(kvm);
>
>  	kvm_put_kvm(kvm);
> @@ -5476,11 +5486,47 @@ bool file_is_kvm(struct file *file)
>  }
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
>
> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname)
> +{
> +	struct kvm *kvm = kvm_create_vm(type, fdname);
> +	struct file *file;
> +
> +	if (IS_ERR(kvm))
> +		return ERR_CAST(kvm);
> +
> +	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> +	if (IS_ERR(file)) {
> +		kvm_put_kvm(kvm);
> +		return file;
> +	}
> +
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> +	/*
> +	 * Weak reference to the file (without get_file()) to prevent a circular
> +	 * dependency. Safe because the file's release path clears this pointer
> +	 * and drops its reference to the VM.
> +	 *
> +	 * Written via rcu_assign_pointer() because the pointer can be read
> +	 * locklessly under RCU (e.g., in kvm_gmem_luo_preserve() via
> +	 * get_file_active() to prevent lockless ABA races).
> +	 */
> +	rcu_assign_pointer(kvm->vm_file, file);
> +#endif
> +
> +	/*
> +	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
> +	 * already set, with ->release() being kvm_vm_release().  In error
> +	 * cases it will be called by the final fput(file) and will take
> +	 * care of doing kvm_put_kvm(kvm).
> +	 */
> +
> +	return file;
> +}
> +
>  static int kvm_dev_ioctl_create_vm(unsigned long type)
>  {
>  	char fdname[ITOA_MAX_LEN + 1];
>  	int r, fd;
> -	struct kvm *kvm;
>  	struct file *file;
>
>  	fd = get_unused_fd_flags(O_CLOEXEC);
> @@ -5489,31 +5535,17 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
>
>  	snprintf(fdname, sizeof(fdname), "%d", fd);
>
> -	kvm = kvm_create_vm(type, fdname);
> -	if (IS_ERR(kvm)) {
> -		r = PTR_ERR(kvm);
> -		goto put_fd;
> -	}
> -
> -	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> +	file = kvm_create_vm_file(type, fdname);
>  	if (IS_ERR(file)) {
>  		r = PTR_ERR(file);
> -		goto put_kvm;
> +		goto put_fd;
>  	}
>
> -	/*
> -	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
> -	 * already set, with ->release() being kvm_vm_release().  In error
> -	 * cases it will be called by the final fput(file) and will take
> -	 * care of doing kvm_put_kvm(kvm).
> -	 */
> -	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
> +	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, file->private_data);

Notifying with file->private_data threw me off... I would rather inline
the rcu_assign_pointer() in this function and have this line read
notify(..., kvm) like before.

>
>  	fd_install(fd, file);
>  	return fd;
>
> -put_kvm:
> -	kvm_put_kvm(kvm);
>  put_fd:
>  	put_unused_fd(fd);
>  	return r;
> @@ -6341,6 +6373,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
>  	kfree(env);
>  }
>
> +void kvm_uevent_notify_vm_create(struct kvm *kvm)
> +{
> +	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
> +}
> +
>  static void kvm_init_debug(void)
>  {
>  	const struct file_operations *fops;
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index 9fcc5d5b7f8d..7aa1d65c3d46 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -3,6 +3,9 @@
>  #ifndef __KVM_MM_H__
>  #define __KVM_MM_H__ 1
>
> +/* Worst case buffer size needed for holding an integer as a string. */
> +#define ITOA_MAX_LEN 12
> +
>  /*
>   * Architectures can choose whether to use an rwlock or spinlock
>   * for the mmu_lock.  These macros, for use in common code
> --
> 2.54.0.1032.g2f8565e1d1-goog

^ permalink raw reply

* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
From: Sean Christopherson @ 2026-06-23  0:09 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, axelrasmussen, linux-kselftest,
	kexec, linux-kernel, linux-doc, kvm, linux-mm
In-Reply-To: <CAEvNRgFEHciT3T9y+qEYRvXhDwfrggoU7Rm=f9hT3OrV+wgpNQ@mail.gmail.com>

On Mon, Jun 22, 2026, Ackerley Tng wrote:
> Tarun Sahu <tarunsahu@google.com> writes:
> 
> > This patch introduces the freeze on gmem_inode which prevents
> 
> Can't find the reference now, but commit messages should take the
> imperative mood and avoid "this patch" [*]

From Documentation/process/submitting-patches.rst:

  Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
  instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
  to do frotz", as if you are giving orders to the codebase to change
  its behaviour.

Documentation/process/maintainer-tip.rst and Documentation/process/maintainer-kvm-x86.rst
elaborate more on the preferred style (I do most of the guest_memfd maintenance,
and so for all intents and purpose it's bound by KVM x86 "rules").

^ permalink raw reply

* Re: [PATCH v8 05/46] KVM: Make CONFIG_KVM_VM_MEMORY_ATTRIBUTES selectable
From: Sean Christopherson @ 2026-06-23  0:16 UTC (permalink / raw)
  To: Julian Braha
  Cc: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
	linux-coco
In-Reply-To: <8e53844c-f2f8-4a4b-bf72-f3140c170d43@gmail.com>

On Fri, Jun 19, 2026, Julian Braha wrote:
> Hi Ackerley,
> 
> On 6/19/26 01:31, Ackerley Tng via B4 Relay wrote:
> 
> >  config KVM_VM_MEMORY_ATTRIBUTES
> > -	bool
> > +	depends on KVM_SW_PROTECTED_VM || KVM_INTEL_TDX || KVM_AMD_SEV
> > +	bool "Enable per-VM PRIVATE vs. SHARED attributes (for CoCo VMs)"
> 
> Sorry for the style nitpick, but could you keep the type and prompt as
> the first attribute in the Kconfig option definition (like the other
> options do)?

No need to be sorry, I've no idea why I put the "depends" first.  I don't even
know if that qualifies as a nit :-)

Ackerley, if you can provide your SoB (for Fuad's feedback), I can fixup when
applying (assuming nothing else necessitates v9).

^ permalink raw reply

* Re: [PATCH v8 13/46] KVM: guest_memfd: Add base support for KVM_SET_MEMORY_ATTRIBUTES2
From: Sean Christopherson @ 2026-06-23  0:22 UTC (permalink / raw)
  To: Fuad Tabba
  Cc: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	willy, wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <CA+EHjTx2xKjheiW5VzHw_TdWFUqdJqfgu=dOPa=_yaYBMY8uyw@mail.gmail.com>

On Fri, Jun 19, 2026, Fuad Tabba wrote:
> On Fri, 19 Jun 2026 at 01:31, Ackerley Tng via B4 Relay
> <devnull+ackerleytng.google.com@kernel.org> wrote:
> >
> > From: Ackerley Tng <ackerleytng@google.com>
> >
> > Introduce base support for KVM_SET_MEMORY_ATTRIBUTES2 in guest_memfd, which
> > just updates attributes tracked by guest_memfd.
> >
> > Validate input fields in general. Guard usage of KVM_SET_MEMORY_ATTRIBUTES2
> > by making sure requested attributes are supported for this instance of kvm.
> >
> > A new KVM_SET_MEMORY_ATTRIBUTES2 is defined to support writes (unlike
> > KVM_SET_MEMORY_ATTRIBUTES) in addition to reads so it can provide error
> > details to userspace. This will be used in a later patch.
> >
> > The two ioctls use their corresponding structs with no overlap, but
> > backward compatibility is baked in for future support of
> > KVM_SET_MEMORY_ATTRIBUTES2 and struct kvm_memory_attributes2 in the VM
> > ioctl.
> >
> > The process of setting memory attributes is set up such that the later half
> > will not fail due to allocation. Any necessary checks are performed before
> > the point of no return.
> >
> > Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> > Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> > Co-developed-by: Sean Christoperson <seanjc@google.com>
> > Signed-off-by: Sean Christoperson <seanjc@google.com>
> > Reviewed-by: Fuad Tabba <tabba@google.com>
> > Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> 
> Note sure if it's user error on my part, if I'm applying this to the
> wrong base, but I found a build break here on patch 13:
> kvm_gmem_invalidate_start() doesn't exist in the base tree. The
> function is kvm_gmem_invalidate_begin() here. The rename
> (190cc5370a8b6) landed via a different merge path and isn't an
> ancestor of the stated base.
> 
> Patches 19 and 20 have the same mismatch. Fix for all three is
> s/kvm_gmem_invalidate_start/kvm_gmem_invalidate_begin/.

Ya, Ackerley used a slightly older kvm/next to send the patches.  I at least was
testing against kvm-x86/next, which does have the rename.

Other than noting that this should be applied against the current kvm/next, I
don't think there's anything else to be done?

^ permalink raw reply

* Re: [PATCH 1/2] cgroup/cpuset: Avoid unnecessary cpus & mems update in cpuset_hotplug_update_tasks()
From: Ridong Chen @ 2026-06-23  1:14 UTC (permalink / raw)
  To: Waiman Long, Tejun Heo, Johannes Weiner, Michal Koutný,
	Jonathan Corbet, Shuah Khan
  Cc: cgroups, linux-kernel, linux-doc
In-Reply-To: <20260622224509.1927419-1-longman@redhat.com>



On 6/23/2026 6:45 AM, Waiman Long wrote:
> As reported by sashiko [1], cpuset_hotplug_update_tasks() may perform
> unnecessary task iteration and updating of tasks' CPU and node masks
> when mems_allowed and/or cpus_allowed are not set in cpuset v2. It is
> due to the fact that the temporary new_cpus and new_mems masks do not
> inherit parent's effective_cpus/mems when they are empty which is the
> expected behavior for cpuset v2 since commit 4ec22e9c5a90 ("cpuset:
> Enable cpuset controller in default hierarchy").
> 
> Fix that and avoid unnecessay work by adding the empty mask checks and
> inheriting the parent's versions if empty.
> 
> [1] https://sashiko.dev/#/patchset/20260621032816.1806773-1-longman%40redhat.com
> 
> Fixes: 4ec22e9c5a90 ("cpuset: Enable cpuset controller in default hierarchy")
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>   kernel/cgroup/cpuset.c | 8 ++++++++
>   1 file changed, 8 insertions(+)
> 
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index aff86acea701..bc0207fd6e57 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -3925,6 +3925,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
>   	compute_effective_cpumask(&new_cpus, cs, parent);
>   	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
>   
> +	if (is_in_v2_mode()) {
> +		/* Inherit parent's effective_cpus/mems if empty */
> +		if (cpumask_empty(&new_cpus))
> +			cpumask_copy(&new_cpus, parent->effective_cpus);
> +		if (nodes_empty(new_mems))
> +			new_mems = parent->effective_mems;
> +	}
> +
>   	if (!tmp || !cs->partition_root_state)
>   		goto update_tasks;
>   

I noticed that compute_effective_cpumask(...) is called in several 
places, so I think the logic should be consolidated into that function.

```
static void compute_effective_cpumask(struct cpumask *new_cpus,
				      struct cpuset *cs, struct cpuset *parent)
{
	cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
	if (cpumask_empty(&new_cpus) && is_in_v2_mode())
		cpumask_copy(&new_cpus, parent->effective_cpus);
}

```

Similarly, for new_mems, should we introduce a dedicated helper like 
compute_effective_nodemask? The same fallback logic is needed in 
update_nodemasks_hier:


```
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
...
		bool has_mems = nodes_and(*new_mems, cp->mems_allowed, 
parent->effective_mems);

		/*
		 * If it becomes empty, inherit the effective mask of the
		 * parent, which is guaranteed to have some MEMs.
		 */
		if (is_in_v2_mode() && !has_mems)
			*new_mems = parent->effective_mems;
...
```

-- 
Best regards
Ridong


^ permalink raw reply

* Re: [PATCH v8 15/46] KVM: guest_memfd: Call arch invalidate hooks on conversion
From: Sean Christopherson @ 2026-06-23  1:15 UTC (permalink / raw)
  To: Fuad Tabba
  Cc: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	willy, wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <CA+EHjTx+3U++dnhGEkwh2SO82xMugAvvJ9ee1O__sxZCKL_X5A@mail.gmail.com>

On Fri, Jun 19, 2026, Fuad Tabba wrote:
> On Fri, 19 Jun 2026 at 01:31, Ackerley Tng via B4 Relay
> <devnull+ackerleytng.google.com@kernel.org> wrote:
> >
> > From: Ackerley Tng <ackerleytng@google.com>
> >
> > When memory in guest_memfd is converted from private to shared, the
> > platform-specific state associated with the guest-private pages must be
> > invalidated or cleaned up.
> >
> > Iterate over the folios in the affected range and call the
> > kvm_arch_gmem_invalidate() hook for each PFN range. This allows
> > architectures to perform necessary teardown, such as updating hardware
> > metadata or encryption states, before the pages are transitioned to the
> > shared state.
> >
> > Invoke this helper after indicating to KVM's mmu code that an invalidation
> > is in progress to stop in-flight page faults from succeeding.
> >
> > Reviewed-by: Fuad Tabba <tabba@google.com>
> > Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> 
> Coming back to this after working through the arm64/pKVM side. My
> Reviewed-by here is from the previous round and the patch hasn't
> changed, but I missed an implication for arm64.
> 
> kvm_arch_gmem_invalidate() is now called from two paths with the same
> (start, end) signature: folio teardown (kvm_gmem_free_folio) and
> private->shared conversion (here). For SNP/TDX that's fine, conversion is
> destructive anyway. For pKVM the two need opposite content semantics:
> conversion must preserve the page in place (same physical page, the point
> of in-place conversion without encryption), while teardown must scrub it
> before returning it to the host.
>
> The hook gets only a pfn range with no indication of which caller it's
> serving, so arm64 can't give the two paths the behaviour they need. It
> would help to signal intent on the conversion path: a reason/flag, a
> separate hook, or not routing non-destructive conversion through the
> teardown hook.
> 
> arm64 isn't here yet, so this isn't urgent, but the hook is gaining a
> second caller now, and it's cheaper to leave room for the distinction
> than to change a generic contract other arches depend on later.

Crud.  It may not be urgent for arm64, but it's urgent for other reasons that
I "can't" describe in detail at the moment, and even if that weren't the case, I
think we should clean things up now.  More below.

> >  virt/kvm/guest_memfd.c | 41 +++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 41 insertions(+)
> >
> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > index 433f79047b9d1..3c94442bc8131 100644
> > --- a/virt/kvm/guest_memfd.c
> > +++ b/virt/kvm/guest_memfd.c
> > @@ -607,6 +607,42 @@ static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
> >         return safe;
> >  }
> >
> > +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> > +static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end)

Not your fault, but kvm_arch_gmem_invalidate() is badly misnamed.  It's not
"invalidating" anything, it's much more of a "free" callback, as SNP uses it to
put physical pages back into a shared state when a maybe-private folio is freed.

As Fuad points out, (ab)using that hook for the private=>shared conversion case
"works", but not broadly.  And it makes the bad name worse, because it's called
from code that _is_ doing true invalidations.  For pKVM, it may not even need to
do anything invalidation-like.

To avoid a conflict with patches that are going to have priority over this series,
to set the stage for arm64 support, and to avoid avoid bleeding vendor details
into guest_memfd, as if they are core guest_memfd behavior (only SNP needs the
"invalidation" on this specific transition), I think we should add an arch hook
to do conversions straightaway.

Unless there's a clever option I'm missing, it'll mean adding yet another
HAVE_KVM_ARCH_GMEM_XXX flag?  Hmm, especially because IIUC, arm64/pKVM doesn't
need a callback for this case, only the free_folio case.

> > +{
> > +       struct folio_batch fbatch;
> > +       pgoff_t next = start;
> > +       int i;
> > +
> > +       folio_batch_init(&fbatch);
> > +       while (filemap_get_folios(inode->i_mapping, &next, end - 1, &fbatch)) {
> > +               for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> > +                       struct folio *folio = fbatch.folios[i];
> > +                       pgoff_t start_index, end_index;
> > +                       kvm_pfn_t start_pfn, end_pfn;
> > +
> > +                       start_index = max(start, folio->index);
> > +                       end_index = min(end, folio_next_index(folio));
> > +                       /*
> > +                        * end_index is either in folio or points to
> > +                        * the first page of the next folio. Hence,
> > +                        * all pages in range [start_index, end_index)
> > +                        * are contiguous.
> > +                        */
> > +                       start_pfn = folio_file_pfn(folio, start_index);
> > +                       end_pfn = start_pfn + end_index - start_index;
> > +
> > +                       kvm_arch_gmem_invalidate(start_pfn, end_pfn);
> > +               }
> > +
> > +               folio_batch_release(&fbatch);
> > +               cond_resched();
> > +       }
> > +}
> > +#else
> > +static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end) {}
> > +#endif
> > +
> >  static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
> >                                      size_t nr_pages, uint64_t attrs,
> >                                      pgoff_t *err_index)
> > @@ -647,7 +683,12 @@ static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
> >          */
> >
> >         kvm_gmem_invalidate_start(inode, start, end);
> > +
> > +       if (!to_private)
> > +               kvm_gmem_invalidate(inode, start, end);

E.g. instead make this something like this?

	kvm_gmem_set_pfn_attributes(...)

Hrm, though that wastes folio lookups in the to_private case.  So maybe just this,
assuming pKVM doesn't need to take additional action on conversions?

	if (!to_private)
		kvm_gmem_make_shared(...)

Actually, if we do that, then we don't need a separate arch hook, just a separate
config.  It'll still bleed SNP details into guest_memfd, but it'll at least be
done in a way that's more explicitly arch specific (and it's no different than
what we already do for PREPARE...).

E.g. this?  There will still be a looming rename conflict, but that's easy enough
to handle.

diff --git virt/kvm/guest_memfd.c virt/kvm/guest_memfd.c
index 9ce5be7843f2..8aead0abd788 100644
--- virt/kvm/guest_memfd.c
+++ virt/kvm/guest_memfd.c
@@ -648,8 +648,8 @@ static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
        return safe;
 }
 
-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
-static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end)
+#ifdef CONFIG_KVM_ARCH_GMEM_FREE_ON_SHARED_CONVERSION
+static void kvm_gmem_make_shared(struct inode *inode, pgoff_t start, pgoff_t end)
 {
        struct folio_batch fbatch;
        pgoff_t next = start;
@@ -681,7 +681,7 @@ static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end)
        }
 }
 #else
-static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end) {}
+static void kvm_gmem_make_shared(struct inode *inode, pgoff_t start, pgoff_t end) { }
 #endif
 
 static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
@@ -729,7 +729,7 @@ static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
        kvm_gmem_invalidate_start(inode, start, end);
 
        if (!to_private)
-               kvm_gmem_invalidate(inode, start, end);
+               kvm_gmem_make_shared(inode, start, end);
 
        mas_store_prealloc(&mas, xa_mk_value(attrs));

^ permalink raw reply related

* Re: [PATCH v2 0/7] seg6: add SRv6 Mobile User Plane (RFC 9433) behaviors
From: Yuya Kusakabe @ 2026-06-23  1:18 UTC (permalink / raw)
  To: andrea
  Cc: Yuya Kusakabe, andrea.mayer, davem, edumazet, dsahern, kuba,
	pabeni, horms, justin.iurman, shuah, corbet, skhan, linux-kernel,
	netdev, linux-kselftest, linux-doc, stefano.salsano, ahabdels
In-Reply-To: <20260608023951.ccd278890d7c489dbfe21113@common-net.org>

Hi Andrea,

Thank you for the answers.

> On the placement, the new lwtunnel encap type you propose could be a way to
> implement the seg6_mobile.c separation. Since this touches UAPI in
> include/uapi/linux/lwtunnel.h beyond the SRv6 subsystem and cannot be
> undone once merged, it needs careful design.
[...]
> As far as I can see, RFC 9433 has only one Headend behavior, and no L2 or
> reduced variants. So a single LWTUNNEL_ENCAP_SEG6_MOBILE handling both
> End.M.* and H.M.GTP4.D could be viable if accepting both input families
> (ETH_P_IPV6 for End.M.*, ETH_P_IP for H.M.GTP4.D) is treated as a design
> choice of the new encap type, not a stretching of the seg6_local endpoint
> processing model.
>
> These trade-offs are worth weighing in the final design. [...] I think the
> lwtunnel direction will need feedback and comments from its community and
> maintainers.

Agreed. The first per-behavior RFC series (End.MAP) will introduce the
LWTUNNEL_ENCAP_SEG6_MOBILE encap type and the SEG6_MOBILE_* attribute
namespace, and explain in its cover letter that this is the shared
container for the RFC 9433 Section 6 behaviors, so the lwtunnel and
routing folks can weigh in early. The dual input family (ETH_P_IPV6
for End.M.*, ETH_P_IP for H.M.GTP4.D) is specific to H.M.GTP4.D, so I
will lay that out in the H.M.GTP4.D cover letter; keeping it last in
the posting order gives that discussion time to converge.

> If LWTUNNEL_ENCAP_SEG6_MOBILE is added, using SEG6_MOBILE_* attributes
> instead of SEG6_LOCAL_* removes the NH6/SRH/OIF overload raised in v2.
> After solving the above, additional issues remain in the patchset,
> for example src is overloaded across MUP behaviors, and v4_mask_len
> needs revision. These are independent of the lwtunnel decision.

Both will be addressed in the rework; the details are in my replies to
your patch 2 and patch 3 reviews. In short: v4_mask_len and the src
template will be removed from End.M.GTP4.E entirely (full 32-bit IPv4
DA/SA recovery only), src will mean the verbatim outer IPv6 SA for the
IPv6-emitting behaviors, and the H.M.GTP4.D "Source UPF Prefix"
template can get its own attribute name in that series if you prefer.

> I can lead it. I have been evaluating the SRv6 drop reasons with my
> research group, alongside other pending SRv6 patches.
>
> We can sync offline on which SRv6 reasons fit your MUP behaviors, which
> v2 MUP-specific reasons would fit better as SRv6 or generic, and what
> stays MUP-specific.

Thanks for taking the lead; happy to sync offline. Until the prep
series lands, the per-behavior series will carry no MUP-specific drop
reasons.

> Thanks. Maybe also worth covering bad packets, like fragmented input or
> malformed GTP-U extensions.

Will do; the C-helper selftests will cover malformed and truncated
GTP-U extension chains, a duplicated PDU Session Container, and
fragmented outer input (which the behaviors will reject explicitly).

> Works for me. What matters is that the upcoming patches are well structured
> so NF_HOOK can be wired in cleanly in the follow-up.
>
> I am already working on the fix.

Understood. Each behavior will keep a single strip / transform / push
flow in its input handler, so the hook can later slot between strip
and push without reintroducing the skb->cb context pattern.

Thanks,
Yuya

^ permalink raw reply

* Re: [PATCH v8 23/46] KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
From: Sean Christopherson @ 2026-06-23  1:22 UTC (permalink / raw)
  To: Yan Zhao
  Cc: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <ajjc0hw8PjGw69e9@yzhao56-desk.sh.intel.com>

On Mon, Jun 22, 2026, Yan Zhao wrote:
> On Thu, Jun 18, 2026 at 05:32:00PM -0700, Ackerley Tng via B4 Relay wrote:
> > From: Ackerley Tng <ackerleytng@google.com>
> > 
> > Update tdx_gmem_post_populate() to handle cases where a source page is
> > not explicitly provided. Instead of returning -EOPNOTSUPP when src_page
> > is NULL, default to using the page associated with the destination PFN.
> > 
> > This change allows for in-place memory conversion where the data is
> > already present in the target PFN, ensuring the TDX module has a valid
> > source page reference for the TDH.MEM.PAGE.ADD operation.
> > 
> > Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
> >  Documentation/virt/kvm/x86/intel-tdx.rst |  4 ++++
> >  arch/x86/kvm/vmx/tdx.c                   | 11 ++++++++---
> >  2 files changed, 12 insertions(+), 3 deletions(-)
> > 
> > diff --git a/Documentation/virt/kvm/x86/intel-tdx.rst b/Documentation/virt/kvm/x86/intel-tdx.rst
> > index 6a222e9d09541..74357fe87f9ec 100644
> > --- a/Documentation/virt/kvm/x86/intel-tdx.rst
> > +++ b/Documentation/virt/kvm/x86/intel-tdx.rst
> > @@ -158,6 +158,10 @@ KVM_TDX_INIT_MEM_REGION
> >  Initialize @nr_pages TDX guest private memory starting from @gpa with userspace
> >  provided data from @source_addr. @source_addr must be PAGE_SIZE-aligned.
> >  
> > +If guest_memfd in-place conversion is enabled, pass NULL for @source_addr to
> > +initialize the memory region using memory contents already populated in
> > +guest_memfd memory.
> > +
> >  Note, before calling this sub command, memory attribute of the range
> >  [gpa, gpa + nr_pages] needs to be private.  Userspace can use
> >  KVM_SET_MEMORY_ATTRIBUTES to set the attribute.
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index ffe9d0db58c59..56d10333c61a7 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -3198,8 +3198,12 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> >  	if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
> >  		return -EIO;
> >  
> > -	if (!src_page)
> > -		return -EOPNOTSUPP;
> > +	if (!src_page) {
> > +		if (!gmem_in_place_conversion)
> When userspace turns on gmem_in_place_conversion while creating guest_memfd
> without the MMAP flag, the absence of src_page should still be treated as an
> error.

Why MMAP?  Shouldn't this be a general "if (!src_page && !up-to-date)"?  Just
because userspace _can_ mmap() the memory doesn't mean userspace _has_ mmap()'d
and written memory.  And when write() lands, MMAP wouldn't be necessary to
initialize the memory.

> Additionally, to properly enable in-place copying for the TDX initial memory
> region, userspace must not only specify source_addr to NULL, but also follow
> a specific sequence (where steps 1/2/3/7 are required only for in-place copy):
> 1. create guest_memfd with MMAP flag
> 2. mmap the guest_memfd.
> 3. convert the initial memory range to shared.
> 4. copy initial content to the source page.
> 5. convert the initial memory range to private
> 6. invoke ioctl KVM_TDX_INIT_MEM_REGION.
> 7. do not unmap the source backend.
> 
> So, would it be reasonable to introduce a dedicated flag that allows userspace
> to explicitly opt into the in-place copy functionality? e.g.,

Why?  It's userspace's responsibility to get the above right.  If userspace fails
to provide a src_page when it doesn't want in-place copy, that's a userspace bug.

^ permalink raw reply

* Re: [PATCH 2/2] cgroup/cpuset: Rebind/migrate mm only for threadgroup leader in cpuset_update_tasks_nodemask()
From: Ridong Chen @ 2026-06-23  1:22 UTC (permalink / raw)
  To: Waiman Long, Tejun Heo, Johannes Weiner, Michal Koutný,
	Jonathan Corbet, Shuah Khan
  Cc: cgroups, linux-kernel, linux-doc
In-Reply-To: <20260622224509.1927419-2-longman@redhat.com>



On 6/23/2026 6:45 AM, Waiman Long wrote:
> As reported by sashiko [1], cpuset_update_tasks_nodemask() will do
> mpol_rebind_mm() and possibly cpuset_migrate_mm() for all threads of
> a multithreaded process. Since commit 3df9ca0a2b8b ("cpuset: migrate
> memory only for threadgroup leaders"), cpuset_attach() had been updated
> to rebind and migrate memory only for threadgroup leaders to mark the
> group leader as the owner of the mm_struct.
> 
> To be consistent and avoid unnecessary performance overhead for heavily
> multithreaded processes, follow the cpuset_attach() example and perform
> memory rebind and migration only for threadgroup leaders.
> 
> Also add a paragraph in cgroup-v2.rst under cpuset.mems that the
> threadgroup leader is the memory owner of that threadgroup. Therefore
> the non-leading threads shouldn't be in other cgroups whose "cpuset.mems"
> doesn't fully overleap that of the group leader.
> 
> [1] https://sashiko.dev/#/patchset/20260621032816.1806773-1-longman%40redhat.com
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>   Documentation/admin-guide/cgroup-v2.rst | 7 +++++++
>   kernel/cgroup/cpuset.c                  | 4 ++++
>   2 files changed, 11 insertions(+)
> 
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 993446ab66d0..341037c7ec9d 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -2527,6 +2527,13 @@ Cpuset Interface Files
>   	a need to change "cpuset.mems" with active tasks, it shouldn't
>   	be done frequently.
>   
> +	For a multithreaded process, the threadgroup leader is
> +	considered the owner of the group's memory. Memory policy
> +	rebinding and migration will only happen with respect to the
> +	threadgroup leader. To avoid unexpected result, non-leading
> +	threads shouldn't be put into another cgroup whose "cpuset.mems"
> +	doesn't full overleap that of the threadgroup leader.
> +
>     cpuset.mems.effective
>   	A read-only multiple values file which exists on all
>   	cpuset-enabled cgroups.
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index bc0207fd6e57..27bc7a466468 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -2659,6 +2659,10 @@ void cpuset_update_tasks_nodemask(struct cpuset *cs)
>   
>   		cpuset_change_task_nodemask(task, &newmems);
>   
> +		/* Rebind and migrate mm only for task group leader */
> +		if (task != task->group_leader)
> +			continue;
> +

Nit.

if (!thread_group_leader(task))
     continue;

>   		mm = get_task_mm(task);
>   		if (!mm)
>   			continue;

Reviewed-by: Ridong Chen <ridong.chen@linux.dev>

-- 
Best regards
Ridong


^ permalink raw reply

* Re: [PATCH v8 23/46] KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
From: Sean Christopherson @ 2026-06-23  1:24 UTC (permalink / raw)
  To: Fuad Tabba
  Cc: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	willy, wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <CA+EHjTyj-JdW8H0ii2j3dayqnT2s3VV+brSG++p335=FGd2GXg@mail.gmail.com>

On Fri, Jun 19, 2026, Fuad Tabba wrote:
> nit: why does it have Sean's SoB?

Heh, I had the same question at first.  It's because I tweaked the module param
name to gmem_in_place_conversion, and so updated this patch and sent that version
to Ackerley off-list.  Ackerley's SoB really should come last in this case, even
though it creates a somewhat weird SoB chain given the author.

^ permalink raw reply

* Re: [PATCH v8 01/46] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Sean Christopherson @ 2026-06-23  1:37 UTC (permalink / raw)
  To: Binbin Wu
  Cc: ackerleytng, aik, andrew.jones, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <aceb07e1-77bc-49b6-a932-5fd9b5a21727@linux.intel.com>

On Mon, Jun 22, 2026, Binbin Wu wrote:
> On 6/19/2026 8:31 AM, Ackerley Tng via B4 Relay wrote:
> 
> [...]
> 
> >  
> > +static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
> > +{
> > +	struct maple_tree *mt = &GMEM_I(inode)->attributes;
> > +	void *entry = mtree_load(mt, index);
> > +
> > +	return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
> 
> If the entry is unexpectedly missing, returning 0 means the attribute would
> be treated as shared.  And then in kvm_gmem_fault_user_mapping(), it would
> allow the userspace to fault in the folio.
> 
> Should gmem deny such edge case?

After several bugs this year where a WARN_ON_ONCE() fired, but was entirely
insufficient to prevent true badness, I'm definitely senstive to making the "bad"
behavior as harmless as possible.

However, in this case I think we're just hosed.  If KVM treats the memory as
private, KVM will incorrectly do prepare(), incorrectly allow populate(), and
will caused missed invalidations (though I suppose __kvm_gmem_set_attributes()
"only" lies to userspace in that case).

That said, assuming SHARED is definitely odd for cases where guest_memfd *can't*
hold shared memory.  Ditto for assuming PRIVATE.  What if we instead fall back to
the "init" state, e.g.?

static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
{
	struct maple_tree *mt = &GMEM_I(inode)->attributes;
	void *entry = mtree_load(mt, index);

	if (WARN_ON_ONCE(!entry)) {
		bool shared = GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED;

		return shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
	}

	return xa_to_value(entry);
}

^ permalink raw reply

* [PATCH v7 00/10] tracing/probes: Add more typecast features
From: Masami Hiramatsu (Google) @ 2026-06-23  1:44 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest

Hi,

Here is the 7th version of series to introduce more typecast features
to probe events. The previous version is here:

 https://lore.kernel.org/all/178201238795.570818.15573963115625446598.stgit@devnote2/

In this version, I added 2 new fix and cleanup patches and update
according to Sashiko's review. [1/10] is a long-lived issue about
@+FOFFS, which was wrongly adding offset twice. [2/10] is a clean
up patch for renaming fetch_op name (good to dump it). 
This is applicable against probes/core branch on linux-trace tree.

Steve introduced BTF typecast feature for eprobe[1].
This series extends it and add more options:

1. Expanding BTF typecast to kprobe and fprobe.
   (currently only function entry/exit)

2. Introduce container_of like typecast. This adds a "assigned
   member" option to the typecast.

   (STRUCT,MEMBER)VAR->ANOTHER_MEMBER

   This casts VAR to STRUCT type but the VAR is as the address
   of STRUCT.MEMBER. In C, it is:

   container_of(VAR, STRUCT, MEMBER)->ANOTHER_MEMBER

3. Support nested typecast, e.g.

   (STRUCT)((STRUCT2)VAR->MEMBER2)->MEMBER

   the nest level must be smaller than 3.

4. Add $current variable to point "current" task_struct.
   This is useful with typecast, e.g.

   (task_struct)$current->pid

5. per-cpu dereference support.

   Intrdouce this_cpu_read(VAR) and this_cpu_ptr(VAR) to
   access per-cpu data on the current CPU (accessing other CPU
   data is not stable, because it can be changed.)

   You can access the member of per-cpu data structure using
   typecast like:

   (STRUCT)this_cpu_ptr(VAR)->MEMBER

And added fetcharg dump feature (for debug) and updated test scripts
to test part of them.

Thanks,

---
base-commit: 3ec75d0067f30eb5e0730f033766d6ab2feca7ae

Masami Hiramatsu (Google) (10):
      tracing/probes: Fix double addition of offset for @+FOFFSET
      tracing/probes: Rename FETCH_OP_DATA to FETCH_OP_IMMSTR
      tracing/probes: Support dumping fetcharg program for debugging dynamic events
      tracing/probes: Support typecast for various probe events
      tracing/probes: Support nested typecast
      tracing/probes: Type casting always involves nested calls
      tracing/probes: Support field specifier option for typecast
      tracing/probes: Add $current variable support
      tracing/probes: Add this_cpu_read() and this_cpu_ptr() dereference method to fetcharg
      tracing/probes: Add a new testcase for BTF typecasts


 Documentation/trace/eprobetrace.rst                |    9 
 Documentation/trace/fprobetrace.rst                |   10 
 Documentation/trace/kprobetrace.rst                |   11 
 kernel/trace/Kconfig                               |   11 
 kernel/trace/trace.c                               |    8 
 kernel/trace/trace_eprobe.c                        |    2 
 kernel/trace/trace_fprobe.c                        |    2 
 kernel/trace/trace_kprobe.c                        |    2 
 kernel/trace/trace_probe.c                         |  582 ++++++++++++++++----
 kernel/trace/trace_probe.h                         |   98 ++-
 kernel/trace/trace_probe_tmpl.h                    |   27 +
 kernel/trace/trace_uprobe.c                        |    3 
 samples/trace_events/trace-events-sample.c         |   40 +
 samples/trace_events/trace-events-sample.h         |   34 +
 .../ftrace/test.d/dynevent/btf_probe_event.tc      |   51 ++
 .../ftrace/test.d/dynevent/fprobe_syntax_errors.tc |   11 
 .../ftrace/test.d/kprobe/kprobe_syntax_errors.tc   |   11 
 .../ftrace/test.d/kprobe/uprobe_syntax_errors.tc   |    5 
 18 files changed, 756 insertions(+), 161 deletions(-)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc

--
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox