Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH 0/2] Fix to EOPNOTSUPP double conversion in ioctl_setflags()
From: Andrey Albershteyn @ 2025-10-08 12:44 UTC (permalink / raw)
  To: linux-api, linux-fsdevel, linux-kernel, linux-xfs
  Cc: Jan Kara, Jiri Slaby, Christian Brauner, Arnd Bergmann,
	Andrey Albershteyn

Revert original double conversion patch from ENOIOCTLCMD to EOPNOSUPP for
vfs_fileattr_get and vfs_fileattr_set. Instead, convert ENOIOCTLCMD only
where necessary.

To: linux-api@vger.kernel.org
To: linux-fsdevel@vger.kernel.org
To: linux-kernel@vger.kernel.org
To: linux-xfs@vger.kernel.org,
Cc: "Jan Kara" <jack@suse.cz>
Cc: "Jiri Slaby" <jirislaby@kernel.org>
Cc: "Christian Brauner" <brauner@kernel.org>
Cc: "Arnd Bergmann" <arnd@arndb.de>

Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
Andrey Albershteyn (2):
      Revert "fs: make vfs_fileattr_[get|set] return -EOPNOTSUPP"
      fs: return EOPNOTSUPP from file_setattr/file_getattr syscalls

 fs/file_attr.c         | 16 ++++++----------
 fs/fuse/ioctl.c        |  4 ----
 fs/overlayfs/copy_up.c |  2 +-
 fs/overlayfs/inode.c   |  5 ++++-
 4 files changed, 11 insertions(+), 16 deletions(-)
---
base-commit: e5f0a698b34ed76002dc5cff3804a61c80233a7a
change-id: 20251007-eopnosupp-fix-d2f30fd7d873

Best regards,
--  
Andrey Albershteyn <aalbersh@kernel.org>


^ permalink raw reply

* [PATCH 1/2] Revert "fs: make vfs_fileattr_[get|set] return -EOPNOTSUPP"
From: Andrey Albershteyn @ 2025-10-08 12:44 UTC (permalink / raw)
  To: linux-api, linux-fsdevel, linux-kernel, linux-xfs
  Cc: Jan Kara, Jiri Slaby, Christian Brauner, Arnd Bergmann,
	Andrey Albershteyn
In-Reply-To: <20251008-eopnosupp-fix-v1-0-5990de009c9f@kernel.org>

This reverts commit 474b155adf3927d2c944423045757b54aa1ca4de.

This patch caused regression in ioctl_setflags(). Underlying filesystems
use EOPNOTSUPP to indicate that flag is not supported. This error is
also gets converted in ioctl_setflags(). Therefore, for unsupported
flags error changed from EOPNOSUPP to ENOIOCTLCMD.

Link: https://lore.kernel.org/linux-xfs/a622643f-1585-40b0-9441-cf7ece176e83@kernel.org/
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/file_attr.c         | 12 ++----------
 fs/fuse/ioctl.c        |  4 ----
 fs/overlayfs/copy_up.c |  2 +-
 fs/overlayfs/inode.c   |  5 ++++-
 4 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/fs/file_attr.c b/fs/file_attr.c
index 12424d4945d0..460b2dd21a85 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -84,7 +84,7 @@ int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 	int error;
 
 	if (!inode->i_op->fileattr_get)
-		return -EOPNOTSUPP;
+		return -ENOIOCTLCMD;
 
 	error = security_inode_file_getattr(dentry, fa);
 	if (error)
@@ -270,7 +270,7 @@ int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
 	int err;
 
 	if (!inode->i_op->fileattr_set)
-		return -EOPNOTSUPP;
+		return -ENOIOCTLCMD;
 
 	if (!inode_owner_or_capable(idmap, inode))
 		return -EPERM;
@@ -312,8 +312,6 @@ int ioctl_getflags(struct file *file, unsigned int __user *argp)
 	int err;
 
 	err = vfs_fileattr_get(file->f_path.dentry, &fa);
-	if (err == -EOPNOTSUPP)
-		err = -ENOIOCTLCMD;
 	if (!err)
 		err = put_user(fa.flags, argp);
 	return err;
@@ -335,8 +333,6 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp)
 			fileattr_fill_flags(&fa, flags);
 			err = vfs_fileattr_set(idmap, dentry, &fa);
 			mnt_drop_write_file(file);
-			if (err == -EOPNOTSUPP)
-				err = -ENOIOCTLCMD;
 		}
 	}
 	return err;
@@ -349,8 +345,6 @@ int ioctl_fsgetxattr(struct file *file, void __user *argp)
 	int err;
 
 	err = vfs_fileattr_get(file->f_path.dentry, &fa);
-	if (err == -EOPNOTSUPP)
-		err = -ENOIOCTLCMD;
 	if (!err)
 		err = copy_fsxattr_to_user(&fa, argp);
 
@@ -371,8 +365,6 @@ int ioctl_fssetxattr(struct file *file, void __user *argp)
 		if (!err) {
 			err = vfs_fileattr_set(idmap, dentry, &fa);
 			mnt_drop_write_file(file);
-			if (err == -EOPNOTSUPP)
-				err = -ENOIOCTLCMD;
 		}
 	}
 	return err;
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 57032eadca6c..fdc175e93f74 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -536,8 +536,6 @@ int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 cleanup:
 	fuse_priv_ioctl_cleanup(inode, ff);
 
-	if (err == -ENOTTY)
-		err = -EOPNOTSUPP;
 	return err;
 }
 
@@ -574,7 +572,5 @@ int fuse_fileattr_set(struct mnt_idmap *idmap,
 cleanup:
 	fuse_priv_ioctl_cleanup(inode, ff);
 
-	if (err == -ENOTTY)
-		err = -EOPNOTSUPP;
 	return err;
 }
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 27396fe63f6d..20c92ea58093 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -178,7 +178,7 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old,
 	err = ovl_real_fileattr_get(old, &oldfa);
 	if (err) {
 		/* Ntfs-3g returns -EINVAL for "no fileattr support" */
-		if (err == -EOPNOTSUPP || err == -EINVAL)
+		if (err == -ENOTTY || err == -EINVAL)
 			return 0;
 		pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n",
 			old->dentry, err);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index ecb9f2019395..d4722e1b83bc 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -720,7 +720,10 @@ int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa)
 	if (err)
 		return err;
 
-	return vfs_fileattr_get(realpath->dentry, fa);
+	err = vfs_fileattr_get(realpath->dentry, fa);
+	if (err == -ENOIOCTLCMD)
+		err = -ENOTTY;
+	return err;
 }
 
 int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa)

-- 
2.51.0


^ permalink raw reply related

* [PATCH 2/2] fs: return EOPNOTSUPP from file_setattr/file_getattr syscalls
From: Andrey Albershteyn @ 2025-10-08 12:44 UTC (permalink / raw)
  To: linux-api, linux-fsdevel, linux-kernel, linux-xfs
  Cc: Jan Kara, Jiri Slaby, Christian Brauner, Arnd Bergmann,
	Andrey Albershteyn
In-Reply-To: <20251008-eopnosupp-fix-v1-0-5990de009c9f@kernel.org>

These syscalls call to vfs_fileattr_get/set functions which return
ENOIOCTLCMD if filesystem doesn't support setting file attribute on an
inode. For syscalls EOPNOTSUPP would be more appropriate return error.

Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/file_attr.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/file_attr.c b/fs/file_attr.c
index 460b2dd21a85..5e3e2aba97b5 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -416,6 +416,8 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
 	}
 
 	error = vfs_fileattr_get(filepath.dentry, &fa);
+	if (error == -ENOIOCTLCMD)
+		error = -EOPNOTSUPP;
 	if (error)
 		return error;
 
@@ -483,6 +485,8 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,
 	if (!error) {
 		error = vfs_fileattr_set(mnt_idmap(filepath.mnt),
 					 filepath.dentry, &fa);
+		if (error == -ENOIOCTLCMD)
+			error = -EOPNOTSUPP;
 		mnt_drop_write(filepath.mnt);
 	}
 

-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH 2/2] fs: return EOPNOTSUPP from file_setattr/file_getattr syscalls
From: Arnd Bergmann @ 2025-10-08 13:23 UTC (permalink / raw)
  To: Andrey Albershteyn, linux-api, linux-fsdevel, linux-kernel,
	linux-xfs
  Cc: Jan Kara, Jiri Slaby, Christian Brauner, Andrey Albershteyn
In-Reply-To: <20251008-eopnosupp-fix-v1-2-5990de009c9f@kernel.org>

On Wed, Oct 8, 2025, at 14:44, Andrey Albershteyn wrote:
> These syscalls call to vfs_fileattr_get/set functions which return
> ENOIOCTLCMD if filesystem doesn't support setting file attribute on an
> inode. For syscalls EOPNOTSUPP would be more appropriate return error.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>

Reviewed-by: Arnd Bergmann <arnd@arndb.de>

^ permalink raw reply

* Re: [PATCH 2/2] fs: return EOPNOTSUPP from file_setattr/file_getattr syscalls
From: Jan Kara @ 2025-10-08 13:30 UTC (permalink / raw)
  To: Andrey Albershteyn
  Cc: linux-api, linux-fsdevel, linux-kernel, linux-xfs, Jan Kara,
	Jiri Slaby, Christian Brauner, Arnd Bergmann, Andrey Albershteyn
In-Reply-To: <20251008-eopnosupp-fix-v1-2-5990de009c9f@kernel.org>

On Wed 08-10-25 14:44:18, Andrey Albershteyn wrote:
> These syscalls call to vfs_fileattr_get/set functions which return
> ENOIOCTLCMD if filesystem doesn't support setting file attribute on an
> inode. For syscalls EOPNOTSUPP would be more appropriate return error.
> 
> Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/file_attr.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/fs/file_attr.c b/fs/file_attr.c
> index 460b2dd21a85..5e3e2aba97b5 100644
> --- a/fs/file_attr.c
> +++ b/fs/file_attr.c
> @@ -416,6 +416,8 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
>  	}
>  
>  	error = vfs_fileattr_get(filepath.dentry, &fa);
> +	if (error == -ENOIOCTLCMD)
> +		error = -EOPNOTSUPP;
>  	if (error)
>  		return error;
>  
> @@ -483,6 +485,8 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,
>  	if (!error) {
>  		error = vfs_fileattr_set(mnt_idmap(filepath.mnt),
>  					 filepath.dentry, &fa);
> +		if (error == -ENOIOCTLCMD)
> +			error = -EOPNOTSUPP;
>  		mnt_drop_write(filepath.mnt);
>  	}
>  
> 
> -- 
> 2.51.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH] fs: Propagate FMODE_NOCMTIME flag to user-facing O_NOCMTIME
From: Andy Lutomirski @ 2025-10-08 15:22 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Pavel Emelyanov, linux-fsdevel, Raphael S . Carvalho, linux-api,
	linux-xfs
In-Reply-To: <aOSgXXzvuq5YDj7q@infradead.org>

On Mon, Oct 6, 2025 at 10:08 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Sat, Oct 04, 2025 at 09:08:05AM -0700, Andy Lutomirski wrote:
> > > Well, we'll need to look into that, including maybe non-blockin
> > > timestamp updates.
> > >
> >
> > It's been 12 years (!), but maybe it's time to reconsider this:
> >
> > https://lore.kernel.org/all/cover.1377193658.git.luto@amacapital.net/
>
> I don't see how that is relevant here.  Also writes through shared
> mmaps are problematic for so many reasons that I'm not sure we want
> to encourage people to use that more.
>

Because the same exact issue exists in the normal non-mmap write path,
and I can even quote you upthread :)

> Well, we'll need to look into that, including maybe non-blockin
timestamp updates.

I assume the code path that inspired this thread in the first place is:

ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        ssize_t ret;

        ret = file_remove_privs(file);
        if (ret)
                return ret;

        ret = file_update_time(file);

and this has *exactly* the same problem as the shared-mmap write path:
it synchronously updates the time (well, synchronously enough that it
sometimes blocks), and it does so before updating the file contents
(although the window during which the timestamp is updated and the
contents are not is not as absurdly long as it is in the mmap case).

Now my series does not change any of this, but I'm thinking more of
the concept: instead of doing file/inode_update_time when a file is
logically written (in write_iter, page_mkwrite, etc), set a flag so
that the writeback code knows that the timestamp needs updating.
Thinking out loud, to handle both write_iter and mmap, there might
need to be two bits: one saying "the timestamp needs to be updated"
and another saying "the timestamp has been updated in the in-memory
inode, but the inode hasn't been dirtied yet".  And maybe the latter
is doable entirely within fs-specific code without any help from the
generic code, but it might still be nice to keep generic_update_time
usable for filesystems that want to do this.

--Andy

^ permalink raw reply

* Re: [PATCH v4 00/30] Live Update Orchestrator
From: Pasha Tatashin @ 2025-10-08 16:40 UTC (permalink / raw)
  To: Samiullah Khawaja
  Cc: pratyush, jasonmiu, graf, changyuanl, rppt, dmatlack, rientjes,
	corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, zhangguopeng, linux, linux-kernel, linux-doc, linux-mm,
	gregkh, tglx, mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu,
	hughd, chrisl, steven.sistare
In-Reply-To: <CAAywjhSP=ugnSJOHPGmTUPGh82wt+qnaqZAqo99EfhF-XHD5Sg@mail.gmail.com>

On Wed, Oct 8, 2025 at 3:04 AM Samiullah Khawaja <skhawaja@google.com> wrote:
>
> On Tue, Oct 7, 2025 at 10:11 AM Pasha Tatashin
> <pasha.tatashin@soleen.com> wrote:
> >
> > On Sun, Sep 28, 2025 at 9:03 PM Pasha Tatashin
> > <pasha.tatashin@soleen.com> wrote:
> > >
> > > This series introduces the Live Update Orchestrator (LUO), a kernel
> > > subsystem designed to facilitate live kernel updates. LUO enables
> > > kexec-based reboots with minimal downtime, a critical capability for
> > > cloud environments where hypervisors must be updated without disrupting
> > > running virtual machines. By preserving the state of selected resources,
> > > such as file descriptors and memory, LUO allows workloads to resume
> > > seamlessly in the new kernel.
> > >
> > > The git branch for this series can be found at:
> > > https://github.com/googleprodkernel/linux-liveupdate/tree/luo/v4
> > >
> > > The patch series applies against linux-next tag: next-20250926
> > >
> > > While this series is showed cased using memfd preservation. There are
> > > works to preserve devices:
> > > 1. IOMMU: https://lore.kernel.org/all/20250928190624.3735830-16-skhawaja@google.com
> > > 2. PCI: https://lore.kernel.org/all/20250916-luo-pci-v2-0-c494053c3c08@kernel.org
> > >
> > > =======================================================================
> > > Changelog since v3:
> > > (https://lore.kernel.org/all/20250807014442.3829950-1-pasha.tatashin@soleen.com):
> > >
> > > - The main architectural change in this version is introduction of
> > >   "sessions" to manage the lifecycle of preserved file descriptors.
> > >   In v3, session management was left to a single userspace agent. This
> > >   approach has been revised to improve robustness. Now, each session is
> > >   represented by a file descriptor (/dev/liveupdate). The lifecycle of
> > >   all preserved resources within a session is tied to this FD, ensuring
> > >   automatic cleanup by the kernel if the controlling userspace agent
> > >   crashes or exits unexpectedly.
> > >
> > > - The first three KHO fixes from the previous series have been merged
> > >   into Linus' tree.
> > >
> > > - Various bug fixes and refactorings, including correcting memory
> > >   unpreservation logic during a kho_abort() sequence.
> > >
> > > - Addressing all comments from reviewers.
> > >
> > > - Removing sysfs interface (/sys/kernel/liveupdate/state), the state
> > >   can now be queried  only via ioctl() API.
> > >
> > > =======================================================================
> >
> > Hi all,
> >
> > Following up on yesterday's Hypervisor Live Update meeting, we
> > discussed the requirements for the LUO to track dependencies,
> > particularly for IOMMU preservation and other stateful file
> > descriptors. This email summarizes the main design decisions and
> > outcomes from that discussion.
> >
> > For context, the notes from the previous meeting can be found here:
> > https://lore.kernel.org/all/365acb25-4b25-86a2-10b0-1df98703e287@google.com
> > The notes for yesterday's meeting are not yes available.
> >
> > The key outcomes are as follows:
> >
> > 1. User-Enforced Ordering
> > -------------------------
> > The responsibility for enforcing the correct order of operations will
> > lie with the userspace agent. If fd_A is a dependency for fd_B,
> > userspace must ensure that fd_A is preserved before fd_B. This same
> > ordering must be honored during the restoration phase after the reboot
> > (fd_A must be restored before fd_B). The kernel preserve the ordering.
> >
> > 2. Serialization in PRESERVE_FD
> > -------------------------------
> > To keep the global prepare() phase lightweight and predictable, the
> > consensus was to shift the heavy serialization work into the
> > PRESERVE_FD ioctl handler. This means that when userspace requests to
> > preserve a file, the file handler should perform the bulk of the
> > state-saving work immediately.
> >
> > The proposed sequence of operations reflects this shift:
> >
> > Shutdown Flow:
> > fd_preserve() (heavy serialization) -> prepare() (lightweight final
> > checks) -> Suspend VM -> reboot(KEXEC) -> freeze() (lightweight)
> >
> > Boot & Restore Flow:
> > fd_restore() (lightweight object creation) -> Resume VM -> Heavy
> > post-restore IOCTLs (e.g., hardware page table re-creation) ->
> > finish() (lightweight cleanup)
> >
> > This decision primarily serves as a guideline for file handler
> > implementations. For the LUO core, this implies minor API changes,
> > such as renaming can_preserve() to a more active preserve() and adding
> > a corresponding unpreserve() callback to be called during
> > UNPRESERVE_FD.
> >
> > 3. FD Data Query API
> > --------------------
> > We identified the need for a kernel API to allow subsystems to query
> > preserved FD data during the boot process, before userspace has
> > initiated the restore.
> >
> > The proposed API would allow a file handler to retrieve a list of all
> > its preserved FDs, including their session names, tokens, and the
> > private data payload.
> >
> > Proposed Data Structure:
> >
> > struct liveupdate_fd {
> >         char *session; /* session name */
> >         u64 token; /* Preserved FD token */
> >         u64 data; /* Private preserved data */
> > };
> >
> > Proposed Function:
> > liveupdate_fd_data_query(struct liveupdate_file_handler *h,
> >                          struct liveupdate_fd *fds, long *count);
> >
> > 4. New File-Lifecycle-Bound Global State
> > ----------------------------------------
> > A new mechanism for managing global state was proposed, designed to be
> > tied to the lifecycle of the preserved files themselves. This would
> > allow a file owner (e.g., the IOMMU subsystem) to save and retrieve
> > global state that is only relevant when one or more of its FDs are
> > being managed by LUO.
> >
> > The key characteristics of this new mechanism are:
> > The global state is optionally created on the first preserve() call
> > for a given file handler.
> > The state can be updated on subsequent preserve() calls.
> > The state is destroyed when the last corresponding file is unpreserved
> > or finished.
> > The data can be accessed during boot.
> >
> > I am thinking of an API like this.

Sami and I discussed this further, and we agree that the proposed API
will work. We also identified two additional requirements that were
not mentioned in my previous email:

1. Ordered Un-preservation
The un-preservation of file descriptors must also be ordered and must
occur in the reverse order of preservation. For example, if a user
preserves a memfd first and then an iommufd that depends on it, the
iommufd must be un-preserved before the memfd when the session is
closed or the FDs are explicitly un-preserved.

2. New API to Check Preservation Status
A new LUO API will be needed to check if a struct file is already
preserved within a session. This is needed for dependency validation.
The proposed function would look like this:

bool liveupdate_is_file_preserved(struct liveupdate_session *session,
struct file *file);

This will allow the file handler for one FD (e.g., iommufd) to verify
during its preserve() callback that its dependencies (e.g., the
backing memfd) have already been preserved in the same session.

Pasha

^ permalink raw reply

* Re: [PATCH v4 00/30] Live Update Orchestrator
From: Jason Gunthorpe @ 2025-10-08 19:35 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: Samiullah Khawaja, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, parav, leonro, witu, hughd,
	chrisl, steven.sistare
In-Reply-To: <CA+CK2bAG+YAS7oSpdrZYDK0LU2mhfRuj2qTJtT-Hn8FLUbt=Dw@mail.gmail.com>

On Wed, Oct 08, 2025 at 12:40:34PM -0400, Pasha Tatashin wrote:
> 1. Ordered Un-preservation
> The un-preservation of file descriptors must also be ordered and must
> occur in the reverse order of preservation. For example, if a user
> preserves a memfd first and then an iommufd that depends on it, the
> iommufd must be un-preserved before the memfd when the session is
> closed or the FDs are explicitly un-preserved.

Why?

I imagined the first to unpreserve would restore the struct file * -
that would satisfy the order.

The ioctl version that is to get back a FD would recover the struct
file and fd_install it.

Meaning preserve side is retaining a database of labels to restored
struct file *'s.

As discussed unpreserve a FD does not imply unfreeze, which is the
opposite of how preserver works.

> 2. New API to Check Preservation Status
> A new LUO API will be needed to check if a struct file is already
> preserved within a session. This is needed for dependency validation.
> The proposed function would look like this:

This doesn't seem right, the API should be more like 'luo get
serialization handle for this file *'

If it hasn't been preserved then there won't be a handle, otherwise it
should return something to allow the unpreserving side to recover this
struct file *.

That's the general use case at least, there may be some narrower use
cases where the preserver throws away the handle.

Jason

^ permalink raw reply

* Re: [PATCH v4 00/30] Live Update Orchestrator
From: Pasha Tatashin @ 2025-10-08 20:26 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Samiullah Khawaja, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, parav, leonro, witu, hughd,
	chrisl, steven.sistare
In-Reply-To: <20251008193551.GA3839422@nvidia.com>

On Wed, Oct 8, 2025 at 3:36 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Wed, Oct 08, 2025 at 12:40:34PM -0400, Pasha Tatashin wrote:
> > 1. Ordered Un-preservation
> > The un-preservation of file descriptors must also be ordered and must
> > occur in the reverse order of preservation. For example, if a user
> > preserves a memfd first and then an iommufd that depends on it, the
> > iommufd must be un-preserved before the memfd when the session is
> > closed or the FDs are explicitly un-preserved.
>
> Why?
>
> I imagined the first to unpreserve would restore the struct file * -
> that would satisfy the order.

In my description, "un-preserve" refers to the action of canceling a
preservation request in the outgoing kernel, before kexec ever
happens. It's the pre-reboot counterpart to the PRESERVE_FD ioctl,
used when a user decides not to go through with the live update for a
specific FD.

The terminology I am using:
preserve: Put FD into LUO in the outgoing kernel
unpreserve: Remove FD from LUO from the outgoing kernel
retrieve: Restore FD and return it to user in the next kernel

For the retrieval part, we are going to be using FIFO order, the same
as preserve.

> The ioctl version that is to get back a FD would recover the struct
> file and fd_install it.
>
> Meaning preserve side is retaining a database of labels to restored
> struct file *'s.
>
> As discussed unpreserve a FD does not imply unfreeze, which is the
> opposite of how preserver works.
>
> > 2. New API to Check Preservation Status
> > A new LUO API will be needed to check if a struct file is already
> > preserved within a session. This is needed for dependency validation.
> > The proposed function would look like this:
>
> This doesn't seem right, the API should be more like 'luo get
> serialization handle for this file *'

How about:

int liveupdate_find_token(struct liveupdate_session *session,
                          struct file *file, u64 *token);

And if needed:
int liveupdate_find_file(struct liveupdate_session *session,
                         u64 token, struct file **file);

Return: 0 on success, or -ENOENT if the file is not preserved.

Pasha

^ permalink raw reply

* Re: [PATCH] fs: Propagate FMODE_NOCMTIME flag to user-facing O_NOCMTIME
From: Dave Chinner @ 2025-10-08 21:27 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Christoph Hellwig, Pavel Emelyanov, linux-fsdevel,
	Raphael S . Carvalho, linux-api, linux-xfs
In-Reply-To: <CALCETrW3iQWQTdMbB52R4=GztfuFYvN_8p52H1fopdS8uExQWg@mail.gmail.com>

On Wed, Oct 08, 2025 at 08:22:35AM -0700, Andy Lutomirski wrote:
> On Mon, Oct 6, 2025 at 10:08 PM Christoph Hellwig <hch@infradead.org> wrote:
> >
> > On Sat, Oct 04, 2025 at 09:08:05AM -0700, Andy Lutomirski wrote:
> > > > Well, we'll need to look into that, including maybe non-blockin
> > > > timestamp updates.
> > > >
> > >
> > > It's been 12 years (!), but maybe it's time to reconsider this:
> > >
> > > https://lore.kernel.org/all/cover.1377193658.git.luto@amacapital.net/
> >
> > I don't see how that is relevant here.  Also writes through shared
> > mmaps are problematic for so many reasons that I'm not sure we want
> > to encourage people to use that more.
> >
> 
> Because the same exact issue exists in the normal non-mmap write path,
> and I can even quote you upthread :)
> 
> > Well, we'll need to look into that, including maybe non-blockin
> timestamp updates.
> 
> I assume the code path that inspired this thread in the first place is:
> 
> ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
> {
>         struct file *file = iocb->ki_filp;
>         struct address_space *mapping = file->f_mapping;
>         struct inode *inode = mapping->host;
>         ssize_t ret;
> 
>         ret = file_remove_privs(file);
>         if (ret)
>                 return ret;
> 
>         ret = file_update_time(file);
> 
> and this has *exactly* the same problem as the shared-mmap write path:
> it synchronously updates the time (well, synchronously enough that it
> sometimes blocks),

You are conflating "synchronous update" with "blocking".

Avoiding the need for synchronous timestamp updates is exactly what
the lazytime mount option provides. i.e. lazytime degrades immediate
consistency requirements to eventual consistency similar to how the
default relatime behaviour defers atime updates for eventual
writeback.

IOWs, we've already largely addressed the synchronous c/mtime update
problem but what we haven't done is made timestamp updates
fully support non-blocking caller semantics. That's a separate
problem...

> and it does so before updating the file contents
> (although the window during which the timestamp is updated and the
> contents are not is not as absurdly long as it is in the mmap case).
> 
> Now my series does not change any of this, but I'm thinking more of
> the concept: instead of doing file/inode_update_time when a file is
> logically written (in write_iter, page_mkwrite, etc), set a flag so
> that the writeback code knows that the timestamp needs updating.

This is exactly what lazytime implements with the I_DIRTY_FLAG.

During writeback, if the filesystem has to modify other metadata in
the inode (e.g. block allocation), the filesystem will piggyback the
persistent update of the dirty timestamps on that modification and
clear the I_DIRTY_TIME flag.

However, if the writeback operation is a pure overwrite, then there
is no metadata modifiction occuring and so we leave the inode
I_DIRTY_TIME dirty for a future metadata persistence operation to
clean them.

IOWs, with lazytime, writeback already persists timestamp updates
when appropriate for best performance.

> Thinking out loud, to handle both write_iter and mmap, there might
> need to be two bits: one saying "the timestamp needs to be updated"
> and another saying "the timestamp has been updated in the in-memory
> inode, but the inode hasn't been dirtied yet".

The flag that implements the latter is called I_DIRTY_TIME. We have
not implemented the former as that's a userspace visible change of
behaviour.

> And maybe the latter
> is doable entirely within fs-specific code without any help from the
> generic code, but it might still be nice to keep generic_update_time
> usable for filesystems that want to do this.

generic_update_time() already supports I_DIRTY_TIME semantics.

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply

* Re: [PATCH] fs: Propagate FMODE_NOCMTIME flag to user-facing O_NOCMTIME
From: Andy Lutomirski @ 2025-10-08 21:51 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Christoph Hellwig, Pavel Emelyanov, linux-fsdevel,
	Raphael S . Carvalho, linux-api, linux-xfs
In-Reply-To: <aObXUBCtp4p83QzS@dread.disaster.area>

On Wed, Oct 8, 2025 at 2:27 PM Dave Chinner <david@fromorbit.com> wrote:
>
> On Wed, Oct 08, 2025 at 08:22:35AM -0700, Andy Lutomirski wrote:
> > On Mon, Oct 6, 2025 at 10:08 PM Christoph Hellwig <hch@infradead.org> wrote:
> > >
> > > On Sat, Oct 04, 2025 at 09:08:05AM -0700, Andy Lutomirski wrote:

>
> You are conflating "synchronous update" with "blocking".
>
> Avoiding the need for synchronous timestamp updates is exactly what
> the lazytime mount option provides. i.e. lazytime degrades immediate
> consistency requirements to eventual consistency similar to how the
> default relatime behaviour defers atime updates for eventual
> writeback.
>
> IOWs, we've already largely addressed the synchronous c/mtime update
> problem but what we haven't done is made timestamp updates
> fully support non-blocking caller semantics. That's a separate
> problem...

I'm probably missing something, but is this really different?  Either
the mtime update can block or it can't block.  I haven't dug all the
way into exactly what happens in __mark_inode_dirty(), but there is a
lot going on in there even in the I_DIRTY_TIME path.  And Pavel is
saying that AIO and mtime updates don't play along well.

>
> > and it does so before updating the file contents
> > (although the window during which the timestamp is updated and the
> > contents are not is not as absurdly long as it is in the mmap case).
> >
> > Now my series does not change any of this, but I'm thinking more of
> > the concept: instead of doing file/inode_update_time when a file is
> > logically written (in write_iter, page_mkwrite, etc), set a flag so
> > that the writeback code knows that the timestamp needs updating.
>
> This is exactly what lazytime implements with the I_DIRTY_FLAG.
>
> During writeback, if the filesystem has to modify other metadata in
> the inode (e.g. block allocation), the filesystem will piggyback the
> persistent update of the dirty timestamps on that modification and
> clear the I_DIRTY_TIME flag.
>
> However, if the writeback operation is a pure overwrite, then there
> is no metadata modifiction occuring and so we leave the inode
> I_DIRTY_TIME dirty for a future metadata persistence operation to
> clean them.
>
> IOWs, with lazytime, writeback already persists timestamp updates
> when appropriate for best performance.

I'm probably doing a bad job explaining myself.

In my series, I move (for page_mkwrite only) the mtime update,
*including dirtying the inode* to the writeback path, which makes it
fully non-blocking / asynchronous / whatever you want to call it at
the time that page_mkwrite is called.  More concretely, my suggestion
is to be a bit lazier than current lazytime and not dirty the inode
*at all* in write_iter, or at least not dirty it for the purpose of
timestamp updates.  Instead set a flag somewhere that it cannot be
forgotten about -- in my series, it's this patch:

https://lore.kernel.org/all/f2ac22142b4634b55ff6858d159b45dac96f81b6.1377193658.git.luto@amacapital.net/

and it's a single atomic bit in struct address_space.  The idea is
that there is approximately no additional overhead at the time that
the page cache is dirtied for cmtime-related inode dirtying and that
all such overhead is deferred to the writeback path when it's as
asynchronous as possible from the perspective of whatever user code
dirtied the page cache.  My page_set_cmtime() is completely lockless.

My series is far from perfect, but I did test it with real workloads
12-ish years ago, on overworked HDDs, with latencytop, and it worked.
Performance was vastly improved (using mmap, not write(), obviously).

>
> > Thinking out loud, to handle both write_iter and mmap, there might
> > need to be two bits: one saying "the timestamp needs to be updated"
> > and another saying "the timestamp has been updated in the in-memory
> > inode, but the inode hasn't been dirtied yet".
>
> The flag that implements the latter is called I_DIRTY_TIME. We have
> not implemented the former as that's a userspace visible change of
> behaviour.

Maybe that change should be done?  Or not -- it wouldn't be terribly
hard to have a pair of atomic timestamps in struct inode indicating
what timestamps we want to write the next time we get around to it.
(Concretely, page_set_cmtime() would get some new parameters to
specify actual times, and atomic compare exchange would be used to
update the underlying data structure, so it would remain lock-free but
not be wait-free.)

^ permalink raw reply

* Re: [PATCH v3 19/30] liveupdate: luo_sysfs: add sysfs state monitoring
From: yanjun.zhu @ 2025-10-09  1:07 UTC (permalink / raw)
  To: Pasha Tatashin, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <20250807014442.3829950-20-pasha.tatashin@soleen.com>

On 8/6/25 6:44 PM, Pasha Tatashin wrote:
> Introduce a sysfs interface for the Live Update Orchestrator
> under /sys/kernel/liveupdate/. This interface provides a way for
> userspace tools and scripts to monitor the current state of the LUO
> state machine.
> 
> The main feature is a read-only file, state, which displays the
> current LUO state as a string ("normal", "prepared", "frozen",
> "updated"). The interface uses sysfs_notify to allow userspace
> listeners (e.g., via poll) to be efficiently notified of state changes.
> 
> ABI documentation for this new sysfs interface is added in
> Documentation/ABI/testing/sysfs-kernel-liveupdate.
> 
> This read-only sysfs interface complements the main ioctl interface
> provided by /dev/liveupdate, which handles LUO control operations and
> resource management.
> 
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
> ---
>   .../ABI/testing/sysfs-kernel-liveupdate       | 51 ++++++++++
>   kernel/liveupdate/Kconfig                     | 18 ++++
>   kernel/liveupdate/Makefile                    |  1 +
>   kernel/liveupdate/luo_core.c                  |  1 +
>   kernel/liveupdate/luo_internal.h              |  6 ++
>   kernel/liveupdate/luo_sysfs.c                 | 92 +++++++++++++++++++
>   6 files changed, 169 insertions(+)
>   create mode 100644 Documentation/ABI/testing/sysfs-kernel-liveupdate
>   create mode 100644 kernel/liveupdate/luo_sysfs.c
> 
> diff --git a/Documentation/ABI/testing/sysfs-kernel-liveupdate b/Documentation/ABI/testing/sysfs-kernel-liveupdate
> new file mode 100644
> index 000000000000..bb85cbae4943
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-kernel-liveupdate
> @@ -0,0 +1,51 @@
> +What:		/sys/kernel/liveupdate/
> +Date:		May 2025
> +KernelVersion:	6.16.0
> +Contact:	pasha.tatashin@soleen.com
> +Description:	Directory containing interfaces to query the live
> +		update orchestrator. Live update is the ability to reboot the
> +		host kernel (e.g., via kexec, without a full power cycle) while
> +		keeping specifically designated devices operational ("alive")
> +		across the transition. After the new kernel boots, these devices
> +		can be re-attached to their original workloads (e.g., virtual
> +		machines) with their state preserved. This is particularly
> +		useful, for example, for quick hypervisor updates without
> +		terminating running virtual machines.
> +
> +
> +What:		/sys/kernel/liveupdate/state
> +Date:		May 2025
> +KernelVersion:	6.16.0
> +Contact:	pasha.tatashin@soleen.com
> +Description:	Read-only file that displays the current state of the live
> +		update orchestrator as a string. Possible values are:
> +
> +		"normal"	No live update operation is in progress. This is
> +				the default operational state.
> +
> +		"prepared"	The live update preparation phase has completed
> +				successfully (e.g., triggered via the
> +				/dev/liveupdate event). Kernel subsystems have
> +				been notified via the %LIVEUPDATE_PREPARE
> +				event/callback and should have initiated state
> +				saving. User workloads (e.g., VMs) are generally
> +				still running, but some operations (like device
> +				unbinding or new DMA mappings) might be
> +				restricted. The system is ready for the reboot
> +				trigger.
> +
> +		"frozen"	The final reboot notification has been sent
> +				(e.g., triggered via the 'reboot()' syscall),
> +				corresponding to the %LIVEUPDATE_REBOOT kernel
> +				event. Subsystems have had their final chance to
> +				save state. User workloads must be suspended.
> +				The system is about to execute the reboot into
> +				the new kernel (imminent kexec). This state
> +				corresponds to the "blackout window".
> +
> +		"updated"	The system has successfully rebooted into the
> +				new kernel via live update. Restoration of
> +				preserved resources can now occur (typically via
> +				ioctl commands). The system is awaiting the
> +				final 'finish' signal after user space completes
> +				restoration tasks.
> diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig
> index f6b0bde188d9..75a17ca8a592 100644
> --- a/kernel/liveupdate/Kconfig
> +++ b/kernel/liveupdate/Kconfig
> @@ -29,6 +29,24 @@ config LIVEUPDATE
>   
>   	  If unsure, say N.
>   
> +config LIVEUPDATE_SYSFS_API
> +	bool "Live Update sysfs monitoring interface"
> +	depends on SYSFS
> +	depends on LIVEUPDATE
> +	help
> +	  Enable a sysfs interface for the Live Update Orchestrator
> +	  at /sys/kernel/liveupdate/.
> +
> +	  This allows monitoring the LUO state ('normal', 'prepared',
> +	  'frozen', 'updated') via the read-only 'state' file.
> +
> +	  This interface complements the primary /dev/liveupdate ioctl
> +	  interface, which handles the full update process.
> +	  This sysfs API may be useful for scripting, or userspace monitoring
> +	  needed to coordinate application restarts and minimize downtime.
> +
> +	  If unsure, say N.
> +
>   config KEXEC_HANDOVER
>   	bool "kexec handover"
>   	depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
> diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
> index c67fa2797796..47f5d0378a75 100644
> --- a/kernel/liveupdate/Makefile
> +++ b/kernel/liveupdate/Makefile
> @@ -13,3 +13,4 @@ obj-$(CONFIG_KEXEC_HANDOVER)		+= kexec_handover.o
>   obj-$(CONFIG_KEXEC_HANDOVER_DEBUG)	+= kexec_handover_debug.o
>   
>   obj-$(CONFIG_LIVEUPDATE)		+= luo.o
> +obj-$(CONFIG_LIVEUPDATE_SYSFS_API)	+= luo_sysfs.o
> diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
> index 64d53b31d6d8..bd07ee859112 100644
> --- a/kernel/liveupdate/luo_core.c
> +++ b/kernel/liveupdate/luo_core.c
> @@ -100,6 +100,7 @@ static inline bool is_current_luo_state(enum liveupdate_state expected_state)
>   static void __luo_set_state(enum liveupdate_state state)
>   {
>   	WRITE_ONCE(luo_state, state);
> +	luo_sysfs_notify();
>   }
>   
>   static inline void luo_set_state(enum liveupdate_state state)
> diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
> index 01bd0d3b023b..9091ed04c606 100644
> --- a/kernel/liveupdate/luo_internal.h
> +++ b/kernel/liveupdate/luo_internal.h
> @@ -47,4 +47,10 @@ int luo_file_freeze(u64 token);
>   int luo_file_cancel(u64 token);
>   int luo_file_finish(u64 token);
>   
> +#ifdef CONFIG_LIVEUPDATE_SYSFS_API
> +void luo_sysfs_notify(void);
> +#else
> +static inline void luo_sysfs_notify(void) {}
> +#endif
> +
>   #endif /* _LINUX_LUO_INTERNAL_H */
> diff --git a/kernel/liveupdate/luo_sysfs.c b/kernel/liveupdate/luo_sysfs.c
> new file mode 100644
> index 000000000000..935946bb741b
> --- /dev/null
> +++ b/kernel/liveupdate/luo_sysfs.c
> @@ -0,0 +1,92 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright (c) 2025, Google LLC.
> + * Pasha Tatashin <pasha.tatashin@soleen.com>
> + */
> +
> +/**
> + * DOC: LUO sysfs interface
> + *
> + * Provides a sysfs interface at ``/sys/kernel/liveupdate/`` for monitoring LUO
> + * state.  Live update allows rebooting the kernel (via kexec) while preserving
> + * designated device state for attached workloads (e.g., VMs), useful for
> + * minimizing downtime during hypervisor updates.
> + *
> + * /sys/kernel/liveupdate/state
> + * ----------------------------
> + * - Permissions:  Read-only
> + * - Description:  Displays the current LUO state string.
> + * - Valid States:
> + *     @normal
> + *       Idle state.
> + *     @prepared
> + *       Preparation phase complete (triggered via '/dev/liveupdate'). Resources
> + *       checked, state saving initiated via %LIVEUPDATE_PREPARE event.
> + *       Workloads mostly running but may be restricted. Ready forreboot
> + *       trigger.
> + *     @frozen
> + *       Final reboot notification sent (triggered via 'reboot'). Corresponds to
> + *       %LIVEUPDATE_REBOOT event. Final state saving. Workloads must be
> + *       suspended. System about to kexec ("blackout window").
> + *     @updated
> + *       New kernel booted via live update. Awaiting 'finish' signal.
> + *
> + * Userspace Interaction & Blackout Window Reduction
> + * -------------------------------------------------
> + * Userspace monitors the ``state`` file to coordinate actions:
> + *   - Suspend workloads before @frozen state is entered.
> + *   - Initiate resource restoration upon entering @updated state.
> + *   - Resume workloads after restoration, minimizing downtime.
> + */
> +
> +#include <linux/kobject.h>
> +#include <linux/liveupdate.h>
> +#include <linux/sysfs.h>
> +#include "luo_internal.h"
> +
> +static bool luo_sysfs_initialized;
> +
> +#define LUO_DIR_NAME	"liveupdate"
> +
> +void luo_sysfs_notify(void)
> +{
> +	if (luo_sysfs_initialized)
> +		sysfs_notify(kernel_kobj, LUO_DIR_NAME, "state");
> +}
> +
> +/* Show the current live update state */
> +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
> +			  char *buf)
> +{
> +	return sysfs_emit(buf, "%s\n", luo_current_state_str());

Because the window of kernel live update is short, it is difficult to 
statistics how many times the kernel is live updated.

Is it possible to add a variable to statistics the times that the kernel 
is live updated?

For example, define a global variable of type atomic_t or u64 in the 
core module:

#include <linux/atomic.h>

static atomic_t klu_counter = ATOMIC_INIT(0);


Every time a live update completes successfully, increment the counter:

atomic_inc(&klu_counter);

Then exporting this value through /proc or /sys so that user space can 
check it:

static ssize_t klu_counter_show(struct kobject *kobj, struct 
kobj_attribute *attr, char *buf)
{
     return sprintf(buf, "%d\n", atomic_read(&klu_counter));
}

Yanjun.Zhu


> +}
> +
> +static struct kobj_attribute state_attribute = __ATTR_RO(state);
> +
> +static struct attribute *luo_attrs[] = {
> +	&state_attribute.attr,
> +	NULL
> +};
> +
> +static struct attribute_group luo_attr_group = {
> +	.attrs = luo_attrs,
> +	.name = LUO_DIR_NAME,
> +};
> +
> +static int __init luo_init(void)
> +{
> +	int ret;
> +
> +	ret = sysfs_create_group(kernel_kobj, &luo_attr_group);
> +	if (ret) {
> +		pr_err("Failed to create group\n");
> +		return ret;
> +	}
> +
> +	luo_sysfs_initialized = true;
> +	pr_info("Initialized\n");
> +
> +	return 0;
> +}
> +subsys_initcall(luo_init);


^ permalink raw reply

* Re: [PATCH v3 19/30] liveupdate: luo_sysfs: add sysfs state monitoring
From: Greg KH @ 2025-10-09  5:20 UTC (permalink / raw)
  To: yanjun.zhu
  Cc: Pasha Tatashin, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, tglx, mingo, bp, dave.hansen, x86, hpa,
	rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <a27f9f8f-dc03-441b-8aa7-7daeff6c82ae@linux.dev>

On Wed, Oct 08, 2025 at 06:07:00PM -0700, yanjun.zhu wrote:
> > +#define LUO_DIR_NAME	"liveupdate"
> > +
> > +void luo_sysfs_notify(void)
> > +{
> > +	if (luo_sysfs_initialized)
> > +		sysfs_notify(kernel_kobj, LUO_DIR_NAME, "state");
> > +}
> > +
> > +/* Show the current live update state */
> > +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
> > +			  char *buf)
> > +{
> > +	return sysfs_emit(buf, "%s\n", luo_current_state_str());
> 
> Because the window of kernel live update is short, it is difficult to
> statistics how many times the kernel is live updated.
> 
> Is it possible to add a variable to statistics the times that the kernel is
> live updated?
> 
> For example, define a global variable of type atomic_t or u64 in the core
> module:
> 
> #include <linux/atomic.h>
> 
> static atomic_t klu_counter = ATOMIC_INIT(0);
> 
> 
> Every time a live update completes successfully, increment the counter:
> 
> atomic_inc(&klu_counter);
> 
> Then exporting this value through /proc or /sys so that user space can check
> it:
> 
> static ssize_t klu_counter_show(struct kobject *kobj, struct kobj_attribute
> *attr, char *buf)
> {
>     return sprintf(buf, "%d\n", atomic_read(&klu_counter));
> }

But the value can change right after you read it, so how do you "know"
it is up to date?

What exactly do you want to do with this type of information?  What are
you going to do with that information?

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH RESEND 00/62] initrd: remove classic initrd support
From: Askar Safin @ 2025-10-09  8:42 UTC (permalink / raw)
  To: Nicolas Schichan
  Cc: akpm, andy.shevchenko, axboe, brauner, cyphar, devicetree,
	ecurtin, email2tema, graf, gregkh, hca, hch, hsiangkao, initramfs,
	jack, julian.stecklina, kees, linux-acpi, linux-alpha, linux-api,
	linux-arch, linux-arm-kernel, linux-block, linux-csky, linux-doc,
	linux-efi, linux-ext4, linux-fsdevel, linux-hexagon, linux-kernel,
	linux-m68k, linux-mips, linux-openrisc, linux-parisc, linux-riscv,
	linux-s390, linux-sh, linux-snps-arc, linux-um, linuxppc-dev,
	loongarch, mcgrof, mingo, monstr, mzxreary, patches, rob,
	sparclinux, thomas.weissschuh, thorsten.blum, torvalds, tytso,
	viro, x86
In-Reply-To: <CAHNNwZC7gC7zaZGiSBhobSAb4m2O1BuoZ4r=SQBF-tCQyuAPvw@mail.gmail.com>

On Mon, Sep 22, 2025 at 5:29 PM Nicolas Schichan <nschichan@freebox.fr> wrote:
> > Then in September 2026 I will fully remove initrd.
>
> Is there a way to find some kind of middle ground here ?

I still plan to fully remove initrd in September 2026.
Maintainers will decide whether they will merge my patchset.
You may try to convince them.

> I can send a patch for that but first I need to sort out my SMTP
> issues from the other day.

If you still have mail issues, consider applying for @linux.dev email,
they are free for Linux devs ( https://linux.dev/ ).

Also, I just tried to test whether your use case is still supported in
mainline (i. e. uncompressed initrd with root=/dev/ram0).
It turned out that on modern kernels you need to enable
recently introduced CONFIG_BLK_DEV_WRITE_MOUNTED to
make this work.
So, make sure to enable this when upgrading kernel.

-- 
Askar Safin

^ permalink raw reply

* Re: [PATCH v3 19/30] liveupdate: luo_sysfs: add sysfs state monitoring
From: Pratyush Yadav @ 2025-10-09 10:58 UTC (permalink / raw)
  To: yanjun.zhu
  Cc: Pasha Tatashin, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <a27f9f8f-dc03-441b-8aa7-7daeff6c82ae@linux.dev>

On Wed, Oct 08 2025, yanjun.zhu wrote:

> On 8/6/25 6:44 PM, Pasha Tatashin wrote:
>> Introduce a sysfs interface for the Live Update Orchestrator
>> under /sys/kernel/liveupdate/. This interface provides a way for
>> userspace tools and scripts to monitor the current state of the LUO
>> state machine.
>> The main feature is a read-only file, state, which displays the
>> current LUO state as a string ("normal", "prepared", "frozen",
>> "updated"). The interface uses sysfs_notify to allow userspace
>> listeners (e.g., via poll) to be efficiently notified of state changes.
>> ABI documentation for this new sysfs interface is added in
>> Documentation/ABI/testing/sysfs-kernel-liveupdate.
>> This read-only sysfs interface complements the main ioctl interface
>> provided by /dev/liveupdate, which handles LUO control operations and
>> resource management.
>> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
[...]
>> +#include <linux/kobject.h>
>> +#include <linux/liveupdate.h>
>> +#include <linux/sysfs.h>
>> +#include "luo_internal.h"
>> +
>> +static bool luo_sysfs_initialized;
>> +
>> +#define LUO_DIR_NAME	"liveupdate"
>> +
>> +void luo_sysfs_notify(void)
>> +{
>> +	if (luo_sysfs_initialized)
>> +		sysfs_notify(kernel_kobj, LUO_DIR_NAME, "state");
>> +}
>> +
>> +/* Show the current live update state */
>> +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
>> +			  char *buf)
>> +{
>> +	return sysfs_emit(buf, "%s\n", luo_current_state_str());
>
> Because the window of kernel live update is short, it is difficult to statistics
> how many times the kernel is live updated.
>
> Is it possible to add a variable to statistics the times that the kernel is live
> updated?

The kernel doesn't do the live update on its own. The process is driven
and sequenced by userspace. So if you want to keep statistics, you
should do it from your userspace (luod maybe?). I don't see any need for
this in the kernel.

>
> For example, define a global variable of type atomic_t or u64 in the core
> module:
>
> #include <linux/atomic.h>
>
> static atomic_t klu_counter = ATOMIC_INIT(0);
>
>
> Every time a live update completes successfully, increment the counter:
>
> atomic_inc(&klu_counter);
>
> Then exporting this value through /proc or /sys so that user space can check it:
>
> static ssize_t klu_counter_show(struct kobject *kobj, struct kobj_attribute
> *attr, char *buf)
> {
>     return sprintf(buf, "%d\n", atomic_read(&klu_counter));
> }
>
> Yanjun.Zhu
[...]

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH RESEND 28/62] init: alpha, arc, arm, arm64, csky, m68k, microblaze, mips, nios2, openrisc, parisc, powerpc, s390, sh, sparc, um, x86, xtensa: rename initrd_{start,end} to virt_external_initramfs_{start,end}
From: Askar Safin @ 2025-10-09 11:19 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: linux-fsdevel, linux-kernel, Linus Torvalds, Greg Kroah-Hartman,
	Christian Brauner, Al Viro, Jan Kara, Christoph Hellwig,
	Jens Axboe, Andy Shevchenko, Aleksa Sarai, Thomas Weißschuh,
	Julian Stecklina, Gao Xiang, Art Nikpal, Andrew Morton,
	Alexander Graf, Rob Landley, Lennart Poettering, linux-arch,
	linux-alpha, linux-snps-arc, linux-arm-kernel, linux-csky,
	linux-hexagon, loongarch, linux-m68k, linux-mips, linux-openrisc,
	linux-parisc, linuxppc-dev, linux-riscv, linux-s390, linux-sh,
	sparclinux, linux-um, x86, Ingo Molnar, linux-block, initramfs,
	linux-api, linux-doc, linux-efi, linux-ext4,
	Theodore Y . Ts'o, linux-acpi, Michal Simek, devicetree,
	Luis Chamberlain, Kees Cook, Thorsten Blum, Heiko Carstens,
	patches
In-Reply-To: <20250913055851.GBaMUIGyF8VhpUsOZg@fat_crate.local>

On Sat, Sep 13, 2025 at 9:00 AM Borislav Petkov <bp@alien8.de> wrote:
> Ooh, now I see it - you have virtual and physical initramfs address things. We
> usually call those "va" and "pa". So
>
> initramfs_{va,pa}_{start,end}

Okay, I will call external_initramfs_{va,pa}_{start,end}
(after I will remove initrd, which will happen after a year)

"external" means "bootloader-supplied" as opposed to builtin initramfs.

-- 
Askar Safin

^ permalink raw reply

* Re: [PATCH v3 19/30] liveupdate: luo_sysfs: add sysfs state monitoring
From: Pasha Tatashin @ 2025-10-09 12:01 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: yanjun.zhu, jasonmiu, graf, changyuanl, rppt, dmatlack, rientjes,
	corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, zhangguopeng, linux, linux-kernel, linux-doc, linux-mm,
	gregkh, tglx, mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, lennart, brauner, linux-api, linux-fsdevel,
	saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <mafs0qzvcmje2.fsf@kernel.org>

> > Because the window of kernel live update is short, it is difficult to statistics
> > how many times the kernel is live updated.
> >
> > Is it possible to add a variable to statistics the times that the kernel is live
> > updated?
>
> The kernel doesn't do the live update on its own. The process is driven
> and sequenced by userspace. So if you want to keep statistics, you
> should do it from your userspace (luod maybe?). I don't see any need for
> this in the kernel.
>

One use case I can think of is including information in kdump or the
backtrace warning/panic messages about how many times this machine has
been live-updated. In the past, I've seen bugs (related to memory
corruption) that occurred only after several kexecs, not on the first
one. With live updates, especially while the code is being stabilized,
I imagine we might have a similar situation. For that reason, it could
be useful to have a count in the dmesg logs showing how many times
this machine has been live-updated. While this information is also
available in userspace, it would be simpler for kernel developers
triaging these issues if everything were in one place.

Pasha

^ permalink raw reply

* Re: [PATCH v4 00/30] Live Update Orchestrator
From: Jason Gunthorpe @ 2025-10-09 14:48 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: Samiullah Khawaja, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, parav, leonro, witu, hughd,
	chrisl, steven.sistare
In-Reply-To: <CA+CK2bDs1JsRCNFXkdUhdu5V-KMJXVTgETSHPvCtXKjkpD79Sw@mail.gmail.com>

On Wed, Oct 08, 2025 at 04:26:39PM -0400, Pasha Tatashin wrote:
> On Wed, Oct 8, 2025 at 3:36 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Wed, Oct 08, 2025 at 12:40:34PM -0400, Pasha Tatashin wrote:
> > > 1. Ordered Un-preservation
> > > The un-preservation of file descriptors must also be ordered and must
> > > occur in the reverse order of preservation. For example, if a user
> > > preserves a memfd first and then an iommufd that depends on it, the
> > > iommufd must be un-preserved before the memfd when the session is
> > > closed or the FDs are explicitly un-preserved.
> >
> > Why?
> >
> > I imagined the first to unpreserve would restore the struct file * -
> > that would satisfy the order.
> 
> In my description, "un-preserve" refers to the action of canceling a
> preservation request in the outgoing kernel, before kexec ever
> happens. It's the pre-reboot counterpart to the PRESERVE_FD ioctl,
> used when a user decides not to go through with the live update for a
> specific FD.
> 
> The terminology I am using:
> preserve: Put FD into LUO in the outgoing kernel
> unpreserve: Remove FD from LUO from the outgoing kernel
> retrieve: Restore FD and return it to user in the next kernel

Ok

> For the retrieval part, we are going to be using FIFO order, the same
> as preserve.

This won't work. retrieval is driven by early boot discovery ordering
and then by userspace. It will be in whatever order it wants. We need
to be able to do things like make the struct file * at the moment
something requests it..

> > This doesn't seem right, the API should be more like 'luo get
> > serialization handle for this file *'
> 
> How about:
> 
> int liveupdate_find_token(struct liveupdate_session *session,
>                           struct file *file, u64 *token);

This sort of thing should not be used on the preserve side..

> And if needed:
> int liveupdate_find_file(struct liveupdate_session *session,
>                          u64 token, struct file **file);
> 
> Return: 0 on success, or -ENOENT if the file is not preserved.

I would argue it should always cause a preservation...

But this is still backwards, what we need is something like

liveupdate_preserve_file(session, file, &token);
my_preserve_blob.file_token = token

[..]

file = liveupdate_retrieve_file(session, my_preserve_blob.file_token);

And these can run in any order, and be called multiple times.

Jason

^ permalink raw reply

* Re: [PATCH v3 19/30] liveupdate: luo_sysfs: add sysfs state monitoring
From: Jason Gunthorpe @ 2025-10-09 14:50 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: Pratyush Yadav, yanjun.zhu, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, parav, leonro, witu
In-Reply-To: <CA+CK2bCx=kTVORq9dRE2h3Z4QQ-ggxanY2tDPRy13_ARhc+TqA@mail.gmail.com>

On Thu, Oct 09, 2025 at 08:01:13AM -0400, Pasha Tatashin wrote:
> > > Because the window of kernel live update is short, it is difficult to statistics
> > > how many times the kernel is live updated.
> > >
> > > Is it possible to add a variable to statistics the times that the kernel is live
> > > updated?
> >
> > The kernel doesn't do the live update on its own. The process is driven
> > and sequenced by userspace. So if you want to keep statistics, you
> > should do it from your userspace (luod maybe?). I don't see any need for
> > this in the kernel.
> >
> 
> One use case I can think of is including information in kdump or the
> backtrace warning/panic messages about how many times this machine has
> been live-updated. In the past, I've seen bugs (related to memory
> corruption) that occurred only after several kexecs, not on the first
> one. 

That seems like a reasonable point, to do something like a taint where
this is recorded, visible and logged during an oops.

Jason

^ permalink raw reply

* Re: [PATCH v4 00/30] Live Update Orchestrator
From: Pasha Tatashin @ 2025-10-09 15:01 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Samiullah Khawaja, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, parav, leonro, witu, hughd,
	chrisl, steven.sistare
In-Reply-To: <20251009144822.GD3839422@nvidia.com>

On Thu, Oct 9, 2025 at 10:48 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Wed, Oct 08, 2025 at 04:26:39PM -0400, Pasha Tatashin wrote:
> > On Wed, Oct 8, 2025 at 3:36 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Wed, Oct 08, 2025 at 12:40:34PM -0400, Pasha Tatashin wrote:
> > > > 1. Ordered Un-preservation
> > > > The un-preservation of file descriptors must also be ordered and must
> > > > occur in the reverse order of preservation. For example, if a user
> > > > preserves a memfd first and then an iommufd that depends on it, the
> > > > iommufd must be un-preserved before the memfd when the session is
> > > > closed or the FDs are explicitly un-preserved.
> > >
> > > Why?
> > >
> > > I imagined the first to unpreserve would restore the struct file * -
> > > that would satisfy the order.
> >
> > In my description, "un-preserve" refers to the action of canceling a
> > preservation request in the outgoing kernel, before kexec ever
> > happens. It's the pre-reboot counterpart to the PRESERVE_FD ioctl,
> > used when a user decides not to go through with the live update for a
> > specific FD.
> >
> > The terminology I am using:
> > preserve: Put FD into LUO in the outgoing kernel
> > unpreserve: Remove FD from LUO from the outgoing kernel
> > retrieve: Restore FD and return it to user in the next kernel
>
> Ok
>
> > For the retrieval part, we are going to be using FIFO order, the same
> > as preserve.
>
> This won't work. retrieval is driven by early boot discovery ordering
> and then by userspace. It will be in whatever order it wants. We need
> to be able to do things like make the struct file * at the moment
> something requests it..

I thought we wanted only the user to do "struct file" creation when
the user retrieves FD back. In this case we can enforce strict
ordering during retrieval. If "struct file" can be retrieved by
anything within the kernel, then that could be any kernel process
during boot, meaning that charging is not going to be properly applied
when kernel allocations are performed.

We specifically decided that while "struct file"s are going to be
created only by the user, the other subsystems can have early access
to the preserved file data, if they know how to parse it.

> > > This doesn't seem right, the API should be more like 'luo get
> > > serialization handle for this file *'
> >
> > How about:
> >
> > int liveupdate_find_token(struct liveupdate_session *session,
> >                           struct file *file, u64 *token);
>
> This sort of thing should not be used on the preserve side..
>
> > And if needed:
> > int liveupdate_find_file(struct liveupdate_session *session,
> >                          u64 token, struct file **file);
> >
> > Return: 0 on success, or -ENOENT if the file is not preserved.
>
> I would argue it should always cause a preservation...
>
> But this is still backwards, what we need is something like
>
> liveupdate_preserve_file(session, file, &token);
> my_preserve_blob.file_token = token

We cannot do that, the user should have already preserved that file
and provided us with a token to use, if that file was not preserved by
the user it is a bug. With this proposal, we would have to generate a
token, and it was argued that the kernel should not do that.

> file = liveupdate_retrieve_file(session, my_preserve_blob.file_token);
>
> And these can run in any order, and be called multiple times.
>
> Jason

^ permalink raw reply

* Re: [PATCH v4 00/30] Live Update Orchestrator
From: Pasha Tatashin @ 2025-10-09 15:03 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Samiullah Khawaja, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, parav, leonro, witu, hughd,
	chrisl, steven.sistare
In-Reply-To: <CA+CK2bC_m5GRxCa1szw1v24Ssq8EnCWp4e985RJ5RRCdhztQWg@mail.gmail.com>

On Thu, Oct 9, 2025 at 11:01 AM Pasha Tatashin
<pasha.tatashin@soleen.com> wrote:
>
> On Thu, Oct 9, 2025 at 10:48 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Wed, Oct 08, 2025 at 04:26:39PM -0400, Pasha Tatashin wrote:
> > > On Wed, Oct 8, 2025 at 3:36 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > >
> > > > On Wed, Oct 08, 2025 at 12:40:34PM -0400, Pasha Tatashin wrote:
> > > > > 1. Ordered Un-preservation
> > > > > The un-preservation of file descriptors must also be ordered and must
> > > > > occur in the reverse order of preservation. For example, if a user
> > > > > preserves a memfd first and then an iommufd that depends on it, the
> > > > > iommufd must be un-preserved before the memfd when the session is
> > > > > closed or the FDs are explicitly un-preserved.
> > > >
> > > > Why?
> > > >
> > > > I imagined the first to unpreserve would restore the struct file * -
> > > > that would satisfy the order.
> > >
> > > In my description, "un-preserve" refers to the action of canceling a
> > > preservation request in the outgoing kernel, before kexec ever
> > > happens. It's the pre-reboot counterpart to the PRESERVE_FD ioctl,
> > > used when a user decides not to go through with the live update for a
> > > specific FD.
> > >
> > > The terminology I am using:
> > > preserve: Put FD into LUO in the outgoing kernel
> > > unpreserve: Remove FD from LUO from the outgoing kernel
> > > retrieve: Restore FD and return it to user in the next kernel
> >
> > Ok
> >
> > > For the retrieval part, we are going to be using FIFO order, the same
> > > as preserve.
> >
> > This won't work. retrieval is driven by early boot discovery ordering
> > and then by userspace. It will be in whatever order it wants. We need
> > to be able to do things like make the struct file * at the moment
> > something requests it..
>
> I thought we wanted only the user to do "struct file" creation when
> the user retrieves FD back. In this case we can enforce strict
> ordering during retrieval. If "struct file" can be retrieved by
> anything within the kernel, then that could be any kernel process
> during boot, meaning that charging is not going to be properly applied
> when kernel allocations are performed.

There is a second reason: by the time we enter userspace, and are
ready to retrieve FDs, we know that all file handlers that are to be
registered have registered, if we do that during boot with-in kernel,
then we can get into the problem, where we are trying to retrieve data
of a file-handler that has not yet registered.

>
> We specifically decided that while "struct file"s are going to be
> created only by the user, the other subsystems can have early access
> to the preserved file data, if they know how to parse it.
>
> > > > This doesn't seem right, the API should be more like 'luo get
> > > > serialization handle for this file *'
> > >
> > > How about:
> > >
> > > int liveupdate_find_token(struct liveupdate_session *session,
> > >                           struct file *file, u64 *token);
> >
> > This sort of thing should not be used on the preserve side..
> >
> > > And if needed:
> > > int liveupdate_find_file(struct liveupdate_session *session,
> > >                          u64 token, struct file **file);
> > >
> > > Return: 0 on success, or -ENOENT if the file is not preserved.
> >
> > I would argue it should always cause a preservation...
> >
> > But this is still backwards, what we need is something like
> >
> > liveupdate_preserve_file(session, file, &token);
> > my_preserve_blob.file_token = token
>
> We cannot do that, the user should have already preserved that file
> and provided us with a token to use, if that file was not preserved by
> the user it is a bug. With this proposal, we would have to generate a
> token, and it was argued that the kernel should not do that.
>
> > file = liveupdate_retrieve_file(session, my_preserve_blob.file_token);
> >
> > And these can run in any order, and be called multiple times.
> >
> > Jason

^ permalink raw reply

* Re: [PATCH v3 19/30] liveupdate: luo_sysfs: add sysfs state monitoring
From: Zhu Yanjun @ 2025-10-09 15:34 UTC (permalink / raw)
  To: Pasha Tatashin, Pratyush Yadav
  Cc: jasonmiu, graf, changyuanl, rppt, dmatlack, rientjes, corbet,
	rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm,
	tj, yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, zhangguopeng,
	linux, linux-kernel, linux-doc, linux-mm, gregkh, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, jgg,
	parav, leonro, witu
In-Reply-To: <CA+CK2bCx=kTVORq9dRE2h3Z4QQ-ggxanY2tDPRy13_ARhc+TqA@mail.gmail.com>


在 2025/10/9 5:01, Pasha Tatashin 写道:
>>> Because the window of kernel live update is short, it is difficult to statistics
>>> how many times the kernel is live updated.
>>>
>>> Is it possible to add a variable to statistics the times that the kernel is live
>>> updated?
>> The kernel doesn't do the live update on its own. The process is driven
>> and sequenced by userspace. So if you want to keep statistics, you
>> should do it from your userspace (luod maybe?). I don't see any need for
>> this in the kernel.
>>
> One use case I can think of is including information in kdump or the
> backtrace warning/panic messages about how many times this machine has
> been live-updated. In the past, I've seen bugs (related to memory
> corruption) that occurred only after several kexecs, not on the first
> one. With live updates, especially while the code is being stabilized,
> I imagine we might have a similar situation. For that reason, it could
> be useful to have a count in the dmesg logs showing how many times
> this machine has been live-updated. While this information is also
> available in userspace, it would be simpler for kernel developers
> triaging these issues if everything were in one place.
I’m considering this issue from a system security perspective. After the 
kernel is automatically updated, user-space applications are usually 
unaware of the change. In one possible scenario, an attacker could 
replace the kernel with a compromised version, while user-space 
applications remain unaware of it — which poses a potential security risk.

To mitigate this, it would be useful to expose the number of kernel 
updates through a sysfs interface, so that we can detect whether the 
kernel has been updated and then collect information about the new 
kernel to check for possible security issues.

Of course, there are other ways to detect kernel updates — for example, 
by using ftrace to monitor functions involved in live kernel updates — 
but such approaches tend to have a higher performance overhead. In 
contrast, adding a simple update counter to track live kernel updates 
would provide similar monitoring capability with minimal overhead.

Yanjun.Zhu

>
> Pasha

-- 
Best Regards,
Yanjun.Zhu


^ permalink raw reply

* Re: [PATCH v4 00/30] Live Update Orchestrator
From: Samiullah Khawaja @ 2025-10-09 16:46 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: Jason Gunthorpe, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, parav, leonro, witu, hughd,
	chrisl, steven.sistare
In-Reply-To: <CA+CK2bC_m5GRxCa1szw1v24Ssq8EnCWp4e985RJ5RRCdhztQWg@mail.gmail.com>

On Thu, Oct 9, 2025 at 8:02 AM Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
>
> On Thu, Oct 9, 2025 at 10:48 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Wed, Oct 08, 2025 at 04:26:39PM -0400, Pasha Tatashin wrote:
> > > On Wed, Oct 8, 2025 at 3:36 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > >
> > > > On Wed, Oct 08, 2025 at 12:40:34PM -0400, Pasha Tatashin wrote:
> > > > > 1. Ordered Un-preservation
> > > > > The un-preservation of file descriptors must also be ordered and must
> > > > > occur in the reverse order of preservation. For example, if a user
> > > > > preserves a memfd first and then an iommufd that depends on it, the
> > > > > iommufd must be un-preserved before the memfd when the session is
> > > > > closed or the FDs are explicitly un-preserved.
> > > >
> > > > Why?
> > > >
> > > > I imagined the first to unpreserve would restore the struct file * -
> > > > that would satisfy the order.
> > >
> > > In my description, "un-preserve" refers to the action of canceling a
> > > preservation request in the outgoing kernel, before kexec ever
> > > happens. It's the pre-reboot counterpart to the PRESERVE_FD ioctl,
> > > used when a user decides not to go through with the live update for a
> > > specific FD.
> > >
> > > The terminology I am using:
> > > preserve: Put FD into LUO in the outgoing kernel
> > > unpreserve: Remove FD from LUO from the outgoing kernel
> > > retrieve: Restore FD and return it to user in the next kernel
> >
> > Ok
> >
> > > For the retrieval part, we are going to be using FIFO order, the same
> > > as preserve.
> >
> > This won't work. retrieval is driven by early boot discovery ordering
> > and then by userspace. It will be in whatever order it wants. We need
> > to be able to do things like make the struct file * at the moment
> > something requests it..
>
> I thought we wanted only the user to do "struct file" creation when
> the user retrieves FD back. In this case we can enforce strict
> ordering during retrieval. If "struct file" can be retrieved by
> anything within the kernel, then that could be any kernel process
> during boot, meaning that charging is not going to be properly applied
> when kernel allocations are performed.
>
> We specifically decided that while "struct file"s are going to be
> created only by the user, the other subsystems can have early access
> to the preserved file data, if they know how to parse it.
>
> > > > This doesn't seem right, the API should be more like 'luo get
> > > > serialization handle for this file *'
> > >
> > > How about:
> > >
> > > int liveupdate_find_token(struct liveupdate_session *session,
> > >                           struct file *file, u64 *token);
> >
> > This sort of thing should not be used on the preserve side..
> >
> > > And if needed:
> > > int liveupdate_find_file(struct liveupdate_session *session,
> > >                          u64 token, struct file **file);
> > >
> > > Return: 0 on success, or -ENOENT if the file is not preserved.
> >
> > I would argue it should always cause a preservation...
> >
> > But this is still backwards, what we need is something like
> >
> > liveupdate_preserve_file(session, file, &token);
> > my_preserve_blob.file_token = token

Please clarify if you still consider that the user does register the
dependencies FDs explicitly, but this API just triggers the
"prepare()" or "preserve()" callback so the preservation order is
enforced/synchronized?
>
> We cannot do that, the user should have already preserved that file
> and provided us with a token to use, if that file was not preserved by
> the user it is a bug. With this proposal, we would have to generate a
> token, and it was argued that the kernel should not do that.

Agreed. Another thing that I was wondering about is how does the user
space know that its FD was preserved as dependency?

>
> > file = liveupdate_retrieve_file(session, my_preserve_blob.file_token);
> >
> > And these can run in any order, and be called multiple times.
> >
> > Jason

^ permalink raw reply

* Re: [PATCH v3 19/30] liveupdate: luo_sysfs: add sysfs state monitoring
From: Pasha Tatashin @ 2025-10-09 17:04 UTC (permalink / raw)
  To: Zhu Yanjun
  Cc: Pratyush Yadav, jasonmiu, graf, changyuanl, rppt, dmatlack,
	rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, zhangguopeng, linux, linux-kernel, linux-doc, linux-mm,
	gregkh, tglx, mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, lennart, brauner, linux-api, linux-fsdevel,
	saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <dc71808c-c6a4-434a-aee9-b97601814c92@linux.dev>

On Thu, Oct 9, 2025 at 11:35 AM Zhu Yanjun <yanjun.zhu@linux.dev> wrote:
>
>
> 在 2025/10/9 5:01, Pasha Tatashin 写道:
> >>> Because the window of kernel live update is short, it is difficult to statistics
> >>> how many times the kernel is live updated.
> >>>
> >>> Is it possible to add a variable to statistics the times that the kernel is live
> >>> updated?
> >> The kernel doesn't do the live update on its own. The process is driven
> >> and sequenced by userspace. So if you want to keep statistics, you
> >> should do it from your userspace (luod maybe?). I don't see any need for
> >> this in the kernel.
> >>
> > One use case I can think of is including information in kdump or the
> > backtrace warning/panic messages about how many times this machine has
> > been live-updated. In the past, I've seen bugs (related to memory
> > corruption) that occurred only after several kexecs, not on the first
> > one. With live updates, especially while the code is being stabilized,
> > I imagine we might have a similar situation. For that reason, it could
> > be useful to have a count in the dmesg logs showing how many times
> > this machine has been live-updated. While this information is also
> > available in userspace, it would be simpler for kernel developers
> > triaging these issues if everything were in one place.
> I’m considering this issue from a system security perspective. After the
> kernel is automatically updated, user-space applications are usually
> unaware of the change. In one possible scenario, an attacker could
> replace the kernel with a compromised version, while user-space
> applications remain unaware of it — which poses a potential security risk.
>
> To mitigate this, it would be useful to expose the number of kernel
> updates through a sysfs interface, so that we can detect whether the
> kernel has been updated and then collect information about the new
> kernel to check for possible security issues.
>
> Of course, there are other ways to detect kernel updates — for example,
> by using ftrace to monitor functions involved in live kernel updates —
> but such approaches tend to have a higher performance overhead. In
> contrast, adding a simple update counter to track live kernel updates
> would provide similar monitoring capability with minimal overhead.

Would a print during boot, i.e. when we print that this kernel is live
updating, we could include the number, work for you? Otherwise, we
could export this number in a debugfs.

Pasha

^ permalink raw reply

* Re: [PATCH 2/2] fs: return EOPNOTSUPP from file_setattr/file_getattr syscalls
From: Darrick J. Wong @ 2025-10-09 17:20 UTC (permalink / raw)
  To: Andrey Albershteyn
  Cc: linux-api, linux-fsdevel, linux-kernel, linux-xfs, Jan Kara,
	Jiri Slaby, Christian Brauner, Arnd Bergmann, Andrey Albershteyn
In-Reply-To: <20251008-eopnosupp-fix-v1-2-5990de009c9f@kernel.org>

On Wed, Oct 08, 2025 at 02:44:18PM +0200, Andrey Albershteyn wrote:
> These syscalls call to vfs_fileattr_get/set functions which return
> ENOIOCTLCMD if filesystem doesn't support setting file attribute on an
> inode. For syscalls EOPNOTSUPP would be more appropriate return error.
> 
> Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
> ---
>  fs/file_attr.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/fs/file_attr.c b/fs/file_attr.c
> index 460b2dd21a85..5e3e2aba97b5 100644
> --- a/fs/file_attr.c
> +++ b/fs/file_attr.c
> @@ -416,6 +416,8 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
>  	}
>  
>  	error = vfs_fileattr_get(filepath.dentry, &fa);
> +	if (error == -ENOIOCTLCMD)

Hrm.  Back in 6.17, XFS would return ENOTTY if you called ->fileattr_get
on a special file:

int
xfs_fileattr_get(
	struct dentry		*dentry,
	struct file_kattr	*fa)
{
	struct xfs_inode	*ip = XFS_I(d_inode(dentry));

	if (d_is_special(dentry))
		return -ENOTTY;
	...
}

Given that there are other fileattr_[gs]et implementations out there
that might return ENOTTY (e.g. fuse servers and other externally
maintained filesystems), I think both syscall functions need to check
for that as well:

	if (error == -ENOIOCTLCMD || error == -ENOTTY)
		return -EOPNOTSUPP;

--D

> +		error = -EOPNOTSUPP;
>  	if (error)
>  		return error;
>  
> @@ -483,6 +485,8 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,
>  	if (!error) {
>  		error = vfs_fileattr_set(mnt_idmap(filepath.mnt),
>  					 filepath.dentry, &fa);
> +		if (error == -ENOIOCTLCMD)
> +			error = -EOPNOTSUPP;
>  		mnt_drop_write(filepath.mnt);
>  	}
>  
> 
> -- 
> 2.51.0
> 
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox