Linux userland API discussions
 help / color / mirror / Atom feed
* Re: [PATCH v6 03/20] kexec: call liveupdate_reboot() before kexec
From: Pratyush Yadav @ 2025-11-21 15:55 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: pratyush, jasonmiu, graf, rppt, dmatlack, rientjes, corbet,
	rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm,
	tj, yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, linux,
	linux-kernel, linux-doc, linux-mm, gregkh, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, jgg,
	parav, leonro, witu, hughd, skhawaja, chrisl
In-Reply-To: <20251115233409.768044-4-pasha.tatashin@soleen.com>

On Sat, Nov 15 2025, Pasha Tatashin wrote:

> Modify the kernel_kexec() to call liveupdate_reboot().
>
> This ensures that the Live Update Orchestrator is notified just
> before the kernel executes the kexec jump. The liveupdate_reboot()
> function triggers the final freeze event, allowing participating
> FDs perform last-minute check or state saving within the blackout
> window.
>
> If liveupdate_reboot() returns an error (indicating a failure during
> LUO finalization), the kexec operation is aborted to prevent proceeding
> with an inconsistent state. An error is returned to user.
>
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>

Reviewed-by: Pratyush Yadav <pratyush@kernel.org>

[...]

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v6 04/20] liveupdate: luo_session: add sessions support
From: Pratyush Yadav @ 2025-11-21 16:32 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: pratyush, jasonmiu, graf, rppt, dmatlack, rientjes, corbet,
	rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm,
	tj, yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, linux,
	linux-kernel, linux-doc, linux-mm, gregkh, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, jgg,
	parav, leonro, witu, hughd, skhawaja, chrisl
In-Reply-To: <20251115233409.768044-5-pasha.tatashin@soleen.com>

On Sat, Nov 15 2025, Pasha Tatashin wrote:

> Introduce concept of "Live Update Sessions" within the LUO framework.
> LUO sessions provide a mechanism to group and manage `struct file *`
> instances (representing file descriptors) that need to be preserved
> across a kexec-based live update.
>
> Each session is identified by a unique name and acts as a container
> for file objects whose state is critical to a userspace workload, such
> as a virtual machine or a high-performance database, aiming to maintain
> their functionality across a kernel transition.
>
> This groundwork establishes the framework for preserving file-backed
> state across kernel updates, with the actual file data preservation
> mechanisms to be implemented in subsequent patches.
>
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
[...]
>  
>  #ifndef _LINUX_LIVEUPDATE_ABI_LUO_H
>  #define _LINUX_LIVEUPDATE_ABI_LUO_H
>  
> +#include <uapi/linux/liveupdate.h>
> +
>  /*
>   * The LUO FDT hooks all LUO state for sessions, fds, etc.
> - * In the root it allso carries "liveupdate-number" 64-bit property that
> + * In the root it also carries "liveupdate-number" 64-bit property that

Nit: This needs a bit of patch massaging. Patch 2 added the typo, and
this patch fixes it. It would be better to just update patch 2.

>   * corresponds to the number of live-updates performed on this machine.
>   */
>  #define LUO_FDT_SIZE		PAGE_SIZE
> @@ -51,4 +82,54 @@
>  #define LUO_FDT_COMPATIBLE	"luo-v1"
>  #define LUO_FDT_LIVEUPDATE_NUM	"liveupdate-number"
>  
> +/*
> + * LUO FDT session node
> + * LUO_FDT_SESSION_HEADER:  is a u64 physical address of struct
> + *                          luo_session_header_ser
> + */
> +#define LUO_FDT_SESSION_NODE_NAME	"luo-session"
> +#define LUO_FDT_SESSION_COMPATIBLE	"luo-session-v1"
> +#define LUO_FDT_SESSION_HEADER		"luo-session-header"
> +
> +/**
> + * struct luo_session_header_ser - Header for the serialized session data block.
> + * @pgcnt: The total size, in pages, of the entire preserved memory block
> + *         that this header describes.
> + * @count: The number of 'struct luo_session_ser' entries that immediately
> + *         follow this header in the memory block.
> + *
> + * This structure is located at the beginning of a contiguous block of
> + * physical memory preserved across the kexec. It provides the necessary
> + * metadata to interpret the array of session entries that follow.
> + */
> +struct luo_session_header_ser {
> +	u64 pgcnt;

Why do you need pgcnt here? Can't the size be inferred from count? And
since you use contiguous memory block, the folio will know its page
count anyway, right? The less we have in the ABI the better IMO.

Same for other structures below.

> +	u64 count;
> +} __packed;
> +
> +/**
> + * struct luo_session_ser - Represents the serialized metadata for a LUO session.
> + * @name:    The unique name of the session, copied from the `luo_session`
> + *           structure.
> + * @files:   The physical address of a contiguous memory block that holds
> + *           the serialized state of files.
> + * @pgcnt:   The number of pages occupied by the `files` memory block.
> + * @count:   The total number of files that were part of this session during
> + *           serialization. Used for iteration and validation during
> + *           restoration.
> + *
> + * This structure is used to package session-specific metadata for transfer
> + * between kernels via Kexec Handover. An array of these structures (one per
> + * session) is created and passed to the new kernel, allowing it to reconstruct
> + * the session context.
> + *
> + * If this structure is modified, LUO_SESSION_COMPATIBLE must be updated.
> + */
> +struct luo_session_ser {
> +	char name[LIVEUPDATE_SESSION_NAME_LENGTH];
> +	u64 files;
> +	u64 pgcnt;
> +	u64 count;
> +} __packed;
> +
>  #endif /* _LINUX_LIVEUPDATE_ABI_LUO_H */
[...]
> +/* Create a "struct file" for session */
> +static int luo_session_getfile(struct luo_session *session, struct file **filep)
> +{
> +	char name_buf[128];
> +	struct file *file;
> +
> +	guard(mutex)(&session->mutex);
> +	snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name);
> +	file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR);

Nit: You can return the file directly and get rid of filep.

> +	if (IS_ERR(file))
> +		return PTR_ERR(file);
> +
> +	*filep = file;
> +
> +	return 0;
> +}
[...]
> +int __init luo_session_setup_outgoing(void *fdt_out)
> +{
> +	struct luo_session_header_ser *header_ser;
> +	u64 header_ser_pa;
> +	int err;
> +
> +	header_ser = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT);

Nit: The naming is a bit confusing here. At first glance I thought this
was just allocating the header, but it allocates the whole session
serialization buffer.

> +	if (IS_ERR(header_ser))
> +		return PTR_ERR(header_ser);
> +	header_ser_pa = virt_to_phys(header_ser);
> +
> +	err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME);
> +	err |= fdt_property_string(fdt_out, "compatible",
> +				   LUO_FDT_SESSION_COMPATIBLE);
> +	err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa,
> +			    sizeof(header_ser_pa));
> +	err |= fdt_end_node(fdt_out);
> +
> +	if (err)
> +		goto err_unpreserve;
> +
> +	header_ser->pgcnt = LUO_SESSION_PGCNT;
> +	INIT_LIST_HEAD(&luo_session_global.outgoing.list);
> +	init_rwsem(&luo_session_global.outgoing.rwsem);
> +	luo_session_global.outgoing.header_ser = header_ser;
> +	luo_session_global.outgoing.ser = (void *)(header_ser + 1);
> +	luo_session_global.outgoing.active = true;
> +
> +	return 0;
> +
> +err_unpreserve:
> +	kho_unpreserve_free(header_ser);
> +	return err;
> +}
[...]
> +int luo_session_deserialize(void)
> +{
> +	struct luo_session_header *sh = &luo_session_global.incoming;
> +	int err;
> +
> +	if (luo_session_is_deserialized())
> +		return 0;
> +
> +	luo_session_global.deserialized = true;
> +	if (!sh->active) {
> +		INIT_LIST_HEAD(&sh->list);
> +		init_rwsem(&sh->rwsem);

Nit: it would be a bit simpler if LUO init always initialized this. And
then luo_session_setup_incoming() can fill the list if it has any data.
Slight reduction in code duplication and mental load.

> +		return 0;
> +	}
> +
> +	for (int i = 0; i < sh->header_ser->count; i++) {
> +		struct luo_session *session;
> +
> +		session = luo_session_alloc(sh->ser[i].name);
> +		if (IS_ERR(session)) {
> +			pr_warn("Failed to allocate session [%s] during deserialization %pe\n",
> +				sh->ser[i].name, session);
> +			return PTR_ERR(session);
> +		}
> +
> +		err = luo_session_insert(sh, session); 
> +		if (err) {
> +			luo_session_free(session);
> +			pr_warn("Failed to insert session [%s] %pe\n",
> +				session->name, ERR_PTR(err));
> +			return err;
> +		}
> +
> +		session->count = sh->ser[i].count;
> +		session->files = sh->ser[i].files ? phys_to_virt(sh->ser[i].files) : 0;
> +		session->pgcnt = sh->ser[i].pgcnt;
> +	}
> +
> +	kho_restore_free(sh->header_ser);
> +	sh->header_ser = NULL;
> +	sh->ser = NULL;
> +
> +	return 0;
> +}
[...]

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v6 05/20] liveupdate: luo_ioctl: add user interface
From: Pratyush Yadav @ 2025-11-21 16:45 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: pratyush, jasonmiu, graf, rppt, dmatlack, rientjes, corbet,
	rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm,
	tj, yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, linux,
	linux-kernel, linux-doc, linux-mm, gregkh, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, jgg,
	parav, leonro, witu, hughd, skhawaja, chrisl
In-Reply-To: <20251115233409.768044-6-pasha.tatashin@soleen.com>

On Sat, Nov 15 2025, Pasha Tatashin wrote:

> Introduce the user-space interface for the Live Update Orchestrator
> via ioctl commands, enabling external control over the live update
> process and management of preserved resources.
>
> The idea is that there is going to be a single userspace agent driving
> the live update, therefore, only a single process can ever hold this
> device opened at a time.
>
> The following ioctl commands are introduced:
>
> LIVEUPDATE_IOCTL_CREATE_SESSION
> Provides a way for userspace to create a named session for grouping file
> descriptors that need to be preserved. It returns a new file descriptor
> representing the session.
>
> LIVEUPDATE_IOCTL_RETRIEVE_SESSION
> Allows the userspace agent in the new kernel to reclaim a preserved
> session by its name, receiving a new file descriptor to manage the
> restored resources.
>
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>

Reviewed-by: Pratyush Yadav <pratyush@kernel.org>

[...]

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v6 06/20] liveupdate: luo_file: implement file systems callbacks
From: Pratyush Yadav @ 2025-11-21 17:24 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: pratyush, jasonmiu, graf, rppt, dmatlack, rientjes, corbet,
	rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm,
	tj, yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, linux,
	linux-kernel, linux-doc, linux-mm, gregkh, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, jgg,
	parav, leonro, witu, hughd, skhawaja, chrisl
In-Reply-To: <20251115233409.768044-7-pasha.tatashin@soleen.com>

On Sat, Nov 15 2025, Pasha Tatashin wrote:

> This patch implements the core mechanism for managing preserved
> files throughout the live update lifecycle. It provides the logic to
> invoke the file handler callbacks (preserve, unpreserve, freeze,
> unfreeze, retrieve, and finish) at the appropriate stages.
>
> During the reboot phase, luo_file_freeze() serializes the final
> metadata for each file (handler compatible string, token, and data
> handle) into a memory region preserved by KHO. In the new kernel,
> luo_file_deserialize() reconstructs the in-memory file list from this
> data, preparing the session for retrieval.
>
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
[...]
> +
> +static LIST_HEAD(luo_file_handler_list);
> +
> +/* 2 4K pages, give space for 128 files per session */
> +#define LUO_FILE_PGCNT		2ul
> +#define LUO_FILE_MAX							\
> +	((LUO_FILE_PGCNT << PAGE_SHIFT) / sizeof(struct luo_file_ser))
> +
> +/**
> + * struct luo_file - Represents a single preserved file instance.
> + * @fh:            Pointer to the &struct liveupdate_file_handler that manages
> + *                 this type of file.
> + * @file:          Pointer to the kernel's &struct file that is being preserved.
> + *                 This is NULL in the new kernel until the file is successfully
> + *                 retrieved.
> + * @serialized_data: The opaque u64 handle to the serialized state of the file.
> + *                 This handle is passed back to the handler's .freeze(),
> + *                 .retrieve(), and .finish() callbacks, allowing it to track
> + *                 and update its serialized state across phases.
> + * @retrieved:     A flag indicating whether a user/kernel in the new kernel has
> + *                 successfully called retrieve() on this file. This prevents
> + *                 multiple retrieval attempts.
> + * @mutex:         A mutex that protects the fields of this specific instance
> + *                 (e.g., @retrieved, @file), ensuring that operations like
> + *                 retrieving or finishing a file are atomic.
> + * @list:          The list_head linking this instance into its parent
> + *                 session's list of preserved files.
> + * @token:         The user-provided unique token used to identify this file.
> + *
> + * This structure is the core in-kernel representation of a single file being
> + * managed through a live update. An instance is created by luo_preserve_file()
> + * to link a 'struct file' to its corresponding handler, a user-provided token,
> + * and the serialized state handle returned by the handler's .preserve()
> + * operation.
> + *
> + * These instances are tracked in a per-session list. The @serialized_data
> + * field, which holds a handle to the file's serialized state, may be updated
> + * during the .freeze() callback before being serialized for the next kernel.
> + * After reboot, these structures are recreated by luo_file_deserialize() and
> + * are finally cleaned up by luo_file_finish().
> + */
> +struct luo_file {
> +	struct liveupdate_file_handler *fh;
> +	struct file *file;
> +	u64 serialized_data;
> +	bool retrieved;
> +	struct mutex mutex;
> +	struct list_head list;
> +	u64 token;
> +};
> +
> +static int luo_session_alloc_files_mem(struct luo_session *session)
> +{
> +	size_t size;
> +	void *mem;
> +
> +	if (session->files)
> +		return 0;
> +
> +	WARN_ON_ONCE(session->count);
> +
> +	size = LUO_FILE_PGCNT << PAGE_SHIFT;
> +	mem = kho_alloc_preserve(size);
> +	if (IS_ERR(mem))
> +		return PTR_ERR(mem);
> +
> +	session->files = mem;
> +	session->pgcnt = LUO_FILE_PGCNT;

I think this is a layering violation. luo_session should take care of
managing the session, including the memory it needs. luo_files should
take care of managing the file, including the memory it needs for _the
file_. I think proper layering will make the code a lot easier to grok
and modify later. When I want to see how sessions are handled, I can do
to luo_session.c. I won't have to poke into luo_files.c.

So I think luo_session_preserve_fd() should first make sure there is
memory available to store the file in the session, and only then call
luo_preserve_file().

> +
> +	return 0;
> +}
> +
> +static void luo_session_free_files_mem(struct luo_session *session)
> +{
> +	/* If session has files, no need to free preservation memory */
> +	if (session->count)
> +		return;
> +
> +	if (!session->files)
> +		return;
> +
> +	kho_unpreserve_free(session->files);
> +	session->files = NULL;
> +	session->pgcnt = 0;
> +}
> +
> +static bool luo_token_is_used(struct luo_session *session, u64 token)
> +{
> +	struct luo_file *iter;
> +
> +	list_for_each_entry(iter, &session->files_list, list) {
> +		if (iter->token == token)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
> +/**
> + * luo_preserve_file - Initiate the preservation of a file descriptor.
> + * @session: The session to which the preserved file will be added.
> + * @token:   A unique, user-provided identifier for the file.
> + * @fd:      The file descriptor to be preserved.
> + *
> + * This function orchestrates the first phase of preserving a file. Upon entry,
> + * it takes a reference to the 'struct file' via fget(), effectively making LUO
> + * a co-owner of the file. This reference is held until the file is either
> + * unpreserved or successfully finished in the next kernel, preventing the file
> + * from being prematurely destroyed.
> + *
> + * This function orchestrates the first phase of preserving a file. It performs
> + * the following steps:
> + *
> + * 1. Validates that the @token is not already in use within the session.
> + * 2. Ensures the session's memory for files serialization is allocated
> + *    (allocates if needed).
> + * 3. Iterates through registered handlers, calling can_preserve() to find one
> + *    compatible with the given @fd.
> + * 4. Calls the handler's .preserve() operation, which saves the file's state
> + *    and returns an opaque private data handle.
> + * 5. Adds the new instance to the session's internal list.
> + *
> + * On success, LUO takes a reference to the 'struct file' and considers it
> + * under its management until it is unpreserved or finished.
> + *
> + * In case of any failure, all intermediate allocations (file reference, memory
> + * for the 'luo_file' struct, etc.) are cleaned up before returning an error.
> + *
> + * Context: Can be called from an ioctl handler during normal system operation.
> + * Return: 0 on success. Returns a negative errno on failure:
> + *         -EEXIST if the token is already used.
> + *         -EBADF if the file descriptor is invalid.
> + *         -ENOSPC if the session is full.
> + *         -ENOENT if no compatible handler is found.
> + *         -ENOMEM on memory allocation failure.
> + *         Other erros might be returned by .preserve().
> + */
> +int luo_preserve_file(struct luo_session *session, u64 token, int fd)
> +{
> +	struct liveupdate_file_op_args args = {0};
> +	struct liveupdate_file_handler *fh;
> +	struct luo_file *luo_file;
> +	struct file *file;
> +	int err;
> +
> +	lockdep_assert_held(&session->mutex);
> +
> +	if (luo_token_is_used(session, token))
> +		return -EEXIST;
> +
> +	file = fget(fd);
> +	if (!file)
> +		return -EBADF;
> +
> +	err = luo_session_alloc_files_mem(session);
> +	if (err)
> +		goto  exit_err;
> +
> +	if (session->count == LUO_FILE_MAX) {
> +		err = -ENOSPC;
> +		goto exit_err;
> +	}

Similarly, luo_file has no business knowing the size of a session.
Checking session->count should also be done in
luo_session_preserve_fd(). luo_preserve_file() should never be called if
there is no space _in the session_ to accommodate the file.

> +
> +	err = -ENOENT;
> +	list_for_each_entry(fh, &luo_file_handler_list, list) {
> +		if (fh->ops->can_preserve(fh, file)) {
> +			err = 0;
> +			break;
> +		}
> +	}
> +
> +	/* err is still -ENOENT if no handler was found */
> +	if (err)
> +		goto exit_err;
> +
> +	luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
> +	if (!luo_file) {
> +		err = -ENOMEM;
> +		goto exit_err;
> +	}
> +
> +	luo_file->file = file;
> +	luo_file->fh = fh;
> +	luo_file->token = token;
> +	luo_file->retrieved = false;
> +	mutex_init(&luo_file->mutex);
> +
> +	args.handler = fh;
> +	args.session = (struct liveupdate_session *)session;
> +	args.file = file;
> +	err = fh->ops->preserve(&args);
> +	if (err) {
> +		mutex_destroy(&luo_file->mutex);
> +		kfree(luo_file);
> +		goto exit_err;
> +	} else {
> +		luo_file->serialized_data = args.serialized_data;
> +		list_add_tail(&luo_file->list, &session->files_list);
> +		session->count++;
> +	}
> +
> +	return 0;
> +
> +exit_err:
> +	fput(file);
> +	luo_session_free_files_mem(session);
> +
> +	return err;
> +}
> +
> +/**
> + * luo_file_unpreserve_files - Unpreserves all files from a session.
> + * @session: The session to be cleaned up.
> + *
> + * This function serves as the primary cleanup path for a session. It is
> + * invoked when the userspace agent closes the session's file descriptor.
> + *
> + * For each file, it performs the following cleanup actions:
> + *   1. Calls the handler's .unpreserve() callback to allow the handler to
> + *      release any resources it allocated.
> + *   2. Removes the file from the session's internal tracking list.
> + *   3. Releases the reference to the 'struct file' that was taken by
> + *      luo_preserve_file() via fput(), returning ownership.
> + *   4. Frees the memory associated with the internal 'struct luo_file'.
> + *
> + * After all individual files are unpreserved, it frees the contiguous memory
> + * block that was allocated to hold their serialization data.
> + */
> +void luo_file_unpreserve_files(struct luo_session *session)
> +{
> +	struct luo_file *luo_file;
> +
> +	lockdep_assert_held(&session->mutex);
> +
> +	while (!list_empty(&session->files_list)) {

Continuing with the layering thing, the list belongs to the session, so
it should traverse it. luo_session_release() should traverse the list
and call luo_file_unpreserve() on each file in the list. The body of
this loop becomes luo_file_unpreserve().

> +		struct liveupdate_file_op_args args = {0};
> +
> +		luo_file = list_last_entry(&session->files_list,
> +					   struct luo_file, list);
> +
> +		args.handler = luo_file->fh;
> +		args.session = (struct liveupdate_session *)session;
> +		args.file = luo_file->file;
> +		args.serialized_data = luo_file->serialized_data;
> +		luo_file->fh->ops->unpreserve(&args);
> +
> +		list_del(&luo_file->list);
> +		session->count--;

... and these two go into luo_session_release().

> +
> +		fput(luo_file->file);
> +		mutex_destroy(&luo_file->mutex);
> +		kfree(luo_file);
> +	}
> +
> +	luo_session_free_files_mem(session);
> +}
> +
> +static int luo_file_freeze_one(struct luo_session *session,
> +			       struct luo_file *luo_file)
> +{
> +	int err = 0;
> +
> +	guard(mutex)(&luo_file->mutex);
> +
> +	if (luo_file->fh->ops->freeze) {

Nit: "if (!luo_file->fh->ops->freeze) return 0;" would make this tad bit
neater. You probably don't even need the mutex since ops are const.

> +		struct liveupdate_file_op_args args = {0};
> +
> +		args.handler = luo_file->fh;
> +		args.session = (struct liveupdate_session *)session;
> +		args.file = luo_file->file;
> +		args.serialized_data = luo_file->serialized_data;
> +
> +		err = luo_file->fh->ops->freeze(&args);
> +		if (!err)
> +			luo_file->serialized_data = args.serialized_data;

Then this can be:

	if (err)
		return err;

	luo_file->serialized_data = args.serialized_data;
	return 0;

> +	}
> +
> +	return err;
> +}
> +
> +static void luo_file_unfreeze_one(struct luo_session *session,
> +				  struct luo_file *luo_file)
> +{
> +	guard(mutex)(&luo_file->mutex);
> +
> +	if (luo_file->fh->ops->unfreeze) {

Same here.

> +		struct liveupdate_file_op_args args = {0};
> +
> +		args.handler = luo_file->fh;
> +		args.session = (struct liveupdate_session *)session;
> +		args.file = luo_file->file;
> +		args.serialized_data = luo_file->serialized_data;
> +
> +		luo_file->fh->ops->unfreeze(&args);
> +	}
> +
> +	luo_file->serialized_data = 0;

The file will also need to be unpreserved after unfreeze. Resetting the
data here is not the right thing, since unpreserve is responsible for
freeing things, and it won't have access to its data.

> +}
> +
> +static void __luo_file_unfreeze(struct luo_session *session,
> +				struct luo_file *failed_entry)
> +{
> +	struct list_head *files_list = &session->files_list;
> +	struct luo_file *luo_file;
> +
> +	list_for_each_entry(luo_file, files_list, list) {
> +		if (luo_file == failed_entry)
> +			break;
> +
> +		luo_file_unfreeze_one(session, luo_file);
> +	}
> +
> +	memset(session->files, 0, session->pgcnt << PAGE_SHIFT);
> +}
> +
> +/**
> + * luo_file_freeze - Freezes all preserved files and serializes their metadata.
> + * @session: The session whose files are to be frozen.
> + *
> + * This function is called from the reboot() syscall path, just before the
> + * kernel transitions to the new image via kexec. Its purpose is to perform the
> + * final preparation and serialization of all preserved files in the session.
> + *
> + * It iterates through each preserved file in FIFO order (the order of
> + * preservation) and performs two main actions:
> + *
> + * 1. Freezes the File: It calls the handler's .freeze() callback for each
> + *    file. This gives the handler a final opportunity to quiesce the device or
> + *    prepare its state for the upcoming reboot. The handler may update its
> + *    private data handle during this step.
> + *
> + * 2. Serializes Metadata: After a successful freeze, it copies the final file
> + *    metadata—the handler's compatible string, the user token, and the final
> + *    private data handle—into the pre-allocated contiguous memory buffer
> + *    (session->files) that will be handed over to the next kernel via KHO.
> + *
> + * Error Handling (Rollback):
> + * This function is atomic. If any handler's .freeze() operation fails, the
> + * entire live update is aborted. The __luo_file_unfreeze() helper is
> + * immediately called to invoke the .unfreeze() op on all files that were
> + * successfully frozen before the point of failure, rolling them back to a
> + * running state. The function then returns an error, causing the reboot()
> + * syscall to fail.
> + *
> + * Context: Called only from the liveupdate_reboot() path.
> + * Return: 0 on success, or a negative errno on failure.
> + */
> +int luo_file_freeze(struct luo_session *session)
> +{
> +	struct luo_file_ser *file_ser = session->files;
> +	struct luo_file *luo_file;
> +	int err;
> +	int i;
> +
> +	lockdep_assert_held(&session->mutex);
> +
> +	if (!session->count)
> +		return 0;

Same comment about layering here...

> +
> +	if (WARN_ON(!file_ser))
> +		return -EINVAL;
> +
> +	i = 0;
> +	list_for_each_entry(luo_file, &session->files_list, list) {
> +		err = luo_file_freeze_one(session, luo_file);
> +		if (err < 0) {
> +			pr_warn("Freeze failed for session[%s] token[%#0llx] handler[%s] err[%pe]\n",
> +				session->name, luo_file->token,
> +				luo_file->fh->compatible, ERR_PTR(err));
> +			goto exit_err;
> +		}
> +
> +		strscpy(file_ser[i].compatible, luo_file->fh->compatible,
> +			sizeof(file_ser[i].compatible));
> +		file_ser[i].data = luo_file->serialized_data;
> +		file_ser[i].token = luo_file->token;
> +		i++;
> +	}
> +
> +	return 0;
> +
> +exit_err:
> +	__luo_file_unfreeze(session, luo_file);
> +
> +	return err;
> +}
> +
> +/**
> + * luo_file_unfreeze - Unfreezes all files in a session.
> + * @session: The session whose files are to be unfrozen.
> + *
> + * This function rolls back the state of all files in a session after the freeze
> + * phase has begun but must be aborted. It is the counterpart to
> + * luo_file_freeze().
> + *
> + * It invokes the __luo_file_unfreeze() helper with a NULL argument, which
> + * signals the helper to iterate through all files in the session  and call
> + * their respective .unfreeze() handler callbacks.
> + *
> + * Context: This is called when the live update is aborted during
> + *          the reboot() syscall, after luo_file_freeze() has been called.
> + */
> +void luo_file_unfreeze(struct luo_session *session)
> +{
> +	lockdep_assert_held(&session->mutex);
> +
> +	if (!session->count)
> +		return;

... and here.

> +
> +	__luo_file_unfreeze(session, NULL);
> +}
> +
> +/**
> + * luo_retrieve_file - Restores a preserved file from a session by its token.
> + * @session: The session from which to retrieve the file.
> + * @token:   The unique token identifying the file to be restored.
> + * @filep:   Output parameter; on success, this is populated with a pointer
> + *           to the newly retrieved 'struct file'.
> + *
> + * This function is the primary mechanism for recreating a file in the new
> + * kernel after a live update. It searches the session's list of deserialized
> + * files for an entry matching the provided @token.
> + *
> + * The operation is idempotent: if a file has already been successfully
> + * retrieved, this function will simply return a pointer to the existing
> + * 'struct file' and report success without re-executing the retrieve
> + * operation. This is handled by checking the 'retrieved' flag under a lock.
> + *
> + * File retrieval can happen in any order; it is not bound by the order of
> + * preservation.
> + *
> + * Context: Can be called from an ioctl or other in-kernel code in the new
> + *          kernel.
> + * Return: 0 on success. Returns a negative errno on failure:
> + *         -ENOENT if no file with the matching token is found.
> + *         Any error code returned by the handler's .retrieve() op.
> + */
> +int luo_retrieve_file(struct luo_session *session, u64 token,
> +		      struct file **filep)
> +{
> +	struct liveupdate_file_op_args args = {0};
> +	struct luo_file *luo_file;
> +	int err;
> +
> +	lockdep_assert_held(&session->mutex);
> +
> +	if (list_empty(&session->files_list))
> +		return -ENOENT;

... and here.

> +
> +	list_for_each_entry(luo_file, &session->files_list, list) {
> +		if (luo_file->token == token)
> +			break;
> +	}
> +
> +	if (luo_file->token != token)
> +		return -ENOENT;
> +
> +	guard(mutex)(&luo_file->mutex);
> +	if (luo_file->retrieved) {
> +		/*
> +		 * Someone is asking for this file again, so get a reference
> +		 * for them.
> +		 */

Should we even allow this? Is there a use case?

> +		get_file(luo_file->file);
> +		*filep = luo_file->file;
> +		return 0;
> +	}
> +
> +	args.handler = luo_file->fh;
> +	args.session = (struct liveupdate_session *)session;
> +	args.serialized_data = luo_file->serialized_data;
> +	err = luo_file->fh->ops->retrieve(&args);
> +	if (!err) {
> +		luo_file->file = args.file;
> +
> +		/* Get reference so we can keep this file in LUO until finish */
> +		get_file(luo_file->file);
> +		*filep = luo_file->file;
> +		luo_file->retrieved = true;
> +	}
> +
> +	return err;
> +}
> +
> +static int luo_file_can_finish_one(struct luo_session *session,
> +				   struct luo_file *luo_file)
> +{
> +	bool can_finish = true;
> +
> +	guard(mutex)(&luo_file->mutex);
> +
> +	if (luo_file->fh->ops->can_finish) {

Same nitpick about doing "if (!luo_file->fh->ops->can_finish)".

> +		struct liveupdate_file_op_args args = {0};
> +
> +		args.handler = luo_file->fh;
> +		args.session = (struct liveupdate_session *)session;
> +		args.file = luo_file->file;
> +		args.serialized_data = luo_file->serialized_data;
> +		args.retrieved = luo_file->retrieved;
> +		can_finish = luo_file->fh->ops->can_finish(&args);
> +	}
> +
> +	return can_finish ? 0 : -EBUSY;
> +}
> +
> +static void luo_file_finish_one(struct luo_session *session,
> +				struct luo_file *luo_file)
> +{
> +	struct liveupdate_file_op_args args = {0};
> +
> +	guard(mutex)(&luo_file->mutex);
> +
> +	args.handler = luo_file->fh;
> +	args.session = (struct liveupdate_session *)session;
> +	args.file = luo_file->file;
> +	args.serialized_data = luo_file->serialized_data;
> +	args.retrieved = luo_file->retrieved;
> +
> +	luo_file->fh->ops->finish(&args);
> +}
> +
> +/**
> + * luo_file_finish - Completes the lifecycle for all files in a session.
> + * @session: The session to be finalized.
> + *
> + * This function orchestrates the final teardown of a live update session in the
> + * new kernel. It should be called after all necessary files have been
> + * retrieved and the userspace agent is ready to release the preserved state.
> + *
> + * The function iterates through all tracked files. For each file, it performs
> + * the following sequence of cleanup actions:
> + *
> + * 1. If file is not yet retrieved, retrieves it, and calls can_finish() on
> + *    every file in the session. If all can_finish return true, continue to
> + *    finish.
> + * 2. Calls the handler's .finish() callback (via luo_file_finish_one) to
> + *    allow for final resource cleanup within the handler.
> + * 3. Releases LUO's ownership reference on the 'struct file' via fput(). This
> + *    is the counterpart to the get_file() call in luo_retrieve_file().
> + * 4. Removes the 'struct luo_file' from the session's internal list.
> + * 5. Frees the memory for the 'struct luo_file' instance itself.
> + *
> + * After successfully finishing all individual files, it frees the
> + * contiguous memory block that was used to transfer the serialized metadata
> + * from the previous kernel.
> + *
> + * Error Handling (Atomic Failure):
> + * This operation is atomic. If any handler's .can_finish() op fails, the entire
> + * function aborts immediately and returns an error.
> + *
> + * Context: Can be called from an ioctl handler in the new kernel.
> + * Return: 0 on success, or a negative errno on failure.
> + */
> +int luo_file_finish(struct luo_session *session)
> +{
> +	struct list_head *files_list = &session->files_list;
> +	struct luo_file *luo_file;
> +	int err;
> +
> +	if (!session->count)
> +		return 0;

Layering comment again.

> +
> +	lockdep_assert_held(&session->mutex);
> +
> +	list_for_each_entry(luo_file, files_list, list) {
> +		err = luo_file_can_finish_one(session, luo_file);
> +		if (err)
> +			return err;
> +	}
> +
> +	while (!list_empty(&session->files_list)) {
> +		luo_file = list_last_entry(&session->files_list,
> +					   struct luo_file, list);
> +
> +		luo_file_finish_one(session, luo_file);
> +
> +		if (luo_file->file)
> +			fput(luo_file->file);
> +		list_del(&luo_file->list);
> +		session->count--;
> +		mutex_destroy(&luo_file->mutex);
> +		kfree(luo_file);
> +	}
> +
> +	if (session->files) {
> +		kho_restore_free(session->files);
> +		session->files = NULL;
> +		session->pgcnt = 0;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * luo_file_deserialize - Reconstructs the list of preserved files in the new kernel.
> + * @session: The incoming session containing the serialized file data from KHO.
> + *
> + * This function is called during the early boot process of the new kernel. It
> + * takes the raw, contiguous memory block of 'struct luo_file_ser' entries,
> + * provided by the previous kernel, and transforms it back into a live,
> + * in-memory linked list of 'struct luo_file' instances.
> + *
> + * For each serialized entry, it performs the following steps:
> + *   1. Reads the 'compatible' string.
> + *   2. Searches the global list of registered file handlers for one that
> + *      matches the compatible string.
> + *   3. Allocates a new 'struct luo_file'.
> + *   4. Populates the new structure with the deserialized data (token, private
> + *      data handle) and links it to the found handler. The 'file' pointer is
> + *      initialized to NULL, as the file has not been retrieved yet.
> + *   5. Adds the new 'struct luo_file' to the session's files_list.
> + *
> + * This prepares the session for userspace, which can later call
> + * luo_retrieve_file() to restore the actual file descriptors.
> + *
> + * Context: Called from session deserialization.
> + */
> +int luo_file_deserialize(struct luo_session *session)
> +{
> +	struct luo_file_ser *file_ser;
> +	u64 i;
> +
> +	lockdep_assert_held(&session->mutex);
> +
> +	if (!session->files)
> +		return 0;

Layering again.

> +
> +	file_ser = session->files;
> +	for (i = 0; i < session->count; i++) {
> +		struct liveupdate_file_handler *fh;
> +		bool handler_found = false;
> +		struct luo_file *luo_file;
> +
> +		list_for_each_entry(fh, &luo_file_handler_list, list) {
> +			if (!strcmp(fh->compatible, file_ser[i].compatible)) {
> +				handler_found = true;
> +				break;
> +			}
> +		}
> +
> +		if (!handler_found) {
> +			pr_warn("No registered handler for compatible '%s'\n",
> +				file_ser[i].compatible);
> +			return -ENOENT;
> +		}
> +
> +		luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
> +		if (!luo_file)
> +			return -ENOMEM;
> +
> +		luo_file->fh = fh;
> +		luo_file->file = NULL;
> +		luo_file->serialized_data = file_ser[i].data;
> +		luo_file->token = file_ser[i].token;
> +		luo_file->retrieved = false;
> +		mutex_init(&luo_file->mutex);
> +		list_add_tail(&luo_file->list, &session->files_list);
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * liveupdate_register_file_handler - Register a file handler with LUO.
> + * @fh: Pointer to a caller-allocated &struct liveupdate_file_handler.
> + * The caller must initialize this structure, including a unique
> + * 'compatible' string and a valid 'fh' callbacks. This function adds the
> + * handler to the global list of supported file handlers.
> + *
> + * Context: Typically called during module initialization for file types that
> + * support live update preservation.
> + *
> + * Return: 0 on success. Negative errno on failure.
> + */
> +int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
> +{
> +	static DEFINE_MUTEX(register_file_handler_lock);
> +	struct liveupdate_file_handler *fh_iter;
> +
> +	if (!liveupdate_enabled())
> +		return -EOPNOTSUPP;
> +
> +	/*
> +	 * Once sessions have been deserialized, file handlers cannot be
> +	 * registered, it is too late.
> +	 */
> +	if (WARN_ON(luo_session_is_deserialized()))
> +		return -EBUSY;
> +
> +	/* Sanity check that all required callbacks are set */
> +	if (!fh->ops->preserve || !fh->ops->unpreserve ||
> +	    !fh->ops->retrieve || !fh->ops->finish) {

Should check can_preserve here, right?

> +		return -EINVAL;
> +	}
> +
> +	guard(mutex)(&register_file_handler_lock);
> +	list_for_each_entry(fh_iter, &luo_file_handler_list, list) {
> +		if (!strcmp(fh_iter->compatible, fh->compatible)) {
> +			pr_err("File handler registration failed: Compatible string '%s' already registered.\n",
> +			       fh->compatible);
> +			return -EEXIST;
> +		}
> +	}
> +
> +	if (!try_module_get(fh->ops->owner))
> +		return -EAGAIN;
> +
> +	INIT_LIST_HEAD(&fh->list);
> +	list_add_tail(&fh->list, &luo_file_handler_list);
> +
> +	return 0;
> +}
> +
> +/**
> + * liveupdate_get_token_outgoing - Get the token for a preserved file.
> + * @s:      The outgoing liveupdate session.
> + * @file:   The file object to search for.
> + * @tokenp: Output parameter for the found token.
> + *
> + * Searches the list of preserved files in an outgoing session for a matching
> + * file object. If found, the corresponding user-provided token is returned.
> + *
> + * This function is intended for in-kernel callers that need to correlate a
> + * file with its liveupdate token.
> + *
> + * Context: Can be called from any context that can acquire the session mutex.
> + * Return: 0 on success, -ENOENT if the file is not preserved in this session.
> + */
> +int liveupdate_get_token_outgoing(struct liveupdate_session *s,
> +				  struct file *file, u64 *tokenp)
> +{
> +	struct luo_session *session = (struct luo_session *)s;
> +	struct luo_file *luo_file;
> +	int err = -ENOENT;
> +
> +	list_for_each_entry(luo_file, &session->files_list, list) {
> +		if (luo_file->file == file) {
> +			if (tokenp)
> +				*tokenp = luo_file->token;
> +			err = 0;
> +			break;
> +		}
> +	}
> +
> +	return err;
> +}
> +
> +/**
> + * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use.
> + * @s:      The incoming liveupdate session (restored from the previous kernel).
> + * @token:  The unique token identifying the file to retrieve.
> + * @filep:  On success, this will be populated with a pointer to the retrieved
> + *          'struct file'.
> + *
> + * Provides a kernel-internal API for other subsystems to retrieve their
> + * preserved files after a live update. This function is a simple wrapper
> + * around luo_retrieve_file(), allowing callers to find a file by its token.
> + *
> + * The operation is idempotent; subsequent calls for the same token will return
> + * a pointer to the same 'struct file' object.
> + *
> + * The caller receives a pointer to the file with a reference incremented. The
> + * file's lifetime is managed by LUO and any userspace file
> + * descriptors. If the caller needs to hold a reference to the file beyond the
> + * immediate scope, it must call get_file() itself.
> + *
> + * Context: Can be called from any context in the new kernel that has a handle
> + *          to a restored session.
> + * Return: 0 on success. Returns -ENOENT if no file with the matching token is
> + *         found, or any other negative errno on failure.
> + */
> +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
> +				 struct file **filep)
> +{
> +	struct luo_session *session = (struct luo_session *)s;
> +
> +	return luo_retrieve_file(session, token, filep);
> +}
> diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
> index 5185ad37a8c1..1a36f2383123 100644
> --- a/kernel/liveupdate/luo_internal.h
> +++ b/kernel/liveupdate/luo_internal.h
> @@ -70,4 +70,13 @@ int luo_session_serialize(void);
>  int luo_session_deserialize(void);
>  bool luo_session_is_deserialized(void);
>  
> +int luo_preserve_file(struct luo_session *session, u64 token, int fd);
> +void luo_file_unpreserve_files(struct luo_session *session);
> +int luo_file_freeze(struct luo_session *session);
> +void luo_file_unfreeze(struct luo_session *session);
> +int luo_retrieve_file(struct luo_session *session, u64 token,
> +		      struct file **filep);
> +int luo_file_finish(struct luo_session *session);
> +int luo_file_deserialize(struct luo_session *session);
> +
>  #endif /* _LINUX_LUO_INTERNAL_H */

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v6 04/20] liveupdate: luo_session: add sessions support
From: Pasha Tatashin @ 2025-11-21 21:30 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: jasonmiu, graf, rppt, dmatlack, rientjes, corbet, rdunlap,
	ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm, tj,
	yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, linux,
	linux-kernel, linux-doc, linux-mm, gregkh, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, jgg,
	parav, leonro, witu, hughd, skhawaja, chrisl
In-Reply-To: <mafs08qfz1h3c.fsf@kernel.org>

> >  /*
> >   * The LUO FDT hooks all LUO state for sessions, fds, etc.
> > - * In the root it allso carries "liveupdate-number" 64-bit property that
> > + * In the root it also carries "liveupdate-number" 64-bit property that
>
> Nit: This needs a bit of patch massaging. Patch 2 added the typo, and
> this patch fixes it. It would be better to just update patch 2.

Yeap, this is fixed.


> > + * This structure is located at the beginning of a contiguous block of
> > + * physical memory preserved across the kexec. It provides the necessary
> > + * metadata to interpret the array of session entries that follow.
> > + */
> > +struct luo_session_header_ser {
> > +     u64 pgcnt;
>
> Why do you need pgcnt here? Can't the size be inferred from count? And
> since you use contiguous memory block, the folio will know its page
> count anyway, right? The less we have in the ABI the better IMO.

Right, I had pgnct because my allocators were using size as an
argument, but we removed that, so pgcnt can also be removed.

> Same for other structures below.
>
> > +     u64 count;
> > +} __packed;
> > +
> > +/**
> > + * struct luo_session_ser - Represents the serialized metadata for a LUO session.
> > + * @name:    The unique name of the session, copied from the `luo_session`
> > + *           structure.
> > + * @files:   The physical address of a contiguous memory block that holds
> > + *           the serialized state of files.
> > + * @pgcnt:   The number of pages occupied by the `files` memory block.
> > + * @count:   The total number of files that were part of this session during
> > + *           serialization. Used for iteration and validation during
> > + *           restoration.
> > + *
> > + * This structure is used to package session-specific metadata for transfer
> > + * between kernels via Kexec Handover. An array of these structures (one per
> > + * session) is created and passed to the new kernel, allowing it to reconstruct
> > + * the session context.
> > + *
> > + * If this structure is modified, LUO_SESSION_COMPATIBLE must be updated.
> > + */
> > +struct luo_session_ser {
> > +     char name[LIVEUPDATE_SESSION_NAME_LENGTH];
> > +     u64 files;
> > +     u64 pgcnt;
> > +     u64 count;
> > +} __packed;
> > +
> >  #endif /* _LINUX_LIVEUPDATE_ABI_LUO_H */
> [...]
> > +/* Create a "struct file" for session */
> > +static int luo_session_getfile(struct luo_session *session, struct file **filep)
> > +{
> > +     char name_buf[128];
> > +     struct file *file;
> > +
> > +     guard(mutex)(&session->mutex);
> > +     snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name);
> > +     file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR);
>
> Nit: You can return the file directly and get rid of filep.

I prefer returning error here.

>
> > +     if (IS_ERR(file))
> > +             return PTR_ERR(file);
> > +
> > +     *filep = file;
> > +
> > +     return 0;
> > +}
> [...]
> > +int __init luo_session_setup_outgoing(void *fdt_out)
> > +{
> > +     struct luo_session_header_ser *header_ser;
> > +     u64 header_ser_pa;
> > +     int err;
> > +
> > +     header_ser = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT);
>
> Nit: The naming is a bit confusing here. At first glance I thought this
> was just allocating the header, but it allocates the whole session
> serialization buffer.

I made it a little clearer by adding "outgoing_buffer" local variable,
and then assigning head_ser to this local variable.

> > +     if (IS_ERR(header_ser))
> > +             return PTR_ERR(header_ser);
> > +     header_ser_pa = virt_to_phys(header_ser);
> > +
> > +     err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME);
> > +     err |= fdt_property_string(fdt_out, "compatible",
> > +                                LUO_FDT_SESSION_COMPATIBLE);
> > +     err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa,
> > +                         sizeof(header_ser_pa));
> > +     err |= fdt_end_node(fdt_out);
> > +
> > +     if (err)
> > +             goto err_unpreserve;
> > +
> > +     header_ser->pgcnt = LUO_SESSION_PGCNT;
> > +     INIT_LIST_HEAD(&luo_session_global.outgoing.list);
> > +     init_rwsem(&luo_session_global.outgoing.rwsem);
> > +     luo_session_global.outgoing.header_ser = header_ser;
> > +     luo_session_global.outgoing.ser = (void *)(header_ser + 1);
> > +     luo_session_global.outgoing.active = true;
> > +
> > +     return 0;
> > +
> > +err_unpreserve:
> > +     kho_unpreserve_free(header_ser);
> > +     return err;
> > +}
> [...]
> > +int luo_session_deserialize(void)
> > +{
> > +     struct luo_session_header *sh = &luo_session_global.incoming;
> > +     int err;
> > +
> > +     if (luo_session_is_deserialized())
> > +             return 0;
> > +
> > +     luo_session_global.deserialized = true;
> > +     if (!sh->active) {
> > +             INIT_LIST_HEAD(&sh->list);
> > +             init_rwsem(&sh->rwsem);
>
> Nit: it would be a bit simpler if LUO init always initialized this. And
> then luo_session_setup_incoming() can fill the list if it has any data.
> Slight reduction in code duplication and mental load.

These are now statically initialized.

>
> > +             return 0;
> > +     }
> > +
> > +     for (int i = 0; i < sh->header_ser->count; i++) {
> > +             struct luo_session *session;
> > +
> > +             session = luo_session_alloc(sh->ser[i].name);
> > +             if (IS_ERR(session)) {
> > +                     pr_warn("Failed to allocate session [%s] during deserialization %pe\n",
> > +                             sh->ser[i].name, session);
> > +                     return PTR_ERR(session);
> > +             }
> > +
> > +             err = luo_session_insert(sh, session);
> > +             if (err) {
> > +                     luo_session_free(session);
> > +                     pr_warn("Failed to insert session [%s] %pe\n",
> > +                             session->name, ERR_PTR(err));
> > +                     return err;
> > +             }
> > +
> > +             session->count = sh->ser[i].count;
> > +             session->files = sh->ser[i].files ? phys_to_virt(sh->ser[i].files) : 0;
> > +             session->pgcnt = sh->ser[i].pgcnt;
> > +     }
> > +
> > +     kho_restore_free(sh->header_ser);
> > +     sh->header_ser = NULL;
> > +     sh->ser = NULL;
> > +
> > +     return 0;
> > +}
> [...]
>
> --
> Regards,
> Pratyush Yadav

Thanks!

Pasha

^ permalink raw reply

* [PATCH v6 1/9] futex: Use explicit sizes for compat_robust_list structs
From: André Almeida @ 2025-11-22  5:50 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Arnd Bergmann, Sebastian Andrzej Siewior,
	Waiman Long, Ryan Houdek
  Cc: linux-kernel, linux-kselftest, linux-api, kernel-dev,
	André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-0-05fea005a0fd@igalia.com>

There are two functions for handling robust lists during a task exit:
exit_robust_list() and compat_exit_robust_list(). The first one handles
either 64-bit or 32-bit lists, depending on the kernel bitness.
compat_exit_robust_list() exists only in 64-bit kernels that supports
32-bit syscalls entry points (also known as compat entry points).

The new syscall set_robust_list2() needs to handle both 64-bit and
32-bit robust lists, regardless of compat entry being enabled, so it
needs to have both functions always available.

In preparation for this, use explicit size for struct members of
compat_robust_list and compat_robust_list_head. Rename the structs and
compat_exit_robust_list() to make clear which bitness it handles.

Keep exit_robust_list() as it is: used to handle the native bit size of
the kernel.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
 include/linux/compat.h     | 13 +++----------
 include/linux/futex.h      |  2 +-
 include/linux/sched.h      |  2 +-
 include/uapi/linux/futex.h | 10 ++++++++++
 kernel/futex/core.c        | 20 ++++++++++----------
 kernel/futex/syscalls.c    |  8 ++++----
 6 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 56cebaff0c91..2c5a7f980182 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -385,15 +385,8 @@ struct compat_ifconf {
 	compat_caddr_t  ifcbuf;
 };
 
-struct compat_robust_list {
-	compat_uptr_t			next;
-};
-
-struct compat_robust_list_head {
-	struct compat_robust_list	list;
-	compat_long_t			futex_offset;
-	compat_uptr_t			list_op_pending;
-};
+struct robust_list32;
+struct robust_list_head32;
 
 #ifdef CONFIG_COMPAT_OLD_SIGACTION
 struct compat_old_sigaction {
@@ -672,7 +665,7 @@ asmlinkage long compat_sys_waitid(int, compat_pid_t,
 		struct compat_siginfo __user *, int,
 		struct compat_rusage __user *);
 asmlinkage long
-compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+compat_sys_set_robust_list(struct robust_list_head32 __user *head,
 			   compat_size_t len);
 asmlinkage long
 compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 9e9750f04980..322851e4a703 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -66,7 +66,7 @@ static inline void futex_init_task(struct task_struct *tsk)
 {
 	tsk->robust_list = NULL;
 #ifdef CONFIG_COMPAT
-	tsk->compat_robust_list = NULL;
+	tsk->robust_list32 = NULL;
 #endif
 	INIT_LIST_HEAD(&tsk->pi_state_list);
 	tsk->pi_state_cache = NULL;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbb7340c5866..76cabfab5b73 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1332,7 +1332,7 @@ struct task_struct {
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user	*robust_list;
 #ifdef CONFIG_COMPAT
-	struct compat_robust_list_head __user *compat_robust_list;
+	struct robust_list_head32 __user *robust_list32;
 #endif
 	struct list_head		pi_state_list;
 	struct futex_pi_state		*pi_state_cache;
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 7e2744ec8933..86efb089893d 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -153,6 +153,16 @@ struct robust_list_head {
 	struct robust_list __user *list_op_pending;
 };
 
+struct robust_list32 {
+	__u32 next;
+};
+
+struct robust_list_head32 {
+	struct robust_list32	list;
+	__s32			futex_offset;
+	__u32			list_op_pending;
+};
+
 /*
  * Are there any waiters for this robust futex:
  */
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 125804fbb5cb..c99d7baab24e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1227,7 +1227,7 @@ static void __user *futex_uaddr(struct robust_list __user *entry,
  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
  */
 static inline int
-compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+fetch_robust_entry32(compat_uptr_t *uentry, struct robust_list __user **entry,
 		   compat_uptr_t __user *head, unsigned int *pi)
 {
 	if (get_user(*uentry, head))
@@ -1245,9 +1245,9 @@ compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **ent
  *
  * We silently return on any sign of list-walking problem.
  */
-static void compat_exit_robust_list(struct task_struct *curr)
+static void exit_robust_list32(struct task_struct *curr)
 {
-	struct compat_robust_list_head __user *head = curr->compat_robust_list;
+	struct robust_list_head32 __user *head = curr->robust_list32;
 	struct robust_list __user *entry, *next_entry, *pending;
 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 	unsigned int next_pi;
@@ -1259,7 +1259,7 @@ static void compat_exit_robust_list(struct task_struct *curr)
 	 * Fetch the list head (which was registered earlier, via
 	 * sys_set_robust_list()):
 	 */
-	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+	if (fetch_robust_entry32(&uentry, &entry, &head->list.next, &pi))
 		return;
 	/*
 	 * Fetch the relative futex offset:
@@ -1270,7 +1270,7 @@ static void compat_exit_robust_list(struct task_struct *curr)
 	 * Fetch any possibly pending lock-add first, and handle it
 	 * if it exists:
 	 */
-	if (compat_fetch_robust_entry(&upending, &pending,
+	if (fetch_robust_entry32(&upending, &pending,
 			       &head->list_op_pending, &pip))
 		return;
 
@@ -1280,8 +1280,8 @@ static void compat_exit_robust_list(struct task_struct *curr)
 		 * Fetch the next entry in the list before calling
 		 * handle_futex_death:
 		 */
-		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
-			(compat_uptr_t __user *)&entry->next, &next_pi);
+		rc = fetch_robust_entry32(&next_uentry, &next_entry,
+			(u32 __user *)&entry->next, &next_pi);
 		/*
 		 * A pending lock might already be on the list, so
 		 * dont process it twice:
@@ -1413,9 +1413,9 @@ static void futex_cleanup(struct task_struct *tsk)
 	}
 
 #ifdef CONFIG_COMPAT
-	if (unlikely(tsk->compat_robust_list)) {
-		compat_exit_robust_list(tsk);
-		tsk->compat_robust_list = NULL;
+	if (unlikely(tsk->robust_list32)) {
+		exit_robust_list32(tsk);
+		tsk->robust_list32 = NULL;
 	}
 #endif
 
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 880c9bf2f315..1de8ff230d54 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -43,7 +43,7 @@ static inline void __user *futex_task_robust_list(struct task_struct *p, bool co
 {
 #ifdef CONFIG_COMPAT
 	if (compat)
-		return p->compat_robust_list;
+		return p->robust_list32;
 #endif
 	return p->robust_list;
 }
@@ -468,13 +468,13 @@ SYSCALL_DEFINE4(futex_requeue,
 
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(set_robust_list,
-		struct compat_robust_list_head __user *, head,
+		struct robust_list_head32 __user *, head,
 		compat_size_t, len)
 {
 	if (unlikely(len != sizeof(*head)))
 		return -EINVAL;
 
-	current->compat_robust_list = head;
+	current->robust_list32 = head;
 
 	return 0;
 }
@@ -483,7 +483,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
 			compat_uptr_t __user *, head_ptr,
 			compat_size_t __user *, len_ptr)
 {
-	struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true);
+	struct robust_list_head32 __user *head = futex_get_robust_list_common(pid, true);
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);

-- 
2.52.0


^ permalink raw reply related

* [PATCH v6 0/9] futex: Create {set,get}_robust_list2() syscalls
From: André Almeida @ 2025-11-22  5:50 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Arnd Bergmann, Sebastian Andrzej Siewior,
	Waiman Long, Ryan Houdek
  Cc: linux-kernel, linux-kselftest, linux-api, kernel-dev,
	André Almeida

Hello,

This version is a complete rewrite of the syscall (thanks Thomas for the
suggestions!). 

 * Use case

The use-case for the new syscalls is detailed in the last patch version:

  https://lore.kernel.org/lkml/20250626-tonyk-robust_futex-v5-0-179194dbde8f@igalia.com

 * The syscall interface

Documented at patches 3/9 "futex: Create set_robust_list2() syscall" and
4/9 "futex: Create get_robust_list2() syscall".

 * Testing

I expanded the current robust list selftest to use the new interface,
and also ported the original syscall to use the new syscall internals,
and everything survived the tests.

 * Changelog

Changes from v5:
 - Complete interface rewrite, there are so many changes but the main
   ones are the following points
 - Array of robust lists now has a static size, allocated once during the
   first usage of the list
 - Now that the list of robust lists have a fixed size, I removed the
   logic of having a command for creating a new index on the list. To
   simplify things for everyone, userspace just need to call
   set_robust_list2(head, 32-bit/64-bit type, index).
 - Created get_robust_list2()
 - The new code can be better integrated with the original interface
 - v5: https://lore.kernel.org/r/20250626-tonyk-robust_futex-v5-0-179194dbde8f@igalia.com

Feedback is very welcomed!

---
André Almeida (9):
      futex: Use explicit sizes for compat_robust_list structs
      futex: Make exit_robust_list32() unconditionally available for 64-bit kernels
      futex: Create set_robust_list2() syscall
      futex: Create get_robust_list2() syscall
      futex: Wire up set_robust_list2 syscall
      futex: Wire up get_robust_list2 syscall
      selftests/futex: Expand for set_robust_list2()
      selftests/futex: Expand for get_robust_list2()
      futex: Use new robust list API internally

 arch/alpha/kernel/syscalls/syscall.tbl             |   2 +
 arch/arm/tools/syscall.tbl                         |   2 +
 arch/m68k/kernel/syscalls/syscall.tbl              |   2 +
 arch/microblaze/kernel/syscalls/syscall.tbl        |   2 +
 arch/mips/kernel/syscalls/syscall_n32.tbl          |   2 +
 arch/mips/kernel/syscalls/syscall_n64.tbl          |   2 +
 arch/mips/kernel/syscalls/syscall_o32.tbl          |   2 +
 arch/parisc/kernel/syscalls/syscall.tbl            |   2 +
 arch/powerpc/kernel/syscalls/syscall.tbl           |   2 +
 arch/s390/kernel/syscalls/syscall.tbl              |   2 +
 arch/sh/kernel/syscalls/syscall.tbl                |   2 +
 arch/sparc/kernel/syscalls/syscall.tbl             |   2 +
 arch/x86/entry/syscalls/syscall_32.tbl             |   2 +
 arch/x86/entry/syscalls/syscall_64.tbl             |   2 +
 arch/xtensa/kernel/syscalls/syscall.tbl            |   2 +
 include/linux/compat.h                             |  13 +-
 include/linux/futex.h                              |  30 +-
 include/linux/sched.h                              |   6 +-
 include/uapi/asm-generic/unistd.h                  |   7 +-
 include/uapi/linux/futex.h                         |  26 ++
 kernel/futex/core.c                                | 140 ++++--
 kernel/futex/syscalls.c                            | 134 +++++-
 kernel/sys_ni.c                                    |   2 +
 scripts/syscall.tbl                                |   1 +
 .../selftests/futex/functional/robust_list.c       | 504 +++++++++++++++++++--
 25 files changed, 788 insertions(+), 105 deletions(-)
---
base-commit: c42ba5a87bdccbca11403b7ca8bad1a57b833732
change-id: 20250225-tonyk-robust_futex-60adeedac695

Best regards,
-- 
André Almeida <andrealmeid@igalia.com>


^ permalink raw reply

* [PATCH v6 2/9] futex: Make exit_robust_list32() unconditionally available for 64-bit kernels
From: André Almeida @ 2025-11-22  5:50 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Arnd Bergmann, Sebastian Andrzej Siewior,
	Waiman Long, Ryan Houdek
  Cc: linux-kernel, linux-kselftest, linux-api, kernel-dev,
	André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-0-05fea005a0fd@igalia.com>

The new syscall set_robust_list2() needs to handle both 64-bit and
32-bit robust lists, but not every 64-bit platform have compat entry
points. Make exit_robust_list32() unconditionally available for 64-bit
kernels regardless of having a compat configuration.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
 kernel/futex/core.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index c99d7baab24e..136639897ff9 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -31,7 +31,6 @@
  *  "The futexes are also cursed."
  *  "But they come in a choice of three flavours!"
  */
-#include <linux/compat.h>
 #include <linux/jhash.h>
 #include <linux/pagemap.h>
 #include <linux/debugfs.h>
@@ -1213,12 +1212,12 @@ static void exit_robust_list(struct task_struct *curr)
 	}
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_64BIT
 static void __user *futex_uaddr(struct robust_list __user *entry,
 				compat_long_t futex_offset)
 {
-	compat_uptr_t base = ptr_to_compat(entry);
-	void __user *uaddr = compat_ptr(base + futex_offset);
+	u32 base = (u32)(unsigned long)(entry);
+	void __user *uaddr = (void __user *)(unsigned long)(base + futex_offset);
 
 	return uaddr;
 }
@@ -1227,13 +1226,13 @@ static void __user *futex_uaddr(struct robust_list __user *entry,
  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
  */
 static inline int
-fetch_robust_entry32(compat_uptr_t *uentry, struct robust_list __user **entry,
-		   compat_uptr_t __user *head, unsigned int *pi)
+fetch_robust_entry32(u32 *uentry, struct robust_list __user **entry,
+		     u32 __user *head, unsigned int *pi)
 {
 	if (get_user(*uentry, head))
 		return -EFAULT;
 
-	*entry = compat_ptr((*uentry) & ~1);
+	*entry = (void __user *)(unsigned long)((*uentry) & ~1);
 	*pi = (unsigned int)(*uentry) & 1;
 
 	return 0;
@@ -1251,8 +1250,8 @@ static void exit_robust_list32(struct task_struct *curr)
 	struct robust_list __user *entry, *next_entry, *pending;
 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 	unsigned int next_pi;
-	compat_uptr_t uentry, next_uentry, upending;
-	compat_long_t futex_offset;
+	u32 uentry, next_uentry, upending;
+	s32 futex_offset;
 	int rc;
 
 	/*
@@ -1412,7 +1411,7 @@ static void futex_cleanup(struct task_struct *tsk)
 		tsk->robust_list = NULL;
 	}
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_64BIT
 	if (unlikely(tsk->robust_list32)) {
 		exit_robust_list32(tsk);
 		tsk->robust_list32 = NULL;

-- 
2.52.0


^ permalink raw reply related

* [PATCH v6 3/9] futex: Create set_robust_list2() syscall
From: André Almeida @ 2025-11-22  5:50 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Arnd Bergmann, Sebastian Andrzej Siewior,
	Waiman Long, Ryan Houdek
  Cc: linux-kernel, linux-kselftest, linux-api, kernel-dev,
	André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-0-05fea005a0fd@igalia.com>

Emulators (like FEX-Emu, to run x86 apps on top of Aarch64) have two
special needs about robust lists: to be able to register more than one
robust list, one of the app being emulated and one list for the emulator
itself; and to be able to walk on 32-bit robusts lists on a 64-bit
platform without compat entry points.

The current syscall allows for one robust list per task (on x86-64, it
can have two if compat is enabled) and on Aarch64 there's no way to
parse a 32-bit robust list. The current syscall cannot be expanded to
solve both needs, so create a new syscall, set_robust_list2() with the
following signature:

  sys_set_robust_list2(struct robust_list_head *head, unsigned int index,
		       unsigned int cmd, unsigned int flags)

The new syscall allows to set multiple lists per task, of 64-bit or
32-bit types.

 - `*head` is the same structure used in the current syscall.
 - `index` is the index of the list to be set with `head`.
 - `cmd` defines the operation to perform:
   - `FUTEX_ROBUST_LIST_CMD_SET_64` set a 64-bit robust list at `index`
   - `FUTEX_ROBUST_LIST_CMD_SET_32` set a 32-bit robust list at `index`
   - `FUTEX_ROBUST_LIST_CMD_LIST_LIMIT` get the limit of lists per task
 - `flag` is unused now but can be used to expand the interface

Setting an index twice overwrites the last instance.

The array of lists is dynamically allocated in the first use, but has a
fixed size determined by the kernel. 8 slots are more than enough to
cover the target use case and allows for more use cases. The command for
getting the list limit allows to userspace check if the kernel ever
expands this list. The first two slots are reserved for the kernel, to
store the original syscall robust_list_head's.

The array of lists is destroyed only during task exit.

The `FUTEX_ROBUST_LIST_CMD_SET_64` operation is only available for
64-bit kernels. In such kernels, lists created with
`FUTEX_ROBUST_LIST_CMD_SET_32` are marked with
`FUTEX_ROBUST_LIST_ENTRY_32BIT` and the kernel handles it with a special
function exit_robust_list32() to be able to walk in a list of 32-bit
pointers.

For 32-bit kernels, there's no special function available as every user
list and list handling functions will all have the same bitness.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
 include/linux/futex.h      |  26 +++++++++++
 include/linux/sched.h      |   1 +
 include/uapi/linux/futex.h |  16 +++++++
 kernel/futex/core.c        | 111 ++++++++++++++++++++++++++++++++++++++++++---
 kernel/futex/syscalls.c    |  41 +++++++++++++++++
 5 files changed, 188 insertions(+), 7 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 322851e4a703..3dba249bcd32 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -5,6 +5,7 @@
 #include <linux/sched.h>
 #include <linux/ktime.h>
 #include <linux/mm_types.h>
+#include <linux/compat.h>
 
 #include <uapi/linux/futex.h>
 
@@ -62,12 +63,35 @@ enum {
 	FUTEX_STATE_DEAD,
 };
 
+#define FUTEX_ROBUST_LIST_NATIVE_IDX	0
+#define FUTEX_ROBUST_LIST_COMPAT_IDX	1
+#define FUTEX_ROBUST_LIST2_IDX		2
+#define FUTEX_ROBUST_LISTS_PER_USER	8
+#define FUTEX_ROBUST_LIST2_MAX_IDX	(FUTEX_ROBUST_LIST2_IDX + FUTEX_ROBUST_LISTS_PER_USER)
+
+/*
+ * List entries without _32BIT flag are using the native machine size
+ */
+#define FUTEX_ROBUST_LIST_ENTRY_INUSE	0x1UL
+#define FUTEX_ROBUST_LIST_ENTRY_32BIT	0x2UL
+#define FUTEX_ROBUST_LIST_ENTRY_MASK	(~0x3UL)
+
+static inline bool futex_in_32bit_syscall(void)
+{
+#ifdef CONFIG_X86
+	return !IS_ENABLED(CONFIG_64BIT) || in_32bit_syscall();
+#else
+	return !IS_ENABLED(CONFIG_64BIT);
+#endif
+}
+
 static inline void futex_init_task(struct task_struct *tsk)
 {
 	tsk->robust_list = NULL;
 #ifdef CONFIG_COMPAT
 	tsk->robust_list32 = NULL;
 #endif
+	tsk->futex_robust_lists = NULL;
 	INIT_LIST_HEAD(&tsk->pi_state_list);
 	tsk->pi_state_cache = NULL;
 	tsk->futex_state = FUTEX_STATE_OK;
@@ -82,6 +106,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4);
 
+int futex_robust_list_set(uintptr_t head, enum robust_list2_cmd cmd, unsigned int index);
+
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
 int futex_hash_allocate_default(void);
 void futex_hash_free(struct mm_struct *mm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 76cabfab5b73..de2f3cbe4953 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1331,6 +1331,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user	*robust_list;
+	uintptr_t			*futex_robust_lists;
 #ifdef CONFIG_COMPAT
 	struct robust_list_head32 __user *robust_list32;
 #endif
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 86efb089893d..2ba5c0c3bb59 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -163,6 +163,22 @@ struct robust_list_head32 {
 	__u32			list_op_pending;
 };
 
+/*
+ * Commands for set_robust_list2 syscall
+ */
+enum robust_list2_cmd {
+	FUTEX_ROBUST_LIST_CMD_SET_64,
+	FUTEX_ROBUST_LIST_CMD_SET_32,
+	FUTEX_ROBUST_LIST_CMD_LIST_LIMIT,
+	FUTEX_ROBUST_LIST_CMD_USER_MAX,
+
+	/*
+	 * Kernel internal, rejected for user space
+	 */
+	FUTEX_ROBUST_LIST_SET_NATIVE = 128,
+	FUTEX_ROBUST_LIST_SET_COMPAT,
+};
+
 /*
  * Are there any waiters for this robust futex:
  */
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 136639897ff9..14d8a7176367 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -71,6 +71,57 @@ struct futex_private_hash {
 	struct futex_hash_bucket queues[];
 };
 
+int futex_robust_list_set(uintptr_t head, enum robust_list2_cmd cmd,
+			  unsigned int index)
+{
+	uintptr_t entry = FUTEX_ROBUST_LIST_ENTRY_INUSE;
+	uintptr_t *rl = current->futex_robust_lists;
+
+	if (!rl) {
+		rl = kcalloc(FUTEX_ROBUST_LIST2_MAX_IDX, sizeof(*rl), GFP_KERNEL);
+		if (!rl)
+			return -ENOMEM;
+
+		scoped_guard(mutex, &current->futex_exit_mutex) {
+			/* check if another thread set the list before us */
+			if (current->futex_robust_lists) {
+				kfree(rl);
+				rl = current->futex_robust_lists;
+			} else {
+				current->futex_robust_lists = rl;
+			}
+		}
+
+	}
+
+	switch (cmd) {
+	case FUTEX_ROBUST_LIST_CMD_SET_64:
+		if (futex_in_32bit_syscall())
+			return -EINVAL;
+		break;
+	case FUTEX_ROBUST_LIST_CMD_SET_32:
+		entry |= FUTEX_ROBUST_LIST_ENTRY_32BIT;
+		break;
+	case FUTEX_ROBUST_LIST_SET_NATIVE:
+		index = FUTEX_ROBUST_LIST_NATIVE_IDX;
+		break;
+	case FUTEX_ROBUST_LIST_SET_COMPAT:
+		if (!IS_ENABLED(CONFIG_64BIT))
+			return -EINVAL;
+		index = FUTEX_ROBUST_LIST_COMPAT_IDX;
+		entry |= FUTEX_ROBUST_LIST_ENTRY_32BIT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	entry |= head;
+	scoped_guard(mutex, &current->futex_exit_mutex)
+	rl[index] = entry;
+
+	return 0;
+}
+
 /*
  * Fault injections for futexes.
  */
@@ -1150,9 +1201,8 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
  *
  * We silently return on any sign of list-walking problem.
  */
-static void exit_robust_list(struct task_struct *curr)
+static void exit_robust_list(struct task_struct *curr, struct robust_list_head __user *head)
 {
-	struct robust_list_head __user *head = curr->robust_list;
 	struct robust_list __user *entry, *next_entry, *pending;
 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 	unsigned int next_pi;
@@ -1244,9 +1294,8 @@ fetch_robust_entry32(u32 *uentry, struct robust_list __user **entry,
  *
  * We silently return on any sign of list-walking problem.
  */
-static void exit_robust_list32(struct task_struct *curr)
+static void exit_robust_list32(struct task_struct *curr, struct robust_list_head32 __user *head)
 {
-	struct robust_list_head32 __user *head = curr->robust_list32;
 	struct robust_list __user *entry, *next_entry, *pending;
 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 	unsigned int next_pi;
@@ -1311,7 +1360,15 @@ static void exit_robust_list32(struct task_struct *curr)
 		handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
 	}
 }
-#endif
+
+#else
+
+static void exit_robust_list32(struct task_struct *curr, struct robust_list_head32 __user *head)
+{
+	pr_crit("32-bit kernel should never call %s", __func__);
+}
+
+#endif /* CONFIG_64BIT */
 
 #ifdef CONFIG_FUTEX_PI
 
@@ -1404,20 +1461,60 @@ static void exit_pi_state_list(struct task_struct *curr)
 static inline void exit_pi_state_list(struct task_struct *curr) { }
 #endif
 
+static void exit_robust_lists(struct task_struct *tsk)
+{
+	uintptr_t *rl = tsk->futex_robust_lists;
+
+	tsk->futex_robust_lists = NULL;
+
+	for (unsigned int idx = 0; idx < FUTEX_ROBUST_LIST2_MAX_IDX; idx++) {
+		uintptr_t entry = rl[idx];
+
+		if (!(entry & FUTEX_ROBUST_LIST_ENTRY_MASK))
+			continue;
+
+		/*
+		 * If the list type is the same as the kernel bitness, always
+		 * calls exit_robust_list(). exit_robust_list32() is only for
+		 * 32-bit lists in a 64-bit kernel.
+		 */
+		if (IS_ENABLED(CONFIG_64BIT) && (entry & FUTEX_ROBUST_LIST_ENTRY_32BIT)) {
+			struct robust_list_head32 __user *head;
+
+			entry &= FUTEX_ROBUST_LIST_ENTRY_MASK;
+
+			head = (__force struct robust_list_head32 __user *)entry;
+			exit_robust_list32(tsk, head);
+		} else {
+			struct robust_list_head __user *head;
+
+			entry &= FUTEX_ROBUST_LIST_ENTRY_MASK;
+
+			head = (__force struct robust_list_head __user *)entry;
+			exit_robust_list(tsk, head);
+		}
+	}
+
+	kfree(rl);
+}
+
 static void futex_cleanup(struct task_struct *tsk)
 {
 	if (unlikely(tsk->robust_list)) {
-		exit_robust_list(tsk);
+		exit_robust_list(tsk, tsk->robust_list);
 		tsk->robust_list = NULL;
 	}
 
 #ifdef CONFIG_64BIT
 	if (unlikely(tsk->robust_list32)) {
-		exit_robust_list32(tsk);
+		exit_robust_list32(tsk, tsk->robust_list32);
 		tsk->robust_list32 = NULL;
 	}
 #endif
 
+	if (unlikely(tsk->futex_robust_lists))
+		exit_robust_lists(tsk);
+
 	if (unlikely(!list_empty(&tsk->pi_state_list)))
 		exit_pi_state_list(tsk);
 }
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 1de8ff230d54..0b7fa88aa34c 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -109,6 +109,47 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 	return put_user(head, head_ptr);
 }
 
+SYSCALL_DEFINE4(set_robust_list2, struct robust_list_head *, head, unsigned int,
+		index, unsigned int, cmd, unsigned int, flags)
+{
+	uintptr_t entry = (__force uintptr_t)head;
+	size_t align = sizeof(u32);
+
+	if (flags)
+		return -EINVAL;
+
+	if (cmd >= FUTEX_ROBUST_LIST_CMD_USER_MAX)
+		return -EINVAL;
+
+	if (index >= FUTEX_ROBUST_LISTS_PER_USER)
+		return -EINVAL;
+
+	/*
+	 * The first two indexes are reserved for the kernel to be used with the
+	 * legacy syscall, so we hide them from userspace.
+	 *
+	 * We map [0, FUTEX_ROBUST_LISTS_PER_USER) to
+	 *  [FUTEX_ROBUST_LIST2_IDX, FUTEX_ROBUST_LIST2_MAX_IDX)
+	 */
+	index += FUTEX_ROBUST_LIST2_IDX;
+
+	switch (cmd) {
+	case FUTEX_ROBUST_LIST_CMD_SET_64:
+		if (futex_in_32bit_syscall())
+			return -EOPNOTSUPP;
+		align = sizeof(u64);
+		fallthrough;
+	case FUTEX_ROBUST_LIST_CMD_SET_32:
+		if (entry % align)
+			return -EINVAL;
+		return futex_robust_list_set(entry, cmd, index);
+	case FUTEX_ROBUST_LIST_CMD_LIST_LIMIT:
+		return FUTEX_ROBUST_LISTS_PER_USER;
+	}
+
+	return -EINVAL;
+}
+
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {

-- 
2.52.0


^ permalink raw reply related

* [PATCH v6 4/9] futex: Create get_robust_list2() syscall
From: André Almeida @ 2025-11-22  5:50 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Arnd Bergmann, Sebastian Andrzej Siewior,
	Waiman Long, Ryan Houdek
  Cc: linux-kernel, linux-kselftest, linux-api, kernel-dev,
	André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-0-05fea005a0fd@igalia.com>

As in the original robust list interface, to pair with
set_robust_list2(), create a get_robust_list2() syscall with the
following signature:

  get_robust_list2(int pid, void __user **head_ptr, unsigned int index,
		   unsigned int flags)

  - `pid` sets with task's list should be returned. If is 0, it gets the
    list of the calling task.
  - `index` is the index of the list to get
  - `flags` is unused but can be used for expanding the interface

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
For some reason I wasn't able to use put_user() for 32-bit lists.. it
kept corrupting the value due to wrong write size I believe.
copy_to_user() worked fine nonetheless.
---
 kernel/futex/syscalls.c | 61 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 57 insertions(+), 4 deletions(-)

diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 0b7fa88aa34c..f730d16632fc 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -48,7 +48,7 @@ static inline void __user *futex_task_robust_list(struct task_struct *p, bool co
 	return p->robust_list;
 }
 
-static void __user *futex_get_robust_list_common(int pid, bool compat)
+static void __user *futex_get_robust_list_common(int pid, bool compat, int index)
 {
 	struct task_struct *p = current;
 	void __user *head;
@@ -75,7 +75,15 @@ static void __user *futex_get_robust_list_common(int pid, bool compat)
 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
-	head = futex_task_robust_list(p, compat);
+	if (index >= 0) {
+		scoped_guard(mutex, &p->futex_exit_mutex) {
+			uintptr_t *rl = p->futex_robust_lists;
+
+			head = rl ? (void __user *) rl[index] : NULL;
+		}
+	} else {
+		head = futex_task_robust_list(p, compat);
+	}
 
 	up_read(&p->signal->exec_update_lock);
 	put_task_struct(p);
@@ -99,7 +107,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 		struct robust_list_head __user * __user *, head_ptr,
 		size_t __user *, len_ptr)
 {
-	struct robust_list_head __user *head = futex_get_robust_list_common(pid, false);
+	struct robust_list_head __user *head = futex_get_robust_list_common(pid, false, -1);
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);
@@ -150,6 +158,51 @@ SYSCALL_DEFINE4(set_robust_list2, struct robust_list_head *, head, unsigned int,
 	return -EINVAL;
 }
 
+SYSCALL_DEFINE4(get_robust_list2, int, pid,
+		void __user * __user *, head_ptr,
+		unsigned int, index, unsigned int, flags)
+{
+	void __user *entry_ptr;
+	uintptr_t entry;
+
+	if (index >= FUTEX_ROBUST_LISTS_PER_USER)
+		return -EINVAL;
+
+	if (flags)
+		return -EINVAL;
+
+	/*
+	 * The first two indexes are reserved for the kernel to be used with the
+	 * legacy syscall, so we hide them from userspace.
+	 *
+	 * We map [0, FUTEX_ROBUST_LISTS_PER_USER) to
+	 *  [FUTEX_ROBUST_LIST2_IDX, FUTEX_ROBUST_LIST2_MAX_IDX)
+	 */
+	index += FUTEX_ROBUST_LIST2_IDX;
+
+	entry_ptr = futex_get_robust_list_common(pid, false, index);
+	if (IS_ERR(entry_ptr))
+		return PTR_ERR(entry_ptr);
+
+	entry = (uintptr_t) entry_ptr;
+
+	if (entry & FUTEX_ROBUST_LIST_ENTRY_32BIT) {
+		entry &= FUTEX_ROBUST_LIST_ENTRY_MASK;
+
+		if (copy_to_user(head_ptr, &entry, sizeof(u32)))
+			return -EFAULT;
+
+		return 0;
+	} else {
+		struct robust_list_head *head;
+
+		entry &= FUTEX_ROBUST_LIST_ENTRY_MASK;
+		head = (__force struct robust_list_head __user *)entry;
+
+		return put_user(head, head_ptr);
+	}
+}
+
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
@@ -524,7 +577,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
 			compat_uptr_t __user *, head_ptr,
 			compat_size_t __user *, len_ptr)
 {
-	struct robust_list_head32 __user *head = futex_get_robust_list_common(pid, true);
+	struct robust_list_head32 __user *head = futex_get_robust_list_common(pid, true, -1);
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);

-- 
2.52.0


^ permalink raw reply related

* [PATCH v6 5/9] futex: Wire up set_robust_list2 syscall
From: André Almeida @ 2025-11-22  5:50 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Arnd Bergmann, Sebastian Andrzej Siewior,
	Waiman Long, Ryan Houdek
  Cc: linux-kernel, linux-kselftest, linux-api, kernel-dev,
	André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-0-05fea005a0fd@igalia.com>

Wire up the new set_robust_list2 syscall in all available architectures.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
 arch/alpha/kernel/syscalls/syscall.tbl      | 1 +
 arch/arm/tools/syscall.tbl                  | 1 +
 arch/m68k/kernel/syscalls/syscall.tbl       | 1 +
 arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   | 1 +
 arch/parisc/kernel/syscalls/syscall.tbl     | 1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    | 1 +
 arch/s390/kernel/syscalls/syscall.tbl       | 1 +
 arch/sh/kernel/syscalls/syscall.tbl         | 1 +
 arch/sparc/kernel/syscalls/syscall.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_32.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl      | 1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     | 1 +
 include/uapi/asm-generic/unistd.h           | 5 ++++-
 kernel/sys_ni.c                             | 1 +
 scripts/syscall.tbl                         | 1 +
 18 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 16dca28ebf17..d0cb7b902cc6 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -509,3 +509,4 @@
 577	common	open_tree_attr			sys_open_tree_attr
 578	common	file_getattr			sys_file_getattr
 579	common	file_setattr			sys_file_setattr
+580	common	set_robust_list2		sys_set_robust_list2
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index b07e699aaa3c..910e6e14ccf0 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -484,3 +484,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	set_robust_list2		sys_set_robust_list2
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index f41d38dfbf13..eee3f320483d 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -469,3 +469,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common  set_robust_list2		sys_set_robust_list2
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 580af574fe73..6c69d8ebbc38 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -475,3 +475,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	set_robust_list2		sys_set_robust_list2
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index d824ffe9a014..f70db3741b0e 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -408,3 +408,4 @@
 467	n32	open_tree_attr			sys_open_tree_attr
 468	n32	file_getattr			sys_file_getattr
 469	n32	file_setattr			sys_file_setattr
+470	n32	set_robust_list2		sys_set_robust_list2
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 7a7049c2c307..9480488f9495 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -384,3 +384,4 @@
 467	n64	open_tree_attr			sys_open_tree_attr
 468	n64	file_getattr			sys_file_getattr
 469	n64	file_setattr			sys_file_setattr
+470	n64	set_robust_list2		sys_set_robust_list2
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index d330274f0601..2761c9cd8946 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -457,3 +457,4 @@
 467	o32	open_tree_attr			sys_open_tree_attr
 468	o32	file_getattr			sys_file_getattr
 469	o32	file_setattr			sys_file_setattr
+470	o32	set_robust_list2		sys_set_robust_list2
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 88a788a7b18d..eb37fda5c48f 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -468,3 +468,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	set_robust_list2		sys_set_robust_list2
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index b453e80dfc00..472bebec449d 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -560,3 +560,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	set_robust_list2		sys_set_robust_list2
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 8a6744d658db..ba7fac304941 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -472,3 +472,4 @@
 467  common	open_tree_attr		sys_open_tree_attr		sys_open_tree_attr
 468  common	file_getattr		sys_file_getattr		sys_file_getattr
 469  common	file_setattr		sys_file_setattr		sys_file_setattr
+470  common	set_robust_list2	sys_set_robust_list2		sys_set_robust_list2
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 5e9c9eff5539..c05c94a742be 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -473,3 +473,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	set_robust_list2		sys_set_robust_list2
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index ebb7d06d1044..3a59f3008325 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -515,3 +515,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	set_robust_list2		sys_set_robust_list2
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4877e16da69a..e9d6e1a1d777 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -475,3 +475,4 @@
 467	i386	open_tree_attr		sys_open_tree_attr
 468	i386	file_getattr		sys_file_getattr
 469	i386	file_setattr		sys_file_setattr
+470	i386	set_robust_list2	sys_set_robust_list2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ced2a1deecd7..8fdcf090300d 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -394,6 +394,7 @@
 467	common	open_tree_attr		sys_open_tree_attr
 468	common	file_getattr		sys_file_getattr
 469	common	file_setattr		sys_file_setattr
+470	common	set_robust_list2	sys_set_robust_list2
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 374e4cb788d8..d7bb6b9104dd 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	set_robust_list2		sys_set_robust_list2
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 04e0077fb4c9..44fc87287983 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -858,8 +858,11 @@ __SYSCALL(__NR_file_getattr, sys_file_getattr)
 #define __NR_file_setattr 469
 __SYSCALL(__NR_file_setattr, sys_file_setattr)
 
+#define __NR_set_robust_list2 470
+__SYSCALL(__NR_set_robust_list2, sys_set_robust_list2)
+
 #undef __NR_syscalls
-#define __NR_syscalls 470
+#define __NR_syscalls 471
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bf5d05c635ff..0ca2cfe69b11 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -172,6 +172,7 @@ COND_SYSCALL_COMPAT(fadvise64_64);
 COND_SYSCALL(lsm_get_self_attr);
 COND_SYSCALL(lsm_set_self_attr);
 COND_SYSCALL(lsm_list_modules);
+COND_SYSCALL(set_robust_list2);
 
 /* CONFIG_MMU only */
 COND_SYSCALL(swapon);
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index d1ae5e92c615..58c334aa8922 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -410,3 +410,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	set_robust_list2		sys_set_robust_list2

-- 
2.52.0


^ permalink raw reply related

* [PATCH v6 6/9] futex: Wire up get_robust_list2 syscall
From: André Almeida @ 2025-11-22  5:50 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Arnd Bergmann, Sebastian Andrzej Siewior,
	Waiman Long, Ryan Houdek
  Cc: linux-kernel, linux-kselftest, linux-api, kernel-dev,
	André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-0-05fea005a0fd@igalia.com>

Wire up the new get_robust_list2 syscall in all available architectures.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
 arch/alpha/kernel/syscalls/syscall.tbl      | 1 +
 arch/arm/tools/syscall.tbl                  | 1 +
 arch/m68k/kernel/syscalls/syscall.tbl       | 1 +
 arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   | 1 +
 arch/parisc/kernel/syscalls/syscall.tbl     | 1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    | 1 +
 arch/s390/kernel/syscalls/syscall.tbl       | 1 +
 arch/sh/kernel/syscalls/syscall.tbl         | 1 +
 arch/sparc/kernel/syscalls/syscall.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_32.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl      | 1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     | 1 +
 include/uapi/asm-generic/unistd.h           | 4 +++-
 kernel/sys_ni.c                             | 1 +
 17 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index d0cb7b902cc6..b4a42beda6db 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -510,3 +510,4 @@
 578	common	file_getattr			sys_file_getattr
 579	common	file_setattr			sys_file_setattr
 580	common	set_robust_list2		sys_set_robust_list2
+581	common	get_robust_list2		sys_get_robust_list2
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 910e6e14ccf0..d4a4d8446cb0 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -485,3 +485,4 @@
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
 470	common	set_robust_list2		sys_set_robust_list2
+471	common	get_robust_list2		sys_get_robust_list2
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index eee3f320483d..c2f1c5a3313c 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -470,3 +470,4 @@
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
 470	common  set_robust_list2		sys_set_robust_list2
+471	common  get_robust_list2		sys_get_robust_list2
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 6c69d8ebbc38..1389dd194eec 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -476,3 +476,4 @@
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
 470	common	set_robust_list2		sys_set_robust_list2
+471	common	get_robust_list2		sys_get_robust_list2
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index f70db3741b0e..e149d2ddbc2f 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -409,3 +409,4 @@
 468	n32	file_getattr			sys_file_getattr
 469	n32	file_setattr			sys_file_setattr
 470	n32	set_robust_list2		sys_set_robust_list2
+471	n32	get_robust_list2		sys_get_robust_list2
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 9480488f9495..7ddddc89a751 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -385,3 +385,4 @@
 468	n64	file_getattr			sys_file_getattr
 469	n64	file_setattr			sys_file_setattr
 470	n64	set_robust_list2		sys_set_robust_list2
+471	n64	get_robust_list2		sys_get_robust_list2
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 2761c9cd8946..c0a5ebafed1a 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -458,3 +458,4 @@
 468	o32	file_getattr			sys_file_getattr
 469	o32	file_setattr			sys_file_setattr
 470	o32	set_robust_list2		sys_set_robust_list2
+471	o32	get_robust_list2		sys_get_robust_list2
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index eb37fda5c48f..4c6cb64ec113 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -469,3 +469,4 @@
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
 470	common	set_robust_list2		sys_set_robust_list2
+471	common	get_robust_list2		sys_get_robust_list2
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 472bebec449d..1475fa6b3ee3 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -561,3 +561,4 @@
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
 470	common	set_robust_list2		sys_set_robust_list2
+470	common	get_robust_list2		sys_get_robust_list2
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index ba7fac304941..b8161ee922ef 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -473,3 +473,4 @@
 468  common	file_getattr		sys_file_getattr		sys_file_getattr
 469  common	file_setattr		sys_file_setattr		sys_file_setattr
 470  common	set_robust_list2	sys_set_robust_list2		sys_set_robust_list2
+471  common	get_robust_list2	sys_get_robust_list2		sys_get_robust_list2
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index c05c94a742be..566baa152634 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -474,3 +474,4 @@
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
 470	common	set_robust_list2		sys_set_robust_list2
+471	common	get_robust_list2		sys_get_robust_list2
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 3a59f3008325..fb3844c17711 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -516,3 +516,4 @@
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
 470	common	set_robust_list2		sys_set_robust_list2
+471	common	get_robust_list2		sys_get_robust_list2
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index e9d6e1a1d777..0df93458ef37 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -476,3 +476,4 @@
 468	i386	file_getattr		sys_file_getattr
 469	i386	file_setattr		sys_file_setattr
 470	i386	set_robust_list2	sys_set_robust_list2
+471	i386	get_robust_list2	sys_get_robust_list2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 8fdcf090300d..e7fdcc3d6e52 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -395,6 +395,7 @@
 468	common	file_getattr		sys_file_getattr
 469	common	file_setattr		sys_file_setattr
 470	common	set_robust_list2	sys_set_robust_list2
+471	common	get_robust_list2	sys_get_robust_list2
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index d7bb6b9104dd..bd63dbc78c0e 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -441,3 +441,4 @@
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
 470	common	set_robust_list2		sys_set_robust_list2
+471	common	get_robust_list2		sys_get_robust_list2
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 44fc87287983..9539e893c9ac 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -860,9 +860,11 @@ __SYSCALL(__NR_file_setattr, sys_file_setattr)
 
 #define __NR_set_robust_list2 470
 __SYSCALL(__NR_set_robust_list2, sys_set_robust_list2)
+#define __NR_get_robust_list2 471
+__SYSCALL(__NR_get_robust_list2, sys_get_robust_list2)
 
 #undef __NR_syscalls
-#define __NR_syscalls 471
+#define __NR_syscalls 472
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0ca2cfe69b11..0a7f7634446c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -173,6 +173,7 @@ COND_SYSCALL(lsm_get_self_attr);
 COND_SYSCALL(lsm_set_self_attr);
 COND_SYSCALL(lsm_list_modules);
 COND_SYSCALL(set_robust_list2);
+COND_SYSCALL(get_robust_list2);
 
 /* CONFIG_MMU only */
 COND_SYSCALL(swapon);

-- 
2.52.0


^ permalink raw reply related

* [PATCH v6 7/9] selftests/futex: Expand for set_robust_list2()
From: André Almeida @ 2025-11-22  5:50 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Arnd Bergmann, Sebastian Andrzej Siewior,
	Waiman Long, Ryan Houdek
  Cc: linux-kernel, linux-kselftest, linux-api, kernel-dev,
	André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-0-05fea005a0fd@igalia.com>

Reuse the same selftest for the original set_robust_list() syscall for
the new set_robust_list2() syscall. Use kselftest variants feature to
run the relevant tests for both interfaces. Create new test cases for
checking invalid parameters, the ability to correctly set multiple lists
for the same task, and to use 32-bit lists in a 64-bit task.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
 .../selftests/futex/functional/robust_list.c       | 409 +++++++++++++++++++--
 1 file changed, 387 insertions(+), 22 deletions(-)

diff --git a/tools/testing/selftests/futex/functional/robust_list.c b/tools/testing/selftests/futex/functional/robust_list.c
index e7d1254e18ca..bf47e9ab2951 100644
--- a/tools/testing/selftests/futex/functional/robust_list.c
+++ b/tools/testing/selftests/futex/functional/robust_list.c
@@ -42,6 +42,27 @@
 
 #define SLEEP_US 100
 
+#ifndef SYS_set_robust_list2
+# define SYS_set_robust_list2 470
+
+enum robust_list_cmd {
+	FUTEX_ROBUST_LIST_CMD_SET_64,
+	FUTEX_ROBUST_LIST_CMD_SET_32,
+	FUTEX_ROBUST_LIST_CMD_LIST_LIMIT,
+	FUTEX_ROBUST_LIST_CMD_USER_MAX,
+};
+
+struct robust_list32 {
+	uint32_t next;
+};
+
+struct robust_list_head32 {
+	struct robust_list32	list;
+	int32_t			futex_offset;
+	uint32_t		list_op_pending;
+};
+#endif
+
 static pthread_barrier_t barrier, barrier2;
 
 static int set_robust_list(struct robust_list_head *head, size_t len)
@@ -54,6 +75,58 @@ static int get_robust_list(int pid, struct robust_list_head **head, size_t *len_
 	return syscall(SYS_get_robust_list, pid, head, len_ptr);
 }
 
+static int set_robust_list2(struct robust_list_head *head, int index,
+			    enum robust_list_cmd cmd, unsigned int flags)
+{
+	return syscall(SYS_set_robust_list2, head, index, cmd, flags);
+}
+
+static bool robust_list2_support(void)
+{
+	int ret = set_robust_list2(0, 0, FUTEX_ROBUST_LIST_CMD_LIST_LIMIT, 0);
+
+	if (ret == -1 && errno == ENOSYS)
+		return false;
+
+	return true;
+}
+
+/*
+ * Return the set command according to the app bitness
+ */
+static int get_cmd_set(void)
+{
+	return sizeof(uintptr_t) == 8 ? FUTEX_ROBUST_LIST_CMD_SET_64 :
+	       FUTEX_ROBUST_LIST_CMD_SET_32;
+}
+
+FIXTURE(robust_api) {};
+
+FIXTURE_VARIANT(robust_api)
+{
+	bool robust2;
+};
+
+FIXTURE_SETUP(robust_api)
+{
+	if (!variant->robust2)
+		return;
+
+	ASSERT_NE(robust_list2_support(), false);
+}
+
+FIXTURE_TEARDOWN(robust_api) {}
+
+FIXTURE_VARIANT_ADD(robust_api, robust1)
+{
+	.robust2 = false,
+};
+
+FIXTURE_VARIANT_ADD(robust_api, robust2)
+{
+	.robust2 = true,
+};
+
 /*
  * Basic lock struct, contains just the futex word and the robust list element
  * Real implementations have also a *prev to easily walk in the list
@@ -61,6 +134,12 @@ static int get_robust_list(int pid, struct robust_list_head **head, size_t *len_
 struct lock_struct {
 	_Atomic(unsigned int)	futex;
 	struct robust_list	list;
+	bool			robust2;
+};
+
+struct lock_struct32 {
+	_Atomic(uint32_t)	futex;
+	struct robust_list32	list;
 };
 
 /*
@@ -89,20 +168,17 @@ static int create_child(int (*fn)(void *arg), void *arg)
 /*
  * Helper function to prepare and register a robust list
  */
-static int set_list(struct robust_list_head *head)
+static int set_list(struct robust_list_head *head, bool robust2, int index)
 {
-	int ret;
-
-	ret = set_robust_list(head, sizeof(*head));
-	if (ret)
-		return ret;
-
 	head->futex_offset = (size_t) offsetof(struct lock_struct, futex) -
 			     (size_t) offsetof(struct lock_struct, list);
 	head->list.next = &head->list;
 	head->list_op_pending = NULL;
 
-	return 0;
+	if (!robust2)
+		return set_robust_list(head, sizeof(*head));
+
+	return set_robust_list2(head, index, get_cmd_set(), 0);
 }
 
 /*
@@ -174,7 +250,7 @@ static int child_fn_lock(void *arg)
 	struct robust_list_head head;
 	int ret;
 
-	ret = set_list(&head);
+	ret = set_list(&head, lock->robust2, 0);
 	if (ret) {
 		ksft_test_result_fail("set_robust_list error\n");
 		return ret;
@@ -204,14 +280,16 @@ static int child_fn_lock(void *arg)
  * in the robust list and die. The parent thread will wait on this futex, and
  * should be waken up when the child exits.
  */
-TEST(test_robustness)
+TEST_F(robust_api, test_robustness)
 {
 	struct lock_struct lock = { .futex = 0 };
 	_Atomic(unsigned int) *futex = &lock.futex;
-	struct robust_list_head head;
 	int ret, pid, wstatus;
+	struct robust_list_head head;
 
-	ret = set_list(&head);
+	lock.robust2 = variant->robust2;
+
+	ret = set_list(&head, lock.robust2, 0);
 	ASSERT_EQ(ret, 0);
 
 	/*
@@ -270,6 +348,46 @@ TEST(test_set_robust_list_invalid_size)
 	ksft_test_result_pass("%s\n", __func__);
 }
 
+/*
+ * Test invalid parameters
+ */
+TEST(test_set_robust_list2_inval)
+{
+	struct robust_list_head head;
+	int ret, list_limit;
+
+	if (!robust_list2_support()) {
+		ksft_test_result_skip("robust_list2 not supported\n");
+		return;
+	}
+
+	/* Bad flag */
+	ret = set_robust_list2(&head, 0, get_cmd_set(), 999);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	/* Bad index */
+	list_limit = set_robust_list2(NULL, 0, FUTEX_ROBUST_LIST_CMD_LIST_LIMIT, 0);
+	ASSERT_GT(list_limit, 0);
+
+	ret = set_robust_list2(&head, -1, get_cmd_set(), 0);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	ret = set_robust_list2(&head, list_limit + 1, get_cmd_set(), 0);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	/* Bad command */
+	ret = set_robust_list2(&head, 0, FUTEX_ROBUST_LIST_CMD_USER_MAX, 0);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	ret = set_robust_list2(&head, 0, -1, 0);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+}
+
 /*
  * Test get_robust_list with pid = 0, getting the list of the running thread
  */
@@ -363,7 +481,7 @@ static int child_fn_lock_with_error(void *arg)
 	struct robust_list_head head;
 	int ret;
 
-	ret = set_list(&head);
+	ret = set_list(&head, false, 0);
 	if (ret) {
 		ksft_test_result_fail("set_robust_list error\n");
 		return -1;
@@ -388,14 +506,16 @@ static int child_fn_lock_with_error(void *arg)
  * earlier, just after setting list_op_pending and taking the lock, to test the
  * list_op_pending mechanism
  */
-TEST(test_set_list_op_pending)
+TEST_F(robust_api, test_set_list_op_pending)
 {
 	struct lock_struct lock = { .futex = 0 };
 	_Atomic(unsigned int) *futex = &lock.futex;
-	struct robust_list_head head;
 	int ret, wstatus;
+	struct robust_list_head head;
+
+	lock.robust2 = variant->robust2;
 
-	ret = set_list(&head);
+	ret = set_list(&head, lock.robust2, 0);
 	ASSERT_EQ(ret, 0);
 
 	ret = pthread_barrier_init(&barrier, NULL, 2);
@@ -429,7 +549,7 @@ static int child_lock_holder(void *arg)
 	struct robust_list_head head;
 	int i;
 
-	set_list(&head);
+	set_list(&head, locks[0].robust2, 0);
 
 	for (i = 0; i < CHILD_NR; i++) {
 		locks[i].futex = 0;
@@ -471,12 +591,19 @@ static int child_wait_lock(void *arg)
  * Test a robust list of more than one element. All the waiters should wake when
  * the holder dies
  */
-TEST(test_robust_list_multiple_elements)
+TEST_F(robust_api, test_robust_list_multiple_elements)
 {
 	struct lock_struct locks[CHILD_NR];
 	pid_t pids[CHILD_NR + 1];
 	int i, ret, wstatus;
 
+	if (!robust_list2_support()) {
+		ksft_test_result_skip("robust_list2 not supported\n");
+		return;
+	}
+
+	locks[0].robust2 = variant->robust2;
+
 	ret = pthread_barrier_init(&barrier, NULL, 2);
 	ASSERT_EQ(ret, 0);
 	ret = pthread_barrier_init(&barrier2, NULL, CHILD_NR + 1);
@@ -507,13 +634,98 @@ TEST(test_robust_list_multiple_elements)
 		ksft_test_result_pass("%s\n", __func__);
 }
 
+static int child_lock_holder_multiple_lists(void *arg)
+{
+	struct lock_struct *locks = arg;
+	struct robust_list_head *heads;
+	int i, list_limit;
+
+	list_limit = set_robust_list2(NULL, 0, FUTEX_ROBUST_LIST_CMD_LIST_LIMIT, 0);
+
+	heads = malloc(list_limit * sizeof(*heads));
+	if (!heads)
+		return -1;
+
+	for (i = 0; i < list_limit; i++) {
+		set_list(&heads[i], true, i);
+		locks[i].futex = 0;
+		mutex_lock(&locks[i], &heads[i], false);
+	}
+
+	pthread_barrier_wait(&barrier);
+	pthread_barrier_wait(&barrier2);
+
+	/* See comment at child_fn_lock() */
+	usleep(SLEEP_US);
+
+	return 0;
+}
+
+/*
+ * Similar to test_robust_list_multiple_elements, but instead of one list with
+ * several elements, create several lists with one element.
+ */
+TEST(test_robust_list_multiple_lists)
+{
+	int i, ret, wstatus, list_limit;
+	struct lock_struct *locks;
+	pid_t *pids;
+
+	if (!robust_list2_support()) {
+		ksft_test_result_skip("robust_list2 not supported\n");
+		return;
+	}
+
+	list_limit = set_robust_list2(NULL, 0, FUTEX_ROBUST_LIST_CMD_LIST_LIMIT, 0);
+	ASSERT_GT(list_limit, 1);
+
+	locks = malloc(list_limit * sizeof(*locks));
+	ASSERT_NE(locks, NULL);
+
+	pids = malloc(list_limit * sizeof(*pids));
+	ASSERT_NE(pids, NULL);
+
+	ret = pthread_barrier_init(&barrier, NULL, 2);
+	ASSERT_EQ(ret, 0);
+	ret = pthread_barrier_init(&barrier2, NULL, list_limit + 1);
+	ASSERT_EQ(ret, 0);
+
+	pids[0] = create_child(&child_lock_holder_multiple_lists, locks);
+
+	/* Wait until the locker thread takes the look */
+	pthread_barrier_wait(&barrier);
+
+	for (i = 0; i < list_limit; i++)
+		pids[i+1] = create_child(&child_wait_lock, &locks[i]);
+
+	/* Wait for all children to return */
+	ret = 0;
+
+	for (i = 0; i < list_limit; i++) {
+		waitpid(pids[i], &wstatus, 0);
+		if (WEXITSTATUS(wstatus))
+			ret = -1;
+	}
+
+	pthread_barrier_destroy(&barrier);
+	pthread_barrier_destroy(&barrier2);
+
+	/* Pass only if the child hasn't return error */
+	if (!ret)
+		ksft_test_result_pass("%s\n", __func__);
+
+	free(locks);
+	free(pids);
+}
+
 static int child_circular_list(void *arg)
 {
-	static struct robust_list_head head;
+	struct robust_list_head head;
 	struct lock_struct a, b, c;
+	bool robust2 = *(bool *) arg;
 	int ret;
 
-	ret = set_list(&head);
+	ret = set_list(&head, robust2, 0);
 	if (ret) {
 		ksft_test_result_fail("set_list error\n");
 		return -1;
@@ -536,11 +748,12 @@ static int child_circular_list(void *arg)
  * while processing it so it won't be trapped in an infinite loop while handling
  * a process exit
  */
-TEST(test_circular_list)
+TEST_F(robust_api, test_circular_list)
 {
 	int wstatus;
+	bool robust2 = variant->robust2;
 
-	create_child(child_circular_list, NULL);
+	create_child(child_circular_list, &robust2);
 
 	wait(&wstatus);
 
@@ -549,4 +762,156 @@ TEST(test_circular_list)
 		ksft_test_result_pass("%s\n", __func__);
 }
 
+/*
+ * 32-bit version of child_lock_holder. 
+ */
+static int child_lock_holder32(void *arg)
+{
+	struct lock_struct32 *locks = arg;
+	struct robust_list_head32 *head;
+	pid_t tid = gettid();
+	int i, ret;
+
+	head = mmap((void *)0x10000, sizeof(*head), PROT_READ | PROT_WRITE,
+		    MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	if (!head || ((uint32_t)(uintptr_t) head) > 0x7FFFFFFF) {
+		ksft_test_result_fail("child_lock_holder32 error\n");
+		return -1;
+	}
+
+	head->futex_offset = (uint32_t) ((size_t) offsetof(struct lock_struct32, futex) -
+			     (size_t) offsetof(struct lock_struct32, list));
+	head->list.next = (uint32_t)(uintptr_t) &head->list;
+	head->list_op_pending = (uint32_t)(uintptr_t) NULL;
+
+	ret = set_robust_list2((struct robust_list_head *) head, 0,
+			       FUTEX_ROBUST_LIST_CMD_SET_32, 0);
+	if (ret) {
+		ksft_test_result_fail("set_robust_list2 error\n");
+		return -1;
+	}
+
+	/*
+	 * Take all the locks and insert them in the list
+	 */
+	for (i = 0; i < CHILD_NR; i++) {
+		struct robust_list32 *list = &head->list;
+
+		locks[i].futex = tid;
+
+		while (list->next != (uint32_t)(uintptr_t) &head->list)
+			list = (struct robust_list32 *)(uintptr_t) list->next;
+
+		list->next = (uint32_t)(uintptr_t) &locks[i].list;
+		locks[i].list.next = (uint32_t)(uintptr_t) &head->list;
+	}
+
+	pthread_barrier_wait(&barrier);
+	pthread_barrier_wait(&barrier2);
+
+	/* See comment at child_fn_lock() */
+	usleep(SLEEP_US);
+
+	/* Exit holding all the locks */
+	return 0;
+}
+
+static int child_wait_lock32(void *arg)
+{
+	struct lock_struct32 *lock = arg;
+	_Atomic(unsigned int) *futex;
+	struct timespec to;
+	pid_t tid;
+	int ret;
+
+	futex = &lock->futex;
+
+	pthread_barrier_wait(&barrier2);
+
+	to.tv_sec = FUTEX_TIMEOUT;
+	to.tv_nsec = 0;
+
+	tid = atomic_load(futex);
+
+	/* Kernel ignores futexes without the waiters flag */
+	tid |= FUTEX_WAITERS;
+	atomic_store(futex, tid);
+
+	ret = futex_wait((futex_t *) futex, tid, &to, 0);
+
+	if (ret) {
+		ksft_test_result_fail("futex_wait error\n");
+		return -1;
+	}
+
+	if (!(lock->futex & FUTEX_OWNER_DIED)) {
+		ksft_test_result_fail("futex not marked with FUTEX_OWNER_DIED\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Test to create a 32-bit robust list in a 64-bit kernel. Replicate
+ * test_robust_list_multiple_elements, but it's simplified: don't do all the
+ * mutex lock dance, just insert futexes in the list and check if the kernel
+ * correctly walks the list and wake the threads
+ */
+TEST(test_32bit_lists)
+{
+	struct lock_struct32 *locks;
+	pid_t pids[CHILD_NR + 1];
+	int i, ret, wstatus;
+
+	if (sizeof(uintptr_t) != 8) {
+		ksft_test_result_skip("Test only for 64-bit\n");
+		return;
+	}
+
+	if (!robust_list2_support()) {
+		ksft_test_result_skip("robust_list2 not supported\n");
+		return;
+	}
+
+	locks = mmap((void *)0x20000, sizeof(*locks) * CHILD_NR,
+		     PROT_READ | PROT_WRITE, MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS,
+		     -1, 0);
+
+	ASSERT_NE(locks, NULL);
+	ASSERT_LT((uintptr_t) locks, 0x7FFFFFFF);
+
+	ret = pthread_barrier_init(&barrier, NULL, 2);
+	ASSERT_EQ(ret, 0);
+	ret = pthread_barrier_init(&barrier2, NULL, CHILD_NR + 1);
+	ASSERT_EQ(ret, 0);
+
+	pids[0] = create_child(&child_lock_holder32, locks);
+
+	/* Wait until the locker thread takes the look */
+	pthread_barrier_wait(&barrier);
+
+	for (i = 0; i < CHILD_NR; i++)
+		pids[i+1] = create_child(&child_wait_lock32, &locks[i]);
+
+	/* Wait for all children to return */
+	ret = 0;
+
+	for (i = 0; i < CHILD_NR; i++) {
+		waitpid(pids[i], &wstatus, 0);
+		if (WEXITSTATUS(wstatus))
+			ret = -1;
+	}
+
+	pthread_barrier_destroy(&barrier);
+	pthread_barrier_destroy(&barrier2);
+
+	/* Pass only if the child hasn't return error */
+	if (!ret)
+		ksft_test_result_pass("%s\n", __func__);
+
+	munmap(locks, sizeof(*locks) * CHILD_NR);
+}
+
 TEST_HARNESS_MAIN

-- 
2.52.0


^ permalink raw reply related

* [PATCH v6 8/9] selftests/futex: Expand for get_robust_list2()
From: André Almeida @ 2025-11-22  5:50 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Arnd Bergmann, Sebastian Andrzej Siewior,
	Waiman Long, Ryan Houdek
  Cc: linux-kernel, linux-kselftest, linux-api, kernel-dev,
	André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-0-05fea005a0fd@igalia.com>

Reuse the same selftest for the original set_robust_list() syscall for
the new set_robust_list2() syscall. Use kselftest variants feature to
run the relevant tests for both interfaces. Create a new test case to
get different lists from the same task.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
 .../selftests/futex/functional/robust_list.c       | 95 ++++++++++++++++++----
 1 file changed, 81 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/futex/functional/robust_list.c b/tools/testing/selftests/futex/functional/robust_list.c
index bf47e9ab2951..e6b26d7b9502 100644
--- a/tools/testing/selftests/futex/functional/robust_list.c
+++ b/tools/testing/selftests/futex/functional/robust_list.c
@@ -44,6 +44,7 @@
 
 #ifndef SYS_set_robust_list2
 # define SYS_set_robust_list2 470
+# define SYS_get_robust_list2 471
 
 enum robust_list_cmd {
 	FUTEX_ROBUST_LIST_CMD_SET_64,
@@ -81,6 +82,12 @@ static int set_robust_list2(struct robust_list_head *head, int index,
 	return syscall(SYS_set_robust_list2, head, index, cmd, flags);
 }
 
+static int get_robust_list2(int pid, struct robust_list_head **head,
+			    unsigned int index, unsigned int flags)
+{
+	return syscall(SYS_get_robust_list2, pid, head, index, flags);
+}
+
 static bool robust_list2_support(void)
 {
 	int ret = set_robust_list2(0, 0, FUTEX_ROBUST_LIST_CMD_LIST_LIMIT, 0);
@@ -181,6 +188,23 @@ static int set_list(struct robust_list_head *head, bool robust2, int index)
 	return set_robust_list2(head, index, get_cmd_set(), 0);
 }
 
+static int get_list(pid_t pid, struct robust_list_head **head, bool robust2, int index)
+{
+	int ret;
+
+	if (!robust2) {
+		size_t len_ptr;
+
+		ret = get_robust_list(pid, head, &len_ptr);
+		if (sizeof(**head) != len_ptr)
+			return -EINVAL;
+
+		return ret;
+	}
+
+	return get_robust_list2(pid, head, index, 0);
+}
+
 /*
  * A basic (and incomplete) mutex lock function with robustness
  */
@@ -391,37 +415,44 @@ TEST(test_set_robust_list2_inval)
 /*
  * Test get_robust_list with pid = 0, getting the list of the running thread
  */
-TEST(test_get_robust_list_self)
+TEST_F(robust_api, test_get_robust_list_self)
 {
 	struct robust_list_head head, head2, *get_head;
-	size_t head_size = sizeof(head), len_ptr;
+	bool robust2 = variant->robust2;
 	int ret;
 
-	ret = set_robust_list(&head, head_size);
+	ret = set_list(&head, robust2, 0);
 	ASSERT_EQ(ret, 0);
 
-	ret = get_robust_list(0, &get_head, &len_ptr);
+	ret = get_list(0, &get_head, robust2, 0);
 	ASSERT_EQ(ret, 0);
 	ASSERT_EQ(get_head, &head);
-	ASSERT_EQ(head_size, len_ptr);
 
-	ret = set_robust_list(&head2, head_size);
+	ret = set_list(&head2, robust2, 0);
 	ASSERT_EQ(ret, 0);
 
-	ret = get_robust_list(0, &get_head, &len_ptr);
+	ret = get_list(0, &get_head, robust2, 0);
 	ASSERT_EQ(ret, 0);
 	ASSERT_EQ(get_head, &head2);
-	ASSERT_EQ(head_size, len_ptr);
 
 	ksft_test_result_pass("%s\n", __func__);
 }
 
+struct child_arg_struct {
+	struct robust_list_head *head;
+	bool robust2;
+};
+
 static int child_list(void *arg)
 {
-	struct robust_list_head *head = arg;
+	struct child_arg_struct *child = arg;
+	struct robust_list_head *head;
+	bool robust2 = child->robust2;
 	int ret;
 
-	ret = set_robust_list(head, sizeof(*head));
+	head = child->head;
+
+	ret = set_list(head, robust2, 0);
 	if (ret) {
 		ksft_test_result_fail("set_robust_list error\n");
 		return -1;
@@ -444,23 +475,26 @@ static int child_list(void *arg)
  * parent
  *   2) the child thread still alive when we try to get the list from it
  */
-TEST(test_get_robust_list_child)
+TEST_F(robust_api, test_get_robust_list_child)
 {
 	struct robust_list_head head, *get_head;
+	bool robust2 = variant->robust2;
+	struct child_arg_struct child =
+		{.robust2 = robust2, .head = &head};
 	int ret, wstatus;
-	size_t len_ptr;
 	pid_t tid;
 
+
 	ret = pthread_barrier_init(&barrier, NULL, 2);
 	ret = pthread_barrier_init(&barrier2, NULL, 2);
 	ASSERT_EQ(ret, 0);
 
-	tid = create_child(&child_list, &head);
+	tid = create_child(&child_list, &child);
 	ASSERT_NE(tid, -1);
 
 	pthread_barrier_wait(&barrier);
 
-	ret = get_robust_list(tid, &get_head, &len_ptr);
+	ret = get_list(tid, &get_head, robust2, 0);
 	ASSERT_EQ(ret, 0);
 	ASSERT_EQ(&head, get_head);
 
@@ -914,4 +948,37 @@ TEST(test_32bit_lists)
 	munmap(locks, sizeof(*locks) * CHILD_NR);
 }
 
+/*
+ * Test setting and getting mutiples head lists
+ */
+TEST(set_and_get_robust2)
+{
+	struct robust_list_head *head = NULL, *heads;
+	int i, list_limit, ret;
+
+	if (!robust_list2_support()) {
+		ksft_test_result_skip("robust_list2 not supported\n");
+		return;
+	}
+
+	list_limit = set_robust_list2(NULL, 0, FUTEX_ROBUST_LIST_CMD_LIST_LIMIT, 0);
+
+	heads = malloc(list_limit * sizeof(*heads));
+	ASSERT_NE(heads, NULL);
+
+	for (i = 0; i < list_limit; i++) {
+		ret = set_list(&heads[i], true, i);
+		ASSERT_EQ(ret, 0);
+	}
+
+	for (i = 0; i < list_limit; i++) {
+		ret = get_list(0, &head, true, i);
+		ASSERT_EQ(ret, 0);
+		ASSERT_EQ(head, &heads[i]);
+	}
+
+	free(heads);
+	ksft_test_result_pass("%s\n", __func__);
+}
+
 TEST_HARNESS_MAIN

-- 
2.52.0


^ permalink raw reply related

* [PATCH v6 9/9] futex: Use new robust list API internally
From: André Almeida @ 2025-11-22  5:50 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Arnd Bergmann, Sebastian Andrzej Siewior,
	Waiman Long, Ryan Houdek
  Cc: linux-kernel, linux-kselftest, linux-api, kernel-dev,
	André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-0-05fea005a0fd@igalia.com>

The new robust list API internals can handle any kind of robust list, so
to simplify the code, reuse the same mechanisms for the original API and
when calling the original set syscall, set the head in the array of
lists. The first two indexes of the array of robust lists are reserved
for the original API lists, the native robust list and the compat robust
list.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
 include/linux/futex.h   |  4 ----
 include/linux/sched.h   |  5 -----
 kernel/futex/core.c     | 12 ------------
 kernel/futex/syscalls.c | 52 ++++++++++++++++++++++++-------------------------
 4 files changed, 25 insertions(+), 48 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 3dba249bcd32..ce27f6307c60 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -87,10 +87,6 @@ static inline bool futex_in_32bit_syscall(void)
 
 static inline void futex_init_task(struct task_struct *tsk)
 {
-	tsk->robust_list = NULL;
-#ifdef CONFIG_COMPAT
-	tsk->robust_list32 = NULL;
-#endif
 	tsk->futex_robust_lists = NULL;
 	INIT_LIST_HEAD(&tsk->pi_state_list);
 	tsk->pi_state_cache = NULL;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index de2f3cbe4953..e0f28e7f0a2d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -75,7 +75,6 @@ struct pid_namespace;
 struct pipe_inode_info;
 struct rcu_node;
 struct reclaim_state;
-struct robust_list_head;
 struct root_domain;
 struct rq;
 struct sched_attr;
@@ -1330,11 +1329,7 @@ struct task_struct {
 	u32				rmid;
 #endif
 #ifdef CONFIG_FUTEX
-	struct robust_list_head __user	*robust_list;
 	uintptr_t			*futex_robust_lists;
-#ifdef CONFIG_COMPAT
-	struct robust_list_head32 __user *robust_list32;
-#endif
 	struct list_head		pi_state_list;
 	struct futex_pi_state		*pi_state_cache;
 	struct mutex			futex_exit_mutex;
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 14d8a7176367..f91df175033d 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1500,18 +1500,6 @@ static void exit_robust_lists(struct task_struct *tsk)
 
 static void futex_cleanup(struct task_struct *tsk)
 {
-	if (unlikely(tsk->robust_list)) {
-		exit_robust_list(tsk, tsk->robust_list);
-		tsk->robust_list = NULL;
-	}
-
-#ifdef CONFIG_64BIT
-	if (unlikely(tsk->robust_list32)) {
-		exit_robust_list32(tsk, tsk->robust_list32);
-		tsk->robust_list32 = NULL;
-	}
-#endif
-
 	if (unlikely(tsk->futex_robust_lists))
 		exit_robust_lists(tsk);
 
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index f730d16632fc..2a44791db37a 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -28,32 +28,29 @@
 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
 		size_t, len)
 {
+	enum robust_list2_cmd cmd;
 	/*
 	 * The kernel knows only one size for now:
 	 */
 	if (unlikely(len != sizeof(*head)))
 		return -EINVAL;
 
-	current->robust_list = head;
+	cmd = IS_ENABLED(CONFIG_64BIT) ? FUTEX_ROBUST_LIST_CMD_SET_64 :
+	      FUTEX_ROBUST_LIST_CMD_SET_32;
 
-	return 0;
+	return futex_robust_list_set((uintptr_t) head, cmd,
+				     FUTEX_ROBUST_LIST_NATIVE_IDX);
 }
 
-static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat)
-{
-#ifdef CONFIG_COMPAT
-	if (compat)
-		return p->robust_list32;
-#endif
-	return p->robust_list;
-}
-
-static void __user *futex_get_robust_list_common(int pid, bool compat, int index)
+static void __user *futex_get_robust_list_common(int pid, unsigned int index)
 {
 	struct task_struct *p = current;
 	void __user *head;
 	int ret;
 
+	if (index >= FUTEX_ROBUST_LIST2_MAX_IDX)
+		return (void __user *)ERR_PTR(-EINVAL);
+
 	scoped_guard(rcu) {
 		if (pid) {
 			p = find_task_by_vpid(pid);
@@ -75,14 +72,10 @@ static void __user *futex_get_robust_list_common(int pid, bool compat, int index
 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
-	if (index >= 0) {
-		scoped_guard(mutex, &p->futex_exit_mutex) {
-			uintptr_t *rl = p->futex_robust_lists;
+	scoped_guard(mutex, &p->futex_exit_mutex) {
+		uintptr_t *rl = p->futex_robust_lists;
 
-			head = rl ? (void __user *) rl[index] : NULL;
-		}
-	} else {
-		head = futex_task_robust_list(p, compat);
+		head = rl ? (void __user *) rl[index] : NULL;
 	}
 
 	up_read(&p->signal->exec_update_lock);
@@ -107,7 +100,11 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 		struct robust_list_head __user * __user *, head_ptr,
 		size_t __user *, len_ptr)
 {
-	struct robust_list_head __user *head = futex_get_robust_list_common(pid, false, -1);
+	struct robust_list_head __user *head =
+		futex_get_robust_list_common(pid, FUTEX_ROBUST_LIST_NATIVE_IDX);
+
+	head = (struct robust_list_head __user *)
+		((uintptr_t) head & FUTEX_ROBUST_LIST_ENTRY_MASK);
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);
@@ -180,7 +177,7 @@ SYSCALL_DEFINE4(get_robust_list2, int, pid,
 	 */
 	index += FUTEX_ROBUST_LIST2_IDX;
 
-	entry_ptr = futex_get_robust_list_common(pid, false, index);
+	entry_ptr = futex_get_robust_list_common(pid, index);
 	if (IS_ERR(entry_ptr))
 		return PTR_ERR(entry_ptr);
 
@@ -568,22 +565,23 @@ COMPAT_SYSCALL_DEFINE2(set_robust_list,
 	if (unlikely(len != sizeof(*head)))
 		return -EINVAL;
 
-	current->robust_list32 = head;
-
-	return 0;
+	return futex_robust_list_set((uintptr_t) head, FUTEX_ROBUST_LIST_CMD_SET_32,
+				     FUTEX_ROBUST_LIST_COMPAT_IDX);
 }
 
 COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
 			compat_uptr_t __user *, head_ptr,
 			compat_size_t __user *, len_ptr)
 {
-	struct robust_list_head32 __user *head = futex_get_robust_list_common(pid, true, -1);
+	struct robust_list_head32 __user *head =
+		futex_get_robust_list_common(pid, FUTEX_ROBUST_LIST_COMPAT_IDX);
 
-	if (IS_ERR(head))
-		return PTR_ERR(head);
+	head = (struct robust_list_head32 __user *)
+		((uintptr_t) head & FUTEX_ROBUST_LIST_ENTRY_MASK);
 
 	if (put_user(sizeof(*head), len_ptr))
 		return -EFAULT;
+
 	return put_user(ptr_to_compat(head), head_ptr);
 }
 #endif /* CONFIG_COMPAT */

-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH v6 6/9] futex: Wire up get_robust_list2 syscall
From: kernel test robot @ 2025-11-22  6:49 UTC (permalink / raw)
  To: André Almeida, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
	Darren Hart, Davidlohr Bueso, Arnd Bergmann,
	Sebastian Andrzej Siewior, Waiman Long, Ryan Houdek
  Cc: oe-kbuild-all, linux-kernel, linux-kselftest, linux-api,
	kernel-dev, André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-6-05fea005a0fd@igalia.com>

Hi André,

kernel test robot noticed the following build warnings:

[auto build test WARNING on c42ba5a87bdccbca11403b7ca8bad1a57b833732]

url:    https://github.com/intel-lab-lkp/linux/commits/Andr-Almeida/futex-Use-explicit-sizes-for-compat_robust_list-structs/20251122-135406
base:   c42ba5a87bdccbca11403b7ca8bad1a57b833732
patch link:    https://lore.kernel.org/r/20251122-tonyk-robust_futex-v6-6-05fea005a0fd%40igalia.com
patch subject: [PATCH v6 6/9] futex: Wire up get_robust_list2 syscall
config: arc-allnoconfig (https://download.01.org/0day-ci/archive/20251122/202511221454.rsysOoSt-lkp@intel.com/config)
compiler: arc-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251122/202511221454.rsysOoSt-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511221454.rsysOoSt-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> <stdin>:1627:2: warning: #warning syscall get_robust_list2 not implemented [-Wcpp]
--
>> <stdin>:1627:2: warning: #warning syscall get_robust_list2 not implemented [-Wcpp]

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH v6 6/9] futex: Wire up get_robust_list2 syscall
From: kernel test robot @ 2025-11-22  7:32 UTC (permalink / raw)
  To: André Almeida, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
	Darren Hart, Davidlohr Bueso, Arnd Bergmann,
	Sebastian Andrzej Siewior, Waiman Long, Ryan Houdek
  Cc: oe-kbuild-all, linux-kernel, linux-kselftest, linux-api,
	kernel-dev, André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-6-05fea005a0fd@igalia.com>

Hi André,

kernel test robot noticed the following build errors:

[auto build test ERROR on c42ba5a87bdccbca11403b7ca8bad1a57b833732]

url:    https://github.com/intel-lab-lkp/linux/commits/Andr-Almeida/futex-Use-explicit-sizes-for-compat_robust_list-structs/20251122-135406
base:   c42ba5a87bdccbca11403b7ca8bad1a57b833732
patch link:    https://lore.kernel.org/r/20251122-tonyk-robust_futex-v6-6-05fea005a0fd%40igalia.com
patch subject: [PATCH v6 6/9] futex: Wire up get_robust_list2 syscall
config: powerpc-allnoconfig (https://download.01.org/0day-ci/archive/20251122/202511221516.vYMzvSVO-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251122/202511221516.vYMzvSVO-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511221516.vYMzvSVO-lkp@intel.com/

All errors (new ones prefixed by >>):

>> error: arch/powerpc/kernel/syscalls/syscall.tbl: syscall table is not sorted or duplicates the same syscall number
   make[3]: *** [arch/powerpc/kernel/syscalls/Makefile:31: arch/powerpc/include/generated/asm/syscall_table_64.h] Error 1
   make[3]: *** Deleting file 'arch/powerpc/include/generated/asm/syscall_table_64.h'
>> error: arch/powerpc/kernel/syscalls/syscall.tbl: syscall table is not sorted or duplicates the same syscall number
   make[3]: *** [arch/powerpc/kernel/syscalls/Makefile:35: arch/powerpc/include/generated/asm/syscall_table_spu.h] Error 1
   make[3]: *** Deleting file 'arch/powerpc/include/generated/asm/syscall_table_spu.h'
>> error: arch/powerpc/kernel/syscalls/syscall.tbl: syscall table is not sorted or duplicates the same syscall number
   make[3]: *** [arch/powerpc/kernel/syscalls/Makefile:27: arch/powerpc/include/generated/asm/syscall_table_32.h] Error 1
   make[3]: *** Deleting file 'arch/powerpc/include/generated/asm/syscall_table_32.h'
   make[3]: Target 'all' not remade because of errors.
   make[2]: *** [arch/powerpc/Makefile:397: archheaders] Error 2
   make[2]: Target 'prepare' not remade because of errors.
   make[1]: *** [Makefile:248: __sub-make] Error 2
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [Makefile:248: __sub-make] Error 2
   make: Target 'prepare' not remade because of errors.

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH v6 6/9] futex: Wire up get_robust_list2 syscall
From: kernel test robot @ 2025-11-22  7:44 UTC (permalink / raw)
  To: André Almeida, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
	Darren Hart, Davidlohr Bueso, Arnd Bergmann,
	Sebastian Andrzej Siewior, Waiman Long, Ryan Houdek
  Cc: llvm, oe-kbuild-all, linux-kernel, linux-kselftest, linux-api,
	kernel-dev, André Almeida
In-Reply-To: <20251122-tonyk-robust_futex-v6-6-05fea005a0fd@igalia.com>

Hi André,

kernel test robot noticed the following build warnings:

[auto build test WARNING on c42ba5a87bdccbca11403b7ca8bad1a57b833732]

url:    https://github.com/intel-lab-lkp/linux/commits/Andr-Almeida/futex-Use-explicit-sizes-for-compat_robust_list-structs/20251122-135406
base:   c42ba5a87bdccbca11403b7ca8bad1a57b833732
patch link:    https://lore.kernel.org/r/20251122-tonyk-robust_futex-v6-6-05fea005a0fd%40igalia.com
patch subject: [PATCH v6 6/9] futex: Wire up get_robust_list2 syscall
config: hexagon-allnoconfig (https://download.01.org/0day-ci/archive/20251122/202511221522.uRpEUT5O-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 9e9fe08b16ea2c4d9867fb4974edf2a3776d6ece)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251122/202511221522.uRpEUT5O-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511221522.uRpEUT5O-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> <stdin>:1627:2: warning: syscall get_robust_list2 not implemented [-W#warnings]
    1627 | #warning syscall get_robust_list2 not implemented
         |  ^
   1 warning generated.
--
>> <stdin>:1627:2: warning: syscall get_robust_list2 not implemented [-W#warnings]
    1627 | #warning syscall get_robust_list2 not implemented
         |  ^
   1 warning generated.

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH v6 0/9] futex: Create {set,get}_robust_list2() syscalls
From: André Almeida @ 2025-11-22 14:25 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Arnd Bergmann, Sebastian Andrzej Siewior,
	Waiman Long, Ryan Houdek
  Cc: linux-kernel, linux-kselftest, linux-api, kernel-dev
In-Reply-To: <20251122-tonyk-robust_futex-v6-0-05fea005a0fd@igalia.com>

Em 22/11/2025 02:50, André Almeida escreveu:
> Hello,
> 
> This version is a complete rewrite of the syscall (thanks Thomas for the
> suggestions!).
> 
>   * Use case
> 
> The use-case for the new syscalls is detailed in the last patch version:
> 
>    https://lore.kernel.org/lkml/20250626-tonyk-robust_futex-v5-0-179194dbde8f@igalia.com
> 
>   * The syscall interface
> 
> Documented at patches 3/9 "futex: Create set_robust_list2() syscall" and
> 4/9 "futex: Create get_robust_list2() syscall".
> 
>   * Testing
> 
> I expanded the current robust list selftest to use the new interface,
> and also ported the original syscall to use the new syscall internals,
> and everything survived the tests.
> 
>   * Changelog
> 
> Changes from v5:
>   - Complete interface rewrite, there are so many changes but the main
>     ones are the following points
>   - Array of robust lists now has a static size, allocated once during the
>     first usage of the list
>   - Now that the list of robust lists have a fixed size, I removed the
>     logic of having a command for creating a new index on the list. To
>     simplify things for everyone, userspace just need to call
>     set_robust_list2(head, 32-bit/64-bit type, index).

Actually, this won't work well. The application that calls 
set_robust_list2() may be using glibc, that will also want to call 
set_robust_list2(), and there's no way to know which lists are being by 
each library. I will re-add the create/modify logic for the next version.

^ permalink raw reply

* [PATCH v7 00/22] Live Update Orchestrator
From: Pasha Tatashin @ 2025-11-22 22:23 UTC (permalink / raw)
  To: pratyush, jasonmiu, graf, pasha.tatashin, rppt, dmatlack,
	rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, linux, linux-kernel, linux-doc, linux-mm, gregkh, tglx,
	mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu,
	hughd, skhawaja, chrisl

This series introduces the Live Update Orchestrator, a kernel subsystem
designed to facilitate live kernel updates using a kexec-based reboot.
This capability is critical for cloud environments, allowing hypervisors
to be updated with minimal downtime for running virtual machines. LUO
achieves this by preserving the state of selected resources, such as
memory, devices and their dependencies, across the kernel transition.

As a key feature, this series includes support for preserving memfd file
descriptors, which allows critical in-memory data, such as guest RAM or
any other large memory region, to be maintained in RAM across the kexec
reboot.

The other series that use LUO, are VFIO [1], IOMMU [2], and PCI [3]
preservations.

Github repo of this series [4].

The core of LUO is a framework for managing the lifecycle of preserved
resources through a userspace-driven interface. Key features include:

- Session Management
  Userspace agent (i.e. luod [5]) creates named sessions, each
  represented by a file descriptor (via centralized agent that controls
  /dev/liveupdate). The lifecycle of all preserved resources within a
  session is tied to this FD, ensuring automatic kernel cleanup if the
  controlling userspace agent crashes or exits unexpectedly.

- File Preservation
  A handler-based framework allows specific file types (demonstrated
  here with memfd) to be preserved. Handlers manage the serialization,
  restoration, and lifecycle of their specific file types.

- File-Lifecycle-Bound State
  A new mechanism for managing shared global state whose lifecycle is
  tied to the preservation of one or more files. This is crucial for
  subsystems like IOMMU or HugeTLB, where multiple file descriptors may
  depend on a single, shared underlying resource that must be preserved
  only once.

- KHO Integration
  LUO drives the Kexec Handover framework programmatically to pass its
  serialized metadata to the next kernel. The LUO state is finalized and
  added to the kexec image just before the reboot is triggered. In the
  future this step will also be removed once stateless KHO is
  merged [6].

- Userspace Interface
  Control is provided via ioctl commands on /dev/liveupdate for creating
  and retrieving sessions, as well as on session file descriptors for
  managing individual files.

- Testing
  The series includes a set of selftests, including userspace API
  validation, kexec-based lifecycle tests for various session and file
  scenarios, and a new in-kernel test module to validate the FLB logic.

Changelog since v6 [7]
- Collected Reviewed-by tags from Mike Rapoport,
  Pratyush Yadav, and Zhu Yanjun. Addressed all outstanding comments.
- Moved ABI headers from include/linux/liveupdate/abi/ to
  include/linux/kho/abi/ to align with other future users of KHO and KHO
  itself.
- Separated internal APIs to allow kernel subsystems to preserve file
  objects programmatically.
- Introduced struct luo_file_set to manage groups of preserved files,
  decoupling this logic from the luo_session structure. This simplifies
  internal management and serialization.
- Implemented luo_session_quiesce() and luo_session_resume() mechanisms.
  These ensure that file handlers and FLBs can be safely unregistered
  (liveupdate_unregister_file_handler, liveupdate_unregister_flb) by
  preventing new operations while unregistration is in progress.
- Added a comprehensive test orchestration framework. This includes a
  custom init process (init.c) and scripts (luo_test.sh, run.sh) to
  automate kexec testing within QEMU environments across x86_64 and
  arm64.
 
[1] https://lore.kernel.org/all/20251018000713.677779-1-vipinsh@google.com/
[2] https://lore.kernel.org/linux-iommu/20250928190624.3735830-1-skhawaja@google.com
[3] https://lore.kernel.org/linux-pci/20250916-luo-pci-v2-0-c494053c3c08@kernel.org
[4] https://github.com/googleprodkernel/linux-liveupdate/tree/luo/v7
[5] https://tinyurl.com/luoddesign
[6] https://lore.kernel.org/all/20251020100306.2709352-1-jasonmiu@google.com
[7] https://lore.kernel.org/all/20251115233409.768044-1-pasha.tatashin@soleen.com

Pasha Tatashin (16):
  liveupdate: luo_core: Live Update Orchestrator
  liveupdate: luo_core: integrate with KHO
  kexec: call liveupdate_reboot() before kexec
  liveupdate: luo_session: add sessions support
  liveupdate: luo_core: add user interface
  liveupdate: luo_file: implement file systems callbacks
  liveupdate: luo_session: Add ioctls for file preservation
  docs: add luo documentation
  MAINTAINERS: add liveupdate entry
  selftests/liveupdate: Add userspace API selftests
  selftests/liveupdate: Add kexec-based selftest for
  selftests/liveupdate: Add kexec test for multiple and empty sessions
  selftests/liveupdate: add test infrastructure and scripts
  liveupdate: luo_file: Add internal APIs for file preservation
  liveupdate: luo_flb: Introduce File-Lifecycle-Bound global state
  tests/liveupdate: Add in-kernel liveupdate test

Pratyush Yadav (6):
  mm: shmem: use SHMEM_F_* flags instead of VM_* flags
  mm: shmem: allow freezing inode mapping
  mm: shmem: export some functions to internal.h
  liveupdate: luo_file: add private argument to store runtime state
  mm: memfd_luo: allow preserving memfd
  docs: add documentation for memfd preservation via LUO

 Documentation/core-api/index.rst              |   1 +
 Documentation/core-api/liveupdate.rst         |  71 ++
 Documentation/mm/index.rst                    |   1 +
 Documentation/mm/memfd_preservation.rst       |  23 +
 Documentation/userspace-api/index.rst         |   1 +
 .../userspace-api/ioctl/ioctl-number.rst      |   2 +
 Documentation/userspace-api/liveupdate.rst    |  20 +
 MAINTAINERS                                   |  16 +
 include/linux/kho/abi/luo.h                   | 243 +++++
 include/linux/kho/abi/memfd.h                 |  77 ++
 include/linux/liveupdate.h                    | 311 ++++++
 include/linux/shmem_fs.h                      |  23 +
 include/uapi/linux/liveupdate.h               | 216 ++++
 kernel/kexec_core.c                           |   5 +
 kernel/liveupdate/Kconfig                     |  27 +
 kernel/liveupdate/Makefile                    |   8 +
 kernel/liveupdate/luo_core.c                  | 454 ++++++++
 kernel/liveupdate/luo_file.c                  | 987 ++++++++++++++++++
 kernel/liveupdate/luo_flb.c                   | 701 +++++++++++++
 kernel/liveupdate/luo_internal.h              | 141 +++
 kernel/liveupdate/luo_session.c               | 645 ++++++++++++
 lib/Kconfig.debug                             |  23 +
 lib/tests/Makefile                            |   1 +
 lib/tests/liveupdate.c                        | 160 +++
 mm/Makefile                                   |   1 +
 mm/internal.h                                 |   6 +
 mm/memfd_luo.c                                | 517 +++++++++
 mm/shmem.c                                    |  57 +-
 tools/testing/selftests/Makefile              |   1 +
 tools/testing/selftests/liveupdate/.gitignore |   9 +
 tools/testing/selftests/liveupdate/Makefile   |  34 +
 tools/testing/selftests/liveupdate/config     |  11 +
 .../testing/selftests/liveupdate/do_kexec.sh  |  16 +
 tools/testing/selftests/liveupdate/init.c     | 174 +++
 .../testing/selftests/liveupdate/liveupdate.c | 348 ++++++
 .../selftests/liveupdate/luo_kexec_simple.c   |  89 ++
 .../selftests/liveupdate/luo_multi_session.c  | 162 +++
 .../testing/selftests/liveupdate/luo_test.sh  | 296 ++++++
 .../selftests/liveupdate/luo_test_utils.c     | 266 +++++
 .../selftests/liveupdate/luo_test_utils.h     |  44 +
 tools/testing/selftests/liveupdate/run.sh     |  68 ++
 41 files changed, 6235 insertions(+), 21 deletions(-)
 create mode 100644 Documentation/core-api/liveupdate.rst
 create mode 100644 Documentation/mm/memfd_preservation.rst
 create mode 100644 Documentation/userspace-api/liveupdate.rst
 create mode 100644 include/linux/kho/abi/luo.h
 create mode 100644 include/linux/kho/abi/memfd.h
 create mode 100644 include/linux/liveupdate.h
 create mode 100644 include/uapi/linux/liveupdate.h
 create mode 100644 kernel/liveupdate/luo_core.c
 create mode 100644 kernel/liveupdate/luo_file.c
 create mode 100644 kernel/liveupdate/luo_flb.c
 create mode 100644 kernel/liveupdate/luo_internal.h
 create mode 100644 kernel/liveupdate/luo_session.c
 create mode 100644 lib/tests/liveupdate.c
 create mode 100644 mm/memfd_luo.c
 create mode 100644 tools/testing/selftests/liveupdate/.gitignore
 create mode 100644 tools/testing/selftests/liveupdate/Makefile
 create mode 100644 tools/testing/selftests/liveupdate/config
 create mode 100755 tools/testing/selftests/liveupdate/do_kexec.sh
 create mode 100644 tools/testing/selftests/liveupdate/init.c
 create mode 100644 tools/testing/selftests/liveupdate/liveupdate.c
 create mode 100644 tools/testing/selftests/liveupdate/luo_kexec_simple.c
 create mode 100644 tools/testing/selftests/liveupdate/luo_multi_session.c
 create mode 100755 tools/testing/selftests/liveupdate/luo_test.sh
 create mode 100644 tools/testing/selftests/liveupdate/luo_test_utils.c
 create mode 100644 tools/testing/selftests/liveupdate/luo_test_utils.h
 create mode 100755 tools/testing/selftests/liveupdate/run.sh


base-commit: 2cb7e27ffe3e3e1d8a837026462ebca22cba3b4f
-- 
2.52.0.rc2.455.g230fcf2819-goog


^ permalink raw reply

* [PATCH v7 01/22] liveupdate: luo_core: Live Update Orchestrator
From: Pasha Tatashin @ 2025-11-22 22:23 UTC (permalink / raw)
  To: pratyush, jasonmiu, graf, pasha.tatashin, rppt, dmatlack,
	rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, linux, linux-kernel, linux-doc, linux-mm, gregkh, tglx,
	mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu,
	hughd, skhawaja, chrisl
In-Reply-To: <20251122222351.1059049-1-pasha.tatashin@soleen.com>

Introduce LUO, a mechanism intended to facilitate kernel updates while
keeping designated devices operational across the transition (e.g., via
kexec). The primary use case is updating hypervisors with minimal
disruption to running virtual machines. For userspace side of hypervisor
update we have copyless migration. LUO is for updating the kernel.

This initial patch lays the groundwork for the LUO subsystem.

Further functionality, including the implementation of state transition
logic, integration with KHO, and hooks for subsystems and file
descriptors, will be added in subsequent patches.

Create a character device at /dev/liveupdate.

A new uAPI header, <uapi/linux/liveupdate.h>, will define the necessary
structures. The magic number for IOCTL is registered in
Documentation/userspace-api/ioctl/ioctl-number.rst.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
---
 .../userspace-api/ioctl/ioctl-number.rst      |   2 +
 include/linux/liveupdate.h                    |  35 ++++++
 include/uapi/linux/liveupdate.h               |  46 ++++++++
 kernel/liveupdate/Kconfig                     |  27 +++++
 kernel/liveupdate/Makefile                    |   5 +
 kernel/liveupdate/luo_core.c                  | 111 ++++++++++++++++++
 6 files changed, 226 insertions(+)
 create mode 100644 include/linux/liveupdate.h
 create mode 100644 include/uapi/linux/liveupdate.h
 create mode 100644 kernel/liveupdate/luo_core.c

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 7c527a01d1cf..7232b3544cec 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -385,6 +385,8 @@ Code  Seq#    Include File                                             Comments
 0xB8  01-02  uapi/misc/mrvl_cn10k_dpi.h                                Marvell CN10K DPI driver
 0xB8  all    uapi/linux/mshv.h                                         Microsoft Hyper-V /dev/mshv driver
                                                                        <mailto:linux-hyperv@vger.kernel.org>
+0xBA  00-0F  uapi/linux/liveupdate.h                                   Pasha Tatashin
+                                                                       <mailto:pasha.tatashin@soleen.com>
 0xC0  00-0F  linux/usb/iowarrior.h
 0xCA  00-0F  uapi/misc/cxl.h                                           Dead since 6.15
 0xCA  10-2F  uapi/misc/ocxl.h
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
new file mode 100644
index 000000000000..c6a1d6bd90cb
--- /dev/null
+++ b/include/linux/liveupdate.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+#ifndef _LINUX_LIVEUPDATE_H
+#define _LINUX_LIVEUPDATE_H
+
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_LIVEUPDATE
+
+/* Return true if live update orchestrator is enabled */
+bool liveupdate_enabled(void);
+
+/* Called during kexec to tell LUO that entered into reboot */
+int liveupdate_reboot(void);
+
+#else /* CONFIG_LIVEUPDATE */
+
+static inline bool liveupdate_enabled(void)
+{
+	return false;
+}
+
+static inline int liveupdate_reboot(void)
+{
+	return 0;
+}
+
+#endif /* CONFIG_LIVEUPDATE */
+#endif /* _LINUX_LIVEUPDATE_H */
diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h
new file mode 100644
index 000000000000..df34c1642c4d
--- /dev/null
+++ b/include/uapi/linux/liveupdate.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+/*
+ * Userspace interface for /dev/liveupdate
+ * Live Update Orchestrator
+ *
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _UAPI_LIVEUPDATE_H
+#define _UAPI_LIVEUPDATE_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * DOC: General ioctl format
+ *
+ * The ioctl interface follows a general format to allow for extensibility. Each
+ * ioctl is passed in a structure pointer as the argument providing the size of
+ * the structure in the first u32. The kernel checks that any structure space
+ * beyond what it understands is 0. This allows userspace to use the backward
+ * compatible portion while consistently using the newer, larger, structures.
+ *
+ * ioctls use a standard meaning for common errnos:
+ *
+ *  - ENOTTY: The IOCTL number itself is not supported at all
+ *  - E2BIG: The IOCTL number is supported, but the provided structure has
+ *    non-zero in a part the kernel does not understand.
+ *  - EOPNOTSUPP: The IOCTL number is supported, and the structure is
+ *    understood, however a known field has a value the kernel does not
+ *    understand or support.
+ *  - EINVAL: Everything about the IOCTL was understood, but a field is not
+ *    correct.
+ *  - ENOENT: A provided token does not exist.
+ *  - ENOMEM: Out of memory.
+ *  - EOVERFLOW: Mathematics overflowed.
+ *
+ * As well as additional errnos, within specific ioctls.
+ */
+
+/* The ioctl type, documented in ioctl-number.rst */
+#define LIVEUPDATE_IOCTL_TYPE		0xBA
+
+#endif /* _UAPI_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig
index a973a54447de..90857dccb359 100644
--- a/kernel/liveupdate/Kconfig
+++ b/kernel/liveupdate/Kconfig
@@ -1,4 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (c) 2025, Google LLC.
+# Pasha Tatashin <pasha.tatashin@soleen.com>
+#
+# Live Update Orchestrator
+#
 
 menu "Live Update and Kexec HandOver"
 	depends on !DEFERRED_STRUCT_PAGE_INIT
@@ -51,4 +57,25 @@ config KEXEC_HANDOVER_ENABLE_DEFAULT
 	  The default behavior can still be overridden at boot time by
 	  passing 'kho=off'.
 
+config LIVEUPDATE
+	bool "Live Update Orchestrator"
+	depends on KEXEC_HANDOVER
+	help
+	  Enable the Live Update Orchestrator. Live Update is a mechanism,
+	  typically based on kexec, that allows the kernel to be updated
+	  while keeping selected devices operational across the transition.
+	  These devices are intended to be reclaimed by the new kernel and
+	  re-attached to their original workload without requiring a device
+	  reset.
+
+	  Ability to handover a device from current to the next kernel depends
+	  on specific support within device drivers and related kernel
+	  subsystems.
+
+	  This feature primarily targets virtual machine hosts to quickly update
+	  the kernel hypervisor with minimal disruption to the running virtual
+	  machines.
+
+	  If unsure, say N.
+
 endmenu
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
index f52ce1ebcf86..08954c1770c4 100644
--- a/kernel/liveupdate/Makefile
+++ b/kernel/liveupdate/Makefile
@@ -1,5 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
 
+luo-y :=								\
+		luo_core.o
+
 obj-$(CONFIG_KEXEC_HANDOVER)		+= kexec_handover.o
 obj-$(CONFIG_KEXEC_HANDOVER_DEBUG)	+= kexec_handover_debug.o
 obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS)	+= kexec_handover_debugfs.o
+
+obj-$(CONFIG_LIVEUPDATE)		+= luo.o
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
new file mode 100644
index 000000000000..30ad8836360b
--- /dev/null
+++ b/kernel/liveupdate/luo_core.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: Live Update Orchestrator (LUO)
+ *
+ * Live Update is a specialized, kexec-based reboot process that allows a
+ * running kernel to be updated from one version to another while preserving
+ * the state of selected resources and keeping designated hardware devices
+ * operational. For these devices, DMA activity may continue throughout the
+ * kernel transition.
+ *
+ * While the primary use case driving this work is supporting live updates of
+ * the Linux kernel when it is used as a hypervisor in cloud environments, the
+ * LUO framework itself is designed to be workload-agnostic. Live Update
+ * facilitates a full kernel version upgrade for any type of system.
+ *
+ * For example, a non-hypervisor system running an in-memory cache like
+ * memcached with many gigabytes of data can use LUO. The userspace service
+ * can place its cache into a memfd, have its state preserved by LUO, and
+ * restore it immediately after the kernel kexec.
+ *
+ * Whether the system is running virtual machines, containers, a
+ * high-performance database, or networking services, LUO's primary goal is to
+ * enable a full kernel update by preserving critical userspace state and
+ * keeping essential devices operational.
+ *
+ * The core of LUO is a mechanism that tracks the progress of a live update,
+ * along with a callback API that allows other kernel subsystems to participate
+ * in the process. Example subsystems that can hook into LUO include: kvm,
+ * iommu, interrupts, vfio, participating filesystems, and memory management.
+ *
+ * LUO uses Kexec Handover to transfer memory state from the current kernel to
+ * the next kernel. For more details see
+ * Documentation/core-api/kho/concepts.rst.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kobject.h>
+#include <linux/liveupdate.h>
+#include <linux/miscdevice.h>
+
+static struct {
+	bool enabled;
+} luo_global;
+
+static int __init early_liveupdate_param(char *buf)
+{
+	return kstrtobool(buf, &luo_global.enabled);
+}
+early_param("liveupdate", early_liveupdate_param);
+
+/* Public Functions */
+
+/**
+ * liveupdate_reboot() - Kernel reboot notifier for live update final
+ * serialization.
+ *
+ * This function is invoked directly from the reboot() syscall pathway
+ * if kexec is in progress.
+ *
+ * If any callback fails, this function aborts KHO, undoes the freeze()
+ * callbacks, and returns an error.
+ */
+int liveupdate_reboot(void)
+{
+	return 0;
+}
+
+/**
+ * liveupdate_enabled - Check if the live update feature is enabled.
+ *
+ * This function returns the state of the live update feature flag, which
+ * can be controlled via the ``liveupdate`` kernel command-line parameter.
+ *
+ * @return true if live update is enabled, false otherwise.
+ */
+bool liveupdate_enabled(void)
+{
+	return luo_global.enabled;
+}
+
+struct luo_device_state {
+	struct miscdevice miscdev;
+};
+
+static const struct file_operations luo_fops = {
+	.owner		= THIS_MODULE,
+};
+
+static struct luo_device_state luo_dev = {
+	.miscdev = {
+		.minor = MISC_DYNAMIC_MINOR,
+		.name  = "liveupdate",
+		.fops  = &luo_fops,
+	},
+};
+
+static int __init liveupdate_ioctl_init(void)
+{
+	if (!liveupdate_enabled())
+		return 0;
+
+	return misc_register(&luo_dev.miscdev);
+}
+late_initcall(liveupdate_ioctl_init);
-- 
2.52.0.rc2.455.g230fcf2819-goog


^ permalink raw reply related

* [PATCH v7 02/22] liveupdate: luo_core: integrate with KHO
From: Pasha Tatashin @ 2025-11-22 22:23 UTC (permalink / raw)
  To: pratyush, jasonmiu, graf, pasha.tatashin, rppt, dmatlack,
	rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, linux, linux-kernel, linux-doc, linux-mm, gregkh, tglx,
	mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu,
	hughd, skhawaja, chrisl
In-Reply-To: <20251122222351.1059049-1-pasha.tatashin@soleen.com>

Integrate the LUO with the KHO framework to enable passing LUO state
across a kexec reboot.

This patch implements the lifecycle integration with KHO:

1. Incoming State: During early boot (`early_initcall`), LUO checks if
   KHO is active. If so, it retrieves the "LUO" subtree, verifies the
   "luo-v1" compatibility string, and reads the `liveupdate-number` to
   track the update count.

2. Outgoing State: During late initialization (`late_initcall`), LUO
   allocates a new FDT for the next kernel, populates it with the basic
   header (compatible string and incremented update number), and
   registers it with KHO (`kho_add_subtree`).

3. Finalization: The `liveupdate_reboot()` notifier is updated to invoke
   `kho_finalize()`. This ensures that all memory segments marked for
   preservation are properly serialized before the kexec jump.

LUO now depends on `CONFIG_KEXEC_HANDOVER`.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
 include/linux/kho/abi/luo.h      |  54 +++++++++++
 kernel/liveupdate/luo_core.c     | 154 ++++++++++++++++++++++++++++++-
 kernel/liveupdate/luo_internal.h |  22 +++++
 3 files changed, 229 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kho/abi/luo.h
 create mode 100644 kernel/liveupdate/luo_internal.h

diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h
new file mode 100644
index 000000000000..8523b3ff82d1
--- /dev/null
+++ b/include/linux/kho/abi/luo.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: Live Update Orchestrator ABI
+ *
+ * This header defines the stable Application Binary Interface used by the
+ * Live Update Orchestrator to pass state from a pre-update kernel to a
+ * post-update kernel. The ABI is built upon the Kexec HandOver framework
+ * and uses a Flattened Device Tree to describe the preserved data.
+ *
+ * This interface is a contract. Any modification to the FDT structure, node
+ * properties, compatible strings, or the layout of the `__packed` serialization
+ * structures defined here constitutes a breaking change. Such changes require
+ * incrementing the version number in the relevant `_COMPATIBLE` string to
+ * prevent a new kernel from misinterpreting data from an old kernel.
+ *
+ * FDT Structure Overview:
+ *   The entire LUO state is encapsulated within a single KHO entry named "LUO".
+ *   This entry contains an FDT with the following layout:
+ *
+ *   .. code-block:: none
+ *
+ *     / {
+ *         compatible = "luo-v1";
+ *         liveupdate-number = <...>;
+ *     };
+ *
+ * Main LUO Node (/):
+ *
+ *   - compatible: "luo-v1"
+ *     Identifies the overall LUO ABI version.
+ *   - liveupdate-number: u64
+ *     A counter tracking the number of successful live updates performed.
+ */
+
+#ifndef _LINUX_KHO_ABI_LUO_H
+#define _LINUX_KHO_ABI_LUO_H
+
+/*
+ * The LUO FDT hooks all LUO state for sessions, fds, etc.
+ * In the root it also carries "liveupdate-number" 64-bit property that
+ * corresponds to the number of live-updates performed on this machine.
+ */
+#define LUO_FDT_SIZE		PAGE_SIZE
+#define LUO_FDT_KHO_ENTRY_NAME	"LUO"
+#define LUO_FDT_COMPATIBLE	"luo-v1"
+#define LUO_FDT_LIVEUPDATE_NUM	"liveupdate-number"
+
+#endif /* _LINUX_KHO_ABI_LUO_H */
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index 30ad8836360b..9f9fe9a81b29 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -41,12 +41,26 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
 #include <linux/kobject.h>
+#include <linux/libfdt.h>
 #include <linux/liveupdate.h>
 #include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+#include "kexec_handover_internal.h"
+#include "luo_internal.h"
 
 static struct {
 	bool enabled;
+	void *fdt_out;
+	void *fdt_in;
+	u64 liveupdate_num;
 } luo_global;
 
 static int __init early_liveupdate_param(char *buf)
@@ -55,6 +69,129 @@ static int __init early_liveupdate_param(char *buf)
 }
 early_param("liveupdate", early_liveupdate_param);
 
+static int __init luo_early_startup(void)
+{
+	phys_addr_t fdt_phys;
+	int err, ln_size;
+	const void *ptr;
+
+	if (!kho_is_enabled()) {
+		if (liveupdate_enabled())
+			pr_warn("Disabling liveupdate because KHO is disabled\n");
+		luo_global.enabled = false;
+		return 0;
+	}
+
+	/* Retrieve LUO subtree, and verify its format. */
+	err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys);
+	if (err) {
+		if (err != -ENOENT) {
+			pr_err("failed to retrieve FDT '%s' from KHO: %pe\n",
+			       LUO_FDT_KHO_ENTRY_NAME, ERR_PTR(err));
+			return err;
+		}
+
+		return 0;
+	}
+
+	luo_global.fdt_in = phys_to_virt(fdt_phys);
+	err = fdt_node_check_compatible(luo_global.fdt_in, 0,
+					LUO_FDT_COMPATIBLE);
+	if (err) {
+		pr_err("FDT '%s' is incompatible with '%s' [%d]\n",
+		       LUO_FDT_KHO_ENTRY_NAME, LUO_FDT_COMPATIBLE, err);
+
+		return -EINVAL;
+	}
+
+	ln_size = 0;
+	ptr = fdt_getprop(luo_global.fdt_in, 0, LUO_FDT_LIVEUPDATE_NUM,
+			  &ln_size);
+	if (!ptr || ln_size != sizeof(luo_global.liveupdate_num)) {
+		pr_err("Unable to get live update number '%s' [%d]\n",
+		       LUO_FDT_LIVEUPDATE_NUM, ln_size);
+
+		return -EINVAL;
+	}
+
+	luo_global.liveupdate_num = get_unaligned((u64 *)ptr);
+	pr_info("Retrieved live update data, liveupdate number: %lld\n",
+		luo_global.liveupdate_num);
+
+	return 0;
+}
+
+static int __init liveupdate_early_init(void)
+{
+	int err;
+
+	err = luo_early_startup();
+	if (err) {
+		luo_global.enabled = false;
+		luo_restore_fail("The incoming tree failed to initialize properly [%pe], disabling live update\n",
+				 ERR_PTR(err));
+	}
+
+	return err;
+}
+early_initcall(liveupdate_early_init);
+
+/* Called during boot to create outgoing LUO fdt tree */
+static int __init luo_fdt_setup(void)
+{
+	const u64 ln = luo_global.liveupdate_num + 1;
+	void *fdt_out;
+	int err;
+
+	fdt_out = kho_alloc_preserve(LUO_FDT_SIZE);
+	if (IS_ERR(fdt_out)) {
+		pr_err("failed to allocate/preserve FDT memory\n");
+		return PTR_ERR(fdt_out);
+	}
+
+	err = fdt_create(fdt_out, LUO_FDT_SIZE);
+	err |= fdt_finish_reservemap(fdt_out);
+	err |= fdt_begin_node(fdt_out, "");
+	err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE);
+	err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln));
+	err |= fdt_end_node(fdt_out);
+	err |= fdt_finish(fdt_out);
+	if (err)
+		goto exit_free;
+
+	err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out);
+	if (err)
+		goto exit_free;
+	luo_global.fdt_out = fdt_out;
+
+	return 0;
+
+exit_free:
+	kho_unpreserve_free(fdt_out);
+	pr_err("failed to prepare LUO FDT: %d\n", err);
+
+	return err;
+}
+
+/*
+ * late initcall because it initializes the outgoing tree that is needed only
+ * once userspace starts using /dev/liveupdate.
+ */
+static int __init luo_late_startup(void)
+{
+	int err;
+
+	if (!liveupdate_enabled())
+		return 0;
+
+	err = luo_fdt_setup();
+	if (err)
+		luo_global.enabled = false;
+
+	return err;
+}
+late_initcall(luo_late_startup);
+
 /* Public Functions */
 
 /**
@@ -69,7 +206,22 @@ early_param("liveupdate", early_liveupdate_param);
  */
 int liveupdate_reboot(void)
 {
-	return 0;
+	int err;
+
+	if (!liveupdate_enabled())
+		return 0;
+
+	err = kho_finalize();
+	if (err) {
+		pr_err("kho_finalize failed %d\n", err);
+		/*
+		 * kho_finalize() may return libfdt errors, to aboid passing to
+		 * userspace unknown errors, change this to EAGAIN.
+		 */
+		err = -EAGAIN;
+	}
+
+	return err;
 }
 
 /**
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
new file mode 100644
index 000000000000..8612687b2000
--- /dev/null
+++ b/kernel/liveupdate/luo_internal.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _LINUX_LUO_INTERNAL_H
+#define _LINUX_LUO_INTERNAL_H
+
+#include <linux/liveupdate.h>
+
+/*
+ * Handles a deserialization failure: devices and memory is in unpredictable
+ * state.
+ *
+ * Continuing the boot process after a failure is dangerous because it could
+ * lead to leaks of private data.
+ */
+#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__)
+
+#endif /* _LINUX_LUO_INTERNAL_H */
-- 
2.52.0.rc2.455.g230fcf2819-goog


^ permalink raw reply related

* [PATCH v7 03/22] kexec: call liveupdate_reboot() before kexec
From: Pasha Tatashin @ 2025-11-22 22:23 UTC (permalink / raw)
  To: pratyush, jasonmiu, graf, pasha.tatashin, rppt, dmatlack,
	rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, linux, linux-kernel, linux-doc, linux-mm, gregkh, tglx,
	mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu,
	hughd, skhawaja, chrisl
In-Reply-To: <20251122222351.1059049-1-pasha.tatashin@soleen.com>

Modify the kernel_kexec() to call liveupdate_reboot().

This ensures that the Live Update Orchestrator is notified just
before the kernel executes the kexec jump. The liveupdate_reboot()
function triggers the final freeze event, allowing participating
FDs perform last-minute check or state saving within the blackout
window.

If liveupdate_reboot() returns an error (indicating a failure during
LUO finalization), the kexec operation is aborted to prevent proceeding
with an inconsistent state. An error is returned to user.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
---
 kernel/kexec_core.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index a8890dd03a1d..3122235c225b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -15,6 +15,7 @@
 #include <linux/kexec.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
+#include <linux/liveupdate.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
@@ -1145,6 +1146,10 @@ int kernel_kexec(void)
 		goto Unlock;
 	}
 
+	error = liveupdate_reboot();
+	if (error)
+		goto Unlock;
+
 #ifdef CONFIG_KEXEC_JUMP
 	if (kexec_image->preserve_context) {
 		/*
-- 
2.52.0.rc2.455.g230fcf2819-goog


^ permalink raw reply related

* [PATCH v7 04/22] liveupdate: luo_session: add sessions support
From: Pasha Tatashin @ 2025-11-22 22:23 UTC (permalink / raw)
  To: pratyush, jasonmiu, graf, pasha.tatashin, rppt, dmatlack,
	rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, linux, linux-kernel, linux-doc, linux-mm, gregkh, tglx,
	mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu,
	hughd, skhawaja, chrisl
In-Reply-To: <20251122222351.1059049-1-pasha.tatashin@soleen.com>

Introduce concept of "Live Update Sessions" within the LUO framework.
LUO sessions provide a mechanism to group and manage `struct file *`
instances (representing file descriptors) that need to be preserved
across a kexec-based live update.

Each session is identified by a unique name and acts as a container
for file objects whose state is critical to a userspace workload, such
as a virtual machine or a high-performance database, aiming to maintain
their functionality across a kernel transition.

This groundwork establishes the framework for preserving file-backed
state across kernel updates, with the actual file data preservation
mechanisms to be implemented in subsequent patches.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
 include/linux/kho/abi/luo.h      |  71 +++++
 include/uapi/linux/liveupdate.h  |   3 +
 kernel/liveupdate/Makefile       |   3 +-
 kernel/liveupdate/luo_core.c     |   9 +
 kernel/liveupdate/luo_internal.h |  29 ++
 kernel/liveupdate/luo_session.c  | 462 +++++++++++++++++++++++++++++++
 6 files changed, 576 insertions(+), 1 deletion(-)
 create mode 100644 kernel/liveupdate/luo_session.c

diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h
index 8523b3ff82d1..a2d2940eca6b 100644
--- a/include/linux/kho/abi/luo.h
+++ b/include/linux/kho/abi/luo.h
@@ -28,6 +28,11 @@
  *     / {
  *         compatible = "luo-v1";
  *         liveupdate-number = <...>;
+ *
+ *         luo-session {
+ *             compatible = "luo-session-v1";
+ *             luo-session-header = <phys_addr_of_session_header_ser>;
+ *         };
  *     };
  *
  * Main LUO Node (/):
@@ -36,11 +41,37 @@
  *     Identifies the overall LUO ABI version.
  *   - liveupdate-number: u64
  *     A counter tracking the number of successful live updates performed.
+ *
+ * Session Node (luo-session):
+ *   This node describes all preserved user-space sessions.
+ *
+ *   - compatible: "luo-session-v1"
+ *     Identifies the session ABI version.
+ *   - luo-session-header: u64
+ *     The physical address of a `struct luo_session_header_ser`. This structure
+ *     is the header for a contiguous block of memory containing an array of
+ *     `struct luo_session_ser`, one for each preserved session.
+ *
+ * Serialization Structures:
+ *   The FDT properties point to memory regions containing arrays of simple,
+ *   `__packed` structures. These structures contain the actual preserved state.
+ *
+ *   - struct luo_session_header_ser:
+ *     Header for the session array. Contains the total page count of the
+ *     preserved memory block and the number of `struct luo_session_ser`
+ *     entries that follow.
+ *
+ *   - struct luo_session_ser:
+ *     Metadata for a single session, including its name and a physical pointer
+ *     to another preserved memory block containing an array of
+ *     `struct luo_file_ser` for all files in that session.
  */
 
 #ifndef _LINUX_KHO_ABI_LUO_H
 #define _LINUX_KHO_ABI_LUO_H
 
+#include <uapi/linux/liveupdate.h>
+
 /*
  * The LUO FDT hooks all LUO state for sessions, fds, etc.
  * In the root it also carries "liveupdate-number" 64-bit property that
@@ -51,4 +82,44 @@
 #define LUO_FDT_COMPATIBLE	"luo-v1"
 #define LUO_FDT_LIVEUPDATE_NUM	"liveupdate-number"
 
+/*
+ * LUO FDT session node
+ * LUO_FDT_SESSION_HEADER:  is a u64 physical address of struct
+ *                          luo_session_header_ser
+ */
+#define LUO_FDT_SESSION_NODE_NAME	"luo-session"
+#define LUO_FDT_SESSION_COMPATIBLE	"luo-session-v1"
+#define LUO_FDT_SESSION_HEADER		"luo-session-header"
+
+/**
+ * struct luo_session_header_ser - Header for the serialized session data block.
+ * @count: The number of `struct luo_session_ser` entries that immediately
+ *         follow this header in the memory block.
+ *
+ * This structure is located at the beginning of a contiguous block of
+ * physical memory preserved across the kexec. It provides the necessary
+ * metadata to interpret the array of session entries that follow.
+ *
+ * If this structure is modified, `LUO_FDT_SESSION_COMPATIBLE` must be updated.
+ */
+struct luo_session_header_ser {
+	u64 count;
+} __packed;
+
+/**
+ * struct luo_session_ser - Represents the serialized metadata for a LUO session.
+ * @name:         The unique name of the session, provided by the userspace at
+ *                the time of session creation.
+ *
+ * This structure is used to package session-specific metadata for transfer
+ * between kernels via Kexec Handover. An array of these structures (one per
+ * session) is created and passed to the new kernel, allowing it to reconstruct
+ * the session context.
+ *
+ * If this structure is modified, `LUO_FDT_SESSION_COMPATIBLE` must be updated.
+ */
+struct luo_session_ser {
+	char name[LIVEUPDATE_SESSION_NAME_LENGTH];
+} __packed;
+
 #endif /* _LINUX_KHO_ABI_LUO_H */
diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h
index df34c1642c4d..40578ae19668 100644
--- a/include/uapi/linux/liveupdate.h
+++ b/include/uapi/linux/liveupdate.h
@@ -43,4 +43,7 @@
 /* The ioctl type, documented in ioctl-number.rst */
 #define LIVEUPDATE_IOCTL_TYPE		0xBA
 
+/* The maximum length of session name including null termination */
+#define LIVEUPDATE_SESSION_NAME_LENGTH 64
+
 #endif /* _UAPI_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
index 08954c1770c4..6af93caa58cf 100644
--- a/kernel/liveupdate/Makefile
+++ b/kernel/liveupdate/Makefile
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 
 luo-y :=								\
-		luo_core.o
+		luo_core.o						\
+		luo_session.o
 
 obj-$(CONFIG_KEXEC_HANDOVER)		+= kexec_handover.o
 obj-$(CONFIG_KEXEC_HANDOVER_DEBUG)	+= kexec_handover_debug.o
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index 9f9fe9a81b29..a0f7788cd003 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -118,6 +118,10 @@ static int __init luo_early_startup(void)
 	pr_info("Retrieved live update data, liveupdate number: %lld\n",
 		luo_global.liveupdate_num);
 
+	err = luo_session_setup_incoming(luo_global.fdt_in);
+	if (err)
+		return err;
+
 	return 0;
 }
 
@@ -154,6 +158,7 @@ static int __init luo_fdt_setup(void)
 	err |= fdt_begin_node(fdt_out, "");
 	err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE);
 	err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln));
+	err |= luo_session_setup_outgoing(fdt_out);
 	err |= fdt_end_node(fdt_out);
 	err |= fdt_finish(fdt_out);
 	if (err)
@@ -211,6 +216,10 @@ int liveupdate_reboot(void)
 	if (!liveupdate_enabled())
 		return 0;
 
+	err = luo_session_serialize();
+	if (err)
+		return err;
+
 	err = kho_finalize();
 	if (err) {
 		pr_err("kho_finalize failed %d\n", err);
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index 8612687b2000..05ae91695ec6 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -19,4 +19,33 @@
  */
 #define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__)
 
+/**
+ * struct luo_session - Represents an active or incoming Live Update session.
+ * @name:       A unique name for this session, used for identification and
+ *              retrieval.
+ * @ser:        Pointer to the serialized data for this session.
+ * @list:       A list_head member used to link this session into a global list
+ *              of either outgoing (to be preserved) or incoming (restored from
+ *              previous kernel) sessions.
+ * @retrieved:  A boolean flag indicating whether this session has been
+ *              retrieved by a consumer in the new kernel.
+ * @mutex:      protects fields in the luo_session.
+ */
+struct luo_session {
+	char name[LIVEUPDATE_SESSION_NAME_LENGTH];
+	struct luo_session_ser *ser;
+	struct list_head list;
+	bool retrieved;
+	struct mutex mutex;
+};
+
+int luo_session_create(const char *name, struct file **filep);
+int luo_session_retrieve(const char *name, struct file **filep);
+int __init luo_session_setup_outgoing(void *fdt);
+int __init luo_session_setup_incoming(void *fdt);
+int luo_session_serialize(void);
+int luo_session_deserialize(void);
+bool luo_session_quiesce(void);
+void luo_session_resume(void);
+
 #endif /* _LINUX_LUO_INTERNAL_H */
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
new file mode 100644
index 000000000000..ee363d1b2b10
--- /dev/null
+++ b/kernel/liveupdate/luo_session.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO Sessions
+ *
+ * LUO Sessions provide the core mechanism for grouping and managing `struct
+ * file *` instances that need to be preserved across a kexec-based live
+ * update. Each session acts as a named container for a set of file objects,
+ * allowing a userspace agent to manage the lifecycle of resources critical to a
+ * workload.
+ *
+ * Core Concepts:
+ *
+ * - Named Containers: Sessions are identified by a unique, user-provided name,
+ *   which is used for both creation in the current kernel and retrieval in the
+ *   next kernel.
+ *
+ * - Userspace Interface: Session management is driven from userspace via
+ *   ioctls on /dev/liveupdate.
+ *
+ * - Serialization: Session metadata is preserved using the KHO framework. When
+ *   a live update is triggered via kexec, an array of `struct luo_session_ser`
+ *   is populated and placed in a preserved memory region. An FDT node is also
+ *   created, containing the count of sessions and the physical address of this
+ *   array.
+ *
+ * Session Lifecycle:
+ *
+ * 1.  Creation: A userspace agent calls `luo_session_create()` to create a
+ *     new, empty session and receives a file descriptor for it.
+ *
+ * 2.  Serialization: When the `reboot(LINUX_REBOOT_CMD_KEXEC)` syscall is
+ *     made, `luo_session_serialize()` is called. It iterates through all
+ *     active sessions and writes their metadata into a memory area preserved
+ *     by KHO.
+ *
+ * 3.  Deserialization (in new kernel): After kexec, `luo_session_deserialize()`
+ *     runs, reading the serialized data and creating a list of `struct
+ *     luo_session` objects representing the preserved sessions.
+ *
+ * 4.  Retrieval: A userspace agent in the new kernel can then call
+ *     `luo_session_retrieve()` with a session name to get a new file
+ *     descriptor and access the preserved state.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/anon_inodes.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/libfdt.h>
+#include <linux/list.h>
+#include <linux/liveupdate.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/unaligned.h>
+#include <uapi/linux/liveupdate.h>
+#include "luo_internal.h"
+
+/* 16 4K pages, give space for 744 sessions */
+#define LUO_SESSION_PGCNT	16ul
+#define LUO_SESSION_MAX		(((LUO_SESSION_PGCNT << PAGE_SHIFT) -	\
+		sizeof(struct luo_session_header_ser)) /		\
+		sizeof(struct luo_session_ser))
+
+/**
+ * struct luo_session_header - Header struct for managing LUO sessions.
+ * @count:      The number of sessions currently tracked in the @list.
+ * @list:       The head of the linked list of `struct luo_session` instances.
+ * @rwsem:      A read-write semaphore providing synchronized access to the
+ *              session list and other fields in this structure.
+ * @header_ser: The header data of serialization array.
+ * @ser:        The serialized session data (an array of
+ *              `struct luo_session_ser`).
+ * @active:     Set to true when first initialized. If previous kernel did not
+ *              send session data, active stays false for incoming.
+ */
+struct luo_session_header {
+	long count;
+	struct list_head list;
+	struct rw_semaphore rwsem;
+	struct luo_session_header_ser *header_ser;
+	struct luo_session_ser *ser;
+	bool active;
+};
+
+/**
+ * struct luo_session_global - Global container for managing LUO sessions.
+ * @incoming:     The sessions passed from the previous kernel.
+ * @outgoing:     The sessions that are going to be passed to the next kernel.
+ */
+struct luo_session_global {
+	struct luo_session_header incoming;
+	struct luo_session_header outgoing;
+};
+
+static struct luo_session_global luo_session_global = {
+	.incoming = {
+		.list = LIST_HEAD_INIT(luo_session_global.incoming.list),
+		.rwsem = __RWSEM_INITIALIZER(luo_session_global.incoming.rwsem),
+	},
+	.outgoing = {
+		.list = LIST_HEAD_INIT(luo_session_global.outgoing.list),
+		.rwsem = __RWSEM_INITIALIZER(luo_session_global.outgoing.rwsem),
+	},
+};
+
+static struct luo_session *luo_session_alloc(const char *name)
+{
+	struct luo_session *session = kzalloc(sizeof(*session), GFP_KERNEL);
+
+	if (!session)
+		return ERR_PTR(-ENOMEM);
+
+	strscpy(session->name, name, sizeof(session->name));
+	INIT_LIST_HEAD(&session->list);
+	mutex_init(&session->mutex);
+
+	return session;
+}
+
+static void luo_session_free(struct luo_session *session)
+{
+	mutex_destroy(&session->mutex);
+	kfree(session);
+}
+
+static int luo_session_insert(struct luo_session_header *sh,
+			      struct luo_session *session)
+{
+	struct luo_session *it;
+
+	guard(rwsem_write)(&sh->rwsem);
+
+	/*
+	 * For outgoing we should make sure there is room in serialization array
+	 * for new session.
+	 */
+	if (sh == &luo_session_global.outgoing) {
+		if (sh->count == LUO_SESSION_MAX)
+			return -ENOMEM;
+	}
+
+	/*
+	 * For small number of sessions this loop won't hurt performance
+	 * but if we ever start using a lot of sessions, this might
+	 * become a bottle neck during deserialization time, as it would
+	 * cause O(n*n) complexity.
+	 */
+	list_for_each_entry(it, &sh->list, list) {
+		if (!strncmp(it->name, session->name, sizeof(it->name)))
+			return -EEXIST;
+	}
+	list_add_tail(&session->list, &sh->list);
+	sh->count++;
+
+	return 0;
+}
+
+static void luo_session_remove(struct luo_session_header *sh,
+			       struct luo_session *session)
+{
+	guard(rwsem_write)(&sh->rwsem);
+	list_del(&session->list);
+	sh->count--;
+}
+
+static int luo_session_release(struct inode *inodep, struct file *filep)
+{
+	struct luo_session *session = filep->private_data;
+	struct luo_session_header *sh;
+
+	/* If retrieved is set, it means this session is from incoming list */
+	if (session->retrieved)
+		sh = &luo_session_global.incoming;
+	else
+		sh = &luo_session_global.outgoing;
+
+	luo_session_remove(sh, session);
+	luo_session_free(session);
+
+	return 0;
+}
+
+static const struct file_operations luo_session_fops = {
+	.owner = THIS_MODULE,
+	.release = luo_session_release,
+};
+
+/* Create a "struct file" for session */
+static int luo_session_getfile(struct luo_session *session, struct file **filep)
+{
+	char name_buf[128];
+	struct file *file;
+
+	lockdep_assert_held(&session->mutex);
+	snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name);
+	file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	*filep = file;
+
+	return 0;
+}
+
+int luo_session_create(const char *name, struct file **filep)
+{
+	struct luo_session *session;
+	int err;
+
+	session = luo_session_alloc(name);
+	if (IS_ERR(session))
+		return PTR_ERR(session);
+
+	err = luo_session_insert(&luo_session_global.outgoing, session);
+	if (err)
+		goto err_free;
+
+	scoped_guard(mutex, &session->mutex)
+		err = luo_session_getfile(session, filep);
+	if (err)
+		goto err_remove;
+
+	return 0;
+
+err_remove:
+	luo_session_remove(&luo_session_global.outgoing, session);
+err_free:
+	luo_session_free(session);
+
+	return err;
+}
+
+int luo_session_retrieve(const char *name, struct file **filep)
+{
+	struct luo_session_header *sh = &luo_session_global.incoming;
+	struct luo_session *session = NULL;
+	struct luo_session *it;
+	int err;
+
+	scoped_guard(rwsem_read, &sh->rwsem) {
+		list_for_each_entry(it, &sh->list, list) {
+			if (!strncmp(it->name, name, sizeof(it->name))) {
+				session = it;
+				break;
+			}
+		}
+	}
+
+	if (!session)
+		return -ENOENT;
+
+	guard(mutex)(&session->mutex);
+	if (session->retrieved)
+		return -EINVAL;
+
+	err = luo_session_getfile(session, filep);
+	if (!err)
+		session->retrieved = true;
+
+	return err;
+}
+
+int __init luo_session_setup_outgoing(void *fdt_out)
+{
+	struct luo_session_header_ser *header_ser;
+	void *outgoing_buffer;
+	u64 header_ser_pa;
+	int err;
+
+	outgoing_buffer = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT);
+	if (IS_ERR(outgoing_buffer))
+		return PTR_ERR(header_ser);
+	header_ser = outgoing_buffer;
+	header_ser_pa = virt_to_phys(header_ser);
+
+	err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME);
+	err |= fdt_property_string(fdt_out, "compatible",
+				   LUO_FDT_SESSION_COMPATIBLE);
+	err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa,
+			    sizeof(header_ser_pa));
+	err |= fdt_end_node(fdt_out);
+
+	if (err)
+		goto err_unpreserve;
+
+	luo_session_global.outgoing.header_ser = header_ser;
+	luo_session_global.outgoing.ser = (void *)(header_ser + 1);
+	luo_session_global.outgoing.active = true;
+
+	return 0;
+
+err_unpreserve:
+	kho_unpreserve_free(header_ser);
+	return err;
+}
+
+int __init luo_session_setup_incoming(void *fdt_in)
+{
+	struct luo_session_header_ser *header_ser;
+	int err, header_size, offset;
+	u64 header_ser_pa;
+	const void *ptr;
+
+	offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_SESSION_NODE_NAME);
+	if (offset < 0) {
+		pr_err("Unable to get session node: [%s]\n",
+		       LUO_FDT_SESSION_NODE_NAME);
+		return -EINVAL;
+	}
+
+	err = fdt_node_check_compatible(fdt_in, offset,
+					LUO_FDT_SESSION_COMPATIBLE);
+	if (err) {
+		pr_err("Session node incompatible [%s]\n",
+		       LUO_FDT_SESSION_COMPATIBLE);
+		return -EINVAL;
+	}
+
+	header_size = 0;
+	ptr = fdt_getprop(fdt_in, offset, LUO_FDT_SESSION_HEADER, &header_size);
+	if (!ptr || header_size != sizeof(u64)) {
+		pr_err("Unable to get session header '%s' [%d]\n",
+		       LUO_FDT_SESSION_HEADER, header_size);
+		return -EINVAL;
+	}
+
+	header_ser_pa = get_unaligned((u64 *)ptr);
+	header_ser = phys_to_virt(header_ser_pa);
+
+	luo_session_global.incoming.header_ser = header_ser;
+	luo_session_global.incoming.ser = (void *)(header_ser + 1);
+	luo_session_global.incoming.active = true;
+
+	return 0;
+}
+
+int luo_session_deserialize(void)
+{
+	struct luo_session_header *sh = &luo_session_global.incoming;
+	static bool is_deserialized;
+	static int err;
+
+	/* If has been deserialized, always return the same error code */
+	if (is_deserialized)
+		return err;
+
+	is_deserialized = true;
+	if (!sh->active)
+		return 0;
+
+	/*
+	 * Note on error handling:
+	 *
+	 * If deserialization fails (e.g., allocation failure or corrupt data),
+	 * we intentionally skip cleanup of sessions that were already restored.
+	 *
+	 * A partial failure leaves the preserved state inconsistent.
+	 * Implementing a safe "undo" to unwind complex dependencies (sessions,
+	 * files, hardware state) is error-prone and provides little value, as
+	 * the system is effectively in a broken state.
+	 *
+	 * We treat these resources as leaked. The expected recovery path is for
+	 * userspace to detect the failure and trigger a reboot, which will
+	 * reliably reset devices and reclaim memory.
+	 */
+	for (int i = 0; i < sh->header_ser->count; i++) {
+		struct luo_session *session;
+
+		session = luo_session_alloc(sh->ser[i].name);
+		if (IS_ERR(session)) {
+			pr_warn("Failed to allocate session [%s] during deserialization %pe\n",
+				sh->ser[i].name, session);
+			return PTR_ERR(session);
+		}
+
+		err = luo_session_insert(sh, session);
+		if (err) {
+			luo_session_free(session);
+			pr_warn("Failed to insert session [%s] %pe\n",
+				session->name, ERR_PTR(err));
+			return err;
+		}
+	}
+
+	kho_restore_free(sh->header_ser);
+	sh->header_ser = NULL;
+	sh->ser = NULL;
+
+	return 0;
+}
+
+int luo_session_serialize(void)
+{
+	struct luo_session_header *sh = &luo_session_global.outgoing;
+	struct luo_session *session;
+	int i = 0;
+
+	guard(rwsem_write)(&sh->rwsem);
+	list_for_each_entry(session, &sh->list, list) {
+		strscpy(sh->ser[i].name, session->name,
+			sizeof(sh->ser[i].name));
+		i++;
+	}
+	sh->header_ser->count = sh->count;
+
+	return 0;
+}
+
+/**
+ * luo_session_quiesce - Ensure no active sessions exist and lock session lists.
+ *
+ * Acquires exclusive write locks on both incoming and outgoing session lists.
+ * It then validates no sessions exist in either list.
+ *
+ * This mechanism is used during file handler un/registration to ensure that no
+ * sessions are currently using the handler, and no new sessions can be created
+ * while un/registration is in progress.
+ *
+ * Return:
+ * true  - System is quiescent (0 sessions) and locked.
+ * false - Active sessions exist. The locks are released internally.
+ */
+bool luo_session_quiesce(void)
+{
+	down_write(&luo_session_global.incoming.rwsem);
+	down_write(&luo_session_global.outgoing.rwsem);
+
+	if (luo_session_global.incoming.count ||
+	    luo_session_global.outgoing.count) {
+		up_write(&luo_session_global.outgoing.rwsem);
+		up_write(&luo_session_global.incoming.rwsem);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * luo_session_resume - Unlock session lists and resume normal activity.
+ *
+ * Releases the exclusive locks acquired by a successful call to
+ * luo_session_quiesce().
+ */
+void luo_session_resume(void)
+{
+	up_write(&luo_session_global.outgoing.rwsem);
+	up_write(&luo_session_global.incoming.rwsem);
+}
-- 
2.52.0.rc2.455.g230fcf2819-goog


^ permalink raw reply related

* [PATCH v7 05/22] liveupdate: luo_core: add user interface
From: Pasha Tatashin @ 2025-11-22 22:23 UTC (permalink / raw)
  To: pratyush, jasonmiu, graf, pasha.tatashin, rppt, dmatlack,
	rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, linux, linux-kernel, linux-doc, linux-mm, gregkh, tglx,
	mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu,
	hughd, skhawaja, chrisl
In-Reply-To: <20251122222351.1059049-1-pasha.tatashin@soleen.com>

Introduce the user-space interface for the Live Update Orchestrator
via ioctl commands, enabling external control over the live update
process and management of preserved resources.

The idea is that there is going to be a single userspace agent driving
the live update, therefore, only a single process can ever hold this
device opened at a time.

The following ioctl commands are introduced:

LIVEUPDATE_IOCTL_CREATE_SESSION
Provides a way for userspace to create a named session for grouping file
descriptors that need to be preserved. It returns a new file descriptor
representing the session.

LIVEUPDATE_IOCTL_RETRIEVE_SESSION
Allows the userspace agent in the new kernel to reclaim a preserved
session by its name, receiving a new file descriptor to manage the
restored resources.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
 include/uapi/linux/liveupdate.h  |  64 +++++++++++
 kernel/liveupdate/luo_core.c     | 179 ++++++++++++++++++++++++++++++-
 kernel/liveupdate/luo_internal.h |  21 ++++
 3 files changed, 263 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h
index 40578ae19668..1183cf984b5f 100644
--- a/include/uapi/linux/liveupdate.h
+++ b/include/uapi/linux/liveupdate.h
@@ -46,4 +46,68 @@
 /* The maximum length of session name including null termination */
 #define LIVEUPDATE_SESSION_NAME_LENGTH 64
 
+/* The /dev/liveupdate ioctl commands */
+enum {
+	LIVEUPDATE_CMD_BASE = 0x00,
+	LIVEUPDATE_CMD_CREATE_SESSION = LIVEUPDATE_CMD_BASE,
+	LIVEUPDATE_CMD_RETRIEVE_SESSION = 0x01,
+};
+
+/**
+ * struct liveupdate_ioctl_create_session - ioctl(LIVEUPDATE_IOCTL_CREATE_SESSION)
+ * @size:	Input; sizeof(struct liveupdate_ioctl_create_session)
+ * @fd:		Output; The new file descriptor for the created session.
+ * @name:	Input; A null-terminated string for the session name, max
+ *		length %LIVEUPDATE_SESSION_NAME_LENGTH including termination
+ *		character.
+ *
+ * Creates a new live update session for managing preserved resources.
+ * This ioctl can only be called on the main /dev/liveupdate device.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+struct liveupdate_ioctl_create_session {
+	__u32		size;
+	__s32		fd;
+	__u8		name[LIVEUPDATE_SESSION_NAME_LENGTH];
+};
+
+#define LIVEUPDATE_IOCTL_CREATE_SESSION					\
+	_IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_CREATE_SESSION)
+
+/**
+ * struct liveupdate_ioctl_retrieve_session - ioctl(LIVEUPDATE_IOCTL_RETRIEVE_SESSION)
+ * @size:    Input; sizeof(struct liveupdate_ioctl_retrieve_session)
+ * @fd:      Output; The new file descriptor for the retrieved session.
+ * @name:    Input; A null-terminated string identifying the session to retrieve.
+ *           The name must exactly match the name used when the session was
+ *           created in the previous kernel.
+ *
+ * Retrieves a handle (a new file descriptor) for a preserved session by its
+ * name. This is the primary mechanism for a userspace agent to regain control
+ * of its preserved resources after a live update.
+ *
+ * The userspace application provides the null-terminated `name` of a session
+ * it created before the live update. If a preserved session with a matching
+ * name is found, the kernel instantiates it and returns a new file descriptor
+ * in the `fd` field. This new session FD can then be used for all file-specific
+ * operations, such as restoring individual file descriptors with
+ * LIVEUPDATE_SESSION_RETRIEVE_FD.
+ *
+ * It is the responsibility of the userspace application to know the names of
+ * the sessions it needs to retrieve. If no session with the given name is
+ * found, the ioctl will fail with -ENOENT.
+ *
+ * This ioctl can only be called on the main /dev/liveupdate device when the
+ * system is in the LIVEUPDATE_STATE_UPDATED state.
+ */
+struct liveupdate_ioctl_retrieve_session {
+	__u32		size;
+	__s32		fd;
+	__u8		name[LIVEUPDATE_SESSION_NAME_LENGTH];
+};
+
+#define LIVEUPDATE_IOCTL_RETRIEVE_SESSION \
+	_IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_RETRIEVE_SESSION)
+
 #endif /* _UAPI_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index a0f7788cd003..bc90954252a3 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -41,7 +41,13 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/atomic.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
 #include <linux/io.h>
+#include <linux/kernel.h>
 #include <linux/kexec_handover.h>
 #include <linux/kho/abi/luo.h>
 #include <linux/kobject.h>
@@ -52,7 +58,6 @@
 #include <linux/sizes.h>
 #include <linux/string.h>
 #include <linux/unaligned.h>
-
 #include "kexec_handover_internal.h"
 #include "luo_internal.h"
 
@@ -246,12 +251,183 @@ bool liveupdate_enabled(void)
 	return luo_global.enabled;
 }
 
+/**
+ * DOC: LUO ioctl Interface
+ *
+ * The IOCTL user-space control interface for the LUO subsystem.
+ * It registers a character device, typically found at ``/dev/liveupdate``,
+ * which allows a userspace agent to manage the LUO state machine and its
+ * associated resources, such as preservable file descriptors.
+ *
+ * To ensure that the state machine is controlled by a single entity, access
+ * to this device is exclusive: only one process is permitted to have
+ * ``/dev/liveupdate`` open at any given time. Subsequent open attempts will
+ * fail with -EBUSY until the first process closes its file descriptor.
+ * This singleton model simplifies state management by preventing conflicting
+ * commands from multiple userspace agents.
+ */
+
 struct luo_device_state {
 	struct miscdevice miscdev;
+	atomic_t in_use;
+};
+
+static int luo_ioctl_create_session(struct luo_ucmd *ucmd)
+{
+	struct liveupdate_ioctl_create_session *argp = ucmd->cmd;
+	struct file *file;
+	int err;
+
+	argp->fd = get_unused_fd_flags(O_CLOEXEC);
+	if (argp->fd < 0)
+		return argp->fd;
+
+	err = luo_session_create(argp->name, &file);
+	if (err)
+		goto err_put_fd;
+
+	err = luo_ucmd_respond(ucmd, sizeof(*argp));
+	if (err)
+		goto err_put_file;
+
+	fd_install(argp->fd, file);
+
+	return 0;
+
+err_put_file:
+	fput(file);
+err_put_fd:
+	put_unused_fd(argp->fd);
+
+	return err;
+}
+
+static int luo_ioctl_retrieve_session(struct luo_ucmd *ucmd)
+{
+	struct liveupdate_ioctl_retrieve_session *argp = ucmd->cmd;
+	struct file *file;
+	int err;
+
+	argp->fd = get_unused_fd_flags(O_CLOEXEC);
+	if (argp->fd < 0)
+		return argp->fd;
+
+	err = luo_session_retrieve(argp->name, &file);
+	if (err < 0)
+		goto err_put_fd;
+
+	err = luo_ucmd_respond(ucmd, sizeof(*argp));
+	if (err)
+		goto err_put_file;
+
+	fd_install(argp->fd, file);
+
+	return 0;
+
+err_put_file:
+	fput(file);
+err_put_fd:
+	put_unused_fd(argp->fd);
+
+	return err;
+}
+
+static int luo_open(struct inode *inodep, struct file *filep)
+{
+	struct luo_device_state *ldev = container_of(filep->private_data,
+						     struct luo_device_state,
+						     miscdev);
+
+	if (atomic_cmpxchg(&ldev->in_use, 0, 1))
+		return -EBUSY;
+
+	/* Always return -EIO to user if deserialization fail */
+	if (luo_session_deserialize()) {
+		atomic_set(&ldev->in_use, 0);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int luo_release(struct inode *inodep, struct file *filep)
+{
+	struct luo_device_state *ldev = container_of(filep->private_data,
+						     struct luo_device_state,
+						     miscdev);
+	atomic_set(&ldev->in_use, 0);
+
+	return 0;
+}
+
+union ucmd_buffer {
+	struct liveupdate_ioctl_create_session create;
+	struct liveupdate_ioctl_retrieve_session retrieve;
+};
+
+struct luo_ioctl_op {
+	unsigned int size;
+	unsigned int min_size;
+	unsigned int ioctl_num;
+	int (*execute)(struct luo_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last)                                  \
+	[_IOC_NR(_ioctl) - LIVEUPDATE_CMD_BASE] = {                            \
+		.size = sizeof(_struct) +                                      \
+			BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) <          \
+					  sizeof(_struct)),                    \
+		.min_size = offsetofend(_struct, _last),                       \
+		.ioctl_num = _ioctl,                                           \
+		.execute = _fn,                                                \
+	}
+
+static const struct luo_ioctl_op luo_ioctl_ops[] = {
+	IOCTL_OP(LIVEUPDATE_IOCTL_CREATE_SESSION, luo_ioctl_create_session,
+		 struct liveupdate_ioctl_create_session, name),
+	IOCTL_OP(LIVEUPDATE_IOCTL_RETRIEVE_SESSION, luo_ioctl_retrieve_session,
+		 struct liveupdate_ioctl_retrieve_session, name),
 };
 
+static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	const struct luo_ioctl_op *op;
+	struct luo_ucmd ucmd = {};
+	union ucmd_buffer buf;
+	unsigned int nr;
+	int err;
+
+	nr = _IOC_NR(cmd);
+	if (nr < LIVEUPDATE_CMD_BASE ||
+	    (nr - LIVEUPDATE_CMD_BASE) >= ARRAY_SIZE(luo_ioctl_ops)) {
+		return -EINVAL;
+	}
+
+	ucmd.ubuffer = (void __user *)arg;
+	err = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+	if (err)
+		return err;
+
+	op = &luo_ioctl_ops[nr - LIVEUPDATE_CMD_BASE];
+	if (op->ioctl_num != cmd)
+		return -ENOIOCTLCMD;
+	if (ucmd.user_size < op->min_size)
+		return -EINVAL;
+
+	ucmd.cmd = &buf;
+	err = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+				    ucmd.user_size);
+	if (err)
+		return err;
+
+	return op->execute(&ucmd);
+}
+
 static const struct file_operations luo_fops = {
 	.owner		= THIS_MODULE,
+	.open		= luo_open,
+	.release	= luo_release,
+	.unlocked_ioctl	= luo_ioctl,
 };
 
 static struct luo_device_state luo_dev = {
@@ -260,6 +436,7 @@ static struct luo_device_state luo_dev = {
 		.name  = "liveupdate",
 		.fops  = &luo_fops,
 	},
+	.in_use = ATOMIC_INIT(0),
 };
 
 static int __init liveupdate_ioctl_init(void)
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index 05ae91695ec6..1292ac47eef8 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -9,6 +9,27 @@
 #define _LINUX_LUO_INTERNAL_H
 
 #include <linux/liveupdate.h>
+#include <linux/uaccess.h>
+
+struct luo_ucmd {
+	void __user *ubuffer;
+	u32 user_size;
+	void *cmd;
+};
+
+static inline int luo_ucmd_respond(struct luo_ucmd *ucmd,
+				   size_t kernel_cmd_size)
+{
+	/*
+	 * Copy the minimum of what the user provided and what we actually
+	 * have.
+	 */
+	if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
+			 min_t(size_t, ucmd->user_size, kernel_cmd_size))) {
+		return -EFAULT;
+	}
+	return 0;
+}
 
 /*
  * Handles a deserialization failure: devices and memory is in unpredictable
-- 
2.52.0.rc2.455.g230fcf2819-goog


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox