linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Pratyush Yadav <pratyush@kernel.org>
To: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: pratyush@kernel.org,  jasonmiu@google.com,  graf@amazon.com,
	changyuanl@google.com,  rppt@kernel.org,  dmatlack@google.com,
	rientjes@google.com,  corbet@lwn.net,  rdunlap@infradead.org,
	ilpo.jarvinen@linux.intel.com,  kanie@linux.alibaba.com,
	ojeda@kernel.org,  aliceryhl@google.com,  masahiroy@kernel.org,
	akpm@linux-foundation.org,  tj@kernel.org,
	 yoann.congal@smile.fr, mmaurer@google.com,
	 roman.gushchin@linux.dev,  chenridong@huawei.com,
	axboe@kernel.dk,  mark.rutland@arm.com,  jannh@google.com,
	vincent.guittot@linaro.org,  hannes@cmpxchg.org,
	dan.j.williams@intel.com,  david@redhat.com,
	 joel.granados@kernel.org, rostedt@goodmis.org,
	 anna.schumaker@oracle.com,  song@kernel.org,
	zhangguopeng@kylinos.cn,  linux@weissschuh.net,
	linux-kernel@vger.kernel.org,  linux-doc@vger.kernel.org,
	linux-mm@kvack.org,  gregkh@linuxfoundation.org,
	 tglx@linutronix.de, mingo@redhat.com,  bp@alien8.de,
	 dave.hansen@linux.intel.com, x86@kernel.org,  hpa@zytor.com,
	 rafael@kernel.org,  dakr@kernel.org,
	bartosz.golaszewski@linaro.org,  cw00.choi@samsung.com,
	myungjoo.ham@samsung.com,  yesanishhere@gmail.com,
	Jonathan.Cameron@huawei.com,  quic_zijuhu@quicinc.com,
	aleksander.lobakin@intel.com,  ira.weiny@intel.com,
	andriy.shevchenko@linux.intel.com,  leon@kernel.org,
	 lukas@wunner.de, bhelgaas@google.com,  wagi@kernel.org,
	 djeffery@redhat.com, stuart.w.hayes@gmail.com
Subject: Re: [RFC v2 08/16] luo: luo_files: add infrastructure for FDs
Date: Thu, 05 Jun 2025 17:56:43 +0200	[thread overview]
Message-ID: <mafs034cetc5g.fsf@kernel.org> (raw)
In-Reply-To: <20250515182322.117840-9-pasha.tatashin@soleen.com>

On Thu, May 15 2025, Pasha Tatashin wrote:

> Introduce the framework within LUO to support preserving specific types
> of file descriptors across a live update transition. This allows
> stateful FDs (like memfds or vfio FDs used by VMs) to be recreated in
> the new kernel.
>
> Note: The core logic for iterating through the luo_files_list and
> invoking the handler callbacks (prepare, freeze, cancel, finish)
> within luo_do_files_*_calls, as well as managing the u64 data
> persistence via the FDT for individual files, is currently implemented
> as stubs in this patch. This patch sets up the registration, FDT layout,
> and retrieval framework.
>
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
> ---
>  drivers/misc/liveupdate/Makefile       |   1 +
>  drivers/misc/liveupdate/luo_core.c     |  19 +
>  drivers/misc/liveupdate/luo_files.c    | 563 +++++++++++++++++++++++++
>  drivers/misc/liveupdate/luo_internal.h |  11 +
>  include/linux/liveupdate.h             |  62 +++
>  5 files changed, 656 insertions(+)
>  create mode 100644 drivers/misc/liveupdate/luo_files.c
>
> diff --git a/drivers/misc/liveupdate/Makefile b/drivers/misc/liveupdate/Makefile
> index df1c9709ba4f..b4cdd162574f 100644
> --- a/drivers/misc/liveupdate/Makefile
> +++ b/drivers/misc/liveupdate/Makefile
> @@ -1,3 +1,4 @@
>  # SPDX-License-Identifier: GPL-2.0
>  obj-y					+= luo_core.o
> +obj-y					+= luo_files.o
>  obj-y					+= luo_subsystems.o
[...]
> diff --git a/drivers/misc/liveupdate/luo_files.c b/drivers/misc/liveupdate/luo_files.c
> new file mode 100644
> index 000000000000..953fc40db3d7
> --- /dev/null
> +++ b/drivers/misc/liveupdate/luo_files.c
> @@ -0,0 +1,563 @@
[...]
> +struct luo_file {
> +	struct liveupdate_filesystem *fs;
> +	struct file *file;
> +	u64 private_data;
> +	bool reclaimed;
> +	enum liveupdate_state state;
> +	struct mutex mutex;
> +};
> +
> +/**
> + * luo_files_startup - Validates the LUO file-descriptors FDT node at startup.
> + * @fdt: Pointer to the LUO FDT blob passed from the previous kernel.
> + *
> + * This __init function checks the existence and validity of the
> + * '/file-descriptors' node in the FDT. This node is considered mandatory. It

Why is it mandatory? Can't a user just preserve some subsystems, and no
FDs?

> + * calls panic() if the node is missing, inaccessible, or invalid (e.g., missing
> + * compatible, wrong compatible string), indicating a critical configuration
> + * error for LUO.
> + */
> +void __init luo_files_startup(void *fdt)
> +{
> +	int ret, node_offset;
> +
> +	node_offset = fdt_subnode_offset(fdt, 0, LUO_FILES_NODE_NAME);
> +	if (node_offset < 0)
> +		panic("Failed to find /file-descriptors node\n");
> +
> +	ret = fdt_node_check_compatible(fdt, node_offset,
> +					LUO_FILES_COMPATIBLE);
> +	if (ret) {
> +		panic("FDT '%s' is incompatible with '%s' [%d]\n",
> +		      LUO_FILES_NODE_NAME, LUO_FILES_COMPATIBLE, ret);
> +	}
> +	luo_fdt_in = fdt;
> +}
> +
> +static void luo_files_recreate_luo_files_xa_in(void)
> +{
> +	int parent_node_offset, file_node_offset;
> +	const char *node_name, *fdt_compat_str;
> +	struct liveupdate_filesystem *fs;
> +	struct luo_file *luo_file;
> +	const void *data_ptr;
> +	int ret = 0;
> +
> +	if (luo_files_xa_in_recreated || !luo_fdt_in)
> +		return;
> +
> +	/* Take write in order to gurantee that we re-create list once */

Typo: s/gurantee/guarantee

> +	down_write(&luo_filesystems_list_rwsem);
> +	if (luo_files_xa_in_recreated)
> +		goto exit_unlock;
> +
> +	parent_node_offset = fdt_subnode_offset(luo_fdt_in, 0,
> +						LUO_FILES_NODE_NAME);
> +
> +	fdt_for_each_subnode(file_node_offset, luo_fdt_in, parent_node_offset) {
> +		bool handler_found = false;
> +		u64 token;
> +
> +		node_name = fdt_get_name(luo_fdt_in, file_node_offset, NULL);
> +		if (!node_name) {
> +			panic("Skipping FDT subnode at offset %d: Cannot get name\n",
> +			      file_node_offset);

Should failure to parse a specific FD really be a panic? Wouldn't it be
better to continue and let userspace decide if it can live with the FD
missing?

> +		}
> +
> +		ret = kstrtou64(node_name, 0, &token);
> +		if (ret < 0) {
> +			panic("Skipping FDT node '%s': Failed to parse token\n",
> +			      node_name);
> +		}
> +
> +		fdt_compat_str = fdt_getprop(luo_fdt_in, file_node_offset,
> +					     "compatible", NULL);
> +		if (!fdt_compat_str) {
> +			panic("Skipping FDT node '%s': Missing 'compatible' property\n",
> +			      node_name);
> +		}
> +
> +		data_ptr = fdt_getprop(luo_fdt_in, file_node_offset, "data",
> +				       NULL);
> +		if (!data_ptr) {
> +			panic("Can't recover property 'data' for FDT node '%s'\n",
> +			      node_name);
> +		}
> +
> +		list_for_each_entry(fs, &luo_filesystems_list, list) {
> +			if (!strcmp(fs->compatible, fdt_compat_str)) {
> +				handler_found = true;
> +				break;
> +			}
> +		}
> +
> +		if (!handler_found) {
> +			panic("Skipping FDT node '%s': No registered handler for compatible '%s'\n",
> +			      node_name, fdt_compat_str);

Thinking out loud here: this means that by the time of first retrieval,
all file systems must be registered. Since this is called from
luo_do_files_finish_calls() or luo_retrieve_file(), it will come from
userspace, so all built in modules would be initialized by then. But
some loadable module might not be. I don't see much of a use case for
loadable modules to participate in LUO, so I don't think it should be a
problem.

> +		}
> +
> +		luo_file = kmalloc(sizeof(*luo_file),
> +				   GFP_KERNEL | __GFP_NOFAIL);
> +		luo_file->fs = fs;
> +		luo_file->file = NULL;
> +		memcpy(&luo_file->private_data, data_ptr, sizeof(u64));

Why not make sure data_ptr is exactly sizeof(u64) when we parse it, and
then simply do luo_file->private_data = (u64)*data_ptr ?

Because if the previous kernel wrote more than a u64 in data, then
something is broken and we should catch that error anyway.

> +		luo_file->reclaimed = false;
> +		mutex_init(&luo_file->mutex);
> +		luo_file->state = LIVEUPDATE_STATE_UPDATED;
> +		ret = xa_err(xa_store(&luo_files_xa_in, token, luo_file,
> +				      GFP_KERNEL | __GFP_NOFAIL));

Should you also check if something is already at token's slot, in case
previous kernel generated wrong tokens or FDT is broken?

> +		if (ret < 0) {
> +			panic("Failed to store luo_file for token %llu in XArray: %d\n",
> +			      token, ret);
> +		}
> +	}
> +	luo_files_xa_in_recreated = true;
> +
> +exit_unlock:
> +	up_write(&luo_filesystems_list_rwsem);
> +}
> +
[...]
> diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
> index 7a130680b5f2..7afe0aac5ce4 100644
> --- a/include/linux/liveupdate.h
> +++ b/include/linux/liveupdate.h
> @@ -86,6 +86,55 @@ enum liveupdate_state  {
>  	LIVEUPDATE_STATE_UPDATED = 3,
>  };
>  
> +/* Forward declaration needed if definition isn't included */
> +struct file;
> +
> +/**
> + * struct liveupdate_filesystem - Represents a handler for a live-updatable
> + * filesystem/file type.
> + * @prepare:       Optional. Saves state for a specific file instance (@file,
> + *                 @arg) before update, potentially returning value via @data.
> + *                 Returns 0 on success, negative errno on failure.
> + * @freeze:        Optional. Performs final actions just before kernel
> + *                 transition, potentially reading/updating the handle via
> + *                 @data.
> + *                 Returns 0 on success, negative errno on failure.
> + * @cancel:        Optional. Cleans up state/resources if update is aborted
> + *                 after prepare/freeze succeeded, using the @data handle (by
> + *                 value) from the successful prepare. Returns void.
> + * @finish:        Optional. Performs final cleanup in the new kernel using the
> + *                 preserved @data handle (by value). Returns void.
> + * @retrieve:      Retrieve the preserved file. Must be called before finish.
> + * @can_preserve:  callback to determine if @file with associated context (@arg)
> + *                 can be preserved by this handler.
> + *                 Return bool (true if preservable, false otherwise).
> + * @compatible:    The compatibility string (e.g., "memfd-v1", "vfiofd-v1")
> + *                 that uniquely identifies the filesystem or file type this
> + *                 handler supports. This is matched against the compatible
> + *                 string associated with individual &struct liveupdate_file
> + *                 instances.
> + * @arg:           An opaque pointer to implementation-specific context data
> + *                 associated with this filesystem handler registration.
> + * @list:          used for linking this handler instance into a global list of
> + *                 registered filesystem handlers.
> + *
> + * Modules that want to support live update for specific file types should
> + * register an instance of this structure. LUO uses this registration to
> + * determine if a given file can be preserved and to find the appropriate
> + * operations to manage its state across the update.
> + */
> +struct liveupdate_filesystem {
> +	int (*prepare)(struct file *file, void *arg, u64 *data);
> +	int (*freeze)(struct file *file, void *arg, u64 *data);
> +	void (*cancel)(struct file *file, void *arg, u64 data);
> +	void (*finish)(struct file *file, void *arg, u64 data, bool reclaimed);
> +	int (*retrieve)(void *arg, u64 data, struct file **file);
> +	bool (*can_preserve)(struct file *file, void *arg);
> +	const char *compatible;
> +	void *arg;

What is the use for this arg? I would expect one file type/system to
register one set of handlers. So they can keep their arg in a global in
their code. I don't see why a per-filesystem arg is needed.

What I do think is needed is a per-file arg. Each callback gets 'data',
which is the serialized data, but there is no place to store runtime
state, like some flags or serialization metadata. Sure, you could make
place for it somewhere in the inode, but I think it would be a lot
cleaner to be able to store it in struct luo_file.

So perhaps rename private_data in struct luo_file to say
serialized_data, and have a field called "private" that filesystems can
use for their runtime state?

Same suggestion for subsystems as well.

> +	struct list_head list;
> +};
> +
>  /**
>   * struct liveupdate_subsystem - Represents a subsystem participating in LUO
>   * @prepare:      Optional. Called during LUO prepare phase. Should perform
[...]

-- 
Regards,
Pratyush Yadav

  parent reply	other threads:[~2025-06-05 15:56 UTC|newest]

Thread overview: 103+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-15 18:23 [RFC v2 00/16] Live Update Orchestrator Pasha Tatashin
2025-05-15 18:23 ` [RFC v2 01/16] kho: make debugfs interface optional Pasha Tatashin
2025-06-04 16:03   ` Pratyush Yadav
2025-06-06 16:12     ` Pasha Tatashin
2025-05-15 18:23 ` [RFC v2 02/16] kho: allow to drive kho from within kernel Pasha Tatashin
2025-05-15 18:23 ` [RFC v2 03/16] kho: add kho_unpreserve_folio/phys Pasha Tatashin
2025-06-04 15:00   ` Pratyush Yadav
2025-06-06 16:22     ` Pasha Tatashin
2025-05-15 18:23 ` [RFC v2 04/16] luo: luo_core: Live Update Orchestrator Pasha Tatashin
2025-05-26  6:31   ` Mike Rapoport
2025-05-30  5:00     ` Pasha Tatashin
2025-06-04 15:17   ` Pratyush Yadav
2025-06-07 17:11     ` Pasha Tatashin
2025-05-15 18:23 ` [RFC v2 05/16] luo: luo_core: integrate with KHO Pasha Tatashin
2025-05-26  7:18   ` Mike Rapoport
2025-06-07 17:50     ` Pasha Tatashin
2025-06-09  2:14       ` Pasha Tatashin
2025-06-04 16:00   ` Pratyush Yadav
2025-06-07 23:30     ` Pasha Tatashin
2025-06-13 14:58       ` Pratyush Yadav
2025-06-17 15:23         ` Jason Gunthorpe
2025-06-17 19:32           ` Pasha Tatashin
2025-06-18 13:11             ` Pratyush Yadav
2025-06-18 14:48               ` Pasha Tatashin
2025-06-18 16:40                 ` Mike Rapoport
2025-06-18 17:00                   ` Pasha Tatashin
2025-06-18 17:43                     ` Pasha Tatashin
2025-06-19 12:00                       ` Mike Rapoport
2025-06-19 14:22                         ` Pasha Tatashin
2025-06-20 15:28                           ` Pratyush Yadav
2025-06-20 16:03                             ` Pasha Tatashin
2025-06-24 16:12                               ` Pratyush Yadav
2025-06-24 16:55                                 ` Pasha Tatashin
2025-06-24 18:31                                 ` Jason Gunthorpe
2025-06-23  7:32                       ` Mike Rapoport
2025-06-23 11:29                         ` Pasha Tatashin
2025-06-25 13:46                           ` Mike Rapoport
2025-05-15 18:23 ` [RFC v2 06/16] luo: luo_subsystems: add subsystem registration Pasha Tatashin
2025-05-26  7:31   ` Mike Rapoport
2025-06-07 23:42     ` Pasha Tatashin
2025-05-28 19:12   ` David Matlack
2025-06-07 23:58     ` Pasha Tatashin
2025-06-04 16:30   ` Pratyush Yadav
2025-06-08  0:04     ` Pasha Tatashin
2025-05-15 18:23 ` [RFC v2 07/16] luo: luo_subsystems: implement subsystem callbacks Pasha Tatashin
2025-05-15 18:23 ` [RFC v2 08/16] luo: luo_files: add infrastructure for FDs Pasha Tatashin
2025-05-15 23:15   ` James Houghton
2025-05-23 18:09     ` Pasha Tatashin
2025-05-26  7:55   ` Mike Rapoport
2025-06-05 11:56     ` Pratyush Yadav
2025-06-08 13:13     ` Pasha Tatashin
2025-06-05 15:56   ` Pratyush Yadav [this message]
2025-06-08 13:37     ` Pasha Tatashin
2025-06-13 15:27       ` Pratyush Yadav
2025-06-15 18:02         ` Pasha Tatashin
2025-05-15 18:23 ` [RFC v2 09/16] luo: luo_files: implement file systems callbacks Pasha Tatashin
2025-06-05 16:03   ` Pratyush Yadav
2025-06-08 13:49     ` Pasha Tatashin
2025-06-13 15:18       ` Pratyush Yadav
2025-06-13 20:26         ` Pasha Tatashin
2025-06-16 10:43           ` Pratyush Yadav
2025-06-16 14:57             ` Pasha Tatashin
2025-06-18 13:16               ` Pratyush Yadav
2025-05-15 18:23 ` [RFC v2 10/16] luo: luo_ioctl: add ioctl interface Pasha Tatashin
2025-05-26  8:42   ` Mike Rapoport
2025-06-08 15:08     ` Pasha Tatashin
2025-05-28 20:29   ` David Matlack
2025-06-08 16:32     ` Pasha Tatashin
2025-06-05 16:15   ` Pratyush Yadav
2025-06-08 16:35     ` Pasha Tatashin
2025-06-24  9:50   ` Christian Brauner
2025-06-24 14:27     ` Pasha Tatashin
2025-06-25  9:36       ` Christian Brauner
2025-06-25 16:12         ` David Matlack
2025-06-26 15:42           ` Pratyush Yadav
2025-06-26 16:24             ` David Matlack
2025-07-14 14:56               ` Pratyush Yadav
2025-07-17 16:17                 ` David Matlack
2025-07-23 14:51                   ` Pratyush Yadav
2025-07-06 14:33             ` Mike Rapoport
2025-07-07 12:56               ` Jason Gunthorpe
2025-06-25 16:58         ` pasha.tatashin
2025-07-06 14:24     ` Mike Rapoport
2025-07-09 21:27       ` Pratyush Yadav
2025-07-10  7:26         ` Mike Rapoport
2025-07-14 14:34           ` Jason Gunthorpe
2025-07-16  9:43             ` Greg KH
2025-05-15 18:23 ` [RFC v2 11/16] luo: luo_sysfs: add sysfs state monitoring Pasha Tatashin
2025-06-05 16:20   ` Pratyush Yadav
2025-06-08 16:36     ` Pasha Tatashin
2025-06-13 15:13       ` Pratyush Yadav
2025-05-15 18:23 ` [RFC v2 12/16] reboot: call liveupdate_reboot() before kexec Pasha Tatashin
2025-05-15 18:23 ` [RFC v2 13/16] luo: add selftests for subsystems un/registration Pasha Tatashin
2025-05-26  8:52   ` Mike Rapoport
2025-06-08 16:47     ` Pasha Tatashin
2025-05-15 18:23 ` [RFC v2 14/16] selftests/liveupdate: add subsystem/state tests Pasha Tatashin
2025-05-15 18:23 ` [RFC v2 15/16] docs: add luo documentation Pasha Tatashin
2025-05-26  9:00   ` Mike Rapoport
2025-05-15 18:23 ` [RFC v2 16/16] MAINTAINERS: add liveupdate entry Pasha Tatashin
2025-05-20  7:25 ` [RFC v2 00/16] Live Update Orchestrator Mike Rapoport
2025-05-23 18:07   ` Pasha Tatashin
2025-05-26  6:32 ` Mike Rapoport
     [not found] <CA+Sjx0XP2A+rsA95NVwX7czQGNREXcFT=psedA7fawwKJb5rkw@mail.gmail.com>
2025-06-08  0:07 ` [RFC v2 08/16] luo: luo_files: add infrastructure for FDs Pasha Tatashin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=mafs034cetc5g.fsf@kernel.org \
    --to=pratyush@kernel.org \
    --cc=Jonathan.Cameron@huawei.com \
    --cc=akpm@linux-foundation.org \
    --cc=aleksander.lobakin@intel.com \
    --cc=aliceryhl@google.com \
    --cc=andriy.shevchenko@linux.intel.com \
    --cc=anna.schumaker@oracle.com \
    --cc=axboe@kernel.dk \
    --cc=bartosz.golaszewski@linaro.org \
    --cc=bhelgaas@google.com \
    --cc=bp@alien8.de \
    --cc=changyuanl@google.com \
    --cc=chenridong@huawei.com \
    --cc=corbet@lwn.net \
    --cc=cw00.choi@samsung.com \
    --cc=dakr@kernel.org \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=djeffery@redhat.com \
    --cc=dmatlack@google.com \
    --cc=graf@amazon.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=hpa@zytor.com \
    --cc=ilpo.jarvinen@linux.intel.com \
    --cc=ira.weiny@intel.com \
    --cc=jannh@google.com \
    --cc=jasonmiu@google.com \
    --cc=joel.granados@kernel.org \
    --cc=kanie@linux.alibaba.com \
    --cc=leon@kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux@weissschuh.net \
    --cc=lukas@wunner.de \
    --cc=mark.rutland@arm.com \
    --cc=masahiroy@kernel.org \
    --cc=mingo@redhat.com \
    --cc=mmaurer@google.com \
    --cc=myungjoo.ham@samsung.com \
    --cc=ojeda@kernel.org \
    --cc=pasha.tatashin@soleen.com \
    --cc=quic_zijuhu@quicinc.com \
    --cc=rafael@kernel.org \
    --cc=rdunlap@infradead.org \
    --cc=rientjes@google.com \
    --cc=roman.gushchin@linux.dev \
    --cc=rostedt@goodmis.org \
    --cc=rppt@kernel.org \
    --cc=song@kernel.org \
    --cc=stuart.w.hayes@gmail.com \
    --cc=tglx@linutronix.de \
    --cc=tj@kernel.org \
    --cc=vincent.guittot@linaro.org \
    --cc=wagi@kernel.org \
    --cc=x86@kernel.org \
    --cc=yesanishhere@gmail.com \
    --cc=yoann.congal@smile.fr \
    --cc=zhangguopeng@kylinos.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).