From: Pratyush Yadav <ptyadav@amazon.de>
To: <linux-kernel@vger.kernel.org>
Cc: Pratyush Yadav <ptyadav@amazon.de>,
Jonathan Corbet <corbet@lwn.net>,
"Eric Biederman" <ebiederm@xmission.com>,
Arnd Bergmann <arnd@arndb.de>,
"Greg Kroah-Hartman" <gregkh@linuxfoundation.org>,
Alexander Viro <viro@zeniv.linux.org.uk>,
Christian Brauner <brauner@kernel.org>, Jan Kara <jack@suse.cz>,
Hugh Dickins <hughd@google.com>, Alexander Graf <graf@amazon.com>,
Benjamin Herrenschmidt <benh@kernel.crashing.org>,
"David Woodhouse" <dwmw2@infradead.org>,
James Gowans <jgowans@amazon.com>,
"Mike Rapoport" <rppt@kernel.org>,
Paolo Bonzini <pbonzini@redhat.com>,
"Pasha Tatashin" <tatashin@google.com>,
Anthony Yznaga <anthony.yznaga@oracle.com>,
Dave Hansen <dave.hansen@intel.com>,
David Hildenbrand <david@redhat.com>,
Jason Gunthorpe <jgg@nvidia.com>,
Matthew Wilcox <willy@infradead.org>,
"Wei Yang" <richard.weiyang@gmail.com>,
Andrew Morton <akpm@linux-foundation.org>,
<linux-fsdevel@vger.kernel.org>, <linux-doc@vger.kernel.org>,
<linux-mm@kvack.org>, <kexec@lists.infradead.org>
Subject: [RFC PATCH 5/5] mm/memfd: allow preserving FD over FDBOX + KHO
Date: Fri, 7 Mar 2025 00:57:39 +0000 [thread overview]
Message-ID: <20250307005830.65293-6-ptyadav@amazon.de> (raw)
In-Reply-To: <20250307005830.65293-1-ptyadav@amazon.de>
For applications with a large amount of memory that takes time to
rebuild, reboots to consume kernel upgrades can be very expensive. FDBox
allows preserving file descriptors over kexec using KHO. Combining that
with memfd gives those applications reboot-persistent memory that they
can use to quickly save and reconstruct that state.
While memfd is backed by either hugetlbfs or shmem, currently only
support on shmem is added for this. Allow saving and restoring shmem FDs
over FDBOX + KHO.
The memfd FDT node itself does not contain much information. It just
creates a subnode and passes it over to shmem to do its thing. Similar
behaviour is followed on the restore side.
Since there are now two paths of getting a shmem file, refactor the file
setup into its own function called memfd_setup_file(). It sets up the
file flags, mode, etc., and sets fdbox ops if enabled.
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
---
mm/memfd.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 116 insertions(+), 12 deletions(-)
diff --git a/mm/memfd.c b/mm/memfd.c
index 37f7be57c2f50..1c32e66197f6d 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -7,6 +7,8 @@
* This file is released under the GPL.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/pagemap.h>
@@ -19,8 +21,12 @@
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
#include <linux/pid_namespace.h>
+#include <linux/fdbox.h>
+#include <linux/libfdt.h>
#include <uapi/linux/memfd.h>
+static const struct fdbox_file_ops memfd_fdbox_fops;
+
/*
* We need a tag: a new tag would expand every xa_node by 8 bytes,
* so reuse a tag which we firmly believe is never set or cleared on tmpfs
@@ -418,21 +424,10 @@ static char *alloc_name(const char __user *uname)
return ERR_PTR(error);
}
-static struct file *alloc_file(const char *name, unsigned int flags)
+static void memfd_setup_file(struct file *file, unsigned int flags)
{
unsigned int *file_seals;
- struct file *file;
- if (flags & MFD_HUGETLB) {
- file = hugetlb_file_setup(name, 0, VM_NORESERVE,
- HUGETLB_ANONHUGE_INODE,
- (flags >> MFD_HUGE_SHIFT) &
- MFD_HUGE_MASK);
- } else {
- file = shmem_file_setup(name, 0, VM_NORESERVE);
- }
- if (IS_ERR(file))
- return file;
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
@@ -452,6 +447,27 @@ static struct file *alloc_file(const char *name, unsigned int flags)
*file_seals &= ~F_SEAL_SEAL;
}
+#if defined(CONFIG_FDBOX) && defined(CONFIG_KEXEC_HANDOVER)
+ file->f_fdbox_op = &memfd_fdbox_fops;
+#endif
+}
+
+static struct file *alloc_file(const char *name, unsigned int flags)
+{
+ struct file *file;
+
+ if (flags & MFD_HUGETLB) {
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE,
+ HUGETLB_ANONHUGE_INODE,
+ (flags >> MFD_HUGE_SHIFT) &
+ MFD_HUGE_MASK);
+ } else {
+ file = shmem_file_setup(name, 0, VM_NORESERVE);
+ }
+ if (IS_ERR(file))
+ return file;
+
+ memfd_setup_file(file, flags);
return file;
}
@@ -493,3 +509,91 @@ SYSCALL_DEFINE2(memfd_create,
kfree(name);
return error;
}
+
+#if defined(CONFIG_FDBOX) && defined(CONFIG_KEXEC_HANDOVER)
+static const char memfd_fdbox_compatible[] = "fdbox,memfd-v1";
+
+static struct file *memfd_fdbox_kho_recover(const void *fdt, int offset)
+{
+ struct file *file;
+ int ret, subnode;
+
+ ret = fdt_node_check_compatible(fdt, offset, memfd_fdbox_compatible);
+ if (ret) {
+ pr_err("kho: invalid compatible\n");
+ return NULL;
+ }
+
+ /* Make sure there is exactly one subnode. */
+ subnode = fdt_first_subnode(fdt, offset);
+ if (subnode < 0) {
+ pr_err("kho: no subnode for underlying storage found!\n");
+ return NULL;
+ }
+ if (fdt_next_subnode(fdt, subnode) >= 0) {
+ pr_err("kho: too many subnodes. Expected only 1.\n");
+ return NULL;
+ }
+
+ if (is_node_shmem(fdt, subnode)) {
+ file = shmem_fdbox_kho_recover(fdt, subnode);
+ if (!file)
+ return NULL;
+
+ memfd_setup_file(file, 0);
+ return file;
+ }
+
+ return NULL;
+}
+
+static int memfd_fdbox_kho_write(struct fdbox_fd *box_fd, void *fdt)
+{
+ int ret = 0;
+
+ ret |= fdt_property(fdt, "compatible", memfd_fdbox_compatible,
+ sizeof(memfd_fdbox_compatible));
+
+ /* TODO: Track seals on the file as well. */
+
+ ret |= fdt_begin_node(fdt, "");
+ if (ret) {
+ pr_err("kho: failed to set up memfd node\n");
+ return -EINVAL;
+ }
+
+ if (shmem_file(box_fd->file))
+ ret = shmem_fdbox_kho_write(box_fd, fdt);
+ else
+ /* TODO: HugeTLB support. */
+ ret = -EOPNOTSUPP;
+
+ if (ret)
+ return ret;
+
+ ret = fdt_end_node(fdt);
+ if (ret) {
+ pr_err("kho: failed to end memfd node!\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static const struct fdbox_file_ops memfd_fdbox_fops = {
+ .kho_write = memfd_fdbox_kho_write,
+};
+
+static int __init memfd_fdbox_init(void)
+{
+ int error;
+
+ error = fdbox_register_handler(memfd_fdbox_compatible,
+ memfd_fdbox_kho_recover);
+ if (error)
+ pr_err("Could not register fdbox handler: %d\n", error);
+
+ return 0;
+}
+late_initcall(memfd_fdbox_init);
+#endif /* CONFIG_FDBOX && CONFIG_KEXEC_HANDOVER */
--
2.47.1
prev parent reply other threads:[~2025-03-07 0:58 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-03-07 0:57 [RFC PATCH 0/5] Introduce FDBox, and preserve memfd with shmem over KHO Pratyush Yadav
2025-03-07 0:57 ` [RFC PATCH 1/5] misc: introduce FDBox Pratyush Yadav
2025-03-07 6:03 ` Greg Kroah-Hartman
2025-03-07 9:31 ` Christian Brauner
2025-03-07 13:19 ` Christian Brauner
2025-03-07 15:14 ` Jason Gunthorpe
2025-03-08 11:09 ` Christian Brauner
2025-03-17 16:46 ` Jason Gunthorpe
2025-03-08 0:10 ` Pratyush Yadav
2025-03-09 12:03 ` Christian Brauner
2025-03-17 16:59 ` Jason Gunthorpe
2025-03-18 14:25 ` Christian Brauner
2025-03-18 14:57 ` Jason Gunthorpe
2025-03-18 23:02 ` Pratyush Yadav
2025-03-18 23:27 ` Jason Gunthorpe
2025-03-19 13:35 ` Pratyush Yadav
2025-03-20 12:14 ` Jason Gunthorpe
2025-03-26 22:40 ` Pratyush Yadav
2025-03-31 15:38 ` Jason Gunthorpe
2025-03-07 0:57 ` [RFC PATCH 2/5] misc: add documentation for FDBox Pratyush Yadav
2025-03-07 2:19 ` Randy Dunlap
2025-03-07 15:03 ` Pratyush Yadav
2025-03-07 14:22 ` Jonathan Corbet
2025-03-07 14:51 ` Pratyush Yadav
2025-03-07 15:25 ` Jonathan Corbet
2025-03-07 23:28 ` Pratyush Yadav
2025-03-07 0:57 ` [RFC PATCH 3/5] mm: shmem: allow callers to specify operations to shmem_undo_range Pratyush Yadav
2025-03-07 0:57 ` [RFC PATCH 4/5] mm: shmem: allow preserving file over FDBOX + KHO Pratyush Yadav
2025-03-07 0:57 ` Pratyush Yadav [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250307005830.65293-6-ptyadav@amazon.de \
--to=ptyadav@amazon.de \
--cc=akpm@linux-foundation.org \
--cc=anthony.yznaga@oracle.com \
--cc=arnd@arndb.de \
--cc=benh@kernel.crashing.org \
--cc=brauner@kernel.org \
--cc=corbet@lwn.net \
--cc=dave.hansen@intel.com \
--cc=david@redhat.com \
--cc=dwmw2@infradead.org \
--cc=ebiederm@xmission.com \
--cc=graf@amazon.com \
--cc=gregkh@linuxfoundation.org \
--cc=hughd@google.com \
--cc=jack@suse.cz \
--cc=jgg@nvidia.com \
--cc=jgowans@amazon.com \
--cc=kexec@lists.infradead.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=pbonzini@redhat.com \
--cc=richard.weiyang@gmail.com \
--cc=rppt@kernel.org \
--cc=tatashin@google.com \
--cc=viro@zeniv.linux.org.uk \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).