From: James Gowans <jgowans@amazon.com>
To: <linux-kernel@vger.kernel.org>
Cc: James Gowans <jgowans@amazon.com>,
Sean Christopherson <seanjc@google.com>,
Paolo Bonzini <pbonzini@redhat.com>,
Alexander Viro <viro@zeniv.linux.org.uk>,
Steve Sistare <steven.sistare@oracle.com>,
Christian Brauner <brauner@kernel.org>, Jan Kara <jack@suse.cz>,
"Anthony Yznaga" <anthony.yznaga@oracle.com>,
Mike Rapoport <rppt@kernel.org>,
"Andrew Morton" <akpm@linux-foundation.org>, <linux-mm@kvack.org>,
Jason Gunthorpe <jgg@ziepe.ca>, <linux-fsdevel@vger.kernel.org>,
Usama Arif <usama.arif@bytedance.com>, <kvm@vger.kernel.org>,
Alexander Graf <graf@amazon.com>,
David Woodhouse <dwmw@amazon.co.uk>,
Paul Durrant <pdurrant@amazon.co.uk>,
Nicolas Saenz Julienne <nsaenz@amazon.es>
Subject: [PATCH 07/10] guestmemfs: Persist filesystem metadata via KHO
Date: Mon, 5 Aug 2024 11:32:42 +0200 [thread overview]
Message-ID: <20240805093245.889357-8-jgowans@amazon.com> (raw)
In-Reply-To: <20240805093245.889357-1-jgowans@amazon.com>
Filesystem metadata consists of: physical memory extents, superblock,
inodes block and allocation bitmap. Here serialisation and
deserialisation of all of these is done via the KHO framework.
A serialisation callback is added which is run when KHO activate is
triggered. This creates the device tree blob for the metadata and marks
the memory as persistent via struct kho_mem(s).
When the filesystem is mounted it attempts to re-hydrate metadata from
KHO. Only if this fails (first boot, for example) then it allocates
fresh metadata pages.
The privatet data struct is switched from holding a reference to the
persistent superblock to now referencing the regular struct super_block.
This is necessary for the serialisation code. Better would be to be able
to define callback private data, if that were possible.
Signed-off-by: James Gowans <jgowans@amazon.com>
---
fs/guestmemfs/Makefile | 2 +
fs/guestmemfs/guestmemfs.c | 72 ++++++---
fs/guestmemfs/guestmemfs.h | 8 +
fs/guestmemfs/serialise.c | 296 +++++++++++++++++++++++++++++++++++++
4 files changed, 355 insertions(+), 23 deletions(-)
create mode 100644 fs/guestmemfs/serialise.c
diff --git a/fs/guestmemfs/Makefile b/fs/guestmemfs/Makefile
index e93e43ba274b..8b95cac34564 100644
--- a/fs/guestmemfs/Makefile
+++ b/fs/guestmemfs/Makefile
@@ -4,3 +4,5 @@
#
obj-y += guestmemfs.o inode.o dir.o allocator.o file.o
+
+obj-$(CONFIG_KEXEC_KHO) += serialise.o
diff --git a/fs/guestmemfs/guestmemfs.c b/fs/guestmemfs/guestmemfs.c
index 38f20ad25286..cf47e5100504 100644
--- a/fs/guestmemfs/guestmemfs.c
+++ b/fs/guestmemfs/guestmemfs.c
@@ -3,6 +3,7 @@
#include "guestmemfs.h"
#include <linux/dcache.h>
#include <linux/fs.h>
+#include <linux/kexec.h>
#include <linux/module.h>
#include <linux/fs_context.h>
#include <linux/io.h>
@@ -10,7 +11,7 @@
#include <linux/statfs.h>
phys_addr_t guestmemfs_base, guestmemfs_size;
-struct guestmemfs_sb *psb;
+struct super_block *guestmemfs_sb;
static int statfs(struct dentry *root, struct kstatfs *buf)
{
@@ -33,26 +34,39 @@ static int guestmemfs_fill_super(struct super_block *sb, struct fs_context *fc)
struct inode *inode;
struct dentry *dentry;
- psb = kzalloc(sizeof(*psb), GFP_KERNEL);
- psb->inodes = kzalloc(2 << 20, GFP_KERNEL);
- if (!psb->inodes)
- return -ENOMEM;
- psb->allocator_bitmap = kzalloc(1 << 20, GFP_KERNEL);
- if (!psb->allocator_bitmap)
- return -ENOMEM;
-
/*
* Keep a reference to the persistent super block in the
* ephemeral super block.
*/
- sb->s_fs_info = psb;
- spin_lock_init(&psb->allocation_lock);
- guestmemfs_initialise_inode_store(sb);
- guestmemfs_zero_allocations(sb);
- guestmemfs_get_persisted_inode(sb, 1)->flags = GUESTMEMFS_INODE_FLAG_DIR;
- strscpy(guestmemfs_get_persisted_inode(sb, 1)->filename, ".",
- GUESTMEMFS_FILENAME_LEN);
- psb->next_free_ino = 2;
+ sb->s_fs_info = guestmemfs_restore_from_kho();
+
+ if (GUESTMEMFS_PSB(sb)) {
+ pr_info("Restored super block from KHO\n");
+ } else {
+ struct guestmemfs_sb *psb;
+
+ pr_info("Did not restore from KHO - allocating free\n");
+ psb = kzalloc(sizeof(*psb), GFP_KERNEL);
+ psb->inodes = kzalloc(2 << 20, GFP_KERNEL);
+ if (!psb->inodes)
+ return -ENOMEM;
+ psb->allocator_bitmap = kzalloc(1 << 20, GFP_KERNEL);
+ if (!psb->allocator_bitmap)
+ return -ENOMEM;
+ sb->s_fs_info = psb;
+ spin_lock_init(&psb->allocation_lock);
+ guestmemfs_initialise_inode_store(sb);
+ guestmemfs_zero_allocations(sb);
+ guestmemfs_get_persisted_inode(sb, 1)->flags = GUESTMEMFS_INODE_FLAG_DIR;
+ strscpy(guestmemfs_get_persisted_inode(sb, 1)->filename, ".",
+ GUESTMEMFS_FILENAME_LEN);
+ GUESTMEMFS_PSB(sb)->next_free_ino = 2;
+ }
+ /*
+ * Keep a reference to this sb; the serialise callback needs it
+ * and has no oher way to get it.
+ */
+ guestmemfs_sb = sb;
sb->s_op = &guestmemfs_super_ops;
@@ -98,11 +112,18 @@ static struct file_system_type guestmemfs_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
+
+static struct notifier_block trace_kho_nb = {
+ .notifier_call = guestmemfs_serialise_to_kho,
+};
+
static int __init guestmemfs_init(void)
{
int ret;
ret = register_filesystem(&guestmemfs_fs_type);
+ if (IS_ENABLED(CONFIG_FTRACE_KHO))
+ register_kho_notifier(&trace_kho_nb);
return ret;
}
@@ -120,13 +141,18 @@ early_param("guestmemfs", parse_guestmemfs_extents);
void __init guestmemfs_reserve_mem(void)
{
- guestmemfs_base = memblock_phys_alloc(guestmemfs_size, 4 << 10);
- if (guestmemfs_base) {
- memblock_reserved_mark_noinit(guestmemfs_base, guestmemfs_size);
- memblock_mark_nomap(guestmemfs_base, guestmemfs_size);
- } else {
- pr_warn("Failed to alloc %llu bytes for guestmemfs\n", guestmemfs_size);
+ if (guestmemfs_size) {
+ guestmemfs_base = memblock_phys_alloc(guestmemfs_size, 4 << 10);
+
+ if (guestmemfs_base) {
+ memblock_reserved_mark_noinit(guestmemfs_base, guestmemfs_size);
+ memblock_mark_nomap(guestmemfs_base, guestmemfs_size);
+ pr_debug("guestmemfs reserved base=%llu from memblocks\n", guestmemfs_base);
+ } else {
+ pr_warn("Failed to alloc %llu bytes for guestmemfs\n", guestmemfs_size);
+ }
}
+
}
MODULE_ALIAS_FS("guestmemfs");
diff --git a/fs/guestmemfs/guestmemfs.h b/fs/guestmemfs/guestmemfs.h
index 0f2788ce740e..263d995b75ed 100644
--- a/fs/guestmemfs/guestmemfs.h
+++ b/fs/guestmemfs/guestmemfs.h
@@ -10,11 +10,14 @@
/* Units of bytes */
extern phys_addr_t guestmemfs_base, guestmemfs_size;
+extern struct super_block *guestmemfs_sb;
struct guestmemfs_sb {
/* Inode number */
unsigned long next_free_ino;
unsigned long allocated_inodes;
+
+ /* Ephemeral fields - must be updated on deserialise */
struct guestmemfs_inode *inodes;
void *allocator_bitmap;
spinlock_t allocation_lock;
@@ -46,6 +49,11 @@ long guestmemfs_alloc_block(struct super_block *sb);
struct inode *guestmemfs_inode_get(struct super_block *sb, unsigned long ino);
struct guestmemfs_inode *guestmemfs_get_persisted_inode(struct super_block *sb, int ino);
+int guestmemfs_serialise_to_kho(struct notifier_block *self,
+ unsigned long cmd,
+ void *v);
+struct guestmemfs_sb *guestmemfs_restore_from_kho(void);
+
extern const struct file_operations guestmemfs_dir_fops;
extern const struct file_operations guestmemfs_file_fops;
extern const struct inode_operations guestmemfs_file_inode_operations;
diff --git a/fs/guestmemfs/serialise.c b/fs/guestmemfs/serialise.c
new file mode 100644
index 000000000000..eb70d496a3eb
--- /dev/null
+++ b/fs/guestmemfs/serialise.c
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "guestmemfs.h"
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+
+/*
+ * Responsible for serialisation and deserialisation of filesystem metadata
+ * to and from KHO to survive kexec. The deserialisation logic needs to mirror
+ * serialisation, so putting them in the same file.
+ *
+ * The format of the device tree structure is:
+ *
+ * /guestmemfs
+ * compatible = "guestmemfs-v1"
+ * fs_mem {
+ * mem = [ ... ]
+ * };
+ * superblock {
+ * mem = [
+ * persistent super block,
+ * inodes,
+ * allocator_bitmap,
+ * };
+ * mappings_block {
+ * mem = [ ... ]
+ * };
+ * // For every mappings_block mem, which inode it belongs to.
+ * mappings_to_inode {
+ * num_inodes,
+ * mem = [ ... ],
+ * }
+ */
+
+static int serialise_superblock(struct super_block *sb, void *fdt)
+{
+ struct kho_mem mem[3];
+ int err = 0;
+ struct guestmemfs_sb *psb = sb->s_fs_info;
+
+ err |= fdt_begin_node(fdt, "superblock");
+
+ mem[0].addr = virt_to_phys(psb);
+ mem[0].len = sizeof(*psb);
+
+ mem[1].addr = virt_to_phys(psb->inodes);
+ mem[1].len = 2 << 20;
+
+ mem[2].addr = virt_to_phys(psb->allocator_bitmap);
+ mem[2].len = 1 << 20;
+
+ err |= fdt_property(fdt, "mem", &mem, sizeof(mem));
+ err |= fdt_end_node(fdt);
+
+ return err;
+}
+
+static int serialise_mappings_blocks(struct super_block *sb, void *fdt)
+{
+ struct kho_mem *mappings_mems;
+ struct kho_mem mappings_to_inode_mem;
+ struct guestmemfs_sb *psb = sb->s_fs_info;
+ int inode_idx;
+ size_t num_inodes = PMD_SIZE / sizeof(struct guestmemfs_inode);
+ struct guestmemfs_inode *inode;
+ int err = 0;
+ int *mappings_to_inode;
+ int mappings_to_inode_idx = 0;
+
+ mappings_to_inode = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+ mappings_mems = kcalloc(psb->allocated_inodes, sizeof(struct kho_mem), GFP_KERNEL);
+
+ for (inode_idx = 1; inode_idx < num_inodes; ++inode_idx) {
+ inode = guestmemfs_get_persisted_inode(sb, inode_idx);
+ if (inode->flags & GUESTMEMFS_INODE_FLAG_FILE) {
+ mappings_mems[mappings_to_inode_idx].addr = virt_to_phys(inode->mappings);
+ mappings_mems[mappings_to_inode_idx].len = PAGE_SIZE;
+ mappings_to_inode[mappings_to_inode_idx] = inode_idx;
+ mappings_to_inode_idx++;
+ }
+ }
+
+ err |= fdt_begin_node(fdt, "mappings_blocks");
+ err |= fdt_property(fdt, "mem", mappings_mems,
+ sizeof(struct kho_mem) * mappings_to_inode_idx);
+ err |= fdt_end_node(fdt);
+
+
+ err |= fdt_begin_node(fdt, "mappings_to_inode");
+ mappings_to_inode_mem.addr = virt_to_phys(mappings_to_inode);
+ mappings_to_inode_mem.len = PAGE_SIZE;
+ err |= fdt_property(fdt, "mem", &mappings_to_inode_mem,
+ sizeof(mappings_to_inode_mem));
+ err |= fdt_property(fdt, "num_inodes", &psb->allocated_inodes,
+ sizeof(psb->allocated_inodes));
+
+ err |= fdt_end_node(fdt);
+
+ return err;
+}
+
+int guestmemfs_serialise_to_kho(struct notifier_block *self,
+ unsigned long cmd,
+ void *v)
+{
+ static const char compatible[] = "guestmemfs-v1";
+ struct kho_mem mem;
+ void *fdt = v;
+ int err = 0;
+
+ switch (cmd) {
+ case KEXEC_KHO_ABORT:
+ /* No rollback action needed. */
+ return NOTIFY_DONE;
+ case KEXEC_KHO_DUMP:
+ /* Handled below */
+ break;
+ default:
+ return NOTIFY_BAD;
+ }
+
+ err |= fdt_begin_node(fdt, "guestmemfs");
+ err |= fdt_property(fdt, "compatible", compatible, sizeof(compatible));
+
+ err |= fdt_begin_node(fdt, "fs_mem");
+ mem.addr = guestmemfs_base | KHO_MEM_ADDR_FLAG_NOINIT;
+ mem.len = guestmemfs_size;
+ err |= fdt_property(fdt, "mem", &mem, sizeof(mem));
+ err |= fdt_end_node(fdt);
+
+ err |= serialise_superblock(guestmemfs_sb, fdt);
+ err |= serialise_mappings_blocks(guestmemfs_sb, fdt);
+
+ err |= fdt_end_node(fdt);
+
+ pr_info("Serialised extends [0x%llx + 0x%llx] via KHO: %i\n",
+ guestmemfs_base, guestmemfs_size, err);
+
+ return err;
+}
+
+static struct guestmemfs_sb *deserialise_superblock(const void *fdt, int root_off)
+{
+ const struct kho_mem *mem;
+ int mem_len;
+ struct guestmemfs_sb *old_sb;
+ int off;
+
+ off = fdt_subnode_offset(fdt, root_off, "superblock");
+ mem = fdt_getprop(fdt, off, "mem", &mem_len);
+
+ if (mem_len != 3 * sizeof(struct kho_mem)) {
+ pr_err("Incorrect mem_len; got %i\n", mem_len);
+ return NULL;
+ }
+
+ old_sb = kho_claim_mem(mem);
+ old_sb->inodes = kho_claim_mem(mem + 1);
+ old_sb->allocator_bitmap = kho_claim_mem(mem + 2);
+
+ return old_sb;
+}
+
+static int deserialise_mappings_blocks(const void *fdt, int root_off,
+ struct guestmemfs_sb *sb)
+{
+ int off;
+ int len = 0;
+ const unsigned long *num_inodes;
+ const struct kho_mem *mappings_to_inode_mem;
+ int *mappings_to_inode;
+ int mappings_block;
+ const struct kho_mem *mappings_blocks_mems;
+
+ /*
+ * Array of struct kho_mem - one for each persisted mappings
+ * blocks.
+ */
+ off = fdt_subnode_offset(fdt, root_off, "mappings_blocks");
+ mappings_blocks_mems = fdt_getprop(fdt, off, "mem", &len);
+
+ /*
+ * Array specifying which inode a specific index into the
+ * mappings_blocks kho_mem array corresponds to. num_inodes
+ * indicates the size of the array which is the number of mappings
+ * blocks which need to be restored.
+ */
+ off = fdt_subnode_offset(fdt, root_off, "mappings_to_inode");
+ if (off < 0) {
+ pr_warn("No fs_mem available in KHO\n");
+ return -EINVAL;
+ }
+ num_inodes = fdt_getprop(fdt, off, "num_inodes", &len);
+ if (len != sizeof(num_inodes)) {
+ pr_warn("Invalid num_inodes len: %i\n", len);
+ return -EINVAL;
+ }
+ mappings_to_inode_mem = fdt_getprop(fdt, off, "mem", &len);
+ if (len != sizeof(*mappings_to_inode_mem)) {
+ pr_warn("Invalid mappings_to_inode_mem len: %i\n", len);
+ return -EINVAL;
+ }
+ mappings_to_inode = kho_claim_mem(mappings_to_inode_mem);
+
+ /*
+ * Re-assigned the mappings block to the inodes. Indexes into
+ * mappings_to_inode specifies which inode to assign each mappings
+ * block to.
+ */
+ for (mappings_block = 0; mappings_block < *num_inodes; ++mappings_block) {
+ int inode = mappings_to_inode[mappings_block];
+
+ sb->inodes[inode].mappings = kho_claim_mem(&mappings_blocks_mems[mappings_block]);
+ }
+
+ return 0;
+}
+
+static int deserialise_fs_mem(const void *fdt, int root_off)
+{
+ int err;
+ /* Offset into the KHO DT */
+ int off;
+ int len = 0;
+ const struct kho_mem *mem;
+
+ off = fdt_subnode_offset(fdt, root_off, "fs_mem");
+ if (off < 0) {
+ pr_info("No fs_mem available in KHO\n");
+ return -EINVAL;
+ }
+
+ mem = fdt_getprop(fdt, off, "mem", &len);
+ if (mem && len == sizeof(*mem)) {
+ guestmemfs_base = mem->addr & ~KHO_MEM_ADDR_FLAG_MASK;
+ guestmemfs_size = mem->len;
+ } else {
+ pr_err("KHO did not contain a guestmemfs base address and size\n");
+ return -EINVAL;
+ }
+
+ pr_info("Reclaimed [%llx + %llx] via KHO\n", guestmemfs_base, guestmemfs_size);
+ if (err) {
+ pr_err("Unable to reserve [0x%llx + 0x%llx] from memblock: %i\n",
+ guestmemfs_base, guestmemfs_size, err);
+ return err;
+ }
+ return 0;
+}
+struct guestmemfs_sb *guestmemfs_restore_from_kho(void)
+{
+ const void *fdt = kho_get_fdt();
+ struct guestmemfs_sb *old_sb;
+ int err;
+ /* Offset into the KHO DT */
+ int off;
+
+ if (!fdt) {
+ pr_err("Unable to get KHO DT after KHO boot?\n");
+ return NULL;
+ }
+
+ off = fdt_path_offset(fdt, "/guestmemfs");
+ pr_info("guestmemfs offset: %i\n", off);
+
+ if (!off) {
+ pr_info("No guestmemfs data available in KHO\n");
+ return NULL;
+ }
+ err = fdt_node_check_compatible(fdt, off, "guestmemfs-v1");
+ if (err) {
+ pr_err("Existing KHO superblock format is not compatible with this kernel\n");
+ return NULL;
+ }
+
+ old_sb = deserialise_superblock(fdt, off);
+ if (!old_sb) {
+ pr_warn("Failed to restore superblock\n");
+ return NULL;
+ }
+
+ err = deserialise_mappings_blocks(fdt, off, old_sb);
+ if (err) {
+ pr_warn("Failed to restore mappings blocks\n");
+ return NULL;
+ }
+
+ err = deserialise_fs_mem(fdt, off);
+ if (err) {
+ pr_warn("Failed to restore filesystem memory extents\n");
+ return NULL;
+ }
+
+ return old_sb;
+}
--
2.34.1
next prev parent reply other threads:[~2024-08-05 9:35 UTC|newest]
Thread overview: 35+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-08-05 9:32 [PATCH 00/10] Introduce guestmemfs: persistent in-memory filesystem James Gowans
2024-08-05 9:32 ` [PATCH 01/10] guestmemfs: Introduce filesystem skeleton James Gowans
2024-08-05 10:20 ` Christian Brauner
2024-08-05 9:32 ` [PATCH 02/10] guestmemfs: add inode store, files and dirs James Gowans
2024-08-05 9:32 ` [PATCH 03/10] guestmemfs: add persistent data block allocator James Gowans
2024-08-05 9:32 ` [PATCH 04/10] guestmemfs: support file truncation James Gowans
2024-08-05 9:32 ` [PATCH 05/10] guestmemfs: add file mmap callback James Gowans
2024-10-29 23:05 ` Elliot Berman
2024-10-30 22:18 ` Frank van der Linden
2024-11-01 12:55 ` Gowans, James
2024-10-31 15:30 ` Gowans, James
2024-10-31 16:06 ` Jason Gunthorpe
2024-11-01 13:01 ` Gowans, James
2024-11-01 13:42 ` Jason Gunthorpe
2024-11-02 8:24 ` Gowans, James
2024-11-04 11:11 ` Mike Rapoport
2024-11-04 14:39 ` Jason Gunthorpe
2024-11-04 10:49 ` Mike Rapoport
2024-08-05 9:32 ` [PATCH 06/10] kexec/kho: Add addr flag to not initialise memory James Gowans
2024-08-05 9:32 ` James Gowans [this message]
2024-08-05 9:32 ` [PATCH 08/10] guestmemfs: Block modifications when serialised James Gowans
2024-08-05 9:32 ` [PATCH 09/10] guestmemfs: Add documentation and usage instructions James Gowans
2024-08-05 9:32 ` [PATCH 10/10] MAINTAINERS: Add maintainers for guestmemfs James Gowans
2024-08-05 14:32 ` [PATCH 00/10] Introduce guestmemfs: persistent in-memory filesystem Theodore Ts'o
2024-08-05 14:41 ` Paolo Bonzini
2024-08-05 19:47 ` Gowans, James
2024-08-05 19:53 ` Gowans, James
2024-08-05 20:01 ` Jan Kara
2024-08-05 23:29 ` Jason Gunthorpe
2024-08-06 8:26 ` Gowans, James
2024-08-06 8:12 ` Gowans, James
2024-08-06 13:43 ` David Hildenbrand
2024-08-07 23:45 ` David Matlack
2024-10-17 4:53 ` Vishal Annapurve
2024-11-01 12:53 ` Gowans, James
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240805093245.889357-8-jgowans@amazon.com \
--to=jgowans@amazon.com \
--cc=akpm@linux-foundation.org \
--cc=anthony.yznaga@oracle.com \
--cc=brauner@kernel.org \
--cc=dwmw@amazon.co.uk \
--cc=graf@amazon.com \
--cc=jack@suse.cz \
--cc=jgg@ziepe.ca \
--cc=kvm@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=nsaenz@amazon.es \
--cc=pbonzini@redhat.com \
--cc=pdurrant@amazon.co.uk \
--cc=rppt@kernel.org \
--cc=seanjc@google.com \
--cc=steven.sistare@oracle.com \
--cc=usama.arif@bytedance.com \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.