From: James Gowans <jgowans@amazon.com>
To: <linux-kernel@vger.kernel.org>
Cc: Eric Biederman <ebiederm@xmission.com>,
<kexec@lists.infradead.org>, "Joerg Roedel" <joro@8bytes.org>,
Will Deacon <will@kernel.org>, <iommu@lists.linux.dev>,
Alexander Viro <viro@zeniv.linux.org.uk>,
"Christian Brauner" <brauner@kernel.org>,
<linux-fsdevel@vger.kernel.org>,
Paolo Bonzini <pbonzini@redhat.com>,
Sean Christopherson <seanjc@google.com>, <kvm@vger.kernel.org>,
Andrew Morton <akpm@linux-foundation.org>, <linux-mm@kvack.org>,
Alexander Graf <graf@amazon.com>,
David Woodhouse <dwmw@amazon.co.uk>,
"Jan H . Schoenherr" <jschoenh@amazon.de>,
Usama Arif <usama.arif@bytedance.com>,
Anthony Yznaga <anthony.yznaga@oracle.com>,
Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>,
<madvenka@linux.microsoft.com>, <steven.sistare@oracle.com>,
<yuleixzhang@tencent.com>
Subject: [RFC 12/18] pkernfs: Add IOMMU domain pgtables file
Date: Mon, 5 Feb 2024 12:01:57 +0000 [thread overview]
Message-ID: <20240205120203.60312-13-jgowans@amazon.com> (raw)
In-Reply-To: <20240205120203.60312-1-jgowans@amazon.com>
Similar to the IOMMU root pgtables file which was added in a previous
commit, now support a file type for IOMMU domain pgtables in the IOMMU
directory. These domain pgtable files only need to be useable after the
system has booted up, for example by QEMU creating one of these files
and using it to back the IOMMU pgtables for a persistent VM. As such the
filesystem abstraction can be better maintained here as the kernel code
doesn't need to reach "behind" the filesystem abstraction like it does
for the root pgtables.
A new inode type is created for domain pgtable files, and the IOMMU
directory gets inode_operation callbacks to support creating and
deleting these files in it.
Note: there is a use-after-free risk here too: if the domain pgtable
file is truncated while it's in-use for IOMMU pgtables then freed memory
could still be mapped into the IOMMU. To mitigate this there should be a
machanism to "freeze" the files once they've been given to the IOMMU.
---
fs/pkernfs/inode.c | 9 +++++--
fs/pkernfs/iommu.c | 55 +++++++++++++++++++++++++++++++++++++++--
fs/pkernfs/pkernfs.h | 4 +++
include/linux/pkernfs.h | 1 +
4 files changed, 65 insertions(+), 4 deletions(-)
diff --git a/fs/pkernfs/inode.c b/fs/pkernfs/inode.c
index 1d712e0a82a1..35842cd61002 100644
--- a/fs/pkernfs/inode.c
+++ b/fs/pkernfs/inode.c
@@ -35,7 +35,11 @@ struct inode *pkernfs_inode_get(struct super_block *sb, unsigned long ino)
inode->i_op = &pkernfs_iommu_dir_inode_operations;
inode->i_fop = &pkernfs_dir_fops;
inode->i_mode = S_IFDIR;
- } else if (pkernfs_inode->flags | PKERNFS_INODE_FLAG_IOMMU_ROOT_PGTABLES) {
+ } else if (pkernfs_inode->flags & PKERNFS_INODE_FLAG_IOMMU_ROOT_PGTABLES) {
+ inode->i_fop = &pkernfs_file_fops;
+ inode->i_mode = S_IFREG;
+ } else if (pkernfs_inode->flags & PKERNFS_INODE_FLAG_IOMMU_DOMAIN_PGTABLES) {
+ inode->i_fop = &pkernfs_file_fops;
inode->i_mode = S_IFREG;
}
@@ -175,6 +179,7 @@ const struct inode_operations pkernfs_dir_inode_operations = {
};
const struct inode_operations pkernfs_iommu_dir_inode_operations = {
+ .create = pkernfs_create_iommu_pgtables,
.lookup = pkernfs_lookup,
+ .unlink = pkernfs_unlink,
};
-
diff --git a/fs/pkernfs/iommu.c b/fs/pkernfs/iommu.c
index 5bce8146d7bb..f14e76013e85 100644
--- a/fs/pkernfs/iommu.c
+++ b/fs/pkernfs/iommu.c
@@ -4,6 +4,27 @@
#include <linux/io.h>
+void pkernfs_alloc_iommu_domain_pgtables(struct file *ppts, struct pkernfs_region *pkernfs_region)
+{
+ struct pkernfs_inode *pkernfs_inode;
+ unsigned long *mappings_block_vaddr;
+ unsigned long inode_idx;
+
+ /*
+ * For a pkernfs region block, the "mappings_block" field is still
+ * just a block index, but that block doesn't actually contain mappings
+ * it contains the pkernfs_region data
+ */
+
+ inode_idx = ppts->f_inode->i_ino;
+ pkernfs_inode = pkernfs_get_persisted_inode(NULL, inode_idx);
+
+ mappings_block_vaddr = (unsigned long *)pkernfs_addr_for_block(NULL,
+ pkernfs_inode->mappings_block);
+ set_bit(0, mappings_block_vaddr);
+ pkernfs_region->vaddr = mappings_block_vaddr;
+ pkernfs_region->paddr = pkernfs_base + (pkernfs_inode->mappings_block * (2 << 20));
+}
void pkernfs_alloc_iommu_root_pgtables(struct pkernfs_region *pkernfs_region)
{
unsigned long *mappings_block_vaddr;
@@ -63,9 +84,8 @@ void pkernfs_alloc_iommu_root_pgtables(struct pkernfs_region *pkernfs_region)
* just a block index, but that block doesn't actually contain mappings
* it contains the pkernfs_region data
*/
-
mappings_block_vaddr = (unsigned long *)pkernfs_addr_for_block(NULL,
- iommu_pgtables->mappings_block);
+ iommu_pgtables->mappings_block);
set_bit(0, mappings_block_vaddr);
pkernfs_region->vaddr = mappings_block_vaddr;
pkernfs_region->paddr = pkernfs_base + (iommu_pgtables->mappings_block * PMD_SIZE);
@@ -88,6 +108,29 @@ void pkernfs_alloc_iommu_root_pgtables(struct pkernfs_region *pkernfs_region)
(iommu_pgtables->mappings_block * PMD_SIZE);
}
+int pkernfs_create_iommu_pgtables(struct mnt_idmap *id, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ unsigned long free_inode;
+ struct pkernfs_inode *pkernfs_inode;
+ struct inode *vfs_inode;
+
+ free_inode = pkernfs_allocate_inode(dir->i_sb);
+ if (free_inode <= 0)
+ return -ENOMEM;
+
+ pkernfs_inode = pkernfs_get_persisted_inode(dir->i_sb, free_inode);
+ pkernfs_inode->sibling_ino = pkernfs_get_persisted_inode(dir->i_sb, dir->i_ino)->child_ino;
+ pkernfs_get_persisted_inode(dir->i_sb, dir->i_ino)->child_ino = free_inode;
+ strscpy(pkernfs_inode->filename, dentry->d_name.name, PKERNFS_FILENAME_LEN);
+ pkernfs_inode->flags = PKERNFS_INODE_FLAG_IOMMU_DOMAIN_PGTABLES;
+ pkernfs_inode->mappings_block = pkernfs_alloc_block(dir->i_sb);
+ memset(pkernfs_addr_for_block(dir->i_sb, pkernfs_inode->mappings_block), 0, (2 << 20));
+ vfs_inode = pkernfs_inode_get(dir->i_sb, free_inode);
+ d_add(dentry, vfs_inode);
+ return 0;
+}
+
void *pkernfs_region_paddr_to_vaddr(struct pkernfs_region *region, unsigned long paddr)
{
if (WARN_ON(paddr >= region->paddr + region->bytes))
@@ -96,3 +139,11 @@ void *pkernfs_region_paddr_to_vaddr(struct pkernfs_region *region, unsigned long
return NULL;
return region->vaddr + (paddr - region->paddr);
}
+
+bool pkernfs_is_iommu_domain_pgtables(struct file *f)
+{
+ return f &&
+ pkernfs_get_persisted_inode(f->f_inode->i_sb, f->f_inode->i_ino)->flags &
+ PKERNFS_INODE_FLAG_IOMMU_DOMAIN_PGTABLES;
+}
+
diff --git a/fs/pkernfs/pkernfs.h b/fs/pkernfs/pkernfs.h
index e1b7ae3fe7f1..9bea827f8b40 100644
--- a/fs/pkernfs/pkernfs.h
+++ b/fs/pkernfs/pkernfs.h
@@ -21,6 +21,7 @@ struct pkernfs_sb {
#define PKERNFS_INODE_FLAG_DIR (1 << 1)
#define PKERNFS_INODE_FLAG_IOMMU_DIR (1 << 2)
#define PKERNFS_INODE_FLAG_IOMMU_ROOT_PGTABLES (1 << 3)
+#define PKERNFS_INODE_FLAG_IOMMU_DOMAIN_PGTABLES (1 << 4)
struct pkernfs_inode {
int flags;
/*
@@ -50,8 +51,11 @@ void *pkernfs_addr_for_block(struct super_block *sb, int block_idx);
unsigned long pkernfs_allocate_inode(struct super_block *sb);
struct pkernfs_inode *pkernfs_get_persisted_inode(struct super_block *sb, int ino);
+int pkernfs_create_iommu_pgtables(struct mnt_idmap *id, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl);
extern const struct file_operations pkernfs_dir_fops;
extern const struct file_operations pkernfs_file_fops;
extern const struct inode_operations pkernfs_file_inode_operations;
extern const struct inode_operations pkernfs_iommu_dir_inode_operations;
+extern const struct inode_operations pkernfs_iommu_domain_pgtables_inode_operations;
diff --git a/include/linux/pkernfs.h b/include/linux/pkernfs.h
index 0110e4784109..4ca923ee0d82 100644
--- a/include/linux/pkernfs.h
+++ b/include/linux/pkernfs.h
@@ -33,4 +33,5 @@ void pkernfs_alloc_page_from_region(struct pkernfs_region *pkernfs_region,
void **vaddr, unsigned long *paddr);
void *pkernfs_region_paddr_to_vaddr(struct pkernfs_region *region, unsigned long paddr);
+bool pkernfs_is_iommu_domain_pgtables(struct file *f);
#endif /* _LINUX_PKERNFS_H */
--
2.40.1
next prev parent reply other threads:[~2024-02-05 12:05 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-02-05 12:01 [RFC 00/18] Pkernfs: Support persistence for live update James Gowans
2024-02-05 12:01 ` [RFC 01/18] pkernfs: Introduce filesystem skeleton James Gowans
2024-02-05 12:01 ` [RFC 02/18] pkernfs: Add persistent inodes hooked into directies James Gowans
2024-02-05 12:01 ` [RFC 03/18] pkernfs: Define an allocator for persistent pages James Gowans
2024-02-05 12:01 ` [RFC 04/18] pkernfs: support file truncation James Gowans
2024-02-05 12:01 ` [RFC 05/18] pkernfs: add file mmap callback James Gowans
2024-02-05 23:34 ` Dave Chinner
2024-02-05 12:01 ` [RFC 06/18] init: Add liveupdate cmdline param James Gowans
2024-02-05 12:01 ` [RFC 07/18] pkernfs: Add file type for IOMMU root pgtables James Gowans
2024-02-05 12:01 ` [RFC 08/18] iommu: Add allocator for pgtables from persistent region James Gowans
2024-02-05 12:01 ` [RFC 09/18] intel-iommu: Use pkernfs for root/context pgtable pages James Gowans
2024-02-05 12:01 ` [RFC 10/18] iommu/intel: zap context table entries on kexec James Gowans
2024-02-05 12:01 ` [RFC 11/18] dma-iommu: Always enable deferred attaches for liveupdate James Gowans
2024-02-05 17:45 ` Jason Gunthorpe
2024-02-05 12:01 ` James Gowans [this message]
2024-02-05 12:01 ` [RFC 13/18] vfio: add ioctl to define persistent pgtables on container James Gowans
2024-02-05 17:08 ` Jason Gunthorpe
2024-02-05 12:01 ` [RFC 14/18] intel-iommu: Allocate domain pgtable pages from pkernfs James Gowans
2024-02-05 17:12 ` Jason Gunthorpe
2024-02-05 12:02 ` [RFC 15/18] pkernfs: register device memory for IOMMU domain pgtables James Gowans
2024-02-05 12:02 ` [RFC 16/18] vfio: support not mapping IOMMU pgtables on live-update James Gowans
2024-02-05 12:02 ` [RFC 17/18] pci: Don't clear bus master is persistence enabled James Gowans
2024-02-05 12:02 ` [RFC 18/18] vfio-pci: Assume device working after liveupdate James Gowans
2024-02-05 17:10 ` [RFC 00/18] Pkernfs: Support persistence for live update Alex Williamson
2024-02-07 14:56 ` Gowans, James
2024-02-07 15:28 ` Jason Gunthorpe
2024-02-05 17:42 ` Jason Gunthorpe
2024-02-07 14:45 ` Gowans, James
2024-02-07 15:22 ` Jason Gunthorpe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240205120203.60312-13-jgowans@amazon.com \
--to=jgowans@amazon.com \
--cc=akpm@linux-foundation.org \
--cc=anthony.yznaga@oracle.com \
--cc=brauner@kernel.org \
--cc=dwmw@amazon.co.uk \
--cc=ebiederm@xmission.com \
--cc=graf@amazon.com \
--cc=iommu@lists.linux.dev \
--cc=joro@8bytes.org \
--cc=jschoenh@amazon.de \
--cc=kexec@lists.infradead.org \
--cc=kvm@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=madvenka@linux.microsoft.com \
--cc=pbonzini@redhat.com \
--cc=seanjc@google.com \
--cc=skinsburskii@linux.microsoft.com \
--cc=steven.sistare@oracle.com \
--cc=usama.arif@bytedance.com \
--cc=viro@zeniv.linux.org.uk \
--cc=will@kernel.org \
--cc=yuleixzhang@tencent.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox