From: Logan Gunthorpe <logang@deltatee.com>
To: linux-kernel@vger.kernel.org, linux-nvme@lists.infradead.org,
	linux-block@vger.kernel.org, linux-pci@vger.kernel.org,
	linux-mm@kvack.org
Cc: "Christoph Hellwig" <hch@lst.de>,
	"Greg Kroah-Hartman" <gregkh@linuxfoundation.org>,
	"Dan Williams" <dan.j.williams@intel.com>,
	"Jason Gunthorpe" <jgg@ziepe.ca>,
	"Christian König" <christian.koenig@amd.com>,
	"John Hubbard" <jhubbard@nvidia.com>,
	"Don Dutile" <ddutile@redhat.com>,
	"Matthew Wilcox" <willy@infradead.org>,
	"Daniel Vetter" <daniel.vetter@ffwll.ch>,
	"Minturn Dave B" <dave.b.minturn@intel.com>,
	"Jason Ekstrand" <jason@jlekstrand.net>,
	"Dave Hansen" <dave.hansen@linux.intel.com>,
	"Xiong Jianxin" <jianxin.xiong@intel.com>,
	"Bjorn Helgaas" <helgaas@kernel.org>,
	"Ira Weiny" <ira.weiny@intel.com>,
	"Robin Murphy" <robin.murphy@arm.com>,
	"Martin Oliveira" <martin.oliveira@eideticom.com>,
	"Chaitanya Kulkarni" <ckulkarnilinux@gmail.com>,
	"Ralph Campbell" <rcampbell@nvidia.com>,
	"Stephen Bates" <sbates@raithlin.com>,
	"Logan Gunthorpe" <logang@deltatee.com>
Subject: [PATCH v9 7/8] PCI/P2PDMA: Allow userspace VMA allocations through sysfs
Date: Thu, 25 Aug 2022 09:24:24 -0600	[thread overview]
Message-ID: <20220825152425.6296-8-logang@deltatee.com> (raw)
In-Reply-To: <20220825152425.6296-1-logang@deltatee.com>
Create a sysfs bin attribute called "allocate" under the existing
"p2pmem" group. The only allowable operation on this file is the mmap()
call.
When mmap() is called on this attribute, the kernel allocates a chunk of
memory from the genalloc and inserts the pages into the VMA. The
dev_pagemap .page_free callback will indicate when these pages are no
longer used and they will be put back into the genalloc.
On device unbind, remove the sysfs file before the memremap_pages are
cleaned up. This ensures unmap_mapping_range() is called on the files
inode and no new mappings can be created.
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
---
 drivers/pci/p2pdma.c | 124 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 4496a7c5c478..a6ed6bbca214 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -89,6 +89,90 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(published);
 
+static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr, struct vm_area_struct *vma)
+{
+	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
+	size_t len = vma->vm_end - vma->vm_start;
+	struct pci_p2pdma *p2pdma;
+	struct percpu_ref *ref;
+	unsigned long vaddr;
+	void *kaddr;
+	int ret;
+
+	/* prevent private mappings from being established */
+	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
+		pci_info_ratelimited(pdev,
+				     "%s: fail, attempted private mapping\n",
+				     current->comm);
+		return -EINVAL;
+	}
+
+	if (vma->vm_pgoff) {
+		pci_info_ratelimited(pdev,
+				     "%s: fail, attempted mapping with non-zero offset\n",
+				     current->comm);
+		return -EINVAL;
+	}
+
+	rcu_read_lock();
+	p2pdma = rcu_dereference(pdev->p2pdma);
+	if (!p2pdma) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref);
+	if (!kaddr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * vm_insert_page() can sleep, so a reference is taken to mapping
+	 * such that rcu_read_unlock() can be done before inserting the
+	 * pages
+	 */
+	if (unlikely(!percpu_ref_tryget_live_rcu(ref))) {
+		ret = -ENODEV;
+		goto out_free_mem;
+	}
+	rcu_read_unlock();
+
+	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
+		ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
+		if (ret) {
+			gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
+			return ret;
+		}
+		percpu_ref_get(ref);
+		put_page(virt_to_page(kaddr));
+		kaddr += PAGE_SIZE;
+		len -= PAGE_SIZE;
+	}
+
+	percpu_ref_put(ref);
+
+	return 0;
+out_free_mem:
+	gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static struct bin_attribute p2pmem_alloc_attr = {
+	.attr = { .name = "allocate", .mode = 0660 },
+	.mmap = p2pmem_alloc_mmap,
+	/*
+	 * Some places where we want to call mmap (ie. python) will check
+	 * that the file size is greater than the mmap size before allowing
+	 * the mmap to continue. To work around this, just set the size
+	 * to be very large.
+	 */
+	.size = SZ_1T,
+};
+
 static struct attribute *p2pmem_attrs[] = {
 	&dev_attr_size.attr,
 	&dev_attr_available.attr,
@@ -96,11 +180,32 @@ static struct attribute *p2pmem_attrs[] = {
 	NULL,
 };
 
+static struct bin_attribute *p2pmem_bin_attrs[] = {
+	&p2pmem_alloc_attr,
+	NULL,
+};
+
 static const struct attribute_group p2pmem_group = {
 	.attrs = p2pmem_attrs,
+	.bin_attrs = p2pmem_bin_attrs,
 	.name = "p2pmem",
 };
 
+static void p2pdma_page_free(struct page *page)
+{
+	struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap);
+	struct percpu_ref *ref;
+
+	gen_pool_free_owner(pgmap->provider->p2pdma->pool,
+			    (uintptr_t)page_to_virt(page), PAGE_SIZE,
+			    (void **)&ref);
+	percpu_ref_put(ref);
+}
+
+static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
+	.page_free = p2pdma_page_free,
+};
+
 static void pci_p2pdma_release(void *data)
 {
 	struct pci_dev *pdev = data;
@@ -152,6 +257,19 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
 	return error;
 }
 
+static void pci_p2pdma_unmap_mappings(void *data)
+{
+	struct pci_dev *pdev = data;
+
+	/*
+	 * Removing the alloc attribute from sysfs will call
+	 * unmap_mapping_range() on the inode, teardown any existing userspace
+	 * mappings and prevent new ones from being created.
+	 */
+	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
+				     p2pmem_group.name);
+}
+
 /**
  * pci_p2pdma_add_resource - add memory for use as p2p memory
  * @pdev: the device to add the memory to
@@ -198,6 +316,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	pgmap->range.end = pgmap->range.start + size - 1;
 	pgmap->nr_range = 1;
 	pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
+	pgmap->ops = &p2pdma_pgmap_ops;
 
 	p2p_pgmap->provider = pdev;
 	p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) -
@@ -209,6 +328,11 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 		goto pgmap_free;
 	}
 
+	error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings,
+					 pdev);
+	if (error)
+		goto pages_free;
+
 	p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
 	error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr,
 			pci_bus_address(pdev, bar) + offset,
-- 
2.30.2
next prev parent reply	other threads:[~2022-08-25 15:24 UTC|newest]
Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-25 15:24 [PATCH v9 0/8] Userspace P2PDMA with O_DIRECT NVMe devices Logan Gunthorpe
2022-08-25 15:24 ` [PATCH v9 1/8] mm: introduce FOLL_PCI_P2PDMA to gate getting PCI P2PDMA pages Logan Gunthorpe
2022-09-05 22:27   ` John Hubbard
2022-08-25 15:24 ` [PATCH v9 2/8] iov_iter: introduce iov_iter_get_pages_[alloc_]flags() Logan Gunthorpe
2022-09-05 14:33   ` Christoph Hellwig
2022-09-05 23:21   ` John Hubbard
2022-09-06 16:52     ` Logan Gunthorpe
2022-08-25 15:24 ` [PATCH v9 3/8] block: add check when merging zone device pages Logan Gunthorpe
2022-09-05 14:34   ` Christoph Hellwig
2022-09-05 23:58   ` John Hubbard
2022-08-25 15:24 ` [PATCH v9 4/8] lib/scatterlist: " Logan Gunthorpe
2022-09-05 14:34   ` Christoph Hellwig
2022-09-06  0:21   ` John Hubbard
2022-08-25 15:24 ` [PATCH v9 5/8] block: set FOLL_PCI_P2PDMA in __bio_iov_iter_get_pages() Logan Gunthorpe
2022-09-05 14:36   ` Christoph Hellwig
2022-09-06  0:48   ` John Hubbard
2022-08-25 15:24 ` [PATCH v9 6/8] block: set FOLL_PCI_P2PDMA in bio_map_user_iov() Logan Gunthorpe
2022-09-05 14:36   ` Christoph Hellwig
2022-09-06  0:54   ` John Hubbard
2022-08-25 15:24 ` Logan Gunthorpe [this message]
2022-09-01 16:20   ` [PATCH v9 7/8] PCI/P2PDMA: Allow userspace VMA allocations through sysfs Greg Kroah-Hartman
2022-09-01 16:32     ` Logan Gunthorpe
2022-09-01 16:42       ` Greg Kroah-Hartman
2022-09-01 18:14         ` Logan Gunthorpe
2022-09-01 18:36           ` Greg Kroah-Hartman
2022-09-01 19:16             ` Logan Gunthorpe
2022-09-02  5:53               ` Greg Kroah-Hartman
2022-09-02 18:46                 ` Logan Gunthorpe
2022-09-20  6:46                   ` Christoph Hellwig
2022-09-22  8:38                     ` Greg Kroah-Hartman
2022-09-22 14:58                       ` Logan Gunthorpe
2022-08-25 15:24 ` [PATCH v9 8/8] ABI: sysfs-bus-pci: add documentation for p2pmem allocate Logan Gunthorpe
2022-09-01 16:18   ` Greg Kroah-Hartman
2022-09-01 16:33     ` Logan Gunthorpe
2022-09-06  1:03   ` John Hubbard
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox
  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):
  git send-email \
    --in-reply-to=20220825152425.6296-8-logang@deltatee.com \
    --to=logang@deltatee.com \
    --cc=christian.koenig@amd.com \
    --cc=ckulkarnilinux@gmail.com \
    --cc=dan.j.williams@intel.com \
    --cc=daniel.vetter@ffwll.ch \
    --cc=dave.b.minturn@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=ddutile@redhat.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=hch@lst.de \
    --cc=helgaas@kernel.org \
    --cc=ira.weiny@intel.com \
    --cc=jason@jlekstrand.net \
    --cc=jgg@ziepe.ca \
    --cc=jhubbard@nvidia.com \
    --cc=jianxin.xiong@intel.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=martin.oliveira@eideticom.com \
    --cc=rcampbell@nvidia.com \
    --cc=robin.murphy@arm.com \
    --cc=sbates@raithlin.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY
  https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
  Be sure your reply has a Subject: header at the top and a blank line
  before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).