From: Alexey Kardashevskiy <aik@ozlabs.ru>
To: linuxppc-dev@lists.ozlabs.org
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>,
Gavin Shan <gwshan@linux.vnet.ibm.com>,
Alexander Graf <agraf@suse.de>,
Alex Williamson <alex.williamson@redhat.com>,
Alexander Gordeev <agordeev@redhat.com>,
Paul Mackerras <paulus@samba.org>,
linux-kernel@vger.kernel.org
Subject: [PATCH v3 14/24] vfio: powerpc/spapr: Register memory
Date: Thu, 29 Jan 2015 20:21:55 +1100 [thread overview]
Message-ID: <1422523325-1389-15-git-send-email-aik@ozlabs.ru> (raw)
In-Reply-To: <1422523325-1389-1-git-send-email-aik@ozlabs.ru>
The existing implementation accounts the whole DMA window in
the locked_vm counter which is going to be even worse with multiple
containers and huge DMA windows.
This introduces 2 ioctls to register/unregister DMA memory which
receive user space address and size of the memory region which
needs to be pinned/unpinned and counted in locked_vm.
If any memory region was registered, all subsequent DMA map requests
should address already pinned memory. If no memory was registered,
then the amount of memory required for a single default memory will be
accounted when the container is enabled and every map/unmap will pin/unpin
a page.
Dynamic DMA window and in-kernel acceleration will require memory to
be registered in order to work.
The accounting is done per VFIO container. When the support of
multiple groups per container is added, we will have accurate locked_vm
accounting.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
drivers/vfio/vfio_iommu_spapr_tce.c | 333 ++++++++++++++++++++++++++++++++----
include/uapi/linux/vfio.h | 29 ++++
2 files changed, 331 insertions(+), 31 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 8256275..d0987ae 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -86,8 +86,169 @@ struct tce_container {
struct mutex lock;
struct iommu_group *grp;
bool enabled;
+ struct list_head mem_list;
};
+struct tce_memory {
+ struct list_head next;
+ struct rcu_head rcu;
+ __u64 vaddr;
+ __u64 size;
+ __u64 pfns[];
+};
+
+static void tce_unpin_pages(struct tce_container *container,
+ struct tce_memory *mem, __u64 vaddr, __u64 size)
+{
+ __u64 off;
+ struct page *page = NULL;
+
+
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ if (!mem->pfns[off >> PAGE_SHIFT])
+ continue;
+
+ page = pfn_to_page(mem->pfns[off >> PAGE_SHIFT]);
+ if (!page)
+ continue;
+
+ put_page(page);
+ mem->pfns[off >> PAGE_SHIFT] = 0;
+ }
+}
+
+static void release_tce_memory(struct rcu_head *head)
+{
+ struct tce_memory *mem = container_of(head, struct tce_memory, rcu);
+
+ kfree(mem);
+}
+
+static void tce_do_unregister_pages(struct tce_container *container,
+ struct tce_memory *mem)
+{
+ tce_unpin_pages(container, mem, mem->vaddr, mem->size);
+ decrement_locked_vm(mem->size);
+ list_del_rcu(&mem->next);
+ call_rcu_sched(&mem->rcu, release_tce_memory);
+}
+
+static long tce_unregister_pages(struct tce_container *container,
+ __u64 vaddr, __u64 size)
+{
+ struct tce_memory *mem, *memtmp;
+
+ if (container->enabled)
+ return -EBUSY;
+
+ if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
+ return -EINVAL;
+
+ list_for_each_entry_safe(mem, memtmp, &container->mem_list, next) {
+ if ((mem->vaddr == vaddr) && (mem->size == size)) {
+ tce_do_unregister_pages(container, mem);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static long tce_pin_pages(struct tce_container *container,
+ struct tce_memory *mem, __u64 vaddr, __u64 size)
+{
+ __u64 off;
+ struct page *page = NULL;
+
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ if (1 != get_user_pages_fast(vaddr + off,
+ 1/* pages */, 1/* iswrite */, &page)) {
+ tce_unpin_pages(container, mem, vaddr, off);
+ return -EFAULT;
+ }
+
+ mem->pfns[off >> PAGE_SHIFT] = page_to_pfn(page);
+ }
+
+ return 0;
+}
+
+static long tce_register_pages(struct tce_container *container,
+ __u64 vaddr, __u64 size)
+{
+ long ret;
+ struct tce_memory *mem;
+
+ if (container->enabled)
+ return -EBUSY;
+
+ if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
+ ((vaddr + size) < vaddr))
+ return -EINVAL;
+
+ /* Any overlap with registered chunks? */
+ rcu_read_lock();
+ list_for_each_entry_rcu(mem, &container->mem_list, next) {
+ if ((mem->vaddr < (vaddr + size)) &&
+ (vaddr < (mem->vaddr + mem->size))) {
+ ret = -EBUSY;
+ goto unlock_exit;
+ }
+ }
+
+ ret = try_increment_locked_vm(size >> PAGE_SHIFT);
+ if (ret)
+ goto unlock_exit;
+
+ mem = kzalloc(sizeof(*mem) + (size >> (PAGE_SHIFT - 3)), GFP_KERNEL);
+ if (!mem)
+ goto unlock_exit;
+
+ if (tce_pin_pages(container, mem, vaddr, size))
+ goto free_exit;
+
+ mem->vaddr = vaddr;
+ mem->size = size;
+
+ list_add_rcu(&mem->next, &container->mem_list);
+ rcu_read_unlock();
+
+ return 0;
+
+free_exit:
+ kfree(mem);
+
+unlock_exit:
+ decrement_locked_vm(size >> PAGE_SHIFT);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline bool tce_preregistered(struct tce_container *container)
+{
+ return !list_empty(&container->mem_list);
+}
+
+static bool tce_pinned(struct tce_container *container,
+ __u64 vaddr, __u64 size)
+{
+ struct tce_memory *mem;
+ bool ret = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(mem, &container->mem_list, next) {
+ if ((mem->vaddr <= vaddr) &&
+ (vaddr + size <= mem->vaddr + mem->size)) {
+ ret = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
static bool tce_check_page_size(struct page *page, unsigned page_shift)
{
unsigned shift;
@@ -166,14 +327,16 @@ static int tce_iommu_enable(struct tce_container *container)
* as this information is only available from KVM and VFIO is
* KVM agnostic.
*/
- iommu = iommu_group_get_iommudata(container->grp);
- if (!iommu)
- return -EFAULT;
+ if (!tce_preregistered(container)) {
+ iommu = iommu_group_get_iommudata(container->grp);
+ if (!iommu)
+ return -EFAULT;
- tbl = &iommu->tables[0];
- ret = try_increment_locked_vm(IOMMU_TABLE_PAGES(tbl));
- if (ret)
- return ret;
+ tbl = &iommu->tables[0];
+ ret = try_increment_locked_vm(IOMMU_TABLE_PAGES(tbl));
+ if (ret)
+ return ret;
+ }
container->enabled = true;
@@ -193,12 +356,14 @@ static void tce_iommu_disable(struct tce_container *container)
if (!container->grp || !current->mm)
return;
- iommu = iommu_group_get_iommudata(container->grp);
- if (!iommu)
- return;
+ if (!tce_preregistered(container)) {
+ iommu = iommu_group_get_iommudata(container->grp);
+ if (!iommu)
+ return;
- tbl = &iommu->tables[0];
- decrement_locked_vm(IOMMU_TABLE_PAGES(tbl));
+ tbl = &iommu->tables[0];
+ decrement_locked_vm(IOMMU_TABLE_PAGES(tbl));
+ }
}
static void *tce_iommu_open(unsigned long arg)
@@ -215,6 +380,7 @@ static void *tce_iommu_open(unsigned long arg)
return ERR_PTR(-ENOMEM);
mutex_init(&container->lock);
+ INIT_LIST_HEAD_RCU(&container->mem_list);
return container;
}
@@ -222,6 +388,7 @@ static void *tce_iommu_open(unsigned long arg)
static void tce_iommu_release(void *iommu_data)
{
struct tce_container *container = iommu_data;
+ struct tce_memory *mem, *memtmp;
WARN_ON(container->grp);
tce_iommu_disable(container);
@@ -229,14 +396,19 @@ static void tce_iommu_release(void *iommu_data)
if (container->grp)
tce_iommu_detach_group(iommu_data, container->grp);
+ list_for_each_entry_safe(mem, memtmp, &container->mem_list, next)
+ tce_do_unregister_pages(container, mem);
+
mutex_destroy(&container->lock);
kfree(container);
}
-static void tce_iommu_unuse_page(unsigned long oldtce)
+static void tce_iommu_unuse_page(struct tce_container *container,
+ unsigned long oldtce)
{
struct page *page;
+ bool do_put = !tce_preregistered(container);
if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE)))
return;
@@ -245,7 +417,8 @@ static void tce_iommu_unuse_page(unsigned long oldtce)
if (oldtce & TCE_PCI_WRITE)
SetPageDirty(page);
- put_page(page);
+ if (do_put)
+ put_page(page);
}
static int tce_iommu_clear(struct tce_container *container,
@@ -261,7 +434,7 @@ static int tce_iommu_clear(struct tce_container *container,
if (ret)
continue;
- tce_iommu_unuse_page(oldtce);
+ tce_iommu_unuse_page(container, oldtce);
}
return 0;
@@ -279,42 +452,91 @@ static enum dma_data_direction tce_iommu_direction(unsigned long tce)
return DMA_NONE;
}
+static unsigned long tce_get_hva_cached(struct tce_container *container,
+ unsigned page_shift, unsigned long tce)
+{
+ struct tce_memory *mem;
+ struct page *page = NULL;
+ unsigned long hva = -1;
+
+ tce &= ~(TCE_PCI_READ | TCE_PCI_WRITE);
+ rcu_read_lock();
+ list_for_each_entry_rcu(mem, &container->mem_list, next) {
+ if ((mem->vaddr <= tce) && (tce < (mem->vaddr + mem->size))) {
+ unsigned long gfn = (tce - mem->vaddr) >> PAGE_SHIFT;
+ unsigned long hpa = mem->pfns[gfn] << PAGE_SHIFT;
+
+ page = pfn_to_page(mem->pfns[gfn]);
+
+ if (!tce_check_page_size(page, page_shift))
+ break;
+
+ hva = (unsigned long) __va(hpa);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return hva;
+}
+
+static unsigned long tce_get_hva(struct tce_container *container,
+ unsigned page_shift, unsigned long tce)
+{
+ long ret = 0;
+ struct page *page = NULL;
+ unsigned long hva = -1;
+ enum dma_data_direction direction = tce_iommu_direction(tce);
+
+ ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+ direction != DMA_TO_DEVICE, &page);
+ if (unlikely(ret != 1))
+ return -1;
+
+ if (!tce_check_page_size(page, page_shift)) {
+ put_page(page);
+ return -1;
+ }
+
+ hva = (unsigned long) page_address(page) +
+ (tce & ~((1ULL << page_shift) - 1) & ~PAGE_MASK);
+
+ return hva;
+}
+
static long tce_iommu_build(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long tce, unsigned long pages)
{
long i, ret = 0;
- struct page *page = NULL;
unsigned long hva, oldtce;
enum dma_data_direction direction = tce_iommu_direction(tce);
+ bool do_put = false;
for (i = 0; i < pages; ++i) {
- ret = get_user_pages_fast(tce & PAGE_MASK, 1,
- direction != DMA_TO_DEVICE, &page);
- if (unlikely(ret != 1)) {
- ret = -EFAULT;
- break;
+ hva = tce_get_hva_cached(container, tbl->it_page_shift, tce);
+ if (hva == -1) {
+ do_put = true;
+ WARN_ON_ONCE(1);
+ hva = tce_get_hva(container, tbl->it_page_shift, tce);
}
- if (!tce_check_page_size(page, tbl->it_page_shift)) {
- ret = -EFAULT;
- break;
- }
-
- hva = (unsigned long) page_address(page) +
- (tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK);
oldtce = 0;
-
ret = iommu_tce_xchg(tbl, entry + i, hva, &oldtce, direction);
if (ret) {
- put_page(page);
+ if (do_put)
+ put_page(pfn_to_page(__pa(hva) >> PAGE_SHIFT));
pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
__func__, entry << tbl->it_page_shift,
tce, ret);
break;
}
- tce_iommu_unuse_page(oldtce);
+ if (do_put)
+ put_page(pfn_to_page(__pa(hva) >> PAGE_SHIFT));
+
+ tce_iommu_unuse_page(container, oldtce);
+
tce += IOMMU_PAGE_SIZE(tbl);
}
@@ -416,6 +638,11 @@ static long tce_iommu_ioctl(void *iommu_data,
if (ret)
return ret;
+ /* If any memory is pinned, only allow pages from that region */
+ if (tce_preregistered(container) &&
+ !tce_pinned(container, param.vaddr, param.size))
+ return -EPERM;
+
ret = tce_iommu_build(container, tbl,
param.iova >> tbl->it_page_shift,
tce, param.size >> tbl->it_page_shift);
@@ -464,6 +691,50 @@ static long tce_iommu_ioctl(void *iommu_data,
return ret;
}
+ case VFIO_IOMMU_REGISTER_MEMORY: {
+ struct vfio_iommu_type1_register_memory param;
+
+ minsz = offsetofend(struct vfio_iommu_type1_register_memory,
+ size);
+
+ if (copy_from_user(¶m, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (param.argsz < minsz)
+ return -EINVAL;
+
+ /* No flag is supported now */
+ if (param.flags)
+ return -EINVAL;
+
+ mutex_lock(&container->lock);
+ ret = tce_register_pages(container, param.vaddr, param.size);
+ mutex_unlock(&container->lock);
+
+ return ret;
+ }
+ case VFIO_IOMMU_UNREGISTER_MEMORY: {
+ struct vfio_iommu_type1_unregister_memory param;
+
+ minsz = offsetofend(struct vfio_iommu_type1_unregister_memory,
+ size);
+
+ if (copy_from_user(¶m, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (param.argsz < minsz)
+ return -EINVAL;
+
+ /* No flag is supported now */
+ if (param.flags)
+ return -EINVAL;
+
+ mutex_lock(&container->lock);
+ tce_unregister_pages(container, param.vaddr, param.size);
+ mutex_unlock(&container->lock);
+
+ return 0;
+ }
case VFIO_IOMMU_ENABLE:
mutex_lock(&container->lock);
ret = tce_iommu_enable(container);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 29715d2..2bb0c9b 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -437,6 +437,35 @@ struct vfio_iommu_type1_dma_unmap {
#define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15)
#define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
+/**
+ * VFIO_IOMMU_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_type1_register_memory)
+ *
+ * Registers user space memory where DMA is allowed. It pins
+ * user pages and does the locked memory accounting so
+ * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls
+ * get simpler.
+ */
+struct vfio_iommu_type1_register_memory {
+ __u32 argsz;
+ __u32 flags;
+ __u64 vaddr; /* Process virtual address */
+ __u64 size; /* Size of mapping (bytes) */
+};
+#define VFIO_IOMMU_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/**
+ * VFIO_IOMMU_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_type1_unregister_memory)
+ *
+ * Unregisters user space memory registered with VFIO_IOMMU_REGISTER_MEMORY.
+ */
+struct vfio_iommu_type1_unregister_memory {
+ __u32 argsz;
+ __u32 flags;
+ __u64 vaddr; /* Process virtual address */
+ __u64 size; /* Size of mapping (bytes) */
+};
+#define VFIO_IOMMU_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18)
+
/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
/*
--
2.0.0
next prev parent reply other threads:[~2015-01-29 9:22 UTC|newest]
Thread overview: 35+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-01-29 9:21 [PATCH v3 00/24] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 01/24] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 02/24] vfio: powerpc/iommu: Check that TCE page size is equal to it_page_size Alexey Kardashevskiy
2015-02-02 21:45 ` Alex Williamson
2015-01-29 9:21 ` [PATCH v3 03/24] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 04/24] vfio: powerpc/spapr: Use it_page_size Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 05/24] vfio: powerpc/spapr: Move locked_vm accounting to helpers Alexey Kardashevskiy
2015-02-03 0:12 ` Alex Williamson
2015-01-29 9:21 ` [PATCH v3 06/24] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 07/24] powerpc/iommu: Introduce iommu_table_alloc() helper Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 08/24] powerpc/spapr: vfio: Switch from iommu_table to new powerpc_iommu Alexey Kardashevskiy
2015-02-03 0:12 ` Alex Williamson
2015-02-04 13:32 ` Alexander Graf
2015-02-05 4:58 ` Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 09/24] powerpc/iommu: Fix IOMMU ownership control functions Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 10/24] powerpc/powernv/ioda2: Rework IOMMU ownership control Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 11/24] powerpc/powernv/ioda/ioda2: Rework tce_build()/tce_free() Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 12/24] powerpc/iommu/powernv: Release replaced TCE Alexey Kardashevskiy
2015-02-04 6:08 ` Paul Mackerras
2015-02-05 4:57 ` Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 13/24] powerpc/pseries/lpar: Enable VFIO Alexey Kardashevskiy
2015-01-29 9:21 ` Alexey Kardashevskiy [this message]
2015-02-03 0:11 ` [PATCH v3 14/24] vfio: powerpc/spapr: Register memory Alex Williamson
2015-02-03 5:51 ` Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 15/24] poweppc/powernv/ioda2: Rework iommu_table creation Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 16/24] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_create_table Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 17/24] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window Alexey Kardashevskiy
2015-01-29 9:21 ` [PATCH v3 18/24] powerpc/iommu: Split iommu_free_table into 2 helpers Alexey Kardashevskiy
2015-01-29 9:22 ` [PATCH v3 19/24] powerpc/powernv: Implement multilevel TCE tables Alexey Kardashevskiy
2015-01-29 9:22 ` [PATCH v3 20/24] powerpc/powernv: Change prototypes to receive iommu Alexey Kardashevskiy
2015-01-29 9:22 ` [PATCH v3 21/24] powerpc/powernv/ioda: Define and implement DMA table/window management callbacks Alexey Kardashevskiy
2015-01-29 9:22 ` [PATCH v3 22/24] powerpc/iommu: Get rid of ownership helpers Alexey Kardashevskiy
2015-01-29 9:22 ` [PATCH v3 23/24] vfio/spapr: Enable multiple groups in a container Alexey Kardashevskiy
2015-01-29 9:22 ` [PATCH v3 24/24] vfio: powerpc/spapr: Support Dynamic DMA windows Alexey Kardashevskiy
2015-02-03 2:53 ` Alex Williamson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1422523325-1389-15-git-send-email-aik@ozlabs.ru \
--to=aik@ozlabs.ru \
--cc=agordeev@redhat.com \
--cc=agraf@suse.de \
--cc=alex.williamson@redhat.com \
--cc=gwshan@linux.vnet.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=paulus@samba.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).