From: Kirti Wankhede <kwankhede@nvidia.com>
To: <alex.williamson@redhat.com>, <pbonzini@redhat.com>,
<kraxel@redhat.com>, <cjia@nvidia.com>
Cc: <qemu-devel@nongnu.org>, <kvm@vger.kernel.org>,
<kevin.tian@intel.com>, <jike.song@intel.com>,
<bjsdjshi@linux.vnet.ibm.com>, <linux-kernel@vger.kernel.org>,
Kirti Wankhede <kwankhede@nvidia.com>
Subject: [PATCH v11 09/22] vfio iommu type1: Add task structure to vfio_dma
Date: Sat, 5 Nov 2016 02:40:43 +0530 [thread overview]
Message-ID: <1478293856-8191-10-git-send-email-kwankhede@nvidia.com> (raw)
In-Reply-To: <1478293856-8191-1-git-send-email-kwankhede@nvidia.com>
Add task structure to vfio_dma.
Add address space structure. Each vfio_dma structure points to the address
space of the task who mapped it.
List of address spaces is maintained in vfio_iommu structure.
>From DMA_MAP call if address space already exist in address space list,
vfio_dma points to it. If address space doesn't exist, allocate address
space, save pointer of mm to it and vfio_dma points to it.
Two tasks can share same address space and so we need keep address space
structure different from task in vfio_dma structure. vfio_dma keeps
pointer to its corresponding address space.
During DMA_UNMAP, same task who mapped it or other task who shares same
address space is allowed to unmap, otherwise unmap fails.
QEMU maps few iova ranges initially, then fork threads and from the child
thread calls DMA_UNMAP on previously mapped iova. Since child shares same
address space, DMA_UNMAP is successful.
This address space structure is used to track pages pinned by external
user in later changes.
Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I7600f1bea6b384fd589fa72421ccf031bcfd9ac5
---
drivers/vfio/vfio_iommu_type1.c | 182 +++++++++++++++++++++++++++++-----------
1 file changed, 134 insertions(+), 48 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 422c8d198abb..8d64528dcc22 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -55,12 +55,20 @@ MODULE_PARM_DESC(disable_hugepages,
struct vfio_iommu {
struct list_head domain_list;
+ struct list_head addr_space_list;
struct mutex lock;
struct rb_root dma_list;
bool v2;
bool nesting;
};
+/* address space */
+struct vfio_addr_space {
+ struct mm_struct *mm;
+ struct list_head next;
+ atomic_t ref_count;
+};
+
struct vfio_domain {
struct iommu_domain *domain;
struct list_head next;
@@ -75,6 +83,9 @@ struct vfio_dma {
unsigned long vaddr; /* Process virtual addr */
size_t size; /* Map size (bytes) */
int prot; /* IOMMU_READ/WRITE */
+ struct vfio_addr_space *addr_space;
+ struct task_struct *task;
+ bool mlock_cap;
};
struct vfio_group {
@@ -130,6 +141,18 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
rb_erase(&old->node, &iommu->dma_list);
}
+static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu,
+ struct mm_struct *mm)
+{
+ struct vfio_addr_space *as;
+
+ list_for_each_entry(as, &iommu->addr_space_list, next) {
+ if (as->mm == mm)
+ return as;
+ }
+ return NULL;
+}
+
struct vwork {
struct mm_struct *mm;
long npage;
@@ -273,24 +296,24 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
* the iommu can only map chunks of consecutive pfns anyway, so get the
* first page and all consecutive pages with the same locking.
*/
-static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
- int prot, unsigned long *pfn_base)
+static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
+ long npage, int prot,
+ unsigned long *pfn_base)
{
- unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- bool lock_cap = capable(CAP_IPC_LOCK);
+ struct task_struct *task = dma->task;
+ unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ bool lock_cap = dma->mlock_cap;
+ struct mm_struct *mm = dma->addr_space->mm;
long ret, i;
bool rsvd;
- if (!current->mm)
- return -ENODEV;
-
- ret = vaddr_get_pfn(current->mm, vaddr, prot, pfn_base);
+ ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
if (ret)
return ret;
rsvd = is_invalid_reserved_pfn(*pfn_base);
- if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
+ if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
put_pfn(*pfn_base, prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
limit << PAGE_SHIFT);
@@ -299,7 +322,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
if (unlikely(disable_hugepages)) {
if (!rsvd)
- vfio_lock_acct(current->mm, 1);
+ vfio_lock_acct(mm, 1);
return 1;
}
@@ -307,7 +330,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
unsigned long pfn = 0;
- ret = vaddr_get_pfn(current->mm, vaddr, prot, &pfn);
+ ret = vaddr_get_pfn(mm, vaddr, prot, &pfn);
if (ret)
break;
@@ -318,7 +341,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
}
if (!rsvd && !lock_cap &&
- current->mm->locked_vm + i + 1 > limit) {
+ mm->locked_vm + i + 1 > limit) {
put_pfn(pfn, prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
__func__, limit << PAGE_SHIFT);
@@ -327,13 +350,13 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
}
if (!rsvd)
- vfio_lock_acct(current->mm, i);
+ vfio_lock_acct(mm, i);
return i;
}
-static long __vfio_unpin_pages_remote(unsigned long pfn, long npage,
- int prot, bool do_accounting)
+static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
+ long npage, int prot, bool do_accounting)
{
unsigned long unlocked = 0;
long i;
@@ -342,7 +365,7 @@ static long __vfio_unpin_pages_remote(unsigned long pfn, long npage,
unlocked += put_pfn(pfn++, prot);
if (do_accounting)
- vfio_lock_acct(current->mm, -unlocked);
+ vfio_lock_acct(dma->addr_space->mm, -unlocked);
return unlocked;
}
@@ -396,7 +419,7 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
if (WARN_ON(!unmapped))
break;
- unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
+ unlocked += __vfio_unpin_pages_remote(dma, phys >> PAGE_SHIFT,
unmapped >> PAGE_SHIFT,
dma->prot, false);
iova += unmapped;
@@ -404,13 +427,20 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
cond_resched();
}
- vfio_lock_acct(current->mm, -unlocked);
+ vfio_lock_acct(dma->addr_space->mm, -unlocked);
}
static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
{
vfio_unmap_unpin(iommu, dma);
vfio_unlink_dma(iommu, dma);
+
+ if (atomic_dec_and_test(&dma->addr_space->ref_count)) {
+ mmput(dma->addr_space->mm);
+ put_task_struct(dma->task);
+ list_del(&dma->addr_space->next);
+ kfree(dma->addr_space);
+ }
kfree(dma);
}
@@ -506,6 +536,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
if (!iommu->v2 && unmap->iova > dma->iova)
break;
+ /*
+ * Task with same address space who mapped this iova range is
+ * allowed to unmap the iova range.
+ */
+ if (dma->task->mm != current->mm)
+ break;
unmapped += dma->size;
vfio_remove_dma(iommu, dma);
}
@@ -572,17 +608,58 @@ unwind:
return ret;
}
+static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
+ size_t map_size)
+{
+ dma_addr_t iova = dma->iova;
+ unsigned long vaddr = dma->vaddr;
+ size_t size = map_size;
+ long npage;
+ unsigned long pfn;
+ int ret = 0;
+
+ while (size) {
+ /* Pin a contiguous chunk of memory */
+ npage = __vfio_pin_pages_remote(dma, vaddr + dma->size,
+ size >> PAGE_SHIFT, dma->prot,
+ &pfn);
+ if (npage <= 0) {
+ WARN_ON(!npage);
+ ret = (int)npage;
+ break;
+ }
+
+ /* Map it! */
+ ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
+ dma->prot);
+ if (ret) {
+ __vfio_unpin_pages_remote(dma, pfn, npage, dma->prot,
+ true);
+ break;
+ }
+
+ size -= npage << PAGE_SHIFT;
+ dma->size += npage << PAGE_SHIFT;
+ }
+
+ if (ret)
+ vfio_remove_dma(iommu, dma);
+
+ return ret;
+}
+
static int vfio_dma_do_map(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map *map)
{
dma_addr_t iova = map->iova;
unsigned long vaddr = map->vaddr;
size_t size = map->size;
- long npage;
int ret = 0, prot = 0;
uint64_t mask;
struct vfio_dma *dma;
- unsigned long pfn;
+ struct vfio_addr_space *addr_space;
+ struct mm_struct *mm;
+ bool free_addr_space_on_err = false;
/* Verify that none of our __u64 fields overflow */
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
@@ -608,47 +685,56 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
mutex_lock(&iommu->lock);
if (vfio_find_dma(iommu, iova, size)) {
- mutex_unlock(&iommu->lock);
- return -EEXIST;
+ ret = -EEXIST;
+ goto do_map_err;
+ }
+
+ mm = get_task_mm(current);
+ if (!mm) {
+ ret = -ENODEV;
+ goto do_map_err;
+ }
+
+ addr_space = vfio_find_addr_space(iommu, mm);
+ if (addr_space) {
+ atomic_inc(&addr_space->ref_count);
+ mmput(mm);
+ } else {
+ addr_space = kzalloc(sizeof(*addr_space), GFP_KERNEL);
+ if (!addr_space) {
+ ret = -ENOMEM;
+ goto do_map_err;
+ }
+ addr_space->mm = mm;
+ atomic_set(&addr_space->ref_count, 1);
+ list_add(&addr_space->next, &iommu->addr_space_list);
+ free_addr_space_on_err = true;
}
dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
- mutex_unlock(&iommu->lock);
- return -ENOMEM;
+ if (free_addr_space_on_err) {
+ mmput(mm);
+ list_del(&addr_space->next);
+ kfree(addr_space);
+ }
+ ret = -ENOMEM;
+ goto do_map_err;
}
dma->iova = iova;
dma->vaddr = vaddr;
dma->prot = prot;
+ dma->addr_space = addr_space;
+ get_task_struct(current);
+ dma->task = current;
+ dma->mlock_cap = capable(CAP_IPC_LOCK);
/* Insert zero-sized and grow as we map chunks of it */
vfio_link_dma(iommu, dma);
- while (size) {
- /* Pin a contiguous chunk of memory */
- npage = __vfio_pin_pages_remote(vaddr + dma->size,
- size >> PAGE_SHIFT, prot, &pfn);
- if (npage <= 0) {
- WARN_ON(!npage);
- ret = (int)npage;
- break;
- }
-
- /* Map it! */
- ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
- if (ret) {
- __vfio_unpin_pages_remote(pfn, npage, prot, true);
- break;
- }
-
- size -= npage << PAGE_SHIFT;
- dma->size += npage << PAGE_SHIFT;
- }
-
- if (ret)
- vfio_remove_dma(iommu, dma);
-
+ ret = vfio_pin_map_dma(iommu, dma, size);
+do_map_err:
mutex_unlock(&iommu->lock);
return ret;
}
--
2.7.0
next prev parent reply other threads:[~2016-11-04 21:10 UTC|newest]
Thread overview: 74+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-11-04 21:10 [PATCH v11 00/22] Add Mediated device support Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 01/22] vfio: Mediated device Core driver Kirti Wankhede
2016-11-07 6:40 ` Tian, Kevin
2016-11-08 9:25 ` Dong Jia Shi
[not found] ` <20161108092552.GA2090@bjsdjshi@linux.vnet.ibm.com>
2016-11-08 21:06 ` Kirti Wankhede
2016-11-09 1:09 ` Dong Jia Shi
2016-11-04 21:10 ` [PATCH v11 02/22] vfio: VFIO based driver for Mediated devices Kirti Wankhede
2016-11-09 3:25 ` Dong Jia Shi
2016-11-04 21:10 ` [PATCH v11 03/22] vfio: Rearrange functions to get vfio_group from dev Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 04/22] vfio: Common function to increment container_users Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 05/22] vfio iommu: Added pin and unpin callback functions to vfio_iommu_driver_ops Kirti Wankhede
2016-11-07 19:36 ` Alex Williamson
2016-11-08 13:55 ` Kirti Wankhede
2016-11-08 16:39 ` Alex Williamson
2016-11-08 18:47 ` Kirti Wankhede
2016-11-08 19:14 ` Alex Williamson
2016-11-04 21:10 ` [PATCH v11 06/22] vfio iommu type1: Update arguments of vfio_lock_acct Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 07/22] vfio iommu type1: Update argument of vaddr_get_pfn() Kirti Wankhede
2016-11-07 8:42 ` Alexey Kardashevskiy
2016-11-04 21:10 ` [PATCH v11 08/22] vfio iommu type1: Add find_iommu_group() function Kirti Wankhede
2016-11-10 7:29 ` Dong Jia Shi
2016-11-04 21:10 ` Kirti Wankhede [this message]
2016-11-07 21:03 ` [PATCH v11 09/22] vfio iommu type1: Add task structure to vfio_dma Alex Williamson
2016-11-08 14:13 ` Kirti Wankhede
2016-11-08 16:43 ` Alex Williamson
2016-11-10 8:24 ` Dong Jia Shi
2016-11-04 21:10 ` [PATCH v11 10/22] vfio iommu type1: Add support for mediated devices Kirti Wankhede
2016-11-07 23:16 ` Alex Williamson
2016-11-08 2:20 ` Jike Song
2016-11-08 16:18 ` Alex Williamson
2016-11-08 15:06 ` Kirti Wankhede
2016-11-08 17:05 ` Alex Williamson
2016-11-08 6:52 ` Alexey Kardashevskiy
2016-11-15 5:17 ` Alexey Kardashevskiy
2016-11-15 6:33 ` Kirti Wankhede
2016-11-15 7:27 ` Alexey Kardashevskiy
2016-11-15 7:56 ` Kirti Wankhede
2016-11-14 2:49 ` Dong Jia Shi
2016-11-04 21:10 ` [PATCH v11 11/22] vfio iommu: Add blocking notifier to notify DMA_UNMAP Kirti Wankhede
2016-11-07 23:45 ` Alex Williamson
2016-11-08 16:26 ` Kirti Wankhede
2016-11-08 17:46 ` Alex Williamson
2016-11-08 19:59 ` Kirti Wankhede
2016-11-08 21:28 ` Alex Williamson
2016-11-14 7:52 ` Kirti Wankhede
2016-11-14 15:37 ` Alex Williamson
2016-11-04 21:10 ` [PATCH v11 12/22] vfio: Add notifier callback to parent's ops structure of mdev Kirti Wankhede
2016-11-07 23:51 ` Alex Williamson
2016-11-04 21:10 ` [PATCH v11 13/22] vfio: Introduce common function to add capabilities Kirti Wankhede
2016-11-08 7:29 ` Alexey Kardashevskiy
2016-11-08 20:46 ` Kirti Wankhede
2016-11-08 21:42 ` Alex Williamson
2016-11-09 2:23 ` Alexey Kardashevskiy
2016-11-04 21:10 ` [PATCH v11 14/22] vfio_pci: Update vfio_pci to use vfio_info_add_capability() Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 15/22] vfio: Introduce vfio_set_irqs_validate_and_prepare() Kirti Wankhede
2016-11-08 8:46 ` Alexey Kardashevskiy
2016-11-08 20:22 ` Kirti Wankhede
2016-11-09 3:07 ` Alexey Kardashevskiy
2016-11-09 3:35 ` Alex Williamson
2016-11-04 21:10 ` [PATCH v11 16/22] vfio_pci: Updated to use vfio_set_irqs_validate_and_prepare() Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 17/22] vfio_platform: " Kirti Wankhede
2016-11-08 8:52 ` Alexey Kardashevskiy
2016-11-08 20:41 ` Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 18/22] vfio: Define device_api strings Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 19/22] docs: Add Documentation for Mediated devices Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 20/22] docs: Sysfs ABI for mediated device framework Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 21/22] docs: Sample driver to demonstrate how to use Mediated " Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 22/22] MAINTAINERS: Add entry VFIO based Mediated device drivers Kirti Wankhede
2016-11-07 3:30 ` [PATCH v11 00/22] Add Mediated device support Alexey Kardashevskiy
2016-11-07 3:59 ` Kirti Wankhede
2016-11-07 5:06 ` Kirti Wankhede
2016-11-07 6:15 ` Alexey Kardashevskiy
2016-11-07 6:36 ` Kirti Wankhede
2016-11-07 6:46 ` Alexey Kardashevskiy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1478293856-8191-10-git-send-email-kwankhede@nvidia.com \
--to=kwankhede@nvidia.com \
--cc=alex.williamson@redhat.com \
--cc=bjsdjshi@linux.vnet.ibm.com \
--cc=cjia@nvidia.com \
--cc=jike.song@intel.com \
--cc=kevin.tian@intel.com \
--cc=kraxel@redhat.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).