From: Jike Song <jike.song@intel.com>
To: Kirti Wankhede <kwankhede@nvidia.com>
Cc: alex.williamson@redhat.com, pbonzini@redhat.com,
kraxel@redhat.com, cjia@nvidia.com, qemu-devel@nongnu.org,
kvm@vger.kernel.org, kevin.tian@intel.com,
bjsdjshi@linux.vnet.ibm.com, linux-kernel@vger.kernel.org
Subject: Re: [Qemu-devel] [PATCH v10 09/19] vfio iommu type1: Add support for mediated devices
Date: Wed, 02 Nov 2016 21:29:24 +0800 [thread overview]
Message-ID: <5819EA34.1020607@intel.com> (raw)
In-Reply-To: <1477517366-27871-10-git-send-email-kwankhede@nvidia.com>
On 10/27/2016 05:29 AM, Kirti Wankhede wrote:
> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
>
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
>
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
> cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
> verify unpinning request and to unpin remaining pages while detaching, if
> there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
> exist in the container then all pages are already pinned and accounted.
> Accouting for mdev device is only done if there is no iommu capable
> domain in the container.
> - Page accouting is updated on hot plug and unplug mdev device and pass
> through device.
>
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - Linux VM hot plug and unplug vGPU device while GPU pass through device
> exist
> - Linux VM hot plug and unplug GPU pass through device while vGPU device
> exist
>
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> ---
> drivers/vfio/vfio_iommu_type1.c | 646 +++++++++++++++++++++++++++++++++++-----
> 1 file changed, 571 insertions(+), 75 deletions(-)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 861ac2a1b0c3..5add11a147e1 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -36,6 +36,7 @@
> #include <linux/uaccess.h>
> #include <linux/vfio.h>
> #include <linux/workqueue.h>
> +#include <linux/mdev.h>
>
> #define DRIVER_VERSION "0.2"
> #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
> @@ -55,18 +56,26 @@ MODULE_PARM_DESC(disable_hugepages,
>
> struct vfio_iommu {
> struct list_head domain_list;
> + struct vfio_domain *external_domain; /* domain for external user */
> struct mutex lock;
> struct rb_root dma_list;
> bool v2;
> bool nesting;
> };
>
> +struct external_addr_space {
> + struct task_struct *task;
> + struct rb_root pfn_list; /* pinned Host pfn list */
> + struct mutex pfn_list_lock; /* mutex for pfn_list */
> +};
> +
> struct vfio_domain {
> - struct iommu_domain *domain;
> - struct list_head next;
> - struct list_head group_list;
> - int prot; /* IOMMU_CACHE */
> - bool fgsp; /* Fine-grained super pages */
> + struct iommu_domain *domain;
> + struct list_head next;
> + struct list_head group_list;
> + struct external_addr_space *external_addr_space;
> + int prot; /* IOMMU_CACHE */
> + bool fgsp; /* Fine-grained super pages */
> };
>
> struct vfio_dma {
> @@ -75,6 +84,7 @@ struct vfio_dma {
> unsigned long vaddr; /* Process virtual addr */
> size_t size; /* Map size (bytes) */
> int prot; /* IOMMU_READ/WRITE */
> + bool iommu_mapped;
> };
>
> struct vfio_group {
> @@ -83,6 +93,21 @@ struct vfio_group {
> };
>
> /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> + struct rb_node node;
> + unsigned long vaddr; /* virtual addr */
> + dma_addr_t iova; /* IOVA */
> + unsigned long pfn; /* Host pfn */
> + int prot;
> + atomic_t ref_count;
> +};
> +
> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
> + (!list_empty(&iommu->domain_list))
> +
> +/*
> * This code handles mapping and unmapping of user data buffers
> * into DMA'ble space using the IOMMU
> */
> @@ -130,6 +155,101 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> rb_erase(&old->node, &iommu->dma_list);
> }
>
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> + unsigned long pfn)
> +{
> + struct rb_node *node;
> + struct vfio_pfn *vpfn;
> +
> + node = domain->external_addr_space->pfn_list.rb_node;
> +
> + while (node) {
> + vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> + if (pfn < vpfn->pfn)
> + node = node->rb_left;
> + else if (pfn > vpfn->pfn)
> + node = node->rb_right;
> + else
> + return vpfn;
> + }
> +
> + return NULL;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> + struct rb_node **link, *parent = NULL;
> + struct vfio_pfn *vpfn;
> +
> + link = &domain->external_addr_space->pfn_list.rb_node;
> + while (*link) {
> + parent = *link;
> + vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> + if (new->pfn < vpfn->pfn)
> + link = &(*link)->rb_left;
> + else
> + link = &(*link)->rb_right;
> + }
> +
> + rb_link_node(&new->node, parent, link);
> + rb_insert_color(&new->node, &domain->external_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> + rb_erase(&old->node, &domain->external_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
> + dma_addr_t iova, unsigned long pfn, int prot)
> +{
> + struct vfio_pfn *vpfn;
> +
> + vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> + if (!vpfn)
> + return -ENOMEM;
> +
> + vpfn->vaddr = vaddr;
> + vpfn->iova = iova;
> + vpfn->pfn = pfn;
> + vpfn->prot = prot;
> + atomic_set(&vpfn->ref_count, 1);
> + vfio_link_pfn(domain, vpfn);
> + return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> + struct vfio_pfn *vpfn)
> +{
> + vfio_unlink_pfn(domain, vpfn);
> + kfree(vpfn);
> +}
> +
> +static int vfio_pfn_account(struct vfio_iommu *iommu, unsigned long pfn)
> +{
> + struct vfio_pfn *p;
> + struct vfio_domain *domain = iommu->external_domain;
> + int ret = 1;
> +
> + if (!domain)
> + return 1;
> +
> + mutex_lock(&domain->external_addr_space->pfn_list_lock);
> +
> + p = vfio_find_pfn(domain, pfn);
> + if (p)
> + ret = 0;
> +
> + mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> + return ret;
> +}
> +
> struct vwork {
> struct mm_struct *mm;
> long npage;
> @@ -270,12 +390,13 @@ static int vaddr_get_pfn(struct mm_struct *remote_mm, unsigned long vaddr,
> * the iommu can only map chunks of consecutive pfns anyway, so get the
> * first page and all consecutive pages with the same locking.
> */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> - int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(struct vfio_iommu *iommu,
> + unsigned long vaddr, long npage,
> + int prot, unsigned long *pfn_base)
> {
> unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> bool lock_cap = capable(CAP_IPC_LOCK);
> - long ret, i;
> + long ret, i, lock_acct = 0;
> bool rsvd;
>
> if (!current->mm)
> @@ -285,9 +406,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
> if (ret)
> return ret;
>
> + lock_acct = vfio_pfn_account(iommu, *pfn_base);
> +
> rsvd = is_invalid_reserved_pfn(*pfn_base);
>
> - if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
> + if (!rsvd && !lock_cap && current->mm->locked_vm + lock_acct > limit) {
> put_pfn(*pfn_base, prot);
> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> limit << PAGE_SHIFT);
> @@ -314,8 +437,10 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
> break;
> }
>
> + lock_acct += vfio_pfn_account(iommu, pfn);
> +
> if (!rsvd && !lock_cap &&
> - current->mm->locked_vm + i + 1 > limit) {
> + current->mm->locked_vm + lock_acct > limit) {
> put_pfn(pfn, prot);
> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> __func__, limit << PAGE_SHIFT);
> @@ -329,14 +454,19 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
> return i;
> }
>
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> - int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(struct vfio_iommu *iommu,
> + unsigned long pfn, long npage, int prot,
> + bool do_accounting)
> {
> - unsigned long unlocked = 0;
> + unsigned long unlocked = 0, unlock_acct = 0;
> long i;
>
> - for (i = 0; i < npage; i++)
> + for (i = 0; i < npage; i++) {
> + if (do_accounting)
> + unlock_acct += vfio_pfn_account(iommu, pfn);
> +
> unlocked += put_pfn(pfn++, prot);
> + }
>
> if (do_accounting)
> vfio_lock_acct(current, -unlocked);
> @@ -344,14 +474,208 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
> return unlocked;
> }
>
> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> +static long __vfio_pin_page_external(struct vfio_domain *domain,
> + unsigned long vaddr, int prot,
> + unsigned long *pfn_base,
> + bool do_accounting)
> +{
> + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> + bool lock_cap = capable(CAP_IPC_LOCK);
> + long ret;
> + bool rsvd;
> + struct task_struct *task = domain->external_addr_space->task;
> +
> + if (!task->mm)
> + return -ENODEV;
> +
> + ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
Hi Kirti,
In your implementation of vaddr_get_pfn, as long as the 1st argument
is not NULL, you will call get_user_pages_remote other than get_user_pages_fast.
But still there is a possibility here @task is actually @current, thereby
task->mm == current->mm. That should not be rare, but probably happen
at most times: the QEMU process emulates MMIO read/write, and call
vfio_pin_pages in its process context.
If I read correctly, current implementation enforces a locked GUP, no
matter which process context it is in.
--
Thanks,
Jike
next prev parent reply other threads:[~2016-11-02 13:33 UTC|newest]
Thread overview: 58+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-10-26 21:29 [Qemu-devel] [PATCH v10 00/19] Add Mediated device support Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 01/19] vfio: Mediated device Core driver Kirti Wankhede
2016-10-29 4:30 ` Jike Song
2016-10-29 10:06 ` Kirti Wankhede
2016-10-29 18:11 ` Jike Song
2016-11-02 7:59 ` Kirti Wankhede
2016-11-02 10:31 ` Jike Song
2016-11-01 3:08 ` Jike Song
2016-11-01 3:44 ` Alex Williamson
2016-11-01 5:28 ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 02/19] vfio: VFIO based driver for Mediated devices Kirti Wankhede
2016-11-02 10:39 ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 03/19] vfio: Rearrange functions to get vfio_group from dev Kirti Wankhede
2016-11-02 10:41 ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 04/19] vfio: Common function to increment container_users Kirti Wankhede
2016-11-02 11:34 ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 05/19] vfio iommu: Added pin and unpin callback functions to vfio_iommu_driver_ops Kirti Wankhede
2016-11-01 8:07 ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 06/19] vfio iommu type1: Update arguments of vfio_lock_acct Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 07/19] vfio iommu type1: Update argument of vaddr_get_pfn() Kirti Wankhede
2016-10-27 12:11 ` Jike Song
2016-10-27 12:24 ` Kirti Wankhede
2016-10-28 6:01 ` Jike Song
2016-11-02 8:06 ` Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 08/19] vfio iommu type1: Add find_iommu_group() function Kirti Wankhede
2016-11-02 14:13 ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 09/19] vfio iommu type1: Add support for mediated devices Kirti Wankhede
2016-10-27 23:01 ` Alex Williamson
2016-11-02 13:29 ` Jike Song [this message]
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 10/19] vfio iommu: Add blocking notifier to notify DMA_UNMAP Kirti Wankhede
2016-10-28 7:33 ` Jike Song
2016-10-28 12:38 ` Kirti Wankhede
2016-10-28 12:40 ` Alex Williamson
2016-10-28 20:02 ` Kirti Wankhede
2016-10-28 20:33 ` Alex Williamson
2016-10-29 10:37 ` Kirti Wankhede
2016-10-29 14:03 ` Alex Williamson
2016-11-01 3:45 ` Dong Jia Shi
2016-11-01 7:47 ` Kirti Wankhede
2016-11-01 8:33 ` Dong Jia Shi
2016-10-31 3:50 ` Jike Song
2016-10-31 5:59 ` Kirti Wankhede
2016-10-31 6:05 ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 11/19] vfio: Introduce common function to add capabilities Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 12/19] vfio_pci: Update vfio_pci to use vfio_info_add_capability() Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 13/19] vfio: Introduce vfio_set_irqs_validate_and_prepare() Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 14/19] vfio_pci: Updated to use vfio_set_irqs_validate_and_prepare() Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 15/19] vfio_platform: " Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 16/19] vfio: Define device_api strings Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 17/19] docs: Add Documentation for Mediated devices Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 18/19] docs: Sysfs ABI for mediated device framework Kirti Wankhede
2016-10-31 7:19 ` Jike Song
2016-11-02 7:55 ` Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 19/19] docs: Sample driver to demonstrate how to use Mediated " Kirti Wankhede
2016-10-27 14:29 ` Jonathan Corbet
2016-11-01 8:32 ` [Qemu-devel] [PATCH v10 00/19] Add Mediated device support Jike Song
2016-11-01 15:24 ` Gerd Hoffmann
2016-11-02 1:01 ` Jike Song
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5819EA34.1020607@intel.com \
--to=jike.song@intel.com \
--cc=alex.williamson@redhat.com \
--cc=bjsdjshi@linux.vnet.ibm.com \
--cc=cjia@nvidia.com \
--cc=kevin.tian@intel.com \
--cc=kraxel@redhat.com \
--cc=kvm@vger.kernel.org \
--cc=kwankhede@nvidia.com \
--cc=linux-kernel@vger.kernel.org \
--cc=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).