qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Jike Song <jike.song@intel.com>
To: Kirti Wankhede <kwankhede@nvidia.com>
Cc: alex.williamson@redhat.com, pbonzini@redhat.com,
	kraxel@redhat.com, cjia@nvidia.com, qemu-devel@nongnu.org,
	kvm@vger.kernel.org, kevin.tian@intel.com,
	bjsdjshi@linux.vnet.ibm.com, linux-kernel@vger.kernel.org
Subject: Re: [Qemu-devel] [PATCH v10 09/19] vfio iommu type1: Add support for mediated devices
Date: Wed, 02 Nov 2016 21:29:24 +0800	[thread overview]
Message-ID: <5819EA34.1020607@intel.com> (raw)
In-Reply-To: <1477517366-27871-10-git-send-email-kwankhede@nvidia.com>

On 10/27/2016 05:29 AM, Kirti Wankhede wrote:
> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
>   cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
>   verify unpinning request and to unpin remaining pages while detaching, if
>   there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
>   exist in the container then all pages are already pinned and accounted.
>   Accouting for mdev device is only done if there is no iommu capable
>   domain in the container.
> - Page accouting is updated on hot plug and unplug mdev device and pass
>   through device.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>   exist
> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>   exist
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> ---
>  drivers/vfio/vfio_iommu_type1.c | 646 +++++++++++++++++++++++++++++++++++-----
>  1 file changed, 571 insertions(+), 75 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 861ac2a1b0c3..5add11a147e1 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -36,6 +36,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
>  #include <linux/workqueue.h>
> +#include <linux/mdev.h>
>  
>  #define DRIVER_VERSION  "0.2"
>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> @@ -55,18 +56,26 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct vfio_domain	*external_domain; /* domain for external user */
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
>  	bool			nesting;
>  };
>  
> +struct external_addr_space {
> +	struct task_struct	*task;
> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
> +};
> +
>  struct vfio_domain {
> -	struct iommu_domain	*domain;
> -	struct list_head	next;
> -	struct list_head	group_list;
> -	int			prot;		/* IOMMU_CACHE */
> -	bool			fgsp;		/* Fine-grained super pages */
> +	struct iommu_domain		*domain;
> +	struct list_head		next;
> +	struct list_head		group_list;
> +	struct external_addr_space	*external_addr_space;
> +	int				prot;	/* IOMMU_CACHE */
> +	bool				fgsp;	/* Fine-grained super pages */
>  };
>  
>  struct vfio_dma {
> @@ -75,6 +84,7 @@ struct vfio_dma {
>  	unsigned long		vaddr;		/* Process virtual addr */
>  	size_t			size;		/* Map size (bytes) */
>  	int			prot;		/* IOMMU_READ/WRITE */
> +	bool			iommu_mapped;
>  };
>  
>  struct vfio_group {
> @@ -83,6 +93,21 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +	struct rb_node		node;
> +	unsigned long		vaddr;		/* virtual addr */
> +	dma_addr_t		iova;		/* IOVA */
> +	unsigned long		pfn;		/* Host pfn */
> +	int			prot;
> +	atomic_t		ref_count;
> +};
> +
> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> +					(!list_empty(&iommu->domain_list))
> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -130,6 +155,101 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> +				      unsigned long pfn)
> +{
> +	struct rb_node *node;
> +	struct vfio_pfn *vpfn;
> +
> +	node = domain->external_addr_space->pfn_list.rb_node;
> +
> +	while (node) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if (pfn < vpfn->pfn)
> +			node = node->rb_left;
> +		else if (pfn > vpfn->pfn)
> +			node = node->rb_right;
> +		else
> +			return vpfn;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> +	struct rb_node **link, *parent = NULL;
> +	struct vfio_pfn *vpfn;
> +
> +	link = &domain->external_addr_space->pfn_list.rb_node;
> +	while (*link) {
> +		parent = *link;
> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +		if (new->pfn < vpfn->pfn)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &domain->external_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> +	rb_erase(&old->node, &domain->external_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
> +				dma_addr_t iova, unsigned long pfn, int prot)
> +{
> +	struct vfio_pfn *vpfn;
> +
> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> +	if (!vpfn)
> +		return -ENOMEM;
> +
> +	vpfn->vaddr = vaddr;
> +	vpfn->iova = iova;
> +	vpfn->pfn = pfn;
> +	vpfn->prot = prot;
> +	atomic_set(&vpfn->ref_count, 1);
> +	vfio_link_pfn(domain, vpfn);
> +	return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> +				      struct vfio_pfn *vpfn)
> +{
> +	vfio_unlink_pfn(domain, vpfn);
> +	kfree(vpfn);
> +}
> +
> +static int vfio_pfn_account(struct vfio_iommu *iommu, unsigned long pfn)
> +{
> +	struct vfio_pfn *p;
> +	struct vfio_domain *domain = iommu->external_domain;
> +	int ret = 1;
> +
> +	if (!domain)
> +		return 1;
> +
> +	mutex_lock(&domain->external_addr_space->pfn_list_lock);
> +
> +	p = vfio_find_pfn(domain, pfn);
> +	if (p)
> +		ret = 0;
> +
> +	mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> +	return ret;
> +}
> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -270,12 +390,13 @@ static int vaddr_get_pfn(struct mm_struct *remote_mm, unsigned long vaddr,
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> -			   int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(struct vfio_iommu *iommu,
> +				    unsigned long vaddr, long npage,
> +				    int prot, unsigned long *pfn_base)
>  {
>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
> -	long ret, i;
> +	long ret, i, lock_acct = 0;
>  	bool rsvd;
>  
>  	if (!current->mm)
> @@ -285,9 +406,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	if (ret)
>  		return ret;
>  
> +	lock_acct = vfio_pfn_account(iommu, *pfn_base);
> +
>  	rsvd = is_invalid_reserved_pfn(*pfn_base);
>  
> -	if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
> +	if (!rsvd && !lock_cap && current->mm->locked_vm + lock_acct > limit) {
>  		put_pfn(*pfn_base, prot);
>  		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
>  			limit << PAGE_SHIFT);
> @@ -314,8 +437,10 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  			break;
>  		}
>  
> +		lock_acct += vfio_pfn_account(iommu, pfn);
> +
>  		if (!rsvd && !lock_cap &&
> -		    current->mm->locked_vm + i + 1 > limit) {
> +		    current->mm->locked_vm + lock_acct > limit) {
>  			put_pfn(pfn, prot);
>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>  				__func__, limit << PAGE_SHIFT);
> @@ -329,14 +454,19 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	return i;
>  }
>  
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> -			     int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(struct vfio_iommu *iommu,
> +				      unsigned long pfn, long npage, int prot,
> +				      bool do_accounting)
>  {
> -	unsigned long unlocked = 0;
> +	unsigned long unlocked = 0, unlock_acct = 0;
>  	long i;
>  
> -	for (i = 0; i < npage; i++)
> +	for (i = 0; i < npage; i++) {
> +		if (do_accounting)
> +			unlock_acct += vfio_pfn_account(iommu, pfn);
> +
>  		unlocked += put_pfn(pfn++, prot);
> +	}
>  
>  	if (do_accounting)
>  		vfio_lock_acct(current, -unlocked);
> @@ -344,14 +474,208 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
>  	return unlocked;
>  }
>  
> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> +static long __vfio_pin_page_external(struct vfio_domain *domain,
> +				     unsigned long vaddr, int prot,
> +				     unsigned long *pfn_base,
> +				     bool do_accounting)
> +{
> +	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	bool lock_cap = capable(CAP_IPC_LOCK);
> +	long ret;
> +	bool rsvd;
> +	struct task_struct *task = domain->external_addr_space->task;
> +
> +	if (!task->mm)
> +		return -ENODEV;
> +
> +	ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);

Hi Kirti,

In your implementation of vaddr_get_pfn, as long as the 1st argument
is not NULL, you will call get_user_pages_remote other than get_user_pages_fast.

But still there is a possibility here @task is actually @current, thereby
task->mm == current->mm.  That should not be rare, but probably happen
at most times: the QEMU process emulates MMIO read/write, and call
vfio_pin_pages in its process context.

If I read correctly, current implementation enforces a locked GUP, no
matter which process context it is in.

--
Thanks,
Jike

  parent reply	other threads:[~2016-11-02 13:33 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-26 21:29 [Qemu-devel] [PATCH v10 00/19] Add Mediated device support Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 01/19] vfio: Mediated device Core driver Kirti Wankhede
2016-10-29  4:30   ` Jike Song
2016-10-29 10:06     ` Kirti Wankhede
2016-10-29 18:11       ` Jike Song
2016-11-02  7:59         ` Kirti Wankhede
2016-11-02 10:31           ` Jike Song
2016-11-01  3:08   ` Jike Song
2016-11-01  3:44     ` Alex Williamson
2016-11-01  5:28       ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 02/19] vfio: VFIO based driver for Mediated devices Kirti Wankhede
2016-11-02 10:39   ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 03/19] vfio: Rearrange functions to get vfio_group from dev Kirti Wankhede
2016-11-02 10:41   ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 04/19] vfio: Common function to increment container_users Kirti Wankhede
2016-11-02 11:34   ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 05/19] vfio iommu: Added pin and unpin callback functions to vfio_iommu_driver_ops Kirti Wankhede
2016-11-01  8:07   ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 06/19] vfio iommu type1: Update arguments of vfio_lock_acct Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 07/19] vfio iommu type1: Update argument of vaddr_get_pfn() Kirti Wankhede
2016-10-27 12:11   ` Jike Song
2016-10-27 12:24     ` Kirti Wankhede
2016-10-28  6:01       ` Jike Song
2016-11-02  8:06         ` Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 08/19] vfio iommu type1: Add find_iommu_group() function Kirti Wankhede
2016-11-02 14:13   ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 09/19] vfio iommu type1: Add support for mediated devices Kirti Wankhede
2016-10-27 23:01   ` Alex Williamson
2016-11-02 13:29   ` Jike Song [this message]
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 10/19] vfio iommu: Add blocking notifier to notify DMA_UNMAP Kirti Wankhede
2016-10-28  7:33   ` Jike Song
2016-10-28 12:38     ` Kirti Wankhede
2016-10-28 12:40     ` Alex Williamson
2016-10-28 20:02       ` Kirti Wankhede
2016-10-28 20:33         ` Alex Williamson
2016-10-29 10:37           ` Kirti Wankhede
2016-10-29 14:03             ` Alex Williamson
2016-11-01  3:45               ` Dong Jia Shi
2016-11-01  7:47                 ` Kirti Wankhede
2016-11-01  8:33                   ` Dong Jia Shi
2016-10-31  3:50   ` Jike Song
2016-10-31  5:59     ` Kirti Wankhede
2016-10-31  6:05       ` Jike Song
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 11/19] vfio: Introduce common function to add capabilities Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 12/19] vfio_pci: Update vfio_pci to use vfio_info_add_capability() Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 13/19] vfio: Introduce vfio_set_irqs_validate_and_prepare() Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 14/19] vfio_pci: Updated to use vfio_set_irqs_validate_and_prepare() Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 15/19] vfio_platform: " Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 16/19] vfio: Define device_api strings Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 17/19] docs: Add Documentation for Mediated devices Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 18/19] docs: Sysfs ABI for mediated device framework Kirti Wankhede
2016-10-31  7:19   ` Jike Song
2016-11-02  7:55     ` Kirti Wankhede
2016-10-26 21:29 ` [Qemu-devel] [PATCH v10 19/19] docs: Sample driver to demonstrate how to use Mediated " Kirti Wankhede
2016-10-27 14:29   ` Jonathan Corbet
2016-11-01  8:32 ` [Qemu-devel] [PATCH v10 00/19] Add Mediated device support Jike Song
2016-11-01 15:24   ` Gerd Hoffmann
2016-11-02  1:01     ` Jike Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5819EA34.1020607@intel.com \
    --to=jike.song@intel.com \
    --cc=alex.williamson@redhat.com \
    --cc=bjsdjshi@linux.vnet.ibm.com \
    --cc=cjia@nvidia.com \
    --cc=kevin.tian@intel.com \
    --cc=kraxel@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=kwankhede@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).