linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Yury Norov <yury.norov@gmail.com>
To: Nicolin Chen <nicolinc@nvidia.com>
Cc: will@kernel.org, robin.murphy@arm.com, jgg@nvidia.com,
	kevin.tian@intel.com, tglx@linutronix.de, maz@kernel.org,
	alex.williamson@redhat.com, joro@8bytes.org, shuah@kernel.org,
	reinette.chatre@intel.com, eric.auger@redhat.com,
	yebin10@huawei.com, apatel@ventanamicro.com,
	shivamurthy.shastri@linutronix.de, bhelgaas@google.com,
	anna-maria@linutronix.de, nipun.gupta@amd.com,
	iommu@lists.linux.dev, linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, kvm@vger.kernel.org,
	linux-kselftest@vger.kernel.org, patches@lists.linux.dev,
	jean-philippe@linaro.org, mdf@kernel.org, mshavit@google.com,
	shameerali.kolothum.thodi@huawei.com, smostafa@google.com,
	ddutile@redhat.com
Subject: Re: [PATCH RFCv2 07/13] iommufd: Implement sw_msi support natively
Date: Tue, 14 Jan 2025 23:21:13 -0500	[thread overview]
Message-ID: <Z4c3ueuDgM7YqElp@thinkpad> (raw)
In-Reply-To: <f70451cf4274bc5955824efe9f98ec7dfdd10927.1736550979.git.nicolinc@nvidia.com>

On Fri, Jan 10, 2025 at 07:32:23PM -0800, Nicolin Chen wrote:
> From: Jason Gunthorpe <jgg@nvidia.com>
> 
> iommufd has a model where the iommu_domain can be changed while the VFIO
> device is attached. In this case the MSI should continue to work. This
> corner case has not worked because the dma-iommu implementation of sw_msi
> is tied to a single domain.
> 
> Implement the sw_msi mapping directly and use a global per-fd table to
> associate assigned iova to the MSI pages. This allows the MSI pages to
> loaded into a domain before it is attached ensuring that MSI is not

s/loaded/be loaded/ ?

> disrupted.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> [nicolinc: set sw_msi pointer in nested hwpt allocators]
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>  drivers/iommu/iommufd/iommufd_private.h |  23 +++-
>  drivers/iommu/iommufd/device.c          | 158 ++++++++++++++++++++----
>  drivers/iommu/iommufd/hw_pagetable.c    |   3 +
>  drivers/iommu/iommufd/main.c            |   9 ++
>  4 files changed, 170 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 063c0a42f54f..3e83bbb5912c 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -19,6 +19,22 @@ struct iommu_group;
>  struct iommu_option;
>  struct iommufd_device;
>  
> +struct iommufd_sw_msi_map {
> +	struct list_head sw_msi_item;
> +	phys_addr_t sw_msi_start;
> +	phys_addr_t msi_addr;
> +	unsigned int pgoff;
> +	unsigned int id;
> +};
> +
> +/* Bitmap of struct iommufd_sw_msi_map::id */
> +struct iommufd_sw_msi_maps {
> +	DECLARE_BITMAP(bitmap, 64);
> +};
> +
> +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
> +		   phys_addr_t msi_addr);
> +
>  struct iommufd_ctx {
>  	struct file *file;
>  	struct xarray objects;
> @@ -26,6 +42,10 @@ struct iommufd_ctx {
>  	wait_queue_head_t destroy_wait;
>  	struct rw_semaphore ioas_creation_lock;
>  
> +	struct mutex sw_msi_lock;
> +	struct list_head sw_msi_list;
> +	unsigned int sw_msi_id;
> +
>  	u8 account_mode;
>  	/* Compatibility with VFIO no iommu */
>  	u8 no_iommu_mode;
> @@ -283,10 +303,10 @@ struct iommufd_hwpt_paging {
>  	struct iommufd_ioas *ioas;
>  	bool auto_domain : 1;
>  	bool enforce_cache_coherency : 1;
> -	bool msi_cookie : 1;
>  	bool nest_parent : 1;
>  	/* Head at iommufd_ioas::hwpt_list */
>  	struct list_head hwpt_item;
> +	struct iommufd_sw_msi_maps present_sw_msi;
>  };
>  
>  struct iommufd_hwpt_nested {
> @@ -383,6 +403,7 @@ struct iommufd_group {
>  	struct iommu_group *group;
>  	struct iommufd_hw_pagetable *hwpt;
>  	struct list_head device_list;
> +	struct iommufd_sw_msi_maps required_sw_msi;
>  	phys_addr_t sw_msi_start;
>  };
>  
> diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> index 38b31b652147..f75b3c23cd41 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -5,6 +5,7 @@
>  #include <linux/iommufd.h>
>  #include <linux/slab.h>
>  #include <uapi/linux/iommufd.h>
> +#include <linux/msi.h>
>  
>  #include "../iommu-priv.h"
>  #include "io_pagetable.h"
> @@ -293,36 +294,149 @@ u32 iommufd_device_to_id(struct iommufd_device *idev)
>  }
>  EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD");
>  
> +/*
> + * Get a iommufd_sw_msi_map for the msi physical address requested by the irq
> + * layer. The mapping to IOVA is global to the iommufd file descriptor, every
> + * domain that is attached to a device using the same MSI parameters will use
> + * the same IOVA.
> + */
> +static struct iommufd_sw_msi_map *
> +iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr,
> +		       phys_addr_t sw_msi_start)
> +{
> +	struct iommufd_sw_msi_map *cur;
> +	unsigned int max_pgoff = 0;
> +
> +	lockdep_assert_held(&ictx->sw_msi_lock);
> +
> +	list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
> +		if (cur->sw_msi_start != sw_msi_start)
> +			continue;
> +		max_pgoff = max(max_pgoff, cur->pgoff + 1);
> +		if (cur->msi_addr == msi_addr)
> +			return cur;
> +	}
> +
> +	if (ictx->sw_msi_id >=
> +	    BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap))
> +		return ERR_PTR(-EOVERFLOW);
> +
> +	cur = kzalloc(sizeof(*cur), GFP_KERNEL);
> +	if (!cur)
> +		cur = ERR_PTR(-ENOMEM);
> +	cur->sw_msi_start = sw_msi_start;
> +	cur->msi_addr = msi_addr;
> +	cur->pgoff = max_pgoff;
> +	cur->id = ictx->sw_msi_id++;
> +	list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list);
> +	return cur;
> +}
> +
> +static int iommufd_sw_msi_install(struct iommufd_ctx *ictx,
> +				  struct iommufd_hwpt_paging *hwpt_paging,
> +				  struct iommufd_sw_msi_map *msi_map)
> +{
> +	unsigned long iova;
> +
> +	lockdep_assert_held(&ictx->sw_msi_lock);
> +
> +	iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
> +	if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) {
> +		int rc;
> +
> +		rc = iommu_map(hwpt_paging->common.domain, iova,
> +			       msi_map->msi_addr, PAGE_SIZE,
> +			       IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO,
> +			       GFP_KERNEL_ACCOUNT);
> +		if (rc)
> +			return rc;
> +		set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap);
> +	}
> +	return 0;
> +}

So, does sw_msi_lock protect the present_sw_msi bitmap? If so, you
should use non-atomic __set_bit(). If not, you'd do something like:

        if (test_and_set_bit(...))
                return 0;

        rc = iommu_map(...);
        if (rc)
                clear_bit(...);

        return rc

Now it looks like a series of atomic accesses, which is not atomic, and it
misleads...

> +
> +/*
> + * Called by the irq code if the platform translates the MSI address through the
> + * IOMMU. msi_addr is the physical address of the MSI page. iommufd will
> + * allocate a fd global iova for the physical page that is the same on all
> + * domains and devices.
> + */
> +#ifdef CONFIG_IRQ_MSI_IOMMU
> +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
> +		   phys_addr_t msi_addr)
> +{
> +	struct device *dev = msi_desc_to_dev(desc);
> +	struct iommu_attach_handle *raw_handle;
> +	struct iommufd_hwpt_paging *hwpt_paging;
> +	struct iommufd_attach_handle *handle;
> +	struct iommufd_sw_msi_map *msi_map;
> +	struct iommufd_ctx *ictx;
> +	unsigned long iova;
> +	int rc;
> +
> +	raw_handle =
> +		iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0);

Nit: no need to break the line.

> +	if (!raw_handle)
> +		return 0;
> +	hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt);
> +
> +	handle = to_iommufd_handle(raw_handle);
> +	/* No IOMMU_RESV_SW_MSI means no change to the msi_msg */
> +	if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX)
> +		return 0;
> +
> +	ictx = handle->idev->ictx;
> +	guard(mutex)(&ictx->sw_msi_lock);
> +	/*
> +	 * The input msi_addr is the exact byte offset of the MSI doorbell, we
> +	 * assume the caller has checked that it is contained with a MMIO region
> +	 * that is secure to map at PAGE_SIZE.
> +	 */
> +	msi_map = iommufd_sw_msi_get_map(handle->idev->ictx,
> +					 msi_addr & PAGE_MASK,
> +					 handle->idev->igroup->sw_msi_start);
> +	if (IS_ERR(msi_map))
> +		return PTR_ERR(msi_map);
> +
> +	rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map);
> +	if (rc)
> +		return rc;
> +	set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap);

Same here. I guess, sw_msi_lock protects required_sw_msi.bitmap,
right?

Thanks,
Yury

> +
> +	iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
> +	msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT);
> +	return 0;
> +}
> +#endif
> +
> +/*
> + * FIXME: when a domain is removed any ids that are not in the union of
> + * all still attached devices should be removed.
> + */
> +
>  static int iommufd_group_setup_msi(struct iommufd_group *igroup,
>  				   struct iommufd_hwpt_paging *hwpt_paging)
>  {
> -	phys_addr_t sw_msi_start = igroup->sw_msi_start;
> -	int rc;
> +	struct iommufd_ctx *ictx = igroup->ictx;
> +	struct iommufd_sw_msi_map *cur;
> +
> +	if (igroup->sw_msi_start == PHYS_ADDR_MAX)
> +		return 0;
>  
>  	/*
> -	 * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to
> -	 * call iommu_get_msi_cookie() on its behalf. This is necessary to setup
> -	 * the MSI window so iommu_dma_prepare_msi() can install pages into our
> -	 * domain after request_irq(). If it is not done interrupts will not
> -	 * work on this domain.
> -	 *
> -	 * FIXME: This is conceptually broken for iommufd since we want to allow
> -	 * userspace to change the domains, eg switch from an identity IOAS to a
> -	 * DMA IOAS. There is currently no way to create a MSI window that
> -	 * matches what the IRQ layer actually expects in a newly created
> -	 * domain.
> +	 * Install all the MSI pages the device has been using into the domain
>  	 */
> -	if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) {
> -		rc = iommu_get_msi_cookie(hwpt_paging->common.domain,
> -					  sw_msi_start);
> +	guard(mutex)(&ictx->sw_msi_lock);
> +	list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
> +		int rc;
> +
> +		if (cur->sw_msi_start != igroup->sw_msi_start ||
> +		    !test_bit(cur->id, igroup->required_sw_msi.bitmap))
> +			continue;
> +
> +		rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur);
>  		if (rc)
>  			return rc;
> -
> -		/*
> -		 * iommu_get_msi_cookie() can only be called once per domain,
> -		 * it returns -EBUSY on later calls.
> -		 */
> -		hwpt_paging->msi_cookie = true;
>  	}
>  	return 0;
>  }
> diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> index f7c0d7b214b6..538484eecb3b 100644
> --- a/drivers/iommu/iommufd/hw_pagetable.c
> +++ b/drivers/iommu/iommufd/hw_pagetable.c
> @@ -156,6 +156,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
>  			goto out_abort;
>  		}
>  	}
> +	iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
>  
>  	/*
>  	 * Set the coherency mode before we do iopt_table_add_domain() as some
> @@ -251,6 +252,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
>  		goto out_abort;
>  	}
>  	hwpt->domain->owner = ops;
> +	iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
>  
>  	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
>  		rc = -EINVAL;
> @@ -303,6 +305,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags,
>  		goto out_abort;
>  	}
>  	hwpt->domain->owner = viommu->iommu_dev->ops;
> +	iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
>  
>  	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
>  		rc = -EINVAL;
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 97c5e3567d33..7cc9497b7193 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -227,6 +227,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
>  	xa_init(&ictx->groups);
>  	ictx->file = filp;
>  	init_waitqueue_head(&ictx->destroy_wait);
> +	mutex_init(&ictx->sw_msi_lock);
> +	INIT_LIST_HEAD(&ictx->sw_msi_list);
>  	filp->private_data = ictx;
>  	return 0;
>  }
> @@ -234,6 +236,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
>  static int iommufd_fops_release(struct inode *inode, struct file *filp)
>  {
>  	struct iommufd_ctx *ictx = filp->private_data;
> +	struct iommufd_sw_msi_map *next;
> +	struct iommufd_sw_msi_map *cur;
>  	struct iommufd_object *obj;
>  
>  	/*
> @@ -262,6 +266,11 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
>  			break;
>  	}
>  	WARN_ON(!xa_empty(&ictx->groups));
> +
> +	mutex_destroy(&ictx->sw_msi_lock);
> +	list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item)
> +		kfree(cur);
> +
>  	kfree(ictx);
>  	return 0;
>  }
> -- 
> 2.43.0

  reply	other threads:[~2025-01-15  4:21 UTC|newest]

Thread overview: 64+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-11  3:32 [PATCH RFCv2 00/13] iommu: Add MSI mapping support with nested SMMU Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 01/13] genirq/msi: Store the IOMMU IOVA directly in msi_desc instead of iommu_cookie Nicolin Chen
2025-01-23 17:10   ` Eric Auger
2025-01-23 18:48     ` Jason Gunthorpe
2025-01-29 12:11       ` Eric Auger
2025-01-11  3:32 ` [PATCH RFCv2 02/13] genirq/msi: Rename iommu_dma_compose_msi_msg() to msi_msg_set_msi_addr() Nicolin Chen
2025-01-23 17:10   ` Eric Auger
2025-01-23 18:50     ` Jason Gunthorpe
2025-01-29 10:44       ` Eric Auger
2025-01-11  3:32 ` [PATCH RFCv2 03/13] iommu: Make iommu_dma_prepare_msi() into a generic operation Nicolin Chen
2025-01-23 17:10   ` Eric Auger
2025-01-23 18:16     ` Jason Gunthorpe
2025-01-29 12:29       ` Eric Auger
2025-01-11  3:32 ` [PATCH RFCv2 04/13] irqchip: Have CONFIG_IRQ_MSI_IOMMU be selected by the irqchips that need it Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 05/13] iommu: Turn fault_data to iommufd private pointer Nicolin Chen
2025-01-23  9:54   ` Tian, Kevin
2025-01-23 13:25     ` Jason Gunthorpe
2025-01-29 12:40   ` Eric Auger
2025-02-03 17:48     ` Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 06/13] iommufd: Make attach_handle generic Nicolin Chen
2025-01-18  8:23   ` Yi Liu
2025-01-18 20:32     ` Nicolin Chen
2025-01-19 10:40       ` Yi Liu
2025-01-20  5:54         ` Nicolin Chen
2025-01-24 13:31           ` Yi Liu
2025-01-20 14:20       ` Jason Gunthorpe
2025-01-29 13:14   ` Eric Auger
2025-02-03 18:08     ` Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 07/13] iommufd: Implement sw_msi support natively Nicolin Chen
2025-01-15  4:21   ` Yury Norov [this message]
2025-01-16 20:21     ` Jason Gunthorpe
2025-01-23 19:30   ` Jason Gunthorpe
2025-01-11  3:32 ` [PATCH RFCv2 08/13] iommu: Turn iova_cookie to dma-iommu private pointer Nicolin Chen
2025-01-13 16:40   ` Jason Gunthorpe
2025-01-11  3:32 ` [PATCH RFCv2 09/13] iommufd: Add IOMMU_OPTION_SW_MSI_START/SIZE ioctls Nicolin Chen
2025-01-23 10:07   ` Tian, Kevin
2025-02-03 18:36     ` Nicolin Chen
2025-01-29 13:44   ` Eric Auger
2025-01-29 14:58     ` Jason Gunthorpe
2025-01-29 17:23       ` Eric Auger
2025-01-29 17:39         ` Jason Gunthorpe
2025-01-29 17:49           ` Eric Auger
2025-01-29 20:15             ` Jason Gunthorpe
2025-02-07  4:26       ` Nicolin Chen
2025-02-07 14:30         ` Jason Gunthorpe
2025-02-07 15:28           ` Jason Gunthorpe
2025-02-07 18:59             ` Nicolin Chen
2025-02-09 18:09               ` Jason Gunthorpe
2025-01-11  3:32 ` [PATCH RFCv2 10/13] iommufd/selftes: Add coverage for IOMMU_OPTION_SW_MSI_START/SIZE Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 11/13] iommufd/device: Allow setting IOVAs for MSI(x) vectors Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 12/13] vfio-iommufd: Provide another layer of msi_iova helpers Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 13/13] vfio/pci: Allow preset MSI IOVAs via VFIO_IRQ_SET_ACTION_PREPARE Nicolin Chen
2025-01-23  9:06 ` [PATCH RFCv2 00/13] iommu: Add MSI mapping support with nested SMMU Shameerali Kolothum Thodi
2025-01-23 13:24   ` Jason Gunthorpe
2025-01-29 14:54     ` Eric Auger
2025-01-29 15:04       ` Jason Gunthorpe
2025-01-29 17:46         ` Eric Auger
2025-01-29 20:13           ` Jason Gunthorpe
2025-02-04 12:55             ` Eric Auger
2025-02-04 13:02               ` Jason Gunthorpe
2025-02-05 22:49 ` Jacob Pan
2025-02-05 22:56   ` Nicolin Chen
2025-02-07 14:34 ` Jason Gunthorpe
2025-02-07 14:42   ` Thomas Gleixner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Z4c3ueuDgM7YqElp@thinkpad \
    --to=yury.norov@gmail.com \
    --cc=alex.williamson@redhat.com \
    --cc=anna-maria@linutronix.de \
    --cc=apatel@ventanamicro.com \
    --cc=bhelgaas@google.com \
    --cc=ddutile@redhat.com \
    --cc=eric.auger@redhat.com \
    --cc=iommu@lists.linux.dev \
    --cc=jean-philippe@linaro.org \
    --cc=jgg@nvidia.com \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=maz@kernel.org \
    --cc=mdf@kernel.org \
    --cc=mshavit@google.com \
    --cc=nicolinc@nvidia.com \
    --cc=nipun.gupta@amd.com \
    --cc=patches@lists.linux.dev \
    --cc=reinette.chatre@intel.com \
    --cc=robin.murphy@arm.com \
    --cc=shameerali.kolothum.thodi@huawei.com \
    --cc=shivamurthy.shastri@linutronix.de \
    --cc=shuah@kernel.org \
    --cc=smostafa@google.com \
    --cc=tglx@linutronix.de \
    --cc=will@kernel.org \
    --cc=yebin10@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).