All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yury Norov <yury.norov@gmail.com>
To: Nicolin Chen <nicolinc@nvidia.com>
Cc: will@kernel.org, robin.murphy@arm.com, jgg@nvidia.com,
	kevin.tian@intel.com, tglx@linutronix.de, maz@kernel.org,
	alex.williamson@redhat.com, joro@8bytes.org, shuah@kernel.org,
	reinette.chatre@intel.com, eric.auger@redhat.com,
	yebin10@huawei.com, apatel@ventanamicro.com,
	shivamurthy.shastri@linutronix.de, bhelgaas@google.com,
	anna-maria@linutronix.de, nipun.gupta@amd.com,
	iommu@lists.linux.dev, linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, kvm@vger.kernel.org,
	linux-kselftest@vger.kernel.org, patches@lists.linux.dev,
	jean-philippe@linaro.org, mdf@kernel.org, mshavit@google.com,
	shameerali.kolothum.thodi@huawei.com, smostafa@google.com,
	ddutile@redhat.com
Subject: Re: [PATCH RFCv2 07/13] iommufd: Implement sw_msi support natively
Date: Tue, 14 Jan 2025 23:21:13 -0500	[thread overview]
Message-ID: <Z4c3ueuDgM7YqElp@thinkpad> (raw)
In-Reply-To: <f70451cf4274bc5955824efe9f98ec7dfdd10927.1736550979.git.nicolinc@nvidia.com>

On Fri, Jan 10, 2025 at 07:32:23PM -0800, Nicolin Chen wrote:
> From: Jason Gunthorpe <jgg@nvidia.com>
> 
> iommufd has a model where the iommu_domain can be changed while the VFIO
> device is attached. In this case the MSI should continue to work. This
> corner case has not worked because the dma-iommu implementation of sw_msi
> is tied to a single domain.
> 
> Implement the sw_msi mapping directly and use a global per-fd table to
> associate assigned iova to the MSI pages. This allows the MSI pages to
> loaded into a domain before it is attached ensuring that MSI is not

s/loaded/be loaded/ ?

> disrupted.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> [nicolinc: set sw_msi pointer in nested hwpt allocators]
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>  drivers/iommu/iommufd/iommufd_private.h |  23 +++-
>  drivers/iommu/iommufd/device.c          | 158 ++++++++++++++++++++----
>  drivers/iommu/iommufd/hw_pagetable.c    |   3 +
>  drivers/iommu/iommufd/main.c            |   9 ++
>  4 files changed, 170 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 063c0a42f54f..3e83bbb5912c 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -19,6 +19,22 @@ struct iommu_group;
>  struct iommu_option;
>  struct iommufd_device;
>  
> +struct iommufd_sw_msi_map {
> +	struct list_head sw_msi_item;
> +	phys_addr_t sw_msi_start;
> +	phys_addr_t msi_addr;
> +	unsigned int pgoff;
> +	unsigned int id;
> +};
> +
> +/* Bitmap of struct iommufd_sw_msi_map::id */
> +struct iommufd_sw_msi_maps {
> +	DECLARE_BITMAP(bitmap, 64);
> +};
> +
> +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
> +		   phys_addr_t msi_addr);
> +
>  struct iommufd_ctx {
>  	struct file *file;
>  	struct xarray objects;
> @@ -26,6 +42,10 @@ struct iommufd_ctx {
>  	wait_queue_head_t destroy_wait;
>  	struct rw_semaphore ioas_creation_lock;
>  
> +	struct mutex sw_msi_lock;
> +	struct list_head sw_msi_list;
> +	unsigned int sw_msi_id;
> +
>  	u8 account_mode;
>  	/* Compatibility with VFIO no iommu */
>  	u8 no_iommu_mode;
> @@ -283,10 +303,10 @@ struct iommufd_hwpt_paging {
>  	struct iommufd_ioas *ioas;
>  	bool auto_domain : 1;
>  	bool enforce_cache_coherency : 1;
> -	bool msi_cookie : 1;
>  	bool nest_parent : 1;
>  	/* Head at iommufd_ioas::hwpt_list */
>  	struct list_head hwpt_item;
> +	struct iommufd_sw_msi_maps present_sw_msi;
>  };
>  
>  struct iommufd_hwpt_nested {
> @@ -383,6 +403,7 @@ struct iommufd_group {
>  	struct iommu_group *group;
>  	struct iommufd_hw_pagetable *hwpt;
>  	struct list_head device_list;
> +	struct iommufd_sw_msi_maps required_sw_msi;
>  	phys_addr_t sw_msi_start;
>  };
>  
> diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> index 38b31b652147..f75b3c23cd41 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -5,6 +5,7 @@
>  #include <linux/iommufd.h>
>  #include <linux/slab.h>
>  #include <uapi/linux/iommufd.h>
> +#include <linux/msi.h>
>  
>  #include "../iommu-priv.h"
>  #include "io_pagetable.h"
> @@ -293,36 +294,149 @@ u32 iommufd_device_to_id(struct iommufd_device *idev)
>  }
>  EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD");
>  
> +/*
> + * Get a iommufd_sw_msi_map for the msi physical address requested by the irq
> + * layer. The mapping to IOVA is global to the iommufd file descriptor, every
> + * domain that is attached to a device using the same MSI parameters will use
> + * the same IOVA.
> + */
> +static struct iommufd_sw_msi_map *
> +iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr,
> +		       phys_addr_t sw_msi_start)
> +{
> +	struct iommufd_sw_msi_map *cur;
> +	unsigned int max_pgoff = 0;
> +
> +	lockdep_assert_held(&ictx->sw_msi_lock);
> +
> +	list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
> +		if (cur->sw_msi_start != sw_msi_start)
> +			continue;
> +		max_pgoff = max(max_pgoff, cur->pgoff + 1);
> +		if (cur->msi_addr == msi_addr)
> +			return cur;
> +	}
> +
> +	if (ictx->sw_msi_id >=
> +	    BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap))
> +		return ERR_PTR(-EOVERFLOW);
> +
> +	cur = kzalloc(sizeof(*cur), GFP_KERNEL);
> +	if (!cur)
> +		cur = ERR_PTR(-ENOMEM);
> +	cur->sw_msi_start = sw_msi_start;
> +	cur->msi_addr = msi_addr;
> +	cur->pgoff = max_pgoff;
> +	cur->id = ictx->sw_msi_id++;
> +	list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list);
> +	return cur;
> +}
> +
> +static int iommufd_sw_msi_install(struct iommufd_ctx *ictx,
> +				  struct iommufd_hwpt_paging *hwpt_paging,
> +				  struct iommufd_sw_msi_map *msi_map)
> +{
> +	unsigned long iova;
> +
> +	lockdep_assert_held(&ictx->sw_msi_lock);
> +
> +	iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
> +	if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) {
> +		int rc;
> +
> +		rc = iommu_map(hwpt_paging->common.domain, iova,
> +			       msi_map->msi_addr, PAGE_SIZE,
> +			       IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO,
> +			       GFP_KERNEL_ACCOUNT);
> +		if (rc)
> +			return rc;
> +		set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap);
> +	}
> +	return 0;
> +}

So, does sw_msi_lock protect the present_sw_msi bitmap? If so, you
should use non-atomic __set_bit(). If not, you'd do something like:

        if (test_and_set_bit(...))
                return 0;

        rc = iommu_map(...);
        if (rc)
                clear_bit(...);

        return rc

Now it looks like a series of atomic accesses, which is not atomic, and it
misleads...

> +
> +/*
> + * Called by the irq code if the platform translates the MSI address through the
> + * IOMMU. msi_addr is the physical address of the MSI page. iommufd will
> + * allocate a fd global iova for the physical page that is the same on all
> + * domains and devices.
> + */
> +#ifdef CONFIG_IRQ_MSI_IOMMU
> +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
> +		   phys_addr_t msi_addr)
> +{
> +	struct device *dev = msi_desc_to_dev(desc);
> +	struct iommu_attach_handle *raw_handle;
> +	struct iommufd_hwpt_paging *hwpt_paging;
> +	struct iommufd_attach_handle *handle;
> +	struct iommufd_sw_msi_map *msi_map;
> +	struct iommufd_ctx *ictx;
> +	unsigned long iova;
> +	int rc;
> +
> +	raw_handle =
> +		iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0);

Nit: no need to break the line.

> +	if (!raw_handle)
> +		return 0;
> +	hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt);
> +
> +	handle = to_iommufd_handle(raw_handle);
> +	/* No IOMMU_RESV_SW_MSI means no change to the msi_msg */
> +	if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX)
> +		return 0;
> +
> +	ictx = handle->idev->ictx;
> +	guard(mutex)(&ictx->sw_msi_lock);
> +	/*
> +	 * The input msi_addr is the exact byte offset of the MSI doorbell, we
> +	 * assume the caller has checked that it is contained with a MMIO region
> +	 * that is secure to map at PAGE_SIZE.
> +	 */
> +	msi_map = iommufd_sw_msi_get_map(handle->idev->ictx,
> +					 msi_addr & PAGE_MASK,
> +					 handle->idev->igroup->sw_msi_start);
> +	if (IS_ERR(msi_map))
> +		return PTR_ERR(msi_map);
> +
> +	rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map);
> +	if (rc)
> +		return rc;
> +	set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap);

Same here. I guess, sw_msi_lock protects required_sw_msi.bitmap,
right?

Thanks,
Yury

> +
> +	iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
> +	msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT);
> +	return 0;
> +}
> +#endif
> +
> +/*
> + * FIXME: when a domain is removed any ids that are not in the union of
> + * all still attached devices should be removed.
> + */
> +
>  static int iommufd_group_setup_msi(struct iommufd_group *igroup,
>  				   struct iommufd_hwpt_paging *hwpt_paging)
>  {
> -	phys_addr_t sw_msi_start = igroup->sw_msi_start;
> -	int rc;
> +	struct iommufd_ctx *ictx = igroup->ictx;
> +	struct iommufd_sw_msi_map *cur;
> +
> +	if (igroup->sw_msi_start == PHYS_ADDR_MAX)
> +		return 0;
>  
>  	/*
> -	 * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to
> -	 * call iommu_get_msi_cookie() on its behalf. This is necessary to setup
> -	 * the MSI window so iommu_dma_prepare_msi() can install pages into our
> -	 * domain after request_irq(). If it is not done interrupts will not
> -	 * work on this domain.
> -	 *
> -	 * FIXME: This is conceptually broken for iommufd since we want to allow
> -	 * userspace to change the domains, eg switch from an identity IOAS to a
> -	 * DMA IOAS. There is currently no way to create a MSI window that
> -	 * matches what the IRQ layer actually expects in a newly created
> -	 * domain.
> +	 * Install all the MSI pages the device has been using into the domain
>  	 */
> -	if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) {
> -		rc = iommu_get_msi_cookie(hwpt_paging->common.domain,
> -					  sw_msi_start);
> +	guard(mutex)(&ictx->sw_msi_lock);
> +	list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
> +		int rc;
> +
> +		if (cur->sw_msi_start != igroup->sw_msi_start ||
> +		    !test_bit(cur->id, igroup->required_sw_msi.bitmap))
> +			continue;
> +
> +		rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur);
>  		if (rc)
>  			return rc;
> -
> -		/*
> -		 * iommu_get_msi_cookie() can only be called once per domain,
> -		 * it returns -EBUSY on later calls.
> -		 */
> -		hwpt_paging->msi_cookie = true;
>  	}
>  	return 0;
>  }
> diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> index f7c0d7b214b6..538484eecb3b 100644
> --- a/drivers/iommu/iommufd/hw_pagetable.c
> +++ b/drivers/iommu/iommufd/hw_pagetable.c
> @@ -156,6 +156,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
>  			goto out_abort;
>  		}
>  	}
> +	iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
>  
>  	/*
>  	 * Set the coherency mode before we do iopt_table_add_domain() as some
> @@ -251,6 +252,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
>  		goto out_abort;
>  	}
>  	hwpt->domain->owner = ops;
> +	iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
>  
>  	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
>  		rc = -EINVAL;
> @@ -303,6 +305,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags,
>  		goto out_abort;
>  	}
>  	hwpt->domain->owner = viommu->iommu_dev->ops;
> +	iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
>  
>  	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
>  		rc = -EINVAL;
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 97c5e3567d33..7cc9497b7193 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -227,6 +227,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
>  	xa_init(&ictx->groups);
>  	ictx->file = filp;
>  	init_waitqueue_head(&ictx->destroy_wait);
> +	mutex_init(&ictx->sw_msi_lock);
> +	INIT_LIST_HEAD(&ictx->sw_msi_list);
>  	filp->private_data = ictx;
>  	return 0;
>  }
> @@ -234,6 +236,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
>  static int iommufd_fops_release(struct inode *inode, struct file *filp)
>  {
>  	struct iommufd_ctx *ictx = filp->private_data;
> +	struct iommufd_sw_msi_map *next;
> +	struct iommufd_sw_msi_map *cur;
>  	struct iommufd_object *obj;
>  
>  	/*
> @@ -262,6 +266,11 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
>  			break;
>  	}
>  	WARN_ON(!xa_empty(&ictx->groups));
> +
> +	mutex_destroy(&ictx->sw_msi_lock);
> +	list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item)
> +		kfree(cur);
> +
>  	kfree(ictx);
>  	return 0;
>  }
> -- 
> 2.43.0

  reply	other threads:[~2025-01-15  4:21 UTC|newest]

Thread overview: 65+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-11  3:32 [PATCH RFCv2 00/13] iommu: Add MSI mapping support with nested SMMU Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 01/13] genirq/msi: Store the IOMMU IOVA directly in msi_desc instead of iommu_cookie Nicolin Chen
2025-01-23 17:10   ` Eric Auger
2025-01-23 18:48     ` Jason Gunthorpe
2025-01-29 12:11       ` Eric Auger
2025-01-11  3:32 ` [PATCH RFCv2 02/13] genirq/msi: Rename iommu_dma_compose_msi_msg() to msi_msg_set_msi_addr() Nicolin Chen
2025-01-23 17:10   ` Eric Auger
2025-01-23 18:50     ` Jason Gunthorpe
2025-01-29 10:44       ` Eric Auger
2025-01-11  3:32 ` [PATCH RFCv2 03/13] iommu: Make iommu_dma_prepare_msi() into a generic operation Nicolin Chen
2025-01-23 17:10   ` Eric Auger
2025-01-23 18:16     ` Jason Gunthorpe
2025-01-29 12:29       ` Eric Auger
2025-01-11  3:32 ` [PATCH RFCv2 04/13] irqchip: Have CONFIG_IRQ_MSI_IOMMU be selected by the irqchips that need it Nicolin Chen
2025-01-12  2:46   ` kernel test robot
2025-01-11  3:32 ` [PATCH RFCv2 05/13] iommu: Turn fault_data to iommufd private pointer Nicolin Chen
2025-01-23  9:54   ` Tian, Kevin
2025-01-23 13:25     ` Jason Gunthorpe
2025-01-29 12:40   ` Eric Auger
2025-02-03 17:48     ` Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 06/13] iommufd: Make attach_handle generic Nicolin Chen
2025-01-18  8:23   ` Yi Liu
2025-01-18 20:32     ` Nicolin Chen
2025-01-19 10:40       ` Yi Liu
2025-01-20  5:54         ` Nicolin Chen
2025-01-24 13:31           ` Yi Liu
2025-01-20 14:20       ` Jason Gunthorpe
2025-01-29 13:14   ` Eric Auger
2025-02-03 18:08     ` Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 07/13] iommufd: Implement sw_msi support natively Nicolin Chen
2025-01-15  4:21   ` Yury Norov [this message]
2025-01-16 20:21     ` Jason Gunthorpe
2025-01-23 19:30   ` Jason Gunthorpe
2025-01-11  3:32 ` [PATCH RFCv2 08/13] iommu: Turn iova_cookie to dma-iommu private pointer Nicolin Chen
2025-01-13 16:40   ` Jason Gunthorpe
2025-01-11  3:32 ` [PATCH RFCv2 09/13] iommufd: Add IOMMU_OPTION_SW_MSI_START/SIZE ioctls Nicolin Chen
2025-01-23 10:07   ` Tian, Kevin
2025-02-03 18:36     ` Nicolin Chen
2025-01-29 13:44   ` Eric Auger
2025-01-29 14:58     ` Jason Gunthorpe
2025-01-29 17:23       ` Eric Auger
2025-01-29 17:39         ` Jason Gunthorpe
2025-01-29 17:49           ` Eric Auger
2025-01-29 20:15             ` Jason Gunthorpe
2025-02-07  4:26       ` Nicolin Chen
2025-02-07 14:30         ` Jason Gunthorpe
2025-02-07 15:28           ` Jason Gunthorpe
2025-02-07 18:59             ` Nicolin Chen
2025-02-09 18:09               ` Jason Gunthorpe
2025-01-11  3:32 ` [PATCH RFCv2 10/13] iommufd/selftes: Add coverage for IOMMU_OPTION_SW_MSI_START/SIZE Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 11/13] iommufd/device: Allow setting IOVAs for MSI(x) vectors Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 12/13] vfio-iommufd: Provide another layer of msi_iova helpers Nicolin Chen
2025-01-11  3:32 ` [PATCH RFCv2 13/13] vfio/pci: Allow preset MSI IOVAs via VFIO_IRQ_SET_ACTION_PREPARE Nicolin Chen
2025-01-23  9:06 ` [PATCH RFCv2 00/13] iommu: Add MSI mapping support with nested SMMU Shameerali Kolothum Thodi
2025-01-23 13:24   ` Jason Gunthorpe
2025-01-29 14:54     ` Eric Auger
2025-01-29 15:04       ` Jason Gunthorpe
2025-01-29 17:46         ` Eric Auger
2025-01-29 20:13           ` Jason Gunthorpe
2025-02-04 12:55             ` Eric Auger
2025-02-04 13:02               ` Jason Gunthorpe
2025-02-05 22:49 ` Jacob Pan
2025-02-05 22:56   ` Nicolin Chen
2025-02-07 14:34 ` Jason Gunthorpe
2025-02-07 14:42   ` Thomas Gleixner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Z4c3ueuDgM7YqElp@thinkpad \
    --to=yury.norov@gmail.com \
    --cc=alex.williamson@redhat.com \
    --cc=anna-maria@linutronix.de \
    --cc=apatel@ventanamicro.com \
    --cc=bhelgaas@google.com \
    --cc=ddutile@redhat.com \
    --cc=eric.auger@redhat.com \
    --cc=iommu@lists.linux.dev \
    --cc=jean-philippe@linaro.org \
    --cc=jgg@nvidia.com \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=maz@kernel.org \
    --cc=mdf@kernel.org \
    --cc=mshavit@google.com \
    --cc=nicolinc@nvidia.com \
    --cc=nipun.gupta@amd.com \
    --cc=patches@lists.linux.dev \
    --cc=reinette.chatre@intel.com \
    --cc=robin.murphy@arm.com \
    --cc=shameerali.kolothum.thodi@huawei.com \
    --cc=shivamurthy.shastri@linutronix.de \
    --cc=shuah@kernel.org \
    --cc=smostafa@google.com \
    --cc=tglx@linutronix.de \
    --cc=will@kernel.org \
    --cc=yebin10@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.