All of lore.kernel.org
 help / color / mirror / Atom feed
From: Pranjal Shrivastava <praan@google.com>
To: Nicolin Chen <nicolinc@nvidia.com>
Cc: will@kernel.org, jean-philippe@linaro.org, robin.murphy@arm.com,
	joro@8bytes.org, jgg@nvidia.com, balbirs@nvidia.com,
	miko.lenczewski@arm.com, peterz@infradead.org,
	kevin.tian@intel.com, linux-arm-kernel@lists.infradead.org,
	iommu@lists.linux.dev, linux-kernel@vger.kernel.org
Subject: Re: [PATCH v9 6/7] iommu/arm-smmu-v3: Add arm_smmu_invs based arm_smmu_domain_inv_range()
Date: Fri, 23 Jan 2026 09:48:37 +0000	[thread overview]
Message-ID: <aXND9YJTwm6om689@google.com> (raw)
In-Reply-To: <06999367d001283744fd98eb7c1823afd516ce84.1766174731.git.nicolinc@nvidia.com>

On Fri, Dec 19, 2025 at 12:11:28PM -0800, Nicolin Chen wrote:
> Each smmu_domain now has an arm_smmu_invs that specifies the invalidation
> steps to perform after any change the IOPTEs. This includes supports for
> basic ASID/VMID, the special case for nesting, and ATC invalidations.
> 
> Introduce a new arm_smmu_domain_inv helper iterating smmu_domain->invs to
> convert the invalidation array to commands. Any invalidation request with
> no size specified means an entire flush over a range based one.
> 
> Take advantage of the sorted array to compatible batch operations together
> to the same SMMU. For instance, ATC invaliations for multiple SIDs can be
> pushed as a batch.
> 
> ATC invalidations must be completed before the driver disables ATS. Or the
> device is permitted to ignore any racing invalidation that would cause an
> SMMU timeout. The sequencing is done with a rwlock where holding the write
> side of the rwlock means that there are no outstanding ATC invalidations.
> If ATS is not used the rwlock is ignored, similar to the existing code.
> 
> Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |   9 +
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 258 +++++++++++++++++++-
>  2 files changed, 254 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index f8dc96476c43..c3fee7f14480 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -1086,6 +1086,15 @@ void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
>  int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
>  			    unsigned long iova, size_t size);
>  
> +void arm_smmu_domain_inv_range(struct arm_smmu_domain *smmu_domain,
> +			       unsigned long iova, size_t size,
> +			       unsigned int granule, bool leaf);
> +
> +static inline void arm_smmu_domain_inv(struct arm_smmu_domain *smmu_domain)
> +{
> +	arm_smmu_domain_inv_range(smmu_domain, 0, 0, 0, false);
> +}
> +
>  void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
>  			      struct arm_smmu_cmdq *cmdq);
>  int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index fb45359680d2..6e1082e6d164 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -2516,23 +2516,19 @@ static void arm_smmu_tlb_inv_context(void *cookie)
>  	arm_smmu_atc_inv_domain(smmu_domain, 0, 0);
>  }
>  
> -static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
> -				     unsigned long iova, size_t size,
> -				     size_t granule,
> -				     struct arm_smmu_domain *smmu_domain)
> +static void arm_smmu_cmdq_batch_add_range(struct arm_smmu_device *smmu,
> +					  struct arm_smmu_cmdq_batch *cmds,
> +					  struct arm_smmu_cmdq_ent *cmd,
> +					  unsigned long iova, size_t size,
> +					  size_t granule, size_t pgsize)
>  {
> -	struct arm_smmu_device *smmu = smmu_domain->smmu;
> -	unsigned long end = iova + size, num_pages = 0, tg = 0;
> +	unsigned long end = iova + size, num_pages = 0, tg = pgsize;
>  	size_t inv_range = granule;
> -	struct arm_smmu_cmdq_batch cmds;
>  
>  	if (!size)
>  		return;
>  
>  	if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> -		/* Get the leaf page size */
> -		tg = __ffs(smmu_domain->domain.pgsize_bitmap);
> -
>  		num_pages = size >> tg;
>  
>  		/* Convert page size of 12,14,16 (log2) to 1,2,3 */
> @@ -2552,8 +2548,6 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
>  			num_pages++;
>  	}
>  
> -	arm_smmu_cmdq_batch_init(smmu, &cmds, cmd);
> -
>  	while (iova < end) {
>  		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>  			/*
> @@ -2581,9 +2575,26 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
>  		}
>  
>  		cmd->tlbi.addr = iova;
> -		arm_smmu_cmdq_batch_add(smmu, &cmds, cmd);
> +		arm_smmu_cmdq_batch_add(smmu, cmds, cmd);
>  		iova += inv_range;
>  	}
> +}
> +
> +static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
> +				     unsigned long iova, size_t size,
> +				     size_t granule,
> +				     struct arm_smmu_domain *smmu_domain)
> +{
> +	struct arm_smmu_device *smmu = smmu_domain->smmu;
> +	struct arm_smmu_cmdq_batch cmds;
> +	size_t pgsize;
> +
> +	/* Get the leaf page size */
> +	pgsize = __ffs(smmu_domain->domain.pgsize_bitmap);
> +
> +	arm_smmu_cmdq_batch_init(smmu, &cmds, cmd);
> +	arm_smmu_cmdq_batch_add_range(smmu, &cmds, cmd, iova, size, granule,
> +				      pgsize);
>  	arm_smmu_cmdq_batch_submit(smmu, &cmds);
>  }
>  
> @@ -2639,6 +2650,193 @@ void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
>  	__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
>  }
>  
> +static bool arm_smmu_inv_size_too_big(struct arm_smmu_device *smmu, size_t size,
> +				      size_t granule)
> +{
> +	size_t max_tlbi_ops;
> +
> +	/* 0 size means invalidate all */
> +	if (!size || size == SIZE_MAX)
> +		return true;
> +
> +	if (smmu->features & ARM_SMMU_FEAT_RANGE_INV)
> +		return false;
> +
> +	/*
> +	 * Borrowed from the MAX_TLBI_OPS in arch/arm64/include/asm/tlbflush.h,
> +	 * this is used as a threshold to replace "size_opcode" commands with a
> +	 * single "nsize_opcode" command, when SMMU doesn't implement the range
> +	 * invalidation feature, where there can be too many per-granule TLBIs,
> +	 * resulting in a soft lockup.
> +	 */
> +	max_tlbi_ops = 1 << (ilog2(granule) - 3);
> +	return size >= max_tlbi_ops * granule;
> +}
> +
> +/* Used by non INV_TYPE_ATS* invalidations */
> +static void arm_smmu_inv_to_cmdq_batch(struct arm_smmu_inv *inv,
> +				       struct arm_smmu_cmdq_batch *cmds,
> +				       struct arm_smmu_cmdq_ent *cmd,
> +				       unsigned long iova, size_t size,
> +				       unsigned int granule)
> +{
> +	if (arm_smmu_inv_size_too_big(inv->smmu, size, granule)) {
> +		cmd->opcode = inv->nsize_opcode;
> +		arm_smmu_cmdq_batch_add(inv->smmu, cmds, cmd);
> +		return;
> +	}
> +
> +	cmd->opcode = inv->size_opcode;
> +	arm_smmu_cmdq_batch_add_range(inv->smmu, cmds, cmd, iova, size, granule,
> +				      inv->pgsize);
> +}
> +
> +static inline bool arm_smmu_invs_end_batch(struct arm_smmu_inv *cur,
> +					   struct arm_smmu_inv *next)
> +{
> +	/* Changing smmu means changing command queue */
> +	if (cur->smmu != next->smmu)
> +		return true;
> +	/* The batch for S2 TLBI must be done before nested S1 ASIDs */
> +	if (cur->type != INV_TYPE_S2_VMID_S1_CLEAR &&
> +	    next->type == INV_TYPE_S2_VMID_S1_CLEAR)
> +		return true;
> +	/* ATS must be after a sync of the S1/S2 invalidations */
> +	if (!arm_smmu_inv_is_ats(cur) && arm_smmu_inv_is_ats(next))
> +		return true;
> +	return false;
> +}
> +
> +static void __arm_smmu_domain_inv_range(struct arm_smmu_invs *invs,
> +					unsigned long iova, size_t size,
> +					unsigned int granule, bool leaf)
> +{
> +	struct arm_smmu_cmdq_batch cmds = {};
> +	struct arm_smmu_inv *cur;
> +	struct arm_smmu_inv *end;
> +
> +	cur = invs->inv;
> +	end = cur + READ_ONCE(invs->num_invs);
> +	/* Skip any leading entry marked as a trash */
> +	for (; cur != end; cur++)
> +		if (refcount_read(&cur->users))
> +			break;
> +	while (cur != end) {
> +		struct arm_smmu_device *smmu = cur->smmu;
> +		struct arm_smmu_cmdq_ent cmd = {
> +			/*
> +			 * Pick size_opcode to run arm_smmu_get_cmdq(). This can
> +			 * be changed to nsize_opcode, which would result in the
> +			 * same CMDQ pointer.
> +			 */
> +			.opcode = cur->size_opcode,
> +		};
> +		struct arm_smmu_inv *next;
> +
> +		if (!cmds.num)
> +			arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd);
> +
> +		switch (cur->type) {
> +		case INV_TYPE_S1_ASID:
> +			cmd.tlbi.asid = cur->id;
> +			cmd.tlbi.leaf = leaf;
> +			arm_smmu_inv_to_cmdq_batch(cur, &cmds, &cmd, iova, size,
> +						   granule);
> +			break;
> +		case INV_TYPE_S2_VMID:
> +			cmd.tlbi.vmid = cur->id;
> +			cmd.tlbi.leaf = leaf;
> +			arm_smmu_inv_to_cmdq_batch(cur, &cmds, &cmd, iova, size,
> +						   granule);
> +			break;
> +		case INV_TYPE_S2_VMID_S1_CLEAR:
> +			/* CMDQ_OP_TLBI_S12_VMALL already flushed S1 entries */
> +			if (arm_smmu_inv_size_too_big(cur->smmu, size, granule))
> +				continue;
> +			cmd.tlbi.vmid = cur->id;
> +			arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
> +			break;
> +		case INV_TYPE_ATS:
> +			arm_smmu_atc_inv_to_cmd(cur->ssid, iova, size, &cmd);
> +			cmd.atc.sid = cur->id;
> +			arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
> +			break;
> +		case INV_TYPE_ATS_FULL:
> +			arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
> +			cmd.atc.sid = cur->id;
> +			arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
> +			break;
> +		default:
> +			WARN_ON_ONCE(1);
> +			continue;
> +		}
> +
> +		/* Skip any trash entry in-between */
> +		for (next = cur + 1; next != end; next++)
> +			if (refcount_read(&next->users))
> +				break;
> +
> +		if (cmds.num &&
> +		    (next == end || arm_smmu_invs_end_batch(cur, next))) {
> +			arm_smmu_cmdq_batch_submit(smmu, &cmds);
> +			cmds.num = 0;
> +		}
> +		cur = next;
> +	}
> +}
> +
> +void arm_smmu_domain_inv_range(struct arm_smmu_domain *smmu_domain,
> +			       unsigned long iova, size_t size,
> +			       unsigned int granule, bool leaf)
> +{
> +	struct arm_smmu_invs *invs;
> +
> +	/*
> +	 * An invalidation request must follow some IOPTE change and then load
> +	 * an invalidation array. In the meantime, a domain attachment mutates
> +	 * the array and then stores an STE/CD asking SMMU HW to acquire those
> +	 * changed IOPTEs. In other word, these two are interdependent and can
> +	 * race.
> +	 *
> +	 * In a race, the RCU design (with its underlying memory barriers) can
> +	 * ensure the invalidation array to always get updated before loaded.
> +	 *
> +	 * smp_mb() is used here, paired with the smp_mb() following the array
> +	 * update in a concurrent attach, to ensure:
> +	 *  - HW sees the new IOPTEs if it walks after STE installation
> +	 *  - Invalidation thread sees the updated array with the new ASID.
> +	 *
> +	 *  [CPU0]                        | [CPU1]
> +	 *                                |
> +	 *  change IOPTEs and TLB flush:  |
> +	 *  arm_smmu_domain_inv_range() { | arm_smmu_install_new_domain_invs {
> +	 *    ...                         |   rcu_assign_pointer(new_invs);
> +	 *    smp_mb(); // ensure IOPTEs  |   smp_mb(); // ensure new_invs
> +	 *    ...                         |   kfree_rcu(old_invs, rcu);
> +	 *    // load invalidation array  | }
> +	 *    invs = rcu_dereference();   | arm_smmu_install_ste_for_dev {
> +	 *                                |   STE = TTB0 // read new IOPTEs
> +	 */
> +	smp_mb();
> +
> +	rcu_read_lock();
> +	invs = rcu_dereference(smmu_domain->invs);
> +
> +	/*
> +	 * Avoid locking unless ATS is being used. No ATC invalidation can be
> +	 * going on after a domain is detached.
> +	 */
> +	if (invs->has_ats) {
> +		read_lock(&invs->rwlock);

Shouldn't these be read_lock_irqsave for all rwlock variants here? 
Invalidations might happen in IRQ context as well..

> +		__arm_smmu_domain_inv_range(invs, iova, size, granule, leaf);
> +		read_unlock(&invs->rwlock);
> +	} else {
> +		__arm_smmu_domain_inv_range(invs, iova, size, granule, leaf);
> +	}
> +
> +	rcu_read_unlock();
> +}
> +
>  static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather,
>  					 unsigned long iova, size_t granule,
>  					 void *cookie)
> @@ -3285,6 +3483,23 @@ arm_smmu_install_new_domain_invs(struct arm_smmu_attach_state *state)
>  		return;
>  
>  	rcu_assign_pointer(*invst->invs_ptr, invst->new_invs);
> +	/*
> +	 * We are committed to updating the STE. Ensure the invalidation array
> +	 * is visible to concurrent map/unmap threads, and acquire any racing
> +	 * IOPTE updates.
> +	 *
> +	 *  [CPU0]                        | [CPU1]
> +	 *                                |
> +	 *  change IOPTEs and TLB flush:  |
> +	 *  arm_smmu_domain_inv_range() { | arm_smmu_install_new_domain_invs {
> +	 *    ...                         |   rcu_assign_pointer(new_invs);
> +	 *    smp_mb(); // ensure IOPTEs  |   smp_mb(); // ensure new_invs
> +	 *    ...                         |   kfree_rcu(old_invs, rcu);
> +	 *    // load invalidation array  | }
> +	 *    invs = rcu_dereference();   | arm_smmu_install_ste_for_dev {
> +	 *                                |   STE = TTB0 // read new IOPTEs
> +	 */
> +	smp_mb();
>  	kfree_rcu(invst->old_invs, rcu);
>  }
>  
> @@ -3334,6 +3549,23 @@ arm_smmu_install_old_domain_invs(struct arm_smmu_attach_state *state)
>  		return;
>  
>  	rcu_assign_pointer(*invst->invs_ptr, new_invs);
> +	/*
> +	 * We are committed to updating the STE. Ensure the invalidation array
> +	 * is visible to concurrent map/unmap threads, and acquire any racing
> +	 * IOPTE updates.
> +	 *
> +	 *  [CPU0]                        | [CPU1]
> +	 *                                |
> +	 *  change IOPTEs and TLB flush:  |
> +	 *  arm_smmu_domain_inv_range() { | arm_smmu_install_old_domain_invs {
> +	 *    ...                         |   rcu_assign_pointer(new_invs);
> +	 *    smp_mb(); // ensure IOPTEs  |   smp_mb(); // ensure new_invs
> +	 *    ...                         |   kfree_rcu(old_invs, rcu);
> +	 *    // load invalidation array  | }
> +	 *    invs = rcu_dereference();   | arm_smmu_install_ste_for_dev {
> +	 *                                |   STE = TTB0 // read new IOPTEs
> +	 */
> +	smp_mb();
>  	kfree_rcu(old_invs, rcu);
>  }
>

For INV_TYPE_S1_ASID, the new code loops and checks size_too_big 
via arm_smmu_inv_to_cmdq_batch. However, for INV_TYPE_ATS, it issues
a single command for the entire range. While this matches the current
driver, are we confident arm_smmu_atc_inv_to_cmd handles all massive
sizes correctly without needing a similar loop or "too big" fallback?

Thanks,
Praan


  reply	other threads:[~2026-01-23  9:48 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-19 20:11 [PATCH v9 0/7] iommu/arm-smmu-v3: Introduce an RCU-protected invalidation array Nicolin Chen
2025-12-19 20:11 ` [PATCH v9 1/7] iommu/arm-smmu-v3: Explicitly set smmu_domain->stage for SVA Nicolin Chen
2026-01-23  9:49   ` Pranjal Shrivastava
2025-12-19 20:11 ` [PATCH v9 2/7] iommu/arm-smmu-v3: Add an inline arm_smmu_domain_free() Nicolin Chen
2026-01-23  9:50   ` Pranjal Shrivastava
2025-12-19 20:11 ` [PATCH v9 3/7] iommu/arm-smmu-v3: Introduce a per-domain arm_smmu_invs array Nicolin Chen
2026-01-23  9:53   ` Pranjal Shrivastava
2026-01-23 17:03   ` Will Deacon
2026-01-23 17:35     ` Nicolin Chen
2026-01-23 17:51       ` Will Deacon
2026-01-23 17:56         ` Nicolin Chen
2026-01-23 19:16           ` Jason Gunthorpe
2026-01-23 19:18             ` Nicolin Chen
2026-01-26 14:54             ` Will Deacon
2026-01-26 15:21               ` Jason Gunthorpe
2025-12-19 20:11 ` [PATCH v9 4/7] iommu/arm-smmu-v3: Pre-allocate a per-master invalidation array Nicolin Chen
2026-01-23  9:54   ` Pranjal Shrivastava
2025-12-19 20:11 ` [PATCH v9 5/7] iommu/arm-smmu-v3: Populate smmu_domain->invs when attaching masters Nicolin Chen
2025-12-19 20:11 ` [PATCH v9 6/7] iommu/arm-smmu-v3: Add arm_smmu_invs based arm_smmu_domain_inv_range() Nicolin Chen
2026-01-23  9:48   ` Pranjal Shrivastava [this message]
2026-01-23 13:56     ` Jason Gunthorpe
2026-01-27 16:38     ` Nicolin Chen
2026-01-27 17:08       ` Jason Gunthorpe
2026-01-27 18:07         ` Nicolin Chen
2026-01-27 18:23           ` Jason Gunthorpe
2026-01-27 18:37             ` Nicolin Chen
2026-01-27 19:19               ` Jason Gunthorpe
2026-01-27 20:14                 ` Nicolin Chen
2026-01-28  0:05                   ` Jason Gunthorpe
2026-01-23 17:05   ` Will Deacon
2026-01-23 17:10     ` Will Deacon
2026-01-23 17:43       ` Nicolin Chen
2026-01-23 20:03       ` Jason Gunthorpe
2026-01-26 13:01         ` Will Deacon
2026-01-26 15:20           ` Jason Gunthorpe
2026-01-26 16:02             ` Will Deacon
2026-01-26 16:09               ` Jason Gunthorpe
2026-01-26 18:56                 ` Will Deacon
2026-01-27  3:14                   ` Nicolin Chen
2026-01-26 17:50             ` Nicolin Chen
2025-12-19 20:11 ` [PATCH v9 7/7] iommu/arm-smmu-v3: Perform per-domain invalidations using arm_smmu_invs Nicolin Chen
2026-01-23 17:07   ` Will Deacon
2026-01-23 17:47     ` Nicolin Chen
2026-01-23 19:59     ` Jason Gunthorpe
2026-01-19 17:10 ` [PATCH v9 0/7] iommu/arm-smmu-v3: Introduce an RCU-protected invalidation array Nicolin Chen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aXND9YJTwm6om689@google.com \
    --to=praan@google.com \
    --cc=balbirs@nvidia.com \
    --cc=iommu@lists.linux.dev \
    --cc=jean-philippe@linaro.org \
    --cc=jgg@nvidia.com \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=miko.lenczewski@arm.com \
    --cc=nicolinc@nvidia.com \
    --cc=peterz@infradead.org \
    --cc=robin.murphy@arm.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.