[PATCH] iommu/arm-smmu-v3: Allocate cmdq

public inbox for linux-arm-kernel@lists.infradead.org
 help / color / mirror / Atom feed

* [PATCH] iommu/arm-smmu-v3: Allocate cmdq_batch on the heap
@ 2026-03-11  9:44 Cheng-Yang Chou
  2026-03-11 14:22 ` Pranjal Shrivastava
  0 siblings, 1 reply; 6+ messages in thread
From: Cheng-Yang Chou @ 2026-03-11  9:44 UTC (permalink / raw)
  To: will; +Cc: robin.murphy, linux-arm-kernel, iommu, jserv, Cheng-Yang Chou

The arm_smmu_cmdq_batch structure is large and was being allocated on
the stack in four call sites, causing stack frame sizes to exceed the
1024-byte limit:

- arm_smmu_atc_inv_domain: 1120 bytes
- arm_smmu_atc_inv_master: 1088 bytes
- arm_smmu_sync_cd: 1088 bytes
- __arm_smmu_tlb_inv_range: 1072 bytes

Move these allocations to the heap using kmalloc_obj() and kfree() to
eliminate the -Wframe-larger-than=1024 warnings and prevent potential
stack overflows.

Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 66 +++++++++++++++------
 1 file changed, 48 insertions(+), 18 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 4d00d796f078..734546dc6a78 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1281,7 +1281,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 			     int ssid, bool leaf)
 {
 	size_t i;
-	struct arm_smmu_cmdq_batch cmds;
+	struct arm_smmu_cmdq_batch *cmds;
 	struct arm_smmu_device *smmu = master->smmu;
 	struct arm_smmu_cmdq_ent cmd = {
 		.opcode	= CMDQ_OP_CFGI_CD,
@@ -1291,13 +1291,23 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 		},
 	};
 
-	arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd);
+	cmds = kmalloc_obj(*cmds);
+	if (!cmds) {
+		struct arm_smmu_cmdq_ent cmd_all = { .opcode = CMDQ_OP_CFGI_ALL };
+
+		WARN_ONCE(1, "arm-smmu-v3: failed to allocate cmdq_batch, falling back to full CD invalidation\n");
+		arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd_all);
+		return;
+	}
+
+	arm_smmu_cmdq_batch_init(smmu, cmds, &cmd);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.cfgi.sid = master->streams[i].id;
-		arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
+		arm_smmu_cmdq_batch_add(smmu, cmds, &cmd);
 	}
 
-	arm_smmu_cmdq_batch_submit(smmu, &cmds);
+	arm_smmu_cmdq_batch_submit(smmu, cmds);
+	kfree(cmds);
 }
 
 static void arm_smmu_write_cd_l1_desc(struct arm_smmu_cdtab_l1 *dst,
@@ -2225,31 +2235,37 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size,
 static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
 				   ioasid_t ssid)
 {
-	int i;
+	int i, ret;
 	struct arm_smmu_cmdq_ent cmd;
-	struct arm_smmu_cmdq_batch cmds;
+	struct arm_smmu_cmdq_batch *cmds;
 
 	arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd);
 
-	arm_smmu_cmdq_batch_init(master->smmu, &cmds, &cmd);
+	cmds = kmalloc_obj(*cmds);
+	if (!cmds)
+		return -ENOMEM;
+
+	arm_smmu_cmdq_batch_init(master->smmu, cmds, &cmd);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.atc.sid = master->streams[i].id;
-		arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
+		arm_smmu_cmdq_batch_add(master->smmu, cmds, &cmd);
 	}
 
-	return arm_smmu_cmdq_batch_submit(master->smmu, &cmds);
+	ret = arm_smmu_cmdq_batch_submit(master->smmu, cmds);
+	kfree(cmds);
+	return ret;
 }
 
 int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 			    unsigned long iova, size_t size)
 {
 	struct arm_smmu_master_domain *master_domain;
-	int i;
+	int i, ret;
 	unsigned long flags;
 	struct arm_smmu_cmdq_ent cmd = {
 		.opcode = CMDQ_OP_ATC_INV,
 	};
-	struct arm_smmu_cmdq_batch cmds;
+	struct arm_smmu_cmdq_batch *cmds;
 
 	if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
 		return 0;
@@ -2271,7 +2287,11 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 	if (!atomic_read(&smmu_domain->nr_ats_masters))
 		return 0;
 
-	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, &cmd);
+	cmds = kmalloc_obj(*cmds);
+	if (!cmds)
+		return -ENOMEM;
+
+	arm_smmu_cmdq_batch_init(smmu_domain->smmu, cmds, &cmd);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	list_for_each_entry(master_domain, &smmu_domain->devices,
@@ -2294,12 +2314,14 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 
 		for (i = 0; i < master->num_streams; i++) {
 			cmd.atc.sid = master->streams[i].id;
-			arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
+			arm_smmu_cmdq_batch_add(smmu_domain->smmu, cmds, &cmd);
 		}
 	}
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
-	return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
+	ret = arm_smmu_cmdq_batch_submit(smmu_domain->smmu, cmds);
+	kfree(cmds);
+	return ret;
 }
 
 /* IO_PGTABLE API */
@@ -2334,7 +2356,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
 	unsigned long end = iova + size, num_pages = 0, tg = 0;
 	size_t inv_range = granule;
-	struct arm_smmu_cmdq_batch cmds;
+	struct arm_smmu_cmdq_batch *cmds;
 
 	if (!size)
 		return;
@@ -2362,7 +2384,14 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
 			num_pages++;
 	}
 
-	arm_smmu_cmdq_batch_init(smmu, &cmds, cmd);
+	cmds = kmalloc_obj(*cmds);
+	if (!cmds) {
+		WARN_ONCE(1, "arm-smmu-v3: failed to allocate cmdq_batch, falling back to full TLB invalidation\n");
+		arm_smmu_tlb_inv_context(smmu_domain);
+		return;
+	}
+
+	arm_smmu_cmdq_batch_init(smmu, cmds, cmd);
 
 	while (iova < end) {
 		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
@@ -2391,10 +2420,11 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
 		}
 
 		cmd->tlbi.addr = iova;
-		arm_smmu_cmdq_batch_add(smmu, &cmds, cmd);
+		arm_smmu_cmdq_batch_add(smmu, cmds, cmd);
 		iova += inv_range;
 	}
-	arm_smmu_cmdq_batch_submit(smmu, &cmds);
+	arm_smmu_cmdq_batch_submit(smmu, cmds);
+	kfree(cmds);
 }
 
 static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
-- 
2.48.1



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] iommu/arm-smmu-v3: Allocate cmdq_batch on the heap
  2026-03-11  9:44 [PATCH] iommu/arm-smmu-v3: Allocate cmdq_batch on the heap Cheng-Yang Chou
@ 2026-03-11 14:22 ` Pranjal Shrivastava
  2026-03-12 18:24   ` Cheng-Yang Chou
  2026-03-17 13:38   ` Robin Murphy
  0 siblings, 2 replies; 6+ messages in thread
From: Pranjal Shrivastava @ 2026-03-11 14:22 UTC (permalink / raw)
  To: Cheng-Yang Chou; +Cc: will, robin.murphy, linux-arm-kernel, iommu, jserv

On Wed, Mar 11, 2026 at 05:44:44PM +0800, Cheng-Yang Chou wrote:
> The arm_smmu_cmdq_batch structure is large and was being allocated on
> the stack in four call sites, causing stack frame sizes to exceed the
> 1024-byte limit:
> 
> - arm_smmu_atc_inv_domain: 1120 bytes
> - arm_smmu_atc_inv_master: 1088 bytes
> - arm_smmu_sync_cd: 1088 bytes
> - __arm_smmu_tlb_inv_range: 1072 bytes
> 
> Move these allocations to the heap using kmalloc_obj() and kfree() to
> eliminate the -Wframe-larger-than=1024 warnings and prevent potential
> stack overflows.
> 

Thanks for the patch. I agree that we should address these warnings, but
moving these allocations to the heap via kmalloc_obj() in the fast path
is problematic. Introducing heap allocation adds unnecessary latency and
potential for allocation failure in hot paths.

So, yes, we are using a lot of stack but we're using it to do good
things.. 

IMO, if we really want to address these, instead of kmalloc, we could
potentially consider some pre-allocated per-CPU buffers (that's a lot of
additional book-keeping though) to keep the data off the stack or
something similar following a simple rule: The fast path must be 
deterministic- no SLAB allocations and no introducing new failure points

The last thing we'd want is a graphic driver's shrinker calling
dma-unmaps when the system is already under heavy memory pressure and 
calling kmalloc leading to a circular dependency or allocation failure
exactly when the system needs to perform the unmap the most.

Thanks,
Praan

> Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 66 +++++++++++++++------
>  1 file changed, 48 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 4d00d796f078..734546dc6a78 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -1281,7 +1281,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
>  			     int ssid, bool leaf)
>  {
>  	size_t i;
> -	struct arm_smmu_cmdq_batch cmds;
> +	struct arm_smmu_cmdq_batch *cmds;
>  	struct arm_smmu_device *smmu = master->smmu;
>  	struct arm_smmu_cmdq_ent cmd = {
>  		.opcode	= CMDQ_OP_CFGI_CD,
> @@ -1291,13 +1291,23 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
>  		},
>  	};
>  
> -	arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd);
> +	cmds = kmalloc_obj(*cmds);
> +	if (!cmds) {
> +		struct arm_smmu_cmdq_ent cmd_all = { .opcode = CMDQ_OP_CFGI_ALL };
> +
> +		WARN_ONCE(1, "arm-smmu-v3: failed to allocate cmdq_batch, falling back to full CD invalidation\n");
> +		arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd_all);
> +		return;
> +	}
> +
> +	arm_smmu_cmdq_batch_init(smmu, cmds, &cmd);
>  	for (i = 0; i < master->num_streams; i++) {
>  		cmd.cfgi.sid = master->streams[i].id;
> -		arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
> +		arm_smmu_cmdq_batch_add(smmu, cmds, &cmd);
>  	}
>  
> -	arm_smmu_cmdq_batch_submit(smmu, &cmds);
> +	arm_smmu_cmdq_batch_submit(smmu, cmds);
> +	kfree(cmds);
>  }
>  
>  static void arm_smmu_write_cd_l1_desc(struct arm_smmu_cdtab_l1 *dst,
> @@ -2225,31 +2235,37 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size,
>  static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
>  				   ioasid_t ssid)
>  {
> -	int i;
> +	int i, ret;
>  	struct arm_smmu_cmdq_ent cmd;
> -	struct arm_smmu_cmdq_batch cmds;
> +	struct arm_smmu_cmdq_batch *cmds;
>  
>  	arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd);
>  
> -	arm_smmu_cmdq_batch_init(master->smmu, &cmds, &cmd);
> +	cmds = kmalloc_obj(*cmds);
> +	if (!cmds)
> +		return -ENOMEM;
> +
> +	arm_smmu_cmdq_batch_init(master->smmu, cmds, &cmd);
>  	for (i = 0; i < master->num_streams; i++) {
>  		cmd.atc.sid = master->streams[i].id;
> -		arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
> +		arm_smmu_cmdq_batch_add(master->smmu, cmds, &cmd);
>  	}
>  
> -	return arm_smmu_cmdq_batch_submit(master->smmu, &cmds);
> +	ret = arm_smmu_cmdq_batch_submit(master->smmu, cmds);
> +	kfree(cmds);
> +	return ret;
>  }
>  
>  int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
>  			    unsigned long iova, size_t size)
>  {
>  	struct arm_smmu_master_domain *master_domain;
> -	int i;
> +	int i, ret;
>  	unsigned long flags;
>  	struct arm_smmu_cmdq_ent cmd = {
>  		.opcode = CMDQ_OP_ATC_INV,
>  	};
> -	struct arm_smmu_cmdq_batch cmds;
> +	struct arm_smmu_cmdq_batch *cmds;
>  
>  	if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
>  		return 0;
> @@ -2271,7 +2287,11 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
>  	if (!atomic_read(&smmu_domain->nr_ats_masters))
>  		return 0;
>  
> -	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, &cmd);
> +	cmds = kmalloc_obj(*cmds);
> +	if (!cmds)
> +		return -ENOMEM;
> +
> +	arm_smmu_cmdq_batch_init(smmu_domain->smmu, cmds, &cmd);
>  
>  	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
>  	list_for_each_entry(master_domain, &smmu_domain->devices,
> @@ -2294,12 +2314,14 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
>  
>  		for (i = 0; i < master->num_streams; i++) {
>  			cmd.atc.sid = master->streams[i].id;
> -			arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
> +			arm_smmu_cmdq_batch_add(smmu_domain->smmu, cmds, &cmd);
>  		}
>  	}
>  	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
>  
> -	return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
> +	ret = arm_smmu_cmdq_batch_submit(smmu_domain->smmu, cmds);
> +	kfree(cmds);
> +	return ret;
>  }
>  
>  /* IO_PGTABLE API */
> @@ -2334,7 +2356,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
>  	struct arm_smmu_device *smmu = smmu_domain->smmu;
>  	unsigned long end = iova + size, num_pages = 0, tg = 0;
>  	size_t inv_range = granule;
> -	struct arm_smmu_cmdq_batch cmds;
> +	struct arm_smmu_cmdq_batch *cmds;
>  
>  	if (!size)
>  		return;
> @@ -2362,7 +2384,14 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
>  			num_pages++;
>  	}
>  
> -	arm_smmu_cmdq_batch_init(smmu, &cmds, cmd);
> +	cmds = kmalloc_obj(*cmds);
> +	if (!cmds) {
> +		WARN_ONCE(1, "arm-smmu-v3: failed to allocate cmdq_batch, falling back to full TLB invalidation\n");
> +		arm_smmu_tlb_inv_context(smmu_domain);
> +		return;
> +	}
> +
> +	arm_smmu_cmdq_batch_init(smmu, cmds, cmd);
>  
>  	while (iova < end) {
>  		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> @@ -2391,10 +2420,11 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
>  		}
>  
>  		cmd->tlbi.addr = iova;
> -		arm_smmu_cmdq_batch_add(smmu, &cmds, cmd);
> +		arm_smmu_cmdq_batch_add(smmu, cmds, cmd);
>  		iova += inv_range;
>  	}
> -	arm_smmu_cmdq_batch_submit(smmu, &cmds);
> +	arm_smmu_cmdq_batch_submit(smmu, cmds);
> +	kfree(cmds);
>  }
>  
>  static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
> -- 
> 2.48.1
> 
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] iommu/arm-smmu-v3: Allocate cmdq_batch on the heap
  2026-03-11 14:22 ` Pranjal Shrivastava
@ 2026-03-12 18:24   ` Cheng-Yang Chou
  2026-03-12 22:50     ` Nicolin Chen
  2026-03-17 13:38   ` Robin Murphy
  1 sibling, 1 reply; 6+ messages in thread
From: Cheng-Yang Chou @ 2026-03-12 18:24 UTC (permalink / raw)
  To: Pranjal Shrivastava; +Cc: will, robin.murphy, linux-arm-kernel, iommu, jserv

On Wed, Mar 11, 2026 at 02:22:50PM +0000, Pranjal Shrivastava wrote:
> On Wed, Mar 11, 2026 at 05:44:44PM +0800, Cheng-Yang Chou wrote:
> > The arm_smmu_cmdq_batch structure is large and was being allocated on
> > the stack in four call sites, causing stack frame sizes to exceed the
> > 1024-byte limit:
> > 
> > - arm_smmu_atc_inv_domain: 1120 bytes
> > - arm_smmu_atc_inv_master: 1088 bytes
> > - arm_smmu_sync_cd: 1088 bytes
> > - __arm_smmu_tlb_inv_range: 1072 bytes
> > 
> > Move these allocations to the heap using kmalloc_obj() and kfree() to
> > eliminate the -Wframe-larger-than=1024 warnings and prevent potential
> > stack overflows.
> > 
> 
> Thanks for the patch. I agree that we should address these warnings, but
> moving these allocations to the heap via kmalloc_obj() in the fast path
> is problematic. Introducing heap allocation adds unnecessary latency and
> potential for allocation failure in hot paths.
> 
> So, yes, we are using a lot of stack but we're using it to do good
> things.. 
> 
> IMO, if we really want to address these, instead of kmalloc, we could
> potentially consider some pre-allocated per-CPU buffers (that's a lot of
> additional book-keeping though) to keep the data off the stack or
> something similar following a simple rule: The fast path must be 
> deterministic- no SLAB allocations and no introducing new failure points
> 
> The last thing we'd want is a graphic driver's shrinker calling
> dma-unmaps when the system is already under heavy memory pressure and 
> calling kmalloc leading to a circular dependency or allocation failure
> exactly when the system needs to perform the unmap the most.
> 
> Thanks,
> Praan

Hi Praan,

Thanks for the feedback.
I agree that kmalloc() is unsuitable for the SMMU fast path due to
potential deadlocks and the need for determinism.

To resolve the stack warnings, I'm considering using per-CPU buffers in v2. 
Does this direction sound reasonable, or would you prefer to keep it as-is
to avoid the added complexity?

-- 
Thanks,
Cheng-Yang


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] iommu/arm-smmu-v3: Allocate cmdq_batch on the heap
  2026-03-12 18:24   ` Cheng-Yang Chou
@ 2026-03-12 22:50     ` Nicolin Chen
  2026-03-13  0:06       ` Pranjal Shrivastava
  0 siblings, 1 reply; 6+ messages in thread
From: Nicolin Chen @ 2026-03-12 22:50 UTC (permalink / raw)
  To: Cheng-Yang Chou
  Cc: Pranjal Shrivastava, will, robin.murphy, linux-arm-kernel, iommu,
	jserv

On Fri, Mar 13, 2026 at 02:24:09AM +0800, Cheng-Yang Chou wrote:
> On Wed, Mar 11, 2026 at 02:22:50PM +0000, Pranjal Shrivastava wrote:
> > IMO, if we really want to address these, instead of kmalloc, we could
> > potentially consider some pre-allocated per-CPU buffers (that's a lot of
> > additional book-keeping though) to keep the data off the stack or
> > something similar following a simple rule: The fast path must be 
> > deterministic- no SLAB allocations and no introducing new failure points

> To resolve the stack warnings, I'm considering using per-CPU buffers in v2. 
> Does this direction sound reasonable, or would you prefer to keep it as-is
> to avoid the added complexity?

I don't think per-CPU buffers would work here either..

arm_smmu_atc_inv_master() is used in a preemptible context, while
arm_smmu_atc_inv_domain() can be called from an irq context.

Think of a !SMP case for simplification: we only have one per-CPU
buffer, which is not enough if an IRQ preempts the task context.

Maybe having a smaller backup array on the stack that can be used
when the heap allocation fails? Still, I don't see how to address
it elegantly without losing some of the performance optimization.

Nicolin

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] iommu/arm-smmu-v3: Allocate cmdq_batch on the heap
  2026-03-12 22:50     ` Nicolin Chen
@ 2026-03-13  0:06       ` Pranjal Shrivastava
  0 siblings, 0 replies; 6+ messages in thread
From: Pranjal Shrivastava @ 2026-03-13  0:06 UTC (permalink / raw)
  To: Nicolin Chen
  Cc: Cheng-Yang Chou, will, robin.murphy, linux-arm-kernel, iommu,
	jserv

On Thu, Mar 12, 2026 at 03:50:19PM -0700, Nicolin Chen wrote:
> On Fri, Mar 13, 2026 at 02:24:09AM +0800, Cheng-Yang Chou wrote:
> > On Wed, Mar 11, 2026 at 02:22:50PM +0000, Pranjal Shrivastava wrote:
> > > IMO, if we really want to address these, instead of kmalloc, we could
> > > potentially consider some pre-allocated per-CPU buffers (that's a lot of
> > > additional book-keeping though) to keep the data off the stack or
> > > something similar following a simple rule: The fast path must be 
> > > deterministic- no SLAB allocations and no introducing new failure points
> 
> > To resolve the stack warnings, I'm considering using per-CPU buffers in v2. 
> > Does this direction sound reasonable, or would you prefer to keep it as-is
> > to avoid the added complexity?
> 
> I don't think per-CPU buffers would work here either..
> 
> arm_smmu_atc_inv_master() is used in a preemptible context, while
> arm_smmu_atc_inv_domain() can be called from an irq context.
> 
> Think of a !SMP case for simplification: we only have one per-CPU
> buffer, which is not enough if an IRQ preempts the task context.

+1

> 
> Maybe having a smaller backup array on the stack that can be used
> when the heap allocation fails? Still, I don't see how to address
> it elegantly without losing some of the performance optimization.
> 

A backup array is no good either IMO, stack sizes are fixed at compile
time, the compiler will still count those bytes against the 1024-byte
limit regardless of whether the heap allocation succeeds or fails. If
the limit changes tomorrow, we'll have to adjust the "backup array size"
Furthermore, for deep call chains 'smaller' array can still be the straw
that breaks the boundary. 

As for a pre-allocated global buffer, the synchronization and bookkeeping
required to safely handle re-entrancy between task and IRQ contexts would
essentially require writing a custom allocator inside the driver.

Falling back to code paths based on transient heap availability also 
introduces non-deterministic behavior in a critical path which must 
remain reliable when the system is under pressure.

I'm still open to suggestions in case we're able to come up with a
solution that keeps the unmap paths equally performant and reliable..

Thanks,
Praan

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] iommu/arm-smmu-v3: Allocate cmdq_batch on the heap
  2026-03-11 14:22 ` Pranjal Shrivastava
  2026-03-12 18:24   ` Cheng-Yang Chou
@ 2026-03-17 13:38   ` Robin Murphy
  1 sibling, 0 replies; 6+ messages in thread
From: Robin Murphy @ 2026-03-17 13:38 UTC (permalink / raw)
  To: Pranjal Shrivastava, Cheng-Yang Chou; +Cc: will, linux-arm-kernel, iommu, jserv

On 2026-03-11 2:22 pm, Pranjal Shrivastava wrote:
> On Wed, Mar 11, 2026 at 05:44:44PM +0800, Cheng-Yang Chou wrote:
>> The arm_smmu_cmdq_batch structure is large and was being allocated on
>> the stack in four call sites, causing stack frame sizes to exceed the
>> 1024-byte limit:
>>
>> - arm_smmu_atc_inv_domain: 1120 bytes
>> - arm_smmu_atc_inv_master: 1088 bytes
>> - arm_smmu_sync_cd: 1088 bytes
>> - __arm_smmu_tlb_inv_range: 1072 bytes
>>
>> Move these allocations to the heap using kmalloc_obj() and kfree() to
>> eliminate the -Wframe-larger-than=1024 warnings and prevent potential
>> stack overflows.

Pro tip: you can also eliminate the warning by setting CONFIG_FRAME_WARN 
to a larger number, or to 0. The default is 2048, so you're already only 
getting a warning because you've gone out of your way to ask for a 
warning. The smaller the number you choose, the more warnings you'll 
get, but does that alone justify "fixing" them?

It's certainly plausible that we could get to issuing invalidation 
commands at the bottom of a relatively long callchain through 
subsystem/client driver/DMA API code, but have you observed a stack 
overflow in practice? It's not like these functions are re-entrant, or 
calling out to unknown external code, so while ~1KB is admittedly 
reasonably big, we can at least reason that there's never going to be 
much more *beyond* that (basically just whatever 
arm_smmu_cmdq_issue_cmdlist() uses).

> Thanks for the patch. I agree that we should address these warnings, but
> moving these allocations to the heap via kmalloc_obj() in the fast path
> is problematic. Introducing heap allocation adds unnecessary latency and
> potential for allocation failure in hot paths.
> 
> So, yes, we are using a lot of stack but we're using it to do good
> things..
> 
> IMO, if we really want to address these, instead of kmalloc, we could
> potentially consider some pre-allocated per-CPU buffers (that's a lot of
> additional book-keeping though) to keep the data off the stack or
> something similar following a simple rule: The fast path must be
> deterministic- no SLAB allocations and no introducing new failure points
> 
> The last thing we'd want is a graphic driver's shrinker calling
> dma-unmaps when the system is already under heavy memory pressure and
> calling kmalloc leading to a circular dependency or allocation failure
> exactly when the system needs to perform the unmap the most.

ISTR it's worse than that, and in fact we must not even attempt to 
allocate in a reclaim path at all, or it risks deadlock. So since the 
SMMU driver cannot realistically know the context of *why* it's being 
asked to unmap/invalidate something, I'm not sure it can ever be assumed 
to be safe.

Thanks,
Robin.

> 
> Thanks,
> Praan
> 
>> Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
>> ---
>>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 66 +++++++++++++++------
>>   1 file changed, 48 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> index 4d00d796f078..734546dc6a78 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> @@ -1281,7 +1281,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
>>   			     int ssid, bool leaf)
>>   {
>>   	size_t i;
>> -	struct arm_smmu_cmdq_batch cmds;
>> +	struct arm_smmu_cmdq_batch *cmds;
>>   	struct arm_smmu_device *smmu = master->smmu;
>>   	struct arm_smmu_cmdq_ent cmd = {
>>   		.opcode	= CMDQ_OP_CFGI_CD,
>> @@ -1291,13 +1291,23 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
>>   		},
>>   	};
>>   
>> -	arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd);
>> +	cmds = kmalloc_obj(*cmds);
>> +	if (!cmds) {
>> +		struct arm_smmu_cmdq_ent cmd_all = { .opcode = CMDQ_OP_CFGI_ALL };
>> +
>> +		WARN_ONCE(1, "arm-smmu-v3: failed to allocate cmdq_batch, falling back to full CD invalidation\n");
>> +		arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd_all);
>> +		return;
>> +	}
>> +
>> +	arm_smmu_cmdq_batch_init(smmu, cmds, &cmd);
>>   	for (i = 0; i < master->num_streams; i++) {
>>   		cmd.cfgi.sid = master->streams[i].id;
>> -		arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
>> +		arm_smmu_cmdq_batch_add(smmu, cmds, &cmd);
>>   	}
>>   
>> -	arm_smmu_cmdq_batch_submit(smmu, &cmds);
>> +	arm_smmu_cmdq_batch_submit(smmu, cmds);
>> +	kfree(cmds);
>>   }
>>   
>>   static void arm_smmu_write_cd_l1_desc(struct arm_smmu_cdtab_l1 *dst,
>> @@ -2225,31 +2235,37 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size,
>>   static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
>>   				   ioasid_t ssid)
>>   {
>> -	int i;
>> +	int i, ret;
>>   	struct arm_smmu_cmdq_ent cmd;
>> -	struct arm_smmu_cmdq_batch cmds;
>> +	struct arm_smmu_cmdq_batch *cmds;
>>   
>>   	arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd);
>>   
>> -	arm_smmu_cmdq_batch_init(master->smmu, &cmds, &cmd);
>> +	cmds = kmalloc_obj(*cmds);
>> +	if (!cmds)
>> +		return -ENOMEM;
>> +
>> +	arm_smmu_cmdq_batch_init(master->smmu, cmds, &cmd);
>>   	for (i = 0; i < master->num_streams; i++) {
>>   		cmd.atc.sid = master->streams[i].id;
>> -		arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
>> +		arm_smmu_cmdq_batch_add(master->smmu, cmds, &cmd);
>>   	}
>>   
>> -	return arm_smmu_cmdq_batch_submit(master->smmu, &cmds);
>> +	ret = arm_smmu_cmdq_batch_submit(master->smmu, cmds);
>> +	kfree(cmds);
>> +	return ret;
>>   }
>>   
>>   int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
>>   			    unsigned long iova, size_t size)
>>   {
>>   	struct arm_smmu_master_domain *master_domain;
>> -	int i;
>> +	int i, ret;
>>   	unsigned long flags;
>>   	struct arm_smmu_cmdq_ent cmd = {
>>   		.opcode = CMDQ_OP_ATC_INV,
>>   	};
>> -	struct arm_smmu_cmdq_batch cmds;
>> +	struct arm_smmu_cmdq_batch *cmds;
>>   
>>   	if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
>>   		return 0;
>> @@ -2271,7 +2287,11 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
>>   	if (!atomic_read(&smmu_domain->nr_ats_masters))
>>   		return 0;
>>   
>> -	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, &cmd);
>> +	cmds = kmalloc_obj(*cmds);
>> +	if (!cmds)
>> +		return -ENOMEM;
>> +
>> +	arm_smmu_cmdq_batch_init(smmu_domain->smmu, cmds, &cmd);
>>   
>>   	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
>>   	list_for_each_entry(master_domain, &smmu_domain->devices,
>> @@ -2294,12 +2314,14 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
>>   
>>   		for (i = 0; i < master->num_streams; i++) {
>>   			cmd.atc.sid = master->streams[i].id;
>> -			arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
>> +			arm_smmu_cmdq_batch_add(smmu_domain->smmu, cmds, &cmd);
>>   		}
>>   	}
>>   	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
>>   
>> -	return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
>> +	ret = arm_smmu_cmdq_batch_submit(smmu_domain->smmu, cmds);
>> +	kfree(cmds);
>> +	return ret;
>>   }
>>   
>>   /* IO_PGTABLE API */
>> @@ -2334,7 +2356,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
>>   	struct arm_smmu_device *smmu = smmu_domain->smmu;
>>   	unsigned long end = iova + size, num_pages = 0, tg = 0;
>>   	size_t inv_range = granule;
>> -	struct arm_smmu_cmdq_batch cmds;
>> +	struct arm_smmu_cmdq_batch *cmds;
>>   
>>   	if (!size)
>>   		return;
>> @@ -2362,7 +2384,14 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
>>   			num_pages++;
>>   	}
>>   
>> -	arm_smmu_cmdq_batch_init(smmu, &cmds, cmd);
>> +	cmds = kmalloc_obj(*cmds);
>> +	if (!cmds) {
>> +		WARN_ONCE(1, "arm-smmu-v3: failed to allocate cmdq_batch, falling back to full TLB invalidation\n");
>> +		arm_smmu_tlb_inv_context(smmu_domain);
>> +		return;
>> +	}
>> +
>> +	arm_smmu_cmdq_batch_init(smmu, cmds, cmd);
>>   
>>   	while (iova < end) {
>>   		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>> @@ -2391,10 +2420,11 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
>>   		}
>>   
>>   		cmd->tlbi.addr = iova;
>> -		arm_smmu_cmdq_batch_add(smmu, &cmds, cmd);
>> +		arm_smmu_cmdq_batch_add(smmu, cmds, cmd);
>>   		iova += inv_range;
>>   	}
>> -	arm_smmu_cmdq_batch_submit(smmu, &cmds);
>> +	arm_smmu_cmdq_batch_submit(smmu, cmds);
>> +	kfree(cmds);
>>   }
>>   
>>   static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
>> -- 
>> 2.48.1
>>
>>



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2026-03-17 13:38 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-11  9:44 [PATCH] iommu/arm-smmu-v3: Allocate cmdq_batch on the heap Cheng-Yang Chou
2026-03-11 14:22 ` Pranjal Shrivastava
2026-03-12 18:24   ` Cheng-Yang Chou
2026-03-12 22:50     ` Nicolin Chen
2026-03-13  0:06       ` Pranjal Shrivastava
2026-03-17 13:38   ` Robin Murphy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox