Linux-ARM-Kernel Archive on lore.kernel.org

Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 07/17] media: rockchip: rga: adjust get_version to return the version
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Adjust get_version to return the version instead of directly updating it
in the rockchip_rga structure. This is done in preparation for a
multi-core support to check that cores with the same compatible share the
same version.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga-hw.c  | 10 +++++++---
 drivers/media/platform/rockchip/rga/rga.c     |  2 +-
 drivers/media/platform/rockchip/rga/rga.h     |  2 +-
 drivers/media/platform/rockchip/rga/rga3-hw.c |  8 +++++---
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/media/platform/rockchip/rga/rga-hw.c b/drivers/media/platform/rockchip/rga/rga-hw.c
index 4d7b0a03820a1..190104f3b2954 100644
--- a/drivers/media/platform/rockchip/rga/rga-hw.c
+++ b/drivers/media/platform/rockchip/rga/rga-hw.c
@@ -474,10 +474,14 @@ static bool rga_handle_irq(struct rockchip_rga *rga)
 	return intr & RGA_INT_COMMAND_FINISHED;
 }
 
-static void rga_get_version(struct rockchip_rga *rga)
+static struct rockchip_rga_version rga_get_version(struct rockchip_rga *rga)
 {
-	rga->version.major = (rga_read(rga, RGA_VERSION_INFO) >> 24) & 0xFF;
-	rga->version.minor = (rga_read(rga, RGA_VERSION_INFO) >> 20) & 0x0F;
+	u32 version = rga_read(rga, RGA_VERSION_INFO);
+
+	return (struct rockchip_rga_version) {
+		.major = (version >> 24) & 0xFF,
+		.minor = (version >> 20) & 0x0F,
+	};
 }
 
 static struct rga_fmt formats[] = {
diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index 0eff558d7f133..b8edd3596c919 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -864,7 +864,7 @@ static int rga_probe(struct platform_device *pdev)
 	if (ret < 0)
 		goto rel_m2m;
 
-	rga->hw->get_version(rga);
+	rga->version = rga->hw->get_version(rga);
 
 	v4l2_info(&rga->v4l2_dev, "HW Version: 0x%02x.%02x\n",
 		  rga->version.major, rga->version.minor);
diff --git a/drivers/media/platform/rockchip/rga/rga.h b/drivers/media/platform/rockchip/rga/rga.h
index 0e62337f8dd38..0e854cdf739f4 100644
--- a/drivers/media/platform/rockchip/rga/rga.h
+++ b/drivers/media/platform/rockchip/rga/rga.h
@@ -158,7 +158,7 @@ struct rga_hw {
 	void (*start)(struct rockchip_rga *rga,
 		      struct rga_vb_buffer *src, struct rga_vb_buffer *dst);
 	bool (*handle_irq)(struct rockchip_rga *rga);
-	void (*get_version)(struct rockchip_rga *rga);
+	struct rockchip_rga_version (*get_version)(struct rockchip_rga *rga);
 	void *(*adjust_and_map_format)(struct rga_ctx *ctx,
 				       struct v4l2_pix_format_mplane *format,
 				       bool is_output);
diff --git a/drivers/media/platform/rockchip/rga/rga3-hw.c b/drivers/media/platform/rockchip/rga/rga3-hw.c
index 72741e1faccff..3469523a5ecad 100644
--- a/drivers/media/platform/rockchip/rga/rga3-hw.c
+++ b/drivers/media/platform/rockchip/rga/rga3-hw.c
@@ -299,12 +299,14 @@ static bool rga3_handle_irq(struct rockchip_rga *rga)
 	return FIELD_GET(RGA3_INT_FRM_DONE, intr);
 }
 
-static void rga3_get_version(struct rockchip_rga *rga)
+static struct rockchip_rga_version rga3_get_version(struct rockchip_rga *rga)
 {
 	u32 version = rga_read(rga, RGA3_VERSION_NUM);
 
-	rga->version.major = FIELD_GET(RGA3_VERSION_NUM_MAJOR, version);
-	rga->version.minor = FIELD_GET(RGA3_VERSION_NUM_MINOR, version);
+	return (struct rockchip_rga_version) {
+		.major = FIELD_GET(RGA3_VERSION_NUM_MAJOR, version),
+		.minor = FIELD_GET(RGA3_VERSION_NUM_MINOR, version),
+	};
 }
 
 static struct rga3_fmt rga3_formats[] = {

-- 
2.54.0



^ permalink raw reply related

* [PATCH 15/17] media: rockchip: rga: schedule jobs to multiple cores
From: Sven Püschel @ 2026-06-05 22:07 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Schedule jobs to multiple cores to utilize all RGA cores. To avoid race
conditions when selecting the next free core a dedicated spinlock is added.

Note that this doesn't increase the max frame rate of a single
stream, as a context will wait for the job to finish before starting
the next device_run call.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga.c | 22 +++++++++++++++++++---
 drivers/media/platform/rockchip/rga/rga.h |  1 +
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index 9cebb461b3fd2..f00b7f99f2521 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -38,15 +38,31 @@ static void device_run(void *prv)
 {
 	struct rga_ctx *ctx = prv;
 	struct rockchip_rga *rga = ctx->rga;
-	struct rga_core *core = rga->cores[0];
+	struct rga_core *core = NULL;
 	struct vb2_v4l2_buffer *src, *dst;
 	unsigned long flags;
 	int ret;
+	unsigned int i;
+
+	spin_lock_irqsave(&rga->cores_lock, flags);
+	for (i = 0; i < rga->num_cores; i++) {
+		if (!rga->cores[i]->curr) {
+			core = rga->cores[i];
+			core->curr = ctx;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&rga->cores_lock, flags);
+
+	WARN_ONCE(!core, "No free core although max parallel jobs matches the core count!\n");
+	if (!core)
+		return;
 
 	ret = pm_runtime_resume_and_get(core->dev);
 	if (ret < 0) {
 		v4l2_m2m_buf_done_and_job_finish(rga->m2m_dev, ctx->fh.m2m_ctx,
 						 VB2_BUF_STATE_ERROR);
+		core->curr = NULL;
 		return;
 	}
 
@@ -58,8 +74,6 @@ static void device_run(void *prv)
 	}
 	spin_unlock_irqrestore(&rga->ctrl_lock, flags);
 
-	core->curr = ctx;
-
 	src = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
 	src->sequence = ctx->osequence++;
 
@@ -946,6 +960,7 @@ static int rga_bind(struct device *dev)
 		ret = PTR_ERR(rga->m2m_dev);
 		goto rel_vdev;
 	}
+	v4l2_m2m_set_max_parallel_jobs(rga->m2m_dev, rga->num_cores);
 
 	ret = video_register_device(vfd, VFL_TYPE_VIDEO, -1);
 	if (ret) {
@@ -1021,6 +1036,7 @@ static int rga_probe(struct platform_device *pdev)
 		return dev_err_probe(dev, -ENODEV, "failed to get match data\n");
 
 	spin_lock_init(&rga->ctrl_lock);
+	spin_lock_init(&rga->cores_lock);
 	mutex_init(&rga->mutex);
 
 	dev_set_drvdata(dev, rga);
diff --git a/drivers/media/platform/rockchip/rga/rga.h b/drivers/media/platform/rockchip/rga/rga.h
index 6237436b984eb..c0dfacdb6f212 100644
--- a/drivers/media/platform/rockchip/rga/rga.h
+++ b/drivers/media/platform/rockchip/rga/rga.h
@@ -85,6 +85,7 @@ struct rockchip_rga {
 	struct mutex mutex;
 	/* ctrl parm lock */
 	spinlock_t ctrl_lock;
+	spinlock_t cores_lock;
 
 	const struct rga_hw *hw;
 

-- 
2.54.0



^ permalink raw reply related

* Re: [PATCH 3/4] arm64: wire SDEI NMI into the hardlockup watchdog
From: Doug Anderson @ 2026-06-05 22:08 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: Catalin Marinas, Will Deacon, James Morse, Mark Rutland,
	Marc Zyngier, Petr Mladek, Thomas Gleixner, Andrew Morton,
	Baoquan He, Puranjay Mohan, Usama Arif, Breno Leitao,
	Julien Thierry, Lecopzer Chen, Sumit Garg, kernel-team, kexec,
	linux-arm-kernel, linux-kernel
In-Reply-To: <aiM69AZXtGduS4VY@thinkstation>

Hi,

On Fri, Jun 5, 2026 at 2:12 PM Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> On Fri, Jun 05, 2026 at 01:03:05PM -0700, Doug Anderson wrote:
> > Hi,
> >
> > On Wed, Jun 3, 2026 at 7:36 AM Kiryl Shutsemau <kirill@shutemov.name> wrote:
> > >
> > > From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
> > >
> > > Select HAVE_HARDLOCKUP_DETECTOR_ARCH so the framework takes its backend
> > > from this driver. A per-CPU hrtimer checks its buddy's heartbeat and
> > > signals event 0 at a stalled CPU, which runs watchdog_hardlockup_check()
> > > NMI-like.
> > >
> > > The source is chosen at boot: SDEI if firmware provides it, otherwise a
> > > perf-NMI counter (pseudo-NMI) fallback -- one image covers both.
> > >
> > > Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
> > > ---
> > >  arch/arm64/Kconfig          |   1 +
> > >  drivers/firmware/Kconfig    |   3 +
> > >  drivers/firmware/sdei_nmi.c | 247 +++++++++++++++++++++++++++++++++++-
> > >  3 files changed, 248 insertions(+), 3 deletions(-)
> >
> > I'm a little confused about this patch. We already have a buddy
> > hardlockup detector using the hrtimer, and it's even been improved
> > recently to trigger in a smaller time bound. It looks as if you're
> > duplicating bits of the perf and buddy detector here?
> >
> > I don't think you need this patch at all. The existing buddy detector
> > + patches #1 and #2 in your series should be sufficient.
>
> You're mostly right.
>
> Buddy + #2 covers the console case (the remote branch triggers the
> culprit's backtrace, which #2 makes deliverable), and #4 gets the wedged
> CPU's registers into the vmcore.
>
> The one thing this patch adds that a config can't is boot-time source
> selection: PERF-compiled kernels have no detector on a pseudo_nmi=0
> boot, and PREFER_BUDDY costs the pseudo-NMI machines perf
> self-detection. But that's arguably out of scope for the patchset.
>
> I'll drop this patch in v2 and run PREFER_BUDDY here. If a runtime
> perf->buddy fallback ever materializes, the gap closes entirely.

Sure. If you're interested in trying to make pref vs. buddy coexist,
that should be done in a platform-agnostic way. Feel free to post
patches for that. I know we discussed this previously. Ah, here they
are:

https://lore.kernel.org/r/20250916145122.416128-1-wangjinchao600@gmail.com

I think those got bikeshedded to death and nobody cared enough to keep pushing.

FWIW, my belief is that the buddy detector is superior in every way
except that it can't detect when all CPUs lock up simultaneously.
...though I wonder if a nicer way to solve the "all CPUs locked up" is
to just NMI-enable the "bark" interrupt of a hardware watchdog timer.
That ought to be quite easy...

-Doug


^ permalink raw reply

* [PATCH 10/17] media: rockchip: rga: move rockchip_rga allocation to master probe
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Move the rockchip_rga struct allocation to the master component probe
function in preparation of enabling all cores. This also adjusts the
allocation to use the actual number of cores found in the of tree
instead of being fixed to one core.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga.c | 32 ++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index 178f45b8da940..11912bf5b6906 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -741,7 +741,7 @@ static int rga_parse_dt(struct rga_core *core)
 static int rga_core_bind(struct device *dev, struct device *master, void *data)
 {
 	struct platform_device *pdev = to_platform_device(dev);
-	struct rockchip_rga *rga;
+	struct rockchip_rga *rga = data;
 	struct rga_core *core;
 	struct video_device *vfd;
 	int ret = 0;
@@ -750,17 +750,6 @@ static int rga_core_bind(struct device *dev, struct device *master, void *data)
 	if (!pdev->dev.of_node)
 		return -ENODEV;
 
-	rga = devm_kzalloc(&pdev->dev, sizeof(*rga) + 1 * sizeof(*rga->cores), GFP_KERNEL);
-	if (!rga)
-		return -ENOMEM;
-
-	rga->hw = of_device_get_match_data(&pdev->dev);
-	if (!rga->hw)
-		return dev_err_probe(&pdev->dev, -ENODEV, "failed to get match data\n");
-
-	spin_lock_init(&rga->ctrl_lock);
-	mutex_init(&rga->mutex);
-
 	core = devm_kzalloc(&pdev->dev, sizeof(*core), GFP_KERNEL);
 	core->rga = rga;
 	core->dev = &pdev->dev;
@@ -947,9 +936,10 @@ static struct platform_driver rga_core_pdrv = {
 
 static int rga_bind(struct device *dev)
 {
+	struct rockchip_rga *rga = dev_get_drvdata(dev);
 	int ret;
 
-	ret = component_bind_all(dev, NULL);
+	ret = component_bind_all(dev, rga);
 	if (ret) {
 		dev_err(dev, "component bind failed\n");
 		return ret;
@@ -974,6 +964,8 @@ static int rga_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct component_match *match = NULL;
 	struct device_node *core_node;
+	struct rockchip_rga *rga;
+	u8 num_cores = 0;
 
 	if (!match_desc)
 		return dev_err_probe(dev, -ENODEV, "missing platform data\n");
@@ -985,6 +977,7 @@ static int rga_probe(struct platform_device *pdev)
 		of_node_get(core_node);
 		component_match_add_release(dev, &match, component_release_of,
 					    component_compare_of, core_node);
+		num_cores++;
 
 		/*
 		 * As multi core is not implemented yet,
@@ -1000,6 +993,19 @@ static int rga_probe(struct platform_device *pdev)
 			dev, -ENODEV,
 			"no matching available component devices found\n");
 
+	rga = devm_kzalloc(dev, sizeof(*rga) + num_cores * sizeof(*rga->cores), GFP_KERNEL);
+	if (!rga)
+		return -ENOMEM;
+
+	rga->hw = match_desc->data;
+	if (!rga->hw)
+		return dev_err_probe(dev, -ENODEV, "failed to get match data\n");
+
+	spin_lock_init(&rga->ctrl_lock);
+	mutex_init(&rga->mutex);
+
+	dev_set_drvdata(dev, rga);
+
 	return component_master_add_with_match(dev, &rga_master_ops, match);
 }
 

-- 
2.54.0



^ permalink raw reply related

* [PATCH 08/17] media: rockchip: rga: add rga_core structure
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Add a rga_core structure to separate the core specific data from the
m2m device. This is done in preparation for multi-core support, where
multiple identical cores are exposed as a single m2m device to the
user-space.

Allocation related calls are explicitly done on the first core, as the
scheduling decisions will be made on demand after the buffers have been
allocated and filled.

In preparation of storing the rockchip_rga struct on a dedicated master
platform device, the rga_core struct is allocated on it's own and only a
pointer is saved in the rockchip_rga struct.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga-buf.c |   4 +-
 drivers/media/platform/rockchip/rga/rga-hw.c  |  32 ++++----
 drivers/media/platform/rockchip/rga/rga.c     | 104 ++++++++++++++------------
 drivers/media/platform/rockchip/rga/rga.h     |  39 +++++-----
 drivers/media/platform/rockchip/rga/rga3-hw.c |  24 +++---
 5 files changed, 108 insertions(+), 95 deletions(-)

diff --git a/drivers/media/platform/rockchip/rga/rga-buf.c b/drivers/media/platform/rockchip/rga/rga-buf.c
index 3f7c3c68e0cb8..47a8d5a4500a3 100644
--- a/drivers/media/platform/rockchip/rga/rga-buf.c
+++ b/drivers/media/platform/rockchip/rga/rga-buf.c
@@ -93,7 +93,7 @@ static int rga_buf_init(struct vb2_buffer *vb)
 	n_desc = DIV_ROUND_UP(size, PAGE_SIZE);
 
 	rbuf->n_desc = n_desc;
-	rbuf->dma_desc = dma_alloc_coherent(rga->dev,
+	rbuf->dma_desc = dma_alloc_coherent(rga->cores[0]->dev,
 					    rbuf->n_desc * sizeof(*rbuf->dma_desc),
 					    &rbuf->dma_desc_pa, GFP_KERNEL);
 	if (!rbuf->dma_desc)
@@ -191,7 +191,7 @@ static void rga_buf_cleanup(struct vb2_buffer *vb)
 	if (!rga_has_internal_iommu(rga))
 		return;
 
-	dma_free_coherent(rga->dev, rbuf->n_desc * sizeof(*rbuf->dma_desc),
+	dma_free_coherent(rga->cores[0]->dev, rbuf->n_desc * sizeof(*rbuf->dma_desc),
 			  rbuf->dma_desc, rbuf->dma_desc_pa);
 }
 
diff --git a/drivers/media/platform/rockchip/rga/rga-hw.c b/drivers/media/platform/rockchip/rga/rga-hw.c
index 190104f3b2954..9a5da4e1716ca 100644
--- a/drivers/media/platform/rockchip/rga/rga-hw.c
+++ b/drivers/media/platform/rockchip/rga/rga-hw.c
@@ -417,10 +417,10 @@ static void rga_cmd_set_mode(struct rga_ctx *ctx)
 	dest[(RGA_MODE_CTRL - RGA_MODE_BASE_REG) >> 2] = mode.val;
 }
 
-static void rga_cmd_set(struct rga_ctx *ctx,
+static void rga_cmd_set(struct rga_core *core,
 			struct rga_vb_buffer *src, struct rga_vb_buffer *dst)
 {
-	struct rockchip_rga *rga = ctx->rga;
+	struct rga_ctx *ctx = core->curr;
 
 	rga_cmd_set_src_addr(ctx, src->dma_desc_pa);
 	/*
@@ -434,10 +434,10 @@ static void rga_cmd_set(struct rga_ctx *ctx,
 	rga_cmd_set_src_info(ctx, &src->dma_addrs);
 	rga_cmd_set_dst_info(ctx, &dst->dma_addrs);
 
-	rga_write(rga, RGA_CMD_BASE, ctx->cmdbuf_phy);
+	rga_write(core, RGA_CMD_BASE, ctx->cmdbuf_phy);
 
 	/* sync CMD buf for RGA */
-	dma_sync_single_for_device(rga->dev, ctx->cmdbuf_phy,
+	dma_sync_single_for_device(core->rga->cores[0]->dev, ctx->cmdbuf_phy,
 				   PAGE_SIZE, DMA_BIDIRECTIONAL);
 }
 
@@ -447,36 +447,34 @@ static void rga_hw_setup_cmdbuf(struct rga_ctx *ctx)
 	rga_cmd_set_trans_info(ctx);
 }
 
-static void rga_hw_start(struct rockchip_rga *rga,
+static void rga_hw_start(struct rga_core *core,
 			 struct rga_vb_buffer *src,  struct rga_vb_buffer *dst)
 {
-	struct rga_ctx *ctx = rga->curr;
-
-	rga_cmd_set(ctx, src, dst);
+	rga_cmd_set(core, src, dst);
 
-	rga_write(rga, RGA_SYS_CTRL, 0x00);
+	rga_write(core, RGA_SYS_CTRL, 0x00);
 
-	rga_write(rga, RGA_SYS_CTRL, 0x22);
+	rga_write(core, RGA_SYS_CTRL, 0x22);
 
-	rga_write(rga, RGA_INT, 0x600);
+	rga_write(core, RGA_INT, 0x600);
 
-	rga_write(rga, RGA_CMD_CTRL, 0x1);
+	rga_write(core, RGA_CMD_CTRL, 0x1);
 }
 
-static bool rga_handle_irq(struct rockchip_rga *rga)
+static bool rga_handle_irq(struct rga_core *core)
 {
 	int intr;
 
-	intr = rga_read(rga, RGA_INT) & 0xf;
+	intr = rga_read(core, RGA_INT) & 0xf;
 
-	rga_mod(rga, RGA_INT, intr << 4, 0xf << 4);
+	rga_mod(core, RGA_INT, intr << 4, 0xf << 4);
 
 	return intr & RGA_INT_COMMAND_FINISHED;
 }
 
-static struct rockchip_rga_version rga_get_version(struct rockchip_rga *rga)
+static struct rockchip_rga_version rga_get_version(struct rga_core *core)
 {
-	u32 version = rga_read(rga, RGA_VERSION_INFO);
+	u32 version = rga_read(core, RGA_VERSION_INFO);
 
 	return (struct rockchip_rga_version) {
 		.major = (version >> 24) & 0xFF,
diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index b8edd3596c919..15d095a1d1973 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -11,6 +11,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_platform.h>
 #include <linux/pm_runtime.h>
 #include <linux/reset.h>
 #include <linux/sched.h>
@@ -35,11 +36,12 @@ static void device_run(void *prv)
 {
 	struct rga_ctx *ctx = prv;
 	struct rockchip_rga *rga = ctx->rga;
+	struct rga_core *core = rga->cores[0];
 	struct vb2_v4l2_buffer *src, *dst;
 	unsigned long flags;
 	int ret;
 
-	ret = pm_runtime_resume_and_get(rga->dev);
+	ret = pm_runtime_resume_and_get(core->dev);
 	if (ret < 0) {
 		v4l2_m2m_buf_done_and_job_finish(rga->m2m_dev, ctx->fh.m2m_ctx,
 						 VB2_BUF_STATE_ERROR);
@@ -54,27 +56,28 @@ static void device_run(void *prv)
 	}
 	spin_unlock_irqrestore(&rga->ctrl_lock, flags);
 
-	rga->curr = ctx;
+	core->curr = ctx;
 
 	src = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
 	src->sequence = ctx->osequence++;
 
 	dst = v4l2_m2m_next_dst_buf(ctx->fh.m2m_ctx);
 
-	rga->hw->start(rga, vb_to_rga(src), vb_to_rga(dst));
+	rga->hw->start(core, vb_to_rga(src), vb_to_rga(dst));
 }
 
 static irqreturn_t rga_isr(int irq, void *prv)
 {
-	struct rockchip_rga *rga = prv;
+	struct rga_core *core = prv;
+	struct rockchip_rga *rga = core->rga;
 
-	if (rga->hw->handle_irq(rga)) {
+	if (rga->hw->handle_irq(core)) {
 		struct vb2_v4l2_buffer *src, *dst;
-		struct rga_ctx *ctx = rga->curr;
+		struct rga_ctx *ctx = core->curr;
 
 		WARN_ON(!ctx);
 
-		rga->curr = NULL;
+		core->curr = NULL;
 
 		src = v4l2_m2m_src_buf_remove(ctx->fh.m2m_ctx);
 		dst = v4l2_m2m_dst_buf_remove(ctx->fh.m2m_ctx);
@@ -90,7 +93,7 @@ static irqreturn_t rga_isr(int irq, void *prv)
 		v4l2_m2m_buf_done(dst, VB2_BUF_STATE_DONE);
 		v4l2_m2m_job_finish(rga->m2m_dev, ctx->fh.m2m_ctx);
 
-		pm_runtime_put_autosuspend(rga->dev);
+		pm_runtime_put_autosuspend(core->dev);
 	}
 
 	return IRQ_HANDLED;
@@ -118,7 +121,7 @@ queue_init(void *priv, struct vb2_queue *src_vq, struct vb2_queue *dst_vq)
 	src_vq->buf_struct_size = sizeof(struct rga_vb_buffer);
 	src_vq->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_COPY;
 	src_vq->lock = &ctx->rga->mutex;
-	src_vq->dev = ctx->rga->v4l2_dev.dev;
+	src_vq->dev = ctx->rga->cores[0]->dev;
 
 	ret = vb2_queue_init(src_vq);
 	if (ret)
@@ -136,7 +139,7 @@ queue_init(void *priv, struct vb2_queue *src_vq, struct vb2_queue *dst_vq)
 	dst_vq->buf_struct_size = sizeof(struct rga_vb_buffer);
 	dst_vq->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_COPY;
 	dst_vq->lock = &ctx->rga->mutex;
-	dst_vq->dev = ctx->rga->v4l2_dev.dev;
+	dst_vq->dev = ctx->rga->cores[0]->dev;
 
 	return vb2_queue_init(dst_vq);
 }
@@ -275,7 +278,7 @@ static int rga_open(struct file *file)
 		return -ENOMEM;
 
 	/* Create CMD buffer */
-	ctx->cmdbuf_virt = dma_alloc_attrs(rga->dev, rga->hw->cmdbuf_size,
+	ctx->cmdbuf_virt = dma_alloc_attrs(rga->cores[0]->dev, rga->hw->cmdbuf_size,
 					   &ctx->cmdbuf_phy, GFP_KERNEL,
 					   DMA_ATTR_WRITE_COMBINE);
 	if (!ctx->cmdbuf_virt) {
@@ -322,7 +325,7 @@ static int rga_open(struct file *file)
 unlock_mutex:
 	mutex_unlock(&rga->mutex);
 rel_cmdbuf:
-	dma_free_attrs(rga->dev, rga->hw->cmdbuf_size, ctx->cmdbuf_virt,
+	dma_free_attrs(rga->cores[0]->dev, rga->hw->cmdbuf_size, ctx->cmdbuf_virt,
 		       ctx->cmdbuf_phy, DMA_ATTR_WRITE_COMBINE);
 rel_ctx:
 	kfree(ctx);
@@ -342,7 +345,7 @@ static int rga_release(struct file *file)
 	v4l2_fh_del(&ctx->fh, file);
 	v4l2_fh_exit(&ctx->fh);
 
-	dma_free_attrs(rga->dev, rga->hw->cmdbuf_size, ctx->cmdbuf_virt,
+	dma_free_attrs(rga->cores[0]->dev, rga->hw->cmdbuf_size, ctx->cmdbuf_virt,
 		       ctx->cmdbuf_phy, DMA_ATTR_WRITE_COMBINE);
 
 	kfree(ctx);
@@ -689,26 +692,26 @@ static const struct video_device rga_videodev = {
 	.device_caps = V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_STREAMING,
 };
 
-static int rga_parse_dt(struct rockchip_rga *rga)
+static int rga_parse_dt(struct rga_core *core)
 {
 	struct reset_control *core_rst, *axi_rst, *ahb_rst;
 	int ret;
 
-	core_rst = devm_reset_control_get(rga->dev, "core");
+	core_rst = devm_reset_control_get(core->dev, "core");
 	if (IS_ERR(core_rst)) {
-		dev_err(rga->dev, "failed to get core reset controller\n");
+		dev_err(core->dev, "failed to get core reset controller\n");
 		return PTR_ERR(core_rst);
 	}
 
-	axi_rst = devm_reset_control_get(rga->dev, "axi");
+	axi_rst = devm_reset_control_get(core->dev, "axi");
 	if (IS_ERR(axi_rst)) {
-		dev_err(rga->dev, "failed to get axi reset controller\n");
+		dev_err(core->dev, "failed to get axi reset controller\n");
 		return PTR_ERR(axi_rst);
 	}
 
-	ahb_rst = devm_reset_control_get(rga->dev, "ahb");
+	ahb_rst = devm_reset_control_get(core->dev, "ahb");
 	if (IS_ERR(ahb_rst)) {
-		dev_err(rga->dev, "failed to get ahb reset controller\n");
+		dev_err(core->dev, "failed to get ahb reset controller\n");
 		return PTR_ERR(ahb_rst);
 	}
 
@@ -724,12 +727,12 @@ static int rga_parse_dt(struct rockchip_rga *rga)
 	udelay(1);
 	reset_control_deassert(ahb_rst);
 
-	ret = devm_clk_bulk_get_all(rga->dev, &rga->clks);
+	ret = devm_clk_bulk_get_all(core->dev, &core->clks);
 	if (ret < 0) {
-		dev_err(rga->dev, "failed to get clocks\n");
+		dev_err(core->dev, "failed to get clocks\n");
 		return ret;
 	}
-	rga->num_clks = ret;
+	core->num_clks = ret;
 
 	return 0;
 }
@@ -780,6 +783,7 @@ static int rga_disable_multicore(struct device *dev)
 static int rga_probe(struct platform_device *pdev)
 {
 	struct rockchip_rga *rga;
+	struct rga_core *core;
 	struct video_device *vfd;
 	int ret = 0;
 	int irq;
@@ -791,7 +795,7 @@ static int rga_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	rga = devm_kzalloc(&pdev->dev, sizeof(*rga), GFP_KERNEL);
+	rga = devm_kzalloc(&pdev->dev, sizeof(*rga) + 1 * sizeof(*rga->cores), GFP_KERNEL);
 	if (!rga)
 		return -ENOMEM;
 
@@ -799,20 +803,25 @@ static int rga_probe(struct platform_device *pdev)
 	if (!rga->hw)
 		return dev_err_probe(&pdev->dev, -ENODEV, "failed to get match data\n");
 
-	rga->dev = &pdev->dev;
 	spin_lock_init(&rga->ctrl_lock);
 	mutex_init(&rga->mutex);
 
-	ret = rga_parse_dt(rga);
+	core = devm_kzalloc(&pdev->dev, sizeof(*core), GFP_KERNEL);
+	core->rga = rga;
+	core->dev = &pdev->dev;
+
+	rga->cores[0] = core;
+
+	ret = rga_parse_dt(core);
 	if (ret)
 		return dev_err_probe(&pdev->dev, ret, "Unable to parse OF data\n");
 
-	pm_runtime_set_autosuspend_delay(rga->dev, 50);
-	pm_runtime_enable(rga->dev);
+	pm_runtime_set_autosuspend_delay(core->dev, 50);
+	pm_runtime_enable(core->dev);
 
-	rga->regs = devm_platform_ioremap_resource(pdev, 0);
-	if (IS_ERR(rga->regs)) {
-		ret = PTR_ERR(rga->regs);
+	core->regs = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(core->regs)) {
+		ret = PTR_ERR(core->regs);
 		goto err_put_clk;
 	}
 
@@ -822,17 +831,17 @@ static int rga_probe(struct platform_device *pdev)
 		goto err_put_clk;
 	}
 
-	ret = devm_request_irq(rga->dev, irq, rga_isr,
+	ret = devm_request_irq(core->dev, irq, rga_isr,
 			       rga_has_internal_iommu(rga) ? 0 : IRQF_SHARED,
-			       dev_name(rga->dev), rga);
+			       dev_name(core->dev), core);
 	if (ret < 0) {
-		dev_err(rga->dev, "failed to request irq\n");
+		dev_err(core->dev, "failed to request irq\n");
 		goto err_put_clk;
 	}
 
-	ret = dma_set_mask_and_coherent(rga->dev, DMA_BIT_MASK(32));
+	ret = dma_set_mask_and_coherent(core->dev, DMA_BIT_MASK(32));
 	if (ret) {
-		dev_err(rga->dev, "32-bit DMA not supported");
+		dev_err(core->dev, "32-bit DMA not supported");
 		goto err_put_clk;
 	}
 
@@ -852,7 +861,7 @@ static int rga_probe(struct platform_device *pdev)
 	video_set_drvdata(vfd, rga);
 	rga->vfd = vfd;
 
-	platform_set_drvdata(pdev, rga);
+	platform_set_drvdata(pdev, core);
 	rga->m2m_dev = v4l2_m2m_init(&rga_m2m_ops);
 	if (IS_ERR(rga->m2m_dev)) {
 		v4l2_err(&rga->v4l2_dev, "Failed to init mem2mem device\n");
@@ -860,16 +869,16 @@ static int rga_probe(struct platform_device *pdev)
 		goto rel_vdev;
 	}
 
-	ret = pm_runtime_resume_and_get(rga->dev);
+	ret = pm_runtime_resume_and_get(core->dev);
 	if (ret < 0)
 		goto rel_m2m;
 
-	rga->version = rga->hw->get_version(rga);
+	rga->version = rga->hw->get_version(core);
 
 	v4l2_info(&rga->v4l2_dev, "HW Version: 0x%02x.%02x\n",
 		  rga->version.major, rga->version.minor);
 
-	pm_runtime_put(rga->dev);
+	pm_runtime_put(core->dev);
 
 	ret = video_register_device(vfd, VFL_TYPE_VIDEO, -1);
 	if (ret) {
@@ -889,14 +898,15 @@ static int rga_probe(struct platform_device *pdev)
 unreg_v4l2_dev:
 	v4l2_device_unregister(&rga->v4l2_dev);
 err_put_clk:
-	pm_runtime_disable(rga->dev);
+	pm_runtime_disable(core->dev);
 
 	return ret;
 }
 
 static void rga_remove(struct platform_device *pdev)
 {
-	struct rockchip_rga *rga = platform_get_drvdata(pdev);
+	struct rga_core *core = platform_get_drvdata(pdev);
+	struct rockchip_rga *rga = core->rga;
 
 	v4l2_info(&rga->v4l2_dev, "Removing\n");
 
@@ -904,23 +914,23 @@ static void rga_remove(struct platform_device *pdev)
 	video_unregister_device(rga->vfd);
 	v4l2_device_unregister(&rga->v4l2_dev);
 
-	pm_runtime_disable(rga->dev);
+	pm_runtime_disable(core->dev);
 }
 
 static int __maybe_unused rga_runtime_suspend(struct device *dev)
 {
-	struct rockchip_rga *rga = dev_get_drvdata(dev);
+	struct rga_core *core = dev_get_drvdata(dev);
 
-	clk_bulk_disable_unprepare(rga->num_clks, rga->clks);
+	clk_bulk_disable_unprepare(core->num_clks, core->clks);
 
 	return 0;
 }
 
 static int __maybe_unused rga_runtime_resume(struct device *dev)
 {
-	struct rockchip_rga *rga = dev_get_drvdata(dev);
+	struct rga_core *core = dev_get_drvdata(dev);
 
-	return clk_bulk_prepare_enable(rga->num_clks, rga->clks);
+	return clk_bulk_prepare_enable(core->num_clks, core->clks);
 }
 
 static const struct dev_pm_ops rga_pm = {
diff --git a/drivers/media/platform/rockchip/rga/rga.h b/drivers/media/platform/rockchip/rga/rga.h
index 0e854cdf739f4..fcf1ef7d2029f 100644
--- a/drivers/media/platform/rockchip/rga/rga.h
+++ b/drivers/media/platform/rockchip/rga/rga.h
@@ -14,7 +14,6 @@
 #include <media/v4l2-device.h>
 
 #define RGA_NAME "rockchip-rga"
-
 #define DEFAULT_WIDTH 100
 #define DEFAULT_HEIGHT 100
 
@@ -36,6 +35,16 @@ struct rockchip_rga_version {
 	u32 minor;
 };
 
+struct rga_core {
+	struct device *dev;
+	void __iomem *regs;
+	struct clk_bulk_data *clks;
+	int num_clks;
+
+	struct rockchip_rga *rga;
+	struct rga_ctx *curr;
+};
+
 struct rga_ctx {
 	struct v4l2_fh fh;
 	struct rockchip_rga *rga;
@@ -70,10 +79,6 @@ struct rockchip_rga {
 	struct v4l2_m2m_dev *m2m_dev;
 	struct video_device *vfd;
 
-	struct device *dev;
-	void __iomem *regs;
-	struct clk_bulk_data *clks;
-	int num_clks;
 	struct rockchip_rga_version version;
 
 	/* vfd lock */
@@ -81,9 +86,9 @@ struct rockchip_rga {
 	/* ctrl parm lock */
 	spinlock_t ctrl_lock;
 
-	struct rga_ctx *curr;
-
 	const struct rga_hw *hw;
+
+	struct rga_core *cores[];
 };
 
 struct rga_addrs {
@@ -119,22 +124,22 @@ int rga_check_scaling(const struct rga_hw *hw, const struct v4l2_rect *crop_in,
 extern const struct vb2_ops rga_qops;
 
 /* RGA Hardware */
-static inline void rga_write(struct rockchip_rga *rga, u32 reg, u32 value)
+static inline void rga_write(struct rga_core *core, u32 reg, u32 value)
 {
-	writel(value, rga->regs + reg);
+	writel(value, core->regs + reg);
 };
 
-static inline u32 rga_read(struct rockchip_rga *rga, u32 reg)
+static inline u32 rga_read(struct rga_core *core, u32 reg)
 {
-	return readl(rga->regs + reg);
+	return readl(core->regs + reg);
 };
 
-static inline void rga_mod(struct rockchip_rga *rga, u32 reg, u32 val, u32 mask)
+static inline void rga_mod(struct rga_core *core, u32 reg, u32 val, u32 mask)
 {
-	u32 temp = rga_read(rga, reg) & ~(mask);
+	u32 temp = rga_read(core, reg) & ~(mask);
 
 	temp |= val & mask;
-	rga_write(rga, reg, temp);
+	rga_write(core, reg, temp);
 };
 
 #define RGA_FEATURE_FLIP	BIT(0)
@@ -155,10 +160,10 @@ struct rga_hw {
 	 * Requires that the cmdbuf is already zeroed.
 	 */
 	void (*setup_cmdbuf)(struct rga_ctx *ctx);
-	void (*start)(struct rockchip_rga *rga,
+	void (*start)(struct rga_core *core,
 		      struct rga_vb_buffer *src, struct rga_vb_buffer *dst);
-	bool (*handle_irq)(struct rockchip_rga *rga);
-	struct rockchip_rga_version (*get_version)(struct rockchip_rga *rga);
+	bool (*handle_irq)(struct rga_core *core);
+	struct rockchip_rga_version (*get_version)(struct rga_core *core);
 	void *(*adjust_and_map_format)(struct rga_ctx *ctx,
 				       struct v4l2_pix_format_mplane *format,
 				       bool is_output);
diff --git a/drivers/media/platform/rockchip/rga/rga3-hw.c b/drivers/media/platform/rockchip/rga/rga3-hw.c
index 3469523a5ecad..f7e4bc8c6ff21 100644
--- a/drivers/media/platform/rockchip/rga/rga3-hw.c
+++ b/drivers/media/platform/rockchip/rga/rga3-hw.c
@@ -266,42 +266,42 @@ static void rga3_hw_setup_cmdbuf(struct rga_ctx *ctx)
 	rga3_cmd_set_wr_format(ctx);
 }
 
-static void rga3_hw_start(struct rockchip_rga *rga,
+static void rga3_hw_start(struct rga_core *core,
 			  struct rga_vb_buffer *src, struct rga_vb_buffer *dst)
 {
-	struct rga_ctx *ctx = rga->curr;
+	struct rga_ctx *ctx = core->curr;
 
 	rga3_cmd_set_win0_addr(ctx, &src->dma_addrs);
 	rga3_cmd_set_wr_addr(ctx, &dst->dma_addrs);
 
-	rga_write(rga, RGA3_CMD_ADDR, ctx->cmdbuf_phy);
+	rga_write(core, RGA3_CMD_ADDR, ctx->cmdbuf_phy);
 
 	/* sync CMD buf for RGA */
-	dma_sync_single_for_device(rga->dev, ctx->cmdbuf_phy,
+	dma_sync_single_for_device(core->rga->cores[0]->dev, ctx->cmdbuf_phy,
 				   PAGE_SIZE, DMA_BIDIRECTIONAL);
 
 	/* set to master mode and start the conversion */
-	rga_write(rga, RGA3_SYS_CTRL,
+	rga_write(core, RGA3_SYS_CTRL,
 		  FIELD_PREP(RGA3_CMD_MODE, RGA3_CMD_MODE_MASTER));
-	rga_write(rga, RGA3_INT_EN, FIELD_PREP(RGA3_INT_FRM_DONE, 1));
-	rga_write(rga, RGA3_CMD_CTRL,
+	rga_write(core, RGA3_INT_EN, FIELD_PREP(RGA3_INT_FRM_DONE, 1));
+	rga_write(core, RGA3_CMD_CTRL,
 		  FIELD_PREP(RGA3_CMD_LINE_START_PULSE, 1));
 }
 
-static bool rga3_handle_irq(struct rockchip_rga *rga)
+static bool rga3_handle_irq(struct rga_core *core)
 {
 	u32 intr;
 
-	intr = rga_read(rga, RGA3_INT_RAW);
+	intr = rga_read(core, RGA3_INT_RAW);
 	/* clear all interrupts */
-	rga_write(rga, RGA3_INT_CLR, intr);
+	rga_write(core, RGA3_INT_CLR, intr);
 
 	return FIELD_GET(RGA3_INT_FRM_DONE, intr);
 }
 
-static struct rockchip_rga_version rga3_get_version(struct rockchip_rga *rga)
+static struct rockchip_rga_version rga3_get_version(struct rga_core *core)
 {
-	u32 version = rga_read(rga, RGA3_VERSION_NUM);
+	u32 version = rga_read(core, RGA3_VERSION_NUM);
 
 	return (struct rockchip_rga_version) {
 		.major = FIELD_GET(RGA3_VERSION_NUM_MAJOR, version),

-- 
2.54.0



^ permalink raw reply related

* [PATCH 05/17] media: v4l2-mem2mem: support running multiple jobs in parallel
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Add support for running multiple jobs in parallel for SoCs containing
multiple identical devices. An example is the Rockchip RK3588 SoC,
which contains two identical RGA3 devices. Therefore it is desirable to
have the kernel schedule the work across all available devices and only
expose one video device to the userspace.

Previously the curr_ctx member of a v4l2_m2m_dev was used to track the
currently running context. But the currently running context will always
be at the top of the job_queue. As the TRANS_RUNNING flag can be used to
check if the queue head is already running, the curr_ctx member can be
completely dropped

To avoid queueing too many parallel jobs, the
v4l2_m2m_set_max_parallel_jobs method is added. It allows a driver
to set the number of parallel jobs and avoids calling device_run when
the given number of jobs is already running. This is set to 1 by default
to prevent parallel job runs. Drivers with the need and support for
scheduling jobs can adjust this value accordingly.

Note that this change doesn't allow a context to be used multiple times
in parallel. So a single stream won't be able to utilize multiple devices
at once, but N streams can utilize up to N devices. This is caused by the
fact that a context is not added multiple times to the job_list and also
holds the job_flags to distinguish if it's currently running.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/v4l2-core/v4l2-mem2mem.c | 89 ++++++++++++++++++++++------------
 include/media/v4l2-mem2mem.h           |  3 ++
 2 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c
index a65cbb124cfe0..14ac9c85803d1 100644
--- a/drivers/media/v4l2-core/v4l2-mem2mem.c
+++ b/drivers/media/v4l2-core/v4l2-mem2mem.c
@@ -84,16 +84,15 @@ static const char * const m2m_entity_name[] = {
  *			v4l2_m2m_unregister_media_controller().
  * @intf_devnode:	&struct media_intf devnode pointer with the interface
  *			with controls the M2M device.
- * @curr_ctx:		currently running instance
  * @job_queue:		instances queued to run
  * @job_spinlock:	protects job_queue
  * @job_work:		worker to run queued jobs.
  * @job_queue_flags:	flags of the queue status, %QUEUE_PAUSED.
+ * @max_parallel_jobs:	max job_queue instances number marked as running
  * @m2m_ops:		driver callbacks
  * @kref:		device reference count
  */
 struct v4l2_m2m_dev {
-	struct v4l2_m2m_ctx	*curr_ctx;
 #ifdef CONFIG_MEDIA_CONTROLLER
 	struct media_entity	*source;
 	struct media_pad	source_pad;
@@ -108,6 +107,7 @@ struct v4l2_m2m_dev {
 	spinlock_t		job_spinlock;
 	struct work_struct	job_work;
 	unsigned long		job_queue_flags;
+	u32			max_parallel_jobs;
 
 	const struct v4l2_m2m_ops *m2m_ops;
 
@@ -123,6 +123,12 @@ static struct v4l2_m2m_queue_ctx *get_queue_ctx(struct v4l2_m2m_ctx *m2m_ctx,
 		return &m2m_ctx->cap_q_ctx;
 }
 
+void v4l2_m2m_set_max_parallel_jobs(struct v4l2_m2m_dev *m2m_dev,
+				    u32 max_parallel_jobs)
+{
+	m2m_dev->max_parallel_jobs = max_parallel_jobs;
+}
+
 struct vb2_queue *v4l2_m2m_get_vq(struct v4l2_m2m_ctx *m2m_ctx,
 				       enum v4l2_buf_type type)
 {
@@ -229,14 +235,22 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_buf_remove_by_idx);
 void *v4l2_m2m_get_curr_priv(struct v4l2_m2m_dev *m2m_dev)
 {
 	unsigned long flags;
-	void *ret = NULL;
+	struct v4l2_m2m_ctx *first_ctx;
 
 	spin_lock_irqsave(&m2m_dev->job_spinlock, flags);
-	if (m2m_dev->curr_ctx)
-		ret = m2m_dev->curr_ctx->priv;
+	if (list_empty(&m2m_dev->job_queue)) {
+		spin_unlock_irqrestore(&m2m_dev->job_spinlock, flags);
+		return NULL;
+	}
+
+	first_ctx = list_first_entry(&m2m_dev->job_queue,
+				     struct v4l2_m2m_ctx, queue);
 	spin_unlock_irqrestore(&m2m_dev->job_spinlock, flags);
 
-	return ret;
+	if (first_ctx->job_flags & TRANS_RUNNING)
+		return first_ctx->priv;
+	else
+		return NULL;
 }
 EXPORT_SYMBOL(v4l2_m2m_get_curr_priv);
 
@@ -252,13 +266,11 @@ EXPORT_SYMBOL(v4l2_m2m_get_curr_priv);
 static void v4l2_m2m_try_run(struct v4l2_m2m_dev *m2m_dev)
 {
 	unsigned long flags;
+	struct v4l2_m2m_ctx *ctx;
+	struct v4l2_m2m_ctx *chosen_ctx = NULL;
+	u32 running_jobs = 0;
 
 	spin_lock_irqsave(&m2m_dev->job_spinlock, flags);
-	if (NULL != m2m_dev->curr_ctx) {
-		spin_unlock_irqrestore(&m2m_dev->job_spinlock, flags);
-		dprintk("Another instance is running, won't run now\n");
-		return;
-	}
 
 	if (list_empty(&m2m_dev->job_queue)) {
 		spin_unlock_irqrestore(&m2m_dev->job_spinlock, flags);
@@ -272,13 +284,30 @@ static void v4l2_m2m_try_run(struct v4l2_m2m_dev *m2m_dev)
 		return;
 	}
 
-	m2m_dev->curr_ctx = list_first_entry(&m2m_dev->job_queue,
-				   struct v4l2_m2m_ctx, queue);
-	m2m_dev->curr_ctx->job_flags |= TRANS_RUNNING;
+	list_for_each_entry(ctx, &m2m_dev->job_queue, queue) {
+		if (!(ctx->job_flags & TRANS_RUNNING)) {
+			chosen_ctx = ctx;
+			break;
+		}
+
+		running_jobs++;
+	}
+	if (running_jobs >= m2m_dev->max_parallel_jobs) {
+		spin_unlock_irqrestore(&m2m_dev->job_spinlock, flags);
+		dprintk("Maximum number of parallel jobs reached\n");
+		return;
+	}
+	if (!chosen_ctx) {
+		spin_unlock_irqrestore(&m2m_dev->job_spinlock, flags);
+		dprintk("All jobs already running\n");
+		return;
+	}
+
+	chosen_ctx->job_flags |= TRANS_RUNNING;
 	spin_unlock_irqrestore(&m2m_dev->job_spinlock, flags);
 
-	dprintk("Running job on m2m_ctx: %p\n", m2m_dev->curr_ctx);
-	m2m_dev->m2m_ops->device_run(m2m_dev->curr_ctx->priv);
+	dprintk("Running job on m2m_ctx: %p\n", chosen_ctx);
+	m2m_dev->m2m_ops->device_run(chosen_ctx->priv);
 }
 
 /*
@@ -469,15 +498,14 @@ static void v4l2_m2m_schedule_next_job(struct v4l2_m2m_dev *m2m_dev,
 static bool _v4l2_m2m_job_finish(struct v4l2_m2m_dev *m2m_dev,
 				 struct v4l2_m2m_ctx *m2m_ctx)
 {
-	if (!m2m_dev->curr_ctx || m2m_dev->curr_ctx != m2m_ctx) {
+	if (!m2m_ctx || !(m2m_ctx->job_flags & TRANS_RUNNING)) {
 		dprintk("Called by an instance not currently running\n");
 		return false;
 	}
 
-	list_del(&m2m_dev->curr_ctx->queue);
-	m2m_dev->curr_ctx->job_flags &= ~(TRANS_QUEUED | TRANS_RUNNING);
-	wake_up(&m2m_dev->curr_ctx->finished);
-	m2m_dev->curr_ctx = NULL;
+	list_del(&m2m_ctx->queue);
+	m2m_ctx->job_flags &= ~(TRANS_QUEUED | TRANS_RUNNING);
+	wake_up(&m2m_ctx->finished);
 	return true;
 }
 
@@ -544,16 +572,19 @@ EXPORT_SYMBOL(v4l2_m2m_buf_done_and_job_finish);
 void v4l2_m2m_suspend(struct v4l2_m2m_dev *m2m_dev)
 {
 	unsigned long flags;
-	struct v4l2_m2m_ctx *curr_ctx;
+	struct v4l2_m2m_ctx *ctx;
+	struct v4l2_m2m_ctx *ctx_safe;
 
 	spin_lock_irqsave(&m2m_dev->job_spinlock, flags);
 	m2m_dev->job_queue_flags |= QUEUE_PAUSED;
-	curr_ctx = m2m_dev->curr_ctx;
 	spin_unlock_irqrestore(&m2m_dev->job_spinlock, flags);
 
-	if (curr_ctx)
-		wait_event(curr_ctx->finished,
-			   !(curr_ctx->job_flags & TRANS_RUNNING));
+	list_for_each_entry_safe(ctx, ctx_safe, &m2m_dev->job_queue, queue) {
+		if (!(ctx->job_flags & TRANS_RUNNING))
+			break;
+
+		wait_event(ctx->finished, !(ctx->job_flags & TRANS_RUNNING));
+	}
 }
 EXPORT_SYMBOL(v4l2_m2m_suspend);
 
@@ -896,10 +927,8 @@ int v4l2_m2m_streamoff(struct file *file, struct v4l2_m2m_ctx *m2m_ctx,
 	q_ctx->num_rdy = 0;
 	spin_unlock_irqrestore(&q_ctx->rdy_spinlock, flags);
 
-	if (m2m_dev->curr_ctx == m2m_ctx) {
-		m2m_dev->curr_ctx = NULL;
+	if (m2m_ctx->job_flags & TRANS_RUNNING)
 		wake_up(&m2m_ctx->finished);
-	}
 	spin_unlock_irqrestore(&m2m_dev->job_spinlock, flags_job);
 
 	return 0;
@@ -1194,12 +1223,12 @@ struct v4l2_m2m_dev *v4l2_m2m_init(const struct v4l2_m2m_ops *m2m_ops)
 	if (!m2m_dev)
 		return ERR_PTR(-ENOMEM);
 
-	m2m_dev->curr_ctx = NULL;
 	m2m_dev->m2m_ops = m2m_ops;
 	INIT_LIST_HEAD(&m2m_dev->job_queue);
 	spin_lock_init(&m2m_dev->job_spinlock);
 	INIT_WORK(&m2m_dev->job_work, v4l2_m2m_device_run_work);
 	kref_init(&m2m_dev->kref);
+	m2m_dev->max_parallel_jobs = 1;
 
 	return m2m_dev;
 }
diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h
index 31de25d792b98..e6177d0eaf637 100644
--- a/include/media/v4l2-mem2mem.h
+++ b/include/media/v4l2-mem2mem.h
@@ -594,6 +594,9 @@ static inline void v4l2_m2m_set_dst_buffered(struct v4l2_m2m_ctx *m2m_ctx,
 	m2m_ctx->cap_q_ctx.buffered = buffered;
 }
 
+void v4l2_m2m_set_max_parallel_jobs(struct v4l2_m2m_dev *m2m_dev,
+				    u32 max_parallel_jobs);
+
 /**
  * v4l2_m2m_ctx_release() - release m2m context
  *

-- 
2.54.0



^ permalink raw reply related

* [PATCH 01/17] media: rockchip: rga: zero cmdbuf in shared code
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Zero the command buffer (cmdbuf) in the shared code instead of the
individual RGA2/RGA3 implementations. Besides centralizing the memset
operation this also uses the cmdbuf_size member for the memset size,
which is also used as the size for the actual allocation.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga-hw.c  | 2 --
 drivers/media/platform/rockchip/rga/rga.c     | 1 +
 drivers/media/platform/rockchip/rga/rga.h     | 3 +++
 drivers/media/platform/rockchip/rga/rga3-hw.c | 2 --
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/media/platform/rockchip/rga/rga-hw.c b/drivers/media/platform/rockchip/rga/rga-hw.c
index be1bc8ddbd03b..4d7b0a03820a1 100644
--- a/drivers/media/platform/rockchip/rga/rga-hw.c
+++ b/drivers/media/platform/rockchip/rga/rga-hw.c
@@ -443,8 +443,6 @@ static void rga_cmd_set(struct rga_ctx *ctx,
 
 static void rga_hw_setup_cmdbuf(struct rga_ctx *ctx)
 {
-	memset(ctx->cmdbuf_virt, 0, RGA_CMDBUF_SIZE);
-
 	rga_cmd_set_mode(ctx);
 	rga_cmd_set_trans_info(ctx);
 }
diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index b3cb6bf8eb863..bd0afd33affe4 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -41,6 +41,7 @@ static void device_run(void *prv)
 	spin_lock_irqsave(&rga->ctrl_lock, flags);
 	if (ctx->cmdbuf_dirty) {
 		ctx->cmdbuf_dirty = false;
+		memset(ctx->cmdbuf_virt, 0, rga->hw->cmdbuf_size);
 		rga->hw->setup_cmdbuf(ctx);
 	}
 	spin_unlock_irqrestore(&rga->ctrl_lock, flags);
diff --git a/drivers/media/platform/rockchip/rga/rga.h b/drivers/media/platform/rockchip/rga/rga.h
index bd431534d0d39..2b4f5694375a4 100644
--- a/drivers/media/platform/rockchip/rga/rga.h
+++ b/drivers/media/platform/rockchip/rga/rga.h
@@ -152,6 +152,9 @@ struct rga_hw {
 	u8 stride_alignment;
 	u8 features;
 
+	/*
+	 * Requires that the cmdbuf is already zeroed.
+	 */
 	void (*setup_cmdbuf)(struct rga_ctx *ctx);
 	void (*start)(struct rockchip_rga *rga,
 		      struct rga_vb_buffer *src, struct rga_vb_buffer *dst);
diff --git a/drivers/media/platform/rockchip/rga/rga3-hw.c b/drivers/media/platform/rockchip/rga/rga3-hw.c
index ca1c268303dd4..72741e1faccff 100644
--- a/drivers/media/platform/rockchip/rga/rga3-hw.c
+++ b/drivers/media/platform/rockchip/rga/rga3-hw.c
@@ -261,8 +261,6 @@ static void rga3_cmd_set_wr_format(struct rga_ctx *ctx)
 
 static void rga3_hw_setup_cmdbuf(struct rga_ctx *ctx)
 {
-	memset(ctx->cmdbuf_virt, 0, RGA3_CMDBUF_SIZE);
-
 	rga3_cmd_set_win0_format(ctx);
 	rga3_cmd_set_trans_info(ctx);
 	rga3_cmd_set_wr_format(ctx);

-- 
2.54.0



^ permalink raw reply related

* [PATCH 09/17] media: rockchip: rga: use components to manage multiple cores
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Use component helpers to manage multiple cores and aggregate them into a
central master device. This gives us a dedicated master device and
ensures that all cores (components) are properly set up before creating
the video device.

This commit only sets up a basic component device. Instead of the
rga_disable_multicore function only the first core is added to the
master device. To avoid the secondary core creating an additional video
device the whole core probe implementation is moved to the bind method,
which is only called when the core is bound to a master device.

The implementation is based on the etnaviv gpu driver, which also groups
multiple gpu cores under a single etnaviv master device.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga.c | 256 +++++++++++++++++++++++-------
 1 file changed, 202 insertions(+), 54 deletions(-)

diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index 15d095a1d1973..178f45b8da940 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/clk.h>
+#include <linux/component.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/fs.h>
@@ -737,51 +738,9 @@ static int rga_parse_dt(struct rga_core *core)
 	return 0;
 }
 
-/*
- * Some SoCs, like RK3588 have multiple identical RGA3 cores, but the
- * kernel is currently missing support for multi-core handling. Exposing
- * separate devices for each core to userspace is bad, since that does
- * not allow scheduling tasks properly (and creates ABI). With this workaround
- * the driver will only probe for the first core and early exit for the other
- * cores. Once the driver gains multi-core support, the same technique
- * for detecting the main core can be used to cluster all cores together.
- */
-static int rga_disable_multicore(struct device *dev)
-{
-	struct device_node *node = NULL;
-	const char *compatible;
-	bool is_main_core;
-	int ret;
-
-	/* Intentionally ignores the fallback strings */
-	ret = of_property_read_string(dev->of_node, "compatible", &compatible);
-	if (ret)
-		return ret;
-
-	/* The first compatible and available node found is considered the main core */
-	do {
-		node = of_find_compatible_node(node, NULL, compatible);
-		if (of_device_is_available(node))
-			break;
-	} while (node);
-
-	if (!node)
-		return -EINVAL;
-
-	is_main_core = (dev->of_node == node);
-
-	of_node_put(node);
-
-	if (!is_main_core) {
-		dev_info(dev, "missing multi-core support, ignoring this instance\n");
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
-static int rga_probe(struct platform_device *pdev)
+static int rga_core_bind(struct device *dev, struct device *master, void *data)
 {
+	struct platform_device *pdev = to_platform_device(dev);
 	struct rockchip_rga *rga;
 	struct rga_core *core;
 	struct video_device *vfd;
@@ -791,10 +750,6 @@ static int rga_probe(struct platform_device *pdev)
 	if (!pdev->dev.of_node)
 		return -ENODEV;
 
-	ret = rga_disable_multicore(&pdev->dev);
-	if (ret)
-		return ret;
-
 	rga = devm_kzalloc(&pdev->dev, sizeof(*rga) + 1 * sizeof(*rga->cores), GFP_KERNEL);
 	if (!rga)
 		return -ENOMEM;
@@ -903,9 +858,10 @@ static int rga_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static void rga_remove(struct platform_device *pdev)
+static void rga_core_unbind(struct device *dev, struct device *master,
+			    void *data)
 {
-	struct rga_core *core = platform_get_drvdata(pdev);
+	struct rga_core *core = dev_get_drvdata(dev);
 	struct rockchip_rga *rga = core->rga;
 
 	v4l2_info(&rga->v4l2_dev, "Removing\n");
@@ -917,6 +873,29 @@ static void rga_remove(struct platform_device *pdev)
 	pm_runtime_disable(core->dev);
 }
 
+static const struct component_ops rga_core_ops = {
+	.bind = rga_core_bind,
+	.unbind = rga_core_unbind,
+};
+
+static int rga_core_probe(struct platform_device *pdev)
+{
+	int ret = 0;
+
+	ret = component_add(&pdev->dev, &rga_core_ops);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to register component: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void rga_core_remove(struct platform_device *pdev)
+{
+	component_del(&pdev->dev, &rga_core_ops);
+}
+
 static int __maybe_unused rga_runtime_suspend(struct device *dev)
 {
 	struct rga_core *core = dev_get_drvdata(dev);
@@ -933,7 +912,7 @@ static int __maybe_unused rga_runtime_resume(struct device *dev)
 	return clk_bulk_prepare_enable(core->num_clks, core->clks);
 }
 
-static const struct dev_pm_ops rga_pm = {
+static const struct dev_pm_ops rga_core_pm = {
 	SET_RUNTIME_PM_OPS(rga_runtime_suspend,
 			   rga_runtime_resume, NULL)
 };
@@ -956,17 +935,186 @@ static const struct of_device_id rockchip_rga_match[] = {
 
 MODULE_DEVICE_TABLE(of, rockchip_rga_match);
 
+static struct platform_driver rga_core_pdrv = {
+	.probe = rga_core_probe,
+	.remove = rga_core_remove,
+	.driver = {
+		.name = RGA_NAME "-core",
+		.pm = &rga_core_pm,
+		.of_match_table = rockchip_rga_match,
+	},
+};
+
+static int rga_bind(struct device *dev)
+{
+	int ret;
+
+	ret = component_bind_all(dev, NULL);
+	if (ret) {
+		dev_err(dev, "component bind failed\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void rga_unbind(struct device *dev)
+{
+	component_unbind_all(dev, NULL);
+}
+
+struct component_master_ops rga_master_ops = {
+	.bind = rga_bind,
+	.unbind = rga_unbind,
+};
+
+static int rga_probe(struct platform_device *pdev)
+{
+	const struct of_device_id *match_desc = pdev->dev.platform_data;
+	struct device *dev = &pdev->dev;
+	struct component_match *match = NULL;
+	struct device_node *core_node;
+
+	if (!match_desc)
+		return dev_err_probe(dev, -ENODEV, "missing platform data\n");
+
+	for_each_compatible_node(core_node, NULL, match_desc->compatible) {
+		if (!of_device_is_available(core_node))
+			continue;
+
+		of_node_get(core_node);
+		component_match_add_release(dev, &match, component_release_of,
+					    component_compare_of, core_node);
+
+		/*
+		 * As multi core is not implemented yet,
+		 * break out of the loop to only have one core per rockchip_rga struct.
+		 * Also put the node, which otherwise would've been done by the loop iteration.
+		 */
+		of_node_put(core_node);
+		break;
+	}
+
+	if (!match)
+		return dev_err_probe(
+			dev, -ENODEV,
+			"no matching available component devices found\n");
+
+	return component_master_add_with_match(dev, &rga_master_ops, match);
+}
+
+static void rga_remove(struct platform_device *pdev)
+{
+	component_master_del(&pdev->dev, &rga_master_ops);
+}
+
 static struct platform_driver rga_pdrv = {
 	.probe = rga_probe,
 	.remove = rga_remove,
 	.driver = {
 		.name = RGA_NAME,
-		.pm = &rga_pm,
-		.of_match_table = rockchip_rga_match,
 	},
 };
 
-module_platform_driver(rga_pdrv);
+static bool rga_of_has_available_node(const char *compat)
+{
+	struct device_node *node;
+
+	for_each_compatible_node(node, NULL, compat) {
+		if (of_device_is_available(node)) {
+			of_node_put(node);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static int rga_create_platform_device(struct platform_device **ppdev,
+				      const struct of_device_id *match)
+{
+	struct platform_device *pdev;
+	int ret;
+
+	pdev = platform_device_alloc(match->compatible, PLATFORM_DEVID_NONE);
+	if (!pdev)
+		return -ENOMEM;
+
+	ret = platform_device_add_data(pdev, match, sizeof(*match));
+	if (ret)
+		goto free_platform_device;
+
+	ret = platform_device_add(pdev);
+	if (ret)
+		goto free_platform_device;
+
+	ret = device_driver_attach(&rga_pdrv.driver, &pdev->dev);
+	if (ret)
+		goto del_platform_device;
+
+	*ppdev = pdev;
+
+	return 0;
+
+del_platform_device:
+	platform_device_del(pdev);
+free_platform_device:
+	platform_device_put(pdev);
+	return ret;
+}
+
+static struct platform_device *master_pdevs[ARRAY_SIZE(rockchip_rga_match) - 1];
+
+static int __init rga_init(void)
+{
+	int ret;
+	unsigned int i;
+
+	ret = platform_driver_register(&rga_core_pdrv);
+	if (ret != 0)
+		return ret;
+
+	ret = platform_driver_register(&rga_pdrv);
+	if (ret != 0)
+		goto unregister_core_driver;
+
+	for (i = 0; i < ARRAY_SIZE(master_pdevs); i++) {
+		if (!rga_of_has_available_node(
+			    rockchip_rga_match[i].compatible))
+			continue;
+
+		ret = rga_create_platform_device(&master_pdevs[i],
+						 &rockchip_rga_match[i]);
+		if (ret)
+			goto unregister_platform_devices;
+	}
+
+	return 0;
+
+unregister_platform_devices:
+	for (i = 0; i < ARRAY_SIZE(master_pdevs); i++) {
+		platform_device_unregister(master_pdevs[i]);
+		master_pdevs[i] = NULL;
+	}
+	platform_driver_unregister(&rga_pdrv);
+unregister_core_driver:
+	platform_driver_unregister(&rga_core_pdrv);
+	return ret;
+}
+module_init(rga_init);
+
+static void __exit rga_exit(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(master_pdevs); i++) {
+		platform_device_unregister(master_pdevs[i]);
+		master_pdevs[i] = NULL;
+	}
+	platform_driver_unregister(&rga_pdrv);
+	platform_driver_unregister(&rga_core_pdrv);
+}
+module_exit(rga_exit);
 
 MODULE_AUTHOR("Jacob Chen <jacob-chen@iotwrt.com>");
 MODULE_DESCRIPTION("Rockchip Raster 2d Graphic Acceleration Unit");

-- 
2.54.0



^ permalink raw reply related

* [PATCH 04/17] media: rockchip: rga: removed unused regmap member
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

The grf member variable is never accessed or written by the RGA driver.
Therefore drop it from the rockchip_rga struct.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/media/platform/rockchip/rga/rga.h b/drivers/media/platform/rockchip/rga/rga.h
index 2b4f5694375a4..0e62337f8dd38 100644
--- a/drivers/media/platform/rockchip/rga/rga.h
+++ b/drivers/media/platform/rockchip/rga/rga.h
@@ -71,7 +71,6 @@ struct rockchip_rga {
 	struct video_device *vfd;
 
 	struct device *dev;
-	struct regmap *grf;
 	void __iomem *regs;
 	struct clk_bulk_data *clks;
 	int num_clks;

-- 
2.54.0



^ permalink raw reply related

* [PATCH 11/17] media: rockchip: rga: move video device to the master
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Move the video device allocation and registration to the master
component bind function in preparation for binding multiple cores
to the master. Moving it to the master bind function allows to
only register the v4l2 device when all cores have been successfully
bound to the master device. This also causes the video device to be
bound against the master platform device instead of a specific core.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga.c | 96 ++++++++++++++++---------------
 1 file changed, 50 insertions(+), 46 deletions(-)

diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index 11912bf5b6906..952377ae467f5 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -743,7 +743,6 @@ static int rga_core_bind(struct device *dev, struct device *master, void *data)
 	struct platform_device *pdev = to_platform_device(dev);
 	struct rockchip_rga *rga = data;
 	struct rga_core *core;
-	struct video_device *vfd;
 	int ret = 0;
 	int irq;
 
@@ -789,33 +788,11 @@ static int rga_core_bind(struct device *dev, struct device *master, void *data)
 		goto err_put_clk;
 	}
 
-	ret = v4l2_device_register(&pdev->dev, &rga->v4l2_dev);
-	if (ret)
-		goto err_put_clk;
-	vfd = video_device_alloc();
-	if (!vfd) {
-		v4l2_err(&rga->v4l2_dev, "Failed to allocate video device\n");
-		ret = -ENOMEM;
-		goto unreg_v4l2_dev;
-	}
-	*vfd = rga_videodev;
-	vfd->lock = &rga->mutex;
-	vfd->v4l2_dev = &rga->v4l2_dev;
-
-	video_set_drvdata(vfd, rga);
-	rga->vfd = vfd;
-
 	platform_set_drvdata(pdev, core);
-	rga->m2m_dev = v4l2_m2m_init(&rga_m2m_ops);
-	if (IS_ERR(rga->m2m_dev)) {
-		v4l2_err(&rga->v4l2_dev, "Failed to init mem2mem device\n");
-		ret = PTR_ERR(rga->m2m_dev);
-		goto rel_vdev;
-	}
 
 	ret = pm_runtime_resume_and_get(core->dev);
 	if (ret < 0)
-		goto rel_m2m;
+		goto err_put_clk;
 
 	rga->version = rga->hw->get_version(core);
 
@@ -824,23 +801,8 @@ static int rga_core_bind(struct device *dev, struct device *master, void *data)
 
 	pm_runtime_put(core->dev);
 
-	ret = video_register_device(vfd, VFL_TYPE_VIDEO, -1);
-	if (ret) {
-		v4l2_err(&rga->v4l2_dev, "Failed to register video device\n");
-		goto rel_m2m;
-	}
-
-	v4l2_info(&rga->v4l2_dev, "Registered %s as /dev/%s\n",
-		  vfd->name, video_device_node_name(vfd));
-
 	return 0;
 
-rel_m2m:
-	v4l2_m2m_release(rga->m2m_dev);
-rel_vdev:
-	video_device_release(vfd);
-unreg_v4l2_dev:
-	v4l2_device_unregister(&rga->v4l2_dev);
 err_put_clk:
 	pm_runtime_disable(core->dev);
 
@@ -851,13 +813,6 @@ static void rga_core_unbind(struct device *dev, struct device *master,
 			    void *data)
 {
 	struct rga_core *core = dev_get_drvdata(dev);
-	struct rockchip_rga *rga = core->rga;
-
-	v4l2_info(&rga->v4l2_dev, "Removing\n");
-
-	v4l2_m2m_release(rga->m2m_dev);
-	video_unregister_device(rga->vfd);
-	v4l2_device_unregister(&rga->v4l2_dev);
 
 	pm_runtime_disable(core->dev);
 }
@@ -937,6 +892,7 @@ static struct platform_driver rga_core_pdrv = {
 static int rga_bind(struct device *dev)
 {
 	struct rockchip_rga *rga = dev_get_drvdata(dev);
+	struct video_device *vfd;
 	int ret;
 
 	ret = component_bind_all(dev, rga);
@@ -945,11 +901,59 @@ static int rga_bind(struct device *dev)
 		return ret;
 	}
 
+	ret = v4l2_device_register(dev, &rga->v4l2_dev);
+	if (ret)
+		return ret;
+	vfd = video_device_alloc();
+	if (!vfd) {
+		v4l2_err(&rga->v4l2_dev, "Failed to allocate video device\n");
+		ret = -ENOMEM;
+		goto unreg_v4l2_dev;
+	}
+	*vfd = rga_videodev;
+	vfd->lock = &rga->mutex;
+	vfd->v4l2_dev = &rga->v4l2_dev;
+
+	video_set_drvdata(vfd, rga);
+	rga->vfd = vfd;
+
+	rga->m2m_dev = v4l2_m2m_init(&rga_m2m_ops);
+	if (IS_ERR(rga->m2m_dev)) {
+		v4l2_err(&rga->v4l2_dev, "Failed to init mem2mem device\n");
+		ret = PTR_ERR(rga->m2m_dev);
+		goto rel_vdev;
+	}
+
+	ret = video_register_device(vfd, VFL_TYPE_VIDEO, -1);
+	if (ret) {
+		v4l2_err(&rga->v4l2_dev, "Failed to register video device\n");
+		goto rel_m2m;
+	}
+
+	v4l2_info(&rga->v4l2_dev, "Registered %s as /dev/%s\n",
+		  vfd->name, video_device_node_name(vfd));
+
 	return 0;
+
+rel_m2m:
+	v4l2_m2m_release(rga->m2m_dev);
+rel_vdev:
+	video_device_release(vfd);
+unreg_v4l2_dev:
+	v4l2_device_unregister(&rga->v4l2_dev);
+	return ret;
 }
 
 static void rga_unbind(struct device *dev)
 {
+	struct rockchip_rga *rga = dev_get_drvdata(dev);
+
+	v4l2_info(&rga->v4l2_dev, "Removing\n");
+
+	v4l2_m2m_release(rga->m2m_dev);
+	video_unregister_device(rga->vfd);
+	v4l2_device_unregister(&rga->v4l2_dev);
+
 	component_unbind_all(dev, NULL);
 }
 

-- 
2.54.0



^ permalink raw reply related

* [PATCH 14/17] media: rockchip: rga: put all cores into first core iommu domain
From: Sven Püschel @ 2026-06-05 22:07 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Put all cores into the iommu domain of the first core to allow them to
be used by any core. All buffers accessed by the hardware are allocated
on the first core, as the scheduling to a specific core is done after
the allocation. Therefore put all cores into the same domain to have the
same iommu mapping on all cores.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index 6add6c510c127..9cebb461b3fd2 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -10,6 +10,7 @@
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/interrupt.h>
+#include <linux/iommu.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
@@ -757,6 +758,19 @@ static int rga_core_bind(struct device *dev, struct device *master, void *data)
 		  version.major, version.minor);
 
 	if (rga->num_cores) {
+		/* Attach to the first cores iommu */
+		struct iommu_domain *domain = iommu_get_domain_for_dev(rga->cores[0]->dev);
+
+		if (IS_ERR(domain)) {
+			dev_err(core->dev, "Couldn't get domain of the first core\n");
+			return PTR_ERR(domain);
+		}
+		ret = iommu_attach_device(domain, core->dev);
+		if (ret) {
+			dev_err(core->dev, "Couldn't attach to the domain of the first core\n");
+			return ret;
+		}
+
 		/* we are not the first core, expect that we have the same version */
 		if (rga->version.major != version.major || rga->version.minor != version.minor)
 			v4l2_warn(&rga->v4l2_dev, "Detected multi-core setup with different core versions!\n");

-- 
2.54.0



^ permalink raw reply related

* [PATCH 12/17] media: rockchip: rga: move core initialization from bind to probe
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Move the core initialization from the core binding function to the core
probing function. This better matches the actual sequence, where the
core probe initializes most things and the bind function just binds the
core to the actual rga struct from the master device.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga.c | 83 ++++++++++++++++---------------
 1 file changed, 42 insertions(+), 41 deletions(-)

diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index 952377ae467f5..0413b8518dfc8 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -740,21 +740,49 @@ static int rga_parse_dt(struct rga_core *core)
 
 static int rga_core_bind(struct device *dev, struct device *master, void *data)
 {
-	struct platform_device *pdev = to_platform_device(dev);
 	struct rockchip_rga *rga = data;
+	struct rga_core *core = dev_get_drvdata(dev);
+	int ret = 0;
+
+	core->rga = rga;
+
+	ret = pm_runtime_resume_and_get(core->dev);
+	if (ret < 0)
+		return ret;
+
+	rga->version = rga->hw->get_version(core);
+
+	v4l2_info(&rga->v4l2_dev, "HW Version: 0x%02x.%02x\n",
+		  rga->version.major, rga->version.minor);
+
+	pm_runtime_put(core->dev);
+
+	rga->cores[0] = core;
+
+	return 0;
+}
+
+static const struct component_ops rga_core_ops = {
+	.bind = rga_core_bind,
+};
+
+static int rga_core_probe(struct platform_device *pdev)
+{
 	struct rga_core *core;
+	const struct rga_hw *hw;
 	int ret = 0;
 	int irq;
 
 	if (!pdev->dev.of_node)
 		return -ENODEV;
 
+	hw = of_device_get_match_data(&pdev->dev);
+	if (!hw)
+		return dev_err_probe(&pdev->dev, -ENODEV, "failed to get match data\n");
+
 	core = devm_kzalloc(&pdev->dev, sizeof(*core), GFP_KERNEL);
-	core->rga = rga;
 	core->dev = &pdev->dev;
 
-	rga->cores[0] = core;
-
 	ret = rga_parse_dt(core);
 	if (ret)
 		return dev_err_probe(&pdev->dev, ret, "Unable to parse OF data\n");
@@ -775,7 +803,7 @@ static int rga_core_bind(struct device *dev, struct device *master, void *data)
 	}
 
 	ret = devm_request_irq(core->dev, irq, rga_isr,
-			       rga_has_internal_iommu(rga) ? 0 : IRQF_SHARED,
+			       hw->has_internal_iommu ? 0 : IRQF_SHARED,
 			       dev_name(core->dev), core);
 	if (ret < 0) {
 		dev_err(core->dev, "failed to request irq\n");
@@ -790,42 +818,6 @@ static int rga_core_bind(struct device *dev, struct device *master, void *data)
 
 	platform_set_drvdata(pdev, core);
 
-	ret = pm_runtime_resume_and_get(core->dev);
-	if (ret < 0)
-		goto err_put_clk;
-
-	rga->version = rga->hw->get_version(core);
-
-	v4l2_info(&rga->v4l2_dev, "HW Version: 0x%02x.%02x\n",
-		  rga->version.major, rga->version.minor);
-
-	pm_runtime_put(core->dev);
-
-	return 0;
-
-err_put_clk:
-	pm_runtime_disable(core->dev);
-
-	return ret;
-}
-
-static void rga_core_unbind(struct device *dev, struct device *master,
-			    void *data)
-{
-	struct rga_core *core = dev_get_drvdata(dev);
-
-	pm_runtime_disable(core->dev);
-}
-
-static const struct component_ops rga_core_ops = {
-	.bind = rga_core_bind,
-	.unbind = rga_core_unbind,
-};
-
-static int rga_core_probe(struct platform_device *pdev)
-{
-	int ret = 0;
-
 	ret = component_add(&pdev->dev, &rga_core_ops);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "failed to register component: %d", ret);
@@ -833,11 +825,20 @@ static int rga_core_probe(struct platform_device *pdev)
 	}
 
 	return 0;
+
+err_put_clk:
+	pm_runtime_disable(core->dev);
+
+	return ret;
 }
 
 static void rga_core_remove(struct platform_device *pdev)
 {
+	struct rga_core *core = platform_get_drvdata(pdev);
+
 	component_del(&pdev->dev, &rga_core_ops);
+
+	pm_runtime_disable(core->dev);
 }
 
 static int __maybe_unused rga_runtime_suspend(struct device *dev)

-- 
2.54.0



^ permalink raw reply related

* [PATCH 13/17] media: rockchip: rga: bind all cores to the master
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Bind all core components to the master component. Previously only the
first core has been added to the master device to avoid creating
multiple video devices. As the video device creation has been moved to
the master component, it allows us to bind all cores without creating
additional video devices.

We expect that all cores to report the same version number, as we only
add cores with the same compatible value. This is important, as  we
setup the command buffer before actually scheduling the work to a
specific core. Therefore adjusting command buffers depending on the
version register only works when all cores have the same value.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga.c | 22 +++++++++++-----------
 drivers/media/platform/rockchip/rga/rga.h |  1 +
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index 0413b8518dfc8..6add6c510c127 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -742,6 +742,7 @@ static int rga_core_bind(struct device *dev, struct device *master, void *data)
 {
 	struct rockchip_rga *rga = data;
 	struct rga_core *core = dev_get_drvdata(dev);
+	struct rockchip_rga_version version;
 	int ret = 0;
 
 	core->rga = rga;
@@ -750,14 +751,21 @@ static int rga_core_bind(struct device *dev, struct device *master, void *data)
 	if (ret < 0)
 		return ret;
 
-	rga->version = rga->hw->get_version(core);
+	version = rga->hw->get_version(core);
 
 	v4l2_info(&rga->v4l2_dev, "HW Version: 0x%02x.%02x\n",
-		  rga->version.major, rga->version.minor);
+		  version.major, version.minor);
+
+	if (rga->num_cores) {
+		/* we are not the first core, expect that we have the same version */
+		if (rga->version.major != version.major || rga->version.minor != version.minor)
+			v4l2_warn(&rga->v4l2_dev, "Detected multi-core setup with different core versions!\n");
+	} else
+		rga->version = version;
 
 	pm_runtime_put(core->dev);
 
-	rga->cores[0] = core;
+	rga->cores[rga->num_cores++] = core;
 
 	return 0;
 }
@@ -983,14 +991,6 @@ static int rga_probe(struct platform_device *pdev)
 		component_match_add_release(dev, &match, component_release_of,
 					    component_compare_of, core_node);
 		num_cores++;
-
-		/*
-		 * As multi core is not implemented yet,
-		 * break out of the loop to only have one core per rockchip_rga struct.
-		 * Also put the node, which otherwise would've been done by the loop iteration.
-		 */
-		of_node_put(core_node);
-		break;
 	}
 
 	if (!match)
diff --git a/drivers/media/platform/rockchip/rga/rga.h b/drivers/media/platform/rockchip/rga/rga.h
index fcf1ef7d2029f..6237436b984eb 100644
--- a/drivers/media/platform/rockchip/rga/rga.h
+++ b/drivers/media/platform/rockchip/rga/rga.h
@@ -88,6 +88,7 @@ struct rockchip_rga {
 
 	const struct rga_hw *hw;
 
+	u8 num_cores;
 	struct rga_core *cores[];
 };
 

-- 
2.54.0



^ permalink raw reply related

* [PATCH 03/17] media: rockchip: rga: move early return into if condition in vidioc_enum_fmt
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Instead of a doing an early return when we don't have a capture device,
merge the condition with the following if condition. This improves
readability, as the condition now explicitly contains a check for a
capture device instead of returning when we don't have a capture device.

Also use the V4L2_TYPE_IS_CAPTURE helper and improve the comment.

The early return if was copied from the vivid drivers
vivid_enum_fmt_vid function.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index efe5541078214..8c03422d669cf 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -372,12 +372,14 @@ static int vidioc_enum_fmt(struct file *file, void *priv, struct v4l2_fmtdesc *f
 	if (ret != 0)
 		return ret;
 
-	if (f->type != V4L2_BUF_TYPE_VIDEO_CAPTURE &&
-	    f->type != V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE)
-		return 0;
-
-	/* allow changing the quantization and xfer func for YUV formats */
-	if (v4l2_is_format_yuv(v4l2_format_info(f->pixelformat)))
+	/*
+	 * Allow changing the quantization and ycbcr_enc func for YUV formats
+	 * on the capture side for RGB -> YUV conversions.
+	 *
+	 * These flags are only relevant for capture devices.
+	 */
+	if (V4L2_TYPE_IS_CAPTURE(f->type) &&
+	    v4l2_is_format_yuv(v4l2_format_info(f->pixelformat)))
 		f->flags |= V4L2_FMT_FLAG_CSC_QUANTIZATION |
 			    V4L2_FMT_FLAG_CSC_YCBCR_ENC;
 

-- 
2.54.0



^ permalink raw reply related

* [PATCH 02/17] media: rockchip: rga: add comment about pixel alignment for YUV formats
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel
In-Reply-To: <20260606-spu-rga3multicore-v1-0-3ec2b15675f7@pengutronix.de>

Add a comment to clarify the use of fixed step_height values for all YUV
formats. While the commit introducing the change already explains the
reasoning, add an explicit comment to improve the visibility of the
reasoning.

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
 drivers/media/platform/rockchip/rga/rga.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index bd0afd33affe4..efe5541078214 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -414,6 +414,16 @@ static int vidioc_try_fmt(struct file *file, void *priv, struct v4l2_format *f)
 		.step_height = 1,
 	};
 
+	/*
+	 * Technically 4:2:2 YUV formats don't need a step_height of 2.
+	 * But for the RGA3 this is explicitly documented in  section 5.6.3
+	 * of the RK3588 TRM Part 2.
+	 * And the RGA2 vendor driver also checks that the height (and width)
+	 * is aligned to 2 when a YUV format is used.
+	 *
+	 * Therefore be safe and always align width and height to 2
+	 * when a YUV format is used.
+	 */
 	if (v4l2_is_format_yuv(v4l2_format_info(pix_fmt->pixelformat))) {
 		frmsize.step_width = 2;
 		frmsize.step_height = 2;

-- 
2.54.0



^ permalink raw reply related

* [PATCH 00/17] media: rockchip: rga: Add multi-core support
From: Sven Püschel @ 2026-06-05 22:06 UTC (permalink / raw)
  To: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Philipp Zabel
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	kernel, Detlev Casanova, Michael Tretter, Sven Püschel,
	Simon Xue, Joerg Roedel

Add multi-core support to the RGA (Raster Graphic Accelerator) driver
for Rockchip SoCs. This works by scheduling the given work to multiple
identical RGA cores. Previously other identical cores were discarded
while probing with -ENODEV to avoid exposing multiple video devices for
identical cores and breaking the ABI when adding an in-kernel scheduling.

This series targets the RK3588 SoC, which has one RGA2-Enhance core
and two RGA3 cores (see [1] for an overview of the different RGA cores).
The slimmed down RK3576 SoC also features two RGA2-Pro
(also described as RGA2.5) cores, but is currently not supported by
the driver. Tests are done on a Radxa Rock 5T SBC.

The scheduling is done only on a context level, which causes no
increased performance for a single stream (which uses only one mem2mem
context). Therefore at least N parallel stream are necessary to utilize
N cores. This avoids the more complex buffer handling required to avoid
mixing the frame ordering when one core is slightly faster than the
other (e.g. due to memory transfer timings or different clocks).

While the work is based on Detlev Casanova's multi-core series for the
rkvdec driver [2], it differs in two major aspects:

(1) It doesn't directly call v4l2_m2m_job_finish to mark the current job
as finished in the device_run callback. Detlev used this to trick the
m2m framework to directly schedule the next job. This looked like a
dirty hack and had me running into some of it's pitfalls (e.g. the
difference between the v4l2_m2m_buf_done and the newly introduced
v4l2_m2m_buf_done_manual function).
Instead I've dropped the current curr_ctx member of the v4l2_m2m_dev
struct and added a max_parallel_jobs member to specify the maximum
number of parallel jobs. This allows the driver to set it's maximum
number of parallel jobs with the newly introduced
v4l2_m2m_set_max_parallel_jobs function. The RGA driver uses it to set
it's number of parallel jobs to it's number of available cores. The m2m
framework then schedules the first N jobs on it's job queue to the
device_run callback instead of only one.

(2) Instead of attaching an identical RGA core on probe to the first
probed RGA core instance, use component helpers to add all cores as
components to a virtual platform device. This has the advantage of only
creating the video device after all cores have been probed successfully
and tearing it down if one core is being removed (e.g. by the sysfs),
which otherwise could lead to nasty memory bugs. The implementation is
based on the driver of the etnaviv gpu. As the virtual platform device
doesn't has an iommu, we still allocate all relevant drives on the first
core, which shares it's iommu domain with all other cores.

v4l2-compliance results:
    v4l2-compliance 1.32.0, 64 bits, 64-bit time_t
    ...
    	Card type        : rga2
    ...
    Total for rockchip-rga device /dev/video0: 48, Succeeded: 48, Failed: 0, Warnings: 0

    v4l2-compliance 1.32.0, 64 bits, 64-bit time_t
    ...
    	Card type        : rga3
    ...
    Total for rockchip-rga device /dev/video1: 48, Succeeded: 48, Failed: 0, Warnings: 0

The DTS and iommu changes at the end are picked out of other next trees
to provide an easy way to actually test the changes with an RGA3 on a
rk3588 SoC. They'll be dropped when they get into media/next.

Patch 1-3 address review comments from my last RGA3 patch series
Patch 4 additional driver cleanup
Patch 5 implements support for parallel jobs in the m2m framework
Patch 6-8 add multi core preparations to the driver
Patch 9-13 rework the driver to use component helpers
Patch 14 puts all cores into the same iommu domain
Patch 15 enables the multi-core support
patch 16-17 just pick patches required for testing

[1] https://codeberg.org/airockchip/librga/src/branch/main/docs/Rockchip_Developer_Guide_RGA_EN.md#design-index
[2] https://lore.kernel.org/linux-media/20260409-rkvdec-multicore-v1-0-62b316abf0f7@collabora.com/

Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
---
Simon Xue (1):
      iommu/rockchip: disable fetch dte time limit

Sven Püschel (16):
      media: rockchip: rga: zero cmdbuf in shared code
      media: rockchip: rga: add comment about pixel alignment for YUV formats
      media: rockchip: rga: move early return into if condition in vidioc_enum_fmt
      media: rockchip: rga: removed unused regmap member
      media: v4l2-mem2mem: support running multiple jobs in parallel
      media: rockchip: rga:  move power handling to device_run
      media: rockchip: rga: adjust get_version to return the version
      media: rockchip: rga: add rga_core structure
      media: rockchip: rga: use components to manage multiple cores
      media: rockchip: rga: move rockchip_rga allocation to master probe
      media: rockchip: rga: move video device to the master
      media: rockchip: rga: move core initialization from bind to probe
      media: rockchip: rga: bind all cores to the master
      media: rockchip: rga: put all cores into first core iommu domain
      media: rockchip: rga: schedule jobs to multiple cores
      arm64: dts: rockchip: add rga3 dt nodes to rk3588

 arch/arm64/boot/dts/rockchip/rk3588-base.dtsi |  44 +++
 drivers/iommu/rockchip-iommu.c                |   8 +
 drivers/media/platform/rockchip/rga/rga-buf.c |  16 +-
 drivers/media/platform/rockchip/rga/rga-hw.c  |  40 +-
 drivers/media/platform/rockchip/rga/rga.c     | 501 +++++++++++++++++++-------
 drivers/media/platform/rockchip/rga/rga.h     |  45 ++-
 drivers/media/platform/rockchip/rga/rga3-hw.c |  32 +-
 drivers/media/v4l2-core/v4l2-mem2mem.c        |  89 +++--
 include/media/v4l2-mem2mem.h                  |   3 +
 9 files changed, 541 insertions(+), 237 deletions(-)
---
base-commit: 6a75e3d4f6428b90f398354212e3a2e0172851d6
change-id: 20260602-spu-rga3multicore-ae8c8caf01e9

Best regards,
--  
Sven Püschel <s.pueschel@pengutronix.de>

^ permalink raw reply

* Re: [PATCH v4 18/24] iommu/arm-smmu-v3: Introduce master->ats_broken flag
From: Nicolin Chen @ 2026-06-05 21:56 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Will Deacon, Robin Murphy, Joerg Roedel, Bjorn Helgaas,
	Rafael J . Wysocki, Len Brown, Pranjal Shrivastava, Mostafa Saleh,
	Lu Baolu, Kevin Tian, linux-arm-kernel, iommu, linux-kernel,
	linux-acpi, linux-pci, vsethi, Shuai Xue
In-Reply-To: <20260605194259.GE1962447@nvidia.com>

Thanks for the reply.

This is indeed a very complex and sophisticated topic..

On Fri, Jun 05, 2026 at 04:42:59PM -0300, Jason Gunthorpe wrote:
> I don't see any of these options as appealing. We have to maintain a
> few key invariants, and I think it cannot be done without a way to
> find all the domains that are using the STE.
> 
> One way or another you have to be using the invs list rw locks to
> synchronize the EATS state changes.
> 
> It is okayish to be sloppy when turning EATS off, but when turning it
> back on we do need to cycle through every invs list and toggle its
> lock to ensure that the invalidations are synchronized before
> EATS=enable happens.

I think the core guarantees that "cycle through every invs list"
happens: a PCI reset calls reset_prepare() blocking all the RID
and PASID domains and removing ATS entries from every invs list,
and then calls reset_done() that re-attach RID/PASID domains so
freshly new ATS entries will be installed before EATS=enable.

So, I think the enable path is not an issue, though the disable
path or the invalidation path would need "a way to find all the
domains that are using the STE".

> Given you must have a way to go from STE -> master -> all invs lists
> I'm not sure either option really makes such a large difference.
> 
> If so then adjusting the invs to disable the ATS is pretty simple, run
> over the xarray and set them all off. Yes you could find the master
> through a SID lookup with some locking adjustment.
> > 
> > (1) Per-invs marker: INV_TYPE_ATS_BROKEN + master_domains
> >     disable_ats() in the timeout path walks master->master_domains
> >     and flips matching ATS invs entries to the BROKEN type.
> > 
> >   + invs walker is free (one case label in the existing type switch).
> >   + No lock or pointer deref in the invs walker.
> >   + No master pointer stored in invs; no lifetime concern.
> > 
> >   - disable_ats() walks every (master, domain) and marks each invs
> >     set; the list needs locking usable from atomic.
> 
> This doesn't seem so bad

Yea, the only thing is that the disable path has to deal with a
complexity from going through a per-device domain list. Maybe it
can reuse iommu_group->pasid_array by taking xa_lock?

> > (2) Per-master flag + streams_lock
> >     invs walker resolves SID -> master via streams_lock and reads
> >     master->ats_broken.
> > 
> >   + Single source of truth on the master.
> >   + disable_ats() is one WRITE_ONCE.
> >   + atc_inv_master early-skips via one READ_ONCE.
> >   + attach gates ats_enabled on the flag; a concurrent quarantine
> >     race can be closed by a short post-attach re-check in commit()
> >   + No master pointer in invs; no lifetime concern.
> > 
> >   - invs walker pays streams_lock + rb_find(SID) per ATS entry on
> >     every invalidation. Measurable on ATS-heavy workloads.
> 
> Doesn't consider how to enable

The enable side is core-driven: when reset_done() re-attaches
the device from blocked_domain back to its RID/PASID domains,
the new attach_dev callback (old_domain == blocked_domain) can
clear the per-master flag. If the device is still broken, then
arm_smmu_atc_inv_master() at the end of attach_commit() times
out and re-triggers quarantine.

The flaw lives in the invalidation path as it must translate
every SID to master using streams_lock + rb_find(SID) per ATS
entry, which make it very less attractive.

> > (3) Per-master flag + inv->master pointer (v4)
> >     invs entry carries a master pointer; the invs walker reads
> >     cur->master->ats_broken directly.
> > 
> >   + invs walker is one READ_ONCE through a cached pointer.
> >   + disable_ats is one WRITE_ONCE.
> >   + atc_inv_master early-skip via one READ_ONCE.
> >   + attach gate + post-attach re-check, same as (2).
> > 
> >   - invs holds a master ptr, so release_device must synchronize_rcu()
> >     before freeing the master to drain walkers under rcu_read_lock().
> >     We dropped this from v4 for that reason.
> 
> synchronize_rcu is not right because you have to have gone through the
> rwlock so there can be no readers.

Ah, I think you are right! When release_device() is invoked, the
device must be already in the release (blocked) domain. So there
should be no domain->invs in the system holding its ATS entries.
And the enable part would work as (2).

In this case, (3) seems the best? It's fast on every aspect.

And I think it would fit we plan to generalize the invs design:

struct inv {
	struct arm_smmu_device *smmu; // => struct iommu_device *iommu;
	struct arm_smmu_master *master; // => void *priv;
					//    (dev->iommu->priv)

Thanks
Nicolin

^ permalink raw reply

* Re: [PATCH 4/4] arm64: route crash_smp_send_stop() last resort through SDEI
From: Kiryl Shutsemau @ 2026-06-05 21:46 UTC (permalink / raw)
  To: Doug Anderson
  Cc: Catalin Marinas, Will Deacon, James Morse, Mark Rutland,
	Marc Zyngier, Petr Mladek, Thomas Gleixner, Andrew Morton,
	Baoquan He, Puranjay Mohan, Usama Arif, Breno Leitao,
	Julien Thierry, Lecopzer Chen, Sumit Garg, kernel-team, kexec,
	linux-arm-kernel, linux-kernel
In-Reply-To: <CAD=FV=X5++c-6Wd6babajiPbn07cfPcG0uW3ZeepznXgSVO2+w@mail.gmail.com>

On Fri, Jun 05, 2026 at 01:42:57PM -0700, Doug Anderson wrote:
> > +       sdei_nmi_crash_smp_send_stop();
> 
> It feels weird to me that you're adding SDEI for "crash stop" but not
> for regular "stop". It feels like you should modify smp_send_stop() to
> fall back to SDEI if sending the NMI failed, instead of adding this
> separate path.

Fair. A wedged CPU ignores the reboot-path stop just the same, and the
escalation logic already lives in smp.c, so I'll restructure in v2.

One thing to sort out there: this patch parks the stopped CPU inside
its SDEI handler without completing the event, which is fine for the
crash case (nothing expects the CPU back before reset), but a generic
stop path probably wants SDEI_EVENT_COMPLETE_AND_RESUME into a parking
stub instead, so that e.g. a regular kexec can bring all CPUs back up
in the new kernel. I'll look into that as part of the rework.

> > +               cpu_park_loop();
> > +               /* unreachable */
> 
> Any chance we could avoid duplicating stuff from ipi_cpu_crash_stop()?

Yes -- falls out of the above. I will look into this.

Maybe pull the save/offline/park body into a shared helper that both the
IPI handler and the SDEI handler call.

> > +bool sdei_nmi_crash_smp_send_stop(void)
> > +{
> > +       unsigned int this_cpu, cpu, remaining;
> > +       unsigned long timeout;
> > +       cpumask_t mask;
> 
> The above will probably get you a yell. Putting "cpumask_t" on the
> stack is a no-no since it can be quite large under certain CONFIG
> options. This is why it's nearly always defined as "static".

Doh! Will make it static in v2 -- safe here since the path is serialized
by the crash_stop guard.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply

* Re: [PATCH 2/4] drivers/firmware: add SDEI cross-CPU NMI service for arm64
From: Kiryl Shutsemau @ 2026-06-05 21:29 UTC (permalink / raw)
  To: Doug Anderson
  Cc: Catalin Marinas, Will Deacon, James Morse, Mark Rutland,
	Marc Zyngier, Petr Mladek, Thomas Gleixner, Andrew Morton,
	Baoquan He, Puranjay Mohan, Usama Arif, Breno Leitao,
	Julien Thierry, Lecopzer Chen, Sumit Garg, kernel-team, kexec,
	linux-arm-kernel, linux-kernel
In-Reply-To: <CAD=FV=XMqFVnri1aVGbFJhN6Ts3SeJUzEZrfN0Pqp9WeOzE=OA@mail.gmail.com>

On Fri, Jun 05, 2026 at 01:54:00PM -0700, Doug Anderson wrote:
> Hi,
> 
> On Wed, Jun 3, 2026 at 7:36 AM Kiryl Shutsemau <kirill@shutemov.name> wrote:
> >
> > @@ -928,11 +929,19 @@ static void arm64_backtrace_ipi(cpumask_t *mask)
> >  void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
> >  {
> >         /*
> > +        * Prefer the SDEI cross-CPU NMI provider when active: firmware
> > +        * dispatches the event out of EL3 and reaches CPUs that have
> > +        * interrupts locally masked, without the per-IRQ-mask cost that
> > +        * pseudo-NMI pays for the same reach. The plain IPI path below
> > +        * can't reach such a CPU unless pseudo-NMI is enabled.
> > +        *
> >          * NOTE: though nmi_trigger_cpumask_backtrace() has "nmi_" in the name,
> >          * nothing about it truly needs to be implemented using an NMI, it's
> >          * just that it's _allowed_ to work with NMIs. If ipi_should_be_nmi()
> >          * returned false our backtrace attempt will just use a regular IPI.
> >          */
> > +       if (sdei_nmi_trigger_cpumask_backtrace(mask, exclude_cpu))
> > +               return;
> >         nmi_trigger_cpumask_backtrace(mask, exclude_cpu, arm64_backtrace_ipi);
> 
> nit: instead of one comment block, I would have broken it up in two. Like:
> 
> /*
>  * Prefer the SDEI ...
>  */
> if (sdei_nmi_trigger_cpumask_backtrace(mask, exclude_cpu))
>   return;
> 
> /*
>  * NOTE: though ...
>  */
> nmi_trigger_cpumask_backtrace(...);

Makes sense.

> >  }
> >
> > diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
> > index bbd2155d8483..6501087ff90d 100644
> > --- a/drivers/firmware/Kconfig
> > +++ b/drivers/firmware/Kconfig
> > @@ -36,6 +36,25 @@ config ARM_SDE_INTERFACE
> >           standard for registering callbacks from the platform firmware
> >           into the OS. This is typically used to implement RAS notifications.
> >
> > +config ARM_SDEI_NMI
> > +       bool "SDEI-based cross-CPU NMI service (arm64)"
> > +       depends on ARM64 && ARM_SDE_INTERFACE
> > +       help
> > +         Provides SDEI-based cross-CPU NMI delivery for hooks that need
> > +         to reach interrupt-masked CPUs on silicon that lacks FEAT_NMI:
> > +
> > +           - arch_trigger_cpumask_backtrace()  (sysrq-l, RCU stalls,
> > +             hardlockup_all_cpu_backtrace, soft-lockup secondary dumps,
> > +             hung-task auxiliary dumps)
> > +
> > +         The driver registers a handler for the SDEI software-signalled
> > +         event (event 0) and reaches a target CPU by signalling it with
> > +         SDEI_EVENT_SIGNAL. Firmware delivers the event out of EL3
> > +         regardless of the target's PSTATE.DAIF -- forced delivery into a
> > +         CPU wedged with interrupts locally masked.
> > +
> > +         If unsure, say N.
> 
> Is there some downside to this? It seems like anyone who has the SDE
> interface would want this. Not sure why you'd suggest people say "N".

No real downside -- without the software-signalled event the driver
stays inert, and there is no cost until an event actually fires.

The "say N" is caution, not a technical limit: so far this has run on
QEMU (TF-A) and one hardware platform, and the interesting paths depend
on each vendor's SDEI implementation at EL3. I'm not sure vendors would
care to run SDEI_EVENT_SIGNAL validation. Maybe we want to see more
data points first?

But maybe I am too cautious. Happy to flip the recommendation (or add
default y) in v2 if that the consensus.

> Other than the nit, this looks reasoanble to me, though I'm a complete
> noob when it comes to SDEI...
> 
> Reviewed-by: Douglas Anderson <dianders@chromium.org>

Thanks!

-- 
  Kiryl Shutsemau / Kirill A. Shutemov


^ permalink raw reply

* Re: [PATCH 3/4] arm64: wire SDEI NMI into the hardlockup watchdog
From: Kiryl Shutsemau @ 2026-06-05 21:11 UTC (permalink / raw)
  To: Doug Anderson
  Cc: Catalin Marinas, Will Deacon, James Morse, Mark Rutland,
	Marc Zyngier, Petr Mladek, Thomas Gleixner, Andrew Morton,
	Baoquan He, Puranjay Mohan, Usama Arif, Breno Leitao,
	Julien Thierry, Lecopzer Chen, Sumit Garg, kernel-team, kexec,
	linux-arm-kernel, linux-kernel
In-Reply-To: <CAD=FV=U4eJ__dQc1e8CGgj5sMDNrD1MgEEy9Cgj9M5n-WmYAXA@mail.gmail.com>

On Fri, Jun 05, 2026 at 01:03:05PM -0700, Doug Anderson wrote:
> Hi,
> 
> On Wed, Jun 3, 2026 at 7:36 AM Kiryl Shutsemau <kirill@shutemov.name> wrote:
> >
> > From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
> >
> > Select HAVE_HARDLOCKUP_DETECTOR_ARCH so the framework takes its backend
> > from this driver. A per-CPU hrtimer checks its buddy's heartbeat and
> > signals event 0 at a stalled CPU, which runs watchdog_hardlockup_check()
> > NMI-like.
> >
> > The source is chosen at boot: SDEI if firmware provides it, otherwise a
> > perf-NMI counter (pseudo-NMI) fallback -- one image covers both.
> >
> > Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
> > ---
> >  arch/arm64/Kconfig          |   1 +
> >  drivers/firmware/Kconfig    |   3 +
> >  drivers/firmware/sdei_nmi.c | 247 +++++++++++++++++++++++++++++++++++-
> >  3 files changed, 248 insertions(+), 3 deletions(-)
> 
> I'm a little confused about this patch. We already have a buddy
> hardlockup detector using the hrtimer, and it's even been improved
> recently to trigger in a smaller time bound. It looks as if you're
> duplicating bits of the perf and buddy detector here?
> 
> I don't think you need this patch at all. The existing buddy detector
> + patches #1 and #2 in your series should be sufficient.

You're mostly right.

Buddy + #2 covers the console case (the remote branch triggers the
culprit's backtrace, which #2 makes deliverable), and #4 gets the wedged
CPU's registers into the vmcore.

The one thing this patch adds that a config can't is boot-time source
selection: PERF-compiled kernels have no detector on a pseudo_nmi=0
boot, and PREFER_BUDDY costs the pseudo-NMI machines perf
self-detection. But that's arguably out of scope for the patchset.

I'll drop this patch in v2 and run PREFER_BUDDY here. If a runtime
perf->buddy fallback ever materializes, the gap closes entirely.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov


^ permalink raw reply

* Re: [PATCH bpf-next v2 8/8] selftests/bpf: add tests to validate KASAN on JIT programs
From: Alexis Lothoré @ 2026-06-05 20:55 UTC (permalink / raw)
  To: Yonghong Song, Alexis Lothoré, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Song Liu, Jiri Olsa,
	John Fastabend, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, Shuah Khan, Maxime Coquelin,
	Alexandre Torgue, Ihor Solodrai
  Cc: ebpf, Bastien Curutchet, Thomas Petazzoni, bpf, linux-kernel,
	linux-kselftest, linux-stm32, linux-arm-kernel
In-Reply-To: <f73d0971-0544-4a92-bde7-b2fbfcdaf28b@linux.dev>

On Fri Jun 5, 2026 at 7:20 PM CEST, Yonghong Song wrote:

[...]

>> Are you seeing any kasan report when you manually check your kernel
>> logs, or not at all ? If not at all, are you using the "CI" defconfig ?
>
> I do see one report:
>
> [   79.503059] ==================================================================
> [   79.503715] BUG: KASAN: slab-use-after-free in bpf_prog_bb753b2ee1f69aa0_st_not_on_stack+0x115/0x160
> [   79.503715] Write of size 1 at addr ff11000117210a20 by task test_progs/2153
>                                                                                                                                                                 
> [   79.503715] CPU: 6 UID: 0 PID: 2153 Comm: test_progs Tainted: G           OE       7.1.0-rc5-gd552a156c2fa #1926 PREEMPT(full)
> [   79.503715] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
> [   79.503715] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
> [   79.503715] Call Trace:
> [   79.503715]  <TASK>
> [   79.503715]  dump_stack_lvl+0x6d/0xa0
> [   79.503715]  print_address_description+0x77/0x200
> [   79.503715]  print_report+0x58/0x70
> [   79.503715]  ? bpf_prog_bb753b2ee1f69aa0_st_not_on_stack+0x115/0x160
> [   79.503715]  kasan_report+0xa2/0xe0
> [   79.503715]  ? bpf_prog_bb753b2ee1f69aa0_st_not_on_stack+0x115/0x160
> [   79.503715]  ? bpf_test_run+0x208/0x770
> [   79.503715]  bpf_prog_bb753b2ee1f69aa0_st_not_on_stack+0x115/0x160
> [   79.503715]  bpf_test_run+0x472/0x770
> [   79.503715]  ? srso_alias_return_thunk+0x5/0xfbef5
> [   79.503715]  ? __lock_acquire+0xe4a/0x2a10
> [   79.503715]  ? __pfx___css_rstat_updated+0x10/0x10
> [   79.503715]  ? __lock_acquire+0xe4a/0x2a10
> [   79.503715]  ? __pfx_bpf_test_run+0x10/0x10
> [   79.503715]  ? srso_alias_return_thunk+0x5/0xfbef5
> [   79.503715]  ? lock_acquire+0xfd/0x2b0
> [   79.503715]  ? srso_alias_return_thunk+0x5/0xfbef5
> [   79.503715]  ? srso_alias_return_thunk+0x5/0xfbef5
> [   79.503715]  ? rcu_is_watching+0x1f/0xa0
> [   79.503715]  ? srso_alias_return_thunk+0x5/0xfbef5
> [   79.503715]  ? __kasan_krealloc+0xe9/0x110
> [   79.503715]  ? eth_type_trans+0x4b9/0x5f0
> [   79.503715]  bpf_prog_test_run_skb+0xddf/0x22f0
> [   79.503715]  ? __fget_files+0x29/0x350
> [   79.503715]  ? srso_alias_return_thunk+0x5/0xfbef5
> [   79.503715]  ? __fget_files+0x29/0x350
> [   79.503715]  bpf_prog_test_run+0x1cc/0x2d0
> [   79.503715]  __sys_bpf+0x740/0xa30
> [   79.503715]  ? __pfx___sys_bpf+0x10/0x10
> [   79.503715]  ? _prb_read_valid+0x334/0x770
> [   79.503715]  ? handle_mm_fault+0x91b/0xc00
> [   79.503715]  __x64_sys_bpf+0xba/0xd0
> [   79.503715]  do_syscall_64+0xee/0x400
> [   79.503715]  ? entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   79.503715]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   79.503715] RIP: 0033:0x7f92d8cfe1ad
> [   79.503715] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 8
> [   79.503715] RSP: 002b:00007ffe4237fee8 EFLAGS: 00000206 ORIG_RAX: 0000000000000141
> [   79.503715] RAX: ffffffffffffffda RBX: 00007ffe423807b8 RCX: 00007f92d8cfe1ad
> [   79.503715] RDX: 0000000000000050 RSI: 00007ffe4237ff70 RDI: 000000000000000a
> [   79.503715] RBP: 00007ffe4237ff10 R08: 0000000000000000 R09: 0000000000000050
> [   79.503715] R10: 0000000000000064 R11: 0000000000000206 R12: 0000000000000000
> [   79.503715] R13: 00007ffe423807d8 R14: 00007f92d8eb9000 R15: 00005585778dd150
> [   79.503715]  </TASK>
>
> [   79.503715] Allocated by task 2153:
> [   79.503715]  kasan_save_track+0x2f/0x70
> [   79.503715]  __kasan_kmalloc+0x72/0x90
> [   79.503715]  __kmalloc_node_noprof+0x34c/0x730
> [   79.503715]  bpf_map_area_alloc+0x4a/0x110
> [   79.503715]  array_map_alloc+0x19e/0x580
> [   79.503715]  map_create+0x8b2/0x1500
> [   79.503715]  __sys_bpf+0x7ea/0xa30
> [   79.503715]  __x64_sys_bpf+0xba/0xd0
> [   79.503715]  do_syscall_64+0xee/0x400
> [   79.503715]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> [   79.503715] The buggy address belongs to the object at ff11000117210800
>                  which belongs to the cache kmalloc-cg-1k of size 1024
> [   79.503715] The buggy address is located 0 bytes to the right of
>                  freed 544-byte region [ff11000117210800, ff11000117210a20)
>
> [   79.503715] The buggy address belongs to the physical page:
> [   79.503715] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x117210
> [   79.503715] head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
> [   79.503715] memcg:ff11000117210411
> [   79.503715] flags: 0x200000000000040(head|node=0|zone=2)
> [   79.503715] page_type: f5(slab)
> [   79.503715] raw: 0200000000000040 ff11000100072000 dead000000000100 dead000000000122
> [   79.503715] raw: 0000000000000000 0000080000100010 00000000f5000000 ff11000117210411
> [   79.503715] head: 0200000000000040 ff11000100072000 dead000000000100 dead000000000122
> [   79.503715] head: 0000000000000000 0000080000100010 00000000f5000000 ff11000117210411
> [   79.503715] head: 0200000000000003 fffffffffffffe01 00000000ffffffff 00000000ffffffff
> [   79.503715] head: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000008
> [   79.503715] page dumped because: kasan: bad access detected
>
> [   79.503715] Memory state around the buggy address:
> [   79.503715]  ff11000117210900: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> [   79.503715]  ff11000117210980: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> [   79.503715] >ff11000117210a00: 00 00 00 00 fb fb fc fc fc fc fc fc fc fc fc fc
> [   79.503715]                                ^
> [   79.503715]  ff11000117210a80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> [   79.503715]  ff11000117210b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> [   79.503715] ==================================================================
>
>
> But when I am running another same test './test_progs -t kasan', there is no kasan reports.

Ok, I guess you are missing kasan_multi_shot on your kernel command
line: without this option, only the first report is generated, then
KASAN does not emit additional report until you restart your kernel.
Could you please try adding it and running the tests again ?

Thanks,

Alexis

>>
>>    cat tools/testing/selftests/bpf/{config,config.vm,config.x86_64} > .config && make olddefconfig
>>
>> If not, would you mind sharing your defconfig ?
>
> Attached.
>
>>
>> Thanks,
>>
>> Alexis




-- 
Alexis Lothoré, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com



^ permalink raw reply

* Re: [PATCH 2/4] drivers/firmware: add SDEI cross-CPU NMI service for arm64
From: Doug Anderson @ 2026-06-05 20:54 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: Catalin Marinas, Will Deacon, James Morse, Mark Rutland,
	Marc Zyngier, Petr Mladek, Thomas Gleixner, Andrew Morton,
	Baoquan He, Puranjay Mohan, Usama Arif, Breno Leitao,
	Julien Thierry, Lecopzer Chen, Sumit Garg, kernel-team, kexec,
	linux-arm-kernel, linux-kernel, Kiryl Shutsemau (Meta)
In-Reply-To: <145b9e98b12a7d314fc4a203075f65c3a0c3a913.1780496779.git.kas@kernel.org>

Hi,

On Wed, Jun 3, 2026 at 7:36 AM Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> @@ -928,11 +929,19 @@ static void arm64_backtrace_ipi(cpumask_t *mask)
>  void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
>  {
>         /*
> +        * Prefer the SDEI cross-CPU NMI provider when active: firmware
> +        * dispatches the event out of EL3 and reaches CPUs that have
> +        * interrupts locally masked, without the per-IRQ-mask cost that
> +        * pseudo-NMI pays for the same reach. The plain IPI path below
> +        * can't reach such a CPU unless pseudo-NMI is enabled.
> +        *
>          * NOTE: though nmi_trigger_cpumask_backtrace() has "nmi_" in the name,
>          * nothing about it truly needs to be implemented using an NMI, it's
>          * just that it's _allowed_ to work with NMIs. If ipi_should_be_nmi()
>          * returned false our backtrace attempt will just use a regular IPI.
>          */
> +       if (sdei_nmi_trigger_cpumask_backtrace(mask, exclude_cpu))
> +               return;
>         nmi_trigger_cpumask_backtrace(mask, exclude_cpu, arm64_backtrace_ipi);

nit: instead of one comment block, I would have broken it up in two. Like:

/*
 * Prefer the SDEI ...
 */
if (sdei_nmi_trigger_cpumask_backtrace(mask, exclude_cpu))
  return;

/*
 * NOTE: though ...
 */
nmi_trigger_cpumask_backtrace(...);



>  }
>
> diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
> index bbd2155d8483..6501087ff90d 100644
> --- a/drivers/firmware/Kconfig
> +++ b/drivers/firmware/Kconfig
> @@ -36,6 +36,25 @@ config ARM_SDE_INTERFACE
>           standard for registering callbacks from the platform firmware
>           into the OS. This is typically used to implement RAS notifications.
>
> +config ARM_SDEI_NMI
> +       bool "SDEI-based cross-CPU NMI service (arm64)"
> +       depends on ARM64 && ARM_SDE_INTERFACE
> +       help
> +         Provides SDEI-based cross-CPU NMI delivery for hooks that need
> +         to reach interrupt-masked CPUs on silicon that lacks FEAT_NMI:
> +
> +           - arch_trigger_cpumask_backtrace()  (sysrq-l, RCU stalls,
> +             hardlockup_all_cpu_backtrace, soft-lockup secondary dumps,
> +             hung-task auxiliary dumps)
> +
> +         The driver registers a handler for the SDEI software-signalled
> +         event (event 0) and reaches a target CPU by signalling it with
> +         SDEI_EVENT_SIGNAL. Firmware delivers the event out of EL3
> +         regardless of the target's PSTATE.DAIF -- forced delivery into a
> +         CPU wedged with interrupts locally masked.
> +
> +         If unsure, say N.

Is there some downside to this? It seems like anyone who has the SDE
interface would want this. Not sure why you'd suggest people say "N".

Other than the nit, this looks reasoanble to me, though I'm a complete
noob when it comes to SDEI...

Reviewed-by: Douglas Anderson <dianders@chromium.org>


^ permalink raw reply

* Re: [PATCH 1/4] firmware: arm_sdei: add SDEI_EVENT_SIGNAL support
From: Doug Anderson @ 2026-06-05 20:46 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: Catalin Marinas, Will Deacon, James Morse, Mark Rutland,
	Marc Zyngier, Petr Mladek, Thomas Gleixner, Andrew Morton,
	Baoquan He, Puranjay Mohan, Usama Arif, Breno Leitao,
	Julien Thierry, Lecopzer Chen, Sumit Garg, kernel-team, kexec,
	linux-arm-kernel, linux-kernel, Kiryl Shutsemau (Meta)
In-Reply-To: <ba8074cdb9ca5a471162cbc15f775c1567a3992a.1780496779.git.kas@kernel.org>

Hi,

On Wed, Jun 3, 2026 at 7:36 AM Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
>
> Add sdei_event_signal(), a thin wrapper over the SDEI_EVENT_SIGNAL call
> (DEN0054) that makes the software-signalled event (event 0) pending on a
> target PE -- delivered NMI-like even when that PE has interrupts masked.
> It takes no locks, so it is safe to call from NMI / crash context.
>
> Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
> ---
>  drivers/firmware/arm_sdei.c   | 12 ++++++++++++
>  include/linux/arm_sdei.h      |  6 ++++++
>  include/uapi/linux/arm_sdei.h |  1 +
>  3 files changed, 19 insertions(+)

I'd never looked at SDEI before this (so my review is probably not
terribly strong), but this looks reasonable to me.

Reviewed-by: Douglas Anderson <dianders@chromium.org>


^ permalink raw reply

* Re: [PATCH 4/4] arm64: route crash_smp_send_stop() last resort through SDEI
From: Doug Anderson @ 2026-06-05 20:42 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: Catalin Marinas, Will Deacon, James Morse, Mark Rutland,
	Marc Zyngier, Petr Mladek, Thomas Gleixner, Andrew Morton,
	Baoquan He, Puranjay Mohan, Usama Arif, Breno Leitao,
	Julien Thierry, Lecopzer Chen, Sumit Garg, kernel-team, kexec,
	linux-arm-kernel, linux-kernel, Kiryl Shutsemau (Meta)
In-Reply-To: <54cb99db3c981dc39eb3031aff5caeaadb09e8b9.1780496779.git.kas@kernel.org>

Hi,

On Wed, Jun 3, 2026 at 7:36 AM Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> @@ -1288,8 +1288,32 @@ void crash_smp_send_stop(void)
>                 return;
>         crash_stop = 1;
>
> +       /*
> +        * Stop the normal way first: IPI_CPU_STOP escalating to a pseudo-NMI
> +        * IPI. Every CPU that responds saves its state via crash_save_cpu()
> +        * and parks in cpu_park_loop() with its online bit cleared -- the
> +        * standard kdump stop, identical to a kernel without SDEI. Crucially
> +        * those CPUs stay in a clean, potentially-reusable state.
> +        */
>         smp_send_stop();
>
> +       /*
> +        * Whatever is still online didn't respond -- typically a CPU wedged
> +        * with interrupts masked. The plain IPI can't reach it, and a fleet
> +        * that declines the pseudo-NMI hot-path cost has no NMI IPI to
> +        * escalate to. Hit only the survivors with the SDEI cross-CPU NMI
> +        * (no-op if SDEI isn't active, or if everything already stopped):
> +        * firmware delivers out of EL3 regardless of PSTATE.DAIF, and the
> +        * handler captures crash_save_cpu() state from the wedged context
> +        * before parking the CPU.
> +        *
> +        * SDEI is deliberately last: an SDEI-stopped CPU never completes its
> +        * event (it parks inside the handler, so EL3 retains its dispatch
> +        * slot until reset), which is strictly less recoverable than a normal
> +        * stop. We pay that only for CPUs that left no other way to reach them.
> +        */
> +       sdei_nmi_crash_smp_send_stop();

It feels weird to me that you're adding SDEI for "crash stop" but not
for regular "stop". It feels like you should modify smp_send_stop() to
fall back to SDEI if sending the NMI failed, instead of adding this
separate path.


>  static int sdei_nmi_handler(u32 event, struct pt_regs *regs, void *arg)
>  {
> +       int cpu = smp_processor_id();
> +
> +       if (READ_ONCE(*this_cpu_ptr(&sdei_nmi_crash_stop_requested))) {
> +               WRITE_ONCE(*this_cpu_ptr(&sdei_nmi_crash_stop_requested), 0);
> +
> +               /*
> +                * Capture the wedged context for kdump while pt_regs still
> +                * points at the interrupted PC. This is the main motivation
> +                * for using SDEI here: the plain IPI stop path can't reach an
> +                * interrupt-masked CPU (and the fleet declines pseudo-NMI to
> +                * keep the IRQ-mask hot path cheap), so crash_save_cpu() for
> +                * that CPU would otherwise record nothing useful.
> +                */
> +               crash_save_cpu(regs, cpu);
> +               set_cpu_online(cpu, false);
> +
> +               /* publish the crash state/offline before the requester sees the ack */
> +               smp_wmb();
> +               WRITE_ONCE(*this_cpu_ptr(&sdei_nmi_crash_stop_acked), 1);
> +
> +               /*
> +                * Park forever from within the SDEI handler. We deliberately
> +                * do NOT issue SDEI_EVENT_COMPLETE: the framework's return
> +                * path restores firmware's saved interrupted context, which
> +                * would land the CPU back wherever it was running (often
> +                * do_idle, which then notices cpu_is_offline=true and BUGs
> +                * at cpuhp_report_idle_dead). Returning the modified pt_regs
> +                * doesn't help -- arch/arm64/kernel/sdei.c::do_sdei_event
> +                * only honours a PC override via its IRQ-state heuristic
> +                * and otherwise hands EL3 its own saved-context slot back.
> +                *
> +                * Trade-off: EL3 firmware retains ~one saved-context slot
> +                * per parked CPU until the next hardware reset (~hundreds of
> +                * bytes per CPU). The CPU itself is parked in cpu_park_loop
> +                * exactly as if IPI_CPU_STOP had stopped it; recoverability
> +                * is unchanged versus the existing path (neither is
> +                * recoverable without hardware reset, since PSCI sees the
> +                * CPU as ALREADY_ON in both cases).
> +                */
> +               cpu_park_loop();
> +               /* unreachable */

Any chance we could avoid duplicating stuff from ipi_cpu_crash_stop()?


> +bool sdei_nmi_crash_smp_send_stop(void)
> +{
> +       unsigned int this_cpu, cpu, remaining;
> +       unsigned long timeout;
> +       cpumask_t mask;

The above will probably get you a yell. Putting "cpumask_t" on the
stack is a no-no since it can be quite large under certain CONFIG
options. This is why it's nearly always defined as "static".

-Doug


^ permalink raw reply

* [PATCH 2/3] soc: samsung: exynos-pmu: fix use-after-free of interrupt generator node
From: Alexey Klimov @ 2026-06-05 20:18 UTC (permalink / raw)
  To: Krzysztof Kozlowski, Alim Akhtar, Peter Griffin
  Cc: Sam Protsenko, linux-samsung-soc, linux-arm-kernel, linux-kernel,
	stable, Sashiko
In-Reply-To: <20260605-exynos-pmu-cpuhp-idle-fixes-v1-0-0cd05c81a82d@linaro.org>

The setup_cpuhp_and_cpuidle() parses the device tree node for the
interrupt generation block via of_parse_phandle() and decrements its
reference count using of_node_put() immediately after fetching the resource
address. However, later the intr_gen_node pointer is passed into
of_syscon_register_regmap().

Fix this by moving the of_node_put() invocation to after the
of_syscon_register_regmap() call, and adding it to correct error paths.

Reported-by: Sashiko <sashiko-bot@kernel.org>
Closes: https://sashiko.dev/#/patchset/20260513-exynos850-cpuhotplug-v4-0-54fec5f65362@linaro.org?part=3
Fixes: 78b72897a5c8 ("soc: samsung: exynos-pmu: Enable CPU Idle for gs101")
Cc: stable@vger.kernel.org
Signed-off-by: Alexey Klimov <alexey.klimov@linaro.org>
---
 drivers/soc/samsung/exynos-pmu.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/soc/samsung/exynos-pmu.c b/drivers/soc/samsung/exynos-pmu.c
index 6e635872247a..9636287f6794 100644
--- a/drivers/soc/samsung/exynos-pmu.c
+++ b/drivers/soc/samsung/exynos-pmu.c
@@ -428,23 +428,30 @@ static int setup_cpuhp_and_cpuidle(struct device *dev)
 	 * syscon provided regmap.
 	 */
 	ret = of_address_to_resource(intr_gen_node, 0, &intrgen_res);
-	of_node_put(intr_gen_node);
+	if (ret) {
+		of_node_put(intr_gen_node);
+		return ret;
+	}
 
 	virt_addr = devm_ioremap(dev, intrgen_res.start,
 				 resource_size(&intrgen_res));
-	if (!virt_addr)
+	if (!virt_addr) {
+		of_node_put(intr_gen_node);
 		return -ENOMEM;
+	}
 
 	pmu_context->pmuintrgen = devm_regmap_init_mmio(dev, virt_addr,
 							&regmap_pmu_intr);
 	if (IS_ERR(pmu_context->pmuintrgen)) {
 		dev_err(dev, "failed to initialize pmu-intr-gen regmap\n");
+		of_node_put(intr_gen_node);
 		return PTR_ERR(pmu_context->pmuintrgen);
 	}
 
 	/* register custom mmio regmap with syscon */
 	ret = of_syscon_register_regmap(intr_gen_node,
 					pmu_context->pmuintrgen);
+	of_node_put(intr_gen_node);
 	if (ret)
 		return ret;
 

-- 
2.51.0



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox