Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [RESEND v3 1/3] KVM: arm64: Reset page order in pKVM hyp_pool
From: Vincent Donnefort @ 2026-05-21 14:36 UTC (permalink / raw)
  To: maz, oliver.upton, joey.gouly, suzuki.poulose, yuzenghui,
	catalin.marinas, will
  Cc: linux-arm-kernel, kvmarm, kernel-team, qperret, tabba,
	Vincent Donnefort, Sashiko
In-Reply-To: <20260521143626.1005660-1-vdonnefort@google.com>

When a VM fails to initialise after its stage-2 hyp_pool has been
initialised, that stage-2 must be torn down entirely. This requires
resetting both the refcount and the order of its pages back to 0.

Currently, reclaim_pgtable_pages() implicitly resets the page order by
allocating the entire pool with order-0 granularity. However, in the VM
initialisation error path, the addresses of the donated memory (the PGD)
are already known, making it unnecessary to iterate over all pages in
the pool.

Since the vmemmap page order is a hyp_pool-specific field, leaving a
non-zero order on hyp_pool destruction is harmless until another pool
attempts to admit the page. Instead of resetting this field during
destruction, reset it during pool initialization in hyp_pool_init().

For 'external' pages, we can't trust the order either as they bypass
hyp_pool_init(). Since we never coalesce them, enforce order-0 to ensure
safe insertion into the pool.

This leaves no vmemmap order users outside of hyp_pool.

Fixes: 256b4668cd89 ("KVM: arm64: Introduce separate hypercalls for pKVM VM reservation and initialization")
Reported-by: Sashiko <sashiko-bot@kernel.org>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 25f04629014e..fa447d400b71 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -217,7 +217,6 @@ static void *guest_s2_zalloc_page(void *mc)
 	memset(addr, 0, PAGE_SIZE);
 	p = hyp_virt_to_page(addr);
 	p->refcount = 1;
-	p->order = 0;
 
 	return addr;
 }
@@ -322,7 +321,6 @@ void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
 	while (addr) {
 		page = hyp_virt_to_page(addr);
 		page->refcount = 0;
-		page->order = 0;
 		push_hyp_memcache(mc, addr, hyp_virt_to_phys);
 		WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1));
 		addr = hyp_alloc_pages(&vm->pool, 0);
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index a1eb27a1a747..57f86aa0f82f 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -94,13 +94,22 @@ static void __hyp_attach_page(struct hyp_pool *pool,
 			      struct hyp_page *p)
 {
 	phys_addr_t phys = hyp_page_to_phys(p);
-	u8 order = p->order;
 	struct hyp_page *buddy;
+	bool coalesce = true;
+	u8 order = p->order;
 
-	memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
+	/*
+	 * 'external' pages are never coalesced and their ->order field
+	 * untrusted as they bypass hyp_pool_init(). Enforce order-0.
+	 */
+	if (phys < pool->range_start || phys >= pool->range_end) {
+		order = 0;
+		coalesce = false;
+	}
+
+	memset(hyp_page_to_virt(p), 0, PAGE_SIZE << order);
 
-	/* Skip coalescing for 'external' pages being freed into the pool. */
-	if (phys < pool->range_start || phys >= pool->range_end)
+	if (!coalesce)
 		goto insert;
 
 	/*
@@ -237,8 +246,10 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
 
 	/* Init the vmemmap portion */
 	p = hyp_phys_to_page(phys);
-	for (i = 0; i < nr_pages; i++)
+	for (i = 0; i < nr_pages; i++) {
 		hyp_set_page_refcounted(&p[i]);
+		p[i].order = 0;
+	}
 
 	/* Attach the unused pages to the buddy tree */
 	for (i = reserved_pages; i < nr_pages; i++)
-- 
2.54.0.746.g67dd491aae-goog



^ permalink raw reply related

* Re: [PATCH v5 5/8] firmware: raspberrypi: register nvmem driver
From: Thomas Weißschuh @ 2026-05-21 14:38 UTC (permalink / raw)
  To: Gregor Herburger
  Cc: Rob Herring, Krzysztof Kozlowski, Conor Dooley, Florian Fainelli,
	Ray Jui, Scott Branden, Broadcom internal kernel review list,
	Eric Anholt, Stefan Wahren, Srinivas Kandagatla, Kees Cook,
	Gustavo A. R. Silva, devicetree, linux-rpi-kernel,
	linux-arm-kernel, linux-kernel, linux-hardening
In-Reply-To: <20260520-rpi-otp-driver-v5-5-b26e5908eeac@linutronix.de>

On Wed, May 20, 2026 at 04:27:57PM +0200, Gregor Herburger wrote:
> The Raspberry Pi firmware exposes two regions with otp registers. The
> first region called "customer otp" is available on all Raspberry Pi
> models. The second is only available on the Raspberry Pi 5 (bcm2712).
> 
> Signed-off-by: Gregor Herburger <gregor.herburger@linutronix.de>

Reviewed-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>

> ---
>  drivers/firmware/raspberrypi.c             | 59 +++++++++++++++++++++++++++++-
>  include/soc/bcm2835/raspberrypi-firmware.h |  5 +++
>  2 files changed, 63 insertions(+), 1 deletion(-)

(...)

> +static void rpi_register_otp_driver(struct device *dev)
> +{
> +	struct rpi_firmware *fw = dev_get_drvdata(dev);
> +
> +	rpi_otp_customer = platform_device_register_data(dev, "raspberrypi-otp",
> +							 PLATFORM_DEVID_AUTO,
> +							 &rpi_otp_customer_data,
> +							 sizeof(rpi_otp_customer_data));
> +
> +	if (IS_ERR(rpi_otp_customer))
> +		dev_err(dev, "Failed to register customer OTP device: %ld\n",
> +			PTR_ERR(rpi_otp_customer));

These should use %pe which will nicely format the error.
This also differs from the other subdevice registrations which do not
print a message on error.

> +
> +	if (fw->soc == RPI_FIRMWARE_SOC_BCM2712) {
> +		rpi_otp_private = platform_device_register_data(dev, "raspberrypi-otp",
> +								PLATFORM_DEVID_AUTO,
> +								&rpi_otp_private_data,
> +								sizeof(rpi_otp_private_data));
> +
> +		if (IS_ERR(rpi_otp_private))
> +			dev_err(dev, "Failed to register private OTP device: %ld\n",
> +				PTR_ERR(rpi_otp_private));
> +	}
> +}

(...)


^ permalink raw reply

* Re: [PATCH v7 16/28] media: rockchip: rga: reuse cmdbuf contents
From: Michael Tretter @ 2026-05-21 14:39 UTC (permalink / raw)
  To: Sven Püschel
  Cc: Jacob Chen, Ezequiel Garcia, Mauro Carvalho Chehab,
	Heiko Stuebner, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Hans Verkuil, linux-media, linux-rockchip, linux-arm-kernel,
	linux-kernel, devicetree, kernel, nicolas, sebastian.reichel,
	p.zabel
In-Reply-To: <20260521-spu-rga3-v7-16-3f33e8c7145f@pengutronix.de>

On Thu, 21 May 2026 00:44:21 +0200, Sven Püschel wrote:
> Reuse the command buffer contents instead of completely writing it
> for every frame. Therefore we only need to replace the source and
> destination addresses for each frame. This reduces the amount of CPU
> and memory operations done in each frame. A new cmdbuf_dirty flag notes
> if the cmdbuf has to be rewritten on the next frame.
> 
> The initial idea of initializing the cmdbuf on streamon broke the
> ability to update controls while streaming (e.g. mirroring).
> 
> Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>
> 
> ---
> Changes in v6:
> - Reworked the commit to not setup the cmdbuf on streamon but rather
>   re-initialize it on the next frame when something changed.
> - Sasahiko flagged the cmdbuf setup at streamon:
>   https://sashiko.dev/#/patchset/20260428-spu-rga3-v5-0-eb7f5d019d86%40pengutronix.de?part=17
> - Dropped Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
>   due to the reworked patch and commit message contents
> 
> Changes in v5:
> - Don't set the flipping and rotation values at streamon and preventing
>   the userspace from chainging them at runtime
> ---
>  drivers/media/platform/rockchip/rga/rga-hw.c | 13 +++++++++----
>  drivers/media/platform/rockchip/rga/rga.c    | 11 +++++++++--
>  drivers/media/platform/rockchip/rga/rga.h    |  2 ++
>  3 files changed, 20 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/media/platform/rockchip/rga/rga-hw.c b/drivers/media/platform/rockchip/rga/rga-hw.c
> index dac3cb6aa17d3..567d39e58d33f 100644
> --- a/drivers/media/platform/rockchip/rga/rga-hw.c
> +++ b/drivers/media/platform/rockchip/rga/rga-hw.c
> @@ -417,8 +417,6 @@ static void rga_cmd_set(struct rga_ctx *ctx,
>  {
>  	struct rockchip_rga *rga = ctx->rga;
>  
> -	memset(ctx->cmdbuf_virt, 0, RGA_CMDBUF_SIZE);
> -
>  	rga_cmd_set_src_addr(ctx, src->dma_desc_pa);
>  	/*
>  	 * Due to hardware bug,
> @@ -427,11 +425,9 @@ static void rga_cmd_set(struct rga_ctx *ctx,
>  	rga_cmd_set_src1_addr(ctx, dst->dma_desc_pa);
>  
>  	rga_cmd_set_dst_addr(ctx, dst->dma_desc_pa);
> -	rga_cmd_set_mode(ctx);
>  
>  	rga_cmd_set_src_info(ctx, &src->offset);
>  	rga_cmd_set_dst_info(ctx, &dst->offset);
> -	rga_cmd_set_trans_info(ctx);
>  
>  	rga_write(rga, RGA_CMD_BASE, ctx->cmdbuf_phy);
>  
> @@ -440,6 +436,14 @@ static void rga_cmd_set(struct rga_ctx *ctx,
>  				   PAGE_SIZE, DMA_BIDIRECTIONAL);
>  }
>  
> +static void rga_hw_setup_cmdbuf(struct rga_ctx *ctx)
> +{
> +	memset(ctx->cmdbuf_virt, 0, RGA_CMDBUF_SIZE);
> +
> +	rga_cmd_set_mode(ctx);
> +	rga_cmd_set_trans_info(ctx);
> +}
> +
>  static void rga_hw_start(struct rockchip_rga *rga,
>  			 struct rga_vb_buffer *src,  struct rga_vb_buffer *dst)
>  {
> @@ -582,6 +586,7 @@ const struct rga_hw rga2_hw = {
>  	.max_height = MAX_HEIGHT,
>  	.stride_alignment = 4,
>  
> +	.setup_cmdbuf = rga_hw_setup_cmdbuf,
>  	.start = rga_hw_start,
>  	.handle_irq = rga_handle_irq,
>  	.get_version = rga_get_version,
> diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
> index d080cb672740b..394b14b9469df 100644
> --- a/drivers/media/platform/rockchip/rga/rga.c
> +++ b/drivers/media/platform/rockchip/rga/rga.c
> @@ -38,6 +38,11 @@ static void device_run(void *prv)
>  	unsigned long flags;
>  
>  	spin_lock_irqsave(&rga->ctrl_lock, flags);
> +	if (ctx->cmdbuf_dirty) {
> +		ctx->cmdbuf_dirty = false;
> +		rga->hw->setup_cmdbuf(ctx);
> +	}
> +	spin_unlock_irqrestore(&rga->ctrl_lock, flags);
>  
>  	rga->curr = ctx;
>  
> @@ -47,8 +52,6 @@ static void device_run(void *prv)
>  	dst = v4l2_m2m_next_dst_buf(ctx->fh.m2m_ctx);
>  
>  	rga->hw->start(rga, vb_to_rga(src), vb_to_rga(dst));
> -
> -	spin_unlock_irqrestore(&rga->ctrl_lock, flags);

I guess that the unlock can be moved, because relevant content of the
cmdbuf is now only changed in the setup_cmd() function and changes done
by rga_cmd_set() don't have to be synchronized.

It would be helpful if this is explained in the commit message or even
moved to a separate patch.

Michael

>  }
>  
>  static irqreturn_t rga_isr(int irq, void *prv)
> @@ -141,6 +144,7 @@ static int rga_s_ctrl(struct v4l2_ctrl *ctrl)
>  		ctx->fill_color = ctrl->val;
>  		break;
>  	}
> +	ctx->cmdbuf_dirty = true;
>  	spin_unlock_irqrestore(&ctx->rga->ctrl_lock, flags);
>  	return 0;
>  }
> @@ -228,6 +232,7 @@ static int rga_open(struct file *file)
>  		ret = -ENOMEM;
>  		goto rel_ctx;
>  	}
> +	ctx->cmdbuf_dirty = true;
>  
>  	ctx->rga = rga;
>  	/* Set default formats */
> @@ -448,6 +453,7 @@ static int vidioc_s_fmt(struct file *file, void *priv, struct v4l2_format *f)
>  	frm->crop.height = pix_fmt->height;
>  
>  	frm->pix = *pix_fmt;
> +	ctx->cmdbuf_dirty = true;
>  
>  	v4l2_dbg(debug, 1, &rga->v4l2_dev,
>  		 "[%s] fmt - %p4cc %dx%d (stride %d, sizeimage %d)\n",
> @@ -564,6 +570,7 @@ static int vidioc_s_selection(struct file *file, void *priv,
>  	}
>  
>  	f->crop = s->r;
> +	ctx->cmdbuf_dirty = true;
>  
>  	return ret;
>  }
> diff --git a/drivers/media/platform/rockchip/rga/rga.h b/drivers/media/platform/rockchip/rga/rga.h
> index 38518146910a6..5360f092fecf0 100644
> --- a/drivers/media/platform/rockchip/rga/rga.h
> +++ b/drivers/media/platform/rockchip/rga/rga.h
> @@ -55,6 +55,7 @@ struct rga_ctx {
>  
>  	void *cmdbuf_virt;
>  	dma_addr_t cmdbuf_phy;
> +	bool cmdbuf_dirty;
>  
>  	int osequence;
>  	int csequence;
> @@ -152,6 +153,7 @@ struct rga_hw {
>  	u32 max_width, max_height;
>  	u8 stride_alignment;
>  
> +	void (*setup_cmdbuf)(struct rga_ctx *ctx);
>  	void (*start)(struct rockchip_rga *rga,
>  		      struct rga_vb_buffer *src, struct rga_vb_buffer *dst);
>  	bool (*handle_irq)(struct rockchip_rga *rga);
> 
> -- 
> 2.54.0
> 
> 


^ permalink raw reply

* Re: [PATCH v14 20/28] drm/rockchip: dw_hdmi_qp: Implement "color format" DRM property
From: Daniel Stone @ 2026-05-21 14:40 UTC (permalink / raw)
  To: Nicolas Frattaroli
  Cc: Harry Wentland, Leo Li, Rodrigo Siqueira, Alex Deucher,
	Christian König, David Airlie, Simona Vetter,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
	Andrzej Hajda, Neil Armstrong, Robert Foss, Laurent Pinchart,
	Jonas Karlman, Jernej Skrabec, Sandy Huang, Heiko Stübner,
	Andy Yan, Jani Nikula, Rodrigo Vivi, Joonas Lahtinen,
	Tvrtko Ursulin, Dmitry Baryshkov, Sascha Hauer, Rob Herring,
	Jonathan Corbet, Shuah Khan, kernel, amd-gfx, dri-devel,
	linux-kernel, linux-arm-kernel, linux-rockchip, intel-gfx,
	intel-xe, linux-doc, wayland-devel, Cristian Ciocaltea
In-Reply-To: <20260423-color-format-v14-20-449a419ccbd4@collabora.com>

Hi,

On Thu, 23 Apr 2026 at 20:06, Nicolas Frattaroli
<nicolas.frattaroli@collabora.com> wrote:
> +       bridge = drm_bridge_chain_get_first_bridge(encoder);
> +       if (!bridge)
> +               return 0;
> +
> +       bstate = drm_atomic_get_bridge_state(conn_state->state, bridge);
> +       if (!bstate)
> +               return 0;

IS_ERR() + PTR_ERR()

Cheers,
Daniel


^ permalink raw reply

* Re: [PATCH v3 0/3] Fix __pkvm_init_vm error path
From: Marc Zyngier @ 2026-05-21 14:44 UTC (permalink / raw)
  To: Vincent Donnefort
  Cc: oliver.upton, joey.gouly, suzuki.poulose, yuzenghui,
	catalin.marinas, will, linux-arm-kernel, kvmarm, kernel-team,
	qperret, tabba
In-Reply-To: <20260521143318.1002172-1-vdonnefort@google.com>

On Thu, 21 May 2026 15:33:15 +0100,
Vincent Donnefort <vdonnefort@google.com> wrote:
> 
> Sashiko reported a potential refcount leak in the unlikely case where
> insert_vm_table_entry fails.
> 
> While at it, I have added a fail-safe to __pkvm_hyp_donate_host to ensure this
> function doesn't allow leaking refcounted pages.
> 
> Changes since v3:
>  
>   * Enforce order-0 for external pages, making the vmemmap ->order field
>     completely private to hyp_pool.

You send 3 versions of a single series in a matter of hours. How about
taking a step back and working out the problems before posting?

Please? It's not like my inbox is not overflowing already...

	M.

-- 
Without deviation from the norm, progress is not possible.


^ permalink raw reply

* Re: [PATCH 5/6] firmware: samsung: acpm: Add TMU protocol support
From: Tudor Ambarus @ 2026-05-21 14:49 UTC (permalink / raw)
  To: Alexey Klimov, Peter Griffin
  Cc: Krzysztof Kozlowski, Michael Turquette, Stephen Boyd, Lee Jones,
	Alim Akhtar, Sylwester Nawrocki, Chanwoo Choi, André Draszik,
	linux-kernel, linux-samsung-soc, linux-arm-kernel, linux-clk,
	jyescas, kernel-team, Krzysztof Kozlowski
In-Reply-To: <DIOE94AX265Y.4QU8ZF520FUV@linaro.org>

Hi, Alexey,

On 5/21/26 4:37 PM, Alexey Klimov wrote:
> Peter, I agree we shouldn't bother about hypothetical SoCs. However,

It's standard kernel philosophy to not add code that __might__ be useful
later. Please consider adding that print when you submit support for
e850.

Cheers,
ta


^ permalink raw reply

* RE: [PATCH v3] i2c: imx: mark I2C adapter when hardware is powered down
From: Carlos Song (OSS) @ 2026-05-21 14:49 UTC (permalink / raw)
  To: Mukesh Savaliya, Carlos Song (OSS), o.rempel@pengutronix.de,
	kernel@pengutronix.de, andi.shyti@kernel.org, Frank Li,
	s.hauer@pengutronix.de, festevam@gmail.com, Carlos Song,
	Bough Chen
  Cc: linux-i2c@vger.kernel.org, imx@lists.linux.dev,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, stable@vger.kernel.org
In-Reply-To: <315029cc-f04c-4dad-a746-f5d3e7245cdc@oss.qualcomm.com>



> -----Original Message-----
> From: Mukesh Savaliya <mukesh.savaliya@oss.qualcomm.com>
> Sent: Thursday, May 21, 2026 8:40 PM
> To: Carlos Song (OSS) <carlos.song@oss.nxp.com>; Mukesh Savaliya
> <mukesh.savaliya@oss.qualcomm.com>; o.rempel@pengutronix.de;
> kernel@pengutronix.de; andi.shyti@kernel.org; Frank Li <frank.li@nxp.com>;
> s.hauer@pengutronix.de; festevam@gmail.com; Carlos Song
> <carlos.song@nxp.com>; Bough Chen <haibo.chen@nxp.com>
> Cc: linux-i2c@vger.kernel.org; imx@lists.linux.dev;
> linux-arm-kernel@lists.infradead.org; linux-kernel@vger.kernel.org;
> stable@vger.kernel.org
> Subject: Re: [PATCH v3] i2c: imx: mark I2C adapter when hardware is powered
> down
> 
> 
> 
> On 5/21/2026 5:32 PM, Carlos Song (OSS) wrote:
> >
> >
> >> -----Original Message-----
> >> From: Mukesh Savaliya <mukesh.savaliya@oss.qualcomm.com>
> >> Sent: Thursday, May 21, 2026 7:14 PM
> >> To: Carlos Song (OSS) <carlos.song@oss.nxp.com>; Mukesh Savaliya
> >> <mukesh.savaliya@oss.qualcomm.com>; o.rempel@pengutronix.de;
> >> kernel@pengutronix.de; andi.shyti@kernel.org; Frank Li
> >> <frank.li@nxp.com>; s.hauer@pengutronix.de; festevam@gmail.com;
> >> Carlos Song <carlos.song@nxp.com>; Bough Chen <haibo.chen@nxp.com>
> >> Cc: linux-i2c@vger.kernel.org; imx@lists.linux.dev;
> >> linux-arm-kernel@lists.infradead.org; linux-kernel@vger.kernel.org;
> >> stable@vger.kernel.org
> >> Subject: Re: [PATCH v3] i2c: imx: mark I2C adapter when hardware is
> >> powered down
> >>
> >>
> >> On 5/21/2026 4:21 PM, Carlos Song (OSS) wrote:
> >>
> >> [...]
> >>
> >>>>>> -----Original Message-----
> >>>>>> From: Mukesh Savaliya <mukesh.savaliya@oss.qualcomm.com>
> >>>>>> Sent: Thursday, May 21, 2026 3:40 PM
> >>>>>> To: Carlos Song (OSS) <carlos.song@oss.nxp.com>;
> >>>>>> o.rempel@pengutronix.de; kernel@pengutronix.de;
> >>>>>> andi.shyti@kernel.org; Frank Li <frank.li@nxp.com>;
> >>>>>> s.hauer@pengutronix.de; festevam@gmail.com; Carlos Song
> >>>>>> <carlos.song@nxp.com>; Bough Chen <haibo.chen@nxp.com>
> >>>>>> Cc: linux-i2c@vger.kernel.org; imx@lists.linux.dev;
> >>>>>> linux-arm-kernel@lists.infradead.org;
> >>>>>> linux-kernel@vger.kernel.org; stable@vger.kernel.org
> >>>>>> Subject: Re: [PATCH v3] i2c: imx: mark I2C adapter when hardware
> >>>>>> is powered down
> >>>>>>
> >>>>>> Hi Carlos,
> >>>>>>
> >>>>>> On 5/20/2026 3:45 PM, Carlos Song (OSS) wrote:
> >>>>>>> From: Carlos Song <carlos.song@nxp.com>
> >>>>>>>
> >>>>>>> Mark the I2C adapter as suspended during system suspend to block
> >>>>>>> further transfers, and resume it on system resume. This prevents
> >>>>>>> potential hangs when the hardware is powered down but clients
> >>>>>>> still attempt
> >>>>>> I2C transfers.
> >>>>>>>
> >>>> what was the reason of this hang ? I was thinking you don't have
> >>>> interrupts working when client requested transfer but adapter was
> >>>> suspended. Please correct me if wrong.
> >>>>
> >>>> And it would be good to mention the actual problem and why/how it
> >> occurred.
> >>>>>> Code changes looks fine to me but have comment on commit log.
> >>>>>>
> >>>>>> It seems, you are adding support of _noirq() callbacks to allow
> >>>>>> transfers during suspend/resume noirq phase of PM.
> >>>>>>
> >>>>>> Would it make sense if you can write "Replace system PM callbacks
> >>>>>> with noirq PM callbacks" OR "Allow transfers during _noirq phase
> >>>>>> of the PM ops" instead of "mark I2C adapter when hardware is
> >>>>>> powered
> >>>> down" ?
> >>>>>>
> >>>>>
> >>>>> Hi,
> >>>>>
> >>>>> Thank you for your comments!
> >>>>>
> >>>>> But this patch is added is not for support noirq PM callback or
> >>>>> transfer in noirq
> >>>> phase.
> >>>>>
> >>>> Okay, may be actual problem description can help me.
> >>>>> In fact, this fix is to mark the I2C adapter as suspended during
> >>>>> system noirq suspend to block further transfers, and resume it on
> >>>>> system noirq resume. This is to prohibit I2C device calling the
> >>>>> I2C controller after the system noirq suspend and before noirq
> >>>>> resume, because at
> >>>> this time the I2C instance is powered off or the clock is disabled
> >>>> ... So I want to keep current commit. How do you think?
> >>>> completely Makes sense. Please help add how this problem occurred
> >>>> and
> >> why ?
> >>>> So the change/fix will be good to understand against it.
> >>>
> >>> Hi,
> >>>
> >>> In some I.MX platform, some I2C devices will keep a work queue all
> >>> time, the work queue will trigger I2C xfer every once in a while,
> >>> but the work
> >> queue shouldn't be free in system suspend.
> >>>
> >>
> >> work queue has transfers queued even if system is suspended ? IMO,
> >> the client i2c devices should not let system go to suspend.
> >>
> >
> > Hi Mukesh,
> >
> > Thank you for the detailed discussion.
> >
> > Yes, I totally agree that I2C client drivers should ideally stop
> > issuing transfers when the system is suspending.
> >
> > However, in practice there are many different I2C clients, and not all
> > of them strictly adhere to this requirement. Some clients may still
> > trigger transfers through workqueues or deferred contexts during the
> > suspend/resume window.
> >
> > Therefore, adding this protection at the I2C controller side helps to
> > avoid unexpected accesses when the hardware resources are unavailable,
> > making the system more robust.
> >
> 
> Agreed !
> 
> >>> Within a very short time window, possibly from noirq_suspend to the
> >>> system actually being suspended, or possibly from the system
> >>> starting to resume to before noirq_resume, this work queue will
> >>> trigger an I2C transfer, and at this time the I2C controller's clk
> >>> and pinctrl have not yet been restored, reading and
> >>
> >> Right, this kind of explains the problem to me. I think you are
> >> trying to serve i2c transfers when your resources(clk, pinctrl) are
> >> not turned ON and also interrupt remains disabled. And that's why you
> >> need to add
> >> _noir() PM callbacks supports along with IRQF_NO_SUSPEND |
> >> IRQF_EARLY_RESUME flags.
> >>
> >>> writing I2C registers causes the system to hang. This patch make all
> >>> I2C operations are performed in a safe hardware state.
> >>>
> >>> Is it better if I add these comment to patch commit log?
> >>>>>
> >> if my latest comments makes sense against the issue, you may write
> >> accordingly. if i am wrong, then your explanation makes sense. Cause
> >> of the hang needs to be clearly mention int the commit log in your next
> patch.
> >>
> >
> > Based on our discussion, I have updated the commit log as below:
> >
> > On some i.MX platforms, certain I2C client drivers keep a periodic
> > workqueue which continues to trigger I2C transfers.
> >
> > During system suspend/resume, there exists a time window between:
> >    - noirq_suspend and full suspend
> >    - resume start and noirq_resume
> 
> - noirq_resume and resume start [Just opposite ?]
> 

Sorry, the expression is ambiguous.

I will update the commit log to:

During system suspend/resume, there exists a time window between:
  - suspend_noirq and the system entering suspend
  - the system starting to resume and resume_noirq

Does this look good to you?

> >
> > In this window, the I2C controller resources such as clock and pinctrl
> > may already be disabled or not yet restored.
> >
> > If a workqueue triggers an I2C transfer in this period, the driver
> > attempts to access I2C registers while the hardware resources are
> > unavailable, which may lead to system hang.
> >
> > Mark the I2C adapter as suspended during noirq suspend and block new
> > transfers until resume, ensuring that I2C transfers are only issued
> > when hardware resources are available.
> >
> > Does this look good to you?
> >
> Looks good, Thanks !
> 
> >>>>
> >>>
> >


^ permalink raw reply

* [PATCH v2 00/39] KVM: arm64: Add GICv5 IRS support
From: Sascha Bischoff @ 2026-05-21 14:49 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes

Hi all,

This series builds on the initial vGICv5 support [1] and adds support
for the GICv5 IRS, as described by the GICv5 (EAC0) specification [2].
With this, a GICv5 guest is no longer restricted to PPIs, and can
make use of SPIs and LPIs as well.

With SPIs and LPIs available, this series makes it possible to boot a
full Linux guest on the Arm FVP model [3], using the setup described
in Lorenzo's GICv5 Linux software enablement guide [4]. In addition,
GICv5 IPIs are typically implemented as LPIs, so LPI support is what
makes guests with more than one vCPU possible.

The corresponding kvmtool changes are available separately [5]. With
these changes, `--irqchip=gicv5` works now. The `--irqchip=gicv5-its`
configuration does not work with this series, as ITS/MSI support is not
included here.

The IRS is created as part of the vgic-v5 device, and is not a
separate KVM device. This is because implementing the CPU interface
alone would limit a guest to a single vCPU and PPIs only. Hence, the
IRS is bundled into the VGIC, much like the distributor for GICv3.

The series adds the host-side plumbing needed for KVM to interact with
the physical IRS, including VM and VPE table management, VPE residency,
and VPE doorbells. It also adds an emulated IRS MMIO interface for the
guest, together with the UAPI needed for userspace to configure the IRS
address and to save/restore the IRS and IST state.

The main pieces are:

* Host IRS capability discovery and configuration frame plumbing.

* Allocation and management of the GICv5 VM table, VPE table, VMTEs,
  VPEs, and guest IST backing storage.

* VPE residency and doorbell handling, allowing the IRS to select SPIs
  and LPIs for resident VPEs and wake non-resident VPEs when required.

* Emulation of the GICv5 IRS MMIO CONFIG_FRAME for guests.
  Virtualisation, MPAM, MEC, SWERR are not supported and are RAZ/WI.

* SPI injection support using GIC VDPEND, together with tracking of
  in-flight SPIs so that deactivation notifiers can still be handled.

* Save/restore support for GICv5 EL1 system registers, IRS MMIO state,
  and guest ISTs.

* Documentation for the new VGICv5 IRS userspace interfaces and the
  required save/restore ordering.

Please pay close attention to the save/restore interface and ordering,
in particular to make sure that no guest state has been omitted.

LPIs are explicitly driven by the guest through the IRS/IST state. This
series does not add direct LPI injection support, and does not add MSI
support. It also does not add GICv5 ITS emulation.

These changes are based on v7.1-rc4. They presume the existence of the
still-under-review changes in [6]. I've pushed the full set of changes
to a branch at [7].

Changes since v1 [8]:

* Added native GICv5 KVM setup without a maintenance IRQ, plus ACPI KVM
  probing for GICv5 hosts.

* Completed and moved IRS MMIO register definitions earlier in the
  series.

* Reworked IRS capability discovery around the irqchip-provided config
  frame.

* Kept GICv5-host vCPU limits per VGIC model: native vGICv5 uses IRS
  VPE capacity, while legacy vGICv3 keeps its fixed limit.

* Consolidated VMTE, VPE, IST, and doorbell lifecycle handling.

* Reworked VPE residency and doorbell programming around WFI and the
  effective priority mask.

* Tightened GICv5 SPI sizing and routing, including the 32-SPI minimum
  and SPI/irqfd validation.

* Hardened in-flight SPI handling, including irqsave IRS MMIO locking
  and allocation-free state foldback.

* Reworked IRS/IST save/restore semantics and documented the migration
  ordering.

Thanks for taking the time to look at these changes. Apologies for the
size of the series, but this is a rather hard set to disentangle and
review separately!

Thanks,
Sascha

[1] https://lore.kernel.org/all/20260319154937.3619520-1-sascha.bischoff@arm.com/
[2] https://developer.arm.com/documentation/aes0070/latest
[3] https://developer.arm.com/documentation/108086/latest
[4] https://linaro.atlassian.net/wiki/x/CQAF-wY
[5] https://lore.kernel.org/all/20260116182606.61856-1-sascha.bischoff@arm.com/
[6] https://lore.kernel.org/all/20260520091949.542365-1-maz@kernel.org/
[7] https://gitlab.arm.com/linux-arm/linux-sb/-/tree/gicv5_kvm_irs_support_v2
[8] https://lore.kernel.org/all/20260427160547.3129448-1-sascha.bischoff@arm.com/

Sascha Bischoff (39):
  irqchip/gic-v5: Allow KVM setup without a maintenance IRQ
  irqchip/gic-v5: Provide OF IRS config frame attrs to KVM
  irqchip/gic-v5: Setup gic_kvm_info on ACPI hosts
  KVM: arm64: gic-v5: Define remaining IRS MMIO registers
  arm64/sysreg: Add GICv5 GIC VDPEND and VDRCFG encodings
  arm64/sysreg: Update ICC_CR0_EL1 with LINK and LINK_IDLE fields
  KVM: arm64: gic-v5: Extract host IRS caps from IRS config frame
  KVM: arm64: gic-v5: Add VPE doorbell domain
  KVM: arm64: gic-v5: Create & manage VM and VPE tables
  KVM: arm64: gic-v5: Introduce guest IST alloc and management
  KVM: arm64: gic-v5: Implement VMT/vIST IRS MMIO Ops
  KVM: arm64: gic-v5: Keep GICv5 vCPU limit model-specific
  KVM: arm64: gic-v5: Implement VPE IRS MMIO Ops
  KVM: arm64: gic-v5: Set up VMTEs and VPE doorbells
  KVM: arm64: gic-v5: Add resident/non-resident hyp calls
  KVM: arm64: gic-v5: Request doorbells when VPEs enter WFI
  KVM: arm64: gic-v5: Introduce struct vgic_v5_irs and IRS base address
  KVM: arm64: gic-v5: Add IRS IODEV support to MMIO handlers
  KVM: arm64: gic-v5: Add KVM_VGIC_V5_ADDR_TYPE_IRS to UAPI
  KVM: arm64: gic-v5: Add GICv5 IRS IODEV and MMIO emulation
  KVM: arm64: gic-v5: Initialise per-VM IRS state
  KVM: arm64: gic-v5: Register the IRS IODEV
  KVM: arm64: gic-v5: Set IRICHPPIDIS based on IRS enable state
  KVM: arm64: selftests: Update vGICv5 selftest to set IRS address
  KVM: arm64: gic-v5: Introduce SPI AP list
  KVM: arm64: gic-v5: Add GIC VDPEND and GIC VDRCFG hyp calls
  KVM: arm64: gic-v5: Track SPI state for in-flight SPIs
  KVM: arm64: gic: Introduce set_pending_state() to irq_op
  KVM: arm64: gic-v5: Support SPI injection
  Documentation: KVM: Extend VGICv5 docs for KVM_VGIC_V5_ADDR_TYPE_IRS
  KVM: arm64: gic-v5: Add GICv5 SPI injection to irqfd
  KVM: arm64: gic-v5: Mask per-vcpu PPI state in
    vgic_v5_finalize_ppi_state()
  KVM: arm64: gic-v5: Add GICv5 EL1 sysreg userspace accessors
  KVM: arm64: gic-v5: Handle userspace accesses to IRS MMIO region
  KVM: arm64: gic-v5: Implement save/restore mechanisms for ISTs
  Documentation: KVM: Document KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS for
    VGICv5
  Documentation: KVM: Add KVM_DEV_ARM_VGIC_GRP_IRS_REGS to VGICv5 docs
  Documentation: KVM: Add docs for KVM_DEV_ARM_VGIC_GRP_IST
  Documentation: KVM: Add the VGICv5 IRS save/restore sequences

 .../virt/kvm/devices/arm-vgic-v5.rst          |  237 ++-
 arch/arm64/include/asm/kvm_asm.h              |    4 +
 arch/arm64/include/asm/kvm_hyp.h              |    4 +
 arch/arm64/include/asm/sysreg.h               |   18 +-
 arch/arm64/include/uapi/asm/kvm.h             |    7 +
 arch/arm64/kvm/Makefile                       |    3 +-
 arch/arm64/kvm/arm.c                          |    2 +-
 arch/arm64/kvm/hyp/nvhe/hyp-main.c            |   34 +
 arch/arm64/kvm/hyp/vgic-v5-sr.c               |   55 +
 arch/arm64/kvm/sys_regs.c                     |    6 +-
 arch/arm64/kvm/vgic-sys-reg-v5.c              |  519 +++++
 arch/arm64/kvm/vgic/vgic-init.c               |  112 +-
 arch/arm64/kvm/vgic/vgic-irqfd.c              |   20 +-
 arch/arm64/kvm/vgic/vgic-irs-v5.c             | 1056 ++++++++++
 arch/arm64/kvm/vgic/vgic-kvm-device.c         |  252 ++-
 arch/arm64/kvm/vgic/vgic-mmio.c               |    6 +
 arch/arm64/kvm/vgic/vgic-mmio.h               |    2 +
 arch/arm64/kvm/vgic/vgic-v5-tables.c          | 1831 +++++++++++++++++
 arch/arm64/kvm/vgic/vgic-v5-tables.h          |  113 +
 arch/arm64/kvm/vgic/vgic-v5.c                 | 1191 ++++++++++-
 arch/arm64/kvm/vgic/vgic.c                    |   39 +-
 arch/arm64/kvm/vgic/vgic.h                    |   21 +-
 arch/arm64/tools/sysreg                       |    4 +-
 drivers/irqchip/irq-gic-v5-irs.c              |   19 +-
 drivers/irqchip/irq-gic-v5.c                  |  117 +-
 include/kvm/arm_vgic.h                        |  165 +-
 include/linux/irqchip/arm-gic-v5.h            |  220 +-
 include/linux/irqchip/arm-vgic-info.h         |    5 +
 tools/arch/arm64/include/uapi/asm/kvm.h       |    7 +
 tools/testing/selftests/kvm/arm64/vgic_v5.c   |    6 +
 .../selftests/kvm/include/arm64/gic_v5.h      |    3 +
 31 files changed, 5928 insertions(+), 150 deletions(-)
 create mode 100644 arch/arm64/kvm/vgic-sys-reg-v5.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-irs-v5.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-v5-tables.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-v5-tables.h

-- 
2.34.1

^ permalink raw reply

* [PATCH v2 01/39] irqchip/gic-v5: Allow KVM setup without a maintenance IRQ
From: Sascha Bischoff @ 2026-05-21 14:49 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

GICv5 does not require a virtual CPU interface maintenance interrupt
for native GCIE operation. The interrupt is only needed when
FEAT_GCIE_LEGACY is present, as the legacy GICv3 interface still
relies on maintenance IRQ delivery.

Stop rejecting KVM setup solely because the maintenance interrupt is
absent. Parse the interrupt if present, but if none is described and
the system does not advertise FEAT_GCIE_LEGACY, tell KVM that no
maintenance interrupt is required.

This lets native GICv5 KVM support be registered on systems that do
not provide a maintenance interrupt, while requiring a maintenance
interrupt for GICv3-capable systems.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 drivers/irqchip/irq-gic-v5.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c
index e9d1795235a66..600726b5c0a46 100644
--- a/drivers/irqchip/irq-gic-v5.c
+++ b/drivers/irqchip/irq-gic-v5.c
@@ -1141,12 +1141,19 @@ static void __init gic_of_setup_kvm_info(struct device_node *node)
 	gic_v5_kvm_info.type = GIC_V5;
 
 	/* GIC Virtual CPU interface maintenance interrupt */
-	gic_v5_kvm_info.no_maint_irq_mask = false;
 	gic_v5_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
-	if (!gic_v5_kvm_info.maint_irq) {
-		pr_warn("cannot find GICv5 virtual CPU interface maintenance interrupt\n");
-		return;
-	}
+
+	/*
+	 * We require an MI if we have legacy support, but don't, otherwise.
+	 * Given that there's an existing flag to convey that an MI isn't
+	 * needed, we (ab)use it to tell KVM that the MI isn't needed if we
+	 * don't support legacy.
+	 *
+	 * The check for ARM64_HAS_GICV5_LEGACY explicitly doesn't use
+	 * cpus_have_final_cap() here as we run too early.
+	 */
+	if (!cpus_have_cap(ARM64_HAS_GICV5_LEGACY) && !gic_v5_kvm_info.maint_irq)
+		gic_v5_kvm_info.no_maint_irq_mask = true;
 
 	vgic_set_kvm_info(&gic_v5_kvm_info);
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 02/39] irqchip/gic-v5: Provide OF IRS config frame attrs to KVM
From: Sascha Bischoff @ 2026-05-21 14:49 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

KVM needs to interact with the host IRS in order to, for example, make
VMs or VPEs valid. There are two potential approaches here. Either the
host irqchip driver can provide an interface, or KVM can interact
directly with the host IRS. The latter of these two is chosen as the
set of MMIO registers that KVM needs to interact with is orthogonal to
the set used by the host irqchip driver (with the exception of some of
the read-only IRS_IDRx registers).

Pass KVM a pointer to an IRS config frame - the config frame belonging
to ANY IRS is fine as long as one IRS's config frame is used
consistently - in struct gic_kvm_info. Additionally, include a flag
telling KVM whether the IRS is coherent or non-coherent in order to
make sure that KVM can do the correct cache state management, if
required.

Only OF (Device Tree) is supported with this change. ACPI is not.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 drivers/irqchip/irq-gic-v5-irs.c      | 7 +++++--
 drivers/irqchip/irq-gic-v5.c          | 5 +++++
 include/linux/irqchip/arm-gic-v5.h    | 3 +++
 include/linux/irqchip/arm-vgic-info.h | 5 +++++
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c
index f3fce0b1e25d9..607e066821b52 100644
--- a/drivers/irqchip/irq-gic-v5-irs.c
+++ b/drivers/irqchip/irq-gic-v5-irs.c
@@ -21,8 +21,6 @@
  */
 #define LPI_ID_BITS_LINEAR		12
 
-#define IRS_FLAGS_NON_COHERENT		BIT(0)
-
 static DEFINE_PER_CPU_READ_MOSTLY(struct gicv5_irs_chip_data *, per_cpu_irs_data);
 static LIST_HEAD(irs_nodes);
 
@@ -50,6 +48,11 @@ static void irs_writeq_relaxed(struct gicv5_irs_chip_data *irs_data,
 	writeq_relaxed(val, irs_data->irs_base + reg_offset);
 }
 
+struct gicv5_irs_chip_data *gicv5_irs_get_chip_data(void)
+{
+	return per_cpu(per_cpu_irs_data, 0);
+}
+
 /*
  * The polling wait (in gicv5_wait_for_op_s_atomic()) on a GIC register
  * provides the memory barriers (through MMIO accessors)
diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c
index 600726b5c0a46..707deabbf2f63 100644
--- a/drivers/irqchip/irq-gic-v5.c
+++ b/drivers/irqchip/irq-gic-v5.c
@@ -1128,6 +1128,8 @@ static struct gic_kvm_info gic_v5_kvm_info __initdata;
 
 static void __init gic_of_setup_kvm_info(struct device_node *node)
 {
+	struct gicv5_irs_chip_data *irs_data = gicv5_irs_get_chip_data();
+
 	/*
 	 * If we don't have native GICv5 virtualisation support, then
 	 * we also don't have FEAT_GCIE_LEGACY - the architecture
@@ -1140,6 +1142,9 @@ static void __init gic_of_setup_kvm_info(struct device_node *node)
 
 	gic_v5_kvm_info.type = GIC_V5;
 
+	gic_v5_kvm_info.gicv5_irs.base = irs_data->irs_base;
+	gic_v5_kvm_info.gicv5_irs.non_coherent = !!(irs_data->flags & IRS_FLAGS_NON_COHERENT);
+
 	/* GIC Virtual CPU interface maintenance interrupt */
 	gic_v5_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
 
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index f78787e654f4c..681c5c51207d6 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -330,6 +330,8 @@ struct gicv5_irs_chip_data {
 	raw_spinlock_t		spi_config_lock;
 };
 
+#define IRS_FLAGS_NON_COHERENT		BIT(0)
+
 static inline int gicv5_wait_for_op_s_atomic(void __iomem *addr, u32 offset,
 					     const char *reg_s, u32 mask,
 					     u32 *val)
@@ -377,6 +379,7 @@ void __init gicv5_free_lpi_domain(void);
 
 int gicv5_irs_of_probe(struct device_node *parent);
 int gicv5_irs_acpi_probe(void);
+struct gicv5_irs_chip_data *gicv5_irs_get_chip_data(void);
 void gicv5_irs_remove(void);
 int gicv5_irs_enable(void);
 void gicv5_irs_its_probe(void);
diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h
index 67d9d960273b9..f05370e2debf4 100644
--- a/include/linux/irqchip/arm-vgic-info.h
+++ b/include/linux/irqchip/arm-vgic-info.h
@@ -38,6 +38,11 @@ struct gic_kvm_info {
 	bool		has_v4_1;
 	/* Deactivation impared, subpar stuff */
 	bool		no_hw_deactivation;
+	/* GICv5 IRS base */
+	struct {
+		void __iomem	*base;
+		bool		non_coherent;
+	}		gicv5_irs;
 };
 
 #ifdef CONFIG_KVM
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH v2 1/1] arm64: dts: imx8mq-evk: Enable MIPI CSI and dual OV5640 cameras
From: Frank Li @ 2026-05-21 14:50 UTC (permalink / raw)
  To: Robby Cai
  Cc: robh, krzk+dt, conor+dt, s.hauer, festevam,
	sebastian.krzyszkowiak, kernel, devicetree, imx, linux-arm-kernel,
	linux-kernel
In-Reply-To: <20260521114952.GA215762@shlinux88>

On Thu, May 21, 2026 at 07:49:52PM +0800, Robby Cai wrote:
> On Wed, May 20, 2026 at 02:52:24PM -0400, Frank Li wrote:
> > On Wed, May 20, 2026 at 02:54:52PM +0800, Robby Cai wrote:
> > > On Fri, May 15, 2026 at 10:01:47AM -0400, Frank Li wrote:
> > > > On Fri, May 15, 2026 at 07:11:43PM +0800, Robby Cai wrote:
> > > > > Enable the MIPI CSI bridges and corresponding CSI-2 host interfaces
> > > > > on the i.MX8MQ EVK, and add two OV5640 camera sensors.
> > > > >
> > > > > The sensors are connected via I2C1 and I2C2, each with proper
> > > > > endpoint descriptions to form complete media pipelines.
> > > > >
> > > > > The resulting pipelines are:
> > > > >
> > > > >   - OV5640 (I2C2) -> MIPI CSI1 -> CSI1 bridge
> > > > >   - OV5640 (I2C1) -> MIPI CSI2 -> CSI2 bridge
> > > > >
> > > > > Both pipelines have been validated on the i.MX8MQ EVK using the
> > > > > upstream OV5640 driver.
> > > > >
> > > > > Both OV5640 sensors share a single reset GPIO on this board,
> > > > > which prevents independent hardware reset when both cameras
> > > > > are enabled. As a result, the reset line is kept deasserted
> > > > > via a GPIO hog, and sensor reset is performed via software.
> > > >
> > > > Does reset_control_get_shared() resolve this problem?
> > > >
> > >
> > > No, reset_control_get_shared() does not really solve this issue.
> > >
> > > The problem here is not about software coordination, but about the
> > > hardware topology: both sensors are physically tied to the same reset
> > > line. This means any reset operation will always affect both devices
> > > simultaneously, regardless of how the reset framework is used.
> >
> > Reset framework is resolve this problem. It is quite common that many devices
> > shared one reset pin.
>
> okay, I'll try to switch to use this approach in next revision.
>
> Some devices require coordinated RESET and PWDN sequencing, but in this
> case the device can be properly initialized with RESET held inactive and
> controlled solely via the PWDN signal, which makes this approach viable.

PWDN should go through regulator interface.

>
> >
> > >
> > > While reset_control_get_shared() introduces reference counting to avoid
> > > unintended assertions, it does not allow independent reset control.
> > > In particular:
> > >
> > >   - A reset operation (assert) will still impact both sensors.
> >
> > yes, only when first devices toggle reset signal. Second device do nothing.
> >
> > >   - It does not solve the requirement for per-device hardware reset.
> >
> > It is hardware limitation.
> >
> > >
> > > Therefore, using a shared reset control does not provide true isolation
> > > between the two OV5640 instances.
> >
> > It is not isolation. Just don't allow second device to toggle reset pin.
> >
> > >
> > > Keeping the reset line permanently deasserted (e.g. via GPIO hog) and
> > > handling initialization through software/power sequencing is a valid
> > > and practical solution for this hardware design.
> >
> > If use i2c gpio, expandor driver may probe after sensor driver probe. So
> > reset may happen after sensor driver probe.
>
>
> Just to clarify, the reset GPIO in this design is provided by the SoC GPIO
> controller (gpio1), not an external I2C GPIO expander.

It is just special case. you touch ov5640 driver code, so need consider
more general case.

Frank
>
> Therefore, the "late reset" issue you mentioned does not apply here.
>
> Regards,
> Robby
> >
> > Frank
> > >
> > > This matches the intention of the upstream changes as well, where GPIO-
> > > based resets are treated as simple control signals rather than fully
> > > isolated reset domains.
> > >
> > > In practice, using a shared reset here can even introduce subtle
> > > interference between the two cameras during probe or power cycling,
> > > so it is safer to avoid using reset for runtime control entirely.
> > >
> > > Regards,
> > > Robby
> > >


^ permalink raw reply

* Re: [PATCH v14 04/44] arm64: RMI: Add SMC definitions for calling the RMM
From: Suzuki K Poulose @ 2026-05-21 14:50 UTC (permalink / raw)
  To: Marc Zyngier, Steven Price
  Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
	Oliver Upton, Zenghui Yu, linux-arm-kernel, linux-kernel,
	Joey Gouly, Alexandru Elisei, Christoffer Dall, Fuad Tabba,
	linux-coco, Ganapatrao Kulkarni, Gavin Shan, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <86ecj5vsu4.wl-maz@kernel.org>

On 21/05/2026 13:40, Marc Zyngier wrote:
> On Wed, 13 May 2026 14:17:12 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> The RMM (Realm Management Monitor) provides functionality that can be
>> accessed by SMC calls from the host.
>>
>> The SMC definitions are based on DEN0137[1] version 2.0-bet1
>>
>> [1] https://developer.arm.com/documentation/den0137/2-0bet1/
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes since v13:
>>   * Updated to RMM spec v2.0-bet1
>> Changes since v12:
>>   * Updated to RMM spec v2.0-bet0
>> Changes since v9:
>>   * Corrected size of 'ripas_value' in struct rec_exit. The spec states
>>     this is an 8-bit type with padding afterwards (rather than a u64).
>> Changes since v8:
>>   * Added RMI_PERMITTED_GICV3_HCR_BITS to define which bits the RMM
>>     permits to be modified.
>> Changes since v6:
>>   * Renamed REC_ENTER_xxx defines to include 'FLAG' to make it obvious
>>     these are flag values.
>> Changes since v5:
>>   * Sorted the SMC #defines by value.
>>   * Renamed SMI_RxI_CALL to SMI_RMI_CALL since the macro is only used for
>>     RMI calls.
>>   * Renamed REC_GIC_NUM_LRS to REC_MAX_GIC_NUM_LRS since the actual
>>     number of available list registers could be lower.
>>   * Provided a define for the reserved fields of FeatureRegister0.
>>   * Fix inconsistent names for padding fields.
>> Changes since v4:
>>   * Update to point to final released RMM spec.
>>   * Minor rearrangements.
>> Changes since v3:
>>   * Update to match RMM spec v1.0-rel0-rc1.
>> Changes since v2:
>>   * Fix specification link.
>>   * Rename rec_entry->rec_enter to match spec.
>>   * Fix size of pmu_ovf_status to match spec.
>> ---
>>   arch/arm64/include/asm/rmi_smc.h | 448 +++++++++++++++++++++++++++++++
>>   1 file changed, 448 insertions(+)
>>   create mode 100644 arch/arm64/include/asm/rmi_smc.h
>>
>> diff --git a/arch/arm64/include/asm/rmi_smc.h b/arch/arm64/include/asm/rmi_smc.h
>> new file mode 100644
>> index 000000000000..a09b7a631fef
>> --- /dev/null
>> +++ b/arch/arm64/include/asm/rmi_smc.h
>> @@ -0,0 +1,448 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * Copyright (C) 2023-2026 ARM Ltd.
>> + *
>> + * The values and structures in this file are from the Realm Management Monitor
>> + * specification (DEN0137) version 2.0-bet1:
>> + * https://developer.arm.com/documentation/den0137/2-0bet1/
> 
> How long is this spec going to be available on the ARM web site, which
> has a tendency of being reorganised every other week? And there is
> already a beta2.
> 
>> + */
>> +
>> +#ifndef __ASM_RMI_SMC_H
>> +#define __ASM_RMI_SMC_H
>> +
>> +#include <linux/arm-smccc.h>
>> +
>> +#define SMC_RMI_CALL(func)				\
>> +	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,		\
>> +			   ARM_SMCCC_SMC_64,		\
>> +			   ARM_SMCCC_OWNER_STANDARD,	\
>> +			   (func))
>> +
>> +#define SMC_RMI_VERSION				SMC_RMI_CALL(0x0150)
>> +
>> +#define SMC_RMI_RTT_DATA_MAP_INIT		SMC_RMI_CALL(0x0153)
>> +
>> +#define SMC_RMI_REALM_ACTIVATE			SMC_RMI_CALL(0x0157)
>> +#define SMC_RMI_REALM_CREATE			SMC_RMI_CALL(0x0158)
>> +#define SMC_RMI_REALM_DESTROY			SMC_RMI_CALL(0x0159)
>> +#define SMC_RMI_REC_CREATE			SMC_RMI_CALL(0x015a)
>> +#define SMC_RMI_REC_DESTROY			SMC_RMI_CALL(0x015b)
>> +#define SMC_RMI_REC_ENTER			SMC_RMI_CALL(0x015c)
>> +#define SMC_RMI_RTT_CREATE			SMC_RMI_CALL(0x015d)
>> +#define SMC_RMI_RTT_DESTROY			SMC_RMI_CALL(0x015e)
>> +
>> +#define SMC_RMI_RTT_READ_ENTRY			SMC_RMI_CALL(0x0161)
>> +
>> +#define SMC_RMI_RTT_DEV_VALIDATE		SMC_RMI_CALL(0x0163)
>> +#define SMC_RMI_PSCI_COMPLETE			SMC_RMI_CALL(0x0164)
>> +#define SMC_RMI_FEATURES			SMC_RMI_CALL(0x0165)
>> +#define SMC_RMI_RTT_FOLD			SMC_RMI_CALL(0x0166)
>> +
>> +#define SMC_RMI_RTT_INIT_RIPAS			SMC_RMI_CALL(0x0168)
>> +#define SMC_RMI_RTT_SET_RIPAS			SMC_RMI_CALL(0x0169)
>> +#define SMC_RMI_VSMMU_CREATE			SMC_RMI_CALL(0x016a)
>> +#define SMC_RMI_VSMMU_DESTROY			SMC_RMI_CALL(0x016b)
>> +#define SMC_RMI_RMM_CONFIG_SET			SMC_RMI_CALL(0x016e)
>> +#define SMC_RMI_PSMMU_IRQ_NOTIFY		SMC_RMI_CALL(0x016f)
>> +
>> +#define SMC_RMI_PDEV_ABORT			SMC_RMI_CALL(0x0174)
>> +#define SMC_RMI_PDEV_COMMUNICATE		SMC_RMI_CALL(0x0175)
>> +#define SMC_RMI_PDEV_CREATE			SMC_RMI_CALL(0x0176)
>> +#define SMC_RMI_PDEV_DESTROY			SMC_RMI_CALL(0x0177)
>> +#define SMC_RMI_PDEV_GET_STATE			SMC_RMI_CALL(0x0178)
>> +
>> +#define SMC_RMI_PDEV_STREAM_KEY_REFRESH		SMC_RMI_CALL(0x017a)
>> +#define SMC_RMI_PDEV_SET_PUBKEY			SMC_RMI_CALL(0x017b)
>> +#define SMC_RMI_PDEV_STOP			SMC_RMI_CALL(0x017c)
>> +#define SMC_RMI_RTT_AUX_CREATE			SMC_RMI_CALL(0x017d)
>> +#define SMC_RMI_RTT_AUX_DESTROY			SMC_RMI_CALL(0x017e)
>> +#define SMC_RMI_RTT_AUX_FOLD			SMC_RMI_CALL(0x017f)
>> +
>> +#define SMC_RMI_VDEV_ABORT			SMC_RMI_CALL(0x0185)
>> +#define SMC_RMI_VDEV_COMMUNICATE		SMC_RMI_CALL(0x0186)
>> +#define SMC_RMI_VDEV_CREATE			SMC_RMI_CALL(0x0187)
>> +#define SMC_RMI_VDEV_DESTROY			SMC_RMI_CALL(0x0188)
>> +#define SMC_RMI_VDEV_GET_STATE			SMC_RMI_CALL(0x0189)
>> +#define SMC_RMI_VDEV_UNLOCK			SMC_RMI_CALL(0x018a)
>> +#define SMC_RMI_RTT_SET_S2AP			SMC_RMI_CALL(0x018b)
>> +#define SMC_RMI_VDEV_COMPLETE			SMC_RMI_CALL(0x018e)
>> +
>> +#define SMC_RMI_VDEV_GET_INTERFACE_REPORT	SMC_RMI_CALL(0x01d0)
>> +#define SMC_RMI_VDEV_GET_MEASUREMENTS		SMC_RMI_CALL(0x01d1)
>> +#define SMC_RMI_VDEV_LOCK			SMC_RMI_CALL(0x01d2)
>> +#define SMC_RMI_VDEV_START			SMC_RMI_CALL(0x01d3)
>> +
>> +#define SMC_RMI_VSMMU_EVENT_NOTIFY		SMC_RMI_CALL(0x01d6)
>> +#define SMC_RMI_PSMMU_ACTIVATE			SMC_RMI_CALL(0x01d7)
>> +#define SMC_RMI_PSMMU_DEACTIVATE		SMC_RMI_CALL(0x01d8)
>> +
>> +#define SMC_RMI_PSMMU_ST_L2_CREATE		SMC_RMI_CALL(0x01db)
>> +#define SMC_RMI_PSMMU_ST_L2_DESTROY		SMC_RMI_CALL(0x01dc)
>> +#define SMC_RMI_DPT_L0_CREATE			SMC_RMI_CALL(0x01dd)
>> +#define SMC_RMI_DPT_L0_DESTROY			SMC_RMI_CALL(0x01de)
>> +#define SMC_RMI_DPT_L1_CREATE			SMC_RMI_CALL(0x01df)
>> +#define SMC_RMI_DPT_L1_DESTROY			SMC_RMI_CALL(0x01e0)
>> +#define SMC_RMI_GRANULE_TRACKING_GET		SMC_RMI_CALL(0x01e1)
>> +
>> +#define SMC_RMI_GRANULE_TRACKING_SET		SMC_RMI_CALL(0x01e3)
>> +
>> +#define SMC_RMI_RMM_CONFIG_GET			SMC_RMI_CALL(0x01ec)
>> +
>> +#define SMC_RMI_RMM_STATE_GET			SMC_RMI_CALL(0x01ee)
>> +
>> +#define SMC_RMI_PSMMU_EVENT_CONSUME		SMC_RMI_CALL(0x01f0)
>> +#define SMC_RMI_GRANULE_RANGE_DELEGATE		SMC_RMI_CALL(0x01f1)
>> +#define SMC_RMI_GRANULE_RANGE_UNDELEGATE	SMC_RMI_CALL(0x01f2)
>> +#define SMC_RMI_GPT_L1_CREATE			SMC_RMI_CALL(0x01f3)
>> +#define SMC_RMI_GPT_L1_DESTROY			SMC_RMI_CALL(0x01f4)
>> +#define SMC_RMI_RTT_DATA_MAP			SMC_RMI_CALL(0x01f5)
>> +#define SMC_RMI_RTT_DATA_UNMAP			SMC_RMI_CALL(0x01f6)
>> +#define SMC_RMI_RTT_DEV_MAP			SMC_RMI_CALL(0x01f7)
>> +#define SMC_RMI_RTT_DEV_UNMAP			SMC_RMI_CALL(0x01f8)
>> +#define SMC_RMI_RTT_ARCH_DEV_MAP		SMC_RMI_CALL(0x01f9)
>> +#define SMC_RMI_RTT_ARCH_DEV_UNMAP		SMC_RMI_CALL(0x01fa)
>> +#define SMC_RMI_RTT_UNPROT_MAP			SMC_RMI_CALL(0x01fb)
>> +#define SMC_RMI_RTT_UNPROT_UNMAP		SMC_RMI_CALL(0x01fc)
>> +#define SMC_RMI_RTT_AUX_PROT_MAP		SMC_RMI_CALL(0x01fd)
>> +#define SMC_RMI_RTT_AUX_PROT_UNMAP		SMC_RMI_CALL(0x01fe)
>> +#define SMC_RMI_RTT_AUX_UNPROT_MAP		SMC_RMI_CALL(0x01ff)
>> +#define SMC_RMI_RTT_AUX_UNPROT_UNMAP		SMC_RMI_CALL(0x0200)
>> +#define SMC_RMI_REALM_TERMINATE			SMC_RMI_CALL(0x0201)
>> +#define SMC_RMI_RMM_ACTIVATE			SMC_RMI_CALL(0x0202)
>> +#define SMC_RMI_OP_CONTINUE			SMC_RMI_CALL(0x0203)
>> +#define SMC_RMI_PDEV_STREAM_CONNECT		SMC_RMI_CALL(0x0204)
>> +#define SMC_RMI_PDEV_STREAM_DISCONNECT		SMC_RMI_CALL(0x0205)
>> +#define SMC_RMI_PDEV_STREAM_COMPLETE		SMC_RMI_CALL(0x0206)
>> +#define SMC_RMI_PDEV_STREAM_KEY_PURGE		SMC_RMI_CALL(0x0207)
>> +#define SMC_RMI_OP_MEM_DONATE			SMC_RMI_CALL(0x0208)
>> +#define SMC_RMI_OP_MEM_RECLAIM			SMC_RMI_CALL(0x0209)
>> +#define SMC_RMI_OP_CANCEL			SMC_RMI_CALL(0x020a)
>> +#define SMC_RMI_VSMMU_FEATURES			SMC_RMI_CALL(0x020b)
>> +#define SMC_RMI_VSMMU_CMD_GET			SMC_RMI_CALL(0x020c)
>> +#define SMC_RMI_VSMMU_CMD_COMPLETE		SMC_RMI_CALL(0x020d)
>> +#define SMC_RMI_PSMMU_INFO			SMC_RMI_CALL(0x020e)
>> +
>> +#define RMI_ABI_MAJOR_VERSION	2
>> +#define RMI_ABI_MINOR_VERSION	0
>> +
>> +#define RMI_ABI_VERSION_GET_MAJOR(version) ((version) >> 16)
>> +#define RMI_ABI_VERSION_GET_MINOR(version) ((version) & 0xFFFF)
>> +#define RMI_ABI_VERSION(major, minor)      (((major) << 16) | (minor))
>> +
>> +#define RMI_UNASSIGNED			0
>> +#define RMI_ASSIGNED			1
>> +#define RMI_TABLE			2
>> +
>> +#define RMI_RETURN_STATUS(ret)		((ret) & 0xFF)
>> +#define RMI_RETURN_INDEX(ret)		(((ret) >> 8) & 0xFF)
>> +#define RMI_RETURN_MEMREQ(ret)		(((ret) >> 8) & 0x3)
>> +#define RMI_RETURN_CAN_CANCEL(ret)	(((ret) >> 10) & 0x1)
> 
> Use FIELD_GET() and specify masks that define the actual fields.
> 
>> +
>> +#define RMI_SUCCESS			0
>> +#define RMI_ERROR_INPUT			1
>> +#define RMI_ERROR_REALM			2
>> +#define RMI_ERROR_REC			3
>> +#define RMI_ERROR_RTT			4
>> +#define RMI_ERROR_NOT_SUPPORTED		5
>> +#define RMI_ERROR_DEVICE		6
>> +#define RMI_ERROR_RTT_AUX		7
>> +#define RMI_ERROR_PSMMU_ST		8
>> +#define RMI_ERROR_DPT			9
>> +#define RMI_BUSY			10
>> +#define RMI_ERROR_GLOBAL		11
>> +#define RMI_ERROR_TRACKING		12
>> +#define RMI_INCOMPLETE			13
>> +#define RMI_BLOCKED			14
>> +#define RMI_ERROR_GPT			15
>> +#define RMI_ERROR_GRANULE		16
>> +
>> +#define RMI_OP_MEM_REQ_NONE		0
>> +#define RMI_OP_MEM_REQ_DONATE		1
>> +#define RMI_OP_MEM_REQ_RECLAIM		2
>> +
>> +#define RMI_DONATE_SIZE(req)		((req) & 0x3)
>> +#define RMI_DONATE_COUNT_MASK		GENMASK(15, 2)
>> +#define RMI_DONATE_COUNT(req)		(((req) & RMI_DONATE_COUNT_MASK) >> 2)
>> +#define RMI_DONATE_CONTIG(req)		(!!((req) & BIT(16)))
>> +#define RMI_DONATE_STATE(req)		(!!((req) & BIT(17)))
> 
> FIELD_GET().
> 
>> +
>> +#define RMI_OP_MEM_DELEGATED		0
>> +#define RMI_OP_MEM_UNDELEGATED		1
>> +
>> +#define RMI_ADDR_TYPE_NONE		0
>> +#define RMI_ADDR_TYPE_SINGLE		1
>> +#define RMI_ADDR_TYPE_LIST		2
>> +
>> +#define RMI_ADDR_RANGE_SIZE_MASK	GENMASK(1, 0)
>> +#define RMI_ADDR_RANGE_COUNT_MASK	GENMASK(PAGE_SHIFT - 1, 2)
>> +#define RMI_ADDR_RANGE_ADDR_MASK	(PAGE_MASK & GENMASK(51, 0))
>> +#define RMI_ADDR_RANGE_STATE_MASK	BIT(63)
>> +
>> +#define RMI_ADDR_RANGE_SIZE(ar)		(FIELD_GET(RMI_ADDR_RANGE_SIZE_MASK, \
>> +						   (ar)))
>> +#define RMI_ADDR_RANGE_COUNT(ar)	(FIELD_GET(RMI_ADDR_RANGE_COUNT_MASK, \
>> +						   (ar)))
>> +#define RMI_ADDR_RANGE_ADDR(ar)		((ar) & RMI_ADDR_RANGE_ADDR_MASK)
>> +#define RMI_ADDR_RANGE_STATE(ar)	(FIELD_GET(RMI_ADDR_RANGE_STATE_MASK, \
>> +						   (ar)))
>> +
>> +enum rmi_ripas {
>> +	RMI_EMPTY = 0,
>> +	RMI_RAM = 1,
>> +	RMI_DESTROYED = 2,
>> +	RMI_DEV = 3,
>> +};
>> +
>> +#define RMI_NO_MEASURE_CONTENT	0
>> +#define RMI_MEASURE_CONTENT	1
>> +
>> +#define RMI_FEATURE_REGISTER_0_S2SZ		GENMASK(7, 0)
>> +#define RMI_FEATURE_REGISTER_0_LPA2		BIT(8)
>> +#define RMI_FEATURE_REGISTER_0_SVE		BIT(9)
>> +#define RMI_FEATURE_REGISTER_0_SVE_VL		GENMASK(13, 10)
>> +#define RMI_FEATURE_REGISTER_0_NUM_BPS		GENMASK(19, 14)
>> +#define RMI_FEATURE_REGISTER_0_NUM_WPS		GENMASK(25, 20)
>> +#define RMI_FEATURE_REGISTER_0_PMU		BIT(26)
>> +#define RMI_FEATURE_REGISTER_0_PMU_NUM_CTRS	GENMASK(31, 27)
>> +
>> +#define RMI_FEATURE_REGISTER_1_RMI_GRAN_SZ_4KB	BIT(0)
>> +#define RMI_FEATURE_REGISTER_1_RMI_GRAN_SZ_16KB	BIT(1)
>> +#define RMI_FEATURE_REGISTER_1_RMI_GRAN_SZ_64KB	BIT(2)
>> +#define RMI_FEATURE_REGISTER_1_HASH_SHA_256	BIT(3)
>> +#define RMI_FEATURE_REGISTER_1_HASH_SHA_384	BIT(4)
>> +#define RMI_FEATURE_REGISTER_1_HASH_SHA_512	BIT(5)
>> +#define RMI_FEATURE_REGISTER_1_MAX_RECS_ORDER	GENMASK(9, 6)
>> +#define RMI_FEATURE_REGISTER_1_L0GPTSZ		GENMASK(13, 10)
>> +#define RMI_FEATURE_REGISTER_1_PPS		GENMASK(16, 14)
>> +
>> +#define RMI_FEATURE_REGISTER_2_DA		BIT(0)
>> +#define RMI_FEATURE_REGISTER_2_DA_COH		BIT(1)
>> +#define RMI_FEATURE_REGISTER_2_VSMMU		BIT(2)
>> +#define RMI_FEATURE_REGISTER_2_ATS		BIT(3)
>> +#define RMI_FEATURE_REGISTER_2_MAX_VDEVS_ORDER	GENMASK(7, 4)
>> +#define RMI_FEATURE_REGISTER_2_VDEV_KROU	BIT(8)
>> +#define RMI_FEATURE_REGISTER_2_NON_TEE_STREAM	BIT(9)
>> +
>> +#define RMI_FEATURE_REGISTER_3_MAX_NUM_AUX_PLANES	GENMASK(3, 0)
>> +#define RMI_FEATURE_REGISTER_3_RTT_PLAN			GENMASK(5, 4)
>> +#define RMI_FEATURE_REGISTER_3_RTT_S2AP_INDIRECT	BIT(6)
>> +
>> +#define RMI_FEATURE_REGISTER_4_MEC_COUNT		GENMASK(63, 0)
>> +
>> +#define RMI_MEM_CATEGORY_CONVENTIONAL		0
>> +#define RMI_MEM_CATEGORY_DEV_NCOH		1
>> +#define RMI_MEM_CATEGORY_DEV_COH		2
>> +
>> +#define RMI_TRACKING_RESERVED			0
>> +#define RMI_TRACKING_NONE			1
>> +#define RMI_TRACKING_FINE			2
>> +#define RMI_TRACKING_COARSE			3
>> +
>> +#define RMI_GRANULE_SIZE_4KB	0
>> +#define RMI_GRANULE_SIZE_16KB	1
>> +#define RMI_GRANULE_SIZE_64KB	2
>> +
>> +/*
>> + * Note many of these fields are smaller than u64 but all fields have u64
>> + * alignment, so use u64 to ensure correct alignment.
>> + */
>> +struct rmm_config {
>> +	union { /* 0x0 */
>> +		struct {
>> +			u64 tracking_region_size;
>> +			u64 rmi_granule_size;
>> +		};
>> +		u8 sizer[0x1000];
> 
> SZ_4K?
> 
>> +	};
>> +};
>> +
>> +#define RMI_REALM_PARAM_FLAG_LPA2		BIT(0)
>> +#define RMI_REALM_PARAM_FLAG_SVE		BIT(1)
>> +#define RMI_REALM_PARAM_FLAG_PMU		BIT(2)
>> +
>> +struct realm_params {
>> +	union { /* 0x0 */
>> +		struct {
>> +			u64 flags;
>> +			u64 s2sz;
>> +			u64 sve_vl;
>> +			u64 num_bps;
>> +			u64 num_wps;
>> +			u64 pmu_num_ctrs;
>> +			u64 hash_algo;
>> +			u64 num_aux_planes;
>> +		};
>> +		u8 padding0[0x400];
> 
> SZ_1K? And similarly all over the shop?

Agreed to the comments above.

> 
> I haven't checked the details of the encodings (life is too short),
> but I wonder how much of this exists as an MRS and could be
> automatically generated?

Good point. This is something that we can check and get back to you.

Thanks
Suzuki


> 
> Thanks,
> 
> 	M.
> 



^ permalink raw reply

* [PATCH v2 03/39] irqchip/gic-v5: Setup gic_kvm_info on ACPI hosts
From: Sascha Bischoff @ 2026-05-21 14:50 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

Device-tree based GICv5 probing already passes the IRS details and
maintenance interrupt to KVM, but the ACPI path only initialises the
irqchip and installs the ACPI IRQ model. As a result, KVM never sees
the GICv5 host information required to probe the vGIC on ACPI systems.

Add the ACPI equivalent of the DT KVM setup. Parse the MADT GICC
entries for the maintenance interrupt, require all relevant entries to
agree, register the interrupt as a GICv5 PPI-encoded GSI, and pass the
resulting IRQ together with the IRS base and coherency information to
KVM. Native GICv5 does not require a maintenance interrupt unless the
legacy GICv3-compatible CPU interface is present, so preserve the
existing no-maintenance-IRQ handling for that case.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 drivers/irqchip/irq-gic-v5.c | 103 +++++++++++++++++++++++++++++++++--
 1 file changed, 98 insertions(+), 5 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c
index 707deabbf2f63..ccd1ec69a6ab2 100644
--- a/drivers/irqchip/irq-gic-v5.c
+++ b/drivers/irqchip/irq-gic-v5.c
@@ -1126,7 +1126,7 @@ static void gicv5_set_cpuif_idbits(void)
 #ifdef CONFIG_KVM
 static struct gic_kvm_info gic_v5_kvm_info __initdata;
 
-static void __init gic_of_setup_kvm_info(struct device_node *node)
+static void __init gic_setup_kvm_info(unsigned int maint_irq)
 {
 	struct gicv5_irs_chip_data *irs_data = gicv5_irs_get_chip_data();
 
@@ -1140,13 +1140,14 @@ static void __init gic_of_setup_kvm_info(struct device_node *node)
 		return;
 	}
 
-	gic_v5_kvm_info.type = GIC_V5;
+	if (WARN_ON(!irs_data))
+		return;
 
+	gic_v5_kvm_info.type = GIC_V5;
 	gic_v5_kvm_info.gicv5_irs.base = irs_data->irs_base;
 	gic_v5_kvm_info.gicv5_irs.non_coherent = !!(irs_data->flags & IRS_FLAGS_NON_COHERENT);
-
-	/* GIC Virtual CPU interface maintenance interrupt */
-	gic_v5_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+	gic_v5_kvm_info.maint_irq = maint_irq;
+	gic_v5_kvm_info.no_maint_irq_mask = false;
 
 	/*
 	 * We require an MI if we have legacy support, but don't, otherwise.
@@ -1162,10 +1163,101 @@ static void __init gic_of_setup_kvm_info(struct device_node *node)
 
 	vgic_set_kvm_info(&gic_v5_kvm_info);
 }
+
+static void __init gic_of_setup_kvm_info(struct device_node *node)
+{
+	/* GIC Virtual CPU interface maintenance interrupt */
+	gic_setup_kvm_info(irq_of_parse_and_map(node, 0));
+}
+
+#ifdef CONFIG_ACPI
+struct gicv5_acpi_kvm_info {
+	u32 maint_irq;
+	int maint_irq_mode;
+};
+
+static struct gicv5_acpi_kvm_info acpi_v5_kvm_info __initdata;
+
+static int __init gic_acpi_parse_virt_madt_gicc(union acpi_subtable_headers *header,
+						const unsigned long end)
+{
+	struct acpi_madt_generic_interrupt *gicc =
+		(struct acpi_madt_generic_interrupt *)header;
+	static int first_madt = true;
+	int maint_irq_mode;
+
+	if (!(gicc->flags &
+	      (ACPI_MADT_ENABLED | ACPI_MADT_GICC_ONLINE_CAPABLE)))
+		return 0;
+
+	maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+			 ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+
+	if (first_madt) {
+		first_madt = false;
+
+		acpi_v5_kvm_info.maint_irq = gicc->vgic_interrupt;
+		acpi_v5_kvm_info.maint_irq_mode = maint_irq_mode;
+		return 0;
+	}
+
+	/* The maintenance interrupt must be the same for every GICC entry. */
+	if (acpi_v5_kvm_info.maint_irq != gicc->vgic_interrupt ||
+	    acpi_v5_kvm_info.maint_irq_mode != maint_irq_mode)
+		return -EINVAL;
+
+	return 0;
+}
+
+static bool __init gic_acpi_collect_virt_info(void)
+{
+	int count;
+
+	acpi_v5_kvm_info.maint_irq = 0;
+	acpi_v5_kvm_info.maint_irq_mode = 0;
+
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
+				      gic_acpi_parse_virt_madt_gicc, 0);
+
+	return count > 0;
+}
+
+static void __init gic_acpi_setup_kvm_info(void)
+{
+	unsigned int maint_irq = 0;
+	int irq;
+
+	if (!gic_acpi_collect_virt_info()) {
+		pr_warn("Unable to get hardware information used for virtualization\n");
+		return;
+	}
+
+	if (acpi_v5_kvm_info.maint_irq) {
+		u32 gsi = FIELD_PREP(GICV5_HWIRQ_TYPE, GICV5_HWIRQ_TYPE_PPI) |
+			  FIELD_PREP(GICV5_HWIRQ_ID, acpi_v5_kvm_info.maint_irq);
+
+		irq = acpi_register_gsi(NULL, gsi,
+					acpi_v5_kvm_info.maint_irq_mode,
+					ACPI_ACTIVE_HIGH);
+		if (irq <= 0)
+			return;
+
+		maint_irq = irq;
+	}
+
+	gic_setup_kvm_info(maint_irq);
+}
+#endif
 #else
 static inline void __init gic_of_setup_kvm_info(struct device_node *node)
 {
 }
+
+#ifdef CONFIG_ACPI
+static inline void __init gic_acpi_setup_kvm_info(void)
+{
+}
+#endif
 #endif // CONFIG_KVM
 
 static int __init gicv5_init_common(struct fwnode_handle *parent_domain)
@@ -1264,6 +1356,7 @@ static int __init gic_acpi_init(union acpi_subtable_headers *header, const unsig
 		goto out_irs;
 
 	acpi_set_irq_model(ACPI_IRQ_MODEL_GIC_V5, gic_v5_get_gsi_domain_id);
+	gic_acpi_setup_kvm_info();
 
 	return 0;
 
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH v10 19/30] KVM: arm64: Provide assembly for SME register access
From: Mark Rutland @ 2026-05-21 14:51 UTC (permalink / raw)
  To: Mark Brown, Oliver Upton, Marc Zyngier
  Cc: Joey Gouly, Catalin Marinas, Suzuki K Poulose, Will Deacon,
	Paolo Bonzini, Jonathan Corbet, Shuah Khan, Dave Martin,
	Fuad Tabba, Ben Horgan, linux-arm-kernel, kvmarm, linux-kernel,
	kvm, linux-doc, linux-kselftest, Peter Maydell, Eric Auger
In-Reply-To: <20260306-kvm-arm64-sme-v10-19-43f7683a0fb7@kernel.org>

On Fri, Mar 06, 2026 at 05:01:11PM +0000, Mark Brown wrote:
> Provide versions of the SME state save and restore functions for the
> hypervisor to allow it to restore ZA and ZT for guests.
> 
> Signed-off-by: Mark Brown <broonie@kernel.org>
> ---
>  arch/arm64/include/asm/kvm_hyp.h |  2 ++
>  arch/arm64/kvm/hyp/fpsimd.S      | 23 +++++++++++++++++++++++
>  2 files changed, 25 insertions(+)

While this specific instance is simple enough, I don't think we should
continue to duplicate the low level save/restore routines between the
main kernel and KVM hyp code.

I've sent a series that avoids the need for this, and cleans up some
other bits):

  https://lore.kernel.org/linux-arm-kernel/20260521132556.584676-1-mark.rutland@arm.com/

Assuming Marc and Oliver are on board, I'd prefer that we do that
cleanup first, and build the KVM SME support atop.

Mark.

> 
> diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
> index 0317790dd3b7..9b1354d1122c 100644
> --- a/arch/arm64/include/asm/kvm_hyp.h
> +++ b/arch/arm64/include/asm/kvm_hyp.h
> @@ -116,6 +116,8 @@ void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
>  void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
>  void __sve_save_state(void *sve_pffr, u32 *fpsr, int save_ffr);
>  void __sve_restore_state(void *sve_pffr, u32 *fpsr, int restore_ffr);
> +void __sme_save_state(void const *state, bool save_zt);
> +void __sme_restore_state(void const *state, bool restore_zt);
>  
>  u64 __guest_enter(struct kvm_vcpu *vcpu);
>  
> diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S
> index 6e16cbfc5df2..18b7a666016c 100644
> --- a/arch/arm64/kvm/hyp/fpsimd.S
> +++ b/arch/arm64/kvm/hyp/fpsimd.S
> @@ -29,3 +29,26 @@ SYM_FUNC_START(__sve_save_state)
>  	sve_save 0, x1, x2, 3
>  	ret
>  SYM_FUNC_END(__sve_save_state)
> +
> +SYM_FUNC_START(__sme_save_state)
> +	// Caller needs to ensure SMCR updates are visible
> +	_sme_rdsvl	2, 1		// x2 = VL/8
> +	sme_save_za 0, x2, 12		// Leaves x0 pointing to the end of ZA
> +
> +	cbz	x1, 1f
> +	_str_zt 0
> +1:
> +	ret
> +SYM_FUNC_END(__sme_save_state)
> +
> +SYM_FUNC_START(__sme_restore_state)
> +	// Caller needs to ensure SMCR updates are visible
> +	_sme_rdsvl	2, 1		// x2 = VL/8
> +	sme_load_za	0, x2, 12	// Leaves x0 pointing to end of ZA
> +
> +	cbz	x1, 1f
> +	_ldr_zt 0
> +
> +1:
> +	ret
> +SYM_FUNC_END(__sme_restore_state)
> 
> -- 
> 2.47.3
> 


^ permalink raw reply

* [PATCH v2 04/39] KVM: arm64: gic-v5: Define remaining IRS MMIO registers
From: Sascha Bischoff @ 2026-05-21 14:50 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

Complete the set of defined IRS MMIO registers in the GICv5 header
file. Up until now, the set of defined IRS MMIO registers has been
driven by code requirements. However, in order to properly emulate the
IRS MMIO interface in KVM, the full set of IRS MMIO registers needs to
be added.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 include/linux/irqchip/arm-gic-v5.h | 203 +++++++++++++++++++++++++++--
 1 file changed, 194 insertions(+), 9 deletions(-)

diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index 681c5c51207d6..dd7da568ee8b8 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -62,20 +62,34 @@
 #define GICV5_OUTER_SHARE		0b10
 #define GICV5_INNER_SHARE		0b11
 
+#define GICV5_AIDR_COMPONENT_IRS	0b00
+#define GICV5_AIDR_COMPONENT_ITS	0b01
+#define GICV5_AIDR_COMPONENT_IWB	0b10
+
+#define GICV5_AIDR_ARCH_MAJ_REV_V5	0
+#define GICV5_AIDR_ARCH_MIN_REV_V0	0
+
 /*
  * IRS registers and tables structures
  */
 #define GICV5_IRS_IDR0			0x0000
 #define GICV5_IRS_IDR1			0x0004
 #define GICV5_IRS_IDR2			0x0008
+#define GICV5_IRS_IDR3			0x000c
+#define GICV5_IRS_IDR4			0x0010
 #define GICV5_IRS_IDR5			0x0014
 #define GICV5_IRS_IDR6			0x0018
 #define GICV5_IRS_IDR7			0x001c
+#define GICV5_IRS_IIDR			0x0040
+#define GICV5_IRS_AIDR			0x0044
 #define GICV5_IRS_CR0			0x0080
 #define GICV5_IRS_CR1			0x0084
 #define GICV5_IRS_SYNCR			0x00c0
 #define GICV5_IRS_SYNC_STATUSR		0x00c4
+#define GICV5_IRS_SPI_VMR		0x0100
 #define GICV5_IRS_SPI_SELR		0x0108
+#define GICV5_IRS_SPI_DOMAINR		0x010c
+#define GICV5_IRS_SPI_RESAMPLER		0x0110
 #define GICV5_IRS_SPI_CFGR		0x0114
 #define GICV5_IRS_SPI_STATUSR		0x0118
 #define GICV5_IRS_PE_SELR		0x0140
@@ -85,11 +99,51 @@
 #define GICV5_IRS_IST_CFGR		0x0190
 #define GICV5_IRS_IST_STATUSR		0x0194
 #define GICV5_IRS_MAP_L2_ISTR		0x01c0
-
+#define GICV5_IRS_VMT_BASER		0x0200
+#define GICV5_IRS_VMT_CFGR		0x0210
+#define GICV5_IRS_VMT_STATUSR		0x0214
+#define GICV5_IRS_VPE_SELR		0x0240
+#define GICV5_IRS_VPE_DBR		0x0248
+#define GICV5_IRS_VPE_HPPIR		0x0250
+#define GICV5_IRS_VPE_CR0		0x0258
+#define GICV5_IRS_VPE_STATUSR		0x025c
+#define GICV5_IRS_VM_DBR		0x0280
+#define GICV5_IRS_VM_SELR		0x0288
+#define GICV5_IRS_VM_STATUSR		0x028c
+#define GICV5_IRS_VMAP_L2_VMTR		0x02c0
+#define GICV5_IRS_VMAP_VMR		0x02c8
+#define GICV5_IRS_VMAP_VISTR		0x02d0
+#define GICV5_IRS_VMAP_L2_VISTR		0x02d8
+#define GICV5_IRS_VMAP_VPER		0x02e0
+#define GICV5_IRS_SAVE_VMR		0x0300
+#define GICV5_IRS_SAVE_VM_STATUSR	0x0308
+#define GICV5_IRS_MEC_IDR		0x0340
+#define GICV5_IRS_MEC_MECID_R		0x0344
+#define GICV5_IRS_MPAM_IDR		0x0380
+#define GICV5_IRS_MPAM_PARTID_R		0x0384
+#define GICV5_IRS_SWERR_STATUSR		0x03c0
+#define GICV5_IRS_SWERR_SYNDROMER0	0x03c8
+#define GICV5_IRS_SWERR_SYNDROMER1	0x03d0
+
+#define GICV5_IRS_IDR0_IRSID		GENMASK(31, 16)
+#define GICV5_IRS_IDR0_SWE		BIT(12)
+#define GICV5_IRS_IDR0_MPAM		BIT(11)
+#define GICV5_IRS_IDR0_MEC		BIT(10)
+#define GICV5_IRS_IDR0_SETLPI		BIT(9)
+#define GICV5_IRS_IDR0_VIRT_ONE_N	BIT(8)
+#define GICV5_IRS_IDR0_ONE_N		BIT(7)
 #define GICV5_IRS_IDR0_VIRT		BIT(6)
+#define GICV5_IRS_IDR0_PA_RANGE		GENMASK(5, 2)
+#define GICV5_IRS_IDR0_INT_DOM		GENMASK(1, 0)
+
+#define GICV5_IRS_IDR0_INT_DOM_SECURE		0b00
+#define GICV5_IRS_IDR0_INT_DOM_NON_SECURE	0b01
+#define GICV5_IRS_IDR0_INT_DOM_EL3		0b10
+#define GICV5_IRS_IDR0_INT_DOM_REALM		0b11
 
 #define GICV5_IRS_IDR1_PRIORITY_BITS	GENMASK(22, 20)
 #define GICV5_IRS_IDR1_IAFFID_BITS	GENMASK(19, 16)
+#define GICV5_IRS_IDR1_PE_CNT		GENMASK(15, 0)
 
 #define GICV5_IRS_IDR1_PRIORITY_BITS_1BITS	0b000
 #define GICV5_IRS_IDR1_PRIORITY_BITS_2BITS	0b001
@@ -105,13 +159,30 @@
 #define GICV5_IRS_IDR2_LPI		BIT(5)
 #define GICV5_IRS_IDR2_ID_BITS		GENMASK(4, 0)
 
+#define GICV5_IRS_IST_L2SZ_SUPPORT_4KB(r)	FIELD_GET(BIT(11), (r))
+#define GICV5_IRS_IST_L2SZ_SUPPORT_16KB(r)	FIELD_GET(BIT(12), (r))
+#define GICV5_IRS_IST_L2SZ_SUPPORT_64KB(r)	FIELD_GET(BIT(13), (r))
+
+#define GICV5_IRS_IDR3_VMT_LEVELS	BIT(10)
+#define GICV5_IRS_IDR3_VM_ID_BITS	GENMASK(9, 5)
+#define GICV5_IRS_IDR3_VMD_SZ		GENMASK(4, 1)
+#define GICV5_IRS_IDR3_VMD		BIT(0)
+
+#define GICV5_IRS_IDR4_VPE_ID_BITS	GENMASK(9, 6)
+#define GICV5_IRS_IDR4_VPED_SZ		GENMASK(5, 0)
+
 #define GICV5_IRS_IDR5_SPI_RANGE	GENMASK(24, 0)
 #define GICV5_IRS_IDR6_SPI_IRS_RANGE	GENMASK(24, 0)
 #define GICV5_IRS_IDR7_SPI_BASE		GENMASK(23, 0)
 
-#define GICV5_IRS_IST_L2SZ_SUPPORT_4KB(r)	FIELD_GET(BIT(11), (r))
-#define GICV5_IRS_IST_L2SZ_SUPPORT_16KB(r)	FIELD_GET(BIT(12), (r))
-#define GICV5_IRS_IST_L2SZ_SUPPORT_64KB(r)	FIELD_GET(BIT(13), (r))
+#define GICV5_IRS_IIDR_PRODUCT_ID	GENMASK(31, 20)
+#define GICV5_IRS_IIDR_VARIANT		GENMASK(19, 16)
+#define GICV5_IRS_IIDR_REVISION		GENMASK(15, 12)
+#define GICV5_IRS_IIDR_IMPLEMENTER	GENMASK(11, 0)
+
+#define GICV5_IRS_AIDR_COMPONENT	GENMASK(11, 8)
+#define GICV5_IRS_AIDR_ARCHMAJORREV	GENMASK(7, 4)
+#define GICV5_IRS_AIDR_ARCHMINORREV	GENMASK(3, 0)
 
 #define GICV5_IRS_CR0_IDLE		BIT(1)
 #define GICV5_IRS_CR0_IRSEN		BIT(0)
@@ -134,21 +205,39 @@
 
 #define GICV5_IRS_SYNC_STATUSR_IDLE	BIT(0)
 
-#define GICV5_IRS_SPI_STATUSR_V		BIT(1)
-#define GICV5_IRS_SPI_STATUSR_IDLE	BIT(0)
+#define GICV5_IRS_SPI_VMR_VIRT		BIT_ULL(63)
+#define GICV5_IRS_SPI_VMR_VM_ID		GENMASK_ULL(15, 0)
 
 #define GICV5_IRS_SPI_SELR_ID		GENMASK(23, 0)
 
+#define GICV5_IRS_SPI_DOMAINR_DOMAIN	GENMASK(1, 0)
+
+#define GICV5_IRS_SPI_DOMAINR_DOMAIN_SECURE	0b00
+#define GICV5_IRS_SPI_DOMAINR_DOMAIN_NON_SECURE	0b01
+#define GICV5_IRS_SPI_DOMAINR_DOMAIN_EL3	0b10
+#define GICV5_IRS_SPI_DOMAINR_DOMAIN_REALM	0b11
+
+#define GICV5_IRS_SPI_RESAMPLER_ID	GENMASK(23, 0)
+
 #define GICV5_IRS_SPI_CFGR_TM		BIT(0)
 
+#define GICV5_IRS_SPI_CFGR_TM_EDGE	0b0
+#define GICV5_IRS_SPI_CFGR_TM_LEVEL	0b1
+
+#define GICV5_IRS_SPI_STATUSR_V		BIT(1)
+#define GICV5_IRS_SPI_STATUSR_IDLE	BIT(0)
+
 #define GICV5_IRS_PE_SELR_IAFFID	GENMASK(15, 0)
 
+#define GICV5_IRS_PE_STATUSR_ONLINE	BIT(2)
 #define GICV5_IRS_PE_STATUSR_V		BIT(1)
 #define GICV5_IRS_PE_STATUSR_IDLE	BIT(0)
 
 #define GICV5_IRS_PE_CR0_DPS		BIT(0)
 
-#define GICV5_IRS_IST_STATUSR_IDLE	BIT(0)
+#define GICV5_IRS_IST_BASER_ADDR_MASK	GENMASK_ULL(55, 6)
+#define GICV5_IRS_IST_BASER_VALID	BIT_ULL(0)
+#define GICV5_IRS_IST_BASER_ADDR_SHIFT	6ULL
 
 #define GICV5_IRS_IST_CFGR_STRUCTURE	BIT(16)
 #define GICV5_IRS_IST_CFGR_ISTSZ	GENMASK(8, 7)
@@ -166,15 +255,111 @@
 #define GICV5_IRS_IST_CFGR_L2SZ_16K	0b01
 #define GICV5_IRS_IST_CFGR_L2SZ_64K	0b10
 
-#define GICV5_IRS_IST_BASER_ADDR_MASK	GENMASK_ULL(55, 6)
-#define GICV5_IRS_IST_BASER_VALID	BIT_ULL(0)
+#define GICV5_IRS_IST_STATUSR_IDLE	BIT(0)
 
 #define GICV5_IRS_MAP_L2_ISTR_ID	GENMASK(23, 0)
 
+#define GICV5_IRS_VMT_BASER_ADDR	GENMASK_ULL(55, 3)
+#define GICV5_IRS_VMT_BASER_ADDR_SHIFT	3ULL
+#define GICV5_IRS_VMT_BASER_VALID	BIT_ULL(0)
+
+#define GICV5_IRS_VMT_CFGR_STRUCTURE_TWO_LEVEL	0b1
+#define GICV5_IRS_VMT_CFGR_STRUCTURE_LINEAR	0b0
+
+#define GICV5_IRS_VMT_CFGR_STRUCTURE	BIT(16)
+#define GICV5_IRS_VMT_CFGR_VM_ID_BITS	GENMASK(4, 0)
+
+#define GICV5_IRS_VMT_STATUSR_IDLE	BIT(0)
+
+#define GICV5_IRS_VPE_SELR_S		BIT_ULL(63)
+#define GICV5_IRS_VPE_SELR_VPE_ID	GENMASK_ULL(47, 32)
+#define GICV5_IRS_VPE_SELR_VM_ID	GENMASK_ULL(15, 0)
+
+#define GICV5_IRS_VPE_DBR_DBV		BIT_ULL(63)
+#define GICV5_IRS_VPE_DBR_REQ_DB	BIT_ULL(62)
+#define GICV5_IRS_VPE_DBR_DBPM		GENMASK_ULL(36, 32)
+#define GICV5_IRS_VPE_DBR_INTID	GENMASK_ULL(23, 0)
+
+#define GICV5_IRS_VPE_HPPIR_HPPIV	BIT_ULL(32)
+#define GICV5_IRS_VPE_HPPIR_TYPE	GENMASK_ULL(31, 29)
+#define GICV5_IRS_VPE_HPPIR_ID		GENMASK_ULL(23, 0)
+
+#define GICV5_IRS_VPE_CR0_DPS		BIT(0)
+
+#define GICV5_IRS_VPE_STATUSR_V		BIT(1)
+#define GICV5_IRS_VPE_STATUSR_IDLE	BIT(0)
+
+#define GICV5_IRS_VM_DBR_EN		BIT_ULL(63)
+#define GICV5_IRS_VM_DBR_VPE_ID		GENMASK_ULL(15, 0)
+
+#define GICV5_IRS_VM_SELR_VM_ID		GENMASK(15, 0)
+
+#define GICV5_IRS_VM_STATUSR_V		BIT(1)
+#define GICV5_IRS_VM_STATUSR_IDLE	BIT(0)
+
+#define GICV5_IRS_VMAP_L2_VMTR_M	BIT_ULL(63)
+#define GICV5_IRS_VMAP_L2_VMTR_VM_ID	GENMASK_ULL(15, 0)
+
+#define GICV5_IRS_VMAP_VMR_M		BIT_ULL(63)
+#define GICV5_IRS_VMAP_VMR_U		BIT_ULL(62)
+#define GICV5_IRS_VMAP_VMR_VM_ID	GENMASK_ULL(15, 0)
+
+#define GICV5_IRS_VMAP_VISTR_M		BIT_ULL(63)
+#define GICV5_IRS_VMAP_VISTR_U		BIT_ULL(62)
+#define GICV5_IRS_VMAP_VISTR_VM_ID	GENMASK_ULL(47, 32)
+#define GICV5_IRS_VMAP_VISTR_TYPE	GENMASK_ULL(31, 29)
+
+#define GICV5_IRS_VMAP_L2_VISTR_M	BIT_ULL(63)
+#define GICV5_IRS_VMAP_L2_VISTR_VM_ID	GENMASK_ULL(47, 32)
+#define GICV5_IRS_VMAP_L2_VISTR_TYPE	GENMASK_ULL(31, 29)
+#define GICV5_IRS_VMAP_L2_VISTR_ID	GENMASK_ULL(23, 0)
+
+#define GICV5_IRS_VMAP_VPER_M		BIT_ULL(63)
+#define GICV5_IRS_VMAP_VPER_VM_ID	GENMASK_ULL(47, 32)
+#define GICV5_IRS_VMAP_VPER_VPE_ID	GENMASK_ULL(15, 0)
+
+#define GICV5_IRS_SAVE_VMR_VM_ID	GENMASK_ULL(15, 0)
+#define GICV5_IRS_SAVE_VMR_Q		BIT_ULL(62)
+#define GICV5_IRS_SAVE_VMR_S		BIT_ULL(63)
+
+#define GICV5_IRS_SAVE_VM_STATUSR_IDLE	BIT(0)
+#define GICV5_IRS_SAVE_VM_STATUSR_Q	BIT(1)
+
+#define GICV5_IRS_MEC_IDR_MECIDSIZE	GENMASK(3, 0)
+
+#define GICV5_IRS_MEC_MECID_R_MECID	GENMASK(15, 0)
+
+#define GICV5_IRS_MPAM_IDR_HAS_MPAM_SP	BIT(24)
+#define GICV5_IRS_MPAM_IDR_PMG_MAX	GENMASK(23, 16)
+#define GICV5_IRS_MPAM_IDR_PARTID_MAX	GENMASK(15, 0)
+
+#define GICV5_IRS_MPAM_PARTID_R_IDLE	BIT(31)
+#define GICV5_IRS_MPAM_PARTID_R_MPAM_SP	GENMASK(25, 24)
+#define GICV5_IRS_MPAM_PARTID_R_PMG	GENMASK(23, 16)
+#define GICV5_IRS_MPAM_PARTID_R_PARTID	GENMASK(15, 0)
+
+#define GICV5_IRS_SWERR_STATUSR_IMP_EC	GENMASK_ULL(31, 24)
+#define GICV5_IRS_SWERR_STATUSR_EC	GENMASK_ULL(23, 16)
+#define GICV5_IRS_SWERR_STATUSR_OF	BIT_ULL(3)
+#define GICV5_IRS_SWERR_STATUSR_S1V	BIT_ULL(2)
+#define GICV5_IRS_SWERR_STATUSR_S0V	BIT_ULL(1)
+#define GICV5_IRS_SWERR_STATUSR_V	BIT_ULL(0)
+
+#define GICV5_IRS_SWERR_SYNDROMER0_VIRTUAL	BIT_ULL(63)
+#define GICV5_IRS_SWERR_SYNDROMER0_TYPE		GENMASK_ULL(62, 60)
+#define GICV5_IRS_SWERR_SYNDROMER0_ID		GENMASK_ULL(55, 32)
+#define GICV5_IRS_SWERR_SYNDROMER0_VM_ID	GENMASK_ULL(15, 0)
+
+#define GICV5_IRS_SWERR_SYNDROMER1_ADDR	GENMASK_ULL(55, 3)
+
 #define GICV5_ISTL1E_VALID		BIT_ULL(0)
+#define GICV5_IRS_ISTL1E_SIZE		8UL
 
 #define GICV5_ISTL1E_L2_ADDR_MASK	GENMASK_ULL(55, 12)
 
+#define GICV5_IRS_SETLPIR		0x0000
+#define GICV5_IRS_SETLPIR_ID		GENMASK(23, 0)
+
 /*
  * ITS registers and tables structures
  */
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 05/39] arm64/sysreg: Add GICv5 GIC VDPEND and VDRCFG encodings
From: Sascha Bischoff @ 2026-05-21 14:50 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

Add the encodings for the GIC VDPEND and GIC VDRCFG system
instructions. These operate on the virtual interrupt domain, and are
used to make interrupts pending for a VM and to read back the
configuration of a VM's interrupts.

This is part of enabling GICv5 KVM support, and is required for
injection of SPIs and LPIs, and querying the state of in-flight SPIs
to detect their deactivation.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 arch/arm64/include/asm/sysreg.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 7aa08d59d4944..40ff7d25d37b0 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1040,7 +1040,7 @@
 #define GCS_CAP(x)	((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \
 					       GCS_CAP_VALID_TOKEN)
 /*
- * Definitions for GICv5 instructions
+ * Definitions for GICv5 instructions for the Current Domain
  */
 #define GICV5_OP_GIC_CDAFF		sys_insn(1, 0, 12, 1, 3)
 #define GICV5_OP_GIC_CDDI		sys_insn(1, 0, 12, 2, 0)
@@ -1105,6 +1105,22 @@
 #define GICV5_GICR_CDNMIA_TYPE_MASK	GENMASK_ULL(31, 29)
 #define GICV5_GICR_CDNMIA_ID_MASK	GENMASK_ULL(23, 0)
 
+/*
+ * Definitions for GICv5 instructions for the Virtual Domain
+ */
+#define GICV5_OP_GIC_VDPEND		sys_insn(1, 4, 12, 1, 4)
+#define GICV5_OP_GIC_VDRCFG		sys_insn(1, 4, 12, 1, 5)
+
+/* Shift and mask definitions for GIC VDPEND */
+#define GICV5_GIC_VDPEND_PENDING_MASK	BIT_ULL(63)
+#define GICV5_GIC_VDPEND_VM_MASK	GENMASK_ULL(47, 32)
+#define GICV5_GIC_VDPEND_TYPE_MASK	GENMASK_ULL(31, 29)
+#define GICV5_GIC_VDPEND_ID_MASK	GENMASK_ULL(23, 0)
+
+/* Shift and mask definitions for GIC VDRCFG */
+#define GICV5_GIC_VDRCFG_TYPE_MASK	GENMASK_ULL(31, 29)
+#define GICV5_GIC_VDRCFG_ID_MASK	GENMASK_ULL(23, 0)
+
 #define gicr_insn(insn)			read_sysreg_s(GICV5_OP_GICR_##insn)
 #define gic_insn(v, insn)		write_sysreg_s(v, GICV5_OP_GIC_##insn)
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 06/39] arm64/sysreg: Update ICC_CR0_EL1 with LINK and LINK_IDLE fields
From: Sascha Bischoff @ 2026-05-21 14:51 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

These fields have been added to the architecture since this register
was added to the generator, and were hence missing.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 arch/arm64/tools/sysreg | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 6c3ff14e561e6..57ab09404267c 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -3736,7 +3736,9 @@ Sysreg	ICC_CR0_EL1	3	1	12	0	1
 Res0	63:39
 Field	38	PID
 Field	37:32	IPPT
-Res0	31:1
+Res0	31:3
+Field	2	LINK_IDLE
+Field	1	LINK
 Field	0	EN
 EndSysreg
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 07/39] KVM: arm64: gic-v5: Extract host IRS caps from IRS config frame
From: Sascha Bischoff @ 2026-05-21 14:51 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

The host irqchip driver provides KVM with a pointer to an IRS's config
frame, which allows KVM to directly interact with the host's IRS. The
MMIO registers in the config frame are used to configure VMs (in
addition to them being used by the host). The IRS's config frame also
includes a set of ID registers which describe the capabilities that
the IRS has.

Stash the pointer to the config frame, and extract the VM capabilities
(from IRS_IDR3 & IRS_IDR4), as well as the IST
capabilities/requirements (IRS_IDR2) from the IRS.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 arch/arm64/kvm/vgic/vgic-v5.c | 46 +++++++++++++++++++++++++++++++++--
 include/kvm/arm_vgic.h        | 26 ++++++++++++++++++++
 2 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index d4789ff3e7402..3f7b132110114 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -11,6 +11,7 @@
 #include "vgic.h"
 
 #define ppi_caps	kvm_vgic_global_state.vgic_v5_ppi_caps
+#define irs_caps	kvm_vgic_global_state.vgic_v5_irs_caps
 
 /*
  * Not all PPIs are guaranteed to be implemented for GICv5. Deterermine which
@@ -34,6 +35,45 @@ static void vgic_v5_get_implemented_ppis(void)
 	__assign_bit(GICV5_ARCH_PPI_PMUIRQ, ppi_caps.impl_ppi_mask, system_supports_pmuv3());
 }
 
+static u32 irs_readl_relaxed(const u32 reg_offset)
+{
+	return readl_relaxed(irs_caps.irs_base + reg_offset);
+}
+
+static void vgic_v5_irs_extract_vm_caps(const struct gic_kvm_info *info)
+{
+	u64 idr;
+
+	irs_caps.irs_base = info->gicv5_irs.base;
+	irs_caps.non_coherent = info->gicv5_irs.non_coherent;
+
+	idr = irs_readl_relaxed(GICV5_IRS_IDR2);
+
+	/* We skip the LPI field as it only applies to physical LPIs */
+	irs_caps.ist_id_bits = FIELD_GET(GICV5_IRS_IDR2_ID_BITS, idr);
+	irs_caps.min_lpi_id_bits = FIELD_GET(GICV5_IRS_IDR2_MIN_LPI_ID_BITS, idr);
+	irs_caps.ist_levels = (idr & GICV5_IRS_IDR2_IST_LEVELS);
+	irs_caps.ist_l2sz = FIELD_GET(GICV5_IRS_IDR2_IST_L2SZ, idr);
+	irs_caps.istmd = (idr & GICV5_IRS_IDR2_ISTMD);
+	irs_caps.istmd_sz = FIELD_GET(GICV5_IRS_IDR2_ISTMD_SZ, idr);
+
+	idr = irs_readl_relaxed(GICV5_IRS_IDR3);
+
+	irs_caps.max_vms = BIT(FIELD_GET(GICV5_IRS_IDR3_VM_ID_BITS, idr));
+	irs_caps.two_level_vmt_support = (idr & GICV5_IRS_IDR3_VMT_LEVELS);
+
+	if (idr & GICV5_IRS_IDR3_VMD)
+		irs_caps.vmd_size = BIT(FIELD_GET(GICV5_IRS_IDR3_VMD_SZ, idr));
+	else
+		irs_caps.vmd_size = 0;
+
+	idr = irs_readl_relaxed(GICV5_IRS_IDR4);
+
+	irs_caps.vped_size = BIT(FIELD_GET(GICV5_IRS_IDR4_VPED_SZ, idr));
+	/* Field stores VPE_ID_BITS - 1 */
+	irs_caps.max_vpes = BIT(FIELD_GET(GICV5_IRS_IDR4_VPE_ID_BITS, idr) + 1);
+}
+
 /*
  * Probe for a vGICv5 compatible interrupt controller, returning 0 on success.
  */
@@ -61,10 +101,12 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 		goto skip_v5;
 	}
 
-	kvm_vgic_global_state.max_gic_vcpus = VGIC_V5_MAX_CPUS;
-
+	vgic_v5_irs_extract_vm_caps(info);
 	vgic_v5_get_implemented_ppis();
 
+	kvm_vgic_global_state.max_gic_vcpus = min(irs_caps.max_vpes,
+						  VGIC_V5_MAX_CPUS);
+
 	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V5);
 	if (ret) {
 		kvm_err("Cannot register GICv5 KVM device.\n");
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index fe49fb56dc3c9..8d65a18fefb80 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -182,6 +182,32 @@ struct vgic_global {
 	struct {
 		DECLARE_BITMAP(impl_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS);
 	} vgic_v5_ppi_caps;
+
+	/* GICv5 IRS capabilities */
+	struct {
+		/* Base address of the host IRS's CONFIG_FRAME */
+		void __iomem	*irs_base;
+
+		/* IST Caps */
+		u8		ist_id_bits;
+		bool		ist_levels;
+		u8		ist_l2sz;
+		bool		istmd;
+		u8		istmd_sz;
+
+		/* LPI only */
+		u8		min_lpi_id_bits;
+
+		/* VM Table, VPE Table */
+		bool		two_level_vmt_support;
+		u32		max_vms;
+		u32		max_vpes;
+		u16		vmd_size;
+		u16		vped_size;
+
+		/* Is the IRS coherent with us, or not? */
+		bool		non_coherent;
+	} vgic_v5_irs_caps;
 };
 
 extern struct vgic_global kvm_vgic_global_state;
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 08/39] KVM: arm64: gic-v5: Add VPE doorbell domain
From: Sascha Bischoff @ 2026-05-21 14:51 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

VPE doorbells allow the GICv5 hardware to notify KVM when an SPI or LPI
can be signalled to a non-resident VPE. This provides the mechanism used
to wake blocked vcpus once the hardware determines that the interrupt is
eligible to be delivered.

Add support for a per-VM VPE doorbell irq domain. The domain is created
under the GICv5 LPI domain, with one doorbell allocated per VPE. Store
the allocated doorbell base in the VM's GICv5 state so that later
patches can request per-vcpu doorbell IRQs and use them for IRS
commands and wakeups.

Add the per-VPE doorbell state to the GICv5 CPU interface state. The
doorbell IRQ number is populated when the IRQs are requested, and the
db_fired state is used by later patches once doorbell delivery is wired
up.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 arch/arm64/kvm/vgic/vgic-init.c    |  18 ++--
 arch/arm64/kvm/vgic/vgic-v5.c      | 137 +++++++++++++++++++++++++++++
 arch/arm64/kvm/vgic/vgic.h         |   1 +
 include/kvm/arm_vgic.h             |   4 +
 include/linux/irqchip/arm-gic-v5.h |   2 +
 5 files changed, 156 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 907057881b26a..625d352756fcf 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -492,16 +492,22 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
 	dist->nr_spis = 0;
 	dist->vgic_dist_base = VGIC_ADDR_UNDEF;
 
-	if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+	switch (dist->vgic_model) {
+	case KVM_DEV_TYPE_ARM_VGIC_V2:
+		dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
+		break;
+	case KVM_DEV_TYPE_ARM_VGIC_V3:
 		list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list)
 			vgic_v3_free_redist_region(kvm, rdreg);
 		INIT_LIST_HEAD(&dist->rd_regions);
-	} else {
-		dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
-	}
 
-	if (vgic_supports_direct_irqs(kvm))
-		vgic_v4_teardown(kvm);
+		if (vgic_supports_direct_irqs(kvm))
+			vgic_v4_teardown(kvm);
+		break;
+	case KVM_DEV_TYPE_ARM_VGIC_V5:
+		vgic_v5_teardown(kvm);
+		break;
+	}
 
 	xa_destroy(&dist->lpi_xa);
 }
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index 3f7b132110114..52924408ca990 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -7,6 +7,7 @@
 
 #include <linux/bitops.h>
 #include <linux/irqchip/arm-vgic-info.h>
+#include <linux/irqdomain.h>
 
 #include "vgic.h"
 
@@ -152,6 +153,132 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	return 0;
 }
 
+/*
+ * This set of irq_chip functions is specific for doorbells.
+ */
+static const struct irq_chip vgic_v5_db_irq_chip = {
+	.name = "GICv5-DB",
+	.irq_mask = irq_chip_mask_parent,
+	.irq_unmask = irq_chip_unmask_parent,
+	.irq_eoi = irq_chip_eoi_parent,
+	.irq_set_affinity = irq_chip_set_affinity_parent,
+	.irq_get_irqchip_state = irq_chip_get_parent_state,
+	.irq_set_irqchip_state = irq_chip_set_parent_state,
+	.flags = IRQCHIP_SET_TYPE_MASKED | IRQCHIP_SKIP_SET_WAKE |
+		 IRQCHIP_MASK_ON_SUSPEND,
+};
+
+static void vgic_v5_irq_db_domain_free(struct irq_domain *domain,
+				       unsigned int virq, unsigned int nr_irqs)
+{
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		struct irq_data *d = irq_domain_get_irq_data(domain, virq + i);
+
+		irq_set_handler(virq + i, NULL);
+		irq_domain_reset_irq_data(d);
+	}
+
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+static int vgic_v5_irq_db_domain_alloc(struct irq_domain *domain,
+				       unsigned int virq, unsigned int nr_irqs,
+				       void *arg)
+{
+	const struct irq_chip *chip = &vgic_v5_db_irq_chip;
+	struct vgic_v5_vm *vm = arg;
+	struct irq_data *irqd;
+	int ret;
+
+	if (!vm) {
+		kvm_err("invalid parameter for doorbell irq allocation\n");
+		return -EINVAL;
+	}
+
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, NULL);
+	if (ret)
+		return ret;
+
+	for (int i = 0; i < nr_irqs; i++) {
+		irq_domain_set_hwirq_and_chip(domain, virq + i, i, chip,
+					      domain->host_data);
+		irqd = irq_desc_get_irq_data(irq_to_desc(virq + i));
+		irqd_set_single_target(irqd);
+	}
+
+	return 0;
+}
+
+static const struct irq_domain_ops vgic_v5_irq_db_domain_ops = {
+	.alloc = vgic_v5_irq_db_domain_alloc,
+	.free = vgic_v5_irq_db_domain_free,
+};
+
+static int vgic_v5_create_per_vm_domain(struct kvm *kvm)
+{
+	struct vgic_v5_vm *vm = &kvm->arch.vgic.gicv5_vm;
+	int nr_vcpus = atomic_read(&kvm->online_vcpus);
+	int id = task_pid_nr(current);
+	int ret, db_virq = 0;
+
+	if (!gicv5_global_data.lpi_domain) {
+		kvm_err("LPI domain uninitialized, can't set up KVM Doorbells\n");
+		return -ENODEV;
+	}
+
+	vm->fwnode = irq_domain_alloc_named_id_fwnode("GICv5-vpe-db", id);
+
+	/*
+	 * KVM per-VM VPE DB domain; child of LPI domain; only ever handles
+	 * doorbells. We know how many doorbells we have, and therefore we
+	 * create a linear domain.
+	 */
+	vm->domain = irq_domain_create_hierarchy(gicv5_global_data.lpi_domain,
+						 0, nr_vcpus, vm->fwnode,
+						 &vgic_v5_irq_db_domain_ops, vm);
+	if (WARN_ON(!vm->domain)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	db_virq = irq_domain_alloc_irqs(vm->domain, nr_vcpus, NUMA_NO_NODE, vm);
+	if (db_virq <= 0) {
+		ret = db_virq;
+		goto err;
+	}
+
+	kvm->arch.vgic.gicv5_vm.vpe_db_base = db_virq;
+
+	return 0;
+
+err:
+	if (db_virq > 0)
+		irq_domain_free_irqs(db_virq, nr_vcpus);
+	if (vm->domain)
+		irq_domain_remove(vm->domain);
+	if (vm->fwnode)
+		irq_domain_free_fwnode(vm->fwnode);
+
+	kvm->arch.vgic.gicv5_vm.vpe_db_base = 0;
+	vm->domain = NULL;
+	vm->fwnode = NULL;
+
+	return ret;
+}
+
+static void vgic_v5_teardown_per_vm_domain(struct vgic_v5_vm *vm)
+{
+	if (!vm->domain)
+		return;
+
+	irq_domain_remove(vm->domain);
+	irq_domain_free_fwnode(vm->fwnode);
+	vm->domain = NULL;
+	vm->fwnode = NULL;
+}
+
 void vgic_v5_reset(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -167,10 +294,16 @@ void vgic_v5_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.vgic_cpu.num_pri_bits = 5;
 }
 
+void vgic_v5_teardown(struct kvm *kvm)
+{
+	vgic_v5_teardown_per_vm_domain(&kvm->arch.vgic.gicv5_vm);
+}
+
 int vgic_v5_init(struct kvm *kvm)
 {
 	struct kvm_vcpu *vcpu;
 	unsigned long idx;
+	int ret;
 
 	if (vgic_initialized(kvm))
 		return 0;
@@ -182,6 +315,10 @@ int vgic_v5_init(struct kvm *kvm)
 		}
 	}
 
+	ret = vgic_v5_create_per_vm_domain(kvm);
+	if (ret)
+		return ret;
+
 	/* We only allow userspace to drive the SW_PPI, if it is implemented. */
 	bitmap_zero(kvm->arch.vgic.gicv5_vm.userspace_ppis,
 		    VGIC_V5_NR_PRIVATE_IRQS);
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index f45f7e3ec4d6e..f2f5fdc3211d7 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -366,6 +366,7 @@ void vgic_debug_destroy(struct kvm *kvm);
 int vgic_v5_probe(const struct gic_kvm_info *info);
 void vgic_v5_reset(struct kvm_vcpu *vcpu);
 int vgic_v5_init(struct kvm *kvm);
+void vgic_v5_teardown(struct kvm *kvm);
 int vgic_v5_map_resources(struct kvm *kvm);
 void vgic_v5_set_ppi_ops(struct kvm_vcpu *vcpu, u32 vintid);
 bool vgic_v5_has_pending_ppi(struct kvm_vcpu *vcpu);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 8d65a18fefb80..bff2b7c896d55 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -392,6 +392,10 @@ struct vgic_v5_vm {
 	 * convenient way to do that).
 	 */
 	DECLARE_BITMAP(vgic_ppi_hmr, VGIC_V5_NR_PRIVATE_IRQS);
+
+	struct fwnode_handle	*fwnode;
+	struct irq_domain	*domain;
+	int			vpe_db_base;
 };
 
 struct vgic_dist {
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index dd7da568ee8b8..1702b57527dee 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -577,6 +577,8 @@ void gicv5_irs_syncr(void);
 
 /* Embedded in kvm.arch */
 struct gicv5_vpe {
+	int			db;
+	bool			db_fired;
 	bool			resident;
 };
 
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH v14 07/44] arm64: RMI: Configure the RMM with the host's page size
From: Suzuki K Poulose @ 2026-05-21 14:53 UTC (permalink / raw)
  To: Marc Zyngier, Steven Price
  Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
	Oliver Upton, Zenghui Yu, linux-arm-kernel, linux-kernel,
	Joey Gouly, Alexandru Elisei, Christoffer Dall, Fuad Tabba,
	linux-coco, Ganapatrao Kulkarni, Gavin Shan, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <86a4tsx536.wl-maz@kernel.org>

On 21/05/2026 14:30, Marc Zyngier wrote:
> On Wed, 13 May 2026 14:17:15 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> RMM v2.0 brings the ability to set the RMM's granule size. Check the
>> feature registers and configure the RMM so that it matches the host's
>> page size. This means that operations can be done with a granulatity
>> equal to PAGE_SIZE.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes since v13:
>>   * Moved out of KVM.
>> ---
>>   arch/arm64/kernel/rmi.c | 42 +++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 42 insertions(+)
>>
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> index 99c1ccc35c11..a14ead5dedda 100644
>> --- a/arch/arm64/kernel/rmi.c
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -49,6 +49,45 @@ static int rmi_check_version(void)
>>   	return 0;
>>   }
>>   
>> +static int rmi_configure(void)
>> +{
>> +	struct rmm_config *config __free(free_page) = NULL;
>> +	unsigned long ret;
>> +
>> +	config = (struct rmm_config *)get_zeroed_page(GFP_KERNEL);
>> +	if (!config)
>> +		return -ENOMEM;
> 
> This is the sort of buggy construct that is highlighted in
> include/linux/cleanup.h: initialising the object for cleanup with
> NULL, and only later assigning the expected value.
> 
> It may not matter here, but it will catch you (or more probably me) in
> the future.
> 
>> +
>> +	switch (PAGE_SIZE) {
>> +	case SZ_4K:
>> +		config->rmi_granule_size = RMI_GRANULE_SIZE_4KB;
>> +		break;
>> +	case SZ_16K:
>> +		config->rmi_granule_size = RMI_GRANULE_SIZE_16KB;
>> +		break;
>> +	case SZ_64K:
>> +		config->rmi_granule_size = RMI_GRANULE_SIZE_64KB;
>> +		break;
>> +	default:
>> +		pr_err("Unsupported PAGE_SIZE for RMM\n");
> 
> Do you really anticipate PAGE_SIZE being any other value? This is 100%
> dead code. If you want to be extra cautious, have a BUILD_BUg_ON().
> 
>> +		return -EINVAL;
>> +	}
>> +
>> +	ret = rmi_rmm_config_set(virt_to_phys(config));
>> +	if (ret) {
>> +		pr_err("RMM config set failed\n");
>> +		return -EINVAL;
>> +	}
> 
> What is the live cycle of the page when the call succeeds? Is it
> switched back to the NS PAS and allowed to be freed?

It always remains in the NS world. We relay some information in the
NS PAS page, which the RMM consumes. The checks are performed on
the values consumed by the RMM.

Kind regards
Suzuki

> 
>> +
>> +	ret = rmi_rmm_activate();
>> +	if (ret) {
>> +		pr_err("RMM activate failed\n");
>> +		return -ENXIO;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   static int __init arm64_init_rmi(void)
>>   {
>>   	/* Continue without realm support if we can't agree on a version */
>> @@ -60,6 +99,9 @@ static int __init arm64_init_rmi(void)
>>   	if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
>>   		return 0;
>>   
>> +	if (rmi_configure())
>> +		return 0;
>> +
>>   	return 0;
>>   }
>>   subsys_initcall(arm64_init_rmi);
> 
> Thanks,
> 
> 	M.
> 



^ permalink raw reply

* [PATCH v2 09/39] KVM: arm64: gic-v5: Create & manage VM and VPE tables
From: Sascha Bischoff @ 2026-05-21 14:52 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

GICv5 uses a set of in-memory tables to track and manage VM
state. These must be allocated by the hypervisor, and provided to the
IRS to use.

The VMT (Virtual Machine Table) is a linear or two level table
comprising VMT Entries (VMTE). Each VMTE describes the state for a
single VM. This state includes things such as the SPI and LPI IST
configuration (coming in a future commit), an implementation-defined
VM Descriptor, and a VPE Table (VPET).

The VPET contains one entry per VPE belonging to a VM, and is used to
mark a VPE as valid, as well as providing the address of an
implementation-defined VPE Descriptor, which is used by the hardware
to track and manage VPE state.

This commit adds support for allocating the VMT, and managing the
VMTEs. The VMTEs can be initialised or released for re-use. Allocation
and tracking of unused VMTEs is handled with an IDA.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 arch/arm64/kvm/Makefile              |   2 +-
 arch/arm64/kvm/vgic/vgic-init.c      |   2 +
 arch/arm64/kvm/vgic/vgic-v5-tables.c | 625 +++++++++++++++++++++++++++
 arch/arm64/kvm/vgic/vgic-v5-tables.h |  76 ++++
 arch/arm64/kvm/vgic/vgic-v5.c        |  15 +
 drivers/irqchip/irq-gic-v5-irs.c     |  12 +-
 include/kvm/arm_vgic.h               |   4 +
 include/linux/irqchip/arm-gic-v5.h   |  14 +-
 8 files changed, 740 insertions(+), 10 deletions(-)
 create mode 100644 arch/arm64/kvm/vgic/vgic-v5-tables.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-v5-tables.h

diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 59612d2f277c1..431de9b145ca1 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -24,7 +24,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 	 vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \
 	 vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
 	 vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o \
-	 vgic/vgic-v5.o
+	 vgic/vgic-v5.o vgic/vgic-v5-tables.o
 
 kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
 kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 625d352756fcf..079a57c2b18f6 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -154,6 +154,8 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
 	case KVM_DEV_TYPE_ARM_VGIC_V3:
 		INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
 		break;
+	case KVM_DEV_TYPE_ARM_VGIC_V5:
+		kvm->arch.vgic.gicv5_vm.vm_id = VGIC_V5_VM_ID_INVAL;
 	}
 
 	/*
diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.c b/arch/arm64/kvm/vgic/vgic-v5-tables.c
new file mode 100644
index 0000000000000..e9b92893b4e1f
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.c
@@ -0,0 +1,625 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025, 2026 Arm Ltd.
+ */
+
+#include <kvm/arm_vgic.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-v5-tables.h"
+
+#define irs_caps	kvm_vgic_global_state.vgic_v5_irs_caps
+
+static struct vgic_v5_vmt *vmt_info;
+/* Serialises lazy installation of shared second-level VMTs. */
+static DEFINE_MUTEX(vmt_l2_lock);
+static DEFINE_XARRAY(vm_info);
+
+/* Level 1 Virtual Machine Table Entry */
+#define GICV5_VMTEL1E_VALID		BIT_ULL(0)
+/* Note that there is no shift for the address by design */
+#define GICV5_VMTEL1E_L2_ADDR		GENMASK(51, 12)
+
+#define GICV5_VMTEL2E_SIZE		32ULL
+/* An L2 table (two-level VMT) is ALWAYS 4kB! */
+#define GICV5_VMT_L2_TABLE_SIZE		4096ULL
+#define GICV5_VMT_L2_TABLE_ENTRIES	(GICV5_VMT_L2_TABLE_SIZE / GICV5_VMTEL2E_SIZE)
+
+/*
+ * As the L2 VMTE is a large data structure, we are splitting it into 4 parts.
+ * We only mask and shift WITHIN each part for simplicity.
+ */
+/* First 64-bit chunk */
+#define GICV5_VMTEL2E_VALID		BIT_ULL(0)
+#define GICV5_VMTEL2E_VMD_ADDR_SHIFT	3ULL
+#define GICV5_VMTEL2E_VMD_ADDR		GENMASK_ULL(55, 3)
+/* Second 64-bit chunk */
+#define GICV5_VMTEL2E_VPET_ADDR_SHIFT	3ULL
+#define GICV5_VMTEL2E_VPET_ADDR		GENMASK_ULL(55, 3)
+#define GICV5_VMTEL2E_VPE_ID_BITS	GENMASK_ULL(63, 59)
+/* Third & fourth 64-bit chunks (the encodings are the same for each) */
+#define GICV5_VMTEL2E_IST_VALID		BIT_ULL(0)
+#define GICV5_VMTEL2E_IST_L2SZ		GENMASK_ULL(2, 1)
+#define GICV5_VMTEL2E_IST_ADDR_SHIFT	6ULL
+#define GICV5_VMTEL2E_IST_ADDR		GENMASK_ULL(55, 6)
+#define GICV5_VMTEL2E_IST_ISTSZ		GENMASK_ULL(57, 56)
+#define GICV5_VMTEL2E_IST_STRUCTURE	BIT_ULL(58)
+#define GICV5_VMTEL2E_IST_ID_BITS	GENMASK_ULL(63, 59)
+
+/* Virtual PE Table Entry */
+#define GICV5_VPE_VALID			BIT_ULL(0)
+/* Note that there is no shift for the address by design. */
+#define GICV5_VPED_ADDR_SHIFT		3ULL
+#define GICV5_VPED_ADDR			GENMASK_ULL(55, 3)
+
+/*
+ * Our IRS might be coherent or non-coherent. If coherent, we can just emit a
+ * DSB to ensure that we're in sync. However, when non-coherent, we need to
+ * manage our cached data explicitly.
+ *
+ * This helper is used to handle both coherent and non-coherent IRSes, and
+ * handles all combinations of cleaning and invalidating to the PoC.
+ */
+static void vgic_v5_clean_inval(void *va, size_t size)
+{
+	unsigned long base = (unsigned long)va;
+
+	dsb(ishst);
+
+	if (kvm_vgic_global_state.vgic_v5_irs_caps.non_coherent)
+		dcache_clean_inval_poc(base, base + size);
+}
+
+/*
+ * Create a linear VM Table. Directly using the number of entries supplied as
+ * the size of an L2 VMTE (32 bytes) guarantees that our allocation is aligned per
+ * the GICv5 requirements for the IRS_VMT_BASER.
+ */
+static int vgic_v5_alloc_vmt_linear(unsigned int num_entries)
+{
+	vmt_info->linear.vmt_base = kzalloc_objs(*vmt_info->linear.vmt_base,
+						 num_entries);
+	if (!vmt_info->linear.vmt_base)
+		return -ENOMEM;
+
+	vgic_v5_clean_inval(vmt_info->linear.vmt_base,
+			    num_entries * sizeof(struct vmtl2_entry));
+
+	return 0;
+}
+
+/*
+ * Allocate the first level of a two-level VM table. The second-level VM tables
+ * are allocated on demand (by vgic_v5_alloc_l2_vmt()).
+ */
+static int vgic_v5_alloc_vmt_two_level(unsigned int num_entries)
+{
+	/*
+	 * Each L2 VMT array is always 4k-sized (covering 128 VMs). This is
+	 * mandated by the GICv5 specification (GICv5 EAC0 Specification rule
+	 * D_LSPBK). Hence, round up the number of entries to be at least 128
+	 * (or the next highest power of two as we give the HW the number of VM
+	 * ID bits).
+	 */
+	if (num_entries < GICV5_VMT_L2_TABLE_ENTRIES)
+		num_entries = GICV5_VMT_L2_TABLE_ENTRIES;
+	num_entries = roundup_pow_of_two(num_entries);
+
+	vmt_info->l2.num_l1_ents = (num_entries / GICV5_VMT_L2_TABLE_ENTRIES);
+	vmt_info->l2.vmt_base = kzalloc_objs(*vmt_info->l2.vmt_base,
+					     vmt_info->l2.num_l1_ents);
+	if (!vmt_info->l2.vmt_base)
+		return -ENOMEM;
+
+	vmt_info->l2.l2ptrs = kzalloc_objs(*vmt_info->l2.l2ptrs,
+					   vmt_info->l2.num_l1_ents,
+					   GFP_KERNEL);
+	if (!vmt_info->l2.l2ptrs) {
+		kfree(vmt_info->l2.vmt_base);
+		return -ENOMEM;
+	}
+
+	vgic_v5_clean_inval(vmt_info->l2.vmt_base,
+			    vmt_info->l2.num_l1_ents * sizeof(vmtl1_entry));
+
+	return 0;
+}
+
+/*
+ * Allocate a second level VMT, if required. This can be called eagerly, and
+ * will only perform the allocation if required.
+ */
+static int vgic_v5_alloc_l2_vmt(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	enum gicv5_vcpu_cmd cmd = VMT_L2_MAP;
+	struct vmtl2_entry *l2_table;
+	unsigned int l1_index;
+	int ret;
+
+	/* Nothing to do if we have linear tables! */
+	if (!vmt_info->two_level)
+		return 0;
+
+	/*
+	 * We have 4k-sized L2 tables - this is mandated by the spec for
+	 * two-level VMTs (GICv5 EAC0 Specification rule D_LSPBK). This means
+	 * that we have 128 entries per L1 VMTE.
+	 */
+	l1_index = vm_id / GICV5_VMT_L2_TABLE_ENTRIES;
+
+	guard(mutex)(&vmt_l2_lock);
+
+	/* Already valid? Great! */
+	if (vmt_info->l2.l2ptrs[l1_index])
+		return 0;
+
+	l2_table = kzalloc_objs(*l2_table, GICV5_VMT_L2_TABLE_ENTRIES);
+	if (!l2_table)
+		return -ENOMEM;
+
+	vgic_v5_clean_inval(l2_table, GICV5_VMT_L2_TABLE_SIZE);
+
+	vgic_v5_clean_inval(vmt_info->l2.vmt_base + l1_index,
+			    sizeof(vmtl1_entry));
+
+	WRITE_ONCE(vmt_info->l2.vmt_base[l1_index],
+		   cpu_to_le64(virt_to_phys(l2_table)));
+
+	vgic_v5_clean_inval(vmt_info->l2.vmt_base + l1_index,
+			    sizeof(vmtl1_entry));
+
+	/*
+	 * VMAP in the L2 VMT via the IRS. We use any of the VM's CPUs as a
+	 * conduit for interacting with the host's IRS. In the current case,
+	 * this lets us resolve the VM ID to pass to the hardware.
+	 */
+	ret = irq_set_vcpu_affinity(vgic_v5_vpe_db(vcpu0), &cmd);
+
+	/* We've failed to make the L2 VMT valid - things are very broken! */
+	if (ret) {
+		/* Remove the pointer from L1 table */
+		WRITE_ONCE(vmt_info->l2.vmt_base[l1_index], 0);
+
+		vgic_v5_clean_inval(vmt_info->l2.vmt_base + l1_index,
+				    sizeof(vmtl1_entry));
+
+		kfree(l2_table);
+
+		return ret;
+	}
+
+	vmt_info->l2.l2ptrs[l1_index] = l2_table;
+
+	return 0;
+}
+
+/*
+ * Allocate the top-level VMT. This can either be linear or two-level.
+ */
+int vgic_v5_vmt_allocate(unsigned int max_vpes)
+{
+	int ret;
+
+	/* Allocate the tracking structure */
+	vmt_info = kzalloc_obj(*vmt_info, GFP_KERNEL);
+	if (!vmt_info)
+		return -ENOMEM;
+
+	ida_init(&vmt_info->vm_id_ida);
+	vmt_info->max_vpes = max_vpes;
+	vmt_info->vmd_size = irs_caps.vmd_size;
+	vmt_info->vped_size = irs_caps.vped_size;
+	vmt_info->two_level = irs_caps.two_level_vmt_support;
+	vmt_info->num_entries = irs_caps.max_vms;
+
+	if (vmt_info->two_level)
+		ret = vgic_v5_alloc_vmt_two_level(vmt_info->num_entries);
+	else
+		ret = vgic_v5_alloc_vmt_linear(vmt_info->num_entries);
+
+	/* If anything failed, free our tracking structure before returning */
+	if (ret) {
+		kfree(vmt_info);
+		vmt_info = NULL;
+	}
+
+	return ret;
+}
+
+/*
+ * Free the VMT and associated tracking structures. This isn't strictly expected
+ * to be called in general operation, but instead exists for completeness.
+ */
+int vgic_v5_vmt_free(void)
+{
+	if (!vmt_info)
+		return 0;
+
+	if (!vmt_info->two_level) {
+		kfree(vmt_info->linear.vmt_base);
+	} else {
+		/* Free the L2 tables; kfree(NULL) is safe */
+		for (int i = 0; i < vmt_info->l2.num_l1_ents; ++i)
+			kfree(vmt_info->l2.l2ptrs[i]);
+		kfree(vmt_info->l2.l2ptrs);
+
+		/* And now free the L1 table */
+		kfree(vmt_info->l2.vmt_base);
+	}
+
+	ida_destroy(&vmt_info->vm_id_ida);
+	kfree(vmt_info);
+	vmt_info = NULL;
+
+	return 0;
+}
+
+/*
+ * Look up a VMT Entry by VM ID.
+ */
+static struct vmtl2_entry *vgic_v5_get_l2_vmte(u16 vm_id)
+{
+	unsigned int l1_index, l2_index;
+	struct vmtl2_entry *l2_table;
+
+	if (!vmt_info->two_level)
+		return &vmt_info->linear.vmt_base[vm_id];
+
+	l1_index = vm_id / GICV5_VMT_L2_TABLE_ENTRIES;
+	l2_index = vm_id % GICV5_VMT_L2_TABLE_ENTRIES;
+
+	if (l1_index >= vmt_info->l2.num_l1_ents)
+		return ERR_PTR(-E2BIG);
+
+	if (!vmt_info->l2.l2ptrs[l1_index])
+		return ERR_PTR(-EINVAL);
+
+	l2_table = vmt_info->l2.l2ptrs[l1_index];
+	return &l2_table[l2_index];
+}
+
+/*
+ * Zero a VMT Entry, and flush & invalidate to the PoC, if required.
+ */
+static int vgic_v5_reset_vmte(struct kvm *kvm)
+{
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vmtl2_entry *vmte;
+
+	vmte = vgic_v5_get_l2_vmte(vm_id);
+	if (IS_ERR(vmte))
+		return PTR_ERR(vmte);
+
+	/*
+	 * The VMT is normal memory shared with the IRS. Invalidate before
+	 * rewriting the entry so that cacheline-granular maintenance cannot
+	 * later push stale data for neighbouring IRS-visible state back to
+	 * memory.
+	 */
+	vgic_v5_clean_inval(vmte, sizeof(*vmte));
+
+	/*
+	 * Prevent the compiler from eliding the individual VMTE
+	 * stores. Ordering and visibility to the IRS are provided by the
+	 * surrounding cache maintenance and command protocol, not by
+	 * WRITE_ONCE().
+	 *
+	 * The same compiler-access constraint applies to READ_ONCE() users in
+	 * this file: when inspecting IRS-visible table entries, read the field
+	 * exactly once and prevent the compiler from reusing, merging or
+	 * tearing the access. Coherency and freshness for non-coherent IRSes
+	 * still come from the surrounding cache maintenance.
+	 */
+	WRITE_ONCE(vmte->val[0], cpu_to_le64(0ULL));
+	WRITE_ONCE(vmte->val[1], cpu_to_le64(0ULL));
+	WRITE_ONCE(vmte->val[2], cpu_to_le64(0ULL));
+	WRITE_ONCE(vmte->val[3], cpu_to_le64(0ULL));
+
+	/* And make our write visible to the IRS (if non-coherent) */
+	vgic_v5_clean_inval(vmte, sizeof(*vmte));
+
+	return 0;
+}
+
+/*
+ * Use the IDA to allocate a new VM ID, and track it in the gicv5_vm data
+ * structure. If we're out of VM IDs, the IDA catches that, and we return the
+ * error (-ENOSPC). If we've previously allocated a VM ID, we catch that too and
+ * return -EBUSY.
+ */
+int vgic_v5_allocate_vm_id(struct kvm *kvm)
+{
+	int id;
+
+	if (kvm->arch.vgic.gicv5_vm.vm_id != VGIC_V5_VM_ID_INVAL)
+		return -EBUSY;
+
+	id = ida_alloc_max(&vmt_info->vm_id_ida, vmt_info->num_entries - 1u,
+			   GFP_KERNEL);
+	if (id < 0)
+		return id;
+
+	kvm->arch.vgic.gicv5_vm.vm_id = id;
+
+	return 0;
+}
+
+/*
+ * Release the VM ID to allow it to be reallocated in the future.
+ */
+void vgic_v5_release_vm_id(struct kvm *kvm)
+{
+	if (kvm->arch.vgic.gicv5_vm.vm_id == VGIC_V5_VM_ID_INVAL)
+		return;
+
+	ida_free(&vmt_info->vm_id_ida, kvm->arch.vgic.gicv5_vm.vm_id);
+	kvm->arch.vgic.gicv5_vm.vm_id = VGIC_V5_VM_ID_INVAL;
+}
+
+/*
+ * Initialise an entry in the VMT based on the index of the VM.
+ *
+ * Note: We don't mark the VMTE as valid as this needs to be done by
+ * the hardware.
+ */
+int vgic_v5_vmte_init(struct kvm *kvm)
+{
+	int nr_cpus = atomic_read(&kvm->online_vcpus);
+	struct vgic_v5_vm_info *vmi = NULL;
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vmtl2_entry *vmte;
+	void **vped_ptrs = NULL;
+	vpe_entry *vpet = NULL;
+	void *vmd = NULL;
+	int ret;
+	u64 tmp;
+
+	if (nr_cpus > vmt_info->max_vpes)
+		return -E2BIG;
+
+	/*
+	 * If we're using two-level VMTs, L2 is allocated on demand. For linear
+	 * VMTs, this is a NOP.
+	 */
+	ret = vgic_v5_alloc_l2_vmt(kvm);
+	if (ret)
+		return ret;
+
+	vmte = vgic_v5_get_l2_vmte(vm_id);
+	if (IS_ERR(vmte))
+		return PTR_ERR(vmte);
+
+	/* If the entry is already valid, something went wrong */
+	vgic_v5_clean_inval(vmte, sizeof(*vmte));
+	if (le64_to_cpu(READ_ONCE(vmte->val[0])) & GICV5_VMTEL2E_VALID)
+		return -EINVAL;
+
+	ret = vgic_v5_reset_vmte(kvm);
+	if (ret)
+		return ret;
+
+	vmi = kzalloc_obj(*vmi);
+	if (!vmi) {
+		ret = -ENOMEM;
+		goto out_fail;
+	}
+
+	ret = xa_insert(&vm_info, vm_id, vmi, GFP_KERNEL);
+	if (ret)
+		goto out_fail;
+
+	/* Allocate and assign the VM Descriptor, if required. */
+	if (vmt_info->vmd_size != 0) {
+		vmd = kzalloc(vmt_info->vmd_size, GFP_KERNEL);
+		if (!vmd) {
+			ret = -ENOMEM;
+			goto out_fail;
+		}
+
+		/* Stash the VA so we can free it later */
+		vmi->vmd_base = vmd;
+
+		tmp = FIELD_PREP(GICV5_VMTEL2E_VMD_ADDR,
+				 virt_to_phys(vmd) >> GICV5_VMTEL2E_VMD_ADDR_SHIFT);
+		WRITE_ONCE(vmte->val[0], cpu_to_le64(tmp));
+	}
+
+	/*
+	 * Allocate and assign the VPE Table. Round up the number of CPUs to a
+	 * whole power of two as we cannot describe non-powers-of-two in the
+	 * VMTE field as it conveys the number of ID bits used and not the
+	 * number of vPEs.
+	 *
+	 * The IRS encodes the number of IAFFID bits as N - 1, so a VM with a
+	 * single vCPU must still allocate two VPET entries and expose 1 bit.
+	 */
+	nr_cpus = max(2UL, roundup_pow_of_two(nr_cpus));
+	vmi->vpe_id_bits = fls(nr_cpus) - 1;
+
+	vpet = kzalloc_objs(*vpet, nr_cpus);
+	if (!vpet) {
+		ret = -ENOMEM;
+		goto out_fail;
+	}
+
+	/* Stash the VA so we can free it later */
+	vmi->vpet_base = vpet;
+
+	tmp = FIELD_PREP(GICV5_VMTEL2E_VPET_ADDR,
+			 virt_to_phys(vpet) >> GICV5_VMTEL2E_VPET_ADDR_SHIFT);
+	tmp |= FIELD_PREP(GICV5_VMTEL2E_VPE_ID_BITS, vmi->vpe_id_bits);
+	WRITE_ONCE(vmte->val[1], cpu_to_le64(tmp));
+
+	vped_ptrs = kzalloc_objs(*vped_ptrs, nr_cpus, GFP_KERNEL);
+	if (!vped_ptrs) {
+		ret = -ENOMEM;
+		goto out_fail;
+	}
+	vmi->vped_ptrs = vped_ptrs;
+
+	if (vmd)
+		vgic_v5_clean_inval(vmd, vmt_info->vmd_size);
+	vgic_v5_clean_inval(vpet, sizeof(*vpet) * nr_cpus);
+	vgic_v5_clean_inval(vmte, sizeof(*vmte));
+
+	kvm->arch.vgic.gicv5_vm.vmte_allocated = true;
+
+	return 0;
+
+out_fail:
+	/* kfree(NULL) is safe so we can just kfree() at leisure */
+	kfree(vmd);
+	kfree(vpet);
+	kfree(vped_ptrs);
+	if (vmi)
+		xa_erase(&vm_info, vm_id);
+	kfree(vmi);
+
+	vgic_v5_reset_vmte(kvm);
+
+	return ret;
+}
+
+/*
+ * Release the VMT Entry, freeing up any allocated data structures before
+ * zeroing the VMTE.
+ *
+ * The VMTE must be marked as invalid before it is released.
+ */
+int vgic_v5_vmte_release(struct kvm *kvm)
+{
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vgic_v5_vm_info *vmi;
+	struct vmtl2_entry *vmte;
+	int ret;
+
+	vmte = vgic_v5_get_l2_vmte(vm_id);
+	if (IS_ERR(vmte))
+		return PTR_ERR(vmte);
+
+	/* Reject if the VMTE has not been marked as invalid! */
+	vgic_v5_clean_inval(vmte, sizeof(*vmte));
+	if (le64_to_cpu(READ_ONCE(vmte->val[0])) & GICV5_VMTEL2E_VALID)
+		return -EINVAL;
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (WARN_ON_ONCE(!vmi))
+		goto no_vmi;
+
+	for (int i = 0; i < BIT(vmi->vpe_id_bits); i++)
+		kfree(vmi->vped_ptrs[i]);
+	kfree(vmi->vped_ptrs);
+	kfree(vmi->vpet_base);
+	kfree(vmi->vmd_base);
+
+	xa_erase(&vm_info, vm_id);
+	kfree(vmi);
+
+no_vmi:
+	/*
+	 * If we didn't get far enough into allocating a VMTE to create the VM
+	 * info structure, then we just zero the VMTE and move on. There's
+	 * nothing else we can realistically do here.
+	 */
+	ret = vgic_v5_reset_vmte(kvm);
+	if (ret)
+		return ret;
+
+	kvm->arch.vgic.gicv5_vm.vmte_allocated = false;
+
+	return 0;
+}
+
+/*
+ * Allocate a VPE descriptor and provide it to the hardware via the VPE Table.
+ */
+int vgic_v5_vmte_alloc_vpe(struct kvm_vcpu *vcpu)
+{
+	u16 vm_id = vgic_v5_vm_id(vcpu->kvm);
+	u16 vpe_id = vgic_v5_vpe_id(vcpu);
+	struct vgic_v5_vm_info *vmi;
+	vpe_entry tmp, *vpet_base;
+	void *vped;
+
+	/* Make sure we're not over what the hardware supports */
+	if (vpe_id >= vmt_info->max_vpes)
+		return -E2BIG;
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (WARN_ON_ONCE(!vmi))
+		return -EINVAL;
+
+	if (vpe_id >= 1 << vmi->vpe_id_bits)
+		return -E2BIG;
+
+	vpet_base = vmi->vpet_base;
+
+	/* If the VPETE for this CPU is already valid we've gone wrong */
+	vgic_v5_clean_inval(&vpet_base[vpe_id], sizeof(*vpet_base));
+	if (le64_to_cpu(READ_ONCE(vpet_base[vpe_id])) & GICV5_VPE_VALID)
+		return -EBUSY;
+
+	/* Alloc VPE Descriptor. Only used by IRS. */
+	vped = kzalloc(vmt_info->vped_size, GFP_KERNEL);
+	if (!vped)
+		return -ENOMEM;
+
+	vmi->vped_ptrs[vpe_id] = vped;
+
+	tmp = FIELD_PREP(GICV5_VPED_ADDR, virt_to_phys(vped) >> GICV5_VPED_ADDR_SHIFT);
+	WRITE_ONCE(vpet_base[vpe_id], cpu_to_le64(tmp));
+
+	vgic_v5_clean_inval(vped, vmt_info->vped_size);
+	vgic_v5_clean_inval(vpet_base + vpe_id, sizeof(vpe_entry));
+
+	return 0;
+}
+
+/*
+ * Free the memory allocated for the VPE descriptor.
+ */
+int vgic_v5_vmte_free_vpe(struct kvm_vcpu *vcpu)
+{
+	u16 vm_id = vgic_v5_vm_id(vcpu->kvm);
+	u16 vpe_id = vgic_v5_vpe_id(vcpu);
+	struct vgic_v5_vm_info *vmi;
+	struct vmtl2_entry *vmte;
+	vpe_entry *vpet_base;
+	void *vped;
+
+	vmte = vgic_v5_get_l2_vmte(vm_id);
+	if (IS_ERR(vmte))
+		return PTR_ERR(vmte);
+
+	vgic_v5_clean_inval(vmte, sizeof(*vmte));
+	if (le64_to_cpu(READ_ONCE(vmte->val[0])) & GICV5_VMTEL2E_VALID)
+		return -EBUSY;
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (!vmi)
+		return -EINVAL;
+
+	if (vpe_id >= 1 << vmi->vpe_id_bits)
+		return -E2BIG;
+
+	vpet_base = vmi->vpet_base;
+	WRITE_ONCE(vpet_base[vpe_id], 0ULL);
+
+	vgic_v5_clean_inval(vpet_base + vpe_id, sizeof(vpe_entry));
+
+	/* Free VPE Descriptor. Only used by IRS. */
+	vped = vmi->vped_ptrs[vpe_id];
+	vmi->vped_ptrs[vpe_id] = NULL;
+	kfree(vped);
+
+	return 0;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.h b/arch/arm64/kvm/vgic/vgic-v5-tables.h
new file mode 100644
index 0000000000000..3ca5bc7214fc9
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2025, 2026 Arm Ltd.
+ */
+
+#ifndef __KVM_ARM_VGICV5_TABLES_H__
+#define __KVM_ARM_VGICV5_TABLES_H__
+
+#include <linux/idr.h>
+#include <linux/irqchip/arm-gic-v5.h>
+
+/* Level 1 Virtual Machine Table Entry */
+typedef __le64 vmtl1_entry;
+
+/* Level 2 Virtual Machine Table Entry */
+struct vmtl2_entry {
+	__le64 val[4];
+};
+
+/* Virtual PE Table Entry */
+typedef __le64 vpe_entry;
+
+struct vgic_v5_vm_info {
+	void __iomem		*vmd_base;
+	vpe_entry __iomem	*vpet_base;
+	void __iomem		**vped_ptrs;
+	u8			vpe_id_bits;
+};
+
+struct vgic_v5_vmt {
+	union {
+		struct {
+			struct vmtl2_entry *vmt_base;
+			unsigned int num_ents;
+		} linear;
+		struct {
+			vmtl1_entry *vmt_base;
+			struct vmtl2_entry **l2ptrs;
+			unsigned int num_l1_ents;
+		} l2;
+	};
+	bool		two_level;
+	unsigned int	num_entries;
+	unsigned int	max_vpes;
+	size_t		vmd_size;
+	size_t		vped_size;
+	struct ida	vm_id_ida;
+};
+
+static inline u16 vgic_v5_vm_id(struct kvm *kvm)
+{
+	return kvm->arch.vgic.gicv5_vm.vm_id;
+}
+
+static inline u16 vgic_v5_vpe_id(struct kvm_vcpu *vcpu)
+{
+	return vcpu->vcpu_id;
+}
+
+static inline int vgic_v5_vpe_db(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.vgic_cpu.vgic_v5.gicv5_vpe.db;
+}
+
+int vgic_v5_vmt_allocate(unsigned int max_vpes);
+int vgic_v5_vmt_free(void);
+
+int vgic_v5_allocate_vm_id(struct kvm *kvm);
+void vgic_v5_release_vm_id(struct kvm *kvm);
+
+int vgic_v5_vmte_init(struct kvm *kvm);
+int vgic_v5_vmte_release(struct kvm *kvm);
+int vgic_v5_vmte_alloc_vpe(struct kvm_vcpu *vcpu);
+int vgic_v5_vmte_free_vpe(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index 52924408ca990..adfe0b207ef40 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -153,6 +153,20 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	return 0;
 }
 
+static int vgic_v5_db_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+{
+	enum gicv5_vcpu_cmd *cmd = vcpu_info;
+
+	switch (*cmd) {
+	case VMT_L2_MAP:
+	case VMTE_MAKE_VALID:
+	case VMTE_MAKE_INVALID:
+		/* Not yet implemented */
+	default:
+		return -EINVAL;
+	}
+}
+
 /*
  * This set of irq_chip functions is specific for doorbells.
  */
@@ -164,6 +178,7 @@ static const struct irq_chip vgic_v5_db_irq_chip = {
 	.irq_set_affinity = irq_chip_set_affinity_parent,
 	.irq_get_irqchip_state = irq_chip_get_parent_state,
 	.irq_set_irqchip_state = irq_chip_set_parent_state,
+	.irq_set_vcpu_affinity = vgic_v5_db_set_vcpu_affinity,
 	.flags = IRQCHIP_SET_TYPE_MASKED | IRQCHIP_SKIP_SET_WAKE |
 		 IRQCHIP_MASK_ON_SUSPEND,
 };
diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c
index 607e066821b52..70502b07ec8d7 100644
--- a/drivers/irqchip/irq-gic-v5-irs.c
+++ b/drivers/irqchip/irq-gic-v5-irs.c
@@ -269,24 +269,24 @@ int gicv5_irs_iste_alloc(const u32 lpi)
  * itself is not supported) again serves to make it easier to find physically
  * contiguous blocks of memory.
  */
-static unsigned int gicv5_irs_l2_sz(u32 idr2)
+unsigned int gicv5_irs_l2_sz(u32 l2sz)
 {
 	switch (PAGE_SIZE) {
 	case SZ_64K:
-		if (GICV5_IRS_IST_L2SZ_SUPPORT_64KB(idr2))
+		if (GICV5_IRS_IST_L2SZ_SUPPORT_64KB(l2sz))
 			return GICV5_IRS_IST_CFGR_L2SZ_64K;
 		fallthrough;
 	case SZ_4K:
-		if (GICV5_IRS_IST_L2SZ_SUPPORT_4KB(idr2))
+		if (GICV5_IRS_IST_L2SZ_SUPPORT_4KB(l2sz))
 			return GICV5_IRS_IST_CFGR_L2SZ_4K;
 		fallthrough;
 	case SZ_16K:
-		if (GICV5_IRS_IST_L2SZ_SUPPORT_16KB(idr2))
+		if (GICV5_IRS_IST_L2SZ_SUPPORT_16KB(l2sz))
 			return GICV5_IRS_IST_CFGR_L2SZ_16K;
 		break;
 	}
 
-	if (GICV5_IRS_IST_L2SZ_SUPPORT_4KB(idr2))
+	if (GICV5_IRS_IST_L2SZ_SUPPORT_4KB(l2sz))
 		return GICV5_IRS_IST_CFGR_L2SZ_4K;
 
 	return GICV5_IRS_IST_CFGR_L2SZ_64K;
@@ -334,7 +334,7 @@ static int __init gicv5_irs_init_ist(struct gicv5_irs_chip_data *irs_data)
 	lpi_id_bits = min(lpi_id_bits, gicv5_global_data.cpuif_id_bits);
 
 	if (two_levels)
-		l2sz = gicv5_irs_l2_sz(idr2);
+		l2sz = gicv5_irs_l2_sz(FIELD_GET(GICV5_IRS_IDR2_IST_L2SZ, idr2));
 
 	istmd = !!FIELD_GET(GICV5_IRS_IDR2_ISTMD, idr2);
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index bff2b7c896d55..ba32cd71fe0a7 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -374,6 +374,8 @@ struct vgic_redist_region {
 	struct list_head list;
 };
 
+#define VGIC_V5_VM_ID_INVAL		(-1)
+
 struct vgic_v5_vm {
 	/*
 	 * We only expose a subset of PPIs to the guest. This subset is a
@@ -396,6 +398,8 @@ struct vgic_v5_vm {
 	struct fwnode_handle	*fwnode;
 	struct irq_domain	*domain;
 	int			vpe_db_base;
+	u32			vm_id;
+	bool			vmte_allocated;
 };
 
 struct vgic_dist {
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index 1702b57527dee..64e31068d9d17 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -159,9 +159,9 @@
 #define GICV5_IRS_IDR2_LPI		BIT(5)
 #define GICV5_IRS_IDR2_ID_BITS		GENMASK(4, 0)
 
-#define GICV5_IRS_IST_L2SZ_SUPPORT_4KB(r)	FIELD_GET(BIT(11), (r))
-#define GICV5_IRS_IST_L2SZ_SUPPORT_16KB(r)	FIELD_GET(BIT(12), (r))
-#define GICV5_IRS_IST_L2SZ_SUPPORT_64KB(r)	FIELD_GET(BIT(13), (r))
+#define GICV5_IRS_IST_L2SZ_SUPPORT_4KB(r)	FIELD_GET(BIT(0), (r))
+#define GICV5_IRS_IST_L2SZ_SUPPORT_16KB(r)	FIELD_GET(BIT(1), (r))
+#define GICV5_IRS_IST_L2SZ_SUPPORT_64KB(r)	FIELD_GET(BIT(2), (r))
 
 #define GICV5_IRS_IDR3_VMT_LEVELS	BIT(10)
 #define GICV5_IRS_IDR3_VM_ID_BITS	GENMASK(9, 5)
@@ -573,6 +573,7 @@ int gicv5_irs_cpu_to_iaffid(int cpu_id, u16 *iaffid);
 struct gicv5_irs_chip_data *gicv5_irs_lookup_by_spi_id(u32 spi_id);
 int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type);
 int gicv5_irs_iste_alloc(u32 lpi);
+unsigned int gicv5_irs_l2_sz(u32 l2sz);
 void gicv5_irs_syncr(void);
 
 /* Embedded in kvm.arch */
@@ -617,4 +618,11 @@ void gicv5_deinit_lpis(void);
 
 void __init gicv5_its_of_probe(struct device_node *parent);
 void __init gicv5_its_acpi_probe(void);
+
+enum gicv5_vcpu_cmd {
+	VMT_L2_MAP,		/* Map in a L2 VMT - *may* happen on VM init */
+	VMTE_MAKE_VALID,	/* Make the VMTE valid */
+	VMTE_MAKE_INVALID,	/* Make the VMTE (et al.) invalid */
+};
+
 #endif
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 10/39] KVM: arm64: gic-v5: Introduce guest IST alloc and management
From: Sascha Bischoff @ 2026-05-21 14:52 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

GICv5 guests use Interrupt State Tables (ISTs) to track and manage the
interrupt state for SPIs and LPIs. These ISTs are provided to the host's
IRS via the VMTE.

On a host GICv5 system, SPIs do not require any up-front memory
allocation prior to their use, unlike LPIs which require the OS to
allocate an IST. For a GICv5 guest, the same holds from the guest's
point of view: SPIs should require no explicit memory allocation by the
guest. This means that KVM must provision the memory passed to the IRS
for managing a guest's SPI state.

Allocate the SPI IST before running the guest for the first time. As only
a small number of SPIs are expected, this is always allocated as a linear
IST. The host is responsible for freeing this memory on guest teardown.

For LPIs, the guest provisions memory for its LPI IST. KVM does not pass
that memory directly to the host IRS. Instead, allocate a shadow LPI IST
and pass that to the IRS through the VMTE. The LPI IST may be allocated
as a two-level structure when supported and required by the configured
LPI ID space, as many more LPIs are expected than SPIs. The host frees
this memory on guest teardown.

This commit also extends the doorbell domain to allow the doorbells
themselves to act as a conduit for issuing commands, similar to what
exists for GICv4 support. Effectively, irq_set_vcpu_affinity() becomes
an ioctl-like interface for issuing commands specific to either a VM or
the particular VPE that the doorbell belongs to. Add support for:

        VMT_L2_MAP - Make a second level VM table valid
        VMTE_MAKE_VALID - Make a single VMTE, and hence VM, valid
        VMTE_MAKE_INVALID - Make a single VMTE, and hence VM, invalid
        SPI_VIST_MAKE_VALID - Make the SPI IST valid
        LPI_VIST_MAKE_VALID - Make the LPI IST valid
        LPI_VIST_MAKE_INVALID - Make the LPI IST invalid

None of these commands are plumbed through to the host IRS at this
stage.

There is intentionally no SPI_VIST_MAKE_INVALID command. The SPI IST is
allocated as part of VM creation and is not invalidated while the VM is
live. It is freed when the VM is destroyed, after the VMTE has been made
invalid. The LPI IST, on the other hand, is driven by the guest, which is
free to invalidate and free its LPI IST at any point.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 arch/arm64/kvm/vgic/vgic-v5-tables.c | 527 +++++++++++++++++++++++++++
 arch/arm64/kvm/vgic/vgic-v5-tables.h |  22 ++
 arch/arm64/kvm/vgic/vgic-v5.c        |   3 +
 include/linux/irqchip/arm-gic-v5.h   |   3 +
 4 files changed, 555 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.c b/arch/arm64/kvm/vgic/vgic-v5-tables.c
index e9b92893b4e1f..a1d0f620b7913 100644
--- a/arch/arm64/kvm/vgic/vgic-v5-tables.c
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.c
@@ -59,6 +59,14 @@ static DEFINE_XARRAY(vm_info);
 #define GICV5_VPED_ADDR_SHIFT		3ULL
 #define GICV5_VPED_ADDR			GENMASK_ULL(55, 3)
 
+/*
+ * The LPI and SPI configuration is stored in the 2nd and 3rd 64-bit chunks of
+ * the VMTE (0-based). We call this a section here in an attempt to simplify the
+ * code.
+ */
+#define GICV5_VMTEL2_LPI_SECTION	2
+#define GICV5_VMTEL2_SPI_SECTION	3
+
 /*
  * Our IRS might be coherent or non-coherent. If coherent, we can just emit a
  * DSB to ensure that we're in sync. However, when non-coherent, we need to
@@ -489,6 +497,25 @@ int vgic_v5_vmte_init(struct kvm *kvm)
 	return ret;
 }
 
+/*
+ * The following set of forward declarations makes the code layout a *little*
+ * clearer as it lets us keep the IST-related code together.
+ */
+static int vgic_v5_alloc_linear_ist(struct kvm *kvm, bool spi_ist,
+				    unsigned int id_bits,
+				    unsigned int istsz);
+static int vgic_v5_alloc_l1_ist(struct kvm *kvm, unsigned int id_bits,
+				unsigned int istsz, unsigned int l2_split);
+static int vgic_v5_alloc_l2_ists(struct kvm *kvm, unsigned int id_bits,
+				 unsigned int istsz, unsigned int l2_split);
+static int vgic_v5_alloc_two_level_lpi_ist(struct kvm *kvm,
+					   unsigned int id_bits,
+					   unsigned int istsz,
+					   unsigned int l2_split);
+static int vgic_v5_linear_ist_free(struct kvm *kvm, bool spi);
+static int vgic_v5_two_level_ist_free(struct kvm *kvm, bool spi);
+static int vgic_v5_spi_ist_free(struct kvm *kvm);
+
 /*
  * Release the VMT Entry, freeing up any allocated data structures before
  * zeroing the VMTE.
@@ -521,6 +548,20 @@ int vgic_v5_vmte_release(struct kvm *kvm)
 	kfree(vmi->vpet_base);
 	kfree(vmi->vmd_base);
 
+	/* If we have an LPI IST, free it */
+	if (vmi->h_lpi_ist) {
+		ret = vgic_v5_lpi_ist_free(kvm);
+		if (ret)
+			return ret;
+	}
+
+	/* If we have an SPI IST, free it */
+	if (vmi->h_spi_ist) {
+		ret = vgic_v5_spi_ist_free(kvm);
+		if (ret)
+			return ret;
+	}
+
 	xa_erase(&vm_info, vm_id);
 	kfree(vmi);
 
@@ -623,3 +664,489 @@ int vgic_v5_vmte_free_vpe(struct kvm_vcpu *vcpu)
 
 	return 0;
 }
+
+/*
+ * Assign an already allocated IST to the VM by populating the fields in the
+ * corresponding VMTE. We re-use this code for both an SPI IST and LPI IST, even
+ * if the paths to reach it might be vastly different.
+ */
+static int vgic_v5_vmte_assign_ist(struct kvm *kvm, phys_addr_t ist_base,
+				   bool two_level, unsigned int id_bits,
+				   unsigned int l2sz, unsigned int istsz,
+				   bool spi_ist)
+{
+	struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	enum gicv5_vcpu_cmd cmd;
+	struct vmtl2_entry *vmte;
+	unsigned int section;
+	u64 tmp;
+	int ret;
+
+	/*
+	 * The L2 VMTE comprises four 64-bit "sections", where sections 2 & 3
+	 * describe the LPI and SPI ISTs, respectively. Both the LPI and SPI
+	 * sections have the same layout, and as we are either operating on SPIs
+	 * or LPIs we pick a section of the VMTE to modify up-front.
+	 *
+	 * See the GICv5 EAC0 Specification 11.2.2 for more details about the
+	 * VMTE layout.
+	 */
+	section = spi_ist ? GICV5_VMTEL2_SPI_SECTION : GICV5_VMTEL2_LPI_SECTION;
+
+	if (ist_base & ~GICV5_VMTEL2E_IST_ADDR) {
+		kvm_err("IST alignment issue! Address: 0x%llx, Mask 0x%llx\n",
+			ist_base, GICV5_VMTEL2E_IST_ADDR);
+		return -EINVAL;
+	}
+
+	vmte = vgic_v5_get_l2_vmte(vm_id);
+	if (IS_ERR(vmte))
+		return PTR_ERR(vmte);
+
+	/* Bail if already allocated */
+	vgic_v5_clean_inval(vmte, sizeof(*vmte));
+	if (le64_to_cpu(READ_ONCE(vmte->val[section])) & GICV5_VMTEL2E_IST_VALID)
+		return -EINVAL;
+
+	tmp = FIELD_PREP(GICV5_VMTEL2E_IST_L2SZ, l2sz);
+	tmp |= FIELD_PREP(GICV5_VMTEL2E_IST_ADDR,
+			ist_base >> GICV5_VMTEL2E_IST_ADDR_SHIFT);
+	tmp |= FIELD_PREP(GICV5_VMTEL2E_IST_ISTSZ, istsz);
+	tmp |= FIELD_PREP(GICV5_VMTEL2E_IST_ID_BITS, id_bits);
+	if (two_level)
+		tmp |= GICV5_VMTEL2E_IST_STRUCTURE;
+
+	WRITE_ONCE(vmte->val[section], cpu_to_le64(tmp));
+	vgic_v5_clean_inval(vmte, sizeof(*vmte));
+
+	/* Finally, mark the entry as valid */
+	cmd = spi_ist ? SPI_VIST_MAKE_VALID : LPI_VIST_MAKE_VALID;
+	ret = irq_set_vcpu_affinity(vgic_v5_vpe_db(vcpu0), &cmd);
+
+	return ret;
+}
+
+/*
+ * Allocate a Linear IST - always used for SPIs and potentially LPIs.
+ *
+ * The calculation for n has been taken from section 11.2.2 of the GICv5 EAC0
+ * spec.
+ *
+ * NOTE: istsz is the FIELD used by GICv5, not the actual size (or log2() of the
+ * size).
+ */
+static int vgic_v5_alloc_linear_ist(struct kvm *kvm, bool spi_ist,
+				    unsigned int id_bits, unsigned int istsz)
+{
+	const size_t n = id_bits + 1 + istsz;
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vgic_v5_vm_info *vmi;
+	__le64 *ist;
+	u32 l1sz;
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (WARN_ON_ONCE(!vmi))
+		return -EINVAL;
+
+	/*
+	 * Allocate the IST. We only have one level, so we just use the L2 ISTE.
+	 */
+	l1sz = BIT(n + 1);
+	ist = kzalloc(l1sz, GFP_KERNEL);
+	if (!ist)
+		return -ENOMEM;
+
+	if (spi_ist) {
+		vmi->h_spi_ist = ist;
+	} else {
+		vmi->h_lpi_ist_structure = false;
+		vmi->h_lpi_ist = ist;
+	}
+
+	vgic_v5_clean_inval(ist, l1sz);
+
+	return 0;
+}
+
+/*
+ * Allocate the first level of a two-level IST - LPI, only.
+ *
+ * The calculation for n has been taken from section 11.2.2 of the GICv5 EAC0
+ * spec.
+ *
+ * NOTE: istsz and l2sz are the FIELDS used by GICv5, not the actual sizes (or
+ * log2() of the sizes).
+ */
+static int vgic_v5_alloc_l1_ist(struct kvm *kvm, unsigned int id_bits,
+				unsigned int istsz, unsigned int l2sz)
+{
+	const size_t n =  max(5, id_bits - ((10 - istsz) + (2 * l2sz)) + 3 - 1);
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	const u32 l1_size = BIT(n + 1);
+	struct vgic_v5_vm_info *vmi;
+	__le64 *ist;
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (!vmi)
+		return -EINVAL;
+
+	ist = kzalloc(l1_size, GFP_KERNEL);
+	if (!ist)
+		return -ENOMEM;
+
+	vmi->h_lpi_ist_structure = true;
+	vmi->h_lpi_ist = ist;
+
+	vgic_v5_clean_inval(ist, l1_size);
+
+	return 0;
+}
+
+/*
+ * Allocate ALL of the second level ISTs for a two-level IST - LPI, only.
+ *
+ * The calculation for n has been taken from section 11.2.2 of the GICv5 EAC0
+ * spec. The l2_size calculation is from section 11.2.3 of the same document.
+ *
+ * NOTE: istsz and l2sz are the FIELDS used by GICv5, not the actual sizes (or
+ * log2() of the sizes).
+ */
+static int vgic_v5_alloc_l2_ists(struct kvm *kvm, unsigned int id_bits,
+				 unsigned int istsz, unsigned int l2sz)
+{
+	const size_t n =  max(5, id_bits - ((10 - istsz) + (2 * l2sz)) + 3 - 1);
+	const int l1_entries = BIT(n + 1) / GICV5_IRS_ISTL1E_SIZE;
+	const size_t l2_size = BIT(11 + (2 * l2sz) + 1);
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vgic_v5_vm_info *vmi;
+	__le64 *l2ist;
+	__le64 *l1ist;
+	int index;
+	u64 val;
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (WARN_ON_ONCE(!vmi))
+		return -EINVAL;
+
+	l1ist = vmi->h_lpi_ist;
+
+	/*
+	 * Allocate the storage for the pointers to the L2 ISTs (used when
+	 * freeing later).
+	 */
+	vmi->h_lpi_l2_ists = kzalloc_objs(*vmi->h_lpi_l2_ists, l1_entries,
+					  GFP_KERNEL);
+	if (!vmi->h_lpi_l2_ists)
+		return -ENOMEM;
+
+	/* Allocate the L2 IST for each L1 IST entry */
+	for (index = 0; index < l1_entries; ++index) {
+		l2ist = kzalloc(l2_size, GFP_KERNEL);
+		if (!l2ist) {
+			while (--index >= 0)
+				kfree(vmi->h_lpi_l2_ists[index]);
+
+			kfree(vmi->h_lpi_l2_ists);
+			vmi->h_lpi_l2_ists = NULL;
+
+			return -ENOMEM;
+		}
+
+		/*
+		 * We are not doing on-demand allocation of the L2 ISTs, and are
+		 * instead provisioning the whole IST up front. This means that
+		 * we are able to mark the L2 ISTs as valid in the L1 ISTEs as
+		 * the overall IST is not yet valid.
+		 */
+		val = (virt_to_phys(l2ist) & GICV5_ISTL1E_L2_ADDR_MASK) |
+		      GICV5_ISTL1E_VALID;
+		l1ist[index] = cpu_to_le64(val);
+
+		vmi->h_lpi_l2_ists[index] = l2ist;
+
+		vgic_v5_clean_inval(l2ist, l2_size);
+	}
+
+	/* Handle CMOs for the whole L1 IST in one go */
+	vgic_v5_clean_inval(l1ist, l1_entries * sizeof(*l1ist));
+
+	return 0;
+}
+
+/* Allocate a two-level IST - LPIs, only */
+static int vgic_v5_alloc_two_level_lpi_ist(struct kvm *kvm, unsigned int id_bits,
+					   unsigned int istsz, unsigned int l2sz)
+{
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vgic_v5_vm_info *vmi;
+	int ret;
+
+	/*
+	 * Allocate the L1 IST first, then all of the L2s. Everything
+	 * is preallocated and we do no on-demand IST allocation. This
+	 * is to avoid needing to track if and when the guest is doing
+	 * on-demand IST allocation.
+	 */
+	ret = vgic_v5_alloc_l1_ist(kvm, id_bits, istsz, l2sz);
+	if (ret)
+		return ret;
+
+	ret = vgic_v5_alloc_l2_ists(kvm, id_bits, istsz, l2sz);
+	if (ret) {
+		/* Free the L1 IST again */
+		vmi = xa_load(&vm_info, vm_id);
+		kfree(vmi->h_lpi_ist);
+		vmi->h_lpi_ist = 0;
+
+		return ret;
+	}
+
+	return 0;
+}
+
+static void vgic_v5_free_allocated_lpi_ist(struct vgic_v5_vm_info *vmi,
+					   unsigned int id_bits,
+					   unsigned int istsz,
+					   unsigned int l2sz)
+{
+	if (!vmi->h_lpi_ist_structure) {
+		kfree(vmi->h_lpi_ist);
+		vmi->h_lpi_ist = NULL;
+		return;
+	}
+
+	if (vmi->h_lpi_l2_ists) {
+		const size_t n = max(5, id_bits - ((10 - istsz) + (2 * l2sz)) + 3 - 1);
+		const int l1_entries = BIT(n + 1) / GICV5_IRS_ISTL1E_SIZE;
+		int index;
+
+		for (index = 0; index < l1_entries; ++index)
+			kfree(vmi->h_lpi_l2_ists[index]);
+
+		kfree(vmi->h_lpi_l2_ists);
+		vmi->h_lpi_l2_ists = NULL;
+	}
+
+	kfree(vmi->h_lpi_ist);
+	vmi->h_lpi_ist = NULL;
+}
+
+static void vgic_v5_free_allocated_spi_ist(struct kvm *kvm)
+{
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vgic_v5_vm_info *vmi;
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (WARN_ON_ONCE(!vmi))
+		return;
+
+	kfree(vmi->h_spi_ist);
+	vmi->h_spi_ist = NULL;
+}
+
+/*
+ * Free a Linear IST. Can only happen once the VM is dead.
+ */
+static int vgic_v5_linear_ist_free(struct kvm *kvm, bool spi)
+{
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vmtl2_entry *vmte;
+	struct vgic_v5_vm_info *vmi;
+	int section;
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (!vmi)
+		return -EINVAL;
+
+	vmte = vgic_v5_get_l2_vmte(vm_id);
+	if (IS_ERR(vmte))
+		return PTR_ERR(vmte);
+
+	if (spi) {
+		section = GICV5_VMTEL2_SPI_SECTION;
+		vgic_v5_free_allocated_spi_ist(kvm);
+	} else {
+		section = GICV5_VMTEL2_LPI_SECTION;
+		vgic_v5_free_allocated_lpi_ist(vmi, 0, 0, 0);
+	}
+
+	/* The VM should be dead here, so we can just zero the VMT section */
+	vmte->val[section] = cpu_to_le64(0);
+	vgic_v5_clean_inval(vmte, sizeof(*vmte));
+
+	return 0;
+}
+
+/*
+ * Free a Two-Level IST. Can only happen once the VM is dead.
+ */
+static int vgic_v5_two_level_ist_free(struct kvm *kvm, bool spi)
+{
+	unsigned int id_bits, istsz, l2sz;
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vgic_v5_vm_info *vmi;
+	struct vmtl2_entry *vmte;
+	__le64 tmp;
+	int section;
+
+	/* We don't create two-level SPI ISTs, so freeing is a bad idea! */
+	if (spi)
+		return -EINVAL;
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (!vmi)
+		return -EINVAL;
+
+	section = GICV5_VMTEL2_LPI_SECTION;
+
+	if (!vmi->h_lpi_ist_structure)
+		return -EINVAL;
+
+	vmte = vgic_v5_get_l2_vmte(vm_id);
+	if (IS_ERR(vmte))
+		return PTR_ERR(vmte);
+
+	tmp = le64_to_cpu(READ_ONCE(vmte->val[section]));
+
+	id_bits = FIELD_GET(GICV5_VMTEL2E_IST_ID_BITS, tmp);
+	istsz = FIELD_GET(GICV5_VMTEL2E_IST_ISTSZ, tmp);
+	l2sz = FIELD_GET(GICV5_VMTEL2E_IST_L2SZ, tmp);
+
+	vgic_v5_free_allocated_lpi_ist(vmi, id_bits, istsz, l2sz);
+
+	/* The VM must be dead, so we can just zero the VMT section */
+	vmte->val[section] = cpu_to_le64(0);
+	vgic_v5_clean_inval(vmte, sizeof(*vmte));
+
+	return 0;
+}
+
+/* Helper to determine ISTE size based on metadata requirements */
+static unsigned int vgic_v5_ist_istsz(unsigned int id_bits)
+{
+	if (!irs_caps.istmd)
+		return GICV5_IRS_IST_CFGR_ISTSZ_4;
+
+	if (id_bits >= irs_caps.istmd_sz)
+		return GICV5_IRS_IST_CFGR_ISTSZ_16;
+
+	return GICV5_IRS_IST_CFGR_ISTSZ_8;
+}
+
+/*
+ * Allocate an IST for SPIs.
+ *
+ * We don't anticipate a large number of SPIs being allocated. Therefore, we
+ * always allocate a Linear IST for SPIs. This will need to be revisited should
+ * that assumption no longer hold.
+ */
+int vgic_v5_spi_ist_allocate(struct kvm *kvm, unsigned int id_bits)
+{
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vgic_v5_vm_info *vmi;
+	phys_addr_t base_addr;
+	unsigned int istsz;
+	int ret;
+
+	istsz = vgic_v5_ist_istsz(id_bits);
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (WARN_ON_ONCE(!vmi))
+		return -EINVAL;
+
+	ret = vgic_v5_alloc_linear_ist(kvm, true, id_bits, istsz);
+	if (ret)
+		return ret;
+	base_addr = virt_to_phys(vmi->h_spi_ist);
+
+	ret = vgic_v5_vmte_assign_ist(kvm, base_addr, false, id_bits, 0, istsz,
+				      true);
+	if (ret) {
+		vgic_v5_free_allocated_spi_ist(kvm);
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Free the IST for SPIs. Should only happen once the VM is dead.
+ */
+static int vgic_v5_spi_ist_free(struct kvm *kvm)
+{
+	return vgic_v5_linear_ist_free(kvm, true);
+}
+
+/*
+ * Allocate an IST for LPIs.
+ *
+ * Unlike with SPIs, we anticipate that the guest will allocate a relatively
+ * large number of LPIs. Therefore, while we support doing a linear LPI IST, it
+ * is expected that LPI ISTs will be two-level.
+ */
+int vgic_v5_lpi_ist_alloc(struct kvm *kvm, unsigned int id_bits)
+{
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vgic_v5_vm_info *vmi;
+	unsigned int istsz, l2sz;
+	phys_addr_t phys_addr;
+	bool two_level;
+	int ret;
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (WARN_ON_ONCE(!vmi))
+		return -EINVAL;
+
+	if (vmi->h_lpi_ist)
+		return -EBUSY;
+
+	istsz = vgic_v5_ist_istsz(id_bits);
+	l2sz = gicv5_irs_l2_sz(irs_caps.ist_l2sz);
+
+	/*
+	 * Determine if we want to create a Linear or a Two-Level IST.
+	 *
+	 * A two-level IST is only required when a single L2 IST cannot cover
+	 * the requested ID space. This depends on the L2 IST size selected for
+	 * the IRS, not PAGE_SIZE. Using PAGE_SIZE here would switch to
+	 * two-level too early when the selected L2 IST is larger than a page,
+	 * and the allocation sizing arithmetic would underflow.
+	 */
+	two_level = irs_caps.ist_levels &&
+		id_bits > ((10 - istsz) + (2 * l2sz));
+
+	if (!two_level)
+		ret = vgic_v5_alloc_linear_ist(kvm, false /* LPIs, not SPIs */,
+					       id_bits, istsz);
+	else
+		ret = vgic_v5_alloc_two_level_lpi_ist(kvm, id_bits, istsz,
+						      l2sz);
+
+	if (ret)
+		return ret;
+
+	phys_addr = virt_to_phys(vmi->h_lpi_ist);
+	ret = vgic_v5_vmte_assign_ist(kvm, phys_addr, two_level, id_bits, l2sz,
+				      istsz, false);
+	if (ret)
+		vgic_v5_free_allocated_lpi_ist(vmi, id_bits, istsz, l2sz);
+
+	return ret;
+}
+
+/* Free the LPI IST again */
+int vgic_v5_lpi_ist_free(struct kvm *kvm)
+{
+	u16 vm_id = vgic_v5_vm_id(kvm);
+	struct vgic_v5_vm_info *vmi;
+
+	vmi = xa_load(&vm_info, vm_id);
+	if (!vmi)
+		return -ENXIO;
+
+	if (!vmi->h_lpi_ist_structure)
+		return vgic_v5_linear_ist_free(kvm, false);
+	else
+		return vgic_v5_two_level_ist_free(kvm, false);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.h b/arch/arm64/kvm/vgic/vgic-v5-tables.h
index 3ca5bc7214fc9..81fed6c5b1559 100644
--- a/arch/arm64/kvm/vgic/vgic-v5-tables.h
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.h
@@ -25,6 +25,24 @@ struct vgic_v5_vm_info {
 	vpe_entry __iomem	*vpet_base;
 	void __iomem		**vped_ptrs;
 	u8			vpe_id_bits;
+
+	/*
+	 * Both the LPI and SPI ISTs are allocated by the hypervisor. While it
+	 * would be possible to track and access them by iterating over the ISTs
+	 * themselves, it makes more sense to store pointers to the ISTs.
+	 *
+	 * The LPI IST can either be two-level or linear. Hence, we keep track
+	 * of the structure. If it is two-level, we retain pointers to the L1
+	 * IST and to each L2 IST array. If it is linear, we just store the base
+	 * address of the IST array.
+	 *
+	 * The SPI IST is linear, and therefore we just store the base address
+	 * of the SPI IST array.
+	 */
+	bool			h_lpi_ist_structure;
+	__le64			*h_lpi_ist;
+	__le64			**h_lpi_l2_ists;
+	__le64			*h_spi_ist;
 };
 
 struct vgic_v5_vmt {
@@ -73,4 +91,8 @@ int vgic_v5_vmte_release(struct kvm *kvm);
 int vgic_v5_vmte_alloc_vpe(struct kvm_vcpu *vcpu);
 int vgic_v5_vmte_free_vpe(struct kvm_vcpu *vcpu);
 
+int vgic_v5_spi_ist_allocate(struct kvm *kvm, unsigned int id_bits);
+int vgic_v5_lpi_ist_alloc(struct kvm *kvm, unsigned int id_bits);
+int vgic_v5_lpi_ist_free(struct kvm *kvm);
+
 #endif
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index adfe0b207ef40..120eadff9a128 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -161,6 +161,9 @@ static int vgic_v5_db_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
 	case VMT_L2_MAP:
 	case VMTE_MAKE_VALID:
 	case VMTE_MAKE_INVALID:
+	case SPI_VIST_MAKE_VALID:
+	case LPI_VIST_MAKE_VALID:
+	case LPI_VIST_MAKE_INVALID:
 		/* Not yet implemented */
 	default:
 		return -EINVAL;
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index 64e31068d9d17..ef649faeeb0ff 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -623,6 +623,9 @@ enum gicv5_vcpu_cmd {
 	VMT_L2_MAP,		/* Map in a L2 VMT - *may* happen on VM init */
 	VMTE_MAKE_VALID,	/* Make the VMTE valid */
 	VMTE_MAKE_INVALID,	/* Make the VMTE (et al.) invalid */
+	SPI_VIST_MAKE_VALID,	/* No corresponding invalid */
+	LPI_VIST_MAKE_VALID,	/* Triggered by a guest */
+	LPI_VIST_MAKE_INVALID,	/* Triggered by a guest */
 };
 
 #endif
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH v14 00/28] Add new general DRM property "color format"
From: Daniel Stone @ 2026-05-21 14:53 UTC (permalink / raw)
  To: Nicolas Frattaroli
  Cc: Harry Wentland, Leo Li, Rodrigo Siqueira, Alex Deucher,
	Christian König, David Airlie, Simona Vetter,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
	Andrzej Hajda, Neil Armstrong, Robert Foss, Laurent Pinchart,
	Jonas Karlman, Jernej Skrabec, Sandy Huang, Heiko Stübner,
	Andy Yan, Jani Nikula, Rodrigo Vivi, Joonas Lahtinen,
	Tvrtko Ursulin, Dmitry Baryshkov, Sascha Hauer, Rob Herring,
	Jonathan Corbet, Shuah Khan, kernel, amd-gfx, dri-devel,
	linux-kernel, linux-arm-kernel, linux-rockchip, intel-gfx,
	intel-xe, linux-doc, wayland-devel, Werner Sembach,
	Andri Yngvason, Cristian Ciocaltea, Marius Vlad, Dmitry Baryshkov,
	Andy Yan
In-Reply-To: <20260423-color-format-v14-0-449a419ccbd4@collabora.com>

Hi there,

On Thu, 23 Apr 2026 at 20:04, Nicolas Frattaroli
<nicolas.frattaroli@collabora.com> wrote:
> We have an implementation in Weston at
> https://gitlab.freedesktop.org/wayland/weston/-/merge_requests/1825 that
> adds support for this property. This patch series has been tested
> against that MR on i915 (HDMI, DP), amdgpu (HDMI, DP) and on rockchip
> (HDMI).

This MR is R-b me.

> General notes on the approach taken by me: instead of silently switching
> to a different format than was explicitly requested, or even worse,
> outputting something to the sink the sink doesn't support, bubble up an
> error to userspace instead. "color format" is a "I want this" type
> property, not a "force this" type property, i.e. the kernel will respect
> the limits imposed by the hardware.

Yes! If userspace wants a fallback chain, it should encode it itself
through a series of test commits, rather than adding the sequential
logic to the kernel. Doing that might work for one axis, but pretty
quickly disintegrates when there are multiple parameters to perhaps
fall back on.

I had minor comments on 03 and 20, but they're Rb me with the obvious
fixes. 11, 12, and 19 are Acked-by me, as I don't quite know the
hardware specifics well enough to say. The rest are Reviewed-by me.

I suggest you merge the common code and VOP2/DW-QP implementations via
drm-misc, leaving Intel and AMD to merge through their own trees
whenever they're ready. We'll merge the Weston implementation when it
lands in DRM.

Thanks to you and all prior cooks for all the work, and to Maxime and
Dmitry for the help and review as well.

Cheers,
Daniel


^ permalink raw reply

* [PATCH v2 11/39] KVM: arm64: gic-v5: Implement VMT/vIST IRS MMIO Ops
From: Sascha Bischoff @ 2026-05-21 14:52 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

GICv5 has rules about which fields of a VMTE (or L1 VMT) may be
directly written by the host once the table is valid. This ensures
that no stale state is cached by the hardware, and provides a clear
interface for making VMs, ISTs, etc, valid.

The hypervisor is responsible for populating the VMTE for a
VM. However, it is not permitted to write the Valid bit (as the VM
table is already valid). Instead, the VM is made valid via an IRS MMIO
Op. The same applies to the ISTs - they must be made valid via the
host IRS.

This commit adds support for:

* Making level 1 VMTs valid (only), allowing for dynamic level 2 array
  allocation.
* Making VMTEs (VMs) valid or invalid
* Making SPI/LPI ISTs valid or invalid for a specific VM

As part of this commit, the following vcpu_affinity-based commands are
plumbed in:

        VMT_L2_MAP - Make a second level VM table valid
        VMTE_MAKE_VALID - Make a single VMTE (and hence VM) valid
        VMTE_MAKE_INVALID - Make a single VMTE (and hence VM) invalid
        SPI_VIST_MAKE_VALID - Make the SPI IST valid
        LPI_VIST_MAKE_VALID - Make the LPI IST valid
        LPI_VIST_MAKE_INVALID - Make the LPI IST invalid

Note: the lack of SPI_VIST_MAKE_INVALID is intentional.

When successfully probing for a GICv5, the VMT is allocated, and is
made valid via the IRS's MMIO interface. Treat failures while
allocating or assigning the VMT as hard GICv5 probe failures. At that
point the IRS VM table state is a prerequisite for vGICv5 operation,
and falling back to the legacy path would leave the host without a
valid GICv5 VM table setup. Later failures can only fall back once the
IRS VMT state has been successfully cleared.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 arch/arm64/kvm/vgic/vgic-v5-tables.c |  58 ++++++---
 arch/arm64/kvm/vgic/vgic-v5-tables.h |   2 +
 arch/arm64/kvm/vgic/vgic-v5.c        | 188 ++++++++++++++++++++++++++-
 3 files changed, 225 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.c b/arch/arm64/kvm/vgic/vgic-v5-tables.c
index a1d0f620b7913..5c87c6c27087a 100644
--- a/arch/arm64/kvm/vgic/vgic-v5-tables.c
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.c
@@ -67,6 +67,21 @@ static DEFINE_XARRAY(vm_info);
 #define GICV5_VMTEL2_LPI_SECTION	2
 #define GICV5_VMTEL2_SPI_SECTION	3
 
+static int vgic_v5_alloc_linear_ist(struct kvm *kvm, bool spi_ist,
+				    unsigned int id_bits,
+				    unsigned int istsz);
+static int vgic_v5_alloc_l1_ist(struct kvm *kvm, unsigned int id_bits,
+				unsigned int istsz, unsigned int l2_split);
+static int vgic_v5_alloc_l2_ists(struct kvm *kvm, unsigned int id_bits,
+				 unsigned int istsz, unsigned int l2_split);
+static int vgic_v5_alloc_two_level_lpi_ist(struct kvm *kvm,
+					   unsigned int id_bits,
+					   unsigned int istsz,
+					   unsigned int l2_split);
+static int vgic_v5_linear_ist_free(struct kvm *kvm, bool spi);
+static int vgic_v5_two_level_ist_free(struct kvm *kvm, bool spi);
+static int vgic_v5_spi_ist_free(struct kvm *kvm);
+
 /*
  * Our IRS might be coherent or non-coherent. If coherent, we can just emit a
  * DSB to ensure that we're in sync. However, when non-coherent, we need to
@@ -497,25 +512,6 @@ int vgic_v5_vmte_init(struct kvm *kvm)
 	return ret;
 }
 
-/*
- * The following set of forward declarations makes the code layout a *little*
- * clearer as it lets us keep the IST-related code together.
- */
-static int vgic_v5_alloc_linear_ist(struct kvm *kvm, bool spi_ist,
-				    unsigned int id_bits,
-				    unsigned int istsz);
-static int vgic_v5_alloc_l1_ist(struct kvm *kvm, unsigned int id_bits,
-				unsigned int istsz, unsigned int l2_split);
-static int vgic_v5_alloc_l2_ists(struct kvm *kvm, unsigned int id_bits,
-				 unsigned int istsz, unsigned int l2_split);
-static int vgic_v5_alloc_two_level_lpi_ist(struct kvm *kvm,
-					   unsigned int id_bits,
-					   unsigned int istsz,
-					   unsigned int l2_split);
-static int vgic_v5_linear_ist_free(struct kvm *kvm, bool spi);
-static int vgic_v5_two_level_ist_free(struct kvm *kvm, bool spi);
-static int vgic_v5_spi_ist_free(struct kvm *kvm);
-
 /*
  * Release the VMT Entry, freeing up any allocated data structures before
  * zeroing the VMTE.
@@ -665,6 +661,23 @@ int vgic_v5_vmte_free_vpe(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+phys_addr_t vgic_v5_get_vmt_base(void)
+{
+	phys_addr_t vmt_base;
+
+	if (!vmt_info->two_level)
+		vmt_base = virt_to_phys(vmt_info->linear.vmt_base);
+	else
+		vmt_base = virt_to_phys(vmt_info->l2.vmt_base);
+
+	return vmt_base;
+}
+
+u8 vgic_v5_vmt_vpe_id_bits(void)
+{
+	return fls(vmt_info->max_vpes) - 1;
+}
+
 /*
  * Assign an already allocated IST to the VM by populating the fields in the
  * corresponding VMTE. We re-use this code for both an SPI IST and LPI IST, even
@@ -723,8 +736,13 @@ static int vgic_v5_vmte_assign_ist(struct kvm *kvm, phys_addr_t ist_base,
 	/* Finally, mark the entry as valid */
 	cmd = spi_ist ? SPI_VIST_MAKE_VALID : LPI_VIST_MAKE_VALID;
 	ret = irq_set_vcpu_affinity(vgic_v5_vpe_db(vcpu0), &cmd);
+	if (ret) {
+		WRITE_ONCE(vmte->val[section], 0ULL);
+		vgic_v5_clean_inval(vmte, sizeof(*vmte));
+		return ret;
+	}
 
-	return ret;
+	return 0;
 }
 
 /*
diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.h b/arch/arm64/kvm/vgic/vgic-v5-tables.h
index 81fed6c5b1559..acd862b8806d1 100644
--- a/arch/arm64/kvm/vgic/vgic-v5-tables.h
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.h
@@ -82,6 +82,8 @@ static inline int vgic_v5_vpe_db(struct kvm_vcpu *vcpu)
 
 int vgic_v5_vmt_allocate(unsigned int max_vpes);
 int vgic_v5_vmt_free(void);
+phys_addr_t vgic_v5_get_vmt_base(void);
+u8 vgic_v5_vmt_vpe_id_bits(void);
 
 int vgic_v5_allocate_vm_id(struct kvm *kvm);
 void vgic_v5_release_vm_id(struct kvm *kvm);
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index 120eadff9a128..f9578c2a634a4 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -10,10 +10,14 @@
 #include <linux/irqdomain.h>
 
 #include "vgic.h"
+#include "vgic-v5-tables.h"
 
 #define ppi_caps	kvm_vgic_global_state.vgic_v5_ppi_caps
 #define irs_caps	kvm_vgic_global_state.vgic_v5_irs_caps
 
+static int vgic_v5_irs_assign_vmt(bool two_level, u8 vm_id_bits, phys_addr_t vmt_base);
+static int vgic_v5_irs_clear_vmt(void);
+
 /*
  * Not all PPIs are guaranteed to be implemented for GICv5. Deterermine which
  * ones are, and generate a mask.
@@ -36,11 +40,32 @@ static void vgic_v5_get_implemented_ppis(void)
 	__assign_bit(GICV5_ARCH_PPI_PMUIRQ, ppi_caps.impl_ppi_mask, system_supports_pmuv3());
 }
 
+/*
+ * The IRS MMIO interface is shared between all VMs, so make sure we don't do
+ * anything stupid!
+ */
+static DEFINE_RAW_SPINLOCK(global_irs_lock);
+
 static u32 irs_readl_relaxed(const u32 reg_offset)
 {
 	return readl_relaxed(irs_caps.irs_base + reg_offset);
 }
 
+static void irs_writel_relaxed(const u32 val, const u32 reg_offset)
+{
+	writel_relaxed(val, irs_caps.irs_base + reg_offset);
+}
+
+static u64 irs_readq_relaxed(const u32 reg_offset)
+{
+	return readq_relaxed(irs_caps.irs_base + reg_offset);
+}
+
+static void irs_writeq_relaxed(const u64 val, const u32 reg_offset)
+{
+	writeq_relaxed(val, irs_caps.irs_base + reg_offset);
+}
+
 static void vgic_v5_irs_extract_vm_caps(const struct gic_kvm_info *info)
 {
 	u64 idr;
@@ -85,6 +110,7 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	int ret;
 
 	kvm_vgic_global_state.type = VGIC_V5;
+	kvm_vgic_global_state.max_gic_vcpus = VGIC_V5_MAX_CPUS;
 
 	kvm_vgic_global_state.vcpu_base = 0;
 	kvm_vgic_global_state.vctrl_base = NULL;
@@ -105,12 +131,49 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	vgic_v5_irs_extract_vm_caps(info);
 	vgic_v5_get_implemented_ppis();
 
+	/*
+	 * Even if the HW supports more per-VM vCPUs, artificially cap as we
+	 * can't use them all.
+	 */
+	kvm_vgic_global_state.max_gic_vcpus = min(irs_caps.max_vpes,
+						  VGIC_V5_MAX_CPUS);
+
+	/*
+	 * GICv5 requires a set of tables to be allocated in order to manage
+	 * VMs. We allocate them in advance here, which alas means that we
+	 * already have to make a decisions regarding the maximum number of VMs
+	 * we want to run. For now, we match the maximum number offered by the
+	 * hardware, but this might not be a wise choice in the long term.
+	 */
+	ret = vgic_v5_vmt_allocate(kvm_vgic_global_state.max_gic_vcpus);
+	if (ret) {
+		kvm_err("Failed to allocate the GICv5 VM tables; no GICv5 support\n");
+		return -ENODEV;
+	}
+
+	/*
+	 * We've now allocated the VM table, but the host's IRS doesn't know
+	 * about it yet. Provide the base address of the VMT to the IRS, as well
+	 * as the number of ID bits that it covers and the structure used
+	 * (linear/two-level).
+	 */
+	ret = vgic_v5_irs_assign_vmt(irs_caps.two_level_vmt_support,
+				     ilog2(irs_caps.max_vms),
+				     vgic_v5_get_vmt_base());
+	if (ret) {
+		kvm_err("Failed to assign the GICv5 VM tables to the IRS; no GICv5 support\n");
+		vgic_v5_vmt_free();
+		return -ENODEV;
+	}
+
 	kvm_vgic_global_state.max_gic_vcpus = min(irs_caps.max_vpes,
 						  VGIC_V5_MAX_CPUS);
 
 	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V5);
 	if (ret) {
 		kvm_err("Cannot register GICv5 KVM device.\n");
+		WARN_ON(vgic_v5_irs_clear_vmt());
+		vgic_v5_vmt_free();
 		goto skip_v5;
 	}
 
@@ -138,12 +201,13 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
 	if (ret) {
 		kvm_err("Cannot register GICv3-legacy KVM device.\n");
-		return ret;
+		/* vGICv5 should still work */
+		return v5_registered ? 0 : ret;
 	}
 
 	/* We potentially limit the max VCPUs further than we need to here */
 	kvm_vgic_global_state.max_gic_vcpus = min(VGIC_V3_MAX_CPUS,
-						  VGIC_V5_MAX_CPUS);
+						  kvm_vgic_global_state.max_gic_vcpus);
 
 	static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
 	kvm_info("GCIE legacy system register CPU interface\n");
@@ -153,18 +217,136 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	return 0;
 }
 
+/*
+ * Wait for completion of a change in any of IRS_VMT_BASER, IRS_VMAP_L2_VMTR,
+ * IRS_VMAP_VMR, IRS_VMAP_VPER, IRS_VMAP_VISTR, IRS_VMAP_L2_VISTR.
+ */
+static int vgic_v5_irs_wait_for_vm_op(void)
+{
+	return gicv5_wait_for_op_atomic(irs_caps.irs_base,
+					GICV5_IRS_VMT_STATUSR,
+					GICV5_IRS_VMT_STATUSR_IDLE,
+					NULL);
+}
+
+static int vgic_v5_irs_write_vm_mmio_reg(u64 val, u32 offset)
+{
+	int ret;
+
+	guard(raw_spinlock_irqsave)(&global_irs_lock);
+
+	/* Make sure that we are idle to begin with */
+	ret = vgic_v5_irs_wait_for_vm_op();
+	if (ret)
+		return ret;
+
+	irs_writeq_relaxed(val, offset);
+
+	return vgic_v5_irs_wait_for_vm_op();
+}
+
+static int vgic_v5_irs_assign_vmt(bool two_level, u8 vm_id_bits,
+				  phys_addr_t vmt_base)
+{
+	u64 vmt_baser;
+	u32 vmt_cfgr;
+
+	guard(raw_spinlock_irqsave)(&global_irs_lock);
+
+	vmt_baser = irs_readq_relaxed(GICV5_IRS_VMT_BASER);
+	if (!!FIELD_GET(GICV5_IRS_VMT_BASER_VALID, vmt_baser))
+		return -EBUSY;
+
+	vmt_cfgr = FIELD_PREP(GICV5_IRS_VMT_CFGR_VM_ID_BITS, vm_id_bits);
+	if (two_level)
+		vmt_cfgr |= FIELD_PREP(GICV5_IRS_VMT_CFGR_STRUCTURE,
+				       GICV5_IRS_VMT_CFGR_STRUCTURE_TWO_LEVEL);
+
+	irs_writel_relaxed(vmt_cfgr, GICV5_IRS_VMT_CFGR);
+
+	/* The base address is intentionally only masked and not shifted */
+	vmt_baser = FIELD_PREP(GICV5_IRS_VMT_BASER_VALID, true) |
+		    (vmt_base & GICV5_IRS_VMT_BASER_ADDR);
+	irs_writeq_relaxed(vmt_baser, GICV5_IRS_VMT_BASER);
+
+	return vgic_v5_irs_wait_for_vm_op();
+}
+
+static int vgic_v5_irs_clear_vmt(void)
+{
+	return vgic_v5_irs_write_vm_mmio_reg(0, GICV5_IRS_VMT_BASER);
+}
+
+static int vgic_v5_irs_vmap_l2_vmt(u16 vm_id)
+{
+	u64 val = FIELD_PREP(GICV5_IRS_VMAP_L2_VMTR_VM_ID, vm_id) |
+		GICV5_IRS_VMAP_L2_VMTR_M;
+
+	return vgic_v5_irs_write_vm_mmio_reg(val, GICV5_IRS_VMAP_L2_VMTR);
+}
+
+static int __vgic_v5_irs_vmap_vm(u16 vm_id, bool unmap)
+{
+	u64 val = FIELD_PREP(GICV5_IRS_VMAP_VMR_VM_ID, vm_id) |
+		FIELD_PREP(GICV5_IRS_VMAP_VMR_U, unmap) |
+		GICV5_IRS_VMAP_VMR_M;
+
+	return vgic_v5_irs_write_vm_mmio_reg(val, GICV5_IRS_VMAP_VMR);
+}
+
+static int vgic_v5_irs_set_vm_valid(u16 vm_id)
+{
+	return __vgic_v5_irs_vmap_vm(vm_id, false);
+}
+
+static int vgic_v5_irs_set_vm_invalid(u16 vm_id)
+{
+	return __vgic_v5_irs_vmap_vm(vm_id, true);
+}
+
+static int __vgic_v5_irs_update_vist_validity(u16 vm_id, bool spi_ist, bool unmap)
+{
+	u8 type = spi_ist ? 0b011 : 0b010;
+	u64 val = FIELD_PREP(GICV5_IRS_VMAP_VISTR_TYPE, type) |
+		FIELD_PREP(GICV5_IRS_VMAP_VISTR_VM_ID, vm_id) |
+		FIELD_PREP(GICV5_IRS_VMAP_VISTR_U, unmap) |
+		GICV5_IRS_VMAP_VISTR_M;
+
+	return vgic_v5_irs_write_vm_mmio_reg(val, GICV5_IRS_VMAP_VISTR);
+}
+
+static int vgic_v5_irs_set_vist_valid(u16 vm_id, bool spi_ist)
+{
+	return __vgic_v5_irs_update_vist_validity(vm_id, spi_ist, false);
+}
+
+/*
+ * LPI ISTs can be invalidated explicitly. SPI ISTs are invalidated by making
+ * the VMTE invalid during teardown.
+ */
+static int vgic_v5_irs_set_vist_invalid(u16 vm_id, bool spi_ist)
+{
+	return __vgic_v5_irs_update_vist_validity(vm_id, spi_ist, true);
+}
+
 static int vgic_v5_db_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
 {
+	struct vgic_v5_vm *vm = data->domain->host_data;
 	enum gicv5_vcpu_cmd *cmd = vcpu_info;
 
 	switch (*cmd) {
 	case VMT_L2_MAP:
+		return vgic_v5_irs_vmap_l2_vmt(vm->vm_id);
 	case VMTE_MAKE_VALID:
+		return vgic_v5_irs_set_vm_valid(vm->vm_id);
 	case VMTE_MAKE_INVALID:
+		return vgic_v5_irs_set_vm_invalid(vm->vm_id);
 	case SPI_VIST_MAKE_VALID:
+		return vgic_v5_irs_set_vist_valid(vm->vm_id, true);
 	case LPI_VIST_MAKE_VALID:
+		return vgic_v5_irs_set_vist_valid(vm->vm_id, false);
 	case LPI_VIST_MAKE_INVALID:
-		/* Not yet implemented */
+		return vgic_v5_irs_set_vist_invalid(vm->vm_id, false);
 	default:
 		return -EINVAL;
 	}
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 12/39] KVM: arm64: gic-v5: Keep GICv5 vCPU limit model-specific
From: Sascha Bischoff @ 2026-05-21 14:53 UTC (permalink / raw)
  To: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	kvm@vger.kernel.org
  Cc: nd, maz@kernel.org, oliver.upton@linux.dev, Joey Gouly,
	Suzuki Poulose, yuzenghui@huawei.com, peter.maydell@linaro.org,
	lpieralisi@kernel.org, Timothy Hayes
In-Reply-To: <20260521144846.1899475-1-sascha.bischoff@arm.com>

A GICv5 host with FEAT_GCIE_LEGACY can expose both a native vGICv5 or
a vGICv3 device. These models do not necessarily have the same vCPU
limit: the native GICv5 limit is probed from the IRS VPE capacity,
while the GICv3 limit remains the fixed KVM vGICv3 limit.

Keep the IRS-derived limit separately for vGICv5 creation. The
pre-VGIC KVM_CAP_MAX_VCPUS value continues to expose the largest limit
among the still-selectable models, and kvm_vgic_create() clamps the VM
to the limit of the VGIC model userspace actually selected.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
---
 arch/arm64/kvm/vgic/vgic-init.c | 14 +++++++++-----
 arch/arm64/kvm/vgic/vgic-v5.c   | 19 +++++++++----------
 include/kvm/arm_vgic.h          | 16 ++++++++++++----
 3 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 079a57c2b18f6..94632fd90b728 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -129,13 +129,17 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
 	}
 	ret = 0;
 
-	if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+	switch (type) {
+	case KVM_DEV_TYPE_ARM_VGIC_V2:
 		kvm->max_vcpus = VGIC_V2_MAX_CPUS;
-	else if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
+		break;
+	case KVM_DEV_TYPE_ARM_VGIC_V3:
 		kvm->max_vcpus = VGIC_V3_MAX_CPUS;
-	else if (type == KVM_DEV_TYPE_ARM_VGIC_V5)
-		kvm->max_vcpus = min(VGIC_V5_MAX_CPUS,
-				     kvm_vgic_global_state.max_gic_vcpus);
+		break;
+	case KVM_DEV_TYPE_ARM_VGIC_V5:
+		kvm->max_vcpus = kvm_vgic_global_state.max_gicv5_vcpus;
+		break;
+	}
 
 	if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) {
 		ret = -E2BIG;
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index f9578c2a634a4..909cef5f31afa 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -110,7 +110,8 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	int ret;
 
 	kvm_vgic_global_state.type = VGIC_V5;
-	kvm_vgic_global_state.max_gic_vcpus = VGIC_V5_MAX_CPUS;
+	kvm_vgic_global_state.max_gic_vcpus = 0;
+	kvm_vgic_global_state.max_gicv5_vcpus = 0;
 
 	kvm_vgic_global_state.vcpu_base = 0;
 	kvm_vgic_global_state.vctrl_base = NULL;
@@ -135,8 +136,8 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	 * Even if the HW supports more per-VM vCPUs, artificially cap as we
 	 * can't use them all.
 	 */
-	kvm_vgic_global_state.max_gic_vcpus = min(irs_caps.max_vpes,
-						  VGIC_V5_MAX_CPUS);
+	kvm_vgic_global_state.max_gicv5_vcpus = min(irs_caps.max_vpes,
+						    VGIC_V5_MAX_CPUS);
 
 	/*
 	 * GICv5 requires a set of tables to be allocated in order to manage
@@ -145,7 +146,7 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	 * we want to run. For now, we match the maximum number offered by the
 	 * hardware, but this might not be a wise choice in the long term.
 	 */
-	ret = vgic_v5_vmt_allocate(kvm_vgic_global_state.max_gic_vcpus);
+	ret = vgic_v5_vmt_allocate(kvm_vgic_global_state.max_gicv5_vcpus);
 	if (ret) {
 		kvm_err("Failed to allocate the GICv5 VM tables; no GICv5 support\n");
 		return -ENODEV;
@@ -166,9 +167,6 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 		return -ENODEV;
 	}
 
-	kvm_vgic_global_state.max_gic_vcpus = min(irs_caps.max_vpes,
-						  VGIC_V5_MAX_CPUS);
-
 	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V5);
 	if (ret) {
 		kvm_err("Cannot register GICv5 KVM device.\n");
@@ -178,6 +176,8 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	}
 
 	v5_registered = true;
+	kvm_vgic_global_state.max_gic_vcpus =
+		kvm_vgic_global_state.max_gicv5_vcpus;
 	kvm_info("GCIE system register CPU interface\n");
 
 skip_v5:
@@ -205,9 +205,8 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 		return v5_registered ? 0 : ret;
 	}
 
-	/* We potentially limit the max VCPUs further than we need to here */
-	kvm_vgic_global_state.max_gic_vcpus = min(VGIC_V3_MAX_CPUS,
-						  kvm_vgic_global_state.max_gic_vcpus);
+	kvm_vgic_global_state.max_gic_vcpus = max(kvm_vgic_global_state.max_gic_vcpus,
+						  VGIC_V3_MAX_CPUS);
 
 	static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
 	kvm_info("GCIE legacy system register CPU interface\n");
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index ba32cd71fe0a7..6f736094a0e7e 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -157,9 +157,16 @@ struct vgic_global {
 	/* Maintenance IRQ number */
 	unsigned int		maint_irq;
 
-	/* maximum number of VCPUs allowed (GICv2 limits us to 8) */
+	/*
+	 * Maximum number of VCPUs exposed before userspace has selected a
+	 * VGIC model. Individual VGIC models can impose a lower limit
+	 * (GICv2 limits us to 8).
+	 */
 	int			max_gic_vcpus;
 
+	/* Maximum number of VCPUs allowed for a GICv5 VM. */
+	int			max_gicv5_vcpus;
+
 	/* Only needed for the legacy KVM_CREATE_IRQCHIP */
 	bool			can_emulate_gicv2;
 
@@ -635,10 +642,11 @@ void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu);
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1);
 
 /**
- * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
+ * kvm_vgic_get_max_vcpus - Get the pre-VGIC-selection VCPU limit
  *
- * The host's GIC naturally limits the maximum amount of VCPUs a guest
- * can use.
+ * Userspace can query KVM_CAP_MAX_VCPUS before selecting a VGIC model, so
+ * expose the highest model-specific limit and let kvm_vgic_create() enforce
+ * the selected model's actual limit.
  */
 static inline int kvm_vgic_get_max_vcpus(void)
 {
-- 
2.34.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox