LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 3/7] ibmvfc: make ibmvfc login to fabric
From: Dave Marquardt via B4 Relay @ 2026-06-08 18:30 UTC (permalink / raw)
  To: James E.J. Bottomley, Martin K. Petersen, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Tyrel Datwyler
  Cc: linux-kernel, linux-scsi, linuxppc-dev, Brian King, Greg Joyce,
	Kyle Mahlkuch, Dave Marquardt
In-Reply-To: <20260608-ibmvfc-fpin-support-v2-0-d41f540fba5c@linux.ibm.com>

From: Dave Marquardt <davemarq@linux.ibm.com>

Add support for fabric login in order to support the asynchronous
event queue with its own interrupt as required by NPIV specification
to support the asynchronous sub-queue and interrupt in order to
support full and extended FPIN messages.
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 94 ++++++++++++++++++++++++++++++++++++++++--
 drivers/scsi/ibmvscsi/ibmvfc.h | 16 +++++++
 2 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 88386d7c9106..a18861808325 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -5244,6 +5244,86 @@ static void ibmvfc_discover_targets(struct ibmvfc_host *vhost)
 		ibmvfc_link_down(vhost, IBMVFC_LINK_DEAD);
 }
 
+static void ibmvfc_fabric_login_done(struct ibmvfc_event *evt)
+{
+	struct ibmvfc_fabric_login *rsp = &evt->xfer_iu->fabric_login;
+	u32 mad_status = be16_to_cpu(rsp->common.status);
+	struct ibmvfc_host *vhost = evt->vhost;
+	int level = IBMVFC_DEFAULT_LOG_LEVEL;
+
+	ENTER;
+
+	switch (mad_status) {
+	case IBMVFC_MAD_SUCCESS:
+		fc_host_port_id(vhost->host) = be64_to_cpu(rsp->nport_id);
+		ibmvfc_free_event(evt);
+		break;
+
+	case IBMVFC_MAD_FAILED:
+		if (ibmvfc_retry_cmd(be16_to_cpu(rsp->status), be16_to_cpu(rsp->error)))
+			level += ibmvfc_retry_host_init(vhost);
+		else
+			ibmvfc_link_down(vhost, IBMVFC_LINK_DEAD);
+		ibmvfc_log(vhost, level, "Fabric Login failed: %s (%x:%x)\n",
+			   ibmvfc_get_cmd_error(be16_to_cpu(rsp->status), be16_to_cpu(rsp->error)),
+						be16_to_cpu(rsp->status), be16_to_cpu(rsp->error));
+		ibmvfc_free_event(evt);
+		LEAVE;
+		return;
+
+	case IBMVFC_MAD_CRQ_ERROR:
+		ibmvfc_retry_host_init(vhost);
+		fallthrough;
+
+	case IBMVFC_MAD_DRIVER_FAILED:
+		ibmvfc_free_event(evt);
+		LEAVE;
+		return;
+
+	default:
+		dev_err(vhost->dev, "Invalid fabric Login response: 0x%x\n", mad_status);
+		ibmvfc_link_down(vhost, IBMVFC_LINK_DEAD);
+		ibmvfc_free_event(evt);
+		LEAVE;
+		return;
+	}
+
+	ibmvfc_set_host_action(vhost, IBMVFC_HOST_ACTION_QUERY);
+	wake_up(&vhost->work_wait_q);
+
+	LEAVE;
+}
+
+static void ibmvfc_fabric_login(struct ibmvfc_host *vhost)
+{
+	struct ibmvfc_fabric_login *mad;
+	struct ibmvfc_event *evt = ibmvfc_get_reserved_event(&vhost->crq);
+	int level = IBMVFC_DEFAULT_LOG_LEVEL;
+
+	if (!evt) {
+		ibmvfc_log(vhost, level, "Fabric Login failed: no available events\n");
+		ibmvfc_hard_reset_host(vhost);
+		return;
+	}
+
+	ibmvfc_init_event(evt, ibmvfc_fabric_login_done, IBMVFC_MAD_FORMAT);
+	mad = &evt->iu.fabric_login;
+	memset(mad, 0, sizeof(*mad));
+	if (vhost->scsi_scrqs.protocol == IBMVFC_PROTO_SCSI)
+		mad->common.opcode = cpu_to_be32(IBMVFC_FABRIC_LOGIN);
+	else {
+		ibmvfc_log(vhost, level, "Fabric Login failed: unknown protocol\n");
+		return;
+	}
+	mad->common.version = cpu_to_be32(1);
+	mad->common.length = cpu_to_be16(sizeof(*mad));
+
+	ibmvfc_set_host_action(vhost, IBMVFC_HOST_ACTION_INIT_WAIT);
+
+	if (ibmvfc_send_event(evt, vhost, default_timeout))
+		ibmvfc_link_down(vhost, IBMVFC_LINK_DOWN);
+}
+
 static void ibmvfc_channel_setup_done(struct ibmvfc_event *evt)
 {
 	struct ibmvfc_host *vhost = evt->vhost;
@@ -5290,8 +5370,12 @@ static void ibmvfc_channel_setup_done(struct ibmvfc_event *evt)
 		return;
 	}
 
-	ibmvfc_set_host_action(vhost, IBMVFC_HOST_ACTION_QUERY);
-	wake_up(&vhost->work_wait_q);
+	if (ibmvfc_check_caps(vhost, IBMVFC_SUPPORT_SCSI)) {
+		ibmvfc_fabric_login(vhost);
+	} else {
+		ibmvfc_set_host_action(vhost, IBMVFC_HOST_ACTION_QUERY);
+		wake_up(&vhost->work_wait_q);
+	}
 }
 
 static void ibmvfc_channel_setup(struct ibmvfc_host *vhost)
@@ -5482,9 +5566,11 @@ static void ibmvfc_npiv_login_done(struct ibmvfc_event *evt)
 	vhost->host->can_queue = be32_to_cpu(rsp->max_cmds) - IBMVFC_NUM_INTERNAL_REQ;
 	vhost->host->max_sectors = npiv_max_sectors;
 
-	if (ibmvfc_check_caps(vhost, IBMVFC_CAN_SUPPORT_CHANNELS) && vhost->do_enquiry) {
+	if (ibmvfc_check_caps(vhost, IBMVFC_CAN_SUPPORT_CHANNELS) && vhost->do_enquiry)
 		ibmvfc_channel_enquiry(vhost);
-	} else {
+	else if (ibmvfc_check_caps(vhost, IBMVFC_SUPPORT_SCSI))
+		ibmvfc_fabric_login(vhost);
+	else {
 		vhost->do_enquiry = 0;
 		ibmvfc_set_host_action(vhost, IBMVFC_HOST_ACTION_QUERY);
 		wake_up(&vhost->work_wait_q);
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index dd26248cac3e..c996b36d335d 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -138,6 +138,7 @@ enum ibmvfc_mad_types {
 	IBMVFC_CHANNEL_ENQUIRY	= 0x1000,
 	IBMVFC_CHANNEL_SETUP	= 0x2000,
 	IBMVFC_CONNECTION_INFO	= 0x4000,
+	IBMVFC_FABRIC_LOGIN	= 0x8000,
 };
 
 struct ibmvfc_mad_common {
@@ -227,6 +228,7 @@ struct ibmvfc_npiv_login_resp {
 #define IBMVFC_MAD_VERSION_CAP		0x20
 #define IBMVFC_HANDLE_VF_WWPN		0x40
 #define IBMVFC_CAN_SUPPORT_CHANNELS	0x80
+#define IBMVFC_SUPPORT_SCSI		0x200
 #define IBMVFC_SUPPORT_NOOP_CMD		0x1000
 	__be32 max_cmds;
 	__be32 scsi_id_sz;
@@ -590,6 +592,19 @@ struct ibmvfc_connection_info {
 	__be64 reserved[16];
 } __packed __aligned(8);
 
+struct ibmvfc_fabric_login {
+	struct ibmvfc_mad_common common;
+	__be64 flags;
+#define IBMVFC_STRIP_MERGE	0x1
+#define IBMVFC_LINK_COMMANDS	0x2
+	__be64 capabilities;
+	__be64 nport_id;
+	__be16 status;
+	__be16 error;
+	__be32 pad;
+	__be64 reserved[16];
+} __packed __aligned(8);
+
 struct ibmvfc_trace_start_entry {
 	u32 xfer_len;
 } __packed;
@@ -715,6 +730,7 @@ union ibmvfc_iu {
 	struct ibmvfc_channel_enquiry channel_enquiry;
 	struct ibmvfc_channel_setup_mad channel_setup;
 	struct ibmvfc_connection_info connection_info;
+	struct ibmvfc_fabric_login fabric_login;
 } __packed __aligned(8);
 
 enum ibmvfc_target_action {

-- 
2.54.0




^ permalink raw reply related

* [PATCH v2 0/7] ibmvfc: make ibmvfc support FPIN messages
From: Dave Marquardt via B4 Relay @ 2026-06-08 18:30 UTC (permalink / raw)
  To: James E.J. Bottomley, Martin K. Petersen, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Tyrel Datwyler
  Cc: linux-kernel, linux-scsi, linuxppc-dev, Brian King, Greg Joyce,
	Kyle Mahlkuch, Dave Marquardt

This patch series adds FPIN (fabric performance impact notification)
support to the ibmvfc (IBM Virtual Fibre Channel) driver. This comes
in three flavors:

- basic, to recognize existing FPIN messages from the virtual I/O
  server (VIOS) (patch 1)
- full, supporting additional FPIN information and using its own
  asynchronous sub-queue and interrupt (patches 6)
- extended, supporting FC-LS-5 (patch 7)

Full and extended FPIN support requires a new asynchronous sub-queue
with its own interrupt. The asynchronous sub-queue support requires
ibmvfc to also support

- a new VFC_NOOP command, which the driver recognizes and
  ignores (patch 2)
- fabric login, to login separately to the fabric through messages
  exchanged with VIOS rather than doing fabric login through the
  existing NPIV login (patch 3)
- defining the asynchronous sub-queue CRQ (patch 4)
- allocating the asynchronous sub-queue (patch 5)
- register and use the  asynchronous sub-queue (patch 6)

All three modes convert an incoming FPIN message from VIOS to an FC
extended link service message, in some cases using default values for
information not provided by the VIOS FPIN message but expected in the
FC ELS message. This FC ELS message is passed to fc_host_rcv_fpin for
updating statistics and sending the information upstream by netlink
multicast, where it may be read by listeners including the DM
multipath daemon "multipathd."

Signed-off-by: Dave Marquardt <davemarq@linux.ibm.com>
---
Highlights of changes in v2:
- Refactored mostly common FPIN conversion routines and async event
  processing into single routines with wrappers for differences.
- Moved FPIN processing to a work queue to avoid conflicts with
  fc_host_fpin_rcv and memory allocation
- Set descriptor sizes correctly
- Use target WWPN for basic FPIN descriptor
- Split patch 4 into 3 patches, for definition, allocation, and use of
  the asynchronous sub-queue for events
- Link to v1: https://patch.msgid.link/20260408-ibmvfc-fpin-support-v1-0-52b06c464e03@linux.ibm.com

---
Dave Marquardt (7):
      ibmvfc: add basic FPIN support
      ibmvfc: Add NOOP command support
      ibmvfc: make ibmvfc login to fabric
      ibmvfc: define asynchronous sub-queue
      ibmvfc: allocate asynchronous sub-queue
      ibmvfc: register and use asynchronous sub-queue
      ibmvfc: handle extended FPIN events

 drivers/scsi/Kconfig                 |  10 +
 drivers/scsi/ibmvscsi/Makefile       |   1 +
 drivers/scsi/ibmvscsi/ibmvfc.c       | 702 ++++++++++++++++++++++++++++++++---
 drivers/scsi/ibmvscsi/ibmvfc.h       |  94 ++++-
 drivers/scsi/ibmvscsi/ibmvfc_kunit.c | 243 ++++++++++++
 5 files changed, 992 insertions(+), 58 deletions(-)
---
base-commit: 0600eec09ad6cc5ba3ca78aceb6fa8dcbad010bb
change-id: 20260407-ibmvfc-fpin-support-b9b575cd2da1

Best regards,
--  
Dave Marquardt <davemarq@linux.ibm.com>




^ permalink raw reply

* [PATCH v2 7/7] ibmvfc: handle extended FPIN events
From: Dave Marquardt via B4 Relay @ 2026-06-08 18:30 UTC (permalink / raw)
  To: James E.J. Bottomley, Martin K. Petersen, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Tyrel Datwyler
  Cc: linux-kernel, linux-scsi, linuxppc-dev, Brian King, Greg Joyce,
	Kyle Mahlkuch, Dave Marquardt
In-Reply-To: <20260608-ibmvfc-fpin-support-v2-0-d41f540fba5c@linux.ibm.com>

From: Dave Marquardt <davemarq@linux.ibm.com>

Add extended FPIN handling to ibmvfc driver. Tell VIOS ibmvfc can
handle extended FPIN messages, convert any received to struct fc_els
descriptors, and call fc_host_fpin_rcv to update statistics and send
netlink multicast messages to listeners such as multipathd.
---
 drivers/scsi/ibmvscsi/ibmvfc.c       |  41 +++++++++++-
 drivers/scsi/ibmvscsi/ibmvfc.h       |  31 +++++++++
 drivers/scsi/ibmvscsi/ibmvfc_kunit.c | 122 +++++++++++++++++++++++++++++++++--
 3 files changed, 186 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index a2252cd2f44b..b034a894e3ec 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -1515,7 +1515,8 @@ static void ibmvfc_set_login_info(struct ibmvfc_host *vhost)
 	login_info->capabilities =
 		cpu_to_be64(IBMVFC_CAN_MIGRATE | IBMVFC_CAN_SEND_VF_WWPN |
 			    IBMVFC_CAN_USE_NOOP_CMD | IBMVFC_YES_SCSI |
-			    IBMVFC_USE_ASYNC_SUBQ | IBMVFC_CAN_HANDLE_FPIN);
+			    IBMVFC_USE_ASYNC_SUBQ | IBMVFC_CAN_HANDLE_FPIN |
+			    IBMVFC_CAN_HANDLE_FPIN_EXT);
 
 	if (vhost->mq_enabled || vhost->using_channels)
 		login_info->capabilities |= cpu_to_be64(IBMVFC_CAN_USE_CHANNELS);
@@ -3254,7 +3255,7 @@ ibmvfc_common_fpin_to_desc(u8 fpin_status, __be64 wwpn, __be16 type, __be16 modi
 	if (size == 0)
 		return NULL;
 
-	fpin = kzalloc(size, GFP_ATOMIC);
+	fpin = kzalloc(size, GFP_KERNEL);
 	if (fpin == NULL)
 		return NULL;
 
@@ -3371,6 +3372,28 @@ ibmvfc_full_fpin_to_desc(struct ibmvfc_async_subq *ibmvfc_fpin)
 					  cpu_to_be32(1));
 }
 
+/**
+ * ibmvfc_ext_fpin_to_desc(): allocate and populate a struct fc_els_fpin struct
+ * containing a descriptor.
+ * @ibmvfc_fpin: Pointer to async subq FPIN data
+ *
+ * Allocate a struct fc_els_fpin containing a descriptor and populate
+ * based on data from *ibmvfc_fpin.
+ *
+ * Return:
+ * NULL     - unable to allocate structure
+ * non-NULL - pointer to populated struct fc_els_fpin
+ */
+static struct fc_els_fpin *
+ibmvfc_ext_fpin_to_desc(struct ibmvfc_async_subq_fpin *ibmvfc_fpin)
+{
+	return ibmvfc_common_fpin_to_desc(ibmvfc_fpin->fpin_status, ibmvfc_fpin->wwpn,
+					  ibmvfc_fpin->fpin_data.event_type,
+					  ibmvfc_fpin->fpin_data.event_type_modifier,
+					  ibmvfc_fpin->fpin_data.event_threshold,
+					  ibmvfc_fpin->fpin_data.event_data.event_count);
+}
+
 /**
  * ibmvfc_process_async_work - Process IBMVFC_AE_FPIN async CRQ from work queue
  * @work: pointer to work_struct
@@ -3425,7 +3448,19 @@ static void ibmvfc_process_async_work(struct work_struct *work)
 			fpin = ibmvfc_basic_fpin_to_desc(crq, tgt->wwpn);
 		} else {
 			sqfpin = (struct ibmvfc_async_subq_fpin *)subq;
-			fpin = ibmvfc_full_fpin_to_desc(subq);
+			if ((subq->flags & IBMVFC_ASYNC_IS_FPIN_EXT) == 0) {
+				fpin = ibmvfc_full_fpin_to_desc(subq);
+			} else if (!(sqfpin->fpin_data.flags & IBMVFC_FPIN_EVENT_TYPE_VALID)) {
+				dev_err_ratelimited(vhost->dev,
+						    "Invalid extended FPIN event received");
+				fpin = NULL;
+			} else if (!ibmvfc_check_caps(vhost, IBMVFC_SUPPORT_FPIN_EXT)) {
+				dev_err_ratelimited(vhost->dev,
+						    "Unexpected extended FPIN event received");
+				fpin = NULL;
+			} else {
+				fpin = ibmvfc_ext_fpin_to_desc(sqfpin);
+			}
 		}
 		if (fpin) {
 			fc_host_fpin_rcv(tgt->vhost->host,
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index 2e02acde0178..5c4cf4be4b67 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -184,6 +184,7 @@ struct ibmvfc_npiv_login {
 #define IBMVFC_YES_SCSI			0x40
 #define IBMVFC_USE_ASYNC_SUBQ		0x100
 #define IBMVFC_CAN_USE_NOOP_CMD		0x200
+#define IBMVFC_CAN_HANDLE_FPIN_EXT	0x800
 	__be64 node_name;
 	struct srp_direct_buf async;
 	u8 partition_name[IBMVFC_MAX_NAME];
@@ -233,6 +234,7 @@ struct ibmvfc_npiv_login_resp {
 #define IBMVFC_SUPPORT_SCSI		0x200
 #define IBMVFC_SUPPORT_ASYNC_SUBQ	0x800
 #define IBMVFC_SUPPORT_NOOP_CMD		0x1000
+#define IBMVFC_SUPPORT_FPIN_EXT		0x2000
 	__be32 max_cmds;
 	__be32 scsi_id_sz;
 	__be64 max_dma_len;
@@ -722,6 +724,7 @@ struct ibmvfc_async_work {
 struct ibmvfc_async_subq {
 	volatile u8 valid;
 #define IBMVFC_ASYNC_ID_IS_ASSOC_ID	0x01
+#define IBMVFC_ASYNC_IS_FPIN_EXT	0x02
 #define IBMVFC_FC_EEH			0x04
 #define IBMVFC_FC_FW_UPDATE		0x08
 #define IBMVFC_FC_FW_DUMP		0x10
@@ -738,6 +741,34 @@ struct ibmvfc_async_subq {
 	} id;
 } __packed __aligned(8);
 
+struct ibmvfc_fpin_data {
+#define IBMVFC_FPIN_EVENT_TYPE_VALID	0x01
+#define IBMVFC_FPIN_MODIFIER_VALID	0x02
+#define IBMVFC_FPIN_THRESHOLD_VALID	0x04
+#define IBMVFC_FPIN_SEVERITY_VALID	0x08
+#define IBMVFC_FPIN_EVENT_COUNT_VALID	0x10
+	u8 flags;
+	u8 reserved[3];
+	__be16 event_type;
+	__be16 event_type_modifier;
+	__be32 event_threshold;
+	union {
+		u8 severity;
+		__be32 event_count;
+	} event_data;
+} __packed __aligned(8);
+
+struct ibmvfc_async_subq_fpin {
+	volatile u8 valid;
+	u8 flags;
+	u8 link_state;
+	u8 fpin_status;
+	__be16 event;
+	__be16 pad;
+	volatile __be64 wwpn;
+	struct ibmvfc_fpin_data fpin_data;
+} __packed __aligned(8);
+
 union ibmvfc_iu {
 	struct ibmvfc_mad_common mad_common;
 	struct ibmvfc_npiv_login_mad npiv_login;
diff --git a/drivers/scsi/ibmvscsi/ibmvfc_kunit.c b/drivers/scsi/ibmvscsi/ibmvfc_kunit.c
index c8799eaf4927..2e6cbaaebdba 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc_kunit.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc_kunit.c
@@ -3,6 +3,7 @@
 #include <kunit/visibility.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_transport_fc.h>
+#include <scsi/fc/fc_els.h>
 #include <linux/list.h>
 #include <linux/delay.h>
 #include "ibmvfc.h"
@@ -58,10 +59,10 @@ static void ibmvfc_async_fpin_test(struct kunit *test)
 		crq[fs].wwpn = cpu_to_be64(tgt->wwpn);
 		crq[fs].node_name = cpu_to_be64(tgt->ids.node_name);
 		ibmvfc_handle_async(&crq[fs], vhost);
+		while (crq[fs].valid)
+			msleep(1U);
 	}
 
-	msleep(500U);
-
 	post[IBMVFC_AE_FPIN_LINK_CONGESTED] = READ_ONCE(fc_host->fpin_stats.cn_device_specific);
 	post[IBMVFC_AE_FPIN_PORT_CONGESTED] = READ_ONCE(tgt->rport->fpin_stats.cn);
 	post[IBMVFC_AE_FPIN_PORT_CLEARED] = READ_ONCE(tgt->rport->fpin_stats.cn_clear);
@@ -94,8 +95,8 @@ static void ibmvfc_async_fpin_test(struct kunit *test)
 	crq[0].wwpn = cpu_to_be64(tgt->wwpn);
 	crq[0].node_name = cpu_to_be64(tgt->ids.node_name);
 	ibmvfc_handle_async(&crq[0], vhost);
-
-	msleep(500U);
+	while (crq[0].valid)
+		msleep(1U);
 
 	post[IBMVFC_AE_FPIN_LINK_CONGESTED] = READ_ONCE(fc_host->fpin_stats.cn_device_specific);
 	post[IBMVFC_AE_FPIN_PORT_CONGESTED] = READ_ONCE(tgt->rport->fpin_stats.cn);
@@ -115,8 +116,119 @@ static void ibmvfc_async_fpin_test(struct kunit *test)
 			post[IBMVFC_AE_FPIN_CONGESTION_CLEARED]);
 }
 
+#define IBMVFC_TEST_FPIN_EXT(fs, ev, stat, crq) {		\
+	crq.valid = 0x80;					\
+	crq.flags = IBMVFC_ASYNC_IS_FPIN_EXT;			\
+	crq.link_state = IBMVFC_AE_LS_LINK_UP;			\
+	crq.fpin_status = (fs);					\
+	crq.event = cpu_to_be16(IBMVFC_AE_FPIN);		\
+	crq.wwpn = cpu_to_be64(tgt->wwpn);			\
+	crq.fpin_data.flags = IBMVFC_FPIN_EVENT_TYPE_VALID;	\
+	crq.fpin_data.event_type = cpu_to_be16((ev));		\
+	pre = READ_ONCE(tgt->rport->fpin_stats.stat);		\
+	ibmvfc_handle_asyncq((struct ibmvfc_crq *)&crq, vhost);	\
+	while (crq.valid)					\
+		msleep(1U);					\
+	post = READ_ONCE(tgt->rport->fpin_stats.stat);		\
+}
+
+/**
+ * ibmvfc_extended_fpin_test - unit test for extended FPIN events
+ * @test: pointer to kunit structure
+ *
+ * Tests
+ *
+ * Return: void
+ */
+static void ibmvfc_extended_fpin_test(struct kunit *test)
+{
+	enum ibmvfc_ae_fpin_status fs;
+	struct ibmvfc_async_subq_fpin crq[IBMVFC_AE_FPIN_CONGESTION_CLEARED+1];
+	struct ibmvfc_async_subq_fpin
+		crqcn[IBMVFC_AE_FPIN_PORT_CONGESTED][FPIN_CONGN_DEVICE_SPEC+1];
+	struct ibmvfc_async_subq_fpin crqportdg[FPIN_LI_DEVICE_SPEC+1];
+	struct ibmvfc_target *tgt;
+	struct ibmvfc_host *vhost;
+	struct list_head *headp;
+	LIST_HEAD(evt_doneq);
+	u64 pre, post;
+
+	headp = ibmvfc_get_headp();
+	KUNIT_ASSERT_FALSE_MSG(test, list_empty(headp), "No ibmvfc devices available\n");
+	vhost = list_first_entry(headp, struct ibmvfc_host, queue);
+	KUNIT_ASSERT_GE_MSG(test, vhost->num_targets, 1, "No targets");
+
+	tgt = list_first_entry(&vhost->targets, struct ibmvfc_target, queue);
+	KUNIT_ASSERT_NOT_NULL(test, tgt->rport);
+
+	for (fs = IBMVFC_AE_FPIN_LINK_CONGESTED; fs <= IBMVFC_AE_FPIN_CONGESTION_CLEARED; fs++) {
+		switch (fs) {
+		case IBMVFC_AE_FPIN_PORT_CLEARED:
+		case IBMVFC_AE_FPIN_CONGESTION_CLEARED:
+			crq[fs].valid = 0x80;
+			crq[fs].flags = IBMVFC_ASYNC_IS_FPIN_EXT;
+			crq[fs].link_state = IBMVFC_AE_LS_LINK_UP;
+			crq[fs].fpin_status = fs;
+			crq[fs].event = cpu_to_be16(IBMVFC_AE_FPIN);
+			crq[fs].wwpn = cpu_to_be64(tgt->wwpn);
+			crq[fs].fpin_data.flags = IBMVFC_FPIN_EVENT_TYPE_VALID;
+			crq[fs].fpin_data.event_type = cpu_to_be16(FPIN_CONGN_CLEAR);
+			pre = READ_ONCE(tgt->rport->fpin_stats.cn_clear);
+			ibmvfc_handle_asyncq((struct ibmvfc_crq *)&crq[fs], vhost);
+			while (crq[fs].valid)
+				msleep(1U);
+			post = READ_ONCE(tgt->rport->fpin_stats.cn_clear);
+			break;
+		case IBMVFC_AE_FPIN_LINK_CONGESTED:
+		case IBMVFC_AE_FPIN_PORT_CONGESTED:
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_CONGN_CLEAR, cn_clear,
+					     crqcn[fs-1][FPIN_CONGN_CLEAR]);
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_CONGN_LOST_CREDIT,
+					     cn_lost_credit,
+					     crqcn[fs-1][FPIN_CONGN_LOST_CREDIT]);
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_CONGN_CREDIT_STALL,
+					     cn_credit_stall,
+					     crqcn[fs-1][FPIN_CONGN_CREDIT_STALL]);
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_CONGN_OVERSUBSCRIPTION,
+					     cn_oversubscription,
+					     crqcn[fs-1][FPIN_CONGN_OVERSUBSCRIPTION]);
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_CONGN_DEVICE_SPEC,
+					     cn_device_specific,
+					     crqcn[fs-1][FPIN_CONGN_DEVICE_SPEC]);
+			break;
+		case IBMVFC_AE_FPIN_PORT_DEGRADED:
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_LI_UNKNOWN,
+					     li_failure_unknown,
+					     crqportdg[FPIN_LI_UNKNOWN]);
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_LI_LINK_FAILURE,
+					     li_link_failure_count,
+					     crqportdg[FPIN_LI_LINK_FAILURE]);
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_LI_LOSS_OF_SYNC,
+					     li_loss_of_sync_count,
+					     crqportdg[FPIN_LI_LOSS_OF_SYNC]);
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_LI_LOSS_OF_SIG,
+					     li_loss_of_signals_count,
+					     crqportdg[FPIN_LI_LOSS_OF_SIG]);
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_LI_PRIM_SEQ_ERR,
+					     li_prim_seq_err_count,
+					     crqportdg[FPIN_LI_PRIM_SEQ_ERR]);
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_LI_INVALID_TX_WD,
+					     li_invalid_tx_word_count,
+					     crqportdg[FPIN_LI_INVALID_TX_WD]);
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_LI_INVALID_CRC,
+					     li_invalid_crc_count,
+					     crqportdg[FPIN_LI_INVALID_CRC]);
+			IBMVFC_TEST_FPIN_EXT(fs, FPIN_LI_DEVICE_SPEC,
+					     li_device_specific,
+					     crqportdg[FPIN_LI_DEVICE_SPEC]);
+			break;
+		}
+	}
+}
+
 static struct kunit_case ibmvfc_fpin_test_cases[] = {
-	KUNIT_CASE_SLOW(ibmvfc_async_fpin_test),
+	KUNIT_CASE(ibmvfc_async_fpin_test),
+	KUNIT_CASE(ibmvfc_extended_fpin_test),
 	{},
 };
 

-- 
2.54.0




^ permalink raw reply related

* Re: [PATCH 35/60] kvm: Add VCPU plane-scheduling state and helpers
From: Paolo Bonzini @ 2026-06-08 17:58 UTC (permalink / raw)
  To: Jörg Rödel
  Cc: Sean Christopherson, Tom Lendacky, ashish.kalra, michael.roth,
	nsaenz, anelkz, James.Bottomley, Melody Wang, kvm, linux-kernel,
	kvmarm, loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86,
	coconut-svsm, joerg.roedel
In-Reply-To: <aib8n5lO6HKbLC4Y@8bytes.org>

On Mon, Jun 8, 2026 at 7:52 PM Jörg Rödel <joro@8bytes.org> wrote:
> On Mon, Jun 08, 2026 at 06:47:54PM +0200, Paolo Bonzini wrote:
> > On 6/8/26 16:42, Jörg Rödel wrote:
> > > The algorithm is to always run the lowest runnable plane. Plane
> > > switches are done by stopping the current plane and setting another
> > > runnable.
> >
> > This was left arbitrary in my version because for example Hyper-V VTLs use
> > highest-runnable instead.  It also made pure userspace scheduling possible,
> > though that may not be very important in the grand scheme of things.
>
> IIRC what Hyper-V does is always the run the highest-privileged runnable level,
> no?  Maybe in their numbering level 0 has the least privileges?

Yes, exactly.

> Anyway, I am happy to make changes here, also based on input from the VSM side.

Related to this, let me know if you want me to pick up again the
common part, especially with Sashiko being hard at work on it.

> > Did you drop it because it didn't work, or just for simplicity?
>
> The user-space scheduling worked, my 6.17 planes implementation used it. But
> there are some problems with it going forward, because TDX Partitioning (and
> likely ARM CCA Planes as well) do not allow arbitrary switches forced by the
> hypervisor. All they allow is a forced switch to the highest privileged plane,
> the SVSM on SNP will force the same constraints by making lower-privilege VMSAs
> not-runnable when it executes.

The idea of the userspace scheduling was that you're not forced to use
it - the kernel can always choose to override it if it's using an
accelerated implementation of planes (and of plane switching). But it
also leaves some leeway to different accelerated implementations, each
of which can pick their own algorithm.

Conceptually I'd rather keep the possibility of userspace scheduling.
But maybe it doesn't add much.

Paolo

> So exposing an interface for user-space to chose which plane to run does seem
> to gain some weird, platform dependent semantics going forward. TDX and CCA
> also require in-kernel switching as they can switch planes without a VMEXIT, so
> I decided to have it from the start.
>
>
> -Joerg
>



^ permalink raw reply

* Re: [PATCH 35/60] kvm: Add VCPU plane-scheduling state and helpers
From: Jörg Rödel @ 2026-06-08 17:52 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Sean Christopherson, Tom Lendacky, ashish.kalra, michael.roth,
	nsaenz, anelkz, James.Bottomley, Melody Wang, kvm, linux-kernel,
	kvmarm, loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86,
	coconut-svsm, joerg.roedel
In-Reply-To: <e7eb0dba-790a-4644-9895-5c9a7420d7fe@redhat.com>

Hi Paolo,

On Mon, Jun 08, 2026 at 06:47:54PM +0200, Paolo Bonzini wrote:
> On 6/8/26 16:42, Jörg Rödel wrote:
> > From: Joerg Roedel <joerg.roedel@amd.com>
> > 
> > The algorithm is to always run the lowest runnable plane. Plane
> > switches are done by stopping the current plane and setting another
> > runnable.
> > 
> > Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
> 
> This was left arbitrary in my version because for example Hyper-V VTLs use
> highest-runnable instead.  It also made pure userspace scheduling possible,
> though that may not be very important in the grand scheme of things.

IIRC what Hyper-V does is always the run the highest-privileged runnable level,
no?  Maybe in their numbering level 0 has the least privileges?  Anyway, I am
happy to make changes here, also based on input from the VSM side.

> Did you drop it because it didn't work, or just for simplicity?

The user-space scheduling worked, my 6.17 planes implementation used it. But
there are some problems with it going forward, because TDX Partitioning (and
likely ARM CCA Planes as well) do not allow arbitrary switches forced by the
hypervisor. All they allow is a forced switch to the highest privileged plane,
the SVSM on SNP will force the same constraints by making lower-privilege VMSAs
not-runnable when it executes.

So exposing an interface for user-space to chose which plane to run does seem
to gain some weird, platform dependent semantics going forward. TDX and CCA
also require in-kernel switching as they can switch planes without a VMEXIT, so
I decided to have it from the start.


-Joerg


^ permalink raw reply

* Re: [PATCH v2] powerpc/pseries/iommu: export DMA window data to user space
From: Gaurav Batra @ 2026-06-08 16:56 UTC (permalink / raw)
  To: Harsh Prateek Bora, maddy
  Cc: linuxppc-dev, sbhat, vaibhav, ritesh.list, Brian King
In-Reply-To: <bef91e40-8fdf-4f22-8dfa-6936eee3d671@linux.ibm.com>

Hello Harsh,

My response to your locking device_node suggestion is below inline. 
Please let me know if you don't agree with my reasoning.

Thanks

Gaurav

On 5/8/26 12:04 PM, Harsh Prateek Bora wrote:
> Hi Gaurav,
>
> On 07/05/26 11:36 pm, Gaurav Batra wrote:
>> Export PowerPC DMA window information (both default 2GB and Dynamic
>> larger window) to user space via sysfs. Each of these DMA windows has
>> attributes like size of the window, page size backing the window, mode,
>> etc. Each of these atributes is exported for user space consumption as a
>> file.
>>
>> PowerPC Host Bridge (PHB) can have multiple devices/functions sharing
>> the same DMA window. For each PHB, iommu registration creates an iommu
>> device under "/sys/devices/virtual/iommu".
>>
>> These devices will have 2 groups created to export Default and DDW
>> attributes.
>>
>> Reviewed-by: Brian King <brking@linux.ibm.com>
>> Reviewed-by: Vaibhav Jain <vaibhav@linux.ibm.com>
>> Reviewed-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
>
> I do not see R-b tags provided on the list after review comments.
> Not sure if I am missing the email or were these provided privately ?
> Sharing some review comments inline below ..
>
>> Signed-off-by: Gaurav Batra <gbatra@linux.ibm.com>
>> ---
>> V1 -> V2 change log:
>>
>> 1. Shiva: "weight" the it_map for the bitmap. This avoids using an extra
>>     counter in the table. Please look into how 
>> iommu_debugfs_weight_get()
>>     does this
>>
>>     Response: Incorporated changes
>>
>> 2. Vaibhav: If the DMA window is not available, show function should 
>> just
>>     return ENOENT so that userspace know the error instantly instead of
>>     having to parse the sysfs contents.
>>
>>     Response: Incorporated changes, returning ENODATA
>>
>> 3. Vaibhav: All the show functions have similar template. Please convert
>>     them to macros expansion to reduce code volume.
>>
>>     Response: Incorporated changes
>>
>> 4. Vaibhav: These new attributes are PSeries specific but they are being
>>     setup in ppc generic iommu code at arch/powerpc/kernel/iommu.c. Can
>>     you move these attributes to arch/powerpc/platforms/pseries/iommu.c
>>
>>     Response: I have split the attributes and moved them to pseries 
>> specific
>>     files. The original group "spapr-tce-iommu", is moved to PowerNV 
>> code
>>     base to retain the legacy functionality.
>>
>>     I tested the changes both on Pseries and PowerNV.
>>
>> 5. Vaibhav: It would be better to use function 
>> iommu_table_inuse_tces() as
>>     a callback in iommu_table_ops which can be implemented by pseries 
>> and
>>     powernv code differently.
>>
>>     Response: the function is no longer needed after changes in #1
>>
>> 6. Vaibhav: Since sysfs is ABI can you propose appropriate entries under
>>     Documentation/ABI/testing
>>
>>     Response: Added documentation
>>
>>   ...sfs-devices-virtual-iommu-dma_window_attrs |  21 ++
>>   .../arch/powerpc/dma_window_attributes.rst    |  65 +++++
>>   arch/powerpc/include/asm/pci-bridge.h         |   4 +
>>   arch/powerpc/kernel/iommu.c                   |  16 +-
>>   arch/powerpc/platforms/powernv/pci-ioda.c     |  16 ++
>>   arch/powerpc/platforms/pseries/iommu.c        | 261 ++++++++++++++++++
>>   arch/powerpc/platforms/pseries/pci_dlpar.c    |   2 +
>>   arch/powerpc/platforms/pseries/pseries.h      |   1 +
>>   arch/powerpc/platforms/pseries/setup.c        |   2 +
>>   9 files changed, 373 insertions(+), 15 deletions(-)
>>   create mode 100644 
>> Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs
>>   create mode 100644 
>> Documentation/arch/powerpc/dma_window_attributes.rst
>>
>> diff --git 
>> a/Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs 
>> b/Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs
>> new file mode 100644
>> index 000000000000..18ba63874276
>> --- /dev/null
>> +++ 
>> b/Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs
>> @@ -0,0 +1,21 @@
>> +What: /sys/devices/virtual/iommu/<iommu-isolation>/spapr-tce-ddw/*
>> +Date:       Oct 2025
>> +Contact:    linuxppc-dev@lists.ozlabs.org
>> +Description:    read only
>> +    For each IOMMU isolation unit spapr-tce-ddw sub-directory provides
>> +    attributes to query information related to the bigger Dynamic DMA
>> +    window (DDW) in the PowerPC virtualized platforms.
>> +
>> +    See Documentation/arch/powerpc/dma_window_attributes.rst for more
>> +    information.
>> +
>> +What: /sys/devices/virtual/iommu/<iommu-isolation>/spapr-tce-dma/*
>> +Date:       Oct 2025
>> +Contact:    linuxppc-dev@lists.ozlabs.org
>> +Description:    read only
>> +    For each IOMMU isolation unit spapr-tce-dma sub-directory provides
>> +    attributes to query information related to the default 2GB DMA
>> +    window in the PowerPC virtualized platforms.
>> +
>> +    See Documentation/arch/powerpc/dma_window_attributes.rst for more
>> +    information.
>> diff --git a/Documentation/arch/powerpc/dma_window_attributes.rst 
>> b/Documentation/arch/powerpc/dma_window_attributes.rst
>> new file mode 100644
>> index 000000000000..8bd9aec8539d
>> --- /dev/null
>> +++ b/Documentation/arch/powerpc/dma_window_attributes.rst
>> @@ -0,0 +1,65 @@
>> +.. SPDX-License-Identifier: GPL-2.0
>> +
>> +=====================
>> +DMA Window Attributes
>> +=====================
>> +
>> +In PowerPC architecture there are 2 types of DMA windows -
>> +
>> +1. Default 2GB DMA window which is backed by 4K page size
>> +2. A bigger Dynamic DMA Window (DDW) which is backed by larger page 
>> size
>> +   (64K or 2MB)
>> +
>> +A dedicated device will have both the DMA windows instantiated but 
>> an SR-IOV
>> +device will only have the bigger Dynamic DMA Window.
>> +
>> +The attributes of these 2 DMA windows are exported to user space via 
>> sysfs.
>> +Each IOMMU isolation unit will have its directory created under
>> +/sys/devices/virtual/iommu.
>> +
>> +As an exapmple, iommu-phb0001
>
> s/exapmple/example ?
>
>> +
>> +Under each IOMMU isolation unit, there will be a group of attributes 
>> for
>> +"Default 2GB DMA Window" and "Dynamic DMA Window" - spapr-tce-dma and
>> +spapr-tce-ddw respectively.
>> +
>> +Attributes under each group
>> +
>> +spapr-tce-ddw:
>> +direct_address  dynamic_address       dynamic_size  window_type
>> +direct_size     dynamic_pages_mapped  page_size
>> +
>> +spapr-tce-dma:
>> +dynamic_address  dynamic_pages_mapped  dynamic_size  page_size
>> +
>> +
>> +The bigger Dynamic DMA Window is configured into pre-mapped and/or 
>> dynamically
>> +allocated TCEs. If the DDW is in "Hybrid" mode, then both the Direct
>> +(pre-mapped) and Dynamic part of the DMA window will have valid 
>> values. Hybrid
>> +mode is valid only for SR-IOV devices.
>> +
>> +DMA Window properties:
>> +
>> +direct_address              Starting address of the pre-mapped DMA 
>> window
>> +direct_size                 Size of the pre-mapped DMA Window
>> +dynamic_address             Starting address of the dynamic allocations
>> +dynamic_size                Size of the dynamic allocation window
>> +dynamic_pages_mapped        Pages mapped for DMA by dynamic allocations
>> +page_size                   Page size backing the DMA window
>> +window_type                 Type of the DMA Window 
>> (Direct/Dynamic/Hybrid)
>> +
>> +
>> +An example of DDW attributes for an SR-IOV device::
>> +
>> +    $ cd /sys/devices/virtual/iommu/iommu-phb0001/spapr-tce-ddw
>> +
>> +    $ grep . *
>> +
>> +    direct_address:0x800000000000000   <-- Starting addr of 
>> pre-mapped Window
>> +    direct_size:137438953472           <-- Size of pre-mapped Window 
>> (128GB)
>> +    dynamic_address:0x800002000000000  <-- Starting addr of Dynamic 
>> allocations
>> +    dynamic_size:412316860416          <-- Size of dynamic 
>> allocation window (384GB)
>> +    dynamic_pages_mapped:270           <-- Pages mapped by dynamic 
>> allocations
>> +    page_size:2097152                  <-- DMA window page size (2MB)
>> +    window_type:Hybrid                 <-- window has both 
>> pre-mapped and
>> +                                           dynamic sections
>> diff --git a/arch/powerpc/include/asm/pci-bridge.h 
>> b/arch/powerpc/include/asm/pci-bridge.h
>> index 1dae53130782..9b09178aca5e 100644
>> --- a/arch/powerpc/include/asm/pci-bridge.h
>> +++ b/arch/powerpc/include/asm/pci-bridge.h
>> @@ -124,6 +124,10 @@ struct pci_controller {
>>       resource_size_t dma_window_base_cur;
>>       resource_size_t dma_window_size;
>>   +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
>> +    const struct attribute_group **iommu_groups;
>> +#endif
>> +
>>   #ifdef CONFIG_PPC64
>>       unsigned long buid;
>>       struct pci_dn *pci_data;
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index 0ce71310b7d9..d6242e3f77da 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -1269,24 +1269,10 @@ static const struct iommu_ops 
>> spapr_tce_iommu_ops = {
>>       .device_group = spapr_tce_iommu_device_group,
>>   };
>>   -static struct attribute *spapr_tce_iommu_attrs[] = {
>> -    NULL,
>> -};
>> -
>> -static struct attribute_group spapr_tce_iommu_group = {
>> -    .name = "spapr-tce-iommu",
>> -    .attrs = spapr_tce_iommu_attrs,
>> -};
>> -
>> -static const struct attribute_group *spapr_tce_iommu_groups[] = {
>> -    &spapr_tce_iommu_group,
>> -    NULL,
>> -};
>> -
>>   void ppc_iommu_register_device(struct pci_controller *phb)
>>   {
>>       iommu_device_sysfs_add(&phb->iommu, phb->parent,
>> -                spapr_tce_iommu_groups, "iommu-phb%04x",
>> +                phb->iommu_groups, "iommu-phb%04x",
>>                   phb->global_number);
>>       iommu_device_register(&phb->iommu, &spapr_tce_iommu_ops,
>>                   phb->parent);
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>> b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index 1c78fdfb7b03..0887f154955e 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -2493,6 +2493,20 @@ static const struct pci_controller_ops 
>> pnv_npu_ocapi_ioda_controller_ops = {
>>       .shutdown        = pnv_pci_ioda_shutdown,
>>   };
>>   +static struct attribute *pnv_tce_iommu_attrs[] = {
>> +    NULL,
>> +};
>> +
>> +static struct attribute_group pnv_tce_iommu_group = {
>> +    .name = "spapr-tce-iommu",
>> +    .attrs = pnv_tce_iommu_attrs,
>> +};
>> +
>> +static const struct attribute_group *pnv_tce_iommu_groups[] = {
>> +    &pnv_tce_iommu_group,
>> +    NULL,
>> +};
>> +
>>   static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>>                        u64 hub_id, int ioda_type)
>>   {
>> @@ -2697,6 +2711,8 @@ static void __init pnv_pci_init_ioda_phb(struct 
>> device_node *np,
>>           hose->controller_ops = pnv_pci_ioda_controller_ops;
>>       }
>>   +    hose->iommu_groups = pnv_tce_iommu_groups;
>> +
>>       ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
>>     #ifdef CONFIG_PCI_IOV
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c 
>> b/arch/powerpc/platforms/pseries/iommu.c
>> index 5497b130e026..28be7a45761d 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -56,6 +56,20 @@ enum {
>>       DDW_EXT_LIMITED_ADDR_MODE = 3
>>   };
>>   +/* used by sysfs when querying Dynamic/Default DMA Window data */
>> +struct dma_win_data {
>> +    u32     page_size;
>> +    u64     direct_address;
>> +    u64     direct_size;
>> +    u64     dynamic_address;
>> +    u64     dynamic_size;
>> +    u32     dynamic_pages_mapped;
>> +    char    window_type[15];
>> +};
>> +
>> +#define SPAPR_SUCCESS        0
>> +#define SPAPR_ERROR            -1
>> +
>>   static struct iommu_table *iommu_pseries_alloc_table(int node)
>>   {
>>       struct iommu_table *tbl;
>> @@ -837,6 +851,253 @@ static struct device_node *pci_dma_find(struct 
>> device_node *dn,
>>       return rdn;
>>   }
>>   +/* Get DDW information for the device */
>> +static int gather_ddw_info(struct device *dev, struct dma_win_data 
>> *data)
>> +{
>> +    struct iommu_device *iommu;
>> +    struct pci_controller *phb;
>> +    struct device_node *dn;
>> +    struct pci_dn *pci;
>> +    const __be32 *prop = NULL;
>> +    bool ddw_direct = false;
>> +    bool found = false;
>> +    struct iommu_table *tbl;
>> +    u32 pgshift;
>> +    struct dynamic_dma_window_prop *p;
>> +
>> +    memset(data, 0, sizeof(*data));
>> +
>> +    iommu = dev_get_drvdata(dev);
>> +    phb = container_of(iommu, struct pci_controller, iommu);
>> +    dn = phb->dn;
>> +
>> +    if (!dn)
>> +        return SPAPR_ERROR;
>> +
>> +    pci = PCI_DN(dn);
>> +    if (!pci || !pci->table_group)
>> +        return SPAPR_ERROR;
>> +
>
Here are the sequence of events when a PHB is registered and IOMMU 
device created

1. first PHB device_node is created

2. IOMMU device created with default DMA window. All the DMA tables are 
hanging

out from PHB device_node

3. IOMMU device is registered and sysfs files/attributes created. This 
is where the

patch is creating attributes as well.


Now, when we DLPAR remove a PHB, the sequence of events are

1. delete the sysfs entries for the IOMMU device of the PHB.

2. delete the device_node of PHB.


So, while *_show() is executing, it is holding the kobject of the sysfs 
attribute. In the

event of DLPAR remove of the PHB, from another thread, the DLPAR thread 
gets blocked while

removing the sysfs attribute. device_del() --> device_remove_attrs()


As such, we are guaranteed that while the _show() interface has not 
completed, the whole

infrastructure is intact - namely, PHB device_node and the DMA table_group.

I have tested this while putting the _show() interface in a long sleep 
and executing DLPAR

of PHB from another terminal.

> Should we also hold a dn ref with of_node_get(dn) before proceeding 
> with of_get_property calls ?
Not needed as explained above.
>
>> +    /* Find DDW */
>> +    prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);
>> +    if (prop) {
>> +        ddw_direct = true;
>> +        found = true;
>> +    } else {
>> +        prop = of_get_property(dn, DMA64_PROPNAME, NULL);
>> +        if (prop)
>> +            found = true;
>> +    }
>> +
>> +    /* NO DDW */
>> +    if (!found)
>
> .. then release dn ref here if not found ..
not needed
>
>> +        return SPAPR_ERROR;
>> +
>> +    p = (struct dynamic_dma_window_prop *)prop;
>> +
>> +    pgshift = be32_to_cpu(p->tce_shift);
>> +    if (pgshift != 0xc && pgshift != 0x10 && pgshift != 0x15)
>
> Can we have macros for 0xc, 0x10 and 0x15 respectively ?
>
>> +        data->page_size = 0;
>> +    else
>> +        data->page_size = 1 << pgshift;
>> +
>> +    /* Check if DDW has table associated with it. Having a table 
>> associated with
>> +     * DDW is indicative that is has some dynamic TCE allocations. 
>> In this case the
>> +     * DDW can be fully Dynamic or in Hybrid mode. For SR-IOV DDW is 
>> on index 0,
>> +     * for dedicated adapter on index 1.
>> +     */
>> +    found = false;
>> +    for (int i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
>> +        tbl = pci->table_group->tables[i];
>
> Can another thread do a kfree(table_group) via 
> iommu_pseries_free_group() during hotplug remove before we reach here?
not possible, as explained above. This will get called only when the PHB 
device_node is deleted.
>
>> +
>> +        if (tbl && tbl->it_index == be32_to_cpu(p->liobn)) {
>> +            found = true;
>> +            break;
>> +        }
>> +    }
>
> Is it possible that another thread changes bitmap before we reach
> bitmap_weight below ? If table is found, we may want to safely access
> its bitamp (consider using tbl->largepool.lock?).

yes, other thread can change the bitmap before we reach here. But, the 
DMA attributes are

exported via sysfs as a way to get a peek at the DMA window properties 
at that moment. The

bitmap doesn't have to be 100% accurate. This just indicates, at that 
moment, how many TCEs

are mapped.

>
>> +
>> +    /* set the parameters depnding on the DDW type */
>
> s/depnding/depending ?
>
>> +    if (ddw_direct && found) {          /* Hybrid */
>> +        data->direct_address = be64_to_cpu(p->dma_base);
>> +        data->dynamic_size = (u64)(tbl->it_size << tbl->it_page_shift);
>> +
>> +        data->dynamic_address = data->direct_address
>> +                                + (u64)(1UL << 
>> be32_to_cpu(p->window_shift))
>> +                                - data->dynamic_size;
>> +
>> +        data->direct_size = data->dynamic_address - 
>> data->direct_address;
>> +        data->dynamic_pages_mapped = bitmap_weight(tbl->it_map, 
>> tbl->it_size);
>> +
>> +        sprintf(data->window_type, "%s", "Hybrid");
>
> Preferably use snprintf for safety. I see two more instances below.
>
>> +    } else if (ddw_direct && !found) {    /* Direct */
>> +        data->direct_address = be64_to_cpu(p->dma_base);
>> +        data->direct_size = (u64)(1UL << be32_to_cpu(p->window_shift));
>> +
>> +        sprintf(data->window_type, "%s", "Direct");
>> +    } else {                              /* Dynamic */
>> +        data->dynamic_address = be64_to_cpu(p->dma_base);
>> +        data->dynamic_size = (u64)(1UL << 
>> be32_to_cpu(p->window_shift));
>> +        data->dynamic_pages_mapped = bitmap_weight(tbl->it_map, 
>> tbl->it_size);
>> +
>> +        sprintf(data->window_type, "%s", "Dynamic");
>> +    }
>> +
>
> .. release dn ref with of_node_put() before returning.
not needed as explained above.
>
> Similarly applicable for gather_dma_info() also.
>
>> +    return SPAPR_SUCCESS;
>> +}
>> +
>> +/* Get DDW information for the device */
>> +static int gather_dma_info(struct device *dev, struct dma_win_data 
>> *data)
>> +{
>> +    struct iommu_device *iommu;
>> +    struct pci_controller *phb;
>> +    struct device_node *dn;
>> +    struct pci_dn *pci;
>> +    const __be32 *prop = NULL;
>> +    struct iommu_table *tbl;
>> +    unsigned long offset, size, liobn;
>> +
>> +    memset(data, 0, sizeof(*data));
>> +
>> +    iommu = dev_get_drvdata(dev);
>> +    phb = container_of(iommu, struct pci_controller, iommu);
>> +    dn = phb->dn;
>> +
>> +    if (!dn)
>> +        return SPAPR_ERROR;
>> +
>> +    pci = PCI_DN(dn);
>> +    if (!pci || !pci->table_group)
>> +        return SPAPR_ERROR;
>> +
>> +    /* search for default DMA window */
>> +    prop = of_get_property(dn, "ibm,dma-window", NULL);
>> +
>> +    if (!prop)
>> +        return SPAPR_ERROR;
>> +
>> +    /* default DMA Window is always at index 0 */
>> +    tbl = pci->table_group->tables[0];
>> +    if (!tbl)
>> +        return SPAPR_ERROR;
>> +
>> +    of_parse_dma_window(dn, prop, &liobn, &offset, &size);
>> +
>> +    data->dynamic_address = offset;
>> +    data->dynamic_size = size;
>> +    data->page_size = 1ULL << IOMMU_PAGE_SHIFT_4K;
>> +    data->dynamic_pages_mapped = bitmap_weight(tbl->it_map, 
>> tbl->it_size);
>> +
>> +    return SPAPR_SUCCESS;
>> +}
>> +
>> +#define DEVICE_SHOW_DDW(_name, _fmt) \
>> +ssize_t ddw_##_name##_show(struct device *dev,                    \
>> +                                  struct device_attribute *attr,\
>> +                                  char *buf) \
>> +{                                                                \
>> +    int rc = 0;                                                    \
>> +    struct dma_win_data data;                                    \
>> + \
>> +    rc = gather_ddw_info(dev, &data);                            \
>> + \
>> +    if (rc == SPAPR_SUCCESS) \
>> +        return sysfs_emit(buf, _fmt, data._name);                \
>> +    else \
>> +        return -ENODATA; \
>> +}                                                                \
>> +
>> +#define DEVICE_SHOW_DMA(_name, _fmt) \
>> +ssize_t dma_##_name##_show(struct device *dev,                    \
>> +                                  struct device_attribute *attr,\
>> +                                  char *buf) \
>> +{                                                                \
>> +    int rc = 0;                                                    \
>> +    struct dma_win_data data;                                    \
>> + \
>> +    rc = gather_dma_info(dev, &data);                            \
>> + \
>> +    if (rc == SPAPR_SUCCESS) \
>> +        return sysfs_emit(buf, _fmt, data._name);                \
>> +    else \
>> +        return -ENODATA; \
>> +}                                                                \
>> +
>> +static DEVICE_SHOW_DDW(direct_address, "%#llx\n");
>> +static DEVICE_SHOW_DDW(direct_size, "%lld\n");
>> +static DEVICE_SHOW_DDW(page_size, "%d\n");
>> +static DEVICE_SHOW_DDW(window_type, "%s\n");
>> +static DEVICE_SHOW_DDW(dynamic_address, "%#llx\n");
>> +static DEVICE_SHOW_DDW(dynamic_size, "%lld\n");
>> +static DEVICE_SHOW_DDW(dynamic_pages_mapped, "%d\n");
>> +static DEVICE_SHOW_DMA(dynamic_address, "%#llx\n");
>> +static DEVICE_SHOW_DMA(dynamic_size, "%lld\n");
>> +static DEVICE_SHOW_DMA(page_size, "%d\n");
>> +static DEVICE_SHOW_DMA(dynamic_pages_mapped, "%d\n");
>> +
>> +#define DEVICE_ATTR_DDW(_name)                              \
>> +        struct device_attribute dev_attr_ddw_##_name =      \
>> +            __ATTR(_name, 0444, ddw_##_name##_show, NULL)
>> +#define DEVICE_ATTR_DMA(_name)                              \
>> +        struct device_attribute dev_attr_dma_##_name =      \
>> +        __ATTR(_name, 0444, dma_##_name##_show, NULL)
>> +
>> +static DEVICE_ATTR_DDW(direct_address);
>> +static DEVICE_ATTR_DDW(direct_size);
>> +static DEVICE_ATTR_DDW(page_size);
>> +static DEVICE_ATTR_DDW(window_type);
>> +static DEVICE_ATTR_DDW(dynamic_address);
>> +static DEVICE_ATTR_DDW(dynamic_size);
>> +static DEVICE_ATTR_DDW(dynamic_pages_mapped);
>> +static DEVICE_ATTR_DMA(dynamic_address);
>> +static DEVICE_ATTR_DMA(dynamic_size);
>> +static DEVICE_ATTR_DMA(page_size);
>> +static DEVICE_ATTR_DMA(dynamic_pages_mapped);
>> +
>> +static struct attribute *spapr_tce_ddw_attrs[] = {
>> +    &dev_attr_ddw_direct_address.attr,
>> +    &dev_attr_ddw_direct_size.attr,
>> +    &dev_attr_ddw_page_size.attr,
>> +    &dev_attr_ddw_window_type.attr,
>> +    &dev_attr_ddw_dynamic_address.attr,
>> +    &dev_attr_ddw_dynamic_size.attr,
>> +    &dev_attr_ddw_dynamic_pages_mapped.attr,
>> +    NULL,
>> +};
>> +
>> +static struct attribute *spapr_tce_dma_attrs[] = {
>> +    &dev_attr_dma_dynamic_address.attr,
>> +    &dev_attr_dma_dynamic_size.attr,
>> +    &dev_attr_dma_page_size.attr,
>> +    &dev_attr_dma_dynamic_pages_mapped.attr,
>> +    NULL,
>> +};
>> +
>> +static struct attribute_group spapr_tce_ddw_group = {
>> +    .name = "spapr-tce-ddw",
>> +    .attrs = spapr_tce_ddw_attrs,
>> +};
>> +
>> +static struct attribute_group spapr_tce_dma_group = {
>> +    .name = "spapr-tce-dma",
>> +    .attrs = spapr_tce_dma_attrs,
>> +};
>> +
>> +static struct attribute *spapr_tce_iommu_attrs[] = {
>> +    NULL,
>> +};
>> +
>> +static struct attribute_group spapr_tce_iommu_group = {
>> +    .name = "spapr-tce-iommu",
>> +    .attrs = spapr_tce_iommu_attrs,
>> +};
>> +
>> +const struct attribute_group *spapr_tce_iommu_groups[] = {
>> +    &spapr_tce_iommu_group,
>> +    &spapr_tce_ddw_group,
>> +    &spapr_tce_dma_group,
>> +    NULL,
>> +};
>> +
>>   static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>>   {
>>       struct iommu_table *tbl;
>> diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c 
>> b/arch/powerpc/platforms/pseries/pci_dlpar.c
>> index 8c77ec7980de..b457451a2814 100644
>> --- a/arch/powerpc/platforms/pseries/pci_dlpar.c
>> +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
>> @@ -45,6 +45,8 @@ struct pci_controller *init_phb_dynamic(struct 
>> device_node *dn)
>>       pci_process_bridge_OF_ranges(phb, dn, 0);
>>       phb->controller_ops = pseries_pci_controller_ops;
>>   +    phb->iommu_groups = spapr_tce_iommu_groups;
>> +
>>       pci_devs_phb_init_dynamic(phb);
>>         pseries_msi_allocate_domains(phb);
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h 
>> b/arch/powerpc/platforms/pseries/pseries.h
>> index 3968a6970fa8..4cf0b7a4e96a 100644
>> --- a/arch/powerpc/platforms/pseries/pseries.h
>> +++ b/arch/powerpc/platforms/pseries/pseries.h
>> @@ -128,4 +128,5 @@ struct iommu_group 
>> *pSeries_pci_device_group(struct pci_controller *hose,
>>                            struct pci_dev *pdev);
>>   #endif
>>   +extern const struct attribute_group *spapr_tce_iommu_groups[];
>>   #endif /* _PSERIES_PSERIES_H */
>> diff --git a/arch/powerpc/platforms/pseries/setup.c 
>> b/arch/powerpc/platforms/pseries/setup.c
>> index 50b26ed8432d..4d877aae0560 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -512,6 +512,8 @@ static void __init pSeries_discover_phbs(void)
>>           isa_bridge_find_early(phb);
>>           phb->controller_ops = pseries_pci_controller_ops;
>>   +        phb->iommu_groups = spapr_tce_iommu_groups;
>> +
>>           /* create pci_dn's for DT nodes under this PHB */
>>           pci_devs_phb_init_dynamic(phb);
>>   base-commit: 192c0159402e6bfbe13de6f8379546943297783d
>


^ permalink raw reply

* Re: [PATCH 35/60] kvm: Add VCPU plane-scheduling state and helpers
From: Paolo Bonzini @ 2026-06-08 16:47 UTC (permalink / raw)
  To: Jörg Rödel, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel
In-Reply-To: <20260608144252.351443-36-joro@8bytes.org>

On 6/8/26 16:42, Jörg Rödel wrote:
> From: Joerg Roedel <joerg.roedel@amd.com>
> 
> The algorithm is to always run the lowest runnable plane. Plane
> switches are done by stopping the current plane and setting another
> runnable.
> 
> Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

This was left arbitrary in my version because for example Hyper-V VTLs 
use highest-runnable instead.  It also made pure userspace scheduling 
possible, though that may not be very important in the grand scheme of 
things.

Did you drop it because it didn't work, or just for simplicity?

Paolo

> ---
>   include/linux/kvm_host.h | 16 ++++++++++++++
>   virt/kvm/kvm_main.c      | 45 ++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 61 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 5c3f9dfa15ea..e3611e6cc3e4 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -168,6 +168,7 @@ static inline bool kvm_is_error_gpa(gpa_t gpa)
>   #define KVM_REQ_VM_DEAD			(1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>   #define KVM_REQ_UNBLOCK			2
>   #define KVM_REQ_DIRTY_RING_SOFT_FULL	3
> +#define KVM_REQ_PLANE_RESCHED		4
>   #define KVM_REQUEST_ARCH_BASE		8
>   
>   /*
> @@ -324,6 +325,8 @@ struct kvm_mmio_fragment {
>   	unsigned int len;
>   };
>   
> +
> +
>   struct kvm_vcpu_common {
>   	struct kvm *kvm;
>   
> @@ -381,6 +384,8 @@ struct kvm_vcpu_common {
>   
>   	struct kvm_dirty_ring dirty_ring;
>   
> +	bool plane_switch;
> +
>   	struct kvm_vcpu_arch_common arch;
>   };
>   
> @@ -388,6 +393,12 @@ struct kvm_vcpu_common {
>   	for ((i) = 0; (i) < KVM_MAX_PLANES; ++(i))		\
>   		if (((v) = common->vcpus[(i)]) != NULL)
>   
> +/* Tracked per plane-VCPU - used for deciding which plane-vcpu to run */
> +enum kvm_vcpu_state {
> +	STOPPED,
> +	RUNNABLE,
> +};
> +
>   struct kvm_vcpu {
>   	struct kvm *kvm;
>   	struct kvm_plane *plane;
> @@ -401,6 +412,7 @@ struct kvm_vcpu {
>   	struct kvm_run *run;
>   
>   	u64 plane_requests;
> +	enum kvm_vcpu_state plane_state;
>   
>   	/* S390 only */
>   	bool valid_wakeup;
> @@ -440,6 +452,10 @@ struct kvm_vcpu {
>   	unsigned plane_level;
>   };
>   
> +void kvm_vcpu_set_plane_runnable(struct kvm_vcpu *vcpu);
> +void kvm_vcpu_set_plane_stopped(struct kvm_vcpu *vcpu);
> +struct kvm_vcpu *kvm_vcpu_select_plane(struct kvm_vcpu *vcpu);
> +
>   static inline bool kvm_vcpu_wants_to_run(struct kvm_vcpu *vcpu)
>   {
>   	return vcpu->common->wants_to_run;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 9d30fd85ce5f..a30123b77112 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4397,6 +4397,7 @@ static int kvm_plane_ioctl_create_vcpu(struct kvm_plane *plane, unsigned long id
>   	vcpu->vcpu_idx = vcpu->common->vcpu_idx;
>   	vcpu->plane = plane;
>   	vcpu->plane_level = plane->level;
> +	vcpu->plane_state = STOPPED;
>   	vcpu->run = vcpu->common->run;
>   
>   	kvm_vcpu_init(vcpu, kvm, id);
> @@ -4938,6 +4939,50 @@ static struct file_operations kvm_plane_fops = {
>   	KVM_COMPAT(kvm_plane_ioctl),
>   };
>   
> +void kvm_vcpu_set_plane_runnable(struct kvm_vcpu *vcpu)
> +{
> +	vcpu->plane_state = RUNNABLE;
> +	vcpu->common->plane_switch = true;
> +	kvm_make_request(KVM_REQ_PLANE_RESCHED, vcpu);
> +}
> +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_set_plane_runnable);
> +
> +void kvm_vcpu_set_plane_stopped(struct kvm_vcpu *vcpu)
> +{
> +	vcpu->plane_state = STOPPED;
> +	kvm_make_request(KVM_REQ_PLANE_RESCHED, vcpu);
> +}
> +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_set_plane_stopped);
> +
> +struct kvm_vcpu *kvm_vcpu_select_plane(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_vcpu_common *common = vcpu->common;
> +	struct kvm_vcpu *ret = NULL;
> +	unsigned i;
> +
> +	for (i = 0; i < KVM_MAX_PLANES; i++) {
> +		if (common->vcpus[i] == NULL)
> +			continue;
> +
> +		if (common->vcpus[i]->plane_state == RUNNABLE) {
> +			ret = common->vcpus[i];
> +			break;
> +		}
> +	}
> +
> +	if (ret == NULL) {
> +		ret = common->vcpus[0];
> +		ret->plane_state = RUNNABLE;
> +	}
> +
> +	common->current_vcpu = ret;
> +
> +	common->plane_switch = false;
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_select_plane);
> +
>   static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
>   {
>   	struct kvm_device *dev = filp->private_data;



^ permalink raw reply

* [PATCH] powerpc: Move CONFIG_QE_GPIO to SoC
From: Christophe Leroy (CS GROUP) @ 2026-06-08 16:39 UTC (permalink / raw)
  To: Paul Louvel, Michael Ellerman, Nicholas Piggin,
	Madhavan Srinivasan
  Cc: Christophe Leroy (CS GROUP), linux-kernel, linuxppc-dev

Commit 7aa1aa6ecec2 ("QE: Move QE from arch/powerpc to drivers/soc")
moved QE into drivers/soc including gpio.c but left CONFIG_QE_GPIO
in powerpc's Kconfig.

Move it to SoC as well as it is the only place it is used:

  drivers/soc/fsl/qe/Makefile:obj-$(CONFIG_QE_GPIO)       += gpio.o qe_ports_ic.o
  include/soc/fsl/qe/qe.h:#ifdef CONFIG_QE_GPIO
  include/soc/fsl/qe/qe.h:#endif /* CONFIG_QE_GPIO */

Signed-off-by: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
---
maddy I'm wondering if you can take it for 7.2 so that I can then take for 7.3 a series in preparation by Paul Louvel without impact on powerpc tree.

If it is too late can you send me you Ack and I'll take it for 7.3 in soc/fsl tree.
---
 arch/powerpc/platforms/Kconfig | 8 --------
 drivers/soc/fsl/qe/Kconfig     | 8 ++++++++
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index c4e61843d9d9..2f797ac6f1b3 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -228,14 +228,6 @@ config TAU_AVERAGE
 
 	  If in doubt, say N here.
 
-config QE_GPIO
-	bool "QE GPIO support"
-	depends on QUICC_ENGINE
-	select GPIOLIB
-	help
-	  Say Y here if you're going to use hardware that connects to the
-	  QE GPIOs.
-
 config CPM2
 	bool "Enable support for the CPM2 (Communications Processor Module)"
 	depends on (FSL_SOC_BOOKE && PPC32) || PPC_82xx
diff --git a/drivers/soc/fsl/qe/Kconfig b/drivers/soc/fsl/qe/Kconfig
index eb03f42ab978..b35a8fd30ebf 100644
--- a/drivers/soc/fsl/qe/Kconfig
+++ b/drivers/soc/fsl/qe/Kconfig
@@ -67,3 +67,11 @@ config QE_USB
 	default y if USB_FSL_QE
 	help
 	  QE USB Controller support
+
+config QE_GPIO
+	bool "QE GPIO support"
+	depends on QUICC_ENGINE
+	select GPIOLIB
+	help
+	  Say Y here if you're going to use hardware that connects to the
+	  QE GPIOs.
-- 
2.54.0



^ permalink raw reply related

* Re: [PATCH v16 00/10] arm64/riscv: Add support for crashkernel CMA reservation
From: Andrew Morton @ 2026-06-08 16:10 UTC (permalink / raw)
  To: Jinjie Ruan
  Cc: corbet, skhan, catalin.marinas, will, chenhuacai, kernel, maddy,
	mpe, npiggin, chleroy, pjw, palmer, aou, alex, tglx, mingo, bp,
	dave.hansen, hpa, robh, saravanak, bhe, rppt, pasha.tatashin,
	pratyush, ruirui.yang, rdunlap, peterz, feng.tang, dapeng1.mi,
	kees, elver, kuba, lirongqing, ebiggers, paulmck, leitao, coxu,
	Liam.Howlett, ryan.roberts, osandov, jbohac, cfsworks,
	tangyouling, sourabhjain, ritesh.list, adityag, liaoyuanhong,
	seanjc, fuqiang.wang, ardb, chenjiahao16, guoren, x86, linux-doc,
	linux-kernel, linux-arm-kernel, loongarch, linuxppc-dev,
	linux-riscv, devicetree, kexec
In-Reply-To: <20260608073459.3119290-1-ruanjinjie@huawei.com>

On Mon, 8 Jun 2026 15:34:49 +0800 Jinjie Ruan <ruanjinjie@huawei.com> wrote:

> The crash memory allocation, and the exclude of crashk_res, crashk_low_res
> and crashk_cma memory are almost identical across different architectures,
> This patch set handle them in crash core in a general way, which eliminate
> a lot of duplication code.
> 
> And add support for crashkernel CMA reservation for arm64 and riscv.

fyi, AI review might have found a bunch of issues in arch-specific
code, all of them pre-existing.

	https://sashiko.dev/#/patchset/20260608073459.3119290-1-ruanjinjie@huawei.com


^ permalink raw reply

* Re: [PATCH] powerpc: Export set_memory_encrypted and set_memory_decrypted
From: T.J. Mercier @ 2026-06-08 16:04 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Sumit Semwal, Jiri Pirko, Maxime Ripard, Christoph Hellwig, maddy,
	mpe, npiggin, chleroy, linuxppc-dev, lkp, linux-kernel, iommu,
	linux-mm, agordeev, gerald.schaefer, linux-s390, Dan Williams,
	Tom Lendacky, x86, Arnd Bergmann
In-Reply-To: <20260608152743.GD2764304@ziepe.ca>

On Mon, Jun 8, 2026 at 8:27 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Mon, Jun 08, 2026 at 08:47:15PM +0530, Sumit Semwal wrote:
> > Hi Jason,
> >
> > On Thu, 4 Jun 2026 at 19:27, Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > >
> > > On Thu, Jun 04, 2026 at 12:51:49PM +0530, Sumit Semwal wrote:
> > >
> > > > Given that Christoph's objection is not really about the modules part,
> > > > but that the set_memory_{encrypted,decrypted} should not be used here,
> > > > one option is to revert 78b30c50a7ac until that issue is sorted out?
> > >
> > > Please no, we have stuff already using this so it would be a
> > > functional regression. Revert making heaps into a module since that
> > > doesn't have a functional regression.
> >
> > Thanks for your comments.
> >
> > To me, it looks like while system and system_cc_shared heaps share a
> > lot of code, their user bases have different needs. It's apparent that
> > system_cc_heap users don't care about it being a module while system
> > heap users would very much like so.
> >
> > I also discussed this with Arnd, and he suggested we could rearrange
> > the code so that system_heap_cc_shared_priv depends on a new Kconfig
> > symbol like
> >
> > config DMABUF_HEAPS_CC_SYSTEM
> >         bool "DMA-BUF System Heap for memory encryption"
> >         depends on ARCH_HAS_MEM_ENCRYPT && DMABUF_HEAPS_SYSTEM=y
> >
> > This allows building both into the kernel or leave encryption choice
> > up to the consumers of the system heap.
> >
> > If this is agreeable to everyone, I can post Arnd's patch.
>
> Yeah, that's fine for me for now
>
> Jason

+1 SGTM

Thanks,
T.J.


^ permalink raw reply

* Re: [PATCH v2 1/3] ppc/pnv: Add null checks for OpenCapi PHBs
From: Bjorn Helgaas @ 2026-06-08 15:39 UTC (permalink / raw)
  To: Aditya Gupta
  Cc: linux-kernel, linuxppc-dev, Madhavan Srinivasan, Timothy Pearson,
	Bjorn Helgaas, Shawn Anastasio, sashiko-bot, linux-pci,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	stable
In-Reply-To: <20260527180816.2749186-2-adityag@linux.ibm.com>

On Wed, May 27, 2026 at 11:38:14PM +0530, Aditya Gupta wrote:
> For opencapi phb direct slots, the .pdev for php_slots will be NULL
> 
> Various sections of the code in pnv_php can do a null dereference and
> crash the kernel.
> 
> Originally, the issue was hit during boot:
> 
>     [    1.568588] PowerPC PowerNV PCI Hotplug Driver version: 0.1
>     [    1.569722] BUG: Kernel NULL pointer dereference at 0x00000074
>     [    1.569811] Faulting instruction address: 0xc000000000b75fd0
>     [    1.569890] Oops: Kernel access of bad area, sig: 11 [#1]
>     [    1.569963] LE PAGE_SIZE=64K MMU=Hash  SMP NR_CPUS=2048 NUMA PowerNV
>     ...
>     [    1.571492] NIP [c000000000b75fd0] pnv_php_get_adapter_state+0x60/0x154
>     [    1.571604] LR [c000000000b75fbc] pnv_php_get_adapter_state+0x4c/0x154
>     [    1.571690] Call Trace:
>     [    1.571725] [c000c0000688f990] [c000000000b75fbc] pnv_php_get_adapter_state+0x4c/0x154 (unreliable)
>     [    1.571783] [c000c0000688fa20] [c000000000b78bd0] pnv_php_enable+0x94/0x378
>     [    1.571951] [c000c0000688fac0] [c000000000b7912c] pnv_php_register_one.isra.0+0x11c/0x1e0

Drop timestamps since they don't add useful information.

Indent quoted material by two spaces to reduce wrapping.

Run "git log --oneline drivers/pci/hotplug/pnv_php.c" and "git log
--oneline drivers/pci/hotplug/" and match subject line style.

> This occurs for hotplug slots on root buses where bus->self == NULL,
> such as OpenCAPI PHB direct slots. An added debug print (not part of
> this patch) confirmed it was opencapi:

Style "OpenCAPI" and "PHB" consistently in commit log and subject.

>     [    1.617227] pnv_php: slot 'OPENCAPI-0009' has NULL pdev (bus 0009:00, parent=NO (root bus))
>     [    1.617308] pnv_php: slot 'OPENCAPI-0009' dn->full_name='pciex@603a000000000', compatible='ibm,power10-pau-opencapi-pciex'
> 
> This only required null check in 'pnv_php_get_adapter_state', which
> caused the kernel to boot.
> 
> Even with 'pnv_php_get_adapter_state' null check, there are more
> possible null dereferences pointed by sashiko, including cases where
> userspace crashes the kernel, such as:
> 
>     $ cat /sys/bus/pci/slots/*/attention
>     ...
>     [  557.036295] Kernel attempted to read user page (6e) - exploit attempt? (uid: 0)
>     [  557.036354] BUG: Kernel NULL pointer dereference on read at 0x0000006e
>     [  557.036383] Faulting instruction address: 0xc000000000a83334
>     [  557.036413] Oops: Kernel access of bad area, sig: 11 [#1]
>     [  557.036449] LE PAGE_SIZE=64K MMU=Hash  SMP NR_CPUS=2048 NUMA PowerNV
>     ...
>     [  557.037749] [c000000046707a20] [c000000046707b90] 0xc000000046707b90 (unreliable)
>     [  557.037795] [c000000046707a70] [0000000000000001] 0x1
>     [  557.037850] [c000000046707ab0] [c000000000acb00c] attention_read_file+0x54/0xa8
>     [  557.037910] [c000000046707b30] [c000000000abfbfc] pci_slot_attr_show+0x3c/0x58
>     [  557.037977] [c000000046707b50] [c0000000008181ec] sysfs_kf_seq_show+0xd4/0x204
>     [  557.038022] [c000000046707be0] [c000000000815004] kernfs_seq_show+0x44/0x58
> 
> Add null checks to prevent the null dereferences.
> 
> Cc: stable@vger.kernel.org
> Fixes: 80f9fc236279 ("PCI: pnv_php: Work around switches with broken presence detection")
> Signed-off-by: Aditya Gupta <adityag@linux.ibm.com>


^ permalink raw reply

* Re: [PATCH] powerpc: Export set_memory_encrypted and set_memory_decrypted
From: Jason Gunthorpe @ 2026-06-08 15:27 UTC (permalink / raw)
  To: Sumit Semwal, Jiri Pirko
  Cc: Maxime Ripard, Jiri Pirko, Christoph Hellwig, T.J. Mercier, maddy,
	mpe, npiggin, chleroy, linuxppc-dev, lkp, linux-kernel, iommu,
	linux-mm, agordeev, gerald.schaefer, linux-s390, Dan Williams,
	Tom Lendacky, x86, Arnd Bergmann
In-Reply-To: <CAO_48GH3NP09U6TdB5drbKY0TpwvtBXwrf=Jajsr5ttNbC_u9g@mail.gmail.com>

On Mon, Jun 08, 2026 at 08:47:15PM +0530, Sumit Semwal wrote:
> Hi Jason,
> 
> On Thu, 4 Jun 2026 at 19:27, Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Thu, Jun 04, 2026 at 12:51:49PM +0530, Sumit Semwal wrote:
> >
> > > Given that Christoph's objection is not really about the modules part,
> > > but that the set_memory_{encrypted,decrypted} should not be used here,
> > > one option is to revert 78b30c50a7ac until that issue is sorted out?
> >
> > Please no, we have stuff already using this so it would be a
> > functional regression. Revert making heaps into a module since that
> > doesn't have a functional regression.
> 
> Thanks for your comments.
> 
> To me, it looks like while system and system_cc_shared heaps share a
> lot of code, their user bases have different needs. It's apparent that
> system_cc_heap users don't care about it being a module while system
> heap users would very much like so.
> 
> I also discussed this with Arnd, and he suggested we could rearrange
> the code so that system_heap_cc_shared_priv depends on a new Kconfig
> symbol like
> 
> config DMABUF_HEAPS_CC_SYSTEM
>         bool "DMA-BUF System Heap for memory encryption"
>         depends on ARCH_HAS_MEM_ENCRYPT && DMABUF_HEAPS_SYSTEM=y
> 
> This allows building both into the kernel or leave encryption choice
> up to the consumers of the system heap.
> 
> If this is agreeable to everyone, I can post Arnd's patch.

Yeah, that's fine for me for now

Jason


^ permalink raw reply

* Re: [RFC PATCH 0/4] perf: Add perf.data tracepoint events to trace.dat conversion
From: Ian Rogers @ 2026-06-08 15:18 UTC (permalink / raw)
  To: Tanushree Shah
  Cc: acme, jolsa, adrian.hunter, vmolnaro, mpetlan, tmricht, maddy,
	namhyung, linux-kernel, linux-perf-users, linuxppc-dev, atrajeev,
	hbathini, Tejas.Manhas1, Tanushree.Shah, Shivani.Nittor
In-Reply-To: <20260608125951.90425-2-tshah@linux.ibm.com>

On Mon, Jun 8, 2026 at 6:00 AM Tanushree Shah <tshah@linux.ibm.com> wrote:
>
> This RFC patch series introduces support for converting perf.data files
> containing tracepoint events into trace.dat format, enabling seamless
> visualization and analysis using KerneShark.

Thanks for doing this, this is a useful feature!

nit: typo KernelShark

>
> ======================
> Background and Motivation
> ======================
>
> Currently, perf and trace-cmd operate as separate tracing ecosystems with
> incompatible data formats. Users who collect tracepoint data with
> 'perf record' cannot easily visualize it in KernelShark's graphical
> timeline view or leverage trace-cmd's analysis capabilities.
>
> This creates workflow friction when users need to:
>
> - Visualize perf tracepoint data in KernelShark's interactive graphical
>   timeline
> - Share trace data between perf and trace-cmd workflows and toolchains
> - Perform architecture-independent conversion and analysis of traces
>
> This conversion bridge eliminates these barriers by enabling seamless
> data exchange between perf and trace-cmd ecosystems, allowing users to
> choose the best tool for each analysis phase.
>
> ======================
> Implementation Overview
> ======================
>
> The series implements the trace.dat file format specification (version 7)
> within perf's data conversion framework.
>
> **Patch 1/4: Core trace.dat Export Infrastructure**
> Introduces util/trace-dat.c and util/trace-dat.h implementing:
> - Per-CPU raw event buffer management (init, collect, free)
> - Ftrace ring buffer page construction
> - trace.dat section writers (strings, options, flyrecord sections)
>
> **Patch 2/4: Metadata Integration**
> Extends util/trace-event-read.c to write trace.dat metadata during
> perf.data
> parsing:
> - Initial format header (magic, version, endian, page size, compression)
> - Section 16: HEADER INFO (header_page + header_event)
> - Section 17: FTRACE EVENT FORMATS
> - Section 18: EVENT FORMATS (per system/event format files)
> - Section 19: KALLSYMS
> - Section 21: CMDLINES
> - Section 15: STRINGS (written last after all sections)
>
> **Patch 3/4: Conversion Backend**
> Implements util/data-convert-trace.c with trace_convert__perf2dat()
> function:
> - Processes PERF_TYPE_TRACEPOINT samples via process_sample_event()
> - Collects raw event data per-CPU using trace_dat__collect_cpu_event()
> - Writes OPTIONS sections (CPUCOUNT, TRACECLOCK, metadata offsets)
> - Writes FLYRECORD section with per-CPU ring buffer pages
>
> **Patch 4/4: User Interface**
> Extends tools/perf/builtin-data.c with --to-trace-dat option:
> - Adds command-line option for trace.dat output
> - Mutually exclusive with --to-ctf and --to-json
> - Calls trace_convert__perf2dat() to perform conversion
>
> ======================
> Current Implementation Details
> ======================
>
> **trace.dat Format Version:**
> The implementation currently targets trace.dat format version 7, which
> is the stable version supported by current trace-cmd releases (v3.x).
> This version is hardcoded to ensure compatibility with existing
> trace-cmd and KernelShark installations. Future enhancements could add
> version negotiation or support for newer format versions as they become
> standardized.
>
> **Compression Strategy:**
> Compression is explicitly disabled (set to NONE) in the generated
> trace.dat files.
> This design choice:
> - Simplifies the initial implementation and testing
> - Ensures maximum compatibility across trace-cmd versions
> - Avoids external compression library dependencies
>
> Future work could add support for various compression algorithms (zlib,
> zstd, lz4) with runtime selection via command-line options, significantly
> reducing file sizes for large traces.
>
> ======================
> Usage Example
> ======================
>
> ```bash
> *Record tracepoint events with perf*
> perf record -e sched:sched_switch -e sched:sched_wakeup -a sleep 10
>
> *Convert to trace.dat format*
> perf data convert --to-trace-dat=output.dat
>
> *Verify trace.dat structure*
> trace-cmd dump --summary output.dat
>
> *Analyze with trace-cmd*
> trace-cmd report output.dat
>
> *Visualize in KernelShark*
> kernelshark output.dat
> ```
>
> **Conversion Output:**
> ```
> [ perf data convert: Converted 'perf.data' into trace.dat format
> 'output.dat' ]
> [ perf data convert: Converted 2684 events ]
> ```
> **trace-cmd dump --summary Output:**
> ```
>  Tracing meta data in file output.dat:
>         [Initial format]
>                 7       [Version]
>                 0       [Little endian]
>                 8       [Bytes in a long]
>                 65536   [Page size, bytes]
>                 none    [Compression algorithm]
>                         [Compression version]
>         [buffer "", "local" clock, 65536 page size, 16 cpus, 1048576 bytes
>     flyrecord data]
>         [10 options]
>         [Saved command lines, 0 bytes]
>         [Kallsyms, 0 bytes]
>         [Ftrace format, 0 events]
>         [Header page, 206 bytes]
>         [Header event, 205 bytes]
>         [Events format, 1 systems]
>         [9 sections]
> ```
> ======================
> Testing and Verification
> ======================
>
> The series has been extensively tested with:
> - Various tracepoint events (sched, irq, syscalls, block I/O)
> - Mixed recordings containing both tracepoint and non-tracepoint events
>   only tracepoints converted)
> - Verification with trace-cmd report and KernelShark visualization
> - Memory leak testing with Valgrind (0 bytes leaked)
> - Cross-architecture testing (x86_64, ppc64le)

It seems that some of this could be a test to give coverage of the
feature. We have similar tests for other convertors:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/tests/shell/test_perf_data_converter_ctf.sh?h=perf-tools-next
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/tests/shell/test_perf_data_converter_json.sh?h=perf-tools-next

I think Sashiko has caught some coding issues, so I'll hold off on a
full review until the churn from Sashiko subsides.

Thanks!
Ian

> All generated trace.dat files successfully open in:
> - trace-cmd report (v3.1+)
> - KernelShark (v2.0+)
>
> ======================
> Next Steps
> ======================
>
> We would highly appreciate reviews, comments, and feedback on:
> - The overall architectural approach and integration points
> - Compatibility considerations with trace-cmd ecosystem
> - Performance characteristics for large-scale traces
> - Additional use cases or workflow scenarios
> - Future enhancement priorities
>
> Tanushree Shah (4):
>   perf/trace-dat: Add trace.dat export infrastructure
>   perf/trace-event: Write trace.dat metadata sections during parsing
>   perf data-convert: Add perf.data to trace.dat conversion backend
>   perf data: Add --to-trace-dat option for converting perf.data
>     tracepoint events into trace.dat format
>
>  tools/perf/builtin-data.c            |  38 +-
>  tools/perf/util/Build                |   2 +
>  tools/perf/util/data-convert-trace.c | 152 ++++++
>  tools/perf/util/data-convert.h       |   4 +
>  tools/perf/util/trace-dat.c          | 705 +++++++++++++++++++++++++++
>  tools/perf/util/trace-dat.h          |  79 +++
>  tools/perf/util/trace-event-read.c   | 259 +++++++++-
>  7 files changed, 1230 insertions(+), 9 deletions(-)
>  create mode 100644 tools/perf/util/data-convert-trace.c
>  create mode 100644 tools/perf/util/trace-dat.c
>  create mode 100644 tools/perf/util/trace-dat.h
>
> --
> 2.53.0
>


^ permalink raw reply

* Re: [PATCH] powerpc: Export set_memory_encrypted and set_memory_decrypted
From: Sumit Semwal @ 2026-06-08 15:17 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Maxime Ripard, Jiri Pirko, Christoph Hellwig, T.J. Mercier, maddy,
	mpe, npiggin, chleroy, linuxppc-dev, lkp, linux-kernel, iommu,
	linux-mm, agordeev, gerald.schaefer, linux-s390, Dan Williams,
	Tom Lendacky, x86, Arnd Bergmann
In-Reply-To: <20260604135712.GV2487554@ziepe.ca>

Hi Jason,

On Thu, 4 Jun 2026 at 19:27, Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Thu, Jun 04, 2026 at 12:51:49PM +0530, Sumit Semwal wrote:
>
> > Given that Christoph's objection is not really about the modules part,
> > but that the set_memory_{encrypted,decrypted} should not be used here,
> > one option is to revert 78b30c50a7ac until that issue is sorted out?
>
> Please no, we have stuff already using this so it would be a
> functional regression. Revert making heaps into a module since that
> doesn't have a functional regression.

Thanks for your comments.

To me, it looks like while system and system_cc_shared heaps share a
lot of code, their user bases have different needs. It's apparent that
system_cc_heap users don't care about it being a module while system
heap users would very much like so.

I also discussed this with Arnd, and he suggested we could rearrange
the code so that system_heap_cc_shared_priv depends on a new Kconfig
symbol like

config DMABUF_HEAPS_CC_SYSTEM
        bool "DMA-BUF System Heap for memory encryption"
        depends on ARCH_HAS_MEM_ENCRYPT && DMABUF_HEAPS_SYSTEM=y

This allows building both into the kernel or leave encryption choice
up to the consumers of the system heap.

If this is agreeable to everyone, I can post Arnd's patch.

>
> Jason


Best,
Sumit.


^ permalink raw reply

* [PATCH 41/60] kvm: x86: Make apic_map per plane
From: Jörg Rödel @ 2026-06-08 14:42 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel
In-Reply-To: <20260608144252.351443-1-joro@8bytes.org>

From: Paolo Bonzini <pbonzini@redhat.com>

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Co-developed-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/kvm_host.h | 24 +++++------
 arch/x86/kvm/i8254.c            |  2 +-
 arch/x86/kvm/lapic.c            | 71 +++++++++++++++++----------------
 arch/x86/kvm/x86.c              | 18 +++++++--
 4 files changed, 61 insertions(+), 54 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 134bc02962fd..11e52f8bb2c2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1418,16 +1418,17 @@ enum kvm_mmu_type {
 };
 
 /* Per-plane state of VM */
-struct kvm_arch_plane {};
+struct kvm_arch_plane {
+	atomic_t vapics_in_nmi_mode;
 
-static inline int kvm_arch_plane_init(struct kvm *kvm,
-				      struct kvm_plane *plane,
-				      unsigned plane_level)
-{
-	return 0;
-}
+	struct mutex apic_map_lock;
+	struct kvm_apic_map __rcu *apic_map;
+	atomic_t apic_map_dirty;
+};
 
-static inline void kvm_arch_plane_destroy(struct kvm_plane *plane) {}
+int kvm_arch_plane_init(struct kvm *kvm, struct kvm_plane *plane,
+			unsigned plane_level);
+void kvm_arch_plane_destroy(struct kvm_plane *plane);
 
 struct kvm_arch {
 	unsigned long n_used_mmu_pages;
@@ -1465,11 +1466,6 @@ struct kvm_arch {
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
 #endif
-	atomic_t vapics_in_nmi_mode;
-
-	struct mutex apic_map_lock;
-	struct kvm_apic_map __rcu *apic_map;
-	atomic_t apic_map_dirty;
 
 	bool apic_access_memslot_enabled;
 	bool apic_access_memslot_inhibited;
@@ -2458,7 +2454,7 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 
-int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+int kvm_pv_send_ipi(struct kvm_vcpu *kvm_vcpu, unsigned long ipi_bitmap_low,
 		    unsigned long ipi_bitmap_high, u32 min,
 		    unsigned long icr, int op_64_bit);
 
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 1982b0077ddd..bfe590378bd2 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -260,7 +260,7 @@ static void pit_do_work(struct kthread_work *work)
 	 * VCPUs and only when LVT0 is in NMI mode.  The interrupt can
 	 * also be simultaneously delivered through PIC and IOAPIC.
 	 */
-	if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+	if (atomic_read(&kvm->planes[0]->arch.vapics_in_nmi_mode) > 0)
 		kvm_for_each_vcpu(i, vcpu, kvm)
 			kvm_apic_nmi_wd_deliver(vcpu);
 }
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1b763f164951..06a12b49fafa 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -403,36 +403,37 @@ enum {
 	DIRTY
 };
 
-static void kvm_recalculate_apic_map(struct kvm *kvm)
+static void kvm_recalculate_apic_map(struct kvm_plane *plane)
 {
 	struct kvm_apic_map *new, *old = NULL;
+	struct kvm *kvm = plane->kvm;
 	struct kvm_vcpu *vcpu;
 	unsigned long i;
 	u32 max_id = 255; /* enough space for any xAPIC ID */
 	bool xapic_id_mismatch;
 	int r;
 
-	/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map.  */
-	if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
+	/* Read plane->arch.apic_map_dirty before plane->arch.apic_map.  */
+	if (atomic_read_acquire(&plane->arch.apic_map_dirty) == CLEAN)
 		return;
 
-	WARN_ONCE(!irqchip_in_kernel(kvm),
+	WARN_ONCE(!irqchip_in_kernel(plane->kvm),
 		  "Dirty APIC map without an in-kernel local APIC");
 
-	mutex_lock(&kvm->arch.apic_map_lock);
+	mutex_lock(&plane->arch.apic_map_lock);
 
 retry:
 	/*
-	 * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean)
+	 * Read plane->arch.apic_map_dirty before plane->arch.apic_map (if clean)
 	 * or the APIC registers (if dirty).  Note, on retry the map may have
 	 * not yet been marked dirty by whatever task changed a vCPU's x2APIC
 	 * ID, i.e. the map may still show up as in-progress.  In that case
 	 * this task still needs to retry and complete its calculation.
 	 */
-	if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
+	if (atomic_cmpxchg_acquire(&plane->arch.apic_map_dirty,
 				   DIRTY, UPDATE_IN_PROGRESS) == CLEAN) {
 		/* Someone else has updated the map. */
-		mutex_unlock(&kvm->arch.apic_map_lock);
+		mutex_unlock(&plane->arch.apic_map_lock);
 		return;
 	}
 
@@ -445,7 +446,7 @@ static void kvm_recalculate_apic_map(struct kvm *kvm)
 	 */
 	xapic_id_mismatch = false;
 
-	kvm_for_each_vcpu(i, vcpu, kvm)
+	plane_for_each_vcpu(i, vcpu, plane)
 		if (kvm_apic_present(vcpu))
 			max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
 
@@ -459,7 +460,7 @@ static void kvm_recalculate_apic_map(struct kvm *kvm)
 	new->max_apic_id = max_id;
 	new->logical_mode = KVM_APIC_MODE_SW_DISABLED;
 
-	kvm_for_each_vcpu(i, vcpu, kvm) {
+	plane_for_each_vcpu(i, vcpu, plane) {
 		if (!kvm_apic_present(vcpu))
 			continue;
 
@@ -498,16 +499,16 @@ static void kvm_recalculate_apic_map(struct kvm *kvm)
 	else
 		kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
 
-	old = rcu_dereference_protected(kvm->arch.apic_map,
-			lockdep_is_held(&kvm->arch.apic_map_lock));
-	rcu_assign_pointer(kvm->arch.apic_map, new);
+	old = rcu_dereference_protected(plane->arch.apic_map,
+			lockdep_is_held(&plane->arch.apic_map_lock));
+	rcu_assign_pointer(plane->arch.apic_map, new);
 	/*
-	 * Write kvm->arch.apic_map before clearing apic->apic_map_dirty.
+	 * Write kvm->arch.apic_map before clearing plane->apic_map_dirty.
 	 * If another update has come in, leave it DIRTY.
 	 */
-	atomic_cmpxchg_release(&kvm->arch.apic_map_dirty,
+	atomic_cmpxchg_release(&plane->arch.apic_map_dirty,
 			       UPDATE_IN_PROGRESS, CLEAN);
-	mutex_unlock(&kvm->arch.apic_map_lock);
+	mutex_unlock(&plane->arch.apic_map_lock);
 
 	if (old)
 		kvfree_rcu(old, rcu);
@@ -528,7 +529,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 		else
 			static_branch_inc(&apic_sw_disabled.key);
 
-		atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+		atomic_set_release(&apic->vcpu->plane->arch.apic_map_dirty, DIRTY);
 	}
 
 	/* Check if there are APF page ready requests pending */
@@ -541,19 +542,19 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
 {
 	kvm_lapic_set_reg(apic, APIC_ID, id << 24);
-	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+	atomic_set_release(&apic->vcpu->plane->arch.apic_map_dirty, DIRTY);
 }
 
 static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
 {
 	kvm_lapic_set_reg(apic, APIC_LDR, id);
-	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+	atomic_set_release(&apic->vcpu->plane->arch.apic_map_dirty, DIRTY);
 }
 
 static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
 {
 	kvm_lapic_set_reg(apic, APIC_DFR, val);
-	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+	atomic_set_release(&apic->vcpu->plane->arch.apic_map_dirty, DIRTY);
 }
 
 static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
@@ -564,7 +565,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 
 	kvm_lapic_set_reg(apic, APIC_ID, id);
 	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
-	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+	atomic_set_release(&apic->vcpu->plane->arch.apic_map_dirty, DIRTY);
 }
 
 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
@@ -860,7 +861,7 @@ static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
 	return count;
 }
 
-int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+int kvm_pv_send_ipi(struct kvm_vcpu *vcpu, unsigned long ipi_bitmap_low,
 		    unsigned long ipi_bitmap_high, u32 min,
 		    unsigned long icr, int op_64_bit)
 {
@@ -878,7 +879,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
 	irq.trig_mode = icr & APIC_INT_LEVELTRIG;
 
 	rcu_read_lock();
-	map = rcu_dereference(kvm->arch.apic_map);
+	map = rcu_dereference(vcpu->plane->arch.apic_map);
 
 	count = -EOPNOTSUPP;
 	if (likely(map)) {
@@ -1240,7 +1241,7 @@ static bool __kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *s
 	}
 
 	rcu_read_lock();
-	map = rcu_dereference(kvm->arch.apic_map);
+	map = rcu_dereference(kvm->planes[0]->arch.apic_map);
 
 	ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
 	if (ret) {
@@ -1290,7 +1291,7 @@ static bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm,
 		return false;
 
 	rcu_read_lock();
-	map = rcu_dereference(kvm->arch.apic_map);
+	map = rcu_dereference(kvm->planes[0]->arch.apic_map);
 
 	if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
 			hweight16(bitmap) == 1) {
@@ -1511,7 +1512,7 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
 	bool ret;
 
 	rcu_read_lock();
-	map = rcu_dereference(kvm->arch.apic_map);
+	map = rcu_dereference(kvm->planes[0]->arch.apic_map);
 
 	ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu,
 					  &bitmap);
@@ -2389,9 +2390,9 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 	if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) {
 		apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode;
 		if (lvt0_in_nmi_mode) {
-			atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+			atomic_inc(&apic->vcpu->plane->arch.vapics_in_nmi_mode);
 		} else
-			atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+			atomic_dec(&apic->vcpu->plane->arch.vapics_in_nmi_mode);
 	}
 }
 
@@ -2551,7 +2552,7 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 	 * was toggled, the APIC ID changed, etc...   The maps are marked dirty
 	 * on relevant changes, i.e. this is a nop for most writes.
 	 */
-	kvm_recalculate_apic_map(apic->vcpu->kvm);
+	kvm_recalculate_apic_map(apic->vcpu->plane);
 
 	return ret;
 }
@@ -2767,7 +2768,7 @@ static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value)
 			kvm_make_request(KVM_REQ_APF_READY, vcpu);
 		} else {
 			static_branch_inc(&apic_hw_disabled.key);
-			atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+			atomic_set_release(&apic->vcpu->plane->arch.apic_map_dirty, DIRTY);
 		}
 	}
 
@@ -2814,7 +2815,7 @@ int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated)
 	}
 
 	__kvm_apic_set_base(vcpu, value);
-	kvm_recalculate_apic_map(vcpu->kvm);
+	kvm_recalculate_apic_map(vcpu->plane);
 	return 0;
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_base);
@@ -2983,7 +2984,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vcpu->arch.apic_arb_prio = 0;
 	vcpu->arch.apic_attention = 0;
 
-	kvm_recalculate_apic_map(vcpu->kvm);
+	kvm_recalculate_apic_map(vcpu->plane);
 }
 
 /*
@@ -3271,13 +3272,13 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 
 	r = kvm_apic_state_fixup(vcpu, s, true);
 	if (r) {
-		kvm_recalculate_apic_map(vcpu->kvm);
+		kvm_recalculate_apic_map(vcpu->plane);
 		return r;
 	}
 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
 
-	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
-	kvm_recalculate_apic_map(vcpu->kvm);
+	atomic_set_release(&apic->vcpu->plane->arch.apic_map_dirty, DIRTY);
+	kvm_recalculate_apic_map(vcpu->plane);
 	kvm_apic_set_version(vcpu);
 
 	apic_update_ppr(apic);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a158740a6fc1..070f87ae23eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10441,7 +10441,7 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 		goto no_yield;
 
 	rcu_read_lock();
-	map = rcu_dereference(vcpu->kvm->arch.apic_map);
+	map = rcu_dereference(vcpu->plane->arch.apic_map);
 
 	if (likely(map) && dest_id <= map->max_apic_id) {
 		dest_id = array_index_nospec(dest_id, map->max_apic_id + 1);
@@ -10528,7 +10528,7 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl,
 		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
 			break;
 
-		ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
+		ret = kvm_pv_send_ipi(vcpu, a0, a1, a2, a3, op_64_bit);
 		break;
 	case KVM_HC_SCHED_YIELD:
 		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
@@ -13397,6 +13397,18 @@ void kvm_arch_free_vm(struct kvm *kvm)
 	__kvm_arch_free_vm(kvm);
 }
 
+int kvm_arch_plane_init(struct kvm *kvm, struct kvm_plane *plane,
+			unsigned plane_level)
+{
+	mutex_init(&plane->arch.apic_map_lock);
+
+	return 0;
+}
+
+void kvm_arch_plane_destroy(struct kvm_plane *plane)
+{
+	kvfree(rcu_dereference_check(plane->arch.apic_map, 1));
+}
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
@@ -13429,7 +13441,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 
 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
-	mutex_init(&kvm->arch.apic_map_lock);
 	seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
 	ratelimit_state_init(&kvm->arch.kvmclock_update_rs, HZ, 10);
 	ratelimit_set_flags(&kvm->arch.kvmclock_update_rs, RATELIMIT_MSG_ON_RELEASE);
@@ -13587,7 +13598,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_pic_destroy(kvm);
 	kvm_ioapic_destroy(kvm);
 #endif
-	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
 	kvm_mmu_uninit_vm(kvm);
 	kvm_page_track_cleanup(kvm);
-- 
2.53.0



^ permalink raw reply related

* [PATCH 13/60] kvm: Add read accessors for kvm_vcpu scheduling state
From: Jörg Rödel @ 2026-06-08 14:42 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel
In-Reply-To: <20260608144252.351443-1-joro@8bytes.org>

From: Joerg Roedel <joerg.roedel@amd.com>

Introduce accessor functions for the scheduling state in struct
kvm_vcpu to make it easier to move these fields to struct
kvm_vcpu_common.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/arm64/kvm/arm.c           |  2 +-
 arch/arm64/kvm/nested.c        |  2 +-
 arch/loongarch/kvm/vcpu.c      |  5 +++--
 arch/mips/kvm/mips.c           |  2 +-
 arch/powerpc/kvm/powerpc.c     |  2 +-
 arch/riscv/kvm/vcpu.c          |  2 +-
 arch/s390/kvm/kvm-s390.c       |  2 +-
 arch/x86/kvm/svm/svm.c         |  2 +-
 arch/x86/kvm/vmx/posted_intr.c |  2 +-
 arch/x86/kvm/vmx/vmx.c         |  2 +-
 arch/x86/kvm/x86.c             | 12 ++++++------
 arch/x86/kvm/xen.h             |  2 +-
 include/linux/kvm_host.h       | 20 ++++++++++++++++++++
 virt/kvm/kvm_main.c            |  6 +++---
 14 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 9453321ef8c6..de00088c9a80 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1253,7 +1253,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
 	vcpu_load(vcpu);
 
-	if (!vcpu->wants_to_run) {
+	if (!kvm_vcpu_wants_to_run(vcpu)) {
 		ret = -EINTR;
 		goto out;
 	}
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 6f7bc9a9992e..b84b1edb02d8 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -822,7 +822,7 @@ void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
 	 * scheduling out and not in WFI emulation, suggesting it is likely to
 	 * reuse the MMU sometime soon.
 	 */
-	if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI))
+	if (kvm_vcpu_scheduled_out(vcpu) && !vcpu_get_flag(vcpu, IN_WFI))
 		return;
 
 	if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu))
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index e28084c49e68..bde8b68b8273 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -1847,7 +1847,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	int cpu, idx;
 	unsigned long flags;
 
-	if (vcpu->preempted && kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_PREEMPT)) {
+	if (kvm_vcpu_preempted(vcpu) &&
+	    kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_PREEMPT)) {
 		/*
 		 * Take the srcu lock as memslots will be accessed to check
 		 * the gfn cache generation against the memslots generation.
@@ -1887,7 +1888,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 		break;
 	}
 
-	if (!vcpu->wants_to_run)
+	if (!kvm_vcpu_wants_to_run(vcpu))
 		return r;
 
 	/* Clear exit_reason */
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index a53abbba43ea..f928ba105104 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -433,7 +433,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 		vcpu->mmio_needed = 0;
 	}
 
-	if (!vcpu->wants_to_run)
+	if (!kvm_vcpu_wants_to_run(vcpu))
 		goto out;
 
 	lose_fpu(1);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 00302399fc37..800867c164c6 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1840,7 +1840,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
 	kvm_sigset_activate(vcpu);
 
-	if (!vcpu->wants_to_run)
+	if (!kvm_vcpu_wants_to_run(vcpu))
 		r = -EINTR;
 	else
 		r = kvmppc_vcpu_run(vcpu);
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index a73690eda84b..8519a5bfbdc4 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -862,7 +862,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 		return ret;
 	}
 
-	if (!vcpu->wants_to_run) {
+	if (!kvm_vcpu_wants_to_run(vcpu)) {
 		kvm_vcpu_srcu_read_unlock(vcpu);
 		return -EINTR;
 	}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ffb20a64d328..8401bcad1f37 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -4954,7 +4954,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 	if (vcpu->kvm->arch.pv.dumping)
 		return -EINVAL;
 
-	if (!vcpu->wants_to_run)
+	if (!kvm_vcpu_wants_to_run(vcpu))
 		return -EINTR;
 
 	if (kvm_run->kvm_valid_regs & ~KVM_SYNC_S390_VALID_FIELDS ||
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 295e02c17b9b..1524c1bb4f37 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1475,7 +1475,7 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
 
 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+	if (kvm_vcpu_scheduled_out(vcpu) && !kvm_pause_in_guest(vcpu->kvm))
 		shrink_ple_window(vcpu);
 
 	if (kvm_vcpu_apicv_active(vcpu))
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 4a6d9a17da23..cba1e6346fc5 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -239,7 +239,7 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 	 * the cost of propagating PIR.IRR to PID.ON is negligible compared to
 	 * the cost of a spurious IRQ, and vCPU put/load is a slow path.
 	 */
-	if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) &&
+	if (!kvm_vcpu_preempted(vcpu) && kvm_vcpu_is_blocking(vcpu) &&
 	    ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
 	     (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
 		pi_enable_wakeup_handler(vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a82a4197d18a..20262855bfe8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1552,7 +1552,7 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
  */
 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+	if (kvm_vcpu_scheduled_out(vcpu) && !kvm_pause_in_guest(vcpu->kvm))
 		shrink_ple_window(vcpu);
 
 	vmx_vcpu_load_vmcs(vcpu, cpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b6b628efa21..6355fe7f546f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5168,7 +5168,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	kvm_request_l1tf_flush_l1d();
 
-	if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
+	if (kvm_vcpu_scheduled_out(vcpu) && pmu->version && pmu->event_count) {
 		pmu->need_cleanup = true;
 		kvm_make_request(KVM_REQ_PMU, vcpu);
 	}
@@ -5293,7 +5293,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	int idx;
 
-	if (vcpu->preempted) {
+	if (kvm_vcpu_preempted(vcpu)) {
 		/*
 		 * Assume protected guests are in-kernel.  Inefficient yielding
 		 * due to false positives is preferable to never yielding due
@@ -10404,7 +10404,7 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 
 	rcu_read_unlock();
 
-	if (!target || !READ_ONCE(target->ready))
+	if (!target || !kvm_vcpu_ready(target))
 		goto no_yield;
 
 	/* Ignore requests to yield to self */
@@ -12041,7 +12041,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
 	kvm_vcpu_srcu_read_lock(vcpu);
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
-		if (!vcpu->wants_to_run) {
+		if (!kvm_vcpu_wants_to_run(vcpu)) {
 			r = -EINTR;
 			goto out;
 		}
@@ -12120,7 +12120,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 		WARN_ON_ONCE(vcpu->mmio_needed);
 	}
 
-	if (!vcpu->wants_to_run) {
+	if (!kvm_vcpu_wants_to_run(vcpu)) {
 		r = -EINTR;
 		goto out;
 	}
@@ -13021,7 +13021,7 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
 	 * only path that can trigger INIT emulation _and_ loads FPU state, and
 	 * KVM_RUN should _always_ load FPU state.
 	 */
-	WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use);
+	WARN_ON_ONCE(kvm_vcpu_wants_to_run(vcpu) != fpstate->in_use);
 	fpu_in_use = fpstate->in_use;
 	if (fpu_in_use)
 		kvm_put_guest_fpu(vcpu);
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 59e6128a7bd3..78793c1ac913 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -206,7 +206,7 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
 	 * behalf of the vCPU. Only if the VMM does actually block
 	 * does it need to enter RUNSTATE_blocked.
 	 */
-	if (WARN_ON_ONCE(!vcpu->preempted))
+	if (WARN_ON_ONCE(!kvm_vcpu_preempted(vcpu)))
 		return;
 
 	kvm_xen_update_runstate(vcpu, RUNSTATE_runnable);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 47144a83f9c5..b334c15d834e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -414,6 +414,26 @@ struct kvm_vcpu {
 	unsigned plane_level;
 };
 
+static inline bool kvm_vcpu_wants_to_run(struct kvm_vcpu *vcpu)
+{
+	return vcpu->wants_to_run;
+}
+
+static inline bool kvm_vcpu_preempted(struct kvm_vcpu *vcpu)
+{
+	return READ_ONCE(vcpu->preempted);
+}
+
+static inline bool kvm_vcpu_ready(struct kvm_vcpu *vcpu)
+{
+	return READ_ONCE(vcpu->ready);
+}
+
+static inline bool kvm_vcpu_scheduled_out(struct kvm_vcpu *vcpu)
+{
+	return vcpu->scheduled_out;
+}
+
 /*
  * Start accounting time towards a guest.
  * Must be called before entering guest context.
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 14e74cdc4709..2c16e124a507 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4132,7 +4132,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 			continue;
 
 		vcpu = xa_load(&kvm->planes[0]->vcpu_array, idx);
-		if (!READ_ONCE(vcpu->ready))
+		if (!kvm_vcpu_ready(vcpu))
 			continue;
 		if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
 			continue;
@@ -4143,7 +4143,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 		 * waiting on IPI delivery, i.e. the target vCPU is in-kernel
 		 * for the purposes of directed yield.
 		 */
-		if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
+		if (kvm_vcpu_preempted(vcpu) && yield_to_kernel_mode &&
 		    !kvm_arch_dy_has_pending_interrupt(vcpu) &&
 		    !kvm_arch_vcpu_preempted_in_kernel(vcpu))
 			continue;
@@ -6513,7 +6513,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 
 	WRITE_ONCE(vcpu->scheduled_out, true);
 
-	if (task_is_runnable(current) && vcpu->wants_to_run) {
+	if (task_is_runnable(current) && kvm_vcpu_wants_to_run(vcpu)) {
 		WRITE_ONCE(vcpu->preempted, true);
 		WRITE_ONCE(vcpu->ready, true);
 	}
-- 
2.53.0



^ permalink raw reply related

* [PATCH 04/60] KVM: SVM: Inject NMIs when Restricted Injection is active
From: Jörg Rödel @ 2026-06-08 14:41 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel
In-Reply-To: <20260608144252.351443-1-joro@8bytes.org>

From: Melody Wang <huibo.wang@amd.com>

When Restricted Injection is active, only #HV exceptions can be injected
into the SEV-SNP guest.

Detect that, and then follow the #HV doorbell communication from the GHCB
specification to inject NMIs.

Co-developed-by: Thomas Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Melody Wang <huibo.wang@amd.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kvm/svm/sev.c | 19 ++++++++++++++++---
 arch/x86/kvm/svm/svm.c |  8 ++++++++
 arch/x86/kvm/svm/svm.h |  1 +
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index f2f40f81ba86..b48745fad8c5 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -5428,7 +5428,10 @@ static void __sev_snp_inject(enum inject_type type, struct kvm_vcpu *vcpu)
 		return;
 	}
 
-	hvdb->events.vector = vcpu->arch.interrupt.nr;
+	if (type == INJECT_NMI)
+		hvdb->events.nmi = 1;
+	else
+		hvdb->events.vector = vcpu->arch.interrupt.nr;
 
 	prepare_hv_injection(svm, hvdb);
 
@@ -5508,10 +5511,17 @@ void sev_snp_cancel_injection(struct kvm_vcpu *vcpu)
 	/* Copy info back into event_inj field (replaces #HV) */
 	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID;
 
+	/*
+	 * KVM only injects a single event each time (prepare_hv_injection),
+	 * so when events.nmi is true, the vector will be zero
+	 */
 	if (hvdb->events.vector)
 		svm->vmcb->control.event_inj |= hvdb->events.vector |
 						SVM_EVTINJ_TYPE_INTR;
 
+	if (hvdb->events.nmi)
+		svm->vmcb->control.event_inj |= SVM_EVTINJ_TYPE_NMI;
+
 	hvdb->events.pending_events = 0;
 
 out:
@@ -5537,8 +5547,11 @@ bool sev_snp_blocked(enum inject_type type, struct kvm_vcpu *vcpu)
 	if (!hvdb)
 		return true;
 
-	/* Indicate interrupts blocked based on guest acknowledgment */
-	blocked = !!hvdb->events.vector;
+	/* Indicate NMIs and interrupts blocked based on guest acknowledgment */
+	if (type == INJECT_NMI)
+		blocked = hvdb->events.nmi;
+	else
+		blocked = !!hvdb->events.vector;
 
 	unmap_hvdb(vcpu, &hvdb_map);
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7253936c460c..5255393986cc 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3738,6 +3738,9 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (sev_snp_inject(INJECT_NMI, vcpu))
+		goto status;
+
 	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
 
 	if (svm->nmi_l1_to_l2)
@@ -3752,6 +3755,8 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 		svm->nmi_masked = true;
 		svm_set_iret_intercept(svm);
 	}
+
+status:
 	++vcpu->stat.nmi_injections;
 }
 
@@ -3968,6 +3973,9 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
 	if (!gif_set(svm))
 		return true;
 
+	if (sev_snp_is_rinj_active(vcpu))
+		return sev_snp_blocked(INJECT_NMI, vcpu);
+
 	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
 		return false;
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a22ad5de03ea..bb0e5bfdb9a6 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -57,6 +57,7 @@ extern struct kvm_x86_ops svm_x86_ops __initdata;
 
 enum inject_type {
 	INJECT_IRQ,
+	INJECT_NMI,
 };
 
 /*
-- 
2.53.0



^ permalink raw reply related

* [PATCH 37/60] kvm: Pass plane_level to kvm_set_routing_entry()
From: Jörg Rödel @ 2026-06-08 14:42 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel
In-Reply-To: <20260608144252.351443-1-joro@8bytes.org>

From: Joerg Roedel <joerg.roedel@amd.com>

The plane_level is used to route MSI IRQs to the correct plane.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/arm64/kvm/vgic/vgic-irqfd.c | 6 ++++--
 arch/loongarch/kvm/irqfd.c       | 6 ++++--
 arch/powerpc/kvm/mpic.c          | 5 +++--
 arch/riscv/kvm/vm.c              | 5 +++--
 arch/s390/kvm/interrupt.c        | 3 ++-
 arch/x86/kvm/irq.c               | 7 ++++---
 include/linux/kvm_host.h         | 3 ++-
 virt/kvm/irqchip.c               | 2 +-
 8 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c
index 479b896c8954..53e5fcc591d7 100644
--- a/arch/arm64/kvm/vgic/vgic-irqfd.c
+++ b/arch/arm64/kvm/vgic/vgic-irqfd.c
@@ -33,11 +33,13 @@ static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
  * @kvm: the VM this entry is applied to
  * @e: kvm kernel routing entry handle
  * @ue: user api routing entry handle
+ * @plane_level: target plane level
  * return 0 on success, -EINVAL on errors.
  */
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
-			  const struct kvm_irq_routing_entry *ue)
+			  const struct kvm_irq_routing_entry *ue,
+			  unsigned plane_level)
 {
 	int r = -EINVAL;
 
@@ -57,7 +59,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->msi.data = ue->u.msi.data;
 		e->msi.flags = ue->flags;
 		e->msi.devid = ue->u.msi.devid;
-		e->msi.plane_level = 0;
+		e->msi.plane_level = plane_level;
 		break;
 	default:
 		goto out;
diff --git a/arch/loongarch/kvm/irqfd.c b/arch/loongarch/kvm/irqfd.c
index 50f0c32df46c..a36a8a9d8a66 100644
--- a/arch/loongarch/kvm/irqfd.c
+++ b/arch/loongarch/kvm/irqfd.c
@@ -39,11 +39,13 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
  * @kvm: the VM this entry is applied to
  * @e: kvm kernel routing entry handle
  * @ue: user api routing entry handle
+ * @plane_level: target plane level
  * return 0 on success, -EINVAL on errors.
  */
 int kvm_set_routing_entry(struct kvm *kvm,
 			struct kvm_kernel_irq_routing_entry *e,
-			const struct kvm_irq_routing_entry *ue)
+			const struct kvm_irq_routing_entry *ue,
+			unsigned plane_level)
 {
 	switch (ue->type) {
 	case KVM_IRQ_ROUTING_IRQCHIP:
@@ -60,7 +62,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->msi.address_lo = ue->u.msi.address_lo;
 		e->msi.address_hi = ue->u.msi.address_hi;
 		e->msi.data = ue->u.msi.data;
-		e->msi.plane_level = 0;
+		e->msi.plane_level = plane_level;
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 0f568f5fff8b..6b6eba7fbf75 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -1824,7 +1824,8 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
-			  const struct kvm_irq_routing_entry *ue)
+			  const struct kvm_irq_routing_entry *ue,
+			  unsigned plane_level)
 {
 	int r = -EINVAL;
 
@@ -1841,7 +1842,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->msi.address_lo = ue->u.msi.address_lo;
 		e->msi.address_hi = ue->u.msi.address_hi;
 		e->msi.data = ue->u.msi.data;
-		e->msi.plane_level = 0;
+		e->msi.plane_level = plane_level;
 		break;
 	default:
 		goto out;
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
index f518247e699b..6b3c8a0e74e2 100644
--- a/arch/riscv/kvm/vm.c
+++ b/arch/riscv/kvm/vm.c
@@ -118,7 +118,8 @@ bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
 
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
-			  const struct kvm_irq_routing_entry *ue)
+			  const struct kvm_irq_routing_entry *ue,
+			  unsigned plane_level)
 {
 	int r = -EINVAL;
 
@@ -138,7 +139,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->msi.data = ue->u.msi.data;
 		e->msi.flags = ue->flags;
 		e->msi.devid = ue->u.msi.devid;
-		e->msi.plane_level = 0;
+		e->msi.plane_level = plane_level;
 		break;
 	default:
 		goto out;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 1d66ef9f7527..dbd6029773aa 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2862,7 +2862,8 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
 
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
-			  const struct kvm_irq_routing_entry *ue)
+			  const struct kvm_irq_routing_entry *ue,
+			  unsigned plane_level)
 {
 	const struct kvm_irq_routing_s390_adapter *adapter;
 	u64 uaddr_s, uaddr_i;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index b7e08eddb765..d2ecfd54d57a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -295,7 +295,8 @@ bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
 
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
-			  const struct kvm_irq_routing_entry *ue)
+			  const struct kvm_irq_routing_entry *ue,
+			  unsigned plane_level)
 {
 	/* We can't check irqchip_in_kernel() here as some callers are
 	 * currently initializing the irqchip. Other callers should therefore
@@ -304,7 +305,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
 	switch (ue->type) {
 #ifdef CONFIG_KVM_IOAPIC
 	case KVM_IRQ_ROUTING_IRQCHIP:
-		if (irqchip_split(kvm))
+		if (irqchip_split(kvm) || plane_level != 0)
 			return -EINVAL;
 		e->irqchip.pin = ue->u.irqchip.pin;
 		switch (ue->u.irqchip.irqchip) {
@@ -332,7 +333,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->msi.address_lo = ue->u.msi.address_lo;
 		e->msi.address_hi = ue->u.msi.address_hi;
 		e->msi.data = ue->u.msi.data;
-		e->msi.plane_level = 0;
+		e->msi.plane_level = plane_level;
 
 		if (kvm_msi_route_invalid(kvm, e))
 			return -EINVAL;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 16dcca3132d3..cfb6911d6771 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2355,7 +2355,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
 int kvm_init_irq_routing(struct kvm *kvm);
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
-			  const struct kvm_irq_routing_entry *ue);
+			  const struct kvm_irq_routing_entry *ue,
+			  unsigned plane_level);
 void kvm_free_irq_routing(struct kvm *kvm);
 
 #else
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index ae47e56176f1..14480d1df4f9 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -146,7 +146,7 @@ static int setup_routing_entry(struct kvm *kvm,
 
 	e->gsi = gsi;
 	e->type = ue->type;
-	r = kvm_set_routing_entry(kvm, e, ue);
+	r = kvm_set_routing_entry(kvm, e, ue, 0);
 	if (r)
 		return r;
 	if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
-- 
2.53.0



^ permalink raw reply related

* [PATCH 00/60] KVM Planes + SEV-SNP Support
From: Jörg Rödel @ 2026-06-08 14:41 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel

From: Joerg Roedel <joerg.roedel@amd.com>

Hi,

Here is the updated patch-set implementing support for planes in KVM.
Planes is KVMs name for supporting various privilege separation
features of hardware (AMD SEV-SNP VMPLs, Intel TDX, ARM CCA Planes) or
software (Hyper-V VSM) in KVM.

The code posted here is based on prior work by Tom Lendacky, Roy
Hopkins[1] and Paolo Bonzini[2] as well as the numerous participants
of the KVM Planes BoF at KVM Forum 2024.

The user-space interface has slightly changed compared to the previous
patches posted by Paolo. The Documentation patch has the details.

The changes implement the base-support in KVM and X86 as well as the
parts required for AMD SEV-SNP VMPLs. The patches are based on
v7.1-rc7 and can be used to run an SEV-SNP VM with COCONUT-SVSM[3] in
VMPL0 with a Linux guest in VMPL2. An updated QEMU is needed as well,
the changes for that will be posted separately.

This changes depend on Melodys patches for supporting restricted
injection. As they are required to run COCONUT-SVSM, they are included
here for completeness.

KVM planes support as posted here has a number of known limitations:

	- Using planes requires IRQ-Chip in split mode
	- IRQFD not yet supported
	- Memory attributes are not per-plane yet - this is required
	  for VSM

The patches are also in this git branch:

	https://github.com/joergroedel/linux/tree/kvm-planes-v7.1

And can be used together with this QEMU tree:

	https://github.com/joergroedel/qemu/tree/qemu-planes-linux-v7.1

Please review.

-Joerg

[1] https://lore.kernel.org/all/cover.1726506534.git.roy.hopkins@suse.com/
[2] https://lore.kernel.org/all/20250401161106.790710-1-pbonzini@redhat.com/
[3] https://github.com/coconut-svsm/svsm/

Joerg Roedel (37):
  kvm: Introduce struct kvm_vcpu_common
  kvm: Move vcpu accounting to struct kvm_vcpu_common
  kvm: Add read accessors for kvm_vcpu scheduling state
  kvm: Make kvm_running_vcpus point to struct kvm_vcpu_common
  kvm: Move VCPU scheduling state to struct kvm_vcpu_common
  kvm: Add accessors for kvm_vcpu->mutex
  kvm: Move VCPU locking to struct kvm_vcpu_common
  kvm: Move kvm_vcpu->rcuwait to struct kvm_vcpu_common
  kvm: Introduce accessors for kvm_vcpu->mode
  kvm: Move kvm_vcpu mode and requests field to struct kvm_vcpu_common
  kvm: Introduce per-plane VCPU requests
  kvm: Move kvm_vcpu pid members to struct kvm_vcpu_common
  kvm: Move kvm_vcpu sigset members to struct kvm_vcpu_common
  kvm: Move kvm_vcpu spinloop members to struct kvm_vcpu_common
  kvm: Move kvm_vcpu->dirty_ring to struct kvm_vcpu_common
  kvm: Introduce arch-specific plane state
  kvm: Introduce arch-specific part of struct kvm_vcpu_common
  kvm: Allocate struct kvm_plane in architecture code
  KVM: Implement KVM_CREATE_VCPU ioctl for planes
  kvm: Keep track of plane VCPUs in struct kvm_vcpu_common
  kvm: Add VCPU plane-scheduling state and helpers
  kvm: Add plane_level to kvm_kernel_irq_routing_entry
  kvm: Pass plane_level to kvm_set_routing_entry()
  kvm: Make KVM_SET_GSI_ROUTING per plane
  kvm: x86: Handle IOAPIC EOIs per plane
  kvm: x86: Move CPUID state to struct kvm_vcpu_arch_common
  kvm: x86: Move cpu_caps to struct kvm_vcpu_arch_common
  kvm: x86: Update state for all plane VCPUs after CPUID update
  kvm: x86: Share MTRR state across planes
  kvm: x86: Select a plane to run
  kvm: x86: Make event injection VCPU requests per-plane
  kvm: x86: Allow hardware backend to overwrite struct kvm_plane
    allocation
  kvm: x86: Make KVM_REQ_UPDATE_PROTECTED_GUEST_STATE per plane
  kvm: x86: Share pio_data across planes
  kvm: x86: Switch to plane0 if it has events
  kvm: x86: Restrict KVM planes support to KVM_IRQCHIP_SPLIT
  kvm: svm: Track vmsa_features per plane

Melody Wang (7):
  x86/sev: Define the #HV doorbell page structure
  KVM: SVM: Add support for the SEV-SNP #HV doorbell page NAE event
  KVM: SVM: Inject #HV when Restricted Injection is active
  KVM: SVM: Inject NMIs when Restricted Injection is active
  KVM: SVM: Inject MCEs when Restricted Injection is active
  KVM: SVM: Enable Restricted Injection for an SEV-SNP guest
  KVM: SVM: Add support for the SEV-SNP #HV IPI NAE event

Paolo Bonzini (11):
  Documentation: kvm: introduce "VM plane" concept
  kvm: Introduce struct kvm_plane
  kvm: Move vcpu_array to struct kvm_plane
  kvm: Implement KVM_CAP_PLANES
  kvm: Implement KVM_CREATE_PLANE ioctl
  kvm: Add KVM_EXIT_PLANE_EVENT
  kvm: Allocate struct kvm_run only for struct kvm_vcpu_common
  kvm: Make KVM_SIGNAL_MSI per plane
  kvm: x86: Make apic_map per plane
  kvm: x86: Make local APIC code aware of planes
  kvm: x86: Introduce max_planes x86-op

Tom Lendacky (5):
  kvm: svm: Implement GET_AP_APIC_IDS NAE event
  kvm: sev: Allow for VMPL level specification in AP create
  kvm: svm: Invoke a specified VMPL level VMSA for the vCPU
  kvm: svm: Implement max_planes x86 operation
  kvm: svm: Advertise full multi-VMPL support to the SNP guest

 Documentation/virt/kvm/api.rst        | 102 +++-
 arch/arm64/include/asm/kvm_host.h     |  19 +-
 arch/arm64/kvm/arch_timer.c           |   3 +-
 arch/arm64/kvm/arm.c                  |  37 +-
 arch/arm64/kvm/inject_fault.c         |   4 +-
 arch/arm64/kvm/nested.c               |   2 +-
 arch/arm64/kvm/vgic/vgic-init.c       |   3 +-
 arch/arm64/kvm/vgic/vgic-irqfd.c      |   7 +-
 arch/loongarch/include/asm/kvm_host.h |  17 +
 arch/loongarch/kvm/intc/pch_pic.c     |   2 +-
 arch/loongarch/kvm/irqfd.c            |   5 +-
 arch/loongarch/kvm/timer.c            |   2 +-
 arch/loongarch/kvm/vcpu.c             |  16 +-
 arch/loongarch/kvm/vm.c               |  18 +
 arch/mips/include/asm/kvm_host.h      |  17 +
 arch/mips/kvm/mips.c                  |  35 +-
 arch/powerpc/include/asm/kvm_host.h   |  17 +
 arch/powerpc/kvm/book3s_pr.c          |   2 +-
 arch/powerpc/kvm/book3s_xics.c        |   4 +-
 arch/powerpc/kvm/book3s_xive.c        |   4 +-
 arch/powerpc/kvm/book3s_xive_native.c |   4 +-
 arch/powerpc/kvm/booke.c              |   2 +-
 arch/powerpc/kvm/mpic.c               |   6 +-
 arch/powerpc/kvm/powerpc.c            |  27 +-
 arch/powerpc/kvm/trace.h              |   2 +-
 arch/riscv/include/asm/kvm_host.h     |  17 +
 arch/riscv/kvm/aia_device.c           |   4 +-
 arch/riscv/kvm/main.c                 |  18 +
 arch/riscv/kvm/vcpu.c                 |  13 +-
 arch/riscv/kvm/vm.c                   |   6 +-
 arch/s390/include/asm/kvm_host.h      |  17 +
 arch/s390/kvm/interrupt.c             |  11 +-
 arch/s390/kvm/kvm-s390.c              |  33 +-
 arch/s390/kvm/pv.c                    |   2 +-
 arch/x86/include/asm/cpufeatures.h    |   1 +
 arch/x86/include/asm/kvm-x86-ops.h    |   4 +
 arch/x86/include/asm/kvm_host.h       |  96 ++--
 arch/x86/include/asm/sev-common.h     |   8 +
 arch/x86/include/asm/svm.h            |  42 ++
 arch/x86/include/uapi/asm/svm.h       |   9 +
 arch/x86/kvm/cpuid.c                  |  70 ++-
 arch/x86/kvm/cpuid.h                  |  31 +-
 arch/x86/kvm/hyperv.c                 |   2 +-
 arch/x86/kvm/i8254.c                  |   2 +-
 arch/x86/kvm/ioapic.c                 |   8 +-
 arch/x86/kvm/irq.c                    |  19 +-
 arch/x86/kvm/lapic.c                  | 144 +++--
 arch/x86/kvm/lapic.h                  |  14 +-
 arch/x86/kvm/mmu/mmu.c                |   4 +-
 arch/x86/kvm/mtrr.c                   |  12 +-
 arch/x86/kvm/smm.c                    |   2 +-
 arch/x86/kvm/svm/sev.c                | 644 ++++++++++++++++++++--
 arch/x86/kvm/svm/svm.c                |  85 ++-
 arch/x86/kvm/svm/svm.h                |  52 +-
 arch/x86/kvm/trace.h                  |   2 +-
 arch/x86/kvm/vmx/common.h             |   2 +-
 arch/x86/kvm/vmx/main.c               |  16 +-
 arch/x86/kvm/vmx/nested.h             |   4 +-
 arch/x86/kvm/vmx/posted_intr.c        |   2 +-
 arch/x86/kvm/vmx/vmx.c                |  11 +-
 arch/x86/kvm/vmx/x86_ops.h            |   1 +
 arch/x86/kvm/x86.c                    | 237 ++++++--
 arch/x86/kvm/x86.h                    |   5 +
 arch/x86/kvm/xen.c                    |   2 +-
 arch/x86/kvm/xen.h                    |   2 +-
 include/linux/kvm_host.h              | 278 ++++++++--
 include/linux/kvm_types.h             |   2 +
 include/uapi/linux/kvm.h              |  18 +
 virt/kvm/dirty_ring.c                 |   4 +-
 virt/kvm/irqchip.c                    |  13 +-
 virt/kvm/kvm_main.c                   | 764 +++++++++++++++++++-------
 71 files changed, 2460 insertions(+), 630 deletions(-)

-- 
2.53.0



^ permalink raw reply

* [PATCH 55/60] kvm: svm: Track vmsa_features per plane
From: Jörg Rödel @ 2026-06-08 14:42 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel
In-Reply-To: <20260608144252.351443-1-joro@8bytes.org>

From: Joerg Roedel <joerg.roedel@amd.com>

Planes can have different set of SEV features enabled. Track the
enabled features per plane instead of per VM.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kvm/svm/sev.c | 37 ++++++++++++++++++++-----------------
 arch/x86/kvm/svm/svm.c | 21 +++++++++++++++++++--
 arch/x86/kvm/svm/svm.h | 24 +++++++++++++++++++++---
 3 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index a23dcb081751..12b039823c1c 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -204,17 +204,16 @@ static inline bool is_mirroring_enc_context(struct kvm *kvm)
 
 static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
 {
-	struct kvm_vcpu *vcpu = &svm->vcpu;
-	struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
+	struct kvm_sev_info_plane *sev_plane = to_kvm_sev_info_plane(svm->vcpu.plane);
 
-	return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
+	return sev_plane->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
 }
 
 static bool snp_is_secure_tsc_enabled(struct kvm *kvm)
 {
-	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+	struct kvm_sev_info_plane *sev_plane = to_kvm_sev_info_plane(kvm->planes[0]);
 
-	return (sev->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) &&
+	return (sev_plane->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) &&
 	       !WARN_ON_ONCE(!sev_snp_guest(kvm));
 }
 
@@ -496,6 +495,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
 			    struct kvm_sev_init *data,
 			    unsigned long vm_type)
 {
+	struct kvm_sev_info_plane *sev_plane = to_kvm_sev_info_plane(kvm->planes[0]);
 	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
 	struct sev_platform_init_args init_args = {0};
 	bool es_active = vm_type != KVM_X86_SEV_VM;
@@ -534,11 +534,11 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
 
 	sev->active = true;
 	sev->es_active = es_active;
-	sev->vmsa_features = data->vmsa_features;
+	sev_plane->vmsa_features = data->vmsa_features;
 	sev->ghcb_version = data->ghcb_version;
 
 	if (snp_active)
-		sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
+		sev_plane->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
 
 	ret = sev_asid_new(sev, vm_type);
 	if (ret)
@@ -576,7 +576,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
 	sev_asid_free(sev);
 	sev->asid = 0;
 e_no_asid:
-	sev->vmsa_features = 0;
+	sev_plane->vmsa_features = 0;
 	sev->es_active = false;
 	sev->active = false;
 	return ret;
@@ -931,7 +931,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
 static int sev_es_sync_vmsa(struct vcpu_svm *svm)
 {
 	struct kvm_vcpu *vcpu = &svm->vcpu;
-	struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
+	struct kvm_sev_info_plane *sev_plane = to_kvm_sev_info_plane(vcpu->plane);
 	struct sev_es_save_area *save = svm->sev_es.vmsa;
 	struct xregs_state *xsave;
 	const u8 *s;
@@ -982,7 +982,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
 	save->xss  = svm->vcpu.arch.ia32_xss;
 	save->dr6  = svm->vcpu.arch.dr6;
 
-	save->sev_features = sev->vmsa_features;
+	save->sev_features = sev_plane->vmsa_features;
 
 	/*
 	 * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
@@ -2026,6 +2026,8 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
 
 static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
 {
+	struct kvm_sev_info_plane *dst_plane = to_kvm_sev_info_plane(dst_kvm->planes[0]);
+	struct kvm_sev_info_plane *src_plane = to_kvm_sev_info_plane(src_kvm->planes[0]);
 	struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm);
 	struct kvm_sev_info *src = to_kvm_sev_info(src_kvm);
 	struct kvm_vcpu *dst_vcpu, *src_vcpu;
@@ -2039,7 +2041,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
 	dst->pages_locked = src->pages_locked;
 	dst->enc_context_owner = src->enc_context_owner;
 	dst->es_active = src->es_active;
-	dst->vmsa_features = src->vmsa_features;
+	dst_plane->vmsa_features = src_plane->vmsa_features;
 
 	src->asid = 0;
 	src->active = false;
@@ -4157,7 +4159,7 @@ static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
 
 static int sev_snp_ap_creation(struct vcpu_svm *svm)
 {
-	struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
+	struct kvm_sev_info_plane *sev_plane = to_kvm_sev_info_plane(svm->vcpu.plane);
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	struct kvm_vcpu *target_vcpu;
 	struct vcpu_svm *target_svm;
@@ -4182,9 +4184,9 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
 	switch (request) {
 	case SVM_VMGEXIT_AP_CREATE_ON_INIT:
 	case SVM_VMGEXIT_AP_CREATE:
-		if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) {
+		if (vcpu->arch.regs[VCPU_REGS_RAX] != sev_plane->vmsa_features) {
 			vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n",
-				    vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features);
+				    vcpu->arch.regs[VCPU_REGS_RAX], sev_plane->vmsa_features);
 			return -EINVAL;
 		}
 
@@ -4815,15 +4817,16 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
 
 static void sev_snp_init_vmcb(struct vcpu_svm *svm)
 {
-	struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+	struct kvm_sev_info_plane *sev_plane = &to_kvm_svm_plane(svm->vcpu.plane)->sev_info_plane;
 
 	/* V_NMI is not supported when Restricted Injection is enabled */
-	if (sev->vmsa_features & SVM_SEV_FEAT_RESTRICTED_INJECTION)
+	if (sev_plane->vmsa_features & SVM_SEV_FEAT_RESTRICTED_INJECTION)
 		svm->vmcb->control.int_ctl &= ~V_NMI_ENABLE_MASK;
 }
 
 static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event)
 {
+	struct kvm_sev_info_plane *sev_plane = to_kvm_sev_info_plane(svm->vcpu.plane);
 	struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
 	struct vmcb *vmcb = svm->vmcb01.ptr;
 	struct kvm_vcpu *vcpu = &svm->vcpu;
@@ -4845,7 +4848,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event)
 	}
 
 	if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
-		svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
+		svm->vmcb->control.allowed_sev_features = sev_plane->vmsa_features |
 							  VMCB_ALLOWED_SEV_FEATURES_VALID;
 
 	/* Can't intercept CR register access, HV can't modify CR registers */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 99357de14034..2ae82dc058c9 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5304,6 +5304,23 @@ static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
 	return page_address(page);
 }
 
+static struct kvm_plane *svm_alloc_plane(void)
+{
+	struct kvm_svm_plane *svm_plane = kzalloc(sizeof(*svm_plane), GFP_KERNEL_ACCOUNT);
+
+	if (svm_plane)
+		return &svm_plane->plane;
+
+	return NULL;
+}
+
+static void svm_free_plane(struct kvm_plane *plane)
+{
+	struct kvm_svm_plane *svm_plane = to_kvm_svm_plane(plane);
+
+	kfree(svm_plane);
+}
+
 struct kvm_x86_ops svm_x86_ops __initdata = {
 	.name = KBUILD_MODNAME,
 
@@ -5446,8 +5463,8 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
 	.gmem_invalidate = sev_gmem_invalidate,
 	.gmem_max_mapping_level = sev_gmem_max_mapping_level,
 
-	.alloc_plane = x86_alloc_plane,
-	.free_plane = x86_free_plane,
+	.alloc_plane = svm_alloc_plane,
+	.free_plane = svm_free_plane,
 	.max_planes = kvm_x86_default_max_planes,
 };
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 7d27ed7099a8..57033922ddcf 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -110,7 +110,6 @@ struct kvm_sev_info {
 	unsigned long pages_locked; /* Number of pages locked */
 	struct list_head regions_list;  /* List of registered regions */
 	u64 ap_jump_table;	/* SEV-ES AP Jump Table address */
-	u64 vmsa_features;
 	u16 ghcb_version;	/* Highest guest GHCB protocol version allowed */
 	struct kvm *enc_context_owner; /* Owner of copied encryption context */
 	struct list_head mirror_vms; /* List of VMs mirroring */
@@ -140,6 +139,15 @@ struct kvm_svm {
 #endif
 };
 
+struct kvm_sev_info_plane {
+	u64 vmsa_features;
+};
+
+struct kvm_svm_plane {
+	struct kvm_plane plane;
+	struct kvm_sev_info_plane sev_info_plane;
+};
+
 struct kvm_vcpu;
 
 struct kvm_vmcb_info {
@@ -394,6 +402,16 @@ static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
 	return container_of(kvm, struct kvm_svm, kvm);
 }
 
+static __always_inline struct kvm_svm_plane *to_kvm_svm_plane(struct kvm_plane *plane)
+{
+	return container_of(plane, struct kvm_svm_plane, plane);
+}
+
+static __always_inline struct kvm_sev_info_plane *to_kvm_sev_info_plane(struct kvm_plane *plane)
+{
+	return &to_kvm_svm_plane(plane)->sev_info_plane;
+}
+
 #ifdef CONFIG_KVM_AMD_SEV
 static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm)
 {
@@ -413,7 +431,7 @@ static __always_inline bool ____sev_es_guest(struct kvm *kvm)
 
 static __always_inline bool ____sev_snp_guest(struct kvm *kvm)
 {
-	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+	struct kvm_sev_info_plane *sev = to_kvm_sev_info_plane(kvm->planes[0]);
 
 	return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) &&
 	       !WARN_ON_ONCE(!____sev_es_guest(kvm));
@@ -984,7 +1002,7 @@ void sev_snp_cancel_injection(struct kvm_vcpu *vcpu);
 bool sev_snp_blocked(enum inject_type type, struct kvm_vcpu *vcpu);
 static inline bool sev_snp_is_rinj_active(struct kvm_vcpu *vcpu)
 {
-	struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+	struct kvm_sev_info_plane *sev = &to_kvm_svm_plane(vcpu->plane)->sev_info_plane;
 
 	return is_sev_snp_guest(vcpu) &&
 		(sev->vmsa_features & SVM_SEV_FEAT_RESTRICTED_INJECTION);
-- 
2.53.0



^ permalink raw reply related

* [PATCH 27/60] kvm: Introduce arch-specific part of struct kvm_vcpu_common
From: Jörg Rödel @ 2026-06-08 14:42 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel
In-Reply-To: <20260608144252.351443-1-joro@8bytes.org>

From: Joerg Roedel <joerg.roedel@amd.com>

Give architectures a place to store their VCPU state which is shared
across all planes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/arm64/include/asm/kvm_host.h     | 5 +++++
 arch/loongarch/include/asm/kvm_host.h | 5 +++++
 arch/mips/include/asm/kvm_host.h      | 5 +++++
 arch/powerpc/include/asm/kvm_host.h   | 5 +++++
 arch/riscv/include/asm/kvm_host.h     | 5 +++++
 arch/s390/include/asm/kvm_host.h      | 5 +++++
 arch/x86/include/asm/kvm_host.h       | 5 +++++
 include/linux/kvm_host.h              | 2 ++
 include/linux/kvm_types.h             | 1 +
 virt/kvm/kvm_main.c                   | 8 ++++++++
 10 files changed, 46 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e9cca2adb371..de9ca00ce4f4 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -869,6 +869,11 @@ struct vcpu_reset_state {
 
 struct vncr_tlb;
 
+struct kvm_vcpu_arch_common {};
+
+static inline int kvm_arch_vcpu_common_init(struct kvm_vcpu_common *common) { return 0; }
+static inline void kvm_arch_vcpu_common_destroy(struct kvm_vcpu_common *common) {}
+
 struct kvm_vcpu_arch {
 	struct kvm_cpu_context ctxt;
 
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index 225aa87ebbdd..7317dceda6b4 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -150,6 +150,11 @@ struct kvm_arch {
 	struct loongarch_pch_pic *pch_pic;
 };
 
+struct kvm_vcpu_arch_common {};
+
+static inline int kvm_arch_vcpu_common_init(struct kvm_vcpu_common *common) { return 0; }
+static inline void kvm_arch_vcpu_common_destroy(struct kvm_vcpu_common *common) {}
+
 #define CSR_MAX_NUMS		0x800
 
 struct loongarch_csrs {
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b01911eb9064..c48bca79207b 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -194,6 +194,11 @@ struct kvm_arch {
 #endif
 };
 
+struct kvm_vcpu_arch_common {};
+
+static inline int kvm_arch_vcpu_common_init(struct kvm_vcpu_common *common) { return 0; }
+static inline void kvm_arch_vcpu_common_destroy(struct kvm_vcpu_common *common) {}
+
 #define N_MIPS_COPROC_REGS	32
 #define N_MIPS_COPROC_SEL	8
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index c5b9fbaf34f3..47d9900c4f85 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -349,6 +349,11 @@ struct kvm_arch {
 #endif
 };
 
+struct kvm_vcpu_arch_common {};
+
+static inline int kvm_arch_vcpu_common_init(struct kvm_vcpu_common *common) { return 0; }
+static inline void kvm_arch_vcpu_common_destroy(struct kvm_vcpu_common *common) {}
+
 #define VCORE_ENTRY_MAP(vc)	((vc)->entry_exit_map & 0xff)
 #define VCORE_EXIT_MAP(vc)	((vc)->entry_exit_map >> 8)
 #define VCORE_IS_EXITING(vc)	(VCORE_EXIT_MAP(vc) != 0)
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index bcbf487d4cb7..397491587f5b 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -107,6 +107,11 @@ struct kvm_arch {
 	bool mp_state_reset;
 };
 
+struct kvm_vcpu_arch_common {};
+
+static inline int kvm_arch_vcpu_common_init(struct kvm_vcpu_common *common) { return 0; }
+static inline void kvm_arch_vcpu_common_destroy(struct kvm_vcpu_common *common) {}
+
 struct kvm_cpu_trap {
 	unsigned long sepc;
 	unsigned long scause;
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index bb3bfbfd35d8..90fd8c0f1a2b 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -413,6 +413,11 @@ struct kvm_s390_pv_vcpu {
 	unsigned long stor_base;
 };
 
+struct kvm_vcpu_arch_common {};
+
+static inline int kvm_arch_vcpu_common_init(struct kvm_vcpu_common *common) { return 0; }
+static inline void kvm_arch_vcpu_common_destroy(struct kvm_vcpu_common *common) {}
+
 struct kvm_vcpu_arch {
 	struct kvm_s390_sie_block *sie_block;
 	/* if vsie is active, currently executed shadow sie control block */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dd95c70bfdba..1393566741a0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -794,6 +794,11 @@ enum kvm_only_cpuid_leafs {
 	NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
 };
 
+struct kvm_vcpu_arch_common {};
+
+static inline int kvm_arch_vcpu_common_init(struct kvm_vcpu_common *common) { return 0; }
+static inline void kvm_arch_vcpu_common_destroy(struct kvm_vcpu_common *common) {}
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4a0eaa1de479..291bccce9b74 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -375,6 +375,8 @@ struct kvm_vcpu_common {
 	bool scheduled_out;
 
 	struct kvm_dirty_ring dirty_ring;
+
+	struct kvm_vcpu_arch_common arch;
 };
 
 struct kvm_vcpu {
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 07e82928c948..06799efe6a12 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -46,6 +46,7 @@ struct kvm_plane;
 struct kvm_run;
 struct kvm_userspace_memory_region;
 struct kvm_vcpu;
+struct kvm_vcpu_common;
 struct kvm_vcpu_init;
 struct kvm_memslots;
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 91fb9abf9b31..7a0b632e3ac0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -490,6 +490,10 @@ static int kvm_vcpu_init_common(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned
 			goto out_drop_counter;
 	}
 
+	r = kvm_arch_vcpu_common_init(common);
+	if (r)
+		goto out_free_dirty_ring;
+
 	vcpu->common = no_free_ptr(common);
 
 	kvm_vcpu_set_in_spin_loop(vcpu, false);
@@ -497,6 +501,8 @@ static int kvm_vcpu_init_common(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned
 
 	return 0;
 
+out_free_dirty_ring:
+	kvm_dirty_ring_free(&common->dirty_ring);
 out_drop_counter:
 	mutex_lock(&kvm->lock);
 	kvm->created_vcpus--;
@@ -548,6 +554,8 @@ static void kvm_vcpu_common_destroy(struct kvm_vcpu *vcpu)
 	kvm->created_vcpus--;
 	mutex_unlock(&common->kvm->lock);
 
+	kvm_arch_vcpu_common_destroy(common);
+
 	/*
 	 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
 	 * the common->pid pointer, and at destruction time all file descriptors
-- 
2.53.0



^ permalink raw reply related

* [PATCH 54/60] kvm: x86: Restrict KVM planes support to KVM_IRQCHIP_SPLIT
From: Jörg Rödel @ 2026-06-08 14:42 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel
In-Reply-To: <20260608144252.351443-1-joro@8bytes.org>

From: Joerg Roedel <joerg.roedel@amd.com>

The code right now only supports plane-aware IOAPIC IRQ routing for
IRQ-chip in split mode. Enforce that restriction in the KVM x86 code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kvm/x86.c       | 8 ++++++--
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c      | 5 +++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c6910356b061..0b9fa1059481 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -490,6 +490,10 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_default_max_planes);
 
 unsigned kvm_arch_max_planes(struct kvm *kvm)
 {
+	/* For now, planes are only supported with irqchip=split */
+	if (!irqchip_split(kvm))
+		return 1;
+
 	return kvm_x86_call(max_planes)(kvm);
 }
 
@@ -6833,7 +6837,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		if (cap->args[0] > KVM_MAX_IRQ_ROUTES)
 			goto split_irqchip_unlock;
 		r = -EEXIST;
-		if (irqchip_in_kernel(kvm))
+		if (irqchip_in_kernel(kvm) || kvm->has_planes)
 			goto split_irqchip_unlock;
 		if (kvm->created_vcpus)
 			goto split_irqchip_unlock;
@@ -7398,7 +7402,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 			goto create_irqchip_unlock;
 
 		r = -EINVAL;
-		if (kvm->created_vcpus)
+		if (kvm->created_vcpus || kvm->has_planes)
 			goto create_irqchip_unlock;
 
 		r = kvm_pic_init(kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3b62fb354267..dbf81e2520f2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -918,6 +918,7 @@ struct kvm {
 	struct list_head gpc_list;
 
 	struct kvm_plane *planes[KVM_MAX_PLANES];
+	bool has_planes;
 
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8f1a16af519a..ff27cdbe8d92 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5477,6 +5477,10 @@ static int kvm_vm_ioctl_create_plane(struct kvm *kvm, unsigned id)
 	    WARN_ON_ONCE(id >= KVM_MAX_PLANES))
 		return -EINVAL;
 
+	/* Planes are only supported with in-kernel IRQ-chip */
+	if (!kvm_arch_irqchip_in_kernel(kvm))
+		return -EINVAL;
+
 	guard(mutex)(&kvm->lock);
 	if (kvm->planes[id])
 		return -EEXIST;
@@ -5498,6 +5502,7 @@ static int kvm_vm_ioctl_create_plane(struct kvm *kvm, unsigned id)
 		goto put_kvm;
 	}
 
+	kvm->has_planes = true;
 	fd_install(fd, file);
 	return fd;
 
-- 
2.53.0



^ permalink raw reply related

* [PATCH 51/60] kvm: x86: Share pio_data across planes
From: Jörg Rödel @ 2026-06-08 14:42 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel
In-Reply-To: <20260608144252.351443-1-joro@8bytes.org>

From: Joerg Roedel <joerg.roedel@amd.com>

The vcpu->arch.pio_data pointer is memory mapped to user-space
alongside the kvm_run page. So it also needs to be common across all
planes for a given VCPU index.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              | 20 +++++++++++---------
 virt/kvm/kvm_main.c             |  2 +-
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0327b77e56b7..1b7aa48c961e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -801,6 +801,8 @@ struct kvm_vcpu_arch_common {
 	bool cpuid_dynamic_bits_dirty;
 	bool is_amd_compatible;
 
+	void *pio_data;
+
 	/*
 	 * cpu_caps holds the effective guest capabilities, i.e. the features
 	 * the vCPU is allowed to use.  Typically, but not always, features can
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5f48392d4738..08fe65b8d57d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8532,7 +8532,7 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
 }
 
 static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
-      			   unsigned short port, void *val, unsigned int count)
+			   unsigned short port, void *val, unsigned int count)
 {
 	int r = emulator_pio_in_out(vcpu, size, port, val, count, true);
 	if (r)
@@ -12936,7 +12936,6 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
 
 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 {
-	struct page *page;
 	int r;
 
 	vcpu->arch.last_vmentry_cpu = -1;
@@ -12960,10 +12959,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
 	r = -ENOMEM;
 
-	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-	if (!page)
-		goto fail_free_lapic;
-	vcpu->arch.pio_data = page_address(page);
+	vcpu->arch.pio_data = vcpu->common->arch.pio_data;
 
 	vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
 				       GFP_KERNEL_ACCOUNT);
@@ -13023,8 +13019,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 fail_free_mce_banks:
 	kfree(vcpu->arch.mce_banks);
 	kfree(vcpu->arch.mci_ctl2_banks);
-	free_page((unsigned long)vcpu->arch.pio_data);
-fail_free_lapic:
 	kvm_free_lapic(vcpu);
 fail_mmu_destroy:
 	kvm_mmu_destroy(vcpu);
@@ -13072,16 +13066,24 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
-	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
 int kvm_arch_vcpu_common_init(struct kvm_vcpu_common *common)
 {
+	struct page *page;
+
+	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!page)
+		return -ENOMEM;
+
+	common->arch.pio_data = page_address(page);
+
 	return 0;
 }
 
 void kvm_arch_vcpu_common_destroy(struct kvm_vcpu_common *common)
 {
+	free_page((unsigned long)common->arch.pio_data);
 	kvfree(common->arch.cpuid_entries);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a6d7601c3412..8f1a16af519a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4250,7 +4250,7 @@ static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 		page = virt_to_page(vcpu->run);
 #ifdef CONFIG_X86
 	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
-		page = virt_to_page(vcpu->arch.pio_data);
+		page = virt_to_page(vcpu->common->arch.pio_data);
 #endif
 #ifdef CONFIG_KVM_MMIO
 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
-- 
2.53.0



^ permalink raw reply related

* [PATCH 03/60] KVM: SVM: Inject #HV when Restricted Injection is active
From: Jörg Rödel @ 2026-06-08 14:41 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel
In-Reply-To: <20260608144252.351443-1-joro@8bytes.org>

From: Melody Wang <huibo.wang@amd.com>

When Restricted Injection is active, only #HV exceptions can be injected into
the SEV-SNP guest. Detect that, and then follow the #HV doorbell communication
from the GHCB specification to inject the interrupt or exception.

Co-developed-by: Thomas Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Melody Wang <huibo.wang@amd.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kvm/svm/sev.c | 164 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c |  14 +++-
 arch/x86/kvm/svm/svm.h |  21 ++++++
 3 files changed, 197 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index b9ad1169cb2c..f2f40f81ba86 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -5380,3 +5380,167 @@ void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
 
 	free_page((unsigned long)vmsa);
 }
+
+static void prepare_hv_injection(struct vcpu_svm *svm, struct hvdb *hvdb)
+{
+	if (hvdb->events.no_further_signal)
+		return;
+
+	svm->vmcb->control.event_inj = HV_VECTOR |
+				       SVM_EVTINJ_TYPE_EXEPT |
+				       SVM_EVTINJ_VALID;
+	svm->vmcb->control.event_inj_err = 0;
+
+	hvdb->events.no_further_signal = 1;
+}
+
+static void unmap_hvdb(struct kvm_vcpu *vcpu, struct kvm_host_map *map)
+{
+	kvm_vcpu_unmap(vcpu, map);
+}
+
+static struct hvdb *map_hvdb(struct kvm_vcpu *vcpu, struct kvm_host_map *map)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!VALID_PAGE(svm->sev_es.hvdb_gpa))
+		return NULL;
+
+	if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->sev_es.hvdb_gpa), map)) {
+		vcpu_unimpl(vcpu, "snp: error mapping #HV doorbell page [%#llx] from guest\n",
+			    svm->sev_es.hvdb_gpa);
+
+		return NULL;
+	}
+
+	return map->hva;
+}
+
+static void __sev_snp_inject(enum inject_type type, struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct kvm_host_map hvdb_map;
+	struct hvdb *hvdb;
+
+	hvdb = map_hvdb(vcpu, &hvdb_map);
+	if (!hvdb) {
+		WARN_ONCE(1, "Restricted Injection enabled, hvdb page mapping failed\n");
+		return;
+	}
+
+	hvdb->events.vector = vcpu->arch.interrupt.nr;
+
+	prepare_hv_injection(svm, hvdb);
+
+	unmap_hvdb(vcpu, &hvdb_map);
+}
+
+bool sev_snp_queue_exception(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!sev_snp_is_rinj_active(vcpu))
+		return false;
+
+	/*
+	 * Restricted Injection is enabled, only #HV is supported.
+	 * If the vector is not HV_VECTOR, do not inject the exception,
+	 * then return true to skip the original injection path.
+	 */
+	if (WARN_ONCE(vcpu->arch.exception.vector != HV_VECTOR,
+		      "Restricted Injection enabled, exception vector %u injection not supported\n",
+		      vcpu->arch.exception.vector))
+		return true;
+
+	/*
+	 * An intercept likely occurred during #HV delivery, so re-inject it
+	 * using the current HVDB pending event values.
+	 */
+	svm->vmcb->control.event_inj = HV_VECTOR |
+				       SVM_EVTINJ_TYPE_EXEPT |
+				       SVM_EVTINJ_VALID;
+	svm->vmcb->control.event_inj_err = 0;
+
+	return true;
+}
+
+bool sev_snp_inject(enum inject_type type, struct kvm_vcpu *vcpu)
+{
+	if (!sev_snp_is_rinj_active(vcpu))
+		return false;
+
+	__sev_snp_inject(type, vcpu);
+
+	return true;
+}
+
+void sev_snp_cancel_injection(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct kvm_host_map hvdb_map;
+	struct hvdb *hvdb;
+
+	if (!sev_snp_is_rinj_active(vcpu))
+		return;
+
+	if (!svm->vmcb->control.event_inj)
+		return;
+
+	if (WARN_ONCE((svm->vmcb->control.event_inj & SVM_EVTINJ_VEC_MASK) != HV_VECTOR,
+			"Restricted Injection enabled,  %u vector not supported\n",
+			svm->vmcb->control.event_inj & SVM_EVTINJ_VEC_MASK))
+		return;
+
+	/*
+	 * Copy the information in the doorbell page into the event injection
+	 * fields to complete the cancellation flow.
+	 */
+	hvdb = map_hvdb(vcpu, &hvdb_map);
+	if (!hvdb)
+		return;
+
+	if (!hvdb->events.pending_events) {
+		/* No pending events, then event_inj field should be 0 */
+		WARN_ON_ONCE(svm->vmcb->control.event_inj);
+		goto out;
+	}
+
+	/* Copy info back into event_inj field (replaces #HV) */
+	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID;
+
+	if (hvdb->events.vector)
+		svm->vmcb->control.event_inj |= hvdb->events.vector |
+						SVM_EVTINJ_TYPE_INTR;
+
+	hvdb->events.pending_events = 0;
+
+out:
+	unmap_hvdb(vcpu, &hvdb_map);
+}
+
+/*
+ * sev_snp_blocked() is for each vector - interrupt, NMI and MCE.  It is
+ * checking if there is an interrupt handled by the guest when
+ * another interrupt is pending. So hvdb->events.vector will be used for
+ * checking while no_further_signal is signaling to the guest that a #HV
+ * is presented by the hypervisor. So no_further_signal is checked when
+ * a #HV needs to be presented to the guest.
+ */
+bool sev_snp_blocked(enum inject_type type, struct kvm_vcpu *vcpu)
+{
+	struct kvm_host_map hvdb_map;
+	struct hvdb *hvdb;
+	bool blocked;
+
+	/* Indicate interrupts are blocked if doorbell page can't be mapped */
+	hvdb = map_hvdb(vcpu, &hvdb_map);
+	if (!hvdb)
+		return true;
+
+	/* Indicate interrupts blocked based on guest acknowledgment */
+	blocked = !!hvdb->events.vector;
+
+	unmap_hvdb(vcpu, &hvdb_map);
+
+	return blocked;
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7981e7583384..7253936c460c 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -392,6 +392,9 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu)
 	    svm_update_soft_interrupt_rip(vcpu, ex->vector))
 		return;
 
+	if (sev_snp_queue_exception(vcpu))
+		return;
+
 	svm->vmcb->control.event_inj = ex->vector
 		| SVM_EVTINJ_VALID
 		| (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
@@ -3818,9 +3821,11 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
 	}
 
 	trace_kvm_inj_virq(intr->nr, intr->soft, reinjected);
-	++vcpu->stat.irq_injections;
 
-	svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type;
+	if (!sev_snp_inject(INJECT_IRQ, vcpu))
+		svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type;
+
+	++vcpu->stat.irq_injections;
 }
 
 static void svm_fixup_nested_rips(struct kvm_vcpu *vcpu)
@@ -3995,6 +4000,9 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
 	if (!gif_set(svm))
 		return true;
 
+	if (sev_snp_is_rinj_active(vcpu))
+		return sev_snp_blocked(INJECT_IRQ, vcpu);
+
 	if (is_guest_mode(vcpu)) {
 		/* As long as interrupts are being delivered...  */
 		if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
@@ -4345,6 +4353,8 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb_control_area *control = &svm->vmcb->control;
 
+	sev_snp_cancel_injection(vcpu);
+
 	control->exit_int_info = control->event_inj;
 	control->exit_int_info_err = control->event_inj_err;
 	control->event_inj = 0;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index fb956c37c941..a22ad5de03ea 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -55,6 +55,10 @@ extern int tsc_aux_uret_slot __ro_after_init;
 
 extern struct kvm_x86_ops svm_x86_ops __initdata;
 
+enum inject_type {
+	INJECT_IRQ,
+};
+
 /*
  * Clean bits in VMCB.
  * VMCB_ALL_CLEAN_MASK might also need to
@@ -971,6 +975,17 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
 int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
 struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu);
 void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa);
+bool sev_snp_queue_exception(struct kvm_vcpu *vcpu);
+bool sev_snp_inject(enum inject_type type, struct kvm_vcpu *vcpu);
+void sev_snp_cancel_injection(struct kvm_vcpu *vcpu);
+bool sev_snp_blocked(enum inject_type type, struct kvm_vcpu *vcpu);
+static inline bool sev_snp_is_rinj_active(struct kvm_vcpu *vcpu)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
+	return is_sev_snp_guest(vcpu) &&
+		(sev->vmsa_features & SVM_SEV_FEAT_RESTRICTED_INJECTION);
+};
 #else
 static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
 {
@@ -1008,6 +1023,12 @@ static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
 	return NULL;
 }
 static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {}
+
+static inline bool sev_snp_queue_exception(struct kvm_vcpu *vcpu) { return false; }
+static inline bool sev_snp_inject(enum inject_type type, struct kvm_vcpu *vcpu) { return false; }
+static inline void sev_snp_cancel_injection(struct kvm_vcpu *vcpu) {}
+static inline bool sev_snp_blocked(enum inject_type type, struct kvm_vcpu *vcpu) { return false; }
+static inline bool sev_snp_is_rinj_active(struct kvm_vcpu *vcpu) { return false; }
 #endif
 
 /* vmenter.S */
-- 
2.53.0



^ permalink raw reply related

* [PATCH 24/60] kvm: Move kvm_vcpu spinloop members to struct kvm_vcpu_common
From: Jörg Rödel @ 2026-06-08 14:42 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: Tom Lendacky, ashish.kalra, michael.roth, nsaenz, anelkz,
	James.Bottomley, Melody Wang, kvm, linux-kernel, kvmarm,
	loongarch, linux-mips, linuxppc-dev, kvm-riscv, x86, coconut-svsm,
	joerg.roedel
In-Reply-To: <20260608144252.351443-1-joro@8bytes.org>

From: Joerg Roedel <joerg.roedel@amd.com>

Onlyh one struct kvm_vcpu across all planes can be in a spin-loop.
Move the state to struct kvm_vcpu_common to make detection independent
of the active struct kvm_vcpu.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/kvm_host.h | 32 +++++++++++++++----------------
 virt/kvm/kvm_main.c      | 41 ++++++++++++++++++++++------------------
 2 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9220c452aa3a..f6e8a0b653b3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -350,6 +350,20 @@ struct kvm_vcpu_common {
 	rwlock_t pid_lock;
 	int sigset_active;
 	sigset_t sigset;
+	unsigned int halt_poll_ns;
+
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+	/*
+	 * Cpu relax intercept or pause loop exit optimization
+	 * in_spin_loop: set when a vcpu does a pause loop exit
+	 *  or cpu relax intercepted.
+	 * dy_eligible: indicates whether vcpu is eligible for directed yield.
+	 */
+	struct {
+		bool in_spin_loop;
+		bool dy_eligible;
+	} spin_loop;
+#endif
 
 	/* Scheduling state */
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -373,8 +387,6 @@ struct kvm_vcpu {
 
 	struct kvm_run *run;
 
-	unsigned int halt_poll_ns;
-
 	u64 plane_requests;
 
 	/* S390 only */
@@ -398,18 +410,6 @@ struct kvm_vcpu {
 	} async_pf;
 #endif
 
-#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
-	/*
-	 * Cpu relax intercept or pause loop exit optimization
-	 * in_spin_loop: set when a vcpu does a pause loop exit
-	 *  or cpu relax intercepted.
-	 * dy_eligible: indicates whether vcpu is eligible for directed yield.
-	 */
-	struct {
-		bool in_spin_loop;
-		bool dy_eligible;
-	} spin_loop;
-#endif
 	struct kvm_vcpu_arch arch;
 	struct kvm_vcpu_stat stat;
 	char stats_id[KVM_STATS_NAME_SIZE];
@@ -2500,11 +2500,11 @@ extern struct kvm_device_ops kvm_arm_vgic_v5_ops;
 
 static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
 {
-	vcpu->spin_loop.in_spin_loop = val;
+	vcpu->common->spin_loop.in_spin_loop = val;
 }
 static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 {
-	vcpu->spin_loop.dy_eligible = val;
+	vcpu->common->spin_loop.dy_eligible = val;
 }
 
 #else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1858880ee3d3..24ff8748a317 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -485,6 +485,9 @@ static int kvm_vcpu_init_common(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned
 
 	vcpu->common = no_free_ptr(common);
 
+	kvm_vcpu_set_in_spin_loop(vcpu, false);
+	kvm_vcpu_set_dy_eligible(vcpu, false);
+
 	return 0;
 
 out_drop_counter:
@@ -515,8 +518,6 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->vcpu_id = id;
 	kvm_async_pf_vcpu_init(vcpu);
 
-	kvm_vcpu_set_in_spin_loop(vcpu, false);
-	kvm_vcpu_set_dy_eligible(vcpu, false);
 	vcpu->last_used_slot = NULL;
 
 	vcpu->plane_level = 0;
@@ -3721,9 +3722,10 @@ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
 
 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
+	struct kvm_vcpu_common *common = vcpu->common;
 	unsigned int old, val, grow, grow_start;
 
-	old = val = vcpu->halt_poll_ns;
+	old = val = common->halt_poll_ns;
 	grow_start = READ_ONCE(halt_poll_ns_grow_start);
 	grow = READ_ONCE(halt_poll_ns_grow);
 	if (!grow)
@@ -3733,16 +3735,17 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 	if (val < grow_start)
 		val = grow_start;
 
-	vcpu->halt_poll_ns = val;
+	common->halt_poll_ns = val;
 out:
 	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
 }
 
 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
+	struct kvm_vcpu_common *common = vcpu->common;
 	unsigned int old, val, shrink, grow_start;
 
-	old = val = vcpu->halt_poll_ns;
+	old = val = common->halt_poll_ns;
 	shrink = READ_ONCE(halt_poll_ns_shrink);
 	grow_start = READ_ONCE(halt_poll_ns_grow_start);
 	if (shrink == 0)
@@ -3753,7 +3756,7 @@ static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
 	if (val < grow_start)
 		val = 0;
 
-	vcpu->halt_poll_ns = val;
+	common->halt_poll_ns = val;
 	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
 }
 
@@ -3864,19 +3867,20 @@ void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 {
 	unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
 	bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
+	struct kvm_vcpu_common *common = vcpu->common;
 	ktime_t start, cur, poll_end;
 	bool waited = false;
 	bool do_halt_poll;
 	u64 halt_ns;
 
-	if (vcpu->halt_poll_ns > max_halt_poll_ns)
-		vcpu->halt_poll_ns = max_halt_poll_ns;
+	if (common->halt_poll_ns > max_halt_poll_ns)
+		common->halt_poll_ns = max_halt_poll_ns;
 
-	do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
+	do_halt_poll = halt_poll_allowed && common->halt_poll_ns;
 
 	start = cur = poll_end = ktime_get();
 	if (do_halt_poll) {
-		ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
+		ktime_t stop = ktime_add_ns(start, common->halt_poll_ns);
 
 		do {
 			if (kvm_vcpu_check_block(vcpu) < 0)
@@ -3914,18 +3918,18 @@ void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 		if (!vcpu_valid_wakeup(vcpu)) {
 			shrink_halt_poll_ns(vcpu);
 		} else if (max_halt_poll_ns) {
-			if (halt_ns <= vcpu->halt_poll_ns)
+			if (halt_ns <= common->halt_poll_ns)
 				;
 			/* we had a long block, shrink polling */
-			else if (vcpu->halt_poll_ns &&
+			else if (common->halt_poll_ns &&
 				 halt_ns > max_halt_poll_ns)
 				shrink_halt_poll_ns(vcpu);
 			/* we had a short halt and our poll time is too small */
-			else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
+			else if (common->halt_poll_ns < max_halt_poll_ns &&
 				 halt_ns < max_halt_poll_ns)
 				grow_halt_poll_ns(vcpu);
 		} else {
-			vcpu->halt_poll_ns = 0;
+			common->halt_poll_ns = 0;
 		}
 	}
 
@@ -4046,13 +4050,14 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_yield_to);
 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+	struct kvm_vcpu_common *common = vcpu->common;
 	bool eligible;
 
-	eligible = !vcpu->spin_loop.in_spin_loop ||
-		    vcpu->spin_loop.dy_eligible;
+	eligible = !common->spin_loop.in_spin_loop ||
+		    common->spin_loop.dy_eligible;
 
-	if (vcpu->spin_loop.in_spin_loop)
-		kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+	if (common->spin_loop.in_spin_loop)
+		kvm_vcpu_set_dy_eligible(vcpu, !common->spin_loop.dy_eligible);
 
 	return eligible;
 #else
-- 
2.53.0



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox