* [PATCH net-next v2 12/14] ixd: add reset checks and initialize the mailbox
From: Tony Nguyen @ 2026-04-03 19:49 UTC (permalink / raw)
To: davem, kuba, pabeni, edumazet, andrew+netdev, netdev
Cc: Larysa Zaremba, anthony.l.nguyen, przemyslaw.kitszel,
aleksander.lobakin, sridhar.samudrala, anjali.singhai,
michal.swiatkowski, maciej.fijalkowski, emil.s.tantilov,
madhu.chittim, joshua.a.hay, jacob.e.keller,
jayaprakash.shanmugam, jiri, horms, corbet, richardcochran,
linux-doc, Aleksandr Loktionov, Bharath R
In-Reply-To: <20260403194938.3577011-1-anthony.l.nguyen@intel.com>
From: Larysa Zaremba <larysa.zaremba@intel.com>
At the end of the probe, trigger hard reset, initialize and schedule the
after-reset task. If the reset is complete in a pre-determined time,
initialize the default mailbox, through which other resources will be
negotiated.
Co-developed-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Bharath R <Bharath.r@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
drivers/net/ethernet/intel/ixd/Kconfig | 1 +
drivers/net/ethernet/intel/ixd/Makefile | 2 +
drivers/net/ethernet/intel/ixd/ixd.h | 28 +++-
drivers/net/ethernet/intel/ixd/ixd_dev.c | 89 +++++++++++
drivers/net/ethernet/intel/ixd/ixd_lan_regs.h | 40 +++++
drivers/net/ethernet/intel/ixd/ixd_lib.c | 143 ++++++++++++++++++
drivers/net/ethernet/intel/ixd/ixd_main.c | 32 +++-
7 files changed, 326 insertions(+), 9 deletions(-)
create mode 100644 drivers/net/ethernet/intel/ixd/ixd_dev.c
create mode 100644 drivers/net/ethernet/intel/ixd/ixd_lib.c
diff --git a/drivers/net/ethernet/intel/ixd/Kconfig b/drivers/net/ethernet/intel/ixd/Kconfig
index f5594efe292c..24510c50070e 100644
--- a/drivers/net/ethernet/intel/ixd/Kconfig
+++ b/drivers/net/ethernet/intel/ixd/Kconfig
@@ -5,6 +5,7 @@ config IXD
tristate "Intel(R) Control Plane Function Support"
depends on PCI_MSI
select LIBETH
+ select LIBIE_CP
select LIBIE_PCI
help
This driver supports Intel(R) Control Plane PCI Function
diff --git a/drivers/net/ethernet/intel/ixd/Makefile b/drivers/net/ethernet/intel/ixd/Makefile
index 3849bc240600..164b2c86952f 100644
--- a/drivers/net/ethernet/intel/ixd/Makefile
+++ b/drivers/net/ethernet/intel/ixd/Makefile
@@ -6,3 +6,5 @@
obj-$(CONFIG_IXD) += ixd.o
ixd-y := ixd_main.o
+ixd-y += ixd_dev.o
+ixd-y += ixd_lib.o
diff --git a/drivers/net/ethernet/intel/ixd/ixd.h b/drivers/net/ethernet/intel/ixd/ixd.h
index d813c27941a5..99c44f2aa659 100644
--- a/drivers/net/ethernet/intel/ixd/ixd.h
+++ b/drivers/net/ethernet/intel/ixd/ixd.h
@@ -4,14 +4,25 @@
#ifndef _IXD_H_
#define _IXD_H_
-#include <linux/intel/libie/pci.h>
+#include <linux/intel/libie/controlq.h>
/**
* struct ixd_adapter - Data structure representing a CPF
- * @hw: Device access data
+ * @cp_ctx: Control plane communication context
+ * @init_task: Delayed initialization after reset
+ * @xnm: virtchnl transaction manager
+ * @asq: Send control queue info
+ * @arq: Receive control queue info
*/
struct ixd_adapter {
- struct libie_mmio_info hw;
+ struct libie_ctlq_ctx cp_ctx;
+ struct {
+ struct delayed_work init_work;
+ u8 reset_retries;
+ } init_task;
+ struct libie_ctlq_xn_manager *xnm;
+ struct libie_ctlq_info *asq;
+ struct libie_ctlq_info *arq;
};
/**
@@ -22,7 +33,16 @@ struct ixd_adapter {
*/
static inline struct device *ixd_to_dev(struct ixd_adapter *adapter)
{
- return &adapter->hw.pdev->dev;
+ return &adapter->cp_ctx.mmio_info.pdev->dev;
}
+void ixd_ctlq_reg_init(struct ixd_adapter *adapter,
+ struct libie_ctlq_reg *ctlq_reg_tx,
+ struct libie_ctlq_reg *ctlq_reg_rx);
+void ixd_trigger_reset(struct ixd_adapter *adapter);
+bool ixd_check_reset_complete(struct ixd_adapter *adapter);
+void ixd_init_task(struct work_struct *work);
+int ixd_init_dflt_mbx(struct ixd_adapter *adapter);
+void ixd_deinit_dflt_mbx(struct ixd_adapter *adapter);
+
#endif /* _IXD_H_ */
diff --git a/drivers/net/ethernet/intel/ixd/ixd_dev.c b/drivers/net/ethernet/intel/ixd/ixd_dev.c
new file mode 100644
index 000000000000..cdd5477cc1f4
--- /dev/null
+++ b/drivers/net/ethernet/intel/ixd/ixd_dev.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2025 Intel Corporation */
+
+#include "ixd.h"
+#include "ixd_lan_regs.h"
+
+/**
+ * ixd_ctlq_reg_init - Initialize default mailbox registers
+ * @adapter: PCI device driver-specific private data
+ * @ctlq_reg_tx: Transmit queue registers info to be filled
+ * @ctlq_reg_rx: Receive queue registers info to be filled
+ */
+void ixd_ctlq_reg_init(struct ixd_adapter *adapter,
+ struct libie_ctlq_reg *ctlq_reg_tx,
+ struct libie_ctlq_reg *ctlq_reg_rx)
+{
+ struct libie_mmio_info *mmio_info = &adapter->cp_ctx.mmio_info;
+ *ctlq_reg_tx = (struct libie_ctlq_reg) {
+ .head = libie_pci_get_mmio_addr(mmio_info, PF_FW_ATQH),
+ .tail = libie_pci_get_mmio_addr(mmio_info, PF_FW_ATQT),
+ .len = libie_pci_get_mmio_addr(mmio_info, PF_FW_ATQLEN),
+ .addr_high = libie_pci_get_mmio_addr(mmio_info, PF_FW_ATQBAH),
+ .addr_low = libie_pci_get_mmio_addr(mmio_info, PF_FW_ATQBAL),
+ .len_mask = PF_FW_ATQLEN_ATQLEN_M,
+ .len_ena_mask = PF_FW_ATQLEN_ATQENABLE_M,
+ .head_mask = PF_FW_ATQH_ATQH_M,
+ };
+
+ *ctlq_reg_rx = (struct libie_ctlq_reg) {
+ .head = libie_pci_get_mmio_addr(mmio_info, PF_FW_ARQH),
+ .tail = libie_pci_get_mmio_addr(mmio_info, PF_FW_ARQT),
+ .len = libie_pci_get_mmio_addr(mmio_info, PF_FW_ARQLEN),
+ .addr_high = libie_pci_get_mmio_addr(mmio_info, PF_FW_ARQBAH),
+ .addr_low = libie_pci_get_mmio_addr(mmio_info, PF_FW_ARQBAL),
+ .len_mask = PF_FW_ARQLEN_ARQLEN_M,
+ .len_ena_mask = PF_FW_ARQLEN_ARQENABLE_M,
+ .head_mask = PF_FW_ARQH_ARQH_M,
+ };
+}
+
+static const struct ixd_reset_reg ixd_reset_reg = {
+ .rstat = PFGEN_RSTAT,
+ .rstat_m = PFGEN_RSTAT_PFR_STATE_M,
+ .rstat_ok_v = 0b01,
+ .rtrigger = PFGEN_CTRL,
+ .rtrigger_m = PFGEN_CTRL_PFSWR,
+};
+
+/**
+ * ixd_trigger_reset - Trigger PFR reset
+ * @adapter: the device with mapped reset register
+ */
+void ixd_trigger_reset(struct ixd_adapter *adapter)
+{
+ void __iomem *addr;
+ u32 reg_val;
+
+ addr = libie_pci_get_mmio_addr(&adapter->cp_ctx.mmio_info,
+ ixd_reset_reg.rtrigger);
+ reg_val = readl(addr);
+ writel(reg_val | ixd_reset_reg.rtrigger_m, addr);
+}
+
+/**
+ * ixd_check_reset_complete - Check if the PFR reset is completed
+ * @adapter: CPF being reset
+ *
+ * Return: %true if the register read indicates reset has been finished,
+ * %false otherwise
+ */
+bool ixd_check_reset_complete(struct ixd_adapter *adapter)
+{
+ u32 reg_val, reset_status;
+ void __iomem *addr;
+
+ addr = libie_pci_get_mmio_addr(&adapter->cp_ctx.mmio_info,
+ ixd_reset_reg.rstat);
+ reg_val = readl(addr);
+ reset_status = reg_val & ixd_reset_reg.rstat_m;
+
+ /* 0xFFFFFFFF might be read if the other side hasn't cleared
+ * the register for us yet.
+ */
+ if (reg_val != GENMASK(31, 0) &&
+ reset_status == ixd_reset_reg.rstat_ok_v)
+ return true;
+
+ return false;
+}
diff --git a/drivers/net/ethernet/intel/ixd/ixd_lan_regs.h b/drivers/net/ethernet/intel/ixd/ixd_lan_regs.h
index fbb88929d0de..58e58c75981b 100644
--- a/drivers/net/ethernet/intel/ixd/ixd_lan_regs.h
+++ b/drivers/net/ethernet/intel/ixd/ixd_lan_regs.h
@@ -11,9 +11,33 @@
#define PF_FW_MBX_REG_LEN 4096
#define PF_FW_MBX 0x08400000
+#define PF_FW_ARQBAL (PF_FW_MBX)
+#define PF_FW_ARQBAH (PF_FW_MBX + 0x4)
+#define PF_FW_ARQLEN (PF_FW_MBX + 0x8)
+#define PF_FW_ARQLEN_ARQLEN_M GENMASK(12, 0)
+#define PF_FW_ARQLEN_ARQENABLE_S 31
+#define PF_FW_ARQLEN_ARQENABLE_M BIT(PF_FW_ARQLEN_ARQENABLE_S)
+#define PF_FW_ARQH_ARQH_M GENMASK(12, 0)
+#define PF_FW_ARQH (PF_FW_MBX + 0xC)
+#define PF_FW_ARQT (PF_FW_MBX + 0x10)
+
+#define PF_FW_ATQBAL (PF_FW_MBX + 0x14)
+#define PF_FW_ATQBAH (PF_FW_MBX + 0x18)
+#define PF_FW_ATQLEN (PF_FW_MBX + 0x1C)
+#define PF_FW_ATQLEN_ATQLEN_M GENMASK(9, 0)
+#define PF_FW_ATQLEN_ATQENABLE_S 31
+#define PF_FW_ATQLEN_ATQENABLE_M BIT(PF_FW_ATQLEN_ATQENABLE_S)
+#define PF_FW_ATQH_ATQH_M GENMASK(9, 0)
+#define PF_FW_ATQH (PF_FW_MBX + 0x20)
+#define PF_FW_ATQT (PF_FW_MBX + 0x24)
+
/* Reset registers */
#define PFGEN_RTRIG_REG_LEN 2048
#define PFGEN_RTRIG 0x08407000 /* Device resets */
+#define PFGEN_RSTAT 0x08407008 /* PFR status */
+#define PFGEN_RSTAT_PFR_STATE_M GENMASK(1, 0)
+#define PFGEN_CTRL 0x0840700C /* PFR trigger */
+#define PFGEN_CTRL_PFSWR BIT(0)
/**
* struct ixd_bar_region - BAR region description
@@ -25,4 +49,20 @@ struct ixd_bar_region {
resource_size_t size;
};
+/**
+ * struct ixd_reset_reg - structure for reset registers
+ * @rstat: offset of status in register
+ * @rstat_m: status mask
+ * @rstat_ok_v: value that indicates PFR completed status
+ * @rtrigger: offset of reset trigger in register
+ * @rtrigger_m: reset trigger mask
+ */
+struct ixd_reset_reg {
+ u32 rstat;
+ u32 rstat_m;
+ u32 rstat_ok_v;
+ u32 rtrigger;
+ u32 rtrigger_m;
+};
+
#endif /* _IXD_LAN_REGS_H_ */
diff --git a/drivers/net/ethernet/intel/ixd/ixd_lib.c b/drivers/net/ethernet/intel/ixd/ixd_lib.c
new file mode 100644
index 000000000000..afc413d3650f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ixd/ixd_lib.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2025 Intel Corporation */
+
+#include "ixd.h"
+
+#define IXD_DFLT_MBX_Q_LEN 64
+
+/**
+ * ixd_init_ctlq_create_info - Initialize control queue info for creation
+ * @info: destination
+ * @type: type of the queue to create
+ * @ctlq_reg: register assigned to the control queue
+ */
+static void ixd_init_ctlq_create_info(struct libie_ctlq_create_info *info,
+ enum virtchnl2_queue_type type,
+ const struct libie_ctlq_reg *ctlq_reg)
+{
+ *info = (struct libie_ctlq_create_info) {
+ .type = type,
+ .id = -1,
+ .reg = *ctlq_reg,
+ .len = IXD_DFLT_MBX_Q_LEN,
+ };
+}
+
+/**
+ * ixd_init_libie_xn_params - Initialize xn transaction manager creation info
+ * @params: destination
+ * @adapter: adapter info struct
+ * @ctlqs: list of the managed queues to create
+ * @num_queues: length of the queue list
+ */
+static void ixd_init_libie_xn_params(struct libie_ctlq_xn_init_params *params,
+ struct ixd_adapter *adapter,
+ struct libie_ctlq_create_info *ctlqs,
+ uint num_queues)
+{
+ *params = (struct libie_ctlq_xn_init_params){
+ .cctlq_info = ctlqs,
+ .ctx = &adapter->cp_ctx,
+ .num_qs = num_queues,
+ };
+}
+
+/**
+ * ixd_adapter_fill_dflt_ctlqs - Find default control queues and store them
+ * @adapter: adapter info struct
+ */
+static void ixd_adapter_fill_dflt_ctlqs(struct ixd_adapter *adapter)
+{
+ guard(spinlock)(&adapter->cp_ctx.ctlqs_lock);
+ struct libie_ctlq_info *cq;
+
+ list_for_each_entry(cq, &adapter->cp_ctx.ctlqs, list) {
+ if (cq->qid != -1)
+ continue;
+ if (cq->type == LIBIE_CTLQ_TYPE_RX)
+ adapter->arq = cq;
+ else if (cq->type == LIBIE_CTLQ_TYPE_TX)
+ adapter->asq = cq;
+ }
+}
+
+/**
+ * ixd_deinit_dflt_mbx - Deinitialize default mailbox
+ * @adapter: adapter info struct
+ */
+void ixd_deinit_dflt_mbx(struct ixd_adapter *adapter)
+{
+ if (adapter->xnm)
+ libie_ctlq_xn_deinit(adapter->xnm, &adapter->cp_ctx);
+
+ adapter->arq = NULL;
+ adapter->asq = NULL;
+ adapter->xnm = NULL;
+}
+
+/**
+ * ixd_init_dflt_mbx - Setup default mailbox parameters and make request
+ * @adapter: adapter info struct
+ *
+ * Return: %0 on success, negative errno code on failure
+ */
+int ixd_init_dflt_mbx(struct ixd_adapter *adapter)
+{
+ struct libie_ctlq_create_info ctlqs_info[2];
+ struct libie_ctlq_xn_init_params xn_params;
+ struct libie_ctlq_reg ctlq_reg_tx;
+ struct libie_ctlq_reg ctlq_reg_rx;
+ int err;
+
+ ixd_ctlq_reg_init(adapter, &ctlq_reg_tx, &ctlq_reg_rx);
+ ixd_init_ctlq_create_info(&ctlqs_info[0], LIBIE_CTLQ_TYPE_TX,
+ &ctlq_reg_tx);
+ ixd_init_ctlq_create_info(&ctlqs_info[1], LIBIE_CTLQ_TYPE_RX,
+ &ctlq_reg_rx);
+ ixd_init_libie_xn_params(&xn_params, adapter, ctlqs_info,
+ ARRAY_SIZE(ctlqs_info));
+ err = libie_ctlq_xn_init(&xn_params);
+ if (err)
+ return err;
+ adapter->xnm = xn_params.xnm;
+
+ ixd_adapter_fill_dflt_ctlqs(adapter);
+
+ if (!adapter->asq || !adapter->arq) {
+ ixd_deinit_dflt_mbx(adapter);
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+/**
+ * ixd_init_task - Initialize after reset
+ * @work: init work struct
+ */
+void ixd_init_task(struct work_struct *work)
+{
+ struct ixd_adapter *adapter;
+ int err;
+
+ adapter = container_of(work, struct ixd_adapter,
+ init_task.init_work.work);
+
+ if (!ixd_check_reset_complete(adapter)) {
+ if (++adapter->init_task.reset_retries < 10)
+ queue_delayed_work(system_unbound_wq,
+ &adapter->init_task.init_work,
+ msecs_to_jiffies(500));
+ else
+ dev_err(ixd_to_dev(adapter),
+ "Device reset failed. The driver was unable to contact the device's firmware. Check that the FW is running.\n");
+ return;
+ }
+
+ adapter->init_task.reset_retries = 0;
+ err = ixd_init_dflt_mbx(adapter);
+ if (err)
+ dev_err(ixd_to_dev(adapter),
+ "Failed to initialize the default mailbox: %pe\n",
+ ERR_PTR(err));
+}
diff --git a/drivers/net/ethernet/intel/ixd/ixd_main.c b/drivers/net/ethernet/intel/ixd/ixd_main.c
index 75ee53152e61..b4d4000b63ed 100644
--- a/drivers/net/ethernet/intel/ixd/ixd_main.c
+++ b/drivers/net/ethernet/intel/ixd/ixd_main.c
@@ -5,6 +5,7 @@
#include "ixd_lan_regs.h"
MODULE_DESCRIPTION("Intel(R) Control Plane Function Device Driver");
+MODULE_IMPORT_NS("LIBIE_CP");
MODULE_IMPORT_NS("LIBIE_PCI");
MODULE_LICENSE("GPL");
@@ -16,7 +17,13 @@ static void ixd_remove(struct pci_dev *pdev)
{
struct ixd_adapter *adapter = pci_get_drvdata(pdev);
- libie_pci_unmap_all_mmio_regions(&adapter->hw);
+ /* Do not mix removal with (re)initialization */
+ cancel_delayed_work_sync(&adapter->init_task.init_work);
+ /* Leave the device clean on exit */
+ ixd_trigger_reset(adapter);
+ ixd_deinit_dflt_mbx(adapter);
+
+ libie_pci_unmap_all_mmio_regions(&adapter->cp_ctx.mmio_info);
}
/**
@@ -51,7 +58,7 @@ static int ixd_iomap_regions(struct ixd_adapter *adapter)
};
for (int i = 0; i < ARRAY_SIZE(regions); i++) {
- struct libie_mmio_info *mmio_info = &adapter->hw;
+ struct libie_mmio_info *mmio_info = &adapter->cp_ctx.mmio_info;
bool map_ok;
map_ok = libie_pci_map_mmio_region(mmio_info,
@@ -81,11 +88,15 @@ static int ixd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
struct ixd_adapter *adapter;
int err;
+ if (WARN_ON(ent->device != IXD_DEV_ID_CPF))
+ return -EINVAL;
+
adapter = devm_kzalloc(&pdev->dev, sizeof(*adapter), GFP_KERNEL);
if (!adapter)
return -ENOMEM;
- adapter->hw.pdev = pdev;
- INIT_LIST_HEAD(&adapter->hw.mmio_list);
+
+ adapter->cp_ctx.mmio_info.pdev = pdev;
+ INIT_LIST_HEAD(&adapter->cp_ctx.mmio_info.mmio_list);
err = libie_pci_init_dev(pdev);
if (err)
@@ -93,7 +104,18 @@ static int ixd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
pci_set_drvdata(pdev, adapter);
- return ixd_iomap_regions(adapter);
+ err = ixd_iomap_regions(adapter);
+ if (err)
+ return err;
+
+ INIT_DELAYED_WORK(&adapter->init_task.init_work,
+ ixd_init_task);
+
+ ixd_trigger_reset(adapter);
+ queue_delayed_work(system_unbound_wq, &adapter->init_task.init_work,
+ msecs_to_jiffies(500));
+
+ return 0;
}
static const struct pci_device_id ixd_pci_tbl[] = {
--
2.47.1
^ permalink raw reply related
* [PATCH net-next v2 13/14] ixd: add the core initialization
From: Tony Nguyen @ 2026-04-03 19:49 UTC (permalink / raw)
To: davem, kuba, pabeni, edumazet, andrew+netdev, netdev
Cc: Larysa Zaremba, anthony.l.nguyen, przemyslaw.kitszel,
aleksander.lobakin, sridhar.samudrala, anjali.singhai,
michal.swiatkowski, maciej.fijalkowski, emil.s.tantilov,
madhu.chittim, joshua.a.hay, jacob.e.keller,
jayaprakash.shanmugam, jiri, horms, corbet, richardcochran,
linux-doc, Bharath R, Aleksandr Loktionov
In-Reply-To: <20260403194938.3577011-1-anthony.l.nguyen@intel.com>
From: Larysa Zaremba <larysa.zaremba@intel.com>
As the mailbox is setup, initialize the core. This makes use of the send
and receive mailbox message framework for virtchnl communication between
the driver and device Control Plane (CP).
To start with, driver confirms the virtchnl version with the CP. Once that
is done, it requests and gets the required capabilities and resources
needed such as max vectors, queues, vports etc.
Use a unified way of handling the virtchnl messages, where a single
function handles all related memory management and the caller only provides
the callbacks to fill the send buffer and to handle the response.
Place generic control queue message handling separately to facilitate the
addition of protocols other than virtchannel in the future.
Co-developed-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Tested-by: Bharath R <Bharath.r@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
drivers/net/ethernet/intel/ixd/Makefile | 2 +
drivers/net/ethernet/intel/ixd/ixd.h | 10 +
drivers/net/ethernet/intel/ixd/ixd_ctlq.c | 149 +++++++++++++++
drivers/net/ethernet/intel/ixd/ixd_ctlq.h | 33 ++++
drivers/net/ethernet/intel/ixd/ixd_lib.c | 25 ++-
drivers/net/ethernet/intel/ixd/ixd_main.c | 3 +
drivers/net/ethernet/intel/ixd/ixd_virtchnl.c | 178 ++++++++++++++++++
drivers/net/ethernet/intel/ixd/ixd_virtchnl.h | 12 ++
8 files changed, 411 insertions(+), 1 deletion(-)
create mode 100644 drivers/net/ethernet/intel/ixd/ixd_ctlq.c
create mode 100644 drivers/net/ethernet/intel/ixd/ixd_ctlq.h
create mode 100644 drivers/net/ethernet/intel/ixd/ixd_virtchnl.c
create mode 100644 drivers/net/ethernet/intel/ixd/ixd_virtchnl.h
diff --git a/drivers/net/ethernet/intel/ixd/Makefile b/drivers/net/ethernet/intel/ixd/Makefile
index 164b2c86952f..90abf231fb16 100644
--- a/drivers/net/ethernet/intel/ixd/Makefile
+++ b/drivers/net/ethernet/intel/ixd/Makefile
@@ -6,5 +6,7 @@
obj-$(CONFIG_IXD) += ixd.o
ixd-y := ixd_main.o
+ixd-y += ixd_ctlq.o
ixd-y += ixd_dev.o
ixd-y += ixd_lib.o
+ixd-y += ixd_virtchnl.o
diff --git a/drivers/net/ethernet/intel/ixd/ixd.h b/drivers/net/ethernet/intel/ixd/ixd.h
index 99c44f2aa659..98d1f22534b5 100644
--- a/drivers/net/ethernet/intel/ixd/ixd.h
+++ b/drivers/net/ethernet/intel/ixd/ixd.h
@@ -10,19 +10,29 @@
* struct ixd_adapter - Data structure representing a CPF
* @cp_ctx: Control plane communication context
* @init_task: Delayed initialization after reset
+ * @mbx_task: Control queue Rx handling
* @xnm: virtchnl transaction manager
* @asq: Send control queue info
* @arq: Receive control queue info
+ * @vc_ver: Negotiated virtchnl version
+ * @caps: Negotiated virtchnl capabilities
*/
struct ixd_adapter {
struct libie_ctlq_ctx cp_ctx;
struct {
struct delayed_work init_work;
u8 reset_retries;
+ u8 vc_retries;
} init_task;
+ struct delayed_work mbx_task;
struct libie_ctlq_xn_manager *xnm;
struct libie_ctlq_info *asq;
struct libie_ctlq_info *arq;
+ struct {
+ u32 major;
+ u32 minor;
+ } vc_ver;
+ struct virtchnl2_get_capabilities caps;
};
/**
diff --git a/drivers/net/ethernet/intel/ixd/ixd_ctlq.c b/drivers/net/ethernet/intel/ixd/ixd_ctlq.c
new file mode 100644
index 000000000000..216aa5c02122
--- /dev/null
+++ b/drivers/net/ethernet/intel/ixd/ixd_ctlq.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2025 Intel Corporation */
+
+#include "ixd.h"
+#include "ixd_ctlq.h"
+#include "ixd_virtchnl.h"
+
+/**
+ * ixd_ctlq_clean_sq - Clean the send control queue after sending the message
+ * @adapter: The adapter that sent the messages
+ * @num_sent: Number of sent messages to be released
+ *
+ * Free the libie send resources after sending the message and handling
+ * the response.
+ */
+static void ixd_ctlq_clean_sq(struct ixd_adapter *adapter, u16 num_sent)
+{
+ if (!num_sent)
+ return;
+
+ struct libie_ctlq_xn_clean_params params = {
+ .ctlq = adapter->asq,
+ .ctx = &adapter->cp_ctx,
+ .num_msgs = num_sent,
+ .rel_tx_buf = kfree,
+ };
+
+ libie_ctlq_xn_send_clean(¶ms);
+}
+
+/**
+ * ixd_ctlq_init_sparams - Initialize control queue send parameters
+ * @adapter: The adapter with initialized mailbox
+ * @sparams: Parameters to initialize
+ * @msg_buf: DMA-mappable pointer to the message being sent
+ * @msg_size: Message size
+ */
+static void ixd_ctlq_init_sparams(struct ixd_adapter *adapter,
+ struct libie_ctlq_xn_send_params *sparams,
+ void *msg_buf, size_t msg_size)
+{
+ *sparams = (struct libie_ctlq_xn_send_params) {
+ .rel_tx_buf = kfree,
+ .xnm = adapter->xnm,
+ .ctlq = adapter->asq,
+ .timeout_ms = IXD_CTLQ_TIMEOUT,
+ .send_buf = (struct kvec) {
+ .iov_base = msg_buf,
+ .iov_len = msg_size,
+ },
+ };
+}
+
+/**
+ * ixd_ctlq_do_req - Perform a standard virtchnl request
+ * @adapter: The adapter with initialized mailbox
+ * @req: virtchnl request description
+ *
+ * Return: %0 if a message was sent and received a response
+ * that was successfully handled by the custom callback,
+ * negative error otherwise.
+ */
+int ixd_ctlq_do_req(struct ixd_adapter *adapter, const struct ixd_ctlq_req *req)
+{
+ struct libie_ctlq_xn_send_params send_params = {};
+ u8 onstack_send_buff[LIBIE_CP_TX_COPYBREAK] = {};
+ struct kvec *recv_mem;
+ void *send_buff;
+ int err;
+
+ send_buff = libie_cp_can_send_onstack(req->send_size) ?
+ &onstack_send_buff : kzalloc(req->send_size, GFP_KERNEL);
+ if (!send_buff)
+ return -ENOMEM;
+
+ ixd_ctlq_init_sparams(adapter, &send_params, send_buff,
+ req->send_size);
+
+ send_params.chnl_opcode = req->opcode;
+
+ if (req->send_buff_init)
+ req->send_buff_init(adapter, send_buff, req->ctx);
+
+ err = libie_ctlq_xn_send(&send_params);
+ if (err)
+ return err;
+
+ recv_mem = &send_params.recv_mem;
+ if (req->recv_process)
+ err = req->recv_process(adapter, recv_mem->iov_base,
+ recv_mem->iov_len, req->ctx);
+
+ ixd_ctlq_clean_sq(adapter, 1);
+ libie_ctlq_release_rx_buf(recv_mem);
+
+ return err;
+}
+
+/**
+ * ixd_ctlq_handle_msg - Default control queue message handler
+ * @ctx: Control plane communication context
+ * @msg: Message received
+ */
+static void ixd_ctlq_handle_msg(struct libie_ctlq_ctx *ctx,
+ struct libie_ctlq_msg *msg)
+{
+ struct ixd_adapter *adapter = pci_get_drvdata(ctx->mmio_info.pdev);
+
+ if (ixd_vc_can_handle_msg(msg))
+ ixd_vc_recv_event_msg(adapter, msg);
+ else
+ dev_dbg_ratelimited(ixd_to_dev(adapter),
+ "Received an unsupported opcode 0x%x from the CP\n",
+ msg->chnl_opcode);
+
+ libie_ctlq_release_rx_buf(&msg->recv_mem);
+}
+
+/**
+ * ixd_ctlq_recv_mb_msg - Receive a potential message over mailbox periodically
+ * @adapter: The adapter with initialized mailbox
+ */
+static void ixd_ctlq_recv_mb_msg(struct ixd_adapter *adapter)
+{
+ struct libie_ctlq_xn_recv_params xn_params = {
+ .xnm = adapter->xnm,
+ .ctlq = adapter->arq,
+ .ctlq_msg_handler = ixd_ctlq_handle_msg,
+ .budget = LIBIE_CTLQ_MAX_XN_ENTRIES,
+ };
+
+ libie_ctlq_xn_recv(&xn_params);
+}
+
+/**
+ * ixd_ctlq_rx_task - Periodically check for mailbox responses and events
+ * @work: work handle
+ */
+void ixd_ctlq_rx_task(struct work_struct *work)
+{
+ struct ixd_adapter *adapter;
+
+ adapter = container_of(work, struct ixd_adapter, mbx_task.work);
+
+ queue_delayed_work(system_unbound_wq, &adapter->mbx_task,
+ msecs_to_jiffies(300));
+
+ ixd_ctlq_recv_mb_msg(adapter);
+}
diff --git a/drivers/net/ethernet/intel/ixd/ixd_ctlq.h b/drivers/net/ethernet/intel/ixd/ixd_ctlq.h
new file mode 100644
index 000000000000..f450a3a0828f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ixd/ixd_ctlq.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2025 Intel Corporation */
+
+#ifndef _IXD_CTLQ_H_
+#define _IXD_CTLQ_H_
+
+#include "linux/intel/virtchnl2.h"
+
+#define IXD_CTLQ_TIMEOUT 2000
+
+/**
+ * struct ixd_ctlq_req - Standard virtchnl request description
+ * @opcode: protocol opcode, only virtchnl2 is needed for now
+ * @send_size: required length of the send buffer
+ * @send_buff_init: function to initialize the allocated send buffer
+ * @recv_process: function to handle the CP response
+ * @ctx: additional context for callbacks
+ */
+struct ixd_ctlq_req {
+ enum virtchnl2_op opcode;
+ size_t send_size;
+ void (*send_buff_init)(struct ixd_adapter *adapter, void *send_buff,
+ void *ctx);
+ int (*recv_process)(struct ixd_adapter *adapter, void *recv_buff,
+ size_t recv_size, void *ctx);
+ void *ctx;
+};
+
+int ixd_ctlq_do_req(struct ixd_adapter *adapter,
+ const struct ixd_ctlq_req *req);
+void ixd_ctlq_rx_task(struct work_struct *work);
+
+#endif /* _IXD_CTLQ_H_ */
diff --git a/drivers/net/ethernet/intel/ixd/ixd_lib.c b/drivers/net/ethernet/intel/ixd/ixd_lib.c
index afc413d3650f..24080cb30c43 100644
--- a/drivers/net/ethernet/intel/ixd/ixd_lib.c
+++ b/drivers/net/ethernet/intel/ixd/ixd_lib.c
@@ -2,6 +2,7 @@
/* Copyright (C) 2025 Intel Corporation */
#include "ixd.h"
+#include "ixd_virtchnl.h"
#define IXD_DFLT_MBX_Q_LEN 64
@@ -67,6 +68,8 @@ static void ixd_adapter_fill_dflt_ctlqs(struct ixd_adapter *adapter)
*/
void ixd_deinit_dflt_mbx(struct ixd_adapter *adapter)
{
+ cancel_delayed_work_sync(&adapter->mbx_task);
+
if (adapter->xnm)
libie_ctlq_xn_deinit(adapter->xnm, &adapter->cp_ctx);
@@ -108,6 +111,8 @@ int ixd_init_dflt_mbx(struct ixd_adapter *adapter)
return -ENOENT;
}
+ queue_delayed_work(system_unbound_wq, &adapter->mbx_task, 0);
+
return 0;
}
@@ -136,8 +141,26 @@ void ixd_init_task(struct work_struct *work)
adapter->init_task.reset_retries = 0;
err = ixd_init_dflt_mbx(adapter);
- if (err)
+ if (err) {
dev_err(ixd_to_dev(adapter),
"Failed to initialize the default mailbox: %pe\n",
ERR_PTR(err));
+ return;
+ }
+
+ if (!ixd_vc_dev_init(adapter)) {
+ adapter->init_task.vc_retries = 0;
+ return;
+ }
+
+ ixd_deinit_dflt_mbx(adapter);
+ if (++adapter->init_task.vc_retries > 5) {
+ dev_err(ixd_to_dev(adapter),
+ "Failed to establish mailbox communications with the hardware\n");
+ return;
+ }
+
+ ixd_trigger_reset(adapter);
+ queue_delayed_work(system_unbound_wq, &adapter->init_task.init_work,
+ msecs_to_jiffies(500));
}
diff --git a/drivers/net/ethernet/intel/ixd/ixd_main.c b/drivers/net/ethernet/intel/ixd/ixd_main.c
index b4d4000b63ed..6d5e6aca77df 100644
--- a/drivers/net/ethernet/intel/ixd/ixd_main.c
+++ b/drivers/net/ethernet/intel/ixd/ixd_main.c
@@ -2,6 +2,7 @@
/* Copyright (C) 2025 Intel Corporation */
#include "ixd.h"
+#include "ixd_ctlq.h"
#include "ixd_lan_regs.h"
MODULE_DESCRIPTION("Intel(R) Control Plane Function Device Driver");
@@ -19,6 +20,7 @@ static void ixd_remove(struct pci_dev *pdev)
/* Do not mix removal with (re)initialization */
cancel_delayed_work_sync(&adapter->init_task.init_work);
+
/* Leave the device clean on exit */
ixd_trigger_reset(adapter);
ixd_deinit_dflt_mbx(adapter);
@@ -110,6 +112,7 @@ static int ixd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
INIT_DELAYED_WORK(&adapter->init_task.init_work,
ixd_init_task);
+ INIT_DELAYED_WORK(&adapter->mbx_task, ixd_ctlq_rx_task);
ixd_trigger_reset(adapter);
queue_delayed_work(system_unbound_wq, &adapter->init_task.init_work,
diff --git a/drivers/net/ethernet/intel/ixd/ixd_virtchnl.c b/drivers/net/ethernet/intel/ixd/ixd_virtchnl.c
new file mode 100644
index 000000000000..66049d1b1d15
--- /dev/null
+++ b/drivers/net/ethernet/intel/ixd/ixd_virtchnl.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2025 Intel Corporation */
+
+#include "ixd.h"
+#include "ixd_ctlq.h"
+#include "ixd_virtchnl.h"
+
+/**
+ * ixd_vc_recv_event_msg - Handle virtchnl event message
+ * @adapter: The adapter handling the message
+ * @ctlq_msg: Message received
+ */
+void ixd_vc_recv_event_msg(struct ixd_adapter *adapter,
+ struct libie_ctlq_msg *ctlq_msg)
+{
+ int payload_size = ctlq_msg->data_len;
+ struct virtchnl2_event *v2e;
+
+ if (payload_size < sizeof(*v2e)) {
+ dev_warn_ratelimited(ixd_to_dev(adapter),
+ "Failed to receive valid payload for event msg (op 0x%X len %u)\n",
+ ctlq_msg->chnl_opcode,
+ payload_size);
+ return;
+ }
+
+ v2e = (struct virtchnl2_event *)ctlq_msg->recv_mem.iov_base;
+
+ dev_dbg(ixd_to_dev(adapter), "Got event 0x%X from the CP\n",
+ le32_to_cpu(v2e->event));
+}
+
+/**
+ * ixd_vc_can_handle_msg - Decide if an event has to be handled by virtchnl code
+ * @ctlq_msg: Message received
+ *
+ * Return: %true if virtchnl code can handle the event, %false otherwise
+ */
+bool ixd_vc_can_handle_msg(struct libie_ctlq_msg *ctlq_msg)
+{
+ return ctlq_msg->chnl_opcode == VIRTCHNL2_OP_EVENT;
+}
+
+/**
+ * ixd_handle_caps - Handle VIRTCHNL2_OP_GET_CAPS response
+ * @adapter: The adapter for which the capabilities are being updated
+ * @recv_buff: Buffer containing the response
+ * @recv_size: Response buffer size
+ * @ctx: unused
+ *
+ * Return: %0 if the response format is correct and was handled as expected,
+ * negative error otherwise.
+ */
+static int ixd_handle_caps(struct ixd_adapter *adapter, void *recv_buff,
+ size_t recv_size, void *ctx)
+{
+ if (recv_size < sizeof(adapter->caps))
+ return -EBADMSG;
+
+ adapter->caps = *(typeof(adapter->caps) *)recv_buff;
+
+ return 0;
+}
+
+/**
+ * ixd_req_vc_caps - Request and save device capability
+ * @adapter: The adapter to get the capabilities for
+ *
+ * Return: success or error if sending the get capability message fails
+ */
+static int ixd_req_vc_caps(struct ixd_adapter *adapter)
+{
+ const struct ixd_ctlq_req req = {
+ .opcode = VIRTCHNL2_OP_GET_CAPS,
+ .send_size = sizeof(struct virtchnl2_get_capabilities),
+ .ctx = NULL,
+ .send_buff_init = NULL,
+ .recv_process = ixd_handle_caps,
+ };
+
+ return ixd_ctlq_do_req(adapter, &req);
+}
+
+/**
+ * ixd_get_vc_ver - Get version info from adapter
+ *
+ * Return: filled in virtchannel2 version info, ready for sending
+ */
+static struct virtchnl2_version_info ixd_get_vc_ver(void)
+{
+ return (struct virtchnl2_version_info) {
+ .major = cpu_to_le32(VIRTCHNL2_VERSION_MAJOR_2),
+ .minor = cpu_to_le32(VIRTCHNL2_VERSION_MINOR_0),
+ };
+}
+
+static void ixd_fill_vc_ver(struct ixd_adapter *adapter, void *send_buff,
+ void *ctx)
+{
+ *(struct virtchnl2_version_info *)send_buff = ixd_get_vc_ver();
+}
+
+/**
+ * ixd_handle_vc_ver - Handle VIRTCHNL2_OP_VERSION response
+ * @adapter: The adapter for which the version is being updated
+ * @recv_buff: Buffer containing the response
+ * @recv_size: Response buffer size
+ * @ctx: Unused
+ *
+ * Return: %0 if the response format is correct and was handled as expected,
+ * negative error otherwise.
+ */
+static int ixd_handle_vc_ver(struct ixd_adapter *adapter, void *recv_buff,
+ size_t recv_size, void *ctx)
+{
+ struct virtchnl2_version_info need_ver = ixd_get_vc_ver();
+ struct virtchnl2_version_info *recv_ver;
+
+ if (recv_size < sizeof(need_ver))
+ return -EBADMSG;
+
+ recv_ver = recv_buff;
+ if (le32_to_cpu(need_ver.major) > le32_to_cpu(recv_ver->major))
+ return -EOPNOTSUPP;
+
+ adapter->vc_ver.major = le32_to_cpu(recv_ver->major);
+ adapter->vc_ver.minor = le32_to_cpu(recv_ver->minor);
+
+ return 0;
+}
+
+/**
+ * ixd_req_vc_version - Request and save Virtchannel2 version
+ * @adapter: The adapter to get the version for
+ *
+ * Return: success or error if sending fails or the response was not as expected
+ */
+static int ixd_req_vc_version(struct ixd_adapter *adapter)
+{
+ const struct ixd_ctlq_req req = {
+ .opcode = VIRTCHNL2_OP_VERSION,
+ .send_size = sizeof(struct virtchnl2_version_info),
+ .ctx = NULL,
+ .send_buff_init = ixd_fill_vc_ver,
+ .recv_process = ixd_handle_vc_ver,
+ };
+
+ return ixd_ctlq_do_req(adapter, &req);
+}
+
+/**
+ * ixd_vc_dev_init - virtchnl device core initialization
+ * @adapter: device information
+ *
+ * Return: %0 on success or error if any step of the initialization fails
+ */
+int ixd_vc_dev_init(struct ixd_adapter *adapter)
+{
+ int err;
+
+ err = ixd_req_vc_version(adapter);
+ if (err) {
+ dev_warn(ixd_to_dev(adapter),
+ "Getting virtchnl version failed, error=%pe\n",
+ ERR_PTR(err));
+ return err;
+ }
+
+ err = ixd_req_vc_caps(adapter);
+ if (err) {
+ dev_warn(ixd_to_dev(adapter),
+ "Getting virtchnl capabilities failed, error=%pe\n",
+ ERR_PTR(err));
+ return err;
+ }
+
+ return err;
+}
diff --git a/drivers/net/ethernet/intel/ixd/ixd_virtchnl.h b/drivers/net/ethernet/intel/ixd/ixd_virtchnl.h
new file mode 100644
index 000000000000..1a53da8b545c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ixd/ixd_virtchnl.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2025 Intel Corporation */
+
+#ifndef _IXD_VIRTCHNL_H_
+#define _IXD_VIRTCHNL_H_
+
+int ixd_vc_dev_init(struct ixd_adapter *adapter);
+bool ixd_vc_can_handle_msg(struct libie_ctlq_msg *ctlq_msg);
+void ixd_vc_recv_event_msg(struct ixd_adapter *adapter,
+ struct libie_ctlq_msg *ctlq_msg);
+
+#endif /* _IXD_VIRTCHNL_H_ */
--
2.47.1
^ permalink raw reply related
* [PATCH net-next v2 14/14] ixd: add devlink support
From: Tony Nguyen @ 2026-04-03 19:49 UTC (permalink / raw)
To: davem, kuba, pabeni, edumazet, andrew+netdev, netdev
Cc: Amritha Nambiar, larysa.zaremba, przemyslaw.kitszel,
aleksander.lobakin, sridhar.samudrala, anjali.singhai,
michal.swiatkowski, maciej.fijalkowski, emil.s.tantilov,
madhu.chittim, joshua.a.hay, jacob.e.keller,
jayaprakash.shanmugam, jiri, horms, corbet, richardcochran,
linux-doc, Bharath R, Aleksandr Loktionov
In-Reply-To: <20260403194938.3577011-1-anthony.l.nguyen@intel.com>
From: Amritha Nambiar <amritha.nambiar@intel.com>
Enable initial support for the devlink interface with the ixd driver. The
ixd hardware is a single function PCIe device. So, the PCIe adapter gets
its own devlink instance to manage device-wide resources or configuration.
$ devlink dev show
pci/0000:83:00.6
$ devlink dev info pci/0000:83:00.6
pci/0000:83:00.6:
driver ixd
serial_number 00-a0-c9-ff-ff-23-45-67
versions:
fixed:
device.type MEV
running:
virtchnl 2.0
Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Reviewed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Tested-by: Bharath R <Bharath.r@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
Documentation/networking/devlink/index.rst | 1 +
Documentation/networking/devlink/ixd.rst | 30 ++++++
drivers/net/ethernet/intel/ixd/Kconfig | 1 +
drivers/net/ethernet/intel/ixd/Makefile | 1 +
drivers/net/ethernet/intel/ixd/ixd_devlink.c | 97 ++++++++++++++++++++
drivers/net/ethernet/intel/ixd/ixd_devlink.h | 44 +++++++++
drivers/net/ethernet/intel/ixd/ixd_main.c | 16 +++-
7 files changed, 187 insertions(+), 3 deletions(-)
create mode 100644 Documentation/networking/devlink/ixd.rst
create mode 100644 drivers/net/ethernet/intel/ixd/ixd_devlink.c
create mode 100644 drivers/net/ethernet/intel/ixd/ixd_devlink.h
diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst
index f7ba7dcf477d..f0c077843fa7 100644
--- a/Documentation/networking/devlink/index.rst
+++ b/Documentation/networking/devlink/index.rst
@@ -88,6 +88,7 @@ parameters, info versions, and other features it supports.
ionic
iosm
ixgbe
+ ixd
kvaser_pciefd
kvaser_usb
mlx4
diff --git a/Documentation/networking/devlink/ixd.rst b/Documentation/networking/devlink/ixd.rst
new file mode 100644
index 000000000000..17b63c8425aa
--- /dev/null
+++ b/Documentation/networking/devlink/ixd.rst
@@ -0,0 +1,30 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================
+ixd devlink support
+===================
+
+This document describes the devlink features implemented by the ``ixd``
+device driver.
+
+Info versions
+=============
+
+The ``ixd`` driver reports the following versions
+
+.. list-table:: devlink info versions implemented
+ :widths: 5 5 5 90
+
+ * - Name
+ - Type
+ - Example
+ - Description
+ * - ``device.type``
+ - fixed
+ - MEV
+ - The hardware type for this device
+ * - ``virtchnl``
+ - running
+ - 2.0
+ - 2-digit version number (major.minor) of the communication channel
+ (virtchnl) used by the device.
diff --git a/drivers/net/ethernet/intel/ixd/Kconfig b/drivers/net/ethernet/intel/ixd/Kconfig
index 24510c50070e..34181c59dcdc 100644
--- a/drivers/net/ethernet/intel/ixd/Kconfig
+++ b/drivers/net/ethernet/intel/ixd/Kconfig
@@ -7,6 +7,7 @@ config IXD
select LIBETH
select LIBIE_CP
select LIBIE_PCI
+ select NET_DEVLINK
help
This driver supports Intel(R) Control Plane PCI Function
of Intel E2100 and later IPUs and FNICs.
diff --git a/drivers/net/ethernet/intel/ixd/Makefile b/drivers/net/ethernet/intel/ixd/Makefile
index 90abf231fb16..03760a2580b9 100644
--- a/drivers/net/ethernet/intel/ixd/Makefile
+++ b/drivers/net/ethernet/intel/ixd/Makefile
@@ -8,5 +8,6 @@ obj-$(CONFIG_IXD) += ixd.o
ixd-y := ixd_main.o
ixd-y += ixd_ctlq.o
ixd-y += ixd_dev.o
+ixd-y += ixd_devlink.o
ixd-y += ixd_lib.o
ixd-y += ixd_virtchnl.o
diff --git a/drivers/net/ethernet/intel/ixd/ixd_devlink.c b/drivers/net/ethernet/intel/ixd/ixd_devlink.c
new file mode 100644
index 000000000000..23ab11226978
--- /dev/null
+++ b/drivers/net/ethernet/intel/ixd/ixd_devlink.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025, Intel Corporation. */
+
+#include "ixd.h"
+#include "ixd_devlink.h"
+
+#define IXD_DEVLINK_INFO_LEN 128
+
+/**
+ * ixd_fill_dsn - Get the serial number for the ixd device
+ * @adapter: adapter to query
+ * @buf: storage buffer for the info request
+ */
+static void ixd_fill_dsn(struct ixd_adapter *adapter, char *buf)
+{
+ u8 dsn[8];
+
+ /* Copy the DSN into an array in Big Endian format */
+ put_unaligned_be64(pci_get_dsn(adapter->cp_ctx.mmio_info.pdev), dsn);
+
+ snprintf(buf, IXD_DEVLINK_INFO_LEN, "%8phD", dsn);
+}
+
+/**
+ * ixd_fill_device_name - Get the name of the underlying hardware
+ * @adapter: adapter to query
+ * @buf: storage buffer for the info request
+ * @buf_size: size of the storage buffer
+ */
+static void ixd_fill_device_name(struct ixd_adapter *adapter, char *buf,
+ size_t buf_size)
+{
+ if (adapter->caps.device_type == cpu_to_le32(VIRTCHNL2_MEV_DEVICE))
+ snprintf(buf, buf_size, "%s", "MEV");
+ else
+ snprintf(buf, buf_size, "%s", "UNKNOWN");
+}
+
+/**
+ * ixd_devlink_info_get - .info_get devlink handler
+ * @devlink: devlink instance structure
+ * @req: the devlink info request
+ * @extack: extended netdev ack structure
+ *
+ * Callback for the devlink .info_get operation. Reports information about the
+ * device.
+ *
+ * Return: zero on success or an error code on failure.
+ */
+static int ixd_devlink_info_get(struct devlink *devlink,
+ struct devlink_info_req *req,
+ struct netlink_ext_ack *extack)
+{
+ struct ixd_adapter *adapter = devlink_priv(devlink);
+ char buf[IXD_DEVLINK_INFO_LEN];
+ int err;
+
+ ixd_fill_dsn(adapter, buf);
+ err = devlink_info_serial_number_put(req, buf);
+ if (err)
+ return err;
+
+ ixd_fill_device_name(adapter, buf, IXD_DEVLINK_INFO_LEN);
+ err = devlink_info_version_fixed_put(req, "device.type", buf);
+ if (err)
+ return err;
+
+ snprintf(buf, sizeof(buf), "%u.%u",
+ adapter->vc_ver.major, adapter->vc_ver.minor);
+
+ return devlink_info_version_running_put(req, "virtchnl", buf);
+}
+
+static const struct devlink_ops ixd_devlink_ops = {
+ .info_get = ixd_devlink_info_get,
+};
+
+/**
+ * ixd_adapter_alloc - Allocate devlink and return adapter pointer
+ * @dev: the device to allocate for
+ *
+ * Allocate a devlink instance for this device and return the private area as
+ * the adapter structure.
+ *
+ * Return: adapter structure on success, NULL on failure
+ */
+struct ixd_adapter *ixd_adapter_alloc(struct device *dev)
+{
+ struct devlink *devlink;
+
+ devlink = devlink_alloc(&ixd_devlink_ops, sizeof(struct ixd_adapter),
+ dev);
+ if (!devlink)
+ return NULL;
+
+ return devlink_priv(devlink);
+}
diff --git a/drivers/net/ethernet/intel/ixd/ixd_devlink.h b/drivers/net/ethernet/intel/ixd/ixd_devlink.h
new file mode 100644
index 000000000000..c43ce0655de2
--- /dev/null
+++ b/drivers/net/ethernet/intel/ixd/ixd_devlink.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025, Intel Corporation. */
+
+#ifndef _IXD_DEVLINK_H_
+#define _IXD_DEVLINK_H_
+#include <net/devlink.h>
+
+struct ixd_adapter *ixd_adapter_alloc(struct device *dev);
+
+/**
+ * ixd_devlink_free - teardown the devlink
+ * @adapter: the adapter structure to free
+ *
+ */
+static inline void ixd_devlink_free(struct ixd_adapter *adapter)
+{
+ struct devlink *devlink = priv_to_devlink(adapter);
+
+ devlink_free(devlink);
+}
+
+/**
+ * ixd_devlink_unregister - Unregister devlink resources for this adapter.
+ * @adapter: the adapter structure to cleanup
+ *
+ * Releases resources used by devlink and cleans up associated memory.
+ */
+static inline void ixd_devlink_unregister(struct ixd_adapter *adapter)
+{
+ devlink_unregister(priv_to_devlink(adapter));
+}
+
+/**
+ * ixd_devlink_register - Register devlink interface for this adapter
+ * @adapter: pointer to ixd adapter structure to be associated with devlink
+ *
+ * Register the devlink instance associated with this adapter
+ */
+static inline void ixd_devlink_register(struct ixd_adapter *adapter)
+{
+ devlink_register(priv_to_devlink(adapter));
+}
+
+#endif /* _IXD_DEVLINK_H_ */
diff --git a/drivers/net/ethernet/intel/ixd/ixd_main.c b/drivers/net/ethernet/intel/ixd/ixd_main.c
index 6d5e6aca77df..ea6aa793a6a7 100644
--- a/drivers/net/ethernet/intel/ixd/ixd_main.c
+++ b/drivers/net/ethernet/intel/ixd/ixd_main.c
@@ -4,6 +4,7 @@
#include "ixd.h"
#include "ixd_ctlq.h"
#include "ixd_lan_regs.h"
+#include "ixd_devlink.h"
MODULE_DESCRIPTION("Intel(R) Control Plane Function Device Driver");
MODULE_IMPORT_NS("LIBIE_CP");
@@ -21,11 +22,14 @@ static void ixd_remove(struct pci_dev *pdev)
/* Do not mix removal with (re)initialization */
cancel_delayed_work_sync(&adapter->init_task.init_work);
+ ixd_devlink_unregister(adapter);
+
/* Leave the device clean on exit */
ixd_trigger_reset(adapter);
ixd_deinit_dflt_mbx(adapter);
libie_pci_unmap_all_mmio_regions(&adapter->cp_ctx.mmio_info);
+ ixd_devlink_free(adapter);
}
/**
@@ -93,7 +97,7 @@ static int ixd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
if (WARN_ON(ent->device != IXD_DEV_ID_CPF))
return -EINVAL;
- adapter = devm_kzalloc(&pdev->dev, sizeof(*adapter), GFP_KERNEL);
+ adapter = ixd_adapter_alloc(&pdev->dev);
if (!adapter)
return -ENOMEM;
@@ -102,13 +106,13 @@ static int ixd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
err = libie_pci_init_dev(pdev);
if (err)
- return err;
+ goto free_adapter;
pci_set_drvdata(pdev, adapter);
err = ixd_iomap_regions(adapter);
if (err)
- return err;
+ goto free_adapter;
INIT_DELAYED_WORK(&adapter->init_task.init_work,
ixd_init_task);
@@ -118,7 +122,13 @@ static int ixd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
queue_delayed_work(system_unbound_wq, &adapter->init_task.init_work,
msecs_to_jiffies(500));
+ ixd_devlink_register(adapter);
+
return 0;
+
+free_adapter:
+ ixd_devlink_free(adapter);
+ return err;
}
static const struct pci_device_id ixd_pci_tbl[] = {
--
2.47.1
^ permalink raw reply related
* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Jim Mattson @ 2026-04-03 20:19 UTC (permalink / raw)
To: Pawan Gupta
Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
Tao Zhang, bpf, netdev, linux-doc
In-Reply-To: <20260403185236.sjgetnkha3o3a4d3@desk>
On Fri, Apr 3, 2026 at 11:52 AM Pawan Gupta
<pawan.kumar.gupta@linux.intel.com> wrote:
>
> On Fri, Apr 03, 2026 at 11:10:08AM -0700, Jim Mattson wrote:
> > On Thu, Apr 2, 2026 at 5:32 PM Pawan Gupta
> > <pawan.kumar.gupta@linux.intel.com> wrote:
> > >
> > > As a mitigation for BHI, clear_bhb_loop() executes branches that overwrite
> > > the Branch History Buffer (BHB). On Alder Lake and newer parts this
> > > sequence is not sufficient because it doesn't clear enough entries. This
> > > was not an issue because these CPUs use the BHI_DIS_S hardware mitigation
> > > in the kernel.
> > >
> > > Now with VMSCAPE (BHI variant) it is also required to isolate branch
> > > history between guests and userspace. Since BHI_DIS_S only protects the
> > > kernel, the newer CPUs also use IBPB.
> > >
> > > A cheaper alternative to the current IBPB mitigation is clear_bhb_loop().
> > > But it currently does not clear enough BHB entries to be effective on newer
> > > CPUs with larger BHB. At boot, dynamically set the loop count of
> > > clear_bhb_loop() such that it is effective on newer CPUs too. Use the
> > > X86_FEATURE_BHI_CTRL feature flag to select the appropriate loop count.
> > >
> > > Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
> > > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > > ---
> > > arch/x86/entry/entry_64.S | 8 +++++---
> > > arch/x86/include/asm/nospec-branch.h | 2 ++
> > > arch/x86/kernel/cpu/bugs.c | 13 +++++++++++++
> > > 3 files changed, 20 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > > index 3a180a36ca0e..bbd4b1c7ec04 100644
> > > --- a/arch/x86/entry/entry_64.S
> > > +++ b/arch/x86/entry/entry_64.S
> > > @@ -1536,7 +1536,9 @@ SYM_FUNC_START(clear_bhb_loop)
> > > ANNOTATE_NOENDBR
> > > push %rbp
> > > mov %rsp, %rbp
> > > - movl $5, %ecx
> > > +
> > > + movzbl bhb_seq_outer_loop(%rip), %ecx
> > > +
> > > ANNOTATE_INTRA_FUNCTION_CALL
> > > call 1f
> > > jmp 5f
> > > @@ -1556,8 +1558,8 @@ SYM_FUNC_START(clear_bhb_loop)
> > > * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
> > > * but some Clang versions (e.g. 18) don't like this.
> > > */
> > > - .skip 32 - 18, 0xcc
> > > -2: movl $5, %eax
> > > + .skip 32 - 20, 0xcc
> > > +2: movzbl bhb_seq_inner_loop(%rip), %eax
> > > 3: jmp 4f
> > > nop
> > > 4: sub $1, %eax
> > > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> > > index 70b377fcbc1c..87b83ae7c97f 100644
> > > --- a/arch/x86/include/asm/nospec-branch.h
> > > +++ b/arch/x86/include/asm/nospec-branch.h
> > > @@ -548,6 +548,8 @@ DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
> > > extern void update_spec_ctrl_cond(u64 val);
> > > extern u64 spec_ctrl_current(void);
> > >
> > > +extern u8 bhb_seq_inner_loop, bhb_seq_outer_loop;
> > > +
> > > /*
> > > * With retpoline, we must use IBRS to restrict branch prediction
> > > * before calling into firmware.
> > > diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> > > index 83f51cab0b1e..2cb4a96247d8 100644
> > > --- a/arch/x86/kernel/cpu/bugs.c
> > > +++ b/arch/x86/kernel/cpu/bugs.c
> > > @@ -2047,6 +2047,10 @@ enum bhi_mitigations {
> > > static enum bhi_mitigations bhi_mitigation __ro_after_init =
> > > IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
> > >
> > > +/* Default to short BHB sequence values */
> > > +u8 bhb_seq_outer_loop __ro_after_init = 5;
> > > +u8 bhb_seq_inner_loop __ro_after_init = 5;
> > > +
> > > static int __init spectre_bhi_parse_cmdline(char *str)
> > > {
> > > if (!str)
> > > @@ -3242,6 +3246,15 @@ void __init cpu_select_mitigations(void)
> > > x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
> > > }
> > >
> > > + /*
> > > + * Switch to long BHB clear sequence on newer CPUs (with BHI_CTRL
> > > + * support), see Intel's BHI guidance.
> > > + */
> > > + if (cpu_feature_enabled(X86_FEATURE_BHI_CTRL)) {
> > > + bhb_seq_outer_loop = 12;
> > > + bhb_seq_inner_loop = 7;
> > > + }
> > > +
> >
> > How does this work for VMs in a heterogeneous migration pool that
> > spans the Alder Lake boundary? They can't advertise BHI_CTRL, because
> > it isn't available on all hosts in the migration pool, but they need
> > the long sequence when running on Alder Lake or newer.
>
> As we discussed elsewhere, support for migration pool is much more
> involved. It should be dealt in a separate QEMU/KVM focused series.
>
> A quickfix could be adding support for spectre_bhi=long that guests in a
> migration pool can use?
The simplest solution is to add "|
cpu_feature_enabled(X86_FEATURE_HYPERVISOR)" to the condition above.
If that is unacceptable for the performance of pre-Alder Lake
migration pools, you could define a CPUID or MSR bit that says
explicitly, "long BHB flush sequence needed," rather than trying to
intuit that property from the presence of BHI_CTRL. Like
IA32_ARCH_CAPABILITIES.SKIP_L1DFL_VMENTRY, the bit would only be set
by a hypervisor.
I am still skeptical of the need for MSR_VIRTUAL_ENUMERATION and
friends, unless there is a major guest OS out there that relies on
them.
^ permalink raw reply
* [PATCH] hwmon: (asus-ec-sensors) add ROG STRIX B650E-E GAMING WIFI
From: Eugene Shalygin @ 2026-04-03 21:03 UTC (permalink / raw)
To: eugene.shalygin
Cc: Veronika Kossmann, Veronika Kossmann, Guenter Roeck,
Jonathan Corbet, Shuah Khan, linux-hwmon, linux-doc, linux-kernel
From: Veronika Kossmann <nanodesuu@gmail.com>
Add support for ROG STRIX B650E-E GAMING WIFI
Signed-off-by: Veronika Kossmann <desu.git@rxtx.cx>
Signed-off-by: Eugene Shalygin <eugene.shalygin@gmail.com>
---
Documentation/hwmon/asus_ec_sensors.rst | 1 +
drivers/hwmon/asus-ec-sensors.c | 11 ++++++++++-
2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst
index 9ad3f0a57f55..e14419811aac 100644
--- a/Documentation/hwmon/asus_ec_sensors.rst
+++ b/Documentation/hwmon/asus_ec_sensors.rst
@@ -31,6 +31,7 @@ Supported boards:
* ROG MAXIMUS Z690 FORMULA
* ROG STRIX B550-E GAMING
* ROG STRIX B550-I GAMING
+ * ROG STRIX B650E-E GAMING WIFI
* ROG STRIX B650E-I GAMING WIFI
* ROG STRIX B850-I GAMING WIFI
* ROG STRIX X470-F GAMING
diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c
index 070bb368f2b7..8c53cd9ed8f3 100644
--- a/drivers/hwmon/asus-ec-sensors.c
+++ b/drivers/hwmon/asus-ec-sensors.c
@@ -274,7 +274,7 @@ static const struct ec_sensor_info sensors_family_amd_600[] = {
[ec_sensor_temp_cpu_package] =
EC_SENSOR("CPU Package", hwmon_temp, 1, 0x00, 0x31),
[ec_sensor_temp_mb] =
- EC_SENSOR("Motherboard", hwmon_temp, 1, 0x00, 0x32),
+ EC_SENSOR("Motherboard", hwmon_temp, 1, 0x00, 0x32),
[ec_sensor_temp_vrm] =
EC_SENSOR("VRM", hwmon_temp, 1, 0x00, 0x33),
[ec_sensor_temp_t_sensor] =
@@ -616,6 +616,13 @@ static const struct ec_board_info board_info_strix_b550_i_gaming = {
.family = family_amd_500_series,
};
+static const struct ec_board_info board_info_strix_b650e_e_gaming = {
+ .sensors = SENSOR_TEMP_VRM | SENSOR_SET_TEMP_CHIPSET_CPU_MB |
+ SENSOR_IN_CPU_CORE,
+ .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0,
+ .family = family_amd_600_series,
+};
+
static const struct ec_board_info board_info_strix_b650e_i_gaming = {
.sensors = SENSOR_TEMP_VRM | SENSOR_TEMP_T_SENSOR |
SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_IN_CPU_CORE,
@@ -861,6 +868,8 @@ static const struct dmi_system_id dmi_table[] = {
&board_info_strix_b550_e_gaming),
DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B550-I GAMING",
&board_info_strix_b550_i_gaming),
+ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B650E-E GAMING WIFI",
+ &board_info_strix_b650e_e_gaming),
DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B650E-I GAMING WIFI",
&board_info_strix_b650e_i_gaming),
DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B850-I GAMING WIFI",
--
2.53.0
^ permalink raw reply related
* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Pawan Gupta @ 2026-04-03 21:34 UTC (permalink / raw)
To: Jim Mattson
Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
Tao Zhang, bpf, netdev, linux-doc
In-Reply-To: <CALMp9eSPkMpKQELTnsaj6=gXD+EyE0n2_p93n4maDc93bPFe+w@mail.gmail.com>
On Fri, Apr 03, 2026 at 01:19:17PM -0700, Jim Mattson wrote:
> On Fri, Apr 3, 2026 at 11:52 AM Pawan Gupta
> <pawan.kumar.gupta@linux.intel.com> wrote:
> >
> > On Fri, Apr 03, 2026 at 11:10:08AM -0700, Jim Mattson wrote:
> > > On Thu, Apr 2, 2026 at 5:32 PM Pawan Gupta
> > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > >
> > > > As a mitigation for BHI, clear_bhb_loop() executes branches that overwrite
> > > > the Branch History Buffer (BHB). On Alder Lake and newer parts this
> > > > sequence is not sufficient because it doesn't clear enough entries. This
> > > > was not an issue because these CPUs use the BHI_DIS_S hardware mitigation
> > > > in the kernel.
> > > >
> > > > Now with VMSCAPE (BHI variant) it is also required to isolate branch
> > > > history between guests and userspace. Since BHI_DIS_S only protects the
> > > > kernel, the newer CPUs also use IBPB.
> > > >
> > > > A cheaper alternative to the current IBPB mitigation is clear_bhb_loop().
> > > > But it currently does not clear enough BHB entries to be effective on newer
> > > > CPUs with larger BHB. At boot, dynamically set the loop count of
> > > > clear_bhb_loop() such that it is effective on newer CPUs too. Use the
> > > > X86_FEATURE_BHI_CTRL feature flag to select the appropriate loop count.
> > > >
> > > > Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
> > > > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > > > ---
> > > > arch/x86/entry/entry_64.S | 8 +++++---
> > > > arch/x86/include/asm/nospec-branch.h | 2 ++
> > > > arch/x86/kernel/cpu/bugs.c | 13 +++++++++++++
> > > > 3 files changed, 20 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > > > index 3a180a36ca0e..bbd4b1c7ec04 100644
> > > > --- a/arch/x86/entry/entry_64.S
> > > > +++ b/arch/x86/entry/entry_64.S
> > > > @@ -1536,7 +1536,9 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > ANNOTATE_NOENDBR
> > > > push %rbp
> > > > mov %rsp, %rbp
> > > > - movl $5, %ecx
> > > > +
> > > > + movzbl bhb_seq_outer_loop(%rip), %ecx
> > > > +
> > > > ANNOTATE_INTRA_FUNCTION_CALL
> > > > call 1f
> > > > jmp 5f
> > > > @@ -1556,8 +1558,8 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
> > > > * but some Clang versions (e.g. 18) don't like this.
> > > > */
> > > > - .skip 32 - 18, 0xcc
> > > > -2: movl $5, %eax
> > > > + .skip 32 - 20, 0xcc
> > > > +2: movzbl bhb_seq_inner_loop(%rip), %eax
> > > > 3: jmp 4f
> > > > nop
> > > > 4: sub $1, %eax
> > > > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> > > > index 70b377fcbc1c..87b83ae7c97f 100644
> > > > --- a/arch/x86/include/asm/nospec-branch.h
> > > > +++ b/arch/x86/include/asm/nospec-branch.h
> > > > @@ -548,6 +548,8 @@ DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
> > > > extern void update_spec_ctrl_cond(u64 val);
> > > > extern u64 spec_ctrl_current(void);
> > > >
> > > > +extern u8 bhb_seq_inner_loop, bhb_seq_outer_loop;
> > > > +
> > > > /*
> > > > * With retpoline, we must use IBRS to restrict branch prediction
> > > > * before calling into firmware.
> > > > diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> > > > index 83f51cab0b1e..2cb4a96247d8 100644
> > > > --- a/arch/x86/kernel/cpu/bugs.c
> > > > +++ b/arch/x86/kernel/cpu/bugs.c
> > > > @@ -2047,6 +2047,10 @@ enum bhi_mitigations {
> > > > static enum bhi_mitigations bhi_mitigation __ro_after_init =
> > > > IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
> > > >
> > > > +/* Default to short BHB sequence values */
> > > > +u8 bhb_seq_outer_loop __ro_after_init = 5;
> > > > +u8 bhb_seq_inner_loop __ro_after_init = 5;
> > > > +
> > > > static int __init spectre_bhi_parse_cmdline(char *str)
> > > > {
> > > > if (!str)
> > > > @@ -3242,6 +3246,15 @@ void __init cpu_select_mitigations(void)
> > > > x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
> > > > }
> > > >
> > > > + /*
> > > > + * Switch to long BHB clear sequence on newer CPUs (with BHI_CTRL
> > > > + * support), see Intel's BHI guidance.
> > > > + */
> > > > + if (cpu_feature_enabled(X86_FEATURE_BHI_CTRL)) {
> > > > + bhb_seq_outer_loop = 12;
> > > > + bhb_seq_inner_loop = 7;
> > > > + }
> > > > +
> > >
> > > How does this work for VMs in a heterogeneous migration pool that
> > > spans the Alder Lake boundary? They can't advertise BHI_CTRL, because
> > > it isn't available on all hosts in the migration pool, but they need
> > > the long sequence when running on Alder Lake or newer.
> >
> > As we discussed elsewhere, support for migration pool is much more
> > involved. It should be dealt in a separate QEMU/KVM focused series.
> >
> > A quickfix could be adding support for spectre_bhi=long that guests in a
> > migration pool can use?
>
> The simplest solution is to add "|
> cpu_feature_enabled(X86_FEATURE_HYPERVISOR)" to the condition above.
> If that is unacceptable for the performance of pre-Alder Lake
Yes, that would be unnecessary overhead.
> migration pools, you could define a CPUID or MSR bit that says
> explicitly, "long BHB flush sequence needed," rather than trying to
> intuit that property from the presence of BHI_CTRL. Like
> IA32_ARCH_CAPABILITIES.SKIP_L1DFL_VMENTRY, the bit would only be set
> by a hypervisor.
I will think about this more.
> I am still skeptical of the need for MSR_VIRTUAL_ENUMERATION and
> friends, unless there is a major guest OS out there that relies on
> them.
If we forget about MSR_VIRTUAL_ENUMERATION for a moment, userspace VMM is
in the best position to decide whether a guest needs
virtual.SPEC_CTRL[BHI_DIS_S]. Via a KVM interface userspace VMM can get
BHI_DIS_S for the guests that are in migration pool?
^ permalink raw reply
* Re: [PATCH v3 02/24] PCI: Add API to track PCI devices preserved across Live Update
From: David Matlack @ 2026-04-03 21:58 UTC (permalink / raw)
To: Yanjun.Zhu
Cc: Alex Williamson, Bjorn Helgaas, Adithya Jayachandran,
Alexander Graf, Alex Mastro, Andrew Morton, Ankit Agrawal,
Arnd Bergmann, Askar Safin, Borislav Petkov (AMD), Chris Li,
Dapeng Mi, David Rientjes, Feng Tang, Jacob Pan, Jason Gunthorpe,
Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Kees Cook,
Kevin Tian, kexec, kvm, Leon Romanovsky, Leon Romanovsky,
linux-doc, linux-kernel, linux-kselftest, linux-mm, linux-pci,
Li RongQing, Lukas Wunner, Marco Elver, Michał Winiarski,
Mike Rapoport, Parav Pandit, Pasha Tatashin, Paul E. McKenney,
Pawan Gupta, Peter Zijlstra (Intel), Pranjal Shrivastava,
Pratyush Yadav, Raghavendra Rao Ananta, Randy Dunlap,
Rodrigo Vivi, Saeed Mahameed, Samiullah Khawaja, Shuah Khan,
Vipin Sharma, Vivek Kasireddy, William Tu, Yi Liu
In-Reply-To: <e3fe7085-1297-47b8-bb17-a48196e8f37f@linux.dev>
On Thu, Apr 2, 2026 at 2:29 PM Yanjun.Zhu <yanjun.zhu@linux.dev> wrote:
> On 3/23/26 4:57 PM, David Matlack wrote:
> > +config PCI_LIVEUPDATE
> > + bool "PCI Live Update Support (EXPERIMENTAL)"
> > + depends on PCI && LIVEUPDATE
> > + help
> > + Support for preserving PCI devices across a Live Update. This option
> > + should only be enabled by developers working on implementing this
> > + support. Once enough support as landed in the kernel, this option
> > + will no longer be marked EXPERIMENTAL.
> > +
> > + If unsure, say N.
>
> Currently, it only supports 'n' or 'y'. Is it possible to add 'm'
> (modular support)?
>
> This would allow the feature to be built as a kernel module. For
> development
>
> purposes, modularization means we only need to recompile a single module
>
> for testing, rather than rebuilding the entire kernel. Compiling a
> module should
>
> be significantly faster than a full kernel build.
I don't think it is possible for CONFIG_PCI_LIVEUPDATE to support 'm'.
pci_setup_device() (which is under CONFIG_PCI) needs to call
pci_liveupdate_setup_device(), and CONFIG_PCI cannot be built as a
module. This call is necessary so the PCI core knows whether a device
being enumerated was preserved across a previous Live Update.
^ permalink raw reply
* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Jim Mattson @ 2026-04-03 21:59 UTC (permalink / raw)
To: Pawan Gupta
Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
Tao Zhang, bpf, netdev, linux-doc
In-Reply-To: <20260403213445.xzb4rxbfbg5un7li@desk>
On Fri, Apr 3, 2026 at 2:34 PM Pawan Gupta
<pawan.kumar.gupta@linux.intel.com> wrote:
>
> On Fri, Apr 03, 2026 at 01:19:17PM -0700, Jim Mattson wrote:
> > On Fri, Apr 3, 2026 at 11:52 AM Pawan Gupta
> > <pawan.kumar.gupta@linux.intel.com> wrote:
> > >
> > > On Fri, Apr 03, 2026 at 11:10:08AM -0700, Jim Mattson wrote:
> > > > On Thu, Apr 2, 2026 at 5:32 PM Pawan Gupta
> > > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > > >
> > > > > As a mitigation for BHI, clear_bhb_loop() executes branches that overwrite
> > > > > the Branch History Buffer (BHB). On Alder Lake and newer parts this
> > > > > sequence is not sufficient because it doesn't clear enough entries. This
> > > > > was not an issue because these CPUs use the BHI_DIS_S hardware mitigation
> > > > > in the kernel.
> > > > >
> > > > > Now with VMSCAPE (BHI variant) it is also required to isolate branch
> > > > > history between guests and userspace. Since BHI_DIS_S only protects the
> > > > > kernel, the newer CPUs also use IBPB.
> > > > >
> > > > > A cheaper alternative to the current IBPB mitigation is clear_bhb_loop().
> > > > > But it currently does not clear enough BHB entries to be effective on newer
> > > > > CPUs with larger BHB. At boot, dynamically set the loop count of
> > > > > clear_bhb_loop() such that it is effective on newer CPUs too. Use the
> > > > > X86_FEATURE_BHI_CTRL feature flag to select the appropriate loop count.
> > > > >
> > > > > Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
> > > > > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > > > > ---
> > > > > arch/x86/entry/entry_64.S | 8 +++++---
> > > > > arch/x86/include/asm/nospec-branch.h | 2 ++
> > > > > arch/x86/kernel/cpu/bugs.c | 13 +++++++++++++
> > > > > 3 files changed, 20 insertions(+), 3 deletions(-)
> > > > >
> > > > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > > > > index 3a180a36ca0e..bbd4b1c7ec04 100644
> > > > > --- a/arch/x86/entry/entry_64.S
> > > > > +++ b/arch/x86/entry/entry_64.S
> > > > > @@ -1536,7 +1536,9 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > > ANNOTATE_NOENDBR
> > > > > push %rbp
> > > > > mov %rsp, %rbp
> > > > > - movl $5, %ecx
> > > > > +
> > > > > + movzbl bhb_seq_outer_loop(%rip), %ecx
> > > > > +
> > > > > ANNOTATE_INTRA_FUNCTION_CALL
> > > > > call 1f
> > > > > jmp 5f
> > > > > @@ -1556,8 +1558,8 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > > * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
> > > > > * but some Clang versions (e.g. 18) don't like this.
> > > > > */
> > > > > - .skip 32 - 18, 0xcc
> > > > > -2: movl $5, %eax
> > > > > + .skip 32 - 20, 0xcc
> > > > > +2: movzbl bhb_seq_inner_loop(%rip), %eax
> > > > > 3: jmp 4f
> > > > > nop
> > > > > 4: sub $1, %eax
> > > > > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> > > > > index 70b377fcbc1c..87b83ae7c97f 100644
> > > > > --- a/arch/x86/include/asm/nospec-branch.h
> > > > > +++ b/arch/x86/include/asm/nospec-branch.h
> > > > > @@ -548,6 +548,8 @@ DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
> > > > > extern void update_spec_ctrl_cond(u64 val);
> > > > > extern u64 spec_ctrl_current(void);
> > > > >
> > > > > +extern u8 bhb_seq_inner_loop, bhb_seq_outer_loop;
> > > > > +
> > > > > /*
> > > > > * With retpoline, we must use IBRS to restrict branch prediction
> > > > > * before calling into firmware.
> > > > > diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> > > > > index 83f51cab0b1e..2cb4a96247d8 100644
> > > > > --- a/arch/x86/kernel/cpu/bugs.c
> > > > > +++ b/arch/x86/kernel/cpu/bugs.c
> > > > > @@ -2047,6 +2047,10 @@ enum bhi_mitigations {
> > > > > static enum bhi_mitigations bhi_mitigation __ro_after_init =
> > > > > IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
> > > > >
> > > > > +/* Default to short BHB sequence values */
> > > > > +u8 bhb_seq_outer_loop __ro_after_init = 5;
> > > > > +u8 bhb_seq_inner_loop __ro_after_init = 5;
> > > > > +
> > > > > static int __init spectre_bhi_parse_cmdline(char *str)
> > > > > {
> > > > > if (!str)
> > > > > @@ -3242,6 +3246,15 @@ void __init cpu_select_mitigations(void)
> > > > > x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
> > > > > }
> > > > >
> > > > > + /*
> > > > > + * Switch to long BHB clear sequence on newer CPUs (with BHI_CTRL
> > > > > + * support), see Intel's BHI guidance.
> > > > > + */
> > > > > + if (cpu_feature_enabled(X86_FEATURE_BHI_CTRL)) {
> > > > > + bhb_seq_outer_loop = 12;
> > > > > + bhb_seq_inner_loop = 7;
> > > > > + }
> > > > > +
> > > >
> > > > How does this work for VMs in a heterogeneous migration pool that
> > > > spans the Alder Lake boundary? They can't advertise BHI_CTRL, because
> > > > it isn't available on all hosts in the migration pool, but they need
> > > > the long sequence when running on Alder Lake or newer.
> > >
> > > As we discussed elsewhere, support for migration pool is much more
> > > involved. It should be dealt in a separate QEMU/KVM focused series.
> > >
> > > A quickfix could be adding support for spectre_bhi=long that guests in a
> > > migration pool can use?
> >
> > The simplest solution is to add "|
> > cpu_feature_enabled(X86_FEATURE_HYPERVISOR)" to the condition above.
> > If that is unacceptable for the performance of pre-Alder Lake
>
> Yes, that would be unnecessary overhead.
>
> > migration pools, you could define a CPUID or MSR bit that says
> > explicitly, "long BHB flush sequence needed," rather than trying to
> > intuit that property from the presence of BHI_CTRL. Like
> > IA32_ARCH_CAPABILITIES.SKIP_L1DFL_VMENTRY, the bit would only be set
> > by a hypervisor.
>
> I will think about this more.
>
> > I am still skeptical of the need for MSR_VIRTUAL_ENUMERATION and
> > friends, unless there is a major guest OS out there that relies on
> > them.
>
> If we forget about MSR_VIRTUAL_ENUMERATION for a moment, userspace VMM is
> in the best position to decide whether a guest needs
> virtual.SPEC_CTRL[BHI_DIS_S]. Via a KVM interface userspace VMM can get
> BHI_DIS_S for the guests that are in migration pool?
That is not possible today, since KVM does not implement Intel's
IA32_SPEC_CTRL virtualization, and cedes the hardware IA32_SPEC_CTRL
to the guest after the first non-zero write to the guest's MSR.
^ permalink raw reply
* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Pawan Gupta @ 2026-04-03 23:16 UTC (permalink / raw)
To: Jim Mattson
Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <CALMp9eSXfJvR=PHtttbqm3q3nH436T1eH4YdpVqxQeP-cxEPsA@mail.gmail.com>
On Fri, Apr 03, 2026 at 02:59:33PM -0700, Jim Mattson wrote:
> On Fri, Apr 3, 2026 at 2:34 PM Pawan Gupta
> <pawan.kumar.gupta@linux.intel.com> wrote:
> >
> > On Fri, Apr 03, 2026 at 01:19:17PM -0700, Jim Mattson wrote:
> > > On Fri, Apr 3, 2026 at 11:52 AM Pawan Gupta
> > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > >
> > > > On Fri, Apr 03, 2026 at 11:10:08AM -0700, Jim Mattson wrote:
> > > > > On Thu, Apr 2, 2026 at 5:32 PM Pawan Gupta
> > > > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > > > >
> > > > > > As a mitigation for BHI, clear_bhb_loop() executes branches that overwrite
> > > > > > the Branch History Buffer (BHB). On Alder Lake and newer parts this
> > > > > > sequence is not sufficient because it doesn't clear enough entries. This
> > > > > > was not an issue because these CPUs use the BHI_DIS_S hardware mitigation
> > > > > > in the kernel.
> > > > > >
> > > > > > Now with VMSCAPE (BHI variant) it is also required to isolate branch
> > > > > > history between guests and userspace. Since BHI_DIS_S only protects the
> > > > > > kernel, the newer CPUs also use IBPB.
> > > > > >
> > > > > > A cheaper alternative to the current IBPB mitigation is clear_bhb_loop().
> > > > > > But it currently does not clear enough BHB entries to be effective on newer
> > > > > > CPUs with larger BHB. At boot, dynamically set the loop count of
> > > > > > clear_bhb_loop() such that it is effective on newer CPUs too. Use the
> > > > > > X86_FEATURE_BHI_CTRL feature flag to select the appropriate loop count.
> > > > > >
> > > > > > Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
> > > > > > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > > > > > ---
> > > > > > arch/x86/entry/entry_64.S | 8 +++++---
> > > > > > arch/x86/include/asm/nospec-branch.h | 2 ++
> > > > > > arch/x86/kernel/cpu/bugs.c | 13 +++++++++++++
> > > > > > 3 files changed, 20 insertions(+), 3 deletions(-)
> > > > > >
> > > > > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > > > > > index 3a180a36ca0e..bbd4b1c7ec04 100644
> > > > > > --- a/arch/x86/entry/entry_64.S
> > > > > > +++ b/arch/x86/entry/entry_64.S
> > > > > > @@ -1536,7 +1536,9 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > > > ANNOTATE_NOENDBR
> > > > > > push %rbp
> > > > > > mov %rsp, %rbp
> > > > > > - movl $5, %ecx
> > > > > > +
> > > > > > + movzbl bhb_seq_outer_loop(%rip), %ecx
> > > > > > +
> > > > > > ANNOTATE_INTRA_FUNCTION_CALL
> > > > > > call 1f
> > > > > > jmp 5f
> > > > > > @@ -1556,8 +1558,8 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > > > * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
> > > > > > * but some Clang versions (e.g. 18) don't like this.
> > > > > > */
> > > > > > - .skip 32 - 18, 0xcc
> > > > > > -2: movl $5, %eax
> > > > > > + .skip 32 - 20, 0xcc
> > > > > > +2: movzbl bhb_seq_inner_loop(%rip), %eax
> > > > > > 3: jmp 4f
> > > > > > nop
> > > > > > 4: sub $1, %eax
> > > > > > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> > > > > > index 70b377fcbc1c..87b83ae7c97f 100644
> > > > > > --- a/arch/x86/include/asm/nospec-branch.h
> > > > > > +++ b/arch/x86/include/asm/nospec-branch.h
> > > > > > @@ -548,6 +548,8 @@ DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
> > > > > > extern void update_spec_ctrl_cond(u64 val);
> > > > > > extern u64 spec_ctrl_current(void);
> > > > > >
> > > > > > +extern u8 bhb_seq_inner_loop, bhb_seq_outer_loop;
> > > > > > +
> > > > > > /*
> > > > > > * With retpoline, we must use IBRS to restrict branch prediction
> > > > > > * before calling into firmware.
> > > > > > diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> > > > > > index 83f51cab0b1e..2cb4a96247d8 100644
> > > > > > --- a/arch/x86/kernel/cpu/bugs.c
> > > > > > +++ b/arch/x86/kernel/cpu/bugs.c
> > > > > > @@ -2047,6 +2047,10 @@ enum bhi_mitigations {
> > > > > > static enum bhi_mitigations bhi_mitigation __ro_after_init =
> > > > > > IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
> > > > > >
> > > > > > +/* Default to short BHB sequence values */
> > > > > > +u8 bhb_seq_outer_loop __ro_after_init = 5;
> > > > > > +u8 bhb_seq_inner_loop __ro_after_init = 5;
> > > > > > +
> > > > > > static int __init spectre_bhi_parse_cmdline(char *str)
> > > > > > {
> > > > > > if (!str)
> > > > > > @@ -3242,6 +3246,15 @@ void __init cpu_select_mitigations(void)
> > > > > > x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
> > > > > > }
> > > > > >
> > > > > > + /*
> > > > > > + * Switch to long BHB clear sequence on newer CPUs (with BHI_CTRL
> > > > > > + * support), see Intel's BHI guidance.
> > > > > > + */
> > > > > > + if (cpu_feature_enabled(X86_FEATURE_BHI_CTRL)) {
> > > > > > + bhb_seq_outer_loop = 12;
> > > > > > + bhb_seq_inner_loop = 7;
> > > > > > + }
> > > > > > +
> > > > >
> > > > > How does this work for VMs in a heterogeneous migration pool that
> > > > > spans the Alder Lake boundary? They can't advertise BHI_CTRL, because
> > > > > it isn't available on all hosts in the migration pool, but they need
> > > > > the long sequence when running on Alder Lake or newer.
> > > >
> > > > As we discussed elsewhere, support for migration pool is much more
> > > > involved. It should be dealt in a separate QEMU/KVM focused series.
> > > >
> > > > A quickfix could be adding support for spectre_bhi=long that guests in a
> > > > migration pool can use?
> > >
> > > The simplest solution is to add "|
> > > cpu_feature_enabled(X86_FEATURE_HYPERVISOR)" to the condition above.
> > > If that is unacceptable for the performance of pre-Alder Lake
> >
> > Yes, that would be unnecessary overhead.
> >
> > > migration pools, you could define a CPUID or MSR bit that says
> > > explicitly, "long BHB flush sequence needed," rather than trying to
> > > intuit that property from the presence of BHI_CTRL. Like
> > > IA32_ARCH_CAPABILITIES.SKIP_L1DFL_VMENTRY, the bit would only be set
> > > by a hypervisor.
> >
> > I will think about this more.
> >
> > > I am still skeptical of the need for MSR_VIRTUAL_ENUMERATION and
> > > friends, unless there is a major guest OS out there that relies on
> > > them.
> >
> > If we forget about MSR_VIRTUAL_ENUMERATION for a moment, userspace VMM is
> > in the best position to decide whether a guest needs
> > virtual.SPEC_CTRL[BHI_DIS_S]. Via a KVM interface userspace VMM can get
> > BHI_DIS_S for the guests that are in migration pool?
>
> That is not possible today, since KVM does not implement Intel's
> IA32_SPEC_CTRL virtualization, and cedes the hardware IA32_SPEC_CTRL
> to the guest after the first non-zero write to the guest's MSR.
Yes, KVM doesn't support it yet. But, adding that support to give more
control to userspace VMM helps this case, and probably many other in
the future.
I will check with Chao if he can prepare the next version of virtual
SPEC_CTRL series (leaving out virtual mitigation MSRs).
^ permalink raw reply
* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Jim Mattson @ 2026-04-03 23:22 UTC (permalink / raw)
To: Pawan Gupta
Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <20260403231608.zopnhnypdclzqlx7@desk>
On Fri, Apr 3, 2026 at 4:16 PM Pawan Gupta
<pawan.kumar.gupta@linux.intel.com> wrote:
>
> On Fri, Apr 03, 2026 at 02:59:33PM -0700, Jim Mattson wrote:
> > On Fri, Apr 3, 2026 at 2:34 PM Pawan Gupta
> > <pawan.kumar.gupta@linux.intel.com> wrote:
> > >
> > > On Fri, Apr 03, 2026 at 01:19:17PM -0700, Jim Mattson wrote:
> > > > On Fri, Apr 3, 2026 at 11:52 AM Pawan Gupta
> > > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > > >
> > > > > On Fri, Apr 03, 2026 at 11:10:08AM -0700, Jim Mattson wrote:
> > > > > > On Thu, Apr 2, 2026 at 5:32 PM Pawan Gupta
> > > > > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > > > > >
> > > > > > > As a mitigation for BHI, clear_bhb_loop() executes branches that overwrite
> > > > > > > the Branch History Buffer (BHB). On Alder Lake and newer parts this
> > > > > > > sequence is not sufficient because it doesn't clear enough entries. This
> > > > > > > was not an issue because these CPUs use the BHI_DIS_S hardware mitigation
> > > > > > > in the kernel.
> > > > > > >
> > > > > > > Now with VMSCAPE (BHI variant) it is also required to isolate branch
> > > > > > > history between guests and userspace. Since BHI_DIS_S only protects the
> > > > > > > kernel, the newer CPUs also use IBPB.
> > > > > > >
> > > > > > > A cheaper alternative to the current IBPB mitigation is clear_bhb_loop().
> > > > > > > But it currently does not clear enough BHB entries to be effective on newer
> > > > > > > CPUs with larger BHB. At boot, dynamically set the loop count of
> > > > > > > clear_bhb_loop() such that it is effective on newer CPUs too. Use the
> > > > > > > X86_FEATURE_BHI_CTRL feature flag to select the appropriate loop count.
> > > > > > >
> > > > > > > Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
> > > > > > > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > > > > > > ---
> > > > > > > arch/x86/entry/entry_64.S | 8 +++++---
> > > > > > > arch/x86/include/asm/nospec-branch.h | 2 ++
> > > > > > > arch/x86/kernel/cpu/bugs.c | 13 +++++++++++++
> > > > > > > 3 files changed, 20 insertions(+), 3 deletions(-)
> > > > > > >
> > > > > > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > > > > > > index 3a180a36ca0e..bbd4b1c7ec04 100644
> > > > > > > --- a/arch/x86/entry/entry_64.S
> > > > > > > +++ b/arch/x86/entry/entry_64.S
> > > > > > > @@ -1536,7 +1536,9 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > > > > ANNOTATE_NOENDBR
> > > > > > > push %rbp
> > > > > > > mov %rsp, %rbp
> > > > > > > - movl $5, %ecx
> > > > > > > +
> > > > > > > + movzbl bhb_seq_outer_loop(%rip), %ecx
> > > > > > > +
> > > > > > > ANNOTATE_INTRA_FUNCTION_CALL
> > > > > > > call 1f
> > > > > > > jmp 5f
> > > > > > > @@ -1556,8 +1558,8 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > > > > * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
> > > > > > > * but some Clang versions (e.g. 18) don't like this.
> > > > > > > */
> > > > > > > - .skip 32 - 18, 0xcc
> > > > > > > -2: movl $5, %eax
> > > > > > > + .skip 32 - 20, 0xcc
> > > > > > > +2: movzbl bhb_seq_inner_loop(%rip), %eax
> > > > > > > 3: jmp 4f
> > > > > > > nop
> > > > > > > 4: sub $1, %eax
> > > > > > > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> > > > > > > index 70b377fcbc1c..87b83ae7c97f 100644
> > > > > > > --- a/arch/x86/include/asm/nospec-branch.h
> > > > > > > +++ b/arch/x86/include/asm/nospec-branch.h
> > > > > > > @@ -548,6 +548,8 @@ DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
> > > > > > > extern void update_spec_ctrl_cond(u64 val);
> > > > > > > extern u64 spec_ctrl_current(void);
> > > > > > >
> > > > > > > +extern u8 bhb_seq_inner_loop, bhb_seq_outer_loop;
> > > > > > > +
> > > > > > > /*
> > > > > > > * With retpoline, we must use IBRS to restrict branch prediction
> > > > > > > * before calling into firmware.
> > > > > > > diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> > > > > > > index 83f51cab0b1e..2cb4a96247d8 100644
> > > > > > > --- a/arch/x86/kernel/cpu/bugs.c
> > > > > > > +++ b/arch/x86/kernel/cpu/bugs.c
> > > > > > > @@ -2047,6 +2047,10 @@ enum bhi_mitigations {
> > > > > > > static enum bhi_mitigations bhi_mitigation __ro_after_init =
> > > > > > > IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
> > > > > > >
> > > > > > > +/* Default to short BHB sequence values */
> > > > > > > +u8 bhb_seq_outer_loop __ro_after_init = 5;
> > > > > > > +u8 bhb_seq_inner_loop __ro_after_init = 5;
> > > > > > > +
> > > > > > > static int __init spectre_bhi_parse_cmdline(char *str)
> > > > > > > {
> > > > > > > if (!str)
> > > > > > > @@ -3242,6 +3246,15 @@ void __init cpu_select_mitigations(void)
> > > > > > > x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
> > > > > > > }
> > > > > > >
> > > > > > > + /*
> > > > > > > + * Switch to long BHB clear sequence on newer CPUs (with BHI_CTRL
> > > > > > > + * support), see Intel's BHI guidance.
> > > > > > > + */
> > > > > > > + if (cpu_feature_enabled(X86_FEATURE_BHI_CTRL)) {
> > > > > > > + bhb_seq_outer_loop = 12;
> > > > > > > + bhb_seq_inner_loop = 7;
> > > > > > > + }
> > > > > > > +
> > > > > >
> > > > > > How does this work for VMs in a heterogeneous migration pool that
> > > > > > spans the Alder Lake boundary? They can't advertise BHI_CTRL, because
> > > > > > it isn't available on all hosts in the migration pool, but they need
> > > > > > the long sequence when running on Alder Lake or newer.
> > > > >
> > > > > As we discussed elsewhere, support for migration pool is much more
> > > > > involved. It should be dealt in a separate QEMU/KVM focused series.
> > > > >
> > > > > A quickfix could be adding support for spectre_bhi=long that guests in a
> > > > > migration pool can use?
> > > >
> > > > The simplest solution is to add "|
> > > > cpu_feature_enabled(X86_FEATURE_HYPERVISOR)" to the condition above.
> > > > If that is unacceptable for the performance of pre-Alder Lake
> > >
> > > Yes, that would be unnecessary overhead.
> > >
> > > > migration pools, you could define a CPUID or MSR bit that says
> > > > explicitly, "long BHB flush sequence needed," rather than trying to
> > > > intuit that property from the presence of BHI_CTRL. Like
> > > > IA32_ARCH_CAPABILITIES.SKIP_L1DFL_VMENTRY, the bit would only be set
> > > > by a hypervisor.
> > >
> > > I will think about this more.
> > >
> > > > I am still skeptical of the need for MSR_VIRTUAL_ENUMERATION and
> > > > friends, unless there is a major guest OS out there that relies on
> > > > them.
> > >
> > > If we forget about MSR_VIRTUAL_ENUMERATION for a moment, userspace VMM is
> > > in the best position to decide whether a guest needs
> > > virtual.SPEC_CTRL[BHI_DIS_S]. Via a KVM interface userspace VMM can get
> > > BHI_DIS_S for the guests that are in migration pool?
> >
> > That is not possible today, since KVM does not implement Intel's
> > IA32_SPEC_CTRL virtualization, and cedes the hardware IA32_SPEC_CTRL
> > to the guest after the first non-zero write to the guest's MSR.
>
> Yes, KVM doesn't support it yet. But, adding that support to give more
> control to userspace VMM helps this case, and probably many other in
> the future.
But didn't you tell me that Windows doesn't want the hypervisor to set
BHI_DIS_S behind their back?
> I will check with Chao if he can prepare the next version of virtual
> SPEC_CTRL series (leaving out virtual mitigation MSRs).
Excellent.
^ permalink raw reply
* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Pawan Gupta @ 2026-04-03 23:33 UTC (permalink / raw)
To: Jim Mattson
Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <CALMp9eT2vJBdLPY2uBYrPgVrhS_aYmfGfdXe6MZXG_gyryLHVA@mail.gmail.com>
On Fri, Apr 03, 2026 at 04:22:28PM -0700, Jim Mattson wrote:
> On Fri, Apr 3, 2026 at 4:16 PM Pawan Gupta
> <pawan.kumar.gupta@linux.intel.com> wrote:
> >
> > On Fri, Apr 03, 2026 at 02:59:33PM -0700, Jim Mattson wrote:
> > > On Fri, Apr 3, 2026 at 2:34 PM Pawan Gupta
> > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > >
> > > > On Fri, Apr 03, 2026 at 01:19:17PM -0700, Jim Mattson wrote:
> > > > > On Fri, Apr 3, 2026 at 11:52 AM Pawan Gupta
> > > > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > > > >
> > > > > > On Fri, Apr 03, 2026 at 11:10:08AM -0700, Jim Mattson wrote:
> > > > > > > On Thu, Apr 2, 2026 at 5:32 PM Pawan Gupta
> > > > > > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > > > > > >
> > > > > > > > As a mitigation for BHI, clear_bhb_loop() executes branches that overwrite
> > > > > > > > the Branch History Buffer (BHB). On Alder Lake and newer parts this
> > > > > > > > sequence is not sufficient because it doesn't clear enough entries. This
> > > > > > > > was not an issue because these CPUs use the BHI_DIS_S hardware mitigation
> > > > > > > > in the kernel.
> > > > > > > >
> > > > > > > > Now with VMSCAPE (BHI variant) it is also required to isolate branch
> > > > > > > > history between guests and userspace. Since BHI_DIS_S only protects the
> > > > > > > > kernel, the newer CPUs also use IBPB.
> > > > > > > >
> > > > > > > > A cheaper alternative to the current IBPB mitigation is clear_bhb_loop().
> > > > > > > > But it currently does not clear enough BHB entries to be effective on newer
> > > > > > > > CPUs with larger BHB. At boot, dynamically set the loop count of
> > > > > > > > clear_bhb_loop() such that it is effective on newer CPUs too. Use the
> > > > > > > > X86_FEATURE_BHI_CTRL feature flag to select the appropriate loop count.
> > > > > > > >
> > > > > > > > Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
> > > > > > > > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > > > > > > > ---
> > > > > > > > arch/x86/entry/entry_64.S | 8 +++++---
> > > > > > > > arch/x86/include/asm/nospec-branch.h | 2 ++
> > > > > > > > arch/x86/kernel/cpu/bugs.c | 13 +++++++++++++
> > > > > > > > 3 files changed, 20 insertions(+), 3 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > > > > > > > index 3a180a36ca0e..bbd4b1c7ec04 100644
> > > > > > > > --- a/arch/x86/entry/entry_64.S
> > > > > > > > +++ b/arch/x86/entry/entry_64.S
> > > > > > > > @@ -1536,7 +1536,9 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > > > > > ANNOTATE_NOENDBR
> > > > > > > > push %rbp
> > > > > > > > mov %rsp, %rbp
> > > > > > > > - movl $5, %ecx
> > > > > > > > +
> > > > > > > > + movzbl bhb_seq_outer_loop(%rip), %ecx
> > > > > > > > +
> > > > > > > > ANNOTATE_INTRA_FUNCTION_CALL
> > > > > > > > call 1f
> > > > > > > > jmp 5f
> > > > > > > > @@ -1556,8 +1558,8 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > > > > > * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
> > > > > > > > * but some Clang versions (e.g. 18) don't like this.
> > > > > > > > */
> > > > > > > > - .skip 32 - 18, 0xcc
> > > > > > > > -2: movl $5, %eax
> > > > > > > > + .skip 32 - 20, 0xcc
> > > > > > > > +2: movzbl bhb_seq_inner_loop(%rip), %eax
> > > > > > > > 3: jmp 4f
> > > > > > > > nop
> > > > > > > > 4: sub $1, %eax
> > > > > > > > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> > > > > > > > index 70b377fcbc1c..87b83ae7c97f 100644
> > > > > > > > --- a/arch/x86/include/asm/nospec-branch.h
> > > > > > > > +++ b/arch/x86/include/asm/nospec-branch.h
> > > > > > > > @@ -548,6 +548,8 @@ DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
> > > > > > > > extern void update_spec_ctrl_cond(u64 val);
> > > > > > > > extern u64 spec_ctrl_current(void);
> > > > > > > >
> > > > > > > > +extern u8 bhb_seq_inner_loop, bhb_seq_outer_loop;
> > > > > > > > +
> > > > > > > > /*
> > > > > > > > * With retpoline, we must use IBRS to restrict branch prediction
> > > > > > > > * before calling into firmware.
> > > > > > > > diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> > > > > > > > index 83f51cab0b1e..2cb4a96247d8 100644
> > > > > > > > --- a/arch/x86/kernel/cpu/bugs.c
> > > > > > > > +++ b/arch/x86/kernel/cpu/bugs.c
> > > > > > > > @@ -2047,6 +2047,10 @@ enum bhi_mitigations {
> > > > > > > > static enum bhi_mitigations bhi_mitigation __ro_after_init =
> > > > > > > > IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
> > > > > > > >
> > > > > > > > +/* Default to short BHB sequence values */
> > > > > > > > +u8 bhb_seq_outer_loop __ro_after_init = 5;
> > > > > > > > +u8 bhb_seq_inner_loop __ro_after_init = 5;
> > > > > > > > +
> > > > > > > > static int __init spectre_bhi_parse_cmdline(char *str)
> > > > > > > > {
> > > > > > > > if (!str)
> > > > > > > > @@ -3242,6 +3246,15 @@ void __init cpu_select_mitigations(void)
> > > > > > > > x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
> > > > > > > > }
> > > > > > > >
> > > > > > > > + /*
> > > > > > > > + * Switch to long BHB clear sequence on newer CPUs (with BHI_CTRL
> > > > > > > > + * support), see Intel's BHI guidance.
> > > > > > > > + */
> > > > > > > > + if (cpu_feature_enabled(X86_FEATURE_BHI_CTRL)) {
> > > > > > > > + bhb_seq_outer_loop = 12;
> > > > > > > > + bhb_seq_inner_loop = 7;
> > > > > > > > + }
> > > > > > > > +
> > > > > > >
> > > > > > > How does this work for VMs in a heterogeneous migration pool that
> > > > > > > spans the Alder Lake boundary? They can't advertise BHI_CTRL, because
> > > > > > > it isn't available on all hosts in the migration pool, but they need
> > > > > > > the long sequence when running on Alder Lake or newer.
> > > > > >
> > > > > > As we discussed elsewhere, support for migration pool is much more
> > > > > > involved. It should be dealt in a separate QEMU/KVM focused series.
> > > > > >
> > > > > > A quickfix could be adding support for spectre_bhi=long that guests in a
> > > > > > migration pool can use?
> > > > >
> > > > > The simplest solution is to add "|
> > > > > cpu_feature_enabled(X86_FEATURE_HYPERVISOR)" to the condition above.
> > > > > If that is unacceptable for the performance of pre-Alder Lake
> > > >
> > > > Yes, that would be unnecessary overhead.
> > > >
> > > > > migration pools, you could define a CPUID or MSR bit that says
> > > > > explicitly, "long BHB flush sequence needed," rather than trying to
> > > > > intuit that property from the presence of BHI_CTRL. Like
> > > > > IA32_ARCH_CAPABILITIES.SKIP_L1DFL_VMENTRY, the bit would only be set
> > > > > by a hypervisor.
> > > >
> > > > I will think about this more.
> > > >
> > > > > I am still skeptical of the need for MSR_VIRTUAL_ENUMERATION and
> > > > > friends, unless there is a major guest OS out there that relies on
> > > > > them.
> > > >
> > > > If we forget about MSR_VIRTUAL_ENUMERATION for a moment, userspace VMM is
> > > > in the best position to decide whether a guest needs
> > > > virtual.SPEC_CTRL[BHI_DIS_S]. Via a KVM interface userspace VMM can get
> > > > BHI_DIS_S for the guests that are in migration pool?
> > >
> > > That is not possible today, since KVM does not implement Intel's
> > > IA32_SPEC_CTRL virtualization, and cedes the hardware IA32_SPEC_CTRL
> > > to the guest after the first non-zero write to the guest's MSR.
> >
> > Yes, KVM doesn't support it yet. But, adding that support to give more
> > control to userspace VMM helps this case, and probably many other in
> > the future.
>
> But didn't you tell me that Windows doesn't want the hypervisor to set
> BHI_DIS_S behind their back?
Since cloud providers have greater control over userspace, the decision to
use BHI_DIS_S or not can be left to them. KVM would simply follow what it
is asked to do by the userspace.
> > I will check with Chao if he can prepare the next version of virtual
> > SPEC_CTRL series (leaving out virtual mitigation MSRs).
>
> Excellent.
^ permalink raw reply
* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Jim Mattson @ 2026-04-03 23:39 UTC (permalink / raw)
To: Pawan Gupta
Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <20260403233329.fb2ppifgwm3um6ny@desk>
On Fri, Apr 3, 2026 at 4:33 PM Pawan Gupta
<pawan.kumar.gupta@linux.intel.com> wrote:
>
> On Fri, Apr 03, 2026 at 04:22:28PM -0700, Jim Mattson wrote:
> > On Fri, Apr 3, 2026 at 4:16 PM Pawan Gupta
> > <pawan.kumar.gupta@linux.intel.com> wrote:
> > >
> > > On Fri, Apr 03, 2026 at 02:59:33PM -0700, Jim Mattson wrote:
> > > > On Fri, Apr 3, 2026 at 2:34 PM Pawan Gupta
> > > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > > >
> > > > > On Fri, Apr 03, 2026 at 01:19:17PM -0700, Jim Mattson wrote:
> > > > > > On Fri, Apr 3, 2026 at 11:52 AM Pawan Gupta
> > > > > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > > > > >
> > > > > > > On Fri, Apr 03, 2026 at 11:10:08AM -0700, Jim Mattson wrote:
> > > > > > > > On Thu, Apr 2, 2026 at 5:32 PM Pawan Gupta
> > > > > > > > <pawan.kumar.gupta@linux.intel.com> wrote:
> > > > > > > > >
> > > > > > > > > As a mitigation for BHI, clear_bhb_loop() executes branches that overwrite
> > > > > > > > > the Branch History Buffer (BHB). On Alder Lake and newer parts this
> > > > > > > > > sequence is not sufficient because it doesn't clear enough entries. This
> > > > > > > > > was not an issue because these CPUs use the BHI_DIS_S hardware mitigation
> > > > > > > > > in the kernel.
> > > > > > > > >
> > > > > > > > > Now with VMSCAPE (BHI variant) it is also required to isolate branch
> > > > > > > > > history between guests and userspace. Since BHI_DIS_S only protects the
> > > > > > > > > kernel, the newer CPUs also use IBPB.
> > > > > > > > >
> > > > > > > > > A cheaper alternative to the current IBPB mitigation is clear_bhb_loop().
> > > > > > > > > But it currently does not clear enough BHB entries to be effective on newer
> > > > > > > > > CPUs with larger BHB. At boot, dynamically set the loop count of
> > > > > > > > > clear_bhb_loop() such that it is effective on newer CPUs too. Use the
> > > > > > > > > X86_FEATURE_BHI_CTRL feature flag to select the appropriate loop count.
> > > > > > > > >
> > > > > > > > > Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
> > > > > > > > > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > > > > > > > > ---
> > > > > > > > > arch/x86/entry/entry_64.S | 8 +++++---
> > > > > > > > > arch/x86/include/asm/nospec-branch.h | 2 ++
> > > > > > > > > arch/x86/kernel/cpu/bugs.c | 13 +++++++++++++
> > > > > > > > > 3 files changed, 20 insertions(+), 3 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > > > > > > > > index 3a180a36ca0e..bbd4b1c7ec04 100644
> > > > > > > > > --- a/arch/x86/entry/entry_64.S
> > > > > > > > > +++ b/arch/x86/entry/entry_64.S
> > > > > > > > > @@ -1536,7 +1536,9 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > > > > > > ANNOTATE_NOENDBR
> > > > > > > > > push %rbp
> > > > > > > > > mov %rsp, %rbp
> > > > > > > > > - movl $5, %ecx
> > > > > > > > > +
> > > > > > > > > + movzbl bhb_seq_outer_loop(%rip), %ecx
> > > > > > > > > +
> > > > > > > > > ANNOTATE_INTRA_FUNCTION_CALL
> > > > > > > > > call 1f
> > > > > > > > > jmp 5f
> > > > > > > > > @@ -1556,8 +1558,8 @@ SYM_FUNC_START(clear_bhb_loop)
> > > > > > > > > * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
> > > > > > > > > * but some Clang versions (e.g. 18) don't like this.
> > > > > > > > > */
> > > > > > > > > - .skip 32 - 18, 0xcc
> > > > > > > > > -2: movl $5, %eax
> > > > > > > > > + .skip 32 - 20, 0xcc
> > > > > > > > > +2: movzbl bhb_seq_inner_loop(%rip), %eax
> > > > > > > > > 3: jmp 4f
> > > > > > > > > nop
> > > > > > > > > 4: sub $1, %eax
> > > > > > > > > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> > > > > > > > > index 70b377fcbc1c..87b83ae7c97f 100644
> > > > > > > > > --- a/arch/x86/include/asm/nospec-branch.h
> > > > > > > > > +++ b/arch/x86/include/asm/nospec-branch.h
> > > > > > > > > @@ -548,6 +548,8 @@ DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
> > > > > > > > > extern void update_spec_ctrl_cond(u64 val);
> > > > > > > > > extern u64 spec_ctrl_current(void);
> > > > > > > > >
> > > > > > > > > +extern u8 bhb_seq_inner_loop, bhb_seq_outer_loop;
> > > > > > > > > +
> > > > > > > > > /*
> > > > > > > > > * With retpoline, we must use IBRS to restrict branch prediction
> > > > > > > > > * before calling into firmware.
> > > > > > > > > diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> > > > > > > > > index 83f51cab0b1e..2cb4a96247d8 100644
> > > > > > > > > --- a/arch/x86/kernel/cpu/bugs.c
> > > > > > > > > +++ b/arch/x86/kernel/cpu/bugs.c
> > > > > > > > > @@ -2047,6 +2047,10 @@ enum bhi_mitigations {
> > > > > > > > > static enum bhi_mitigations bhi_mitigation __ro_after_init =
> > > > > > > > > IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
> > > > > > > > >
> > > > > > > > > +/* Default to short BHB sequence values */
> > > > > > > > > +u8 bhb_seq_outer_loop __ro_after_init = 5;
> > > > > > > > > +u8 bhb_seq_inner_loop __ro_after_init = 5;
> > > > > > > > > +
> > > > > > > > > static int __init spectre_bhi_parse_cmdline(char *str)
> > > > > > > > > {
> > > > > > > > > if (!str)
> > > > > > > > > @@ -3242,6 +3246,15 @@ void __init cpu_select_mitigations(void)
> > > > > > > > > x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > + /*
> > > > > > > > > + * Switch to long BHB clear sequence on newer CPUs (with BHI_CTRL
> > > > > > > > > + * support), see Intel's BHI guidance.
> > > > > > > > > + */
> > > > > > > > > + if (cpu_feature_enabled(X86_FEATURE_BHI_CTRL)) {
> > > > > > > > > + bhb_seq_outer_loop = 12;
> > > > > > > > > + bhb_seq_inner_loop = 7;
> > > > > > > > > + }
> > > > > > > > > +
> > > > > > > >
> > > > > > > > How does this work for VMs in a heterogeneous migration pool that
> > > > > > > > spans the Alder Lake boundary? They can't advertise BHI_CTRL, because
> > > > > > > > it isn't available on all hosts in the migration pool, but they need
> > > > > > > > the long sequence when running on Alder Lake or newer.
> > > > > > >
> > > > > > > As we discussed elsewhere, support for migration pool is much more
> > > > > > > involved. It should be dealt in a separate QEMU/KVM focused series.
> > > > > > >
> > > > > > > A quickfix could be adding support for spectre_bhi=long that guests in a
> > > > > > > migration pool can use?
> > > > > >
> > > > > > The simplest solution is to add "|
> > > > > > cpu_feature_enabled(X86_FEATURE_HYPERVISOR)" to the condition above.
> > > > > > If that is unacceptable for the performance of pre-Alder Lake
> > > > >
> > > > > Yes, that would be unnecessary overhead.
> > > > >
> > > > > > migration pools, you could define a CPUID or MSR bit that says
> > > > > > explicitly, "long BHB flush sequence needed," rather than trying to
> > > > > > intuit that property from the presence of BHI_CTRL. Like
> > > > > > IA32_ARCH_CAPABILITIES.SKIP_L1DFL_VMENTRY, the bit would only be set
> > > > > > by a hypervisor.
> > > > >
> > > > > I will think about this more.
> > > > >
> > > > > > I am still skeptical of the need for MSR_VIRTUAL_ENUMERATION and
> > > > > > friends, unless there is a major guest OS out there that relies on
> > > > > > them.
> > > > >
> > > > > If we forget about MSR_VIRTUAL_ENUMERATION for a moment, userspace VMM is
> > > > > in the best position to decide whether a guest needs
> > > > > virtual.SPEC_CTRL[BHI_DIS_S]. Via a KVM interface userspace VMM can get
> > > > > BHI_DIS_S for the guests that are in migration pool?
> > > >
> > > > That is not possible today, since KVM does not implement Intel's
> > > > IA32_SPEC_CTRL virtualization, and cedes the hardware IA32_SPEC_CTRL
> > > > to the guest after the first non-zero write to the guest's MSR.
> > >
> > > Yes, KVM doesn't support it yet. But, adding that support to give more
> > > control to userspace VMM helps this case, and probably many other in
> > > the future.
> >
> > But didn't you tell me that Windows doesn't want the hypervisor to set
> > BHI_DIS_S behind their back?
>
> Since cloud providers have greater control over userspace, the decision to
> use BHI_DIS_S or not can be left to them. KVM would simply follow what it
> is asked to do by the userspace.
I feel like we've gone over this before, but if userspace tells KVM
not to enable BHI_DIS_S, how do we inform Windows that it needs to do
the longer clearing sequence, despite the fact that the virtual CPU is
masquerading as Ice Lake?
I don't think the virtual mitigation MSRs address that issue.
> > > I will check with Chao if he can prepare the next version of virtual
> > > SPEC_CTRL series (leaving out virtual mitigation MSRs).
> >
> > Excellent.
^ permalink raw reply
* [riscv:for-next 24/45] htmldocs: Documentation/arch/riscv/zicfilp.rst:79: WARNING: Inline literal start-string without end-string. [docutils]
From: kernel test robot @ 2026-04-04 0:16 UTC (permalink / raw)
To: Paul Walmsley; +Cc: oe-kbuild-all, linux-doc
tree: https://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git for-next
head: c0368008b37da62d634fdd1aa2603f13dc31528a
commit: 8a8f622c329e2e3690b54826370e4a0af45f66ef [24/45] prctl: cfi: change the branch landing pad prctl()s to be more descriptive
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
docutils: docutils (Docutils 0.21.2, Python 3.13.5, on linux)
reproduce: (https://download.01.org/0day-ci/archive/20260404/202604040216.ea9oBMct-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202604040216.ea9oBMct-lkp@intel.com/
All warnings (new ones prefixed by >>):
Warning: tools/docs/documentation-file-ref-check references a file that doesn't exist: m,\b(\S*)(Documentation/[A-Za-z0-9
Warning: tools/docs/documentation-file-ref-check references a file that doesn't exist: Documentation/devicetree/dt-object-internal.txt
Warning: tools/docs/documentation-file-ref-check references a file that doesn't exist: m,^Documentation/scheduler/sched-pelt
Warning: tools/docs/documentation-file-ref-check references a file that doesn't exist: m,(Documentation/translations/[
Using alabaster theme
>> Documentation/arch/riscv/zicfilp.rst:79: WARNING: Inline literal start-string without end-string. [docutils]
Documentation/core-api/kref:328: ./include/linux/kref.h:72: WARNING: Invalid C declaration: Expected end of definition. [error at 96]
int kref_put_mutex (struct kref *kref, void (*release)(struct kref *kref), struct mutex *mutex) __cond_acquires(true# mutex)
------------------------------------------------------------------------------------------------^
Documentation/core-api/kref:328: ./include/linux/kref.h:94: WARNING: Invalid C declaration: Expected end of definition. [error at 92]
int kref_put_lock (struct kref *kref, void (*release)(struct kref *kref), spinlock_t *lock) __cond_acquires(true# lock)
vim +79 Documentation/arch/riscv/zicfilp.rst
78
> 79 Per-task indirect branch tracking state can be monitored and
80 controlled via the :c:macro:`PR_GET_CFI` and :c:macro:`PR_SET_CFI`
81 ``prctl()` arguments (respectively), by supplying
82 :c:macro:`PR_CFI_BRANCH_LANDING_PADS` as the second argument. These
83 are architecture-agnostic, and will return -EINVAL if the underlying
84 functionality is not supported.
85
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply
* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Pawan Gupta @ 2026-04-04 0:21 UTC (permalink / raw)
To: Jim Mattson
Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <CALMp9eTpsenqsWjzmpXLEubn9uNjgZgzgrMwtZ72HDuV_2xgfg@mail.gmail.com>
On Fri, Apr 03, 2026 at 04:39:54PM -0700, Jim Mattson wrote:
> > Since cloud providers have greater control over userspace, the decision to
> > use BHI_DIS_S or not can be left to them. KVM would simply follow what it
> > is asked to do by the userspace.
>
> I feel like we've gone over this before, but if userspace tells KVM
> not to enable BHI_DIS_S, how do we inform Windows that it needs to do
> the longer clearing sequence, despite the fact that the virtual CPU is
> masquerading as Ice Lake?
IMO, if an OS is allergic to a hardware mitigation, and is also aware that
it is virtualized, it should default to a sw mitigation that works everywhere.
> I don't think the virtual mitigation MSRs address that issue.
Virtual mitigation MSRs are meant to inform the VMM about the guest
mitigation. Even if there was a way to tell the guest that it needs to use
a different mitigation, it seems unrealistic for a guest to change its
mitigation post-migration.
^ permalink raw reply
* Re: [PATCH v6 4/4] selftests/ftrace: Add accept cases for fprobe list syntax
From: Masami Hiramatsu @ 2026-04-04 0:25 UTC (permalink / raw)
To: Ryan Chung
Cc: rostedt, corbet, shuah, mathieu.desnoyers, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <CAB1jyqw_6wepbDaKi7087GDUcJ9t1jQO6qP9pa0DWCjti7ABZg@mail.gmail.com>
On Thu, 2 Apr 2026 11:45:42 -0400
Ryan Chung <seokwoo.chung130@gmail.com> wrote:
> Hi Masami,
>
> Thank you for your feedback. Unfortunately, I am not in the position
> to continue working on this patch series for the foreseeable future.
> If you or anyone else on the list would like to pick it up and carry
> it forward, you are welcome to do so. I appreciate your time and
> effort on this.
I see, that's unfortunate, but I understand. I'll continue to fix
and post updates for this patch series.
I appreciate you starting this series.
Thank you.
>
> Best regards,
> Seokwoo Chung
>
> On Tue, 24 Mar 2026 at 00:12, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> >
> > On Thu, 5 Feb 2026 08:58:42 -0500
> > "Seokwoo Chung (Ryan)" <seokwoo.chung130@gmail.com> wrote:
> >
> > > Add fprobe_list.tc to test the comma-separated symbol list syntax
> > > with :entry/:exit suffixes. Three scenarios are covered:
> > >
> > > 1. List with default (entry) behavior and ! exclusion
> > > 2. List with explicit :entry suffix
> > > 3. List with :exit suffix for return probes
> >
> >
> > Could you also add wildcard pattern test?
> >
> > >
> > > Each test verifies that the correct functions appear in
> > > enabled_functions and that excluded (!) symbols are absent.
> > >
> > > Note: The existing tests add_remove_fprobe.tc, fprobe_syntax_errors.tc,
> > > and add_remove_fprobe_repeat.tc check their "requires" line against the
> > > tracefs README for the old "%return" syntax pattern. Since the README
> > > now documents ":entry|:exit" instead, these tests report UNSUPPORTED.
> > > Their "requires" lines need updating in a follow-up patch.
> >
> > This means you'll break the selftest. please fix those test first.
> > (This fix must be done before "tracing/fprobe: Support comma-separated
> > symbols and :entry/:exit" so that we can safely bisect it.)
> >
> > Thank you,
> >
> >
> > >
> > > Signed-off-by: Seokwoo Chung (Ryan) <seokwoo.chung130@gmail.com>
> > > ---
> > > .../ftrace/test.d/dynevent/fprobe_list.tc | 92 +++++++++++++++++++
> > > 1 file changed, 92 insertions(+)
> > > create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/fprobe_list.tc
> > >
> > > diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_list.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_list.tc
> > > new file mode 100644
> > > index 000000000000..45e57c6f487d
> > > --- /dev/null
> > > +++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_list.tc
> > > @@ -0,0 +1,92 @@
> > > +#!/bin/sh
> > > +# SPDX-License-Identifier: GPL-2.0
> > > +# description: Fprobe event list syntax and :entry/:exit suffixes
> > > +# requires: dynamic_events "f[:[<group>/][<event>]] <func-name>[:entry|:exit] [<args>]":README
> > > +
> > > +# Setup symbols to test. These are common kernel functions.
> > > +PLACE=vfs_read
> > > +PLACE2=vfs_write
> > > +PLACE3=vfs_open
> > > +
> > > +echo 0 > events/enable
> > > +echo > dynamic_events
> > > +
> > > +# Get baseline count of enabled functions (should be 0 if clean, but be safe)
> > > +if [ -f enabled_functions ]; then
> > > + ocnt=`cat enabled_functions | wc -l`
> > > +else
> > > + ocnt=0
> > > +fi
> > > +
> > > +# Test 1: List default (entry) with exclusion
> > > +# Target: Trace vfs_read and vfs_open, but EXCLUDE vfs_write
> > > +echo "f:test/list_entry $PLACE,!$PLACE2,$PLACE3" >> dynamic_events
> > > +grep -q "test/list_entry" dynamic_events
> > > +test -d events/test/list_entry
> > > +
> > > +echo 1 > events/test/list_entry/enable
> > > +
> > > +grep -q "$PLACE" enabled_functions
> > > +grep -q "$PLACE3" enabled_functions
> > > +! grep -q "$PLACE2" enabled_functions
> > > +
> > > +# Check count (Baseline + 2 new functions)
> > > +cnt=`cat enabled_functions | wc -l`
> > > +if [ $cnt -ne $((ocnt + 2)) ]; then
> > > + exit_fail
> > > +fi
> > > +
> > > +# Cleanup Test 1
> > > +echo 0 > events/test/list_entry/enable
> > > +echo "-:test/list_entry" >> dynamic_events
> > > +! grep -q "test/list_entry" dynamic_events
> > > +
> > > +# Count should return to baseline
> > > +cnt=`cat enabled_functions | wc -l`
> > > +if [ $cnt -ne $ocnt ]; then
> > > + exit_fail
> > > +fi
> > > +
> > > +# Test 2: List with explicit :entry suffix
> > > +# (Should behave exactly like Test 1)
> > > +echo "f:test/list_entry_exp $PLACE,!$PLACE2,$PLACE3:entry" >> dynamic_events
> > > +grep -q "test/list_entry_exp" dynamic_events
> > > +test -d events/test/list_entry_exp
> > > +
> > > +echo 1 > events/test/list_entry_exp/enable
> > > +
> > > +grep -q "$PLACE" enabled_functions
> > > +grep -q "$PLACE3" enabled_functions
> > > +! grep -q "$PLACE2" enabled_functions
> > > +
> > > +cnt=`cat enabled_functions | wc -l`
> > > +if [ $cnt -ne $((ocnt + 2)) ]; then
> > > + exit_fail
> > > +fi
> > > +
> > > +# Cleanup Test 2
> > > +echo 0 > events/test/list_entry_exp/enable
> > > +echo "-:test/list_entry_exp" >> dynamic_events
> > > +
> > > +# Test 3: List with :exit suffix
> > > +echo "f:test/list_exit $PLACE,!$PLACE2,$PLACE3:exit" >> dynamic_events
> > > +grep -q "test/list_exit" dynamic_events
> > > +test -d events/test/list_exit
> > > +
> > > +echo 1 > events/test/list_exit/enable
> > > +
> > > +# Even for return probes, enabled_functions lists the attached symbols
> > > +grep -q "$PLACE" enabled_functions
> > > +grep -q "$PLACE3" enabled_functions
> > > +! grep -q "$PLACE2" enabled_functions
> > > +
> > > +cnt=`cat enabled_functions | wc -l`
> > > +if [ $cnt -ne $((ocnt + 2)) ]; then
> > > + exit_fail
> > > +fi
> > > +
> > > +# Cleanup Test 3
> > > +echo 0 > events/test/list_exit/enable
> > > +echo "-:test/list_exit" >> dynamic_events
> > > +
> > > +clear_trace
> > > --
> > > 2.43.0
> > >
> >
> >
> > --
> > Masami Hiramatsu (Google) <mhiramat@kernel.org>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply
* Re: [PATCH v4 0/3] dpll: add frequency monitoring feature
From: patchwork-bot+netdevbpf @ 2026-04-04 0:40 UTC (permalink / raw)
To: Ivan Vecera
Cc: netdev, arkadiusz.kubalewski, davem, donald.hunter, edumazet,
kuba, jiri, corbet, mschmidt, pabeni, poros, Prathosh.Satish,
skhan, horms, vadim.fedorenko, linux-doc, linux-kernel
In-Reply-To: <20260402184057.1890514-1-ivecera@redhat.com>
Hello:
This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Thu, 2 Apr 2026 20:40:54 +0200 you wrote:
> This series adds support for monitoring the measured input frequency
> of DPLL input pins via the DPLL netlink interface.
>
> Some DPLL devices can measure the actual frequency being received on
> input pins. The approach mirrors the existing phase-offset-monitor
> feature: a device-level attribute (DPLL_A_FREQUENCY_MONITOR) enables
> or disables monitoring, and a per-pin attribute
> (DPLL_A_PIN_MEASURED_FREQUENCY) exposes the measured frequency in
> millihertz (mHz) when monitoring is enabled.
>
> [...]
Here is the summary with links:
- [v4,1/3] dpll: add frequency monitoring to netlink spec
https://git.kernel.org/netdev/net-next/c/3fdea79c09d1
- [v4,2/3] dpll: add frequency monitoring callback ops
https://git.kernel.org/netdev/net-next/c/15ed91aa84ea
- [v4,3/3] dpll: zl3073x: implement frequency monitoring
https://git.kernel.org/netdev/net-next/c/bfc923b64287
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* [PATCH v5 0/1] mm/damon: add node_eligible_mem_bp and node_ineligible_mem_bp goal metrics
From: Ravi Jonnalagadda @ 2026-04-04 1:22 UTC (permalink / raw)
To: sj, damon, linux-mm, linux-kernel, linux-doc
Cc: akpm, corbet, bijan311, ajayjoshi, honggyu.kim, yunjeong.mun,
ravis.opensrc
This patch introduces two new DAMOS quota goal metrics for controlling
memory distribution in heterogeneous memory systems (e.g., DRAM and CXL
memory tiering) using physical address (PA) mode monitoring.
Changes since v4:
=================
https://lore.kernel.org/linux-mm/20260320190453.1430-1-ravis.opensrc@gmail.com/
- Fixed commit message description for DAMOS_QUOTA_NODE_INELIGIBLE_MEM_BP
per review feedback
- Added clarifying comment for ops-common.h include (for damon_get_folio())
- Fixed build error when CONFIG_DAMON_PADDR is disabled by adding
#ifdef CONFIG_DAMON_PADDR guards around functions using damon_get_folio()
- Dropped RFC tag per maintainer feedback
This patch is based on top of damon/next.
Background and Motivation
=========================
In heterogeneous memory systems, controlling memory distribution across
NUMA nodes is essential for performance optimization. This patch enables
system-wide page distribution with target-state goals such as "maintain
30% of scheme-eligible memory on CXL" using PA-mode DAMON schemes.
What These Metrics Measure
==========================
node_eligible_mem_bp:
scheme_eligible_bytes_on_node / total_scheme_eligible_bytes * 10000
node_ineligible_mem_bp:
(total - scheme_eligible_bytes_on_node) / total * 10000
These metrics are complementary: eligible_bp + ineligible_bp = 10000 bp.
Two-Scheme Setup for Hot Page Distribution
==========================================
For maintaining hot memory on DRAM (node 0) and CXL (node 1) in a 7:3
ratio:
PUSH scheme: migrate_hot from node 0 -> node 1
goal: node_ineligible_mem_bp, nid=0, target=3000
"Move hot pages from DRAM to CXL if more than 70% of hot data is
in DRAM"
PULL scheme: migrate_hot from node 1 -> node 0
goal: node_eligible_mem_bp, nid=0, target=7000
"Move hot pages from CXL to DRAM if less than 70% of hot data is
in DRAM"
The complementary goals create a feedback loop that converges to the
target distribution.
Testing Results
===============
Functionally tested on a two-node heterogeneous memory system with DRAM
(node 0) and CXL memory (node 1). A PUSH+PULL scheme configuration using
migrate_hot actions was used to reach a target hot memory ratio between
the two tiers. Testing used the TEMPORAL goal tuner available in
damon/next and mm-unstable.
With the TEMPORAL tuner, the system converges quickly to the target
distribution. The tuner drives esz to maximum when under goal and to
zero once the goal is met, forming a simple on/off feedback loop that
stabilizes at the desired ratio.
With the CONSIST tuner, the scheme still converges but more slowly, as
it migrates and then throttles itself based on quota feedback. The time
to reach the goal varies depending on workload intensity.
Note: These metrics work with both TEMPORAL and CONSIST goal tuners.
Ravi Jonnalagadda (1):
mm/damon: add node_eligible_mem_bp and node_ineligible_mem_bp goal
metrics
include/linux/damon.h | 6 ++
mm/damon/core.c | 186 ++++++++++++++++++++++++++++++++++++---
mm/damon/sysfs-schemes.c | 12 +++
3 files changed, 190 insertions(+), 14 deletions(-)
base-commit: 97eefd14af390e1921f1fc5507140025095634e0
--
2.43.0
^ permalink raw reply
* [PATCH v5 1/1] mm/damon: add node_eligible_mem_bp and node_ineligible_mem_bp goal metrics
From: Ravi Jonnalagadda @ 2026-04-04 1:22 UTC (permalink / raw)
To: sj, damon, linux-mm, linux-kernel, linux-doc
Cc: akpm, corbet, bijan311, ajayjoshi, honggyu.kim, yunjeong.mun,
ravis.opensrc, kernel test robot
In-Reply-To: <20260404012215.1539-1-ravis.opensrc@gmail.com>
Add new quota goal metrics for memory tiering that track scheme-eligible
memory distribution across NUMA nodes:
- DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: ratio of eligible memory on a node
- DAMOS_QUOTA_NODE_INELIGIBLE_MEM_BP: ratio of ineligible memory on a
node
These complementary metrics enable push-pull migration schemes that
maintain a target memory distribution across different NUMA nodes
representing different memory tiers, based on access patterns defined
by each scheme.
The metrics iterate scheme-eligible regions and use damon_get_folio()
to determine NUMA node placement of each folio, calculating the ratio
of eligible memory on the specified node versus total eligible memory.
The implementation is guarded by CONFIG_DAMON_PADDR since damon_get_folio()
is only available when physical address space monitoring is enabled.
Suggested-by: SeongJae Park <sj@kernel.org>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202603251034.978zcsQ2-lkp@intel.com/
Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@gmail.com>
---
include/linux/damon.h | 6 ++
mm/damon/core.c | 186 ++++++++++++++++++++++++++++++++++++---
mm/damon/sysfs-schemes.c | 12 +++
3 files changed, 190 insertions(+), 14 deletions(-)
diff --git a/include/linux/damon.h b/include/linux/damon.h
index c15a7d2a05c6..98dbf6911dad 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -193,6 +193,10 @@ enum damos_action {
* @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup.
* @DAMOS_QUOTA_ACTIVE_MEM_BP: Active to total LRU memory ratio.
* @DAMOS_QUOTA_INACTIVE_MEM_BP: Inactive to total LRU memory ratio.
+ * @DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: Scheme-eligible memory ratio of a
+ * node.
+ * @DAMOS_QUOTA_NODE_INELIGIBLE_MEM_BP: Scheme-ineligible memory ratio of a
+ * node.
* @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics.
*
* Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
@@ -206,6 +210,8 @@ enum damos_quota_goal_metric {
DAMOS_QUOTA_NODE_MEMCG_FREE_BP,
DAMOS_QUOTA_ACTIVE_MEM_BP,
DAMOS_QUOTA_INACTIVE_MEM_BP,
+ DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP,
+ DAMOS_QUOTA_NODE_INELIGIBLE_MEM_BP,
NR_DAMOS_QUOTA_GOAL_METRICS,
};
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 5908537f45f1..f71ee19f526d 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -17,6 +17,9 @@
#include <linux/string.h>
#include <linux/string_choices.h>
+/* for damon_get_folio() used by node eligible memory metrics */
+#include "ops-common.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/damon.h>
@@ -2549,7 +2552,136 @@ static unsigned long damos_get_node_memcg_used_bp(
numerator = i.totalram - used_pages;
return mult_frac(numerator, 10000, i.totalram);
}
-#else
+
+#ifdef CONFIG_DAMON_PADDR
+/*
+ * damos_calc_eligible_bytes() - Calculate raw eligible bytes per node.
+ * @c: The DAMON context.
+ * @s: The scheme.
+ * @nid: The target NUMA node id.
+ * @total: Output for total eligible bytes across all nodes.
+ *
+ * Iterates through each folio in eligible regions to accurately determine
+ * which node the memory resides on. Returns eligible bytes on the specified
+ * node and sets *total to the sum across all nodes.
+ *
+ * Note: This function requires damon_get_folio() from ops-common.c, which is
+ * only available when CONFIG_DAMON_PADDR or CONFIG_DAMON_VADDR is enabled.
+ */
+static unsigned long damos_calc_eligible_bytes(struct damon_ctx *c,
+ struct damos *s, int nid, unsigned long *total)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned long total_eligible = 0;
+ unsigned long node_eligible = 0;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ phys_addr_t addr, end_addr;
+
+ if (!__damos_valid_target(r, s))
+ continue;
+
+ /* Convert from core address units to physical bytes */
+ addr = r->ar.start * c->addr_unit;
+ end_addr = r->ar.end * c->addr_unit;
+ while (addr < end_addr) {
+ struct folio *folio;
+ unsigned long folio_sz, counted;
+
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (!folio) {
+ addr += PAGE_SIZE;
+ continue;
+ }
+
+ folio_sz = folio_size(folio);
+ /*
+ * Clip to region boundaries to avoid counting
+ * bytes outside the region when folio spans
+ * region boundaries.
+ */
+ counted = min(folio_sz, (unsigned long)(end_addr - addr));
+ total_eligible += counted;
+ if (folio_nid(folio) == nid)
+ node_eligible += counted;
+
+ addr += folio_sz;
+ folio_put(folio);
+ }
+ }
+ }
+
+ *total = total_eligible;
+ return node_eligible;
+}
+
+/*
+ * damos_get_node_eligible_mem_bp() - Get eligible memory ratio for a node.
+ * @c: The DAMON context.
+ * @s: The scheme.
+ * @nid: The target NUMA node id.
+ *
+ * Calculates scheme-eligible bytes on the specified node and returns the
+ * ratio in basis points (0-10000) relative to total eligible bytes across
+ * all nodes.
+ */
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ unsigned long total_eligible = 0;
+ unsigned long node_eligible = 0;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_online(nid))
+ return 0;
+
+ node_eligible = damos_calc_eligible_bytes(c, s, nid, &total_eligible);
+
+ if (!total_eligible)
+ return 0;
+
+ return mult_frac(node_eligible, 10000, total_eligible);
+}
+
+static unsigned long damos_get_node_ineligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ unsigned long total_eligible = 0;
+ unsigned long node_eligible;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_online(nid))
+ return 0;
+
+ node_eligible = damos_calc_eligible_bytes(c, s, nid, &total_eligible);
+
+ /* No eligible memory anywhere - ratio is undefined, return 0 */
+ if (!total_eligible)
+ return 0;
+
+ /* Compute ineligible ratio directly: 10000 - eligible_bp */
+ return 10000 - mult_frac(node_eligible, 10000, total_eligible);
+}
+#else /* CONFIG_DAMON_PADDR */
+/*
+ * Stub functions when CONFIG_DAMON_PADDR is disabled.
+ * The node_eligible/ineligible metrics require physical address operations
+ * to iterate folios, which are only available with PA-mode DAMON.
+ */
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ return 0;
+}
+
+static unsigned long damos_get_node_ineligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ return 0;
+}
+#endif /* CONFIG_DAMON_PADDR */
+
+#else /* CONFIG_NUMA */
static __kernel_ulong_t damos_get_node_mem_bp(
struct damos_quota_goal *goal)
{
@@ -2561,7 +2693,19 @@ static unsigned long damos_get_node_memcg_used_bp(
{
return 0;
}
-#endif
+
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ return 0;
+}
+
+static unsigned long damos_get_node_ineligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ return 0;
+}
+#endif /* CONFIG_NUMA */
/*
* Returns LRU-active or inactive memory to total LRU memory size ratio.
@@ -2581,7 +2725,8 @@ static unsigned int damos_get_in_active_mem_bp(bool active_ratio)
return mult_frac(inactive, 10000, total);
}
-static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
+static void damos_set_quota_goal_current_value(struct damon_ctx *c,
+ struct damos *s, struct damos_quota_goal *goal)
{
u64 now_psi_total;
@@ -2608,19 +2753,28 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
goal->current_value = damos_get_in_active_mem_bp(
goal->metric == DAMOS_QUOTA_ACTIVE_MEM_BP);
break;
+ case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP:
+ goal->current_value = damos_get_node_eligible_mem_bp(c, s,
+ goal->nid);
+ break;
+ case DAMOS_QUOTA_NODE_INELIGIBLE_MEM_BP:
+ goal->current_value = damos_get_node_ineligible_mem_bp(c, s,
+ goal->nid);
+ break;
default:
break;
}
}
/* Return the highest score since it makes schemes least aggressive */
-static unsigned long damos_quota_score(struct damos_quota *quota)
+static unsigned long damos_quota_score(struct damon_ctx *c, struct damos *s)
{
+ struct damos_quota *quota = &s->quota;
struct damos_quota_goal *goal;
unsigned long highest_score = 0;
damos_for_each_quota_goal(goal, quota) {
- damos_set_quota_goal_current_value(goal);
+ damos_set_quota_goal_current_value(c, s, goal);
highest_score = max(highest_score,
mult_frac(goal->current_value, 10000,
goal->target_value));
@@ -2629,17 +2783,20 @@ static unsigned long damos_quota_score(struct damos_quota *quota)
return highest_score;
}
-static void damos_goal_tune_esz_bp_consist(struct damos_quota *quota)
+static void damos_goal_tune_esz_bp_consist(struct damon_ctx *c, struct damos *s)
{
- unsigned long score = damos_quota_score(quota);
+ struct damos_quota *quota = &s->quota;
+ unsigned long score = damos_quota_score(c, s);
quota->esz_bp = damon_feed_loop_next_input(
max(quota->esz_bp, 10000UL), score);
}
-static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota)
+static void damos_goal_tune_esz_bp_temporal(struct damon_ctx *c,
+ struct damos *s)
{
- unsigned long score = damos_quota_score(quota);
+ struct damos_quota *quota = &s->quota;
+ unsigned long score = damos_quota_score(c, s);
if (score >= 10000)
quota->esz_bp = 0;
@@ -2652,8 +2809,9 @@ static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota)
/*
* Called only if quota->ms, or quota->sz are set, or quota->goals is not empty
*/
-static void damos_set_effective_quota(struct damos_quota *quota)
+static void damos_set_effective_quota(struct damon_ctx *c, struct damos *s)
{
+ struct damos_quota *quota = &s->quota;
unsigned long throughput;
unsigned long esz = ULONG_MAX;
@@ -2664,9 +2822,9 @@ static void damos_set_effective_quota(struct damos_quota *quota)
if (!list_empty("a->goals)) {
if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_CONSIST)
- damos_goal_tune_esz_bp_consist(quota);
+ damos_goal_tune_esz_bp_consist(c, s);
else if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_TEMPORAL)
- damos_goal_tune_esz_bp_temporal(quota);
+ damos_goal_tune_esz_bp_temporal(c, s);
esz = quota->esz_bp / 10000;
}
@@ -2715,7 +2873,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
/* First charge window */
if (!quota->total_charged_sz && !quota->charged_from) {
quota->charged_from = jiffies;
- damos_set_effective_quota(quota);
+ damos_set_effective_quota(c, s);
if (trace_damos_esz_enabled())
damos_trace_esz(c, s, quota);
}
@@ -2737,7 +2895,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
quota->charged_sz = 0;
if (trace_damos_esz_enabled())
cached_esz = quota->esz;
- damos_set_effective_quota(quota);
+ damos_set_effective_quota(c, s);
if (trace_damos_esz_enabled() && quota->esz != cached_esz)
damos_trace_esz(c, s, quota);
}
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index bf923709ab91..7e9cd19d5bff 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1084,6 +1084,14 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = {
.metric = DAMOS_QUOTA_INACTIVE_MEM_BP,
.name = "inactive_mem_bp",
},
+ {
+ .metric = DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP,
+ .name = "node_eligible_mem_bp",
+ },
+ {
+ .metric = DAMOS_QUOTA_NODE_INELIGIBLE_MEM_BP,
+ .name = "node_ineligible_mem_bp",
+ },
};
static ssize_t target_metric_show(struct kobject *kobj,
@@ -2717,6 +2725,10 @@ static int damos_sysfs_add_quota_score(
case DAMOS_QUOTA_NODE_MEM_FREE_BP:
goal->nid = sysfs_goal->nid;
break;
+ case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP:
+ case DAMOS_QUOTA_NODE_INELIGIBLE_MEM_BP:
+ goal->nid = sysfs_goal->nid;
+ break;
case DAMOS_QUOTA_NODE_MEMCG_USED_BP:
case DAMOS_QUOTA_NODE_MEMCG_FREE_BP:
err = damon_sysfs_memcg_path_to_id(
--
2.43.0
^ permalink raw reply related
* [PATCH] docs: usb: document USBDEVFS_BULK return value
From: Adeel Zahid @ 2026-04-04 1:41 UTC (permalink / raw)
To: Jonathan Corbet; +Cc: Shuah Khan, linux-doc, linux-kernel, Adeel Zahid
Replace the FIXME in the usbfs bulk I/O documentation with the
current behavior.
Document that USBDEVFS_BULK returns the completed URB actual_length
on success, which may be smaller than the requested len. Also clarify
that for IN endpoints only the returned number of bytes is copied into
the userspace buffer, so a smaller return value indicates a short
read.
Signed-off-by: Adeel Zahid <adeel.m.zahid@gmail.com>
---
Documentation/driver-api/usb/usb.rst | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/Documentation/driver-api/usb/usb.rst b/Documentation/driver-api/usb/usb.rst
index 7f2f41e80c1c..6b6a759c1f62 100644
--- a/Documentation/driver-api/usb/usb.rst
+++ b/Documentation/driver-api/usb/usb.rst
@@ -535,9 +535,19 @@ USBDEVFS_BULK
The ``ep`` value identifies a bulk endpoint number (1 to 15, as
identified in an endpoint descriptor), masked with USB_DIR_IN when
referring to an endpoint which sends data to the host from the
- device. The length of the data buffer is identified by ``len``; Recent
- kernels support requests up to about 128KBytes. *FIXME say how read
- length is returned, and how short reads are handled.*.
+ device. The length of the data buffer is identified by ``len``. Recent
+ kernels support requests up to about 128 KBytes.
+
+ On success, the ioctl returns the completed URB's ``actual_length``
+ value, that is, the number of bytes actually transferred for the
+ request. This may be less than the value requested in ``len``.
+
+ For an IN endpoint, the return value tells userspace how many bytes were
+ read and copied into ``data``. If the return value is smaller than
+ ``len``, the read completed as a short read, and only the returned
+ number of bytes is valid in the buffer.
+
+ Failures return a negative errno value.
USBDEVFS_CLEAR_HALT
Clears endpoint halt (stall) and resets the endpoint toggle. This is
--
2.43.0
^ permalink raw reply related
* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Jim Mattson @ 2026-04-04 2:21 UTC (permalink / raw)
To: Pawan Gupta
Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <20260404002149.wtayv6a64vzuppgp@desk>
On Fri, Apr 3, 2026 at 5:22 PM Pawan Gupta
<pawan.kumar.gupta@linux.intel.com> wrote:
>
> On Fri, Apr 03, 2026 at 04:39:54PM -0700, Jim Mattson wrote:
> > > Since cloud providers have greater control over userspace, the decision to
> > > use BHI_DIS_S or not can be left to them. KVM would simply follow what it
> > > is asked to do by the userspace.
> >
> > I feel like we've gone over this before, but if userspace tells KVM
> > not to enable BHI_DIS_S, how do we inform Windows that it needs to do
> > the longer clearing sequence, despite the fact that the virtual CPU is
> > masquerading as Ice Lake?
>
> IMO, if an OS is allergic to a hardware mitigation, and is also aware that
> it is virtualized, it should default to a sw mitigation that works everywhere.
Agreed. So, without any information to the contrary, VMs should assume
the long BHB clearing sequence is required.
Returning to my earlier comment, the test should be:
+ if (cpu_feature_enabled(X86_FEATURE_BHI_CTRL) ||
cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) {
+ bhb_seq_outer_loop = 12;
+ bhb_seq_inner_loop = 7;
+ }
^ permalink raw reply
* Re: [PATCH v8 2/3] RISC-V: KVM: Cache gstage pgd_levels in struct kvm_gstage
From: Nutty.Liu @ 2026-04-04 3:25 UTC (permalink / raw)
To: fangyu.yu, pbonzini, corbet, anup, atish.patra, pjw, palmer, aou,
alex, skhan
Cc: guoren, radim.krcmar, andrew.jones, linux-doc, kvm, kvm-riscv,
linux-riscv, linux-kernel
In-Reply-To: <20260403153019.9916-3-fangyu.yu@linux.alibaba.com>
On 4/3/2026 11:30 PM, fangyu.yu@linux.alibaba.com wrote:
> From: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>
> Gstage page-table helpers frequently chase gstage->kvm->arch to
> fetch pgd_levels. This adds noise and repeats the same dereference
> chain in hot paths.
>
> Add pgd_levels to struct kvm_gstage and initialize it from kvm->arch
> when setting up a gstage instance. Introduce kvm_riscv_gstage_init()
> to centralize initialization and switch gstage code to use
> gstage->pgd_levels.
>
> Suggested-by: Anup Patel <anup@brainfault.org>
> Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
> Reviewed-by: Anup Patel <anup@brainfault.org>
Reviewed-by: Nutty Liu <nutty.liu@hotmail.com>
Thanks,
Nutty
> ---
> arch/riscv/include/asm/kvm_gstage.h | 10 ++++++
> arch/riscv/kvm/gstage.c | 10 +++---
> arch/riscv/kvm/mmu.c | 50 ++++++-----------------------
> 3 files changed, 25 insertions(+), 45 deletions(-)
>
> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
> index 5aa58d1f692a..70d9d483365e 100644
> --- a/arch/riscv/include/asm/kvm_gstage.h
> +++ b/arch/riscv/include/asm/kvm_gstage.h
> @@ -15,6 +15,7 @@ struct kvm_gstage {
> #define KVM_GSTAGE_FLAGS_LOCAL BIT(0)
> unsigned long vmid;
> pgd_t *pgd;
> + unsigned long pgd_levels;
> };
>
> struct kvm_gstage_mapping {
> @@ -92,4 +93,13 @@ static inline unsigned long kvm_riscv_gstage_mode(unsigned long pgd_levels)
> }
> }
>
> +static inline void kvm_riscv_gstage_init(struct kvm_gstage *gstage, struct kvm *kvm)
> +{
> + gstage->kvm = kvm;
> + gstage->flags = 0;
> + gstage->vmid = READ_ONCE(kvm->arch.vmid.vmid);
> + gstage->pgd = kvm->arch.pgd;
> + gstage->pgd_levels = kvm->arch.pgd_levels;
> +}
> +
> #endif
> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
> index 4beb9322fe76..7c4c34bc191b 100644
> --- a/arch/riscv/kvm/gstage.c
> +++ b/arch/riscv/kvm/gstage.c
> @@ -26,7 +26,7 @@ static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage,
> unsigned long mask;
> unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
>
> - if (level == gstage->kvm->arch.pgd_levels - 1)
> + if (level == gstage->pgd_levels - 1)
> mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
> else
> mask = PTRS_PER_PTE - 1;
> @@ -45,7 +45,7 @@ static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long pa
> u32 i;
> unsigned long psz = 1UL << 12;
>
> - for (i = 0; i < gstage->kvm->arch.pgd_levels; i++) {
> + for (i = 0; i < gstage->pgd_levels; i++) {
> if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
> *out_level = i;
> return 0;
> @@ -58,7 +58,7 @@ static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long pa
> static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level,
> unsigned long *out_pgorder)
> {
> - if (gstage->kvm->arch.pgd_levels < level)
> + if (gstage->pgd_levels < level)
> return -EINVAL;
>
> *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
> @@ -83,7 +83,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
> pte_t **ptepp, u32 *ptep_level)
> {
> pte_t *ptep;
> - u32 current_level = gstage->kvm->arch.pgd_levels - 1;
> + u32 current_level = gstage->pgd_levels - 1;
>
> *ptep_level = current_level;
> ptep = (pte_t *)gstage->pgd;
> @@ -127,7 +127,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
> struct kvm_mmu_memory_cache *pcache,
> const struct kvm_gstage_mapping *map)
> {
> - u32 current_level = gstage->kvm->arch.pgd_levels - 1;
> + u32 current_level = gstage->pgd_levels - 1;
> pte_t *next_ptep = (pte_t *)gstage->pgd;
> pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
>
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index fbcdd75cb9af..2d3def024270 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -24,10 +24,7 @@ static void mmu_wp_memory_region(struct kvm *kvm, int slot)
> phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
> struct kvm_gstage gstage;
>
> - gstage.kvm = kvm;
> - gstage.flags = 0;
> - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
> - gstage.pgd = kvm->arch.pgd;
> + kvm_riscv_gstage_init(&gstage, kvm);
>
> spin_lock(&kvm->mmu_lock);
> kvm_riscv_gstage_wp_range(&gstage, start, end);
> @@ -49,10 +46,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
> struct kvm_gstage_mapping map;
> struct kvm_gstage gstage;
>
> - gstage.kvm = kvm;
> - gstage.flags = 0;
> - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
> - gstage.pgd = kvm->arch.pgd;
> + kvm_riscv_gstage_init(&gstage, kvm);
>
> end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
> pfn = __phys_to_pfn(hpa);
> @@ -89,10 +83,7 @@ void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
> {
> struct kvm_gstage gstage;
>
> - gstage.kvm = kvm;
> - gstage.flags = 0;
> - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
> - gstage.pgd = kvm->arch.pgd;
> + kvm_riscv_gstage_init(&gstage, kvm);
>
> spin_lock(&kvm->mmu_lock);
> kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
> @@ -109,10 +100,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
> phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
> struct kvm_gstage gstage;
>
> - gstage.kvm = kvm;
> - gstage.flags = 0;
> - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
> - gstage.pgd = kvm->arch.pgd;
> + kvm_riscv_gstage_init(&gstage, kvm);
>
> kvm_riscv_gstage_wp_range(&gstage, start, end);
> }
> @@ -141,10 +129,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
> phys_addr_t size = slot->npages << PAGE_SHIFT;
> struct kvm_gstage gstage;
>
> - gstage.kvm = kvm;
> - gstage.flags = 0;
> - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
> - gstage.pgd = kvm->arch.pgd;
> + kvm_riscv_gstage_init(&gstage, kvm);
>
> spin_lock(&kvm->mmu_lock);
> kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
> @@ -250,10 +235,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> if (!kvm->arch.pgd)
> return false;
>
> - gstage.kvm = kvm;
> - gstage.flags = 0;
> - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
> - gstage.pgd = kvm->arch.pgd;
> + kvm_riscv_gstage_init(&gstage, kvm);
> mmu_locked = spin_trylock(&kvm->mmu_lock);
> kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT,
> (range->end - range->start) << PAGE_SHIFT,
> @@ -275,10 +257,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>
> WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
>
> - gstage.kvm = kvm;
> - gstage.flags = 0;
> - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
> - gstage.pgd = kvm->arch.pgd;
> + kvm_riscv_gstage_init(&gstage, kvm);
> if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT,
> &ptep, &ptep_level))
> return false;
> @@ -298,10 +277,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>
> WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
>
> - gstage.kvm = kvm;
> - gstage.flags = 0;
> - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
> - gstage.pgd = kvm->arch.pgd;
> + kvm_riscv_gstage_init(&gstage, kvm);
> if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT,
> &ptep, &ptep_level))
> return false;
> @@ -463,10 +439,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
> struct kvm_gstage gstage;
> struct page *page;
>
> - gstage.kvm = kvm;
> - gstage.flags = 0;
> - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
> - gstage.pgd = kvm->arch.pgd;
> + kvm_riscv_gstage_init(&gstage, kvm);
>
> /* Setup initial state of output mapping */
> memset(out_map, 0, sizeof(*out_map));
> @@ -587,10 +560,7 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>
> spin_lock(&kvm->mmu_lock);
> if (kvm->arch.pgd) {
> - gstage.kvm = kvm;
> - gstage.flags = 0;
> - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
> - gstage.pgd = kvm->arch.pgd;
> + kvm_riscv_gstage_init(&gstage, kvm);
> kvm_riscv_gstage_unmap_range(&gstage, 0UL,
> kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels), false);
> pgd = READ_ONCE(kvm->arch.pgd);
^ permalink raw reply
* Re: [PATCH v8 3/3] RISC-V: KVM: Reuse KVM_CAP_VM_GPA_BITS to select HGATP.MODE
From: Nutty.Liu @ 2026-04-04 3:25 UTC (permalink / raw)
To: fangyu.yu, pbonzini, corbet, anup, atish.patra, pjw, palmer, aou,
alex, skhan
Cc: guoren, radim.krcmar, andrew.jones, linux-doc, kvm, kvm-riscv,
linux-riscv, linux-kernel
In-Reply-To: <20260403153019.9916-4-fangyu.yu@linux.alibaba.com>
On 4/3/2026 11:30 PM, fangyu.yu@linux.alibaba.com wrote:
> From: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>
> Reuse KVM_CAP_VM_GPA_BITS to advertise and select the effective
> G-stage GPA width for a VM.
>
> KVM_CHECK_EXTENSION(KVM_CAP_VM_GPA_BITS) returns the effective GPA
> bits for a VM, KVM_ENABLE_CAP(KVM_CAP_VM_GPA_BITS) allows userspace
> to downsize the effective GPA width by selecting a smaller G-stage
> page table format:
> - gpa_bits <= 41 selects Sv39x4 (pgd_levels=3)
> - gpa_bits <= 50 selects Sv48x4 (pgd_levels=4)
> - gpa_bits <= 59 selects Sv57x4 (pgd_levels=5)
>
> Reject the request with -EINVAL for unsupported values and with -EBUSY
> if vCPUs have been created or any memslot is populated.
>
> Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
> Reviewed-by: Andrew Jones <andrew.jones@oss.qualcomm.com>
> Reviewed-by: Guo Ren <guoren@kernel.org>
Reviewed-by: Nutty Liu <nutty.liu@hotmail.com>
Thanks,
Nutty
> ---
> arch/riscv/kvm/vm.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 42 insertions(+), 2 deletions(-)
>
> diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
> index fb7c4e07961f..a9f083feeb76 100644
> --- a/arch/riscv/kvm/vm.c
> +++ b/arch/riscv/kvm/vm.c
> @@ -214,12 +214,52 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>
> int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
> {
> + if (cap->flags)
> + return -EINVAL;
> +
> switch (cap->cap) {
> case KVM_CAP_RISCV_MP_STATE_RESET:
> - if (cap->flags)
> - return -EINVAL;
> kvm->arch.mp_state_reset = true;
> return 0;
> + case KVM_CAP_VM_GPA_BITS: {
> + unsigned long gpa_bits = cap->args[0];
> + unsigned long new_levels;
> + int r = 0;
> +
> + /* Decide target pgd levels from requested gpa_bits */
> +#ifdef CONFIG_64BIT
> + if (gpa_bits <= 41)
> + new_levels = 3; /* Sv39x4 */
> + else if (gpa_bits <= 50)
> + new_levels = 4; /* Sv48x4 */
> + else if (gpa_bits <= 59)
> + new_levels = 5; /* Sv57x4 */
> + else
> + return -EINVAL;
> +#else
> + /* 32-bit: only Sv32x4*/
> + if (gpa_bits <= 34)
> + new_levels = 2;
> + else
> + return -EINVAL;
> +#endif
> + if (new_levels > kvm_riscv_gstage_max_pgd_levels)
> + return -EINVAL;
> +
> + /* Follow KVM's lock ordering: kvm->lock -> kvm->slots_lock. */
> + mutex_lock(&kvm->lock);
> + mutex_lock(&kvm->slots_lock);
> +
> + if (kvm->created_vcpus || !kvm_are_all_memslots_empty(kvm))
> + r = -EBUSY;
> + else
> + kvm->arch.pgd_levels = new_levels;
> +
> + mutex_unlock(&kvm->slots_lock);
> + mutex_unlock(&kvm->lock);
> +
> + return r;
> + }
> default:
> return -EINVAL;
> }
^ permalink raw reply
* Re: [PATCH v8 1/3] RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode
From: Nutty.Liu @ 2026-04-04 3:27 UTC (permalink / raw)
To: fangyu.yu, pbonzini, corbet, anup, atish.patra, pjw, palmer, aou,
alex, skhan
Cc: guoren, radim.krcmar, andrew.jones, linux-doc, kvm, kvm-riscv,
linux-riscv, linux-kernel
In-Reply-To: <20260403153019.9916-2-fangyu.yu@linux.alibaba.com>
On 4/3/2026 11:30 PM, fangyu.yu@linux.alibaba.com wrote:
> From: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>
> Introduces one per-VM architecture-specific fields to support runtime
> configuration of the G-stage page table format:
>
> - kvm->arch.pgd_levels: the corresponding number of page table levels
> for the selected mode.
>
> These fields replace the previous global variables
> kvm_riscv_gstage_mode and kvm_riscv_gstage_pgd_levels, enabling different
> virtual machines to independently select their G-stage page table format
> instead of being forced to share the maximum mode detected by the kernel
> at boot time.
>
> Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
> Reviewed-by: Andrew Jones <andrew.jones@oss.qualcomm.com>
> Reviewed-by: Anup Patel <anup@brainfault.org>
> Reviewed-by: Guo Ren <guoren@kernel.org>
Reviewed-by: Nutty Liu <nutty.liu@hotmail.com>
Thanks,
Nutty
> ---
> arch/riscv/include/asm/kvm_gstage.h | 37 ++++++++++++----
> arch/riscv/include/asm/kvm_host.h | 1 +
> arch/riscv/kvm/gstage.c | 65 ++++++++++++++---------------
> arch/riscv/kvm/main.c | 12 +++---
> arch/riscv/kvm/mmu.c | 20 +++++----
> arch/riscv/kvm/vm.c | 5 ++-
> arch/riscv/kvm/vmid.c | 3 +-
> 7 files changed, 86 insertions(+), 57 deletions(-)
>
> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
> index 595e2183173e..5aa58d1f692a 100644
> --- a/arch/riscv/include/asm/kvm_gstage.h
> +++ b/arch/riscv/include/asm/kvm_gstage.h
> @@ -29,16 +29,22 @@ struct kvm_gstage_mapping {
> #define kvm_riscv_gstage_index_bits 10
> #endif
>
> -extern unsigned long kvm_riscv_gstage_mode;
> -extern unsigned long kvm_riscv_gstage_pgd_levels;
> +extern unsigned long kvm_riscv_gstage_max_pgd_levels;
>
> #define kvm_riscv_gstage_pgd_xbits 2
> #define kvm_riscv_gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits))
> -#define kvm_riscv_gstage_gpa_bits (HGATP_PAGE_SHIFT + \
> - (kvm_riscv_gstage_pgd_levels * \
> - kvm_riscv_gstage_index_bits) + \
> - kvm_riscv_gstage_pgd_xbits)
> -#define kvm_riscv_gstage_gpa_size ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits))
> +
> +static inline unsigned long kvm_riscv_gstage_gpa_bits(unsigned long pgd_levels)
> +{
> + return (HGATP_PAGE_SHIFT +
> + pgd_levels * kvm_riscv_gstage_index_bits +
> + kvm_riscv_gstage_pgd_xbits);
> +}
> +
> +static inline gpa_t kvm_riscv_gstage_gpa_size(unsigned long pgd_levels)
> +{
> + return BIT_ULL(kvm_riscv_gstage_gpa_bits(pgd_levels));
> +}
>
> bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
> pte_t **ptepp, u32 *ptep_level);
> @@ -69,4 +75,21 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>
> void kvm_riscv_gstage_mode_detect(void);
>
> +static inline unsigned long kvm_riscv_gstage_mode(unsigned long pgd_levels)
> +{
> + switch (pgd_levels) {
> + case 2:
> + return HGATP_MODE_SV32X4;
> + case 3:
> + return HGATP_MODE_SV39X4;
> + case 4:
> + return HGATP_MODE_SV48X4;
> + case 5:
> + return HGATP_MODE_SV57X4;
> + default:
> + WARN_ON_ONCE(1);
> + return HGATP_MODE_OFF;
> + }
> +}
> +
> #endif
> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> index 24585304c02b..478f699e9dec 100644
> --- a/arch/riscv/include/asm/kvm_host.h
> +++ b/arch/riscv/include/asm/kvm_host.h
> @@ -94,6 +94,7 @@ struct kvm_arch {
> /* G-stage page table */
> pgd_t *pgd;
> phys_addr_t pgd_phys;
> + unsigned long pgd_levels;
>
> /* Guest Timer */
> struct kvm_guest_timer timer;
> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
> index b67d60d722c2..4beb9322fe76 100644
> --- a/arch/riscv/kvm/gstage.c
> +++ b/arch/riscv/kvm/gstage.c
> @@ -12,22 +12,21 @@
> #include <asm/kvm_gstage.h>
>
> #ifdef CONFIG_64BIT
> -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
> -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
> +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 3;
> #else
> -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
> -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
> +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 2;
> #endif
>
> #define gstage_pte_leaf(__ptep) \
> (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
>
> -static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
> +static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage,
> + gpa_t addr, u32 level)
> {
> unsigned long mask;
> unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
>
> - if (level == (kvm_riscv_gstage_pgd_levels - 1))
> + if (level == gstage->kvm->arch.pgd_levels - 1)
> mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
> else
> mask = PTRS_PER_PTE - 1;
> @@ -40,12 +39,13 @@ static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
> return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
> }
>
> -static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
> +static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long page_size,
> + u32 *out_level)
> {
> u32 i;
> unsigned long psz = 1UL << 12;
>
> - for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
> + for (i = 0; i < gstage->kvm->arch.pgd_levels; i++) {
> if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
> *out_level = i;
> return 0;
> @@ -55,21 +55,23 @@ static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
> return -EINVAL;
> }
>
> -static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
> +static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level,
> + unsigned long *out_pgorder)
> {
> - if (kvm_riscv_gstage_pgd_levels < level)
> + if (gstage->kvm->arch.pgd_levels < level)
> return -EINVAL;
>
> *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
> return 0;
> }
>
> -static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
> +static int gstage_level_to_page_size(struct kvm_gstage *gstage, u32 level,
> + unsigned long *out_pgsize)
> {
> int rc;
> unsigned long page_order = PAGE_SHIFT;
>
> - rc = gstage_level_to_page_order(level, &page_order);
> + rc = gstage_level_to_page_order(gstage, level, &page_order);
> if (rc)
> return rc;
>
> @@ -81,11 +83,11 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
> pte_t **ptepp, u32 *ptep_level)
> {
> pte_t *ptep;
> - u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
> + u32 current_level = gstage->kvm->arch.pgd_levels - 1;
>
> *ptep_level = current_level;
> ptep = (pte_t *)gstage->pgd;
> - ptep = &ptep[gstage_pte_index(addr, current_level)];
> + ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
> while (ptep && pte_val(ptep_get(ptep))) {
> if (gstage_pte_leaf(ptep)) {
> *ptep_level = current_level;
> @@ -97,7 +99,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
> current_level--;
> *ptep_level = current_level;
> ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
> - ptep = &ptep[gstage_pte_index(addr, current_level)];
> + ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
> } else {
> ptep = NULL;
> }
> @@ -110,7 +112,7 @@ static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
> {
> unsigned long order = PAGE_SHIFT;
>
> - if (gstage_level_to_page_order(level, &order))
> + if (gstage_level_to_page_order(gstage, level, &order))
> return;
> addr &= ~(BIT(order) - 1);
>
> @@ -125,9 +127,9 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
> struct kvm_mmu_memory_cache *pcache,
> const struct kvm_gstage_mapping *map)
> {
> - u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
> + u32 current_level = gstage->kvm->arch.pgd_levels - 1;
> pte_t *next_ptep = (pte_t *)gstage->pgd;
> - pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
> + pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
>
> if (current_level < map->level)
> return -EINVAL;
> @@ -151,7 +153,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
> }
>
> current_level--;
> - ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
> + ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
> }
>
> if (pte_val(*ptep) != pte_val(map->pte)) {
> @@ -175,7 +177,7 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
> out_map->addr = gpa;
> out_map->level = 0;
>
> - ret = gstage_page_size_to_level(page_size, &out_map->level);
> + ret = gstage_page_size_to_level(gstage, page_size, &out_map->level);
> if (ret)
> return ret;
>
> @@ -217,7 +219,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
> u32 next_ptep_level;
> unsigned long next_page_size, page_size;
>
> - ret = gstage_level_to_page_size(ptep_level, &page_size);
> + ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
> if (ret)
> return;
>
> @@ -229,7 +231,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
> if (ptep_level && !gstage_pte_leaf(ptep)) {
> next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
> next_ptep_level = ptep_level - 1;
> - ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
> + ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size);
> if (ret)
> return;
>
> @@ -263,7 +265,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>
> while (addr < end) {
> found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
> - ret = gstage_level_to_page_size(ptep_level, &page_size);
> + ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
> if (ret)
> break;
>
> @@ -297,7 +299,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>
> while (addr < end) {
> found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
> - ret = gstage_level_to_page_size(ptep_level, &page_size);
> + ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
> if (ret)
> break;
>
> @@ -319,39 +321,34 @@ void __init kvm_riscv_gstage_mode_detect(void)
> /* Try Sv57x4 G-stage mode */
> csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
> if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
> - kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
> - kvm_riscv_gstage_pgd_levels = 5;
> + kvm_riscv_gstage_max_pgd_levels = 5;
> goto done;
> }
>
> /* Try Sv48x4 G-stage mode */
> csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
> if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
> - kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
> - kvm_riscv_gstage_pgd_levels = 4;
> + kvm_riscv_gstage_max_pgd_levels = 4;
> goto done;
> }
>
> /* Try Sv39x4 G-stage mode */
> csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
> if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) {
> - kvm_riscv_gstage_mode = HGATP_MODE_SV39X4;
> - kvm_riscv_gstage_pgd_levels = 3;
> + kvm_riscv_gstage_max_pgd_levels = 3;
> goto done;
> }
> #else /* CONFIG_32BIT */
> /* Try Sv32x4 G-stage mode */
> csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
> if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) {
> - kvm_riscv_gstage_mode = HGATP_MODE_SV32X4;
> - kvm_riscv_gstage_pgd_levels = 2;
> + kvm_riscv_gstage_max_pgd_levels = 2;
> goto done;
> }
> #endif
>
> /* KVM depends on !HGATP_MODE_OFF */
> - kvm_riscv_gstage_mode = HGATP_MODE_OFF;
> - kvm_riscv_gstage_pgd_levels = 0;
> + kvm_riscv_gstage_max_pgd_levels = 0;
>
> done:
> csr_write(CSR_HGATP, 0);
> diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
> index 0f3fe3986fc0..90ee0a032b9a 100644
> --- a/arch/riscv/kvm/main.c
> +++ b/arch/riscv/kvm/main.c
> @@ -105,17 +105,17 @@ static int __init riscv_kvm_init(void)
> return rc;
>
> kvm_riscv_gstage_mode_detect();
> - switch (kvm_riscv_gstage_mode) {
> - case HGATP_MODE_SV32X4:
> + switch (kvm_riscv_gstage_max_pgd_levels) {
> + case 2:
> str = "Sv32x4";
> break;
> - case HGATP_MODE_SV39X4:
> + case 3:
> str = "Sv39x4";
> break;
> - case HGATP_MODE_SV48X4:
> + case 4:
> str = "Sv48x4";
> break;
> - case HGATP_MODE_SV57X4:
> + case 5:
> str = "Sv57x4";
> break;
> default:
> @@ -164,7 +164,7 @@ static int __init riscv_kvm_init(void)
> (rc) ? slist : "no features");
> }
>
> - kvm_info("using %s G-stage page table format\n", str);
> + kvm_info("highest G-stage page table mode is %s\n", str);
>
> kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
>
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index 088d33ba90ed..fbcdd75cb9af 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -67,7 +67,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
> if (!writable)
> map.pte = pte_wrprotect(map.pte);
>
> - ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels);
> + ret = kvm_mmu_topup_memory_cache(&pcache, kvm->arch.pgd_levels);
> if (ret)
> goto out;
>
> @@ -186,7 +186,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
> * space addressable by the KVM guest GPA space.
> */
> if ((new->base_gfn + new->npages) >=
> - (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT))
> + kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels) >> PAGE_SHIFT)
> return -EFAULT;
>
> hva = new->userspace_addr;
> @@ -472,7 +472,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
> memset(out_map, 0, sizeof(*out_map));
>
> /* We need minimum second+third level pages */
> - ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels);
> + ret = kvm_mmu_topup_memory_cache(pcache, kvm->arch.pgd_levels);
> if (ret) {
> kvm_err("Failed to topup G-stage cache\n");
> return ret;
> @@ -575,6 +575,7 @@ int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm)
> return -ENOMEM;
> kvm->arch.pgd = page_to_virt(pgd_page);
> kvm->arch.pgd_phys = page_to_phys(pgd_page);
> + kvm->arch.pgd_levels = kvm_riscv_gstage_max_pgd_levels;
>
> return 0;
> }
> @@ -590,10 +591,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
> gstage.flags = 0;
> gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
> gstage.pgd = kvm->arch.pgd;
> - kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
> + kvm_riscv_gstage_unmap_range(&gstage, 0UL,
> + kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels), false);
> pgd = READ_ONCE(kvm->arch.pgd);
> kvm->arch.pgd = NULL;
> kvm->arch.pgd_phys = 0;
> + kvm->arch.pgd_levels = 0;
> }
> spin_unlock(&kvm->mmu_lock);
>
> @@ -603,11 +606,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>
> void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu)
> {
> - unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
> - struct kvm_arch *k = &vcpu->kvm->arch;
> + struct kvm_arch *ka = &vcpu->kvm->arch;
> + unsigned long hgatp = kvm_riscv_gstage_mode(ka->pgd_levels)
> + << HGATP_MODE_SHIFT;
>
> - hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
> - hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
> + hgatp |= (READ_ONCE(ka->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
> + hgatp |= (ka->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
>
> ncsr_write(CSR_HGATP, hgatp);
>
> diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
> index 13c63ae1a78b..fb7c4e07961f 100644
> --- a/arch/riscv/kvm/vm.c
> +++ b/arch/riscv/kvm/vm.c
> @@ -199,7 +199,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> r = KVM_USER_MEM_SLOTS;
> break;
> case KVM_CAP_VM_GPA_BITS:
> - r = kvm_riscv_gstage_gpa_bits;
> + if (!kvm)
> + r = kvm_riscv_gstage_gpa_bits(kvm_riscv_gstage_max_pgd_levels);
> + else
> + r = kvm_riscv_gstage_gpa_bits(kvm->arch.pgd_levels);
> break;
> default:
> r = 0;
> diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
> index cf34d448289d..c15bdb1dd8be 100644
> --- a/arch/riscv/kvm/vmid.c
> +++ b/arch/riscv/kvm/vmid.c
> @@ -26,7 +26,8 @@ static DEFINE_SPINLOCK(vmid_lock);
> void __init kvm_riscv_gstage_vmid_detect(void)
> {
> /* Figure-out number of VMID bits in HW */
> - csr_write(CSR_HGATP, (kvm_riscv_gstage_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
> + csr_write(CSR_HGATP, (kvm_riscv_gstage_mode(kvm_riscv_gstage_max_pgd_levels) <<
> + HGATP_MODE_SHIFT) | HGATP_VMID);
> vmid_bits = csr_read(CSR_HGATP);
> vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT;
> vmid_bits = fls_long(vmid_bits);
^ permalink raw reply
* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Pawan Gupta @ 2026-04-04 3:49 UTC (permalink / raw)
To: Jim Mattson
Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <CALMp9eSqgL5q-MY1xpjqR5oRn5_cb=mfEhNFWusNneS=Mx8UMg@mail.gmail.com>
On Fri, Apr 03, 2026 at 07:21:02PM -0700, Jim Mattson wrote:
> On Fri, Apr 3, 2026 at 5:22 PM Pawan Gupta
> <pawan.kumar.gupta@linux.intel.com> wrote:
> >
> > On Fri, Apr 03, 2026 at 04:39:54PM -0700, Jim Mattson wrote:
> > > > Since cloud providers have greater control over userspace, the decision to
> > > > use BHI_DIS_S or not can be left to them. KVM would simply follow what it
> > > > is asked to do by the userspace.
> > >
> > > I feel like we've gone over this before, but if userspace tells KVM
> > > not to enable BHI_DIS_S, how do we inform Windows that it needs to do
> > > the longer clearing sequence, despite the fact that the virtual CPU is
> > > masquerading as Ice Lake?
> >
> > IMO, if an OS is allergic to a hardware mitigation, and is also aware that
> > it is virtualized, it should default to a sw mitigation that works everywhere.
>
> Agreed. So, without any information to the contrary, VMs should assume
> the long BHB clearing sequence is required.
>
> Returning to my earlier comment, the test should be:
>
> + if (cpu_feature_enabled(X86_FEATURE_BHI_CTRL) ||
> cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) {
> + bhb_seq_outer_loop = 12;
> + bhb_seq_inner_loop = 7;
> + }
To be clear, my comment was for an OS that doesn't want BHI_DIS_S
under-the-hood with virtual-SPEC_CTRL. Linux doesn't have that problem,
hardware mitigation on Linux is perfectly okay.
Without virtual-SPEC_CTRL, the problem set is limited to guests that
migrate accross Alder Lake generation CPUs. As you mentioned the change in
MAXPHYADDR makes it unlikely.
With virtual-SPEC_CTRL support, guests that fall into the subset that
migrate inspite of MAXPHYADDR change would also be mitigated. Then, on top
of hardware mitigation, deploying the long sequence in the guest would
incur a significant performance penalty for no good reason.
^ permalink raw reply
* Re: [PATCH] hwmon: (asus-ec-sensors) add ROG STRIX B650E-E GAMING WIFI
From: Guenter Roeck @ 2026-04-04 4:38 UTC (permalink / raw)
To: Eugene Shalygin
Cc: Veronika Kossmann, Veronika Kossmann, Jonathan Corbet, Shuah Khan,
linux-hwmon, linux-doc, linux-kernel
In-Reply-To: <20260403210343.1380437-1-eugene.shalygin@gmail.com>
On 4/3/26 14:03, Eugene Shalygin wrote:
> From: Veronika Kossmann <nanodesuu@gmail.com>
>
> Add support for ROG STRIX B650E-E GAMING WIFI
>
> Signed-off-by: Veronika Kossmann <desu.git@rxtx.cx>
> Signed-off-by: Eugene Shalygin <eugene.shalygin@gmail.com>
Sashiko has a problem with this patch:
https://sashiko.dev/#/patchset/20260403210343.1380437-1-eugene.shalygin%40gmail.com
I never paid attention, but seems to me that it has a point.
Assuming the concern is valid, that makes me wonder: Do other boards
have similar problems ?
Thanks,
Guenter
> ---
> Documentation/hwmon/asus_ec_sensors.rst | 1 +
> drivers/hwmon/asus-ec-sensors.c | 11 ++++++++++-
> 2 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst
> index 9ad3f0a57f55..e14419811aac 100644
> --- a/Documentation/hwmon/asus_ec_sensors.rst
> +++ b/Documentation/hwmon/asus_ec_sensors.rst
> @@ -31,6 +31,7 @@ Supported boards:
> * ROG MAXIMUS Z690 FORMULA
> * ROG STRIX B550-E GAMING
> * ROG STRIX B550-I GAMING
> + * ROG STRIX B650E-E GAMING WIFI
> * ROG STRIX B650E-I GAMING WIFI
> * ROG STRIX B850-I GAMING WIFI
> * ROG STRIX X470-F GAMING
> diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c
> index 070bb368f2b7..8c53cd9ed8f3 100644
> --- a/drivers/hwmon/asus-ec-sensors.c
> +++ b/drivers/hwmon/asus-ec-sensors.c
> @@ -274,7 +274,7 @@ static const struct ec_sensor_info sensors_family_amd_600[] = {
> [ec_sensor_temp_cpu_package] =
> EC_SENSOR("CPU Package", hwmon_temp, 1, 0x00, 0x31),
> [ec_sensor_temp_mb] =
> - EC_SENSOR("Motherboard", hwmon_temp, 1, 0x00, 0x32),
> + EC_SENSOR("Motherboard", hwmon_temp, 1, 0x00, 0x32),
> [ec_sensor_temp_vrm] =
> EC_SENSOR("VRM", hwmon_temp, 1, 0x00, 0x33),
> [ec_sensor_temp_t_sensor] =
> @@ -616,6 +616,13 @@ static const struct ec_board_info board_info_strix_b550_i_gaming = {
> .family = family_amd_500_series,
> };
>
> +static const struct ec_board_info board_info_strix_b650e_e_gaming = {
> + .sensors = SENSOR_TEMP_VRM | SENSOR_SET_TEMP_CHIPSET_CPU_MB |
> + SENSOR_IN_CPU_CORE,
> + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0,
> + .family = family_amd_600_series,
> +};
> +
> static const struct ec_board_info board_info_strix_b650e_i_gaming = {
> .sensors = SENSOR_TEMP_VRM | SENSOR_TEMP_T_SENSOR |
> SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_IN_CPU_CORE,
> @@ -861,6 +868,8 @@ static const struct dmi_system_id dmi_table[] = {
> &board_info_strix_b550_e_gaming),
> DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B550-I GAMING",
> &board_info_strix_b550_i_gaming),
> + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B650E-E GAMING WIFI",
> + &board_info_strix_b650e_e_gaming),
> DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B650E-I GAMING WIFI",
> &board_info_strix_b650e_i_gaming),
> DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B850-I GAMING WIFI",
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox