* [PATCH 1/4] vfio/pci: Add PCIe TPH interface with capability query
2026-04-15 9:09 [PATCH 0/4] vfio/pci: Add PCIe TPH support Chengwen Feng
@ 2026-04-15 9:09 ` Chengwen Feng
2026-04-15 9:09 ` [PATCH 2/4] vfio/pci: Add PCIe TPH enable/disable support Chengwen Feng
` (2 subsequent siblings)
3 siblings, 0 replies; 12+ messages in thread
From: Chengwen Feng @ 2026-04-15 9:09 UTC (permalink / raw)
To: alex, jgg; +Cc: wathsala.vithanage, kvm, linux-pci, Chengwen Feng
Add the VFIO_DEVICE_PCI_TPH IOCTL and implement the basic
capability query operation to let userspace discover device
TPH support, supported modes and ST table information.
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
drivers/vfio/pci/vfio_pci_core.c | 56 ++++++++++++++++++++++++++++++++
include/uapi/linux/vfio.h | 54 ++++++++++++++++++++++++++++++
2 files changed, 110 insertions(+)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index d43745fe4c84..35df624439a3 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -29,6 +29,7 @@
#include <linux/sched/mm.h>
#include <linux/iommufd.h>
#include <linux/pci-p2pdma.h>
+#include <linux/pci-tph.h>
#if IS_ENABLED(CONFIG_EEH)
#include <asm/eeh.h>
#endif
@@ -1461,6 +1462,59 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
ioeventfd.fd);
}
+static int vfio_pci_tph_get_cap(struct vfio_pci_core_device *vdev,
+ struct vfio_device_pci_tph_op *op,
+ void __user *uarg)
+{
+#ifdef CONFIG_PCIE_TPH
+ struct pci_dev *pdev = vdev->pdev;
+ struct vfio_pci_tph_cap cap = {0};
+ u32 reg;
+
+ if (!pdev->tph_cap)
+ return -EOPNOTSUPP;
+
+ pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CAP, ®);
+ if (reg & PCI_TPH_CAP_ST_NS)
+ cap.supported_modes |= VFIO_PCI_TPH_MODE_NS;
+ if (reg & PCI_TPH_CAP_ST_IV)
+ cap.supported_modes |= VFIO_PCI_TPH_MODE_IV;
+ if (reg & PCI_TPH_CAP_ST_DS)
+ cap.supported_modes |= VFIO_PCI_TPH_MODE_DS;
+ cap.st_table_present = !!(pcie_tph_get_st_table_loc(pdev) != PCI_TPH_LOC_NONE);
+ cap.st_table_sz = pcie_tph_get_st_table_size(pdev);
+
+ if (copy_to_user(uarg, &cap, sizeof(cap)))
+ return -EFAULT;
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
+ void __user *uarg)
+{
+ struct vfio_device_pci_tph_op op;
+ size_t minsz;
+
+ if (copy_from_user(&op, uarg, sizeof(op.argsz) + sizeof(op.op)))
+ return -EFAULT;
+
+ minsz = offsetof(struct vfio_device_pci_tph_op, cap);
+ if (op.argsz < minsz)
+ return -EINVAL;
+
+ switch (op.op) {
+ case VFIO_PCI_TPH_GET_CAP:
+ return vfio_pci_tph_get_cap(vdev, &op, uarg + minsz);
+ default:
+ /* Other ops are not implemented yet */
+ return -EINVAL;
+ }
+}
+
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
unsigned long arg)
{
@@ -1483,6 +1537,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
return vfio_pci_ioctl_reset(vdev, uarg);
case VFIO_DEVICE_SET_IRQS:
return vfio_pci_ioctl_set_irqs(vdev, uarg);
+ case VFIO_DEVICE_PCI_TPH:
+ return vfio_pci_ioctl_tph(vdev, uarg);
default:
return -ENOTTY;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index bb7b89330d35..f3f79f43cee9 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1307,6 +1307,60 @@ struct vfio_precopy_info {
#define VFIO_MIG_GET_PRECOPY_INFO _IO(VFIO_TYPE, VFIO_BASE + 21)
+/* PCIe TPH capability query */
+struct vfio_pci_tph_cap {
+ __u8 supported_modes;
+#define VFIO_PCI_TPH_MODE_NS (1u << 0) /* No steering */
+#define VFIO_PCI_TPH_MODE_IV (1u << 1) /* Interrupt vector */
+#define VFIO_PCI_TPH_MODE_DS (1u << 2) /* Device specific */
+ __u8 st_table_present; /* Indicates whether ST table present */
+ __u16 st_table_sz; /* ST table size */
+ __u32 reserved;
+};
+
+/* PCIe TPH enable control */
+struct vfio_pci_tph_ctrl {
+ __u8 mode; /* VFIO_PCI_TPH_MODE_* */
+ __u8 reserved[7];
+};
+
+/* PCIe TPH steer-tag single entry */
+struct vfio_pci_tph_entry {
+ __u32 cpu; /* [IN] CPU identifier, used with get/set ops */
+ __u8 mem_type; /* [IN] Memory type, used with get/set ops */
+#define VFIO_PCI_TPH_MEM_TYPE_VM 0
+#define VFIO_PCI_TPH_MEM_TYPE_PM 1
+ __u8 reserved0;
+ __u16 index; /* [IN] ST table index, used with set ops */
+ __u16 st; /* [OUT] steer-tag, used with get ops */
+ __u16 reserved1;
+};
+
+/* PCIe TPH batch steer-tags request */
+struct vfio_pci_tph_st {
+ __u32 count;
+ __u32 reserved;
+ struct vfio_pci_tph_entry ents[];
+};
+
+/* IOCTL argument for VFIO_DEVICE_PCI_TPH */
+struct vfio_device_pci_tph_op {
+ __u32 argsz;
+ __u32 op;
+#define VFIO_PCI_TPH_GET_CAP 0
+#define VFIO_PCI_TPH_ENABLE 1
+#define VFIO_PCI_TPH_DISABLE 2
+#define VFIO_PCI_TPH_GET_ST 3
+#define VFIO_PCI_TPH_SET_ST 4
+ union {
+ struct vfio_pci_tph_cap cap; /* GET_CAP: out */
+ struct vfio_pci_tph_ctrl ctrl; /* ENABLE: in */
+ struct vfio_pci_tph_st st; /* GET_ST/SET_ST */
+ };
+};
+
+#define VFIO_DEVICE_PCI_TPH _IO(VFIO_TYPE, VFIO_BASE + 22)
+
/*
* Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power
* state with the platform-based power management. Device use of lower power
--
2.17.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 2/4] vfio/pci: Add PCIe TPH enable/disable support
2026-04-15 9:09 [PATCH 0/4] vfio/pci: Add PCIe TPH support Chengwen Feng
2026-04-15 9:09 ` [PATCH 1/4] vfio/pci: Add PCIe TPH interface with capability query Chengwen Feng
@ 2026-04-15 9:09 ` Chengwen Feng
2026-04-15 9:09 ` [PATCH 3/4] vfio/pci: Add PCIe TPH GET_ST interface Chengwen Feng
2026-04-15 9:09 ` [PATCH 4/4] vfio/pci: Add PCIe TPH SET_ST interface Chengwen Feng
3 siblings, 0 replies; 12+ messages in thread
From: Chengwen Feng @ 2026-04-15 9:09 UTC (permalink / raw)
To: alex, jgg; +Cc: wathsala.vithanage, kvm, linux-pci, Chengwen Feng
Add support to enable and disable TPH function with
mode selection.
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
drivers/vfio/pci/vfio_pci_core.c | 35 ++++++++++++++++++++++++++++++++
1 file changed, 35 insertions(+)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 35df624439a3..0f96b41779cd 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1493,6 +1493,37 @@ static int vfio_pci_tph_get_cap(struct vfio_pci_core_device *vdev,
#endif
}
+static int vfio_pci_tph_enable(struct vfio_pci_core_device *vdev,
+ struct vfio_device_pci_tph_op *op,
+ void __user *uarg)
+{
+ struct vfio_pci_tph_ctrl ctrl;
+ int mode;
+
+ if (op->argsz < offsetofend(struct vfio_device_pci_tph_op, ctrl))
+ return -EINVAL;
+
+ if (copy_from_user(&ctrl, uarg, sizeof(ctrl)))
+ return -EFAULT;
+
+ if (ctrl.mode != VFIO_PCI_TPH_MODE_IV && ctrl.mode != VFIO_PCI_TPH_MODE_DS)
+ return -EINVAL;
+
+ /* Reserved must be zero */
+ if (memchr_inv(ctrl.reserved, 0, sizeof(ctrl.reserved)))
+ return -EINVAL;
+
+ mode = (ctrl.mode == VFIO_PCI_TPH_MODE_IV) ? PCI_TPH_ST_IV_MODE :
+ PCI_TPH_ST_DS_MODE;
+ return pcie_enable_tph(vdev->pdev, mode);
+}
+
+static int vfio_pci_tph_disable(struct vfio_pci_core_device *vdev)
+{
+ pcie_disable_tph(vdev->pdev);
+ return 0;
+}
+
static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
void __user *uarg)
{
@@ -1509,6 +1540,10 @@ static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
switch (op.op) {
case VFIO_PCI_TPH_GET_CAP:
return vfio_pci_tph_get_cap(vdev, &op, uarg + minsz);
+ case VFIO_PCI_TPH_ENABLE:
+ return vfio_pci_tph_enable(vdev, &op, uarg + minsz);
+ case VFIO_PCI_TPH_DISABLE:
+ return vfio_pci_tph_disable(vdev);
default:
/* Other ops are not implemented yet */
return -EINVAL;
--
2.17.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 3/4] vfio/pci: Add PCIe TPH GET_ST interface
2026-04-15 9:09 [PATCH 0/4] vfio/pci: Add PCIe TPH support Chengwen Feng
2026-04-15 9:09 ` [PATCH 1/4] vfio/pci: Add PCIe TPH interface with capability query Chengwen Feng
2026-04-15 9:09 ` [PATCH 2/4] vfio/pci: Add PCIe TPH enable/disable support Chengwen Feng
@ 2026-04-15 9:09 ` Chengwen Feng
2026-04-15 13:55 ` Wathsala Vithanage
2026-04-15 9:09 ` [PATCH 4/4] vfio/pci: Add PCIe TPH SET_ST interface Chengwen Feng
3 siblings, 1 reply; 12+ messages in thread
From: Chengwen Feng @ 2026-04-15 9:09 UTC (permalink / raw)
To: alex, jgg; +Cc: wathsala.vithanage, kvm, linux-pci, Chengwen Feng
Add support to batch get CPU's Steering Tags for Device Specific Mode.
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
drivers/vfio/pci/vfio_pci_core.c | 51 ++++++++++++++++++++++++++++++++
1 file changed, 51 insertions(+)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 0f96b41779cd..3fe8a48b1cc0 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1524,6 +1524,55 @@ static int vfio_pci_tph_disable(struct vfio_pci_core_device *vdev)
return 0;
}
+static int vfio_pci_tph_get_st(struct vfio_pci_core_device *vdev,
+ struct vfio_device_pci_tph_op *op,
+ void __user *uarg)
+{
+ struct vfio_pci_tph_entry *ents;
+ struct vfio_pci_tph_st st;
+ enum tph_mem_type mtype;
+ size_t size;
+ int i, err;
+
+ if (copy_from_user(&st, uarg, sizeof(st)))
+ return -EFAULT;
+
+ if (!st.count || st.count > 2048)
+ return -EINVAL;
+
+ size = st.count * sizeof(*ents);
+ ents = kvmalloc(size, GFP_KERNEL);
+ if (!ents)
+ return -ENOMEM;
+
+ if (copy_from_user(ents, uarg + sizeof(st), size)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ for (i = 0; i < st.count; i++) {
+ if (ents[i].mem_type == VFIO_PCI_TPH_MEM_TYPE_VM) {
+ mtype = TPH_MEM_TYPE_VM;
+ } else if (ents[i].mem_type == VFIO_PCI_TPH_MEM_TYPE_PM) {
+ mtype = TPH_MEM_TYPE_PM;
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = pcie_tph_get_cpu_st(vdev->pdev, mtype, ents[i].cpu, &ents[i].st);
+ if (err)
+ goto out;
+ }
+
+ if (copy_to_user(uarg + sizeof(st), ents, size))
+ err = -EFAULT;
+
+out:
+ kvfree(ents);
+ return err;
+}
+
static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
void __user *uarg)
{
@@ -1544,6 +1593,8 @@ static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
return vfio_pci_tph_enable(vdev, &op, uarg + minsz);
case VFIO_PCI_TPH_DISABLE:
return vfio_pci_tph_disable(vdev);
+ case VFIO_PCI_TPH_GET_ST:
+ return vfio_pci_tph_get_st(vdev, &op, uarg + minsz);
default:
/* Other ops are not implemented yet */
return -EINVAL;
--
2.17.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH 3/4] vfio/pci: Add PCIe TPH GET_ST interface
2026-04-15 9:09 ` [PATCH 3/4] vfio/pci: Add PCIe TPH GET_ST interface Chengwen Feng
@ 2026-04-15 13:55 ` Wathsala Vithanage
2026-04-16 1:09 ` fengchengwen
0 siblings, 1 reply; 12+ messages in thread
From: Wathsala Vithanage @ 2026-04-15 13:55 UTC (permalink / raw)
To: Chengwen Feng, alex, jgg; +Cc: kvm, linux-pci
Hi Feng,
get_st feature is unsafe. It allows a rogue userspace driver in
device-specific
mode to obtain steering tags for arbitrary CPUs, including ones unrelated
to the device or its workload, enabling it to direct traffic into those
CPUs’
caches and potentially interfere with other workloads, opening doors to
further exploits depending on other vulnerabilities.
That's why we dropped this capability in
https://lore.kernel.org/kvm/20251013163515.16565-1-wathsala.vithanage@arm.com/
--wathsala
> }
>
> +static int vfio_pci_tph_get_st(struct vfio_pci_core_device *vdev,
> + struct vfio_device_pci_tph_op *op,
> + void __user *uarg)
> +{
> + struct vfio_pci_tph_entry *ents;
> + struct vfio_pci_tph_st st;
> + enum tph_mem_type mtype;
> + size_t size;
> + int i, err;
> +
> + if (copy_from_user(&st, uarg, sizeof(st)))
> + return -EFAULT;
> +
> + if (!st.count || st.count > 2048)
> + return -EINVAL;
> +
> + size = st.count * sizeof(*ents);
> + ents = kvmalloc(size, GFP_KERNEL);
> + if (!ents)
> + return -ENOMEM;
> +
> + if (copy_from_user(ents, uarg + sizeof(st), size)) {
> + err = -EFAULT;
> + goto out;
> + }
> +
> + for (i = 0; i < st.count; i++) {
> + if (ents[i].mem_type == VFIO_PCI_TPH_MEM_TYPE_VM) {
> + mtype = TPH_MEM_TYPE_VM;
> + } else if (ents[i].mem_type == VFIO_PCI_TPH_MEM_TYPE_PM) {
> + mtype = TPH_MEM_TYPE_PM;
> + } else {
> + err = -EINVAL;
> + goto out;
> + }
> +
> + err = pcie_tph_get_cpu_st(vdev->pdev, mtype, ents[i].cpu, &ents[i].st);
> + if (err)
> + goto out;
> + }
> +
> + if (copy_to_user(uarg + sizeof(st), ents, size))
> + err = -EFAULT;
> +
> +out:
> + kvfree(ents);
> + return err;
> +}
> +
> static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
> void __user *uarg)
> {
> @@ -1544,6 +1593,8 @@ static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
> return vfio_pci_tph_enable(vdev, &op, uarg + minsz);
> case VFIO_PCI_TPH_DISABLE:
> return vfio_pci_tph_disable(vdev);
> + case VFIO_PCI_TPH_GET_ST:
> + return vfio_pci_tph_get_st(vdev, &op, uarg + minsz);
> default:
> /* Other ops are not implemented yet */
> return -EINVAL;
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH 3/4] vfio/pci: Add PCIe TPH GET_ST interface
2026-04-15 13:55 ` Wathsala Vithanage
@ 2026-04-16 1:09 ` fengchengwen
2026-04-16 13:40 ` Alex Williamson
0 siblings, 1 reply; 12+ messages in thread
From: fengchengwen @ 2026-04-16 1:09 UTC (permalink / raw)
To: Wathsala Vithanage, alex, jgg; +Cc: kvm, linux-pci
On 4/15/2026 9:55 PM, Wathsala Vithanage wrote:
> Hi Feng,
>
> get_st feature is unsafe. It allows a rogue userspace driver in device-specific
> mode to obtain steering tags for arbitrary CPUs, including ones unrelated
> to the device or its workload, enabling it to direct traffic into those CPUs’
> caches and potentially interfere with other workloads, opening doors to
> further exploits depending on other vulnerabilities.
Thank you for the follow-up and for referencing the prior RFC
discussion on this topic. I appreciate you clarifying the
historical context of the safety concerns.
I acknowledge the risks you’ve highlighted, but I believe the
risk profile in this VFIO interface is different and already
well bounded by existing design and practice:
1. VFIO device access requires elevated privileges
A userspace process can only open a VFIO device node if it
has sufficient privileges (typically root). This is not an
interface for unprivileged users.
2. In the thread "[RFC v2 0/2] Retrieve tph from dmabuf for PCIe
P2P memory access", applications can configure the steertag
of exported dmabufs from userspace to the kernel. Kernel PCIe
drivers (e.g., mlx5 NIC) then use these steertags and set them
to their ST tables. Even here, userspace could set invalid
steertags that impact GPU performance—but this model is
basically accepted I think (refer from maillist discuss).
3. Malicious resource consumption is not unique to TPH
A malicious thread can be created to forcibly consume CPU
resources and bound to a specific CPU, affecting other CPUs.
This is a general system security concern, not one specific
to TPH GET_ST, and is addressed by existing system hardening
and access control mechanisms—not by removing useful features.
4. GET_ST is strictly necessary for Device-Specific (DS) mode
when no ST table is present on the device.
For devices that do not have a dedicated ST table (a common
scenario in many PCIe endpoints), DS mode requires userspace
to retrieve per-CPU steering tags first, then program them
into the device’s steering logic via other registers. Without
GET_ST, userspace cannot obtain the required steertags to
enable TPH DS mode at all—rendering TPH support useless for
these devices. This is not an optional feature but a
fundamental requirement to unlock TPH functionality for a
large class of hardware.
Given these points—privilege restriction, existing industry
practice, general system security mitigations, and strict
functional necessity for DS mode—I believe GET_ST is reasonable
and consistent with existing VFIO security boundaries.
Thanks
>
> That's why we dropped this capability in https://lore.kernel.org/kvm/20251013163515.16565-1-wathsala.vithanage@arm.com/
>
> --wathsala
>
>> }
>> +static int vfio_pci_tph_get_st(struct vfio_pci_core_device *vdev,
>> + struct vfio_device_pci_tph_op *op,
>> + void __user *uarg)
>> +{
>> + struct vfio_pci_tph_entry *ents;
>> + struct vfio_pci_tph_st st;
>> + enum tph_mem_type mtype;
>> + size_t size;
>> + int i, err;
>> +
>> + if (copy_from_user(&st, uarg, sizeof(st)))
>> + return -EFAULT;
>> +
>> + if (!st.count || st.count > 2048)
>> + return -EINVAL;
>> +
>> + size = st.count * sizeof(*ents);
>> + ents = kvmalloc(size, GFP_KERNEL);
>> + if (!ents)
>> + return -ENOMEM;
>> +
>> + if (copy_from_user(ents, uarg + sizeof(st), size)) {
>> + err = -EFAULT;
>> + goto out;
>> + }
>> +
>> + for (i = 0; i < st.count; i++) {
>> + if (ents[i].mem_type == VFIO_PCI_TPH_MEM_TYPE_VM) {
>> + mtype = TPH_MEM_TYPE_VM;
>> + } else if (ents[i].mem_type == VFIO_PCI_TPH_MEM_TYPE_PM) {
>> + mtype = TPH_MEM_TYPE_PM;
>> + } else {
>> + err = -EINVAL;
>> + goto out;
>> + }
>> +
>> + err = pcie_tph_get_cpu_st(vdev->pdev, mtype, ents[i].cpu, &ents[i].st);
>> + if (err)
>> + goto out;
>> + }
>> +
>> + if (copy_to_user(uarg + sizeof(st), ents, size))
>> + err = -EFAULT;
>> +
>> +out:
>> + kvfree(ents);
>> + return err;
>> +}
>> +
>> static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
>> void __user *uarg)
>> {
>> @@ -1544,6 +1593,8 @@ static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
>> return vfio_pci_tph_enable(vdev, &op, uarg + minsz);
>> case VFIO_PCI_TPH_DISABLE:
>> return vfio_pci_tph_disable(vdev);
>> + case VFIO_PCI_TPH_GET_ST:
>> + return vfio_pci_tph_get_st(vdev, &op, uarg + minsz);
>> default:
>> /* Other ops are not implemented yet */
>> return -EINVAL;
>
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH 3/4] vfio/pci: Add PCIe TPH GET_ST interface
2026-04-16 1:09 ` fengchengwen
@ 2026-04-16 13:40 ` Alex Williamson
2026-04-16 16:12 ` Wathsala Vithanage
2026-04-17 0:48 ` fengchengwen
0 siblings, 2 replies; 12+ messages in thread
From: Alex Williamson @ 2026-04-16 13:40 UTC (permalink / raw)
To: fengchengwen; +Cc: Wathsala Vithanage, jgg, kvm, linux-pci, alex
On Thu, 16 Apr 2026 09:09:50 +0800
fengchengwen <fengchengwen@huawei.com> wrote:
> On 4/15/2026 9:55 PM, Wathsala Vithanage wrote:
> > Hi Feng,
> >
> > get_st feature is unsafe. It allows a rogue userspace driver in device-specific
> > mode to obtain steering tags for arbitrary CPUs, including ones unrelated
> > to the device or its workload, enabling it to direct traffic into those CPUs’
> > caches and potentially interfere with other workloads, opening doors to
> > further exploits depending on other vulnerabilities.
>
> Thank you for the follow-up and for referencing the prior RFC
> discussion on this topic. I appreciate you clarifying the
> historical context of the safety concerns.
>
> I acknowledge the risks you’ve highlighted, but I believe the
> risk profile in this VFIO interface is different and already
> well bounded by existing design and practice:
>
> 1. VFIO device access requires elevated privileges
> A userspace process can only open a VFIO device node if it
> has sufficient privileges (typically root). This is not an
> interface for unprivileged users.
This argument is NOT helping your cause. This is not the usage model
we design for. VFIO usage requires that privileges be granted to a
user, in the form of device ACL access and locked memory, but does not
generally require elevated privileges beyond that, or otherwise grant
the user authority beyond the scope of the device. The root use case
may be typical for you, but is not required for many other typical use
cases, such as device assignment to VMs.
> 2. In the thread "[RFC v2 0/2] Retrieve tph from dmabuf for PCIe
> P2P memory access", applications can configure the steertag
> of exported dmabufs from userspace to the kernel. Kernel PCIe
> drivers (e.g., mlx5 NIC) then use these steertags and set them
> to their ST tables. Even here, userspace could set invalid
> steertags that impact GPU performance—but this model is
> basically accepted I think (refer from maillist discuss).
It's an RFC. It's bold to claim that it's nearly accepted.
> 3. Malicious resource consumption is not unique to TPH
> A malicious thread can be created to forcibly consume CPU
> resources and bound to a specific CPU, affecting other CPUs.
> This is a general system security concern, not one specific
> to TPH GET_ST, and is addressed by existing system hardening
> and access control mechanisms—not by removing useful features.
You're conflating process abuse of a CPU to a potential side-channel
DMA attach from a device. What *existing* hardening protects against
the latter?
> 4. GET_ST is strictly necessary for Device-Specific (DS) mode
> when no ST table is present on the device.
> For devices that do not have a dedicated ST table (a common
> scenario in many PCIe endpoints), DS mode requires userspace
> to retrieve per-CPU steering tags first, then program them
> into the device’s steering logic via other registers. Without
> GET_ST, userspace cannot obtain the required steertags to
> enable TPH DS mode at all—rendering TPH support useless for
> these devices. This is not an optional feature but a
> fundamental requirement to unlock TPH functionality for a
> large class of hardware.
Unlocking a hardware feature does not give you authority to ignore the
security implications of that feature. Thanks,
Alex
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 3/4] vfio/pci: Add PCIe TPH GET_ST interface
2026-04-16 13:40 ` Alex Williamson
@ 2026-04-16 16:12 ` Wathsala Vithanage
2026-04-17 0:48 ` fengchengwen
1 sibling, 0 replies; 12+ messages in thread
From: Wathsala Vithanage @ 2026-04-16 16:12 UTC (permalink / raw)
To: Alex Williamson, fengchengwen; +Cc: jgg, kvm, linux-pci
On 4/16/26 08:40, Alex Williamson wrote:
> On Thu, 16 Apr 2026 09:09:50 +0800
> fengchengwen <fengchengwen@huawei.com> wrote:
>
>> On 4/15/2026 9:55 PM, Wathsala Vithanage wrote:
>>> Hi Feng,
>>>
>>> get_st feature is unsafe. It allows a rogue userspace driver in device-specific
>>> mode to obtain steering tags for arbitrary CPUs, including ones unrelated
>>> to the device or its workload, enabling it to direct traffic into those CPUs’
>>> caches and potentially interfere with other workloads, opening doors to
>>> further exploits depending on other vulnerabilities.
>> Thank you for the follow-up and for referencing the prior RFC
>> discussion on this topic. I appreciate you clarifying the
>> historical context of the safety concerns.
>>
>> I acknowledge the risks you’ve highlighted, but I believe the
>> risk profile in this VFIO interface is different and already
>> well bounded by existing design and practice:
>>
>> 1. VFIO device access requires elevated privileges
>> A userspace process can only open a VFIO device node if it
>> has sufficient privileges (typically root). This is not an
>> interface for unprivileged users.
> This argument is NOT helping your cause. This is not the usage model
> we design for. VFIO usage requires that privileges be granted to a
> user, in the form of device ACL access and locked memory, but does not
> generally require elevated privileges beyond that, or otherwise grant
> the user authority beyond the scope of the device. The root use case
> may be typical for you, but is not required for many other typical use
> cases, such as device assignment to VMs.
>
>> 2. In the thread "[RFC v2 0/2] Retrieve tph from dmabuf for PCIe
>> P2P memory access", applications can configure the steertag
>> of exported dmabufs from userspace to the kernel. Kernel PCIe
>> drivers (e.g., mlx5 NIC) then use these steertags and set them
>> to their ST tables. Even here, userspace could set invalid
>> steertags that impact GPU performance—but this model is
>> basically accepted I think (refer from maillist discuss).
> It's an RFC. It's bold to claim that it's nearly accepted.
>
>> 3. Malicious resource consumption is not unique to TPH
>> A malicious thread can be created to forcibly consume CPU
>> resources and bound to a specific CPU, affecting other CPUs.
>> This is a general system security concern, not one specific
>> to TPH GET_ST, and is addressed by existing system hardening
>> and access control mechanisms—not by removing useful features.
> You're conflating process abuse of a CPU to a potential side-channel
> DMA attach from a device. What *existing* hardening protects against
> the latter?
>
>> 4. GET_ST is strictly necessary for Device-Specific (DS) mode
>> when no ST table is present on the device.
>> For devices that do not have a dedicated ST table (a common
>> scenario in many PCIe endpoints), DS mode requires userspace
>> to retrieve per-CPU steering tags first, then program them
>> into the device’s steering logic via other registers. Without
>> GET_ST, userspace cannot obtain the required steertags to
>> enable TPH DS mode at all—rendering TPH support useless for
>> these devices. This is not an optional feature but a
>> fundamental requirement to unlock TPH functionality for a
>> large class of hardware.
> Unlocking a hardware feature does not give you authority to ignore the
> security implications of that feature. Thanks,
>
> Alex
First vfio-TPH RFC captures some of the risks
https://lore.kernel.org/kvm/20250221224638.1836909-1-wathsala.vithanage@arm.com/
--wathsala
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 3/4] vfio/pci: Add PCIe TPH GET_ST interface
2026-04-16 13:40 ` Alex Williamson
2026-04-16 16:12 ` Wathsala Vithanage
@ 2026-04-17 0:48 ` fengchengwen
2026-04-17 2:06 ` fengchengwen
1 sibling, 1 reply; 12+ messages in thread
From: fengchengwen @ 2026-04-17 0:48 UTC (permalink / raw)
To: Alex Williamson; +Cc: Wathsala Vithanage, jgg, kvm, linux-pci
Hi Alex,
Thank you very much for your clear and detailed security explanation.
I fully understand and agree with your security concerns about allowing
userspace to query steering tags for arbitrary CPUs.
To completely resolve this security issue while retaining the mandatory
functionality for DS-mode devices without ST table, I will revise the
GET_ST interface with a strict security constraint in v3:
The CPU number provided by userspace will be VALIDATED TO EQUAL
THE CURRENT CALLING CPU of the ioctl().
In other words:
- Userspace can ONLY query the steering tag for the CPU it is currently
running on.
- Userspace CANNOT query any other CPU.
- No cross-CPU probing, no side-channel, no attack surface.
- No ability to influence or target other CPUs.
This completely eliminates the security exposure you mentioned, while
still fully supporting the Device-Specific mode requirement for devices
without ST tables.
Thanks
On 4/16/2026 9:40 PM, Alex Williamson wrote:
> On Thu, 16 Apr 2026 09:09:50 +0800
> fengchengwen <fengchengwen@huawei.com> wrote:
>
>> On 4/15/2026 9:55 PM, Wathsala Vithanage wrote:
>>> Hi Feng,
>>>
>>> get_st feature is unsafe. It allows a rogue userspace driver in device-specific
>>> mode to obtain steering tags for arbitrary CPUs, including ones unrelated
>>> to the device or its workload, enabling it to direct traffic into those CPUs’
>>> caches and potentially interfere with other workloads, opening doors to
>>> further exploits depending on other vulnerabilities.
>>
>> Thank you for the follow-up and for referencing the prior RFC
>> discussion on this topic. I appreciate you clarifying the
>> historical context of the safety concerns.
>>
>> I acknowledge the risks you’ve highlighted, but I believe the
>> risk profile in this VFIO interface is different and already
>> well bounded by existing design and practice:
>>
>> 1. VFIO device access requires elevated privileges
>> A userspace process can only open a VFIO device node if it
>> has sufficient privileges (typically root). This is not an
>> interface for unprivileged users.
>
> This argument is NOT helping your cause. This is not the usage model
> we design for. VFIO usage requires that privileges be granted to a
> user, in the form of device ACL access and locked memory, but does not
> generally require elevated privileges beyond that, or otherwise grant
> the user authority beyond the scope of the device. The root use case
> may be typical for you, but is not required for many other typical use
> cases, such as device assignment to VMs.
>
>> 2. In the thread "[RFC v2 0/2] Retrieve tph from dmabuf for PCIe
>> P2P memory access", applications can configure the steertag
>> of exported dmabufs from userspace to the kernel. Kernel PCIe
>> drivers (e.g., mlx5 NIC) then use these steertags and set them
>> to their ST tables. Even here, userspace could set invalid
>> steertags that impact GPU performance—but this model is
>> basically accepted I think (refer from maillist discuss).
>
> It's an RFC. It's bold to claim that it's nearly accepted.
>
>> 3. Malicious resource consumption is not unique to TPH
>> A malicious thread can be created to forcibly consume CPU
>> resources and bound to a specific CPU, affecting other CPUs.
>> This is a general system security concern, not one specific
>> to TPH GET_ST, and is addressed by existing system hardening
>> and access control mechanisms—not by removing useful features.
>
> You're conflating process abuse of a CPU to a potential side-channel
> DMA attach from a device. What *existing* hardening protects against
> the latter?
>
>> 4. GET_ST is strictly necessary for Device-Specific (DS) mode
>> when no ST table is present on the device.
>> For devices that do not have a dedicated ST table (a common
>> scenario in many PCIe endpoints), DS mode requires userspace
>> to retrieve per-CPU steering tags first, then program them
>> into the device’s steering logic via other registers. Without
>> GET_ST, userspace cannot obtain the required steertags to
>> enable TPH DS mode at all—rendering TPH support useless for
>> these devices. This is not an optional feature but a
>> fundamental requirement to unlock TPH functionality for a
>> large class of hardware.
>
> Unlocking a hardware feature does not give you authority to ignore the
> security implications of that feature. Thanks,
>
> Alex
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH 3/4] vfio/pci: Add PCIe TPH GET_ST interface
2026-04-17 0:48 ` fengchengwen
@ 2026-04-17 2:06 ` fengchengwen
0 siblings, 0 replies; 12+ messages in thread
From: fengchengwen @ 2026-04-17 2:06 UTC (permalink / raw)
To: Alex Williamson, Wathsala Vithanage; +Cc: jgg, kvm, linux-pci
Sorry for the self-reply.
Hi Alex & Wathsala,
Based on the VM assignment scenario and the cross-VM attack concern
raised by Wathsala in her review of "[PATCH v2 RESEND 4/5] vfio/pci: Add PCIe TPH GET_ST interface":
This is unsafe. A user space driver can obtain STs for arbitrary CPUs and program them
into device-specific registers (e.g., E810), with no isolation guarantees.
For example, consider two VMs on the same host. A driver in VM1 could program STs that
target CPUs primarily used by VM2. This can steer traffic processing onto VM2's CPUs,
creating contention and degrading VM2's performance.
This breaks CPU isolation at the host level and can be used to disrupt workloads and violate
SLAs across tenants.
After fully re-evaluating the security architecture with your feedback,
I agree with your concerns and conclusions. I hereby revoke my earlier
proposal to restrict GET_ST to only the current CPU.
For devices that implement standard ST tables (via config space or MSI-X caps),
hypervisors such as QEMU/kvmtool can trap and filter guest writes, preventing
malicious steering tag abuse. This is the safe and supported model for TPH in
virtualized environments.
However, for devices that only support Device-Specific mode with no standard
ST table, there is no existing hypervisor interception mechanism to prevent
a guest from programming arbitrary steering tags to attack other CPUs.
This is a fundamental security risk that cannot be safely mitigated in software.
Therefore, the correct security posture for virtualization is:
- TPH should be enabled *only* for devices with standard ST tables
- Devices without standard ST tables should NOT enable TPH in virtualization
On the other hand, in non-virtualization (bare-metal) scenarios, there is
strong legitimate demand for devices that lack a standard ST table —
many real-world devices are designed this way. For this reason, I would
like to retain the GET_ST interface.
For virtualization scenarios, the hypervisor is responsible for avoiding
this risk by **disabling TPH Requester Enable in the PCIe config space**,
which is fully interceptable and under the hypervisor’s control.
Thanks
On 4/17/2026 8:48 AM, fengchengwen wrote:
> Hi Alex,
>
> Thank you very much for your clear and detailed security explanation.
> I fully understand and agree with your security concerns about allowing
> userspace to query steering tags for arbitrary CPUs.
>
> To completely resolve this security issue while retaining the mandatory
> functionality for DS-mode devices without ST table, I will revise the
> GET_ST interface with a strict security constraint in v3:
>
> The CPU number provided by userspace will be VALIDATED TO EQUAL
> THE CURRENT CALLING CPU of the ioctl().
>
> In other words:
> - Userspace can ONLY query the steering tag for the CPU it is currently
> running on.
> - Userspace CANNOT query any other CPU.
> - No cross-CPU probing, no side-channel, no attack surface.
> - No ability to influence or target other CPUs.
>
> This completely eliminates the security exposure you mentioned, while
> still fully supporting the Device-Specific mode requirement for devices
> without ST tables.
>
> Thanks
>
> On 4/16/2026 9:40 PM, Alex Williamson wrote:
>> On Thu, 16 Apr 2026 09:09:50 +0800
>> fengchengwen <fengchengwen@huawei.com> wrote:
>>
>>> On 4/15/2026 9:55 PM, Wathsala Vithanage wrote:
>>>> Hi Feng,
>>>>
>>>> get_st feature is unsafe. It allows a rogue userspace driver in device-specific
>>>> mode to obtain steering tags for arbitrary CPUs, including ones unrelated
>>>> to the device or its workload, enabling it to direct traffic into those CPUs’
>>>> caches and potentially interfere with other workloads, opening doors to
>>>> further exploits depending on other vulnerabilities.
>>>
>>> Thank you for the follow-up and for referencing the prior RFC
>>> discussion on this topic. I appreciate you clarifying the
>>> historical context of the safety concerns.
>>>
>>> I acknowledge the risks you’ve highlighted, but I believe the
>>> risk profile in this VFIO interface is different and already
>>> well bounded by existing design and practice:
>>>
>>> 1. VFIO device access requires elevated privileges
>>> A userspace process can only open a VFIO device node if it
>>> has sufficient privileges (typically root). This is not an
>>> interface for unprivileged users.
>>
>> This argument is NOT helping your cause. This is not the usage model
>> we design for. VFIO usage requires that privileges be granted to a
>> user, in the form of device ACL access and locked memory, but does not
>> generally require elevated privileges beyond that, or otherwise grant
>> the user authority beyond the scope of the device. The root use case
>> may be typical for you, but is not required for many other typical use
>> cases, such as device assignment to VMs.
>>
>>> 2. In the thread "[RFC v2 0/2] Retrieve tph from dmabuf for PCIe
>>> P2P memory access", applications can configure the steertag
>>> of exported dmabufs from userspace to the kernel. Kernel PCIe
>>> drivers (e.g., mlx5 NIC) then use these steertags and set them
>>> to their ST tables. Even here, userspace could set invalid
>>> steertags that impact GPU performance—but this model is
>>> basically accepted I think (refer from maillist discuss).
>>
>> It's an RFC. It's bold to claim that it's nearly accepted.
>>
>>> 3. Malicious resource consumption is not unique to TPH
>>> A malicious thread can be created to forcibly consume CPU
>>> resources and bound to a specific CPU, affecting other CPUs.
>>> This is a general system security concern, not one specific
>>> to TPH GET_ST, and is addressed by existing system hardening
>>> and access control mechanisms—not by removing useful features.
>>
>> You're conflating process abuse of a CPU to a potential side-channel
>> DMA attach from a device. What *existing* hardening protects against
>> the latter?
>>
>>> 4. GET_ST is strictly necessary for Device-Specific (DS) mode
>>> when no ST table is present on the device.
>>> For devices that do not have a dedicated ST table (a common
>>> scenario in many PCIe endpoints), DS mode requires userspace
>>> to retrieve per-CPU steering tags first, then program them
>>> into the device’s steering logic via other registers. Without
>>> GET_ST, userspace cannot obtain the required steertags to
>>> enable TPH DS mode at all—rendering TPH support useless for
>>> these devices. This is not an optional feature but a
>>> fundamental requirement to unlock TPH functionality for a
>>> large class of hardware.
>>
>> Unlocking a hardware feature does not give you authority to ignore the
>> security implications of that feature. Thanks,
>>
>> Alex
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 4/4] vfio/pci: Add PCIe TPH SET_ST interface
2026-04-15 9:09 [PATCH 0/4] vfio/pci: Add PCIe TPH support Chengwen Feng
` (2 preceding siblings ...)
2026-04-15 9:09 ` [PATCH 3/4] vfio/pci: Add PCIe TPH GET_ST interface Chengwen Feng
@ 2026-04-15 9:09 ` Chengwen Feng
[not found] ` <e6dbfdd5-5117-4c3e-bb84-ee1e489aa38f@arm.com>
3 siblings, 1 reply; 12+ messages in thread
From: Chengwen Feng @ 2026-04-15 9:09 UTC (permalink / raw)
To: alex, jgg; +Cc: wathsala.vithanage, kvm, linux-pci, Chengwen Feng
Add support to batch program Steering Tag table entries
for standard TPH modes.
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
drivers/vfio/pci/vfio_pci_core.c | 51 ++++++++++++++++++++++++++++++++
1 file changed, 51 insertions(+)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 3fe8a48b1cc0..a3146056397a 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1573,6 +1573,55 @@ static int vfio_pci_tph_get_st(struct vfio_pci_core_device *vdev,
return err;
}
+static int vfio_pci_tph_set_st(struct vfio_pci_core_device *vdev,
+ struct vfio_device_pci_tph_op *op,
+ void __user *uarg)
+{
+ struct vfio_pci_tph_entry *ents;
+ struct vfio_pci_tph_st st;
+ enum tph_mem_type mtype;
+ int i, err, size;
+ u16 st_val;
+
+ if (copy_from_user(&st, uarg, sizeof(st)))
+ return -EFAULT;
+
+ if (!st.count || st.count > 2048)
+ return -EINVAL;
+
+ size = st.count * sizeof(*ents);
+ ents = kvmalloc(size, GFP_KERNEL);
+ if (!ents)
+ return -ENOMEM;
+
+ if (copy_from_user(ents, uarg + sizeof(st), size)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ for (i = 0; i < st.count; i++) {
+ if (ents[i].mem_type == VFIO_PCI_TPH_MEM_TYPE_VM) {
+ mtype = TPH_MEM_TYPE_VM;
+ } else if (ents[i].mem_type == VFIO_PCI_TPH_MEM_TYPE_PM) {
+ mtype = TPH_MEM_TYPE_PM;
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = pcie_tph_get_cpu_st(vdev->pdev, mtype, ents[i].cpu, &st_val);
+ if (err)
+ goto out;
+ err = pcie_tph_set_st_entry(vdev->pdev, ents[i].index, st_val);
+ if (err)
+ goto out;
+ }
+
+out:
+ kvfree(ents);
+ return err;
+}
+
static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
void __user *uarg)
{
@@ -1595,6 +1644,8 @@ static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
return vfio_pci_tph_disable(vdev);
case VFIO_PCI_TPH_GET_ST:
return vfio_pci_tph_get_st(vdev, &op, uarg + minsz);
+ case VFIO_PCI_TPH_SET_ST:
+ return vfio_pci_tph_set_st(vdev, &op, uarg + minsz);
default:
/* Other ops are not implemented yet */
return -EINVAL;
--
2.17.1
^ permalink raw reply related [flat|nested] 12+ messages in thread