[RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl

kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
@ 2025-02-21 22:46 Wathsala Vithanage
  2025-02-28 18:48 ` Alex Williamson
                   ` (3 more replies)
  0 siblings, 4 replies; 24+ messages in thread
From: Wathsala Vithanage @ 2025-02-21 22:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: nd, Wathsala Vithanage, Alex Williamson, Jason Gunthorpe,
	Kevin Tian, Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER

Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
direct cache injection. As described in the relevant patch set [1],
direct cache injection in supported hardware allows optimal platform
resource utilization for specific requests on the PCIe bus. This feature
is currently available only for kernel device drivers. However,
user space applications, especially those whose performance is sensitive
to the latency of inbound writes as seen by a CPU core, may benefit from
using this information (E.g., DPDK cache stashing RFC [2] or an HPC
application running in a VM).

This patch enables configuring of TPH from the user space via
VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
drivers and VMMs to enable/disable the TPH feature on PCIe devices and
set steering tags in MSI-X or steering-tag table entries using
VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel using
VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.

[1] lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@amd.com
[2] inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@arm.com

Signed-off-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 163 +++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        |  68 +++++++++++++
 2 files changed, 231 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 586e49efb81b..d6dd0495b08b 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -29,6 +29,7 @@
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
 #include <linux/iommufd.h>
+#include <linux/pci-tph.h>
 #if IS_ENABLED(CONFIG_EEH)
 #include <asm/eeh.h>
 #endif
@@ -1510,6 +1511,165 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
 	return 0;
 }
 
+static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
+				      void __user *arg, size_t argsz,
+				      struct vfio_pci_tph_info **info)
+{
+	size_t minsz;
+
+	if (tph->count > VFIO_TPH_INFO_MAX)
+		return -EINVAL;
+	if (!tph->count)
+		return 0;
+
+	minsz = tph->count * sizeof(struct vfio_pci_tph_info);
+	if (minsz < argsz)
+		return -EINVAL;
+
+	*info = memdup_user(arg, minsz);
+	if (IS_ERR(info))
+		return PTR_ERR(info);
+
+	return minsz;
+}
+
+static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device *vdev,
+				      struct vfio_pci_tph *tph,
+				      void __user *arg, size_t argsz)
+{
+	int i, mtype, err = 0;
+	u32 cpu_uid;
+	struct vfio_pci_tph_info *info = NULL;
+	ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz, &info);
+
+	if (data_size <= 0)
+		return data_size;
+
+	for (i = 0; i < tph->count; i++) {
+		if (!(info[i].cpu_id < nr_cpu_ids && cpu_present(info[i].cpu_id))) {
+			info[i].err = -EINVAL;
+			continue;
+		}
+		cpu_uid = topology_core_id(info[i].cpu_id);
+		mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
+			VFIO_TPH_MEM_TYPE_SHIFT;
+
+		/* processing hints are always ignored */
+		info[i].ph_ignore = 1;
+
+		info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype, cpu_uid,
+						  &info[i].st);
+		if (info[i].err)
+			continue;
+
+		if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) {
+			info[i].err = pcie_tph_set_st_entry(vdev->pdev,
+							    info[i].index,
+							    info[i].st);
+		}
+	}
+
+	if (copy_to_user(arg, info, data_size))
+		err = -EFAULT;
+
+	kfree(info);
+	return err;
+}
+
+
+static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device *vdev,
+				       struct vfio_pci_tph *arg)
+{
+	int mode = arg->flags & VFIO_TPH_ST_MODE_MASK;
+
+	switch (mode) {
+	case VFIO_TPH_ST_NS_MODE:
+		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_NS_MODE);
+
+	case VFIO_TPH_ST_IV_MODE:
+		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_IV_MODE);
+
+	case VFIO_TPH_ST_DS_MODE:
+		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_DS_MODE);
+
+	default:
+		return -EINVAL;
+	}
+
+}
+
+static int vfio_pci_feature_tph_disable(struct vfio_pci_core_device *vdev)
+{
+	pcie_disable_tph(vdev->pdev);
+	return 0;
+}
+
+static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user *arg,
+					size_t argsz, u32 flags,
+					struct vfio_pci_tph *tph)
+{
+	u32 op;
+	int err = vfio_check_feature(flags, argsz,
+				 VFIO_DEVICE_FEATURE_SET |
+				 VFIO_DEVICE_FEATURE_GET,
+				 sizeof(struct vfio_pci_tph));
+	if (err != 1)
+		return err;
+
+	if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph)))
+		return -EFAULT;
+
+	op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
+
+	switch (op) {
+	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
+	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
+	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
+		return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : -EINVAL;
+
+	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
+		return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : -EINVAL;
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static int vfio_pci_core_feature_tph(struct vfio_device *device, u32 flags,
+				     struct vfio_pci_tph __user *arg,
+				     size_t argsz)
+{
+	u32 op;
+	struct vfio_pci_tph tph;
+	void __user *uinfo;
+	size_t infosz;
+	struct vfio_pci_core_device *vdev =
+		container_of(device, struct vfio_pci_core_device, vdev);
+	int err = vfio_pci_feature_tph_prepare(arg, argsz, flags, &tph);
+
+	if (err)
+		return err;
+
+	op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
+
+	switch (op) {
+	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
+		return vfio_pci_feature_tph_enable(vdev, &tph);
+
+	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
+		return vfio_pci_feature_tph_disable(vdev);
+
+	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
+	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
+		uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph, info);
+		infosz = argsz - sizeof(struct vfio_pci_tph);
+		return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo, infosz);
+
+	default:
+		return -EINVAL;
+	}
+}
+
 int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 				void __user *arg, size_t argsz)
 {
@@ -1523,6 +1683,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
 		return vfio_pci_core_feature_token(device, flags, arg, argsz);
+	case VFIO_DEVICE_FEATURE_PCI_TPH:
+		return vfio_pci_core_feature_tph(device, flags,
+						 arg, argsz);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index c8dbf8219c4f..608d57dfe279 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master {
 };
 #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
 
+/*
+ * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or set steering tags
+ * on the device. Data provided when setting this feature is a __u32 with the
+ * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe TPH in
+ * no-steering-tag, interrupt-vector, or device-specific mode when feature flags
+ * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and VFIO_TPH_ST_DS_MODE are set
+ * respectively.
+ * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device.
+ * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device at an index in
+ * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used and device
+ * capabilities. The caller can set multiple steering tags by passing an array
+ * of vfio_pci_tph_info objects containing cpu_id, cache_level, and
+ * MSI-X/ST-table index. The caller can also set the intended memory type and
+ * the processing hint by setting VFIO_TPH_MEM_TYPE_x and VFIO_TPH_HINT_x flags,
+ * respectively. The return value for each vfio_pci_tph_info object is stored in
+ * err, with the steering-tag set on the device and the ph_ignore status bit
+ * resulting from the steering-tag lookup operation. If err < 0, the values
+ * stored in the st and ph_ignore fields should be considered invalid.
+ *
+ * Upon VFIO_DEVICE_FEATURE_GET,  return steering tags to the caller.
+ * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the caller.
+ * The return values per vfio_pci_tph_info object are stored in the st,
+ * ph_ignore, and err fields.
+ */
+struct vfio_pci_tph_info {
+	/* in */
+	__u32 cpu_id;
+	__u32 cache_level;
+	__u8  flags;
+#define VFIO_TPH_MEM_TYPE_MASK		0x1
+#define VFIO_TPH_MEM_TYPE_SHIFT		0
+#define VFIO_TPH_MEM_TYPE_VMEM		0	/* Request volatile memory ST */
+#define VFIO_TPH_MEM_TYPE_PMEM		1	/* Request persistent memory ST */
+
+#define VFIO_TPH_HINT_MASK		0x3
+#define VFIO_TPH_HINT_SHIFT		1
+#define VFIO_TPH_HINT_BIDIR		0
+#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
+#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
+#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)
+	__u16 index;			/* MSI-X/ST-table index to set ST */
+	/* out */
+	__u16 st;			/* Steering-Tag */
+	__u8  ph_ignore;		/* Processing hint was ignored by */
+	__s32 err;			/* Error on getting/setting Steering-Tag*/
+};
+
+struct vfio_pci_tph {
+	__u32 argsz;			/* Size of vfio_pci_tph and info[] */
+	__u32 flags;
+#define VFIO_DEVICE_FEATURE_TPH_OP_MASK		0x7
+#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT	3
+#define VFIO_DEVICE_FEATURE_TPH_ENABLE		0	/* Enable TPH on device */
+#define VFIO_DEVICE_FEATURE_TPH_DISABLE	1	/* Disable TPH on device */
+#define VFIO_DEVICE_FEATURE_TPH_GET_ST		2	/* Get steering-tags */
+#define VFIO_DEVICE_FEATURE_TPH_SET_ST		4	/* Set steering-rags */
+
+#define	VFIO_TPH_ST_MODE_MASK	(0x3 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
+#define	VFIO_TPH_ST_NS_MODE	(0 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
+#define	VFIO_TPH_ST_IV_MODE	(1 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
+#define	VFIO_TPH_ST_DS_MODE	(2 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
+	__u32 count;				/* Number of entries in info[] */
+	struct vfio_pci_tph_info info[];
+#define VFIO_TPH_INFO_MAX	64		/* Max entries allowed in info[] */
+};
+
+#define VFIO_DEVICE_FEATURE_PCI_TPH 11
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-02-21 22:46 [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl Wathsala Vithanage
@ 2025-02-28 18:48 ` Alex Williamson
  2025-03-03  4:20   ` Wathsala Wathawana Vithanage
  2025-03-03  7:49 ` Philipp Stanner
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 24+ messages in thread
From: Alex Williamson @ 2025-02-28 18:48 UTC (permalink / raw)
  To: Wathsala Vithanage
  Cc: linux-kernel, nd, Jason Gunthorpe, Kevin Tian, Philipp Stanner,
	Yunxiang Li, Dr. David Alan Gilbert, Ankit Agrawal,
	open list:VFIO DRIVER

On Fri, 21 Feb 2025 22:46:33 +0000
Wathsala Vithanage <wathsala.vithanage@arm.com> wrote:

> Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> direct cache injection. As described in the relevant patch set [1],
> direct cache injection in supported hardware allows optimal platform
> resource utilization for specific requests on the PCIe bus. This feature
> is currently available only for kernel device drivers. However,
> user space applications, especially those whose performance is sensitive
> to the latency of inbound writes as seen by a CPU core, may benefit from
> using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> application running in a VM).
> 
> This patch enables configuring of TPH from the user space via
> VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> drivers and VMMs to enable/disable the TPH feature on PCIe devices and
> set steering tags in MSI-X or steering-tag table entries using
> VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel using
> VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.

Unless I'm missing it, the RFC in [2] doesn't make use of this
proposal.  Is there published code anywhere that does use this
interface?

> [1] lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@amd.com
> [2] inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@arm.com
> 
> Signed-off-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 163 +++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h        |  68 +++++++++++++
>  2 files changed, 231 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 586e49efb81b..d6dd0495b08b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -29,6 +29,7 @@
>  #include <linux/nospec.h>
>  #include <linux/sched/mm.h>
>  #include <linux/iommufd.h>
> +#include <linux/pci-tph.h>
>  #if IS_ENABLED(CONFIG_EEH)
>  #include <asm/eeh.h>
>  #endif
> @@ -1510,6 +1511,165 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
>  	return 0;
>  }
>  
> +static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
> +				      void __user *arg, size_t argsz,
> +				      struct vfio_pci_tph_info **info)
> +{
> +	size_t minsz;
> +
> +	if (tph->count > VFIO_TPH_INFO_MAX)
> +		return -EINVAL;
> +	if (!tph->count)
> +		return 0;
> +
> +	minsz = tph->count * sizeof(struct vfio_pci_tph_info);
> +	if (minsz < argsz)
> +		return -EINVAL;
> +
> +	*info = memdup_user(arg, minsz);
> +	if (IS_ERR(info))
> +		return PTR_ERR(info);
> +
> +	return minsz;
> +}
> +
> +static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device *vdev,
> +				      struct vfio_pci_tph *tph,
> +				      void __user *arg, size_t argsz)
> +{
> +	int i, mtype, err = 0;
> +	u32 cpu_uid;
> +	struct vfio_pci_tph_info *info = NULL;
> +	ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz, &info);
> +
> +	if (data_size <= 0)
> +		return data_size;
> +
> +	for (i = 0; i < tph->count; i++) {
> +		if (!(info[i].cpu_id < nr_cpu_ids && cpu_present(info[i].cpu_id))) {

Not intuitive logic, imo.  This could easily be:

		if (info[i].cpu_id >= nr_cpu_ids || !cpu_present(info[i].cpu_id))

> +			info[i].err = -EINVAL;
> +			continue;
> +		}
> +		cpu_uid = topology_core_id(info[i].cpu_id);
> +		mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
> +			VFIO_TPH_MEM_TYPE_SHIFT;
> +
> +		/* processing hints are always ignored */
> +		info[i].ph_ignore = 1;
> +
> +		info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype, cpu_uid,
> +						  &info[i].st);
> +		if (info[i].err)
> +			continue;
> +
> +		if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) {
> +			info[i].err = pcie_tph_set_st_entry(vdev->pdev,
> +							    info[i].index,
> +							    info[i].st);
> +		}
> +	}
> +
> +	if (copy_to_user(arg, info, data_size))
> +		err = -EFAULT;
> +
> +	kfree(info);
> +	return err;
> +}
> +
> +
> +static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device *vdev,
> +				       struct vfio_pci_tph *arg)
> +{
> +	int mode = arg->flags & VFIO_TPH_ST_MODE_MASK;
> +
> +	switch (mode) {
> +	case VFIO_TPH_ST_NS_MODE:
> +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_NS_MODE);
> +
> +	case VFIO_TPH_ST_IV_MODE:
> +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_IV_MODE);
> +
> +	case VFIO_TPH_ST_DS_MODE:
> +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_DS_MODE);
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +}
> +
> +static int vfio_pci_feature_tph_disable(struct vfio_pci_core_device *vdev)
> +{
> +	pcie_disable_tph(vdev->pdev);
> +	return 0;
> +}
> +
> +static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user *arg,
> +					size_t argsz, u32 flags,
> +					struct vfio_pci_tph *tph)
> +{
> +	u32 op;
> +	int err = vfio_check_feature(flags, argsz,
> +				 VFIO_DEVICE_FEATURE_SET |
> +				 VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(struct vfio_pci_tph));
> +	if (err != 1)
> +		return err;

We don't seem to account for a host booted with pci=notph.

> +
> +	if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph)))
> +		return -EFAULT;
> +
> +	op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> +	switch (op) {
> +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> +		return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : -EINVAL;
> +
> +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> +		return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : -EINVAL;

This is a convoluted mangling of an ioctl into a vfio feature.

> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static int vfio_pci_core_feature_tph(struct vfio_device *device, u32 flags,
> +				     struct vfio_pci_tph __user *arg,
> +				     size_t argsz)
> +{
> +	u32 op;
> +	struct vfio_pci_tph tph;
> +	void __user *uinfo;
> +	size_t infosz;
> +	struct vfio_pci_core_device *vdev =
> +		container_of(device, struct vfio_pci_core_device, vdev);
> +	int err = vfio_pci_feature_tph_prepare(arg, argsz, flags, &tph);
> +
> +	if (err)
> +		return err;
> +
> +	op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> +	switch (op) {
> +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> +		return vfio_pci_feature_tph_enable(vdev, &tph);
> +
> +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> +		return vfio_pci_feature_tph_disable(vdev);
> +
> +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> +		uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph, info);
> +		infosz = argsz - sizeof(struct vfio_pci_tph);
> +		return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo, infosz);

This is effectively encoding a regular ioctl as a feature.  See below.

> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  				void __user *arg, size_t argsz)
>  {
> @@ -1523,6 +1683,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
>  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>  		return vfio_pci_core_feature_token(device, flags, arg, argsz);
> +	case VFIO_DEVICE_FEATURE_PCI_TPH:
> +		return vfio_pci_core_feature_tph(device, flags,
> +						 arg, argsz);
>  	default:
>  		return -ENOTTY;
>  	}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index c8dbf8219c4f..608d57dfe279 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>  
> +/*
> + * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or set steering tags
> + * on the device. Data provided when setting this feature is a __u32 with the
> + * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe TPH in
> + * no-steering-tag, interrupt-vector, or device-specific mode when feature flags
> + * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and VFIO_TPH_ST_DS_MODE are set
> + * respectively.
> + * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device.
> + * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device at an index in
> + * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used and device
> + * capabilities. The caller can set multiple steering tags by passing an array
> + * of vfio_pci_tph_info objects containing cpu_id, cache_level, and
> + * MSI-X/ST-table index. The caller can also set the intended memory type and
> + * the processing hint by setting VFIO_TPH_MEM_TYPE_x and VFIO_TPH_HINT_x flags,
> + * respectively. The return value for each vfio_pci_tph_info object is stored in
> + * err, with the steering-tag set on the device and the ph_ignore status bit
> + * resulting from the steering-tag lookup operation. If err < 0, the values
> + * stored in the st and ph_ignore fields should be considered invalid.
> + *

Sorry, I don't understand ph_ignore as described here.  It's only ever
set to 1.  I guess we assume the incoming state is zero.  But what does
it mean?

> + * Upon VFIO_DEVICE_FEATURE_GET,  return steering tags to the caller.
> + * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the caller.
> + * The return values per vfio_pci_tph_info object are stored in the st,
> + * ph_ignore, and err fields.

Why do we need to provide an interface to return the steering tags set
by the user?  Seems like this could be a write-only, SET, interface.

> + */
> +struct vfio_pci_tph_info {

This seems more of an _entry than an _info.  Note that vfio has various
INFO ioctls which make this nomenclature confusing.

> +	/* in */
> +	__u32 cpu_id;

The logical ID?

> +	__u32 cache_level;

Zero based?  1-based?  How many cache levels are there?  What's valid
here?

> +	__u8  flags;
> +#define VFIO_TPH_MEM_TYPE_MASK		0x1
> +#define VFIO_TPH_MEM_TYPE_SHIFT		0
> +#define VFIO_TPH_MEM_TYPE_VMEM		0	/* Request volatile memory ST */
> +#define VFIO_TPH_MEM_TYPE_PMEM		1	/* Request persistent memory ST */

Is there a relation to the cache_level here?  Spec references here and
below, assuming those are relevant to these values.

> +
> +#define VFIO_TPH_HINT_MASK		0x3
> +#define VFIO_TPH_HINT_SHIFT		1
> +#define VFIO_TPH_HINT_BIDIR		0
> +#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)

There needs to be a __u8 padding in here somewhere or flags extended to
__u16.

> +	__u16 index;			/* MSI-X/ST-table index to set ST */
> +	/* out */
> +	__u16 st;			/* Steering-Tag */

Sorry if I'm just not familiar with TPH, but why do we need to return
the ST?  Doesn't hardware make use of the ST index and the physical
value gets applied automatically in HW?

> +	__u8  ph_ignore;		/* Processing hint was ignored by */

Padding/alignment, same as above.  "ignored by..."  By what?  Is that
an error?

> +	__s32 err;			/* Error on getting/setting Steering-Tag*/
> +};

Generally we'd expect a system call either works or fails.  Why do we
need per entry error report?  Can't we validate and prepare the entire
operation before setting any of it into the device?  Ultimately we're
just writing hints to config space or MSI-X table space, so the write
operation itself is not likely to be the point of failure.

> +
> +struct vfio_pci_tph {
> +	__u32 argsz;			/* Size of vfio_pci_tph and info[] */
> +	__u32 flags;
> +#define VFIO_DEVICE_FEATURE_TPH_OP_MASK		0x7
> +#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT	3
> +#define VFIO_DEVICE_FEATURE_TPH_ENABLE		0	/* Enable TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_DISABLE	1	/* Disable TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_GET_ST		2	/* Get steering-tags */
> +#define VFIO_DEVICE_FEATURE_TPH_SET_ST		4	/* Set steering-rags */

s/rags/tags/

vfio device features already have GET and SET as part of their base
structure, why are they duplicated here?  It really seems like there
are two separate features here, one that allows enabling with a given
mode or disable, and another that allows writing specific steering
tags.  Both seem like they could be SET-only features.  It's also not
clear to me that there's a significant advantage to providing an array
of steering tags.  Isn't updating STs an infrequent operation and
likely bound to at most 2K tags in the case of MSI-X?

> +
> +#define	VFIO_TPH_ST_MODE_MASK	(0x3 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_NS_MODE	(0 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_IV_MODE	(1 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_DS_MODE	(2 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +	__u32 count;				/* Number of entries in info[] */
> +	struct vfio_pci_tph_info info[];
> +#define VFIO_TPH_INFO_MAX	64		/* Max entries allowed in info[] */

This seems to match the limit if the table is located in extended
config space, but it's not particularly relevant if the table is
located in MSI-X space.  Why is this a good limit?

Also, try to keep these all in 80 column.  Thanks,

Alex

> +};
> +
> +#define VFIO_DEVICE_FEATURE_PCI_TPH 11
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**


^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-02-28 18:48 ` Alex Williamson
@ 2025-03-03  4:20   ` Wathsala Wathawana Vithanage
  2025-03-04  3:12     ` Alex Williamson
  0 siblings, 1 reply; 24+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-03-03  4:20 UTC (permalink / raw)
  To: Alex Williamson
  Cc: linux-kernel@vger.kernel.org, nd, Jason Gunthorpe, Kevin Tian,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Jeremy Linton,
	Honnappa Nagarahalli, Dhruv Tripathi

Hi Alex,

> > Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> > direct cache injection. As described in the relevant patch set [1],
> > direct cache injection in supported hardware allows optimal platform
> > resource utilization for specific requests on the PCIe bus. This
> > feature is currently available only for kernel device drivers.
> > However, user space applications, especially those whose performance
> > is sensitive to the latency of inbound writes as seen by a CPU core,
> > may benefit from using this information (E.g., DPDK cache stashing RFC
> > [2] or an HPC application running in a VM).
> >
> > This patch enables configuring of TPH from the user space via
> > VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> > drivers and VMMs to enable/disable the TPH feature on PCIe devices and
> > set steering tags in MSI-X or steering-tag table entries using
> > VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel
> > using VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.
> 
> Unless I'm missing it, the RFC in [2] doesn't make use of this proposal.  Is there
> published code anywhere that does use this interface?
> 
> > [1]
> > lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@amd.com
> > [2]
> > inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@arm.com
> >

The DPDK RFC in question is on hold for now because there is no way to get steering-tags
from the user space. 
Consequently, I had to stop working on [2] and start working on the kernel to get VFIO ready
for it. This was discussed in a DPDK community call sometime back.
https://mails.dpdk.org/archives/dev/2024-October/305408.html

> > Signed-off-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
> > ---
> >  drivers/vfio/pci/vfio_pci_core.c | 163 +++++++++++++++++++++++++++++++
> >  include/uapi/linux/vfio.h        |  68 +++++++++++++
> >  2 files changed, 231 insertions(+)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c
> > b/drivers/vfio/pci/vfio_pci_core.c
> > index 586e49efb81b..d6dd0495b08b 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -29,6 +29,7 @@
> >  #include <linux/nospec.h>
> >  #include <linux/sched/mm.h>
> >  #include <linux/iommufd.h>
> > +#include <linux/pci-tph.h>
> >  #if IS_ENABLED(CONFIG_EEH)
> >  #include <asm/eeh.h>
> >  #endif
> > @@ -1510,6 +1511,165 @@ static int vfio_pci_core_feature_token(struct
> vfio_device *device, u32 flags,
> >  	return 0;
> >  }
> >
> > +static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
> > +				      void __user *arg, size_t argsz,
> > +				      struct vfio_pci_tph_info **info) {
> > +	size_t minsz;
> > +
> > +	if (tph->count > VFIO_TPH_INFO_MAX)
> > +		return -EINVAL;
> > +	if (!tph->count)
> > +		return 0;
> > +
> > +	minsz = tph->count * sizeof(struct vfio_pci_tph_info);
> > +	if (minsz < argsz)
> > +		return -EINVAL;
> > +
> > +	*info = memdup_user(arg, minsz);
> > +	if (IS_ERR(info))
> > +		return PTR_ERR(info);
> > +
> > +	return minsz;
> > +}
> > +
> > +static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device *vdev,
> > +				      struct vfio_pci_tph *tph,
> > +				      void __user *arg, size_t argsz) {
> > +	int i, mtype, err = 0;
> > +	u32 cpu_uid;
> > +	struct vfio_pci_tph_info *info = NULL;
> > +	ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz, &info);
> > +
> > +	if (data_size <= 0)
> > +		return data_size;
> > +
> > +	for (i = 0; i < tph->count; i++) {
> > +		if (!(info[i].cpu_id < nr_cpu_ids && cpu_present(info[i].cpu_id)))
> > +{
> 
> Not intuitive logic, imo.  This could easily be:
> 
> 		if (info[i].cpu_id >= nr_cpu_ids || !cpu_present(info[i].cpu_id))
> 

Agree, looks more intuitive.

> > +			info[i].err = -EINVAL;
> > +			continue;
> > +		}
> > +		cpu_uid = topology_core_id(info[i].cpu_id);
> > +		mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
> > +			VFIO_TPH_MEM_TYPE_SHIFT;
> > +
> > +		/* processing hints are always ignored */
> > +		info[i].ph_ignore = 1;
> > +
> > +		info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype, cpu_uid,
> > +						  &info[i].st);
> > +		if (info[i].err)
> > +			continue;
> > +
> > +		if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) {
> > +			info[i].err = pcie_tph_set_st_entry(vdev->pdev,
> > +							    info[i].index,
> > +							    info[i].st);
> > +		}
> > +	}
> > +
> > +	if (copy_to_user(arg, info, data_size))
> > +		err = -EFAULT;
> > +
> > +	kfree(info);
> > +	return err;
> > +}
> > +
> > +
> > +static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device *vdev,
> > +				       struct vfio_pci_tph *arg)
> > +{
> > +	int mode = arg->flags & VFIO_TPH_ST_MODE_MASK;
> > +
> > +	switch (mode) {
> > +	case VFIO_TPH_ST_NS_MODE:
> > +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_NS_MODE);
> > +
> > +	case VFIO_TPH_ST_IV_MODE:
> > +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_IV_MODE);
> > +
> > +	case VFIO_TPH_ST_DS_MODE:
> > +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_DS_MODE);
> > +
> > +	default:
> > +		return -EINVAL;
> > +	}
> > +
> > +}
> > +
> > +static int vfio_pci_feature_tph_disable(struct vfio_pci_core_device
> > +*vdev) {
> > +	pcie_disable_tph(vdev->pdev);
> > +	return 0;
> > +}
> > +
> > +static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user *arg,
> > +					size_t argsz, u32 flags,
> > +					struct vfio_pci_tph *tph)
> > +{
> > +	u32 op;
> > +	int err = vfio_check_feature(flags, argsz,
> > +				 VFIO_DEVICE_FEATURE_SET |
> > +				 VFIO_DEVICE_FEATURE_GET,
> > +				 sizeof(struct vfio_pci_tph));
> > +	if (err != 1)
> > +		return err;
> 
> We don't seem to account for a host booted with pci=notph.
> 

pcie_enable_tph() looks for pci=notph and fails if it's set.
If pcie_enable_tph() fails, pcie_tph_get_cpu_st() and 
pcie_tph_set_st_entry() fail too.

Is it required to check pci=notph here as well?

> > +
> > +	if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph)))
> > +		return -EFAULT;
> > +
> > +	op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> > +
> > +	switch (op) {
> > +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> > +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> > +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> > +		return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : -EINVAL;
> > +
> > +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> > +		return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : -EINVAL;
> 
> This is a convoluted mangling of an ioctl into a vfio feature.
> 

Do you prefer all four operations to fold under a single ioctl? Or split them
such that enabling and disabling TPH remains a VFIO SET feature while
SET_ST and GET_ST moved to a regular ioctl?

> > +
> > +	default:
> > +		return -EINVAL;
> > +	}
> > +}
> > +
> > +static int vfio_pci_core_feature_tph(struct vfio_device *device, u32 flags,
> > +				     struct vfio_pci_tph __user *arg,
> > +				     size_t argsz)
> > +{
> > +	u32 op;
> > +	struct vfio_pci_tph tph;
> > +	void __user *uinfo;
> > +	size_t infosz;
> > +	struct vfio_pci_core_device *vdev =
> > +		container_of(device, struct vfio_pci_core_device, vdev);
> > +	int err = vfio_pci_feature_tph_prepare(arg, argsz, flags, &tph);
> > +
> > +	if (err)
> > +		return err;
> > +
> > +	op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> > +
> > +	switch (op) {
> > +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> > +		return vfio_pci_feature_tph_enable(vdev, &tph);
> > +
> > +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> > +		return vfio_pci_feature_tph_disable(vdev);
> > +
> > +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> > +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> > +		uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph, info);
> > +		infosz = argsz - sizeof(struct vfio_pci_tph);
> > +		return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo, infosz);
> 
> This is effectively encoding a regular ioctl as a feature.  See below.
> 

Addressed this in the above comment.

> > +
> > +	default:
> > +		return -EINVAL;
> > +	}
> > +}
> > +
> >  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> >  				void __user *arg, size_t argsz)
> >  {
> > @@ -1523,6 +1683,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device
> *device, u32 flags,
> >  		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
> >  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
> >  		return vfio_pci_core_feature_token(device, flags, arg, argsz);
> > +	case VFIO_DEVICE_FEATURE_PCI_TPH:
> > +		return vfio_pci_core_feature_tph(device, flags,
> > +						 arg, argsz);
> >  	default:
> >  		return -ENOTTY;
> >  	}
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index c8dbf8219c4f..608d57dfe279 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master {  };
> > #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
> >
> > +/*
> > + * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or set
> > +steering tags
> > + * on the device. Data provided when setting this feature is a __u32
> > +with the
> > + * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe TPH
> > +in
> > + * no-steering-tag, interrupt-vector, or device-specific mode when
> > +feature flags
> > + * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and
> VFIO_TPH_ST_DS_MODE
> > +are set
> > + * respectively.
> > + * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device.
> > + * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device at an
> > +index in
> > + * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used
> > +and device
> > + * capabilities. The caller can set multiple steering tags by passing
> > +an array
> > + * of vfio_pci_tph_info objects containing cpu_id, cache_level, and
> > + * MSI-X/ST-table index. The caller can also set the intended memory
> > +type and
> > + * the processing hint by setting VFIO_TPH_MEM_TYPE_x and
> > +VFIO_TPH_HINT_x flags,
> > + * respectively. The return value for each vfio_pci_tph_info object
> > +is stored in
> > + * err, with the steering-tag set on the device and the ph_ignore
> > +status bit
> > + * resulting from the steering-tag lookup operation. If err < 0, the
> > +values
> > + * stored in the st and ph_ignore fields should be considered invalid.
> > + *
> 
> Sorry, I don't understand ph_ignore as described here.  It's only ever set to 1.  I
> guess we assume the incoming state is zero.  But what does it mean?
> 

Agree, it's confusing and it has something to do with the current implementation
of TPH in the Kernel. 
PCIe firmware specification states root-port ACPI _DSM as the single source of
truth for steering-tags. Its job is to maintain a table of steering-tags indexed by
a tuple of CPU/Container UID, Cache-id and a processing-hint>. Each tuple is mapped
to two steering-tags for persistent or volatile memory (either or both depending on
the platform).
A caller looking to acquire a steering tag for a cache closer to a CPU will pass above
tuple in the format defined in the PCIe firmware specification. 
But PCIe spec also allows implementing TPH without processing-hints, in such cases
the _DSM is supposed to set ph_ignore bit in the return structure.
Current TPH implementation in the Linux kernel ignores some of these details,
it sets cache-id and the processing-hint in the above tuple to zeros. It also ignores the
ph_ignore bit return by the _DSM as well. 
However, I wrote this RFC to match the PCI firmware specification, therefore it sets
ph_ignore bit to 1 to inform the caller that it is ignored so that caller can make an
informed decision to stick with the default (bidirectional hint) or exit with error.
This is why the cache_level is ignored as well, ideally cache levels (0 for L1D, 1 for L2D,
Etc.) should be converted to a PPTT cache_id and passed into the _DSM.
I'm working on a separate PCI patch to fix above issues.

> > + * Upon VFIO_DEVICE_FEATURE_GET,  return steering tags to the caller.
> > + * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the caller.
> > + * The return values per vfio_pci_tph_info object are stored in the
> > + st,
> > + * ph_ignore, and err fields.
> 
> Why do we need to provide an interface to return the steering tags set by the
> user?  Seems like this could be a write-only, SET, interface.
> 

VFIO_DEVICE_FEATURE_TPH_GET_ST calls the _DSM to read the steering-tag
for a vector of tuples of a cpu-id, cache-id, processing-hint. It is for devices that operate
in device specific mode where they don't set steering-tags in the MSI-X or ST tables but
in another construct like a queue context accessible from the user/kernel space. 
For instance, this is what will be used by DPDK poll mode drivers as [2] fleshes
out. 

So, VFIO_DEVICE_FEATURE_TPH_GET_ST doesn't return steering-tags set by the
user. It's there to read the platform database of steering-tags which is an ACPI _DSM
bound each PCIe root port.

> > + */
> > +struct vfio_pci_tph_info {
> 
> This seems more of an _entry than an _info.  Note that vfio has various INFO
> ioctls which make this nomenclature confusing.
> 

It is, it's more of a struct that describes the request.
Perhaps vfio_pci_tph_request_descriptor, but that's too long. 
Suggestions are welcome.


> > +	/* in */
> > +	__u32 cpu_id;
> 
> The logical ID?
> 
Yes, this is logical ID.

> > +	__u32 cache_level;
> 
> Zero based?  1-based?  How many cache levels are there?  What's valid here?
> 
It's Zero based. This is currently ignored due to limitations in the kernel TPH. 
One way to validate would be iterating through the topology to find it, decrementing
a copy_of_cache_level as we move further away from the cpu_id until the value
reaches zero. If loop terminates before copy_of_cache_level reaching zero, -EINVAL.
Is that a good approach?

> > +	__u8  flags;
> > +#define VFIO_TPH_MEM_TYPE_MASK		0x1
> > +#define VFIO_TPH_MEM_TYPE_SHIFT		0
> > +#define VFIO_TPH_MEM_TYPE_VMEM		0	/* Request
> volatile memory ST */
> > +#define VFIO_TPH_MEM_TYPE_PMEM		1	/* Request
> persistent memory ST */
> 
> Is there a relation to the cache_level here?  Spec references here and below,
> assuming those are relevant to these values.
> 
There is no relation to cache level. Return from the _DSM has PMEM and VMEM
sections. If either of those are not supported a valid bit is set to 0 in the return.

PCIe spec has four processing-hints, VFIO_TPH_HINT_* are there to set them
in the flags field. Hints are forced to VFIO_TPH_HINT_BIDIR by current TPH
implementation as I described above. 

> > +
> > +#define VFIO_TPH_HINT_MASK		0x3
> > +#define VFIO_TPH_HINT_SHIFT		1
> > +#define VFIO_TPH_HINT_BIDIR		0
> > +#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
> > +#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
> > +#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)
> 
> There needs to be a __u8 padding in here somewhere or flags extended to
> __u16.
> 
> > +	__u16 index;			/* MSI-X/ST-table index to set ST */
> > +	/* out */
> > +	__u16 st;			/* Steering-Tag */
> 
> Sorry if I'm just not familiar with TPH, but why do we need to return the ST?
> Doesn't hardware make use of the ST index and the physical value gets applied
> automatically in HW?
> 

Device specific mode requires this. For instance, intel E810 NIC can set
Steering-tags in queue context which is in user-space when used with DPDK.
For such use cases we must return steering-tags read from the _DSM. 
Going back to DPDK RFC mentioned here, if TPH gets enabled in VFIO, the E810 
poll mode driver in the DPDK will ask the kernel a steering-tag for a combination
of a cpu-id, a cache-level and a processing-hint. In response the kernel will
invoke the ACPI _DSM for the root port of the device via pcie_tph_get_cpu_st()
and return the steering-tag back to the user. E810 driver will set the returned tag
in appropriate queue context.

For cases where steering-tag is not required in user-space (VMMs), caller asks the
kernel to set steering-tag that corresponds the cpu-id, cache-level, and PH
combination at a given MSI-X vector entry or ST table in config space. For such
cases too, the kernel will read the steering-tag from the _DSM and set the tag
in the corresponding MSI-X or ST table entry.

> > +	__u8  ph_ignore;		/* Processing hint was ignored by */
> 
> Padding/alignment, same as above.  "ignored by..."  By what?  Is that an error?
> 

An error. "Processing hint was ignored by the platform"

> > +	__s32 err;			/* Error on getting/setting Steering-
> Tag*/
> > +};
> 
> Generally we'd expect a system call either works or fails.  Why do we need per
> entry error report?  Can't we validate and prepare the entire operation before
> setting any of it into the device?  Ultimately we're just writing hints to config
> space or MSI-X table space, so the write operation itself is not likely to be the
> point of failure.
> 

Write to MSI-X won't fail but reading the steering-tag from the _DSM for an
invalid <cpu_id, cach_id, ph_hint> combo can fail in both GET_ST and SET_ST
cases.
We can change this to an all or nothing interface, such that success if all the
entries are valid and otherwise if at least one is invalid. But in that case the
caller may not know which entries were invalid. If you think that's ok, I can
change it.

> > +
> > +struct vfio_pci_tph {
> > +	__u32 argsz;			/* Size of vfio_pci_tph and info[] */
> > +	__u32 flags;
> > +#define VFIO_DEVICE_FEATURE_TPH_OP_MASK		0x7
> > +#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT	3
> > +#define VFIO_DEVICE_FEATURE_TPH_ENABLE		0	/*
> Enable TPH on device */
> > +#define VFIO_DEVICE_FEATURE_TPH_DISABLE	1	/* Disable TPH
> on device */
> > +#define VFIO_DEVICE_FEATURE_TPH_GET_ST		2	/* Get
> steering-tags */
> > +#define VFIO_DEVICE_FEATURE_TPH_SET_ST		4	/* Set
> steering-rags */
> 
> s/rags/tags/
> 
> vfio device features already have GET and SET as part of their base structure, why
> are they duplicated here?  It really seems like there are two separate features
> here, one that allows enabling with a given mode or disable, and another that
> allows writing specific steering tags.  Both seem like they could be SET-only
> features.  It's also not clear to me that there's a significant advantage to providing
> an array of steering tags.  Isn't updating STs an infrequent operation and likely
> bound to at most 2K tags in the case of MSI-X?
> 

VFIO_DEVICE_FEATURE_TPH_ENABLE and VFIO_DEVICE_FEATURE_TPH_DISABLE
are SET features.
Since, VFIO_DEVICE_FEATURE_TPH_SET_ST requires writing to ST table, so that too
falls under SET features.
Therefore, these flags are only valid when VFIO_DEVICE_FEATURE_SET flag is set.
The only GET features use case here is VFIO_DEVICE_FEATURE_TPH_GET_ST that
reads the steering-tags from the _DSM and returns it back to caller. Only valid
with VFIO_DEVICE_FEATURE_GET flag. 
These are checked in vfio_pci_feature_tph_prepare().

As I mentioned earlier, does it make sense to leave enable/disable under VFIO
Feature and move GET_ST and SET_ST to a separate ioctl?

There isn't much rational to providing an array of tuples other than following
the format VFIO_DEVICE_SET_IRQS that sets eventfds.

> > +
> > +#define	VFIO_TPH_ST_MODE_MASK	(0x3 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> > +#define	VFIO_TPH_ST_NS_MODE	(0 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> > +#define	VFIO_TPH_ST_IV_MODE	(1 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> > +#define	VFIO_TPH_ST_DS_MODE	(2 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> > +	__u32 count;				/* Number of entries in info[]
> */
> > +	struct vfio_pci_tph_info info[];
> > +#define VFIO_TPH_INFO_MAX	64		/* Max entries allowed
> in info[] */
> 
> This seems to match the limit if the table is located in extended config space, but
> it's not particularly relevant if the table is located in MSI-X space.  Why is this a
> good limit?

Yes, for MSI-X the caller may have to invoke the ioctl multiple times if more
than 64 entries need to be updated. 

> 
> Also, try to keep these all in 80 column.  Thanks,
> 

I will, try to keep these under 80 columns.

Thanks Alex, for your input. Let me know what you think about moving SET_ST
and GET_ST to an ioctl and keep ENABLE/DISABLE TPH as a vfio feature.

--Wathsala


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-02-21 22:46 [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl Wathsala Vithanage
  2025-02-28 18:48 ` Alex Williamson
@ 2025-03-03  7:49 ` Philipp Stanner
  2025-03-03 17:21   ` Wathsala Wathawana Vithanage
  2025-03-04 14:14 ` Jason Gunthorpe
  2025-06-02 23:06 ` [RFC PATCH v2 0/1] VFIO ioctl for TLP Processing Hints Wathsala Vithanage
  3 siblings, 1 reply; 24+ messages in thread
From: Philipp Stanner @ 2025-03-03  7:49 UTC (permalink / raw)
  To: Wathsala Vithanage, linux-kernel
  Cc: nd, Alex Williamson, Jason Gunthorpe, Kevin Tian, Yunxiang Li,
	Dr. David Alan Gilbert, Ankit Agrawal, open list:VFIO DRIVER

On Fri, 2025-02-21 at 22:46 +0000, Wathsala Vithanage wrote:
> Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature
> for
> direct cache injection. As described in the relevant patch set [1],
> direct cache injection in supported hardware allows optimal platform
> resource utilization for specific requests on the PCIe bus. This
> feature
> is currently available only for kernel device drivers. However,
> user space applications, especially those whose performance is
> sensitive
> to the latency of inbound writes as seen by a CPU core, may benefit
> from
> using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> application running in a VM).
> 
> This patch enables configuring of TPH from the user space via
> VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> drivers and VMMs to enable/disable the TPH feature on PCIe devices
> and
> set steering tags in MSI-X or steering-tag table entries using
> VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel
> using
> VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.
> 
> [1] 
> lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@amd.com
> [2] 
> inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@arm.com
> 
> Signed-off-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 163
> +++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h        |  68 +++++++++++++
>  2 files changed, 231 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c
> b/drivers/vfio/pci/vfio_pci_core.c
> index 586e49efb81b..d6dd0495b08b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -29,6 +29,7 @@
>  #include <linux/nospec.h>
>  #include <linux/sched/mm.h>
>  #include <linux/iommufd.h>
> +#include <linux/pci-tph.h>
>  #if IS_ENABLED(CONFIG_EEH)
>  #include <asm/eeh.h>
>  #endif
> @@ -1510,6 +1511,165 @@ static int vfio_pci_core_feature_token(struct
> vfio_device *device, u32 flags,
>  	return 0;
>  }
>  
> +static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
> +				      void __user *arg, size_t
> argsz,
> +				      struct vfio_pci_tph_info
> **info)
> +{
> +	size_t minsz;
> +
> +	if (tph->count > VFIO_TPH_INFO_MAX)
> +		return -EINVAL;
> +	if (!tph->count)
> +		return 0;
> +
> +	minsz = tph->count * sizeof(struct vfio_pci_tph_info);
> +	if (minsz < argsz)
> +		return -EINVAL;
> +
> +	*info = memdup_user(arg, minsz);

You can use memdup_array_user() instead of the lines above. It does the
multiplication plus overflow check for you and will make your code more
compact.

> +	if (IS_ERR(info))
> +		return PTR_ERR(info);
> +
> +	return minsz;

see below…

> +}
> +
> +static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device
> *vdev,
> +				      struct vfio_pci_tph *tph,
> +				      void __user *arg, size_t
> argsz)
> +{
> +	int i, mtype, err = 0;
> +	u32 cpu_uid;
> +	struct vfio_pci_tph_info *info = NULL;
> +	ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz,
> &info);
> +
> +	if (data_size <= 0)
> +		return data_size;

So it seems you return here in case of an error. However, that would
result in a length of 0 being an error?

I would try to avoid to return 0 for an error whenever possible. That
breaks convention.

How about you return the result value of memdup_array_user() in
…uinfo_dup()?

The only thing I can't tell is whether tph->count == 0 should be
treated as an error. Maybe map it to -EINVAL?


Regards,
P.

> +
> +	for (i = 0; i < tph->count; i++) {
> +		if (!(info[i].cpu_id < nr_cpu_ids &&
> cpu_present(info[i].cpu_id))) {
> +			info[i].err = -EINVAL;
> +			continue;
> +		}
> +		cpu_uid = topology_core_id(info[i].cpu_id);
> +		mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
> +			VFIO_TPH_MEM_TYPE_SHIFT;
> +
> +		/* processing hints are always ignored */
> +		info[i].ph_ignore = 1;
> +
> +		info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype,
> cpu_uid,
> +						  &info[i].st);
> +		if (info[i].err)
> +			continue;
> +
> +		if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) {
> +			info[i].err = pcie_tph_set_st_entry(vdev-
> >pdev,
> +							   
> info[i].index,
> +							   
> info[i].st);
> +		}
> +	}
> +
> +	if (copy_to_user(arg, info, data_size))
> +		err = -EFAULT;
> +
> +	kfree(info);
> +	return err;
> +}
> +
> +
> +static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device
> *vdev,
> +				       struct vfio_pci_tph *arg)
> +{
> +	int mode = arg->flags & VFIO_TPH_ST_MODE_MASK;
> +
> +	switch (mode) {
> +	case VFIO_TPH_ST_NS_MODE:
> +		return pcie_enable_tph(vdev->pdev,
> PCI_TPH_ST_NS_MODE);
> +
> +	case VFIO_TPH_ST_IV_MODE:
> +		return pcie_enable_tph(vdev->pdev,
> PCI_TPH_ST_IV_MODE);
> +
> +	case VFIO_TPH_ST_DS_MODE:
> +		return pcie_enable_tph(vdev->pdev,
> PCI_TPH_ST_DS_MODE);
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +}
> +
> +static int vfio_pci_feature_tph_disable(struct vfio_pci_core_device
> *vdev)
> +{
> +	pcie_disable_tph(vdev->pdev);
> +	return 0;
> +}
> +
> +static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user
> *arg,
> +					size_t argsz, u32 flags,
> +					struct vfio_pci_tph *tph)
> +{
> +	u32 op;
> +	int err = vfio_check_feature(flags, argsz,
> +				 VFIO_DEVICE_FEATURE_SET |
> +				 VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(struct vfio_pci_tph));
> +	if (err != 1)
> +		return err;
> +
> +	if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph)))
> +		return -EFAULT;
> +
> +	op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> +	switch (op) {
> +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> +		return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : -
> EINVAL;
> +
> +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> +		return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : -
> EINVAL;
> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static int vfio_pci_core_feature_tph(struct vfio_device *device, u32
> flags,
> +				     struct vfio_pci_tph __user
> *arg,
> +				     size_t argsz)
> +{
> +	u32 op;
> +	struct vfio_pci_tph tph;
> +	void __user *uinfo;
> +	size_t infosz;
> +	struct vfio_pci_core_device *vdev =
> +		container_of(device, struct vfio_pci_core_device,
> vdev);
> +	int err = vfio_pci_feature_tph_prepare(arg, argsz, flags,
> &tph);
> +
> +	if (err)
> +		return err;
> +
> +	op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> +	switch (op) {
> +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> +		return vfio_pci_feature_tph_enable(vdev, &tph);
> +
> +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> +		return vfio_pci_feature_tph_disable(vdev);
> +
> +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> +		uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph,
> info);
> +		infosz = argsz - sizeof(struct vfio_pci_tph);
> +		return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo,
> infosz);
> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32
> flags,
>  				void __user *arg, size_t argsz)
>  {
> @@ -1523,6 +1683,9 @@ int vfio_pci_core_ioctl_feature(struct
> vfio_device *device, u32 flags,
>  		return vfio_pci_core_pm_exit(device, flags, arg,
> argsz);
>  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>  		return vfio_pci_core_feature_token(device, flags,
> arg, argsz);
> +	case VFIO_DEVICE_FEATURE_PCI_TPH:
> +		return vfio_pci_core_feature_tph(device, flags,
> +						 arg, argsz);
>  	default:
>  		return -ENOTTY;
>  	}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index c8dbf8219c4f..608d57dfe279 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>  
> +/*
> + * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or set
> steering tags
> + * on the device. Data provided when setting this feature is a __u32
> with the
> + * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe TPH
> in
> + * no-steering-tag, interrupt-vector, or device-specific mode when
> feature flags
> + * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and VFIO_TPH_ST_DS_MODE
> are set
> + * respectively.
> + * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device.
> + * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device at
> an index in
> + * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used
> and device
> + * capabilities. The caller can set multiple steering tags by
> passing an array
> + * of vfio_pci_tph_info objects containing cpu_id, cache_level, and
> + * MSI-X/ST-table index. The caller can also set the intended memory
> type and
> + * the processing hint by setting VFIO_TPH_MEM_TYPE_x and
> VFIO_TPH_HINT_x flags,
> + * respectively. The return value for each vfio_pci_tph_info object
> is stored in
> + * err, with the steering-tag set on the device and the ph_ignore
> status bit
> + * resulting from the steering-tag lookup operation. If err < 0, the
> values
> + * stored in the st and ph_ignore fields should be considered
> invalid.
> + *
> + * Upon VFIO_DEVICE_FEATURE_GET,  return steering tags to the
> caller.
> + * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the
> caller.
> + * The return values per vfio_pci_tph_info object are stored in the
> st,
> + * ph_ignore, and err fields.
> + */
> +struct vfio_pci_tph_info {
> +	/* in */
> +	__u32 cpu_id;
> +	__u32 cache_level;
> +	__u8  flags;
> +#define VFIO_TPH_MEM_TYPE_MASK		0x1
> +#define VFIO_TPH_MEM_TYPE_SHIFT		0
> +#define VFIO_TPH_MEM_TYPE_VMEM		0	/* Request volatile
> memory ST */
> +#define VFIO_TPH_MEM_TYPE_PMEM		1	/* Request
> persistent memory ST */
> +
> +#define VFIO_TPH_HINT_MASK		0x3
> +#define VFIO_TPH_HINT_SHIFT		1
> +#define VFIO_TPH_HINT_BIDIR		0
> +#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)
> +	__u16 index;			/* MSI-X/ST-table index to
> set ST */
> +	/* out */
> +	__u16 st;			/* Steering-Tag */
> +	__u8  ph_ignore;		/* Processing hint was
> ignored by */
> +	__s32 err;			/* Error on getting/setting
> Steering-Tag*/
> +};
> +
> +struct vfio_pci_tph {
> +	__u32 argsz;			/* Size of vfio_pci_tph and
> info[] */
> +	__u32 flags;
> +#define VFIO_DEVICE_FEATURE_TPH_OP_MASK		0x7
> +#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT	3
> +#define VFIO_DEVICE_FEATURE_TPH_ENABLE		0	/* Enable
> TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_DISABLE	1	/* Disable
> TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_GET_ST		2	/* Get
> steering-tags */
> +#define VFIO_DEVICE_FEATURE_TPH_SET_ST		4	/* Set
> steering-rags */
> +
> +#define	VFIO_TPH_ST_MODE_MASK	(0x3 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_NS_MODE	(0 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_IV_MODE	(1 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_DS_MODE	(2 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +	__u32 count;				/* Number of entries
> in info[] */
> +	struct vfio_pci_tph_info info[];
> +#define VFIO_TPH_INFO_MAX	64		/* Max entries
> allowed in info[] */
> +};
> +
> +#define VFIO_DEVICE_FEATURE_PCI_TPH 11
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**


^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-03  7:49 ` Philipp Stanner
@ 2025-03-03 17:21   ` Wathsala Wathawana Vithanage
  0 siblings, 0 replies; 24+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-03-03 17:21 UTC (permalink / raw)
  To: Philipp Stanner, linux-kernel@vger.kernel.org
  Cc: nd, Alex Williamson, Jason Gunthorpe, Kevin Tian, Yunxiang Li,
	Dr. David Alan Gilbert, Ankit Agrawal, open list:VFIO DRIVER,
	Honnappa Nagarahalli, Dhruv Tripathi

> > +static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
> > +				      void __user *arg, size_t
> > argsz,
> > +				      struct vfio_pci_tph_info
> > **info)
> > +{
> > +	size_t minsz;
> > +
> > +	if (tph->count > VFIO_TPH_INFO_MAX)
> > +		return -EINVAL;
> > +	if (!tph->count)
> > +		return 0;
> > +
> > +	minsz = tph->count * sizeof(struct vfio_pci_tph_info);
> > +	if (minsz < argsz)
> > +		return -EINVAL;
> > +
> > +	*info = memdup_user(arg, minsz);
> 
> You can use memdup_array_user() instead of the lines above. It does the
> multiplication plus overflow check for you and will make your code more
> compact.
> 

Thank you, wasn't aware of it.

> > +	if (IS_ERR(info))
> > +		return PTR_ERR(info);
> > +
> > +	return minsz;
> 
> see below…
> 
> > +}
> > +
> > +static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device
> > *vdev,
> > +				      struct vfio_pci_tph *tph,
> > +				      void __user *arg, size_t
> > argsz)
> > +{
> > +	int i, mtype, err = 0;
> > +	u32 cpu_uid;
> > +	struct vfio_pci_tph_info *info = NULL;
> > +	ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz,
> > &info);
> > +
> > +	if (data_size <= 0)
> > +		return data_size;
> 
> So it seems you return here in case of an error. However, that would result in a
> length of 0 being an error?
> 
> I would try to avoid to return 0 for an error whenever possible. That breaks
> convention.
> 
> How about you return the result value of memdup_array_user() in …uinfo_dup()?
> 
> The only thing I can't tell is whether tph->count == 0 should be treated as an
> error. Maybe map it to -EINVAL?
> 
> 

I wasn't sure before, but I lean towards -EINVAL now. I will amend this to -EINVAL.
I saw this like reading 0 bytes from a file descriptor, which returns 0.
But there are also differences, read return the number of bytes read, whereas this
doesn't return the number of records.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-03  4:20   ` Wathsala Wathawana Vithanage
@ 2025-03-04  3:12     ` Alex Williamson
  2025-03-04  7:01       ` Wathsala Wathawana Vithanage
  2025-04-26 12:49       ` Wathsala Wathawana Vithanage
  0 siblings, 2 replies; 24+ messages in thread
From: Alex Williamson @ 2025-03-04  3:12 UTC (permalink / raw)
  To: Wathsala Wathawana Vithanage
  Cc: linux-kernel@vger.kernel.org, nd, Jason Gunthorpe, Kevin Tian,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Jeremy Linton,
	Honnappa Nagarahalli, Dhruv Tripathi

On Mon, 3 Mar 2025 04:20:24 +0000
Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com> wrote:

> Hi Alex,
> 
> > > Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> > > direct cache injection. As described in the relevant patch set [1],
> > > direct cache injection in supported hardware allows optimal platform
> > > resource utilization for specific requests on the PCIe bus. This
> > > feature is currently available only for kernel device drivers.
> > > However, user space applications, especially those whose performance
> > > is sensitive to the latency of inbound writes as seen by a CPU core,
> > > may benefit from using this information (E.g., DPDK cache stashing RFC
> > > [2] or an HPC application running in a VM).
> > >
> > > This patch enables configuring of TPH from the user space via
> > > VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> > > drivers and VMMs to enable/disable the TPH feature on PCIe devices and
> > > set steering tags in MSI-X or steering-tag table entries using
> > > VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel
> > > using VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.  
> > 
> > Unless I'm missing it, the RFC in [2] doesn't make use of this proposal.  Is there
> > published code anywhere that does use this interface?
> >   
> > > [1]
> > > lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@amd.com
> > > [2]
> > > inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@arm.com
> > >  
> 
> The DPDK RFC in question is on hold for now because there is no way to get steering-tags
> from the user space. 
> Consequently, I had to stop working on [2] and start working on the kernel to get VFIO ready
> for it. This was discussed in a DPDK community call sometime back.
> https://mails.dpdk.org/archives/dev/2024-October/305408.html

I think the userspace and kernel support need to be co-developed, we
don't want to be adding kernel code and uAPIs that don't have users.
 
> > > Signed-off-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
> > > ---
> > >  drivers/vfio/pci/vfio_pci_core.c | 163 +++++++++++++++++++++++++++++++
> > >  include/uapi/linux/vfio.h        |  68 +++++++++++++
> > >  2 files changed, 231 insertions(+)
> > >
> > > diff --git a/drivers/vfio/pci/vfio_pci_core.c
> > > b/drivers/vfio/pci/vfio_pci_core.c
> > > index 586e49efb81b..d6dd0495b08b 100644
> > > --- a/drivers/vfio/pci/vfio_pci_core.c
> > > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > > @@ -29,6 +29,7 @@
> > >  #include <linux/nospec.h>
> > >  #include <linux/sched/mm.h>
> > >  #include <linux/iommufd.h>
> > > +#include <linux/pci-tph.h>
> > >  #if IS_ENABLED(CONFIG_EEH)
> > >  #include <asm/eeh.h>
> > >  #endif
> > > @@ -1510,6 +1511,165 @@ static int vfio_pci_core_feature_token(struct  
> > vfio_device *device, u32 flags,  
> > >  	return 0;
> > >  }
> > >
> > > +static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
> > > +				      void __user *arg, size_t argsz,
> > > +				      struct vfio_pci_tph_info **info) {
> > > +	size_t minsz;
> > > +
> > > +	if (tph->count > VFIO_TPH_INFO_MAX)
> > > +		return -EINVAL;
> > > +	if (!tph->count)
> > > +		return 0;
> > > +
> > > +	minsz = tph->count * sizeof(struct vfio_pci_tph_info);
> > > +	if (minsz < argsz)
> > > +		return -EINVAL;
> > > +
> > > +	*info = memdup_user(arg, minsz);
> > > +	if (IS_ERR(info))
> > > +		return PTR_ERR(info);
> > > +
> > > +	return minsz;
> > > +}
> > > +
> > > +static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device *vdev,
> > > +				      struct vfio_pci_tph *tph,
> > > +				      void __user *arg, size_t argsz) {
> > > +	int i, mtype, err = 0;
> > > +	u32 cpu_uid;
> > > +	struct vfio_pci_tph_info *info = NULL;
> > > +	ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz, &info);
> > > +
> > > +	if (data_size <= 0)
> > > +		return data_size;
> > > +
> > > +	for (i = 0; i < tph->count; i++) {
> > > +		if (!(info[i].cpu_id < nr_cpu_ids && cpu_present(info[i].cpu_id)))
> > > +{  
> > 
> > Not intuitive logic, imo.  This could easily be:
> > 
> > 		if (info[i].cpu_id >= nr_cpu_ids || !cpu_present(info[i].cpu_id))
> >   
> 
> Agree, looks more intuitive.
> 
> > > +			info[i].err = -EINVAL;
> > > +			continue;
> > > +		}
> > > +		cpu_uid = topology_core_id(info[i].cpu_id);
> > > +		mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
> > > +			VFIO_TPH_MEM_TYPE_SHIFT;
> > > +
> > > +		/* processing hints are always ignored */
> > > +		info[i].ph_ignore = 1;
> > > +
> > > +		info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype, cpu_uid,
> > > +						  &info[i].st);
> > > +		if (info[i].err)
> > > +			continue;
> > > +
> > > +		if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) {
> > > +			info[i].err = pcie_tph_set_st_entry(vdev->pdev,
> > > +							    info[i].index,
> > > +							    info[i].st);
> > > +		}
> > > +	}
> > > +
> > > +	if (copy_to_user(arg, info, data_size))
> > > +		err = -EFAULT;
> > > +
> > > +	kfree(info);
> > > +	return err;
> > > +}
> > > +
> > > +
> > > +static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device *vdev,
> > > +				       struct vfio_pci_tph *arg)
> > > +{
> > > +	int mode = arg->flags & VFIO_TPH_ST_MODE_MASK;
> > > +
> > > +	switch (mode) {
> > > +	case VFIO_TPH_ST_NS_MODE:
> > > +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_NS_MODE);
> > > +
> > > +	case VFIO_TPH_ST_IV_MODE:
> > > +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_IV_MODE);
> > > +
> > > +	case VFIO_TPH_ST_DS_MODE:
> > > +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_DS_MODE);
> > > +
> > > +	default:
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +}
> > > +
> > > +static int vfio_pci_feature_tph_disable(struct vfio_pci_core_device
> > > +*vdev) {
> > > +	pcie_disable_tph(vdev->pdev);
> > > +	return 0;
> > > +}
> > > +
> > > +static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user *arg,
> > > +					size_t argsz, u32 flags,
> > > +					struct vfio_pci_tph *tph)
> > > +{
> > > +	u32 op;
> > > +	int err = vfio_check_feature(flags, argsz,
> > > +				 VFIO_DEVICE_FEATURE_SET |
> > > +				 VFIO_DEVICE_FEATURE_GET,
> > > +				 sizeof(struct vfio_pci_tph));
> > > +	if (err != 1)
> > > +		return err;  
> > 
> > We don't seem to account for a host booted with pci=notph.
> >   
> 
> pcie_enable_tph() looks for pci=notph and fails if it's set.
> If pcie_enable_tph() fails, pcie_tph_get_cpu_st() and 
> pcie_tph_set_st_entry() fail too.
> 
> Is it required to check pci=notph here as well?

Does it make sense to probe affirmatively for a feature that can't be
enabled?  I think I previously also overlooked that we never actually
check that the device supports TPH at probe time.
 
> > > +
> > > +	if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph)))
> > > +		return -EFAULT;
> > > +
> > > +	op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> > > +
> > > +	switch (op) {
> > > +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> > > +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> > > +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> > > +		return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : -EINVAL;
> > > +
> > > +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> > > +		return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : -EINVAL;  
> > 
> > This is a convoluted mangling of an ioctl into a vfio feature.
> >   
> 
> Do you prefer all four operations to fold under a single ioctl? Or split them
> such that enabling and disabling TPH remains a VFIO SET feature while
> SET_ST and GET_ST moved to a regular ioctl?

Splitting the functionality between a feature and a new ioctl doesn't
make any sense.  As I noted, I can imagine this interface being two
related features where one allows the TPH state to be set enabled or
disabled and the other allows setting steering tags.

> > > +
> > > +	default:
> > > +		return -EINVAL;
> > > +	}
> > > +}
> > > +
> > > +static int vfio_pci_core_feature_tph(struct vfio_device *device, u32 flags,
> > > +				     struct vfio_pci_tph __user *arg,
> > > +				     size_t argsz)
> > > +{
> > > +	u32 op;
> > > +	struct vfio_pci_tph tph;
> > > +	void __user *uinfo;
> > > +	size_t infosz;
> > > +	struct vfio_pci_core_device *vdev =
> > > +		container_of(device, struct vfio_pci_core_device, vdev);
> > > +	int err = vfio_pci_feature_tph_prepare(arg, argsz, flags, &tph);
> > > +
> > > +	if (err)
> > > +		return err;
> > > +
> > > +	op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> > > +
> > > +	switch (op) {
> > > +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> > > +		return vfio_pci_feature_tph_enable(vdev, &tph);
> > > +
> > > +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> > > +		return vfio_pci_feature_tph_disable(vdev);
> > > +
> > > +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> > > +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> > > +		uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph, info);
> > > +		infosz = argsz - sizeof(struct vfio_pci_tph);
> > > +		return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo, infosz);  
> > 
> > This is effectively encoding a regular ioctl as a feature.  See below.
> >   
> 
> Addressed this in the above comment.
> 
> > > +
> > > +	default:
> > > +		return -EINVAL;
> > > +	}
> > > +}
> > > +
> > >  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> > >  				void __user *arg, size_t argsz)
> > >  {
> > > @@ -1523,6 +1683,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device  
> > *device, u32 flags,  
> > >  		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
> > >  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
> > >  		return vfio_pci_core_feature_token(device, flags, arg, argsz);
> > > +	case VFIO_DEVICE_FEATURE_PCI_TPH:
> > > +		return vfio_pci_core_feature_tph(device, flags,
> > > +						 arg, argsz);
> > >  	default:
> > >  		return -ENOTTY;
> > >  	}
> > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > index c8dbf8219c4f..608d57dfe279 100644
> > > --- a/include/uapi/linux/vfio.h
> > > +++ b/include/uapi/linux/vfio.h
> > > @@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master {  };
> > > #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
> > >
> > > +/*
> > > + * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or set
> > > +steering tags
> > > + * on the device. Data provided when setting this feature is a __u32
> > > +with the
> > > + * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe TPH
> > > +in
> > > + * no-steering-tag, interrupt-vector, or device-specific mode when
> > > +feature flags
> > > + * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and  
> > VFIO_TPH_ST_DS_MODE  
> > > +are set
> > > + * respectively.
> > > + * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device.
> > > + * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device at an
> > > +index in
> > > + * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used
> > > +and device
> > > + * capabilities. The caller can set multiple steering tags by passing
> > > +an array
> > > + * of vfio_pci_tph_info objects containing cpu_id, cache_level, and
> > > + * MSI-X/ST-table index. The caller can also set the intended memory
> > > +type and
> > > + * the processing hint by setting VFIO_TPH_MEM_TYPE_x and
> > > +VFIO_TPH_HINT_x flags,
> > > + * respectively. The return value for each vfio_pci_tph_info object
> > > +is stored in
> > > + * err, with the steering-tag set on the device and the ph_ignore
> > > +status bit
> > > + * resulting from the steering-tag lookup operation. If err < 0, the
> > > +values
> > > + * stored in the st and ph_ignore fields should be considered invalid.
> > > + *  
> > 
> > Sorry, I don't understand ph_ignore as described here.  It's only ever set to 1.  I
> > guess we assume the incoming state is zero.  But what does it mean?
> >   
> 
> Agree, it's confusing and it has something to do with the current implementation
> of TPH in the Kernel. 
> PCIe firmware specification states root-port ACPI _DSM as the single source of
> truth for steering-tags. Its job is to maintain a table of steering-tags indexed by
> a tuple of CPU/Container UID, Cache-id and a processing-hint>. Each tuple is mapped
> to two steering-tags for persistent or volatile memory (either or both depending on
> the platform).
> A caller looking to acquire a steering tag for a cache closer to a CPU will pass above
> tuple in the format defined in the PCIe firmware specification. 
> But PCIe spec also allows implementing TPH without processing-hints, in such cases
> the _DSM is supposed to set ph_ignore bit in the return structure.
> Current TPH implementation in the Linux kernel ignores some of these details,
> it sets cache-id and the processing-hint in the above tuple to zeros. It also ignores the
> ph_ignore bit return by the _DSM as well. 
> However, I wrote this RFC to match the PCI firmware specification, therefore it sets
> ph_ignore bit to 1 to inform the caller that it is ignored so that caller can make an
> informed decision to stick with the default (bidirectional hint) or exit with error.
> This is why the cache_level is ignored as well, ideally cache levels (0 for L1D, 1 for L2D,
> Etc.) should be converted to a PPTT cache_id and passed into the _DSM.
> I'm working on a separate PCI patch to fix above issues.
> 
> > > + * Upon VFIO_DEVICE_FEATURE_GET,  return steering tags to the caller.
> > > + * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the caller.
> > > + * The return values per vfio_pci_tph_info object are stored in the
> > > + st,
> > > + * ph_ignore, and err fields.  
> > 
> > Why do we need to provide an interface to return the steering tags set by the
> > user?  Seems like this could be a write-only, SET, interface.
> >   
> 
> VFIO_DEVICE_FEATURE_TPH_GET_ST calls the _DSM to read the steering-tag
> for a vector of tuples of a cpu-id, cache-id, processing-hint. It is for devices that operate
> in device specific mode where they don't set steering-tags in the MSI-X or ST tables but
> in another construct like a queue context accessible from the user/kernel space. 
> For instance, this is what will be used by DPDK poll mode drivers as [2] fleshes
> out. 
> 
> So, VFIO_DEVICE_FEATURE_TPH_GET_ST doesn't return steering-tags set by the
> user. It's there to read the platform database of steering-tags which is an ACPI _DSM
> bound each PCIe root port.
> 
> > > + */
> > > +struct vfio_pci_tph_info {  
> > 
> > This seems more of an _entry than an _info.  Note that vfio has various INFO
> > ioctls which make this nomenclature confusing.
> >   
> 
> It is, it's more of a struct that describes the request.
> Perhaps vfio_pci_tph_request_descriptor, but that's too long. 
> Suggestions are welcome.
> 
> 
> > > +	/* in */
> > > +	__u32 cpu_id;  
> > 
> > The logical ID?
> >   
> Yes, this is logical ID.
> 
> > > +	__u32 cache_level;  
> > 
> > Zero based?  1-based?  How many cache levels are there?  What's valid here?
> >   
> It's Zero based. This is currently ignored due to limitations in the kernel TPH. 
> One way to validate would be iterating through the topology to find it, decrementing
> a copy_of_cache_level as we move further away from the cpu_id until the value
> reaches zero. If loop terminates before copy_of_cache_level reaching zero, -EINVAL.
> Is that a good approach?

Sounds like a try and hope for the best interface.  How would userspace
realistically infer the cache levels?  I don't think we want vfio
describing that.

> > > +	__u8  flags;
> > > +#define VFIO_TPH_MEM_TYPE_MASK		0x1
> > > +#define VFIO_TPH_MEM_TYPE_SHIFT		0
> > > +#define VFIO_TPH_MEM_TYPE_VMEM		0	/* Request  
> > volatile memory ST */  
> > > +#define VFIO_TPH_MEM_TYPE_PMEM		1	/* Request  
> > persistent memory ST */
> > 
> > Is there a relation to the cache_level here?  Spec references here and below,
> > assuming those are relevant to these values.
> >   
> There is no relation to cache level. Return from the _DSM has PMEM and VMEM
> sections. If either of those are not supported a valid bit is set to 0 in the return.
> 
> PCIe spec has four processing-hints, VFIO_TPH_HINT_* are there to set them
> in the flags field. Hints are forced to VFIO_TPH_HINT_BIDIR by current TPH
> implementation as I described above. 
> 
> > > +
> > > +#define VFIO_TPH_HINT_MASK		0x3
> > > +#define VFIO_TPH_HINT_SHIFT		1
> > > +#define VFIO_TPH_HINT_BIDIR		0
> > > +#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
> > > +#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
> > > +#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)  
> > 
> > There needs to be a __u8 padding in here somewhere or flags extended to
> > __u16.
> >   
> > > +	__u16 index;			/* MSI-X/ST-table index to set ST */
> > > +	/* out */
> > > +	__u16 st;			/* Steering-Tag */  
> > 
> > Sorry if I'm just not familiar with TPH, but why do we need to return the ST?
> > Doesn't hardware make use of the ST index and the physical value gets applied
> > automatically in HW?
> >   
> 
> Device specific mode requires this. For instance, intel E810 NIC can set
> Steering-tags in queue context which is in user-space when used with DPDK.
> For such use cases we must return steering-tags read from the _DSM. 
> Going back to DPDK RFC mentioned here, if TPH gets enabled in VFIO, the E810 
> poll mode driver in the DPDK will ask the kernel a steering-tag for a combination
> of a cpu-id, a cache-level and a processing-hint. In response the kernel will
> invoke the ACPI _DSM for the root port of the device via pcie_tph_get_cpu_st()
> and return the steering-tag back to the user. E810 driver will set the returned tag
> in appropriate queue context.
> 
> For cases where steering-tag is not required in user-space (VMMs), caller asks the
> kernel to set steering-tag that corresponds the cpu-id, cache-level, and PH
> combination at a given MSI-X vector entry or ST table in config space. For such
> cases too, the kernel will read the steering-tag from the _DSM and set the tag
> in the corresponding MSI-X or ST table entry.

We need to consider that there are vfio-pci variant drivers that
support migration and make use of the vfio-pci-core feature interface.
TPH programming appears to be very non-migration friendly, the
attributes are very specific to the current host system.  Should
migration and TPH be mutually exclusive?  This may be a factor in the
decision to use a feature or ioctl.

> > > +	__u8  ph_ignore;		/* Processing hint was ignored by */  
> > 
> > Padding/alignment, same as above.  "ignored by..."  By what?  Is that an error?
> >   
> 
> An error. "Processing hint was ignored by the platform"
> 
> > > +	__s32 err;			/* Error on getting/setting Steering-  
> > Tag*/  
> > > +};  
> > 
> > Generally we'd expect a system call either works or fails.  Why do we need per
> > entry error report?  Can't we validate and prepare the entire operation before
> > setting any of it into the device?  Ultimately we're just writing hints to config
> > space or MSI-X table space, so the write operation itself is not likely to be the
> > point of failure.
> >   
> 
> Write to MSI-X won't fail but reading the steering-tag from the _DSM for an
> invalid <cpu_id, cach_id, ph_hint> combo can fail in both GET_ST and SET_ST
> cases.
> We can change this to an all or nothing interface, such that success if all the
> entries are valid and otherwise if at least one is invalid. But in that case the
> caller may not know which entries were invalid. If you think that's ok, I can
> change it.

It's not exactly uncommon for an ioctl to fail for a single bad arg
among many.  I think it's harder for userspace to deal with recovering
from a partially implemented call.

> > > +
> > > +struct vfio_pci_tph {
> > > +	__u32 argsz;			/* Size of vfio_pci_tph and info[] */
> > > +	__u32 flags;
> > > +#define VFIO_DEVICE_FEATURE_TPH_OP_MASK		0x7
> > > +#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT	3
> > > +#define VFIO_DEVICE_FEATURE_TPH_ENABLE		0	/*  
> > Enable TPH on device */  
> > > +#define VFIO_DEVICE_FEATURE_TPH_DISABLE	1	/* Disable TPH  
> > on device */  
> > > +#define VFIO_DEVICE_FEATURE_TPH_GET_ST		2	/* Get  
> > steering-tags */  
> > > +#define VFIO_DEVICE_FEATURE_TPH_SET_ST		4	/* Set  
> > steering-rags */
> > 
> > s/rags/tags/
> > 
> > vfio device features already have GET and SET as part of their base structure, why
> > are they duplicated here?  It really seems like there are two separate features
> > here, one that allows enabling with a given mode or disable, and another that
> > allows writing specific steering tags.  Both seem like they could be SET-only
> > features.  It's also not clear to me that there's a significant advantage to providing
> > an array of steering tags.  Isn't updating STs an infrequent operation and likely
> > bound to at most 2K tags in the case of MSI-X?
> >   
> 
> VFIO_DEVICE_FEATURE_TPH_ENABLE and VFIO_DEVICE_FEATURE_TPH_DISABLE
> are SET features.
> Since, VFIO_DEVICE_FEATURE_TPH_SET_ST requires writing to ST table, so that too
> falls under SET features.
> Therefore, these flags are only valid when VFIO_DEVICE_FEATURE_SET flag is set.
> The only GET features use case here is VFIO_DEVICE_FEATURE_TPH_GET_ST that
> reads the steering-tags from the _DSM and returns it back to caller. Only valid
> with VFIO_DEVICE_FEATURE_GET flag. 
> These are checked in vfio_pci_feature_tph_prepare().
> 
> As I mentioned earlier, does it make sense to leave enable/disable under VFIO
> Feature and move GET_ST and SET_ST to a separate ioctl?

Splitting TPH between a feature and new ioctls doesn't seem like a
logical interface.

> There isn't much rational to providing an array of tuples other than following
> the format VFIO_DEVICE_SET_IRQS that sets eventfds.
> 
> > > +
> > > +#define	VFIO_TPH_ST_MODE_MASK	(0x3 <<  
> > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)  
> > > +#define	VFIO_TPH_ST_NS_MODE	(0 <<  
> > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)  
> > > +#define	VFIO_TPH_ST_IV_MODE	(1 <<  
> > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)  
> > > +#define	VFIO_TPH_ST_DS_MODE	(2 <<  
> > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)  
> > > +	__u32 count;				/* Number of entries in info[]  
> > */  
> > > +	struct vfio_pci_tph_info info[];
> > > +#define VFIO_TPH_INFO_MAX	64		/* Max entries allowed  
> > in info[] */
> > 
> > This seems to match the limit if the table is located in extended config space, but
> > it's not particularly relevant if the table is located in MSI-X space.  Why is this a
> > good limit?  
> 
> Yes, for MSI-X the caller may have to invoke the ioctl multiple times if more
> than 64 entries need to be updated. 
> 
> > 
> > Also, try to keep these all in 80 column.  Thanks,
> >   
> 
> I will, try to keep these under 80 columns.
> 
> Thanks Alex, for your input. Let me know what you think about moving SET_ST
> and GET_ST to an ioctl and keep ENABLE/DISABLE TPH as a vfio feature.

It seems like kind of a hodgepodge to split the interface like that,
imo.  Sorry for a less that complete reply, I'm on PTO this week.
Thanks,

Alex


^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-04  3:12     ` Alex Williamson
@ 2025-03-04  7:01       ` Wathsala Wathawana Vithanage
  2025-04-26 12:49       ` Wathsala Wathawana Vithanage
  1 sibling, 0 replies; 24+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-03-04  7:01 UTC (permalink / raw)
  To: Alex Williamson
  Cc: linux-kernel@vger.kernel.org, nd, Jason Gunthorpe, Kevin Tian,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Jeremy Linton,
	Honnappa Nagarahalli, Dhruv Tripathi

> Subject: Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
> 
> On Mon, 3 Mar 2025 04:20:24 +0000
> Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com> wrote:
> 
> > Hi Alex,
> >
> > > > Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature
> > > > for direct cache injection. As described in the relevant patch set
> > > > [1], direct cache injection in supported hardware allows optimal
> > > > platform resource utilization for specific requests on the PCIe
> > > > bus. This feature is currently available only for kernel device drivers.
> > > > However, user space applications, especially those whose
> > > > performance is sensitive to the latency of inbound writes as seen
> > > > by a CPU core, may benefit from using this information (E.g., DPDK
> > > > cache stashing RFC [2] or an HPC application running in a VM).
> > > >
> > > > This patch enables configuring of TPH from the user space via
> > > > VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> > > > drivers and VMMs to enable/disable the TPH feature on PCIe devices
> > > > and set steering tags in MSI-X or steering-tag table entries using
> > > > VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel
> > > > using VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.
> > >
> > > Unless I'm missing it, the RFC in [2] doesn't make use of this
> > > proposal.  Is there published code anywhere that does use this interface?
> > >
> > > > [1]
> > > > lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@amd.c
> > > > om
> > > > [2]
> > > > inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@arm.
> > > > com
> > > >
> >
> > The DPDK RFC in question is on hold for now because there is no way to
> > get steering-tags from the user space.
> > Consequently, I had to stop working on [2] and start working on the
> > kernel to get VFIO ready for it. This was discussed in a DPDK community call
> sometime back.
> > https://mails.dpdk.org/archives/dev/2024-October/305408.html
> 
> I think the userspace and kernel support need to be co-developed, we don't want
> to be adding kernel code and uAPIs that don't have users.
> 


I will update [2] RFC with along with a v2 of this RFC to use the uAPI discussed here. 


> > > > Signed-off-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
> > > > ---
> > > >  drivers/vfio/pci/vfio_pci_core.c | 163
> +++++++++++++++++++++++++++++++
> > > >  include/uapi/linux/vfio.h        |  68 +++++++++++++
> > > >  2 files changed, 231 insertions(+)
> > > >
> > > > diff --git a/drivers/vfio/pci/vfio_pci_core.c
> > > > b/drivers/vfio/pci/vfio_pci_core.c
> > > > index 586e49efb81b..d6dd0495b08b 100644
> > > > --- a/drivers/vfio/pci/vfio_pci_core.c
> > > > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > > > @@ -29,6 +29,7 @@
> > > >  #include <linux/nospec.h>
> > > >  #include <linux/sched/mm.h>
> > > >  #include <linux/iommufd.h>
> > > > +#include <linux/pci-tph.h>
> > > >  #if IS_ENABLED(CONFIG_EEH)
> > > >  #include <asm/eeh.h>
> > > >  #endif
> > > > @@ -1510,6 +1511,165 @@ static int
> > > > vfio_pci_core_feature_token(struct
> > > vfio_device *device, u32 flags,
> > > >  	return 0;
> > > >  }
> > > >
> > > > +static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
> > > > +				      void __user *arg, size_t argsz,
> > > > +				      struct vfio_pci_tph_info **info) {
> > > > +	size_t minsz;
> > > > +
> > > > +	if (tph->count > VFIO_TPH_INFO_MAX)
> > > > +		return -EINVAL;
> > > > +	if (!tph->count)
> > > > +		return 0;
> > > > +
> > > > +	minsz = tph->count * sizeof(struct vfio_pci_tph_info);
> > > > +	if (minsz < argsz)
> > > > +		return -EINVAL;
> > > > +
> > > > +	*info = memdup_user(arg, minsz);
> > > > +	if (IS_ERR(info))
> > > > +		return PTR_ERR(info);
> > > > +
> > > > +	return minsz;
> > > > +}
> > > > +
> > > > +static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device *vdev,
> > > > +				      struct vfio_pci_tph *tph,
> > > > +				      void __user *arg, size_t argsz) {
> > > > +	int i, mtype, err = 0;
> > > > +	u32 cpu_uid;
> > > > +	struct vfio_pci_tph_info *info = NULL;
> > > > +	ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz,
> > > > +&info);
> > > > +
> > > > +	if (data_size <= 0)
> > > > +		return data_size;
> > > > +
> > > > +	for (i = 0; i < tph->count; i++) {
> > > > +		if (!(info[i].cpu_id < nr_cpu_ids &&
> > > > +cpu_present(info[i].cpu_id))) {
> > >
> > > Not intuitive logic, imo.  This could easily be:
> > >
> > > 		if (info[i].cpu_id >= nr_cpu_ids || !cpu_present(info[i].cpu_id))
> > >
> >
> > Agree, looks more intuitive.
> >
> > > > +			info[i].err = -EINVAL;
> > > > +			continue;
> > > > +		}
> > > > +		cpu_uid = topology_core_id(info[i].cpu_id);
> > > > +		mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
> > > > +			VFIO_TPH_MEM_TYPE_SHIFT;
> > > > +
> > > > +		/* processing hints are always ignored */
> > > > +		info[i].ph_ignore = 1;
> > > > +
> > > > +		info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype, cpu_uid,
> > > > +						  &info[i].st);
> > > > +		if (info[i].err)
> > > > +			continue;
> > > > +
> > > > +		if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) {
> > > > +			info[i].err = pcie_tph_set_st_entry(vdev->pdev,
> > > > +							    info[i].index,
> > > > +							    info[i].st);
> > > > +		}
> > > > +	}
> > > > +
> > > > +	if (copy_to_user(arg, info, data_size))
> > > > +		err = -EFAULT;
> > > > +
> > > > +	kfree(info);
> > > > +	return err;
> > > > +}
> > > > +
> > > > +
> > > > +static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device *vdev,
> > > > +				       struct vfio_pci_tph *arg) {
> > > > +	int mode = arg->flags & VFIO_TPH_ST_MODE_MASK;
> > > > +
> > > > +	switch (mode) {
> > > > +	case VFIO_TPH_ST_NS_MODE:
> > > > +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_NS_MODE);
> > > > +
> > > > +	case VFIO_TPH_ST_IV_MODE:
> > > > +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_IV_MODE);
> > > > +
> > > > +	case VFIO_TPH_ST_DS_MODE:
> > > > +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_DS_MODE);
> > > > +
> > > > +	default:
> > > > +		return -EINVAL;
> > > > +	}
> > > > +
> > > > +}
> > > > +
> > > > +static int vfio_pci_feature_tph_disable(struct
> > > > +vfio_pci_core_device
> > > > +*vdev) {
> > > > +	pcie_disable_tph(vdev->pdev);
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user *arg,
> > > > +					size_t argsz, u32 flags,
> > > > +					struct vfio_pci_tph *tph)
> > > > +{
> > > > +	u32 op;
> > > > +	int err = vfio_check_feature(flags, argsz,
> > > > +				 VFIO_DEVICE_FEATURE_SET |
> > > > +				 VFIO_DEVICE_FEATURE_GET,
> > > > +				 sizeof(struct vfio_pci_tph));
> > > > +	if (err != 1)
> > > > +		return err;
> > >
> > > We don't seem to account for a host booted with pci=notph.
> > >
> >
> > pcie_enable_tph() looks for pci=notph and fails if it's set.
> > If pcie_enable_tph() fails, pcie_tph_get_cpu_st() and
> > pcie_tph_set_st_entry() fail too.
> >
> > Is it required to check pci=notph here as well?
> 
> Does it make sense to probe affirmatively for a feature that can't be enabled?  I
> think I previously also overlooked that we never actually check that the device
> supports TPH at probe time.
>

pci_init_capabilities() calls pci_tph_init(), so my understanding is that device is
already probed and pdev->tph_cap is set by the time vfio is initialized.
pcie_enable_tph() checks both pdev->tph_cap and pci=notph before enabling.


> > > > +
> > > > +	if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph)))
> > > > +		return -EFAULT;
> > > > +
> > > > +	op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> > > > +
> > > > +	switch (op) {
> > > > +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> > > > +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> > > > +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> > > > +		return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : -EINVAL;
> > > > +
> > > > +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> > > > +		return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : -EINVAL;
> > >
> > > This is a convoluted mangling of an ioctl into a vfio feature.
> > >
> >
> > Do you prefer all four operations to fold under a single ioctl? Or
> > split them such that enabling and disabling TPH remains a VFIO SET
> > feature while SET_ST and GET_ST moved to a regular ioctl?
> 
> Splitting the functionality between a feature and a new ioctl doesn't make any
> sense.  As I noted, I can imagine this interface being two related features where
> one allows the TPH state to be set enabled or disabled and the other allows
> setting steering tags.
> 

Understood. I will change this in V2.

> > > > +
> > > > +	default:
> > > > +		return -EINVAL;
> > > > +	}
> > > > +}
> > > > +
> > > > +static int vfio_pci_core_feature_tph(struct vfio_device *device, u32 flags,
> > > > +				     struct vfio_pci_tph __user *arg,
> > > > +				     size_t argsz)
> > > > +{
> > > > +	u32 op;
> > > > +	struct vfio_pci_tph tph;
> > > > +	void __user *uinfo;
> > > > +	size_t infosz;
> > > > +	struct vfio_pci_core_device *vdev =
> > > > +		container_of(device, struct vfio_pci_core_device, vdev);
> > > > +	int err = vfio_pci_feature_tph_prepare(arg, argsz, flags, &tph);
> > > > +
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > +	op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> > > > +
> > > > +	switch (op) {
> > > > +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> > > > +		return vfio_pci_feature_tph_enable(vdev, &tph);
> > > > +
> > > > +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> > > > +		return vfio_pci_feature_tph_disable(vdev);
> > > > +
> > > > +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> > > > +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> > > > +		uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph, info);
> > > > +		infosz = argsz - sizeof(struct vfio_pci_tph);
> > > > +		return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo, infosz);
> > >
> > > This is effectively encoding a regular ioctl as a feature.  See below.
> > >
> >
> > Addressed this in the above comment.
> >
> > > > +
> > > > +	default:
> > > > +		return -EINVAL;
> > > > +	}
> > > > +}
> > > > +
> > > >  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> > > >  				void __user *arg, size_t argsz)  { @@ -1523,6
> +1683,9 @@ int
> > > > vfio_pci_core_ioctl_feature(struct vfio_device
> > > *device, u32 flags,
> > > >  		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
> > > >  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
> > > >  		return vfio_pci_core_feature_token(device, flags, arg, argsz);
> > > > +	case VFIO_DEVICE_FEATURE_PCI_TPH:
> > > > +		return vfio_pci_core_feature_tph(device, flags,
> > > > +						 arg, argsz);
> > > >  	default:
> > > >  		return -ENOTTY;
> > > >  	}
> > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > > index c8dbf8219c4f..608d57dfe279 100644
> > > > --- a/include/uapi/linux/vfio.h
> > > > +++ b/include/uapi/linux/vfio.h
> > > > @@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master {  };
> > > > #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
> > > >
> > > > +/*
> > > > + * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or
> > > > +set steering tags
> > > > + * on the device. Data provided when setting this feature is a
> > > > +__u32 with the
> > > > + * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe
> > > > +TPH in
> > > > + * no-steering-tag, interrupt-vector, or device-specific mode
> > > > +when feature flags
> > > > + * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and
> > > VFIO_TPH_ST_DS_MODE
> > > > +are set
> > > > + * respectively.
> > > > + * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device.
> > > > + * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device
> > > > +at an index in
> > > > + * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag
> > > > +used and device
> > > > + * capabilities. The caller can set multiple steering tags by
> > > > +passing an array
> > > > + * of vfio_pci_tph_info objects containing cpu_id, cache_level,
> > > > +and
> > > > + * MSI-X/ST-table index. The caller can also set the intended
> > > > +memory type and
> > > > + * the processing hint by setting VFIO_TPH_MEM_TYPE_x and
> > > > +VFIO_TPH_HINT_x flags,
> > > > + * respectively. The return value for each vfio_pci_tph_info
> > > > +object is stored in
> > > > + * err, with the steering-tag set on the device and the ph_ignore
> > > > +status bit
> > > > + * resulting from the steering-tag lookup operation. If err < 0,
> > > > +the values
> > > > + * stored in the st and ph_ignore fields should be considered invalid.
> > > > + *
> > >
> > > Sorry, I don't understand ph_ignore as described here.  It's only
> > > ever set to 1.  I guess we assume the incoming state is zero.  But what does it
> mean?
> > >
> >
> > Agree, it's confusing and it has something to do with the current
> > implementation of TPH in the Kernel.
> > PCIe firmware specification states root-port ACPI _DSM as the single
> > source of truth for steering-tags. Its job is to maintain a table of
> > steering-tags indexed by a tuple of CPU/Container UID, Cache-id and a
> > processing-hint>. Each tuple is mapped to two steering-tags for
> > persistent or volatile memory (either or both depending on the platform).
> > A caller looking to acquire a steering tag for a cache closer to a CPU
> > will pass above tuple in the format defined in the PCIe firmware specification.
> > But PCIe spec also allows implementing TPH without processing-hints,
> > in such cases the _DSM is supposed to set ph_ignore bit in the return structure.
> > Current TPH implementation in the Linux kernel ignores some of these
> > details, it sets cache-id and the processing-hint in the above tuple
> > to zeros. It also ignores the ph_ignore bit return by the _DSM as well.
> > However, I wrote this RFC to match the PCI firmware specification,
> > therefore it sets ph_ignore bit to 1 to inform the caller that it is
> > ignored so that caller can make an informed decision to stick with the default
> (bidirectional hint) or exit with error.
> > This is why the cache_level is ignored as well, ideally cache levels
> > (0 for L1D, 1 for L2D,
> > Etc.) should be converted to a PPTT cache_id and passed into the _DSM.
> > I'm working on a separate PCI patch to fix above issues.
> >
> > > > + * Upon VFIO_DEVICE_FEATURE_GET,  return steering tags to the caller.
> > > > + * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the caller.
> > > > + * The return values per vfio_pci_tph_info object are stored in
> > > > + the st,
> > > > + * ph_ignore, and err fields.
> > >
> > > Why do we need to provide an interface to return the steering tags
> > > set by the user?  Seems like this could be a write-only, SET, interface.
> > >
> >
> > VFIO_DEVICE_FEATURE_TPH_GET_ST calls the _DSM to read the steering-tag
> > for a vector of tuples of a cpu-id, cache-id, processing-hint. It is
> > for devices that operate in device specific mode where they don't set
> > steering-tags in the MSI-X or ST tables but in another construct like a queue
> context accessible from the user/kernel space.
> > For instance, this is what will be used by DPDK poll mode drivers as
> > [2] fleshes out.
> >
> > So, VFIO_DEVICE_FEATURE_TPH_GET_ST doesn't return steering-tags set by
> > the user. It's there to read the platform database of steering-tags
> > which is an ACPI _DSM bound each PCIe root port.
> >
> > > > + */
> > > > +struct vfio_pci_tph_info {
> > >
> > > This seems more of an _entry than an _info.  Note that vfio has
> > > various INFO ioctls which make this nomenclature confusing.
> > >
> >
> > It is, it's more of a struct that describes the request.
> > Perhaps vfio_pci_tph_request_descriptor, but that's too long.
> > Suggestions are welcome.
> >
> >
> > > > +	/* in */
> > > > +	__u32 cpu_id;
> > >
> > > The logical ID?
> > >
> > Yes, this is logical ID.
> >
> > > > +	__u32 cache_level;
> > >
> > > Zero based?  1-based?  How many cache levels are there?  What's valid here?
> > >
> > It's Zero based. This is currently ignored due to limitations in the kernel TPH.
> > One way to validate would be iterating through the topology to find
> > it, decrementing a copy_of_cache_level as we move further away from
> > the cpu_id until the value reaches zero. If loop terminates before
> copy_of_cache_level reaching zero, -EINVAL.
> > Is that a good approach?
> 
> Sounds like a try and hope for the best interface.  How would userspace
> realistically infer the cache levels?  I don't think we want vfio describing that.
> 

User sees each cache level relative to a logical CPU.
For instance, when caller says cup_id = 0, and cache_level = 0 they mean L1D
of the logical CPU 0. Cpu_id = 0, cache_level = 1 means L2D of logical CPU 0.
Likewise, further it moves from the L1D, the cache_level increases. 

> > > > +	__u8  flags;
> > > > +#define VFIO_TPH_MEM_TYPE_MASK		0x1
> > > > +#define VFIO_TPH_MEM_TYPE_SHIFT		0
> > > > +#define VFIO_TPH_MEM_TYPE_VMEM		0	/* Request
> > > volatile memory ST */
> > > > +#define VFIO_TPH_MEM_TYPE_PMEM		1	/* Request
> > > persistent memory ST */
> > >
> > > Is there a relation to the cache_level here?  Spec references here
> > > and below, assuming those are relevant to these values.
> > >
> > There is no relation to cache level. Return from the _DSM has PMEM and
> > VMEM sections. If either of those are not supported a valid bit is set to 0 in the
> return.
> >
> > PCIe spec has four processing-hints, VFIO_TPH_HINT_* are there to set
> > them in the flags field. Hints are forced to VFIO_TPH_HINT_BIDIR by
> > current TPH implementation as I described above.
> >
> > > > +
> > > > +#define VFIO_TPH_HINT_MASK		0x3
> > > > +#define VFIO_TPH_HINT_SHIFT		1
> > > > +#define VFIO_TPH_HINT_BIDIR		0
> > > > +#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
> > > > +#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
> > > > +#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)
> > >
> > > There needs to be a __u8 padding in here somewhere or flags extended
> > > to __u16.
> > >
> > > > +	__u16 index;			/* MSI-X/ST-table index to set ST */
> > > > +	/* out */
> > > > +	__u16 st;			/* Steering-Tag */
> > >
> > > Sorry if I'm just not familiar with TPH, but why do we need to return the ST?
> > > Doesn't hardware make use of the ST index and the physical value
> > > gets applied automatically in HW?
> > >
> >
> > Device specific mode requires this. For instance, intel E810 NIC can
> > set Steering-tags in queue context which is in user-space when used with DPDK.
> > For such use cases we must return steering-tags read from the _DSM.
> > Going back to DPDK RFC mentioned here, if TPH gets enabled in VFIO,
> > the E810 poll mode driver in the DPDK will ask the kernel a
> > steering-tag for a combination of a cpu-id, a cache-level and a
> > processing-hint. In response the kernel will invoke the ACPI _DSM for
> > the root port of the device via pcie_tph_get_cpu_st() and return the
> > steering-tag back to the user. E810 driver will set the returned tag in
> appropriate queue context.
> >
> > For cases where steering-tag is not required in user-space (VMMs),
> > caller asks the kernel to set steering-tag that corresponds the
> > cpu-id, cache-level, and PH combination at a given MSI-X vector entry
> > or ST table in config space. For such cases too, the kernel will read
> > the steering-tag from the _DSM and set the tag in the corresponding MSI-X or
> ST table entry.
> 
> We need to consider that there are vfio-pci variant drivers that support migration
> and make use of the vfio-pci-core feature interface.
> TPH programming appears to be very non-migration friendly, the attributes are
> very specific to the current host system.  Should migration and TPH be mutually
> exclusive?  This may be a factor in the decision to use a feature or ioctl.
> 

I'm not very familiar with migration, so I don't have a good answer here.
By virtualizing STs VMMs can provide a consistent ST assignments for a vCPU and a
Cache-level relative to the vCPU, but there is no guarantee that on next platform the
same processing hint would be available. One solution would be to only allow 
bidirectional hint on VMs (which is cache injection with ST only that works on everything
that supports TPH).

For virtualization in general, I think platform STs should not be exposed to VMs
or allowed to set arbitrary STs on devices to prevent VMs from figuring out platform
topology and cache injecting to physical CPUs that they are not supposed to run on.
So, I believe VMMs need some additional features to support TPH securely anyway.

> > > > +	__u8  ph_ignore;		/* Processing hint was ignored by */
> > >
> > > Padding/alignment, same as above.  "ignored by..."  By what?  Is that an
> error?
> > >
> >
> > An error. "Processing hint was ignored by the platform"
> >
> > > > +	__s32 err;			/* Error on getting/setting Steering-
> > > Tag*/
> > > > +};
> > >
> > > Generally we'd expect a system call either works or fails.  Why do
> > > we need per entry error report?  Can't we validate and prepare the
> > > entire operation before setting any of it into the device?
> > > Ultimately we're just writing hints to config space or MSI-X table
> > > space, so the write operation itself is not likely to be the point of failure.
> > >
> >
> > Write to MSI-X won't fail but reading the steering-tag from the _DSM
> > for an invalid <cpu_id, cach_id, ph_hint> combo can fail in both
> > GET_ST and SET_ST cases.
> > We can change this to an all or nothing interface, such that success
> > if all the entries are valid and otherwise if at least one is invalid.
> > But in that case the caller may not know which entries were invalid.
> > If you think that's ok, I can change it.
> 
> It's not exactly uncommon for an ioctl to fail for a single bad arg among many.  I
> think it's harder for userspace to deal with recovering from a partially
> implemented call.
> 

I will change this in v2.

> > > > +
> > > > +struct vfio_pci_tph {
> > > > +	__u32 argsz;			/* Size of vfio_pci_tph and info[] */
> > > > +	__u32 flags;
> > > > +#define VFIO_DEVICE_FEATURE_TPH_OP_MASK		0x7
> > > > +#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT	3
> > > > +#define VFIO_DEVICE_FEATURE_TPH_ENABLE		0	/*
> > > Enable TPH on device */
> > > > +#define VFIO_DEVICE_FEATURE_TPH_DISABLE	1	/* Disable TPH
> > > on device */
> > > > +#define VFIO_DEVICE_FEATURE_TPH_GET_ST		2	/* Get
> > > steering-tags */
> > > > +#define VFIO_DEVICE_FEATURE_TPH_SET_ST		4	/* Set
> > > steering-rags */
> > >
> > > s/rags/tags/
> > >
> > > vfio device features already have GET and SET as part of their base
> > > structure, why are they duplicated here?  It really seems like there
> > > are two separate features here, one that allows enabling with a
> > > given mode or disable, and another that allows writing specific
> > > steering tags.  Both seem like they could be SET-only features.
> > > It's also not clear to me that there's a significant advantage to
> > > providing an array of steering tags.  Isn't updating STs an infrequent operation
> and likely bound to at most 2K tags in the case of MSI-X?
> > >
> >
> > VFIO_DEVICE_FEATURE_TPH_ENABLE and
> VFIO_DEVICE_FEATURE_TPH_DISABLE are
> > SET features.
> > Since, VFIO_DEVICE_FEATURE_TPH_SET_ST requires writing to ST table, so
> > that too falls under SET features.
> > Therefore, these flags are only valid when VFIO_DEVICE_FEATURE_SET flag is
> set.
> > The only GET features use case here is VFIO_DEVICE_FEATURE_TPH_GET_ST
> > that reads the steering-tags from the _DSM and returns it back to
> > caller. Only valid with VFIO_DEVICE_FEATURE_GET flag.
> > These are checked in vfio_pci_feature_tph_prepare().
> >
> > As I mentioned earlier, does it make sense to leave enable/disable
> > under VFIO Feature and move GET_ST and SET_ST to a separate ioctl?
> 
> Splitting TPH between a feature and new ioctls doesn't seem like a logical
> interface.
> 
> > There isn't much rational to providing an array of tuples other than
> > following the format VFIO_DEVICE_SET_IRQS that sets eventfds.
> >
> > > > +
> > > > +#define	VFIO_TPH_ST_MODE_MASK	(0x3 <<
> > > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> > > > +#define	VFIO_TPH_ST_NS_MODE	(0 <<
> > > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> > > > +#define	VFIO_TPH_ST_IV_MODE	(1 <<
> > > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> > > > +#define	VFIO_TPH_ST_DS_MODE	(2 <<
> > > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> > > > +	__u32 count;				/* Number of entries in info[]
> > > */
> > > > +	struct vfio_pci_tph_info info[];
> > > > +#define VFIO_TPH_INFO_MAX	64		/* Max entries allowed
> > > in info[] */
> > >
> > > This seems to match the limit if the table is located in extended
> > > config space, but it's not particularly relevant if the table is
> > > located in MSI-X space.  Why is this a good limit?
> >
> > Yes, for MSI-X the caller may have to invoke the ioctl multiple times
> > if more than 64 entries need to be updated.
> >
> > >
> > > Also, try to keep these all in 80 column.  Thanks,
> > >
> >
> > I will, try to keep these under 80 columns.
> >
> > Thanks Alex, for your input. Let me know what you think about moving
> > SET_ST and GET_ST to an ioctl and keep ENABLE/DISABLE TPH as a vfio
> feature.
> 
> It seems like kind of a hodgepodge to split the interface like that, imo.  Sorry for a
> less that complete reply, I'm on PTO this week.
> Thanks,

Understood. Now, the question is whether this should be an ioctl or two features, 
and it all depends on how TPH works with migration.
Thanks and have a great week.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-02-21 22:46 [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl Wathsala Vithanage
  2025-02-28 18:48 ` Alex Williamson
  2025-03-03  7:49 ` Philipp Stanner
@ 2025-03-04 14:14 ` Jason Gunthorpe
  2025-03-04 22:38   ` Wathsala Wathawana Vithanage
  2025-06-02 23:06 ` [RFC PATCH v2 0/1] VFIO ioctl for TLP Processing Hints Wathsala Vithanage
  3 siblings, 1 reply; 24+ messages in thread
From: Jason Gunthorpe @ 2025-03-04 14:14 UTC (permalink / raw)
  To: Wathsala Vithanage
  Cc: linux-kernel, nd, Alex Williamson, Kevin Tian, Philipp Stanner,
	Yunxiang Li, Dr. David Alan Gilbert, Ankit Agrawal,
	open list:VFIO DRIVER

On Fri, Feb 21, 2025 at 10:46:33PM +0000, Wathsala Vithanage wrote:
> Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> direct cache injection. As described in the relevant patch set [1],
> direct cache injection in supported hardware allows optimal platform
> resource utilization for specific requests on the PCIe bus. This feature
> is currently available only for kernel device drivers. However,
> user space applications, especially those whose performance is sensitive
> to the latency of inbound writes as seen by a CPU core, may benefit from
> using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> application running in a VM).
> 
> This patch enables configuring of TPH from the user space via
> VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> drivers and VMMs to enable/disable the TPH feature on PCIe devices and
> set steering tags in MSI-X or steering-tag table entries using
> VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel using
> VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.

What level of protection do we expect to have here? Is it OK for
userspace to make up any old tag value or is there some security
concern with that?

Jason

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-04 14:14 ` Jason Gunthorpe
@ 2025-03-04 22:38   ` Wathsala Wathawana Vithanage
  2025-03-05  1:24     ` Alex Williamson
  0 siblings, 1 reply; 24+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-03-04 22:38 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: linux-kernel@vger.kernel.org, nd, Alex Williamson, Kevin Tian,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER

> > Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> > direct cache injection. As described in the relevant patch set [1],
> > direct cache injection in supported hardware allows optimal platform
> > resource utilization for specific requests on the PCIe bus. This feature
> > is currently available only for kernel device drivers. However,
> > user space applications, especially those whose performance is sensitive
> > to the latency of inbound writes as seen by a CPU core, may benefit from
> > using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> > application running in a VM).
> >
> > This patch enables configuring of TPH from the user space via
> > VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> > drivers and VMMs to enable/disable the TPH feature on PCIe devices and
> > set steering tags in MSI-X or steering-tag table entries using
> > VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel using
> > VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.
> 
> What level of protection do we expect to have here? Is it OK for
> userspace to make up any old tag value or is there some security
> concern with that?
> 
Shouldn't be allowed from within a container.
A hypervisor should have its own STs and map them to platform STs for
the cores the VM is pinned to and verify any old ST is not written to the
device MSI-X, ST table or device specific locations.

Thanks

--wathsala

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-04 22:38   ` Wathsala Wathawana Vithanage
@ 2025-03-05  1:24     ` Alex Williamson
  2025-03-05  6:11       ` Wathsala Wathawana Vithanage
  0 siblings, 1 reply; 24+ messages in thread
From: Alex Williamson @ 2025-03-05  1:24 UTC (permalink / raw)
  To: Wathsala Wathawana Vithanage
  Cc: Jason Gunthorpe, linux-kernel@vger.kernel.org, nd, Kevin Tian,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER

On Tue, 4 Mar 2025 22:38:16 +0000
Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com> wrote:

> > > Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> > > direct cache injection. As described in the relevant patch set [1],
> > > direct cache injection in supported hardware allows optimal platform
> > > resource utilization for specific requests on the PCIe bus. This feature
> > > is currently available only for kernel device drivers. However,
> > > user space applications, especially those whose performance is sensitive
> > > to the latency of inbound writes as seen by a CPU core, may benefit from
> > > using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> > > application running in a VM).
> > >
> > > This patch enables configuring of TPH from the user space via
> > > VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> > > drivers and VMMs to enable/disable the TPH feature on PCIe devices and
> > > set steering tags in MSI-X or steering-tag table entries using
> > > VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel using
> > > VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.  
> > 
> > What level of protection do we expect to have here? Is it OK for
> > userspace to make up any old tag value or is there some security
> > concern with that?
> >   
> Shouldn't be allowed from within a container.
> A hypervisor should have its own STs and map them to platform STs for
> the cores the VM is pinned to and verify any old ST is not written to the
> device MSI-X, ST table or device specific locations.

And how exactly are we mediating device specific steering tags when we
don't know where/how they're written to the device.  An API that
returns a valid ST to userspace doesn't provide any guarantees relative
to what userspace later writes.  MSI-X tables are also writable by
userspace.  I could have missed it, but I also didn't note any pinning
requirement in this proposal.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-05  1:24     ` Alex Williamson
@ 2025-03-05  6:11       ` Wathsala Wathawana Vithanage
  2025-03-05 18:58         ` Jason Gunthorpe
                           ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-03-05  6:11 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Jason Gunthorpe, linux-kernel@vger.kernel.org, nd, Kevin Tian,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Dhruv Tripathi,
	Honnappa Nagarahalli, Jeremy Linton



> -----Original Message-----
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Tuesday, March 4, 2025 7:24 PM
> To: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>
> Cc: Jason Gunthorpe <jgg@ziepe.ca>; linux-kernel@vger.kernel.org; nd
> <nd@arm.com>; Kevin Tian <kevin.tian@intel.com>; Philipp Stanner
> <pstanner@redhat.com>; Yunxiang Li <Yunxiang.Li@amd.com>; Dr. David Alan
> Gilbert <linux@treblig.org>; Ankit Agrawal <ankita@nvidia.com>; open list:VFIO
> DRIVER <kvm@vger.kernel.org>
> Subject: Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
> 
> On Tue, 4 Mar 2025 22:38:16 +0000
> Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com> wrote:
> 
> > > > Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> > > > direct cache injection. As described in the relevant patch set [1],
> > > > direct cache injection in supported hardware allows optimal platform
> > > > resource utilization for specific requests on the PCIe bus. This feature
> > > > is currently available only for kernel device drivers. However,
> > > > user space applications, especially those whose performance is sensitive
> > > > to the latency of inbound writes as seen by a CPU core, may benefit from
> > > > using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> > > > application running in a VM).
> > > >
> > > > This patch enables configuring of TPH from the user space via
> > > > VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> > > > drivers and VMMs to enable/disable the TPH feature on PCIe devices and
> > > > set steering tags in MSI-X or steering-tag table entries using
> > > > VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel using
> > > > VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.
> > >
> > > What level of protection do we expect to have here? Is it OK for
> > > userspace to make up any old tag value or is there some security
> > > concern with that?
> > >
> > Shouldn't be allowed from within a container.
> > A hypervisor should have its own STs and map them to platform STs for
> > the cores the VM is pinned to and verify any old ST is not written to the
> > device MSI-X, ST table or device specific locations.
> 
> And how exactly are we mediating device specific steering tags when we
> don't know where/how they're written to the device.  An API that
> returns a valid ST to userspace doesn't provide any guarantees relative
> to what userspace later writes.  MSI-X tables are also writable by

By not enabling TPH in device-specific mode, hypervisors can ensure that
setting an ST in a device-specific location (like queue contexts) will have no
effect. VMs should also not be allowed to enable TPH. I believe this could
be enforced by trapping (causing VM exits) on MSI-X/ST table writes. 

Having said that, regardless of this proposal or the availability of kernel
TPH support, a VFIO driver could enable TPH and set an arbitrary ST on the
MSI-X/ST table or a device-specific location on supported platforms. If the
driver doesn't have a list of valid STs, it can enumerate 8- or 16-bit STs and
measure access latencies to determine valid ones.

> userspace.  I could have missed it, but I also didn't note any pinning
> requirement in this proposal.  Thanks,
> 

Sorry, I failed to mention pinning earlier. Let's say we don't pin VMs to
CPUs. Now, say VM_A sets an ST on a NIC to get packet data to the L2D 
of the CPU_N to which its vCPU_0 is currently bound. Then, after a while,
say, VM_B gets scheduled to CPU_N. CPU_N, regardless of what 
process/thread is scheduled, will continuously receive data from VM A's
NIC for its L2D. Consequently, the performance of VMs scheduled on
CPU_N other than VM_A would degrade due to capacity misses and
invalidations. This is where the pinning requirement comes from.

--wathsala



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-05  6:11       ` Wathsala Wathawana Vithanage
@ 2025-03-05 18:58         ` Jason Gunthorpe
  2025-03-05 20:21           ` Wathsala Wathawana Vithanage
  2025-03-12  7:53         ` Tian, Kevin
  2025-04-26 12:42         ` Wathsala Wathawana Vithanage
  2 siblings, 1 reply; 24+ messages in thread
From: Jason Gunthorpe @ 2025-03-05 18:58 UTC (permalink / raw)
  To: Wathsala Wathawana Vithanage
  Cc: Alex Williamson, linux-kernel@vger.kernel.org, nd, Kevin Tian,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Dhruv Tripathi,
	Honnappa Nagarahalli, Jeremy Linton

On Wed, Mar 05, 2025 at 06:11:22AM +0000, Wathsala Wathawana Vithanage wrote:

> By not enabling TPH in device-specific mode, hypervisors can ensure that
> setting an ST in a device-specific location (like queue contexts) will have no
> effect. VMs should also not be allowed to enable TPH. 

So many workloads run inside VMs now for security reasons that is not
a reasonable approach.

> I believe this could
> be enforced by trapping (causing VM exits) on MSI-X/ST table writes. 

Yes, I think this was always part of the plan for virtualization when
using a MSI-X table.

> Having said that, regardless of this proposal or the availability of kernel
> TPH support, a VFIO driver could enable TPH and set an arbitrary ST on the
> MSI-X/ST table or a device-specific location on supported platforms. If the
> driver doesn't have a list of valid STs, it can enumerate 8- or 16-bit STs and
> measure access latencies to determine valid ones.

And you think it is absolutely true that no TPH value can cause a
platform malfunction or security failure?

Jason

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-05 18:58         ` Jason Gunthorpe
@ 2025-03-05 20:21           ` Wathsala Wathawana Vithanage
  0 siblings, 0 replies; 24+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-03-05 20:21 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Alex Williamson, linux-kernel@vger.kernel.org, nd, Kevin Tian,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Dhruv Tripathi,
	Honnappa Nagarahalli, Jeremy Linton



> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Wednesday, March 5, 2025 12:58 PM
> To: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>
> Cc: Alex Williamson <alex.williamson@redhat.com>; linux-
> kernel@vger.kernel.org; nd <nd@arm.com>; Kevin Tian <kevin.tian@intel.com>;
> Philipp Stanner <pstanner@redhat.com>; Yunxiang Li <Yunxiang.Li@amd.com>;
> Dr. David Alan Gilbert <linux@treblig.org>; Ankit Agrawal <ankita@nvidia.com>;
> open list:VFIO DRIVER <kvm@vger.kernel.org>; Dhruv Tripathi
> <Dhruv.Tripathi@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Jeremy Linton <Jeremy.Linton@arm.com>
> Subject: Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
> 
> On Wed, Mar 05, 2025 at 06:11:22AM +0000, Wathsala Wathawana Vithanage
> wrote:
> 
> > By not enabling TPH in device-specific mode, hypervisors can ensure
> > that setting an ST in a device-specific location (like queue contexts)
> > will have no effect. VMs should also not be allowed to enable TPH.
> 
> So many workloads run inside VMs now for security reasons that is not a
> reasonable approach.
> 
> > I believe this could
> > be enforced by trapping (causing VM exits) on MSI-X/ST table writes.
> 
> Yes, I think this was always part of the plan for virtualization when using a MSI-X
> table.
> 
> > Having said that, regardless of this proposal or the availability of
> > kernel TPH support, a VFIO driver could enable TPH and set an
> > arbitrary ST on the MSI-X/ST table or a device-specific location on
> > supported platforms. If the driver doesn't have a list of valid STs,
> > it can enumerate 8- or 16-bit STs and measure access latencies to determine
> valid ones.
> 
> And you think it is absolutely true that no TPH value can cause a platform
> malfunction or security failure?
> 

I think such hardware bugs are inevitable :).
Would disabling TPH in the kernel and preventing config-writes that enables
TPH from the user-space by trapping them in the kernel work as a solution?
That requires trapping config-writes.

--wathsala





^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-05  6:11       ` Wathsala Wathawana Vithanage
  2025-03-05 18:58         ` Jason Gunthorpe
@ 2025-03-12  7:53         ` Tian, Kevin
  2025-03-14  1:49           ` Wathsala Wathawana Vithanage
  2025-04-01 14:11           ` Jason Gunthorpe
  2025-04-26 12:42         ` Wathsala Wathawana Vithanage
  2 siblings, 2 replies; 24+ messages in thread
From: Tian, Kevin @ 2025-03-12  7:53 UTC (permalink / raw)
  To: Wathsala Wathawana Vithanage, Alex Williamson
  Cc: Jason Gunthorpe, linux-kernel@vger.kernel.org, nd,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Dhruv Tripathi,
	Nagarahalli, Honnappa, Jeremy Linton

> From: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>
> Sent: Wednesday, March 5, 2025 2:11 PM
>
> > -----Original Message-----
> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Tuesday, March 4, 2025 7:24 PM
> > To: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>
> > Cc: Jason Gunthorpe <jgg@ziepe.ca>; linux-kernel@vger.kernel.org; nd
> > <nd@arm.com>; Kevin Tian <kevin.tian@intel.com>; Philipp Stanner
> > <pstanner@redhat.com>; Yunxiang Li <Yunxiang.Li@amd.com>; Dr. David
> Alan
> > Gilbert <linux@treblig.org>; Ankit Agrawal <ankita@nvidia.com>; open
> list:VFIO
> > DRIVER <kvm@vger.kernel.org>
> > Subject: Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
> >
> > On Tue, 4 Mar 2025 22:38:16 +0000
> > Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com> wrote:
> >
> > > > > Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> > > > > direct cache injection. As described in the relevant patch set [1],
> > > > > direct cache injection in supported hardware allows optimal platform
> > > > > resource utilization for specific requests on the PCIe bus. This feature
> > > > > is currently available only for kernel device drivers. However,
> > > > > user space applications, especially those whose performance is
> sensitive
> > > > > to the latency of inbound writes as seen by a CPU core, may benefit
> from
> > > > > using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> > > > > application running in a VM).
> > > > >
> > > > > This patch enables configuring of TPH from the user space via
> > > > > VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> > > > > drivers and VMMs to enable/disable the TPH feature on PCIe devices
> and
> > > > > set steering tags in MSI-X or steering-tag table entries using
> > > > > VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel
> using
> > > > > VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.
> > > >
> > > > What level of protection do we expect to have here? Is it OK for
> > > > userspace to make up any old tag value or is there some security
> > > > concern with that?
> > > >
> > > Shouldn't be allowed from within a container.
> > > A hypervisor should have its own STs and map them to platform STs for
> > > the cores the VM is pinned to and verify any old ST is not written to the
> > > device MSI-X, ST table or device specific locations.
> >
> > And how exactly are we mediating device specific steering tags when we
> > don't know where/how they're written to the device.  An API that
> > returns a valid ST to userspace doesn't provide any guarantees relative
> > to what userspace later writes.  MSI-X tables are also writable by
> 
> By not enabling TPH in device-specific mode, hypervisors can ensure that
> setting an ST in a device-specific location (like queue contexts) will have no
> effect. VMs should also not be allowed to enable TPH. I believe this could
> be enforced by trapping (causing VM exits) on MSI-X/ST table writes.

Probably we should not allow device-specific mode unless the user is
capable of CAP_SYS_RAWIO? It allows an user to pollute caches on
CPUs which its processes are not affined to, hence could easily break
SLAs which CSPs try to achieve...

Interrupt vector mode sounds safer as it only needs to provide an
enable/disable cmd to the user and it's the kernel VFIO driver
managing the steering table, e.g. also in irq affinity handler.

> 
> Having said that, regardless of this proposal or the availability of kernel
> TPH support, a VFIO driver could enable TPH and set an arbitrary ST on the
> MSI-X/ST table or a device-specific location on supported platforms. If the
> driver doesn't have a list of valid STs, it can enumerate 8- or 16-bit STs and
> measure access latencies to determine valid ones.
> 

PCI capabilities are managed by the kernel VFIO driver. So w/o this
patch no userspace driver can enable TPH to try that trick?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-12  7:53         ` Tian, Kevin
@ 2025-03-14  1:49           ` Wathsala Wathawana Vithanage
  2025-04-07  5:16             ` Tian, Kevin
  2025-04-01 14:11           ` Jason Gunthorpe
  1 sibling, 1 reply; 24+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-03-14  1:49 UTC (permalink / raw)
  To: Tian, Kevin, Alex Williamson
  Cc: Jason Gunthorpe, linux-kernel@vger.kernel.org, nd,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Dhruv Tripathi,
	Honnappa Nagarahalli, Jeremy Linton



> -----Original Message-----
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Wednesday, March 12, 2025 2:53 AM
> To: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>; Alex
> Williamson <alex.williamson@redhat.com>
> Cc: Jason Gunthorpe <jgg@ziepe.ca>; linux-kernel@vger.kernel.org; nd
> <nd@arm.com>; Philipp Stanner <pstanner@redhat.com>; Yunxiang Li
> <Yunxiang.Li@amd.com>; Dr. David Alan Gilbert <linux@treblig.org>; Ankit
> Agrawal <ankita@nvidia.com>; open list:VFIO DRIVER <kvm@vger.kernel.org>;
> Dhruv Tripathi <Dhruv.Tripathi@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Jeremy Linton <Jeremy.Linton@arm.com>
> Subject: RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
> 
> > From: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>
> > Sent: Wednesday, March 5, 2025 2:11 PM
> >
> > > -----Original Message-----
> > > From: Alex Williamson <alex.williamson@redhat.com>
> > > Sent: Tuesday, March 4, 2025 7:24 PM
> > > To: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>
> > > Cc: Jason Gunthorpe <jgg@ziepe.ca>; linux-kernel@vger.kernel.org; nd
> > > <nd@arm.com>; Kevin Tian <kevin.tian@intel.com>; Philipp Stanner
> > > <pstanner@redhat.com>; Yunxiang Li <Yunxiang.Li@amd.com>; Dr. David
> > Alan
> > > Gilbert <linux@treblig.org>; Ankit Agrawal <ankita@nvidia.com>; open
> > list:VFIO
> > > DRIVER <kvm@vger.kernel.org>
> > > Subject: Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
> > >
> > > On Tue, 4 Mar 2025 22:38:16 +0000
> > > Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com> wrote:
> > >
> > > > > > Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> > > > > > direct cache injection. As described in the relevant patch set [1],
> > > > > > direct cache injection in supported hardware allows optimal platform
> > > > > > resource utilization for specific requests on the PCIe bus. This feature
> > > > > > is currently available only for kernel device drivers. However,
> > > > > > user space applications, especially those whose performance is
> > sensitive
> > > > > > to the latency of inbound writes as seen by a CPU core, may benefit
> > from
> > > > > > using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> > > > > > application running in a VM).
> > > > > >
> > > > > > This patch enables configuring of TPH from the user space via
> > > > > > VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> > > > > > drivers and VMMs to enable/disable the TPH feature on PCIe devices
> > and
> > > > > > set steering tags in MSI-X or steering-tag table entries using
> > > > > > VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel
> > using
> > > > > > VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.
> > > > >
> > > > > What level of protection do we expect to have here? Is it OK for
> > > > > userspace to make up any old tag value or is there some security
> > > > > concern with that?
> > > > >
> > > > Shouldn't be allowed from within a container.
> > > > A hypervisor should have its own STs and map them to platform STs for
> > > > the cores the VM is pinned to and verify any old ST is not written to the
> > > > device MSI-X, ST table or device specific locations.
> > >
> > > And how exactly are we mediating device specific steering tags when we
> > > don't know where/how they're written to the device.  An API that
> > > returns a valid ST to userspace doesn't provide any guarantees relative
> > > to what userspace later writes.  MSI-X tables are also writable by
> >
> > By not enabling TPH in device-specific mode, hypervisors can ensure that
> > setting an ST in a device-specific location (like queue contexts) will have no
> > effect. VMs should also not be allowed to enable TPH. I believe this could
> > be enforced by trapping (causing VM exits) on MSI-X/ST table writes.
> 
> Probably we should not allow device-specific mode unless the user is
> capable of CAP_SYS_RAWIO? It allows an user to pollute caches on

Sounds plausible. 

> CPUs which its processes are not affined to, hence could easily break
> SLAs which CSPs try to achieve...
>
> Interrupt vector mode sounds safer as it only needs to provide an
> enable/disable cmd to the user and it's the kernel VFIO driver
> managing the steering table, e.g. also in irq affinity handler.
> 
> >
> > Having said that, regardless of this proposal or the availability of kernel
> > TPH support, a VFIO driver could enable TPH and set an arbitrary ST on the
> > MSI-X/ST table or a device-specific location on supported platforms. If the
> > driver doesn't have a list of valid STs, it can enumerate 8- or 16-bit STs and
> > measure access latencies to determine valid ones.
> >
> 
> PCI capabilities are managed by the kernel VFIO driver. So w/o this
> patch no userspace driver can enable TPH to try that trick?

Yes, it's possible. It's just a matter of setting the right bits in the PCI config
space to enable TPH on the device.

Thanks 
--wathsala


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-12  7:53         ` Tian, Kevin
  2025-03-14  1:49           ` Wathsala Wathawana Vithanage
@ 2025-04-01 14:11           ` Jason Gunthorpe
  2025-04-01 23:38             ` Wathsala Wathawana Vithanage
  2025-04-07  5:22             ` Tian, Kevin
  1 sibling, 2 replies; 24+ messages in thread
From: Jason Gunthorpe @ 2025-04-01 14:11 UTC (permalink / raw)
  To: Tian, Kevin
  Cc: Wathsala Wathawana Vithanage, Alex Williamson,
	linux-kernel@vger.kernel.org, nd, Philipp Stanner, Yunxiang Li,
	Dr. David Alan Gilbert, Ankit Agrawal, open list:VFIO DRIVER,
	Dhruv Tripathi, Nagarahalli, Honnappa, Jeremy Linton

On Wed, Mar 12, 2025 at 07:53:17AM +0000, Tian, Kevin wrote:

> Probably we should not allow device-specific mode unless the user is
> capable of CAP_SYS_RAWIO? It allows an user to pollute caches on
> CPUs which its processes are not affined to, hence could easily break
> SLAs which CSPs try to achieve...

I'm not sure this is within the threat model for VFIO though..

qemu or the operator needs to deal with this by not permiting such
HW to go into a VM.

Really we can't block device specific mode anyhow because we can't
even discover it on the kernel side..

Jason

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-04-01 14:11           ` Jason Gunthorpe
@ 2025-04-01 23:38             ` Wathsala Wathawana Vithanage
  2025-04-01 23:52               ` Jason Gunthorpe
  2025-04-07  5:22             ` Tian, Kevin
  1 sibling, 1 reply; 24+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-04-01 23:38 UTC (permalink / raw)
  To: Jason Gunthorpe, Tian, Kevin
  Cc: Alex Williamson, linux-kernel@vger.kernel.org, nd,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Dhruv Tripathi,
	Honnappa Nagarahalli, Jeremy Linton



> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Tuesday, April 1, 2025 9:12 AM
> To: Tian, Kevin <kevin.tian@intel.com>
> Cc: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>; Alex
> Williamson <alex.williamson@redhat.com>; linux-kernel@vger.kernel.org; nd
> <nd@arm.com>; Philipp Stanner <pstanner@redhat.com>; Yunxiang Li
> <Yunxiang.Li@amd.com>; Dr. David Alan Gilbert <linux@treblig.org>; Ankit
> Agrawal <ankita@nvidia.com>; open list:VFIO DRIVER <kvm@vger.kernel.org>;
> Dhruv Tripathi <Dhruv.Tripathi@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Jeremy Linton <Jeremy.Linton@arm.com>
> Subject: Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
> 
> On Wed, Mar 12, 2025 at 07:53:17AM +0000, Tian, Kevin wrote:
> 
> > Probably we should not allow device-specific mode unless the user is
> > capable of CAP_SYS_RAWIO? It allows an user to pollute caches on CPUs
> > which its processes are not affined to, hence could easily break SLAs
> > which CSPs try to achieve...
> 
> I'm not sure this is within the threat model for VFIO though..
> 
> qemu or the operator needs to deal with this by not permiting such HW to go into
> a VM.
> 
> Really we can't block device specific mode anyhow because we can't even
> discover it on the kernel side..
> 

We cannot block users from writing a steering-tag to a device specific location, but
can we use a capability to prevent users from enabling device specific mode on the device?


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-04-01 23:38             ` Wathsala Wathawana Vithanage
@ 2025-04-01 23:52               ` Jason Gunthorpe
  0 siblings, 0 replies; 24+ messages in thread
From: Jason Gunthorpe @ 2025-04-01 23:52 UTC (permalink / raw)
  To: Wathsala Wathawana Vithanage
  Cc: Tian, Kevin, Alex Williamson, linux-kernel@vger.kernel.org, nd,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Dhruv Tripathi,
	Honnappa Nagarahalli, Jeremy Linton

On Tue, Apr 01, 2025 at 11:38:01PM +0000, Wathsala Wathawana Vithanage wrote:

> > Really we can't block device specific mode anyhow because we can't even
> > discover it on the kernel side..
> 
> We cannot block users from writing a steering-tag to a device specific location, but
> can we use a capability to prevent users from enabling device specific mode on the device?

qemu could do that, but the vfio kernel side really shouldn't.. There
is no reason vfio userspace on bare metal couldn't use device specific
mode.

Jason 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-14  1:49           ` Wathsala Wathawana Vithanage
@ 2025-04-07  5:16             ` Tian, Kevin
  0 siblings, 0 replies; 24+ messages in thread
From: Tian, Kevin @ 2025-04-07  5:16 UTC (permalink / raw)
  To: Wathsala Wathawana Vithanage, Alex Williamson
  Cc: Jason Gunthorpe, linux-kernel@vger.kernel.org, nd,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Dhruv Tripathi,
	Nagarahalli, Honnappa, Jeremy Linton

> From: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>
> Sent: Friday, March 14, 2025 9:49 AM
> > >
> > > Having said that, regardless of this proposal or the availability of kernel
> > > TPH support, a VFIO driver could enable TPH and set an arbitrary ST on
> the
> > > MSI-X/ST table or a device-specific location on supported platforms. If the
> > > driver doesn't have a list of valid STs, it can enumerate 8- or 16-bit STs
> and
> > > measure access latencies to determine valid ones.
> > >
> >
> > PCI capabilities are managed by the kernel VFIO driver. So w/o this
> > patch no userspace driver can enable TPH to try that trick?
> 
> Yes, it's possible. It's just a matter of setting the right bits in the PCI config
> space to enable TPH on the device.
> 

No. Before this patch the TPH capability is read-only to userspace,
enforced by vfio-pci. So there is no way that an user can toggle
the TPH cap by itself!

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-04-01 14:11           ` Jason Gunthorpe
  2025-04-01 23:38             ` Wathsala Wathawana Vithanage
@ 2025-04-07  5:22             ` Tian, Kevin
  1 sibling, 0 replies; 24+ messages in thread
From: Tian, Kevin @ 2025-04-07  5:22 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Wathsala Wathawana Vithanage, Alex Williamson,
	linux-kernel@vger.kernel.org, nd, Philipp Stanner, Yunxiang Li,
	Dr. David Alan Gilbert, Ankit Agrawal, open list:VFIO DRIVER,
	Dhruv Tripathi, Nagarahalli, Honnappa, Jeremy Linton

> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Tuesday, April 1, 2025 10:12 PM
> 
> On Wed, Mar 12, 2025 at 07:53:17AM +0000, Tian, Kevin wrote:
> 
> > Probably we should not allow device-specific mode unless the user is
> > capable of CAP_SYS_RAWIO? It allows an user to pollute caches on
> > CPUs which its processes are not affined to, hence could easily break
> > SLAs which CSPs try to achieve...
> 
> I'm not sure this is within the threat model for VFIO though..
> 
> qemu or the operator needs to deal with this by not permiting such
> HW to go into a VM.

it could be used by native app e.g. dpdk.

> 
> Really we can't block device specific mode anyhow because we can't
> even discover it on the kernel side..
> 

hmm the TPH capability reports which steering modes (no st,
irq vector, or device specific) are supported by a device. and the
mode must be selected explicitly when sw enables the capability.

so policy-wise vfio could advocate/enforce that only the interrupt
vector mode is supported for non-privileged users.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-05  6:11       ` Wathsala Wathawana Vithanage
  2025-03-05 18:58         ` Jason Gunthorpe
  2025-03-12  7:53         ` Tian, Kevin
@ 2025-04-26 12:42         ` Wathsala Wathawana Vithanage
  2 siblings, 0 replies; 24+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-04-26 12:42 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Jason Gunthorpe, linux-kernel@vger.kernel.org, nd, Kevin Tian,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Dhruv Tripathi,
	Honnappa Nagarahalli, Jeremy Linton

> Having said that, regardless of this proposal or the availability of kernel
> TPH support, a VFIO driver could enable TPH and set an arbitrary ST on the
> MSI-X/ST table or a device-specific location on supported platforms. If the
> driver doesn't have a list of valid STs, it can enumerate 8- or 16-bit STs and
> measure access latencies to determine valid ones.
> 
I tested enabling TPH inside a VM with setpci, it turned out that it doesn't
write to TPH control register thankfully because vfio_pci_init_perm_bits()
in host kernel is not setting config write permissions for PCI_EXT_CAP_ID_TPH. 

QEMU traps config writes and routes them via vfio_pci_write_config().
So, like how it already handles MSI/MSI-X enablement, TPH enablement
too could be handled by a special handler that does the following.
1. Error out on device specific mode enablement.
2. Check ST-table writes for STs that do not belong to CPUs the VM is bound to.

Then for MSI-X mode, a Quirk could trap and check STs for the #2 case
above.
To make #1 above consistent with TPH cap register, some emulated bits can
be added to mask out availability of device specific mode.

--wathsala





^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
  2025-03-04  3:12     ` Alex Williamson
  2025-03-04  7:01       ` Wathsala Wathawana Vithanage
@ 2025-04-26 12:49       ` Wathsala Wathawana Vithanage
  1 sibling, 0 replies; 24+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-04-26 12:49 UTC (permalink / raw)
  To: Alex Williamson
  Cc: linux-kernel@vger.kernel.org, nd, Jason Gunthorpe, Kevin Tian,
	Philipp Stanner, Yunxiang Li, Dr. David Alan Gilbert,
	Ankit Agrawal, open list:VFIO DRIVER, Jeremy Linton,
	Honnappa Nagarahalli, Dhruv Tripathi

> > For cases where steering-tag is not required in user-space (VMMs),
> > caller asks the kernel to set steering-tag that corresponds the
> > cpu-id, cache-level, and PH combination at a given MSI-X vector entry
> > or ST table in config space. For such cases too, the kernel will read
> > the steering-tag from the _DSM and set the tag in the corresponding MSI-X or
> ST table entry.
> 
> We need to consider that there are vfio-pci variant drivers that support migration
> and make use of the vfio-pci-core feature interface.
> TPH programming appears to be very non-migration friendly, the attributes are
> very specific to the current host system.  Should migration and TPH be mutually
> exclusive?  This may be a factor in the decision to use a feature or ioctl.
> 

TPH isn't migration friendly at least for the time being. Therefore, this should be
implemented as an ioctl.

--wathsala


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [RFC PATCH v2 0/1] VFIO ioctl for TLP Processing Hints
  2025-02-21 22:46 [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl Wathsala Vithanage
                   ` (2 preceding siblings ...)
  2025-03-04 14:14 ` Jason Gunthorpe
@ 2025-06-02 23:06 ` Wathsala Vithanage
  2025-06-02 23:06   ` [RFC PATCH v2 1/1] vfio/pci: add PCIe TPH device ioctl Wathsala Vithanage
  3 siblings, 1 reply; 24+ messages in thread
From: Wathsala Vithanage @ 2025-06-02 23:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: alex.williamson, gg, kevin.tian, bhelgaas, pstanner, linux,
	schnelle, Yunxiang.Li, kvm, Wathsala Vithanage

Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
direct cache injection. As described in the relevant patch set [1],
direct cache injection in supported hardware allows optimal platform
resource utilization for specific requests on the PCIe bus. This feature
is currently available only for kernel device drivers. However, user
space applications, especially those whose performance is sensitive to
the latency of inbound writes as seen by a CPU core, may benefit from
using this information (E.g., DPDK cache stashing RFC [2] or an HPC
application running in a VM). For such applications to enable and
configure TPH on PCIe devices, this RFC proposes a new VFIO ioctl.

[1] lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@amd.com
[2] inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@arm.com

V1->V2:
 * Rewrite as a VFIO IOCTL
 * Add missing padding for structs
 * Increase maximum steering tags per IOCTL to 2048

Wathsala Vithanage (1):
  vfio/pci: add PCIe TPH device ioctl

 drivers/vfio/pci/vfio_pci_core.c | 153 +++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        |  83 +++++++++++++++++
 2 files changed, 236 insertions(+)

-- 
2.43.0

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [RFC PATCH v2 1/1] vfio/pci: add PCIe TPH device ioctl
  2025-06-02 23:06 ` [RFC PATCH v2 0/1] VFIO ioctl for TLP Processing Hints Wathsala Vithanage
@ 2025-06-02 23:06   ` Wathsala Vithanage
  0 siblings, 0 replies; 24+ messages in thread
From: Wathsala Vithanage @ 2025-06-02 23:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: alex.williamson, gg, kevin.tian, bhelgaas, pstanner, linux,
	schnelle, Yunxiang.Li, kvm, Wathsala Vithanage

This patch introduces VFIO_DEVICE_PCI_TPH IOCTL to enable configuring of
TPH from the user space. It provides an interface to user space drivers
and VMMs to enable/disable TPH capability on PCIe devices and set
steering tags in MSI-X or steering-tag table entries or read steering
tags from the kernel to use them in device-specific mode.

Signed-off-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 153 +++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        |  83 +++++++++++++++++
 2 files changed, 236 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 6328c3a05bcd..efdc2f09aa75 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -29,6 +29,7 @@
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
 #include <linux/iommufd.h>
+#include <linux/pci-tph.h>
 #if IS_ENABLED(CONFIG_EEH)
 #include <asm/eeh.h>
 #endif
@@ -1444,6 +1445,156 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
 				  ioeventfd.fd);
 }
 
+static struct vfio_pci_tph_entry *vfio_pci_tph_get_ents(struct vfio_pci_tph *tph,
+							void __user *tph_ents,
+							size_t *ents_size)
+{
+	unsigned long minsz;
+	size_t size;
+
+	if (!ents_size)
+		return ERR_PTR(-EINVAL);
+
+	minsz = offsetofend(struct vfio_pci_tph, count);
+
+	size = tph->count * sizeof(struct vfio_pci_tph_entry);
+
+	if (tph->argsz - minsz < size)
+		return ERR_PTR(-EINVAL);
+
+	*ents_size = size;
+
+	return memdup_user(tph_ents, size);
+}
+
+static int vfio_pci_tph_set_st(struct vfio_pci_core_device *vdev,
+			       struct vfio_pci_tph_entry *ents, int count)
+{
+	int i, err = 0;
+
+	for (i = 0; i < count && !err; i++)
+		err = pcie_tph_set_st_entry(vdev->pdev, ents[i].index,
+					    ents[i].st);
+
+	return err;
+}
+
+static int vfio_pci_tph_get_st(struct vfio_pci_core_device *vdev,
+			       struct vfio_pci_tph_entry *ents, int count)
+{
+	int i, mtype, err = 0;
+	u32 cpu_uid;
+
+	for (i = 0; i < count && !err; i++) {
+		if (ents[i].cpu_id >= nr_cpu_ids || !cpu_present(ents[i].cpu_id)) {
+			err = -EINVAL;
+			break;
+		}
+
+		cpu_uid = topology_core_id(ents[i].cpu_id);
+		mtype = (ents[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
+			VFIO_TPH_MEM_TYPE_SHIFT;
+
+		/*
+		 * ph_ignore is always set.
+		 * TPH implementation of the PCI subsystem forces processing
+		 * hint to bi-directional by setting PH bits to 0 when
+		 * acquiring Steering Tags from the platform firmware. It also
+		 * discards the ph_ignore bit returned by firmware.
+		 */
+		ents[i].ph_ignore = 1;
+
+		err = pcie_tph_get_cpu_st(vdev->pdev, mtype, cpu_uid,
+					  &ents[i].st);
+	}
+
+	return err;
+}
+
+static int vfio_pci_tph_st_op(struct vfio_pci_core_device *vdev,
+			      struct vfio_pci_tph *tph, void __user *tph_ents)
+{
+	int err = 0;
+	struct vfio_pci_tph_entry *ents;
+	size_t ents_size;
+
+	if (!tph->count || tph->count > VFIO_TPH_INFO_MAX) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	ents = vfio_pci_tph_get_ents(tph, tph_ents, &ents_size);
+	if (IS_ERR(ents)) {
+		err = PTR_ERR(ents);
+		goto out;
+	}
+
+	err = vfio_pci_tph_get_st(vdev, ents, tph->count);
+	if (err)
+		goto out_free_ents;
+
+	/*
+	 * Set Steering tags. TPH will be disabled on the device by the PCI
+	 * subsystem if there is an error.
+	 */
+	if (tph->flags & VFIO_DEVICE_TPH_SET_ST) {
+		err = vfio_pci_tph_set_st(vdev, ents, tph->count);
+		if (err)
+			goto out_free_ents;
+	}
+
+	if (copy_to_user(tph_ents, ents, ents_size))
+		err = -EFAULT;
+
+out_free_ents:
+	kfree(ents);
+out:
+	return err;
+}
+
+static int vfio_pci_tph_enable(struct vfio_pci_core_device *vdev,
+			       struct vfio_pci_tph *arg)
+{
+	return pcie_enable_tph(vdev->pdev, arg->flags & VFIO_TPH_ST_MODE_MASK);
+}
+
+static int vfio_pci_tph_disable(struct vfio_pci_core_device *vdev)
+{
+	pcie_disable_tph(vdev->pdev);
+	return 0;
+}
+
+static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
+			      void __user *uarg)
+{
+	u32 op;
+	struct vfio_pci_tph tph;
+	size_t minsz = offsetofend(struct vfio_pci_tph, count);
+
+	if (copy_from_user(&tph, uarg, minsz))
+		return -EFAULT;
+
+	if (tph.argsz < minsz)
+		return -EINVAL;
+
+	op = tph.flags & VFIO_DEVICE_TPH_OP_MASK;
+
+	switch (op) {
+	case VFIO_DEVICE_TPH_ENABLE:
+		return vfio_pci_tph_enable(vdev, &tph);
+
+	case VFIO_DEVICE_TPH_DISABLE:
+		return vfio_pci_tph_disable(vdev);
+
+	case VFIO_DEVICE_TPH_GET_ST:
+	case VFIO_DEVICE_TPH_SET_ST:
+		return vfio_pci_tph_st_op(vdev, &tph, (u8 *)(uarg) + minsz);
+
+	default:
+		return -EINVAL;
+	}
+}
+
 long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 			 unsigned long arg)
 {
@@ -1468,6 +1619,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 		return vfio_pci_ioctl_reset(vdev, uarg);
 	case VFIO_DEVICE_SET_IRQS:
 		return vfio_pci_ioctl_set_irqs(vdev, uarg);
+	case VFIO_DEVICE_PCI_TPH:
+		return vfio_pci_ioctl_tph(vdev, uarg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5764f315137f..1a56abdf9deb 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -873,6 +873,88 @@ struct vfio_device_ioeventfd {
 
 #define VFIO_DEVICE_IOEVENTFD		_IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_DEVICE_PCI_TPH	- _IO(VFIO_TYPE, VFIO_BASE + 22)
+ *
+ * This command is used to control PCIe TLP Processing Hints (TPH)
+ * capability in a PCIe device.
+ * It supports following operations on a PCIe device with respect to TPH
+ * capability.
+ *
+ * - Enabling/disabling TPH capability in a PCIe device.
+ *
+ *   Setting VFIO_DEVICE_TPH_ENABLE flag enables TPH in no-steering-tag,
+ *   interrupt-vector, or device-specific mode defined in the PCIe specficiation
+ *   when feature flags TPH_ST_NS_MODE, TPH_ST_IV_MODE, and TPH_ST_DS_MODE are
+ *   set respectively. TPH_ST_xx_MODE macros are defined in
+ *   uapi/linux/pci_regs.h.
+ *
+ *   VFIO_DEVICE_TPH_DISABLE disables PCIe TPH on the device.
+ *
+ * - Writing STs to MSI-X or ST table in a PCIe device.
+ *
+ *   VFIO_DEVICE_TPH_SET_ST flag set steering tags on a device at an index in
+ *   MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used and
+ *   returns the programmed steering tag values. The caller can set one or more
+ *   steering tags by passing an array of vfio_pci_tph_entry objects containing
+ *   cpu_id, cache_level, and MSI-X/ST-table index. The caller can also set the
+ *   intended memory type and the processing hint by setting VFIO_TPH_MEM_TYPE_x
+ *   and VFIO_TPH_HINT_x flags, respectively.
+ *
+ * - Reading Steering Tags (ST) from the host platform.
+ *
+ *   VFIO_DEVICE_TPH_GET_ST flags returns steering tags to the caller. Caller
+ *   can request one or more steering tags by passing an array of
+ *   vfio_pci_tph_entry objects. Steering Tag for each request is returned via
+ *   the st field in vfio_pci_tph_entry.
+ */
+struct vfio_pci_tph_entry {
+	/* in */
+	__u32 cpu_id;			/* CPU logical ID */
+	__u32 cache_level;		/* Cache level. L1 D= 0, L2D = 2, ...*/
+	__u8  flags;
+#define VFIO_TPH_MEM_TYPE_MASK		0x1
+#define VFIO_TPH_MEM_TYPE_SHIFT		0
+#define VFIO_TPH_MEM_TYPE_VMEM		0   /* Request volatile memory ST */
+#define VFIO_TPH_MEM_TYPE_PMEM		1   /* Request persistent memory ST */
+
+#define VFIO_TPH_HINT_MASK		0x3
+#define VFIO_TPH_HINT_SHIFT		1
+#define VFIO_TPH_HINT_BIDIR		0
+#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
+#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
+#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)
+	__u8  pad0;
+	__u16 index;			/* MSI-X/ST-table index to set ST */
+	/* out */
+	__u16 st;			/* Steering-Tag */
+	__u8  ph_ignore;		/* Platform ignored the Processing */
+	__u8  pad1;
+};
+
+struct vfio_pci_tph {
+	__u32 argsz;			/* Size of vfio_pci_tph and info[] */
+	__u32 flags;
+#define VFIO_TPH_ST_MODE_MASK		0x7
+
+#define VFIO_DEVICE_TPH_OP_SHIFT	3
+#define VFIO_DEVICE_TPH_OP_MASK		(0x7 << VFIO_DEVICE_TPH_OP_SHIFT)
+/* Enable TPH on device */
+#define VFIO_DEVICE_TPH_ENABLE		0
+/* Disable TPH on device */
+#define VFIO_DEVICE_TPH_DISABLE		(1 << VFIO_DEVICE_TPH_OP_SHIFT)
+/* Get steering-tags */
+#define VFIO_DEVICE_TPH_GET_ST		(2 << VFIO_DEVICE_TPH_OP_SHIFT)
+/* Set steering-tags */
+#define VFIO_DEVICE_TPH_SET_ST		(4 << VFIO_DEVICE_TPH_OP_SHIFT)
+	__u32 count;			/* Number of entries in ents[] */
+	struct vfio_pci_tph_entry ents[];
+#define VFIO_TPH_INFO_MAX	2048	/* Max entries in ents[] */
+};
+
+#define VFIO_DEVICE_PCI_TPH	_IO(VFIO_TYPE, VFIO_BASE + 22)
+
+
 /**
  * VFIO_DEVICE_FEATURE - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
  *			       struct vfio_device_feature)
@@ -1468,6 +1550,7 @@ struct vfio_device_feature_bus_master {
 };
 #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
 
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2025-06-02 23:07 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-02-21 22:46 [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl Wathsala Vithanage
2025-02-28 18:48 ` Alex Williamson
2025-03-03  4:20   ` Wathsala Wathawana Vithanage
2025-03-04  3:12     ` Alex Williamson
2025-03-04  7:01       ` Wathsala Wathawana Vithanage
2025-04-26 12:49       ` Wathsala Wathawana Vithanage
2025-03-03  7:49 ` Philipp Stanner
2025-03-03 17:21   ` Wathsala Wathawana Vithanage
2025-03-04 14:14 ` Jason Gunthorpe
2025-03-04 22:38   ` Wathsala Wathawana Vithanage
2025-03-05  1:24     ` Alex Williamson
2025-03-05  6:11       ` Wathsala Wathawana Vithanage
2025-03-05 18:58         ` Jason Gunthorpe
2025-03-05 20:21           ` Wathsala Wathawana Vithanage
2025-03-12  7:53         ` Tian, Kevin
2025-03-14  1:49           ` Wathsala Wathawana Vithanage
2025-04-07  5:16             ` Tian, Kevin
2025-04-01 14:11           ` Jason Gunthorpe
2025-04-01 23:38             ` Wathsala Wathawana Vithanage
2025-04-01 23:52               ` Jason Gunthorpe
2025-04-07  5:22             ` Tian, Kevin
2025-04-26 12:42         ` Wathsala Wathawana Vithanage
2025-06-02 23:06 ` [RFC PATCH v2 0/1] VFIO ioctl for TLP Processing Hints Wathsala Vithanage
2025-06-02 23:06   ` [RFC PATCH v2 1/1] vfio/pci: add PCIe TPH device ioctl Wathsala Vithanage

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).