- * [PATCH 1/8] linux-headers: update against 5.11-rc4
  2021-01-19 20:44 [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Matthew Rosato
@ 2021-01-19 20:44 ` Matthew Rosato
  2021-01-19 20:44 ` [PATCH 2/8] s390x/pci: Keep track of the PCI Function type Matthew Rosato
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Matthew Rosato @ 2021-01-19 20:44 UTC (permalink / raw)
  To: cohuck, thuth
  Cc: pmorel, david, schnelle, richard.henderson, qemu-s390x,
	qemu-devel, pasic, borntraeger, alex.williamson, mst, pbonzini
Placeholder commit to pull in changes from "vfio-pci/zdev: Pass the relaxed
alignment flag" and "vfio-pci/zdev: Introduce the PCISTB vfio region"
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
 .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.h        |   2 +-
 include/standard-headers/drm/drm_fourcc.h          | 175 ++++++++++++++++++++-
 include/standard-headers/linux/ethtool.h           |   2 +-
 include/standard-headers/linux/fuse.h              |  30 +++-
 include/standard-headers/linux/kernel.h            |   9 +-
 include/standard-headers/linux/pci_regs.h          |  16 ++
 include/standard-headers/linux/vhost_types.h       |   9 ++
 include/standard-headers/linux/virtio_gpu.h        |  82 ++++++++++
 include/standard-headers/linux/virtio_ids.h        |  44 +++---
                       |   3 -
                  |   6 +-
                 |   1 +
                 |   1 +
                 |   1 +
               |   1 +
               |   1 +
                  |   1 +
                  |   1 +
                         |   1 +
                   |   1 +
                   |   1 +
                  |   1 +
                           |  58 ++++++-
                   |   9 ++
                          |   5 +
                     |  34 ++++
                         |   4 +
 27 files changed, 458 insertions(+), 41 deletions(-)
diff --git a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
index 0a8c7c9..1677208 100644
--- a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
+++ b/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
@@ -176,7 +176,7 @@ struct pvrdma_port_attr {
 	uint8_t			subnet_timeout;
 	uint8_t			init_type_reply;
 	uint8_t			active_width;
-	uint16_t			active_speed;
+	uint8_t			active_speed;
 	uint8_t			phys_state;
 	uint8_t			reserved[2];
 };
diff --git a/include/standard-headers/drm/drm_fourcc.h b/include/standard-headers/drm/drm_fourcc.h
index 0de1a55..c47e198 100644
--- a/include/standard-headers/drm/drm_fourcc.h
+++ b/include/standard-headers/drm/drm_fourcc.h
@@ -57,6 +57,30 @@ extern "C" {
  * may preserve meaning - such as number of planes - from the fourcc code,
  * whereas others may not.
  *
+ * Modifiers must uniquely encode buffer layout. In other words, a buffer must
+ * match only a single modifier. A modifier must not be a subset of layouts of
+ * another modifier. For instance, it's incorrect to encode pitch alignment in
+ * a modifier: a buffer may match a 64-pixel aligned modifier and a 32-pixel
+ * aligned modifier. That said, modifiers can have implicit minimal
+ * requirements.
+ *
+ * For modifiers where the combination of fourcc code and modifier can alias,
+ * a canonical pair needs to be defined and used by all drivers. Preferred
+ * combinations are also encouraged where all combinations might lead to
+ * confusion and unnecessarily reduced interoperability. An example for the
+ * latter is AFBC, where the ABGR layouts are preferred over ARGB layouts.
+ *
+ * There are two kinds of modifier users:
+ *
+ * - Kernel and user-space drivers: for drivers it's important that modifiers
+ *   don't alias, otherwise two drivers might support the same format but use
+ *   different aliases, preventing them from sharing buffers in an efficient
+ *   format.
+ * - Higher-level programs interfacing with KMS/GBM/EGL/Vulkan/etc: these users
+ *   see modifiers as opaque tokens they can check for equality and intersect.
+ *   These users musn't need to know to reason about the modifier value
+ *   (i.e. they are not expected to extract information out of the modifier).
+ *
  * Vendors should document their modifier usage in as much detail as
  * possible, to ensure maximum compatibility across devices, drivers and
  * applications.
@@ -154,6 +178,12 @@ extern "C" {
 #define DRM_FORMAT_ARGB16161616F fourcc_code('A', 'R', '4', 'H') /* [63:0] A:R:G:B 16:16:16:16 little endian */
 #define DRM_FORMAT_ABGR16161616F fourcc_code('A', 'B', '4', 'H') /* [63:0] A:B:G:R 16:16:16:16 little endian */
 
+/*
+ * RGBA format with 10-bit components packed in 64-bit per pixel, with 6 bits
+ * of unused padding per component:
+ */
+#define DRM_FORMAT_AXBXGXRX106106106106 fourcc_code('A', 'B', '1', '0') /* [63:0] A:x:B:x:G:x:R:x 10:6:10:6:10:6:10:6 little endian */
+
 /* packed YCbCr */
 #define DRM_FORMAT_YUYV		fourcc_code('Y', 'U', 'Y', 'V') /* [31:0] Cr0:Y1:Cb0:Y0 8:8:8:8 little endian */
 #define DRM_FORMAT_YVYU		fourcc_code('Y', 'V', 'Y', 'U') /* [31:0] Cb0:Y1:Cr0:Y0 8:8:8:8 little endian */
@@ -319,7 +349,6 @@ extern "C" {
  */
 
 /* Vendor Ids: */
-#define DRM_FORMAT_MOD_NONE           0
 #define DRM_FORMAT_MOD_VENDOR_NONE    0
 #define DRM_FORMAT_MOD_VENDOR_INTEL   0x01
 #define DRM_FORMAT_MOD_VENDOR_AMD     0x02
@@ -391,6 +420,16 @@ extern "C" {
  */
 #define DRM_FORMAT_MOD_LINEAR	fourcc_mod_code(NONE, 0)
 
+/*
+ * Deprecated: use DRM_FORMAT_MOD_LINEAR instead
+ *
+ * The "none" format modifier doesn't actually mean that the modifier is
+ * implicit, instead it means that the layout is linear. Whether modifiers are
+ * used is out-of-band information carried in an API-specific way (e.g. in a
+ * flag for drm_mode_fb_cmd2).
+ */
+#define DRM_FORMAT_MOD_NONE	0
+
 /* Intel framebuffer modifiers */
 
 /*
@@ -1055,6 +1094,140 @@ drm_fourcc_canonicalize_nvidia_format_mod(uint64_t modifier)
  */
 #define AMLOGIC_FBC_OPTION_MEM_SAVING		(1ULL << 0)
 
+/*
+ * AMD modifiers
+ *
+ * Memory layout:
+ *
+ * without DCC:
+ *   - main surface
+ *
+ * with DCC & without DCC_RETILE:
+ *   - main surface in plane 0
+ *   - DCC surface in plane 1 (RB-aligned, pipe-aligned if DCC_PIPE_ALIGN is set)
+ *
+ * with DCC & DCC_RETILE:
+ *   - main surface in plane 0
+ *   - displayable DCC surface in plane 1 (not RB-aligned & not pipe-aligned)
+ *   - pipe-aligned DCC surface in plane 2 (RB-aligned & pipe-aligned)
+ *
+ * For multi-plane formats the above surfaces get merged into one plane for
+ * each format plane, based on the required alignment only.
+ *
+ * Bits  Parameter                Notes
+ * ----- ------------------------ ---------------------------------------------
+ *
+ *   7:0 TILE_VERSION             Values are AMD_FMT_MOD_TILE_VER_*
+ *  12:8 TILE                     Values are AMD_FMT_MOD_TILE_<version>_*
+ *    13 DCC
+ *    14 DCC_RETILE
+ *    15 DCC_PIPE_ALIGN
+ *    16 DCC_INDEPENDENT_64B
+ *    17 DCC_INDEPENDENT_128B
+ * 19:18 DCC_MAX_COMPRESSED_BLOCK Values are AMD_FMT_MOD_DCC_BLOCK_*
+ *    20 DCC_CONSTANT_ENCODE
+ * 23:21 PIPE_XOR_BITS            Only for some chips
+ * 26:24 BANK_XOR_BITS            Only for some chips
+ * 29:27 PACKERS                  Only for some chips
+ * 32:30 RB                       Only for some chips
+ * 35:33 PIPE                     Only for some chips
+ * 55:36 -                        Reserved for future use, must be zero
+ */
+#define AMD_FMT_MOD fourcc_mod_code(AMD, 0)
+
+#define IS_AMD_FMT_MOD(val) (((val) >> 56) == DRM_FORMAT_MOD_VENDOR_AMD)
+
+/* Reserve 0 for GFX8 and older */
+#define AMD_FMT_MOD_TILE_VER_GFX9 1
+#define AMD_FMT_MOD_TILE_VER_GFX10 2
+#define AMD_FMT_MOD_TILE_VER_GFX10_RBPLUS 3
+
+/*
+ * 64K_S is the same for GFX9/GFX10/GFX10_RBPLUS and hence has GFX9 as canonical
+ * version.
+ */
+#define AMD_FMT_MOD_TILE_GFX9_64K_S 9
+
+/*
+ * 64K_D for non-32 bpp is the same for GFX9/GFX10/GFX10_RBPLUS and hence has
+ * GFX9 as canonical version.
+ */
+#define AMD_FMT_MOD_TILE_GFX9_64K_D 10
+#define AMD_FMT_MOD_TILE_GFX9_64K_S_X 25
+#define AMD_FMT_MOD_TILE_GFX9_64K_D_X 26
+#define AMD_FMT_MOD_TILE_GFX9_64K_R_X 27
+
+#define AMD_FMT_MOD_DCC_BLOCK_64B 0
+#define AMD_FMT_MOD_DCC_BLOCK_128B 1
+#define AMD_FMT_MOD_DCC_BLOCK_256B 2
+
+#define AMD_FMT_MOD_TILE_VERSION_SHIFT 0
+#define AMD_FMT_MOD_TILE_VERSION_MASK 0xFF
+#define AMD_FMT_MOD_TILE_SHIFT 8
+#define AMD_FMT_MOD_TILE_MASK 0x1F
+
+/* Whether DCC compression is enabled. */
+#define AMD_FMT_MOD_DCC_SHIFT 13
+#define AMD_FMT_MOD_DCC_MASK 0x1
+
+/*
+ * Whether to include two DCC surfaces, one which is rb & pipe aligned, and
+ * one which is not-aligned.
+ */
+#define AMD_FMT_MOD_DCC_RETILE_SHIFT 14
+#define AMD_FMT_MOD_DCC_RETILE_MASK 0x1
+
+/* Only set if DCC_RETILE = false */
+#define AMD_FMT_MOD_DCC_PIPE_ALIGN_SHIFT 15
+#define AMD_FMT_MOD_DCC_PIPE_ALIGN_MASK 0x1
+
+#define AMD_FMT_MOD_DCC_INDEPENDENT_64B_SHIFT 16
+#define AMD_FMT_MOD_DCC_INDEPENDENT_64B_MASK 0x1
+#define AMD_FMT_MOD_DCC_INDEPENDENT_128B_SHIFT 17
+#define AMD_FMT_MOD_DCC_INDEPENDENT_128B_MASK 0x1
+#define AMD_FMT_MOD_DCC_MAX_COMPRESSED_BLOCK_SHIFT 18
+#define AMD_FMT_MOD_DCC_MAX_COMPRESSED_BLOCK_MASK 0x3
+
+/*
+ * DCC supports embedding some clear colors directly in the DCC surface.
+ * However, on older GPUs the rendering HW ignores the embedded clear color
+ * and prefers the driver provided color. This necessitates doing a fastclear
+ * eliminate operation before a process transfers control.
+ *
+ * If this bit is set that means the fastclear eliminate is not needed for these
+ * embeddable colors.
+ */
+#define AMD_FMT_MOD_DCC_CONSTANT_ENCODE_SHIFT 20
+#define AMD_FMT_MOD_DCC_CONSTANT_ENCODE_MASK 0x1
+
+/*
+ * The below fields are for accounting for per GPU differences. These are only
+ * relevant for GFX9 and later and if the tile field is *_X/_T.
+ *
+ * PIPE_XOR_BITS = always needed
+ * BANK_XOR_BITS = only for TILE_VER_GFX9
+ * PACKERS = only for TILE_VER_GFX10_RBPLUS
+ * RB = only for TILE_VER_GFX9 & DCC
+ * PIPE = only for TILE_VER_GFX9 & DCC & (DCC_RETILE | DCC_PIPE_ALIGN)
+ */
+#define AMD_FMT_MOD_PIPE_XOR_BITS_SHIFT 21
+#define AMD_FMT_MOD_PIPE_XOR_BITS_MASK 0x7
+#define AMD_FMT_MOD_BANK_XOR_BITS_SHIFT 24
+#define AMD_FMT_MOD_BANK_XOR_BITS_MASK 0x7
+#define AMD_FMT_MOD_PACKERS_SHIFT 27
+#define AMD_FMT_MOD_PACKERS_MASK 0x7
+#define AMD_FMT_MOD_RB_SHIFT 30
+#define AMD_FMT_MOD_RB_MASK 0x7
+#define AMD_FMT_MOD_PIPE_SHIFT 33
+#define AMD_FMT_MOD_PIPE_MASK 0x7
+
+#define AMD_FMT_MOD_SET(field, value) \
+	((uint64_t)(value) << AMD_FMT_MOD_##field##_SHIFT)
+#define AMD_FMT_MOD_GET(field, value) \
+	(((value) >> AMD_FMT_MOD_##field##_SHIFT) & AMD_FMT_MOD_##field##_MASK)
+#define AMD_FMT_MOD_CLEAR(field) \
+	(~((uint64_t)AMD_FMT_MOD_##field##_MASK << AMD_FMT_MOD_##field##_SHIFT))
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/include/standard-headers/linux/ethtool.h b/include/standard-headers/linux/ethtool.h
index 0df22f7..8bfd01d 100644
--- a/include/standard-headers/linux/ethtool.h
+++ b/include/standard-headers/linux/ethtool.h
@@ -16,7 +16,7 @@
 
 #include "net/eth.h"
 
-#include "standard-headers/linux/kernel.h"
+#include "standard-headers/linux/const.h"
 #include "standard-headers/linux/types.h"
 #include "standard-headers/linux/if_ether.h"
 
diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h
index 82c0a38..950d7ed 100644
--- a/include/standard-headers/linux/fuse.h
+++ b/include/standard-headers/linux/fuse.h
@@ -175,6 +175,10 @@
  *
  *  7.32
  *  - add flags to fuse_attr, add FUSE_ATTR_SUBMOUNT, add FUSE_SUBMOUNTS
+ *
+ *  7.33
+ *  - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID
+ *  - add FUSE_OPEN_KILL_SUIDGID
  */
 
 #ifndef _LINUX_FUSE_H
@@ -206,7 +210,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 32
+#define FUSE_KERNEL_MINOR_VERSION 33
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -267,6 +271,7 @@ struct fuse_file_lock {
 #define FATTR_MTIME_NOW	(1 << 8)
 #define FATTR_LOCKOWNER	(1 << 9)
 #define FATTR_CTIME	(1 << 10)
+#define FATTR_KILL_SUIDGID	(1 << 11)
 
 /**
  * Flags returned by the OPEN request
@@ -316,6 +321,11 @@ struct fuse_file_lock {
  *		       foffset and moffset fields in struct
  *		       fuse_setupmapping_out and fuse_removemapping_one.
  * FUSE_SUBMOUNTS: kernel supports auto-mounting directory submounts
+ * FUSE_HANDLE_KILLPRIV_V2: fs kills suid/sgid/cap on write/chown/trunc.
+ *			Upon write/truncate suid/sgid is only killed if caller
+ *			does not have CAP_FSETID. Additionally upon
+ *			write/truncate sgid is killed only if file has group
+ *			execute permission. (Same as Linux VFS behavior).
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -345,6 +355,7 @@ struct fuse_file_lock {
 #define FUSE_EXPLICIT_INVAL_DATA (1 << 25)
 #define FUSE_MAP_ALIGNMENT	(1 << 26)
 #define FUSE_SUBMOUNTS		(1 << 27)
+#define FUSE_HANDLE_KILLPRIV_V2	(1 << 28)
 
 /**
  * CUSE INIT request/reply flags
@@ -374,11 +385,14 @@ struct fuse_file_lock {
  *
  * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed
  * FUSE_WRITE_LOCKOWNER: lock_owner field is valid
- * FUSE_WRITE_KILL_PRIV: kill suid and sgid bits
+ * FUSE_WRITE_KILL_SUIDGID: kill suid and sgid bits
  */
 #define FUSE_WRITE_CACHE	(1 << 0)
 #define FUSE_WRITE_LOCKOWNER	(1 << 1)
-#define FUSE_WRITE_KILL_PRIV	(1 << 2)
+#define FUSE_WRITE_KILL_SUIDGID (1 << 2)
+
+/* Obsolete alias; this flag implies killing suid/sgid only. */
+#define FUSE_WRITE_KILL_PRIV	FUSE_WRITE_KILL_SUIDGID
 
 /**
  * Read flags
@@ -427,6 +441,12 @@ struct fuse_file_lock {
  */
 #define FUSE_ATTR_SUBMOUNT      (1 << 0)
 
+/**
+ * Open flags
+ * FUSE_OPEN_KILL_SUIDGID: Kill suid and sgid if executable
+ */
+#define FUSE_OPEN_KILL_SUIDGID	(1 << 0)
+
 enum fuse_opcode {
 	FUSE_LOOKUP		= 1,
 	FUSE_FORGET		= 2,  /* no reply */
@@ -588,14 +608,14 @@ struct fuse_setattr_in {
 
 struct fuse_open_in {
 	uint32_t	flags;
-	uint32_t	unused;
+	uint32_t	open_flags;	/* FUSE_OPEN_... */
 };
 
 struct fuse_create_in {
 	uint32_t	flags;
 	uint32_t	mode;
 	uint32_t	umask;
-	uint32_t	padding;
+	uint32_t	open_flags;	/* FUSE_OPEN_... */
 };
 
 struct fuse_open_out {
diff --git a/include/standard-headers/linux/kernel.h b/include/standard-headers/linux/kernel.h
index 1eeba2e..7848c5a 100644
--- a/include/standard-headers/linux/kernel.h
+++ b/include/standard-headers/linux/kernel.h
@@ -3,13 +3,6 @@
 #define _LINUX_KERNEL_H
 
 #include "standard-headers/linux/sysinfo.h"
-
-/*
- * 'kernel.h' contains some often-used function prototypes etc
- */
-#define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
-#define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
-
-#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#include "standard-headers/linux/const.h"
 
 #endif /* _LINUX_KERNEL_H */
diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h
index a95d55f..e709ae8 100644
--- a/include/standard-headers/linux/pci_regs.h
+++ b/include/standard-headers/linux/pci_regs.h
@@ -531,6 +531,7 @@
 #define  PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */
 #define  PCI_EXP_LNKCAP_SLS_16_0GB 0x00000004 /* LNKCAP2 SLS Vector bit 3 */
 #define  PCI_EXP_LNKCAP_SLS_32_0GB 0x00000005 /* LNKCAP2 SLS Vector bit 4 */
+#define  PCI_EXP_LNKCAP_SLS_64_0GB 0x00000006 /* LNKCAP2 SLS Vector bit 5 */
 #define  PCI_EXP_LNKCAP_MLW	0x000003f0 /* Maximum Link Width */
 #define  PCI_EXP_LNKCAP_ASPMS	0x00000c00 /* ASPM Support */
 #define  PCI_EXP_LNKCAP_ASPM_L0S 0x00000400 /* ASPM L0s Support */
@@ -562,6 +563,7 @@
 #define  PCI_EXP_LNKSTA_CLS_8_0GB 0x0003 /* Current Link Speed 8.0GT/s */
 #define  PCI_EXP_LNKSTA_CLS_16_0GB 0x0004 /* Current Link Speed 16.0GT/s */
 #define  PCI_EXP_LNKSTA_CLS_32_0GB 0x0005 /* Current Link Speed 32.0GT/s */
+#define  PCI_EXP_LNKSTA_CLS_64_0GB 0x0006 /* Current Link Speed 64.0GT/s */
 #define  PCI_EXP_LNKSTA_NLW	0x03f0	/* Negotiated Link Width */
 #define  PCI_EXP_LNKSTA_NLW_X1	0x0010	/* Current Link Width x1 */
 #define  PCI_EXP_LNKSTA_NLW_X2	0x0020	/* Current Link Width x2 */
@@ -670,6 +672,7 @@
 #define  PCI_EXP_LNKCAP2_SLS_8_0GB	0x00000008 /* Supported Speed 8GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_16_0GB	0x00000010 /* Supported Speed 16GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_32_0GB	0x00000020 /* Supported Speed 32GT/s */
+#define  PCI_EXP_LNKCAP2_SLS_64_0GB	0x00000040 /* Supported Speed 64GT/s */
 #define  PCI_EXP_LNKCAP2_CROSSLINK	0x00000100 /* Crosslink supported */
 #define PCI_EXP_LNKCTL2		48	/* Link Control 2 */
 #define  PCI_EXP_LNKCTL2_TLS		0x000f
@@ -678,6 +681,7 @@
 #define  PCI_EXP_LNKCTL2_TLS_8_0GT	0x0003 /* Supported Speed 8GT/s */
 #define  PCI_EXP_LNKCTL2_TLS_16_0GT	0x0004 /* Supported Speed 16GT/s */
 #define  PCI_EXP_LNKCTL2_TLS_32_0GT	0x0005 /* Supported Speed 32GT/s */
+#define  PCI_EXP_LNKCTL2_TLS_64_0GT	0x0006 /* Supported Speed 64GT/s */
 #define  PCI_EXP_LNKCTL2_ENTER_COMP	0x0010 /* Enter Compliance */
 #define  PCI_EXP_LNKCTL2_TX_MARGIN	0x0380 /* Transmit Margin */
 #define  PCI_EXP_LNKCTL2_HASD		0x0020 /* HW Autonomous Speed Disable */
@@ -723,6 +727,7 @@
 #define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
 #define PCI_EXT_CAP_ID_L1SS	0x1E	/* L1 PM Substates */
 #define PCI_EXT_CAP_ID_PTM	0x1F	/* Precision Time Measurement */
+#define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
 #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
 #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
 #define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
@@ -831,6 +836,13 @@
 #define  PCI_PWR_CAP_BUDGET(x)	((x) & 1)	/* Included in system budget */
 #define PCI_EXT_CAP_PWR_SIZEOF	16
 
+/* Root Complex Event Collector Endpoint Association  */
+#define PCI_RCEC_RCIEP_BITMAP	4	/* Associated Bitmap for RCiEPs */
+#define PCI_RCEC_BUSN		8	/* RCEC Associated Bus Numbers */
+#define  PCI_RCEC_BUSN_REG_VER	0x02	/* Least version with BUSN present */
+#define  PCI_RCEC_BUSN_NEXT(x)	(((x) >> 8) & 0xff)
+#define  PCI_RCEC_BUSN_LAST(x)	(((x) >> 16) & 0xff)
+
 /* Vendor-Specific (VSEC, PCI_EXT_CAP_ID_VNDR) */
 #define PCI_VNDR_HEADER		4	/* Vendor-Specific Header */
 #define  PCI_VNDR_HEADER_ID(x)	((x) & 0xffff)
@@ -1066,6 +1078,10 @@
 #define  PCI_L1SS_CTL1_LTR_L12_TH_SCALE	0xe0000000  /* LTR_L1.2_THRESHOLD_Scale */
 #define PCI_L1SS_CTL2		0x0c	/* Control 2 Register */
 
+/* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */
+#define PCI_DVSEC_HEADER1		0x4 /* Designated Vendor-Specific Header1 */
+#define PCI_DVSEC_HEADER2		0x8 /* Designated Vendor-Specific Header2 */
+
 /* Data Link Feature */
 #define PCI_DLF_CAP		0x04	/* Capabilities Register */
 #define  PCI_DLF_EXCHANGE_ENABLE	0x80000000  /* Data Link Feature Exchange Enable */
diff --git a/include/standard-headers/linux/vhost_types.h b/include/standard-headers/linux/vhost_types.h
index 486630b..0bd2684 100644
--- a/include/standard-headers/linux/vhost_types.h
+++ b/include/standard-headers/linux/vhost_types.h
@@ -138,6 +138,15 @@ struct vhost_vdpa_config {
 	uint8_t buf[0];
 };
 
+/* vhost vdpa IOVA range
+ * @first: First address that can be mapped by vhost-vDPA
+ * @last: Last address that can be mapped by vhost-vDPA
+ */
+struct vhost_vdpa_iova_range {
+	uint64_t first;
+	uint64_t last;
+};
+
 /* Feature bits */
 /* Log all write descriptors. Can be changed while device is active. */
 #define VHOST_F_LOG_ALL 26
diff --git a/include/standard-headers/linux/virtio_gpu.h b/include/standard-headers/linux/virtio_gpu.h
index 4183cdc..1357e47 100644
--- a/include/standard-headers/linux/virtio_gpu.h
+++ b/include/standard-headers/linux/virtio_gpu.h
@@ -55,6 +55,11 @@
  */
 #define VIRTIO_GPU_F_RESOURCE_UUID       2
 
+/*
+ * VIRTIO_GPU_CMD_RESOURCE_CREATE_BLOB
+ */
+#define VIRTIO_GPU_F_RESOURCE_BLOB       3
+
 enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_UNDEFINED = 0,
 
@@ -71,6 +76,8 @@ enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_CMD_GET_CAPSET,
 	VIRTIO_GPU_CMD_GET_EDID,
 	VIRTIO_GPU_CMD_RESOURCE_ASSIGN_UUID,
+	VIRTIO_GPU_CMD_RESOURCE_CREATE_BLOB,
+	VIRTIO_GPU_CMD_SET_SCANOUT_BLOB,
 
 	/* 3d commands */
 	VIRTIO_GPU_CMD_CTX_CREATE = 0x0200,
@@ -81,6 +88,8 @@ enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_CMD_TRANSFER_TO_HOST_3D,
 	VIRTIO_GPU_CMD_TRANSFER_FROM_HOST_3D,
 	VIRTIO_GPU_CMD_SUBMIT_3D,
+	VIRTIO_GPU_CMD_RESOURCE_MAP_BLOB,
+	VIRTIO_GPU_CMD_RESOURCE_UNMAP_BLOB,
 
 	/* cursor commands */
 	VIRTIO_GPU_CMD_UPDATE_CURSOR = 0x0300,
@@ -93,6 +102,7 @@ enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_RESP_OK_CAPSET,
 	VIRTIO_GPU_RESP_OK_EDID,
 	VIRTIO_GPU_RESP_OK_RESOURCE_UUID,
+	VIRTIO_GPU_RESP_OK_MAP_INFO,
 
 	/* error responses */
 	VIRTIO_GPU_RESP_ERR_UNSPEC = 0x1200,
@@ -103,6 +113,15 @@ enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER,
 };
 
+enum virtio_gpu_shm_id {
+	VIRTIO_GPU_SHM_ID_UNDEFINED = 0,
+	/*
+	 * VIRTIO_GPU_CMD_RESOURCE_MAP_BLOB
+	 * VIRTIO_GPU_CMD_RESOURCE_UNMAP_BLOB
+	 */
+	VIRTIO_GPU_SHM_ID_HOST_VISIBLE = 1
+};
+
 #define VIRTIO_GPU_FLAG_FENCE (1 << 0)
 
 struct virtio_gpu_ctrl_hdr {
@@ -359,4 +378,67 @@ struct virtio_gpu_resp_resource_uuid {
 	uint8_t uuid[16];
 };
 
+/* VIRTIO_GPU_CMD_RESOURCE_CREATE_BLOB */
+struct virtio_gpu_resource_create_blob {
+	struct virtio_gpu_ctrl_hdr hdr;
+	uint32_t resource_id;
+#define VIRTIO_GPU_BLOB_MEM_GUEST             0x0001
+#define VIRTIO_GPU_BLOB_MEM_HOST3D            0x0002
+#define VIRTIO_GPU_BLOB_MEM_HOST3D_GUEST      0x0003
+
+#define VIRTIO_GPU_BLOB_FLAG_USE_MAPPABLE     0x0001
+#define VIRTIO_GPU_BLOB_FLAG_USE_SHAREABLE    0x0002
+#define VIRTIO_GPU_BLOB_FLAG_USE_CROSS_DEVICE 0x0004
+	/* zero is invalid blob mem */
+	uint32_t blob_mem;
+	uint32_t blob_flags;
+	uint32_t nr_entries;
+	uint64_t blob_id;
+	uint64_t size;
+	/*
+	 * sizeof(nr_entries * virtio_gpu_mem_entry) bytes follow
+	 */
+};
+
+/* VIRTIO_GPU_CMD_SET_SCANOUT_BLOB */
+struct virtio_gpu_set_scanout_blob {
+	struct virtio_gpu_ctrl_hdr hdr;
+	struct virtio_gpu_rect r;
+	uint32_t scanout_id;
+	uint32_t resource_id;
+	uint32_t width;
+	uint32_t height;
+	uint32_t format;
+	uint32_t padding;
+	uint32_t strides[4];
+	uint32_t offsets[4];
+};
+
+/* VIRTIO_GPU_CMD_RESOURCE_MAP_BLOB */
+struct virtio_gpu_resource_map_blob {
+	struct virtio_gpu_ctrl_hdr hdr;
+	uint32_t resource_id;
+	uint32_t padding;
+	uint64_t offset;
+};
+
+/* VIRTIO_GPU_RESP_OK_MAP_INFO */
+#define VIRTIO_GPU_MAP_CACHE_MASK     0x0f
+#define VIRTIO_GPU_MAP_CACHE_NONE     0x00
+#define VIRTIO_GPU_MAP_CACHE_CACHED   0x01
+#define VIRTIO_GPU_MAP_CACHE_UNCACHED 0x02
+#define VIRTIO_GPU_MAP_CACHE_WC       0x03
+struct virtio_gpu_resp_map_info {
+	struct virtio_gpu_ctrl_hdr hdr;
+	uint32_t map_info;
+	uint32_t padding;
+};
+
+/* VIRTIO_GPU_CMD_RESOURCE_UNMAP_BLOB */
+struct virtio_gpu_resource_unmap_blob {
+	struct virtio_gpu_ctrl_hdr hdr;
+	uint32_t resource_id;
+	uint32_t padding;
+};
+
 #endif
diff --git a/include/standard-headers/linux/virtio_ids.h b/include/standard-headers/linux/virtio_ids.h
index b052355..bc1c062 100644
--- a/include/standard-headers/linux/virtio_ids.h
+++ b/include/standard-headers/linux/virtio_ids.h
@@ -29,24 +29,30 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE. */
 
-#define VIRTIO_ID_NET		1 /* virtio net */
-#define VIRTIO_ID_BLOCK		2 /* virtio block */
-#define VIRTIO_ID_CONSOLE	3 /* virtio console */
-#define VIRTIO_ID_RNG		4 /* virtio rng */
-#define VIRTIO_ID_BALLOON	5 /* virtio balloon */
-#define VIRTIO_ID_RPMSG		7 /* virtio remote processor messaging */
-#define VIRTIO_ID_SCSI		8 /* virtio scsi */
-#define VIRTIO_ID_9P		9 /* 9p virtio console */
-#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
-#define VIRTIO_ID_CAIF	       12 /* Virtio caif */
-#define VIRTIO_ID_GPU          16 /* virtio GPU */
-#define VIRTIO_ID_INPUT        18 /* virtio input */
-#define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
-#define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
-#define VIRTIO_ID_IOMMU        23 /* virtio IOMMU */
-#define VIRTIO_ID_MEM          24 /* virtio mem */
-#define VIRTIO_ID_FS           26 /* virtio filesystem */
-#define VIRTIO_ID_PMEM         27 /* virtio pmem */
-#define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */
+#define VIRTIO_ID_NET			1 /* virtio net */
+#define VIRTIO_ID_BLOCK			2 /* virtio block */
+#define VIRTIO_ID_CONSOLE		3 /* virtio console */
+#define VIRTIO_ID_RNG			4 /* virtio rng */
+#define VIRTIO_ID_BALLOON		5 /* virtio balloon */
+#define VIRTIO_ID_IOMEM			6 /* virtio ioMemory */
+#define VIRTIO_ID_RPMSG			7 /* virtio remote processor messaging */
+#define VIRTIO_ID_SCSI			8 /* virtio scsi */
+#define VIRTIO_ID_9P			9 /* 9p virtio console */
+#define VIRTIO_ID_MAC80211_WLAN		10 /* virtio WLAN MAC */
+#define VIRTIO_ID_RPROC_SERIAL		11 /* virtio remoteproc serial link */
+#define VIRTIO_ID_CAIF			12 /* Virtio caif */
+#define VIRTIO_ID_MEMORY_BALLOON	13 /* virtio memory balloon */
+#define VIRTIO_ID_GPU			16 /* virtio GPU */
+#define VIRTIO_ID_CLOCK			17 /* virtio clock/timer */
+#define VIRTIO_ID_INPUT			18 /* virtio input */
+#define VIRTIO_ID_VSOCK			19 /* virtio vsock transport */
+#define VIRTIO_ID_CRYPTO		20 /* virtio crypto */
+#define VIRTIO_ID_SIGNAL_DIST		21 /* virtio signal distribution device */
+#define VIRTIO_ID_PSTORE		22 /* virtio pstore device */
+#define VIRTIO_ID_IOMMU			23 /* virtio IOMMU */
+#define VIRTIO_ID_MEM			24 /* virtio mem */
+#define VIRTIO_ID_FS			26 /* virtio filesystem */
+#define VIRTIO_ID_PMEM			27 /* virtio pmem */
+#define VIRTIO_ID_MAC80211_HWSIM	29 /* virtio mac80211-hwsim */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
 --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h
index a72de1a..b6a0eaa 100644
--- a/linux-headers/asm-arm64/kvm.h
+++ b/linux-headers/asm-arm64/kvm.h
@@ -156,9 +156,6 @@ struct kvm_sync_regs {
 	__u64 device_irq_level;
 };
 
-struct kvm_arch_memory_slot {
-};
-
 /*
  * PMU filter structure. Describe a range of events with a particular
  * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER.
 --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h
index 2056318..7287529 100644
--- a/linux-headers/asm-generic/unistd.h
+++ b/linux-headers/asm-generic/unistd.h
@@ -517,7 +517,7 @@ __SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday)
 __SC_3264(__NR_adjtimex, sys_adjtimex_time32, sys_adjtimex)
 #endif
 
-/* kernel/timer.c */
+/* kernel/sys.c */
 #define __NR_getpid 172
 __SYSCALL(__NR_getpid, sys_getpid)
 #define __NR_getppid 173
@@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
 #define __NR_process_madvise 440
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_epoll_pwait2 441
+__SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
 
 #undef __NR_syscalls
-#define __NR_syscalls 441
+#define __NR_syscalls 442
 
 /*
  * 32 bit systems traditionally used different
 --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h
index aba284d..59e53b6 100644
--- a/linux-headers/asm-mips/unistd_n32.h
+++ b/linux-headers/asm-mips/unistd_n32.h
@@ -370,6 +370,7 @@
 #define __NR_pidfd_getfd	(__NR_Linux + 438)
 #define __NR_faccessat2	(__NR_Linux + 439)
 #define __NR_process_madvise	(__NR_Linux + 440)
+#define __NR_epoll_pwait2	(__NR_Linux + 441)
 
 
 #endif /* _ASM_MIPS_UNISTD_N32_H */
 --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h
index 0465ab9..683558a 100644
--- a/linux-headers/asm-mips/unistd_n64.h
+++ b/linux-headers/asm-mips/unistd_n64.h
@@ -346,6 +346,7 @@
 #define __NR_pidfd_getfd	(__NR_Linux + 438)
 #define __NR_faccessat2	(__NR_Linux + 439)
 #define __NR_process_madvise	(__NR_Linux + 440)
+#define __NR_epoll_pwait2	(__NR_Linux + 441)
 
 
 #endif /* _ASM_MIPS_UNISTD_N64_H */
 --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h
index 5222a0d..ca6a7e5 100644
--- a/linux-headers/asm-mips/unistd_o32.h
+++ b/linux-headers/asm-mips/unistd_o32.h
@@ -416,6 +416,7 @@
 #define __NR_pidfd_getfd	(__NR_Linux + 438)
 #define __NR_faccessat2	(__NR_Linux + 439)
 #define __NR_process_madvise	(__NR_Linux + 440)
+#define __NR_epoll_pwait2	(__NR_Linux + 441)
 
 
 #endif /* _ASM_MIPS_UNISTD_O32_H */
 --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h
index 21066a3..4624c90 100644
--- a/linux-headers/asm-powerpc/unistd_32.h
+++ b/linux-headers/asm-powerpc/unistd_32.h
@@ -423,6 +423,7 @@
 #define __NR_pidfd_getfd	438
 #define __NR_faccessat2	439
 #define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
 
 
 #endif /* _ASM_POWERPC_UNISTD_32_H */
 --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h
index c153da2..7e851b3 100644
--- a/linux-headers/asm-powerpc/unistd_64.h
+++ b/linux-headers/asm-powerpc/unistd_64.h
@@ -395,6 +395,7 @@
 #define __NR_pidfd_getfd	438
 #define __NR_faccessat2	439
 #define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
 
 
 #endif /* _ASM_POWERPC_UNISTD_64_H */
 --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h
index 3b4f2dd..c94d2c3 100644
--- a/linux-headers/asm-s390/unistd_32.h
+++ b/linux-headers/asm-s390/unistd_32.h
@@ -413,5 +413,6 @@
 #define __NR_pidfd_getfd 438
 #define __NR_faccessat2 439
 #define __NR_process_madvise 440
+#define __NR_epoll_pwait2 441
 
 #endif /* _ASM_S390_UNISTD_32_H */
 --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h
index 030a51f..984a06b 100644
--- a/linux-headers/asm-s390/unistd_64.h
+++ b/linux-headers/asm-s390/unistd_64.h
@@ -361,5 +361,6 @@
 #define __NR_pidfd_getfd 438
 #define __NR_faccessat2 439
 #define __NR_process_madvise 440
+#define __NR_epoll_pwait2 441
 
 #endif /* _ASM_S390_UNISTD_64_H */
 --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h
index 89e5f3d..8e76d37 100644
--- a/linux-headers/asm-x86/kvm.h
+++ b/linux-headers/asm-x86/kvm.h
@@ -12,6 +12,7 @@
 
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
 
 #define DE_VECTOR 0
 #define DB_VECTOR 1
 --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h
index cfba368..18fb99d 100644
--- a/linux-headers/asm-x86/unistd_32.h
+++ b/linux-headers/asm-x86/unistd_32.h
@@ -431,6 +431,7 @@
 #define __NR_pidfd_getfd 438
 #define __NR_faccessat2 439
 #define __NR_process_madvise 440
+#define __NR_epoll_pwait2 441
 
 
 #endif /* _ASM_X86_UNISTD_32_H */
 --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h
index 61af725..bde9593 100644
--- a/linux-headers/asm-x86/unistd_64.h
+++ b/linux-headers/asm-x86/unistd_64.h
@@ -353,6 +353,7 @@
 #define __NR_pidfd_getfd 438
 #define __NR_faccessat2 439
 #define __NR_process_madvise 440
+#define __NR_epoll_pwait2 441
 
 
 #endif /* _ASM_X86_UNISTD_64_H */
 --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h
index a6890cb..4ff6b17 100644
--- a/linux-headers/asm-x86/unistd_x32.h
+++ b/linux-headers/asm-x86/unistd_x32.h
@@ -306,6 +306,7 @@
 #define __NR_pidfd_getfd (__X32_SYSCALL_BIT + 438)
 #define __NR_faccessat2 (__X32_SYSCALL_BIT + 439)
 #define __NR_process_madvise (__X32_SYSCALL_BIT + 440)
+#define __NR_epoll_pwait2 (__X32_SYSCALL_BIT + 441)
 #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512)
 #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513)
 #define __NR_ioctl (__X32_SYSCALL_BIT + 514)
 --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 56ce14a..220db26 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -250,6 +250,8 @@ struct kvm_hyperv_exit {
 #define KVM_EXIT_ARM_NISV         28
 #define KVM_EXIT_X86_RDMSR        29
 #define KVM_EXIT_X86_WRMSR        30
+#define KVM_EXIT_DIRTY_RING_FULL  31
+#define KVM_EXIT_AP_RESET_HOLD    32
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -572,6 +574,7 @@ struct kvm_vapic_addr {
 #define KVM_MP_STATE_CHECK_STOP        6
 #define KVM_MP_STATE_OPERATING         7
 #define KVM_MP_STATE_LOAD              8
+#define KVM_MP_STATE_AP_RESET_HOLD     9
 
 struct kvm_mp_state {
 	__u32 mp_state;
@@ -1053,6 +1056,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_USER_SPACE_MSR 188
 #define KVM_CAP_X86_MSR_FILTER 189
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
+#define KVM_CAP_SYS_HYPERV_CPUID 191
+#define KVM_CAP_DIRTY_LOG_RING 192
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1511,7 +1516,7 @@ struct kvm_enc_region {
 /* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT_2 */
 #define KVM_CLEAR_DIRTY_LOG          _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log)
 
-/* Available with KVM_CAP_HYPERV_CPUID */
+/* Available with KVM_CAP_HYPERV_CPUID (vcpu) / KVM_CAP_SYS_HYPERV_CPUID (system) */
 #define KVM_GET_SUPPORTED_HV_CPUID _IOWR(KVMIO, 0xc1, struct kvm_cpuid2)
 
 /* Available with KVM_CAP_ARM_SVE */
@@ -1557,6 +1562,9 @@ struct kvm_pv_cmd {
 /* Available with KVM_CAP_X86_MSR_FILTER */
 #define KVM_X86_SET_MSR_FILTER	_IOW(KVMIO,  0xc6, struct kvm_msr_filter)
 
+/* Available with KVM_CAP_DIRTY_LOG_RING */
+#define KVM_RESET_DIRTY_RINGS		_IO(KVMIO, 0xc7)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
@@ -1710,4 +1718,52 @@ struct kvm_hyperv_eventfd {
 #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE    (1 << 0)
 #define KVM_DIRTY_LOG_INITIALLY_SET            (1 << 1)
 
+/*
+ * Arch needs to define the macro after implementing the dirty ring
+ * feature.  KVM_DIRTY_LOG_PAGE_OFFSET should be defined as the
+ * starting page offset of the dirty ring structures.
+ */
+#ifndef KVM_DIRTY_LOG_PAGE_OFFSET
+#define KVM_DIRTY_LOG_PAGE_OFFSET 0
+#endif
+
+/*
+ * KVM dirty GFN flags, defined as:
+ *
+ * |---------------+---------------+--------------|
+ * | bit 1 (reset) | bit 0 (dirty) | Status       |
+ * |---------------+---------------+--------------|
+ * |             0 |             0 | Invalid GFN  |
+ * |             0 |             1 | Dirty GFN    |
+ * |             1 |             X | GFN to reset |
+ * |---------------+---------------+--------------|
+ *
+ * Lifecycle of a dirty GFN goes like:
+ *
+ *      dirtied         harvested        reset
+ * 00 -----------> 01 -------------> 1X -------+
+ *  ^                                          |
+ *  |                                          |
+ *  +------------------------------------------+
+ *
+ * The userspace program is only responsible for the 01->1X state
+ * conversion after harvesting an entry.  Also, it must not skip any
+ * dirty bits, so that dirty bits are always harvested in sequence.
+ */
+#define KVM_DIRTY_GFN_F_DIRTY           BIT(0)
+#define KVM_DIRTY_GFN_F_RESET           BIT(1)
+#define KVM_DIRTY_GFN_F_MASK            0x3
+
+/*
+ * KVM dirty rings should be mapped at KVM_DIRTY_LOG_PAGE_OFFSET of
+ * per-vcpu mmaped regions as an array of struct kvm_dirty_gfn.  The
+ * size of the gfn buffer is decided by the first argument when
+ * enabling KVM_CAP_DIRTY_LOG_RING.
+ */
+struct kvm_dirty_gfn {
+	__u32 flags;
+	__u32 slot;
+	__u64 offset;
+};
+
 #endif /* __LINUX_KVM_H */
 --git a/linux-headers/linux/userfaultfd.h b/linux-headers/linux/userfaultfd.h
index 8d3996e..1ba9a9f 100644
--- a/linux-headers/linux/userfaultfd.h
+++ b/linux-headers/linux/userfaultfd.h
@@ -257,4 +257,13 @@ struct uffdio_writeprotect {
 	__u64 mode;
 };
 
+/*
+ * Flags for the userfaultfd(2) system call itself.
+ */
+
+/*
+ * Create a userfaultfd that can handle page faults only in user mode.
+ */
+#define UFFD_USER_MODE_ONLY 1
+
 #endif /* _LINUX_USERFAULTFD_H */
 --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index b92dcc4..f14e46c 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -338,6 +338,10 @@ struct vfio_region_info_cap_type {
  * to do TLB invalidation on a GPU.
  */
 #define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
+/*
+ * IBM zPCI I/O region
+ */
+#define VFIO_REGION_SUBTYPE_IBM_ZPCI_IO		(2)
 
 /* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
@@ -820,6 +824,7 @@ enum {
 enum {
 	VFIO_CCW_IO_IRQ_INDEX,
 	VFIO_CCW_CRW_IRQ_INDEX,
+	VFIO_CCW_REQ_IRQ_INDEX,
 	VFIO_CCW_NUM_IRQS
 };
 
 --git a/linux-headers/linux/vfio_zdev.h b/linux-headers/linux/vfio_zdev.h
index b430939..830acca 100644
--- a/linux-headers/linux/vfio_zdev.h
+++ b/linux-headers/linux/vfio_zdev.h
@@ -43,6 +43,7 @@ struct vfio_device_info_cap_zpci_group {
 	__u64 msi_addr;		/* MSI address */
 	__u64 flags;
 #define VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH 1 /* Program-specified TLB refresh */
+#define VFIO_DEVICE_INFO_ZPCI_FLAG_RELAXED 2 /* Relaxed Length and Alignment */
 	__u16 mui;		/* Measurement Block Update Interval */
 	__u16 noi;		/* Maximum number of MSIs */
 	__u16 maxstbl;		/* Maximum Store Block Length */
@@ -75,4 +76,37 @@ struct vfio_device_info_cap_zpci_pfip {
 	__u8 pfip[];
 };
 
+/**
+ * VFIO_REGION_SUBTYPE_IBM_ZPCI_IO - VFIO zPCI PCI Direct I/O Region
+ *
+ * This region is used to transfer I/O operations from the guest directly
+ * to the host zPCI I/O layer.  The same instruction requested by the guest
+ * (e.g. PCISTB) will always be used, even if the MIO variant is available.
+ *
+ * The _hdr area is user-readable and is used to provide setup information.
+ * The _req area is user-writable and is used to provide the I/O operation.
+ */
+struct vfio_zpci_io_hdr {
+	__u64 flags;
+	__u16 max;		/* Max block operation size allowed */
+	__u16 reserved;
+	__u32 reserved2;
+};
+
+struct vfio_zpci_io_req {
+	__u64 flags;
+#define VFIO_ZPCI_IO_FLAG_READ (1 << 0) /* Read Operation Specified */
+#define VFIO_ZPCI_IO_FLAG_BLOCKW (1 << 1) /* Block Write Operation Specified */
+	__u64 gaddr;		/* Address of guest data */
+	__u64 offset;		/* Offset into target PCI Address Space */
+	__u32 reserved;
+	__u16 len;		/* Length of guest operation */
+	__u8 pcias;		/* Target PCI Address Space */
+	__u8 reserved2;
+};
+
+struct vfio_region_zpci_io {
+	struct vfio_zpci_io_hdr hdr;
+	struct vfio_zpci_io_req req;
+};
 #endif
 --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h
index 7523218..c998860 100644
--- a/linux-headers/linux/vhost.h
+++ b/linux-headers/linux/vhost.h
@@ -146,4 +146,8 @@
 
 /* Set event fd for config interrupt*/
 #define VHOST_VDPA_SET_CONFIG_CALL	_IOW(VHOST_VIRTIO, 0x77, int)
+
+/* Get the valid iova range */
+#define VHOST_VDPA_GET_IOVA_RANGE	_IOR(VHOST_VIRTIO, 0x78, \
+					     struct vhost_vdpa_iova_range)
 #endif
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 27+ messages in thread
- * [PATCH 2/8] s390x/pci: Keep track of the PCI Function type
  2021-01-19 20:44 [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Matthew Rosato
  2021-01-19 20:44 ` [PATCH 1/8] linux-headers: update against 5.11-rc4 Matthew Rosato
@ 2021-01-19 20:44 ` Matthew Rosato
  2021-01-19 20:44 ` [PATCH 3/8] s390x/pci: MSI-X isn't strictly required for passthrough Matthew Rosato
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Matthew Rosato @ 2021-01-19 20:44 UTC (permalink / raw)
  To: cohuck, thuth
  Cc: pmorel, david, schnelle, richard.henderson, qemu-s390x,
	qemu-devel, pasic, borntraeger, alex.williamson, mst, pbonzini
We now receive the hardware identification of the PCI Function type
through vfio.  While we can't yet forward that to a guest via CLP,
we can still use this to make device-type-dependent decisions in
QEMU.
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
 hw/s390x/s390-pci-vfio.c        | 1 +
 include/hw/s390x/s390-pci-bus.h | 1 +
 2 files changed, 2 insertions(+)
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index ead4f22..cb59e98 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -125,6 +125,7 @@ static void s390_pci_read_base(S390PCIBusDevice *pbdev,
     /* The following values remain 0 until we support other FMB formats */
     pbdev->zpci_fn.fmbl = 0;
     pbdev->zpci_fn.pft = 0;
+    pbdev->pft = cap->pft;
 }
 
 static void s390_pci_read_group(S390PCIBusDevice *pbdev,
diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
index 49ae9f0..0662e7b 100644
--- a/include/hw/s390x/s390-pci-bus.h
+++ b/include/hw/s390x/s390-pci-bus.h
@@ -342,6 +342,7 @@ struct S390PCIBusDevice {
     uint16_t noi;
     uint16_t maxstbl;
     uint8_t sum;
+    uint8_t pft;
     S390PCIGroup *pci_group;
     ClpRspQueryPci zpci_fn;
     S390MsixInfo msix;
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 27+ messages in thread
- * [PATCH 3/8] s390x/pci: MSI-X isn't strictly required for passthrough
  2021-01-19 20:44 [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Matthew Rosato
  2021-01-19 20:44 ` [PATCH 1/8] linux-headers: update against 5.11-rc4 Matthew Rosato
  2021-01-19 20:44 ` [PATCH 2/8] s390x/pci: Keep track of the PCI Function type Matthew Rosato
@ 2021-01-19 20:44 ` Matthew Rosato
  2021-01-19 20:44 ` [PATCH 4/8] s390x/pci: Introduce the ZpciOps structure Matthew Rosato
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Matthew Rosato @ 2021-01-19 20:44 UTC (permalink / raw)
  To: cohuck, thuth
  Cc: pmorel, david, schnelle, richard.henderson, qemu-s390x,
	qemu-devel, pasic, borntraeger, alex.williamson, mst, pbonzini
s390 PCI currently disallows PCI devices without the MSI-X capability.
However, this fence doesn't make sense for passthrough devices.  Move
the check to only fence emulated devices (e.g., virtio).
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
---
 hw/s390x/s390-pci-bus.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index dd138da..dc732e2 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -1028,12 +1028,12 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
             s390_pci_get_clp_info(pbdev);
         } else {
             pbdev->fh |= FH_SHM_EMUL;
-        }
 
-        if (s390_pci_msix_init(pbdev)) {
-            error_setg(errp, "MSI-X support is mandatory "
-                       "in the S390 architecture");
-            return;
+            if (s390_pci_msix_init(pbdev)) {
+                error_setg(errp, "MSI-X support is mandatory "
+                           "in the S390 architecture");
+                return;
+            }
         }
 
         if (dev->hotplugged) {
@@ -1073,7 +1073,9 @@ static void s390_pcihost_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
         devfn = pci_dev->devfn;
         qdev_unrealize(dev);
 
-        s390_pci_msix_free(pbdev);
+        if (pbdev->fh & FH_SHM_EMUL) {
+            s390_pci_msix_free(pbdev);
+        }
         s390_pci_iommu_free(s, bus, devfn);
         pbdev->pdev = NULL;
         pbdev->state = ZPCI_FS_RESERVED;
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 27+ messages in thread
- * [PATCH 4/8] s390x/pci: Introduce the ZpciOps structure
  2021-01-19 20:44 [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Matthew Rosato
                   ` (2 preceding siblings ...)
  2021-01-19 20:44 ` [PATCH 3/8] s390x/pci: MSI-X isn't strictly required for passthrough Matthew Rosato
@ 2021-01-19 20:44 ` Matthew Rosato
  2021-01-19 20:44 ` [PATCH 5/8] s390x/pci: Handle devices that support relaxed alignment Matthew Rosato
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Matthew Rosato @ 2021-01-19 20:44 UTC (permalink / raw)
  To: cohuck, thuth
  Cc: pmorel, david, schnelle, richard.henderson, qemu-s390x,
	qemu-devel, pasic, borntraeger, alex.williamson, mst, pbonzini
As inftrastructure to introduce different PCI instruction handlers,
introduce the ZpciOps structure to contain function pointers for the
handlers.  Add default handlers for the PCISTG, PCILG and PCISTB
instructions.
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
 hw/s390x/s390-pci-bus.c          |   3 +
 hw/s390x/s390-pci-inst.c         | 246 ++++++++++++++++++++++++++-------------
 include/hw/s390x/s390-pci-bus.h  |  22 ++++
 include/hw/s390x/s390-pci-inst.h |   1 +
 4 files changed, 189 insertions(+), 83 deletions(-)
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index dc732e2..784ca65 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -1020,6 +1020,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
         pbdev->iommu->pbdev = pbdev;
         pbdev->state = ZPCI_FS_DISABLED;
         set_pbdev_info(pbdev);
+        zpci_assign_default_ops(pbdev);
 
         if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
             pbdev->fh |= FH_SHM_VFIO;
@@ -1079,6 +1080,8 @@ static void s390_pcihost_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
         s390_pci_iommu_free(s, bus, devfn);
         pbdev->pdev = NULL;
         pbdev->state = ZPCI_FS_RESERVED;
+        if (pbdev->pcistb_buf)
+            qemu_vfree(pbdev->pcistb_buf);
     } else if (object_dynamic_cast(OBJECT(dev), TYPE_S390_PCI_DEVICE)) {
         pbdev = S390_PCI_DEVICE(dev);
         pbdev->fid = 0;
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
index 654fac6..2d440a3 100644
--- a/hw/s390x/s390-pci-inst.c
+++ b/hw/s390x/s390-pci-inst.c
@@ -404,16 +404,49 @@ static MemTxResult zpci_read_bar(S390PCIBusDevice *pbdev, uint8_t pcias,
                                        MEMTXATTRS_UNSPECIFIED);
 }
 
+static int pcilg_default(S390PCIBusDevice *pbdev, uint64_t *data, uint8_t pcias,
+                         uint16_t len, uint64_t offset)
+{
+    MemTxResult result;
+
+    switch (pcias) {
+    case ZPCI_IO_BAR_MIN...ZPCI_IO_BAR_MAX:
+        if (!len || (len > (8 - (offset & 0x7)))) {
+            return -EINVAL;
+        }
+        result = zpci_read_bar(pbdev, pcias, offset, data, len);
+        if (result != MEMTX_OK) {
+            return -EINVAL;
+        }
+        break;
+    case ZPCI_CONFIG_BAR:
+        if (!len || (len > (4 - (offset & 0x3))) || len == 3) {
+            return -EINVAL;
+        }
+        *data =  pci_host_config_read_common(
+                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
+
+        if (zpci_endian_swap(data, len)) {
+            return -EINVAL;
+        }
+        break;
+    default:
+        return -EFAULT;
+    }
+
+    return 0;
+}
+
 int pcilg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
 {
     CPUS390XState *env = &cpu->env;
     S390PCIBusDevice *pbdev;
     uint64_t offset;
     uint64_t data;
-    MemTxResult result;
     uint8_t len;
     uint32_t fh;
     uint8_t pcias;
+    int ret;
 
     if (env->psw.mask & PSW_MASK_PSTATE) {
         s390_program_interrupt(env, PGM_PRIVILEGED, ra);
@@ -452,35 +485,21 @@ int pcilg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
         break;
     }
 
-    switch (pcias) {
-    case ZPCI_IO_BAR_MIN...ZPCI_IO_BAR_MAX:
-        if (!len || (len > (8 - (offset & 0x7)))) {
-            s390_program_interrupt(env, PGM_OPERAND, ra);
-            return 0;
-        }
-        result = zpci_read_bar(pbdev, pcias, offset, &data, len);
-        if (result != MEMTX_OK) {
-            s390_program_interrupt(env, PGM_OPERAND, ra);
-            return 0;
-        }
-        break;
-    case ZPCI_CONFIG_BAR:
-        if (!len || (len > (4 - (offset & 0x3))) || len == 3) {
-            s390_program_interrupt(env, PGM_OPERAND, ra);
-            return 0;
-        }
-        data =  pci_host_config_read_common(
-                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
+    ret = pbdev->ops.pcilg(pbdev, &data, pcias, len, offset);
 
-        if (zpci_endian_swap(&data, len)) {
-            s390_program_interrupt(env, PGM_OPERAND, ra);
-            return 0;
-        }
-        break;
-    default:
+    switch (ret) {
+    case -EINVAL:
+        s390_program_interrupt(env, PGM_OPERAND, ra);
+        return 0;
+    case -EFAULT:
         DPRINTF("pcilg invalid space\n");
         setcc(cpu, ZPCI_PCI_LS_ERR);
         s390_set_status_code(env, r2, ZPCI_PCI_ST_INVAL_AS);
+    case 0:
+        break;
+    default:
+        DPRINTF("pcilg unexpected return %d from op\n", ret);
+        s390_program_interrupt(env, PGM_OPERAND, ra);
         return 0;
     }
 
@@ -504,15 +523,55 @@ static MemTxResult zpci_write_bar(S390PCIBusDevice *pbdev, uint8_t pcias,
                                         MEMTXATTRS_UNSPECIFIED);
 }
 
+static int pcistg_default(S390PCIBusDevice *pbdev, uint64_t data, uint8_t pcias,
+                          uint16_t len, uint64_t offset)
+{
+    MemTxResult result;
+
+    switch (pcias) {
+        /* A ZPCI PCI card may use any BAR from BAR 0 to BAR 5 */
+    case ZPCI_IO_BAR_MIN...ZPCI_IO_BAR_MAX:
+        /*
+         * Check length:
+         * A length of 0 is invalid and length should not cross a double word
+         */
+        if (!len || (len > (8 - (offset & 0x7)))) {
+            return -EINVAL;
+        }
+
+        result = zpci_write_bar(pbdev, pcias, offset, data, len);
+        if (result != MEMTX_OK) {
+            return -EINVAL;
+        }
+        break;
+    case ZPCI_CONFIG_BAR:
+        /* ZPCI uses the pseudo BAR number 15 as configuration space */
+        /* possible access lengths are 1,2,4 and must not cross a word */
+        if (!len || (len > (4 - (offset & 0x3))) || len == 3) {
+            return -EINVAL;
+        }
+        /* len = 1,2,4 so we do not need to test */
+        zpci_endian_swap(&data, len);
+        pci_host_config_write_common(pbdev->pdev, offset,
+                                     pci_config_size(pbdev->pdev),
+                                     data, len);
+        break;
+    default:
+        return -EFAULT;
+    }
+
+    return 0;
+}
+
 int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
 {
     CPUS390XState *env = &cpu->env;
     uint64_t offset, data;
     S390PCIBusDevice *pbdev;
-    MemTxResult result;
     uint8_t len;
     uint32_t fh;
     uint8_t pcias;
+    int ret;
 
     if (env->psw.mask & PSW_MASK_PSTATE) {
         s390_program_interrupt(env, PGM_PRIVILEGED, ra);
@@ -555,40 +614,21 @@ int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
         break;
     }
 
-    switch (pcias) {
-        /* A ZPCI PCI card may use any BAR from BAR 0 to BAR 5 */
-    case ZPCI_IO_BAR_MIN...ZPCI_IO_BAR_MAX:
-        /* Check length:
-         * A length of 0 is invalid and length should not cross a double word
-         */
-        if (!len || (len > (8 - (offset & 0x7)))) {
-            s390_program_interrupt(env, PGM_OPERAND, ra);
-            return 0;
-        }
+    ret = pbdev->ops.pcistg(pbdev, data, pcias, len, offset);
 
-        result = zpci_write_bar(pbdev, pcias, offset, data, len);
-        if (result != MEMTX_OK) {
-            s390_program_interrupt(env, PGM_OPERAND, ra);
-            return 0;
-        }
-        break;
-    case ZPCI_CONFIG_BAR:
-        /* ZPCI uses the pseudo BAR number 15 as configuration space */
-        /* possible access lengths are 1,2,4 and must not cross a word */
-        if (!len || (len > (4 - (offset & 0x3))) || len == 3) {
-            s390_program_interrupt(env, PGM_OPERAND, ra);
-            return 0;
-        }
-        /* len = 1,2,4 so we do not need to test */
-        zpci_endian_swap(&data, len);
-        pci_host_config_write_common(pbdev->pdev, offset,
-                                     pci_config_size(pbdev->pdev),
-                                     data, len);
-        break;
-    default:
+    switch (ret) {
+    case -EINVAL:
+        s390_program_interrupt(env, PGM_OPERAND, ra);
+        return 0;
+    case -EFAULT:
         DPRINTF("pcistg invalid space\n");
         setcc(cpu, ZPCI_PCI_LS_ERR);
         s390_set_status_code(env, r2, ZPCI_PCI_ST_INVAL_AS);
+    case 0:
+        break;
+    default:
+        DPRINTF("pcistg unexpected return %d from op\n", ret);
+        s390_program_interrupt(env, PGM_OPERAND, ra);
         return 0;
     }
 
@@ -744,19 +784,55 @@ err:
     return 0;
 }
 
+/*
+ * The default PCISTB handler will break PCISTB instructions into a series of
+ * 8B memory operations.
+ */
+static int pcistb_default(S390PCIBusDevice *pbdev, S390CPU *cpu,
+                           uint64_t gaddr, uint8_t ar, uint8_t pcias,
+                           uint16_t len, uint64_t offset)
+{
+    MemTxResult result;
+    MemoryRegion *mr;
+    int i;
+
+    mr = pbdev->pdev->io_regions[pcias].memory;
+    mr = s390_get_subregion(mr, offset, len);
+    offset -= mr->addr;
+
+    for (i = 0; i < len; i += 8) {
+        if (!memory_region_access_valid(mr, offset + i, 8, true,
+                                        MEMTXATTRS_UNSPECIFIED)) {
+            return -EINVAL;
+        }
+    }
+
+    if (s390_cpu_virt_mem_read(cpu, gaddr, ar, pbdev->pcistb_buf, len)) {
+        return -EACCES;
+    }
+
+    for (i = 0; i < len; i += 8) {
+        result = memory_region_dispatch_write(mr, offset + i,
+                                              ldq_p(pbdev->pcistb_buf + i),
+                                              MO_64, MEMTXATTRS_UNSPECIFIED);
+        if (result != MEMTX_OK) {
+            return -EINVAL;
+        }
+    }
+
+    return 0;
+}
+
 int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr,
                         uint8_t ar, uintptr_t ra)
 {
     CPUS390XState *env = &cpu->env;
     S390PCIBusDevice *pbdev;
-    MemoryRegion *mr;
-    MemTxResult result;
     uint64_t offset;
-    int i;
     uint32_t fh;
     uint8_t pcias;
     uint16_t len;
-    uint8_t buffer[128];
+    int ret;
 
     if (env->psw.mask & PSW_MASK_PSTATE) {
         s390_program_interrupt(env, PGM_PRIVILEGED, ra);
@@ -817,31 +893,21 @@ int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr,
         goto specification_error;
     }
 
-    mr = pbdev->pdev->io_regions[pcias].memory;
-    mr = s390_get_subregion(mr, offset, len);
-    offset -= mr->addr;
-
-    for (i = 0; i < len; i += 8) {
-        if (!memory_region_access_valid(mr, offset + i, 8, true,
-                                        MEMTXATTRS_UNSPECIFIED)) {
-            s390_program_interrupt(env, PGM_OPERAND, ra);
-            return 0;
-        }
-    }
+    ret = pbdev->ops.pcistb(pbdev, cpu, gaddr, ar, pcias, len, offset);
 
-    if (s390_cpu_virt_mem_read(cpu, gaddr, ar, buffer, len)) {
+    switch (ret) {
+    case -EINVAL:
+        s390_program_interrupt(env, PGM_OPERAND, ra);
+        return 0;
+    case -EACCES:
         s390_cpu_virt_mem_handle_exc(cpu, ra);
         return 0;
-    }
-
-    for (i = 0; i < len / 8; i++) {
-        result = memory_region_dispatch_write(mr, offset + i * 8,
-                                              ldq_p(buffer + i * 8),
-                                              MO_64, MEMTXATTRS_UNSPECIFIED);
-        if (result != MEMTX_OK) {
-            s390_program_interrupt(env, PGM_OPERAND, ra);
-            return 0;
-        }
+    case 0:
+        break;
+    default:
+        DPRINTF("pcistb unexpected return %d from op\n", ret);
+        s390_program_interrupt(env, PGM_OPERAND, ra);
+        return 0;
     }
 
     pbdev->fmb.counter[ZPCI_FMB_CNT_STB]++;
@@ -1304,3 +1370,17 @@ out:
     setcc(cpu, cc);
     return 0;
 }
+
+void zpci_assign_default_ops(S390PCIBusDevice *pbdev)
+{
+    /*
+     * As PCISTB operations are not allowed to cross a page boundary, use
+     * qemu_memalign to get a single page for all subseqent PCISTB
+     * operations.
+     */
+    pbdev->pcistb_buf = qemu_memalign(PAGE_SIZE, PAGE_SIZE);
+
+    pbdev->ops.pcistg = pcistg_default;
+    pbdev->ops.pcilg = pcilg_default;
+    pbdev->ops.pcistb = pcistb_default;
+}
diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
index 0662e7b..7c34c5b 100644
--- a/include/hw/s390x/s390-pci-bus.h
+++ b/include/hw/s390x/s390-pci-bus.h
@@ -325,6 +325,26 @@ typedef struct S390PCIGroup {
 S390PCIGroup *s390_group_create(int id);
 S390PCIGroup *s390_group_find(int id);
 
+typedef struct ZpciOps {
+    int (*pcistg)(S390PCIBusDevice *pbdev,
+                  uint64_t data,
+                  uint8_t pcias,
+                  uint16_t len,
+                  uint64_t offset);
+    int (*pcilg)(S390PCIBusDevice *pbdev,
+                 uint64_t *data,
+                 uint8_t pcias,
+                 uint16_t len,
+                 uint64_t offset);
+    int (*pcistb)(S390PCIBusDevice *pbdev,
+                  S390CPU *cpu,
+                  uint64_t gaddr,
+                  uint8_t ar,
+                  uint8_t pcias,
+                  uint16_t len,
+                  uint64_t offset);
+} ZpciOps;
+
 struct S390PCIBusDevice {
     DeviceState qdev;
     PCIDevice *pdev;
@@ -351,6 +371,8 @@ struct S390PCIBusDevice {
     MemoryRegion msix_notify_mr;
     IndAddr *summary_ind;
     IndAddr *indicator;
+    ZpciOps ops;
+    uint8_t *pcistb_buf;
     bool pci_unplug_request_processed;
     bool unplug_requested;
     QTAILQ_ENTRY(S390PCIBusDevice) link;
diff --git a/include/hw/s390x/s390-pci-inst.h b/include/hw/s390x/s390-pci-inst.h
index a55c448..c9fe3f1 100644
--- a/include/hw/s390x/s390-pci-inst.h
+++ b/include/hw/s390x/s390-pci-inst.h
@@ -111,6 +111,7 @@ int mpcifc_service_call(S390CPU *cpu, uint8_t r1, uint64_t fiba, uint8_t ar,
 int stpcifc_service_call(S390CPU *cpu, uint8_t r1, uint64_t fiba, uint8_t ar,
                          uintptr_t ra);
 void fmb_timer_free(S390PCIBusDevice *pbdev);
+void zpci_assign_default_ops(S390PCIBusDevice *pbdev);
 
 #define ZPCI_IO_BAR_MIN 0
 #define ZPCI_IO_BAR_MAX 5
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 27+ messages in thread
- * [PATCH 5/8] s390x/pci: Handle devices that support relaxed alignment
  2021-01-19 20:44 [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Matthew Rosato
                   ` (3 preceding siblings ...)
  2021-01-19 20:44 ` [PATCH 4/8] s390x/pci: Introduce the ZpciOps structure Matthew Rosato
@ 2021-01-19 20:44 ` Matthew Rosato
  2021-01-19 20:44 ` [PATCH 6/8] s390x/pci: PCISTB via the vfio zPCI I/O region Matthew Rosato
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Matthew Rosato @ 2021-01-19 20:44 UTC (permalink / raw)
  To: cohuck, thuth
  Cc: pmorel, david, schnelle, richard.henderson, qemu-s390x,
	qemu-devel, pasic, borntraeger, alex.williamson, mst, pbonzini
Certain zPCI device types (e.g. ISM) allow for a different set of address
alignment rules for PCISTB instructions.  Recognize this distinction and
perform only a subset of alignment checks for intercepted PCISTB
instructions.  Furthermore for the default path, handle the potential for
writes that are not aligned and sized to 8B chunks.
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
 hw/s390x/s390-pci-inst.c        | 152 +++++++++++++++++++++++++++++++++-------
 hw/s390x/s390-pci-vfio.c        |   3 +
 include/hw/s390x/s390-pci-clp.h |   1 +
 3 files changed, 132 insertions(+), 24 deletions(-)
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
index 2d440a3..67eb4a4 100644
--- a/hw/s390x/s390-pci-inst.c
+++ b/hw/s390x/s390-pci-inst.c
@@ -784,6 +784,92 @@ err:
     return 0;
 }
 
+static int pcistb_write_validate(MemoryRegion *mr, uint64_t offset,
+                                 uint16_t len)
+{
+    uint8_t size;
+
+    switch (offset & 0x07UL) {
+    case 0:
+        size = 8;
+        if (size <= len) {
+            break;
+        }
+        /* fall through */
+    case 4:
+        size = 4;
+        if (size <= len) {
+            break;
+        }
+        /* fall through */
+    case 6:
+    case 2:
+        size = 2;
+        if (size <= len) {
+            break;
+        }
+        /* fall through */
+    default:
+        size = 1;
+    }
+
+    if (!memory_region_access_valid(mr, offset, size, true,
+                                    MEMTXATTRS_UNSPECIFIED)) {
+        return -EINVAL;
+    }
+
+    return size;
+}
+
+static int pcistb_write(MemoryRegion *mr, uint8_t *buffer, uint64_t offset,
+                        uint16_t len)
+{
+    MemTxResult result;
+    uint64_t size;
+    uint64_t data;
+    MemOp opsize;
+
+    switch (offset & 0x07UL) {
+    case 0:
+        size = 8;
+        if (size <= len) {
+            opsize = MO_64;
+            data = ldq_p(buffer);
+            break;
+        }
+        /* fall through */
+    case 4:
+        size = 4;
+        if (size <= len) {
+            opsize = MO_32;
+            data = ldl_p(buffer);
+            break;
+        }
+        /* fall through */
+    case 6:
+    case 2:
+        size = 2;
+        if (size <= len) {
+            opsize = MO_16;
+            data = lduw_p(buffer);
+            break;
+        }
+        /* fall through */
+    default:
+        size = 1;
+        opsize = MO_8;
+        data = ldub_p(buffer);
+    }
+
+    result = memory_region_dispatch_write(mr, offset, data, opsize,
+                                          MEMTXATTRS_UNSPECIFIED);
+    if (result != MEMTX_OK) {
+        return -EINVAL;
+    }
+
+    return size;
+}
+
 /*
  * The default PCISTB handler will break PCISTB instructions into a series of
  * 8B memory operations.
@@ -792,32 +878,44 @@ static int pcistb_default(S390PCIBusDevice *pbdev, S390CPU *cpu,
                            uint64_t gaddr, uint8_t ar, uint8_t pcias,
                            uint16_t len, uint64_t offset)
 {
-    MemTxResult result;
     MemoryRegion *mr;
-    int i;
+    uint64_t curroff;
+    uint16_t currlen;
+    uint8_t *currbuff;
+    int size;
 
     mr = pbdev->pdev->io_regions[pcias].memory;
     mr = s390_get_subregion(mr, offset, len);
     offset -= mr->addr;
 
-    for (i = 0; i < len; i += 8) {
-        if (!memory_region_access_valid(mr, offset + i, 8, true,
-                                        MEMTXATTRS_UNSPECIFIED)) {
+    /* Loop over the proposed area and validate that writes will work. */
+    curroff = offset;
+    currlen = len;
+    while (currlen > 0) {
+        size = pcistb_write_validate(mr, curroff, currlen);
+        if (size <= 0) {
             return -EINVAL;
         }
+        curroff += size;
+        currlen -= size;
     }
 
     if (s390_cpu_virt_mem_read(cpu, gaddr, ar, pbdev->pcistb_buf, len)) {
         return -EACCES;
     }
 
-    for (i = 0; i < len; i += 8) {
-        result = memory_region_dispatch_write(mr, offset + i,
-                                              ldq_p(pbdev->pcistb_buf + i),
-                                              MO_64, MEMTXATTRS_UNSPECIFIED);
-        if (result != MEMTX_OK) {
+    /* Perform the chain of previously-validated writes */
+    currbuff = pbdev->pcistb_buf;
+    curroff = offset;
+    currlen = len;
+    while (currlen > 0) {
+        size = pcistb_write(mr, currbuff, curroff, currlen);
+        if (size < 0) {
             return -EINVAL;
         }
+        currbuff += size;
+        curroff += size;
+        currlen -= size;
     }
 
     return 0;
@@ -873,25 +971,31 @@ int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr,
         return 0;
     }
 
-    /* Verify the address, offset and length */
-    /* offset must be a multiple of 8 */
-    if (offset % 8) {
-        goto specification_error;
-    }
-    /* Length must be greater than 8, a multiple of 8 */
-    /* and not greater than maxstbl */
-    if ((len <= 8) || (len % 8) ||
-        (len > pbdev->pci_group->zpci_group.maxstbl)) {
-        goto specification_error;
+    /*
+     * If the specified device supports relaxed alignment, some checks can
+     * be skipped.
+     */
+    if (!(pbdev->pci_group->zpci_group.fr & CLP_RSP_QPCIG_MASK_RELAXED)) {
+        /* Verify the address, offset and length */
+        /* offset must be a multiple of 8 */
+        if (offset % 8) {
+            goto specification_error;
+        }
+        /* Length must be greater than 8, a multiple of 8 */
+        /* and not greater than maxstbl */
+        if ((len <= 8) || (len % 8) ||
+            (len > pbdev->pci_group->zpci_group.maxstbl)) {
+            goto specification_error;
+        }
+        /* Guest address must be double word aligned */
+        if (gaddr & 0x07UL) {
+            goto specification_error;
+        }
     }
     /* Do not cross a 4K-byte boundary */
     if (((offset & 0xfff) + len) > 0x1000) {
         goto specification_error;
     }
-    /* Guest address must be double word aligned */
-    if (gaddr & 0x07UL) {
-        goto specification_error;
-    }
 
     ret = pbdev->ops.pcistb(pbdev, cpu, gaddr, ar, pcias, len, offset);
 
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index cb59e98..4cee640 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -158,6 +158,9 @@ static void s390_pci_read_group(S390PCIBusDevice *pbdev,
         if (cap->flags & VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH) {
             resgrp->fr = 1;
         }
+        if (cap->flags & VFIO_DEVICE_INFO_ZPCI_FLAG_RELAXED) {
+            resgrp->fr |= CLP_RSP_QPCIG_MASK_RELAXED;
+        }
         resgrp->dasm = cap->dasm;
         resgrp->msia = cap->msi_addr;
         resgrp->mui = cap->mui;
diff --git a/include/hw/s390x/s390-pci-clp.h b/include/hw/s390x/s390-pci-clp.h
index 96b8e3f..73a28a0 100644
--- a/include/hw/s390x/s390-pci-clp.h
+++ b/include/hw/s390x/s390-pci-clp.h
@@ -158,6 +158,7 @@ typedef struct ClpRspQueryPciGrp {
 #define CLP_RSP_QPCIG_MASK_NOI 0xfff
     uint16_t i;
     uint8_t version;
+#define CLP_RSP_QPCIG_MASK_RELAXED 0x8
 #define CLP_RSP_QPCIG_MASK_FRAME   0x2
 #define CLP_RSP_QPCIG_MASK_REFRESH 0x1
     uint8_t fr;
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 27+ messages in thread
- * [PATCH 6/8] s390x/pci: PCISTB via the vfio zPCI I/O region
  2021-01-19 20:44 [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Matthew Rosato
                   ` (4 preceding siblings ...)
  2021-01-19 20:44 ` [PATCH 5/8] s390x/pci: Handle devices that support relaxed alignment Matthew Rosato
@ 2021-01-19 20:44 ` Matthew Rosato
  2021-01-19 20:44 ` [PATCH 7/8] s390x/pci: PCILG " Matthew Rosato
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Matthew Rosato @ 2021-01-19 20:44 UTC (permalink / raw)
  To: cohuck, thuth
  Cc: pmorel, david, schnelle, richard.henderson, qemu-s390x,
	qemu-devel, pasic, borntraeger, alex.williamson, mst, pbonzini
For ISM devices, use the vfio region to handle intercepted PCISTB
instructions.  This region will allow large block I/O instructions
intercepted from the guest to be performed as a single I/O instruction on
the host.  This ensure that proper write patterns that are expected by the
underlying device are respected and ensures that a non-MIO instruction is
used to perform the operation (as ISM devices do not support the MIO
instruction set).
Furthermore, add a requirement that the I/O region must be available in
order to pass the device through to the guest.
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
 hw/s390x/s390-pci-bus.c          | 15 +++++++
 hw/s390x/s390-pci-inst.c         |  8 ++++
 hw/s390x/s390-pci-vfio.c         | 95 ++++++++++++++++++++++++++++++++++++++++
 include/hw/s390x/s390-pci-bus.h  |  5 +++
 include/hw/s390x/s390-pci-inst.h |  1 +
 include/hw/s390x/s390-pci-vfio.h | 15 +++++++
 6 files changed, 139 insertions(+)
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 784ca65..9d5c2c5 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -977,6 +977,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
     S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev);
     PCIDevice *pdev = NULL;
     S390PCIBusDevice *pbdev = NULL;
+    int ret;
 
     if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
         PCIBridge *pb = PCI_BRIDGE(dev);
@@ -1027,6 +1028,20 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
             pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev);
             /* Fill in CLP information passed via the vfio region */
             s390_pci_get_clp_info(pbdev);
+
+            /*
+             * For an ISM device, attempt to setup the special I/O region.  If
+             * this fails, the device cannot be passed through.
+             */
+            ret = 0;
+            if (pbdev->pft == ZPCI_PFT_ISM) {
+                ret = s390_pci_get_zpci_io_region(pbdev);
+            }
+            if (ret) {
+                error_setg(errp, "vfio zPCI I/O region support is mandatory "
+                           "for %02x:%02x.%01x", pci_dev_bus_num(pdev),
+                           PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+            }
         } else {
             pbdev->fh |= FH_SHM_EMUL;
 
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
index 67eb4a4..18a701f 100644
--- a/hw/s390x/s390-pci-inst.c
+++ b/hw/s390x/s390-pci-inst.c
@@ -19,6 +19,7 @@
 #include "sysemu/hw_accel.h"
 #include "hw/s390x/s390-pci-inst.h"
 #include "hw/s390x/s390-pci-bus.h"
+#include "hw/s390x/s390-pci-vfio.h"
 #include "hw/s390x/tod.h"
 
 #ifndef DEBUG_S390PCI_INST
@@ -1000,6 +1001,8 @@ int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr,
     ret = pbdev->ops.pcistb(pbdev, cpu, gaddr, ar, pcias, len, offset);
 
     switch (ret) {
+    case -EIO:
+        /* fall through */
     case -EINVAL:
         s390_program_interrupt(env, PGM_OPERAND, ra);
         return 0;
@@ -1488,3 +1491,8 @@ void zpci_assign_default_ops(S390PCIBusDevice *pbdev)
     pbdev->ops.pcilg = pcilg_default;
     pbdev->ops.pcistb = pcistb_default;
 }
+
+void zpci_assign_ops_vfio_io_region(S390PCIBusDevice *pbdev)
+{
+    pbdev->ops.pcistb = s390_pci_vfio_pcistb;
+}
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index 4cee640..33b24d4 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -18,6 +18,7 @@
 #include "trace.h"
 #include "hw/s390x/s390-pci-bus.h"
 #include "hw/s390x/s390-pci-clp.h"
+#include "hw/s390x/s390-pci-inst.h"
 #include "hw/s390x/s390-pci-vfio.h"
 #include "hw/vfio/pci.h"
 #include "hw/vfio/vfio-common.h"
@@ -279,3 +280,97 @@ retry:
 
     return;
 }
+
+/*
+ * This function will look for the VFIO_REGION_SUBTYPE_IBM_ZPCI_IO vfio
+ * device region, which is used for performing block I/O operations.
+ */
+int s390_pci_get_zpci_io_region(S390PCIBusDevice *pbdev)
+{
+    VFIOPCIDevice *vfio_pci;
+    VFIODevice *vdev;
+    struct vfio_region_info *info;
+    int ret;
+
+    vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev);
+    vdev = &vfio_pci->vbasedev;
+
+    if (vdev->num_regions < VFIO_PCI_NUM_REGIONS + 1) {
+        return -ENOENT;
+    }
+
+    /* Get the I/O region if it's available */
+    if (vfio_get_dev_region_info(vdev,
+                                 PCI_VENDOR_ID_IBM |
+                                 VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+                                 VFIO_REGION_SUBTYPE_IBM_ZPCI_IO, &info)) {
+        return -ENOENT;
+    }
+
+    /* If the size is unexpectedly small, don't use the region */
+    if (sizeof(*pbdev->io_region) > info->size) {
+        return -EINVAL;
+    }
+
+    pbdev->io_region = g_malloc0(info->size);
+
+    /* Check the header for setup information */
+    ret = pread(vfio_pci->vbasedev.fd, &pbdev->io_region->hdr,
+                sizeof(struct vfio_zpci_io_hdr), info->offset);
+    if (ret != sizeof(struct vfio_zpci_io_hdr)) {
+        g_free(pbdev->io_region);
+        pbdev->io_region = 0;
+        ret = -EINVAL;
+    } else {
+        pbdev->io_region_op_offset = info->offset +
+                                     offsetof(struct vfio_region_zpci_io, req);
+        /* All devices in this group will use the I/O region for PCISTB */
+        pbdev->pci_group->zpci_group.maxstbl = MIN(PAGE_SIZE,
+                                               pbdev->io_region->hdr.max);
+        ret = 0;
+    }
+    g_free(info);
+
+    /* Register the new handlers for the device if region available */
+    if (pbdev->io_region) {
+        zpci_assign_ops_vfio_io_region(pbdev);
+    }
+
+    return ret;
+}
+
+int s390_pci_vfio_pcistb(S390PCIBusDevice *pbdev, S390CPU *cpu, uint64_t gaddr,
+                         uint8_t ar, uint8_t pcias, uint16_t len,
+                         uint64_t offset)
+{
+    struct vfio_region_zpci_io *region = pbdev->io_region;
+    VFIOPCIDevice *vfio_pci;
+    int ret;
+
+    if (region == NULL) {
+        return -EIO;
+    }
+
+    vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev);
+
+    if (s390_cpu_virt_mem_read(cpu, gaddr, ar, pbdev->pcistb_buf, len)) {
+        return -EACCES;
+    }
+
+    region->req.gaddr = (uint64_t)pbdev->pcistb_buf;
+    region->req.offset = offset;
+    region->req.len = len;
+    region->req.pcias = pcias;
+    region->req.flags = VFIO_ZPCI_IO_FLAG_BLOCKW;
+
+    ret = pwrite(vfio_pci->vbasedev.fd, ®ion->req,
+                 sizeof(struct vfio_zpci_io_req),
+                 pbdev->io_region_op_offset);
+    if (ret != sizeof(struct vfio_zpci_io_req)) {
+        ret = -EIO;
+    } else {
+        ret = 0;
+    }
+
+    return ret;
+}
diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
index 7c34c5b..aaef890 100644
--- a/include/hw/s390x/s390-pci-bus.h
+++ b/include/hw/s390x/s390-pci-bus.h
@@ -43,6 +43,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(S390PCIBus, S390_PCI_BUS)
 OBJECT_DECLARE_SIMPLE_TYPE(S390PCIBusDevice, S390_PCI_DEVICE)
 OBJECT_DECLARE_SIMPLE_TYPE(S390PCIIOMMU, S390_PCI_IOMMU)
 
+/* PCI Function Types */
+#define ZPCI_PFT_ISM 5
+
 #define HP_EVENT_TO_CONFIGURED        0x0301
 #define HP_EVENT_RESERVED_TO_STANDBY  0x0302
 #define HP_EVENT_DECONFIGURE_REQUEST  0x0303
@@ -355,6 +358,8 @@ struct S390PCIBusDevice {
     uint32_t fh;
     uint32_t fid;
     bool fid_defined;
+    uint64_t io_region_op_offset;
+    struct vfio_region_zpci_io *io_region;
     uint64_t fmb_addr;
     ZpciFmb fmb;
     QEMUTimer *fmb_timer;
diff --git a/include/hw/s390x/s390-pci-inst.h b/include/hw/s390x/s390-pci-inst.h
index c9fe3f1..7ed6175 100644
--- a/include/hw/s390x/s390-pci-inst.h
+++ b/include/hw/s390x/s390-pci-inst.h
@@ -112,6 +112,7 @@ int stpcifc_service_call(S390CPU *cpu, uint8_t r1, uint64_t fiba, uint8_t ar,
                          uintptr_t ra);
 void fmb_timer_free(S390PCIBusDevice *pbdev);
 void zpci_assign_default_ops(S390PCIBusDevice *pbdev);
+void zpci_assign_ops_vfio_io_region(S390PCIBusDevice *pbdev);
 
 #define ZPCI_IO_BAR_MIN 0
 #define ZPCI_IO_BAR_MAX 5
diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h
index ff708ae..f0a994f 100644
--- a/include/hw/s390x/s390-pci-vfio.h
+++ b/include/hw/s390x/s390-pci-vfio.h
@@ -21,6 +21,10 @@ S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s,
                                           S390PCIBusDevice *pbdev);
 void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt);
 void s390_pci_get_clp_info(S390PCIBusDevice *pbdev);
+int s390_pci_get_zpci_io_region(S390PCIBusDevice *pbdev);
+int s390_pci_vfio_pcistb(S390PCIBusDevice *pbdev, S390CPU *cpu, uint64_t gaddr,
+                         uint8_t ar, uint8_t pcias, uint16_t len,
+                         uint64_t offset);
 #else
 static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail)
 {
@@ -34,6 +38,17 @@ static inline S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s,
 static inline void s390_pci_end_dma_count(S390pciState *s,
                                           S390PCIDMACount *cnt) { }
 static inline void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) { }
+static inline int s390_pci_get_zpci_io_region(S390PCIBusDevice *pbdev)
+{
+    return -EINVAL;
+}
+static inline int s390_pci_vfio_pcistb(S390PCIBusDevice *pbdev, S390CPU *cpu,
+                                       uint64_t gaddr, uint8_t ar,
+                                       uint8_t pcias, uint16_t len,
+                                       uint64_t offset)
+{
+    return -EIO;
+}
 #endif
 
 #endif
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 27+ messages in thread
- * [PATCH 7/8] s390x/pci: PCILG via the vfio zPCI I/O region
  2021-01-19 20:44 [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Matthew Rosato
                   ` (5 preceding siblings ...)
  2021-01-19 20:44 ` [PATCH 6/8] s390x/pci: PCISTB via the vfio zPCI I/O region Matthew Rosato
@ 2021-01-19 20:44 ` Matthew Rosato
  2021-01-19 20:44 ` [PATCH 8/8] s390x/pci: Prevent ISM device passthrough on older host kernels Matthew Rosato
  2021-01-20  9:12 ` [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Pierre Morel
  8 siblings, 0 replies; 27+ messages in thread
From: Matthew Rosato @ 2021-01-19 20:44 UTC (permalink / raw)
  To: cohuck, thuth
  Cc: pmorel, david, schnelle, richard.henderson, qemu-s390x,
	qemu-devel, pasic, borntraeger, alex.williamson, mst, pbonzini
For ISM devices, use the vfio region to handle intercepted PCILG
instructions.  This will allow read I/Os intercepted from the guest to be
performed as single operations that ensure the same non-MIO PCI instruction
is used on the host as specified in the guest.
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
 hw/s390x/s390-pci-inst.c         |  3 ++-
 hw/s390x/s390-pci-vfio.c         | 53 ++++++++++++++++++++++++++++++++++++++++
 include/hw/s390x/s390-pci-inst.h |  1 +
 include/hw/s390x/s390-pci-vfio.h |  8 ++++++
 4 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
index 18a701f..97e9a7a 100644
--- a/hw/s390x/s390-pci-inst.c
+++ b/hw/s390x/s390-pci-inst.c
@@ -352,7 +352,7 @@ out:
  * @ptr: a pointer to a uint64_t data field
  * @len: the length of the valid data, must be 1,2,4 or 8
  */
-static int zpci_endian_swap(uint64_t *ptr, uint8_t len)
+int zpci_endian_swap(uint64_t *ptr, uint8_t len)
 {
     uint64_t data = *ptr;
 
@@ -1494,5 +1494,6 @@ void zpci_assign_default_ops(S390PCIBusDevice *pbdev)
 
 void zpci_assign_ops_vfio_io_region(S390PCIBusDevice *pbdev)
 {
+    pbdev->ops.pcilg = s390_pci_vfio_pcilg;
     pbdev->ops.pcistb = s390_pci_vfio_pcistb;
 }
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index 33b24d4..6778ba4 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -339,6 +339,59 @@ int s390_pci_get_zpci_io_region(S390PCIBusDevice *pbdev)
     return ret;
 }
 
+int s390_pci_vfio_pcilg(S390PCIBusDevice *pbdev, uint64_t *data, uint8_t pcias,
+                        uint16_t len, uint64_t offset)
+{
+    struct vfio_region_zpci_io *region = pbdev->io_region;
+    VFIOPCIDevice *vfio_pci;
+    int ret;
+
+    if (region == NULL) {
+        return -EIO;
+    }
+
+    vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev);
+
+    /* Perform Length/Alignment checks */
+    switch (pcias) {
+    case ZPCI_IO_BAR_MIN...ZPCI_IO_BAR_MAX:
+        if (!len || (len > (8 - (offset & 0x7)))) {
+            return -EINVAL;
+        }
+        region->req.gaddr = (uint64_t)data;
+        region->req.offset = offset;
+        region->req.len = len;
+        region->req.pcias = pcias;
+        region->req.flags = VFIO_ZPCI_IO_FLAG_READ;
+
+        ret = pwrite(vfio_pci->vbasedev.fd, ®ion->req,
+                     sizeof(struct vfio_zpci_io_req),
+                     pbdev->io_region_op_offset);
+        if (ret != sizeof(struct vfio_zpci_io_req)) {
+            ret = -EIO;
+        } else {
+            ret = 0;
+        }
+        break;
+    case ZPCI_CONFIG_BAR:
+        if (!len || (len > (4 - (offset & 0x3))) || len == 3) {
+            return -EINVAL;
+        }
+        *data = pci_host_config_read_common(
+                       pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
+
+        if (zpci_endian_swap(data, len)) {
+            ret = -EINVAL;
+        }
+        ret = 0;
+        break;
+    default:
+        return -EFAULT;
+    }
+
+    return ret;
+}
+
 int s390_pci_vfio_pcistb(S390PCIBusDevice *pbdev, S390CPU *cpu, uint64_t gaddr,
                          uint8_t ar, uint8_t pcias, uint16_t len,
                          uint64_t offset)
diff --git a/include/hw/s390x/s390-pci-inst.h b/include/hw/s390x/s390-pci-inst.h
index 7ed6175..fe368fb 100644
--- a/include/hw/s390x/s390-pci-inst.h
+++ b/include/hw/s390x/s390-pci-inst.h
@@ -101,6 +101,7 @@ typedef struct ZpciFib {
 int pci_dereg_irqs(S390PCIBusDevice *pbdev);
 void pci_dereg_ioat(S390PCIIOMMU *iommu);
 int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra);
+int zpci_endian_swap(uint64_t *ptr, uint8_t len);
 int pcilg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra);
 int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra);
 int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra);
diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h
index f0a994f..d9fb3a4 100644
--- a/include/hw/s390x/s390-pci-vfio.h
+++ b/include/hw/s390x/s390-pci-vfio.h
@@ -22,6 +22,8 @@ S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s,
 void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt);
 void s390_pci_get_clp_info(S390PCIBusDevice *pbdev);
 int s390_pci_get_zpci_io_region(S390PCIBusDevice *pbdev);
+int s390_pci_vfio_pcilg(S390PCIBusDevice *pbdev, uint64_t *data, uint8_t pcias,
+                        uint16_t len, uint64_t offset);
 int s390_pci_vfio_pcistb(S390PCIBusDevice *pbdev, S390CPU *cpu, uint64_t gaddr,
                          uint8_t ar, uint8_t pcias, uint16_t len,
                          uint64_t offset);
@@ -42,6 +44,12 @@ static inline int s390_pci_get_zpci_io_region(S390PCIBusDevice *pbdev)
 {
     return -EINVAL;
 }
+static inline int s390_pci_vfio_pcilg(S390PCIBusDevice *pbdev, uint64_t *data,
+                                      uint8_t pcias, uint16_t len,
+                                      uint64_t offset)
+{
+    return -EIO;
+}
 static inline int s390_pci_vfio_pcistb(S390PCIBusDevice *pbdev, S390CPU *cpu,
                                        uint64_t gaddr, uint8_t ar,
                                        uint8_t pcias, uint16_t len,
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 27+ messages in thread
- * [PATCH 8/8] s390x/pci: Prevent ISM device passthrough on older host kernels
  2021-01-19 20:44 [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Matthew Rosato
                   ` (6 preceding siblings ...)
  2021-01-19 20:44 ` [PATCH 7/8] s390x/pci: PCILG " Matthew Rosato
@ 2021-01-19 20:44 ` Matthew Rosato
  2021-01-20  9:12 ` [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Pierre Morel
  8 siblings, 0 replies; 27+ messages in thread
From: Matthew Rosato @ 2021-01-19 20:44 UTC (permalink / raw)
  To: cohuck, thuth
  Cc: pmorel, david, schnelle, richard.henderson, qemu-s390x,
	qemu-devel, pasic, borntraeger, alex.williamson, mst, pbonzini
If the underlying host does not provide CLP capabilities, we cannot tell
what type of function is being passed through, which means for ISM devices
we can't properly tell if the vfio I/O region is necessary.  If the ISM
device is allowed to pass through erroneously, it will not function
properly without the I/O region.  Therefore, check for unclassified devices
that do not have MSI-X support and still disallow these from being passed
through, using the same error as known ISM devices when the vfio I/O region
is unavailable.
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
 hw/s390x/s390-pci-bus.c         | 13 +++++++++++++
 include/hw/s390x/s390-pci-bus.h |  1 +
 2 files changed, 14 insertions(+)
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 9d5c2c5..0dfdc88 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -1037,6 +1037,19 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
             if (pbdev->pft == ZPCI_PFT_ISM) {
                 ret = s390_pci_get_zpci_io_region(pbdev);
             }
+            /*
+             * If the device type is unclassified, it may be due to the fact
+             * that CLP info was not provided by vfio -- Which means we cannot
+             * tell if this is actually an ISM device, which will not be able
+             * to function properly without proper identification and the I/O
+             * region.  Therefore, attempt to identify the ISM device via the
+             * lack of MSI-X and only in this case prevent the device from
+             * being passed through.
+             */
+            else if (pbdev->pft == ZPCI_PFT_UNCLASSIFIED &&
+                     !pci_find_capability(pbdev->pdev, PCI_CAP_ID_MSIX)) {
+                ret = -EINVAL;
+            }
             if (ret) {
                 error_setg(errp, "vfio zPCI I/O region support is mandatory "
                            "for %02x:%02x.%01x", pci_dev_bus_num(pdev),
diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
index aaef890..d75aad8 100644
--- a/include/hw/s390x/s390-pci-bus.h
+++ b/include/hw/s390x/s390-pci-bus.h
@@ -44,6 +44,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(S390PCIBusDevice, S390_PCI_DEVICE)
 OBJECT_DECLARE_SIMPLE_TYPE(S390PCIIOMMU, S390_PCI_IOMMU)
 
 /* PCI Function Types */
+#define ZPCI_PFT_UNCLASSIFIED 0
 #define ZPCI_PFT_ISM 5
 
 #define HP_EVENT_TO_CONFIGURED        0x0301
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 27+ messages in thread
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-19 20:44 [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Matthew Rosato
                   ` (7 preceding siblings ...)
  2021-01-19 20:44 ` [PATCH 8/8] s390x/pci: Prevent ISM device passthrough on older host kernels Matthew Rosato
@ 2021-01-20  9:12 ` Pierre Morel
  2021-01-20 14:03   ` Matthew Rosato
  8 siblings, 1 reply; 27+ messages in thread
From: Pierre Morel @ 2021-01-20  9:12 UTC (permalink / raw)
  To: Matthew Rosato, cohuck, thuth
  Cc: schnelle, david, mst, richard.henderson, qemu-s390x, qemu-devel,
	pasic, borntraeger, alex.williamson, pbonzini
On 1/19/21 9:44 PM, Matthew Rosato wrote:
> Today, ISM devices are completely disallowed for vfio-pci passthrough as
> QEMU rejects the device due to an (inappropriate) MSI-X check.  Removing
> this fence, however, reveals additional deficiencies in the s390x PCI
> interception layer that prevent ISM devices from working correctly.
> Namely, ISM block write operations have particular requirements in regards
> to the alignment, size and order of writes performed that cannot be
> guaranteed when breaking up write operations through the typical
> vfio_pci_bar_rw paths. Furthermore, ISM requires that legacy/non-MIO
> s390 PCI instructions are used, which is also not guaranteed when the I/O
> is passed through the typical userspace channels.
> 
> This patchset provides a set of fixes related to enabling ISM device
> passthrough and includes patches to enable use of a new vfio region that
> will allow s390x PCI pass-through devices to perform s390 PCI instructions
> in such a way that the same instruction issued on the guest is re-issued
> on the host.
> 
> Associated kernel patchset:
> https://lkml.org/lkml/2021/1/19/874
> 
> Changes from RFC -> v1:
> - Refresh the header sync (built using Eric's 'update-linux-headers:
> Include const.h' + manually removed pvrdma_ring.h again)
> - Remove s390x/pci: fix pcistb length (already merged)
> - Remove s390x/pci: Fix memory_region_access_valid call (already merged)
> - Fix bug: s390_pci_vfio_pcistb should use the pre-allocated PCISTB
> buffer pcistb_buf rather than allocating/freeing its own.
> - New patch: track the PFT (PCI Function Type) separately from guest CLP
> response data -- we tell the guest '0' for now due to limitations in
> measurement block support, but we can still use the real value provided via
> the vfio CLP capabilities to make decisions.
> - Use the PFT (pci function type) to determine when to use the region
> for PCISTB/PCILG (only for ISM), rather than using the relaxed alignment
> bit.
> - As a result, the pcistb_default is now updated to also handle the
> possibility of relaxed alignment via 2 new functions, pcistb_validate_write
> and pcistb_write, which serve as wrappers to the memory_region calls.
> - New patch, which partially restores the MSI-X fence for passthrough
> devices...  Could potentially be squashed with 's390x/pci: MSI-X isn't
> strictly required for passthrough' but left separately for now as I felt it
> needed a clear commit description of why we should still fence this case.
> 
Hi,
The choice of using the new VFIO region is made on the ISM PCI function 
type (PFT), which makes the patch ISM specific, why don't we use here 
the MIO bit common to any zPCI function and present in kernel to make 
the choice?
Regards,
Pierre
-- 
Pierre Morel
IBM Lab Boeblingen
^ permalink raw reply	[flat|nested] 27+ messages in thread
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-20  9:12 ` [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support Pierre Morel
@ 2021-01-20 14:03   ` Matthew Rosato
  2021-01-20 14:45     ` Pierre Morel
  0 siblings, 1 reply; 27+ messages in thread
From: Matthew Rosato @ 2021-01-20 14:03 UTC (permalink / raw)
  To: Pierre Morel, cohuck, thuth
  Cc: schnelle, david, mst, richard.henderson, qemu-s390x, qemu-devel,
	pasic, borntraeger, alex.williamson, pbonzini
On 1/20/21 4:12 AM, Pierre Morel wrote:
> 
> 
> On 1/19/21 9:44 PM, Matthew Rosato wrote:
>> Today, ISM devices are completely disallowed for vfio-pci passthrough as
>> QEMU rejects the device due to an (inappropriate) MSI-X check.  Removing
>> this fence, however, reveals additional deficiencies in the s390x PCI
>> interception layer that prevent ISM devices from working correctly.
>> Namely, ISM block write operations have particular requirements in 
>> regards
>> to the alignment, size and order of writes performed that cannot be
>> guaranteed when breaking up write operations through the typical
>> vfio_pci_bar_rw paths. Furthermore, ISM requires that legacy/non-MIO
>> s390 PCI instructions are used, which is also not guaranteed when the I/O
>> is passed through the typical userspace channels.
>>
>> This patchset provides a set of fixes related to enabling ISM device
>> passthrough and includes patches to enable use of a new vfio region that
>> will allow s390x PCI pass-through devices to perform s390 PCI 
>> instructions
>> in such a way that the same instruction issued on the guest is re-issued
>> on the host.
>>
>> Associated kernel patchset:
>> https://lkml.org/lkml/2021/1/19/874
>>
>> Changes from RFC -> v1:
>> - Refresh the header sync (built using Eric's 'update-linux-headers:
>> Include const.h' + manually removed pvrdma_ring.h again)
>> - Remove s390x/pci: fix pcistb length (already merged)
>> - Remove s390x/pci: Fix memory_region_access_valid call (already merged)
>> - Fix bug: s390_pci_vfio_pcistb should use the pre-allocated PCISTB
>> buffer pcistb_buf rather than allocating/freeing its own.
>> - New patch: track the PFT (PCI Function Type) separately from guest CLP
>> response data -- we tell the guest '0' for now due to limitations in
>> measurement block support, but we can still use the real value 
>> provided via
>> the vfio CLP capabilities to make decisions.
>> - Use the PFT (pci function type) to determine when to use the region
>> for PCISTB/PCILG (only for ISM), rather than using the relaxed alignment
>> bit.
>> - As a result, the pcistb_default is now updated to also handle the
>> possibility of relaxed alignment via 2 new functions, 
>> pcistb_validate_write
>> and pcistb_write, which serve as wrappers to the memory_region calls.
>> - New patch, which partially restores the MSI-X fence for passthrough
>> devices...  Could potentially be squashed with 's390x/pci: MSI-X isn't
>> strictly required for passthrough' but left separately for now as I 
>> felt it
>> needed a clear commit description of why we should still fence this case.
>>
> Hi,
> 
> The choice of using the new VFIO region is made on the ISM PCI function 
> type (PFT), which makes the patch ISM specific, why don't we use here 
> the MIO bit common to any zPCI function and present in kernel to make 
> the choice?
> 
As discussed during the RFC (and see my reply also to the kernel set), 
the use of this region only works for devices that do not rely on MSI-X 
interrupts.  If we did as you suggest, other device types like mlx would 
not receive MSI-X interrupts in the guest (And I did indeed try 
variations where I used the special VFIO region for all 
PCISTG/PCILG/PCISTB for various device types)
So the idea for now was to solve the specific problem at hand (getting 
ISM devices working).
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-20 14:03   ` Matthew Rosato
@ 2021-01-20 14:45     ` Pierre Morel
  2021-01-20 15:59       ` Matthew Rosato
  0 siblings, 1 reply; 27+ messages in thread
From: Pierre Morel @ 2021-01-20 14:45 UTC (permalink / raw)
  To: Matthew Rosato, cohuck, thuth
  Cc: schnelle, david, mst, richard.henderson, qemu-s390x, qemu-devel,
	pasic, borntraeger, alex.williamson, pbonzini
On 1/20/21 3:03 PM, Matthew Rosato wrote:
> On 1/20/21 4:12 AM, Pierre Morel wrote:
>>
>>
>> On 1/19/21 9:44 PM, Matthew Rosato wrote:
>>> Today, ISM devices are completely disallowed for vfio-pci passthrough as
>>> QEMU rejects the device due to an (inappropriate) MSI-X check.  Removing
>>> this fence, however, reveals additional deficiencies in the s390x PCI
>>> interception layer that prevent ISM devices from working correctly.
>>> Namely, ISM block write operations have particular requirements in 
>>> regards
>>> to the alignment, size and order of writes performed that cannot be
>>> guaranteed when breaking up write operations through the typical
>>> vfio_pci_bar_rw paths. Furthermore, ISM requires that legacy/non-MIO
>>> s390 PCI instructions are used, which is also not guaranteed when the 
>>> I/O
>>> is passed through the typical userspace channels.
>>>
>>> This patchset provides a set of fixes related to enabling ISM device
>>> passthrough and includes patches to enable use of a new vfio region that
>>> will allow s390x PCI pass-through devices to perform s390 PCI 
>>> instructions
>>> in such a way that the same instruction issued on the guest is re-issued
>>> on the host.
>>>
>>> Associated kernel patchset:
>>> https://lkml.org/lkml/2021/1/19/874
>>>
>>> Changes from RFC -> v1:
>>> - Refresh the header sync (built using Eric's 'update-linux-headers:
>>> Include const.h' + manually removed pvrdma_ring.h again)
>>> - Remove s390x/pci: fix pcistb length (already merged)
>>> - Remove s390x/pci: Fix memory_region_access_valid call (already merged)
>>> - Fix bug: s390_pci_vfio_pcistb should use the pre-allocated PCISTB
>>> buffer pcistb_buf rather than allocating/freeing its own.
>>> - New patch: track the PFT (PCI Function Type) separately from guest CLP
>>> response data -- we tell the guest '0' for now due to limitations in
>>> measurement block support, but we can still use the real value 
>>> provided via
>>> the vfio CLP capabilities to make decisions.
>>> - Use the PFT (pci function type) to determine when to use the region
>>> for PCISTB/PCILG (only for ISM), rather than using the relaxed alignment
>>> bit.
>>> - As a result, the pcistb_default is now updated to also handle the
>>> possibility of relaxed alignment via 2 new functions, 
>>> pcistb_validate_write
>>> and pcistb_write, which serve as wrappers to the memory_region calls.
>>> - New patch, which partially restores the MSI-X fence for passthrough
>>> devices...  Could potentially be squashed with 's390x/pci: MSI-X isn't
>>> strictly required for passthrough' but left separately for now as I 
>>> felt it
>>> needed a clear commit description of why we should still fence this 
>>> case.
>>>
>> Hi,
>>
>> The choice of using the new VFIO region is made on the ISM PCI 
>> function type (PFT), which makes the patch ISM specific, why don't we 
>> use here the MIO bit common to any zPCI function and present in kernel 
>> to make the choice?
>>
> 
> As discussed during the RFC (and see my reply also to the kernel set), 
> the use of this region only works for devices that do not rely on MSI-X 
> interrupts.  If we did as you suggest, other device types like mlx would 
> not receive MSI-X interrupts in the guest (And I did indeed try 
> variations where I used the special VFIO region for all 
> PCISTG/PCILG/PCISTB for various device types)
> 
> So the idea for now was to solve the specific problem at hand (getting 
> ISM devices working).
> 
> 
Sorry, if I missed or forgot some discussions, but I understood that we 
are using this region to handle PCISTB instructions when the device do 
not support MIO.
Don't we?
I do not understand the relation between MSI-X and MIO.
Can you please explain?
Thanks,
Pierre
-- 
Pierre Morel
IBM Lab Boeblingen
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-20 14:45     ` Pierre Morel
@ 2021-01-20 15:59       ` Matthew Rosato
  2021-01-20 19:18         ` Pierre Morel
  0 siblings, 1 reply; 27+ messages in thread
From: Matthew Rosato @ 2021-01-20 15:59 UTC (permalink / raw)
  To: Pierre Morel, cohuck, thuth
  Cc: schnelle, david, mst, richard.henderson, qemu-s390x, qemu-devel,
	pasic, borntraeger, alex.williamson, pbonzini
On 1/20/21 9:45 AM, Pierre Morel wrote:
> 
> 
> On 1/20/21 3:03 PM, Matthew Rosato wrote:
>> On 1/20/21 4:12 AM, Pierre Morel wrote:
>>>
>>>
>>> On 1/19/21 9:44 PM, Matthew Rosato wrote:
>>>> Today, ISM devices are completely disallowed for vfio-pci 
>>>> passthrough as
>>>> QEMU rejects the device due to an (inappropriate) MSI-X check.  
>>>> Removing
>>>> this fence, however, reveals additional deficiencies in the s390x PCI
>>>> interception layer that prevent ISM devices from working correctly.
>>>> Namely, ISM block write operations have particular requirements in 
>>>> regards
>>>> to the alignment, size and order of writes performed that cannot be
>>>> guaranteed when breaking up write operations through the typical
>>>> vfio_pci_bar_rw paths. Furthermore, ISM requires that legacy/non-MIO
>>>> s390 PCI instructions are used, which is also not guaranteed when 
>>>> the I/O
>>>> is passed through the typical userspace channels.
>>>>
>>>> This patchset provides a set of fixes related to enabling ISM device
>>>> passthrough and includes patches to enable use of a new vfio region 
>>>> that
>>>> will allow s390x PCI pass-through devices to perform s390 PCI 
>>>> instructions
>>>> in such a way that the same instruction issued on the guest is 
>>>> re-issued
>>>> on the host.
>>>>
>>>> Associated kernel patchset:
>>>> https://lkml.org/lkml/2021/1/19/874
>>>>
>>>> Changes from RFC -> v1:
>>>> - Refresh the header sync (built using Eric's 'update-linux-headers:
>>>> Include const.h' + manually removed pvrdma_ring.h again)
>>>> - Remove s390x/pci: fix pcistb length (already merged)
>>>> - Remove s390x/pci: Fix memory_region_access_valid call (already 
>>>> merged)
>>>> - Fix bug: s390_pci_vfio_pcistb should use the pre-allocated PCISTB
>>>> buffer pcistb_buf rather than allocating/freeing its own.
>>>> - New patch: track the PFT (PCI Function Type) separately from guest 
>>>> CLP
>>>> response data -- we tell the guest '0' for now due to limitations in
>>>> measurement block support, but we can still use the real value 
>>>> provided via
>>>> the vfio CLP capabilities to make decisions.
>>>> - Use the PFT (pci function type) to determine when to use the region
>>>> for PCISTB/PCILG (only for ISM), rather than using the relaxed 
>>>> alignment
>>>> bit.
>>>> - As a result, the pcistb_default is now updated to also handle the
>>>> possibility of relaxed alignment via 2 new functions, 
>>>> pcistb_validate_write
>>>> and pcistb_write, which serve as wrappers to the memory_region calls.
>>>> - New patch, which partially restores the MSI-X fence for passthrough
>>>> devices...  Could potentially be squashed with 's390x/pci: MSI-X isn't
>>>> strictly required for passthrough' but left separately for now as I 
>>>> felt it
>>>> needed a clear commit description of why we should still fence this 
>>>> case.
>>>>
>>> Hi,
>>>
>>> The choice of using the new VFIO region is made on the ISM PCI 
>>> function type (PFT), which makes the patch ISM specific, why don't we 
>>> use here the MIO bit common to any zPCI function and present in 
>>> kernel to make the choice?
>>>
>>
>> As discussed during the RFC (and see my reply also to the kernel set), 
>> the use of this region only works for devices that do not rely on 
>> MSI-X interrupts.  If we did as you suggest, other device types like 
>> mlx would not receive MSI-X interrupts in the guest (And I did indeed 
>> try variations where I used the special VFIO region for all 
>> PCISTG/PCILG/PCISTB for various device types)
>>
>> So the idea for now was to solve the specific problem at hand (getting 
>> ISM devices working).
>>
>>
> 
> Sorry, if I missed or forgot some discussions, but I understood that we 
> are using this region to handle PCISTB instructions when the device do 
> not support MIO.
> Don't we?
Sure thing - It's probably good to refresh the issue/rationale anyway as 
we've had the holidays in between.
You are correct, a primary reason we need to resort to a separate VFIO 
region for PCISTB (and PCILG) instructions for ISM devices is that they 
do not support the MIO instruction set, yet the host kernel will 
translate everything coming through the PCI I/O layer to MIO 
instructions whenever that facility is available to the host (and not 
purposely disabled).  This issue is unique to vfio-pci/passthrough - in 
the host, the ISM driver directly invokes functions in s390 pci code to 
ensure that MIO instructions are not used.
But this is not the only reason.  There are additional reasons for using 
this VFIO region:
1) ISM devices also don't support PCISTG instructions to certain address 
spaces and PCISTB must be used regardless of operation length.  However 
the standard s390 PCI I/O path always uses PCISTG for anything <=8B. 
Trying to determine whether a given I/O is intended for an ISM device at 
that point in kernel code so as to use PCISTB instead of PCISTG is the 
same problem as attempting to decide whether to use MIO vs non-MIO 
instructions at that point.
2) It allows for much larger PCISTB operations (4K) than allowed via the 
memory regions (loop of 8B operations).
3) The above also has the added benefit of eliminating certain write 
pattern requirements that are unique to ISM that would be introduced if 
we split up the I/O into 8B chunks (if we can't write the whole PCISTB 
in one go, ISM requires data written in a certain order for some address 
spaces, or with certain bits on/off on the PCISTB instruction to signify 
the state of the larger operation)
> 
> I do not understand the relation between MSI-X and MIO.
> Can you please explain?
> 
There is not a relation between MSI-X and MIO really.  Rather, this is a 
case of the solution that is being offered here ONLY works for devices 
that use MSI -- and ISM is a device that only supports MSI.  If you try 
to use this new VFIO region to pass I/O for an MSI-X enabled device, the 
notifiers set up via vfio_msix_setup won't be triggered because we are 
writing to the new VFIO region, not the virtual bar regions that may 
have had notifiers setup as part of vfio_msix_setup.  This results in 
missing interrupts on MSI-X-enabled vfio-pci devices.
These notifiers aren't a factor when the device is using MSI.
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-20 15:59       ` Matthew Rosato
@ 2021-01-20 19:18         ` Pierre Morel
  2021-01-20 20:29           ` Matthew Rosato
  0 siblings, 1 reply; 27+ messages in thread
From: Pierre Morel @ 2021-01-20 19:18 UTC (permalink / raw)
  To: Matthew Rosato, cohuck, thuth
  Cc: schnelle, david, mst, richard.henderson, qemu-s390x, qemu-devel,
	pasic, borntraeger, alex.williamson, pbonzini
On 1/20/21 4:59 PM, Matthew Rosato wrote:
> On 1/20/21 9:45 AM, Pierre Morel wrote:
>>
>>
>> On 1/20/21 3:03 PM, Matthew Rosato wrote:
>>> On 1/20/21 4:12 AM, Pierre Morel wrote:
>>>>
>>>>
>>>> On 1/19/21 9:44 PM, Matthew Rosato wrote:
>>>>> Today, ISM devices are completely disallowed for vfio-pci 
>>>>> passthrough as
>>>>> QEMU rejects the device due to an (inappropriate) MSI-X check. 
>>>>> Removing
>>>>> this fence, however, reveals additional deficiencies in the s390x PCI
>>>>> interception layer that prevent ISM devices from working correctly.
>>>>> Namely, ISM block write operations have particular requirements in 
>>>>> regards
>>>>> to the alignment, size and order of writes performed that cannot be
>>>>> guaranteed when breaking up write operations through the typical
>>>>> vfio_pci_bar_rw paths. Furthermore, ISM requires that legacy/non-MIO
>>>>> s390 PCI instructions are used, which is also not guaranteed when 
>>>>> the I/O
>>>>> is passed through the typical userspace channels.
>>>>>
>>>>> This patchset provides a set of fixes related to enabling ISM device
>>>>> passthrough and includes patches to enable use of a new vfio region 
>>>>> that
>>>>> will allow s390x PCI pass-through devices to perform s390 PCI 
>>>>> instructions
>>>>> in such a way that the same instruction issued on the guest is 
>>>>> re-issued
>>>>> on the host.
>>>>>
>>>>> Associated kernel patchset:
>>>>> https://lkml.org/lkml/2021/1/19/874
>>>>>
>>>>> Changes from RFC -> v1:
>>>>> - Refresh the header sync (built using Eric's 'update-linux-headers:
>>>>> Include const.h' + manually removed pvrdma_ring.h again)
>>>>> - Remove s390x/pci: fix pcistb length (already merged)
>>>>> - Remove s390x/pci: Fix memory_region_access_valid call (already 
>>>>> merged)
>>>>> - Fix bug: s390_pci_vfio_pcistb should use the pre-allocated PCISTB
>>>>> buffer pcistb_buf rather than allocating/freeing its own.
>>>>> - New patch: track the PFT (PCI Function Type) separately from 
>>>>> guest CLP
>>>>> response data -- we tell the guest '0' for now due to limitations in
>>>>> measurement block support, but we can still use the real value 
>>>>> provided via
>>>>> the vfio CLP capabilities to make decisions.
>>>>> - Use the PFT (pci function type) to determine when to use the region
>>>>> for PCISTB/PCILG (only for ISM), rather than using the relaxed 
>>>>> alignment
>>>>> bit.
>>>>> - As a result, the pcistb_default is now updated to also handle the
>>>>> possibility of relaxed alignment via 2 new functions, 
>>>>> pcistb_validate_write
>>>>> and pcistb_write, which serve as wrappers to the memory_region calls.
>>>>> - New patch, which partially restores the MSI-X fence for passthrough
>>>>> devices...  Could potentially be squashed with 's390x/pci: MSI-X isn't
>>>>> strictly required for passthrough' but left separately for now as I 
>>>>> felt it
>>>>> needed a clear commit description of why we should still fence this 
>>>>> case.
>>>>>
>>>> Hi,
>>>>
>>>> The choice of using the new VFIO region is made on the ISM PCI 
>>>> function type (PFT), which makes the patch ISM specific, why don't 
>>>> we use here the MIO bit common to any zPCI function and present in 
>>>> kernel to make the choice?
>>>>
>>>
>>> As discussed during the RFC (and see my reply also to the kernel 
>>> set), the use of this region only works for devices that do not rely 
>>> on MSI-X interrupts.  If we did as you suggest, other device types 
>>> like mlx would not receive MSI-X interrupts in the guest (And I did 
>>> indeed try variations where I used the special VFIO region for all 
>>> PCISTG/PCILG/PCISTB for various device types)
>>>
>>> So the idea for now was to solve the specific problem at hand 
>>> (getting ISM devices working).
>>>
>>>
>>
>> Sorry, if I missed or forgot some discussions, but I understood that 
>> we are using this region to handle PCISTB instructions when the device 
>> do not support MIO.
>> Don't we?
> 
> Sure thing - It's probably good to refresh the issue/rationale anyway as 
> we've had the holidays in between.
> 
> You are correct, a primary reason we need to resort to a separate VFIO 
> region for PCISTB (and PCILG) instructions for ISM devices is that they 
> do not support the MIO instruction set, yet the host kernel will 
> translate everything coming through the PCI I/O layer to MIO 
> instructions whenever that facility is available to the host (and not 
> purposely disabled).  This issue is unique to vfio-pci/passthrough - in 
> the host, the ISM driver directly invokes functions in s390 pci code to 
> ensure that MIO instructions are not used.
QEMU intercepts and differentiates PCISTG and PCISTB.
The new hardware support both MIO and legacy PCISTB/PCISTG.
QEMU does not support MIO
My first interrogation is why should we translate legacy to MIO?
But OK, say we do need this for some obscure reason.
> 
> But this is not the only reason.  There are additional reasons for using 
> this VFIO region:
> 1) ISM devices also don't support PCISTG instructions to certain address 
> spaces and PCISTB must be used regardless of operation length.  However 
> the standard s390 PCI I/O path always uses PCISTG for anything <=8B. 
> Trying to determine whether a given I/O is intended for an ISM device at 
> that point in kernel code so as to use PCISTB instead of PCISTG is the 
OK, this is clear.
> same problem as attempting to decide whether to use MIO vs non-MIO 
> instructions at that point.
humm, this is not exactly the same problem for me, but OK to choose to 
handle it the same way.
> 2) It allows for much larger PCISTB operations (4K) than allowed via the 
> memory regions (loop of 8B operations).
OK
> 3) The above also has the added benefit of eliminating certain write 
> pattern requirements that are unique to ISM that would be introduced if 
> we split up the I/O into 8B chunks (if we can't write the whole PCISTB 
> in one go, ISM requires data written in a certain order for some address 
> spaces, or with certain bits on/off on the PCISTB instruction to signify 
> the state of the larger operation)
Yes, I suppose that the driver in the guest does it right and we need to 
do the same.
> 
>>
>> I do not understand the relation between MSI-X and MIO.
>> Can you please explain?
>>
> 
> There is not a relation between MSI-X and MIO really.  Rather, this is a 
> case of the solution that is being offered here ONLY works for devices 
> that use MSI -- and ISM is a device that only supports MSI.  If you try 
> to use this new VFIO region to pass I/O for an MSI-X enabled device, the 
> notifiers set up via vfio_msix_setup won't be triggered because we are 
> writing to the new VFIO region, not the virtual bar regions that may 
> have had notifiers setup as part of vfio_msix_setup.  This results in 
> missing interrupts on MSI-X-enabled vfio-pci devices.
> 
> These notifiers aren't a factor when the device is using MSI.
I find this strange but we do not need to discuss it.
> 
So we have:
devices supporting MIO and MSIX
devices not supporting MIO nor MSIX
devices not supporting the use of PCISTG to emulate PCISTB
The first two are two different things indicated by two different 
entries in the clp query PCI function response.
The last one, we do not have an indicator as if the relaxed alignment 
and length is set, PCISTB can not be emulated with PCISTG
What I mean with this is that considering the proposed implementation 
and considering:
MIO MSIX RELAX
0 0 1  -> must use the new region (ISM)
1 1 0  -> must use the standard VFIO region (MLX)
we can discuss other 6 possibilities
0 0 0 -> must use the new region
0 1 0 -> NOOP
0 1 1 -> NOOP
1 0 0 -> can use any region
1 0 1 -> can use any region
1 1 1 -> NOOP
In my opinion the test for using one region or another should be done on 
these indicator instead of using the PFT.
This may offer us more compatibility with other hardware we may not be 
aware of as today.
Regards,
Pierre
-- 
Pierre Morel
IBM Lab Boeblingen
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-20 19:18         ` Pierre Morel
@ 2021-01-20 20:29           ` Matthew Rosato
  2021-01-21  8:27             ` Pierre Morel
  0 siblings, 1 reply; 27+ messages in thread
From: Matthew Rosato @ 2021-01-20 20:29 UTC (permalink / raw)
  To: Pierre Morel, cohuck, thuth
  Cc: schnelle, david, mst, richard.henderson, qemu-s390x, qemu-devel,
	pasic, borntraeger, alex.williamson, pbonzini
On 1/20/21 2:18 PM, Pierre Morel wrote:
> 
> 
> On 1/20/21 4:59 PM, Matthew Rosato wrote:
>> On 1/20/21 9:45 AM, Pierre Morel wrote:
>>>
>>>
>>> On 1/20/21 3:03 PM, Matthew Rosato wrote:
>>>> On 1/20/21 4:12 AM, Pierre Morel wrote:
>>>>>
>>>>>
>>>>> On 1/19/21 9:44 PM, Matthew Rosato wrote:
>>>>>> Today, ISM devices are completely disallowed for vfio-pci 
>>>>>> passthrough as
>>>>>> QEMU rejects the device due to an (inappropriate) MSI-X check. 
>>>>>> Removing
>>>>>> this fence, however, reveals additional deficiencies in the s390x PCI
>>>>>> interception layer that prevent ISM devices from working correctly.
>>>>>> Namely, ISM block write operations have particular requirements in 
>>>>>> regards
>>>>>> to the alignment, size and order of writes performed that cannot be
>>>>>> guaranteed when breaking up write operations through the typical
>>>>>> vfio_pci_bar_rw paths. Furthermore, ISM requires that legacy/non-MIO
>>>>>> s390 PCI instructions are used, which is also not guaranteed when 
>>>>>> the I/O
>>>>>> is passed through the typical userspace channels.
>>>>>>
>>>>>> This patchset provides a set of fixes related to enabling ISM device
>>>>>> passthrough and includes patches to enable use of a new vfio 
>>>>>> region that
>>>>>> will allow s390x PCI pass-through devices to perform s390 PCI 
>>>>>> instructions
>>>>>> in such a way that the same instruction issued on the guest is 
>>>>>> re-issued
>>>>>> on the host.
>>>>>>
>>>>>> Associated kernel patchset:
>>>>>> https://lkml.org/lkml/2021/1/19/874
>>>>>>
>>>>>> Changes from RFC -> v1:
>>>>>> - Refresh the header sync (built using Eric's 'update-linux-headers:
>>>>>> Include const.h' + manually removed pvrdma_ring.h again)
>>>>>> - Remove s390x/pci: fix pcistb length (already merged)
>>>>>> - Remove s390x/pci: Fix memory_region_access_valid call (already 
>>>>>> merged)
>>>>>> - Fix bug: s390_pci_vfio_pcistb should use the pre-allocated PCISTB
>>>>>> buffer pcistb_buf rather than allocating/freeing its own.
>>>>>> - New patch: track the PFT (PCI Function Type) separately from 
>>>>>> guest CLP
>>>>>> response data -- we tell the guest '0' for now due to limitations in
>>>>>> measurement block support, but we can still use the real value 
>>>>>> provided via
>>>>>> the vfio CLP capabilities to make decisions.
>>>>>> - Use the PFT (pci function type) to determine when to use the region
>>>>>> for PCISTB/PCILG (only for ISM), rather than using the relaxed 
>>>>>> alignment
>>>>>> bit.
>>>>>> - As a result, the pcistb_default is now updated to also handle the
>>>>>> possibility of relaxed alignment via 2 new functions, 
>>>>>> pcistb_validate_write
>>>>>> and pcistb_write, which serve as wrappers to the memory_region calls.
>>>>>> - New patch, which partially restores the MSI-X fence for passthrough
>>>>>> devices...  Could potentially be squashed with 's390x/pci: MSI-X 
>>>>>> isn't
>>>>>> strictly required for passthrough' but left separately for now as 
>>>>>> I felt it
>>>>>> needed a clear commit description of why we should still fence 
>>>>>> this case.
>>>>>>
>>>>> Hi,
>>>>>
>>>>> The choice of using the new VFIO region is made on the ISM PCI 
>>>>> function type (PFT), which makes the patch ISM specific, why don't 
>>>>> we use here the MIO bit common to any zPCI function and present in 
>>>>> kernel to make the choice?
>>>>>
>>>>
>>>> As discussed during the RFC (and see my reply also to the kernel 
>>>> set), the use of this region only works for devices that do not rely 
>>>> on MSI-X interrupts.  If we did as you suggest, other device types 
>>>> like mlx would not receive MSI-X interrupts in the guest (And I did 
>>>> indeed try variations where I used the special VFIO region for all 
>>>> PCISTG/PCILG/PCISTB for various device types)
>>>>
>>>> So the idea for now was to solve the specific problem at hand 
>>>> (getting ISM devices working).
>>>>
>>>>
>>>
>>> Sorry, if I missed or forgot some discussions, but I understood that 
>>> we are using this region to handle PCISTB instructions when the 
>>> device do not support MIO.
>>> Don't we?
>>
>> Sure thing - It's probably good to refresh the issue/rationale anyway 
>> as we've had the holidays in between.
>>
>> You are correct, a primary reason we need to resort to a separate VFIO 
>> region for PCISTB (and PCILG) instructions for ISM devices is that 
>> they do not support the MIO instruction set, yet the host kernel will 
>> translate everything coming through the PCI I/O layer to MIO 
>> instructions whenever that facility is available to the host (and not 
>> purposely disabled).  This issue is unique to vfio-pci/passthrough - 
>> in the host, the ISM driver directly invokes functions in s390 pci 
>> code to ensure that MIO instructions are not used.
> 
> 
> QEMU intercepts and differentiates PCISTG and PCISTB.
> The new hardware support both MIO and legacy PCISTB/PCISTG.
> 
> QEMU does not support MIO
> 
> My first interrogation is why should we translate legacy to MIO?
This is existing behavior of the s390 kernel PCI layer, not something 
I'm introducing here.
So, as you say, QEMU does not support MIO, nor does it attempt to 
translate anything to MIO.  The thing is, to the host kernel, PCI I/O 
coming from QEMU through the standard vfio-pci codepath just looks like 
any other userspace PCI I/O coming in to the kernel.  So the 
memory_region operations against the vfio virtual address space bars in 
QEMU then turn into iowrite/ioread operations in vfio-pci in the kernel, 
which then subsequently end up in the s390 PCI kernel layer, and it's 
there that everything is turned into an MIO operation if MIO is 
available.  That's not limited to vfio-pci, that's all PCI I/O in the 
host (except for the ISM driver, which bypasses this behavior by 
invoking s390 PCI kernel interfaces directly).
In an early (internal) version of the kernel component to this I floated 
the idea of trying to determine whether or not MIO instructions could be 
used for a given I/O operation, but the ideas I floated had various 
flaws -- I'd invite Niklas to chime in with why this got squashed.
But anyway...  If we were to solve that somehow, this would still leave 
us with write operations capped at 8B and odd write pattern requirements 
for ISM passthrough.  Using a VFIO region to pass the operation directly 
to the host kernel via a pinned page to overcome those limitations was 
your idea actually :)
> 
> But OK, say we do need this for some obscure reason.
> 
>>
>> But this is not the only reason.  There are additional reasons for 
>> using this VFIO region:
>> 1) ISM devices also don't support PCISTG instructions to certain 
>> address spaces and PCISTB must be used regardless of operation 
>> length.  However the standard s390 PCI I/O path always uses PCISTG for 
>> anything <=8B. Trying to determine whether a given I/O is intended for 
>> an ISM device at that point in kernel code so as to use PCISTB instead 
>> of PCISTG is the 
> 
> OK, this is clear.
> 
>> same problem as attempting to decide whether to use MIO vs non-MIO 
>> instructions at that point.
> 
> humm, this is not exactly the same problem for me, but OK to choose to 
> handle it the same way.
The problem isn't the same BUT the information needed to solve the 
problem is the same (for a given memory operation, knowing what 
device/type it is intended for, which can therefore be used to determine 
also whether it supports MIO or not).
> 
> 
> 
>> 2) It allows for much larger PCISTB operations (4K) than allowed via 
>> the memory regions (loop of 8B operations).
> 
> OK
> 
>> 3) The above also has the added benefit of eliminating certain write 
>> pattern requirements that are unique to ISM that would be introduced 
>> if we split up the I/O into 8B chunks (if we can't write the whole 
>> PCISTB in one go, ISM requires data written in a certain order for 
>> some address spaces, or with certain bits on/off on the PCISTB 
>> instruction to signify the state of the larger operation)
> 
> Yes, I suppose that the driver in the guest does it right and we need to 
> do the same.
> 
> 
>>
>>>
>>> I do not understand the relation between MSI-X and MIO.
>>> Can you please explain?
>>>
>>
>> There is not a relation between MSI-X and MIO really.  Rather, this is 
>> a case of the solution that is being offered here ONLY works for 
>> devices that use MSI -- and ISM is a device that only supports MSI.  
>> If you try to use this new VFIO region to pass I/O for an MSI-X 
>> enabled device, the notifiers set up via vfio_msix_setup won't be 
>> triggered because we are writing to the new VFIO region, not the 
>> virtual bar regions that may have had notifiers setup as part of 
>> vfio_msix_setup.  This results in missing interrupts on MSI-X-enabled 
>> vfio-pci devices.
>>
>> These notifiers aren't a factor when the device is using MSI.
> 
> I find this strange but we do not need to discuss it.
> 
>>
> 
> So we have:
> devices supporting MIO and MSIX
> devices not supporting MIO nor MSIX
> devices not supporting the use of PCISTG to emulate PCISTB
> 
> The first two are two different things indicated by two different 
> entries in the clp query PCI function response.
> 
> The last one, we do not have an indicator as if the relaxed alignment 
> and length is set, PCISTB can not be emulated with PCISTG
> 
> What I mean with this is that considering the proposed implementation 
> and considering:
> MIO MSIX RELAX
> 
> 0 0 1  -> must use the new region (ISM)
> 1 1 0  -> must use the standard VFIO region (MLX)
> 
> we can discuss other 6 possibilities
> 
> 0 0 0 -> must use the new region
> 0 1 0 -> NOOP
> 0 1 1 -> NOOP
> 1 0 0 -> can use any region
> 1 0 1 -> can use any region
> 1 1 1 -> NOOP
> 
> In my opinion the test for using one region or another should be done on 
> these indicator instead of using the PFT. > This may offer us more compatibility with other hardware we may not be
> aware of as today.
This gets a little shaky, and goes both ways -- Using your list, a 
device that supports MIO, does not have MSI-X capability and doesn't 
support relaxed alignment (1 0 0 from above) can use any region -- but 
that may not always be true.  What if "other hardware we may not be 
aware of as today" includes future hardware that ONLY supports the MIO 
instruction set?  Then that device really can't use this region either.
But forgetting that possibility...  I think we can really simplify the 
above matrix down to a statement of "if device doesn't support MSI-X but 
DOES support non-MIO instructions, it can use the region."  I believe 
the latter half of that statement is implicit in the architecture today, 
so it's really then "if device doesn't support MSI-X, it can use the 
region".  There's just the caveat of, if the device is ISM, it changes 
from 'can use the region' to 'must use the region'.
So, I mean I can change the code to be more permissive in that way 
(allow any device that doesn't have MSI-X capability to at least attempt 
to use the region).  But the reality is that ISM specifically needs the 
region for successful pass through, so I don't see a reason to create a 
different bit for that vs just checking for the PFT in QEMU and using 
that value to decide whether or not region availability is a requirement 
for allowing the device to pass through.
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-20 20:29           ` Matthew Rosato
@ 2021-01-21  8:27             ` Pierre Morel
  2021-01-21  9:58               ` Niklas Schnelle
  2021-01-21 14:42               ` Matthew Rosato
  0 siblings, 2 replies; 27+ messages in thread
From: Pierre Morel @ 2021-01-21  8:27 UTC (permalink / raw)
  To: Matthew Rosato, cohuck, thuth
  Cc: schnelle, david, mst, richard.henderson, qemu-s390x, qemu-devel,
	pasic, borntraeger, alex.williamson, pbonzini
On 1/20/21 9:29 PM, Matthew Rosato wrote:
> On 1/20/21 2:18 PM, Pierre Morel wrote:
>>
>>
...snip...
>> So we have:
>> devices supporting MIO and MSIX
>> devices not supporting MIO nor MSIX
>> devices not supporting the use of PCISTG to emulate PCISTB
>>
>> The first two are two different things indicated by two different 
>> entries in the clp query PCI function response.
>>
>> The last one, we do not have an indicator as if the relaxed alignment 
>> and length is set, PCISTB can not be emulated with PCISTG
hum sorry, it seems I rewrote my sentence until it was wrong wrong!
I wanted to say we DO HAVE an indicator with the relaxed bit...
>>
>> What I mean with this is that considering the proposed implementation 
>> and considering:
>> MIO MSIX RELAX
>>
>> 0 0 1  -> must use the new region (ISM)
>> 1 1 0  -> must use the standard VFIO region (MLX)
>>
>> we can discuss other 6 possibilities
>>
>> 0 0 0 -> must use the new region
>> 0 1 0 -> NOOP
>> 0 1 1 -> NOOP
>> 1 0 0 -> can use any region
>> 1 0 1 -> can use any region
>> 1 1 1 -> NOOP
>>
>> In my opinion the test for using one region or another should be done 
>> on these indicator instead of using the PFT. > This may offer us more 
>> compatibility with other hardware we may not be
>> aware of as today.
> 
> This gets a little shaky, and goes both ways -- Using your list, a 
> device that supports MIO, does not have MSI-X capability and doesn't 
> support relaxed alignment (1 0 0 from above) can use any region -- but 
> that may not always be true.  What if "other hardware we may not be 
> aware of as today" includes future hardware that ONLY supports the MIO 
> instruction set?  Then that device really can't use this region either.
Right, but there is no bit in the CLP response for this case.
Until there is one, the system is supposed to handle legacy instructions
> 
> But forgetting that possibility...  I think we can really simplify the 
> above matrix down to a statement of "if device doesn't support MSI-X but 
> DOES support non-MIO instructions, it can use the region."  I believe 
> the latter half of that statement is implicit in the architecture today, 
> so it's really then "if device doesn't support MSI-X, it can use the 
> region".  There's just the caveat of, if the device is ISM, it changes 
> from 'can use the region' to 'must use the region'.
There can surely be simplifications.
> 
> So, I mean I can change the code to be more permissive in that way 
> (allow any device that doesn't have MSI-X capability to at least attempt 
> to use the region).  But the reality is that ISM specifically needs the 
> region for successful pass through, so I don't see a reason to create a 
> different bit for that vs just checking for the PFT in QEMU and using 
> that value to decide whether or not region availability is a requirement 
> for allowing the device to pass through.
There is no need for a new bit to know if a device support MIO or not, 
as I said before, there is already one in the CLP query PCI function 
response and it is already used in the kernel zPCI architecture.
It is not a big think to do and does not change the general architecture 
of the patch, only the detection of which device is impacted to make it 
generic instead of device dedicated.
Regards,
Pierre
-- 
Pierre Morel
IBM Lab Boeblingen
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-21  8:27             ` Pierre Morel
@ 2021-01-21  9:58               ` Niklas Schnelle
  2021-01-21 12:30                 ` Pierre Morel
  2021-01-21 14:42               ` Matthew Rosato
  1 sibling, 1 reply; 27+ messages in thread
From: Niklas Schnelle @ 2021-01-21  9:58 UTC (permalink / raw)
  To: Pierre Morel, Matthew Rosato, cohuck, thuth
  Cc: mst, david, richard.henderson, qemu-s390x, qemu-devel, pasic,
	borntraeger, alex.williamson, pbonzini
On 1/21/21 9:27 AM, Pierre Morel wrote:
> 
> 
> On 1/20/21 9:29 PM, Matthew Rosato wrote:
>> On 1/20/21 2:18 PM, Pierre Morel wrote:
>>>
>>>
> ...snip...
> 
>>
>> So, I mean I can change the code to be more permissive in that way (allow any device that doesn't have MSI-X capability to at least attempt to use the region).  But the reality is that ISM specifically needs the region for successful pass through, so I don't see a reason to create a different bit for that vs just checking for the PFT in QEMU and using that value to decide whether or not region availability is a requirement for allowing the device to pass through.
> 
> 
> There is no need for a new bit to know if a device support MIO or not, as I said before, there is already one in the CLP query PCI function response and it is already used in the kernel zPCI architecture.
> 
> 
> It is not a big think to do and does not change the general architecture of the patch, only the detection of which device is impacted to make it generic instead of device dedicated.
> 
> Regards,
> Pierre
Just wanted to say that we've had a very similar discussion with
Cornelia end of last year and came to the conclusion that explicitly
matching the PFT is likely the safest bet:
https://lkml.org/lkml/2020/12/22/479
One other point for me is that all these special requirements seem
to stem from the fact that ISM is not a physical PCIe device but
a special s390 only thing. That also implies that unlike with real
PCI devices we are not going to find an existing Linux driver
and infrastructure that just works. I'd bet if it's anywhere
as special as ISM we will also need a QEMU change either way
and if that change is only to change the PFT check then
great, won't have trouble getting that backported...
So the PFT check ensures we're definitely trying the standard path
first and only change it to a special case if we understand that
it is required as we do for ISM.
That said it's also important to note that we are not searching
for a be all end all solution here. The standard path's to-MIO
translation, even though it is working, is weird enough and also
not ideal for performance so we might look at that again
in separate work and yes that could change things for
this case too but that too will require QEMU and Kernel
changes either way.
Furthermore we will be looking into handling more PCI
without leaving SIE at which point we will also re-evaluate
if that works for ISM. Again that needs QEMU and likely Kernel
changes. I'm a great fan of general solutions that
don't unnecessarily exclude something, so I do understand your
concern but let's focus on what we know and want to fix now
instead of trying to solve future theoretical issues that
we have no strong reason to believe will ever appear.
If we stick with the PFT check I think we do need to
change some wording especially for the Kernel changes.
> 
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-21  9:58               ` Niklas Schnelle
@ 2021-01-21 12:30                 ` Pierre Morel
  2021-01-21 13:37                   ` Niklas Schnelle
  0 siblings, 1 reply; 27+ messages in thread
From: Pierre Morel @ 2021-01-21 12:30 UTC (permalink / raw)
  To: Niklas Schnelle, Matthew Rosato, cohuck, thuth
  Cc: mst, david, richard.henderson, qemu-s390x, qemu-devel, pasic,
	borntraeger, alex.williamson, pbonzini
On 1/21/21 10:58 AM, Niklas Schnelle wrote:
> 
> 
> On 1/21/21 9:27 AM, Pierre Morel wrote:
>>
>>
>> On 1/20/21 9:29 PM, Matthew Rosato wrote:
>>> On 1/20/21 2:18 PM, Pierre Morel wrote:
>>>>
>>>>
>> ...snip...
>>
>>>
>>> So, I mean I can change the code to be more permissive in that way (allow any device that doesn't have MSI-X capability to at least attempt to use the region).  But the reality is that ISM specifically needs the region for successful pass through, so I don't see a reason to create a different bit for that vs just checking for the PFT in QEMU and using that value to decide whether or not region availability is a requirement for allowing the device to pass through.
>>
>>
>> There is no need for a new bit to know if a device support MIO or not, as I said before, there is already one in the CLP query PCI function response and it is already used in the kernel zPCI architecture.
>>
>>
>> It is not a big think to do and does not change the general architecture of the patch, only the detection of which device is impacted to make it generic instead of device dedicated.
>>
>> Regards,
>> Pierre
> 
> Just wanted to say that we've had a very similar discussion with
> Cornelia end of last year and came to the conclusion that explicitly
> matching the PFT is likely the safest bet:
> https://lkml.org/lkml/2020/12/22/479
What I see there is a discussion on the relation between relaxed access 
and MIO without explaining to Connie that we have in the kernel the 
possibility to know if a device support MIO or not independently of it 
supports the relaxed access.
The all point here is about taking decisions for the right reasons.
We have the possibility to take the decision based on functionalities 
and not on a specific PCI function.
If we keep the PFT check, and we can do this of course, but is it a good 
solution if it appears we have other PFT with the same functionalities?
Please note that this is a minor code change, keeping track of the MIO 
support just as we keep track of the PFT and check on this instead of on 
the PFT.
It does not modify the general architecture of the patch series neither 
in kernel nor in QEMU at all.
Pierre
-- 
Pierre Morel
IBM Lab Boeblingen
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-21 12:30                 ` Pierre Morel
@ 2021-01-21 13:37                   ` Niklas Schnelle
  2021-01-21 14:46                     ` Pierre Morel
  0 siblings, 1 reply; 27+ messages in thread
From: Niklas Schnelle @ 2021-01-21 13:37 UTC (permalink / raw)
  To: Pierre Morel, Matthew Rosato, cohuck, thuth
  Cc: mst, david, richard.henderson, qemu-s390x, qemu-devel, pasic,
	borntraeger, alex.williamson, pbonzini
On 1/21/21 1:30 PM, Pierre Morel wrote:
> 
> 
> On 1/21/21 10:58 AM, Niklas Schnelle wrote:
>>
>>
>> On 1/21/21 9:27 AM, Pierre Morel wrote:
>>>
>>>
>>> On 1/20/21 9:29 PM, Matthew Rosato wrote:
>>>> On 1/20/21 2:18 PM, Pierre Morel wrote:
>>>>>
>>>>>
>>> ...snip...
>>>
>>>>
>>>> So, I mean I can change the code to be more permissive in that way (allow any device that doesn't have MSI-X capability to at least attempt to use the region).  But the reality is that ISM specifically needs the region for successful pass through, so I don't see a reason to create a different bit for that vs just checking for the PFT in QEMU and using that value to decide whether or not region availability is a requirement for allowing the device to pass through.
>>>
>>>
>>> There is no need for a new bit to know if a device support MIO or not, as I said before, there is already one in the CLP query PCI function response and it is already used in the kernel zPCI architecture.
>>>
>>>
>>> It is not a big think to do and does not change the general architecture of the patch, only the detection of which device is impacted to make it generic instead of device dedicated.
>>>
>>> Regards,
>>> Pierre
>>
>> Just wanted to say that we've had a very similar discussion with
>> Cornelia end of last year and came to the conclusion that explicitly
>> matching the PFT is likely the safest bet:
>> https://lkml.org/lkml/2020/12/22/479
> 
> What I see there is a discussion on the relation between relaxed access and MIO without explaining to Connie that we have in the kernel the possibility to know if a device support MIO or not independently of it supports the relaxed access.
> 
> The all point here is about taking decisions for the right reasons.
> 
> We have the possibility to take the decision based on functionalities and not on a specific PCI function.
Yes but that goes both ways the functionality of the region has to match
that of the device and at least in it's current state the regions functionality
matches only ISM in a way that is so specific that it is very unlikely to match anything
else. For example it can't support a PCI device that requires non-MIO but
also MSI-X. In its current form it doesn't even support PCI Store only PCI Store
Block, we had that in an earlier version and it's trivial but then we get the MSI-X
problem.
> 
> 
> If we keep the PFT check, and we can do this of course, but is it a good solution if it appears we have other PFT with the same functionalities?
> 
> Please note that this is a minor code change, keeping track of the MIO support just as we keep track of the PFT and check on this instead of on the PFT.
That is certainly true and I'm not strongly against matching on functionality
it just seems to me that it's too specific for that to make sense and
in that case I feel it's better to be clear about that and make it ISM
specific in name and functionality.
If we manage to find a fix for the MSI-X problem which I'd be really happy about
we can simply extend the regions functionality and reuse the same code
for a backwards compatibile ISM region and a more generic zPCI non-MIO
region that could even be used if the client (QEMU) uses non-MIO and
the device can do both as is the case for all current physical devices.
> 
> It does not modify the general architecture of the patch series neither in kernel nor in QEMU at all.
> 
> 
> Pierre
> 
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-21 13:37                   ` Niklas Schnelle
@ 2021-01-21 14:46                     ` Pierre Morel
  2021-01-21 14:54                       ` Niklas Schnelle
  0 siblings, 1 reply; 27+ messages in thread
From: Pierre Morel @ 2021-01-21 14:46 UTC (permalink / raw)
  To: Niklas Schnelle, Matthew Rosato, cohuck, thuth
  Cc: mst, david, richard.henderson, qemu-s390x, qemu-devel, pasic,
	borntraeger, alex.williamson, pbonzini
On 1/21/21 2:37 PM, Niklas Schnelle wrote:
> 
> 
> On 1/21/21 1:30 PM, Pierre Morel wrote:
>>>
>>> Just wanted to say that we've had a very similar discussion with
>>> Cornelia end of last year and came to the conclusion that explicitly
>>> matching the PFT is likely the safest bet:
>>> https://lkml.org/lkml/2020/12/22/479
>>
>> What I see there is a discussion on the relation between relaxed access and MIO without explaining to Connie that we have in the kernel the possibility to know if a device support MIO or not independently of it supports the relaxed access.
>>
>> The all point here is about taking decisions for the right reasons.
>>
>> We have the possibility to take the decision based on functionalities and not on a specific PCI function.
> 
> Yes but that goes both ways the functionality of the region has to match
> that of the device and at least in it's current state the regions functionality
> matches only ISM in a way that is so specific that it is very unlikely to match anything
> else. For example it can't support a PCI device that requires non-MIO but
> also MSI-X. In its current form it doesn't even support PCI Store only PCI Store
> Block, we had that in an earlier version and it's trivial but then we get the MSI-X
> problem.
What does that change if we take one or the other solution considering 
the checking of MIO/MSIX/relax versus PFT?
> 
>>
>>
>> If we keep the PFT check, and we can do this of course, but is it a good solution if it appears we have other PFT with the same functionalities?
>>
>> Please note that this is a minor code change, keeping track of the MIO support just as we keep track of the PFT and check on this instead of on the PFT.
> 
> That is certainly true and I'm not strongly against matching on functionality
> it just seems to me that it's too specific for that to make sense and
> in that case I feel it's better to be clear about that and make it ISM
> specific in name and functionality.
> 
> If we manage to find a fix for the MSI-X problem which I'd be really happy about
> we can simply extend the regions functionality and reuse the same code
> for a backwards compatibile ISM region and a more generic zPCI non-MIO
> region that could even be used if the client (QEMU) uses non-MIO and
> the device can do both as is the case for all current physical devices.
I think that the fix for this is in the kernel, in the MIO implementation.
But we better discuss this offline.
Since neither of us have the decision, why don't we let the maintainer 
discuss now that they have all the information.
Regards,
Pierre
-- 
Pierre Morel
IBM Lab Boeblingen
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-21 14:46                     ` Pierre Morel
@ 2021-01-21 14:54                       ` Niklas Schnelle
  2021-01-21 17:50                         ` Cornelia Huck
  0 siblings, 1 reply; 27+ messages in thread
From: Niklas Schnelle @ 2021-01-21 14:54 UTC (permalink / raw)
  To: Pierre Morel, Matthew Rosato, cohuck, thuth
  Cc: mst, david, richard.henderson, qemu-s390x, qemu-devel, pasic,
	borntraeger, alex.williamson, pbonzini
On 1/21/21 3:46 PM, Pierre Morel wrote:
> 
> 
> On 1/21/21 2:37 PM, Niklas Schnelle wrote:
>>
>>
>> On 1/21/21 1:30 PM, Pierre Morel wrote:
> 
>>>>
>>>> Just wanted to say that we've had a very similar discussion with
>>>> Cornelia end of last year and came to the conclusion that explicitly
>>>> matching the PFT is likely the safest bet:
>>>> https://lkml.org/lkml/2020/12/22/479
>>>
>>> What I see there is a discussion on the relation between relaxed access and MIO without explaining to Connie that we have in the kernel the possibility to know if a device support MIO or not independently of it supports the relaxed access.
>>>
>>> The all point here is about taking decisions for the right reasons.
>>>
>>> We have the possibility to take the decision based on functionalities and not on a specific PCI function.
>>
>> Yes but that goes both ways the functionality of the region has to match
>> that of the device and at least in it's current state the regions functionality
>> matches only ISM in a way that is so specific that it is very unlikely to match anything
>> else. For example it can't support a PCI device that requires non-MIO but
>> also MSI-X. In its current form it doesn't even support PCI Store only PCI Store
>> Block, we had that in an earlier version and it's trivial but then we get the MSI-X
>> problem.
> 
> 
> What does that change if we take one or the other solution considering the checking of MIO/MSIX/relax versus PFT?
If it's !MIO && !MSIX && relax_align I'm fine with that check but
then we should also add PCISTG to the region.
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-21 14:54                       ` Niklas Schnelle
@ 2021-01-21 17:50                         ` Cornelia Huck
  2021-01-21 18:06                           ` Matthew Rosato
  0 siblings, 1 reply; 27+ messages in thread
From: Cornelia Huck @ 2021-01-21 17:50 UTC (permalink / raw)
  To: Niklas Schnelle
  Cc: thuth, Pierre Morel, Matthew Rosato, david, mst,
	richard.henderson, qemu-s390x, qemu-devel, pasic, borntraeger,
	alex.williamson, pbonzini
On Thu, 21 Jan 2021 15:54:22 +0100
Niklas Schnelle <schnelle@linux.ibm.com> wrote:
> On 1/21/21 3:46 PM, Pierre Morel wrote:
> > 
> > 
> > On 1/21/21 2:37 PM, Niklas Schnelle wrote:  
> >>
> >>
> >> On 1/21/21 1:30 PM, Pierre Morel wrote:  
> >   
> >>>>
> >>>> Just wanted to say that we've had a very similar discussion with
> >>>> Cornelia end of last year and came to the conclusion that explicitly
> >>>> matching the PFT is likely the safest bet:
> >>>> https://lkml.org/lkml/2020/12/22/479  
> >>>
> >>> What I see there is a discussion on the relation between relaxed access and MIO without explaining to Connie that we have in the kernel the possibility to know if a device support MIO or not independently of it supports the relaxed access.
> >>>
> >>> The all point here is about taking decisions for the right reasons.
> >>>
> >>> We have the possibility to take the decision based on functionalities and not on a specific PCI function.  
> >>
> >> Yes but that goes both ways the functionality of the region has to match
> >> that of the device and at least in it's current state the regions functionality
> >> matches only ISM in a way that is so specific that it is very unlikely to match anything
> >> else. For example it can't support a PCI device that requires non-MIO but
> >> also MSI-X. In its current form it doesn't even support PCI Store only PCI Store
> >> Block, we had that in an earlier version and it's trivial but then we get the MSI-X
> >> problem.  
> > 
> > 
> > What does that change if we take one or the other solution considering the checking of MIO/MSIX/relax versus PFT?  
> 
> 
> If it's !MIO && !MSIX && relax_align I'm fine with that check but
> then we should also add PCISTG to the region.
> 
Just to double check: that would today cover only ISM (which doesn't
use PCISTG), correct?
/me getting a bit lost in this discussion :)
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-21 17:50                         ` Cornelia Huck
@ 2021-01-21 18:06                           ` Matthew Rosato
  2021-01-22 16:46                             ` Cornelia Huck
  0 siblings, 1 reply; 27+ messages in thread
From: Matthew Rosato @ 2021-01-21 18:06 UTC (permalink / raw)
  To: Cornelia Huck, Niklas Schnelle
  Cc: thuth, Pierre Morel, david, mst, richard.henderson, qemu-s390x,
	qemu-devel, pasic, borntraeger, alex.williamson, pbonzini
On 1/21/21 12:50 PM, Cornelia Huck wrote:
> On Thu, 21 Jan 2021 15:54:22 +0100
> Niklas Schnelle <schnelle@linux.ibm.com> wrote:
> 
>> On 1/21/21 3:46 PM, Pierre Morel wrote:
>>>
>>>
>>> On 1/21/21 2:37 PM, Niklas Schnelle wrote:
>>>>
>>>>
>>>> On 1/21/21 1:30 PM, Pierre Morel wrote:
>>>    
>>>>>>
>>>>>> Just wanted to say that we've had a very similar discussion with
>>>>>> Cornelia end of last year and came to the conclusion that explicitly
>>>>>> matching the PFT is likely the safest bet:
>>>>>> https://lkml.org/lkml/2020/12/22/479
>>>>>
>>>>> What I see there is a discussion on the relation between relaxed access and MIO without explaining to Connie that we have in the kernel the possibility to know if a device support MIO or not independently of it supports the relaxed access.
>>>>>
>>>>> The all point here is about taking decisions for the right reasons.
>>>>>
>>>>> We have the possibility to take the decision based on functionalities and not on a specific PCI function.
>>>>
>>>> Yes but that goes both ways the functionality of the region has to match
>>>> that of the device and at least in it's current state the regions functionality
>>>> matches only ISM in a way that is so specific that it is very unlikely to match anything
>>>> else. For example it can't support a PCI device that requires non-MIO but
>>>> also MSI-X. In its current form it doesn't even support PCI Store only PCI Store
>>>> Block, we had that in an earlier version and it's trivial but then we get the MSI-X
>>>> problem.
>>>
>>>
>>> What does that change if we take one or the other solution considering the checking of MIO/MSIX/relax versus PFT?
>>
>>
>> If it's !MIO && !MSIX && relax_align I'm fine with that check but
>> then we should also add PCISTG to the region.
>>
> 
> Just to double check: that would today cover only ISM (which doesn't
> use PCISTG), correct?
> 
Yes, correct -- So to summarize the proposal outlined is to use those 
features to determine whether a device should be using the region or not 
rather rather than strictly saying only PFT==ISM may use it.
Practically speaking, ISM is the only device that fits the bill today 
when you combine these things, and ISM does not use PCISTG -- so PCISTG 
support was simply omitted from the region (prior versions before coming 
upstream included it, was dropped since ISM doesn't use it).
What Niklas suggests here is that, if we are going to be broad in 
determining whether the region can be used for a given device vs saying 
'only PFT==ISM', then we can no longer assume that the device doesn't 
use PCISTG and as such is suggesting the region should also include 
PCISTG support as a means of future compatibility (to ensure non-MIO 
PCISTG is issued for the device).
> /me getting a bit lost in this discussion :)
> 
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-21 18:06                           ` Matthew Rosato
@ 2021-01-22 16:46                             ` Cornelia Huck
  2021-01-25 14:55                               ` Matthew Rosato
  0 siblings, 1 reply; 27+ messages in thread
From: Cornelia Huck @ 2021-01-22 16:46 UTC (permalink / raw)
  To: Matthew Rosato
  Cc: thuth, Pierre Morel, david, Niklas Schnelle, richard.henderson,
	qemu-s390x, qemu-devel, pasic, borntraeger, alex.williamson, mst,
	pbonzini
On Thu, 21 Jan 2021 13:06:24 -0500
Matthew Rosato <mjrosato@linux.ibm.com> wrote:
> On 1/21/21 12:50 PM, Cornelia Huck wrote:
> > On Thu, 21 Jan 2021 15:54:22 +0100
> > Niklas Schnelle <schnelle@linux.ibm.com> wrote:
> >   
> >> On 1/21/21 3:46 PM, Pierre Morel wrote:  
> >>>
> >>>
> >>> On 1/21/21 2:37 PM, Niklas Schnelle wrote:  
> >>>>
> >>>>
> >>>> On 1/21/21 1:30 PM, Pierre Morel wrote:  
> >>>      
> >>>>>>
> >>>>>> Just wanted to say that we've had a very similar discussion with
> >>>>>> Cornelia end of last year and came to the conclusion that explicitly
> >>>>>> matching the PFT is likely the safest bet:
> >>>>>> https://lkml.org/lkml/2020/12/22/479  
> >>>>>
> >>>>> What I see there is a discussion on the relation between relaxed access and MIO without explaining to Connie that we have in the kernel the possibility to know if a device support MIO or not independently of it supports the relaxed access.
> >>>>>
> >>>>> The all point here is about taking decisions for the right reasons.
> >>>>>
> >>>>> We have the possibility to take the decision based on functionalities and not on a specific PCI function.  
> >>>>
> >>>> Yes but that goes both ways the functionality of the region has to match
> >>>> that of the device and at least in it's current state the regions functionality
> >>>> matches only ISM in a way that is so specific that it is very unlikely to match anything
> >>>> else. For example it can't support a PCI device that requires non-MIO but
> >>>> also MSI-X. In its current form it doesn't even support PCI Store only PCI Store
> >>>> Block, we had that in an earlier version and it's trivial but then we get the MSI-X
> >>>> problem.  
> >>>
> >>>
> >>> What does that change if we take one or the other solution considering the checking of MIO/MSIX/relax versus PFT?  
> >>
> >>
> >> If it's !MIO && !MSIX && relax_align I'm fine with that check but
> >> then we should also add PCISTG to the region.
> >>  
> > 
> > Just to double check: that would today cover only ISM (which doesn't
> > use PCISTG), correct?
> >   
> 
> Yes, correct -- So to summarize the proposal outlined is to use those 
> features to determine whether a device should be using the region or not 
> rather rather than strictly saying only PFT==ISM may use it.
> 
> Practically speaking, ISM is the only device that fits the bill today 
> when you combine these things, and ISM does not use PCISTG -- so PCISTG 
> support was simply omitted from the region (prior versions before coming 
> upstream included it, was dropped since ISM doesn't use it).
> 
> What Niklas suggests here is that, if we are going to be broad in 
> determining whether the region can be used for a given device vs saying 
> 'only PFT==ISM', then we can no longer assume that the device doesn't 
> use PCISTG and as such is suggesting the region should also include 
> PCISTG support as a means of future compatibility (to ensure non-MIO 
> PCISTG is issued for the device).
Yes, if we go the "check for features" route, including PCISTG makes
sense.
However, I'm still not quite sure whether checking for ISM vs checking
for features makes more sense in the long run. Is ISM that one weird
device whose cousins you're not going to invite to the party, or is
there a possibility that there will be devices that could make use of
the region for performance etc.?
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-22 16:46                             ` Cornelia Huck
@ 2021-01-25 14:55                               ` Matthew Rosato
  0 siblings, 0 replies; 27+ messages in thread
From: Matthew Rosato @ 2021-01-25 14:55 UTC (permalink / raw)
  To: Cornelia Huck
  Cc: thuth, Pierre Morel, david, Niklas Schnelle, richard.henderson,
	qemu-s390x, qemu-devel, pasic, borntraeger, alex.williamson, mst,
	pbonzini
On 1/22/21 11:46 AM, Cornelia Huck wrote:
> On Thu, 21 Jan 2021 13:06:24 -0500
> Matthew Rosato <mjrosato@linux.ibm.com> wrote:
> 
>> On 1/21/21 12:50 PM, Cornelia Huck wrote:
>>> On Thu, 21 Jan 2021 15:54:22 +0100
>>> Niklas Schnelle <schnelle@linux.ibm.com> wrote:
>>>    
>>>> On 1/21/21 3:46 PM, Pierre Morel wrote:
>>>>>
>>>>>
>>>>> On 1/21/21 2:37 PM, Niklas Schnelle wrote:
>>>>>>
>>>>>>
>>>>>> On 1/21/21 1:30 PM, Pierre Morel wrote:
>>>>>       
>>>>>>>>
>>>>>>>> Just wanted to say that we've had a very similar discussion with
>>>>>>>> Cornelia end of last year and came to the conclusion that explicitly
>>>>>>>> matching the PFT is likely the safest bet:
>>>>>>>> https://lkml.org/lkml/2020/12/22/479
>>>>>>>
>>>>>>> What I see there is a discussion on the relation between relaxed access and MIO without explaining to Connie that we have in the kernel the possibility to know if a device support MIO or not independently of it supports the relaxed access.
>>>>>>>
>>>>>>> The all point here is about taking decisions for the right reasons.
>>>>>>>
>>>>>>> We have the possibility to take the decision based on functionalities and not on a specific PCI function.
>>>>>>
>>>>>> Yes but that goes both ways the functionality of the region has to match
>>>>>> that of the device and at least in it's current state the regions functionality
>>>>>> matches only ISM in a way that is so specific that it is very unlikely to match anything
>>>>>> else. For example it can't support a PCI device that requires non-MIO but
>>>>>> also MSI-X. In its current form it doesn't even support PCI Store only PCI Store
>>>>>> Block, we had that in an earlier version and it's trivial but then we get the MSI-X
>>>>>> problem.
>>>>>
>>>>>
>>>>> What does that change if we take one or the other solution considering the checking of MIO/MSIX/relax versus PFT?
>>>>
>>>>
>>>> If it's !MIO && !MSIX && relax_align I'm fine with that check but
>>>> then we should also add PCISTG to the region.
>>>>   
>>>
>>> Just to double check: that would today cover only ISM (which doesn't
>>> use PCISTG), correct?
>>>    
>>
>> Yes, correct -- So to summarize the proposal outlined is to use those
>> features to determine whether a device should be using the region or not
>> rather rather than strictly saying only PFT==ISM may use it.
>>
>> Practically speaking, ISM is the only device that fits the bill today
>> when you combine these things, and ISM does not use PCISTG -- so PCISTG
>> support was simply omitted from the region (prior versions before coming
>> upstream included it, was dropped since ISM doesn't use it).
>>
>> What Niklas suggests here is that, if we are going to be broad in
>> determining whether the region can be used for a given device vs saying
>> 'only PFT==ISM', then we can no longer assume that the device doesn't
>> use PCISTG and as such is suggesting the region should also include
>> PCISTG support as a means of future compatibility (to ensure non-MIO
>> PCISTG is issued for the device).
> 
> Yes, if we go the "check for features" route, including PCISTG makes
> sense.
> 
> However, I'm still not quite sure whether checking for ISM vs checking
> for features makes more sense in the long run. Is ISM that one weird
> device whose cousins you're not going to invite to the party, or is
...  Yes. :)
> there a possibility that there will be devices that could make use of
> the region for performance etc.?
> 
Also yes.  Well, potentially.  That's really what I wanted at the 
outset, but the fly in the ointment is that the region as I exploit it 
in QEMU today only works properly for devices without MSI-X, which 
frankly limits its general-purpose utility.  If we could lift that 
restriction (whether it be now or at a later point) then the region 
becomes quite beneficial for performance of any devices making frequent 
use of large PCISTB operations.
^ permalink raw reply	[flat|nested] 27+ messages in thread 
 
 
 
 
 
 
 
 
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-21  8:27             ` Pierre Morel
  2021-01-21  9:58               ` Niklas Schnelle
@ 2021-01-21 14:42               ` Matthew Rosato
  2021-01-21 15:45                 ` Pierre Morel
  1 sibling, 1 reply; 27+ messages in thread
From: Matthew Rosato @ 2021-01-21 14:42 UTC (permalink / raw)
  To: Pierre Morel, cohuck, thuth
  Cc: schnelle, alex.williamson, david, mst, richard.henderson,
	qemu-devel, pasic, borntraeger, qemu-s390x, pbonzini
On 1/21/21 3:27 AM, Pierre Morel wrote:
> 
> 
> On 1/20/21 9:29 PM, Matthew Rosato wrote:
>> On 1/20/21 2:18 PM, Pierre Morel wrote:
>>>
>>>
> ...snip...
> 
>>> So we have:
>>> devices supporting MIO and MSIX
>>> devices not supporting MIO nor MSIX
>>> devices not supporting the use of PCISTG to emulate PCISTB
>>>
>>> The first two are two different things indicated by two different 
>>> entries in the clp query PCI function response.
>>>
>>> The last one, we do not have an indicator as if the relaxed alignment 
>>> and length is set, PCISTB can not be emulated with PCISTG
> 
> 
> hum sorry, it seems I rewrote my sentence until it was wrong wrong!
> I wanted to say we DO HAVE an indicator with the relaxed bit...
That's actually not quite true though...  The relaxed bit does not 
directly imply that PCISTB cannot be emulated with PCISTG.  Rather, it 
more generally implies that PCISTB could be used instead of PCISTG as 
the length and alignment requirements for PCISTB are waived.  As far as 
I can tell, disallowing a PCISTG altogether is a trait that is unique to 
ISM and I don't see anywhere that it's otherwise indicated in 
architecture...  And in fact, for a given ISM device, certain address 
spaces (command) WILL accept a series of PCISTG issued in a particular 
manner in place of a PCISTB; meanwhile other ISM address spaces (data) 
will not accept any PCISTG ever.  :(  The ISM driver just always uses 
PCISTB.
So that is why I suggested type==ISM must require use of the region 
whereas other types that implement MSI could request (but not require) 
use of the region.
But yes, any other theoretical piece of hardware that also does not 
support MIO would have a similar region requirement.  I'll have a look 
at the MIO bit you referenced below and will have to verify that it does 
exactly what we expect for an ISM device.  Assuming yes, I will consider 
the following sort of checking for the next version of QEMU...
if (!MIO) {
	if (MSIX) {
		// passthrough disallowed
	}
	else {
		// region required, disallow if unavailable
	}
}
else if(RELAX && !MSIX) {
	// use region if available
}
Sound good?
All said, this would result in another bit passed from the kernel as CLP 
info and some small changes to patch 6 of this set to determine when to 
call s390_pci_get_zpci_io_region(pbdev) -- But just about everything 
else is unaffected, so @all please feel free to provide other review 
comments for the series in the meanwhile.
> 
>>>
>>> What I mean with this is that considering the proposed implementation 
>>> and considering:
>>> MIO MSIX RELAX
>>>
>>> 0 0 1  -> must use the new region (ISM)
>>> 1 1 0  -> must use the standard VFIO region (MLX)
>>>
>>> we can discuss other 6 possibilities
>>>
>>> 0 0 0 -> must use the new region
>>> 0 1 0 -> NOOP
>>> 0 1 1 -> NOOP
>>> 1 0 0 -> can use any region
>>> 1 0 1 -> can use any region
>>> 1 1 1 -> NOOP
>>>
>>> In my opinion the test for using one region or another should be done 
>>> on these indicator instead of using the PFT. > This may offer us more 
>>> compatibility with other hardware we may not be
>>> aware of as today.
>>
>> This gets a little shaky, and goes both ways -- Using your list, a 
>> device that supports MIO, does not have MSI-X capability and doesn't 
>> support relaxed alignment (1 0 0 from above) can use any region -- but 
>> that may not always be true.  What if "other hardware we may not be 
>> aware of as today" includes future hardware that ONLY supports the MIO 
>> instruction set?  Then that device really can't use this region either.
> 
> 
> Right, but there is no bit in the CLP response for this case.
> Until there is one, the system is supposed to handle legacy instructions
> 
>>
>> But forgetting that possibility...  I think we can really simplify the 
>> above matrix down to a statement of "if device doesn't support MSI-X 
>> but DOES support non-MIO instructions, it can use the region."  I 
>> believe the latter half of that statement is implicit in the 
>> architecture today, so it's really then "if device doesn't support 
>> MSI-X, it can use the region".  There's just the caveat of, if the 
>> device is ISM, it changes from 'can use the region' to 'must use the 
>> region'.
> 
> 
> There can surely be simplifications.
> 
>>
>> So, I mean I can change the code to be more permissive in that way 
>> (allow any device that doesn't have MSI-X capability to at least 
>> attempt to use the region).  But the reality is that ISM specifically 
>> needs the region for successful pass through, so I don't see a reason 
>> to create a different bit for that vs just checking for the PFT in 
>> QEMU and using that value to decide whether or not region availability 
>> is a requirement for allowing the device to pass through.
> 
> 
> There is no need for a new bit to know if a device support MIO or not, 
> as I said before, there is already one in the CLP query PCI function 
> response and it is already used in the kernel zPCI architecture.
> 
> 
> It is not a big think to do and does not change the general architecture 
> of the patch, only the detection of which device is impacted to make it 
> generic instead of device dedicated.
> 
> Regards,
> Pierre
> 
^ permalink raw reply	[flat|nested] 27+ messages in thread
- * Re: [PATCH 0/8] s390x/pci: Fixing s390 vfio-pci ISM support
  2021-01-21 14:42               ` Matthew Rosato
@ 2021-01-21 15:45                 ` Pierre Morel
  0 siblings, 0 replies; 27+ messages in thread
From: Pierre Morel @ 2021-01-21 15:45 UTC (permalink / raw)
  To: Matthew Rosato, cohuck, thuth
  Cc: schnelle, alex.williamson, david, mst, richard.henderson,
	qemu-devel, pasic, borntraeger, qemu-s390x, pbonzini
On 1/21/21 3:42 PM, Matthew Rosato wrote:
> On 1/21/21 3:27 AM, Pierre Morel wrote:
>>
>>
>> On 1/20/21 9:29 PM, Matthew Rosato wrote:
>>> On 1/20/21 2:18 PM, Pierre Morel wrote:
>>>>
>>>>
>> ...snip...
>>
>>>> So we have:
>>>> devices supporting MIO and MSIX
>>>> devices not supporting MIO nor MSIX
>>>> devices not supporting the use of PCISTG to emulate PCISTB
>>>>
>>>> The first two are two different things indicated by two different 
>>>> entries in the clp query PCI function response.
>>>>
>>>> The last one, we do not have an indicator as if the relaxed 
>>>> alignment and length is set, PCISTB can not be emulated with PCISTG
>>
>>
>> hum sorry, it seems I rewrote my sentence until it was wrong wrong!
>> I wanted to say we DO HAVE an indicator with the relaxed bit...
> 
> That's actually not quite true though...  The relaxed bit does not 
> directly imply that PCISTB cannot be emulated with PCISTG.
It does, PCISTG have stronger alignment enforcement as PCISTB with the 
relaxed bit set, as you say here under, so we agree.
>  Rather, it 
> more generally implies that PCISTB could be used instead of PCISTG as 
> the length and alignment requirements for PCISTB are waived.  As far as 
> I can tell, disallowing a PCISTG altogether is a trait that is unique to 
> ISM and I don't see anywhere that it's otherwise indicated in 
> architecture...  And in fact, for a given ISM device, certain address 
> spaces (command) WILL accept a series of PCISTG issued in a particular 
> manner in place of a PCISTB; meanwhile other ISM address spaces (data) 
> will not accept any PCISTG ever.  :(  The ISM driver just always uses 
> PCISTB.
Very interresting!
> 
> So that is why I suggested type==ISM must require use of the region 
> whereas other types that implement MSI could request (but not require) 
> use of the region.
> 
> But yes, any other theoretical piece of hardware that also does not 
> support MIO would have a similar region requirement.  I'll have a look 
> at the MIO bit you referenced below and will have to verify that it does 
> exactly what we expect for an ISM device.  Assuming yes, I will consider 
> the following sort of checking for the next version of QEMU...
> 
> if (!MIO) {
>      if (MSIX) {
>          // passthrough disallowed
>      }
>      else {
>          // region required, disallow if unavailable
>      }
> }
> else if(RELAX && !MSIX) {
>      // use region if available
> }
> 
> Sound good?
Yes sounds good to me.
I mean the idea, I will not try to simplify, I trust you on this.
Thanks,
Pierre
-- 
Pierre Morel
IBM Lab Boeblingen
^ permalink raw reply	[flat|nested] 27+ messages in thread