Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH v4 06/15] fpga: dfl: fme: add DFL_FPGA_FME_PORT_RELEASE/ASSIGN ioctl support.
From: Wu Hao @ 2019-06-27  4:44 UTC (permalink / raw)
  To: mdf, linux-fpga, linux-kernel
  Cc: linux-api, yilun.xu, hao.wu, gregkh, atull, Zhang Yi Z
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>

In order to support virtualization usage via PCIe SRIOV, this patch
adds two ioctls under FPGA Management Engine (FME) to release and
assign back the port device. In order to safely turn Port from PF
into VF and enable PCIe SRIOV, it requires user to invoke this
PORT_RELEASE ioctl to release port firstly to remove userspace
interfaces, and then configure the PF/VF access register in FME.
After disable SRIOV, it requires user to invoke this PORT_ASSIGN
ioctl to attach the port back to PF.

 Ioctl interfaces:
 * DFL_FPGA_FME_PORT_RELEASE
   Release platform device of given port, it deletes port platform
   device to remove related userspace interfaces on PF, then
   configures PF/VF access mode to VF.

 * DFL_FPGA_FME_PORT_ASSIGN
   Assign platform device of given port back to PF, it configures
   PF/VF access mode to PF, then adds port platform device back to
   re-enable related userspace interfaces on PF.

Signed-off-by: Zhang Yi Z <yi.z.zhang@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Acked-by: Moritz Fischer <mdf@kernel.org>
---
v3: code rebase.
v4: no change.
---
 drivers/fpga/dfl-fme-main.c   |  54 +++++++++++++++++++++
 drivers/fpga/dfl.c            | 107 +++++++++++++++++++++++++++++++++++++-----
 drivers/fpga/dfl.h            |  10 ++++
 include/uapi/linux/fpga-dfl.h |  32 +++++++++++++
 4 files changed, 191 insertions(+), 12 deletions(-)

diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index 076d74f..8b2a337 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -16,6 +16,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <linux/fpga-dfl.h>
 
 #include "dfl.h"
@@ -105,9 +106,62 @@ static void fme_hdr_uinit(struct platform_device *pdev,
 	sysfs_remove_files(&pdev->dev.kobj, fme_hdr_attrs);
 }
 
+static long fme_hdr_ioctl_release_port(struct dfl_feature_platform_data *pdata,
+				       void __user *arg)
+{
+	struct dfl_fpga_cdev *cdev = pdata->dfl_cdev;
+	struct dfl_fpga_fme_port_release release;
+	unsigned long minsz;
+
+	minsz = offsetofend(struct dfl_fpga_fme_port_release, port_id);
+
+	if (copy_from_user(&release, arg, minsz))
+		return -EFAULT;
+
+	if (release.argsz < minsz || release.flags)
+		return -EINVAL;
+
+	return dfl_fpga_cdev_config_port(cdev, release.port_id, true);
+}
+
+static long fme_hdr_ioctl_assign_port(struct dfl_feature_platform_data *pdata,
+				      void __user *arg)
+{
+	struct dfl_fpga_cdev *cdev = pdata->dfl_cdev;
+	struct dfl_fpga_fme_port_assign assign;
+	unsigned long minsz;
+
+	minsz = offsetofend(struct dfl_fpga_fme_port_assign, port_id);
+
+	if (copy_from_user(&assign, arg, minsz))
+		return -EFAULT;
+
+	if (assign.argsz < minsz || assign.flags)
+		return -EINVAL;
+
+	return dfl_fpga_cdev_config_port(cdev, assign.port_id, false);
+}
+
+static long fme_hdr_ioctl(struct platform_device *pdev,
+			  struct dfl_feature *feature,
+			  unsigned int cmd, unsigned long arg)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+
+	switch (cmd) {
+	case DFL_FPGA_FME_PORT_RELEASE:
+		return fme_hdr_ioctl_release_port(pdata, (void __user *)arg);
+	case DFL_FPGA_FME_PORT_ASSIGN:
+		return fme_hdr_ioctl_assign_port(pdata, (void __user *)arg);
+	}
+
+	return -ENODEV;
+}
+
 static const struct dfl_feature_ops fme_hdr_ops = {
 	.init = fme_hdr_init,
 	.uinit = fme_hdr_uinit,
+	.ioctl = fme_hdr_ioctl,
 };
 
 static struct dfl_feature_driver fme_feature_drvs[] = {
diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c
index 4b66aaa..308c808 100644
--- a/drivers/fpga/dfl.c
+++ b/drivers/fpga/dfl.c
@@ -231,16 +231,20 @@ void dfl_fpga_port_ops_del(struct dfl_fpga_port_ops *ops)
  */
 int dfl_fpga_check_port_id(struct platform_device *pdev, void *pport_id)
 {
-	struct dfl_fpga_port_ops *port_ops = dfl_fpga_port_ops_get(pdev);
-	int port_id;
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	struct dfl_fpga_port_ops *port_ops;
+
+	if (pdata->id != FEATURE_DEV_ID_UNUSED)
+		return pdata->id == *(int *)pport_id;
 
+	port_ops = dfl_fpga_port_ops_get(pdev);
 	if (!port_ops || !port_ops->get_id)
 		return 0;
 
-	port_id = port_ops->get_id(pdev);
+	pdata->id = port_ops->get_id(pdev);
 	dfl_fpga_port_ops_put(port_ops);
 
-	return port_id == *(int *)pport_id;
+	return pdata->id == *(int *)pport_id;
 }
 EXPORT_SYMBOL_GPL(dfl_fpga_check_port_id);
 
@@ -474,6 +478,7 @@ static int build_info_commit_dev(struct build_feature_devs_info *binfo)
 	pdata->dev = fdev;
 	pdata->num = binfo->feature_num;
 	pdata->dfl_cdev = binfo->cdev;
+	pdata->id = FEATURE_DEV_ID_UNUSED;
 	mutex_init(&pdata->lock);
 	lockdep_set_class_and_name(&pdata->lock, &dfl_pdata_keys[type],
 				   dfl_pdata_key_strings[type]);
@@ -973,25 +978,27 @@ void dfl_fpga_feature_devs_remove(struct dfl_fpga_cdev *cdev)
 {
 	struct dfl_feature_platform_data *pdata, *ptmp;
 
-	remove_feature_devs(cdev);
-
 	mutex_lock(&cdev->lock);
-	if (cdev->fme_dev) {
-		/* the fme should be unregistered. */
-		WARN_ON(device_is_registered(cdev->fme_dev));
+	if (cdev->fme_dev)
 		put_device(cdev->fme_dev);
-	}
 
 	list_for_each_entry_safe(pdata, ptmp, &cdev->port_dev_list, node) {
 		struct platform_device *port_dev = pdata->dev;
 
-		/* the port should be unregistered. */
-		WARN_ON(device_is_registered(&port_dev->dev));
+		/* remove released ports */
+		if (!device_is_registered(&port_dev->dev)) {
+			dfl_id_free(feature_dev_id_type(port_dev),
+				    port_dev->id);
+			platform_device_put(port_dev);
+		}
+
 		list_del(&pdata->node);
 		put_device(&port_dev->dev);
 	}
 	mutex_unlock(&cdev->lock);
 
+	remove_feature_devs(cdev);
+
 	fpga_region_unregister(cdev->region);
 	devm_kfree(cdev->parent, cdev);
 }
@@ -1029,6 +1036,82 @@ struct platform_device *
 }
 EXPORT_SYMBOL_GPL(__dfl_fpga_cdev_find_port);
 
+static int attach_port_dev(struct dfl_fpga_cdev *cdev, u32 port_id)
+{
+	struct platform_device *port_pdev;
+	int ret = -ENODEV;
+
+	mutex_lock(&cdev->lock);
+	port_pdev = __dfl_fpga_cdev_find_port(cdev, &port_id,
+					      dfl_fpga_check_port_id);
+	if (!port_pdev)
+		goto unlock_exit;
+
+	if (device_is_registered(&port_pdev->dev)) {
+		ret = -EBUSY;
+		goto put_dev_exit;
+	}
+
+	ret = platform_device_add(port_pdev);
+	if (ret)
+		goto put_dev_exit;
+
+	dfl_feature_dev_use_end(dev_get_platdata(&port_pdev->dev));
+	cdev->released_port_num--;
+put_dev_exit:
+	put_device(&port_pdev->dev);
+unlock_exit:
+	mutex_unlock(&cdev->lock);
+	return ret;
+}
+
+static int detach_port_dev(struct dfl_fpga_cdev *cdev, u32 port_id)
+{
+	struct platform_device *port_pdev;
+	int ret = -ENODEV;
+
+	mutex_lock(&cdev->lock);
+	port_pdev = __dfl_fpga_cdev_find_port(cdev, &port_id,
+					      dfl_fpga_check_port_id);
+	if (!port_pdev)
+		goto unlock_exit;
+
+	if (!device_is_registered(&port_pdev->dev)) {
+		ret = -EBUSY;
+		goto put_dev_exit;
+	}
+
+	ret = dfl_feature_dev_use_begin(dev_get_platdata(&port_pdev->dev));
+	if (ret)
+		goto put_dev_exit;
+
+	platform_device_del(port_pdev);
+	cdev->released_port_num++;
+put_dev_exit:
+	put_device(&port_pdev->dev);
+unlock_exit:
+	mutex_unlock(&cdev->lock);
+	return ret;
+}
+
+/**
+ * dfl_fpga_cdev_config_port - configure a port feature dev
+ * @cdev: parent container device.
+ * @port_id: id of the port feature device.
+ * @release: release port or assign port back.
+ *
+ * This function allows user to release port platform device or assign it back.
+ * e.g. to safely turn one port from PF into VF for PCI device SRIOV support,
+ * release port platform device is one necessary step.
+ */
+int dfl_fpga_cdev_config_port(struct dfl_fpga_cdev *cdev,
+			      u32 port_id, bool release)
+{
+	return release ? detach_port_dev(cdev, port_id) :
+			 attach_port_dev(cdev, port_id);
+}
+EXPORT_SYMBOL_GPL(dfl_fpga_cdev_config_port);
+
 static int __init dfl_fpga_init(void)
 {
 	int ret;
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index 8851c6c..63f39ab 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -183,6 +183,8 @@ struct dfl_feature {
 
 #define DEV_STATUS_IN_USE	0
 
+#define FEATURE_DEV_ID_UNUSED	(-1)
+
 /**
  * struct dfl_feature_platform_data - platform data for feature devices
  *
@@ -191,6 +193,7 @@ struct dfl_feature {
  * @cdev: cdev of feature dev.
  * @dev: ptr to platform device linked with this platform data.
  * @dfl_cdev: ptr to container device.
+ * @id: id used for this feature device.
  * @disable_count: count for port disable.
  * @num: number for sub features.
  * @dev_status: dev status (e.g. DEV_STATUS_IN_USE).
@@ -203,6 +206,7 @@ struct dfl_feature_platform_data {
 	struct cdev cdev;
 	struct platform_device *dev;
 	struct dfl_fpga_cdev *dfl_cdev;
+	int id;
 	unsigned int disable_count;
 	unsigned long dev_status;
 	void *private;
@@ -378,6 +382,7 @@ int dfl_fpga_enum_info_add_dfl(struct dfl_fpga_enum_info *info,
  * @fme_dev: FME feature device under this container device.
  * @lock: mutex lock to protect the port device list.
  * @port_dev_list: list of all port feature devices under this container device.
+ * @released_port_num: released port number under this container device.
  */
 struct dfl_fpga_cdev {
 	struct device *parent;
@@ -385,6 +390,7 @@ struct dfl_fpga_cdev {
 	struct device *fme_dev;
 	struct mutex lock;
 	struct list_head port_dev_list;
+	int released_port_num;
 };
 
 struct dfl_fpga_cdev *
@@ -412,4 +418,8 @@ struct platform_device *
 
 	return pdev;
 }
+
+int dfl_fpga_cdev_config_port(struct dfl_fpga_cdev *cdev,
+			      u32 port_id, bool release);
+
 #endif /* __FPGA_DFL_H */
diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h
index 2e324e5..e9a00e0 100644
--- a/include/uapi/linux/fpga-dfl.h
+++ b/include/uapi/linux/fpga-dfl.h
@@ -176,4 +176,36 @@ struct dfl_fpga_fme_port_pr {
 
 #define DFL_FPGA_FME_PORT_PR	_IO(DFL_FPGA_MAGIC, DFL_FME_BASE + 0)
 
+/**
+ * DFL_FPGA_FME_PORT_RELEASE - _IOW(DFL_FPGA_MAGIC, DFL_FME_BASE + 1,
+ *					struct dfl_fpga_fme_port_release)
+ *
+ * Driver releases the port per Port ID provided by caller.
+ * Return: 0 on success, -errno on failure.
+ */
+struct dfl_fpga_fme_port_release {
+	/* Input */
+	__u32 argsz;		/* Structure length */
+	__u32 flags;		/* Zero for now */
+	__u32 port_id;
+};
+
+#define DFL_FPGA_FME_PORT_RELEASE	_IO(DFL_FPGA_MAGIC, DFL_FME_BASE + 1)
+
+/**
+ * DFL_FPGA_FME_PORT_ASSIGN - _IOW(DFL_FPGA_MAGIC, DFL_FME_BASE + 2,
+ *					struct dfl_fpga_fme_port_assign)
+ *
+ * Driver assigns the port back per Port ID provided by caller.
+ * Return: 0 on success, -errno on failure.
+ */
+struct dfl_fpga_fme_port_assign {
+	/* Input */
+	__u32 argsz;		/* Structure length */
+	__u32 flags;		/* Zero for now */
+	__u32 port_id;
+};
+
+#define DFL_FPGA_FME_PORT_ASSIGN	_IO(DFL_FPGA_MAGIC, DFL_FME_BASE + 2)
+
 #endif /* _UAPI_LINUX_FPGA_DFL_H */
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 05/15] Documentation: fpga: dfl: add descriptions for virtualization and new interfaces.
From: Wu Hao @ 2019-06-27  4:44 UTC (permalink / raw)
  To: mdf, linux-fpga, linux-kernel; +Cc: linux-api, yilun.xu, hao.wu, gregkh, atull
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>

This patch adds virtualization support description for DFL based
FPGA devices (based on PCIe SRIOV), and introductions to new
interfaces added by new dfl private feature drivers.

Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
---
 Documentation/fpga/dfl.txt | 101 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/Documentation/fpga/dfl.txt b/Documentation/fpga/dfl.txt
index 6df4621..a22631f 100644
--- a/Documentation/fpga/dfl.txt
+++ b/Documentation/fpga/dfl.txt
@@ -84,6 +84,8 @@ The following functions are exposed through ioctls:
  Get driver API version (DFL_FPGA_GET_API_VERSION)
  Check for extensions (DFL_FPGA_CHECK_EXTENSION)
  Program bitstream (DFL_FPGA_FME_PORT_PR)
+ Assign port to PF (DFL_FPGA_FME_PORT_ASSIGN)
+ Release port from PF (DFL_FPGA_FME_PORT_RELEASE)
 
 More functions are exposed through sysfs
 (/sys/class/fpga_region/regionX/dfl-fme.n/):
@@ -99,6 +101,10 @@ More functions are exposed through sysfs
      one FPGA device may have more than one port, this sysfs interface indicates
      how many ports the FPGA device has.
 
+ Global error reporting management (errors/)
+     error reporting sysfs interfaces allow user to read errors detected by the
+     hardware, and clear the logged errors.
+
 
 FIU - PORT
 ==========
@@ -139,6 +145,10 @@ More functions are exposed through sysfs:
  Read Accelerator GUID (afu_id)
      afu_id indicates which PR bitstream is programmed to this AFU.
 
+ Error reporting (errors/)
+     error reporting sysfs interfaces allow user to read port/afu errors
+     detected by the hardware, and clear the logged errors.
+
 
 DFL Framework Overview
 ======================
@@ -212,6 +222,97 @@ the compat_id exposed by the target FPGA region. This check is usually done by
 userspace before calling the reconfiguration IOCTL.
 
 
+FPGA virtualization - PCIe SRIOV
+================================
+This section describes the virtualization support on DFL based FPGA device to
+enable accessing an accelerator from applications running in a virtual machine
+(VM). This section only describes the PCIe based FPGA device with SRIOV support.
+
+Features supported by the particular FPGA device are exposed through Device
+Feature Lists, as illustrated below:
+
+  +-------------------------------+  +-------------+
+  |              PF               |  |     VF      |
+  +-------------------------------+  +-------------+
+      ^            ^         ^              ^
+      |            |         |              |
++-----|------------|---------|--------------|-------+
+|     |            |         |              |       |
+|  +-----+     +-------+ +-------+      +-------+   |
+|  | FME |     | Port0 | | Port1 |      | Port2 |   |
+|  +-----+     +-------+ +-------+      +-------+   |
+|                  ^         ^              ^       |
+|                  |         |              |       |
+|              +-------+ +------+       +-------+   |
+|              |  AFU  | |  AFU |       |  AFU  |   |
+|              +-------+ +------+       +-------+   |
+|                                                   |
+|            DFL based FPGA PCIe Device             |
++---------------------------------------------------+
+
+FME is always accessed through the physical function (PF).
+
+Ports (and related AFUs) are accessed via PF by default, but could be exposed
+through virtual function (VF) devices via PCIe SRIOV. Each VF only contains
+1 Port and 1 AFU for isolation. Users could assign individual VFs (accelerators)
+created via PCIe SRIOV interface, to virtual machines.
+
+The driver organization in virtualization case is illustrated below:
+
+  +-------++------++------+             |
+  | FME   || FME  || FME  |             |
+  | FPGA  || FPGA || FPGA |             |
+  |Manager||Bridge||Region|             |
+  +-------++------++------+             |
+  +-----------------------+  +--------+ |             +--------+
+  |          FME          |  |  AFU   | |             |  AFU   |
+  |         Module        |  | Module | |             | Module |
+  +-----------------------+  +--------+ |             +--------+
+        +-----------------------+       |       +-----------------------+
+        | FPGA Container Device |       |       | FPGA Container Device |
+        |  (FPGA Base Region)   |       |       |  (FPGA Base Region)   |
+        +-----------------------+       |       +-----------------------+
+          +------------------+          |         +------------------+
+          | FPGA PCIE Module |          | Virtual | FPGA PCIE Module |
+          +------------------+   Host   | Machine +------------------+
+ -------------------------------------- | ------------------------------
+           +---------------+            |          +---------------+
+           | PCI PF Device |            |          | PCI VF Device |
+           +---------------+            |          +---------------+
+
+FPGA PCIe device driver is always loaded first once a FPGA PCIe PF or VF device
+is detected. It:
+
+	a) finish enumeration on both FPGA PCIe PF and VF device using common
+	   interfaces from DFL framework.
+	b) supports SRIOV.
+
+The FME device driver plays a management role in this driver architecture, it
+provides ioctls to release Port from PF and assign Port to PF. After release
+a port from PF, then it's safe to expose this port through a VF via PCIe SRIOV
+sysfs interface.
+
+To enable accessing an accelerator from applications running in a VM, the
+respective AFU's port needs to be assigned to a VF using the following steps:
+
+	a) The PF owns all AFU ports by default. Any port that needs to be
+	   reassigned to a VF must first be released through the
+	   DFL_FPGA_FME_PORT_RELEASE ioctl on the FME device.
+
+	b) Once N ports are released from PF, then user can use command below
+	   to enable SRIOV and VFs. Each VF owns only one Port with AFU.
+
+	   echo N > $PCI_DEVICE_PATH/sriov_numvfs
+
+	c) Pass through the VFs to VMs
+
+	d) The AFU under VF is accessible from applications in VM (using the
+	   same driver inside the VF).
+
+Note that an FME can't be assigned to a VF, thus PR and other management
+functions are only available via the PF.
+
+
 Device enumeration
 ==================
 This section introduces how applications enumerate the fpga device from
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 04/15] fpga: dfl: fme: support 512bit data width PR
From: Wu Hao @ 2019-06-27  4:44 UTC (permalink / raw)
  To: mdf, linux-fpga, linux-kernel
  Cc: linux-api, yilun.xu, hao.wu, gregkh, atull, Ananda Ravuri
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>

In early partial reconfiguration private feature, it only
supports 32bit data width when writing data to hardware for
PR. 512bit data width PR support is an important optimization
for some specific solutions (e.g. XEON with FPGA integrated),
it allows driver to use AVX512 instruction to improve the
performance of partial reconfiguration. e.g. programming one
100MB bitstream image via this 512bit data width PR hardware
only takes ~300ms, but 32bit revision requires ~3s per test
result.

Please note now this optimization is only done on revision 2
of this PR private feature which is only used in integrated
solution that AVX512 is always supported. This revision 2
hardware doesn't support 32bit PR.

Signed-off-by: Ananda Ravuri <ananda.ravuri@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
---
v2: check AVX512 support using cpu_feature_enabled()
    fix other comments from Scott Wood <swood@redhat.com>
v3: no change.
v4: no change.
---
 drivers/fpga/dfl-fme-main.c |   3 ++
 drivers/fpga/dfl-fme-mgr.c  | 113 +++++++++++++++++++++++++++++++++++++-------
 drivers/fpga/dfl-fme-pr.c   |  43 +++++++++++------
 drivers/fpga/dfl-fme.h      |   2 +
 drivers/fpga/dfl.h          |   5 ++
 5 files changed, 135 insertions(+), 31 deletions(-)

diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index 086ad24..076d74f 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -21,6 +21,8 @@
 #include "dfl.h"
 #include "dfl-fme.h"
 
+#define DRV_VERSION	"0.8"
+
 static ssize_t ports_num_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
 {
@@ -277,3 +279,4 @@ static int fme_remove(struct platform_device *pdev)
 MODULE_AUTHOR("Intel Corporation");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS("platform:dfl-fme");
+MODULE_VERSION(DRV_VERSION);
diff --git a/drivers/fpga/dfl-fme-mgr.c b/drivers/fpga/dfl-fme-mgr.c
index b3f7eee..d1a4ba5 100644
--- a/drivers/fpga/dfl-fme-mgr.c
+++ b/drivers/fpga/dfl-fme-mgr.c
@@ -22,14 +22,18 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/fpga/fpga-mgr.h>
 
+#include "dfl.h"
 #include "dfl-fme-pr.h"
 
+#define DRV_VERSION	"0.8"
+
 /* FME Partial Reconfiguration Sub Feature Register Set */
 #define FME_PR_DFH		0x0
 #define FME_PR_CTRL		0x8
 #define FME_PR_STS		0x10
 #define FME_PR_DATA		0x18
 #define FME_PR_ERR		0x20
+#define FME_PR_512_DATA		0x40 /* Data Register for 512bit datawidth PR */
 #define FME_PR_INTFC_ID_L	0xA8
 #define FME_PR_INTFC_ID_H	0xB0
 
@@ -67,8 +71,43 @@
 #define PR_WAIT_TIMEOUT   8000000
 #define PR_HOST_STATUS_IDLE	0
 
+#if defined(CONFIG_X86) && defined(CONFIG_AS_AVX512)
+
+#include <linux/cpufeature.h>
+#include <asm/fpu/api.h>
+
+static inline int is_cpu_avx512_enabled(void)
+{
+	return cpu_feature_enabled(X86_FEATURE_AVX512F);
+}
+
+static inline void copy512(const void *src, void __iomem *dst)
+{
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqu64 (%0), %%zmm0;"
+		     "vmovntdq %%zmm0, (%1);"
+		     :
+		     : "r"(src), "r"(dst)
+		     : "memory");
+
+	kernel_fpu_end();
+}
+#else
+static inline int is_cpu_avx512_enabled(void)
+{
+	return 0;
+}
+
+static inline void copy512(const void *src, void __iomem *dst)
+{
+	WARN_ON_ONCE(1);
+}
+#endif
+
 struct fme_mgr_priv {
 	void __iomem *ioaddr;
+	unsigned int pr_datawidth;
 	u64 pr_error;
 };
 
@@ -169,7 +208,7 @@ static int fme_mgr_write(struct fpga_manager *mgr,
 	struct fme_mgr_priv *priv = mgr->priv;
 	void __iomem *fme_pr = priv->ioaddr;
 	u64 pr_ctrl, pr_status, pr_data;
-	int delay = 0, pr_credit, i = 0;
+	int ret = 0, delay = 0, pr_credit;
 
 	dev_dbg(dev, "start request\n");
 
@@ -181,9 +220,9 @@ static int fme_mgr_write(struct fpga_manager *mgr,
 
 	/*
 	 * driver can push data to PR hardware using PR_DATA register once HW
-	 * has enough pr_credit (> 1), pr_credit reduces one for every 32bit
-	 * pr data write to PR_DATA register. If pr_credit <= 1, driver needs
-	 * to wait for enough pr_credit from hardware by polling.
+	 * has enough pr_credit (> 1), pr_credit reduces one for every pr data
+	 * width write to PR_DATA register. If pr_credit <= 1, driver needs to
+	 * wait for enough pr_credit from hardware by polling.
 	 */
 	pr_status = readq(fme_pr + FME_PR_STS);
 	pr_credit = FIELD_GET(FME_PR_STS_PR_CREDIT, pr_status);
@@ -192,7 +231,8 @@ static int fme_mgr_write(struct fpga_manager *mgr,
 		while (pr_credit <= 1) {
 			if (delay++ > PR_WAIT_TIMEOUT) {
 				dev_err(dev, "PR_CREDIT timeout\n");
-				return -ETIMEDOUT;
+				ret = -ETIMEDOUT;
+				goto done;
 			}
 			udelay(1);
 
@@ -200,21 +240,27 @@ static int fme_mgr_write(struct fpga_manager *mgr,
 			pr_credit = FIELD_GET(FME_PR_STS_PR_CREDIT, pr_status);
 		}
 
-		if (count < 4) {
-			dev_err(dev, "Invalid PR bitstream size\n");
-			return -EINVAL;
+		WARN_ON(count < priv->pr_datawidth);
+
+		switch (priv->pr_datawidth) {
+		case 4:
+			pr_data = FIELD_PREP(FME_PR_DATA_PR_DATA_RAW,
+					     *(u32 *)buf);
+			writeq(pr_data, fme_pr + FME_PR_DATA);
+			break;
+		case 64:
+			copy512(buf, fme_pr + FME_PR_512_DATA);
+			break;
+		default:
+			WARN_ON_ONCE(1);
 		}
-
-		pr_data = 0;
-		pr_data |= FIELD_PREP(FME_PR_DATA_PR_DATA_RAW,
-				      *(((u32 *)buf) + i));
-		writeq(pr_data, fme_pr + FME_PR_DATA);
-		count -= 4;
+		buf += priv->pr_datawidth;
+		count -= priv->pr_datawidth;
 		pr_credit--;
-		i++;
 	}
 
-	return 0;
+done:
+	return ret;
 }
 
 static int fme_mgr_write_complete(struct fpga_manager *mgr,
@@ -279,6 +325,36 @@ static void fme_mgr_get_compat_id(void __iomem *fme_pr,
 	id->id_h = readq(fme_pr + FME_PR_INTFC_ID_H);
 }
 
+static u8 fme_mgr_get_pr_datawidth(struct device *dev, void __iomem *fme_pr)
+{
+	u8 revision = dfl_feature_revision(fme_pr);
+
+	if (revision < 2) {
+		/*
+		 * revision 0 and 1 only support 32bit data width partial
+		 * reconfiguration, so pr_datawidth is 4 (Byte).
+		 */
+		return 4;
+	} else if (revision == 2) {
+		/*
+		 * revision 2 hardware has optimization to support 512bit data
+		 * width partial reconfiguration with AVX512 instructions. So
+		 * pr_datawidth is 64 (Byte). As revision 2 hardware is only
+		 * used in integrated solution, CPU supports AVX512 instructions
+		 * for sure, but it still needs to check here as AVX512 could be
+		 * disabled in kernel (e.g. using clearcpuid boot option).
+		 */
+		if (is_cpu_avx512_enabled())
+			return 64;
+
+		dev_err(dev, "revision 2: AVX512 is disabled\n");
+		return 0;
+	}
+
+	dev_err(dev, "revision %d is not supported yet\n", revision);
+	return 0;
+}
+
 static int fme_mgr_probe(struct platform_device *pdev)
 {
 	struct dfl_fme_mgr_pdata *pdata = dev_get_platdata(&pdev->dev);
@@ -302,6 +378,10 @@ static int fme_mgr_probe(struct platform_device *pdev)
 			return PTR_ERR(priv->ioaddr);
 	}
 
+	priv->pr_datawidth = fme_mgr_get_pr_datawidth(dev, priv->ioaddr);
+	if (!priv->pr_datawidth)
+		return -ENODEV;
+
 	compat_id = devm_kzalloc(dev, sizeof(*compat_id), GFP_KERNEL);
 	if (!compat_id)
 		return -ENOMEM;
@@ -342,3 +422,4 @@ static int fme_mgr_remove(struct platform_device *pdev)
 MODULE_AUTHOR("Intel Corporation");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS("platform:dfl-fme-mgr");
+MODULE_VERSION(DRV_VERSION);
diff --git a/drivers/fpga/dfl-fme-pr.c b/drivers/fpga/dfl-fme-pr.c
index 3c71dc3..cd94ba8 100644
--- a/drivers/fpga/dfl-fme-pr.c
+++ b/drivers/fpga/dfl-fme-pr.c
@@ -83,7 +83,7 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg)
 	if (copy_from_user(&port_pr, argp, minsz))
 		return -EFAULT;
 
-	if (port_pr.argsz < minsz || port_pr.flags)
+	if (port_pr.argsz < minsz || port_pr.flags || !port_pr.buffer_size)
 		return -EINVAL;
 
 	/* get fme header region */
@@ -101,15 +101,25 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg)
 		       port_pr.buffer_size))
 		return -EFAULT;
 
+	mutex_lock(&pdata->lock);
+	fme = dfl_fpga_pdata_get_private(pdata);
+	/* fme device has been unregistered. */
+	if (!fme) {
+		ret = -EINVAL;
+		goto unlock_exit;
+	}
+
 	/*
 	 * align PR buffer per PR bandwidth, as HW ignores the extra padding
 	 * data automatically.
 	 */
-	length = ALIGN(port_pr.buffer_size, 4);
+	length = ALIGN(port_pr.buffer_size, fme->pr_datawidth);
 
 	buf = vmalloc(length);
-	if (!buf)
-		return -ENOMEM;
+	if (!buf) {
+		ret = -ENOMEM;
+		goto unlock_exit;
+	}
 
 	if (copy_from_user(buf,
 			   (void __user *)(unsigned long)port_pr.buffer_address,
@@ -127,18 +137,10 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg)
 
 	info->flags |= FPGA_MGR_PARTIAL_RECONFIG;
 
-	mutex_lock(&pdata->lock);
-	fme = dfl_fpga_pdata_get_private(pdata);
-	/* fme device has been unregistered. */
-	if (!fme) {
-		ret = -EINVAL;
-		goto unlock_exit;
-	}
-
 	region = dfl_fme_region_find(fme, port_pr.port_id);
 	if (!region) {
 		ret = -EINVAL;
-		goto unlock_exit;
+		goto free_exit;
 	}
 
 	fpga_image_info_free(region->info);
@@ -159,10 +161,10 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg)
 		fpga_bridges_put(&region->bridge_list);
 
 	put_device(&region->dev);
-unlock_exit:
-	mutex_unlock(&pdata->lock);
 free_exit:
 	vfree(buf);
+unlock_exit:
+	mutex_unlock(&pdata->lock);
 	return ret;
 }
 
@@ -388,6 +390,17 @@ static int pr_mgmt_init(struct platform_device *pdev,
 	mutex_lock(&pdata->lock);
 	priv = dfl_fpga_pdata_get_private(pdata);
 
+	/*
+	 * Initialize PR data width.
+	 * Only revision 2 supports 512bit datawidth for better performance,
+	 * other revisions use default 32bit datawidth. This is used for
+	 * buffer alignment.
+	 */
+	if (dfl_feature_revision(feature->ioaddr) == 2)
+		priv->pr_datawidth = 64;
+	else
+		priv->pr_datawidth = 4;
+
 	/* Initialize the region and bridge sub device list */
 	INIT_LIST_HEAD(&priv->region_list);
 	INIT_LIST_HEAD(&priv->bridge_list);
diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
index 5394a21..de20755 100644
--- a/drivers/fpga/dfl-fme.h
+++ b/drivers/fpga/dfl-fme.h
@@ -21,12 +21,14 @@
 /**
  * struct dfl_fme - dfl fme private data
  *
+ * @pr_datawidth: data width for partial reconfiguration.
  * @mgr: FME's FPGA manager platform device.
  * @region_list: linked list of FME's FPGA regions.
  * @bridge_list: linked list of FME's FPGA bridges.
  * @pdata: fme platform device's pdata.
  */
 struct dfl_fme {
+	int pr_datawidth;
 	struct platform_device *mgr;
 	struct list_head region_list;
 	struct list_head bridge_list;
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index a8b869e..8851c6c 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -331,6 +331,11 @@ static inline bool dfl_feature_is_port(void __iomem *base)
 		(FIELD_GET(DFH_ID, v) == DFH_ID_FIU_PORT);
 }
 
+static inline u8 dfl_feature_revision(void __iomem *base)
+{
+	return (u8)FIELD_GET(DFH_REVISION, readq(base + DFH));
+}
+
 /**
  * struct dfl_fpga_enum_info - DFL FPGA enumeration information
  *
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 03/15] fpga: dfl: fme: align PR buffer size per PR datawidth
From: Wu Hao @ 2019-06-27  4:44 UTC (permalink / raw)
  To: mdf, linux-fpga, linux-kernel; +Cc: linux-api, yilun.xu, hao.wu, gregkh, atull
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>

Current driver checks if input bitstream file size is aligned or
not per PR data width (default 32bits). It requires one additional
step for end user when they generate the bitstream file, padding
extra zeros to bitstream file to align its size per PR data width,
but they don't have to as hardware will drop extra padding bytes
automatically.

In order to simplify the user steps, this patch aligns PR buffer
size per PR data width in driver, to allow user to pass unaligned
size bitstream files to driver.

Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Acked-by: Moritz Fischer <mdf@kernel.org>
---
 drivers/fpga/dfl-fme-pr.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/fpga/dfl-fme-pr.c b/drivers/fpga/dfl-fme-pr.c
index 6ec0f09..3c71dc3 100644
--- a/drivers/fpga/dfl-fme-pr.c
+++ b/drivers/fpga/dfl-fme-pr.c
@@ -74,6 +74,7 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg)
 	struct dfl_fme *fme;
 	unsigned long minsz;
 	void *buf = NULL;
+	size_t length;
 	int ret = 0;
 	u64 v;
 
@@ -85,9 +86,6 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg)
 	if (port_pr.argsz < minsz || port_pr.flags)
 		return -EINVAL;
 
-	if (!IS_ALIGNED(port_pr.buffer_size, 4))
-		return -EINVAL;
-
 	/* get fme header region */
 	fme_hdr = dfl_get_feature_ioaddr_by_id(&pdev->dev,
 					       FME_FEATURE_ID_HEADER);
@@ -103,7 +101,13 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg)
 		       port_pr.buffer_size))
 		return -EFAULT;
 
-	buf = vmalloc(port_pr.buffer_size);
+	/*
+	 * align PR buffer per PR bandwidth, as HW ignores the extra padding
+	 * data automatically.
+	 */
+	length = ALIGN(port_pr.buffer_size, 4);
+
+	buf = vmalloc(length);
 	if (!buf)
 		return -ENOMEM;
 
@@ -140,7 +144,7 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg)
 	fpga_image_info_free(region->info);
 
 	info->buf = buf;
-	info->count = port_pr.buffer_size;
+	info->count = length;
 	info->region_id = port_pr.port_id;
 	region->info = info;
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 02/15] fpga: dfl: fme: remove copy_to_user() in ioctl for PR
From: Wu Hao @ 2019-06-27  4:44 UTC (permalink / raw)
  To: mdf, linux-fpga, linux-kernel; +Cc: linux-api, yilun.xu, hao.wu, gregkh, atull
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>

This patch removes copy_to_user() code in partial reconfiguration
ioctl, as it's useless as user never needs to read the data
structure after ioctl.

Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Moritz Fischer <mdf@kernel.org>
Acked-by: Alan Tull <atull@kernel.org>
---
v2: clean up code split from patch 2 in v1 patchset.
v3: no change.
v4: no change.
---
 drivers/fpga/dfl-fme-pr.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/fpga/dfl-fme-pr.c b/drivers/fpga/dfl-fme-pr.c
index d9ca955..6ec0f09 100644
--- a/drivers/fpga/dfl-fme-pr.c
+++ b/drivers/fpga/dfl-fme-pr.c
@@ -159,9 +159,6 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg)
 	mutex_unlock(&pdata->lock);
 free_exit:
 	vfree(buf);
-	if (copy_to_user((void __user *)arg, &port_pr, minsz))
-		return -EFAULT;
-
 	return ret;
 }
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 01/15] fpga: dfl-fme-mgr: fix FME_PR_INTFC_ID register address.
From: Wu Hao @ 2019-06-27  4:44 UTC (permalink / raw)
  To: mdf, linux-fpga, linux-kernel; +Cc: linux-api, yilun.xu, hao.wu, gregkh, atull
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>

FME_PR_INTFC_ID is used as compat_id for fpga manager and region,
but high 64 bits and low 64 bits of the compat_id are swapped by
mistake. This patch fixes this problem by fixing register address.

Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Acked-by: Moritz Fischer <mdf@kernel.org>
---
 drivers/fpga/dfl-fme-mgr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/fpga/dfl-fme-mgr.c b/drivers/fpga/dfl-fme-mgr.c
index 76f3770..b3f7eee 100644
--- a/drivers/fpga/dfl-fme-mgr.c
+++ b/drivers/fpga/dfl-fme-mgr.c
@@ -30,8 +30,8 @@
 #define FME_PR_STS		0x10
 #define FME_PR_DATA		0x18
 #define FME_PR_ERR		0x20
-#define FME_PR_INTFC_ID_H	0xA8
-#define FME_PR_INTFC_ID_L	0xB0
+#define FME_PR_INTFC_ID_L	0xA8
+#define FME_PR_INTFC_ID_H	0xB0

 /* FME PR Control Register Bitfield */
 #define FME_PR_CTRL_PR_RST	BIT_ULL(0)  /* Reset PR engine */
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 00/15] add new features for FPGA DFL drivers
From: Wu Hao @ 2019-06-27  4:44 UTC (permalink / raw)
  To: mdf, linux-fpga, linux-kernel; +Cc: linux-api, yilun.xu, hao.wu, gregkh, atull

This patchset adds more features support for FPGA Device Feature List
(DFL) drivers, including PR enhancement, virtualization support based
on PCIe SRIOV, private features of Port, private features of FME, and
enhancement to DFL framework. Please refer to details in below list.

Main changes from v3:
 - split performance reporting support into another patchset for
   better review.

Main changes from v2:
 - move thermal/power management private feature support to another
   patchset, including hwmon patches and related documentation update.
 - update sysfs doc for kernel version and date.
 - replace scnprintf to sprintf for sysfs interfaces.
 - fix comments for performance reporting support. (patch #16)

Main changes from v1:
 - split the clean up code in a separated patch (patch #2)
 - add cpu_feature_enabled check for AVX512 code (patch #4)
 - improve sysfs return values and also sysfs doc (patch #12 #17)
 - create a hwmon for thermal management sysfs interfaces (patch #15)
 - create a hwmon for power management sysfs interfaces (patch #16)
 - update docmentation according to above changes (patch #5)
 - improve sysfs doc for performance reporting support (patch #18)

Wu Hao (15):
  fpga: dfl-fme-mgr: fix FME_PR_INTFC_ID register address.
  fpga: dfl: fme: remove copy_to_user() in ioctl for PR
  fpga: dfl: fme: align PR buffer size per PR datawidth
  fpga: dfl: fme: support 512bit data width PR
  Documentation: fpga: dfl: add descriptions for virtualization and new
    interfaces.
  fpga: dfl: fme: add DFL_FPGA_FME_PORT_RELEASE/ASSIGN ioctl support.
  fpga: dfl: pci: enable SRIOV support.
  fpga: dfl: afu: add AFU state related sysfs interfaces
  fpga: dfl: afu: add userclock sysfs interfaces.
  fpga: dfl: add id_table for dfl private feature driver
  fpga: dfl: afu: export __port_enable/disable function.
  fpga: dfl: afu: add error reporting support.
  fpga: dfl: afu: add STP (SignalTap) support
  fpga: dfl: fme: add capability sysfs interfaces
  fpga: dfl: fme: add global error reporting support

 Documentation/ABI/testing/sysfs-platform-dfl-fme  |  98 ++++++
 Documentation/ABI/testing/sysfs-platform-dfl-port | 104 ++++++
 Documentation/fpga/dfl.txt                        | 101 ++++++
 drivers/fpga/Makefile                             |   3 +-
 drivers/fpga/dfl-afu-error.c                      | 225 +++++++++++++
 drivers/fpga/dfl-afu-main.c                       | 330 ++++++++++++++++++-
 drivers/fpga/dfl-afu.h                            |   7 +
 drivers/fpga/dfl-fme-error.c                      | 385 ++++++++++++++++++++++
 drivers/fpga/dfl-fme-main.c                       | 120 ++++++-
 drivers/fpga/dfl-fme-mgr.c                        | 117 ++++++-
 drivers/fpga/dfl-fme-pr.c                         |  65 ++--
 drivers/fpga/dfl-fme.h                            |   7 +-
 drivers/fpga/dfl-pci.c                            |  40 +++
 drivers/fpga/dfl.c                                | 169 +++++++++-
 drivers/fpga/dfl.h                                |  54 ++-
 include/uapi/linux/fpga-dfl.h                     |  32 ++
 16 files changed, 1777 insertions(+), 80 deletions(-)
 create mode 100644 drivers/fpga/dfl-afu-error.c
 create mode 100644 drivers/fpga/dfl-fme-error.c

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH V34 19/29] Lock down module params that specify hardware parameters (eg. ioport)
From: Daniel Axtens @ 2019-06-27  1:49 UTC (permalink / raw)
  To: Matthew Garrett, jmorris
  Cc: linux-security-module, linux-kernel, linux-api, David Howells,
	Alan Cox, Matthew Garrett
In-Reply-To: <20190622000358.19895-20-matthewgarrett@google.com>

Matthew Garrett <matthewgarrett@google.com> writes:

> From: David Howells <dhowells@redhat.com>
>
> Provided an annotation for module parameters that specify hardware
> parameters (such as io ports, iomem addresses, irqs, dma channels, fixed
> dma buffers and other types).
>
> Suggested-by: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
> Signed-off-by: David Howells <dhowells@redhat.com>
> Signed-off-by: Matthew Garrett <mjg59@google.com>
> ---
>  include/linux/security.h     |  1 +
>  kernel/params.c              | 27 ++++++++++++++++++++++-----
>  security/lockdown/lockdown.c |  1 +
>  3 files changed, 24 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/security.h b/include/linux/security.h
> index 61e3f4a62d16..88064d7f6827 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -92,6 +92,7 @@ enum lockdown_reason {
>  	LOCKDOWN_ACPI_TABLES,
>  	LOCKDOWN_PCMCIA_CIS,
>  	LOCKDOWN_TIOCSSERIAL,
> +	LOCKDOWN_MODULE_PARAMETERS,
>  	LOCKDOWN_INTEGRITY_MAX,
>  	LOCKDOWN_CONFIDENTIALITY_MAX,
>  };
> diff --git a/kernel/params.c b/kernel/params.c
> index ce89f757e6da..f94fe79e331d 100644
> --- a/kernel/params.c
> +++ b/kernel/params.c
> @@ -24,6 +24,7 @@
>  #include <linux/err.h>
>  #include <linux/slab.h>
>  #include <linux/ctype.h>
> +#include <linux/security.h>
>  
>  #ifdef CONFIG_SYSFS
>  /* Protects all built-in parameters, modules use their own param_lock */
> @@ -108,13 +109,19 @@ bool parameq(const char *a, const char *b)
>  	return parameqn(a, b, strlen(a)+1);
>  }
>  
> -static void param_check_unsafe(const struct kernel_param *kp)
> +static bool param_check_unsafe(const struct kernel_param *kp,
> +			       const char *doing)
>  {
>  	if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
>  		pr_notice("Setting dangerous option %s - tainting kernel\n",
>  			  kp->name);
>  		add_taint(TAINT_USER, LOCKDEP_STILL_OK);
>  	}
> +
> +	if (kp->flags & KERNEL_PARAM_FL_HWPARAM &&
> +	    security_locked_down(LOCKDOWN_MODULE_PARAMETERS))
> +		return false;
> +	return true;
>  }

Should this test occur before tainting the kernel?

Regards,
Daniel

>  
>  static int parse_one(char *param,
> @@ -144,8 +151,10 @@ static int parse_one(char *param,
>  			pr_debug("handling %s with %p\n", param,
>  				params[i].ops->set);
>  			kernel_param_lock(params[i].mod);
> -			param_check_unsafe(&params[i]);
> -			err = params[i].ops->set(val, &params[i]);
> +			if (param_check_unsafe(&params[i], doing))
> +				err = params[i].ops->set(val, &params[i]);
> +			else
> +				err = -EPERM;
>  			kernel_param_unlock(params[i].mod);
>  			return err;
>  		}
> @@ -553,6 +562,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
>  	return count;
>  }
>  
> +#ifdef CONFIG_MODULES
> +#define mod_name(mod) (mod)->name
> +#else
> +#define mod_name(mod) "unknown"
> +#endif
> +
>  /* sysfs always hands a nul-terminated string in buf.  We rely on that. */
>  static ssize_t param_attr_store(struct module_attribute *mattr,
>  				struct module_kobject *mk,
> @@ -565,8 +580,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
>  		return -EPERM;
>  
>  	kernel_param_lock(mk->mod);
> -	param_check_unsafe(attribute->param);
> -	err = attribute->param->ops->set(buf, attribute->param);
> +	if (param_check_unsafe(attribute->param, mod_name(mk->mod)))
> +		err = attribute->param->ops->set(buf, attribute->param);
> +	else
> +		err = -EPERM;
>  	kernel_param_unlock(mk->mod);
>  	if (!err)
>  		return len;
> diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c
> index c89046dc2155..d03c4c296af7 100644
> --- a/security/lockdown/lockdown.c
> +++ b/security/lockdown/lockdown.c
> @@ -28,6 +28,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
>  	[LOCKDOWN_ACPI_TABLES] = "modified ACPI tables",
>  	[LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage",
>  	[LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO",
> +	[LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters",
>  	[LOCKDOWN_INTEGRITY_MAX] = "integrity",
>  	[LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
>  };
> -- 
> 2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply

* Re: [PATCH V33 24/30] bpf: Restrict bpf when kernel lockdown is in confidentiality mode
From: Andy Lutomirski @ 2019-06-27  0:57 UTC (permalink / raw)
  To: James Morris
  Cc: Andy Lutomirski, Matthew Garrett, linux-security, LKML, Linux API,
	David Howells, Alexei Starovoitov, Matthew Garrett,
	Network Development, Chun-Yi Lee, Daniel Borkmann,
	linux-security-module
In-Reply-To: <alpine.LRH.2.21.1906270621080.28132@namei.org>



> On Jun 26, 2019, at 1:22 PM, James Morris <jmorris@namei.org> wrote:
> 
> [Adding the LSM mailing list: missed this patchset initially]
> 
>> On Thu, 20 Jun 2019, Andy Lutomirski wrote:
>> 
>> This patch exemplifies why I don't like this approach:
>> 
>>> @@ -97,6 +97,7 @@ enum lockdown_reason {
>>>        LOCKDOWN_INTEGRITY_MAX,
>>>        LOCKDOWN_KCORE,
>>>        LOCKDOWN_KPROBES,
>>> +       LOCKDOWN_BPF,
>>>        LOCKDOWN_CONFIDENTIALITY_MAX,
>> 
>>> --- a/security/lockdown/lockdown.c
>>> +++ b/security/lockdown/lockdown.c
>>> @@ -33,6 +33,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
>>>        [LOCKDOWN_INTEGRITY_MAX] = "integrity",
>>>        [LOCKDOWN_KCORE] = "/proc/kcore access",
>>>        [LOCKDOWN_KPROBES] = "use of kprobes",
>>> +       [LOCKDOWN_BPF] = "use of bpf",
>>>        [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
>> 
>> The text here says "use of bpf", but what this patch is *really* doing
>> is locking down use of BPF to read kernel memory.  If the details
>> change, then every LSM needs to get updated, and we risk breaking user
>> policies that are based on LSMs that offer excessively fine
>> granularity.
> 
> Can you give an example of how the details might change?
> 
>> I'd be more comfortable if the LSM only got to see "confidentiality"
>> or "integrity".
> 
> These are not sufficient for creating a useful policy for the SELinux 
> case.
> 
> 

I may have misunderstood, but I thought that SELinux mainly needed to allow certain privileged programs to bypass the policy.  Is there a real example of what SELinux wants to do that can’t be done in the simplified model?

The think that specifically makes me uneasy about exposing all of these precise actions to LSMs is that they will get exposed to userspace in a way that forces us to treat them as stable ABIs.

^ permalink raw reply

* Re: [PATCH 00/25] VFS: Introduce filesystem information query syscall [ver #14]
From: Ian Kent @ 2019-06-27  0:38 UTC (permalink / raw)
  To: Christian Brauner
  Cc: David Howells, viro, mszeredi, linux-api, linux-fsdevel,
	linux-kernel
In-Reply-To: <20190626104704.dwjd4urpsmuheirc@brauner.io>

On Wed, 2019-06-26 at 12:47 +0200, Christian Brauner wrote:
> On Wed, Jun 26, 2019 at 06:42:51PM +0800, Ian Kent wrote:
> > On Wed, 2019-06-26 at 12:05 +0200, Christian Brauner wrote:
> > > On Mon, Jun 24, 2019 at 03:08:45PM +0100, David Howells wrote:
> > > > Hi Al,
> > > > 
> > > > Here are a set of patches that adds a syscall, fsinfo(), that allows
> > > > attributes of a filesystem/superblock to be queried.  Attribute values
> > > > are
> > > > of four basic types:
> > > > 
> > > >  (1) Version dependent-length structure (size defined by type).
> > > > 
> > > >  (2) Variable-length string (up to PAGE_SIZE).
> > > > 
> > > >  (3) Array of fixed-length structures (up to INT_MAX size).
> > > > 
> > > >  (4) Opaque blob (up to INT_MAX size).
> > > > 
> > > > Attributes can have multiple values in up to two dimensions and all the
> > > > values of a particular attribute must have the same type.
> > > > 
> > > > Note that the attribute values *are* allowed to vary between dentries
> > > > within a single superblock, depending on the specific dentry that you're
> > > > looking at.
> > > > 
> > > > I've tried to make the interface as light as possible, so integer/enum
> > > > attribute selector rather than string and the core does all the
> > > > allocation
> > > > and extensibility support work rather than leaving that to the
> > > > filesystems.
> > > > That means that for the first two attribute types, sb->s_op->fsinfo()
> > > > may
> > > > assume that the provided buffer is always present and always big enough.
> > > > 
> > > > Further, this removes the possibility of the filesystem gaining access
> > > > to
> > > > the
> > > > userspace buffer.
> > > > 
> > > > 
> > > > fsinfo() allows a variety of information to be retrieved about a
> > > > filesystem
> > > > and the mount topology:
> > > > 
> > > >  (1) General superblock attributes:
> > > > 
> > > >       - The amount of space/free space in a filesystem (as statfs()).
> > > >       - Filesystem identifiers (UUID, volume label, device numbers, ...)
> > > >       - The limits on a filesystem's capabilities
> > > >       - Information on supported statx fields and attributes and IOC
> > > > flags.
> > > >       - A variety single-bit flags indicating supported capabilities.
> > > >       - Timestamp resolution and range.
> > > >       - Sources (as per mount(2), but fsconfig() allows multiple
> > > > sources).
> > > >       - In-filesystem filename format information.
> > > >       - Filesystem parameters ("mount -o xxx"-type things).
> > > >       - LSM parameters (again "mount -o xxx"-type things).
> > > > 
> > > >  (2) Filesystem-specific superblock attributes:
> > > > 
> > > >       - Server names and addresses.
> > > >       - Cell name.
> > > > 
> > > >  (3) Filesystem configuration metadata attributes:
> > > > 
> > > >       - Filesystem parameter type descriptions.
> > > >       - Name -> parameter mappings.
> > > >       - Simple enumeration name -> value mappings.
> > > > 
> > > >  (4) Mount topology:
> > > > 
> > > >       - General information about a mount object.
> > > >       - Mount device name(s).
> > > >       - Children of a mount object and their relative paths.
> > > > 
> > > >  (5) Information about what the fsinfo() syscall itself supports,
> > > > including
> > > >      the number of attibutes supported and the number of capability bits
> > > >      supported.
> > > 
> > > Phew, this patchset is a lot. It's good of course but can we please cut
> > > some of the more advanced features such as querying by mount id,
> > > submounts etc. pp. for now?
> > 
> > Did you mean the "vfs: Allow fsinfo() to look up a mount object by ID"
> > patch?
> > 
> > We would need to be very careful what was dropped.
> 
> Not dropped as in never implement but rather defer it by one merge
> window to give us a) more time to review and settle the interface while
> b) not stalling the overall patch.

Sure, and I'm not saying something like what you recommend shouldn't
be done.

I'm working on user space mount table improvements that I want to
get done ahead of the merge.

Well, I would be but there's still mount-api conversions that need
to be done so that fsinfo() patches don't end up with endless merge
conflicts. The fsinfo() super block method will result in changes
in the same area as the mount-api changes.

The mount-api changes are proving to be a bit of a challenge.

Anyway, the plan is to use the mount table handling improvements to
try and
locate bugs and missing or not quite right functionality.

> 
> > For example, I've found that the patch above is pretty much essential
> > for fsinfo() to be useful from user space.
> 
> Yeah, but that interface is not clearly defined yet as can be seen from
> the commit message and that's what's bothering me most.

Yeah, but updating my cloned branch hasn't been difficult.

There's a certain amount of functionality that I'd like to see
retained for when I get back to the user space development.

Using the notifications changes are something I'm not likely
to get to for quite some time so breaking those out into a
separate branch (like they were not so long ago) would be
more sensible IMHO.

There may be some other bits that David can identify too.

Ian

^ permalink raw reply

* Re: [PATCH V33 24/30] bpf: Restrict bpf when kernel lockdown is in confidentiality mode
From: James Morris @ 2019-06-26 20:22 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Matthew Garrett, linux-security, LKML, Linux API, David Howells,
	Alexei Starovoitov, Matthew Garrett, Network Development,
	Chun-Yi Lee, Daniel Borkmann, linux-security-module
In-Reply-To: <CALCETrVUwQP7roLnW6kFG80Cc5U6X_T6AW+BTAftLccYGp8+Ow@mail.gmail.com>

[Adding the LSM mailing list: missed this patchset initially]

On Thu, 20 Jun 2019, Andy Lutomirski wrote:

> This patch exemplifies why I don't like this approach:
> 
> > @@ -97,6 +97,7 @@ enum lockdown_reason {
> >         LOCKDOWN_INTEGRITY_MAX,
> >         LOCKDOWN_KCORE,
> >         LOCKDOWN_KPROBES,
> > +       LOCKDOWN_BPF,
> >         LOCKDOWN_CONFIDENTIALITY_MAX,
> 
> > --- a/security/lockdown/lockdown.c
> > +++ b/security/lockdown/lockdown.c
> > @@ -33,6 +33,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
> >         [LOCKDOWN_INTEGRITY_MAX] = "integrity",
> >         [LOCKDOWN_KCORE] = "/proc/kcore access",
> >         [LOCKDOWN_KPROBES] = "use of kprobes",
> > +       [LOCKDOWN_BPF] = "use of bpf",
> >         [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
> 
> The text here says "use of bpf", but what this patch is *really* doing
> is locking down use of BPF to read kernel memory.  If the details
> change, then every LSM needs to get updated, and we risk breaking user
> policies that are based on LSMs that offer excessively fine
> granularity.

Can you give an example of how the details might change?

> I'd be more comfortable if the LSM only got to see "confidentiality"
> or "integrity".

These are not sufficient for creating a useful policy for the SELinux 
case.

-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply

* Re: [PATCH V33 29/30] tracefs: Restrict tracefs when the kernel is locked down
From: Matthew Garrett @ 2019-06-26 19:39 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: James Morris, linux-security, Linux Kernel Mailing List,
	Linux API
In-Reply-To: <20190626090748.23eba868@gandalf.local.home>

On Wed, Jun 26, 2019 at 6:07 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 20 Jun 2019 18:19:40 -0700
> Matthew Garrett <matthewgarrett@google.com> wrote:
> > +static const struct file_operations tracefs_proxy_file_operations = {
> > +     .read =         default_read_file,
> > +     .write =        default_write_file,
> > +     .open =         default_open_file,
> > +     .llseek =       noop_llseek,
> > +};
>
> This appears to be unused.

Oops, yup - dropped.

> > +     dentry->d_fsdata = fops ? (void *)fops :
> > +             (void *)&tracefs_file_operations;
> > +     memcpy(proxy_fops, dentry->d_fsdata, sizeof(struct file_operations));
> > +     proxy_fops->open = default_open_file;
> >       inode->i_mode = mode;
> > -     inode->i_fop = fops ? fops : &tracefs_file_operations;
> > +     inode->i_fop = proxy_fops;
>
>
> I think the above would look cleaner as:
>
>
>         if (!fops)
>                 fops = &tracefs_file_operations;
>
>         dentry->d_fsdata = (void *)fops;
>         memcpy(proxy_fops, fops, sizeof(*proxy_fops);
>         proxy_fops->open = default_open_file;

ACK.

^ permalink raw reply

* Re: [PATCH v5 16/16] f2fs: add fs-verity support
From: Eric Biggers @ 2019-06-26 18:21 UTC (permalink / raw)
  To: Chao Yu
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fscrypt, linux-fsdevel, Jaegeuk Kim,
	linux-integrity, linux-ext4, Linus Torvalds, Christoph Hellwig,
	Victor Hsieh
In-Reply-To: <68c5a15f-f6a8-75e2-b485-0f1b51471995@huawei.com>

On Wed, Jun 26, 2019 at 03:34:35PM +0800, Chao Yu wrote:
> >>> +	err = f2fs_convert_inline_inode(inode);
> >>> +	if (err)
> >>> +		return err;
> >>> +
> >>> +	err = dquot_initialize(inode);
> >>> +	if (err)
> >>> +		return err;
> >>
> >> We can get rid of dquot_initialize() here, since f2fs_file_open() ->
> >> dquot_file_open() should has initialized quota entry previously, right?
> > 
> > We still need it because dquot_file_open() only calls dquot_initialize() if the
> > file is being opened for writing.  But here the file descriptor is readonly.
> > I'll add a comment explaining this here and in the ext4 equivalent.
> 
> Ah, you're right.
> 
> f2fs_convert_inline_inode() may grab one more block during conversion, so we
> need to call dquot_initialize() before inline conversion?
> 
> Thanks,
> 

Good point.  I'll fix that here and in ext4.

- Eric

^ permalink raw reply

* Re: [RFC PATCH 1/1] Revert "rseq/selftests: arm: use udf instruction for RSEQ_SIG"
From: Will Deacon @ 2019-06-26 18:00 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Will Deacon, shuah, linux-kernel, Peter Zijlstra, Thomas Gleixner,
	Joel Fernandes, Catalin Marinas, Dave Watson, Andi Kleen,
	linux-kselftest, H. Peter Anvin, Chris Lameter, Russell King,
	Michael Kerrisk, Paul E . McKenney, Paul Turner, Boqun Feng,
	Josh Triplett, rostedt, Ben Maurer, linux-api <linux-api>
In-Reply-To: <795143697.722.1561471732756.JavaMail.zimbra@efficios.com>

On Tue, Jun 25, 2019 at 10:08:52AM -0400, Mathieu Desnoyers wrote:
> ----- On Jun 25, 2019, at 5:15 AM, Will Deacon will.deacon@arm.com wrote:
> > On Mon, Jun 24, 2019 at 02:20:26PM -0400, Mathieu Desnoyers wrote:
> >> ----- On Jun 24, 2019, at 1:24 PM, Will Deacon will.deacon@arm.com wrote:
> >> > On Mon, Jun 17, 2019 at 05:23:04PM +0200, Mathieu Desnoyers wrote:
> >> >> -#define RSEQ_SIG_CODE	0xe7f5def3
> >> >> -
> >> >> -#ifndef __ASSEMBLER__
> >> >> -
> >> >> -#define RSEQ_SIG_DATA							\
> >> >> -	({								\
> >> >> -		int sig;						\
> >> >> -		asm volatile ("b 2f\n\t"				\
> >> >> -			      "1: .inst " __rseq_str(RSEQ_SIG_CODE) "\n\t" \
> >> >> -			      "2:\n\t"					\
> >> >> -			      "ldr %[sig], 1b\n\t"			\
> >> >> -			      : [sig] "=r" (sig));			\
> >> >> -		sig;							\
> >> >> -	})
> >> >> -
> >> >> -#define RSEQ_SIG	RSEQ_SIG_DATA
> >> >> -
> >> >> -#endif
> >> >> +#define RSEQ_SIG	0x53053053
> >> > 
> >> > I don't get why you're reverting back to this old signature value, when the
> >> > one we came up with will work well when interpreted as an instruction in the
> >> > *vast* majority of scenarios that people care about (A32/T32 little-endian).
> >> > I think you might be under-estimating just how dead things like BE32 really
> >> > are.
> >> 
> >> My issue is that the current .instr approach is broken for programs or functions
> >> built in Thumb mode, and I received no feedback on the solutions I proposed for
> >> those issues, which led me to propose a patch reverting to a simple .word.
> > 
> > I understand why you're moving from .inst to .word, but I don't understand
> > why that necessitates a change in the value. Why not .word 0xe7f5def3 ? You
> > could also flip the bytes around in case of big-endian, which would keep the
> > instruction coding clean for BE8.
> 
> As long as we state and document that this should not be expected to generate
> valid instructions on big endian prior to ARMv6, I'm OK with that approach, e.g.:
> 
> /*
>  * - ARM little endian
>  *
>  * RSEQ_SIG uses the udf A32 instruction with an uncommon immediate operand
>  * value 0x5de3. This traps if user-space reaches this instruction by mistake,
>  * and the uncommon operand ensures the kernel does not move the instruction
>  * pointer to attacker-controlled code on rseq abort.
>  *
>  * The instruction pattern in the A32 instruction set is:
>  *
>  * e7f5def3    udf    #24035    ; 0x5de3
>  *
>  * This translates to the following instruction pattern in the T16 instruction
>  * set:
>  *
>  * little endian:
>  * def3        udf    #243      ; 0xf3
>  * e7f5        b.n    <7f5>
>  *
>  * - ARMv6+ big endian:

Maybe mention "(BE8)" here...

>  *
>  * ARMv6+ -mbig-endian generates mixed endianness code vs data: little-endian
>  * code and big-endian data. The data value of the signature needs to have its
>  * byte order reversed to generate the trap instruction:
>  *
>  * Data: 0xf3def5e7
>  *
>  * Translates to this A32 instruction pattern:
>  *
>  * e7f5def3    udf    #24035    ; 0x5de3
>  *
>  * Translates to this T16 instruction pattern:
>  *
>  * def3        udf    #243      ; 0xf3
>  * e7f5        b.n    <7f5>
>  *
>  * - Prior to ARMv6 big endian:

... and "(BE32)" here.

With that, this looks fine to me.

Will

^ permalink raw reply

* Re: [PATCH] binfmt_elf: Extract .note.gnu.property from an ELF file
From: Yu-cheng Yu @ 2019-06-26 17:30 UTC (permalink / raw)
  To: Andy Lutomirski, Dave Martin
  Cc: X86 ML, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, LKML,
	open list:DOCUMENTATION, Linux-MM, linux-arch, Linux API,
	Arnd Bergmann, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
	Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
	Jonathan Corbet, Kees Cook, Mike Kravetz,
	Nadav Amit <nadav.ami>
In-Reply-To: <CALCETrVZCzh+KFCF6ijuf4QEPn=R2gJ8FHLpyFd=n+pNOMMMjA@mail.gmail.com>

On Wed, 2019-06-26 at 10:14 -0700, Andy Lutomirski wrote:
> On Thu, May 2, 2019 at 4:10 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > 
> > On Wed, May 01, 2019 at 02:12:17PM -0700, Yu-cheng Yu wrote:
> > > An ELF file's .note.gnu.property indicates features the executable file
> > > can support.  For example, the property GNU_PROPERTY_X86_FEATURE_1_AND
> > > indicates the file supports GNU_PROPERTY_X86_FEATURE_1_IBT and/or
> > > GNU_PROPERTY_X86_FEATURE_1_SHSTK.
> > > 
[...]
> 
> Where did PT_GNU_PROPERTY come from?  Are there actual docs for it?
> Can someone here tell us what the actual semantics of this new ELF
> thingy are?  From some searching, it seems like it's kind of an ELF
> note but kind of not.  An actual description would be fantastic.
> 
> Also, I don't think there's any actual requirement that the upstream
> kernel recognize existing CET-enabled RHEL 8 binaries as being
> CET-enabled.  I tend to think that RHEL 8 jumped the gun here.  While
> the upstream kernel should make some reasonble effort to make sure
> that RHEL 8 binaries will continue to run, I don't see why we need to
> go out of our way to keep the full set of mitigations available for
> binaries that were developed against a non-upstream kernel.
> 
> In fact, if we handle the legacy bitmap differently from RHEL 8, we
> may *have* to make sure that we don't recognize existing RHEL 8
> binaries as CET-enabled.

We have worked out the issue.  Linux will look at only PT_GNU_PROPERTY, which is
a shortcut pointing directly to .note.gnu.property.  I have an updated patch,
and will send it out (although it is not yet perfect).

The Linux gABI extension draft is here: https://github.com/hjl-tools/linux-abi/w
iki/linux-abi-draft.pdf.

Yu-cheng

^ permalink raw reply

* Re: Detecting the availability of VSYSCALL
From: Andy Lutomirski @ 2019-06-26 17:14 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Andy Lutomirski, Thomas Gleixner, Linux API, Kernel Hardening,
	linux-x86_64, linux-arch, Kees Cook, Carlos O'Donell, X86 ML
In-Reply-To: <87ef3g1do3.fsf@oldenburg2.str.redhat.com>

On Wed, Jun 26, 2019 at 10:04 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Andy Lutomirski:
>
> > On Wed, Jun 26, 2019 at 9:45 AM Florian Weimer <fweimer@redhat.com> wrote:
> >>
> >> * Andy Lutomirski:
> >>
> >> > Can’t an ELF note be done with some more or less ordinary asm such
> >> > that any link editor will insert it correctly?
> >>
> >> We've just been over this for the CET enablement.  ELF PT_NOTE parsing
> >> was rejected there.
> >
> > No one told me this.  Unless I missed something, the latest kernel
> > patches still had PT_NOTE parsing.  Can you point me at an
> > enlightening thread or explain what happened?
>
> The ABI was changed rather late, and PT_GNU_PROPERTY has been added.
> But this is okay because the kernel only looks at the dynamic loader,
> which we can update fairly easily.

Ugh.  I replied there.  I don't consider any of that to have much
bearing on what we do for vsyscalls.  That being said, the
PT_GNU_PROPERTY thing sounds like maybe we could use it for a bit
saying "no vsyscalls needed".

>
> The thread is:
>
> Subject: Re: [PATCH v7 22/27] binfmt_elf: Extract .note.gnu.property from an ELF file
>
> <87blyu7ubf.fsf@oldenburg2.str.redhat.com> is a message reference in it.
>
> >> > The problem with a personality flag is that it needs to have some kind
> >> > of sensible behavior for setuid programs, and getting that right in a
> >> > way that doesn’t scream “exploit me” while preserving useful
> >> > compatibility may be tricky.
> >>
> >> Are restrictive personality flags still a problem with user namespaces?
> >> I think it would be fine to restrict this one to CAP_SYS_ADMIN.
> >
> > We could possibly get away with this, but now we're introducing a
> > whole new mechanism.  I'd rather just add proper per-namespace
> > sysctls, but this is a pretty big hammer.
>
> Oh, I wasn't aware of that.  I thought that this already existed in some
> form, e.g. prctl with PR_SET_SECCOMP requiring CAP_SYS_ADMIN unless
> PR_SET_NO_NEW_PRIVS was active as well.

We do have that, but I don't think we have it for personality.  The
whole personality mechanism scares me a bit due to a lack of this type
of thing, and I'd want to review it carefully before adding a new
personality bit.


--Andy

^ permalink raw reply

* Re: [PATCH] binfmt_elf: Extract .note.gnu.property from an ELF file
From: Andy Lutomirski @ 2019-06-26 17:14 UTC (permalink / raw)
  To: Dave Martin
  Cc: Yu-cheng Yu, X86 ML, H. Peter Anvin, Thomas Gleixner, Ingo Molnar,
	LKML, open list:DOCUMENTATION, Linux-MM, linux-arch, Linux API,
	Arnd Bergmann, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
	Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
	Jonathan Corbet, Kees Cook, Mike Kravetz <mike.krave>
In-Reply-To: <20190502111003.GO3567@e103592.cambridge.arm.com>

On Thu, May 2, 2019 at 4:10 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Wed, May 01, 2019 at 02:12:17PM -0700, Yu-cheng Yu wrote:
> > An ELF file's .note.gnu.property indicates features the executable file
> > can support.  For example, the property GNU_PROPERTY_X86_FEATURE_1_AND
> > indicates the file supports GNU_PROPERTY_X86_FEATURE_1_IBT and/or
> > GNU_PROPERTY_X86_FEATURE_1_SHSTK.
> >
> > This patch was part of the Control-flow Enforcement series; the original
> > patch is here: https://lkml.org/lkml/2018/11/20/205.  Dave Martin responded
> > that ARM recently introduced new features to NT_GNU_PROPERTY_TYPE_0 with
> > properties closely modelled on GNU_PROPERTY_X86_FEATURE_1_AND, and it is
> > logical to split out the generic part.  Here it is.
> >
> > With this patch, if an arch needs to setup features from ELF properties,
> > it needs CONFIG_ARCH_USE_GNU_PROPERTY to be set, and a specific
> > arch_setup_property().
> >
> > For example, for X86_64:
> >
> > int arch_setup_property(void *ehdr, void *phdr, struct file *f, bool inter)
> > {
> >       int r;
> >       uint32_t property;
> >
> >       r = get_gnu_property(ehdr, phdr, f, GNU_PROPERTY_X86_FEATURE_1_AND,
> >                            &property);
> >       ...
> > }
>
> Thanks, this is timely for me.  I should be able to build the needed
> arm64 support pretty quickly around this now.
>
> [Cc'ing libc-alpha for the elf.h question -- see (2)]
>
>
> A couple of questions before I look in more detail:
>
> 1) Can we rely on PT_GNU_PROPERTY being present in the phdrs to describe
> the NT_GNU_PROPERTY_TYPE_0 note?  If so, we can avoid trying to parse
> irrelevant PT_NOTE segments.
>
>
> 2) Are there standard types for things like the program property header?
> If not, can we add something in elf.h?  We should try to coordinate with
> libc on that.  Something like
>

Where did PT_GNU_PROPERTY come from?  Are there actual docs for it?
Can someone here tell us what the actual semantics of this new ELF
thingy are?  From some searching, it seems like it's kind of an ELF
note but kind of not.  An actual description would be fantastic.

Also, I don't think there's any actual requirement that the upstream
kernel recognize existing CET-enabled RHEL 8 binaries as being
CET-enabled.  I tend to think that RHEL 8 jumped the gun here.  While
the upstream kernel should make some reasonble effort to make sure
that RHEL 8 binaries will continue to run, I don't see why we need to
go out of our way to keep the full set of mitigations available for
binaries that were developed against a non-upstream kernel.

In fact, if we handle the legacy bitmap differently from RHEL 8, we
may *have* to make sure that we don't recognize existing RHEL 8
binaries as CET-enabled.

Sigh.

^ permalink raw reply

* Re: Detecting the availability of VSYSCALL
From: Florian Weimer @ 2019-06-26 17:04 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, Linux API, Kernel Hardening, linux-x86_64,
	linux-arch, Kees Cook, Carlos O'Donell, X86 ML
In-Reply-To: <CALCETrUG9yHf4D_fDEj054Bgo4zXpmK5UzME9mKNqD70U7vy5Q@mail.gmail.com>

* Andy Lutomirski:

> On Wed, Jun 26, 2019 at 9:45 AM Florian Weimer <fweimer@redhat.com> wrote:
>>
>> * Andy Lutomirski:
>>
>> > Can’t an ELF note be done with some more or less ordinary asm such
>> > that any link editor will insert it correctly?
>>
>> We've just been over this for the CET enablement.  ELF PT_NOTE parsing
>> was rejected there.
>
> No one told me this.  Unless I missed something, the latest kernel
> patches still had PT_NOTE parsing.  Can you point me at an
> enlightening thread or explain what happened?

The ABI was changed rather late, and PT_GNU_PROPERTY has been added.
But this is okay because the kernel only looks at the dynamic loader,
which we can update fairly easily.

The thread is:

Subject: Re: [PATCH v7 22/27] binfmt_elf: Extract .note.gnu.property from an ELF file

<87blyu7ubf.fsf@oldenburg2.str.redhat.com> is a message reference in it.

>> > The problem with a personality flag is that it needs to have some kind
>> > of sensible behavior for setuid programs, and getting that right in a
>> > way that doesn’t scream “exploit me” while preserving useful
>> > compatibility may be tricky.
>>
>> Are restrictive personality flags still a problem with user namespaces?
>> I think it would be fine to restrict this one to CAP_SYS_ADMIN.
>
> We could possibly get away with this, but now we're introducing a
> whole new mechanism.  I'd rather just add proper per-namespace
> sysctls, but this is a pretty big hammer.

Oh, I wasn't aware of that.  I thought that this already existed in some
form, e.g. prctl with PR_SET_SECCOMP requiring CAP_SYS_ADMIN unless
PR_SET_NO_NEW_PRIVS was active as well.

Thanks,
Florian

^ permalink raw reply

* Re: Detecting the availability of VSYSCALL
From: Andy Lutomirski @ 2019-06-26 16:52 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Andy Lutomirski, Thomas Gleixner, Linux API, Kernel Hardening,
	linux-x86_64, linux-arch, Kees Cook, Carlos O'Donell, X86 ML
In-Reply-To: <87sgrw1ejv.fsf@oldenburg2.str.redhat.com>

On Wed, Jun 26, 2019 at 9:45 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Andy Lutomirski:
>
> > Can’t an ELF note be done with some more or less ordinary asm such
> > that any link editor will insert it correctly?
>
> We've just been over this for the CET enablement.  ELF PT_NOTE parsing
> was rejected there.

No one told me this.  Unless I missed something, the latest kernel
patches still had PT_NOTE parsing.  Can you point me at an
enlightening thread or explain what happened?

> > The problem with a personality flag is that it needs to have some kind
> > of sensible behavior for setuid programs, and getting that right in a
> > way that doesn’t scream “exploit me” while preserving useful
> > compatibility may be tricky.
>
> Are restrictive personality flags still a problem with user namespaces?
> I think it would be fine to restrict this one to CAP_SYS_ADMIN.

We could possibly get away with this, but now we're introducing a
whole new mechanism.  I'd rather just add proper per-namespace
sysctls, but this is a pretty big hammer.

^ permalink raw reply

* Re: Detecting the availability of VSYSCALL
From: Florian Weimer @ 2019-06-26 16:45 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Thomas Gleixner, Linux API, Kernel Hardening,
	linux-x86_64, linux-arch, Kees Cook, Carlos O'Donell, X86 ML
In-Reply-To: <6CECE9DE-51AB-4A21-A257-8B85C4C94EB0@amacapital.net>

* Andy Lutomirski:

> Can’t an ELF note be done with some more or less ordinary asm such
> that any link editor will insert it correctly?

We've just been over this for the CET enablement.  ELF PT_NOTE parsing
was rejected there.

I don't think binutils ld has a way to set an ELF program header it
doesn't know anything about.

>>> Would enterprise distros consider backporting such a thing?
>> 
>> Enterprise distros aren't the problem here because they can't remove
>> vsyscall support for quite a while due to existing customer binaries.
>> For them, it would just be an additional (and welcome) hardening
>> opportunity.
>> 
>> The challenge here are container hosting platforms which have already
>> disabled vsyscall, presumably to protect the container host itself.
>> They would need to rebuild the container host userspace with the markup
>> to keep it protected, and then they could switch to a kernel which has
>> vsyscall-unless-opt-out logic.  That seems to be a bit of a stretch
>> because from their perspective, there's no problem today.
>> 
>> My guess is that it would be easier to have a personality flag.  Then
>> they could keep the host largely as-is, and would “only” need a
>> mechanism to pass through the flag from the image metadata to the actual
>> container creation.  It's still a change to the container host (and the
>> kernel change is required as well), but it would not require relinking
>> every statically linked binary.

> The problem with a personality flag is that it needs to have some kind
> of sensible behavior for setuid programs, and getting that right in a
> way that doesn’t scream “exploit me” while preserving useful
> compatibility may be tricky.

Are restrictive personality flags still a problem with user namespaces?
I think it would be fine to restrict this one to CAP_SYS_ADMIN.

Thanks,
Florian

^ permalink raw reply

* Re: Detecting the availability of VSYSCALL
From: Andy Lutomirski @ 2019-06-26 16:24 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Andy Lutomirski, Thomas Gleixner, Linux API, Kernel Hardening,
	linux-x86_64, linux-arch, Kees Cook, Carlos O'Donell, X86 ML
In-Reply-To: <87a7e4jr4s.fsf@oldenburg2.str.redhat.com>


> On Jun 26, 2019, at 8:36 AM, Florian Weimer <fweimer@redhat.com> wrote:
> 
> * Andy Lutomirski:
> 
>> I’m wondering if we can still do it: add a note or other ELF indicator
>> that says “I don’t need vsyscalls.”  Then we can change the default
>> mode to “no vsyscalls if the flag is there, else execute-only
>> vsyscalls”.
>> 
>> Would glibc go along with this?
> 
> I think we can make it happen, at least for relatively recent glibc
> linked with current binutils.  It's not trivial because it requires
> coordination among multiple projects.  We have three or four widely used
> link editors now, but we could make it happen.  (Although getting to
> PT_GNU_PROPERTY wasn't exactly easy.)

Can’t an ELF note be done with some more or less ordinary asm such that any link editor will insert it correctly?

> 
>> Would enterprise distros consider backporting such a thing?
> 
> Enterprise distros aren't the problem here because they can't remove
> vsyscall support for quite a while due to existing customer binaries.
> For them, it would just be an additional (and welcome) hardening
> opportunity.
> 
> The challenge here are container hosting platforms which have already
> disabled vsyscall, presumably to protect the container host itself.
> They would need to rebuild the container host userspace with the markup
> to keep it protected, and then they could switch to a kernel which has
> vsyscall-unless-opt-out logic.  That seems to be a bit of a stretch
> because from their perspective, there's no problem today.
> 
> My guess is that it would be easier to have a personality flag.  Then
> they could keep the host largely as-is, and would “only” need a
> mechanism to pass through the flag from the image metadata to the actual
> container creation.  It's still a change to the container host (and the
> kernel change is required as well), but it would not require relinking
> every statically linked binary.
> 
> 

The problem with a personality flag is that it needs to have some kind of sensible behavior for setuid programs, and getting that right in a way that doesn’t scream “exploit me” while preserving useful compatibility may be tricky.

^ permalink raw reply

* Re: New glibc-testfail io/tst-copy_file_range with kernel-next.
From: Amir Goldstein @ 2019-06-26 15:38 UTC (permalink / raw)
  To: Stefan Liebler
  Cc: GNU C Library, Linux API, linux-fsdevel, Darrick J. Wong,
	Dave Chinner
In-Reply-To: <987759a8-0b0b-f74e-4a0e-b3570d9a888f@linux.ibm.com>

On Wed, Jun 26, 2019 at 4:57 PM Stefan Liebler <stli@linux.ibm.com> wrote:
>
> Hi,
>
> as information, I had the chance to run the glibc-testsuite on a
> kernel-next from today on s390x and recognized a new failing test:
> io/tst-copy_file_range

Thanks for the detailed report!

[cc: linux-api and linux-fsdevel]

>
> It seems as the patches from Amir Golstein are changing the behaviour of
> copy_file_range. See two of the series:
> -"vfs: allow copy_file_range to copy across devices"
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=5dae222a5ff0c269730393018a5539cc970a4726
> -"vfs: add missing checks to copy_file_range"
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=96e6e8f4a68df2d94800311163faa67124df24e5
>
> A quick look into the verbose output (see attached file) shows at least
> the following changes:
>
> - writing at a position past the maximum allowed offset with "write"
> fails with EINVAL (22) whereas "copy_file_range" is now returning EFBIG
> (27). The test assumes that the same errno is set for "write" and
> "copy_file_range". (See <glibc>/io/tst-copy_file_range.c in
> delayed_write_failure_beginning() with current_size=1 or the second copy
> in delayed_write_failure_end())
> According to http://man7.org/linux/man-pages/man2/copy_file_range.2.html
> and http://man7.org/linux/man-pages/man2/write.2.html EFBIG seems to be
> the correct error.
> Should "write" also return EFBIG in those cases?

I'm not sure.
I think it makes sense for copy_file_range()  to behave very similarly to
"read"+"write" that is what I was aiming for. However, copy_file_range()
can be called with or without pos/offset. When called with offset, it would
be better to try and match its behavior with pread/pwrite. Note the EINVAL
case in http://man7.org/linux/man-pages/man2/pwritev.2.html
when offset+len overflows ssize_t.

Also, please see planned changes to copy_file_range() man page:
https://github.com/amir73il/man-pages/commit/ef24cb90797552eb0a2c55a1fb7f2c275e3b1bdb

>
> - For delayed_write_failure_beginning() with current_size>=2
> copy_file_range is started at a valid offset, but the length is beyond a
> valid range. copy_file_range is now copying the "valid" bytes and
> returning the number of copied bytes. The old behaviour was to return
> EINVAL without copying anything.
> In find_maximum_offset() a test sets maximum_offset_hard_limit to
> true/false depending on the behaviour of "write" in such a situation
> and the tests in delayed_write_failure_beginning() are assuming that
> "copy_file_range" behaves like "write".
> Should "write" perform the same partial copies as "copy_file_range" or
> how to determine the setting of maximum_offset_hard_limit?
>
> - In cross_device_failure it is assumed that copy_file_range always
> fails with EXDEV. Amirs patches are now allowing this case if possible.
> How could the testcase determine if the kernel supports cross device
> copies or not?
>
>

Florian's response:

> There's been a previous thread here:
>
>  <https://sourceware.org/ml/libc-alpha/2019-06/msg00039.html>
>
> I can back out the emulation, as discussed, then there wouldn't be
> anything left to test in theory (although I think our tests caught one
> or two bugs in XFS backports downstream, that as before fstests covered
> copy_file_range).

I agree that sounds like the best option.

Thanks,
Amir.

^ permalink raw reply

* Re: Detecting the availability of VSYSCALL
From: Florian Weimer @ 2019-06-26 15:36 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Thomas Gleixner, Linux API, Kernel Hardening,
	linux-x86_64, linux-arch, Kees Cook, Carlos O'Donell, X86 ML
In-Reply-To: <534B9F63-E949-4CF5-ACAC-71381190846F@amacapital.net>

* Andy Lutomirski:

> I’m wondering if we can still do it: add a note or other ELF indicator
> that says “I don’t need vsyscalls.”  Then we can change the default
> mode to “no vsyscalls if the flag is there, else execute-only
> vsyscalls”.
>
> Would glibc go along with this?

I think we can make it happen, at least for relatively recent glibc
linked with current binutils.  It's not trivial because it requires
coordination among multiple projects.  We have three or four widely used
link editors now, but we could make it happen.  (Although getting to
PT_GNU_PROPERTY wasn't exactly easy.)

> Would enterprise distros consider backporting such a thing?

Enterprise distros aren't the problem here because they can't remove
vsyscall support for quite a while due to existing customer binaries.
For them, it would just be an additional (and welcome) hardening
opportunity.

The challenge here are container hosting platforms which have already
disabled vsyscall, presumably to protect the container host itself.
They would need to rebuild the container host userspace with the markup
to keep it protected, and then they could switch to a kernel which has
vsyscall-unless-opt-out logic.  That seems to be a bit of a stretch
because from their perspective, there's no problem today.

My guess is that it would be easier to have a personality flag.  Then
they could keep the host largely as-is, and would “only” need a
mechanism to pass through the flag from the image metadata to the actual
container creation.  It's still a change to the container host (and the
kernel change is required as well), but it would not require relinking
every statically linked binary.

Thanks,
Florian

^ permalink raw reply

* Re: Detecting the availability of VSYSCALL
From: Andy Lutomirski @ 2019-06-26 15:21 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Andy Lutomirski, Thomas Gleixner, Linux API, Kernel Hardening,
	linux-x86_64, linux-arch, Kees Cook, Carlos O'Donell, X86 ML
In-Reply-To: <87r27gjss3.fsf@oldenburg2.str.redhat.com>



> On Jun 26, 2019, at 8:00 AM, Florian Weimer <fweimer@redhat.com> wrote:
> 
> * Andy Lutomirski:
> 
>> I didn’t add a flag because the vsyscall page was thoroughly obsolete
>> when all this happened, and I wanted to encourage all new code to just
>> parse the vDSO instead of piling on the hacks.
> 
> It turned out that the thorny cases just switched to system calls
> instead.  I think we finally completed the transition in glibc upstream
> in 2018 (for x86).
> 
>> Anyway, you may be the right person to ask: is there some credible way
>> that the kernel could detect new binaries that don’t need vsyscalls?
>> Maybe a new ELF note on a static binary or on the ELF interpreter? We
>> can dynamically switch it in principle.
> 
> For this kind of change, markup similar to PT_GNU_STACK would have been
> appropriate, I think: Old kernels and loaders would have ignored the
> program header and loaded the program anyway, but the vsyscall page
> still existed, so that would have been fine. The kernel would have
> needed to check the program interpreter or the main executable (without
> a program interpreter, i.e., the statically linked case).  Due the way
> the vsyscalls are concentrated in glibc, a dynamically linked executable
> would not have needed checking (or re-linking).  I don't think we would
> have implemented the full late enablement after dlopen we did for
> executable stacks.  In theory, any code could have jumped to the
> vsyscall area, but in practice, it's just dynamically linked glibc and
> static binaries.
> 
> But nowadays, unmarked glibcs which do not depend on vsyscall vastly
> outnumber unmarked glibcs which requrie it.  Therefore, markup of
> binaries does not seem to be reasonable to day.  I could imagine a
> personality flag you can set (if yoy have CAP_SYS_ADMIN) that re-enables
> vsyscall support for new subprocesses.  And a container runtime would do
> this based on metadata found in the image.  This way, the container host
> itself could be protected, and you could still run legacy images which
> require vsyscall.
> 
> For the non-container case, if you know that you'll run legacy
> workloads, you'd still have the boot parameter.  But I think it could
> default to vsyscall=none in many more cases.
> 

I’m wondering if we can still do it: add a note or other ELF indicator that says “I don’t need vsyscalls.”  Then we can change the default mode to “no vsyscalls if the flag is there, else execute-only vsyscalls”.

Would glibc go along with this?  Would enterprise distros consider backporting such a thing?

I

^ permalink raw reply

* Re: Detecting the availability of VSYSCALL
From: Florian Weimer @ 2019-06-26 15:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Thomas Gleixner, Linux API, Kernel Hardening,
	linux-x86_64, linux-arch, Kees Cook, Carlos O'Donell, X86 ML
In-Reply-To: <CA96B819-30A9-43D3-9FE3-2D551D35369E@amacapital.net>

* Andy Lutomirski:

> I didn’t add a flag because the vsyscall page was thoroughly obsolete
> when all this happened, and I wanted to encourage all new code to just
> parse the vDSO instead of piling on the hacks.

It turned out that the thorny cases just switched to system calls
instead.  I think we finally completed the transition in glibc upstream
in 2018 (for x86).

> Anyway, you may be the right person to ask: is there some credible way
> that the kernel could detect new binaries that don’t need vsyscalls?
> Maybe a new ELF note on a static binary or on the ELF interpreter? We
> can dynamically switch it in principle.

For this kind of change, markup similar to PT_GNU_STACK would have been
appropriate, I think: Old kernels and loaders would have ignored the
program header and loaded the program anyway, but the vsyscall page
still existed, so that would have been fine. The kernel would have
needed to check the program interpreter or the main executable (without
a program interpreter, i.e., the statically linked case).  Due the way
the vsyscalls are concentrated in glibc, a dynamically linked executable
would not have needed checking (or re-linking).  I don't think we would
have implemented the full late enablement after dlopen we did for
executable stacks.  In theory, any code could have jumped to the
vsyscall area, but in practice, it's just dynamically linked glibc and
static binaries.

But nowadays, unmarked glibcs which do not depend on vsyscall vastly
outnumber unmarked glibcs which requrie it.  Therefore, markup of
binaries does not seem to be reasonable to day.  I could imagine a
personality flag you can set (if yoy have CAP_SYS_ADMIN) that re-enables
vsyscall support for new subprocesses.  And a container runtime would do
this based on metadata found in the image.  This way, the container host
itself could be protected, and you could still run legacy images which
require vsyscall.

For the non-container case, if you know that you'll run legacy
workloads, you'd still have the boot parameter.  But I think it could
default to vsyscall=none in many more cases.

Thanks,
Florian

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox