Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH v4 09/12] fpga: dfl: afu: add STP (SignalTap) support
From: Wu Hao @ 2019-08-04 10:20 UTC (permalink / raw)
  To: gregkh, mdf, linux-fpga
  Cc: linux-kernel, linux-api, linux-doc, atull, Wu Hao, Xu Yilun
In-Reply-To: <1564914022-3710-1-git-send-email-hao.wu@intel.com>

STP (SignalTap) is one of the private features under the port for
debugging. This patch adds private feature driver support for it
to allow userspace applications to mmap related mmio region and
provide STP service.

Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Moritz Fischer <mdf@kernel.org>
Acked-by: Alan Tull <atull@kernel.org>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
---
v4: remove uinit callback which does nothing.
    remove dev_dbg in init callback function.
---
 drivers/fpga/dfl-afu-main.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
index b14df11..e44e31f 100644
--- a/drivers/fpga/dfl-afu-main.c
+++ b/drivers/fpga/dfl-afu-main.c
@@ -514,6 +514,27 @@ static void port_afu_uinit(struct platform_device *pdev,
 	.uinit = port_afu_uinit,
 };
 
+static int port_stp_init(struct platform_device *pdev,
+			 struct dfl_feature *feature)
+{
+	struct resource *res = &pdev->resource[feature->resource_index];
+
+	return afu_mmio_region_add(dev_get_platdata(&pdev->dev),
+				   DFL_PORT_REGION_INDEX_STP,
+				   resource_size(res), res->start,
+				   DFL_PORT_REGION_MMAP | DFL_PORT_REGION_READ |
+				   DFL_PORT_REGION_WRITE);
+}
+
+static const struct dfl_feature_id port_stp_id_table[] = {
+	{.id = PORT_FEATURE_ID_STP,},
+	{0,}
+};
+
+static const struct dfl_feature_ops port_stp_ops = {
+	.init = port_stp_init,
+};
+
 static struct dfl_feature_driver port_feature_drvs[] = {
 	{
 		.id_table = port_hdr_id_table,
@@ -528,6 +549,10 @@ static void port_afu_uinit(struct platform_device *pdev,
 		.ops = &port_err_ops,
 	},
 	{
+		.id_table = port_stp_id_table,
+		.ops = &port_stp_ops,
+	},
+	{
 		.ops = NULL,
 	}
 };
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 10/12] fpga: dfl: fme: add capability sysfs interfaces
From: Wu Hao @ 2019-08-04 10:20 UTC (permalink / raw)
  To: gregkh, mdf, linux-fpga
  Cc: linux-kernel, linux-api, linux-doc, atull, Wu Hao, Luwei Kang,
	Xu Yilun
In-Reply-To: <1564914022-3710-1-git-send-email-hao.wu@intel.com>

This patch adds 3 read-only sysfs interfaces for FPGA Management Engine
(FME) block for capabilities including cache_size, fabric_version and
socket_id.

Signed-off-by: Luwei Kang <luwei.kang@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
---
v2: rebased.
v3: update kernel version and date in sysfs doc
---
 Documentation/ABI/testing/sysfs-platform-dfl-fme | 23 ++++++++++++
 drivers/fpga/dfl-fme-main.c                      | 48 ++++++++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
index 8fa4feb..65372aa 100644
--- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
@@ -21,3 +21,26 @@ Contact:	Wu Hao <hao.wu@intel.com>
 Description:	Read-only. It returns Bitstream (static FPGA region) meta
 		data, which includes the synthesis date, seed and other
 		information of this static FPGA region.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/cache_size
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. It returns cache size of this FPGA device.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/fabric_version
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. It returns fabric version of this FPGA device.
+		Userspace applications need this information to select
+		best data channels per different fabric design.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/socket_id
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. It returns socket_id to indicate which socket
+		this FPGA belongs to, only valid for integrated solution.
+		User only needs this information, in case standard numa node
+		can't provide correct information.
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index 5fdce54..f033f1c 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -73,10 +73,58 @@ static ssize_t bitstream_metadata_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(bitstream_metadata);
 
+static ssize_t cache_size_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	void __iomem *base;
+	u64 v;
+
+	base = dfl_get_feature_ioaddr_by_id(dev, FME_FEATURE_ID_HEADER);
+
+	v = readq(base + FME_HDR_CAP);
+
+	return sprintf(buf, "%u\n",
+		       (unsigned int)FIELD_GET(FME_CAP_CACHE_SIZE, v));
+}
+static DEVICE_ATTR_RO(cache_size);
+
+static ssize_t fabric_version_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	void __iomem *base;
+	u64 v;
+
+	base = dfl_get_feature_ioaddr_by_id(dev, FME_FEATURE_ID_HEADER);
+
+	v = readq(base + FME_HDR_CAP);
+
+	return sprintf(buf, "%u\n",
+		       (unsigned int)FIELD_GET(FME_CAP_FABRIC_VERID, v));
+}
+static DEVICE_ATTR_RO(fabric_version);
+
+static ssize_t socket_id_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	void __iomem *base;
+	u64 v;
+
+	base = dfl_get_feature_ioaddr_by_id(dev, FME_FEATURE_ID_HEADER);
+
+	v = readq(base + FME_HDR_CAP);
+
+	return sprintf(buf, "%u\n",
+		       (unsigned int)FIELD_GET(FME_CAP_SOCKET_ID, v));
+}
+static DEVICE_ATTR_RO(socket_id);
+
 static struct attribute *fme_hdr_attrs[] = {
 	&dev_attr_ports_num.attr,
 	&dev_attr_bitstream_id.attr,
 	&dev_attr_bitstream_metadata.attr,
+	&dev_attr_cache_size.attr,
+	&dev_attr_fabric_version.attr,
+	&dev_attr_socket_id.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(fme_hdr);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 11/12] fpga: dfl: fme: add global error reporting support
From: Wu Hao @ 2019-08-04 10:20 UTC (permalink / raw)
  To: gregkh, mdf, linux-fpga
  Cc: linux-kernel, linux-api, linux-doc, atull, Wu Hao, Luwei Kang,
	Ananda Ravuri, Xu Yilun
In-Reply-To: <1564914022-3710-1-git-send-email-hao.wu@intel.com>

This patch adds support for global error reporting for FPGA
Management Engine (FME), it introduces sysfs interfaces to
report different error detected by the hardware, and allow
user to clear errors or inject error for testing purpose.

Signed-off-by: Luwei Kang <luwei.kang@intel.com>
Signed-off-by: Ananda Ravuri <ananda.ravuri@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
---
v2: switch to device_add/remove_groups for sysfs.
v3: update kernel version and date in sysfs doc
v4: rebase, remove dev_dbg in init/uinit callback.
---
 Documentation/ABI/testing/sysfs-platform-dfl-fme |  75 +++++
 drivers/fpga/Makefile                            |   2 +-
 drivers/fpga/dfl-fme-error.c                     | 381 +++++++++++++++++++++++
 drivers/fpga/dfl-fme-main.c                      |   4 +
 drivers/fpga/dfl-fme.h                           |   2 +
 drivers/fpga/dfl.h                               |   2 +
 6 files changed, 465 insertions(+), 1 deletion(-)
 create mode 100644 drivers/fpga/dfl-fme-error.c

diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
index 65372aa..6e9e7d4 100644
--- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
@@ -44,3 +44,78 @@ Description:	Read-only. It returns socket_id to indicate which socket
 		this FPGA belongs to, only valid for integrated solution.
 		User only needs this information, in case standard numa node
 		can't provide correct information.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/revision
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. Read this file to get the revision of this global
+		error reporting private feature.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/pcie0_errors
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-Write. Read this file for errors detected on pcie0 link.
+		Write this file to clear errors logged in pcie0_errors. Write
+		fails with -EINVAL if input parsing fails or input error code
+		doesn't match.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/pcie1_errors
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-Write. Read this file for errors detected on pcie1 link.
+		Write this file to clear errors logged in pcie1_errors. Write
+		fails with -EINVAL if input parsing fails or input error code
+		doesn't match.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/nonfatal_errors
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. It returns non-fatal errors detected.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/catfatal_errors
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. It returns catastrophic and fatal errors detected.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/inject_error
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-Write. Read this file to check errors injected. Write this
+		file to inject errors for testing purpose. Write fails with
+		-EINVAL if input parsing fails or input inject error code isn't
+		supported.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/errors
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. Read this file to get errors detected by hardware.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/first_error
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. Read this file to get the first error detected by
+		hardware.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/next_error
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. Read this file to get the second error detected by
+		hardware.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/clear
+Date:		August 2019
+KernelVersion:  5.4
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Write-only. Write error code to this file to clear all errors
+		logged in errors, first_error and next_error. Write fails with
+		-EINVAL if input parsing fails or input error code doesn't
+		match.
diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
index 7255891..4865b74 100644
--- a/drivers/fpga/Makefile
+++ b/drivers/fpga/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_FPGA_DFL_FME_BRIDGE)	+= dfl-fme-br.o
 obj-$(CONFIG_FPGA_DFL_FME_REGION)	+= dfl-fme-region.o
 obj-$(CONFIG_FPGA_DFL_AFU)		+= dfl-afu.o
 
-dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
+dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o
 dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
 dfl-afu-objs += dfl-afu-error.o
 
diff --git a/drivers/fpga/dfl-fme-error.c b/drivers/fpga/dfl-fme-error.c
new file mode 100644
index 0000000..8b2df80
--- /dev/null
+++ b/drivers/fpga/dfl-fme-error.c
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for FPGA Management Engine Error Management
+ *
+ * Copyright 2019 Intel Corporation, Inc.
+ *
+ * Authors:
+ *   Kang Luwei <luwei.kang@intel.com>
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *   Wu Hao <hao.wu@intel.com>
+ *   Joseph Grecco <joe.grecco@intel.com>
+ *   Enno Luebbers <enno.luebbers@intel.com>
+ *   Tim Whisonant <tim.whisonant@intel.com>
+ *   Ananda Ravuri <ananda.ravuri@intel.com>
+ *   Mitchel, Henry <henry.mitchel@intel.com>
+ */
+
+#include <linux/uaccess.h>
+
+#include "dfl.h"
+#include "dfl-fme.h"
+
+#define FME_ERROR_MASK		0x8
+#define FME_ERROR		0x10
+#define MBP_ERROR		BIT_ULL(6)
+#define PCIE0_ERROR_MASK	0x18
+#define PCIE0_ERROR		0x20
+#define PCIE1_ERROR_MASK	0x28
+#define PCIE1_ERROR		0x30
+#define FME_FIRST_ERROR		0x38
+#define FME_NEXT_ERROR		0x40
+#define RAS_NONFAT_ERROR_MASK	0x48
+#define RAS_NONFAT_ERROR	0x50
+#define RAS_CATFAT_ERROR_MASK	0x58
+#define RAS_CATFAT_ERROR	0x60
+#define RAS_ERROR_INJECT	0x68
+#define INJECT_ERROR_MASK	GENMASK_ULL(2, 0)
+
+static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return sprintf(buf, "%u\n", dfl_feature_revision(base));
+}
+static DEVICE_ATTR_RO(revision);
+
+static ssize_t pcie0_errors_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return sprintf(buf, "0x%llx\n",
+		       (unsigned long long)readq(base + PCIE0_ERROR));
+}
+
+static ssize_t pcie0_errors_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+	int ret = 0;
+	u64 v, val;
+
+	if (kstrtou64(buf, 0, &val))
+		return -EINVAL;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	mutex_lock(&pdata->lock);
+	writeq(GENMASK_ULL(63, 0), base + PCIE0_ERROR_MASK);
+
+	v = readq(base + PCIE0_ERROR);
+	if (val == v)
+		writeq(v, base + PCIE0_ERROR);
+	else
+		ret = -EINVAL;
+
+	writeq(0ULL, base + PCIE0_ERROR_MASK);
+	mutex_unlock(&pdata->lock);
+	return ret ? ret : count;
+}
+static DEVICE_ATTR_RW(pcie0_errors);
+
+static ssize_t pcie1_errors_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return sprintf(buf, "0x%llx\n",
+		       (unsigned long long)readq(base + PCIE1_ERROR));
+}
+
+static ssize_t pcie1_errors_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+	int ret = 0;
+	u64 v, val;
+
+	if (kstrtou64(buf, 0, &val))
+		return -EINVAL;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	mutex_lock(&pdata->lock);
+	writeq(GENMASK_ULL(63, 0), base + PCIE1_ERROR_MASK);
+
+	v = readq(base + PCIE1_ERROR);
+	if (val == v)
+		writeq(v, base + PCIE1_ERROR);
+	else
+		ret = -EINVAL;
+
+	writeq(0ULL, base + PCIE1_ERROR_MASK);
+	mutex_unlock(&pdata->lock);
+	return ret ? ret : count;
+}
+static DEVICE_ATTR_RW(pcie1_errors);
+
+static ssize_t nonfatal_errors_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return sprintf(buf, "0x%llx\n",
+		       (unsigned long long)readq(base + RAS_NONFAT_ERROR));
+}
+static DEVICE_ATTR_RO(nonfatal_errors);
+
+static ssize_t catfatal_errors_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return sprintf(buf, "0x%llx\n",
+		       (unsigned long long)readq(base + RAS_CATFAT_ERROR));
+}
+static DEVICE_ATTR_RO(catfatal_errors);
+
+static ssize_t inject_error_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+	u64 v;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	v = readq(base + RAS_ERROR_INJECT);
+
+	return sprintf(buf, "0x%llx\n",
+		       (unsigned long long)FIELD_GET(INJECT_ERROR_MASK, v));
+}
+
+static ssize_t inject_error_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+	u8 inject_error;
+	u64 v;
+
+	if (kstrtou8(buf, 0, &inject_error))
+		return -EINVAL;
+
+	if (inject_error & ~INJECT_ERROR_MASK)
+		return -EINVAL;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	mutex_lock(&pdata->lock);
+	v = readq(base + RAS_ERROR_INJECT);
+	v &= ~INJECT_ERROR_MASK;
+	v |= FIELD_PREP(INJECT_ERROR_MASK, inject_error);
+	writeq(v, base + RAS_ERROR_INJECT);
+	mutex_unlock(&pdata->lock);
+
+	return count;
+}
+static DEVICE_ATTR_RW(inject_error);
+
+static struct attribute *errors_attrs[] = {
+	&dev_attr_revision.attr,
+	&dev_attr_pcie0_errors.attr,
+	&dev_attr_pcie1_errors.attr,
+	&dev_attr_nonfatal_errors.attr,
+	&dev_attr_catfatal_errors.attr,
+	&dev_attr_inject_error.attr,
+	NULL,
+};
+
+static struct attribute_group errors_attr_group = {
+	.attrs	= errors_attrs,
+};
+
+static ssize_t errors_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return sprintf(buf, "0x%llx\n",
+		       (unsigned long long)readq(base + FME_ERROR));
+}
+static DEVICE_ATTR_RO(errors);
+
+static ssize_t first_error_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return sprintf(buf, "0x%llx\n",
+		       (unsigned long long)readq(base + FME_FIRST_ERROR));
+}
+static DEVICE_ATTR_RO(first_error);
+
+static ssize_t next_error_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return sprintf(buf, "0x%llx\n",
+		       (unsigned long long)readq(base + FME_NEXT_ERROR));
+}
+static DEVICE_ATTR_RO(next_error);
+
+static ssize_t clear_store(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t count)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+	u64 v, val;
+	int ret = 0;
+
+	if (kstrtou64(buf, 0, &val))
+		return -EINVAL;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	mutex_lock(&pdata->lock);
+	writeq(GENMASK_ULL(63, 0), base + FME_ERROR_MASK);
+
+	v = readq(base + FME_ERROR);
+	if (val == v) {
+		writeq(v, base + FME_ERROR);
+		v = readq(base + FME_FIRST_ERROR);
+		writeq(v, base + FME_FIRST_ERROR);
+		v = readq(base + FME_NEXT_ERROR);
+		writeq(v, base + FME_NEXT_ERROR);
+	} else {
+		ret = -EINVAL;
+	}
+
+	/* Workaround: disable MBP_ERROR if feature revision is 0 */
+	writeq(dfl_feature_revision(base) ? 0ULL : MBP_ERROR,
+	       base + FME_ERROR_MASK);
+	mutex_unlock(&pdata->lock);
+	return ret ? ret : count;
+}
+static DEVICE_ATTR_WO(clear);
+
+static struct attribute *fme_errors_attrs[] = {
+	&dev_attr_errors.attr,
+	&dev_attr_first_error.attr,
+	&dev_attr_next_error.attr,
+	&dev_attr_clear.attr,
+	NULL,
+};
+
+static struct attribute_group fme_errors_attr_group = {
+	.attrs	= fme_errors_attrs,
+	.name	= "fme-errors",
+};
+
+static const struct attribute_group *error_groups[] = {
+	&fme_errors_attr_group,
+	&errors_attr_group,
+	NULL
+};
+
+static void fme_error_enable(struct dfl_feature *feature)
+{
+	void __iomem *base = feature->ioaddr;
+
+	/* Workaround: disable MBP_ERROR if revision is 0 */
+	writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
+	       base + FME_ERROR_MASK);
+	writeq(0ULL, base + PCIE0_ERROR_MASK);
+	writeq(0ULL, base + PCIE1_ERROR_MASK);
+	writeq(0ULL, base + RAS_NONFAT_ERROR_MASK);
+	writeq(0ULL, base + RAS_CATFAT_ERROR_MASK);
+}
+
+static void err_dev_release(struct device *dev)
+{
+	kfree(dev);
+}
+
+static int fme_global_err_init(struct platform_device *pdev,
+			       struct dfl_feature *feature)
+{
+	struct device *dev;
+	int ret = 0;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->parent = &pdev->dev;
+	dev->release = err_dev_release;
+	dev_set_name(dev, "errors");
+
+	fme_error_enable(feature);
+
+	ret = device_register(dev);
+	if (ret) {
+		put_device(dev);
+		return ret;
+	}
+
+	ret = device_add_groups(dev, error_groups);
+	if (ret) {
+		device_unregister(dev);
+		return ret;
+	}
+
+	feature->priv = dev;
+
+	return ret;
+}
+
+static void fme_global_err_uinit(struct platform_device *pdev,
+				 struct dfl_feature *feature)
+{
+	struct device *dev = feature->priv;
+
+	device_remove_groups(dev, error_groups);
+	device_unregister(dev);
+}
+
+const struct dfl_feature_id fme_global_err_id_table[] = {
+	{.id = FME_FEATURE_ID_GLOBAL_ERR,},
+	{0,}
+};
+
+const struct dfl_feature_ops fme_global_err_ops = {
+	.init = fme_global_err_init,
+	.uinit = fme_global_err_uinit,
+};
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index f033f1c..59bc2cc 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -214,6 +214,10 @@ static long fme_hdr_ioctl(struct platform_device *pdev,
 		.ops = &fme_pr_mgmt_ops,
 	},
 	{
+		.id_table = fme_global_err_id_table,
+		.ops = &fme_global_err_ops,
+	},
+	{
 		.ops = NULL,
 	},
 };
diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
index e4131e8..a3a48c8 100644
--- a/drivers/fpga/dfl-fme.h
+++ b/drivers/fpga/dfl-fme.h
@@ -35,5 +35,7 @@ struct dfl_fme {
 
 extern const struct dfl_feature_ops fme_pr_mgmt_ops;
 extern const struct dfl_feature_id fme_pr_mgmt_id_table[];
+extern const struct dfl_feature_ops fme_global_err_ops;
+extern const struct dfl_feature_id fme_global_err_id_table[];
 
 #endif /* __DFL_FME_H */
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index 9f0e656..d312678 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -197,12 +197,14 @@ struct dfl_feature_driver {
  *		    feature dev (platform device)'s reources.
  * @ioaddr: mapped mmio resource address.
  * @ops: ops of this sub feature.
+ * @priv: priv data of this feature.
  */
 struct dfl_feature {
 	u64 id;
 	int resource_index;
 	void __iomem *ioaddr;
 	const struct dfl_feature_ops *ops;
+	void *priv;
 };
 
 #define DEV_STATUS_IN_USE	0
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 12/12] Documentation: fpga: dfl: add descriptions for virtualization and new interfaces.
From: Wu Hao @ 2019-08-04 10:20 UTC (permalink / raw)
  To: gregkh, mdf, linux-fpga
  Cc: linux-kernel, linux-api, linux-doc, atull, Wu Hao, Xu Yilun
In-Reply-To: <1564914022-3710-1-git-send-email-hao.wu@intel.com>

This patch adds virtualization support description for DFL based
FPGA devices (based on PCIe SRIOV), and introductions to new
interfaces added by new dfl private feature drivers.

[mdf@kernel.org: Fixed up to make it work with new reStructuredText docs]
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
---
 Documentation/fpga/dfl.rst | 105 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

diff --git a/Documentation/fpga/dfl.rst b/Documentation/fpga/dfl.rst
index 2f125ab..6fa483f 100644
--- a/Documentation/fpga/dfl.rst
+++ b/Documentation/fpga/dfl.rst
@@ -87,6 +87,8 @@ The following functions are exposed through ioctls:
 - Get driver API version (DFL_FPGA_GET_API_VERSION)
 - Check for extensions (DFL_FPGA_CHECK_EXTENSION)
 - Program bitstream (DFL_FPGA_FME_PORT_PR)
+- Assign port to PF (DFL_FPGA_FME_PORT_ASSIGN)
+- Release port from PF (DFL_FPGA_FME_PORT_RELEASE)
 
 More functions are exposed through sysfs
 (/sys/class/fpga_region/regionX/dfl-fme.n/):
@@ -102,6 +104,10 @@ More functions are exposed through sysfs
      one FPGA device may have more than one port, this sysfs interface indicates
      how many ports the FPGA device has.
 
+ Global error reporting management (errors/)
+     error reporting sysfs interfaces allow user to read errors detected by the
+     hardware, and clear the logged errors.
+
 
 FIU - PORT
 ==========
@@ -143,6 +149,10 @@ More functions are exposed through sysfs:
  Read Accelerator GUID (afu_id)
      afu_id indicates which PR bitstream is programmed to this AFU.
 
+ Error reporting (errors/)
+     error reporting sysfs interfaces allow user to read port/afu errors
+     detected by the hardware, and clear the logged errors.
+
 
 DFL Framework Overview
 ======================
@@ -218,6 +228,101 @@ the compat_id exposed by the target FPGA region. This check is usually done by
 userspace before calling the reconfiguration IOCTL.
 
 
+FPGA virtualization - PCIe SRIOV
+================================
+This section describes the virtualization support on DFL based FPGA device to
+enable accessing an accelerator from applications running in a virtual machine
+(VM). This section only describes the PCIe based FPGA device with SRIOV support.
+
+Features supported by the particular FPGA device are exposed through Device
+Feature Lists, as illustrated below:
+
+::
+
+    +-------------------------------+  +-------------+
+    |              PF               |  |     VF      |
+    +-------------------------------+  +-------------+
+        ^            ^         ^              ^
+        |            |         |              |
+  +-----|------------|---------|--------------|-------+
+  |     |            |         |              |       |
+  |  +-----+     +-------+ +-------+      +-------+   |
+  |  | FME |     | Port0 | | Port1 |      | Port2 |   |
+  |  +-----+     +-------+ +-------+      +-------+   |
+  |                  ^         ^              ^       |
+  |                  |         |              |       |
+  |              +-------+ +------+       +-------+   |
+  |              |  AFU  | |  AFU |       |  AFU  |   |
+  |              +-------+ +------+       +-------+   |
+  |                                                   |
+  |            DFL based FPGA PCIe Device             |
+  +---------------------------------------------------+
+
+FME is always accessed through the physical function (PF).
+
+Ports (and related AFUs) are accessed via PF by default, but could be exposed
+through virtual function (VF) devices via PCIe SRIOV. Each VF only contains
+1 Port and 1 AFU for isolation. Users could assign individual VFs (accelerators)
+created via PCIe SRIOV interface, to virtual machines.
+
+The driver organization in virtualization case is illustrated below:
+::
+
+    +-------++------++------+             |
+    | FME   || FME  || FME  |             |
+    | FPGA  || FPGA || FPGA |             |
+    |Manager||Bridge||Region|             |
+    +-------++------++------+             |
+    +-----------------------+  +--------+ |             +--------+
+    |          FME          |  |  AFU   | |             |  AFU   |
+    |         Module        |  | Module | |             | Module |
+    +-----------------------+  +--------+ |             +--------+
+          +-----------------------+       |       +-----------------------+
+          | FPGA Container Device |       |       | FPGA Container Device |
+          |  (FPGA Base Region)   |       |       |  (FPGA Base Region)   |
+          +-----------------------+       |       +-----------------------+
+            +------------------+          |         +------------------+
+            | FPGA PCIE Module |          | Virtual | FPGA PCIE Module |
+            +------------------+   Host   | Machine +------------------+
+   -------------------------------------- | ------------------------------
+             +---------------+            |          +---------------+
+             | PCI PF Device |            |          | PCI VF Device |
+             +---------------+            |          +---------------+
+
+FPGA PCIe device driver is always loaded first once a FPGA PCIe PF or VF device
+is detected. It:
+
+* Finishes enumeration on both FPGA PCIe PF and VF device using common
+  interfaces from DFL framework.
+* Supports SRIOV.
+
+The FME device driver plays a management role in this driver architecture, it
+provides ioctls to release Port from PF and assign Port to PF. After release
+a port from PF, then it's safe to expose this port through a VF via PCIe SRIOV
+sysfs interface.
+
+To enable accessing an accelerator from applications running in a VM, the
+respective AFU's port needs to be assigned to a VF using the following steps:
+
+#. The PF owns all AFU ports by default. Any port that needs to be
+   reassigned to a VF must first be released through the
+   DFL_FPGA_FME_PORT_RELEASE ioctl on the FME device.
+
+#. Once N ports are released from PF, then user can use command below
+   to enable SRIOV and VFs. Each VF owns only one Port with AFU.
+
+   ::
+
+      echo N > $PCI_DEVICE_PATH/sriov_numvfs
+
+#. Pass through the VFs to VMs
+
+#. The AFU under VF is accessible from applications in VM (using the
+   same driver inside the VF).
+
+Note that an FME can't be assigned to a VF, thus PR and other management
+functions are only available via the PF.
+
 Device enumeration
 ==================
 This section introduces how applications enumerate the fpga device from
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Andy Lutomirski @ 2019-08-04 22:16 UTC (permalink / raw)
  To: Song Liu
  Cc: Andy Lutomirski, Kees Cook, Networking, bpf, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, Lorenz Bauer, Jann Horn, Greg KH,
	Linux API, LSM List
In-Reply-To: <5A2FCD7E-7F54-41E5-BFAE-BB9494E74F2D@fb.com>

On Fri, Aug 2, 2019 at 12:22 AM Song Liu <songliubraving@fb.com> wrote:
>
> Hi Andy,
>
 >> I actually agree CAP_BPF_ADMIN makes sense. The hard part is to make
> >> existing tools (setcap, getcap, etc.) and libraries aware of the new CAP.
> >
> > It's been done before -- it's not that hard.  IMO the main tricky bit
> > would be try be somewhat careful about defining exactly what
> > CAP_BPF_ADMIN does.
>
> Agreed. I think defining CAP_BPF_ADMIN could be a good topic for the
> Plumbers conference.
>
> OTOH, I don't think we have to wait for CAP_BPF_ADMIN to allow daemons
> like systemd to do sys_bpf() without root.

I don't understand the use case here.  Are you talking about systemd
--user?  As far as I know, a user is expected to be able to fully
control their systemd --user process, so giving it unrestricted bpf
access is very close to giving it superuser access, and this doesn't
sound like a good idea.  I think that, if systemd --user needs bpf(),
it either needs real unprivileged bpf() or it needs a privileged
helper (SUID or a daemon) to intermediate this access.

>
> >
> >>> I don't see why you need to invent a whole new mechanism for this.
> >>> The entire cgroup ecosystem outside bpf() does just fine using the
> >>> write permission on files in cgroupfs to control access.  Why can't
> >>> bpf() do the same thing?
> >>
> >> It is easier to use write permission for BPF_PROG_ATTACH. But it is
> >> not easy to do the same for other bpf commands: BPF_PROG_LOAD and
> >> BPF_MAP_*. A lot of these commands don't have target concept. Maybe
> >> we should have target concept for all these commands. But that is a
> >> much bigger project. OTOH, "all or nothing" model allows all these
> >> commands at once.
> >
> > For BPF_PROG_LOAD, I admit I've never understood why permission is
> > required at all.  I think that CAP_SYS_ADMIN or similar should be
> > needed to get is_priv in the verifier, but I think that should mainly
> > be useful for tracing, and that requires lots of privilege anyway.
> > BPF_MAP_* is probably the trickiest part.  One solution would be some
> > kind of bpffs, but I'm sure other solutions are possible.
>
> Improving permission management of cgroup_bpf is another good topic to
> discuss. However, it is also an overkill for current use case.
>

I looked at the code some more, and I don't think this is so hard
after all.  As I understand it, all of the map..by_id stuff is, to
some extent, deprecated in favor of persistent maps.  As I see it, the
map..by_id calls should require privilege forever, although I can
imagine ways to scope that privilege to a namespace if the maps
themselves were to be scoped to a namespace.

Instead, unprivileged tools would use the persistent map interface
roughly like this:

$ bpftool map create /sys/fs/bpf/my_dir/filename type hash key 8 value
8 entries 64 name mapname

This would require that the caller have either CAP_DAC_OVERRIDE or
that the caller have permission to create files in /sys/fs/bpf/my_dir
(using the same rules as for any filesystem), and the resulting map
would end up owned by the creating user and have mode 0600 (or maybe
0666, or maybe a new bpf_attr parameter) modified by umask.  Then all
the various capable() checks that are currently involved in accessing
a persistent map would instead check FMODE_READ or FMODE_WRITE on the
map file as appropriate.

Half of this stuff already works.  I just set my system up like this:

$ ls -l /sys/fs/bpf
total 0
drwxr-xr-x. 3 luto luto 0 Aug  4 15:10 luto

$ mkdir /sys/fs/bpf/luto/test

$ ls -l /sys/fs/bpf/luto
total 0
drwxrwxr-x. 2 luto luto 0 Aug  4 15:10 test

I bet that making the bpf() syscalls work appropriately in this
context without privilege would only be a couple of hours of work.
The hard work, creating bpffs and making it function, is already done
:)

P.S. The docs for bpftool create are less than fantastic.  The
complete lack of any error message at all when the syscall returns
-EACCES is also not fantastic.

^ permalink raw reply

* Re: [RFC PATCH v1 1/5] fs: Add support for an O_MAYEXEC flag on sys_open()
From: Andy Lutomirski @ 2019-08-04 23:55 UTC (permalink / raw)
  To: Jan Kara, Song Liu, Alexei Starovoitov, Daniel Borkmann
  Cc: Mickaël Salaün, LKML, Al Viro, James Morris,
	Jonathan Corbet, Kees Cook, Matthew Garrett, Michael Kerrisk,
	Mickaël Salaün, Mimi Zohar, Philippe Trébuchet,
	Shuah Khan, Thibaut Sautereau, Vincent Strubel, Yves-Alexis Perez,
	Kernel Hardening, Linux API
In-Reply-To: <20181212144306.GA19945@quack2.suse.cz>

On Wed, Dec 12, 2018 at 6:43 AM Jan Kara <jack@suse.cz> wrote:
>
> On Wed 12-12-18 09:17:08, Mickaël Salaün wrote:
> > When the O_MAYEXEC flag is passed, sys_open() may be subject to
> > additional restrictions depending on a security policy implemented by an
> > LSM through the inode_permission hook.
> >
> > The underlying idea is to be able to restrict scripts interpretation
> > according to a policy defined by the system administrator.  For this to
> > be possible, script interpreters must use the O_MAYEXEC flag
> > appropriately.  To be fully effective, these interpreters also need to
> > handle the other ways to execute code (for which the kernel can't help):
> > command line parameters (e.g., option -e for Perl), module loading
> > (e.g., option -m for Python), stdin, file sourcing, environment
> > variables, configuration files...  According to the threat model, it may
> > be acceptable to allow some script interpreters (e.g. Bash) to interpret
> > commands from stdin, may it be a TTY or a pipe, because it may not be
> > enough to (directly) perform syscalls.
> >
> > A simple security policy implementation is available in a following
> > patch for Yama.
> >
> > This is an updated subset of the patch initially written by Vincent
> > Strubel for CLIP OS:
> > https://github.com/clipos-archive/src_platform_clip-patches/blob/f5cb330d6b684752e403b4e41b39f7004d88e561/1901_open_mayexec.patch
> > This patch has been used for more than 10 years with customized script
> > interpreters.  Some examples can be found here:
> > https://github.com/clipos-archive/clipos4_portage-overlay/search?q=O_MAYEXEC
> >
> > Signed-off-by: Mickaël Salaün <mic@digikod.net>
> > Signed-off-by: Thibaut Sautereau <thibaut.sautereau@ssi.gouv.fr>
> > Signed-off-by: Vincent Strubel <vincent.strubel@ssi.gouv.fr>
> > Reviewed-by: Philippe Trébuchet <philippe.trebuchet@ssi.gouv.fr>
> > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > Cc: Kees Cook <keescook@chromium.org>
> > Cc: Mickaël Salaün <mickael.salaun@ssi.gouv.fr>
>
> ...
>
> > diff --git a/fs/open.c b/fs/open.c
> > index 0285ce7dbd51..75479b79a58f 100644
> > --- a/fs/open.c
> > +++ b/fs/open.c
> > @@ -974,6 +974,10 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
> >       if (flags & O_APPEND)
> >               acc_mode |= MAY_APPEND;
> >
> > +     /* Check execution permissions on open. */
> > +     if (flags & O_MAYEXEC)
> > +             acc_mode |= MAY_OPENEXEC;
> > +
> >       op->acc_mode = acc_mode;
> >
> >       op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
>
> I don't feel experienced enough in security to tell whether we want this
> functionality or not. But if we do this, shouldn't we also set FMODE_EXEC
> on the resulting struct file? That way also security_file_open() can be
> used to arbitrate such executable opens and in particular
> fanotify permission event FAN_OPEN_EXEC will get properly generated which I
> guess is desirable (support for it is sitting in my tree waiting for the
> merge window) - adding some audit people involved in FAN_OPEN_EXEC to
> CC. Just an idea...
>

I would really like to land this patch.  I'm fiddling with making
bpffs handle permissions intelligently, and the lack of a way to say
"hey, I want to open this bpf program so that I can run it" is
annoying.

--Andy

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Andy Lutomirski @ 2019-08-05  0:08 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Song Liu, Kees Cook, Networking, bpf, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, Lorenz Bauer, Jann Horn, Greg KH,
	Linux API, LSM List
In-Reply-To: <CALCETrU7NbBnXXsw1B+DvTkfTVRBFWXuJ8cZERCCNvdFG6KqRw@mail.gmail.com>

On Sun, Aug 4, 2019 at 3:16 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Fri, Aug 2, 2019 at 12:22 AM Song Liu <songliubraving@fb.com> wrote:
> >
> > Hi Andy,
> >
>  >> I actually agree CAP_BPF_ADMIN makes sense. The hard part is to make
> > >> existing tools (setcap, getcap, etc.) and libraries aware of the new CAP.
> > >
> > > It's been done before -- it's not that hard.  IMO the main tricky bit
> > > would be try be somewhat careful about defining exactly what
> > > CAP_BPF_ADMIN does.
> >
> > Agreed. I think defining CAP_BPF_ADMIN could be a good topic for the
> > Plumbers conference.
> >
> > OTOH, I don't think we have to wait for CAP_BPF_ADMIN to allow daemons
> > like systemd to do sys_bpf() without root.
>
> I don't understand the use case here.  Are you talking about systemd
> --user?  As far as I know, a user is expected to be able to fully
> control their systemd --user process, so giving it unrestricted bpf
> access is very close to giving it superuser access, and this doesn't
> sound like a good idea.  I think that, if systemd --user needs bpf(),
> it either needs real unprivileged bpf() or it needs a privileged
> helper (SUID or a daemon) to intermediate this access.
>
> >
> > >
> > >>> I don't see why you need to invent a whole new mechanism for this.
> > >>> The entire cgroup ecosystem outside bpf() does just fine using the
> > >>> write permission on files in cgroupfs to control access.  Why can't
> > >>> bpf() do the same thing?
> > >>
> > >> It is easier to use write permission for BPF_PROG_ATTACH. But it is
> > >> not easy to do the same for other bpf commands: BPF_PROG_LOAD and
> > >> BPF_MAP_*. A lot of these commands don't have target concept. Maybe
> > >> we should have target concept for all these commands. But that is a
> > >> much bigger project. OTOH, "all or nothing" model allows all these
> > >> commands at once.
> > >
> > > For BPF_PROG_LOAD, I admit I've never understood why permission is
> > > required at all.  I think that CAP_SYS_ADMIN or similar should be
> > > needed to get is_priv in the verifier, but I think that should mainly
> > > be useful for tracing, and that requires lots of privilege anyway.
> > > BPF_MAP_* is probably the trickiest part.  One solution would be some
> > > kind of bpffs, but I'm sure other solutions are possible.
> >
> > Improving permission management of cgroup_bpf is another good topic to
> > discuss. However, it is also an overkill for current use case.
> >
>
> I looked at the code some more, and I don't think this is so hard
> after all.  As I understand it, all of the map..by_id stuff is, to
> some extent, deprecated in favor of persistent maps.  As I see it, the
> map..by_id calls should require privilege forever, although I can
> imagine ways to scope that privilege to a namespace if the maps
> themselves were to be scoped to a namespace.
>
> Instead, unprivileged tools would use the persistent map interface
> roughly like this:
>
> $ bpftool map create /sys/fs/bpf/my_dir/filename type hash key 8 value
> 8 entries 64 name mapname
>
> This would require that the caller have either CAP_DAC_OVERRIDE or
> that the caller have permission to create files in /sys/fs/bpf/my_dir
> (using the same rules as for any filesystem), and the resulting map
> would end up owned by the creating user and have mode 0600 (or maybe
> 0666, or maybe a new bpf_attr parameter) modified by umask.  Then all
> the various capable() checks that are currently involved in accessing
> a persistent map would instead check FMODE_READ or FMODE_WRITE on the
> map file as appropriate.
>
> Half of this stuff already works.  I just set my system up like this:
>
> $ ls -l /sys/fs/bpf
> total 0
> drwxr-xr-x. 3 luto luto 0 Aug  4 15:10 luto
>
> $ mkdir /sys/fs/bpf/luto/test
>
> $ ls -l /sys/fs/bpf/luto
> total 0
> drwxrwxr-x. 2 luto luto 0 Aug  4 15:10 test
>
> I bet that making the bpf() syscalls work appropriately in this
> context without privilege would only be a couple of hours of work.
> The hard work, creating bpffs and making it function, is already done
> :)
>
> P.S. The docs for bpftool create are less than fantastic.  The
> complete lack of any error message at all when the syscall returns
> -EACCES is also not fantastic.

This isn't remotely finished, but I spent a bit of time fiddling with this:

https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=bpf/perms

What do you think?  (It's obviously not done.  It doesn't compile, and
I haven't gotten to the permissions needed to do map operations.  I
also haven't touched the capable() checks.)

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Andy Lutomirski @ 2019-08-05  5:47 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Song Liu, Kees Cook, Networking, bpf, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, Lorenz Bauer, Jann Horn, Greg KH,
	Linux API, LSM List
In-Reply-To: <CALCETrUjh6DdgW1qSuSRd1_=0F9CqB8+sNj__e_6AHEvh_BaxQ@mail.gmail.com>

On Sun, Aug 4, 2019 at 5:08 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Sun, Aug 4, 2019 at 3:16 PM Andy Lutomirski <luto@kernel.org> wrote:
> >
> > On Fri, Aug 2, 2019 at 12:22 AM Song Liu <songliubraving@fb.com> wrote:
> > >
> > > Hi Andy,
> > >
> >  >> I actually agree CAP_BPF_ADMIN makes sense. The hard part is to make
> > > >> existing tools (setcap, getcap, etc.) and libraries aware of the new CAP.
> > > >
> > > > It's been done before -- it's not that hard.  IMO the main tricky bit
> > > > would be try be somewhat careful about defining exactly what
> > > > CAP_BPF_ADMIN does.
> > >
> > > Agreed. I think defining CAP_BPF_ADMIN could be a good topic for the
> > > Plumbers conference.
> > >
> > > OTOH, I don't think we have to wait for CAP_BPF_ADMIN to allow daemons
> > > like systemd to do sys_bpf() without root.
> >
> > I don't understand the use case here.  Are you talking about systemd
> > --user?  As far as I know, a user is expected to be able to fully
> > control their systemd --user process, so giving it unrestricted bpf
> > access is very close to giving it superuser access, and this doesn't
> > sound like a good idea.  I think that, if systemd --user needs bpf(),
> > it either needs real unprivileged bpf() or it needs a privileged
> > helper (SUID or a daemon) to intermediate this access.
> >
> > >
> > > >
> > > >>> I don't see why you need to invent a whole new mechanism for this.
> > > >>> The entire cgroup ecosystem outside bpf() does just fine using the
> > > >>> write permission on files in cgroupfs to control access.  Why can't
> > > >>> bpf() do the same thing?
> > > >>
> > > >> It is easier to use write permission for BPF_PROG_ATTACH. But it is
> > > >> not easy to do the same for other bpf commands: BPF_PROG_LOAD and
> > > >> BPF_MAP_*. A lot of these commands don't have target concept. Maybe
> > > >> we should have target concept for all these commands. But that is a
> > > >> much bigger project. OTOH, "all or nothing" model allows all these
> > > >> commands at once.
> > > >
> > > > For BPF_PROG_LOAD, I admit I've never understood why permission is
> > > > required at all.  I think that CAP_SYS_ADMIN or similar should be
> > > > needed to get is_priv in the verifier, but I think that should mainly
> > > > be useful for tracing, and that requires lots of privilege anyway.
> > > > BPF_MAP_* is probably the trickiest part.  One solution would be some
> > > > kind of bpffs, but I'm sure other solutions are possible.
> > >
> > > Improving permission management of cgroup_bpf is another good topic to
> > > discuss. However, it is also an overkill for current use case.
> > >
> >
> > I looked at the code some more, and I don't think this is so hard
> > after all.  As I understand it, all of the map..by_id stuff is, to
> > some extent, deprecated in favor of persistent maps.  As I see it, the
> > map..by_id calls should require privilege forever, although I can
> > imagine ways to scope that privilege to a namespace if the maps
> > themselves were to be scoped to a namespace.
> >
> > Instead, unprivileged tools would use the persistent map interface
> > roughly like this:
> >
> > $ bpftool map create /sys/fs/bpf/my_dir/filename type hash key 8 value
> > 8 entries 64 name mapname
> >
> > This would require that the caller have either CAP_DAC_OVERRIDE or
> > that the caller have permission to create files in /sys/fs/bpf/my_dir
> > (using the same rules as for any filesystem), and the resulting map
> > would end up owned by the creating user and have mode 0600 (or maybe
> > 0666, or maybe a new bpf_attr parameter) modified by umask.  Then all
> > the various capable() checks that are currently involved in accessing
> > a persistent map would instead check FMODE_READ or FMODE_WRITE on the
> > map file as appropriate.
> >
> > Half of this stuff already works.  I just set my system up like this:
> >
> > $ ls -l /sys/fs/bpf
> > total 0
> > drwxr-xr-x. 3 luto luto 0 Aug  4 15:10 luto
> >
> > $ mkdir /sys/fs/bpf/luto/test
> >
> > $ ls -l /sys/fs/bpf/luto
> > total 0
> > drwxrwxr-x. 2 luto luto 0 Aug  4 15:10 test
> >
> > I bet that making the bpf() syscalls work appropriately in this
> > context without privilege would only be a couple of hours of work.
> > The hard work, creating bpffs and making it function, is already done
> > :)
> >
> > P.S. The docs for bpftool create are less than fantastic.  The
> > complete lack of any error message at all when the syscall returns
> > -EACCES is also not fantastic.
>
> This isn't remotely finished, but I spent a bit of time fiddling with this:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=bpf/perms
>
> What do you think?  (It's obviously not done.  It doesn't compile, and
> I haven't gotten to the permissions needed to do map operations.  I
> also haven't touched the capable() checks.)

I updated the branch.  It compiles, and basic map functionality works!

# mount -t bpf bpf /sys/fs/bpf
# cd /sys/fs/bpf
# mkdir luto
# chown luto: luto
# setpriv --euid=1000 --ruid=1000 bash
$ pwd
/sys/fs/bpf
bash-5.0$ ls -l
total 0
drwxr-xr-x 2 luto luto 0 Aug  4 22:41 luto
bash-5.0$ bpftool map create /sys/fs/bpf/luto/filename type hash key 8
value 8 entries 64 name mapname
bash-5.0$ bpftool map dump pinned /sys/fs/bpf/luto/filename
Found 0 elements

# chown root: /sys/fs/bpf/luto/filename

$ bpftool map dump pinned /sys/fs/bpf/luto/filename
Error: bpf obj get (/sys/fs/bpf/luto): Permission denied

So I think it's possible to get a respectable subset of bpf()
functionality working without privilege in short order :)

(FWIW, a decent fraction of this probably works even without my
patches, but it's going to have nonsensical semantics and may fail for
silly reasons.)

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Song Liu @ 2019-08-05  7:36 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Kees Cook, Networking, bpf, Alexei Starovoitov, Daniel Borkmann,
	Kernel Team, Lorenz Bauer, Jann Horn, Greg KH, Linux API,
	LSM List
In-Reply-To: <CALCETrWtE2U4EvZVYeq8pSmQjBzF2PHH+KxYW8FSeF+W=1FYjw@mail.gmail.com>

Hi Andy, 

> On Aug 4, 2019, at 10:47 PM, Andy Lutomirski <luto@kernel.org> wrote:
> 
> On Sun, Aug 4, 2019 at 5:08 PM Andy Lutomirski <luto@kernel.org> wrote:
>> 
>> On Sun, Aug 4, 2019 at 3:16 PM Andy Lutomirski <luto@kernel.org> wrote:
>>> 
>>> On Fri, Aug 2, 2019 at 12:22 AM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> Hi Andy,
>>>> 
>>>>> I actually agree CAP_BPF_ADMIN makes sense. The hard part is to make
>>>>>> existing tools (setcap, getcap, etc.) and libraries aware of the new CAP.
>>>>> 
>>>>> It's been done before -- it's not that hard.  IMO the main tricky bit
>>>>> would be try be somewhat careful about defining exactly what
>>>>> CAP_BPF_ADMIN does.
>>>> 
>>>> Agreed. I think defining CAP_BPF_ADMIN could be a good topic for the
>>>> Plumbers conference.
>>>> 
>>>> OTOH, I don't think we have to wait for CAP_BPF_ADMIN to allow daemons
>>>> like systemd to do sys_bpf() without root.
>>> 
>>> I don't understand the use case here.  Are you talking about systemd
>>> --user?  As far as I know, a user is expected to be able to fully
>>> control their systemd --user process, so giving it unrestricted bpf
>>> access is very close to giving it superuser access, and this doesn't
>>> sound like a good idea.  I think that, if systemd --user needs bpf(),
>>> it either needs real unprivileged bpf() or it needs a privileged
>>> helper (SUID or a daemon) to intermediate this access.
>>> 
>>>> 
>>>>> 
>>>>>>> I don't see why you need to invent a whole new mechanism for this.
>>>>>>> The entire cgroup ecosystem outside bpf() does just fine using the
>>>>>>> write permission on files in cgroupfs to control access.  Why can't
>>>>>>> bpf() do the same thing?
>>>>>> 
>>>>>> It is easier to use write permission for BPF_PROG_ATTACH. But it is
>>>>>> not easy to do the same for other bpf commands: BPF_PROG_LOAD and
>>>>>> BPF_MAP_*. A lot of these commands don't have target concept. Maybe
>>>>>> we should have target concept for all these commands. But that is a
>>>>>> much bigger project. OTOH, "all or nothing" model allows all these
>>>>>> commands at once.
>>>>> 
>>>>> For BPF_PROG_LOAD, I admit I've never understood why permission is
>>>>> required at all.  I think that CAP_SYS_ADMIN or similar should be
>>>>> needed to get is_priv in the verifier, but I think that should mainly
>>>>> be useful for tracing, and that requires lots of privilege anyway.
>>>>> BPF_MAP_* is probably the trickiest part.  One solution would be some
>>>>> kind of bpffs, but I'm sure other solutions are possible.
>>>> 
>>>> Improving permission management of cgroup_bpf is another good topic to
>>>> discuss. However, it is also an overkill for current use case.
>>>> 
>>> 
>>> I looked at the code some more, and I don't think this is so hard
>>> after all.  As I understand it, all of the map..by_id stuff is, to
>>> some extent, deprecated in favor of persistent maps.  As I see it, the
>>> map..by_id calls should require privilege forever, although I can
>>> imagine ways to scope that privilege to a namespace if the maps
>>> themselves were to be scoped to a namespace.
>>> 
>>> Instead, unprivileged tools would use the persistent map interface
>>> roughly like this:
>>> 
>>> $ bpftool map create /sys/fs/bpf/my_dir/filename type hash key 8 value
>>> 8 entries 64 name mapname
>>> 
>>> This would require that the caller have either CAP_DAC_OVERRIDE or
>>> that the caller have permission to create files in /sys/fs/bpf/my_dir
>>> (using the same rules as for any filesystem), and the resulting map
>>> would end up owned by the creating user and have mode 0600 (or maybe
>>> 0666, or maybe a new bpf_attr parameter) modified by umask.  Then all
>>> the various capable() checks that are currently involved in accessing
>>> a persistent map would instead check FMODE_READ or FMODE_WRITE on the
>>> map file as appropriate.
>>> 
>>> Half of this stuff already works.  I just set my system up like this:
>>> 
>>> $ ls -l /sys/fs/bpf
>>> total 0
>>> drwxr-xr-x. 3 luto luto 0 Aug  4 15:10 luto
>>> 
>>> $ mkdir /sys/fs/bpf/luto/test
>>> 
>>> $ ls -l /sys/fs/bpf/luto
>>> total 0
>>> drwxrwxr-x. 2 luto luto 0 Aug  4 15:10 test
>>> 
>>> I bet that making the bpf() syscalls work appropriately in this
>>> context without privilege would only be a couple of hours of work.
>>> The hard work, creating bpffs and making it function, is already done
>>> :)
>>> 
>>> P.S. The docs for bpftool create are less than fantastic.  The
>>> complete lack of any error message at all when the syscall returns
>>> -EACCES is also not fantastic.
>> 
>> This isn't remotely finished, but I spent a bit of time fiddling with this:
>> 
>> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=bpf/perms
>> 
>> What do you think?  (It's obviously not done.  It doesn't compile, and
>> I haven't gotten to the permissions needed to do map operations.  I
>> also haven't touched the capable() checks.)
> 
> I updated the branch.  It compiles, and basic map functionality works!

Thanks a lot for trying this out. This is a very interesting direction
that we will explore. 

> 
> # mount -t bpf bpf /sys/fs/bpf
> # cd /sys/fs/bpf
> # mkdir luto
> # chown luto: luto
> # setpriv --euid=1000 --ruid=1000 bash
> $ pwd
> /sys/fs/bpf
> bash-5.0$ ls -l
> total 0
> drwxr-xr-x 2 luto luto 0 Aug  4 22:41 luto
> bash-5.0$ bpftool map create /sys/fs/bpf/luto/filename type hash key 8
> value 8 entries 64 name mapname
> bash-5.0$ bpftool map dump pinned /sys/fs/bpf/luto/filename
> Found 0 elements
> 
> # chown root: /sys/fs/bpf/luto/filename
> 
> $ bpftool map dump pinned /sys/fs/bpf/luto/filename
> Error: bpf obj get (/sys/fs/bpf/luto): Permission denied
> 
> So I think it's possible to get a respectable subset of bpf()
> functionality working without privilege in short order :)

I think we have two key questions to answer: 
  1. What subset of bpf() functionality will the users need?
  2. Who are the users? 

Different answers to these two questions lead to different directions.


In our use case, the answers are 
  1) almost all bpf() functionality
  2) highly trusted users (sudoers)

So our initial approach of /dev/bpf allows all bpf() functionality
in one bit in task_struct. (Yes, we can just sudo. But, we would 
rather not use sudo when possible.)


"cgroup management" use case may have answers like:
  1) cgroup_bpf only
  2) users in their own containers

For this case, getting cgroup_bpf related features (cgroup_bpf progs; 
some map types, etc.) work with unprivileged users would be the right 
direction. 


"USDT tracing" use case may have answers like:
  1) uprobe, stockmap, histogram, etc.
  2) unprivileged user, w/ or w/o containers

For this case, the first step is likely hacking sys_perf_event_open(). 


I guess we will need more discussions to decide how to make bpf() 
work better for all these (and more) use cases. 

Thanks,
Song

^ permalink raw reply

* Re: [PATCH v3 1/2] mm/page_idle: Add per-pid idle page tracking using virtual indexing
From: Minchan Kim @ 2019-08-05  7:55 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: linux-kernel, Alexey Dobriyan, Andrew Morton, Brendan Gregg,
	Christian Hansen, dancol, fmayer, joaodias, Jonathan Corbet,
	Kees Cook, kernel-team, linux-api, linux-doc, linux-fsdevel,
	linux-mm, Michal Hocko, Mike Rapoport, namhyung, Roman Gushchin,
	Stephen Rothwell, surenb, tkjos, Vladimir Davydov,
	Vlastimil Babka, wvw
In-Reply-To: <20190731171937.GA75376@google.com>

Hi Joel,

On Wed, Jul 31, 2019 at 01:19:37PM -0400, Joel Fernandes wrote:
> > > -static struct page *page_idle_get_page(unsigned long pfn)
> > > +static struct page *page_idle_get_page(struct page *page_in)
> > 
> > Looks weird function name after you changed the argument.
> > Maybe "bool check_valid_page(struct page *page)"?
> 
> 
> I don't think so, this function does a get_page_unless_zero() on the page as well.
> 
> > >  {
> > >  	struct page *page;
> > >  	pg_data_t *pgdat;
> > >  
> > > -	if (!pfn_valid(pfn))
> > > -		return NULL;
> > > -
> > > -	page = pfn_to_page(pfn);
> > > +	page = page_in;
> > >  	if (!page || !PageLRU(page) ||
> > >  	    !get_page_unless_zero(page))
> > >  		return NULL;
> > > @@ -51,6 +49,18 @@ static struct page *page_idle_get_page(unsigned long pfn)
> > >  	return page;
> > >  }
> > >  
> > > +/*
> > > + * This function tries to get a user memory page by pfn as described above.
> > > + */
> > > +static struct page *page_idle_get_page_pfn(unsigned long pfn)
> > 
> > So we could use page_idle_get_page name here.
> 
> 
> Based on above comment, I prefer to keep same name. Do you agree?

Yes, I agree. Just please add a comment about refcount in the description
on page_idle_get_page.

> 
> 
> > > +	return page_idle_get_page(pfn_to_page(pfn));
> > > +}
> > > +
> > >  static bool page_idle_clear_pte_refs_one(struct page *page,
> > >  					struct vm_area_struct *vma,
> > >  					unsigned long addr, void *arg)
> > > @@ -118,6 +128,47 @@ static void page_idle_clear_pte_refs(struct page *page)
> > >  		unlock_page(page);
> > >  }
> > >  
> > > +/* Helper to get the start and end frame given a pos and count */
> > > +static int page_idle_get_frames(loff_t pos, size_t count, struct mm_struct *mm,
> > > +				unsigned long *start, unsigned long *end)
> > > +{
> > > +	unsigned long max_frame;
> > > +
> > > +	/* If an mm is not given, assume we want physical frames */
> > > +	max_frame = mm ? (mm->task_size >> PAGE_SHIFT) : max_pfn;
> > > +
> > > +	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
> > > +		return -EINVAL;
> > > +
> > > +	*start = pos * BITS_PER_BYTE;
> > > +	if (*start >= max_frame)
> > > +		return -ENXIO;
> > > +
> > > +	*end = *start + count * BITS_PER_BYTE;
> > > +	if (*end > max_frame)
> > > +		*end = max_frame;
> > > +	return 0;
> > > +}
> > > +
> > > +static bool page_really_idle(struct page *page)
> > 
> > Just minor:
> > Instead of creating new API, could we combine page_is_idle with
> > introducing furthere argument pte_check?
> 
> 
> I cannot see in the code where pte_check will be false when this is called? I
> could rename the function to page_idle_check_ptes() if that's Ok with you.

What I don't like is _*really*_ part of the funcion name.

I see several page_is_idle calls in huge_memory.c, migration.c, swap.c.
They could just check only page flag so they could use "false" with pte_check.

< snip >
 
> > > +ssize_t page_idle_proc_generic(struct file *file, char __user *ubuff,
> > > +			       size_t count, loff_t *pos,
> > > +			       struct task_struct *tsk, int write)
> > > +{
> > > +	int ret;
> > > +	char *buffer;
> > > +	u64 *out;
> > > +	unsigned long start_addr, end_addr, start_frame, end_frame;
> > > +	struct mm_struct *mm = file->private_data;
> > > +	struct mm_walk walk = { .pmd_entry = pte_page_idle_proc_range, };
> > > +	struct page_node *cur;
> > > +	struct page_idle_proc_priv priv;
> > > +	bool walk_error = false;
> > > +	LIST_HEAD(idle_page_list);
> > > +
> > > +	if (!mm || !mmget_not_zero(mm))
> > > +		return -EINVAL;
> > > +
> > > +	if (count > PAGE_SIZE)
> > > +		count = PAGE_SIZE;
> > > +
> > > +	buffer = kzalloc(PAGE_SIZE, GFP_KERNEL);
> > > +	if (!buffer) {
> > > +		ret = -ENOMEM;
> > > +		goto out_mmput;
> > > +	}
> > > +	out = (u64 *)buffer;
> > > +
> > > +	if (write && copy_from_user(buffer, ubuff, count)) {
> > > +		ret = -EFAULT;
> > > +		goto out;
> > > +	}
> > > +
> > > +	ret = page_idle_get_frames(*pos, count, mm, &start_frame, &end_frame);
> > > +	if (ret)
> > > +		goto out;
> > > +
> > > +	start_addr = (start_frame << PAGE_SHIFT);
> > > +	end_addr = (end_frame << PAGE_SHIFT);
> > > +	priv.buffer = buffer;
> > > +	priv.start_addr = start_addr;
> > > +	priv.write = write;
> > > +
> > > +	priv.idle_page_list = &idle_page_list;
> > > +	priv.cur_page_node = 0;
> > > +	priv.page_nodes = kzalloc(sizeof(struct page_node) *
> > > +				  (end_frame - start_frame), GFP_KERNEL);
> > > +	if (!priv.page_nodes) {
> > > +		ret = -ENOMEM;
> > > +		goto out;
> > > +	}
> > > +
> > > +	walk.private = &priv;
> > > +	walk.mm = mm;
> > > +
> > > +	down_read(&mm->mmap_sem);
> > > +
> > > +	/*
> > > +	 * idle_page_list is needed because walk_page_vma() holds ptlock which
> > > +	 * deadlocks with page_idle_clear_pte_refs(). So we have to collect all
> > > +	 * pages first, and then call page_idle_clear_pte_refs().
> > > +	 */
> > 
> > Thanks for the comment, I was curious why you want to have
> > idle_page_list and the reason is here.
> > 
> > How about making this /proc/<pid>/page_idle per-process granuariy,
> > unlike system level /sys/xxx/page_idle? What I meant is not to check
> > rmap to see any reference from random process but just check only
> > access from the target process. It would be more proper as /proc/
> > <pid>/ interface and good for per-process tracking as well as
> > fast.
> 
> 
> I prefer not to do this for the following reasons:
> (1) It makes a feature lost, now accesses to shared pages will not be
> accounted properly. 

Do you really want to check global attribute by per-process interface?
That would be doable with existing idle page tracking feature and that's
the one of reasons page idle tracking was born(e.g. even, page cache
for non-mapped) unlike clear_refs.
Once we create a new interface by per-process, just checking the process
-granuariy access check sounds more reasonable to me.

With that, we could catch only idle pages of the target process even though
the page was touched by several other processes.
If the user want to know global level access point, they could use
exisint interface(If there is a concern(e.g., security) to use existing
idle page tracking, let's discuss it as other topic how we could make
existing feature more useful).

IOW, my point is that we already have global access check(1. from ptes
among several processes, 2. from page flag for non-mapped pages) feature
from from existing idle page tracking interface and now we are about to create
new interface for per-process wise so I wanted to create a particular
feature which cannot be covered by existing iterface.

> 
> (2) It makes it inconsistent with other idle page tracking mechanism. I

That's the my comment to create different idle page tracking we couldn't
do with existing interface.

> prefer if post per-process. At the heart of it, the tracking is always at the

What does it mean "post per-process"?

> physical page level -- I feel that is how it should be. Other drawback, is
> also we have to document this subtlety.

Sorry, Could you elaborate it a bit?

^ permalink raw reply

* Re: [PATCH v3 1/2] mm/page_idle: Add per-pid idle page tracking using virtual indexing
From: Joel Fernandes @ 2019-08-05 13:10 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-kernel, Alexey Dobriyan, Andrew Morton, Brendan Gregg,
	Christian Hansen, dancol, fmayer, joaodias, Jonathan Corbet,
	Kees Cook, kernel-team, linux-api, linux-doc, linux-fsdevel,
	linux-mm, Michal Hocko, Mike Rapoport, namhyung, Roman Gushchin,
	Stephen Rothwell, surenb, tkjos, Vladimir Davydov,
	Vlastimil Babka, wvw
In-Reply-To: <20190805075547.GA196934@google.com>

On Mon, Aug 05, 2019 at 04:55:47PM +0900, Minchan Kim wrote:
> Hi Joel,

Hi Minchan,

> On Wed, Jul 31, 2019 at 01:19:37PM -0400, Joel Fernandes wrote:
> > > > -static struct page *page_idle_get_page(unsigned long pfn)
> > > > +static struct page *page_idle_get_page(struct page *page_in)
> > > 
> > > Looks weird function name after you changed the argument.
> > > Maybe "bool check_valid_page(struct page *page)"?
> > 
> > 
> > I don't think so, this function does a get_page_unless_zero() on the page as well.
> > 
> > > >  {
> > > >  	struct page *page;
> > > >  	pg_data_t *pgdat;
> > > >  
> > > > -	if (!pfn_valid(pfn))
> > > > -		return NULL;
> > > > -
> > > > -	page = pfn_to_page(pfn);
> > > > +	page = page_in;
> > > >  	if (!page || !PageLRU(page) ||
> > > >  	    !get_page_unless_zero(page))
> > > >  		return NULL;
> > > > @@ -51,6 +49,18 @@ static struct page *page_idle_get_page(unsigned long pfn)
> > > >  	return page;
> > > >  }
> > > >  
> > > > +/*
> > > > + * This function tries to get a user memory page by pfn as described above.
> > > > + */
> > > > +static struct page *page_idle_get_page_pfn(unsigned long pfn)
> > > 
> > > So we could use page_idle_get_page name here.
> > 
> > 
> > Based on above comment, I prefer to keep same name. Do you agree?
> 
> Yes, I agree. Just please add a comment about refcount in the description
> on page_idle_get_page.

Ok.


> > > > +	return page_idle_get_page(pfn_to_page(pfn));
> > > > +}
> > > > +
> > > >  static bool page_idle_clear_pte_refs_one(struct page *page,
> > > >  					struct vm_area_struct *vma,
> > > >  					unsigned long addr, void *arg)
> > > > @@ -118,6 +128,47 @@ static void page_idle_clear_pte_refs(struct page *page)
> > > >  		unlock_page(page);
> > > >  }
> > > >  
> > > > +/* Helper to get the start and end frame given a pos and count */
> > > > +static int page_idle_get_frames(loff_t pos, size_t count, struct mm_struct *mm,
> > > > +				unsigned long *start, unsigned long *end)
> > > > +{
> > > > +	unsigned long max_frame;
> > > > +
> > > > +	/* If an mm is not given, assume we want physical frames */
> > > > +	max_frame = mm ? (mm->task_size >> PAGE_SHIFT) : max_pfn;
> > > > +
> > > > +	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
> > > > +		return -EINVAL;
> > > > +
> > > > +	*start = pos * BITS_PER_BYTE;
> > > > +	if (*start >= max_frame)
> > > > +		return -ENXIO;
> > > > +
> > > > +	*end = *start + count * BITS_PER_BYTE;
> > > > +	if (*end > max_frame)
> > > > +		*end = max_frame;
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static bool page_really_idle(struct page *page)
> > > 
> > > Just minor:
> > > Instead of creating new API, could we combine page_is_idle with
> > > introducing furthere argument pte_check?
> > 
> > 
> > I cannot see in the code where pte_check will be false when this is called? I
> > could rename the function to page_idle_check_ptes() if that's Ok with you.
> 
> What I don't like is _*really*_ part of the funcion name.
> 
> I see several page_is_idle calls in huge_memory.c, migration.c, swap.c.
> They could just check only page flag so they could use "false" with pte_check.

I will rename it to page_idle_check_ptes(). If you want pte_check argument,
that can be a later patch if/when there are other users for it in other
files. Hope that's reasonable.


> > > > +ssize_t page_idle_proc_generic(struct file *file, char __user *ubuff,
> > > > +			       size_t count, loff_t *pos,
> > > > +			       struct task_struct *tsk, int write)
> > > > +{
> > > > +	int ret;
> > > > +	char *buffer;
> > > > +	u64 *out;
> > > > +	unsigned long start_addr, end_addr, start_frame, end_frame;
> > > > +	struct mm_struct *mm = file->private_data;
> > > > +	struct mm_walk walk = { .pmd_entry = pte_page_idle_proc_range, };
> > > > +	struct page_node *cur;
> > > > +	struct page_idle_proc_priv priv;
> > > > +	bool walk_error = false;
> > > > +	LIST_HEAD(idle_page_list);
> > > > +
> > > > +	if (!mm || !mmget_not_zero(mm))
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (count > PAGE_SIZE)
> > > > +		count = PAGE_SIZE;
> > > > +
> > > > +	buffer = kzalloc(PAGE_SIZE, GFP_KERNEL);
> > > > +	if (!buffer) {
> > > > +		ret = -ENOMEM;
> > > > +		goto out_mmput;
> > > > +	}
> > > > +	out = (u64 *)buffer;
> > > > +
> > > > +	if (write && copy_from_user(buffer, ubuff, count)) {
> > > > +		ret = -EFAULT;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	ret = page_idle_get_frames(*pos, count, mm, &start_frame, &end_frame);
> > > > +	if (ret)
> > > > +		goto out;
> > > > +
> > > > +	start_addr = (start_frame << PAGE_SHIFT);
> > > > +	end_addr = (end_frame << PAGE_SHIFT);
> > > > +	priv.buffer = buffer;
> > > > +	priv.start_addr = start_addr;
> > > > +	priv.write = write;
> > > > +
> > > > +	priv.idle_page_list = &idle_page_list;
> > > > +	priv.cur_page_node = 0;
> > > > +	priv.page_nodes = kzalloc(sizeof(struct page_node) *
> > > > +				  (end_frame - start_frame), GFP_KERNEL);
> > > > +	if (!priv.page_nodes) {
> > > > +		ret = -ENOMEM;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	walk.private = &priv;
> > > > +	walk.mm = mm;
> > > > +
> > > > +	down_read(&mm->mmap_sem);
> > > > +
> > > > +	/*
> > > > +	 * idle_page_list is needed because walk_page_vma() holds ptlock which
> > > > +	 * deadlocks with page_idle_clear_pte_refs(). So we have to collect all
> > > > +	 * pages first, and then call page_idle_clear_pte_refs().
> > > > +	 */
> > > 
> > > Thanks for the comment, I was curious why you want to have
> > > idle_page_list and the reason is here.
> > > 
> > > How about making this /proc/<pid>/page_idle per-process granuariy,
> > > unlike system level /sys/xxx/page_idle? What I meant is not to check
> > > rmap to see any reference from random process but just check only
> > > access from the target process. It would be more proper as /proc/
> > > <pid>/ interface and good for per-process tracking as well as
> > > fast.
> > 
> > 
> > I prefer not to do this for the following reasons:
> > (1) It makes a feature lost, now accesses to shared pages will not be
> > accounted properly. 
> 
> Do you really want to check global attribute by per-process interface?

Pages are inherrently not per-process, they are global. A page does not
necessarily belong to a process. An anonymous page can be shared. We are
operating on pages in the end of the day.

I think you are confusing the per-process file interface with the core
mechanism. The core mechanism always operations on physical PAGES.


> That would be doable with existing idle page tracking feature and that's
> the one of reasons page idle tracking was born(e.g. even, page cache
> for non-mapped) unlike clear_refs.

I think you are misunderstanding the patch, the patch does not want to change
the core mechanism. That is a bit out of scope for the patch. Page
idle-tracking at the core of it looks at PTE of all processes. We are just
using the VFN (virtual frame) interface to skip the need for separate pagemap
look up -- that's it.


> Once we create a new interface by per-process, just checking the process
> -granuariy access check sounds more reasonable to me.

It sounds reasonable but there is no reason to not do the full and proper
page tracking for now, including shared pages. Otherwise it makes it
inconsistent with the existing mechanism and can confuse the user about what
to expect (especially for shared pages).


> With that, we could catch only idle pages of the target process even though
> the page was touched by several other processes.
> If the user want to know global level access point, they could use
> exisint interface(If there is a concern(e.g., security) to use existing
> idle page tracking, let's discuss it as other topic how we could make
> existing feature more useful).
> 
> IOW, my point is that we already have global access check(1. from ptes
> among several processes, 2. from page flag for non-mapped pages) feature
> from from existing idle page tracking interface and now we are about to create
> new interface for per-process wise so I wanted to create a particular
> feature which cannot be covered by existing iterface.

Yes, it sounds like you want to create a different feature. Then that can be
a follow-up different patch, and that is out of scope for this patch.


> > (2) It makes it inconsistent with other idle page tracking mechanism. I
> 
> That's the my comment to create different idle page tracking we couldn't
> do with existing interface.

Yes, sure. But that can be a different patch and we can weigh the benefits of
it at that time. I don't want to introduce a new page tracking mechanism, I
am just trying to reuse the existing one.


> > prefer if post per-process. At the heart of it, the tracking is always at the
> 
> What does it mean "post per-process"?

Sorry it was a typo, I meant "the core mechanism should not be a per-process
one, but a global one". We are just changing the interface in this patch, we
are not changing the existing core mechanism. That gives us all the benefits
of the existing code such as non-interference with page reclaim code, without
introducing any new bugs. By the way I did fix a bug in the existing original
code as well!


> > physical page level -- I feel that is how it should be. Other drawback, is
> > also we have to document this subtlety.
> 
> Sorry, Could you elaborate it a bit?

I meant, with a new mechanism as the one you are proposing, we have to
document that now shared pages will not be tracked properly. That is a
'subtle difference' and will have to be documented appropriated in the
'internals' section of the idle page tracking document.

thanks,

 - Joel

^ permalink raw reply

* Re: [PATCH v4 04/12] fpga: dfl: afu: add userclock sysfs interfaces.
From: Greg KH @ 2019-08-05 15:51 UTC (permalink / raw)
  To: Wu Hao
  Cc: mdf, linux-fpga, linux-kernel, linux-api, linux-doc, atull,
	Ananda Ravuri, Russ Weight, Xu Yilun
In-Reply-To: <1564914022-3710-5-git-send-email-hao.wu@intel.com>

On Sun, Aug 04, 2019 at 06:20:14PM +0800, Wu Hao wrote:
> This patch introduces userclock sysfs interfaces for AFU, user
> could use these interfaces for clock setting to AFU.
> 
> Please note that, this is only working for port header feature
> with revision 0, for later revisions, userclock setting is moved
> to a separated private feature, so one revision sysfs interface
> is exposed to userspace application for this purpose too.
> 
> Signed-off-by: Ananda Ravuri <ananda.ravuri@intel.com>
> Signed-off-by: Russ Weight <russell.h.weight@intel.com>
> Signed-off-by: Xu Yilun <yilun.xu@intel.com>
> Signed-off-by: Wu Hao <hao.wu@intel.com>
> Acked-by: Alan Tull <atull@kernel.org>
> Signed-off-by: Moritz Fischer <mdf@kernel.org>
> ---
> v2: rebased, and switched to use device_add/remove_groups for sysfs
> v3: update kernel version and date in sysfs doc
> v4: rebased.
> ---
>  Documentation/ABI/testing/sysfs-platform-dfl-port |  35 +++++++
>  drivers/fpga/dfl-afu-main.c                       | 114 +++++++++++++++++++++-
>  drivers/fpga/dfl.h                                |   9 ++
>  3 files changed, 157 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-port b/Documentation/ABI/testing/sysfs-platform-dfl-port
> index 1ab3e6f..5663441 100644
> --- a/Documentation/ABI/testing/sysfs-platform-dfl-port
> +++ b/Documentation/ABI/testing/sysfs-platform-dfl-port
> @@ -46,3 +46,38 @@ Contact:	Wu Hao <hao.wu@intel.com>
>  Description:	Read-write. Read or set AFU latency tolerance reporting value.
>  		Set ltr to 1 if the AFU can tolerate latency >= 40us or set it
>  		to 0 if it is latency sensitive.
> +
> +What:		/sys/bus/platform/devices/dfl-port.0/revision
> +Date:		August 2019
> +KernelVersion:	5.4
> +Contact:	Wu Hao <hao.wu@intel.com>
> +Description:	Read-only. Read this file to get the revision of port header
> +		feature.

What does "revision" mean?

It feels like you are creating a different set of sysfs files depending
on the revision field.  Which is fine, sysfs is one-value-per-file and
userspace needs to handle if the file is present or not.  So why not
just rely on that and not have to mess with 'revision' at all?  What is
userspace going to do with that information?

> +
> +What:		/sys/bus/platform/devices/dfl-port.0/userclk_freqcmd
> +Date:		August 2019
> +KernelVersion:	5.4
> +Contact:	Wu Hao <hao.wu@intel.com>
> +Description:	Write-only. User writes command to this interface to set
> +		userclock to AFU.
> +
> +What:		/sys/bus/platform/devices/dfl-port.0/userclk_freqsts
> +Date:		August 2019
> +KernelVersion:	5.4
> +Contact:	Wu Hao <hao.wu@intel.com>
> +Description:	Read-only. Read this file to get the status of issued command
> +		to userclck_freqcmd.
> +
> +What:		/sys/bus/platform/devices/dfl-port.0/userclk_freqcntrcmd
> +Date:		August 2019
> +KernelVersion:	5.4
> +Contact:	Wu Hao <hao.wu@intel.com>
> +Description:	Write-only. User writes command to this interface to set
> +		userclock counter.
> +
> +What:		/sys/bus/platform/devices/dfl-port.0/userclk_freqcntrsts
> +Date:		August 2019
> +KernelVersion:	5.4
> +Contact:	Wu Hao <hao.wu@intel.com>
> +Description:	Read-only. Read this file to get the status of issued command
> +		to userclck_freqcntrcmd.
> diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
> index 12175bb..407c97d 100644
> --- a/drivers/fpga/dfl-afu-main.c
> +++ b/drivers/fpga/dfl-afu-main.c
> @@ -142,6 +142,17 @@ static int port_get_id(struct platform_device *pdev)
>  static DEVICE_ATTR_RO(id);
>  
>  static ssize_t
> +revision_show(struct device *dev, struct device_attribute *attr, char *buf)
> +{
> +	void __iomem *base;
> +
> +	base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
> +
> +	return sprintf(buf, "%x\n", dfl_feature_revision(base));
> +}
> +static DEVICE_ATTR_RO(revision);
> +
> +static ssize_t
>  ltr_show(struct device *dev, struct device_attribute *attr, char *buf)
>  {
>  	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
> @@ -276,6 +287,7 @@ static int port_get_id(struct platform_device *pdev)
>  
>  static struct attribute *port_hdr_attrs[] = {
>  	&dev_attr_id.attr,
> +	&dev_attr_revision.attr,
>  	&dev_attr_ltr.attr,
>  	&dev_attr_ap1_event.attr,
>  	&dev_attr_ap2_event.attr,
> @@ -284,14 +296,113 @@ static int port_get_id(struct platform_device *pdev)
>  };
>  ATTRIBUTE_GROUPS(port_hdr);
>  
> +static ssize_t
> +userclk_freqcmd_store(struct device *dev, struct device_attribute *attr,
> +		      const char *buf, size_t count)
> +{
> +	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
> +	u64 userclk_freq_cmd;
> +	void __iomem *base;
> +
> +	if (kstrtou64(buf, 0, &userclk_freq_cmd))
> +		return -EINVAL;
> +
> +	base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
> +
> +	mutex_lock(&pdata->lock);
> +	writeq(userclk_freq_cmd, base + PORT_HDR_USRCLK_CMD0);
> +	mutex_unlock(&pdata->lock);
> +
> +	return count;
> +}
> +static DEVICE_ATTR_WO(userclk_freqcmd);
> +
> +static ssize_t
> +userclk_freqcntrcmd_store(struct device *dev, struct device_attribute *attr,
> +			  const char *buf, size_t count)
> +{
> +	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
> +	u64 userclk_freqcntr_cmd;
> +	void __iomem *base;
> +
> +	if (kstrtou64(buf, 0, &userclk_freqcntr_cmd))
> +		return -EINVAL;
> +
> +	base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
> +
> +	mutex_lock(&pdata->lock);
> +	writeq(userclk_freqcntr_cmd, base + PORT_HDR_USRCLK_CMD1);
> +	mutex_unlock(&pdata->lock);
> +
> +	return count;
> +}
> +static DEVICE_ATTR_WO(userclk_freqcntrcmd);
> +
> +static ssize_t
> +userclk_freqsts_show(struct device *dev, struct device_attribute *attr,
> +		     char *buf)
> +{
> +	u64 userclk_freqsts;
> +	void __iomem *base;
> +
> +	base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
> +
> +	userclk_freqsts = readq(base + PORT_HDR_USRCLK_STS0);
> +
> +	return sprintf(buf, "0x%llx\n", (unsigned long long)userclk_freqsts);
> +}
> +static DEVICE_ATTR_RO(userclk_freqsts);
> +
> +static ssize_t
> +userclk_freqcntrsts_show(struct device *dev, struct device_attribute *attr,
> +			 char *buf)
> +{
> +	u64 userclk_freqcntrsts;
> +	void __iomem *base;
> +
> +	base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
> +
> +	userclk_freqcntrsts = readq(base + PORT_HDR_USRCLK_STS1);
> +
> +	return sprintf(buf, "0x%llx\n",
> +		       (unsigned long long)userclk_freqcntrsts);
> +}
> +static DEVICE_ATTR_RO(userclk_freqcntrsts);
> +
> +static struct attribute *port_hdr_userclk_attrs[] = {
> +	&dev_attr_userclk_freqcmd.attr,
> +	&dev_attr_userclk_freqcntrcmd.attr,
> +	&dev_attr_userclk_freqsts.attr,
> +	&dev_attr_userclk_freqcntrsts.attr,
> +	NULL,
> +};
> +ATTRIBUTE_GROUPS(port_hdr_userclk);
> +
>  static int port_hdr_init(struct platform_device *pdev,
>  			 struct dfl_feature *feature)
>  {
> +	int ret;
> +
>  	dev_dbg(&pdev->dev, "PORT HDR Init.\n");
>  
>  	port_reset(pdev);
>  
> -	return device_add_groups(&pdev->dev, port_hdr_groups);
> +	ret = device_add_groups(&pdev->dev, port_hdr_groups);

This all needs to be reworked based on the ability for devices to
properly add groups when they are bound on probe (the core does it for
you, no need for the driver to do it.)  But until then, you should at
least consider:

> +	if (ret)
> +		return ret;
> +
> +	/*
> +	 * if revision > 0, the userclock will be moved from port hdr register
> +	 * region to a separated private feature.
> +	 */
> +	if (dfl_feature_revision(feature->ioaddr) > 0)
> +		return 0;
> +
> +	ret = device_add_groups(&pdev->dev, port_hdr_userclk_groups);
> +	if (ret)
> +		device_remove_groups(&pdev->dev, port_hdr_groups);

struct attribute_group has is_visible() as a callback to have the core
show or not show, individual attributes when they are created.  So no
need for a second group of attributes and you needing to add/remove
them, just add them all and let the callback handle the "is visible"
logic.  Makes cleanup _so_ much easier (i.e. you don't have to do it.)

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH v4 06/12] fpga: dfl: afu: export __port_enable/disable function.
From: Greg KH @ 2019-08-05 15:52 UTC (permalink / raw)
  To: Wu Hao; +Cc: mdf, linux-fpga, linux-kernel, linux-api, linux-doc, atull,
	Xu Yilun
In-Reply-To: <1564914022-3710-7-git-send-email-hao.wu@intel.com>

On Sun, Aug 04, 2019 at 06:20:16PM +0800, Wu Hao wrote:
> As these two functions are used by other private features. e.g.
> in error reporting private feature, it requires to check port status
> and reset port for error clearing.
> 
> Signed-off-by: Xu Yilun <yilun.xu@intel.com>
> Signed-off-by: Wu Hao <hao.wu@intel.com>
> Acked-by: Moritz Fischer <mdf@kernel.org>
> Acked-by: Alan Tull <atull@kernel.org>
> Signed-off-by: Moritz Fischer <mdf@kernel.org>
> ---
> v2: rebased
> ---
>  drivers/fpga/dfl-afu-main.c | 25 ++++++++++++++-----------
>  drivers/fpga/dfl-afu.h      |  3 +++
>  2 files changed, 17 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
> index e013afb..e312179 100644
> --- a/drivers/fpga/dfl-afu-main.c
> +++ b/drivers/fpga/dfl-afu-main.c
> @@ -22,14 +22,16 @@
>  #include "dfl-afu.h"
>  
>  /**
> - * port_enable - enable a port
> + * __port_enable - enable a port
>   * @pdev: port platform device.
>   *
>   * Enable Port by clear the port soft reset bit, which is set by default.
>   * The AFU is unable to respond to any MMIO access while in reset.
> - * port_enable function should only be used after port_disable function.
> + * __port_enable function should only be used after __port_disable function.
> + *
> + * The caller needs to hold lock for protection.
>   */
> -static void port_enable(struct platform_device *pdev)
> +void __port_enable(struct platform_device *pdev)

worst global function name ever.

Don't polute the global namespace like this for a single driver.  If you
REALLY need it, then use a prefix that shows it is your individual
dfl_special_sauce_platform_device_only type thing.

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH v4 07/12] fpga: dfl: afu: add error reporting support.
From: Greg KH @ 2019-08-05 15:54 UTC (permalink / raw)
  To: Wu Hao; +Cc: mdf, linux-fpga, linux-kernel, linux-api, linux-doc, atull,
	Xu Yilun
In-Reply-To: <1564914022-3710-8-git-send-email-hao.wu@intel.com>

On Sun, Aug 04, 2019 at 06:20:17PM +0800, Wu Hao wrote:
> Error reporting is one important private feature, it reports error
> detected on port and accelerated function unit (AFU). It introduces
> several sysfs interfaces to allow userspace to check and clear
> errors detected by hardware.
> 
> Signed-off-by: Xu Yilun <yilun.xu@intel.com>
> Signed-off-by: Wu Hao <hao.wu@intel.com>
> Acked-by: Alan Tull <atull@kernel.org>
> Signed-off-by: Moritz Fischer <mdf@kernel.org>
> ---
> v2: switch to device_add/remove_group for sysfs.
> v3: update kernel version and date in sysfs doc
> v4: remove dev_dbg in init/uinit callback function.
> ---
>  Documentation/ABI/testing/sysfs-platform-dfl-port |  39 ++++
>  drivers/fpga/Makefile                             |   1 +
>  drivers/fpga/dfl-afu-error.c                      | 221 ++++++++++++++++++++++
>  drivers/fpga/dfl-afu-main.c                       |   4 +
>  drivers/fpga/dfl-afu.h                            |   4 +
>  5 files changed, 269 insertions(+)
>  create mode 100644 drivers/fpga/dfl-afu-error.c
> 
> diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-port b/Documentation/ABI/testing/sysfs-platform-dfl-port
> index 5663441..3b6580b 100644
> --- a/Documentation/ABI/testing/sysfs-platform-dfl-port
> +++ b/Documentation/ABI/testing/sysfs-platform-dfl-port
> @@ -81,3 +81,42 @@ KernelVersion:	5.4
>  Contact:	Wu Hao <hao.wu@intel.com>
>  Description:	Read-only. Read this file to get the status of issued command
>  		to userclck_freqcntrcmd.
> +
> +What:		/sys/bus/platform/devices/dfl-port.0/errors/revision
> +Date:		August 2019
> +KernelVersion:	5.4
> +Contact:	Wu Hao <hao.wu@intel.com>
> +Description:	Read-only. Read this file to get the revision of this error
> +		reporting private feature.

Same revision question here that I had on an earlier patch.


> +
> +What:		/sys/bus/platform/devices/dfl-port.0/errors/errors
> +Date:		August 2019
> +KernelVersion:	5.4
> +Contact:	Wu Hao <hao.wu@intel.com>
> +Description:	Read-only. Read this file to get errors detected on port and
> +		Accelerated Function Unit (AFU).
> +
> +What:		/sys/bus/platform/devices/dfl-port.0/errors/first_error
> +Date:		August 2019
> +KernelVersion:	5.4
> +Contact:	Wu Hao <hao.wu@intel.com>
> +Description:	Read-only. Read this file to get the first error detected by
> +		hardware.
> +
> +What:		/sys/bus/platform/devices/dfl-port.0/errors/first_malformed_req
> +Date:		August 2019
> +KernelVersion:	5.4
> +Contact:	Wu Hao <hao.wu@intel.com>
> +Description:	Read-only. Read this file to get the first malformed request
> +		captured by hardware.
> +
> +What:		/sys/bus/platform/devices/dfl-port.0/errors/clear
> +Date:		August 2019
> +KernelVersion:	5.4
> +Contact:	Wu Hao <hao.wu@intel.com>
> +Description:	Write-only. Write error code to this file to clear errors.
> +		Write fails with -EINVAL if input parsing fails or input error
> +		code doesn't match.
> +		Write fails with -EBUSY or -ETIMEDOUT if error can't be cleared
> +		as hardware is in low power state (-EBUSY) or not responding
> +		(-ETIMEDOUT).
> diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
> index 312b937..7255891 100644
> --- a/drivers/fpga/Makefile
> +++ b/drivers/fpga/Makefile
> @@ -41,6 +41,7 @@ obj-$(CONFIG_FPGA_DFL_AFU)		+= dfl-afu.o
>  
>  dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
>  dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
> +dfl-afu-objs += dfl-afu-error.o
>  
>  # Drivers for FPGAs which implement DFL
>  obj-$(CONFIG_FPGA_DFL_PCI)		+= dfl-pci.o
> diff --git a/drivers/fpga/dfl-afu-error.c b/drivers/fpga/dfl-afu-error.c
> new file mode 100644
> index 0000000..c5e0efa
> --- /dev/null
> +++ b/drivers/fpga/dfl-afu-error.c
> @@ -0,0 +1,221 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Driver for FPGA Accelerated Function Unit (AFU) Error Reporting
> + *
> + * Copyright 2019 Intel Corporation, Inc.
> + *
> + * Authors:
> + *   Wu Hao <hao.wu@linux.intel.com>
> + *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
> + *   Joseph Grecco <joe.grecco@intel.com>
> + *   Enno Luebbers <enno.luebbers@intel.com>
> + *   Tim Whisonant <tim.whisonant@intel.com>
> + *   Ananda Ravuri <ananda.ravuri@intel.com>
> + *   Mitchel Henry <henry.mitchel@intel.com>
> + */
> +
> +#include <linux/uaccess.h>
> +
> +#include "dfl-afu.h"
> +
> +#define PORT_ERROR_MASK		0x8
> +#define PORT_ERROR		0x10
> +#define PORT_FIRST_ERROR	0x18
> +#define PORT_MALFORMED_REQ0	0x20
> +#define PORT_MALFORMED_REQ1	0x28
> +
> +#define ERROR_MASK		GENMASK_ULL(63, 0)
> +
> +/* mask or unmask port errors by the error mask register. */
> +static void __port_err_mask(struct device *dev, bool mask)
> +{
> +	void __iomem *base;
> +
> +	base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> +
> +	writeq(mask ? ERROR_MASK : 0, base + PORT_ERROR_MASK);
> +}
> +
> +/* clear port errors. */
> +static int __port_err_clear(struct device *dev, u64 err)
> +{
> +	struct platform_device *pdev = to_platform_device(dev);
> +	void __iomem *base_err, *base_hdr;
> +	int ret;
> +	u64 v;
> +
> +	base_err = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> +	base_hdr = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
> +
> +	/*
> +	 * clear Port Errors
> +	 *
> +	 * - Check for AP6 State
> +	 * - Halt Port by keeping Port in reset
> +	 * - Set PORT Error mask to all 1 to mask errors
> +	 * - Clear all errors
> +	 * - Set Port mask to all 0 to enable errors
> +	 * - All errors start capturing new errors
> +	 * - Enable Port by pulling the port out of reset
> +	 */
> +
> +	/* if device is still in AP6 power state, can not clear any error. */
> +	v = readq(base_hdr + PORT_HDR_STS);
> +	if (FIELD_GET(PORT_STS_PWR_STATE, v) == PORT_STS_PWR_STATE_AP6) {
> +		dev_err(dev, "Could not clear errors, device in AP6 state.\n");
> +		return -EBUSY;
> +	}
> +
> +	/* Halt Port by keeping Port in reset */
> +	ret = __port_disable(pdev);
> +	if (ret)
> +		return ret;
> +
> +	/* Mask all errors */
> +	__port_err_mask(dev, true);
> +
> +	/* Clear errors if err input matches with current port errors.*/
> +	v = readq(base_err + PORT_ERROR);
> +
> +	if (v == err) {
> +		writeq(v, base_err + PORT_ERROR);
> +
> +		v = readq(base_err + PORT_FIRST_ERROR);
> +		writeq(v, base_err + PORT_FIRST_ERROR);
> +	} else {
> +		ret = -EINVAL;
> +	}
> +
> +	/* Clear mask */
> +	__port_err_mask(dev, false);
> +
> +	/* Enable the Port by clear the reset */
> +	__port_enable(pdev);
> +
> +	return ret;
> +}
> +
> +static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
> +			     char *buf)
> +{
> +	void __iomem *base;
> +
> +	base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> +
> +	return sprintf(buf, "%u\n", dfl_feature_revision(base));
> +}
> +static DEVICE_ATTR_RO(revision);
> +
> +static ssize_t errors_show(struct device *dev, struct device_attribute *attr,
> +			   char *buf)
> +{
> +	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
> +	void __iomem *base;
> +	u64 error;
> +
> +	base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> +
> +	mutex_lock(&pdata->lock);
> +	error = readq(base + PORT_ERROR);
> +	mutex_unlock(&pdata->lock);
> +
> +	return sprintf(buf, "0x%llx\n", (unsigned long long)error);
> +}
> +static DEVICE_ATTR_RO(errors);
> +
> +static ssize_t first_error_show(struct device *dev,
> +				struct device_attribute *attr, char *buf)
> +{
> +	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
> +	void __iomem *base;
> +	u64 error;
> +
> +	base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> +
> +	mutex_lock(&pdata->lock);
> +	error = readq(base + PORT_FIRST_ERROR);
> +	mutex_unlock(&pdata->lock);
> +
> +	return sprintf(buf, "0x%llx\n", (unsigned long long)error);
> +}
> +static DEVICE_ATTR_RO(first_error);
> +
> +static ssize_t first_malformed_req_show(struct device *dev,
> +					struct device_attribute *attr,
> +					char *buf)
> +{
> +	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
> +	void __iomem *base;
> +	u64 req0, req1;
> +
> +	base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> +
> +	mutex_lock(&pdata->lock);
> +	req0 = readq(base + PORT_MALFORMED_REQ0);
> +	req1 = readq(base + PORT_MALFORMED_REQ1);
> +	mutex_unlock(&pdata->lock);
> +
> +	return sprintf(buf, "0x%016llx%016llx\n",
> +		       (unsigned long long)req1, (unsigned long long)req0);
> +}
> +static DEVICE_ATTR_RO(first_malformed_req);
> +
> +static ssize_t clear_store(struct device *dev, struct device_attribute *attr,
> +			   const char *buff, size_t count)
> +{
> +	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
> +	u64 value;
> +	int ret;
> +
> +	if (kstrtou64(buff, 0, &value))
> +		return -EINVAL;
> +
> +	mutex_lock(&pdata->lock);
> +	ret = __port_err_clear(dev, value);
> +	mutex_unlock(&pdata->lock);
> +
> +	return ret ? ret : count;
> +}
> +static DEVICE_ATTR_WO(clear);
> +
> +static struct attribute *port_err_attrs[] = {
> +	&dev_attr_revision.attr,
> +	&dev_attr_errors.attr,
> +	&dev_attr_first_error.attr,
> +	&dev_attr_first_malformed_req.attr,
> +	&dev_attr_clear.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group port_err_attr_group = {
> +	.attrs = port_err_attrs,
> +	.name = "errors",
> +};
> +
> +static int port_err_init(struct platform_device *pdev,
> +			 struct dfl_feature *feature)
> +{
> +	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
> +
> +	mutex_lock(&pdata->lock);
> +	__port_err_mask(&pdev->dev, false);
> +	mutex_unlock(&pdata->lock);

Locking one data structure and then modifying another one is up there
with "things never to do in the kernel unless you want a huge
headache!".

> +
> +	return device_add_group(&pdev->dev, &port_err_attr_group);

You raced userspace and lost :(

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH v4 11/12] fpga: dfl: fme: add global error reporting support
From: Greg KH @ 2019-08-05 15:56 UTC (permalink / raw)
  To: Wu Hao
  Cc: mdf, linux-fpga, linux-kernel, linux-api, linux-doc, atull,
	Luwei Kang, Ananda Ravuri, Xu Yilun
In-Reply-To: <1564914022-3710-12-git-send-email-hao.wu@intel.com>

On Sun, Aug 04, 2019 at 06:20:21PM +0800, Wu Hao wrote:
> +static int fme_global_err_init(struct platform_device *pdev,
> +			       struct dfl_feature *feature)
> +{
> +	struct device *dev;
> +	int ret = 0;
> +
> +	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> +	if (!dev)
> +		return -ENOMEM;
> +
> +	dev->parent = &pdev->dev;
> +	dev->release = err_dev_release;
> +	dev_set_name(dev, "errors");
> +
> +	fme_error_enable(feature);
> +
> +	ret = device_register(dev);
> +	if (ret) {
> +		put_device(dev);
> +		return ret;
> +	}
> +
> +	ret = device_add_groups(dev, error_groups);

cute, but no, you do not create a whole struct device for a subdir.  Use
the attribute group name like you did on earlier patches.

And again, you raced userspace and lost :(

thanks,

greg k-h

^ permalink raw reply

* [PATCH v8 00/20] fscrypt: key management improvements
From: Eric Biggers @ 2019-08-05 16:25 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Satya Tangirala, Theodore Ts'o, linux-api, linux-f2fs-devel,
	keyrings, linux-mtd, linux-crypto, linux-fsdevel, Jaegeuk Kim,
	linux-ext4, Paul Crowley

Hello,

[Note: I'd like to apply this for v5.4.  Additional review is greatly
 appreciated, especially of the API before it's set in stone.  Thanks!]

This patchset makes major improvements to how keys are added, removed,
and derived in fscrypt, aka ext4/f2fs/ubifs encryption.  It does this by
adding new ioctls that add and remove encryption keys directly to/from
the filesystem, and by adding a new encryption policy version ("v2")
where the user-provided keys are only used as input to HKDF-SHA512 and
are identified by their cryptographic hash.

All new APIs and all cryptosystem changes are documented in
Documentation/filesystems/fscrypt.rst.  Userspace can use the new key
management ioctls with existing encrypted directories, but migrating to
v2 encryption policies is needed for the full benefits.

These changes solve four interrelated problems:

(1) Providing fscrypt keys via process-subscribed keyrings is abusing
    encryption as an OS-level access control mechanism, causing many
    bugs where processes don't get access to the keys they need -- e.g.,
    when a 'sudo' command or a system service needs to access encrypted
    files.  It's also inconsistent with the filesystem/VFS "view" of
    encrypted files which is global, so sometimes things randomly happen
    to work anyway due to caching.  Regardless, currently almost all
    fscrypt users actually do need global keys, so they're having to use
    workarounds that heavily abuse the session or user keyrings, e.g.
    Android and Chromium OS both use a systemwide "session keyring" and
    the 'fscrypt' tool links all user keyrings into root's user keyring.

(2) Currently there's no way to securely and efficiently remove a
    fscrypt key such that not only is the original key wiped, but also
    all files and directories protected by that key are "locked" and
    their per-file keys wiped.  Many users want this and are using
    'echo 2 > /proc/sys/vm/drop_caches' as a workaround, but this is
    root-only, and also is overkill so can be a performance disaster.

(3) The key derivation function (KDF) that fscrypt uses to derive
    per-file keys is nonstandard, inflexible, and has some weaknesses
    such as being reversible and not evenly distributing the entropy
    from the user-provided keys.

(4) fscrypt doesn't check that the correct key was supplied.  This can
    be a security vulnerability, since it allows malicious local users
    to associate the wrong key with files to which they have read-only
    access, causing other users' processes to read/write the wrong data.

Ultimately, the solutions to these problems all tie into each other.  By
adding a filesystem-level encryption keyring with ioctls to add/remove
keys to/from it, the keys are made usable filesystem-wide (solves
problem #1).  It also becomes easy to track the inodes that were
"unlocked" with each key, so they can be evicted when the key is removed
(solves problem #2).  Moreover, the filesystem-level keyring is a
natural place to store an HMAC transform keyed by each key, thus making
it easy and efficient to switch the KDF to HKDF (solves problem #3).

Finally, to check that the correct key was supplied, I use HKDF to
derive a cryptographically secure key_identifier for each key (solves
problem #4).  This in combination with key quotas and other careful
precautions also makes it safe to allow non-root users to add and remove
keys to/from the filesystem-level keyring.  Thus, all problems are
solved without having to restrict the fscrypt API to root only.

The patchset is organized as follows:

- Patches 1-8 create a dedicated UAPI header for fscrypt and do various
  refactoring and cleanups in preparation for the later patches.

- Patches 9-11 add new ioctls FS_IOC_ADD_ENCRYPTION_KEY,
  FS_IOC_REMOVE_ENCRYPTION_KEY, and FS_IOC_GET_ENCRYPTION_KEY_STATUS.
  Adding a key logically "unlocks" all files on the filesystem that are
  protected by that key; removing a key "locks" them again.

- Patches 12-16 add support for v2 encryption policies.

- Patches 17-19 wire up the new ioctls to ext4, f2fs, and ubifs.

- Patch 20 updates the fscrypt documentation for all the changes.

This patchset applies to v5.3-rc3 with the pending fscrypt cleanup
patches applied (https://patchwork.kernel.org/patch/11057589/ and
https://patchwork.kernel.org/cover/11057583/).
You can also get it from git at:

	Repository:   https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git
	Branch:       fscrypt-key-mgmt-improvements-v8

I've written xfstests for the new APIs.  They test the APIs themselves
as well as verify the correctness of the ciphertext stored on-disk for
v2 encryption policies.  The tests can be found at:

	Repository:   https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/xfstests-dev.git
	Branch:       fscrypt-key-mgmt-improvements

The xfstests depend on new xfs_io commands which can be found at:

	Repository:   https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/xfsprogs-dev.git
	Branch:       fscrypt-key-mgmt-improvements

This patchset also passes all the existing encryption tests in xfstests,
including the ciphertext verification tests which verify that there are
no regressions in the crypto for any existing encryption settings.

I've also made proof-of-concept changes to the 'fscrypt' userspace
program (https://github.com/google/fscrypt) to make it support v2
encryption policies.  You can find these changes in git at:

	Repository:   https://github.com/ebiggers/fscrypt.git
	Branch:       fscrypt-key-mgmt-improvements

To make the 'fscrypt' userspace program experimentally use v2 encryption
policies on new encrypted directories, add the following to
/etc/fscrypt.conf within the "options" section:

	"policy_version": "2"

Finally, it's also planned for Android and Chromium OS to switch to the
new ioctls and eventually to v2 encryption policies.  Work-in-progress,
proof-of-concept changes by Satya Tangirala for AOSP can be found at
https://android-review.googlesource.com/q/topic:fscrypt-key-mgmt-improvements

Changes v7 => v8:
    - Replace -EUSERS and -EBUSY statuses for
      FS_IOC_REMOVE_ENCRYPTION_KEY with informational status flags.
    - Replace FSCRYPT_REMOVE_KEY_FLAG_ALL_USERS with a separate ioctl,
      FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS.
    - Improve the documentation.
    - Improve some comments.
    - Rename keysetup_legacy.c => keysetup_v1.c, and split the keyinfo.c
      refactoring into multiple patches to make it easier to review.
    - Avoid checks like 'if (v1 policy) { ... } else { ... }' even when
      the policy version was already validated.  Instead handle v1, v2,
      and default case explicitly.
    - In warning messages that refer to keys in the fs-level keyring,
      say "descriptor" or "identifier" instead of "description".
    - Restore a fscrypt_warn() that was accidentally lost when rebasing.
    - Rebase onto v5.3-rc3.
    - Other small cleanups.

Changes v6 => v7:
    - Rebase onto v5.3-rc1 and the pending fscrypt cleanups.
    - Work around false positive compile-time buffer overflow check in
      copy_from_user() in fscrypt_ioctl_set_policy() when building an
      i386 kernel in a specific config with an old gcc version.
    - A few very minor cleanups.

Changes v5 => v6:
    - Change HKDF to use the specification-defined default salt rather
      than a custom fixed salt, and prepend the string "fscrypt" to
      'info' instead.  This is arguably needed to match how RFC 5869 and
      SP 800-56C are worded.  Both ways are secure in this context, so
      prefer the "boring" way that clearly matches the standards.
    - Rebase onto v5.2-rc1.
    - A few small cleanups.

Changes v4 => v5:
    - Simplify shrink_dcache_inode(), as suggested by Al Viro;
      also move it into fs/crypto/.
    - Fix a build error on some architectures by calling
      copy_from_user() rather than get_user() with a __u64 pointer.

Changes v3 => v4:
    - Introduce fscrypt_sb_free() to avoid an extra #ifdef.
    - Fix UBIFS's ->drop_inode().
    - Add 'version' to union fscrypt_policy and union fscrypt_context.

Changes v2 => v3:
    - Use ->drop_inode() to trigger the inode eviction during/after
      FS_IOC_REMOVE_ENCRYPTION_KEY, as suggested by Dave Chinner.
    - A few small cleanups.

v1 of this patchset was sent in October 2017 with title "fscrypt:
filesystem-level keyring and v2 policy support".  This revived version
follows the same basic design but incorporates numerous improvements,
such as splitting keyinfo.c into multiple files for much better
understandability, and introducing "per-mode" encryption keys to
implement the semantics of the DIRECT_KEY encryption policy flag.

Eric Biggers (20):
  fs, fscrypt: move uapi definitions to new header <linux/fscrypt.h>
  fscrypt: use FSCRYPT_ prefix for uapi constants
  fscrypt: use FSCRYPT_* definitions, not FS_*
  fscrypt: add ->ci_inode to fscrypt_info
  fscrypt: rename fscrypt_master_key to fscrypt_direct_key
  fscrypt: refactor key setup code in preparation for v2 policies
  fscrypt: move v1 policy key setup to keysetup_v1.c
  fscrypt: rename keyinfo.c to keysetup.c
  fscrypt: add FS_IOC_ADD_ENCRYPTION_KEY ioctl
  fscrypt: add FS_IOC_REMOVE_ENCRYPTION_KEY ioctl
  fscrypt: add FS_IOC_GET_ENCRYPTION_KEY_STATUS ioctl
  fscrypt: add an HKDF-SHA512 implementation
  fscrypt: v2 encryption policy support
  fscrypt: allow unprivileged users to add/remove keys for v2 policies
  fscrypt: add FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS ioctl
  fscrypt: require that key be added when setting a v2 encryption policy
  ext4: wire up new fscrypt ioctls
  f2fs: wire up new fscrypt ioctls
  ubifs: wire up new fscrypt ioctls
  fscrypt: document the new ioctls and policy version

 Documentation/filesystems/fscrypt.rst | 755 ++++++++++++++++----
 MAINTAINERS                           |   1 +
 fs/crypto/Kconfig                     |   2 +
 fs/crypto/Makefile                    |  10 +-
 fs/crypto/crypto.c                    |  12 +-
 fs/crypto/fname.c                     |   5 +-
 fs/crypto/fscrypt_private.h           | 389 +++++++++-
 fs/crypto/hkdf.c                      | 181 +++++
 fs/crypto/keyinfo.c                   | 627 ----------------
 fs/crypto/keyring.c                   | 981 ++++++++++++++++++++++++++
 fs/crypto/keysetup.c                  | 591 ++++++++++++++++
 fs/crypto/keysetup_v1.c               | 340 +++++++++
 fs/crypto/policy.c                    | 434 +++++++++---
 fs/ext4/ioctl.c                       |  30 +
 fs/ext4/super.c                       |   3 +
 fs/f2fs/file.c                        |  58 ++
 fs/f2fs/super.c                       |   2 +
 fs/super.c                            |   2 +
 fs/ubifs/ioctl.c                      |  20 +
 fs/ubifs/super.c                      |  11 +
 include/linux/fs.h                    |   1 +
 include/linux/fscrypt.h               |  55 +-
 include/uapi/linux/fs.h               |  54 +-
 include/uapi/linux/fscrypt.h          | 181 +++++
 24 files changed, 3787 insertions(+), 958 deletions(-)
 create mode 100644 fs/crypto/hkdf.c
 delete mode 100644 fs/crypto/keyinfo.c
 create mode 100644 fs/crypto/keyring.c
 create mode 100644 fs/crypto/keysetup.c
 create mode 100644 fs/crypto/keysetup_v1.c
 create mode 100644 include/uapi/linux/fscrypt.h

-- 
2.22.0

^ permalink raw reply

* [PATCH v8 01/20] fs, fscrypt: move uapi definitions to new header <linux/fscrypt.h>
From: Eric Biggers @ 2019-08-05 16:25 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Satya Tangirala, Theodore Ts'o, linux-api, linux-f2fs-devel,
	keyrings, linux-mtd, linux-crypto, linux-fsdevel, Jaegeuk Kim,
	linux-ext4, Paul Crowley
In-Reply-To: <20190805162521.90882-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

More fscrypt definitions are being added, and we shouldn't use a
disproportionate amount of space in <linux/fs.h> for fscrypt stuff.
So move the fscrypt definitions to a new header <linux/fscrypt.h>.

For source compatibility with existing userspace programs, <linux/fs.h>
still includes the new header.

Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 MAINTAINERS                  |  1 +
 include/linux/fscrypt.h      |  1 +
 include/uapi/linux/fs.h      | 54 ++-----------------------------
 include/uapi/linux/fscrypt.h | 61 ++++++++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+), 51 deletions(-)
 create mode 100644 include/uapi/linux/fscrypt.h

diff --git a/MAINTAINERS b/MAINTAINERS
index a2c343ee3b2ca1..714ffdaaa9204f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6603,6 +6603,7 @@ T:	git git://git.kernel.org/pub/scm/fs/fscrypt/fscrypt.git
 S:	Supported
 F:	fs/crypto/
 F:	include/linux/fscrypt*.h
+F:	include/uapi/linux/fscrypt.h
 F:	Documentation/filesystems/fscrypt.rst
 
 FSI SUBSYSTEM
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index bd8f207a2fb68b..81c0c754f8b21b 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -16,6 +16,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <uapi/linux/fscrypt.h>
 
 #define FS_CRYPTO_BLOCK_SIZE		16
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 59c71fa8c553a3..41bd84d25a9807 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -13,6 +13,9 @@
 #include <linux/limits.h>
 #include <linux/ioctl.h>
 #include <linux/types.h>
+#ifndef __KERNEL__
+#include <linux/fscrypt.h>
+#endif
 
 /* Use of MS_* flags within the kernel is restricted to core mount(2) code. */
 #if !defined(__KERNEL__)
@@ -212,57 +215,6 @@ struct fsxattr {
 #define FS_IOC_GETFSLABEL		_IOR(0x94, 49, char[FSLABEL_MAX])
 #define FS_IOC_SETFSLABEL		_IOW(0x94, 50, char[FSLABEL_MAX])
 
-/*
- * File system encryption support
- */
-/* Policy provided via an ioctl on the topmost directory */
-#define FS_KEY_DESCRIPTOR_SIZE	8
-
-#define FS_POLICY_FLAGS_PAD_4		0x00
-#define FS_POLICY_FLAGS_PAD_8		0x01
-#define FS_POLICY_FLAGS_PAD_16		0x02
-#define FS_POLICY_FLAGS_PAD_32		0x03
-#define FS_POLICY_FLAGS_PAD_MASK	0x03
-#define FS_POLICY_FLAG_DIRECT_KEY	0x04	/* use master key directly */
-#define FS_POLICY_FLAGS_VALID		0x07
-
-/* Encryption algorithms */
-#define FS_ENCRYPTION_MODE_INVALID		0
-#define FS_ENCRYPTION_MODE_AES_256_XTS		1
-#define FS_ENCRYPTION_MODE_AES_256_GCM		2
-#define FS_ENCRYPTION_MODE_AES_256_CBC		3
-#define FS_ENCRYPTION_MODE_AES_256_CTS		4
-#define FS_ENCRYPTION_MODE_AES_128_CBC		5
-#define FS_ENCRYPTION_MODE_AES_128_CTS		6
-#define FS_ENCRYPTION_MODE_SPECK128_256_XTS	7 /* Removed, do not use. */
-#define FS_ENCRYPTION_MODE_SPECK128_256_CTS	8 /* Removed, do not use. */
-#define FS_ENCRYPTION_MODE_ADIANTUM		9
-
-struct fscrypt_policy {
-	__u8 version;
-	__u8 contents_encryption_mode;
-	__u8 filenames_encryption_mode;
-	__u8 flags;
-	__u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
-};
-
-#define FS_IOC_SET_ENCRYPTION_POLICY	_IOR('f', 19, struct fscrypt_policy)
-#define FS_IOC_GET_ENCRYPTION_PWSALT	_IOW('f', 20, __u8[16])
-#define FS_IOC_GET_ENCRYPTION_POLICY	_IOW('f', 21, struct fscrypt_policy)
-
-/* Parameters for passing an encryption key into the kernel keyring */
-#define FS_KEY_DESC_PREFIX		"fscrypt:"
-#define FS_KEY_DESC_PREFIX_SIZE		8
-
-/* Structure that userspace passes to the kernel keyring */
-#define FS_MAX_KEY_SIZE			64
-
-struct fscrypt_key {
-	__u32 mode;
-	__u8 raw[FS_MAX_KEY_SIZE];
-	__u32 size;
-};
-
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
  *
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
new file mode 100644
index 00000000000000..26f6d2c19afd3f
--- /dev/null
+++ b/include/uapi/linux/fscrypt.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * fscrypt user API
+ *
+ * These ioctls can be used on filesystems that support fscrypt.  See the
+ * "User API" section of Documentation/filesystems/fscrypt.rst.
+ */
+#ifndef _UAPI_LINUX_FSCRYPT_H
+#define _UAPI_LINUX_FSCRYPT_H
+
+#include <linux/types.h>
+
+#define FS_KEY_DESCRIPTOR_SIZE	8
+
+/* Encryption policy flags */
+#define FS_POLICY_FLAGS_PAD_4		0x00
+#define FS_POLICY_FLAGS_PAD_8		0x01
+#define FS_POLICY_FLAGS_PAD_16		0x02
+#define FS_POLICY_FLAGS_PAD_32		0x03
+#define FS_POLICY_FLAGS_PAD_MASK	0x03
+#define FS_POLICY_FLAG_DIRECT_KEY	0x04	/* use master key directly */
+#define FS_POLICY_FLAGS_VALID		0x07
+
+/* Encryption algorithms */
+#define FS_ENCRYPTION_MODE_INVALID		0
+#define FS_ENCRYPTION_MODE_AES_256_XTS		1
+#define FS_ENCRYPTION_MODE_AES_256_GCM		2
+#define FS_ENCRYPTION_MODE_AES_256_CBC		3
+#define FS_ENCRYPTION_MODE_AES_256_CTS		4
+#define FS_ENCRYPTION_MODE_AES_128_CBC		5
+#define FS_ENCRYPTION_MODE_AES_128_CTS		6
+#define FS_ENCRYPTION_MODE_SPECK128_256_XTS	7 /* Removed, do not use. */
+#define FS_ENCRYPTION_MODE_SPECK128_256_CTS	8 /* Removed, do not use. */
+#define FS_ENCRYPTION_MODE_ADIANTUM		9
+
+struct fscrypt_policy {
+	__u8 version;
+	__u8 contents_encryption_mode;
+	__u8 filenames_encryption_mode;
+	__u8 flags;
+	__u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+};
+
+#define FS_IOC_SET_ENCRYPTION_POLICY	_IOR('f', 19, struct fscrypt_policy)
+#define FS_IOC_GET_ENCRYPTION_PWSALT	_IOW('f', 20, __u8[16])
+#define FS_IOC_GET_ENCRYPTION_POLICY	_IOW('f', 21, struct fscrypt_policy)
+
+/* Parameters for passing an encryption key into the kernel keyring */
+#define FS_KEY_DESC_PREFIX		"fscrypt:"
+#define FS_KEY_DESC_PREFIX_SIZE		8
+
+/* Structure that userspace passes to the kernel keyring */
+#define FS_MAX_KEY_SIZE			64
+
+struct fscrypt_key {
+	__u32 mode;
+	__u8 raw[FS_MAX_KEY_SIZE];
+	__u32 size;
+};
+
+#endif /* _UAPI_LINUX_FSCRYPT_H */
-- 
2.22.0.770.g0f2c4a37fd-goog

^ permalink raw reply related

* [PATCH v8 02/20] fscrypt: use FSCRYPT_ prefix for uapi constants
From: Eric Biggers @ 2019-08-05 16:25 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Satya Tangirala, Theodore Ts'o, linux-api, linux-f2fs-devel,
	keyrings, linux-mtd, linux-crypto, linux-fsdevel, Jaegeuk Kim,
	linux-ext4, Paul Crowley
In-Reply-To: <20190805162521.90882-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Prefix all filesystem encryption UAPI constants except the ioctl numbers
with "FSCRYPT_" rather than with "FS_".  This namespaces the constants
more appropriately and makes it clear that they are related specifically
to the filesystem encryption feature, and to the 'fscrypt_*' structures.
With some of the old names like "FS_POLICY_FLAGS_VALID", it was not
immediately clear that the constant had anything to do with encryption.

This is also useful because we'll be adding more encryption-related
constants, e.g. for the policy version, and we'd otherwise have to
choose whether to use unclear names like FS_POLICY_V1 or inconsistent
names like FS_ENCRYPTION_POLICY_V1.

For source compatibility with existing userspace programs, keep the old
names defined as aliases to the new names.

Finally, as long as new names are being defined anyway, I skipped
defining new names for the fscrypt mode numbers that aren't actually
used: INVALID (0), AES_256_GCM (2), AES_256_CBC (3), SPECK128_256_XTS
(7), and SPECK128_256_CTS (8).

Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 Documentation/filesystems/fscrypt.rst | 40 ++++++++---------
 include/uapi/linux/fscrypt.h          | 65 +++++++++++++++++----------
 2 files changed, 62 insertions(+), 43 deletions(-)

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 82efa41b0e6c02..d60b885c402401 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -225,9 +225,10 @@ a little endian number, except that:
   is encrypted with AES-256 where the AES-256 key is the SHA-256 hash
   of the file's data encryption key.
 
-- In the "direct key" configuration (FS_POLICY_FLAG_DIRECT_KEY set in
-  the fscrypt_policy), the file's nonce is also appended to the IV.
-  Currently this is only allowed with the Adiantum encryption mode.
+- In the "direct key" configuration (FSCRYPT_POLICY_FLAG_DIRECT_KEY
+  set in the fscrypt_policy), the file's nonce is also appended to the
+  IV.  Currently this is only allowed with the Adiantum encryption
+  mode.
 
 Filenames encryption
 --------------------
@@ -274,14 +275,14 @@ empty directory or verifies that a directory or regular file already
 has the specified encryption policy.  It takes in a pointer to a
 :c:type:`struct fscrypt_policy`, defined as follows::
 
-    #define FS_KEY_DESCRIPTOR_SIZE  8
+    #define FSCRYPT_KEY_DESCRIPTOR_SIZE  8
 
     struct fscrypt_policy {
             __u8 version;
             __u8 contents_encryption_mode;
             __u8 filenames_encryption_mode;
             __u8 flags;
-            __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+            __u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
     };
 
 This structure must be initialized as follows:
@@ -289,19 +290,18 @@ This structure must be initialized as follows:
 - ``version`` must be 0.
 
 - ``contents_encryption_mode`` and ``filenames_encryption_mode`` must
-  be set to constants from ``<linux/fs.h>`` which identify the
-  encryption modes to use.  If unsure, use
-  FS_ENCRYPTION_MODE_AES_256_XTS (1) for ``contents_encryption_mode``
-  and FS_ENCRYPTION_MODE_AES_256_CTS (4) for
-  ``filenames_encryption_mode``.
+  be set to constants from ``<linux/fscrypt.h>`` which identify the
+  encryption modes to use.  If unsure, use FSCRYPT_MODE_AES_256_XTS
+  (1) for ``contents_encryption_mode`` and FSCRYPT_MODE_AES_256_CTS
+  (4) for ``filenames_encryption_mode``.
 
-- ``flags`` must contain a value from ``<linux/fs.h>`` which
+- ``flags`` must contain a value from ``<linux/fscrypt.h>`` which
   identifies the amount of NUL-padding to use when encrypting
-  filenames.  If unsure, use FS_POLICY_FLAGS_PAD_32 (0x3).
-  In addition, if the chosen encryption modes are both
-  FS_ENCRYPTION_MODE_ADIANTUM, this can contain
-  FS_POLICY_FLAG_DIRECT_KEY to specify that the master key should be
-  used directly, without key derivation.
+  filenames.  If unsure, use FSCRYPT_POLICY_FLAGS_PAD_32 (0x3).  In
+  addition, if the chosen encryption modes are both
+  FSCRYPT_MODE_ADIANTUM, this can contain
+  FSCRYPT_POLICY_FLAG_DIRECT_KEY to specify that the master key should
+  be used directly, without key derivation.
 
 - ``master_key_descriptor`` specifies how to find the master key in
   the keyring; see `Adding keys`_.  It is up to userspace to choose a
@@ -401,11 +401,11 @@ followed by the 16-character lower case hex representation of the
 ``master_key_descriptor`` that was set in the encryption policy.  The
 key payload must conform to the following structure::
 
-    #define FS_MAX_KEY_SIZE 64
+    #define FSCRYPT_MAX_KEY_SIZE 64
 
     struct fscrypt_key {
             u32 mode;
-            u8 raw[FS_MAX_KEY_SIZE];
+            u8 raw[FSCRYPT_MAX_KEY_SIZE];
             u32 size;
     };
 
@@ -574,7 +574,7 @@ much confusion if an encryption policy were to be added to or removed
 from anything other than an empty directory.)  The struct is defined
 as follows::
 
-    #define FS_KEY_DESCRIPTOR_SIZE  8
+    #define FSCRYPT_KEY_DESCRIPTOR_SIZE  8
     #define FS_KEY_DERIVATION_NONCE_SIZE 16
 
     struct fscrypt_context {
@@ -582,7 +582,7 @@ as follows::
             u8 contents_encryption_mode;
             u8 filenames_encryption_mode;
             u8 flags;
-            u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+            u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
             u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
     };
 
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
index 26f6d2c19afd3f..674b0452ef5759 100644
--- a/include/uapi/linux/fscrypt.h
+++ b/include/uapi/linux/fscrypt.h
@@ -10,35 +10,30 @@
 
 #include <linux/types.h>
 
-#define FS_KEY_DESCRIPTOR_SIZE	8
+#define FSCRYPT_KEY_DESCRIPTOR_SIZE	8
 
 /* Encryption policy flags */
-#define FS_POLICY_FLAGS_PAD_4		0x00
-#define FS_POLICY_FLAGS_PAD_8		0x01
-#define FS_POLICY_FLAGS_PAD_16		0x02
-#define FS_POLICY_FLAGS_PAD_32		0x03
-#define FS_POLICY_FLAGS_PAD_MASK	0x03
-#define FS_POLICY_FLAG_DIRECT_KEY	0x04	/* use master key directly */
-#define FS_POLICY_FLAGS_VALID		0x07
+#define FSCRYPT_POLICY_FLAGS_PAD_4		0x00
+#define FSCRYPT_POLICY_FLAGS_PAD_8		0x01
+#define FSCRYPT_POLICY_FLAGS_PAD_16		0x02
+#define FSCRYPT_POLICY_FLAGS_PAD_32		0x03
+#define FSCRYPT_POLICY_FLAGS_PAD_MASK		0x03
+#define FSCRYPT_POLICY_FLAG_DIRECT_KEY		0x04	/* use master key directly */
+#define FSCRYPT_POLICY_FLAGS_VALID		0x07
 
 /* Encryption algorithms */
-#define FS_ENCRYPTION_MODE_INVALID		0
-#define FS_ENCRYPTION_MODE_AES_256_XTS		1
-#define FS_ENCRYPTION_MODE_AES_256_GCM		2
-#define FS_ENCRYPTION_MODE_AES_256_CBC		3
-#define FS_ENCRYPTION_MODE_AES_256_CTS		4
-#define FS_ENCRYPTION_MODE_AES_128_CBC		5
-#define FS_ENCRYPTION_MODE_AES_128_CTS		6
-#define FS_ENCRYPTION_MODE_SPECK128_256_XTS	7 /* Removed, do not use. */
-#define FS_ENCRYPTION_MODE_SPECK128_256_CTS	8 /* Removed, do not use. */
-#define FS_ENCRYPTION_MODE_ADIANTUM		9
+#define FSCRYPT_MODE_AES_256_XTS		1
+#define FSCRYPT_MODE_AES_256_CTS		4
+#define FSCRYPT_MODE_AES_128_CBC		5
+#define FSCRYPT_MODE_AES_128_CTS		6
+#define FSCRYPT_MODE_ADIANTUM			9
 
 struct fscrypt_policy {
 	__u8 version;
 	__u8 contents_encryption_mode;
 	__u8 filenames_encryption_mode;
 	__u8 flags;
-	__u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+	__u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
 };
 
 #define FS_IOC_SET_ENCRYPTION_POLICY	_IOR('f', 19, struct fscrypt_policy)
@@ -46,16 +41,40 @@ struct fscrypt_policy {
 #define FS_IOC_GET_ENCRYPTION_POLICY	_IOW('f', 21, struct fscrypt_policy)
 
 /* Parameters for passing an encryption key into the kernel keyring */
-#define FS_KEY_DESC_PREFIX		"fscrypt:"
-#define FS_KEY_DESC_PREFIX_SIZE		8
+#define FSCRYPT_KEY_DESC_PREFIX		"fscrypt:"
+#define FSCRYPT_KEY_DESC_PREFIX_SIZE		8
 
 /* Structure that userspace passes to the kernel keyring */
-#define FS_MAX_KEY_SIZE			64
+#define FSCRYPT_MAX_KEY_SIZE			64
 
 struct fscrypt_key {
 	__u32 mode;
-	__u8 raw[FS_MAX_KEY_SIZE];
+	__u8 raw[FSCRYPT_MAX_KEY_SIZE];
 	__u32 size;
 };
+/**********************************************************************/
+
+/* old names; don't add anything new here! */
+#define FS_KEY_DESCRIPTOR_SIZE		FSCRYPT_KEY_DESCRIPTOR_SIZE
+#define FS_POLICY_FLAGS_PAD_4		FSCRYPT_POLICY_FLAGS_PAD_4
+#define FS_POLICY_FLAGS_PAD_8		FSCRYPT_POLICY_FLAGS_PAD_8
+#define FS_POLICY_FLAGS_PAD_16		FSCRYPT_POLICY_FLAGS_PAD_16
+#define FS_POLICY_FLAGS_PAD_32		FSCRYPT_POLICY_FLAGS_PAD_32
+#define FS_POLICY_FLAGS_PAD_MASK	FSCRYPT_POLICY_FLAGS_PAD_MASK
+#define FS_POLICY_FLAG_DIRECT_KEY	FSCRYPT_POLICY_FLAG_DIRECT_KEY
+#define FS_POLICY_FLAGS_VALID		FSCRYPT_POLICY_FLAGS_VALID
+#define FS_ENCRYPTION_MODE_INVALID	0	/* never used */
+#define FS_ENCRYPTION_MODE_AES_256_XTS	FSCRYPT_MODE_AES_256_XTS
+#define FS_ENCRYPTION_MODE_AES_256_GCM	2	/* never used */
+#define FS_ENCRYPTION_MODE_AES_256_CBC	3	/* never used */
+#define FS_ENCRYPTION_MODE_AES_256_CTS	FSCRYPT_MODE_AES_256_CTS
+#define FS_ENCRYPTION_MODE_AES_128_CBC	FSCRYPT_MODE_AES_128_CBC
+#define FS_ENCRYPTION_MODE_AES_128_CTS	FSCRYPT_MODE_AES_128_CTS
+#define FS_ENCRYPTION_MODE_SPECK128_256_XTS	7	/* removed */
+#define FS_ENCRYPTION_MODE_SPECK128_256_CTS	8	/* removed */
+#define FS_ENCRYPTION_MODE_ADIANTUM	FSCRYPT_MODE_ADIANTUM
+#define FS_KEY_DESC_PREFIX		FSCRYPT_KEY_DESC_PREFIX
+#define FS_KEY_DESC_PREFIX_SIZE		FSCRYPT_KEY_DESC_PREFIX_SIZE
+#define FS_MAX_KEY_SIZE			FSCRYPT_MAX_KEY_SIZE
 
 #endif /* _UAPI_LINUX_FSCRYPT_H */
-- 
2.22.0.770.g0f2c4a37fd-goog

^ permalink raw reply related

* [PATCH v8 03/20] fscrypt: use FSCRYPT_* definitions, not FS_*
From: Eric Biggers @ 2019-08-05 16:25 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Satya Tangirala, Theodore Ts'o, linux-api, linux-f2fs-devel,
	keyrings, linux-mtd, linux-crypto, linux-fsdevel, Jaegeuk Kim,
	linux-ext4, Paul Crowley
In-Reply-To: <20190805162521.90882-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Update fs/crypto/ to use the new names for the UAPI constants rather
than the old names, then make the old definitions conditional on
!__KERNEL__.

Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/crypto.c           |  2 +-
 fs/crypto/fname.c            |  2 +-
 fs/crypto/fscrypt_private.h  | 16 +++++------
 fs/crypto/keyinfo.c          | 53 ++++++++++++++++++------------------
 fs/crypto/policy.c           | 14 +++++-----
 include/uapi/linux/fscrypt.h |  2 ++
 6 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 3e4624cfe4b54d..7502c1f0ede9e9 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -141,7 +141,7 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
 	memset(iv, 0, ci->ci_mode->ivsize);
 	iv->lblk_num = cpu_to_le64(lblk_num);
 
-	if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY)
+	if (ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
 		memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
 
 	if (ci->ci_essiv_tfm != NULL)
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 5cab3bb2d1fc00..f4977d44d69b81 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -182,7 +182,7 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
 				  u32 max_len, u32 *encrypted_len_ret)
 {
 	int padding = 4 << (inode->i_crypt_info->ci_flags &
-			    FS_POLICY_FLAGS_PAD_MASK);
+			    FSCRYPT_POLICY_FLAGS_PAD_MASK);
 	u32 encrypted_len;
 
 	if (orig_len > max_len)
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 4d715708c6e1f7..fae411b2f78dcb 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -34,7 +34,7 @@ struct fscrypt_context {
 	u8 contents_encryption_mode;
 	u8 filenames_encryption_mode;
 	u8 flags;
-	u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+	u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
 	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
 } __packed;
 
@@ -84,7 +84,7 @@ struct fscrypt_info {
 	u8 ci_data_mode;
 	u8 ci_filename_mode;
 	u8 ci_flags;
-	u8 ci_master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+	u8 ci_master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
 	u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE];
 };
 
@@ -98,16 +98,16 @@ typedef enum {
 static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
 					   u32 filenames_mode)
 {
-	if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
-	    filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
+	if (contents_mode == FSCRYPT_MODE_AES_128_CBC &&
+	    filenames_mode == FSCRYPT_MODE_AES_128_CTS)
 		return true;
 
-	if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
-	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
+	if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
+	    filenames_mode == FSCRYPT_MODE_AES_256_CTS)
 		return true;
 
-	if (contents_mode == FS_ENCRYPTION_MODE_ADIANTUM &&
-	    filenames_mode == FS_ENCRYPTION_MODE_ADIANTUM)
+	if (contents_mode == FSCRYPT_MODE_ADIANTUM &&
+	    filenames_mode == FSCRYPT_MODE_ADIANTUM)
 		return true;
 
 	return false;
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 2129943002335c..22345ddede1199 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -20,7 +20,7 @@
 
 static struct crypto_shash *essiv_hash_tfm;
 
-/* Table of keys referenced by FS_POLICY_FLAG_DIRECT_KEY policies */
+/* Table of keys referenced by DIRECT_KEY policies */
 static DEFINE_HASHTABLE(fscrypt_master_keys, 6); /* 6 bits = 64 buckets */
 static DEFINE_SPINLOCK(fscrypt_master_keys_lock);
 
@@ -77,7 +77,7 @@ static int derive_key_aes(const u8 *master_key,
  */
 static struct key *
 find_and_lock_process_key(const char *prefix,
-			  const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE],
+			  const u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE],
 			  unsigned int min_keysize,
 			  const struct fscrypt_key **payload_ret)
 {
@@ -87,7 +87,7 @@ find_and_lock_process_key(const char *prefix,
 	const struct fscrypt_key *payload;
 
 	description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
-				FS_KEY_DESCRIPTOR_SIZE, descriptor);
+				FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor);
 	if (!description)
 		return ERR_PTR(-ENOMEM);
 
@@ -105,7 +105,7 @@ find_and_lock_process_key(const char *prefix,
 	payload = (const struct fscrypt_key *)ukp->data;
 
 	if (ukp->datalen != sizeof(struct fscrypt_key) ||
-	    payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) {
+	    payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) {
 		fscrypt_warn(NULL,
 			     "key with description '%s' has invalid payload",
 			     key->description);
@@ -129,32 +129,32 @@ find_and_lock_process_key(const char *prefix,
 }
 
 static struct fscrypt_mode available_modes[] = {
-	[FS_ENCRYPTION_MODE_AES_256_XTS] = {
+	[FSCRYPT_MODE_AES_256_XTS] = {
 		.friendly_name = "AES-256-XTS",
 		.cipher_str = "xts(aes)",
 		.keysize = 64,
 		.ivsize = 16,
 	},
-	[FS_ENCRYPTION_MODE_AES_256_CTS] = {
+	[FSCRYPT_MODE_AES_256_CTS] = {
 		.friendly_name = "AES-256-CTS-CBC",
 		.cipher_str = "cts(cbc(aes))",
 		.keysize = 32,
 		.ivsize = 16,
 	},
-	[FS_ENCRYPTION_MODE_AES_128_CBC] = {
+	[FSCRYPT_MODE_AES_128_CBC] = {
 		.friendly_name = "AES-128-CBC",
 		.cipher_str = "cbc(aes)",
 		.keysize = 16,
 		.ivsize = 16,
 		.needs_essiv = true,
 	},
-	[FS_ENCRYPTION_MODE_AES_128_CTS] = {
+	[FSCRYPT_MODE_AES_128_CTS] = {
 		.friendly_name = "AES-128-CTS-CBC",
 		.cipher_str = "cts(cbc(aes))",
 		.keysize = 16,
 		.ivsize = 16,
 	},
-	[FS_ENCRYPTION_MODE_ADIANTUM] = {
+	[FSCRYPT_MODE_ADIANTUM] = {
 		.friendly_name = "Adiantum",
 		.cipher_str = "adiantum(xchacha12,aes)",
 		.keysize = 32,
@@ -192,7 +192,7 @@ static int find_and_derive_key(const struct inode *inode,
 	const struct fscrypt_key *payload;
 	int err;
 
-	key = find_and_lock_process_key(FS_KEY_DESC_PREFIX,
+	key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
 					ctx->master_key_descriptor,
 					mode->keysize, &payload);
 	if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
@@ -203,7 +203,7 @@ static int find_and_derive_key(const struct inode *inode,
 	if (IS_ERR(key))
 		return PTR_ERR(key);
 
-	if (ctx->flags & FS_POLICY_FLAG_DIRECT_KEY) {
+	if (ctx->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
 		if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) {
 			fscrypt_warn(inode,
 				     "Direct key mode not allowed with %s",
@@ -272,14 +272,14 @@ allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
 	return ERR_PTR(err);
 }
 
-/* Master key referenced by FS_POLICY_FLAG_DIRECT_KEY policy */
+/* Master key referenced by DIRECT_KEY policy */
 struct fscrypt_master_key {
 	struct hlist_node mk_node;
 	refcount_t mk_refcount;
 	const struct fscrypt_mode *mk_mode;
 	struct crypto_skcipher *mk_ctfm;
-	u8 mk_descriptor[FS_KEY_DESCRIPTOR_SIZE];
-	u8 mk_raw[FS_MAX_KEY_SIZE];
+	u8 mk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
+	u8 mk_raw[FSCRYPT_MAX_KEY_SIZE];
 };
 
 static void free_master_key(struct fscrypt_master_key *mk)
@@ -320,13 +320,13 @@ find_or_insert_master_key(struct fscrypt_master_key *to_insert,
 	 * raw key, and use crypto_memneq() when comparing raw keys.
 	 */
 
-	BUILD_BUG_ON(sizeof(hash_key) > FS_KEY_DESCRIPTOR_SIZE);
+	BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE);
 	memcpy(&hash_key, ci->ci_master_key_descriptor, sizeof(hash_key));
 
 	spin_lock(&fscrypt_master_keys_lock);
 	hash_for_each_possible(fscrypt_master_keys, mk, mk_node, hash_key) {
 		if (memcmp(ci->ci_master_key_descriptor, mk->mk_descriptor,
-			   FS_KEY_DESCRIPTOR_SIZE) != 0)
+			   FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0)
 			continue;
 		if (mode != mk->mk_mode)
 			continue;
@@ -370,7 +370,7 @@ fscrypt_get_master_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode,
 		goto err_free_mk;
 	}
 	memcpy(mk->mk_descriptor, ci->ci_master_key_descriptor,
-	       FS_KEY_DESCRIPTOR_SIZE);
+	       FSCRYPT_KEY_DESCRIPTOR_SIZE);
 	memcpy(mk->mk_raw, raw_key, mode->keysize);
 
 	return find_or_insert_master_key(mk, raw_key, mode, ci);
@@ -448,8 +448,8 @@ static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
 
 /*
  * Given the encryption mode and key (normally the derived key, but for
- * FS_POLICY_FLAG_DIRECT_KEY mode it's the master key), set up the inode's
- * symmetric cipher transform object(s).
+ * DIRECT_KEY mode it's the master key), set up the inode's symmetric cipher
+ * transform object(s).
  */
 static int setup_crypto_transform(struct fscrypt_info *ci,
 				  struct fscrypt_mode *mode,
@@ -459,7 +459,7 @@ static int setup_crypto_transform(struct fscrypt_info *ci,
 	struct crypto_skcipher *ctfm;
 	int err;
 
-	if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY) {
+	if (ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
 		mk = fscrypt_get_master_key(ci, mode, raw_key, inode);
 		if (IS_ERR(mk))
 			return PTR_ERR(mk);
@@ -476,7 +476,7 @@ static int setup_crypto_transform(struct fscrypt_info *ci,
 	if (mode->needs_essiv) {
 		/* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
 		WARN_ON(mode->ivsize != AES_BLOCK_SIZE);
-		WARN_ON(ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY);
+		WARN_ON(ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY);
 
 		err = init_essiv_generator(ci, raw_key, mode->keysize);
 		if (err) {
@@ -530,9 +530,10 @@ int fscrypt_get_encryption_info(struct inode *inode)
 		/* Fake up a context for an unencrypted directory */
 		memset(&ctx, 0, sizeof(ctx));
 		ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-		ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
-		ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
-		memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
+		ctx.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+		ctx.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+		memset(ctx.master_key_descriptor, 0x42,
+		       FSCRYPT_KEY_DESCRIPTOR_SIZE);
 	} else if (res != sizeof(ctx)) {
 		fscrypt_warn(inode,
 			     "Unknown encryption context size (%d bytes)", res);
@@ -545,7 +546,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
 		return -EINVAL;
 	}
 
-	if (ctx.flags & ~FS_POLICY_FLAGS_VALID) {
+	if (ctx.flags & ~FSCRYPT_POLICY_FLAGS_VALID) {
 		fscrypt_warn(inode, "Unknown encryption context flags (0x%02x)",
 			     ctx.flags);
 		return -EINVAL;
@@ -559,7 +560,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
 	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
 	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
 	memcpy(crypt_info->ci_master_key_descriptor, ctx.master_key_descriptor,
-	       FS_KEY_DESCRIPTOR_SIZE);
+	       FSCRYPT_KEY_DESCRIPTOR_SIZE);
 	memcpy(crypt_info->ci_nonce, ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
 
 	mode = select_encryption_mode(crypt_info, inode);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 4941fe8471ceff..da7ae9c8b4ad0b 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -22,7 +22,7 @@ static bool is_encryption_context_consistent_with_policy(
 				const struct fscrypt_policy *policy)
 {
 	return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor,
-		      FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+		      FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 &&
 		(ctx->flags == policy->flags) &&
 		(ctx->contents_encryption_mode ==
 		 policy->contents_encryption_mode) &&
@@ -37,13 +37,13 @@ static int create_encryption_context_from_policy(struct inode *inode,
 
 	ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
 	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
-					FS_KEY_DESCRIPTOR_SIZE);
+					FSCRYPT_KEY_DESCRIPTOR_SIZE);
 
 	if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
 				     policy->filenames_encryption_mode))
 		return -EINVAL;
 
-	if (policy->flags & ~FS_POLICY_FLAGS_VALID)
+	if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID)
 		return -EINVAL;
 
 	ctx.contents_encryption_mode = policy->contents_encryption_mode;
@@ -128,7 +128,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
 	policy.filenames_encryption_mode = ctx.filenames_encryption_mode;
 	policy.flags = ctx.flags;
 	memcpy(policy.master_key_descriptor, ctx.master_key_descriptor,
-				FS_KEY_DESCRIPTOR_SIZE);
+				FSCRYPT_KEY_DESCRIPTOR_SIZE);
 
 	if (copy_to_user(arg, &policy, sizeof(policy)))
 		return -EFAULT;
@@ -202,7 +202,7 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 	if (parent_ci && child_ci) {
 		return memcmp(parent_ci->ci_master_key_descriptor,
 			      child_ci->ci_master_key_descriptor,
-			      FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+			      FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 &&
 			(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
 			(parent_ci->ci_filename_mode ==
 			 child_ci->ci_filename_mode) &&
@@ -219,7 +219,7 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 
 	return memcmp(parent_ctx.master_key_descriptor,
 		      child_ctx.master_key_descriptor,
-		      FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+		      FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 &&
 		(parent_ctx.contents_encryption_mode ==
 		 child_ctx.contents_encryption_mode) &&
 		(parent_ctx.filenames_encryption_mode ==
@@ -257,7 +257,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
 	ctx.filenames_encryption_mode = ci->ci_filename_mode;
 	ctx.flags = ci->ci_flags;
 	memcpy(ctx.master_key_descriptor, ci->ci_master_key_descriptor,
-	       FS_KEY_DESCRIPTOR_SIZE);
+	       FSCRYPT_KEY_DESCRIPTOR_SIZE);
 	get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
 	BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
 	res = parent->i_sb->s_cop->set_context(child, &ctx,
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
index 674b0452ef5759..29a945d165def3 100644
--- a/include/uapi/linux/fscrypt.h
+++ b/include/uapi/linux/fscrypt.h
@@ -55,6 +55,7 @@ struct fscrypt_key {
 /**********************************************************************/
 
 /* old names; don't add anything new here! */
+#ifndef __KERNEL__
 #define FS_KEY_DESCRIPTOR_SIZE		FSCRYPT_KEY_DESCRIPTOR_SIZE
 #define FS_POLICY_FLAGS_PAD_4		FSCRYPT_POLICY_FLAGS_PAD_4
 #define FS_POLICY_FLAGS_PAD_8		FSCRYPT_POLICY_FLAGS_PAD_8
@@ -76,5 +77,6 @@ struct fscrypt_key {
 #define FS_KEY_DESC_PREFIX		FSCRYPT_KEY_DESC_PREFIX
 #define FS_KEY_DESC_PREFIX_SIZE		FSCRYPT_KEY_DESC_PREFIX_SIZE
 #define FS_MAX_KEY_SIZE			FSCRYPT_MAX_KEY_SIZE
+#endif /* !__KERNEL__ */
 
 #endif /* _UAPI_LINUX_FSCRYPT_H */
-- 
2.22.0.770.g0f2c4a37fd-goog

^ permalink raw reply related

* [PATCH v8 04/20] fscrypt: add ->ci_inode to fscrypt_info
From: Eric Biggers @ 2019-08-05 16:25 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Satya Tangirala, Theodore Ts'o, linux-api, linux-f2fs-devel,
	keyrings, linux-mtd, linux-crypto, linux-fsdevel, Jaegeuk Kim,
	linux-ext4, Paul Crowley
In-Reply-To: <20190805162521.90882-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add an inode back-pointer to 'struct fscrypt_info', such that
inode->i_crypt_info->ci_inode == inode.

This will be useful for:

1. Evicting the inodes when a fscrypt key is removed, since we'll track
   the inodes using a given key by linking their fscrypt_infos together,
   rather than the inodes directly.  This avoids bloating 'struct inode'
   with a new list_head.

2. Simplifying the per-file key setup, since the inode pointer won't
   have to be passed around everywhere just in case something goes wrong
   and it's needed for fscrypt_warn().

Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/fscrypt_private.h | 3 +++
 fs/crypto/keyinfo.c         | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index fae411b2f78dcb..d345a7d28df8c2 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -73,6 +73,9 @@ struct fscrypt_info {
 	 */
 	struct fscrypt_mode *ci_mode;
 
+	/* Back-pointer to the inode */
+	struct inode *ci_inode;
+
 	/*
 	 * If non-NULL, then this inode uses a master key directly rather than a
 	 * derived key, and ci_ctfm will equal ci_master_key->mk_ctfm.
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 22345ddede1199..2d45a86f09db25 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -556,6 +556,8 @@ int fscrypt_get_encryption_info(struct inode *inode)
 	if (!crypt_info)
 		return -ENOMEM;
 
+	crypt_info->ci_inode = inode;
+
 	crypt_info->ci_flags = ctx.flags;
 	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
 	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
-- 
2.22.0.770.g0f2c4a37fd-goog

^ permalink raw reply related

* [PATCH v8 05/20] fscrypt: rename fscrypt_master_key to fscrypt_direct_key
From: Eric Biggers @ 2019-08-05 16:25 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Satya Tangirala, Theodore Ts'o, linux-api, linux-f2fs-devel,
	keyrings, linux-mtd, linux-crypto, linux-fsdevel, Jaegeuk Kim,
	linux-ext4, Paul Crowley
In-Reply-To: <20190805162521.90882-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

In preparation for introducing a filesystem-level keyring which will
contain fscrypt master keys, rename the existing 'struct
fscrypt_master_key' to 'struct fscrypt_direct_key'.  This is the
structure in the existing table of master keys that's maintained to
deduplicate the crypto transforms for v1 DIRECT_KEY policies.

I've chosen to keep this table as-is rather than make it automagically
add/remove the keys to/from the filesystem-level keyring, since that
would add a lot of extra complexity to the filesystem-level keyring.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/fscrypt_private.h |   7 +-
 fs/crypto/keyinfo.c         | 130 ++++++++++++++++++------------------
 2 files changed, 68 insertions(+), 69 deletions(-)

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index d345a7d28df8c2..80d15a1bf60685 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -77,11 +77,10 @@ struct fscrypt_info {
 	struct inode *ci_inode;
 
 	/*
-	 * If non-NULL, then this inode uses a master key directly rather than a
-	 * derived key, and ci_ctfm will equal ci_master_key->mk_ctfm.
-	 * Otherwise, this inode uses a derived key.
+	 * If non-NULL, then encryption is done using the master key directly
+	 * and ci_ctfm will equal ci_direct_key->dk_ctfm.
 	 */
-	struct fscrypt_master_key *ci_master_key;
+	struct fscrypt_direct_key *ci_direct_key;
 
 	/* fields from the fscrypt_context */
 	u8 ci_data_mode;
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 2d45a86f09db25..c4650071df2772 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -21,8 +21,8 @@
 static struct crypto_shash *essiv_hash_tfm;
 
 /* Table of keys referenced by DIRECT_KEY policies */
-static DEFINE_HASHTABLE(fscrypt_master_keys, 6); /* 6 bits = 64 buckets */
-static DEFINE_SPINLOCK(fscrypt_master_keys_lock);
+static DEFINE_HASHTABLE(fscrypt_direct_keys, 6); /* 6 bits = 64 buckets */
+static DEFINE_SPINLOCK(fscrypt_direct_keys_lock);
 
 /*
  * Key derivation function.  This generates the derived key by encrypting the
@@ -273,46 +273,46 @@ allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
 }
 
 /* Master key referenced by DIRECT_KEY policy */
-struct fscrypt_master_key {
-	struct hlist_node mk_node;
-	refcount_t mk_refcount;
-	const struct fscrypt_mode *mk_mode;
-	struct crypto_skcipher *mk_ctfm;
-	u8 mk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
-	u8 mk_raw[FSCRYPT_MAX_KEY_SIZE];
+struct fscrypt_direct_key {
+	struct hlist_node		dk_node;
+	refcount_t			dk_refcount;
+	const struct fscrypt_mode	*dk_mode;
+	struct crypto_skcipher		*dk_ctfm;
+	u8				dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
+	u8				dk_raw[FSCRYPT_MAX_KEY_SIZE];
 };
 
-static void free_master_key(struct fscrypt_master_key *mk)
+static void free_direct_key(struct fscrypt_direct_key *dk)
 {
-	if (mk) {
-		crypto_free_skcipher(mk->mk_ctfm);
-		kzfree(mk);
+	if (dk) {
+		crypto_free_skcipher(dk->dk_ctfm);
+		kzfree(dk);
 	}
 }
 
-static void put_master_key(struct fscrypt_master_key *mk)
+static void put_direct_key(struct fscrypt_direct_key *dk)
 {
-	if (!refcount_dec_and_lock(&mk->mk_refcount, &fscrypt_master_keys_lock))
+	if (!refcount_dec_and_lock(&dk->dk_refcount, &fscrypt_direct_keys_lock))
 		return;
-	hash_del(&mk->mk_node);
-	spin_unlock(&fscrypt_master_keys_lock);
+	hash_del(&dk->dk_node);
+	spin_unlock(&fscrypt_direct_keys_lock);
 
-	free_master_key(mk);
+	free_direct_key(dk);
 }
 
 /*
- * Find/insert the given master key into the fscrypt_master_keys table.  If
- * found, it is returned with elevated refcount, and 'to_insert' is freed if
- * non-NULL.  If not found, 'to_insert' is inserted and returned if it's
- * non-NULL; otherwise NULL is returned.
+ * Find/insert the given key into the fscrypt_direct_keys table.  If found, it
+ * is returned with elevated refcount, and 'to_insert' is freed if non-NULL.  If
+ * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise
+ * NULL is returned.
  */
-static struct fscrypt_master_key *
-find_or_insert_master_key(struct fscrypt_master_key *to_insert,
+static struct fscrypt_direct_key *
+find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
 			  const u8 *raw_key, const struct fscrypt_mode *mode,
 			  const struct fscrypt_info *ci)
 {
 	unsigned long hash_key;
-	struct fscrypt_master_key *mk;
+	struct fscrypt_direct_key *dk;
 
 	/*
 	 * Careful: to avoid potentially leaking secret key bytes via timing
@@ -323,60 +323,60 @@ find_or_insert_master_key(struct fscrypt_master_key *to_insert,
 	BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE);
 	memcpy(&hash_key, ci->ci_master_key_descriptor, sizeof(hash_key));
 
-	spin_lock(&fscrypt_master_keys_lock);
-	hash_for_each_possible(fscrypt_master_keys, mk, mk_node, hash_key) {
-		if (memcmp(ci->ci_master_key_descriptor, mk->mk_descriptor,
+	spin_lock(&fscrypt_direct_keys_lock);
+	hash_for_each_possible(fscrypt_direct_keys, dk, dk_node, hash_key) {
+		if (memcmp(ci->ci_master_key_descriptor, dk->dk_descriptor,
 			   FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0)
 			continue;
-		if (mode != mk->mk_mode)
+		if (mode != dk->dk_mode)
 			continue;
-		if (crypto_memneq(raw_key, mk->mk_raw, mode->keysize))
+		if (crypto_memneq(raw_key, dk->dk_raw, mode->keysize))
 			continue;
 		/* using existing tfm with same (descriptor, mode, raw_key) */
-		refcount_inc(&mk->mk_refcount);
-		spin_unlock(&fscrypt_master_keys_lock);
-		free_master_key(to_insert);
-		return mk;
+		refcount_inc(&dk->dk_refcount);
+		spin_unlock(&fscrypt_direct_keys_lock);
+		free_direct_key(to_insert);
+		return dk;
 	}
 	if (to_insert)
-		hash_add(fscrypt_master_keys, &to_insert->mk_node, hash_key);
-	spin_unlock(&fscrypt_master_keys_lock);
+		hash_add(fscrypt_direct_keys, &to_insert->dk_node, hash_key);
+	spin_unlock(&fscrypt_direct_keys_lock);
 	return to_insert;
 }
 
 /* Prepare to encrypt directly using the master key in the given mode */
-static struct fscrypt_master_key *
-fscrypt_get_master_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode,
+static struct fscrypt_direct_key *
+fscrypt_get_direct_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode,
 		       const u8 *raw_key, const struct inode *inode)
 {
-	struct fscrypt_master_key *mk;
+	struct fscrypt_direct_key *dk;
 	int err;
 
 	/* Is there already a tfm for this key? */
-	mk = find_or_insert_master_key(NULL, raw_key, mode, ci);
-	if (mk)
-		return mk;
+	dk = find_or_insert_direct_key(NULL, raw_key, mode, ci);
+	if (dk)
+		return dk;
 
 	/* Nope, allocate one. */
-	mk = kzalloc(sizeof(*mk), GFP_NOFS);
-	if (!mk)
+	dk = kzalloc(sizeof(*dk), GFP_NOFS);
+	if (!dk)
 		return ERR_PTR(-ENOMEM);
-	refcount_set(&mk->mk_refcount, 1);
-	mk->mk_mode = mode;
-	mk->mk_ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
-	if (IS_ERR(mk->mk_ctfm)) {
-		err = PTR_ERR(mk->mk_ctfm);
-		mk->mk_ctfm = NULL;
-		goto err_free_mk;
+	refcount_set(&dk->dk_refcount, 1);
+	dk->dk_mode = mode;
+	dk->dk_ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
+	if (IS_ERR(dk->dk_ctfm)) {
+		err = PTR_ERR(dk->dk_ctfm);
+		dk->dk_ctfm = NULL;
+		goto err_free_dk;
 	}
-	memcpy(mk->mk_descriptor, ci->ci_master_key_descriptor,
+	memcpy(dk->dk_descriptor, ci->ci_master_key_descriptor,
 	       FSCRYPT_KEY_DESCRIPTOR_SIZE);
-	memcpy(mk->mk_raw, raw_key, mode->keysize);
+	memcpy(dk->dk_raw, raw_key, mode->keysize);
 
-	return find_or_insert_master_key(mk, raw_key, mode, ci);
+	return find_or_insert_direct_key(dk, raw_key, mode, ci);
 
-err_free_mk:
-	free_master_key(mk);
+err_free_dk:
+	free_direct_key(dk);
 	return ERR_PTR(err);
 }
 
@@ -455,22 +455,22 @@ static int setup_crypto_transform(struct fscrypt_info *ci,
 				  struct fscrypt_mode *mode,
 				  const u8 *raw_key, const struct inode *inode)
 {
-	struct fscrypt_master_key *mk;
+	struct fscrypt_direct_key *dk;
 	struct crypto_skcipher *ctfm;
 	int err;
 
 	if (ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
-		mk = fscrypt_get_master_key(ci, mode, raw_key, inode);
-		if (IS_ERR(mk))
-			return PTR_ERR(mk);
-		ctfm = mk->mk_ctfm;
+		dk = fscrypt_get_direct_key(ci, mode, raw_key, inode);
+		if (IS_ERR(dk))
+			return PTR_ERR(dk);
+		ctfm = dk->dk_ctfm;
 	} else {
-		mk = NULL;
+		dk = NULL;
 		ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
 		if (IS_ERR(ctfm))
 			return PTR_ERR(ctfm);
 	}
-	ci->ci_master_key = mk;
+	ci->ci_direct_key = dk;
 	ci->ci_ctfm = ctfm;
 
 	if (mode->needs_essiv) {
@@ -494,8 +494,8 @@ static void put_crypt_info(struct fscrypt_info *ci)
 	if (!ci)
 		return;
 
-	if (ci->ci_master_key) {
-		put_master_key(ci->ci_master_key);
+	if (ci->ci_direct_key) {
+		put_direct_key(ci->ci_direct_key);
 	} else {
 		crypto_free_skcipher(ci->ci_ctfm);
 		crypto_free_cipher(ci->ci_essiv_tfm);
-- 
2.22.0.770.g0f2c4a37fd-goog

^ permalink raw reply related

* [PATCH v8 06/20] fscrypt: refactor key setup code in preparation for v2 policies
From: Eric Biggers @ 2019-08-05 16:25 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Satya Tangirala, Theodore Ts'o, linux-api, linux-f2fs-devel,
	keyrings, linux-mtd, linux-crypto, linux-fsdevel, Jaegeuk Kim,
	linux-ext4, Paul Crowley
In-Reply-To: <20190805162521.90882-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Do some more refactoring of the key setup code, in preparation for
introducing a filesystem-level keyring and v2 encryption policies:

- Now that ci_inode exists, don't pass around the inode unnecessarily.

- Define a function setup_file_encryption_key() which handles the crypto
  key setup given an under-construction fscrypt_info.  Don't pass the
  fscrypt_context, since everything is in the fscrypt_info.
  [This will be extended for v2 policies and the fs-level keyring.]

- Define a function fscrypt_set_derived_key() which sets the per-file
  key, without depending on anything specific to v1 policies.
  [This will also be used for v2 policies.]

- Define a function fscrypt_setup_v1_file_key() which takes the raw
  master key, thus separating finding the key from using it.
  [This will also be used if the key is found in the fs-level keyring.]

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/fscrypt_private.h |  11 +-
 fs/crypto/keyinfo.c         | 247 ++++++++++++++++++++----------------
 2 files changed, 146 insertions(+), 112 deletions(-)

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 80d15a1bf60685..56bac5c7ef408a 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -4,9 +4,8 @@
  *
  * Copyright (C) 2015, Google, Inc.
  *
- * This contains encryption key functions.
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
+ * Heavily modified since then.
  */
 
 #ifndef _FSCRYPT_PRIVATE_H
@@ -168,4 +167,10 @@ struct fscrypt_mode {
 	bool needs_essiv;
 };
 
+static inline bool
+fscrypt_mode_supports_direct_key(const struct fscrypt_mode *mode)
+{
+	return mode->ivsize >= offsetofend(union fscrypt_iv, nonce);
+}
+
 #endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index c4650071df2772..c6bf44d6411189 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -1,12 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * key management facility for FS encryption support.
+ * Key setup facility for FS encryption support.
  *
  * Copyright (C) 2015, Google, Inc.
  *
- * This contains encryption key functions.
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
+ * Heavily modified since then.
  */
 
 #include <keys/user-type.h>
@@ -25,14 +24,19 @@ static DEFINE_HASHTABLE(fscrypt_direct_keys, 6); /* 6 bits = 64 buckets */
 static DEFINE_SPINLOCK(fscrypt_direct_keys_lock);
 
 /*
- * Key derivation function.  This generates the derived key by encrypting the
- * master key with AES-128-ECB using the inode's nonce as the AES key.
+ * v1 key derivation function.  This generates the derived key by encrypting the
+ * master key with AES-128-ECB using the nonce as the AES key.  This provides a
+ * unique derived key with sufficient entropy for each inode.  However, it's
+ * nonstandard, non-extensible, doesn't evenly distribute the entropy from the
+ * master key, and is trivially reversible: an attacker who compromises a
+ * derived key can "decrypt" it to get back to the master key, then derive any
+ * other key.  For all new code, use HKDF instead.
  *
  * The master key must be at least as long as the derived key.  If the master
  * key is longer, then only the first 'derived_keysize' bytes are used.
  */
 static int derive_key_aes(const u8 *master_key,
-			  const struct fscrypt_context *ctx,
+			  const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE],
 			  u8 *derived_key, unsigned int derived_keysize)
 {
 	int res = 0;
@@ -55,7 +59,7 @@ static int derive_key_aes(const u8 *master_key,
 	skcipher_request_set_callback(req,
 			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 			crypto_req_done, &wait);
-	res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce));
+	res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE);
 	if (res < 0)
 		goto out;
 
@@ -183,54 +187,10 @@ select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode)
 	return ERR_PTR(-EINVAL);
 }
 
-/* Find the master key, then derive the inode's actual encryption key */
-static int find_and_derive_key(const struct inode *inode,
-			       const struct fscrypt_context *ctx,
-			       u8 *derived_key, const struct fscrypt_mode *mode)
-{
-	struct key *key;
-	const struct fscrypt_key *payload;
-	int err;
-
-	key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
-					ctx->master_key_descriptor,
-					mode->keysize, &payload);
-	if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
-		key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix,
-						ctx->master_key_descriptor,
-						mode->keysize, &payload);
-	}
-	if (IS_ERR(key))
-		return PTR_ERR(key);
-
-	if (ctx->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
-		if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) {
-			fscrypt_warn(inode,
-				     "Direct key mode not allowed with %s",
-				     mode->friendly_name);
-			err = -EINVAL;
-		} else if (ctx->contents_encryption_mode !=
-			   ctx->filenames_encryption_mode) {
-			fscrypt_warn(inode,
-				     "Direct key mode not allowed with different contents and filenames modes");
-			err = -EINVAL;
-		} else {
-			memcpy(derived_key, payload->raw, mode->keysize);
-			err = 0;
-		}
-	} else {
-		err = derive_key_aes(payload->raw, ctx, derived_key,
-				     mode->keysize);
-	}
-	up_read(&key->sem);
-	key_put(key);
-	return err;
-}
-
-/* Allocate and key a symmetric cipher object for the given encryption mode */
+/* Create a symmetric cipher object for the given encryption mode and key */
 static struct crypto_skcipher *
-allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
-			   const struct inode *inode)
+fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
+			  const struct inode *inode)
 {
 	struct crypto_skcipher *tfm;
 	int err;
@@ -308,8 +268,7 @@ static void put_direct_key(struct fscrypt_direct_key *dk)
  */
 static struct fscrypt_direct_key *
 find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
-			  const u8 *raw_key, const struct fscrypt_mode *mode,
-			  const struct fscrypt_info *ci)
+			  const u8 *raw_key, const struct fscrypt_info *ci)
 {
 	unsigned long hash_key;
 	struct fscrypt_direct_key *dk;
@@ -328,9 +287,9 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
 		if (memcmp(ci->ci_master_key_descriptor, dk->dk_descriptor,
 			   FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0)
 			continue;
-		if (mode != dk->dk_mode)
+		if (ci->ci_mode != dk->dk_mode)
 			continue;
-		if (crypto_memneq(raw_key, dk->dk_raw, mode->keysize))
+		if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize))
 			continue;
 		/* using existing tfm with same (descriptor, mode, raw_key) */
 		refcount_inc(&dk->dk_refcount);
@@ -346,14 +305,13 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
 
 /* Prepare to encrypt directly using the master key in the given mode */
 static struct fscrypt_direct_key *
-fscrypt_get_direct_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode,
-		       const u8 *raw_key, const struct inode *inode)
+fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
 {
 	struct fscrypt_direct_key *dk;
 	int err;
 
 	/* Is there already a tfm for this key? */
-	dk = find_or_insert_direct_key(NULL, raw_key, mode, ci);
+	dk = find_or_insert_direct_key(NULL, raw_key, ci);
 	if (dk)
 		return dk;
 
@@ -362,8 +320,9 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode,
 	if (!dk)
 		return ERR_PTR(-ENOMEM);
 	refcount_set(&dk->dk_refcount, 1);
-	dk->dk_mode = mode;
-	dk->dk_ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
+	dk->dk_mode = ci->ci_mode;
+	dk->dk_ctfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key,
+						ci->ci_inode);
 	if (IS_ERR(dk->dk_ctfm)) {
 		err = PTR_ERR(dk->dk_ctfm);
 		dk->dk_ctfm = NULL;
@@ -371,9 +330,9 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode,
 	}
 	memcpy(dk->dk_descriptor, ci->ci_master_key_descriptor,
 	       FSCRYPT_KEY_DESCRIPTOR_SIZE);
-	memcpy(dk->dk_raw, raw_key, mode->keysize);
+	memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize);
 
-	return find_or_insert_direct_key(dk, raw_key, mode, ci);
+	return find_or_insert_direct_key(dk, raw_key, ci);
 
 err_free_dk:
 	free_direct_key(dk);
@@ -422,6 +381,9 @@ static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
 	struct crypto_cipher *essiv_tfm;
 	u8 salt[SHA256_DIGEST_SIZE];
 
+	if (WARN_ON(ci->ci_mode->ivsize != AES_BLOCK_SIZE))
+		return -EINVAL;
+
 	essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(essiv_tfm))
 		return PTR_ERR(essiv_tfm);
@@ -446,41 +408,24 @@ static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
 	return err;
 }
 
-/*
- * Given the encryption mode and key (normally the derived key, but for
- * DIRECT_KEY mode it's the master key), set up the inode's symmetric cipher
- * transform object(s).
- */
-static int setup_crypto_transform(struct fscrypt_info *ci,
-				  struct fscrypt_mode *mode,
-				  const u8 *raw_key, const struct inode *inode)
+/* Given the per-file key, set up the file's crypto transform object(s) */
+static int fscrypt_set_derived_key(struct fscrypt_info *ci,
+				   const u8 *derived_key)
 {
-	struct fscrypt_direct_key *dk;
+	struct fscrypt_mode *mode = ci->ci_mode;
 	struct crypto_skcipher *ctfm;
 	int err;
 
-	if (ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
-		dk = fscrypt_get_direct_key(ci, mode, raw_key, inode);
-		if (IS_ERR(dk))
-			return PTR_ERR(dk);
-		ctfm = dk->dk_ctfm;
-	} else {
-		dk = NULL;
-		ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
-		if (IS_ERR(ctfm))
-			return PTR_ERR(ctfm);
-	}
-	ci->ci_direct_key = dk;
+	ctfm = fscrypt_allocate_skcipher(mode, derived_key, ci->ci_inode);
+	if (IS_ERR(ctfm))
+		return PTR_ERR(ctfm);
+
 	ci->ci_ctfm = ctfm;
 
 	if (mode->needs_essiv) {
-		/* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
-		WARN_ON(mode->ivsize != AES_BLOCK_SIZE);
-		WARN_ON(ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY);
-
-		err = init_essiv_generator(ci, raw_key, mode->keysize);
+		err = init_essiv_generator(ci, derived_key, mode->keysize);
 		if (err) {
-			fscrypt_warn(inode,
+			fscrypt_warn(ci->ci_inode,
 				     "Error initializing ESSIV generator: %d",
 				     err);
 			return err;
@@ -489,6 +434,105 @@ static int setup_crypto_transform(struct fscrypt_info *ci,
 	return 0;
 }
 
+/* v1 policy, DIRECT_KEY: use the master key directly */
+static int setup_v1_file_key_direct(struct fscrypt_info *ci,
+				    const u8 *raw_master_key)
+{
+	const struct fscrypt_mode *mode = ci->ci_mode;
+	struct fscrypt_direct_key *dk;
+
+	if (!fscrypt_mode_supports_direct_key(mode)) {
+		fscrypt_warn(ci->ci_inode,
+			     "Direct key mode not allowed with %s",
+			     mode->friendly_name);
+		return -EINVAL;
+	}
+
+	if (ci->ci_data_mode != ci->ci_filename_mode) {
+		fscrypt_warn(ci->ci_inode,
+			     "Direct key mode not allowed with different contents and filenames modes");
+		return -EINVAL;
+	}
+
+	/* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
+	if (WARN_ON(mode->needs_essiv))
+		return -EINVAL;
+
+	dk = fscrypt_get_direct_key(ci, raw_master_key);
+	if (IS_ERR(dk))
+		return PTR_ERR(dk);
+	ci->ci_direct_key = dk;
+	ci->ci_ctfm = dk->dk_ctfm;
+	return 0;
+}
+
+/* v1 policy, !DIRECT_KEY: derive the file's encryption key */
+static int setup_v1_file_key_derived(struct fscrypt_info *ci,
+				     const u8 *raw_master_key)
+{
+	u8 *derived_key;
+	int err;
+
+	/*
+	 * This cannot be a stack buffer because it will be passed to the
+	 * scatterlist crypto API during derive_key_aes().
+	 */
+	derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS);
+	if (!derived_key)
+		return -ENOMEM;
+
+	err = derive_key_aes(raw_master_key, ci->ci_nonce,
+			     derived_key, ci->ci_mode->keysize);
+	if (err)
+		goto out;
+
+	err = fscrypt_set_derived_key(ci, derived_key);
+out:
+	kzfree(derived_key);
+	return err;
+}
+
+static int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
+				     const u8 *raw_master_key)
+{
+	if (ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
+		return setup_v1_file_key_direct(ci, raw_master_key);
+	else
+		return setup_v1_file_key_derived(ci, raw_master_key);
+}
+
+static int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
+						struct fscrypt_info *ci)
+{
+	struct key *key;
+	const struct fscrypt_key *payload;
+	int err;
+
+	key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
+					ci->ci_master_key_descriptor,
+					ci->ci_mode->keysize, &payload);
+	if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) {
+		key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix,
+						ci->ci_master_key_descriptor,
+						ci->ci_mode->keysize, &payload);
+	}
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	err = fscrypt_setup_v1_file_key(ci, payload->raw);
+	up_read(&key->sem);
+	key_put(key);
+	return err;
+}
+
+/*
+ * Find the master key, then set up the inode's actual encryption key.
+ */
+static int setup_file_encryption_key(struct fscrypt_info *ci)
+{
+	return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
+}
+
 static void put_crypt_info(struct fscrypt_info *ci)
 {
 	if (!ci)
@@ -508,7 +552,6 @@ int fscrypt_get_encryption_info(struct inode *inode)
 	struct fscrypt_info *crypt_info;
 	struct fscrypt_context ctx;
 	struct fscrypt_mode *mode;
-	u8 *raw_key = NULL;
 	int res;
 
 	if (fscrypt_has_encryption_key(inode))
@@ -573,20 +616,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
 	WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
 	crypt_info->ci_mode = mode;
 
-	/*
-	 * This cannot be a stack buffer because it may be passed to the
-	 * scatterlist crypto API as part of key derivation.
-	 */
-	res = -ENOMEM;
-	raw_key = kmalloc(mode->keysize, GFP_NOFS);
-	if (!raw_key)
-		goto out;
-
-	res = find_and_derive_key(inode, &ctx, raw_key, mode);
-	if (res)
-		goto out;
-
-	res = setup_crypto_transform(crypt_info, mode, raw_key, inode);
+	res = setup_file_encryption_key(crypt_info);
 	if (res)
 		goto out;
 
@@ -596,7 +626,6 @@ int fscrypt_get_encryption_info(struct inode *inode)
 	if (res == -ENOKEY)
 		res = 0;
 	put_crypt_info(crypt_info);
-	kzfree(raw_key);
 	return res;
 }
 EXPORT_SYMBOL(fscrypt_get_encryption_info);
-- 
2.22.0.770.g0f2c4a37fd-goog

^ permalink raw reply related

* [PATCH v8 07/20] fscrypt: move v1 policy key setup to keysetup_v1.c
From: Eric Biggers @ 2019-08-05 16:25 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Satya Tangirala, Theodore Ts'o, linux-api, linux-f2fs-devel,
	keyrings, linux-mtd, linux-crypto, linux-fsdevel, Jaegeuk Kim,
	linux-ext4, Paul Crowley
In-Reply-To: <20190805162521.90882-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

In preparation for introducing v2 encryption policies which will find
and derive encryption keys differently from the current v1 encryption
policies, move the v1 policy-specific key setup code from keyinfo.c into
keysetup_v1.c.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/Makefile          |   8 +-
 fs/crypto/fscrypt_private.h |  17 ++
 fs/crypto/keyinfo.c         | 328 +---------------------------------
 fs/crypto/keysetup_v1.c     | 338 ++++++++++++++++++++++++++++++++++++
 4 files changed, 369 insertions(+), 322 deletions(-)
 create mode 100644 fs/crypto/keysetup_v1.c

diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index 4f0df5e682e49f..1fba255c34ca56 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -1,5 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_FS_ENCRYPTION)	+= fscrypto.o
 
-fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o
+fscrypto-y := crypto.o \
+	      fname.o \
+	      hooks.o \
+	      keyinfo.o \
+	      keysetup_v1.o \
+	      policy.o
+
 fscrypto-$(CONFIG_BLOCK) += bio.o
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 56bac5c7ef408a..387b44b255f6ab 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -173,4 +173,21 @@ fscrypt_mode_supports_direct_key(const struct fscrypt_mode *mode)
 	return mode->ivsize >= offsetofend(union fscrypt_iv, nonce);
 }
 
+extern struct crypto_skcipher *
+fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
+			  const struct inode *inode);
+
+extern int fscrypt_set_derived_key(struct fscrypt_info *ci,
+				   const u8 *derived_key);
+
+/* keysetup_v1.c */
+
+extern void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
+
+extern int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
+				     const u8 *raw_master_key);
+
+extern int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
+					struct fscrypt_info *ci);
+
 #endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index c6bf44d6411189..f4a47448e9efa1 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -8,130 +8,15 @@
  * Heavily modified since then.
  */
 
-#include <keys/user-type.h>
-#include <linux/hashtable.h>
-#include <linux/scatterlist.h>
 #include <crypto/aes.h>
-#include <crypto/algapi.h>
 #include <crypto/sha.h>
 #include <crypto/skcipher.h>
+#include <linux/key.h>
+
 #include "fscrypt_private.h"
 
 static struct crypto_shash *essiv_hash_tfm;
 
-/* Table of keys referenced by DIRECT_KEY policies */
-static DEFINE_HASHTABLE(fscrypt_direct_keys, 6); /* 6 bits = 64 buckets */
-static DEFINE_SPINLOCK(fscrypt_direct_keys_lock);
-
-/*
- * v1 key derivation function.  This generates the derived key by encrypting the
- * master key with AES-128-ECB using the nonce as the AES key.  This provides a
- * unique derived key with sufficient entropy for each inode.  However, it's
- * nonstandard, non-extensible, doesn't evenly distribute the entropy from the
- * master key, and is trivially reversible: an attacker who compromises a
- * derived key can "decrypt" it to get back to the master key, then derive any
- * other key.  For all new code, use HKDF instead.
- *
- * The master key must be at least as long as the derived key.  If the master
- * key is longer, then only the first 'derived_keysize' bytes are used.
- */
-static int derive_key_aes(const u8 *master_key,
-			  const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE],
-			  u8 *derived_key, unsigned int derived_keysize)
-{
-	int res = 0;
-	struct skcipher_request *req = NULL;
-	DECLARE_CRYPTO_WAIT(wait);
-	struct scatterlist src_sg, dst_sg;
-	struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
-
-	if (IS_ERR(tfm)) {
-		res = PTR_ERR(tfm);
-		tfm = NULL;
-		goto out;
-	}
-	crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
-	req = skcipher_request_alloc(tfm, GFP_NOFS);
-	if (!req) {
-		res = -ENOMEM;
-		goto out;
-	}
-	skcipher_request_set_callback(req,
-			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-			crypto_req_done, &wait);
-	res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE);
-	if (res < 0)
-		goto out;
-
-	sg_init_one(&src_sg, master_key, derived_keysize);
-	sg_init_one(&dst_sg, derived_key, derived_keysize);
-	skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
-				   NULL);
-	res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
-out:
-	skcipher_request_free(req);
-	crypto_free_skcipher(tfm);
-	return res;
-}
-
-/*
- * Search the current task's subscribed keyrings for a "logon" key with
- * description prefix:descriptor, and if found acquire a read lock on it and
- * return a pointer to its validated payload in *payload_ret.
- */
-static struct key *
-find_and_lock_process_key(const char *prefix,
-			  const u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE],
-			  unsigned int min_keysize,
-			  const struct fscrypt_key **payload_ret)
-{
-	char *description;
-	struct key *key;
-	const struct user_key_payload *ukp;
-	const struct fscrypt_key *payload;
-
-	description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
-				FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor);
-	if (!description)
-		return ERR_PTR(-ENOMEM);
-
-	key = request_key(&key_type_logon, description, NULL);
-	kfree(description);
-	if (IS_ERR(key))
-		return key;
-
-	down_read(&key->sem);
-	ukp = user_key_payload_locked(key);
-
-	if (!ukp) /* was the key revoked before we acquired its semaphore? */
-		goto invalid;
-
-	payload = (const struct fscrypt_key *)ukp->data;
-
-	if (ukp->datalen != sizeof(struct fscrypt_key) ||
-	    payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) {
-		fscrypt_warn(NULL,
-			     "key with description '%s' has invalid payload",
-			     key->description);
-		goto invalid;
-	}
-
-	if (payload->size < min_keysize) {
-		fscrypt_warn(NULL,
-			     "key with description '%s' is too short (got %u bytes, need %u+ bytes)",
-			     key->description, payload->size, min_keysize);
-		goto invalid;
-	}
-
-	*payload_ret = payload;
-	return key;
-
-invalid:
-	up_read(&key->sem);
-	key_put(key);
-	return ERR_PTR(-ENOKEY);
-}
-
 static struct fscrypt_mode available_modes[] = {
 	[FSCRYPT_MODE_AES_256_XTS] = {
 		.friendly_name = "AES-256-XTS",
@@ -188,9 +73,9 @@ select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode)
 }
 
 /* Create a symmetric cipher object for the given encryption mode and key */
-static struct crypto_skcipher *
-fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
-			  const struct inode *inode)
+struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode,
+						  const u8 *raw_key,
+						  const struct inode *inode)
 {
 	struct crypto_skcipher *tfm;
 	int err;
@@ -232,113 +117,6 @@ fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
 	return ERR_PTR(err);
 }
 
-/* Master key referenced by DIRECT_KEY policy */
-struct fscrypt_direct_key {
-	struct hlist_node		dk_node;
-	refcount_t			dk_refcount;
-	const struct fscrypt_mode	*dk_mode;
-	struct crypto_skcipher		*dk_ctfm;
-	u8				dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
-	u8				dk_raw[FSCRYPT_MAX_KEY_SIZE];
-};
-
-static void free_direct_key(struct fscrypt_direct_key *dk)
-{
-	if (dk) {
-		crypto_free_skcipher(dk->dk_ctfm);
-		kzfree(dk);
-	}
-}
-
-static void put_direct_key(struct fscrypt_direct_key *dk)
-{
-	if (!refcount_dec_and_lock(&dk->dk_refcount, &fscrypt_direct_keys_lock))
-		return;
-	hash_del(&dk->dk_node);
-	spin_unlock(&fscrypt_direct_keys_lock);
-
-	free_direct_key(dk);
-}
-
-/*
- * Find/insert the given key into the fscrypt_direct_keys table.  If found, it
- * is returned with elevated refcount, and 'to_insert' is freed if non-NULL.  If
- * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise
- * NULL is returned.
- */
-static struct fscrypt_direct_key *
-find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
-			  const u8 *raw_key, const struct fscrypt_info *ci)
-{
-	unsigned long hash_key;
-	struct fscrypt_direct_key *dk;
-
-	/*
-	 * Careful: to avoid potentially leaking secret key bytes via timing
-	 * information, we must key the hash table by descriptor rather than by
-	 * raw key, and use crypto_memneq() when comparing raw keys.
-	 */
-
-	BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE);
-	memcpy(&hash_key, ci->ci_master_key_descriptor, sizeof(hash_key));
-
-	spin_lock(&fscrypt_direct_keys_lock);
-	hash_for_each_possible(fscrypt_direct_keys, dk, dk_node, hash_key) {
-		if (memcmp(ci->ci_master_key_descriptor, dk->dk_descriptor,
-			   FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0)
-			continue;
-		if (ci->ci_mode != dk->dk_mode)
-			continue;
-		if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize))
-			continue;
-		/* using existing tfm with same (descriptor, mode, raw_key) */
-		refcount_inc(&dk->dk_refcount);
-		spin_unlock(&fscrypt_direct_keys_lock);
-		free_direct_key(to_insert);
-		return dk;
-	}
-	if (to_insert)
-		hash_add(fscrypt_direct_keys, &to_insert->dk_node, hash_key);
-	spin_unlock(&fscrypt_direct_keys_lock);
-	return to_insert;
-}
-
-/* Prepare to encrypt directly using the master key in the given mode */
-static struct fscrypt_direct_key *
-fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
-{
-	struct fscrypt_direct_key *dk;
-	int err;
-
-	/* Is there already a tfm for this key? */
-	dk = find_or_insert_direct_key(NULL, raw_key, ci);
-	if (dk)
-		return dk;
-
-	/* Nope, allocate one. */
-	dk = kzalloc(sizeof(*dk), GFP_NOFS);
-	if (!dk)
-		return ERR_PTR(-ENOMEM);
-	refcount_set(&dk->dk_refcount, 1);
-	dk->dk_mode = ci->ci_mode;
-	dk->dk_ctfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key,
-						ci->ci_inode);
-	if (IS_ERR(dk->dk_ctfm)) {
-		err = PTR_ERR(dk->dk_ctfm);
-		dk->dk_ctfm = NULL;
-		goto err_free_dk;
-	}
-	memcpy(dk->dk_descriptor, ci->ci_master_key_descriptor,
-	       FSCRYPT_KEY_DESCRIPTOR_SIZE);
-	memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize);
-
-	return find_or_insert_direct_key(dk, raw_key, ci);
-
-err_free_dk:
-	free_direct_key(dk);
-	return ERR_PTR(err);
-}
-
 static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
 {
 	struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
@@ -409,8 +187,7 @@ static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
 }
 
 /* Given the per-file key, set up the file's crypto transform object(s) */
-static int fscrypt_set_derived_key(struct fscrypt_info *ci,
-				   const u8 *derived_key)
+int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key)
 {
 	struct fscrypt_mode *mode = ci->ci_mode;
 	struct crypto_skcipher *ctfm;
@@ -434,97 +211,6 @@ static int fscrypt_set_derived_key(struct fscrypt_info *ci,
 	return 0;
 }
 
-/* v1 policy, DIRECT_KEY: use the master key directly */
-static int setup_v1_file_key_direct(struct fscrypt_info *ci,
-				    const u8 *raw_master_key)
-{
-	const struct fscrypt_mode *mode = ci->ci_mode;
-	struct fscrypt_direct_key *dk;
-
-	if (!fscrypt_mode_supports_direct_key(mode)) {
-		fscrypt_warn(ci->ci_inode,
-			     "Direct key mode not allowed with %s",
-			     mode->friendly_name);
-		return -EINVAL;
-	}
-
-	if (ci->ci_data_mode != ci->ci_filename_mode) {
-		fscrypt_warn(ci->ci_inode,
-			     "Direct key mode not allowed with different contents and filenames modes");
-		return -EINVAL;
-	}
-
-	/* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
-	if (WARN_ON(mode->needs_essiv))
-		return -EINVAL;
-
-	dk = fscrypt_get_direct_key(ci, raw_master_key);
-	if (IS_ERR(dk))
-		return PTR_ERR(dk);
-	ci->ci_direct_key = dk;
-	ci->ci_ctfm = dk->dk_ctfm;
-	return 0;
-}
-
-/* v1 policy, !DIRECT_KEY: derive the file's encryption key */
-static int setup_v1_file_key_derived(struct fscrypt_info *ci,
-				     const u8 *raw_master_key)
-{
-	u8 *derived_key;
-	int err;
-
-	/*
-	 * This cannot be a stack buffer because it will be passed to the
-	 * scatterlist crypto API during derive_key_aes().
-	 */
-	derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS);
-	if (!derived_key)
-		return -ENOMEM;
-
-	err = derive_key_aes(raw_master_key, ci->ci_nonce,
-			     derived_key, ci->ci_mode->keysize);
-	if (err)
-		goto out;
-
-	err = fscrypt_set_derived_key(ci, derived_key);
-out:
-	kzfree(derived_key);
-	return err;
-}
-
-static int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
-				     const u8 *raw_master_key)
-{
-	if (ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
-		return setup_v1_file_key_direct(ci, raw_master_key);
-	else
-		return setup_v1_file_key_derived(ci, raw_master_key);
-}
-
-static int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
-						struct fscrypt_info *ci)
-{
-	struct key *key;
-	const struct fscrypt_key *payload;
-	int err;
-
-	key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
-					ci->ci_master_key_descriptor,
-					ci->ci_mode->keysize, &payload);
-	if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) {
-		key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix,
-						ci->ci_master_key_descriptor,
-						ci->ci_mode->keysize, &payload);
-	}
-	if (IS_ERR(key))
-		return PTR_ERR(key);
-
-	err = fscrypt_setup_v1_file_key(ci, payload->raw);
-	up_read(&key->sem);
-	key_put(key);
-	return err;
-}
-
 /*
  * Find the master key, then set up the inode's actual encryption key.
  */
@@ -539,7 +225,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
 		return;
 
 	if (ci->ci_direct_key) {
-		put_direct_key(ci->ci_direct_key);
+		fscrypt_put_direct_key(ci->ci_direct_key);
 	} else {
 		crypto_free_skcipher(ci->ci_ctfm);
 		crypto_free_cipher(ci->ci_essiv_tfm);
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
new file mode 100644
index 00000000000000..631690bb6ed54c
--- /dev/null
+++ b/fs/crypto/keysetup_v1.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Key setup for v1 encryption policies
+ *
+ * Copyright 2015, 2019 Google LLC
+ */
+
+/*
+ * This file implements compatibility functions for the original encryption
+ * policy version ("v1"), including:
+ *
+ * - Deriving per-file keys using the AES-128-ECB based KDF
+ *   (rather than the new method of using HKDF-SHA512)
+ *
+ * - Retrieving fscrypt master keys from process-subscribed keyrings
+ *   (rather than the new method of using a filesystem-level keyring)
+ *
+ * - Handling policies with the DIRECT_KEY flag set using a master key table
+ *   (rather than the new method of implementing DIRECT_KEY with per-mode keys
+ *    managed alongside the master keys in the filesystem-level keyring)
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/skcipher.h>
+#include <keys/user-type.h>
+#include <linux/hashtable.h>
+#include <linux/scatterlist.h>
+
+#include "fscrypt_private.h"
+
+/* Table of keys referenced by DIRECT_KEY policies */
+static DEFINE_HASHTABLE(fscrypt_direct_keys, 6); /* 6 bits = 64 buckets */
+static DEFINE_SPINLOCK(fscrypt_direct_keys_lock);
+
+/*
+ * v1 key derivation function.  This generates the derived key by encrypting the
+ * master key with AES-128-ECB using the nonce as the AES key.  This provides a
+ * unique derived key with sufficient entropy for each inode.  However, it's
+ * nonstandard, non-extensible, doesn't evenly distribute the entropy from the
+ * master key, and is trivially reversible: an attacker who compromises a
+ * derived key can "decrypt" it to get back to the master key, then derive any
+ * other key.  For all new code, use HKDF instead.
+ *
+ * The master key must be at least as long as the derived key.  If the master
+ * key is longer, then only the first 'derived_keysize' bytes are used.
+ */
+static int derive_key_aes(const u8 *master_key,
+			  const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE],
+			  u8 *derived_key, unsigned int derived_keysize)
+{
+	int res = 0;
+	struct skcipher_request *req = NULL;
+	DECLARE_CRYPTO_WAIT(wait);
+	struct scatterlist src_sg, dst_sg;
+	struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
+
+	if (IS_ERR(tfm)) {
+		res = PTR_ERR(tfm);
+		tfm = NULL;
+		goto out;
+	}
+	crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+	req = skcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		res = -ENOMEM;
+		goto out;
+	}
+	skcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			crypto_req_done, &wait);
+	res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+	if (res < 0)
+		goto out;
+
+	sg_init_one(&src_sg, master_key, derived_keysize);
+	sg_init_one(&dst_sg, derived_key, derived_keysize);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
+				   NULL);
+	res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+out:
+	skcipher_request_free(req);
+	crypto_free_skcipher(tfm);
+	return res;
+}
+
+/*
+ * Search the current task's subscribed keyrings for a "logon" key with
+ * description prefix:descriptor, and if found acquire a read lock on it and
+ * return a pointer to its validated payload in *payload_ret.
+ */
+static struct key *
+find_and_lock_process_key(const char *prefix,
+			  const u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE],
+			  unsigned int min_keysize,
+			  const struct fscrypt_key **payload_ret)
+{
+	char *description;
+	struct key *key;
+	const struct user_key_payload *ukp;
+	const struct fscrypt_key *payload;
+
+	description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+				FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor);
+	if (!description)
+		return ERR_PTR(-ENOMEM);
+
+	key = request_key(&key_type_logon, description, NULL);
+	kfree(description);
+	if (IS_ERR(key))
+		return key;
+
+	down_read(&key->sem);
+	ukp = user_key_payload_locked(key);
+
+	if (!ukp) /* was the key revoked before we acquired its semaphore? */
+		goto invalid;
+
+	payload = (const struct fscrypt_key *)ukp->data;
+
+	if (ukp->datalen != sizeof(struct fscrypt_key) ||
+	    payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) {
+		fscrypt_warn(NULL,
+			     "key with description '%s' has invalid payload",
+			     key->description);
+		goto invalid;
+	}
+
+	if (payload->size < min_keysize) {
+		fscrypt_warn(NULL,
+			     "key with description '%s' is too short (got %u bytes, need %u+ bytes)",
+			     key->description, payload->size, min_keysize);
+		goto invalid;
+	}
+
+	*payload_ret = payload;
+	return key;
+
+invalid:
+	up_read(&key->sem);
+	key_put(key);
+	return ERR_PTR(-ENOKEY);
+}
+
+/* Master key referenced by DIRECT_KEY policy */
+struct fscrypt_direct_key {
+	struct hlist_node		dk_node;
+	refcount_t			dk_refcount;
+	const struct fscrypt_mode	*dk_mode;
+	struct crypto_skcipher		*dk_ctfm;
+	u8				dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
+	u8				dk_raw[FSCRYPT_MAX_KEY_SIZE];
+};
+
+static void free_direct_key(struct fscrypt_direct_key *dk)
+{
+	if (dk) {
+		crypto_free_skcipher(dk->dk_ctfm);
+		kzfree(dk);
+	}
+}
+
+void fscrypt_put_direct_key(struct fscrypt_direct_key *dk)
+{
+	if (!refcount_dec_and_lock(&dk->dk_refcount, &fscrypt_direct_keys_lock))
+		return;
+	hash_del(&dk->dk_node);
+	spin_unlock(&fscrypt_direct_keys_lock);
+
+	free_direct_key(dk);
+}
+
+/*
+ * Find/insert the given key into the fscrypt_direct_keys table.  If found, it
+ * is returned with elevated refcount, and 'to_insert' is freed if non-NULL.  If
+ * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise
+ * NULL is returned.
+ */
+static struct fscrypt_direct_key *
+find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
+			  const u8 *raw_key, const struct fscrypt_info *ci)
+{
+	unsigned long hash_key;
+	struct fscrypt_direct_key *dk;
+
+	/*
+	 * Careful: to avoid potentially leaking secret key bytes via timing
+	 * information, we must key the hash table by descriptor rather than by
+	 * raw key, and use crypto_memneq() when comparing raw keys.
+	 */
+
+	BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE);
+	memcpy(&hash_key, ci->ci_master_key_descriptor, sizeof(hash_key));
+
+	spin_lock(&fscrypt_direct_keys_lock);
+	hash_for_each_possible(fscrypt_direct_keys, dk, dk_node, hash_key) {
+		if (memcmp(ci->ci_master_key_descriptor, dk->dk_descriptor,
+			   FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0)
+			continue;
+		if (ci->ci_mode != dk->dk_mode)
+			continue;
+		if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize))
+			continue;
+		/* using existing tfm with same (descriptor, mode, raw_key) */
+		refcount_inc(&dk->dk_refcount);
+		spin_unlock(&fscrypt_direct_keys_lock);
+		free_direct_key(to_insert);
+		return dk;
+	}
+	if (to_insert)
+		hash_add(fscrypt_direct_keys, &to_insert->dk_node, hash_key);
+	spin_unlock(&fscrypt_direct_keys_lock);
+	return to_insert;
+}
+
+/* Prepare to encrypt directly using the master key in the given mode */
+static struct fscrypt_direct_key *
+fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
+{
+	struct fscrypt_direct_key *dk;
+	int err;
+
+	/* Is there already a tfm for this key? */
+	dk = find_or_insert_direct_key(NULL, raw_key, ci);
+	if (dk)
+		return dk;
+
+	/* Nope, allocate one. */
+	dk = kzalloc(sizeof(*dk), GFP_NOFS);
+	if (!dk)
+		return ERR_PTR(-ENOMEM);
+	refcount_set(&dk->dk_refcount, 1);
+	dk->dk_mode = ci->ci_mode;
+	dk->dk_ctfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key,
+						ci->ci_inode);
+	if (IS_ERR(dk->dk_ctfm)) {
+		err = PTR_ERR(dk->dk_ctfm);
+		dk->dk_ctfm = NULL;
+		goto err_free_dk;
+	}
+	memcpy(dk->dk_descriptor, ci->ci_master_key_descriptor,
+	       FSCRYPT_KEY_DESCRIPTOR_SIZE);
+	memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize);
+
+	return find_or_insert_direct_key(dk, raw_key, ci);
+
+err_free_dk:
+	free_direct_key(dk);
+	return ERR_PTR(err);
+}
+
+/* v1 policy, DIRECT_KEY: use the master key directly */
+static int setup_v1_file_key_direct(struct fscrypt_info *ci,
+				    const u8 *raw_master_key)
+{
+	const struct fscrypt_mode *mode = ci->ci_mode;
+	struct fscrypt_direct_key *dk;
+
+	if (!fscrypt_mode_supports_direct_key(mode)) {
+		fscrypt_warn(ci->ci_inode,
+			     "Direct key mode not allowed with %s",
+			     mode->friendly_name);
+		return -EINVAL;
+	}
+
+	if (ci->ci_data_mode != ci->ci_filename_mode) {
+		fscrypt_warn(ci->ci_inode,
+			     "Direct key mode not allowed with different contents and filenames modes");
+		return -EINVAL;
+	}
+
+	/* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
+	if (WARN_ON(mode->needs_essiv))
+		return -EINVAL;
+
+	dk = fscrypt_get_direct_key(ci, raw_master_key);
+	if (IS_ERR(dk))
+		return PTR_ERR(dk);
+	ci->ci_direct_key = dk;
+	ci->ci_ctfm = dk->dk_ctfm;
+	return 0;
+}
+
+/* v1 policy, !DIRECT_KEY: derive the file's encryption key */
+static int setup_v1_file_key_derived(struct fscrypt_info *ci,
+				     const u8 *raw_master_key)
+{
+	u8 *derived_key;
+	int err;
+
+	/*
+	 * This cannot be a stack buffer because it will be passed to the
+	 * scatterlist crypto API during derive_key_aes().
+	 */
+	derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS);
+	if (!derived_key)
+		return -ENOMEM;
+
+	err = derive_key_aes(raw_master_key, ci->ci_nonce,
+			     derived_key, ci->ci_mode->keysize);
+	if (err)
+		goto out;
+
+	err = fscrypt_set_derived_key(ci, derived_key);
+out:
+	kzfree(derived_key);
+	return err;
+}
+
+int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
+{
+	if (ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
+		return setup_v1_file_key_direct(ci, raw_master_key);
+	else
+		return setup_v1_file_key_derived(ci, raw_master_key);
+}
+
+int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
+{
+	struct key *key;
+	const struct fscrypt_key *payload;
+	int err;
+
+	key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
+					ci->ci_master_key_descriptor,
+					ci->ci_mode->keysize, &payload);
+	if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) {
+		key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix,
+						ci->ci_master_key_descriptor,
+						ci->ci_mode->keysize, &payload);
+	}
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	err = fscrypt_setup_v1_file_key(ci, payload->raw);
+	up_read(&key->sem);
+	key_put(key);
+	return err;
+}
-- 
2.22.0.770.g0f2c4a37fd-goog

^ permalink raw reply related

* [PATCH v8 08/20] fscrypt: rename keyinfo.c to keysetup.c
From: Eric Biggers @ 2019-08-05 16:25 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Satya Tangirala, Theodore Ts'o, linux-api, linux-f2fs-devel,
	keyrings, linux-mtd, linux-crypto, linux-fsdevel, Jaegeuk Kim,
	linux-ext4, Paul Crowley
In-Reply-To: <20190805162521.90882-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Rename keyinfo.c to keysetup.c since this better describes what the file
does (sets up the key), and it matches the new file keysetup_v1.c.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/Makefile                  | 2 +-
 fs/crypto/fscrypt_private.h         | 2 +-
 fs/crypto/{keyinfo.c => keysetup.c} | 0
 include/linux/fscrypt.h             | 4 ++--
 4 files changed, 4 insertions(+), 4 deletions(-)
 rename fs/crypto/{keyinfo.c => keysetup.c} (100%)

diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index 1fba255c34ca56..ad14d4c29784a6 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_FS_ENCRYPTION)	+= fscrypto.o
 fscrypto-y := crypto.o \
 	      fname.o \
 	      hooks.o \
-	      keyinfo.o \
+	      keysetup.o \
 	      keysetup_v1.o \
 	      policy.o
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 387b44b255f6ab..794dcba25ca826 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -156,7 +156,7 @@ extern bool fscrypt_fname_encrypted_size(const struct inode *inode,
 					 u32 orig_len, u32 max_len,
 					 u32 *encrypted_len_ret);
 
-/* keyinfo.c */
+/* keysetup.c */
 
 struct fscrypt_mode {
 	const char *friendly_name;
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keysetup.c
similarity index 100%
rename from fs/crypto/keyinfo.c
rename to fs/crypto/keysetup.c
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 81c0c754f8b21b..583802cb2e35d0 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -138,7 +138,7 @@ extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
 extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
 extern int fscrypt_inherit_context(struct inode *, struct inode *,
 					void *, bool);
-/* keyinfo.c */
+/* keysetup.c */
 extern int fscrypt_get_encryption_info(struct inode *);
 extern void fscrypt_put_encryption_info(struct inode *);
 extern void fscrypt_free_inode(struct inode *);
@@ -367,7 +367,7 @@ static inline int fscrypt_inherit_context(struct inode *parent,
 	return -EOPNOTSUPP;
 }
 
-/* keyinfo.c */
+/* keysetup.c */
 static inline int fscrypt_get_encryption_info(struct inode *inode)
 {
 	return -EOPNOTSUPP;
-- 
2.22.0.770.g0f2c4a37fd-goog

^ permalink raw reply related

* [PATCH v8 09/20] fscrypt: add FS_IOC_ADD_ENCRYPTION_KEY ioctl
From: Eric Biggers @ 2019-08-05 16:25 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Satya Tangirala, Theodore Ts'o, linux-api, linux-f2fs-devel,
	keyrings, linux-mtd, linux-crypto, linux-fsdevel, Jaegeuk Kim,
	linux-ext4, Paul Crowley
In-Reply-To: <20190805162521.90882-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add a new fscrypt ioctl, FS_IOC_ADD_ENCRYPTION_KEY.  This ioctl adds an
encryption key to the filesystem's fscrypt keyring ->s_master_keys,
making any files encrypted with that key appear "unlocked".

Why we need this
~~~~~~~~~~~~~~~~

The main problem is that the "locked/unlocked" (ciphertext/plaintext)
status of encrypted files is global, but the fscrypt keys are not.
fscrypt only looks for keys in the keyring(s) the process accessing the
filesystem is subscribed to: the thread keyring, process keyring, and
session keyring, where the session keyring may contain the user keyring.

Therefore, userspace has to put fscrypt keys in the keyrings for
individual users or sessions.  But this means that when a process with a
different keyring tries to access encrypted files, whether they appear
"unlocked" or not is nondeterministic.  This is because it depends on
whether the files are currently present in the inode cache.

Fixing this by consistently providing each process its own view of the
filesystem depending on whether it has the key or not isn't feasible due
to how the VFS caches work.  Furthermore, while sometimes users expect
this behavior, it is misguided for two reasons.  First, it would be an
OS-level access control mechanism largely redundant with existing access
control mechanisms such as UNIX file permissions, ACLs, LSMs, etc.
Encryption is actually for protecting the data at rest.

Second, almost all users of fscrypt actually do need the keys to be
global.  The largest users of fscrypt, Android and Chromium OS, achieve
this by having PID 1 create a "session keyring" that is inherited by
every process.  This works, but it isn't scalable because it prevents
session keyrings from being used for any other purpose.

On general-purpose Linux distros, the 'fscrypt' userspace tool [1] can't
similarly abuse the session keyring, so to make 'sudo' work on all
systems it has to link all the user keyrings into root's user keyring
[2].  This is ugly and raises security concerns.  Moreover it can't make
the keys available to system services, such as sshd trying to access the
user's '~/.ssh' directory (see [3], [4]) or NetworkManager trying to
read certificates from the user's home directory (see [5]); or to Docker
containers (see [6], [7]).

By having an API to add a key to the *filesystem* we'll be able to fix
the above bugs, remove userspace workarounds, and clearly express the
intended semantics: the locked/unlocked status of an encrypted directory
is global, and encryption is orthogonal to OS-level access control.

Why not use the add_key() syscall
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

We use an ioctl for this API rather than the existing add_key() system
call because the ioctl gives us the flexibility needed to implement
fscrypt-specific semantics that will be introduced in later patches:

- Supporting key removal with the semantics such that the secret is
  removed immediately and any unused inodes using the key are evicted;
  also, the eviction of any in-use inodes can be retried.

- Calculating a key-dependent cryptographic identifier and returning it
  to userspace.

- Allowing keys to be added and removed by non-root users, but only keys
  for v2 encryption policies; and to prevent denial-of-service attacks,
  users can only remove keys they themselves have added, and a key is
  only really removed after all users who added it have removed it.

Trying to shoehorn these semantics into the keyrings syscalls would be
very difficult, whereas the ioctls make things much easier.

However, to reuse code the implementation still uses the keyrings
service internally.  Thus we get lockless RCU-mode key lookups without
having to re-implement it, and the keys automatically show up in
/proc/keys for debugging purposes.

References:

    [1] https://github.com/google/fscrypt
    [2] https://goo.gl/55cCrI#heading=h.vf09isp98isb
    [3] https://github.com/google/fscrypt/issues/111#issuecomment-444347939
    [4] https://github.com/google/fscrypt/issues/116
    [5] https://bugs.launchpad.net/ubuntu/+source/fscrypt/+bug/1770715
    [6] https://github.com/google/fscrypt/issues/128
    [7] https://askubuntu.com/questions/1130306/cannot-run-docker-on-an-encrypted-filesystem

Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/Makefile           |   1 +
 fs/crypto/crypto.c           |  10 +-
 fs/crypto/fscrypt_private.h  |  62 +++++++-
 fs/crypto/keyring.c          | 286 +++++++++++++++++++++++++++++++++++
 fs/crypto/keysetup.c         |  35 ++++-
 fs/super.c                   |   2 +
 include/linux/fs.h           |   1 +
 include/linux/fscrypt.h      |  14 ++
 include/uapi/linux/fscrypt.h |  49 ++++--
 9 files changed, 447 insertions(+), 13 deletions(-)
 create mode 100644 fs/crypto/keyring.c

diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index ad14d4c29784a6..6b2485b4139335 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_FS_ENCRYPTION)	+= fscrypto.o
 fscrypto-y := crypto.o \
 	      fname.o \
 	      hooks.o \
+	      keyring.o \
 	      keysetup.o \
 	      keysetup_v1.o \
 	      policy.o
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 7502c1f0ede9e9..65ca077e8d585f 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -478,6 +478,8 @@ void fscrypt_msg(const struct inode *inode, const char *level,
  */
 static int __init fscrypt_init(void)
 {
+	int err = -ENOMEM;
+
 	/*
 	 * Use an unbound workqueue to allow bios to be decrypted in parallel
 	 * even when they happen to complete on the same CPU.  This sacrifices
@@ -500,13 +502,19 @@ static int __init fscrypt_init(void)
 	if (!fscrypt_info_cachep)
 		goto fail_free_ctx;
 
+	err = fscrypt_init_keyring();
+	if (err)
+		goto fail_free_info;
+
 	return 0;
 
+fail_free_info:
+	kmem_cache_destroy(fscrypt_info_cachep);
 fail_free_ctx:
 	kmem_cache_destroy(fscrypt_ctx_cachep);
 fail_free_queue:
 	destroy_workqueue(fscrypt_read_workqueue);
 fail:
-	return -ENOMEM;
+	return err;
 }
 late_initcall(fscrypt_init)
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 794dcba25ca826..0d9ebfd3bf3a54 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -14,9 +14,12 @@
 #include <linux/fscrypt.h>
 #include <crypto/hash.h>
 
-/* Encryption parameters */
+#define CONST_STRLEN(str)	(sizeof(str) - 1)
+
 #define FS_KEY_DERIVATION_NONCE_SIZE	16
 
+#define FSCRYPT_MIN_KEY_SIZE		16
+
 /**
  * Encryption context for inode
  *
@@ -156,6 +159,63 @@ extern bool fscrypt_fname_encrypted_size(const struct inode *inode,
 					 u32 orig_len, u32 max_len,
 					 u32 *encrypted_len_ret);
 
+/* keyring.c */
+
+/*
+ * fscrypt_master_key_secret - secret key material of an in-use master key
+ */
+struct fscrypt_master_key_secret {
+
+	/* Size of the raw key in bytes */
+	u32			size;
+
+	/* The raw key */
+	u8			raw[FSCRYPT_MAX_KEY_SIZE];
+
+} __randomize_layout;
+
+/*
+ * fscrypt_master_key - an in-use master key
+ *
+ * This represents a master encryption key which has been added to the
+ * filesystem and can be used to "unlock" the encrypted files which were
+ * encrypted with it.
+ */
+struct fscrypt_master_key {
+
+	/* The secret key material */
+	struct fscrypt_master_key_secret	mk_secret;
+
+	/* Arbitrary key descriptor which was assigned by userspace */
+	struct fscrypt_key_specifier		mk_spec;
+
+} __randomize_layout;
+
+static inline const char *master_key_spec_type(
+				const struct fscrypt_key_specifier *spec)
+{
+	switch (spec->type) {
+	case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+		return "descriptor";
+	}
+	return "[unknown]";
+}
+
+static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
+{
+	switch (spec->type) {
+	case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+		return FSCRYPT_KEY_DESCRIPTOR_SIZE;
+	}
+	return 0;
+}
+
+extern struct key *
+fscrypt_find_master_key(struct super_block *sb,
+			const struct fscrypt_key_specifier *mk_spec);
+
+extern int __init fscrypt_init_keyring(void);
+
 /* keysetup.c */
 
 struct fscrypt_mode {
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
new file mode 100644
index 00000000000000..bcd7d2836e1e4c
--- /dev/null
+++ b/fs/crypto/keyring.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Filesystem-level keyring for fscrypt
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * This file implements management of fscrypt master keys in the
+ * filesystem-level keyring, including the ioctls:
+ *
+ * - FS_IOC_ADD_ENCRYPTION_KEY
+ *
+ * See the "User API" section of Documentation/filesystems/fscrypt.rst for more
+ * information about these ioctls.
+ */
+
+#include <linux/key-type.h>
+#include <linux/seq_file.h>
+
+#include "fscrypt_private.h"
+
+static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret)
+{
+	memzero_explicit(secret, sizeof(*secret));
+}
+
+static void move_master_key_secret(struct fscrypt_master_key_secret *dst,
+				   struct fscrypt_master_key_secret *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+	memzero_explicit(src, sizeof(*src));
+}
+
+static void free_master_key(struct fscrypt_master_key *mk)
+{
+	wipe_master_key_secret(&mk->mk_secret);
+	kzfree(mk);
+}
+
+static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
+{
+	if (spec->__reserved)
+		return false;
+	return master_key_spec_len(spec) != 0;
+}
+
+static int fscrypt_key_instantiate(struct key *key,
+				   struct key_preparsed_payload *prep)
+{
+	key->payload.data[0] = (struct fscrypt_master_key *)prep->data;
+	return 0;
+}
+
+static void fscrypt_key_destroy(struct key *key)
+{
+	free_master_key(key->payload.data[0]);
+}
+
+static void fscrypt_key_describe(const struct key *key, struct seq_file *m)
+{
+	seq_puts(m, key->description);
+}
+
+/*
+ * Type of key in ->s_master_keys.  Each key of this type represents a master
+ * key which has been added to the filesystem.  Its payload is a
+ * 'struct fscrypt_master_key'.  The "." prefix in the key type name prevents
+ * users from adding keys of this type via the keyrings syscalls rather than via
+ * the intended method of FS_IOC_ADD_ENCRYPTION_KEY.
+ */
+static struct key_type key_type_fscrypt = {
+	.name			= "._fscrypt",
+	.instantiate		= fscrypt_key_instantiate,
+	.destroy		= fscrypt_key_destroy,
+	.describe		= fscrypt_key_describe,
+};
+
+/* Search ->s_master_keys */
+static struct key *search_fscrypt_keyring(struct key *keyring,
+					  struct key_type *type,
+					  const char *description)
+{
+	/*
+	 * We need to mark the keyring reference as "possessed" so that we
+	 * acquire permission to search it, via the KEY_POS_SEARCH permission.
+	 */
+	key_ref_t keyref = make_key_ref(keyring, true /* possessed */);
+
+	keyref = keyring_search(keyref, type, description, false);
+	if (IS_ERR(keyref)) {
+		if (PTR_ERR(keyref) == -EAGAIN || /* not found */
+		    PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
+			keyref = ERR_PTR(-ENOKEY);
+		return ERR_CAST(keyref);
+	}
+	return key_ref_to_ptr(keyref);
+}
+
+#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE	\
+	(CONST_STRLEN("fscrypt-") + FIELD_SIZEOF(struct super_block, s_id))
+
+#define FSCRYPT_MK_DESCRIPTION_SIZE	(2 * FSCRYPT_KEY_DESCRIPTOR_SIZE + 1)
+
+static void format_fs_keyring_description(
+			char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE],
+			const struct super_block *sb)
+{
+	sprintf(description, "fscrypt-%s", sb->s_id);
+}
+
+static void format_mk_description(
+			char description[FSCRYPT_MK_DESCRIPTION_SIZE],
+			const struct fscrypt_key_specifier *mk_spec)
+{
+	sprintf(description, "%*phN",
+		master_key_spec_len(mk_spec), (u8 *)&mk_spec->u);
+}
+
+/* Create ->s_master_keys if needed.  Synchronized by fscrypt_add_key_mutex. */
+static int allocate_filesystem_keyring(struct super_block *sb)
+{
+	char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE];
+	struct key *keyring;
+
+	if (sb->s_master_keys)
+		return 0;
+
+	format_fs_keyring_description(description, sb);
+	keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				current_cred(), KEY_POS_SEARCH |
+				  KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW,
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
+
+	/* Pairs with READ_ONCE() in fscrypt_find_master_key() */
+	smp_store_release(&sb->s_master_keys, keyring);
+	return 0;
+}
+
+void fscrypt_sb_free(struct super_block *sb)
+{
+	key_put(sb->s_master_keys);
+	sb->s_master_keys = NULL;
+}
+
+/*
+ * Find the specified master key in ->s_master_keys.
+ * Returns ERR_PTR(-ENOKEY) if not found.
+ */
+struct key *fscrypt_find_master_key(struct super_block *sb,
+				    const struct fscrypt_key_specifier *mk_spec)
+{
+	struct key *keyring;
+	char description[FSCRYPT_MK_DESCRIPTION_SIZE];
+
+	/* pairs with smp_store_release() in allocate_filesystem_keyring() */
+	keyring = READ_ONCE(sb->s_master_keys);
+	if (keyring == NULL)
+		return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */
+
+	format_mk_description(description, mk_spec);
+	return search_fscrypt_keyring(keyring, &key_type_fscrypt, description);
+}
+
+/*
+ * Allocate a new fscrypt_master_key which contains the given secret, set it as
+ * the payload of a new 'struct key' of type fscrypt, and link the 'struct key'
+ * into the given keyring.  Synchronized by fscrypt_add_key_mutex.
+ */
+static int add_new_master_key(struct fscrypt_master_key_secret *secret,
+			      const struct fscrypt_key_specifier *mk_spec,
+			      struct key *keyring)
+{
+	struct fscrypt_master_key *mk;
+	char description[FSCRYPT_MK_DESCRIPTION_SIZE];
+	struct key *key;
+	int err;
+
+	mk = kzalloc(sizeof(*mk), GFP_KERNEL);
+	if (!mk)
+		return -ENOMEM;
+
+	mk->mk_spec = *mk_spec;
+
+	move_master_key_secret(&mk->mk_secret, secret);
+
+	format_mk_description(description, mk_spec);
+	key = key_alloc(&key_type_fscrypt, description,
+			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+			KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW,
+			KEY_ALLOC_NOT_IN_QUOTA, NULL);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		goto out_free_mk;
+	}
+	err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL);
+	key_put(key);
+	if (err)
+		goto out_free_mk;
+
+	return 0;
+
+out_free_mk:
+	free_master_key(mk);
+	return err;
+}
+
+static int add_master_key(struct super_block *sb,
+			  struct fscrypt_master_key_secret *secret,
+			  const struct fscrypt_key_specifier *mk_spec)
+{
+	static DEFINE_MUTEX(fscrypt_add_key_mutex);
+	struct key *key;
+	int err;
+
+	mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */
+	key = fscrypt_find_master_key(sb, mk_spec);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		if (err != -ENOKEY)
+			goto out_unlock;
+		/* Didn't find the key in ->s_master_keys.  Add it. */
+		err = allocate_filesystem_keyring(sb);
+		if (err)
+			goto out_unlock;
+		err = add_new_master_key(secret, mk_spec, sb->s_master_keys);
+	} else {
+		key_put(key);
+		err = 0;
+	}
+out_unlock:
+	mutex_unlock(&fscrypt_add_key_mutex);
+	return err;
+}
+
+/*
+ * Add a master encryption key to the filesystem, causing all files which were
+ * encrypted with it to appear "unlocked" (decrypted) when accessed.
+ *
+ * For more details, see the "FS_IOC_ADD_ENCRYPTION_KEY" section of
+ * Documentation/filesystems/fscrypt.rst.
+ */
+int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg)
+{
+	struct super_block *sb = file_inode(filp)->i_sb;
+	struct fscrypt_add_key_arg __user *uarg = _uarg;
+	struct fscrypt_add_key_arg arg;
+	struct fscrypt_master_key_secret secret;
+	int err;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (!valid_key_spec(&arg.key_spec))
+		return -EINVAL;
+
+	if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE ||
+	    arg.raw_size > FSCRYPT_MAX_KEY_SIZE)
+		return -EINVAL;
+
+	if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
+		return -EINVAL;
+
+	memset(&secret, 0, sizeof(secret));
+	secret.size = arg.raw_size;
+	err = -EFAULT;
+	if (copy_from_user(secret.raw, uarg->raw, secret.size))
+		goto out_wipe_secret;
+
+	err = -EACCES;
+	if (!capable(CAP_SYS_ADMIN))
+		goto out_wipe_secret;
+
+	err = add_master_key(sb, &secret, &arg.key_spec);
+out_wipe_secret:
+	wipe_master_key_secret(&secret);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key);
+
+int __init fscrypt_init_keyring(void)
+{
+	return register_key_type(&key_type_fscrypt);
+}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index f4a47448e9efa1..1c6d18bcdc7b64 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -216,7 +216,40 @@ int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key)
  */
 static int setup_file_encryption_key(struct fscrypt_info *ci)
 {
-	return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
+	struct key *key;
+	struct fscrypt_master_key *mk = NULL;
+	struct fscrypt_key_specifier mk_spec;
+	int err;
+
+	mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
+	memcpy(mk_spec.u.descriptor, ci->ci_master_key_descriptor,
+	       FSCRYPT_KEY_DESCRIPTOR_SIZE);
+
+	key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
+	if (IS_ERR(key)) {
+		if (key != ERR_PTR(-ENOKEY))
+			return PTR_ERR(key);
+
+		return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
+	}
+
+	mk = key->payload.data[0];
+
+	if (mk->mk_secret.size < ci->ci_mode->keysize) {
+		fscrypt_warn(NULL,
+			     "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
+			     master_key_spec_type(&mk_spec),
+			     master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u,
+			     mk->mk_secret.size, ci->ci_mode->keysize);
+		err = -ENOKEY;
+		goto out_release_key;
+	}
+
+	err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw);
+
+out_release_key:
+	key_put(key);
+	return err;
 }
 
 static void put_crypt_info(struct fscrypt_info *ci)
diff --git a/fs/super.c b/fs/super.c
index 5960578a40760a..e486a442a61fb2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -32,6 +32,7 @@
 #include <linux/backing-dev.h>
 #include <linux/rculist_bl.h>
 #include <linux/cleancache.h>
+#include <linux/fscrypt.h>
 #include <linux/fsnotify.h>
 #include <linux/lockdep.h>
 #include <linux/user_namespace.h>
@@ -290,6 +291,7 @@ static void __put_super(struct super_block *s)
 		WARN_ON(s->s_inode_lru.node);
 		WARN_ON(!list_empty(&s->s_mounts));
 		security_sb_free(s);
+		fscrypt_sb_free(s);
 		put_user_ns(s->s_user_ns);
 		kfree(s->s_subtype);
 		call_rcu(&s->rcu, destroy_super_rcu);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 997a530ff4e9d0..5dff77326cec60 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1427,6 +1427,7 @@ struct super_block {
 	const struct xattr_handler **s_xattr;
 #ifdef CONFIG_FS_ENCRYPTION
 	const struct fscrypt_operations	*s_cop;
+	struct key		*s_master_keys; /* master crypto keys in use */
 #endif
 	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 583802cb2e35d0..46bf66cf76ef88 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -138,6 +138,10 @@ extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
 extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
 extern int fscrypt_inherit_context(struct inode *, struct inode *,
 					void *, bool);
+/* keyring.c */
+extern void fscrypt_sb_free(struct super_block *sb);
+extern int fscrypt_ioctl_add_key(struct file *filp, void __user *arg);
+
 /* keysetup.c */
 extern int fscrypt_get_encryption_info(struct inode *);
 extern void fscrypt_put_encryption_info(struct inode *);
@@ -367,6 +371,16 @@ static inline int fscrypt_inherit_context(struct inode *parent,
 	return -EOPNOTSUPP;
 }
 
+/* keyring.c */
+static inline void fscrypt_sb_free(struct super_block *sb)
+{
+}
+
+static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
 /* keysetup.c */
 static inline int fscrypt_get_encryption_info(struct inode *inode)
 {
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
index 29a945d165def3..6aeca3cb0a2dec 100644
--- a/include/uapi/linux/fscrypt.h
+++ b/include/uapi/linux/fscrypt.h
@@ -36,22 +36,51 @@ struct fscrypt_policy {
 	__u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
 };
 
-#define FS_IOC_SET_ENCRYPTION_POLICY	_IOR('f', 19, struct fscrypt_policy)
-#define FS_IOC_GET_ENCRYPTION_PWSALT	_IOW('f', 20, __u8[16])
-#define FS_IOC_GET_ENCRYPTION_POLICY	_IOW('f', 21, struct fscrypt_policy)
-
-/* Parameters for passing an encryption key into the kernel keyring */
+/*
+ * Process-subscribed "logon" key description prefix and payload format.
+ * Deprecated; prefer FS_IOC_ADD_ENCRYPTION_KEY instead.
+ */
 #define FSCRYPT_KEY_DESC_PREFIX		"fscrypt:"
-#define FSCRYPT_KEY_DESC_PREFIX_SIZE		8
-
-/* Structure that userspace passes to the kernel keyring */
-#define FSCRYPT_MAX_KEY_SIZE			64
-
+#define FSCRYPT_KEY_DESC_PREFIX_SIZE	8
+#define FSCRYPT_MAX_KEY_SIZE		64
 struct fscrypt_key {
 	__u32 mode;
 	__u8 raw[FSCRYPT_MAX_KEY_SIZE];
 	__u32 size;
 };
+
+/*
+ * Keys are specified by an arbitrary 8-byte key "descriptor",
+ * matching fscrypt_policy::master_key_descriptor.
+ */
+#define FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR	1
+
+/*
+ * Specifies a key.  This doesn't contain the actual key itself; this is just
+ * the "name" of the key.
+ */
+struct fscrypt_key_specifier {
+	__u32 type;	/* one of FSCRYPT_KEY_SPEC_TYPE_* */
+	__u32 __reserved;
+	union {
+		__u8 __reserved[32]; /* reserve some extra space */
+		__u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
+	} u;
+};
+
+/* Struct passed to FS_IOC_ADD_ENCRYPTION_KEY */
+struct fscrypt_add_key_arg {
+	struct fscrypt_key_specifier key_spec;
+	__u32 raw_size;
+	__u32 __reserved[9];
+	__u8 raw[];
+};
+
+#define FS_IOC_SET_ENCRYPTION_POLICY		_IOR('f', 19, struct fscrypt_policy)
+#define FS_IOC_GET_ENCRYPTION_PWSALT		_IOW('f', 20, __u8[16])
+#define FS_IOC_GET_ENCRYPTION_POLICY		_IOW('f', 21, struct fscrypt_policy)
+#define FS_IOC_ADD_ENCRYPTION_KEY		_IOWR('f', 23, struct fscrypt_add_key_arg)
+
 /**********************************************************************/
 
 /* old names; don't add anything new here! */
-- 
2.22.0.770.g0f2c4a37fd-goog

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox