Linux Documentation

Linux Documentation
 help / color / mirror / Atom feed

* [PATCHv5 6/8] fpga: add intel stratix10 soc fpga manager driver
From: richard.gong @ 2018-05-24 16:33 UTC (permalink / raw)
  To: catalin.marinas, will.deacon, dinguyen, robh+dt, mark.rutland,
	atull, mdf, arnd, gregkh, corbet
  Cc: linux-arm-kernel, linux-kernel, devicetree, linux-fpga, linux-doc,
	yves.vandervennet, richard.gong, richard.gong
In-Reply-To: <1527179600-26441-1-git-send-email-richard.gong@linux.intel.com>

From: Alan Tull <atull@kernel.org>

Add driver for reconfiguring Intel Stratix10 SoC FPGA devices.
This driver communicates through the Intel Service Driver which
does communication with privileged hardware (that does the
FPGA programming) through a secure mailbox.

Signed-off-by: Alan Tull <atull@kernel.org>
Signed-off-by: Richard Gong <richard.gong@intel.com>
---
v2: this patch is added in patch set version 2
v3: change to align to the update of service client APIs, and the
    update of fpga_mgr device node
v4: changes to align with stratix10-svc-client API updates
    add Richard's signed-off-by
v5: update to align changes at service layer to minimize service
    layer thread usages
---
 drivers/fpga/Kconfig         |   6 +
 drivers/fpga/Makefile        |   1 +
 drivers/fpga/stratix10-soc.c | 545 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 552 insertions(+)
 create mode 100644 drivers/fpga/stratix10-soc.c

diff --git a/drivers/fpga/Kconfig b/drivers/fpga/Kconfig
index f47ef84..1624a73 100644
--- a/drivers/fpga/Kconfig
+++ b/drivers/fpga/Kconfig
@@ -57,6 +57,12 @@ config FPGA_MGR_ZYNQ_FPGA
 	help
 	  FPGA manager driver support for Xilinx Zynq FPGAs.
 
+config FPGA_MGR_STRATIX10_SOC
+	tristate "Intel Stratix10 SoC FPGA Manager"
+	depends on (ARCH_STRATIX10 && STRATIX10_SERVICE)
+	help
+	  FPGA manager driver support for the Intel Stratix10 SoC.
+
 config FPGA_MGR_XILINX_SPI
 	tristate "Xilinx Configuration over Slave Serial (SPI)"
 	depends on SPI
diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
index 3cb276a..6eef670 100644
--- a/drivers/fpga/Makefile
+++ b/drivers/fpga/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_FPGA_MGR_ALTERA_PS_SPI)	+= altera-ps-spi.o
 obj-$(CONFIG_FPGA_MGR_ICE40_SPI)	+= ice40-spi.o
 obj-$(CONFIG_FPGA_MGR_SOCFPGA)		+= socfpga.o
 obj-$(CONFIG_FPGA_MGR_SOCFPGA_A10)	+= socfpga-a10.o
+obj-$(CONFIG_FPGA_MGR_STRATIX10_SOC)	+= stratix10-soc.o
 obj-$(CONFIG_FPGA_MGR_TS73XX)		+= ts73xx-fpga.o
 obj-$(CONFIG_FPGA_MGR_XILINX_SPI)	+= xilinx-spi.o
 obj-$(CONFIG_FPGA_MGR_ZYNQ_FPGA)	+= zynq-fpga.o
diff --git a/drivers/fpga/stratix10-soc.c b/drivers/fpga/stratix10-soc.c
new file mode 100644
index 0000000..d645ef7
--- /dev/null
+++ b/drivers/fpga/stratix10-soc.c
@@ -0,0 +1,545 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FPGA Manager Driver for Intel Stratix10 SoC
+ *
+ *  Copyright (C) 2018 Intel Corporation
+ */
+#include <linux/completion.h>
+#include <linux/fpga/fpga-mgr.h>
+#include <linux/stratix10-svc-client.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+/*
+ * FPGA programming requires a higher level of privilege (EL3), per the SoC
+ * design.
+ */
+#define NUM_SVC_BUFS	4
+#define SVC_BUF_SIZE	SZ_512K
+
+/* Indicates buffer is in use if set */
+#define SVC_BUF_LOCK	0
+
+/**
+ * struct s10_svc_buf
+ * @buf: virtual address of buf provided by service layer
+ * @lock: locked if buffer is in use
+ */
+struct s10_svc_buf {
+	char *buf;
+	unsigned long lock;
+};
+
+struct s10_priv {
+	struct stratix10_svc_chan *chan;
+	struct stratix10_svc_client client;
+	struct completion status_return_completion;
+	struct s10_svc_buf svc_bufs[NUM_SVC_BUFS];
+	unsigned long status;
+};
+
+static int s10_svc_send_msg(struct s10_priv *priv,
+			    enum stratix10_svc_command_code command,
+			    void *payload, u32 payload_length)
+{
+	struct stratix10_svc_chan *chan = priv->chan;
+	struct stratix10_svc_client_msg msg;
+	int ret;
+
+	pr_debug("%s cmd=%d payload=%p legnth=%d\n",
+		 __func__, command, payload, payload_length);
+
+	msg.command = command;
+	msg.payload = payload;
+	msg.payload_length = payload_length;
+
+	ret = stratix10_svc_send(chan, &msg);
+	pr_debug("stratix10_svc_send returned status %d\n", ret);
+
+	return ret;
+}
+
+/**
+ * s10_free_buffers
+ * Free buffers allocated from the service layer's pool that are not in use.
+ * @mgr: fpga manager struct
+ * Free all buffers that are not in use.
+ * Return true when all buffers are freed.
+ */
+static bool s10_free_buffers(struct fpga_manager *mgr)
+{
+	struct s10_priv *priv = mgr->priv;
+	uint num_free = 0;
+	uint i;
+
+	for (i = 0; i < NUM_SVC_BUFS; i++) {
+		if (!priv->svc_bufs[i].buf) {
+			num_free++;
+			continue;
+		}
+
+		if (!test_and_set_bit_lock(SVC_BUF_LOCK,
+					   &priv->svc_bufs[i].lock)) {
+			stratix10_svc_free_memory(priv->chan,
+					      priv->svc_bufs[i].buf);
+			priv->svc_bufs[i].buf = NULL;
+			num_free++;
+		}
+	}
+
+	return num_free == NUM_SVC_BUFS;
+}
+
+/**
+ * s10_free_buffer_count
+ * Count how many buffers are not in use.
+ * @mgr: fpga manager struct
+ * Return # of buffers that are not in use.
+ */
+static uint s10_free_buffer_count(struct fpga_manager *mgr)
+{
+	struct s10_priv *priv = mgr->priv;
+	uint num_free = 0;
+	uint i;
+
+	for (i = 0; i < NUM_SVC_BUFS; i++)
+		if (!priv->svc_bufs[i].buf)
+			num_free++;
+
+	return num_free;
+}
+
+/**
+ * s10_unlock_bufs
+ * Given the returned buffer address, match that address to our buffer struct
+ * and unlock that buffer.  This marks it as available to be refilled and sent
+ * (or freed).
+ * @priv: private data
+ * @kaddr: kernel address of buffer that was returned from service layer
+ */
+static void s10_unlock_bufs(struct s10_priv *priv, void *kaddr)
+{
+	uint i;
+
+	if (!kaddr)
+		return;
+
+	for (i = 0; i < NUM_SVC_BUFS; i++)
+		if (priv->svc_bufs[i].buf == kaddr) {
+			clear_bit_unlock(SVC_BUF_LOCK,
+					 &priv->svc_bufs[i].lock);
+			return;
+		}
+
+	WARN(1, "Unknown buffer returned from service layer %p\n", kaddr);
+}
+
+/**
+ * s10_receive_callback
+ * Callback for service layer to use to provide client (this driver) messages
+ * received through the mailbox.
+ * @client: service layer client struct
+ * @data: message
+ */
+static void s10_receive_callback(struct stratix10_svc_client *client,
+				 struct stratix10_svc_cb_data *data)
+{
+	struct s10_priv *priv = client->priv;
+	u32 status;
+	int i;
+
+	WARN_ONCE(!data, "%s: stratix10_svc_rc_data = NULL", __func__);
+
+	status = data->status;
+
+	/*
+	 * Here we set status bits as we receive them.  Elsewhere, we always use
+	 * test_and_clear_bit() to check status in priv->status
+	 */
+	for (i = 0; i <= SVC_STATUS_RECONFIG_ERROR; i++)
+		if (status & (1 << i))
+			set_bit(i, &priv->status);
+
+	if (status & BIT(SVC_STATUS_RECONFIG_BUFFER_DONE)) {
+		s10_unlock_bufs(priv, data->kaddr1);
+		s10_unlock_bufs(priv, data->kaddr2);
+		s10_unlock_bufs(priv, data->kaddr3);
+	}
+
+	complete(&priv->status_return_completion);
+}
+
+/**
+ * s10_ops_write_init
+ * Prepare for FPGA reconfiguration by requesting partial reconfig and
+ * allocating buffers from the service layer.
+ * @mgr: fpga manager
+ * @info: fpga image info
+ * @buf: fpga image buffer
+ * @count: size of buf in bytes
+ */
+static int s10_ops_write_init(struct fpga_manager *mgr,
+			      struct fpga_image_info *info,
+			      const char *buf, size_t count)
+{
+	struct s10_priv *priv = mgr->priv;
+	struct device *dev = priv->client.dev;
+	unsigned long timeout;
+	struct stratix10_svc_command_reconfig_payload payload;
+	char *kbuf;
+	uint i;
+	int ret;
+
+	if (info->flags & FPGA_MGR_PARTIAL_RECONFIG) {
+		dev_info(dev, "Requesting partial reconfiguration.\n");
+		payload.flags |= BIT(COMMAND_RECONFIG_FLAG_PARTIAL);
+	} else {
+		dev_info(dev, "Requesting full reconfiguration.\n");
+	}
+
+	reinit_completion(&priv->status_return_completion);
+	ret = s10_svc_send_msg(priv, COMMAND_RECONFIG,
+			       &payload, sizeof(payload));
+	if (ret)
+		goto init_done;
+
+	timeout = msecs_to_jiffies(SVC_RECONFIG_REQUEST_TIMEOUT_MS);
+	ret = wait_for_completion_interruptible_timeout(
+		&priv->status_return_completion, timeout);
+	if (!ret) {
+		dev_err(dev, "timeout waiting for RECONFIG_REQUEST\n");
+		ret = -ETIMEDOUT;
+		goto init_done;
+	}
+	if (ret < 0) {
+		dev_err(dev, "error (%d) waiting for RECONFIG_REQUEST\n", ret);
+		goto init_done;
+	}
+
+	ret = 0;
+	if (!test_and_clear_bit(SVC_STATUS_RECONFIG_REQUEST_OK,
+				&priv->status)) {
+		ret = -ETIMEDOUT;
+		goto init_done;
+	}
+
+	/* Allocate buffers from the service layer's pool. */
+	for (i = 0; i < NUM_SVC_BUFS; i++) {
+		kbuf = stratix10_svc_allocate_memory(priv->chan, SVC_BUF_SIZE);
+		if (!kbuf) {
+			s10_free_buffers(mgr);
+			ret = -ENOMEM;
+			goto init_done;
+		}
+
+		priv->svc_bufs[i].buf = kbuf;
+		priv->svc_bufs[i].lock = 0;
+	}
+
+init_done:
+	stratix10_svc_done(priv->chan);
+	return ret;
+}
+
+/**
+ * s10_send_buf
+ * Send a buffer to the service layer queue
+ * @mgr: fpga manager struct
+ * @buf_num: index of buffer in svc_bufs array
+ * @buf: fpga image buffer
+ * @count: size of buf in bytes
+ * Returns # of bytes transferred or -errno, never 0
+ */
+static int s10_send_buf(struct fpga_manager *mgr, uint buf_num,
+			const char *buf, size_t count)
+
+{
+	struct s10_priv *priv = mgr->priv;
+	struct device *dev = priv->client.dev;
+	void *svc_buf;
+	size_t xfer_sz;
+	int ret;
+
+	xfer_sz = count < SVC_BUF_SIZE ? count : SVC_BUF_SIZE;
+
+	svc_buf = priv->svc_bufs[buf_num].buf;
+	memcpy(svc_buf, buf, xfer_sz);
+	ret = s10_svc_send_msg(priv, COMMAND_RECONFIG_DATA_SUBMIT,
+			       svc_buf, xfer_sz);
+	if (ret) {
+		dev_err(dev,
+			"Error while sending data to service layer (%d)", ret);
+		return ret;
+	}
+
+	return xfer_sz;
+}
+
+/**
+ * s10_ops_write
+ * Send a FPGA image to privileged layers to write to the FPGA.  When done
+ * sending, free all service layer buffers we allocated in write_init.
+ * @mgr: fpga manager
+ * @buf: fpga image buffer
+ * @count: size of buf in bytes
+ * Returns 0 for success or negative errno.
+ */
+static int s10_ops_write(struct fpga_manager *mgr, const char *buf,
+			 size_t count)
+{
+	struct s10_priv *priv = mgr->priv;
+	struct device *dev = priv->client.dev;
+	unsigned long timeout;
+	size_t sent = 0;
+	int ret = 0;
+	uint i;
+
+	timeout = msecs_to_jiffies(SVC_RECONFIG_BUFFER_TIMEOUT_MS);
+
+	/* Buffer loop: either send buffers or free them. */
+	while (1) {
+		reinit_completion(&priv->status_return_completion);
+
+		if (count > 0) {
+			for (i = 0; i < NUM_SVC_BUFS; i++)
+				if (!test_and_set_bit_lock(
+					 SVC_BUF_LOCK, &priv->svc_bufs[i].lock))
+					break;
+
+			if (i == NUM_SVC_BUFS)
+				/* wait for a free buffer */
+				continue;
+
+			sent = s10_send_buf(mgr, i, buf, count);
+			/*
+			 * If service queue was full, we won't get a callback.
+			 * Wait and try again
+			 */
+			if (sent < 0)
+				continue;
+
+			count -= sent;
+			buf += sent;
+		} else {
+			s10_free_buffers(mgr);
+			if (s10_free_buffer_count(mgr) == NUM_SVC_BUFS)
+				return 0;
+
+			ret = s10_svc_send_msg(
+				priv, COMMAND_RECONFIG_DATA_CLAIM,
+				NULL, 0);
+			if (ret)
+				break;
+		}
+
+		/*
+		 * If callback hasn't already happened, wait for buffers to be
+		 * returned from service layer
+		 */
+		if (priv->status)
+			ret = 0;
+		else
+			ret = wait_for_completion_interruptible_timeout(
+				&priv->status_return_completion, timeout);
+
+		if (test_and_clear_bit(
+				SVC_STATUS_RECONFIG_BUFFER_DONE, &priv->status))
+			continue;
+
+		if (test_and_clear_bit(SVC_STATUS_RECONFIG_BUFFER_SUBMITTED,
+				       &priv->status))
+			continue;
+
+		if (test_and_clear_bit(SVC_STATUS_RECONFIG_ERROR,
+				       &priv->status)) {
+			dev_err(dev, "ERROR - giving up - SVC_STATUS_RECONFIG_ERROR\n");
+			ret = -EFAULT;
+			break;
+		}
+
+		if (!ret) {
+			dev_err(dev, "timeout waiting for svc layer buffers\n");
+			ret = -ETIMEDOUT;
+			break;
+		}
+		if (ret < 0) {
+			dev_err(dev,
+				"error (%d) waiting for svc layer buffers\n",
+				ret);
+			break;
+		}
+	}
+
+	s10_free_buffers(mgr);
+	if (s10_free_buffer_count(mgr) != NUM_SVC_BUFS)
+		dev_err(dev, "%s not all buffers were freed\n", __func__);
+
+	return ret;
+}
+
+/**
+ * s10_ops_write_complete
+ * Wait for FPGA configuration to be done
+ * @mgr: fpga manager
+ * @info: fpga image info
+ * Returns 0 for success negative errno.
+ */
+static int s10_ops_write_complete(struct fpga_manager *mgr,
+				  struct fpga_image_info *info)
+{
+	struct s10_priv *priv = mgr->priv;
+	struct device *dev = priv->client.dev;
+	unsigned long timeout;
+	int ret;
+
+	timeout = usecs_to_jiffies(info->config_complete_timeout_us);
+
+	do {
+		reinit_completion(&priv->status_return_completion);
+
+		ret = s10_svc_send_msg(priv, COMMAND_RECONFIG_STATUS, NULL, 0);
+		if (ret)
+			break;
+
+		ret = wait_for_completion_interruptible_timeout(
+			&priv->status_return_completion, timeout);
+		if (!ret) {
+			dev_err(dev,
+				"timeout waiting for RECONFIG_COMPLETED\n");
+			ret = -ETIMEDOUT;
+			break;
+		}
+		if (ret < 0) {
+			dev_err(dev,
+				"error (%d) waiting for RECONFIG_COMPLETED\n",
+				ret);
+			break;
+		}
+		/* Not error or timeout, so ret is # of jiffies until timeout */
+		timeout = ret;
+		ret = 0;
+
+		if (test_and_clear_bit(SVC_STATUS_RECONFIG_COMPLETED,
+				       &priv->status))
+			break;
+
+		if (test_and_clear_bit(SVC_STATUS_RECONFIG_ERROR,
+				       &priv->status)) {
+			dev_err(dev, "ERROR - giving up - SVC_STATUS_RECONFIG_ERROR\n");
+			ret = -EFAULT;
+			break;
+		}
+	} while (1);
+
+	stratix10_svc_done(priv->chan);
+	return ret;
+}
+
+static enum fpga_mgr_states s10_ops_state(struct fpga_manager *mgr)
+{
+	return FPGA_MGR_STATE_UNKNOWN;
+}
+
+static const struct fpga_manager_ops s10_ops = {
+	.state = s10_ops_state,
+	.write_init = s10_ops_write_init,
+	.write = s10_ops_write,
+	.write_complete = s10_ops_write_complete,
+};
+
+static int s10_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct s10_priv *priv;
+	int ret;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->client.dev = dev;
+	priv->client.receive_cb = s10_receive_callback;
+	priv->client.priv = priv;
+
+	priv->chan = stratix10_svc_request_channel_byname(&priv->client,
+						SVC_CLIENT_FPGA);
+	if (IS_ERR(priv->chan)) {
+		dev_err(dev, "couldn't get service channel (%s)\n",
+			SVC_CLIENT_FPGA);
+		return PTR_ERR(priv->chan);
+	}
+
+	init_completion(&priv->status_return_completion);
+
+	ret = fpga_mgr_register(dev, "Stratix10 SOC FPGA Manager",
+				&s10_ops, priv);
+
+	if (ret)
+		stratix10_svc_free_channel(priv->chan);
+
+	return ret;
+}
+
+static int s10_remove(struct platform_device *pdev)
+{
+	struct fpga_manager *mgr = platform_get_drvdata(pdev);
+	struct s10_priv *priv = mgr->priv;
+
+	fpga_mgr_unregister(&pdev->dev);
+	stratix10_svc_free_channel(priv->chan);
+
+	return 0;
+}
+
+static const struct of_device_id s10_of_match[] = {
+	{ .compatible = "intel,stratix10-soc-fpga-mgr", },
+	{},
+};
+
+MODULE_DEVICE_TABLE(of, s10_of_match);
+
+static struct platform_driver s10_driver = {
+	.probe = s10_probe,
+	.remove = s10_remove,
+	.driver = {
+		.name	= "Stratix10 SoC FPGA manager",
+		.of_match_table = of_match_ptr(s10_of_match),
+	},
+};
+
+static int __init s10_init(void)
+{
+	struct device_node *fw_np;
+	struct device_node *np;
+	int ret;
+
+	fw_np = of_find_node_by_name(NULL, "svc");
+	if (!fw_np)
+		return -ENODEV;
+
+	np = of_find_matching_node(fw_np, s10_of_match);
+	if (!np) {
+		of_node_put(fw_np);
+		return -ENODEV;
+	}
+
+	of_node_put(np);
+	ret = of_platform_populate(fw_np, s10_of_match, NULL, NULL);
+	of_node_put(fw_np);
+	if (ret)
+		return ret;
+
+	return platform_driver_register(&s10_driver);
+}
+
+static void __exit s10_exit(void)
+{
+	return platform_driver_unregister(&s10_driver);
+}
+
+module_init(s10_init);
+module_exit(s10_exit);
+
+MODULE_AUTHOR("Alan Tull <atull@kernel.org>");
+MODULE_DESCRIPTION("Intel Stratix 10 SOC FPGA Manager");
+MODULE_LICENSE("GPL v2");
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCHv5 7/8] defconfig: enable fpga and service layer
From: richard.gong @ 2018-05-24 16:33 UTC (permalink / raw)
  To: catalin.marinas, will.deacon, dinguyen, robh+dt, mark.rutland,
	atull, mdf, arnd, gregkh, corbet
  Cc: linux-arm-kernel, linux-kernel, devicetree, linux-fpga, linux-doc,
	yves.vandervennet, richard.gong, richard.gong
In-Reply-To: <1527179600-26441-1-git-send-email-richard.gong@linux.intel.com>

From: Richard Gong <richard.gong@intel.com>

Enable fpga framework, Stratix 10 SoC FPGA manager and Stratix10
Service Layer

Signed-off-by: Richard Gong <richard.gong@intel.com>
Signed-off-by: Alan Tull <atull@kernel.org>
---
v2: this patch is added in patch set version 2
v3: no change
v4: s/CONFIG_INTEL_SERVICE/CONFIG_STRATIX10_SERVICE/
    add CONFIG_OF_FPGA_REGION=y
    s/Intel/Stratix10/ in subject line
v5: no change
---
 arch/arm64/configs/defconfig | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index ecf6137..5f7a9b7 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -180,6 +180,7 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_NBD=m
 CONFIG_VIRTIO_BLK=y
 CONFIG_BLK_DEV_NVME=m
+CONFIG_STRATIX10_SERVICE=y
 CONFIG_SRAM=y
 CONFIG_EEPROM_AT25=m
 # CONFIG_SCSI_PROC_FS is not set
@@ -595,6 +596,11 @@ CONFIG_PHY_TEGRA_XUSB=y
 CONFIG_QCOM_L2_PMU=y
 CONFIG_QCOM_L3_PMU=y
 CONFIG_MESON_EFUSE=m
+CONFIG_FPGA=y
+CONFIG_FPGA_MGR_STRATIX10_SOC=y
+CONFIG_FPGA_REGION=y
+CONFIG_FPGA_BRIDGE=y
+CONFIG_OF_FPGA_REGION=y
 CONFIG_QCOM_QFPROM=y
 CONFIG_UNIPHIER_EFUSE=y
 CONFIG_TEE=y
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCHv5 8/8] Documentation: driver-api: add stratix10 service layer
From: richard.gong @ 2018-05-24 16:33 UTC (permalink / raw)
  To: catalin.marinas, will.deacon, dinguyen, robh+dt, mark.rutland,
	atull, mdf, arnd, gregkh, corbet
  Cc: linux-arm-kernel, linux-kernel, devicetree, linux-fpga, linux-doc,
	yves.vandervennet, richard.gong, richard.gong
In-Reply-To: <1527179600-26441-1-git-send-email-richard.gong@linux.intel.com>

From: Richard Gong <richard.gong@intel.com>

Add new file stratix10-svc.rst
Add stratix10-svc.rst to driver-api/index.rst

Signed-off-by: Richard Gong <richard.gong@intel.com>
Signed-off-by: Alan Tull <atull@kernel.org>
---
v5: this patch is added in patch set version 5
---
 Documentation/driver-api/index.rst         |  1 +
 Documentation/driver-api/stratix10-svc.rst | 32 ++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 Documentation/driver-api/stratix10-svc.rst

diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 6d8352c..4b31109 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -49,6 +49,7 @@ available subsections can be seen below.
    dmaengine/index
    slimbus
    soundwire/index
+   stratix10-svc
 
 .. only::  subproject and html
 
diff --git a/Documentation/driver-api/stratix10-svc.rst b/Documentation/driver-api/stratix10-svc.rst
new file mode 100644
index 0000000..ed361d8
--- /dev/null
+++ b/Documentation/driver-api/stratix10-svc.rst
@@ -0,0 +1,32 @@
+
+Intel Stratix10 SoC Service Layer
+=================================
+
+Some features of the Intel Stratix10 SoC require a level of privilege
+higher than the kernel is granted. Such secure features include
+FPGA programming. In terms of the ARMv8 architecture, the kernel runs
+at Exception Level 1 (EL1), access to the features requires
+Exception Level 3 (EL3).
+
+The Intel Stratix10 SoC service layer provides an in kernel API for
+drivers to request access to the secure features. The requests are queued
+and processed one by one. ARM’s SMCCC is used to pass the execution
+of the requests on to a secure monitor (EL3).
+
+.. kernel-doc:: include/linux/stratix10-svc-client.h
+   :functions: stratix10_svc_command_code
+
+.. kernel-doc:: include/linux/stratix10-svc-client.h
+   :functions: stratix10_svc_client_msg
+
+.. kernel-doc:: include/linux/stratix10-svc-client.h
+   :functions: stratix10_svc_command_reconfig_payload
+
+.. kernel-doc:: include/linux/stratix10-svc-client.h
+   :functions: stratix10_svc_cb_data
+
+.. kernel-doc:: include/linux/stratix10-svc-client.h
+   :functions: stratix10_svc_client
+
+.. kernel-doc:: drivers/misc/stratix10-svc.c
+   :export:
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCHv5 5/8] arm64: dts: stratix10: add fpga manager and region
From: richard.gong @ 2018-05-24 16:33 UTC (permalink / raw)
  To: catalin.marinas, will.deacon, dinguyen, robh+dt, mark.rutland,
	atull, mdf, arnd, gregkh, corbet
  Cc: linux-arm-kernel, linux-kernel, devicetree, linux-fpga, linux-doc,
	yves.vandervennet, richard.gong, richard.gong
In-Reply-To: <1527179600-26441-1-git-send-email-richard.gong@linux.intel.com>

From: Alan Tull <atull@kernel.org>

Add the Stratix10 FPGA manager and a FPGA region to the
device tree.

Signed-off-by: Alan Tull <atull@kernel.org>
Signed-off-by: Richard Gong <richard.gong@intel.com>
---
v2: this patch is added in patch set version 2
v3: change to put fpga_mgr node under firmware/svc node
v4: s/fpga-mgr@0/fpga-mgr/ to remove unit_address
    add Richard's signed-off-by
v5: no change
---
 arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
index c257287..8f8f409 100644
--- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
+++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
@@ -106,6 +106,14 @@
 		interrupt-parent = <&intc>;
 		ranges = <0 0 0 0xffffffff>;
 
+		base_fpga_region {
+			#address-cells = <0x1>;
+			#size-cells = <0x1>;
+
+			compatible = "fpga-region";
+			fpga-mgr = <&fpga_mgr>;
+		};
+
 		clkmgr: clock-controller@ffd10000 {
 			compatible = "intel,stratix10-clkmgr";
 			reg = <0xffd10000 0x1000>;
@@ -506,6 +514,10 @@
 				compatible = "intel,stratix10-svc";
 				method = "smc";
 				memory-region = <&service_reserved>;
+
+				fpga_mgr: fpga-mgr {
+					compatible = "intel,stratix10-soc-fpga-mgr";
+				};
 			};
 		};
 	};
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCHv5 0/8] Add Intel Stratix10 FPGA manager and service layer
From: richard.gong @ 2018-05-24 16:33 UTC (permalink / raw)
  To: catalin.marinas, will.deacon, dinguyen, robh+dt, mark.rutland,
	atull, mdf, arnd, gregkh, corbet
  Cc: linux-arm-kernel, linux-kernel, devicetree, linux-fpga, linux-doc,
	yves.vandervennet, richard.gong, richard.gong

From: Richard Gong <richard.gong@intel.com>

This is the 5th submission of Intel stratix10 service layer patches. Intel
Stratix10 FPGA manager, which is 1st Stratix10 service layer client, is
included in this submission.

Stratix10 service layer patches have been reviewed internally by Alan Tull
and other colleagues at Intel.

Some features of the Intel Stratix10 SoC require a level of privilege
higher than the kernel is granted. Such secure features include
FPGA programming. In terms of the ARMv8 architecture, the kernel runs
at Exception Level 1 (EL1), access to the features requires
Exception Level 3 (EL3).

The Intel Stratix10 service layer provides an in kernel API for drivers to
request access to the secure features. The requests are queued and
processed one by one. ARM’s SMCCC is used to pass the execution of the
requests on to a secure monitor (EL3).

Later the Intel Stratix10 service layer driver will be extended to provide
services for QSPI, Crypto and warm reset.

v2: add patches for FPGA manager, FPGA manager binding, dts and defconfig
    remove intel-service subdirectory and intel-service.h, move intel-smc.h
    and intel-service.c to driver/misc subdirectory
    remove global variables
    change service layer driver be 'default n'
    correct SPDX markers
    add timeout for do..while() loop
    add kernel-doc for the functions and structs, correct multiline comments
    replace kfifo_in/kfifo_out with kfifo_in_spinlocked/kfifo_out_spinlocked
    rename struct intel_svc_data (at client header) to intel_svc_client_msg
    rename struct intel_svc_private_mem to intel_svc_data
    other corrections/changes from Intel internal code reviews
v3: change all exported functions with "intel_svc_" as the prefix
    increase timeout values for claiming back submitted buffer(s)
    rename struct intel_command_reconfig_payload to
    struct intel_svc_command_reconfig_payload
    add pr_err() to provide the error return value
    change to put fpga_mgr node under firmware/svc node
    change to FPGA manager to align the update of service client APIs, and the
    update of fpga_mgr device node
    Other corrections/changes
v4: s/intel/stratix10/ on some variables, structs, functions, and file names
    intel-service.c -> stratix10-svc.c
    intel-smc.h -> stratix10-smc.h
    intel-service-client.h -> stratix10-svc-client.h
    remove non-kernel-doc formatting
    s/fpga-mgr@0/fpga-mgr/ to remove unit_address at fpga_mgr node
    add Rob's Reviewed-by
    add Richard's signed-off-by
v5: add a new API statix10_svc_done() which is called by service client
    when client request is completed or error occurs during request
    process. Which allows service layer to free its resources.
    remove dummy client from service layer client header and service layer
    source file.
    add Rob's Reviewed-by
    add a new file stratix10-svc.rst and add that to driver-api/index.rst
    kernel-doc fixes

Alan Tull (3):
  dt-bindings: fpga: add Stratix10 SoC FPGA manager binding
  arm64: dts: stratix10: add fpga manager and region
  fpga: add intel stratix10 soc fpga manager driver

Richard Gong (5):
  dt-bindings, firmware: add Intel Stratix10 service layer binding
  arm64: dts: stratix10: add stratix10 service driver binding to base
    dtsi
  driver, misc: add Intel Stratix10 service layer driver
  defconfig: enable fpga and service layer
  Documentation: driver-api: add stratix10 service layer

 .../bindings/firmware/intel,stratix10-svc.txt      |  57 ++
 .../bindings/fpga/intel-stratix10-soc-fpga-mgr.txt |  17 +
 Documentation/driver-api/index.rst                 |   1 +
 Documentation/driver-api/stratix10-svc.rst         |  32 +
 arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi  |  33 +
 arch/arm64/configs/defconfig                       |   6 +
 drivers/fpga/Kconfig                               |   6 +
 drivers/fpga/Makefile                              |   1 +
 drivers/fpga/stratix10-soc.c                       | 545 ++++++++++++
 drivers/misc/Kconfig                               |  12 +
 drivers/misc/Makefile                              |   1 +
 drivers/misc/stratix10-smc.h                       | 205 +++++
 drivers/misc/stratix10-svc.c                       | 984 +++++++++++++++++++++
 include/linux/stratix10-svc-client.h               | 199 +++++
 14 files changed, 2099 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/firmware/intel,stratix10-svc.txt
 create mode 100644 Documentation/devicetree/bindings/fpga/intel-stratix10-soc-fpga-mgr.txt
 create mode 100644 Documentation/driver-api/stratix10-svc.rst
 create mode 100644 drivers/fpga/stratix10-soc.c
 create mode 100644 drivers/misc/stratix10-smc.h
 create mode 100644 drivers/misc/stratix10-svc.c
 create mode 100644 include/linux/stratix10-svc-client.h

-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v3 6/9] trace_uprobe: Support SDT markers having reference count (semaphore)
From: Oleg Nesterov @ 2018-05-24 16:26 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: mhiramat, peterz, srikar, rostedt, acme, ananth, akpm,
	alexander.shishkin, alexis.berlemont, corbet, dan.j.williams,
	jolsa, kan.liang, kjlx, kstewart, linux-doc, linux-kernel,
	linux-mm, milian.wolff, mingo, namhyung, naveen.n.rao, pc, tglx,
	yao.jin, fengguang.wu, jglisse, Ravi Bangoria
In-Reply-To: <20180417043244.7501-7-ravi.bangoria@linux.vnet.ibm.com>

Hi Ravi,

sorry for delay!

I am trying to recall what this code should do ;) At first glance, I do
not see any serious problem in this version... except it doesn't apply
to Linus's tree. just one question for now.

On 04/17, Ravi Bangoria wrote:
>
> @@ -941,6 +1091,9 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
>  	if (ret)
>  		goto err_buffer;
>  
> +	if (tu->ref_ctr_offset)
> +		sdt_increment_ref_ctr(tu);
> +

iiuc, this is probe_event_enable()...

Looks racy, but afaics the race with uprobe_mmap() will be closed by the next
change. However, it seems that probe_event_disable() can race with trace_uprobe_mmap()
too and the next 7/9 patch won't help,

> +	if (tu->ref_ctr_offset)
> +		sdt_decrement_ref_ctr(tu);
> +
>  	uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
>  	tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;

so what if trace_uprobe_mmap() comes right after uprobe_unregister() ?
Note that trace_probe_is_enabled() is T until we update tp.flags.

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v8 3/6] cpuset: Add cpuset.sched.load_balance flag to v2
From: Peter Zijlstra @ 2018-05-24 15:43 UTC (permalink / raw)
  To: Waiman Long
  Cc: Tejun Heo, Li Zefan, Johannes Weiner, Ingo Molnar, cgroups,
	linux-kernel, linux-doc, kernel-team, pjt, luto, Mike Galbraith,
	torvalds, Roman Gushchin, Juri Lelli
In-Reply-To: <1526590545-3350-4-git-send-email-longman@redhat.com>

On Thu, May 17, 2018 at 04:55:42PM -0400, Waiman Long wrote:
> The sched.load_balance flag is needed to enable CPU isolation similar to
> what can be done with the "isolcpus" kernel boot parameter. Its value
> can only be changed in a scheduling domain with no child cpusets. On
> a non-scheduling domain cpuset, the value of sched.load_balance is
> inherited from its parent.
> 
> This flag is set by the parent and is not delegatable.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  Documentation/cgroup-v2.txt | 24 ++++++++++++++++++++
>  kernel/cgroup/cpuset.c      | 53 +++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 73 insertions(+), 4 deletions(-)
> 
> diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
> index 54d9e22..071b634d 100644
> --- a/Documentation/cgroup-v2.txt
> +++ b/Documentation/cgroup-v2.txt
> @@ -1536,6 +1536,30 @@ Cpuset Interface Files
>  	CPUs of the parent cgroup. Once it is set, this flag cannot be
>  	cleared if there are any child cgroups with cpuset enabled.
>  
> +	A parent cgroup cannot distribute all its CPUs to child
> +	scheduling domain cgroups unless its load balancing flag is
> +	turned off.
> +
> +  cpuset.sched.load_balance
> +	A read-write single value file which exists on non-root
> +	cpuset-enabled cgroups.  It is a binary value flag that accepts
> +	either "0" (off) or a non-zero value (on).  This flag is set
> +	by the parent and is not delegatable.
> +
> +	When it is on, tasks within this cpuset will be load-balanced
> +	by the kernel scheduler.  Tasks will be moved from CPUs with
> +	high load to other CPUs within the same cpuset with less load
> +	periodically.
> +
> +	When it is off, there will be no load balancing among CPUs on
> +	this cgroup.  Tasks will stay in the CPUs they are running on
> +	and will not be moved to other CPUs.
> +
> +	The initial value of this flag is "1".	This flag is then
> +	inherited by child cgroups with cpuset enabled.  Its state
> +	can only be changed on a scheduling domain cgroup with no
> +	cpuset-enabled children.

I'm confused... why exactly do we have both domain and load_balance ?
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v8 2/6] cpuset: Add new v2 cpuset.sched.domain flag
From: Peter Zijlstra @ 2018-05-24 15:41 UTC (permalink / raw)
  To: Waiman Long
  Cc: Tejun Heo, Li Zefan, Johannes Weiner, Ingo Molnar, cgroups,
	linux-kernel, linux-doc, kernel-team, pjt, luto, Mike Galbraith,
	torvalds, Roman Gushchin, Juri Lelli
In-Reply-To: <1526590545-3350-3-git-send-email-longman@redhat.com>

On Thu, May 17, 2018 at 04:55:41PM -0400, Waiman Long wrote:
> A new cpuset.sched.domain boolean flag is added to cpuset v2. This new
> flag indicates that the CPUs in the current cpuset should be treated
> as a separate scheduling domain.

The traditional name for this is a partition.

>                                  This new flag is owned by the parent
> and will cause the CPUs in the cpuset to be removed from the effective
> CPUs of its parent.

This is a significant departure from existing behaviour, but one I can
appreciate. I don't immediately see something terribly wrong with it.

> This is implemented internally by adding a new isolated_cpus mask that
> holds the CPUs belonging to child scheduling domain cpusets so that:
> 
> 	isolated_cpus | effective_cpus = cpus_allowed
> 	isolated_cpus & effective_cpus = 0
> 
> This new flag can only be turned on in a cpuset if its parent is either
> root or a scheduling domain itself with non-empty cpu list. The state
> of this flag cannot be changed if the cpuset has children.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  Documentation/cgroup-v2.txt |  22 ++++
>  kernel/cgroup/cpuset.c      | 237 +++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 256 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
> index cf7bac6..54d9e22 100644
> --- a/Documentation/cgroup-v2.txt
> +++ b/Documentation/cgroup-v2.txt
> @@ -1514,6 +1514,28 @@ Cpuset Interface Files
>  	it is a subset of "cpuset.mems".  Its value will be affected
>  	by memory nodes hotplug events.
>  
> +  cpuset.sched.domain
> +	A read-write single value file which exists on non-root
> +	cpuset-enabled cgroups.  It is a binary value flag that accepts
> +	either "0" (off) or a non-zero value (on).

I would be conservative and only allow 0/1.

>                                                  This flag is set
> +	by the parent and is not delegatable.
> +
> +	If set, it indicates that the CPUs in the current cgroup will
> +	be the root of a scheduling domain.  The root cgroup is always
> +	a scheduling domain.  There are constraints on where this flag
> +	can be set.  It can only be set in a cgroup if all the following
> +	conditions are true.
> +
> +	1) The parent cgroup is also a scheduling domain with a non-empty
> +	   cpu list.

Ah, so initially I was confused by the requirement for root to have it
always set, but you'll allow child domains to steal _all_ CPUs, such
that root ends up with an empty effective set?

What about the (kernel) threads that cannot be moved out of the root
group?

> +	2) The list of CPUs are exclusive, i.e. they are not shared by
> +	   any of its siblings.

Right.

> +	3) There is no child cgroups with cpuset enabled.
> +
> +	Setting this flag will take the CPUs away from the effective
> +	CPUs of the parent cgroup. Once it is set, this flag cannot be
> +	cleared if there are any child cgroups with cpuset enabled.

This I'm not clear on. Why?


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v8 3/6] cpuset: Add cpuset.sched.load_balance flag to v2
From: Waiman Long @ 2018-05-24 15:22 UTC (permalink / raw)
  To: Juri Lelli
  Cc: Tejun Heo, Li Zefan, Johannes Weiner, Peter Zijlstra, Ingo Molnar,
	cgroups, linux-kernel, linux-doc, kernel-team, pjt, luto,
	Mike Galbraith, torvalds, Roman Gushchin
In-Reply-To: <20180524151656.GD3948@localhost.localdomain>

On 05/24/2018 11:16 AM, Juri Lelli wrote:
> On 24/05/18 11:09, Waiman Long wrote:
>> On 05/24/2018 10:36 AM, Juri Lelli wrote:
>>> On 17/05/18 16:55, Waiman Long wrote:
>>>
>>> [...]
>>>
>>>> +	A parent cgroup cannot distribute all its CPUs to child
>>>> +	scheduling domain cgroups unless its load balancing flag is
>>>> +	turned off.
>>>> +
>>>> +  cpuset.sched.load_balance
>>>> +	A read-write single value file which exists on non-root
>>>> +	cpuset-enabled cgroups.  It is a binary value flag that accepts
>>>> +	either "0" (off) or a non-zero value (on).  This flag is set
>>>> +	by the parent and is not delegatable.
>>>> +
>>>> +	When it is on, tasks within this cpuset will be load-balanced
>>>> +	by the kernel scheduler.  Tasks will be moved from CPUs with
>>>> +	high load to other CPUs within the same cpuset with less load
>>>> +	periodically.
>>>> +
>>>> +	When it is off, there will be no load balancing among CPUs on
>>>> +	this cgroup.  Tasks will stay in the CPUs they are running on
>>>> +	and will not be moved to other CPUs.
>>>> +
>>>> +	The initial value of this flag is "1".	This flag is then
>>>> +	inherited by child cgroups with cpuset enabled.  Its state
>>>> +	can only be changed on a scheduling domain cgroup with no
>>>> +	cpuset-enabled children.
>>> [...]
>>>
>>>> +	/*
>>>> +	 * On default hierachy, a load balance flag change is only allowed
>>>> +	 * in a scheduling domain with no child cpuset.
>>>> +	 */
>>>> +	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && balance_flag_changed &&
>>>> +	   (!is_sched_domain(cs) || css_has_online_children(&cs->css))) {
>>>> +		err = -EINVAL;
>>>> +		goto out;
>>>> +	}
>>> The rule is actually
>>>
>>>  - no child cpuset
>>>  - and it must be a scheduling domain
>>>
>>> Right?
>> Yes, because it doesn't make sense to have a cpu in one cpuset that has
>> loading balance off while, at the same time, in another cpuset with load
>> balancing turned on. This restriction is there to make sure that the
>> above condition will not happen. I may be wrong if there is a realistic
>> use case where the above condition is desired.
> Yep, makes sense to me.
>
> Maybe add the second condition to the comment and documentation.

Sure. Will do.

-Longman

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v8 3/6] cpuset: Add cpuset.sched.load_balance flag to v2
From: Juri Lelli @ 2018-05-24 15:16 UTC (permalink / raw)
  To: Waiman Long
  Cc: Tejun Heo, Li Zefan, Johannes Weiner, Peter Zijlstra, Ingo Molnar,
	cgroups, linux-kernel, linux-doc, kernel-team, pjt, luto,
	Mike Galbraith, torvalds, Roman Gushchin
In-Reply-To: <4bd31510-4f73-e263-8dc1-5edb0fe63b59@redhat.com>

On 24/05/18 11:09, Waiman Long wrote:
> On 05/24/2018 10:36 AM, Juri Lelli wrote:
> > On 17/05/18 16:55, Waiman Long wrote:
> >
> > [...]
> >
> >> +	A parent cgroup cannot distribute all its CPUs to child
> >> +	scheduling domain cgroups unless its load balancing flag is
> >> +	turned off.
> >> +
> >> +  cpuset.sched.load_balance
> >> +	A read-write single value file which exists on non-root
> >> +	cpuset-enabled cgroups.  It is a binary value flag that accepts
> >> +	either "0" (off) or a non-zero value (on).  This flag is set
> >> +	by the parent and is not delegatable.
> >> +
> >> +	When it is on, tasks within this cpuset will be load-balanced
> >> +	by the kernel scheduler.  Tasks will be moved from CPUs with
> >> +	high load to other CPUs within the same cpuset with less load
> >> +	periodically.
> >> +
> >> +	When it is off, there will be no load balancing among CPUs on
> >> +	this cgroup.  Tasks will stay in the CPUs they are running on
> >> +	and will not be moved to other CPUs.
> >> +
> >> +	The initial value of this flag is "1".	This flag is then
> >> +	inherited by child cgroups with cpuset enabled.  Its state
> >> +	can only be changed on a scheduling domain cgroup with no
> >> +	cpuset-enabled children.
> > [...]
> >
> >> +	/*
> >> +	 * On default hierachy, a load balance flag change is only allowed
> >> +	 * in a scheduling domain with no child cpuset.
> >> +	 */
> >> +	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && balance_flag_changed &&
> >> +	   (!is_sched_domain(cs) || css_has_online_children(&cs->css))) {
> >> +		err = -EINVAL;
> >> +		goto out;
> >> +	}
> > The rule is actually
> >
> >  - no child cpuset
> >  - and it must be a scheduling domain
> >
> > Right?
> 
> Yes, because it doesn't make sense to have a cpu in one cpuset that has
> loading balance off while, at the same time, in another cpuset with load
> balancing turned on. This restriction is there to make sure that the
> above condition will not happen. I may be wrong if there is a realistic
> use case where the above condition is desired.

Yep, makes sense to me.

Maybe add the second condition to the comment and documentation.
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v8 3/6] cpuset: Add cpuset.sched.load_balance flag to v2
From: Waiman Long @ 2018-05-24 15:09 UTC (permalink / raw)
  To: Juri Lelli
  Cc: Tejun Heo, Li Zefan, Johannes Weiner, Peter Zijlstra, Ingo Molnar,
	cgroups, linux-kernel, linux-doc, kernel-team, pjt, luto,
	Mike Galbraith, torvalds, Roman Gushchin
In-Reply-To: <20180524143614.GC3948@localhost.localdomain>

On 05/24/2018 10:36 AM, Juri Lelli wrote:
> On 17/05/18 16:55, Waiman Long wrote:
>
> [...]
>
>> +	A parent cgroup cannot distribute all its CPUs to child
>> +	scheduling domain cgroups unless its load balancing flag is
>> +	turned off.
>> +
>> +  cpuset.sched.load_balance
>> +	A read-write single value file which exists on non-root
>> +	cpuset-enabled cgroups.  It is a binary value flag that accepts
>> +	either "0" (off) or a non-zero value (on).  This flag is set
>> +	by the parent and is not delegatable.
>> +
>> +	When it is on, tasks within this cpuset will be load-balanced
>> +	by the kernel scheduler.  Tasks will be moved from CPUs with
>> +	high load to other CPUs within the same cpuset with less load
>> +	periodically.
>> +
>> +	When it is off, there will be no load balancing among CPUs on
>> +	this cgroup.  Tasks will stay in the CPUs they are running on
>> +	and will not be moved to other CPUs.
>> +
>> +	The initial value of this flag is "1".	This flag is then
>> +	inherited by child cgroups with cpuset enabled.  Its state
>> +	can only be changed on a scheduling domain cgroup with no
>> +	cpuset-enabled children.
> [...]
>
>> +	/*
>> +	 * On default hierachy, a load balance flag change is only allowed
>> +	 * in a scheduling domain with no child cpuset.
>> +	 */
>> +	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && balance_flag_changed &&
>> +	   (!is_sched_domain(cs) || css_has_online_children(&cs->css))) {
>> +		err = -EINVAL;
>> +		goto out;
>> +	}
> The rule is actually
>
>  - no child cpuset
>  - and it must be a scheduling domain
>
> Right?

Yes, because it doesn't make sense to have a cpu in one cpuset that has
loading balance off while, at the same time, in another cpuset with load
balancing turned on. This restriction is there to make sure that the
above condition will not happen. I may be wrong if there is a realistic
use case where the above condition is desired.

-Longman


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v8 3/6] cpuset: Add cpuset.sched.load_balance flag to v2
From: Juri Lelli @ 2018-05-24 14:36 UTC (permalink / raw)
  To: Waiman Long
  Cc: Tejun Heo, Li Zefan, Johannes Weiner, Peter Zijlstra, Ingo Molnar,
	cgroups, linux-kernel, linux-doc, kernel-team, pjt, luto,
	Mike Galbraith, torvalds, Roman Gushchin
In-Reply-To: <1526590545-3350-4-git-send-email-longman@redhat.com>

On 17/05/18 16:55, Waiman Long wrote:

[...]

> +	A parent cgroup cannot distribute all its CPUs to child
> +	scheduling domain cgroups unless its load balancing flag is
> +	turned off.
> +
> +  cpuset.sched.load_balance
> +	A read-write single value file which exists on non-root
> +	cpuset-enabled cgroups.  It is a binary value flag that accepts
> +	either "0" (off) or a non-zero value (on).  This flag is set
> +	by the parent and is not delegatable.
> +
> +	When it is on, tasks within this cpuset will be load-balanced
> +	by the kernel scheduler.  Tasks will be moved from CPUs with
> +	high load to other CPUs within the same cpuset with less load
> +	periodically.
> +
> +	When it is off, there will be no load balancing among CPUs on
> +	this cgroup.  Tasks will stay in the CPUs they are running on
> +	and will not be moved to other CPUs.
> +
> +	The initial value of this flag is "1".	This flag is then
> +	inherited by child cgroups with cpuset enabled.  Its state
> +	can only be changed on a scheduling domain cgroup with no
> +	cpuset-enabled children.

[...]

> +	/*
> +	 * On default hierachy, a load balance flag change is only allowed
> +	 * in a scheduling domain with no child cpuset.
> +	 */
> +	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && balance_flag_changed &&
> +	   (!is_sched_domain(cs) || css_has_online_children(&cs->css))) {
> +		err = -EINVAL;
> +		goto out;
> +	}

The rule is actually

 - no child cpuset
 - and it must be a scheduling domain

Right?
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 0/7] mm: pages for hugetlb's overcommit may be able to charge to memcg
From: Michal Hocko @ 2018-05-24 13:24 UTC (permalink / raw)
  To: TSUKADA Koutaro
  Cc: Mike Kravetz, Johannes Weiner, Vladimir Davydov, Jonathan Corbet,
	Luis R. Rodriguez, Kees Cook, Andrew Morton, Roman Gushchin,
	David Rientjes, Aneesh Kumar K.V, Naoya Horiguchi,
	Anshuman Khandual, Marc-Andre Lureau, Punit Agrawal, Dan Williams,
	Vlastimil Babka, linux-doc, linux-kernel, linux-fsdevel, linux-mm,
	cgroups
In-Reply-To: <b2afbff6-b59f-7105-3808-64d41bd4a3a8@ascade.co.jp>

On Thu 24-05-18 21:58:49, TSUKADA Koutaro wrote:
> On 2018/05/24 17:20, Michal Hocko wrote:
> > On Thu 24-05-18 13:39:59, TSUKADA Koutaro wrote:
> >> On 2018/05/23 3:54, Michal Hocko wrote:
> > [...]
> >>> I am also quite confused why you keep distinguishing surplus hugetlb
> >>> pages from regular preallocated ones. Being a surplus page is an
> >>> implementation detail that we use for an internal accounting rather than
> >>> something to exhibit to the userspace even more than we do currently.
> >>
> >> I apologize for having confused.
> >>
> >> The hugetlb pages obtained from the pool do not waste the buddy pool.
> > 
> > Because they have already allocated from the buddy allocator so the end
> > result is very same.
> > 
> >> On
> >> the other hand, surplus hugetlb pages waste the buddy pool. Due to this
> >> difference in property, I thought it could be distinguished.
> > 
> > But this is simply not correct. Surplus pages are fluid. If you increase
> > the hugetlb size they will become regular persistent hugetlb pages.
> 
> I really can not understand what's wrong with this. That page is obviously
> released before being added to the persistent pool, and at that time it is
> uncharged from memcg to which the task belongs(This assumes my patch-set).
> After that, the same page obtained from the pool is not surplus hugepage
> so it will not be charged to memcg again.

I do not see anything like that. adjust_pool_surplus is simply and
accounting thing. At least the last time I've checked. Maybe your
patchset handles that?
 
> >> Although my memcg knowledge is extremely limited, memcg is accounting for
> >> various kinds of pages obtained from the buddy pool by the task belonging
> >> to it. I would like to argue that surplus hugepage has specificity in
> >> terms of obtaining from the buddy pool, and that it is specially permitted
> >> charge requirements for memcg.
> > 
> > Not really. Memcg accounts primarily for reclaimable memory. We do
> > account for some non-reclaimable slabs but the life time should be at
> > least bound to a process life time. Otherwise the memcg oom killer
> > behavior is not guaranteed to unclutter the situation. Hugetlb pages are
> > simply persistent. Well, to be completely honest tmpfs pages have a
> > similar problem but lacking the swap space for them is kinda
> > configuration bug.
> 
> Absolutely you are saying the right thing, but, for example, can mlock(2)ed
> pages be swapped out by reclaim?(What is the difference between mlock(2)ed
> pages and hugetlb page?)

No mlocked pages cannot be reclaimed and that is why we restrict them to
a relatively small amount.
 
> >> It seems very strange that charge hugetlb page to memcg, but essentially
> >> it only charges the usage of the compound page obtained from the buddy pool,
> >> and even if that page is used as hugetlb page after that, memcg is not
> >> interested in that.
> > 
> > Ohh, it is very much interested. The primary goal of memcg is to enforce
> > the limit. How are you going to do that in an absence of the reclaimable
> > memory? And quite a lot of it because hugetlb pages usually consume a
> > lot of memory.
> 
> Simply kill any of the tasks belonging to that memcg. Maybe, no one wants
> reclaim at the time of account of with surplus hugepages.

But that will not release the hugetlb memory, does it?
 
> [...]
> >> I could not understand the intention of this question, sorry. When resize
> >> the pool, I think that the number of surplus hugepages in use does not
> >> change. Could you explain what you were concerned about?
> > 
> > It does change when you change the hugetlb pool size, migrate pages
> > between per-numa pools (have a look at adjust_pool_surplus).
> 
> As I looked at, what kind of fatal problem is caused by charging surplus
> hugepages to memcg by just manipulating counter of statistical information?

Fatal? Not sure. It simply tries to add an alien memory to the memcg
concept so I would pressume an unexpected behavior (e.g. not being able
to reclaim memcg or, over reclaim, trashing etc.).
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] libata: remove ata_sff_data_xfer_noirq()
From: Sebastian Andrzej Siewior @ 2018-05-24 13:58 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-ide, tglx, Bartlomiej Zolnierkiewicz, linux-doc,
	Jonathan Corbet
In-Reply-To: <20180507155216.g3ycg3e5pvnmhztz@linutronix.de>

On 2018-05-07 17:52:16 [+0200], To Tejun Heo wrote:
> On 2018-05-07 08:49:08 [-0700], Tejun Heo wrote:
> > Hello, Sebastian.
Hi Tejun,

> > On Fri, May 04, 2018 at 05:06:20PM +0200, Sebastian Andrzej Siewior wrote:
> > > ata_sff_data_xfer_noirq() is invoked via the ->sff_data_xfer hook. The
> > > latter is invoked by ata_pio_sector(), atapi_send_cdb() and
> > > __atapi_pio_bytes() which in turn is invoked by ata_sff_hsm_move().
> > > The latter function requires that the "ap->lock" lock is held which
> > > needs to be taken with disabled interrupts.
> > > 
> > > There is no need have to have ata_sff_data_xfer_noirq() which invokes
> > > ata_sff_data_xfer32() with disabled interrupts because at this point the
> > > interrupts are already disabled.
> > > Remove the function and its references to it and replace all callers
> > > with ata_sff_data_xfer32().
> > 
> > Can you please add irq disabled assert to ata_sff_data_xfer*()?
> 
> Why irq-disabled assert? Can we use lockdep_assert_held() instead?
That irq-disabled assert won't work on RT as expected that is why I
intend to remove the local_irq_save() (which is not needed). If we could
avoid the irq-disabled assert or use a lock instead, then it wouldn't
trigger another error on RT.

> > Thanks.

Sebastian
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 0/7] mm: pages for hugetlb's overcommit may be able to charge to memcg
From: TSUKADA Koutaro @ 2018-05-24 12:58 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Mike Kravetz, Johannes Weiner, Vladimir Davydov, Jonathan Corbet,
	Luis R. Rodriguez, Kees Cook, Andrew Morton, Roman Gushchin,
	David Rientjes, Aneesh Kumar K.V, Naoya Horiguchi,
	Anshuman Khandual, Marc-Andre Lureau, Punit Agrawal, Dan Williams,
	Vlastimil Babka, linux-doc, linux-kernel, linux-fsdevel, linux-mm,
	cgroups
In-Reply-To: <20180524082044.GW20441@dhcp22.suse.cz>

On 2018/05/24 17:20, Michal Hocko wrote:
> On Thu 24-05-18 13:39:59, TSUKADA Koutaro wrote:
>> On 2018/05/23 3:54, Michal Hocko wrote:
> [...]
>>> I am also quite confused why you keep distinguishing surplus hugetlb
>>> pages from regular preallocated ones. Being a surplus page is an
>>> implementation detail that we use for an internal accounting rather than
>>> something to exhibit to the userspace even more than we do currently.
>>
>> I apologize for having confused.
>>
>> The hugetlb pages obtained from the pool do not waste the buddy pool.
> 
> Because they have already allocated from the buddy allocator so the end
> result is very same.
> 
>> On
>> the other hand, surplus hugetlb pages waste the buddy pool. Due to this
>> difference in property, I thought it could be distinguished.
> 
> But this is simply not correct. Surplus pages are fluid. If you increase
> the hugetlb size they will become regular persistent hugetlb pages.

I really can not understand what's wrong with this. That page is obviously
released before being added to the persistent pool, and at that time it is
uncharged from memcg to which the task belongs(This assumes my patch-set).
After that, the same page obtained from the pool is not surplus hugepage
so it will not be charged to memcg again.

>> Although my memcg knowledge is extremely limited, memcg is accounting for
>> various kinds of pages obtained from the buddy pool by the task belonging
>> to it. I would like to argue that surplus hugepage has specificity in
>> terms of obtaining from the buddy pool, and that it is specially permitted
>> charge requirements for memcg.
> 
> Not really. Memcg accounts primarily for reclaimable memory. We do
> account for some non-reclaimable slabs but the life time should be at
> least bound to a process life time. Otherwise the memcg oom killer
> behavior is not guaranteed to unclutter the situation. Hugetlb pages are
> simply persistent. Well, to be completely honest tmpfs pages have a
> similar problem but lacking the swap space for them is kinda
> configuration bug.

Absolutely you are saying the right thing, but, for example, can mlock(2)ed
pages be swapped out by reclaim?(What is the difference between mlock(2)ed
pages and hugetlb page?)

>> It seems very strange that charge hugetlb page to memcg, but essentially
>> it only charges the usage of the compound page obtained from the buddy pool,
>> and even if that page is used as hugetlb page after that, memcg is not
>> interested in that.
> 
> Ohh, it is very much interested. The primary goal of memcg is to enforce
> the limit. How are you going to do that in an absence of the reclaimable
> memory? And quite a lot of it because hugetlb pages usually consume a
> lot of memory.

Simply kill any of the tasks belonging to that memcg. Maybe, no one wants
reclaim at the time of account of with surplus hugepages.

[...]
>> I could not understand the intention of this question, sorry. When resize
>> the pool, I think that the number of surplus hugepages in use does not
>> change. Could you explain what you were concerned about?
> 
> It does change when you change the hugetlb pool size, migrate pages
> between per-numa pools (have a look at adjust_pool_surplus).

As I looked at, what kind of fatal problem is caused by charging surplus
hugepages to memcg by just manipulating counter of statistical information?

-- 
Thanks,
Tsukada

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 07/24] arm64: ilp32: add documentation on the ILP32 ABI for ARM64
From: Dr. Philipp Tomsich @ 2018-05-24 12:24 UTC (permalink / raw)
  To: Yury Norov
  Cc: Pavel Machek, Catalin Marinas, Arnd Bergmann, linux-arm-kernel,
	LKML, linux-doc, linux-arch, linux-api, Adam Borowski,
	Alexander Graf, Alexey Klimov, Andreas Schwab, Andrew Pinski,
	Bamvor Zhangjian, Chris Metcalf, Christoph Muellner, Dave Martin,
	David S . Miller, Florian Weimer, Geert Uytterhoeven,
	Heiko Carstens, James Hogan, James Morse, Joseph Myers,
	Lin Yongting, Manuel Montezelo, Mark Brown, Martin Schwidefsky,
	Maxim Kuvyrkov, Nathan_Lynch, Prasun Kapoor, Ramana Radhakrishnan,
	Steve Ellcey, Szabolcs Nagy
In-Reply-To: <20180524121524.GA3873@yury-thinkpad>

Yury & Pavel,

> On 24 May 2018, at 14:15, Yury Norov <ynorov@caviumnetworks.com> wrote:
> 
> Hi Pavel,
> 
> On Wed, May 23, 2018 at 04:06:20PM +0200, Pavel Machek wrote:
>> On Wed 2018-05-16 11:18:52, Yury Norov wrote:
>>> Based on Andrew Pinski's patch-series.
>>> 
>>> Signed-off-by: Yury Norov <ynorov@caviumnetworks.com>
>> 
>> So Andrew's signoff should be here?
> 
> Yes it should, but it lost since v4. I'll restore it.
> 
>>> ---
>>> Documentation/arm64/ilp32.txt | 45 +++++++++++++++++++++++++++++++++++
>>> 1 file changed, 45 insertions(+)
>>> create mode 100644 Documentation/arm64/ilp32.txt
>>> 
>>> diff --git a/Documentation/arm64/ilp32.txt b/Documentation/arm64/ilp32.txt
>>> new file mode 100644
>>> index 000000000000..d0fd5109c4b2
>>> --- /dev/null
>>> +++ b/Documentation/arm64/ilp32.txt
>>> @@ -0,0 +1,45 @@
>>> +ILP32 AARCH64 SYSCALL ABI
>>> +=========================
>>> +
>>> +This document describes the ILP32 syscall ABI and where it differs
>>> +from the generic compat linux syscall interface.
>> 
>> I was hoping to learn what ILP32 is / what is it good for, but no,
>> this does not tell me... it would be good to do a short explanation
>> here, and maybe reference it from cover letter of the series...
>> 								Pavel
> 
> ILP32 is ABI acronym that means "Integers, Longs and Pointers are 32-bit".
> And LP64 means "Longs and Pointers are 64-bit”.

Just a nitpick: ILP32 is in fact just the memory model, but calling from ILP32
code into the Linux kernel requires modifications to the syscall-ABI due to
datastructure layout changing (every time a pointer or a ‘long’ is present in
a structure). As such structures are passed between the userspace and the
kernel (and also due to the fact that time_t is an ‘unsigned long’ in the C
language standard), modifications to the syscall ABI in Linux are needed to
support ILP32 processes calling into the kernel.

Things get a bit more involved, as the final consensus was to pass 64bit
quantities in the lower half of 2 64bit registers instead of as a single register:
this makes the way (on AArch64) that an ILP32 process calls into the kernel
more dissimilar from a LP64 process calling the same syscall.

What this rambling boils down to is: “ILP32" is the memory model, whereas
this series deals with the “Linux/AArch64 syscall ABI for ILP32 processes”.

Thanks,
Phil.

> 
> There's AN490 - "ILP32 for AArch64 Whitepaper" from ARM which covers
> the topic:
> http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dai0490a/ar01s01.html
> 
> And some talks:
> http://connect.linaro.org/resource/bkk16/bkk16-305b/
> 
> Briefly, ILP32 is 32-bit ABI that works with AARCH64 instruction set. It looks
> better in some performance tests, and is useful for compatibility with 32-bit
> legacy code.
> 
> If you're more familiar with x86 terminology, in ARM world LP64 corresponds
> to x86_64, AARCH32_EL0 corresponds to x86_32, and ILP32 corresponds to x32
> ABI.
> 
> I'll add link to AN490 in next submission.
> 
> Yury
> 
>> -- 
>> (english) http://www.livejournal.com/~pavelmachek
>> (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 07/24] arm64: ilp32: add documentation on the ILP32 ABI for ARM64
From: Yury Norov @ 2018-05-24 12:15 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Catalin Marinas, Arnd Bergmann, linux-arm-kernel, linux-kernel,
	linux-doc, linux-arch, linux-api, Adam Borowski, Alexander Graf,
	Alexey Klimov, Andreas Schwab, Andrew Pinski, Bamvor Zhangjian,
	Chris Metcalf, Christoph Muellner, Dave Martin, David S . Miller,
	Florian Weimer, Geert Uytterhoeven, Heiko Carstens, James Hogan,
	James Morse, Joseph Myers, Lin Yongting, Manuel Montezelo,
	Mark Brown, Martin Schwidefsky, Maxim Kuvyrkov, Nathan_Lynch,
	Philipp Tomsich, Prasun Kapoor, Ramana Radhakrishnan,
	Steve Ellcey, Szabolcs Nagy
In-Reply-To: <20180523140620.GA27215@amd>

Hi Pavel,

On Wed, May 23, 2018 at 04:06:20PM +0200, Pavel Machek wrote:
> On Wed 2018-05-16 11:18:52, Yury Norov wrote:
> > Based on Andrew Pinski's patch-series.
> > 
> > Signed-off-by: Yury Norov <ynorov@caviumnetworks.com>
> 
> So Andrew's signoff should be here?

Yes it should, but it lost since v4. I'll restore it.
 
> > ---
> >  Documentation/arm64/ilp32.txt | 45 +++++++++++++++++++++++++++++++++++
> >  1 file changed, 45 insertions(+)
> >  create mode 100644 Documentation/arm64/ilp32.txt
> > 
> > diff --git a/Documentation/arm64/ilp32.txt b/Documentation/arm64/ilp32.txt
> > new file mode 100644
> > index 000000000000..d0fd5109c4b2
> > --- /dev/null
> > +++ b/Documentation/arm64/ilp32.txt
> > @@ -0,0 +1,45 @@
> > +ILP32 AARCH64 SYSCALL ABI
> > +=========================
> > +
> > +This document describes the ILP32 syscall ABI and where it differs
> > +from the generic compat linux syscall interface.
> 
> I was hoping to learn what ILP32 is / what is it good for, but no,
> this does not tell me... it would be good to do a short explanation
> here, and maybe reference it from cover letter of the series...
> 								Pavel

ILP32 is ABI acronym that means "Integers, Longs and Pointers are 32-bit".
And LP64 means "Longs and Pointers are 64-bit".

There's AN490 - "ILP32 for AArch64 Whitepaper" from ARM which covers
the topic:
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dai0490a/ar01s01.html

And some talks:
http://connect.linaro.org/resource/bkk16/bkk16-305b/

Briefly, ILP32 is 32-bit ABI that works with AARCH64 instruction set. It looks
better in some performance tests, and is useful for compatibility with 32-bit
legacy code.

If you're more familiar with x86 terminology, in ARM world LP64 corresponds
to x86_64, AARCH32_EL0 corresponds to x86_32, and ILP32 corresponds to x32
ABI.

I'll add link to AN490 in next submission.

Yury

> -- 
> (english) http://www.livejournal.com/~pavelmachek
> (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html



--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v8 4/6] cpuset: Make generate_sched_domains() recognize isolated_cpus
From: Juri Lelli @ 2018-05-24 10:39 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: Waiman Long, Tejun Heo, Li Zefan, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, cgroups, linux-kernel, linux-doc, kernel-team, pjt,
	luto, Mike Galbraith, torvalds, Roman Gushchin
In-Reply-To: <20180524090430.GZ30654@e110439-lin>

On 24/05/18 10:04, Patrick Bellasi wrote:

[...]

> From 84bb8137ce79f74849d97e30871cf67d06d8d682 Mon Sep 17 00:00:00 2001
> From: Patrick Bellasi <patrick.bellasi@arm.com>
> Date: Wed, 23 May 2018 16:33:06 +0100
> Subject: [PATCH 1/1] cgroup/cpuset: disable sched domain rebuild when not
>  required
> 
> The generate_sched_domains() already addresses the "special case for 99%
> of systems" which require a single full sched domain at the root,
> spanning all the CPUs. However, the current support is based on an
> expensive sequence of operations which destroy and recreate the exact
> same scheduling domain configuration.
> 
> If we notice that:
> 
>  1) CPUs in "cpuset.isolcpus" are excluded from load balancing by the
>     isolcpus= kernel boot option, and will never be load balanced
>     regardless of the value of "cpuset.sched_load_balance" in any
>     cpuset.
> 
>  2) the root cpuset has load_balance enabled by default at boot and
>     it's the only parameter which userspace can change at run-time.
> 
> we know that, by default, every system comes up with a complete and
> properly configured set of scheduling domains covering all the CPUs.
> 
> Thus, on every system, unless the user explicitly disables load balance
> for the top_cpuset, the scheduling domains already configured at boot
> time by the scheduler/topology code and updated in consequence of
> hotplug events, are already properly configured for cpuset too.
> 
> This configuration is the default one for 99% of the systems,
> and it's also the one used by most of the Android devices which never
> disable load balance from the top_cpuset.
> 
> Thus, while load balance is enabled for the top_cpuset,
> destroying/rebuilding the scheduling domains at every cpuset.cpus
> reconfiguration is a useless operation which will always produce the
> same result.
> 
> Let's anticipate the "special" optimization within:
> 
>    rebuild_sched_domains_locked()
> 
> thus completely skipping the expensive:
> 
>    generate_sched_domains()
>    partition_sched_domains()
> 
> for all the cases we know that the scheduling domains already defined
> will not be affected by whatsoever value of cpuset.cpus.

[...]

> +	/* Special case for the 99% of systems with one, full, sched domain */
> +	if (!top_cpuset.isolation_count &&
> +	    is_sched_load_balance(&top_cpuset))
> +		goto out;
> +

Mmm, looks like we still need to destroy e recreate if there is a
new_topology (see arch_update_cpu_topology() in partition_sched_
domains).

Maybe we could move the check you are proposing in update_cpumasks_
hier() ?
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v8 4/6] cpuset: Make generate_sched_domains() recognize isolated_cpus
From: Juri Lelli @ 2018-05-24 10:28 UTC (permalink / raw)
  To: Waiman Long
  Cc: Tejun Heo, Li Zefan, Johannes Weiner, Peter Zijlstra, Ingo Molnar,
	cgroups, linux-kernel, linux-doc, kernel-team, pjt, luto,
	Mike Galbraith, torvalds, Roman Gushchin
In-Reply-To: <1526590545-3350-5-git-send-email-longman@redhat.com>

On 17/05/18 16:55, Waiman Long wrote:

[...]

> @@ -849,7 +860,12 @@ static void rebuild_sched_domains_locked(void)
>  	 * passing doms with offlined cpu to partition_sched_domains().
>  	 * Anyways, hotplug work item will rebuild sched domains.
>  	 */
> -	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
> +	if (!top_cpuset.isolation_count &&
> +	    !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
> +		goto out;
> +
> +	if (top_cpuset.isolation_count &&
> +	   !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
>  		goto out;

Do we cover the case in which hotplug removed one of the isolated cpus
from cpu_active_mask?
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v8 4/6] cpuset: Make generate_sched_domains() recognize isolated_cpus
From: Patrick Bellasi @ 2018-05-24  9:04 UTC (permalink / raw)
  To: Waiman Long
  Cc: Tejun Heo, Li Zefan, Johannes Weiner, Peter Zijlstra, Ingo Molnar,
	cgroups, linux-kernel, linux-doc, kernel-team, pjt, luto,
	Mike Galbraith, torvalds, Roman Gushchin, Juri Lelli
In-Reply-To: <bf4cb72b-9ff3-eb8b-ca2c-f6c4fee5c123@redhat.com>

On 23-May 16:18, Waiman Long wrote:
> On 05/23/2018 01:34 PM, Patrick Bellasi wrote:
> > Hi Waiman,
> >
> > On 17-May 16:55, Waiman Long wrote:
> >
> > [...]
> >
> >> @@ -672,13 +672,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
> >>  	int ndoms = 0;		/* number of sched domains in result */
> >>  	int nslot;		/* next empty doms[] struct cpumask slot */
> >>  	struct cgroup_subsys_state *pos_css;
> >> +	bool root_load_balance = is_sched_load_balance(&top_cpuset);
> >>  
> >>  	doms = NULL;
> >>  	dattr = NULL;
> >>  	csa = NULL;
> >>  
> >>  	/* Special case for the 99% of systems with one, full, sched domain */
> >> -	if (is_sched_load_balance(&top_cpuset)) {
> >> +	if (root_load_balance && !top_cpuset.isolation_count) {
> > Perhaps I'm missing something but, it seems to me that, when the two
> > conditions above are true, then we are going to destroy and rebuild
> > the exact same scheduling domains.
> >
> > IOW, on 99% of systems where:
> >
> >    is_sched_load_balance(&top_cpuset)
> >    top_cpuset.isolation_count = 0
> >
> > since boot time and forever, then every time we update a value for
> > cpuset.cpus we keep rebuilding the same SDs.
> >
> > It's not strictly related to this patch, the same already happens in
> > mainline based just on the first condition, but since you are extending
> > that optimization, perhaps you can tell me where I'm possibly wrong or
> > which cases I'm not considering.
> >
> > I'm interested mainly because on Android systems those conditions
> > are always true and we see SDs rebuilds every time we write
> > something in cpuset.cpus, which ultimately accounts for almost all the
> > 6-7[ms] time required for the write to return, depending on the CPU
> > frequency.
> >
> > Cheers Patrick
> >
> Yes, that is true. I will look into how to further optimize this. Thanks
> for the suggestion.

FWIW, following is my take on top of your series.

With the following patch applied I see a reduction of the average
execution time for a rebuild_sched_domains_locked() from 1.4[ms] to
40[us] while running 60 /tg1/cpuset.cpus switches in a loop on an
JunoR2 Arm board using the performance cpufreq governor.

---8<---
From 84bb8137ce79f74849d97e30871cf67d06d8d682 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Wed, 23 May 2018 16:33:06 +0100
Subject: [PATCH 1/1] cgroup/cpuset: disable sched domain rebuild when not
 required

The generate_sched_domains() already addresses the "special case for 99%
of systems" which require a single full sched domain at the root,
spanning all the CPUs. However, the current support is based on an
expensive sequence of operations which destroy and recreate the exact
same scheduling domain configuration.

If we notice that:

 1) CPUs in "cpuset.isolcpus" are excluded from load balancing by the
    isolcpus= kernel boot option, and will never be load balanced
    regardless of the value of "cpuset.sched_load_balance" in any
    cpuset.

 2) the root cpuset has load_balance enabled by default at boot and
    it's the only parameter which userspace can change at run-time.

we know that, by default, every system comes up with a complete and
properly configured set of scheduling domains covering all the CPUs.

Thus, on every system, unless the user explicitly disables load balance
for the top_cpuset, the scheduling domains already configured at boot
time by the scheduler/topology code and updated in consequence of
hotplug events, are already properly configured for cpuset too.

This configuration is the default one for 99% of the systems,
and it's also the one used by most of the Android devices which never
disable load balance from the top_cpuset.

Thus, while load balance is enabled for the top_cpuset,
destroying/rebuilding the scheduling domains at every cpuset.cpus
reconfiguration is a useless operation which will always produce the
same result.

Let's anticipate the "special" optimization within:

   rebuild_sched_domains_locked()

thus completely skipping the expensive:

   generate_sched_domains()
   partition_sched_domains()

for all the cases we know that the scheduling domains already defined
will not be affected by whatsoever value of cpuset.cpus.

The proposed solution is the minimal variation to optimize the case for
systems with load balance enabled at the root level and without isolated
CPUs. As soon as one of these conditions is not more valid, we fall back
to the original behavior.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Tejun Heo <tj@kernel.org>,
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@google.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: kernel-team@fb.com
Cc: cgroups@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 kernel/cgroup/cpuset.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 8f586e8bdc98..cff14be94678 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -874,6 +874,11 @@ static void rebuild_sched_domains_locked(void)
 	   !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
 		goto out;
 
+	/* Special case for the 99% of systems with one, full, sched domain */
+	if (!top_cpuset.isolation_count &&
+	    is_sched_load_balance(&top_cpuset))
+		goto out;
+
 	/* Generate domain masks and attrs */
 	ndoms = generate_sched_domains(&doms, &attr);
 
-- 
2.15.1
---8<---


-- 
#include <best/regards.h>

Patrick Bellasi
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH v2 0/7] mm: pages for hugetlb's overcommit may be able to charge to memcg
From: Michal Hocko @ 2018-05-24  8:27 UTC (permalink / raw)
  To: TSUKADA Koutaro
  Cc: Johannes Weiner, Vladimir Davydov, Jonathan Corbet,
	Luis R. Rodriguez, Kees Cook, Andrew Morton, Roman Gushchin,
	David Rientjes, Mike Kravetz, Aneesh Kumar K.V, Naoya Horiguchi,
	Anshuman Khandual, Marc-Andre Lureau, Punit Agrawal, Dan Williams,
	Vlastimil Babka, linux-doc, linux-kernel, linux-fsdevel, linux-mm,
	cgroups
In-Reply-To: <af1a3050-7365-428a-dfb1-2f3da37dc9ff@ascade.co.jp>

On Thu 24-05-18 13:26:12, TSUKADA Koutaro wrote:
[...]
> I do not know if it is really a strong use case, but I will explain my
> motive in detail. English is not my native language, so please pardon
> my poor English.
> 
> I am one of the developers for software that managing the resource used
> from user job at HPC-Cluster with Linux. The resource is memory mainly.
> The HPC-Cluster may be shared by multiple people and used. Therefore, the
> memory used by each user must be strictly controlled, otherwise the
> user's job will runaway, not only will it hamper the other users, it will
> crash the entire system in OOM.
> 
> Some users of HPC are very nervous about performance. Jobs are executed
> while synchronizing with MPI communication using multiple compute nodes.
> Since CPU wait time will occur when synchronizing, they want to minimize
> the variation in execution time at each node to reduce waiting times as
> much as possible. We call this variation a noise.
> 
> THP does not guarantee to use the Huge Page, but may use the normal page.
> This mechanism is one cause of variation(noise).
> 
> The users who know this mechanism will be hesitant to use THP. However,
> the users also know the benefits of the Huge Page's TLB hit rate
> performance, and the Huge Page seems to be attractive. It seems natural
> that these users are interested in HugeTLBfs, I do not know at all
> whether it is the right approach or not.

Sure, asking for guarantee makes hugetlb pages attractive. But nothing
is really for free, especially any resource _guarantee_, and you have to
pay an additional configuration price usually.
 
> At the very least, our HPC system is pursuing high versatility and we
> have to consider whether we can provide it if users want to use HugeTLBfs.
> 
> In order to use HugeTLBfs we need to create a persistent pool, but in
> our use case sharing nodes, it would be impossible to create, delete or
> resize the pool.

Why? I can see this would be quite a PITA but not really impossible.

> One of the answers I have reached is to use HugeTLBfs by overcommitting
> without creating a pool(this is the surplus hugepage).
> 
> Surplus hugepages is hugetlb page, but I think at least that consuming
> buddy pool is a decisive difference from hugetlb page of persistent pool.
> If nr_overcommit_hugepages is assumed to be infinite, allocating pages for
> surplus hugepages from buddy pool is all unlimited even if being limited
> by memcg.

Not really, you can specify how much you can overcommit hugetlb pages.

> In extreme cases, overcommitment will allow users to exhaust
> the entire memory of the system. Of course, this can be prevented by the
> hugetlb cgroup, but even if we set the limit for memcg and hugetlb cgroup
> respectively, as I asked in the first mail(set limit to 10GB), the
> control will not work.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 0/7] mm: pages for hugetlb's overcommit may be able to charge to memcg
From: Michal Hocko @ 2018-05-24  8:20 UTC (permalink / raw)
  To: TSUKADA Koutaro
  Cc: Mike Kravetz, Johannes Weiner, Vladimir Davydov, Jonathan Corbet,
	Luis R. Rodriguez, Kees Cook, Andrew Morton, Roman Gushchin,
	David Rientjes, Aneesh Kumar K.V, Naoya Horiguchi,
	Anshuman Khandual, Marc-Andre Lureau, Punit Agrawal, Dan Williams,
	Vlastimil Babka, linux-doc, linux-kernel, linux-fsdevel, linux-mm,
	cgroups
In-Reply-To: <455b1a07-d7e3-102b-65e7-3892947b7675@ascade.co.jp>

On Thu 24-05-18 13:39:59, TSUKADA Koutaro wrote:
> On 2018/05/23 3:54, Michal Hocko wrote:
[...]
> > I am also quite confused why you keep distinguishing surplus hugetlb
> > pages from regular preallocated ones. Being a surplus page is an
> > implementation detail that we use for an internal accounting rather than
> > something to exhibit to the userspace even more than we do currently.
> 
> I apologize for having confused.
> 
> The hugetlb pages obtained from the pool do not waste the buddy pool.

Because they have already allocated from the buddy allocator so the end
result is very same.

> On
> the other hand, surplus hugetlb pages waste the buddy pool. Due to this
> difference in property, I thought it could be distinguished.

But this is simply not correct. Surplus pages are fluid. If you increase
the hugetlb size they will become regular persistent hugetlb pages.
 
> Although my memcg knowledge is extremely limited, memcg is accounting for
> various kinds of pages obtained from the buddy pool by the task belonging
> to it. I would like to argue that surplus hugepage has specificity in
> terms of obtaining from the buddy pool, and that it is specially permitted
> charge requirements for memcg.

Not really. Memcg accounts primarily for reclaimable memory. We do
account for some non-reclaimable slabs but the life time should be at
least bound to a process life time. Otherwise the memcg oom killer
behavior is not guaranteed to unclutter the situation. Hugetlb pages are
simply persistent. Well, to be completely honest tmpfs pages have a
similar problem but lacking the swap space for them is kinda
configuration bug.

> It seems very strange that charge hugetlb page to memcg, but essentially
> it only charges the usage of the compound page obtained from the buddy pool,
> and even if that page is used as hugetlb page after that, memcg is not
> interested in that.

Ohh, it is very much interested. The primary goal of memcg is to enforce
the limit. How are you going to do that in an absence of the reclaimable
memory? And quite a lot of it because hugetlb pages usually consume a
lot of memory.

> I will completely apologize if my way of thinking is wrong. It would be
> greatly appreciated if you could mention why we can not charge surplus
> hugepages to memcg.
> 
> > Just look at what [sw]hould when you need to adjust accounting - e.g.
> > due to the pool resize. Are you going to uncharge those surplus pages
> > ffrom memcg to reflect their persistence?
> > 
> 
> I could not understand the intention of this question, sorry. When resize
> the pool, I think that the number of surplus hugepages in use does not
> change. Could you explain what you were concerned about?

It does change when ou change the hugetlb pool size, migrate pages
between per-numa pools (have a look at adjust_pool_surplus).
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH bpf-next v2 0/3] bpf: add boot parameters for sysctl knobs
From: Jesper Dangaard Brouer @ 2018-05-24  7:41 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eugene Syromiatnikov, netdev, linux-kernel, linux-doc, Kees Cook,
	Kai-Heng Feng, Daniel Borkmann, Alexei Starovoitov,
	Jonathan Corbet, Jiri Olsa, brouer
In-Reply-To: <20180523220244.a4u25kapqbjnmpr4@ast-mbp>

On Wed, 23 May 2018 15:02:45 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Wed, May 23, 2018 at 02:18:19PM +0200, Eugene Syromiatnikov wrote:
> > Some BPF sysctl knobs affect the loading of BPF programs, and during
> > system boot/init stages these sysctls are not yet configured.
> > A concrete example is systemd, that has implemented loading of BPF
> > programs.
> > 
> > Thus, to allow controlling these setting at early boot, this patch set
> > adds the ability to change the default setting of these sysctl knobs
> > as well as option to override them via a boot-time kernel parameter
> > (in order to avoid rebuilding kernel each time a need of changing these
> > defaults arises).
> > 
> > The sysctl knobs in question are kernel.unprivileged_bpf_disable,
> > net.core.bpf_jit_harden, and net.core.bpf_jit_kallsyms.  
> 
> - systemd is root. today it only uses cgroup-bpf progs which require root,
>   so disabling unpriv during boot time makes no difference to systemd.
>   what is the actual reason to present time?
> 
> - say in the future systemd wants to use so_reuseport+bpf for faster
>   networking. With unpriv disable during boot, it will force systemd
>   to do such networking from root, which will lower its security barrier.
>   How that make sense?
> 
> - bpf_jit_kallsyms sysctl has immediate effect on loaded programs.
>   Flipping it during the boot or right after or any time after
>   is the same thing. Why add such boot flag then?
> 
> - jit_harden can be turned on by systemd. so turning it during the boot
>   will make systemd progs to be constant blinded.
>   Constant blinding protects kernel from unprivileged JIT spraying.
>   Are you worried that systemd will attack the kernel with JIT spraying?


I think you are missing that, we want the ability to change these
defaults in-order to avoid depending on /etc/sysctl.conf settings, and
that the these sysctl.conf setting happen too late.

For example with jit_harden, there will be a difference between the
loaded BPF program that got loaded at boot-time with systemd (no
constant blinding) and when someone reloads that systemd service after
/etc/sysctl.conf have been evaluated and setting bpf_jit_harden (now
slower due to constant blinding).   This is inconsistent behavior.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 1/5] PCI/AER: Define and allocate aer_stats structure for AER capable devices
From: Greg Kroah-Hartman @ 2018-05-24  6:08 UTC (permalink / raw)
  To: Rajat Jain
  Cc: Bjorn Helgaas, Jonathan Corbet, Philippe Ombredanne, Kate Stewart,
	Thomas Gleixner, Frederick Lawler, Oza Pawandeep, Keith Busch,
	Gabriele Paoloni, Alexandru Gagniuc, Thomas Tai,
	Steven Rostedt (VMware), linux-pci, linux-doc, linux-kernel,
	Jes Sorensen, Kyle McMartin, rajatxjain
In-Reply-To: <20180523175808.28030-2-rajatja@google.com>

On Wed, May 23, 2018 at 10:58:04AM -0700, Rajat Jain wrote:
> ---
> v2: Fix the license header as per Greg's suggestions
>     (Since there is disagreement with using "//" vs "/* */" for license
>      I decided to keep the one preferred by Linus, also used by others
>      in this directory)

The rules are pretty simple for how to do this, and they are documented
in Documentation/process/license-rules.rst, please just follow that like
the rest of the kernel has done.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 0/7] mm: pages for hugetlb's overcommit may be able to charge to memcg
From: TSUKADA Koutaro @ 2018-05-24  4:39 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Mike Kravetz, Johannes Weiner, Vladimir Davydov, Jonathan Corbet,
	Luis R. Rodriguez, Kees Cook, Andrew Morton, Roman Gushchin,
	David Rientjes, Aneesh Kumar K.V, Naoya Horiguchi,
	Anshuman Khandual, Marc-Andre Lureau, Punit Agrawal, Dan Williams,
	Vlastimil Babka, linux-doc, linux-kernel, linux-fsdevel, linux-mm,
	cgroups
In-Reply-To: <20180522185407.GC20441@dhcp22.suse.cz>

On 2018/05/23 3:54, Michal Hocko wrote:
> On Tue 22-05-18 22:04:23, TSUKADA Koutaro wrote:
>> On 2018/05/22 3:07, Mike Kravetz wrote:
>>> On 05/17/2018 09:27 PM, TSUKADA Koutaro wrote:
>>>> Thanks to Mike Kravetz for comment on the previous version patch.
>>>>
>>>> The purpose of this patch-set is to make it possible to control whether or
>>>> not to charge surplus hugetlb pages obtained by overcommitting to memory
>>>> cgroup. In the future, I am trying to accomplish limiting the memory usage
>>>> of applications that use both normal pages and hugetlb pages by the memory
>>>> cgroup(not use the hugetlb cgroup).
>>>>
>>>> Applications that use shared libraries like libhugetlbfs.so use both normal
>>>> pages and hugetlb pages, but we do not know how much to use each. Please
>>>> suppose you want to manage the memory usage of such applications by cgroup
>>>> How do you set the memory cgroup and hugetlb cgroup limit when you want to
>>>> limit memory usage to 10GB?
>>>>
>>>> If you set a limit of 10GB for each, the user can use a total of 20GB of
>>>> memory and can not limit it well. Since it is difficult to estimate the
>>>> ratio used by user of normal pages and hugetlb pages, setting limits of 2GB
>>>> to memory cgroup and 8GB to hugetlb cgroup is not very good idea. In such a
>>>> case, I thought that by using my patch-set, we could manage resources just
>>>> by setting 10GB as the limit of memory cgoup(there is no limit to hugetlb
>>>> cgroup).
>>>>
>>>> In this patch-set, introduce the charge_surplus_huge_pages(boolean) to
>>>> struct hstate. If it is true, it charges to the memory cgroup to which the
>>>> task that obtained surplus hugepages belongs. If it is false, do nothing as
>>>> before, and the default value is false. The charge_surplus_huge_pages can
>>>> be controlled procfs or sysfs interfaces.
>>>>
>>>> Since THP is very effective in environments with kernel page size of 4KB,
>>>> such as x86, there is no reason to positively use HugeTLBfs, so I think
>>>> that there is no situation to enable charge_surplus_huge_pages. However, in
>>>> some distributions such as arm64, the page size of the kernel is 64KB, and
>>>> the size of THP is too huge as 512MB, making it difficult to use. HugeTLBfs
>>>> may support multiple huge page sizes, and in such a special environment
>>>> there is a desire to use HugeTLBfs.
>>>
>>> One of the basic questions/concerns I have is accounting for surplus huge
>>> pages in the default memory resource controller.  The existing huegtlb
>>> resource controller already takes hugetlbfs huge pages into account,
>>> including surplus pages.  This series would allow surplus pages to be
>>> accounted for in the default  memory controller, or the hugetlb controller
>>> or both.
>>>
>>> I understand that current mechanisms do not meet the needs of the above
>>> use case.  The question is whether this is an appropriate way to approach
>>> the issue.
> 
> I do share your view Mike!
> 
>>> My cgroup experience and knowledge is extremely limited, but
>>> it does not appear that any other resource can be controlled by multiple
>>> controllers.  Therefore, I am concerned that this may be going against
>>> basic cgroup design philosophy.
>>
>> Thank you for your feedback.
>> That makes sense, surplus hugepages are charged to both memcg and hugetlb
>> cgroup, which may be contrary to cgroup design philosophy.
>>
>> Based on the above advice, I have considered the following improvements,
>> what do you think about?
>>
>> The 'charge_surplus_hugepages' of v2 patch-set was an option to switch
>> "whether to charge memcg in addition to hugetlb cgroup", but it will be
>> abolished. Instead, change to "switch only to memcg instead of hugetlb
>> cgroup" option. This is called 'surplus_charge_to_memcg'.
> 
> This all looks so hackish and ad-hoc that I would be tempted to give it
> an outright nack, but let's here more about why do we need this fiddling
> at all. I've asked in other email so I guess I will get an answer there
> but let me just emphasize again that I absolutely detest a possibility
> to put hugetlb pages into the memcg mix. They just do not belong there.
> Try to look at previous discussions why it has been decided to have a
> separate hugetlb pages at all.
> 
> I am also quite confused why you keep distinguishing surplus hugetlb
> pages from regular preallocated ones. Being a surplus page is an
> implementation detail that we use for an internal accounting rather than
> something to exhibit to the userspace even more than we do currently.

I apologize for having confused.

The hugetlb pages obtained from the pool do not waste the buddy pool. On
the other hand, surplus hugetlb pages waste the buddy pool. Due to this
difference in property, I thought it could be distinguished.

Although my memcg knowledge is extremely limited, memcg is accounting for
various kinds of pages obtained from the buddy pool by the task belonging
to it. I would like to argue that surplus hugepage has specificity in
terms of obtaining from the buddy pool, and that it is specially permitted
charge requirements for memcg.

It seems very strange that charge hugetlb page to memcg, but essentially
it only charges the usage of the compound page obtained from the buddy pool,
and even if that page is used as hugetlb page after that, memcg is not
interested in that.

I will completely apologize if my way of thinking is wrong. It would be
greatly appreciated if you could mention why we can not charge surplus
hugepages to memcg.

> Just look at what [sw]hould when you need to adjust accounting - e.g.
> due to the pool resize. Are you going to uncharge those surplus pages
> ffrom memcg to reflect their persistence?
> 

I could not understand the intention of this question, sorry. When resize
the pool, I think that the number of surplus hugepages in use does not
change. Could you explain what you were concerned about?

-- 
Thanks,
Tsukada


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox