Linux Documentation
 help / color / mirror / Atom feed
* [PATCH v14 4/5] gpio: rpmsg: add generic rpmsg GPIO driver
From: Shenwei Wang @ 2026-06-25 15:54 UTC (permalink / raw)
  To: Linus Walleij, Bartosz Golaszewski, Jonathan Corbet, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Bjorn Andersson,
	Mathieu Poirier, Frank Li, Sascha Hauer
  Cc: Shuah Khan, linux-gpio, linux-doc, linux-kernel,
	Pengutronix Kernel Team, Fabio Estevam, Shenwei Wang, Peng Fan,
	devicetree, linux-remoteproc, imx, linux-arm-kernel, linux-imx,
	Arnaud POULIQUEN, b-padhi, Andrew Lunn, Bartosz Golaszewski
In-Reply-To: <20260625155432.815185-1-shenwei.wang@oss.nxp.com>

From: Shenwei Wang <shenwei.wang@nxp.com>

On an AMP platform, the system may include multiple processors:
	- MCUs running an RTOS
	- An MPU running Linux

These processors communicate via the RPMSG protocol.
The driver implements the standard GPIO interface, allowing
the Linux side to control GPIO controllers which reside in
the remote processor via RPMSG protocol.

Cc: Bartosz Golaszewski <brgl@bgdev.pl>
Cc: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
---
 drivers/gpio/Kconfig      |  17 ++
 drivers/gpio/Makefile     |   1 +
 drivers/gpio/gpio-rpmsg.c | 568 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 586 insertions(+)
 create mode 100644 drivers/gpio/gpio-rpmsg.c

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 020e51e30317..4ad299fe3c6f 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -1917,6 +1917,23 @@ config GPIO_SODAVILLE
 
 endmenu
 
+menu "RPMSG GPIO drivers"
+	depends on RPMSG
+
+config GPIO_RPMSG
+	tristate "Generic RPMSG GPIO support"
+	depends on OF && REMOTEPROC
+	select GPIOLIB_IRQCHIP
+	default REMOTEPROC
+	help
+	  Say yes here to support the generic GPIO functions over the RPMSG
+	  bus. Currently supported devices: i.MX7ULP, i.MX8ULP, i.MX8x, and
+	  i.MX9x.
+
+	  If unsure, say N.
+
+endmenu
+
 menu "SPI GPIO expanders"
 	depends on SPI_MASTER
 
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index b267598b517d..ee75c0e65b8b 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -157,6 +157,7 @@ obj-$(CONFIG_GPIO_RDC321X)		+= gpio-rdc321x.o
 obj-$(CONFIG_GPIO_REALTEK_OTTO)		+= gpio-realtek-otto.o
 obj-$(CONFIG_GPIO_REG)			+= gpio-reg.o
 obj-$(CONFIG_GPIO_ROCKCHIP)	+= gpio-rockchip.o
+obj-$(CONFIG_GPIO_RPMSG)		+= gpio-rpmsg.o
 obj-$(CONFIG_GPIO_RTD)			+= gpio-rtd.o
 obj-$(CONFIG_ARCH_SA1100)		+= gpio-sa1100.o
 obj-$(CONFIG_GPIO_SAMA5D2_PIOBU)	+= gpio-sama5d2-piobu.o
diff --git a/drivers/gpio/gpio-rpmsg.c b/drivers/gpio/gpio-rpmsg.c
new file mode 100644
index 000000000000..332e2925a830
--- /dev/null
+++ b/drivers/gpio/gpio-rpmsg.c
@@ -0,0 +1,568 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2026 NXP
+ *
+ * The driver exports a standard gpiochip interface to control
+ * the GPIO controllers via RPMSG on a remote processor.
+ */
+
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/gpio/driver.h>
+#include <linux/init.h>
+#include <linux/irqdomain.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/remoteproc.h>
+#include <linux/rpmsg.h>
+#include <linux/virtio_gpio.h>
+
+#define GPIOS_PER_PORT_DEFAULT		32
+#define RPMSG_TIMEOUT			1000
+
+/* Additional commands beyond virtio-gpio */
+#define VIRTIO_GPIO_MSG_SET_WAKEUP	0x0010
+
+/* GPIO Receive MSG Type */
+#define GPIO_RPMSG_REPLY	1
+#define GPIO_RPMSG_NOTIFY	2
+
+#define CHAN_NAME_PREFIX	"rpmsg-io-"
+#define GPIO_COMPAT_STR		"rpmsg-gpio"
+
+struct rpmsg_gpio_response {
+	__u8 type;
+	union {
+		/* command reply */
+		struct {
+			__u8 status;
+			__u8 value;
+		};
+
+		/* interrupt notification */
+		struct {
+			__u8 line;
+			__u8 trigger; /* rising/falling/high/low */
+		};
+	};
+};
+
+struct rpmsg_gpio_line {
+	u8 irq_shutdown;
+	u8 irq_unmask;
+	u8 irq_mask;
+	u32 irq_wake_enable;
+	u32 irq_type;
+};
+
+struct rpmsg_gpio_port {
+	struct gpio_chip gc;
+	struct rpmsg_device *rpdev;
+	struct virtio_gpio_request *send_msg;
+	struct rpmsg_gpio_response *recv_msg;
+	struct completion cmd_complete;
+	struct mutex lock;
+	u32 ngpios;
+	u32 idx;
+	struct rpmsg_gpio_line lines[GPIOS_PER_PORT_DEFAULT];
+};
+
+static int rpmsg_gpio_send_message(struct rpmsg_gpio_port *port)
+{
+	int ret;
+
+	reinit_completion(&port->cmd_complete);
+
+	ret = rpmsg_send(port->rpdev->ept, port->send_msg, sizeof(*port->send_msg));
+	if (ret) {
+		dev_err(&port->rpdev->dev, "rpmsg_send failed: cmd=%d ret=%d\n",
+			port->send_msg->type, ret);
+		return ret;
+	}
+
+	ret = wait_for_completion_timeout(&port->cmd_complete,
+					  msecs_to_jiffies(RPMSG_TIMEOUT));
+	if (ret == 0) {
+		dev_err(&port->rpdev->dev, "rpmsg_send timeout! cmd=%d\n",
+			port->send_msg->type);
+		return -ETIMEDOUT;
+	}
+
+	if (unlikely(port->recv_msg->status != VIRTIO_GPIO_STATUS_OK)) {
+		dev_err(&port->rpdev->dev, "remote core replies an error: cmd=%d!\n",
+			port->send_msg->type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct virtio_gpio_request *
+rpmsg_gpio_msg_prepare(struct rpmsg_gpio_port *port, u16 line, u16 cmd, u32 val)
+{
+	struct virtio_gpio_request *msg = port->send_msg;
+
+	msg->type = cmd;
+	msg->gpio = line;
+	msg->value = val;
+
+	return msg;
+}
+
+static int rpmsg_gpio_get(struct gpio_chip *gc, unsigned int line)
+{
+	struct rpmsg_gpio_port *port = gpiochip_get_data(gc);
+	int ret;
+
+	guard(mutex)(&port->lock);
+
+	rpmsg_gpio_msg_prepare(port, line, VIRTIO_GPIO_MSG_GET_VALUE, 0);
+
+	ret = rpmsg_gpio_send_message(port);
+	return ret ? ret : port->recv_msg->value;
+}
+
+static int rpmsg_gpio_get_direction(struct gpio_chip *gc, unsigned int line)
+{
+	struct rpmsg_gpio_port *port = gpiochip_get_data(gc);
+	int ret;
+
+	guard(mutex)(&port->lock);
+
+	rpmsg_gpio_msg_prepare(port, line, VIRTIO_GPIO_MSG_GET_DIRECTION, 0);
+
+	ret = rpmsg_gpio_send_message(port);
+	if (ret)
+		return ret;
+
+	switch (port->recv_msg->value) {
+	case VIRTIO_GPIO_DIRECTION_IN:
+		return GPIO_LINE_DIRECTION_IN;
+	case VIRTIO_GPIO_DIRECTION_OUT:
+		return GPIO_LINE_DIRECTION_OUT;
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static int rpmsg_gpio_direction_input(struct gpio_chip *gc, unsigned int line)
+{
+	struct rpmsg_gpio_port *port = gpiochip_get_data(gc);
+
+	guard(mutex)(&port->lock);
+
+	rpmsg_gpio_msg_prepare(port, line, VIRTIO_GPIO_MSG_SET_DIRECTION,
+			       VIRTIO_GPIO_DIRECTION_IN);
+
+	return rpmsg_gpio_send_message(port);
+}
+
+static int rpmsg_gpio_set(struct gpio_chip *gc, unsigned int line, int val)
+{
+	struct rpmsg_gpio_port *port = gpiochip_get_data(gc);
+
+	guard(mutex)(&port->lock);
+
+	rpmsg_gpio_msg_prepare(port, line, VIRTIO_GPIO_MSG_SET_VALUE, val);
+
+	return rpmsg_gpio_send_message(port);
+}
+
+static int rpmsg_gpio_direction_output(struct gpio_chip *gc, unsigned int line, int val)
+{
+	struct rpmsg_gpio_port *port = gpiochip_get_data(gc);
+	int ret;
+
+	guard(mutex)(&port->lock);
+
+	rpmsg_gpio_msg_prepare(port, line, VIRTIO_GPIO_MSG_SET_DIRECTION,
+			       VIRTIO_GPIO_DIRECTION_OUT);
+
+	ret = rpmsg_gpio_send_message(port);
+	if (ret)
+		return ret;
+
+	rpmsg_gpio_msg_prepare(port, line, VIRTIO_GPIO_MSG_SET_VALUE, val);
+
+	return rpmsg_gpio_send_message(port);
+}
+
+static int gpio_rpmsg_irq_set_type(struct irq_data *d, u32 type)
+{
+	struct rpmsg_gpio_port *port = irq_data_get_irq_chip_data(d);
+	u32 line = d->hwirq;
+
+	switch (type) {
+	case IRQ_TYPE_EDGE_RISING:
+		type = VIRTIO_GPIO_IRQ_TYPE_EDGE_RISING;
+		irq_set_handler_locked(d, handle_simple_irq);
+		break;
+	case IRQ_TYPE_EDGE_FALLING:
+		type = VIRTIO_GPIO_IRQ_TYPE_EDGE_FALLING;
+		irq_set_handler_locked(d, handle_simple_irq);
+		break;
+	case IRQ_TYPE_EDGE_BOTH:
+		type = VIRTIO_GPIO_IRQ_TYPE_EDGE_BOTH;
+		irq_set_handler_locked(d, handle_simple_irq);
+		break;
+	case IRQ_TYPE_LEVEL_LOW:
+		type = VIRTIO_GPIO_IRQ_TYPE_LEVEL_LOW;
+		irq_set_handler_locked(d, handle_level_irq);
+		break;
+	case IRQ_TYPE_LEVEL_HIGH:
+		type = VIRTIO_GPIO_IRQ_TYPE_LEVEL_HIGH;
+		irq_set_handler_locked(d, handle_level_irq);
+		break;
+	default:
+		dev_err(&port->rpdev->dev, "unsupported irq type: %u\n", type);
+		return -EINVAL;
+	}
+
+	port->lines[line].irq_type = type;
+
+	return 0;
+}
+
+static int gpio_rpmsg_irq_set_wake(struct irq_data *d, u32 enable)
+{
+	struct rpmsg_gpio_port *port = irq_data_get_irq_chip_data(d);
+	u32 line = d->hwirq;
+
+	port->lines[line].irq_wake_enable = enable;
+
+	return 0;
+}
+
+/*
+ * This unmask/mask function is invoked in two situations:
+ *   - when an interrupt is being set up, and
+ *   - after an interrupt has occurred.
+ *
+ * The GPIO driver does not access hardware registers directly.
+ * Instead, it caches all relevant information locally, and then sends
+ * the accumulated state to the remote system at this stage.
+ */
+static void gpio_rpmsg_unmask_irq(struct irq_data *d)
+{
+	struct rpmsg_gpio_port *port = irq_data_get_irq_chip_data(d);
+	u32 line = d->hwirq;
+
+	port->lines[line].irq_unmask = 1;
+}
+
+static void gpio_rpmsg_mask_irq(struct irq_data *d)
+{
+	struct rpmsg_gpio_port *port = irq_data_get_irq_chip_data(d);
+	u32 line = d->hwirq;
+
+	/*
+	 * When an interrupt occurs, the remote system masks the interrupt
+	 * and then sends a notification to Linux. After Linux processes
+	 * that notification, it sends an RPMsg command back to the remote
+	 * system to unmask the interrupt again.
+	 */
+	port->lines[line].irq_mask = 1;
+}
+
+static void gpio_rpmsg_irq_shutdown(struct irq_data *d)
+{
+	struct rpmsg_gpio_port *port = irq_data_get_irq_chip_data(d);
+	u32 line = d->hwirq;
+
+	port->lines[line].irq_shutdown = 1;
+}
+
+static void gpio_rpmsg_irq_bus_lock(struct irq_data *d)
+{
+	struct rpmsg_gpio_port *port = irq_data_get_irq_chip_data(d);
+
+	mutex_lock(&port->lock);
+}
+
+static void gpio_rpmsg_irq_bus_sync_unlock(struct irq_data *d)
+{
+	struct rpmsg_gpio_port *port = irq_data_get_irq_chip_data(d);
+	u32 line = d->hwirq;
+
+	rpmsg_gpio_msg_prepare(port, line, VIRTIO_GPIO_MSG_SET_WAKEUP,
+			       port->lines[line].irq_wake_enable);
+	rpmsg_gpio_send_message(port);
+
+	/*
+	 * For mask irq, do nothing here.
+	 * The remote system will mask interrupt after an interrupt occurs,
+	 * and then send a notification to Linux system. After Linux system
+	 * handles the notification, it sends an rpmsg back to the remote
+	 * system to unmask this interrupt again.
+	 */
+	if (port->lines[line].irq_mask && !port->lines[line].irq_unmask) {
+		port->lines[line].irq_mask = 0;
+		mutex_unlock(&port->lock);
+		return;
+	}
+
+	if (port->lines[line].irq_shutdown) {
+		rpmsg_gpio_msg_prepare(port, line, VIRTIO_GPIO_MSG_IRQ_TYPE,
+				       VIRTIO_GPIO_IRQ_TYPE_NONE);
+		port->lines[line].irq_shutdown = 0;
+	} else {
+		rpmsg_gpio_msg_prepare(port, line, VIRTIO_GPIO_MSG_IRQ_TYPE,
+				       port->lines[line].irq_type);
+
+		if (port->lines[line].irq_unmask)
+			port->lines[line].irq_unmask = 0;
+	}
+
+	rpmsg_gpio_send_message(port);
+	mutex_unlock(&port->lock);
+}
+
+static const struct irq_chip gpio_rpmsg_irq_chip = {
+	.irq_mask = gpio_rpmsg_mask_irq,
+	.irq_unmask = gpio_rpmsg_unmask_irq,
+	.irq_set_wake = gpio_rpmsg_irq_set_wake,
+	.irq_set_type = gpio_rpmsg_irq_set_type,
+	.irq_shutdown = gpio_rpmsg_irq_shutdown,
+	.irq_bus_lock = gpio_rpmsg_irq_bus_lock,
+	.irq_bus_sync_unlock = gpio_rpmsg_irq_bus_sync_unlock,
+	.flags = IRQCHIP_IMMUTABLE,
+};
+
+static int rpmsg_gpiochip_register(struct rpmsg_device *rpdev,
+				   struct device_node *np, const char *name)
+{
+	struct rpmsg_gpio_port *port;
+	struct gpio_irq_chip *girq;
+	struct gpio_chip *gc;
+	int ret;
+
+	port = devm_kzalloc(&rpdev->dev, sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return -ENOMEM;
+
+	ret = of_property_read_u32(np, "reg", &port->idx);
+	if (ret)
+		return ret;
+
+	ret = devm_mutex_init(&rpdev->dev, &port->lock);
+	if (ret)
+		return ret;
+
+	ret = of_property_read_u32(np, "ngpios", &port->ngpios);
+	if (ret || port->ngpios > GPIOS_PER_PORT_DEFAULT)
+		port->ngpios = GPIOS_PER_PORT_DEFAULT;
+
+	port->send_msg = devm_kzalloc(&rpdev->dev,
+				      sizeof(*port->send_msg),
+				      GFP_KERNEL);
+
+	port->recv_msg = devm_kzalloc(&rpdev->dev,
+				      sizeof(*port->recv_msg),
+				      GFP_KERNEL);
+	if (!port->send_msg || !port->recv_msg)
+		return -ENOMEM;
+
+	init_completion(&port->cmd_complete);
+	port->rpdev = rpdev;
+
+	gc = &port->gc;
+	gc->owner = THIS_MODULE;
+	gc->parent = &rpdev->dev;
+	gc->fwnode = of_fwnode_handle(np);
+	gc->ngpio = port->ngpios;
+	gc->base = -1;
+	gc->label = devm_kasprintf(&rpdev->dev, GFP_KERNEL, "%s-gpio%d",
+				   name, port->idx);
+
+	gc->direction_input = rpmsg_gpio_direction_input;
+	gc->direction_output = rpmsg_gpio_direction_output;
+	gc->get_direction = rpmsg_gpio_get_direction;
+	gc->get = rpmsg_gpio_get;
+	gc->set = rpmsg_gpio_set;
+
+	girq = &gc->irq;
+	gpio_irq_chip_set_chip(girq, &gpio_rpmsg_irq_chip);
+	girq->parent_handler = NULL;
+	girq->num_parents = 0;
+	girq->parents = NULL;
+	girq->chip->name = devm_kstrdup(&rpdev->dev, gc->label, GFP_KERNEL);
+
+	dev_set_drvdata(&rpdev->dev, port);
+
+	return devm_gpiochip_add_data(&rpdev->dev, gc, port);
+}
+
+static const char *rpmsg_get_rproc_node_name(struct rpmsg_device *rpdev)
+{
+	const char *name = NULL;
+	struct device_node *np;
+	struct rproc *rproc;
+
+	rproc = rproc_get_by_child(&rpdev->dev);
+	if (!rproc)
+		return NULL;
+
+	np = of_node_get(rproc->dev.of_node);
+	if (!np && rproc->dev.parent)
+		np = of_node_get(rproc->dev.parent->of_node);
+
+	if (np) {
+		name = devm_kstrdup(&rpdev->dev, np->name, GFP_KERNEL);
+		of_node_put(np);
+	}
+
+	return name;
+}
+
+static struct device_node *
+rpmsg_find_child_by_compat_reg(struct device_node *parent, const char *compat, u32 idx)
+{
+	struct device_node *child;
+	u32 reg;
+
+	for_each_available_child_of_node(parent, child) {
+		if (!of_device_is_compatible(child, compat))
+			continue;
+
+		if (of_property_read_u32(child, "reg", &reg))
+			continue;
+
+		if (reg == idx)
+			return child;
+	}
+
+	return NULL;
+}
+
+static struct device_node *
+rpmsg_get_channel_ofnode(struct rpmsg_device *rpdev, const char *compat, u32 idx)
+{
+	struct device_node *np_chan = NULL, *np;
+	struct rproc *rproc;
+
+	rproc = rproc_get_by_child(&rpdev->dev);
+	if (!rproc)
+		return NULL;
+
+	np = of_node_get(rproc->dev.of_node);
+	if (!np && rproc->dev.parent)
+		np = of_node_get(rproc->dev.parent->of_node);
+
+	if (np)
+		np_chan = rpmsg_find_child_by_compat_reg(np, compat, idx);
+
+	return np_chan;
+}
+
+static int rpmsg_get_gpio_index(const char *name, const char *prefix)
+{
+	const char *p;
+	int base = 10;
+	int val;
+
+	if (!name)
+		return -EINVAL;
+
+	/* Ensure correct prefix */
+	if (!str_has_prefix(name, prefix))
+		return -EINVAL;
+
+	/* Find last '-' */
+	p = strrchr(name, '-');
+
+	if (!p || *(p + 1) == '\0')
+		return -EINVAL;
+
+	if (p[1] == '0' && (p[2] == 'x' || p[2] == 'X'))
+		base = 16;
+
+	if (kstrtoint(p + 1, base, &val))
+		return -EINVAL;
+
+	return val;
+}
+
+static int rpmsg_gpio_channel_callback(struct rpmsg_device *rpdev, void *data,
+				       int len, void *priv, u32 src)
+{
+	struct rpmsg_gpio_response *msg = data;
+	struct rpmsg_gpio_port *port = NULL;
+
+	port = dev_get_drvdata(&rpdev->dev);
+
+	if (!port) {
+		dev_err(&rpdev->dev, "port is null\n");
+		return -EINVAL;
+	}
+
+	if (msg->type == GPIO_RPMSG_REPLY) {
+		*port->recv_msg = *msg;
+		complete(&port->cmd_complete);
+	} else if (msg->type == GPIO_RPMSG_NOTIFY) {
+		generic_handle_domain_irq_safe(port->gc.irq.domain, msg->line);
+	} else {
+		dev_err(&rpdev->dev, "wrong message type (0x%x)\n", msg->type);
+	}
+
+	return 0;
+}
+
+static int rpmsg_gpio_channel_probe(struct rpmsg_device *rpdev)
+{
+	struct device *dev = &rpdev->dev;
+	struct device_node *np;
+	const char *rproc_name;
+	int idx;
+
+	idx = rpmsg_get_gpio_index(rpdev->id.name, CHAN_NAME_PREFIX);
+	if (idx < 0)
+		return -EINVAL;
+
+	if (!dev->of_node) {
+		np = rpmsg_get_channel_ofnode(rpdev, GPIO_COMPAT_STR, idx);
+		if (!np)
+			return -ENODEV;
+
+		dev->of_node = np;
+		set_primary_fwnode(dev, of_fwnode_handle(np));
+		return -EPROBE_DEFER;
+	}
+
+	rproc_name = rpmsg_get_rproc_node_name(rpdev);
+
+	return rpmsg_gpiochip_register(rpdev, dev->of_node, rproc_name);
+}
+
+static const struct of_device_id rpmsg_gpio_dt_ids[] = {
+	{ .compatible = GPIO_COMPAT_STR },
+	{ /* sentinel */ }
+};
+
+static struct rpmsg_device_id rpmsg_gpio_channel_id_table[] = {
+	{ .name = CHAN_NAME_PREFIX },
+	{ },
+};
+MODULE_DEVICE_TABLE(rpmsg, rpmsg_gpio_channel_id_table);
+
+static struct rpmsg_driver rpmsg_gpio_channel_client = {
+	.callback	= rpmsg_gpio_channel_callback,
+	.id_table	= rpmsg_gpio_channel_id_table,
+	.probe		= rpmsg_gpio_channel_probe,
+	.drv		= {
+		.name	= KBUILD_MODNAME,
+		.of_match_table = rpmsg_gpio_dt_ids,
+	},
+};
+module_rpmsg_driver(rpmsg_gpio_channel_client);
+
+MODULE_AUTHOR("Shenwei Wang <shenwei.wang@nxp.com>");
+MODULE_DESCRIPTION("generic rpmsg gpio driver");
+MODULE_LICENSE("GPL");
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 3/5] rpmsg: core: match rpmsg device IDs by prefix
From: Shenwei Wang @ 2026-06-25 15:54 UTC (permalink / raw)
  To: Linus Walleij, Bartosz Golaszewski, Jonathan Corbet, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Bjorn Andersson,
	Mathieu Poirier, Frank Li, Sascha Hauer
  Cc: Shuah Khan, linux-gpio, linux-doc, linux-kernel,
	Pengutronix Kernel Team, Fabio Estevam, Shenwei Wang, Peng Fan,
	devicetree, linux-remoteproc, imx, linux-arm-kernel, linux-imx,
	Arnaud POULIQUEN, b-padhi, Andrew Lunn
In-Reply-To: <20260625155432.815185-1-shenwei.wang@oss.nxp.com>

From: Shenwei Wang <shenwei.wang@nxp.com>

The current rpmsg_id_match() implementation requires an exact
string match between the driver id_table entry and the rpmsg
device name using strncmp() with RPMSG_NAME_SIZE.

This makes it impossible for a driver to match a group of
rpmsg devices sharing a common prefix (e.g. dynamically
suffixed channel names).

Update the matching logic to compare only the length of the
id->name string, allowing id_table entries to act as prefixes.
This enables drivers to bind to devices whose names start with
the specified id->name.

The implementation is copied from a reply by Mathieu.

Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
---
 drivers/rpmsg/rpmsg_core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c
index e7f7831d37f8..f95bfc9965d4 100644
--- a/drivers/rpmsg/rpmsg_core.c
+++ b/drivers/rpmsg/rpmsg_core.c
@@ -414,7 +414,9 @@ ATTRIBUTE_GROUPS(rpmsg_dev);
 static inline int rpmsg_id_match(const struct rpmsg_device *rpdev,
 				  const struct rpmsg_device_id *id)
 {
-	return strncmp(id->name, rpdev->id.name, RPMSG_NAME_SIZE) == 0;
+	size_t len = strnlen(id->name, RPMSG_NAME_SIZE);
+
+	return strncmp(id->name, rpdev->id.name, len) == 0;
 }
 
 /* match rpmsg channel and rpmsg driver */
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 2/5] dt-bindings: remoteproc: imx_rproc: Add "rpmsg" subnode support
From: Shenwei Wang @ 2026-06-25 15:54 UTC (permalink / raw)
  To: Linus Walleij, Bartosz Golaszewski, Jonathan Corbet, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Bjorn Andersson,
	Mathieu Poirier, Frank Li, Sascha Hauer
  Cc: Shuah Khan, linux-gpio, linux-doc, linux-kernel,
	Pengutronix Kernel Team, Fabio Estevam, Shenwei Wang, Peng Fan,
	devicetree, linux-remoteproc, imx, linux-arm-kernel, linux-imx,
	Arnaud POULIQUEN, b-padhi, Andrew Lunn
In-Reply-To: <20260625155432.815185-1-shenwei.wang@oss.nxp.com>

From: Shenwei Wang <shenwei.wang@nxp.com>

Remote processors may announce multiple GPIO controllers over an RPMSG
channel. These GPIO controllers may require corresponding device tree
nodes, especially when acting as providers, to supply phandles for their
consumers.

Define an RPMSG node to work as a container for a group of RPMSG channels
under the imx_rproc node. Each subnode within "rpmsg" represents an
individual RPMSG channel. The name of each subnode corresponds to the
channel name as defined by the remote processor.

All remote devices associated with a given channel are defined as child
nodes under the corresponding channel node.

Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
---
 .../devicetree/bindings/gpio/gpio-rpmsg.yaml  | 55 +++++++++++++++++++
 .../bindings/remoteproc/fsl,imx-rproc.yaml    | 53 ++++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/gpio/gpio-rpmsg.yaml

diff --git a/Documentation/devicetree/bindings/gpio/gpio-rpmsg.yaml b/Documentation/devicetree/bindings/gpio/gpio-rpmsg.yaml
new file mode 100644
index 000000000000..6c78b6850321
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/gpio-rpmsg.yaml
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/gpio-rpmsg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Generic RPMSG GPIO Controller
+
+maintainers:
+  - Shenwei Wang <shenwei.wang@nxp.com>
+
+description:
+  On an AMP platform, some GPIO controllers are exposed by the remote processor
+  through the RPMSG bus. The RPMSG GPIO transport protocol defines the packet
+  structure and communication flow between Linux and the remote firmware. Those
+  controllers are managed via this transport protocol. For more details of the
+  protocol, check the document below.
+  Documentation/driver-api/gpio/gpio-rpmsg.rst
+
+properties:
+  compatible:
+    oneOf:
+      - items:
+          - enum:
+              - fsl,rpmsg-gpio
+          - const: rpmsg-gpio
+      - const: rpmsg-gpio
+
+  reg:
+    description:
+      The reg property represents the index of the GPIO controllers. Since
+      the driver manages controllers on a remote system, this index tells
+      the remote system which controller to operate.
+    maxItems: 1
+
+  "#gpio-cells":
+    const: 2
+
+  gpio-controller: true
+
+  interrupt-controller: true
+
+  "#interrupt-cells":
+    const: 2
+
+required:
+  - compatible
+  - reg
+  - "#gpio-cells"
+  - "#interrupt-cells"
+
+allOf:
+  - $ref: /schemas/gpio/gpio.yaml#
+
+unevaluatedProperties: false
diff --git a/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml b/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml
index ce8ec0119469..aea33205a881 100644
--- a/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml
+++ b/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml
@@ -85,6 +85,34 @@ properties:
       This property is to specify the resource id of the remote processor in SoC
       which supports SCFW
 
+  rpmsg:
+    type: object
+    additionalProperties: false
+    description:
+      Represents the RPMSG bus between Linux and the remote system. Contains
+      a group of RPMSG channel devices running on the bus.
+
+    properties:
+      rpmsg-io:
+        type: object
+        additionalProperties: false
+        properties:
+          '#address-cells':
+            const: 1
+
+          '#size-cells':
+            const: 0
+
+        patternProperties:
+          "gpio@[0-9a-f]+$":
+            type: object
+            $ref: /schemas/gpio/gpio-rpmsg.yaml#
+            unevaluatedProperties: false
+
+        required:
+          - '#address-cells'
+          - '#size-cells'
+
 required:
   - compatible
 
@@ -147,5 +175,30 @@ examples:
                 &mu 3 1>;
       memory-region = <&vdev0buffer>, <&vdev0vring0>, <&vdev0vring1>, <&rsc_table>;
       syscon = <&src>;
+
+      rpmsg {
+        rpmsg-io {
+          #address-cells = <1>;
+          #size-cells = <0>;
+
+          gpio@0 {
+            compatible = "rpmsg-gpio";
+            reg = <0>;
+            gpio-controller;
+            #gpio-cells = <2>;
+            #interrupt-cells = <2>;
+            interrupt-controller;
+          };
+
+          gpio@1 {
+            compatible = "rpmsg-gpio";
+            reg = <1>;
+            gpio-controller;
+            #gpio-cells = <2>;
+            #interrupt-cells = <2>;
+            interrupt-controller;
+          };
+        };
+      };
     };
 ...
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 1/5] docs: driver-api: gpio: rpmsg gpio driver over rpmsg bus
From: Shenwei Wang @ 2026-06-25 15:54 UTC (permalink / raw)
  To: Linus Walleij, Bartosz Golaszewski, Jonathan Corbet, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Bjorn Andersson,
	Mathieu Poirier, Frank Li, Sascha Hauer
  Cc: Shuah Khan, linux-gpio, linux-doc, linux-kernel,
	Pengutronix Kernel Team, Fabio Estevam, Shenwei Wang, Peng Fan,
	devicetree, linux-remoteproc, imx, linux-arm-kernel, linux-imx,
	Arnaud POULIQUEN, b-padhi, Andrew Lunn
In-Reply-To: <20260625155432.815185-1-shenwei.wang@oss.nxp.com>

From: Shenwei Wang <shenwei.wang@nxp.com>

Describes the gpio rpmsg transport protocol over the rpmsg bus between
the remote system and Linux.

Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
---
 Documentation/driver-api/gpio/gpio-rpmsg.rst | 271 +++++++++++++++++++
 Documentation/driver-api/gpio/index.rst      |   1 +
 2 files changed, 272 insertions(+)
 create mode 100644 Documentation/driver-api/gpio/gpio-rpmsg.rst

diff --git a/Documentation/driver-api/gpio/gpio-rpmsg.rst b/Documentation/driver-api/gpio/gpio-rpmsg.rst
new file mode 100644
index 000000000000..7d351ff0adb0
--- /dev/null
+++ b/Documentation/driver-api/gpio/gpio-rpmsg.rst
@@ -0,0 +1,271 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+GPIO RPMSG (Remote Processor Messaging) Protocol
+================================================
+
+The GPIO RPMSG transport protocol is used for communication and interaction
+with GPIO controllers on remote processors via the RPMSG bus.
+
+Message Format
+--------------
+
+The RPMSG message consists of a 8-byte packet with the following layout:
+
+.. code-block:: none
+
+   +------+------+------+------+------+------+------+------+
+   | 0x00 | 0x01 | 0x02 | 0x03 | 0x04 | 0x05 | 0x06 | 0x07 |
+   |     cmd     |    line     |           value           |
+   +------+------+------+------+------+------+------+------+
+
+- **cmd**: Command code, used for GPIO_RPMSG_SEND messages.
+
+- **line**: The GPIO line (pin) index of the port.
+
+- **value**: See details in the command description below.
+
+
+GPIO Commands
+-------------
+
+Commands are specified in the **Cmd** field.
+
+The SEND message is always sent from Linux to the remote firmware. Each
+SEND corresponds to a single REPLY message. The GPIO driver should
+serialize messages and determine whether a REPLY message is required. If a
+REPLY message is expected but not received within the specified timeout
+period (currently 1 second in the Linux driver), the driver should return
+-ETIMEOUT.
+
+GET_DIRECTION (Cmd=2)
+~~~~~~~~~~~~~~~~~~~~~
+
+**Request:**
+
+.. code-block:: none
+
+   +------+------+------+------+------+------+------+------+
+   | 0x00 | 0x01 | 0x02 | 0x03 | 0x04 | 0x05 | 0x06 | 0x07 |
+   |      2      |    line     |             0             |
+   +------+------+------+------+------+------+------+------+
+
+**Reply:**
+
+.. code-block:: none
+
+   +------+--------+--------+
+   | 0x00 |  0x01  |  0x02  |
+   |   1  | status | value  |
+   +------+--------+--------+
+
+- **status**:
+
+  - 0: Ok
+  - 1: Error
+
+- **value**: Direction.
+
+  - 0: None
+  - 1: Output
+  - 2: Input
+
+
+SET_DIRECTION (Cmd=3)
+~~~~~~~~~~~~~~~~~~~~~
+
+**Request:**
+
+.. code-block:: none
+
+   +------+------+------+------+------+------+------+------+
+   | 0x00 | 0x01 | 0x02 | 0x03 | 0x04 | 0x05 | 0x06 | 0x07 |
+   |      3      |    line     |           value           |
+   +------+------+------+------+------+------+------+------+
+
+- **value**: Direction.
+
+  - 0: None
+  - 1: Output
+  - 2: Input
+
+**Reply:**
+
+.. code-block:: none
+
+   +------+--------+--------+
+   | 0x00 |  0x01  |  0x02  |
+   |   1  | status |    0   |
+   +------+--------+--------+
+
+- **status**:
+
+  - 0: Ok
+  - 1: Error
+
+
+GET_VALUE (Cmd=4)
+~~~~~~~~~~~~~~~~~
+
+**Request:**
+
+.. code-block:: none
+
+   +------+------+------+------+------+------+------+------+
+   | 0x00 | 0x01 | 0x02 | 0x03 | 0x04 | 0x05 | 0x06 | 0x07 |
+   |      4      |    line     |             0             |
+   +------+------+------+------+------+------+------+------+
+
+**Reply:**
+
+.. code-block:: none
+
+   +------+--------+--------+
+   | 0x00 |  0x01  |  0x02  |
+   |   1  | status | value  |
+   +------+--------+--------+
+
+- **status**:
+
+  - 0: Ok
+  - 1: Error
+
+- **value**: Level.
+
+  - 0: Low
+  - 1: High
+
+
+SET_VALUE (Cmd=5)
+~~~~~~~~~~~~~~~~~
+
+**Request:**
+
+.. code-block:: none
+
+   +------+------+------+------+------+------+------+------+
+   | 0x00 | 0x01 | 0x02 | 0x03 | 0x04 | 0x05 | 0x06 | 0x07 |
+   |      5      |    line     |           value           |
+   +------+------+------+------+------+------+------+------+
+
+- **value**: Output level.
+
+  - 0: Low
+  - 1: High
+
+**Reply:**
+
+.. code-block:: none
+
+   +------+--------+--------+
+   | 0x00 |  0x01  |  0x02  |
+   |   1  | status |    0   |
+   +------+--------+--------+
+
+- **status**:
+
+  - 0: Ok
+  - 1: Error
+
+
+SET_IRQ_TYPE (Cmd=6)
+~~~~~~~~~~~~~~~~~~~~
+
+**Request:**
+
+.. code-block:: none
+
+   +------+------+------+------+------+------+------+------+
+   | 0x00 | 0x01 | 0x02 | 0x03 | 0x04 | 0x05 | 0x06 | 0x07 |
+   |      6      |    line     |           value           |
+   +------+------+------+------+------+------+------+------+
+
+- **value**: IRQ types.
+
+  - 0: Interrupt disabled
+  - 1: Rising edge trigger
+  - 2: Falling edge trigger
+  - 3: Both edge trigger
+  - 4: High level trigger
+  - 8: Low level trigger
+
+**Reply:**
+
+.. code-block:: none
+
+   +------+--------+--------+
+   | 0x00 |  0x01  |  0x02  |
+   |   1  | status |    0   |
+   +------+--------+--------+
+
+- **status**:
+
+  - 0: Ok
+  - 1: Error
+
+SET_WAKEUP (Cmd=16)
+~~~~~~~~~~~~~~~~~~~
+
+**Request:**
+
+.. code-block:: none
+
+   +------+------+------+------+------+------+------+------+
+   | 0x00 | 0x01 | 0x02 | 0x03 | 0x04 | 0x05 | 0x06 | 0x07 |
+   |      1      |    line     |           value           |
+   +------+------+------+------+------+------+------+------+
+
+- **value**: Wakeup enable.
+
+  The remote system should always aim to stay in a power-efficient state by
+  shutting down or clock-gating the GPIO blocks that aren't in use. Since
+  the remoteproc driver is responsible for managing the power states of the
+  remote firmware, the GPIO driver does not require to know the firmware's
+  running states.
+
+  When the wakeup bit is set, the remote firmware should configure the line
+  as a wakeup source. The firmware should send the notification message to
+  Linux after it is woken from the GPIO line.
+
+  - 0: Disable wakeup from GPIO
+  - 1: Enable wakeup from GPIO
+
+**Reply:**
+
+.. code-block:: none
+
+   +------+--------+--------+
+   | 0x00 |  0x01  |  0x02  |
+   |   1  | status |    0   |
+   +------+--------+--------+
+
+- **status**:
+
+  - 0: Ok
+  - 1: Error
+
+Notification Message
+--------------------
+
+Notifications are sent by the remote core and they have
+**Type=2 (GPIO_RPMSG_NOTIFY)**:
+
+When a GPIO line asserts an interrupt on the remote processor, the firmware
+should immediately mask the corresponding interrupt source and send a
+notification message to the Linux. Upon completion of the interrupt
+handling on the Linux side, the driver should issue a
+command **SET_IRQ_TYPE** to the firmware to unmask the interrupt.
+
+A Notification message can arrive between a SEND and its REPLY message,
+and the driver is expected to handle this scenario.
+
+.. code-block:: none
+
+   +------+------+--------+
+   | 0x00 | 0x01 |  0x02  |
+   |   2  | line | trigger|
+   +------+------+--------+
+
+- **line**: The GPIO line (pin) index of the port.
+
+- **trigger**: Optional parameter to indicate the trigger event type.
+
diff --git a/Documentation/driver-api/gpio/index.rst b/Documentation/driver-api/gpio/index.rst
index bee58f709b9a..e5eb1f82f01f 100644
--- a/Documentation/driver-api/gpio/index.rst
+++ b/Documentation/driver-api/gpio/index.rst
@@ -16,6 +16,7 @@ Contents:
    drivers-on-gpio
    bt8xxgpio
    pca953x
+   gpio-rpmsg
 
 Core
 ====
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 0/5] Enable Remote GPIO over RPMSG on i.MX Platform
From: Shenwei Wang @ 2026-06-25 15:54 UTC (permalink / raw)
  To: Linus Walleij, Bartosz Golaszewski, Jonathan Corbet, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Bjorn Andersson,
	Mathieu Poirier, Frank Li, Sascha Hauer
  Cc: Shuah Khan, linux-gpio, linux-doc, linux-kernel,
	Pengutronix Kernel Team, Fabio Estevam, Shenwei Wang, Peng Fan,
	devicetree, linux-remoteproc, imx, linux-arm-kernel, linux-imx,
	Arnaud POULIQUEN, b-padhi, Andrew Lunn

From: Shenwei Wang <shenwei.wang@nxp.com>

Support the remote devices on the remote processor via the RPMSG bus on
i.MX platform.

Changes in v14:
 - Update gpio-rpmsg.rst per Mathieu’s feedback.
 - Align the rpmsg-gpio driver with the revised gpio-rpmsg.rst.
 - Modify rpmsg-core to enable prefix-based matching of RPMSG device IDs.

Changes in v13:
 - drop the support for legacy NXP firmware.
 - remove the fixed_up hooks from the rpmsg gpio driver.
 - code cleanup.

Changes in v12:
 - Fixed the "underline" warning reported by Randy.

Changes in v11:
 - Expand RPMSG for the first time per Shuah's review comment.

Changes in v10:
 - Update gpio-rpmsg.rst according to Daniel Baluta's review comments.
 - Add a kernel CONFIG for fixed up handlers and only enable it on
   i.MX products.
 - Fixed bugs reported by kernel test robot.

Changes in v9:
 - Reuse the gpio-virtio design for command and IRQ type definitions.
 - Remove msg_id, version, and vendor fields from the generic protocol.
 - Add fixed-up handlers to support legacy firmware.

Changes in v8:
 - Add "depends on REMOTEPROC" in Kconfig to fix the build error reported
   by the kernel test robot.
 - Move the .rst patch before the .yaml patch.
 - Handle the "ngpios" DT property based on Andrew's feedback.

Changes in v7:
 - Reworked the driver to use the rpmsg_driver framework instead of
   platform_driver, based on feedback from Bjorn and Arnaud.
 - Updated gpio-rpmsg.yaml and imx_rproc.yaml according to comments from
   Rob and Arnaud.
 - Further refinements to gpio-rpmsg.yaml per Arnaud's feedback.

Changes in v6:
 - make the driver more generic with the actions below:
     rename the driver file to gpio-rpmsg.c
     remove the imx related info in the function and variable names
     rename the imx_rpmsg.h to rpdev_info.h
     create a gpio-rpmsg.yaml and refer it in imx_rproc.yaml
 - update the gpio-rpmsg.rst according to the feedback from Andrew and
   move the source file to driver-api/gpio
 - fix the bug reported by Zhongqiu Han
 - remove the I2C related info

Changes in v5:
 - move the gpio-rpmsg.rst from admin-guide to staging directory after
   discussion with Randy Dunlap.
 - add include files with some code improvements per Bartosz's comments.

Changes in v4:
 - add a documentation to describe the transport protocol per Andrew's
   comments.
 - add a new handler to get the gpio direction.

Changes in v3:
 - fix various format issue and return value check per Peng 's review
   comments.
 - add the logic to also populate the subnodes which are not in the
   device map per Arnaud's request. (in imx_rproc.c)
 - update the yaml per Frank's review comments.

Changes in v2:
 - re-implemented the gpio driver per Linus Walleij's feedback by using
   GPIOLIB_IRQCHIP helper library.
 - fix various format issue per Mathieu/Peng 's review comments.
 - update the yaml doc per Rob's feedback

Shenwei Wang (5):
  docs: driver-api: gpio: rpmsg gpio driver over rpmsg bus
  dt-bindings: remoteproc: imx_rproc: Add "rpmsg" subnode support
  rpmsg: core: match rpmsg device IDs by prefix
  gpio: rpmsg: add generic rpmsg GPIO driver
  arm64: dts: imx8ulp: Add rpmsg node under imx_rproc

 .../devicetree/bindings/gpio/gpio-rpmsg.yaml  |  55 ++
 .../bindings/remoteproc/fsl,imx-rproc.yaml    |  53 ++
 Documentation/driver-api/gpio/gpio-rpmsg.rst  | 271 +++++++++
 Documentation/driver-api/gpio/index.rst       |   1 +
 arch/arm64/boot/dts/freescale/imx8ulp.dtsi    |  25 +
 drivers/gpio/Kconfig                          |  17 +
 drivers/gpio/Makefile                         |   1 +
 drivers/gpio/gpio-rpmsg.c                     | 568 ++++++++++++++++++
 drivers/rpmsg/rpmsg_core.c                    |   4 +-
 9 files changed, 994 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/devicetree/bindings/gpio/gpio-rpmsg.yaml
 create mode 100644 Documentation/driver-api/gpio/gpio-rpmsg.rst
 create mode 100644 drivers/gpio/gpio-rpmsg.c

--
2.43.0


^ permalink raw reply

* Re: [PATCH v4] coredump: Add /proc/<pid>/coredump_pre_exit for pre-exit before dumping
From: Xin Zhao @ 2026-06-25 15:45 UTC (permalink / raw)
  To: ljs
  Cc: akpm, alex.aring, allen.lkml, arnd, brauner, chuck.lever, corbet,
	david, ebiederm, j.granados, jack, jackzxcui1989, jlayton,
	juri.lelli, keescook, liam, linux-arch, linux-doc, linux-fsdevel,
	linux-kernel, linux-mm, mcgrof, mingo, mjguzik, peterz, pfalcato,
	vincent.guittot, viro
In-Reply-To: <aj0cUrwdXYKIicC-@lucifer>

On Thu, 25 Jun 2026 13:48:10 +0100 Lorenzo Stoakes <ljs@kernel.org> wrote:

> +cc missing maintainers, lists.
> 
> NAK.
> 
> This is un-upstreamable for numerous reasons.
> 
> The stuff you're doing in mm is broken, wrong and invasive and you've not
> even bothered to cc- mm people. I'm annoyed by this.
> 
> You're also doing incredibly silly mistakes at v4 of something that should have
> been an RFC.
> 
> You don't seem to understand the concept of patch _series_ (break it up into
> smaller patches!!!) and you haven't bothered cc'ing maintainers whose subsystems
> you're radically alterting.
> 
> I'm annoyed as you have a history where you were told not to add insane hacks
> before ([0], my reply at [1]).
> 
> [0]:https://lore.kernel.org/all/20260116042817.3790405-1-jackzxcui1989@163.com/
> [1]:https://lore.kernel.org/all/14110b70-19e7-474d-b0dd-ba80e8bed9b0@lucifer.local/
> 
> Was I wasting my time there? Am I wasting my time responding now?
> 
> And how hard is it to run a simple perl script?
> 
> Let me run it for you for _just_ the maintainers:

I probably shouldn't reply to this email to waste more of your time, but I
can't help but respond because your comments have been very beneficial to
me, and I enjoy the process.

The v4 version has changed too much compared to the v3 version. I should
have re-executed the "get maintainer" script, but I mistakenly copied the
previous email list and sent it out. I sincerely apologize for that.

There are quite a few issues now, and I haven't come up with a good
overall solution. I actually want to resolve the problems we encountered
in our project with minimal kernel modifications, but I can't think of a
good way to do it. It seems that the v4 version has turned out to be a
complete disaster of a patch, and I sincerely hope that my example won't
be used as a counterexample in the future. Thank you for that.

Suddenly, I have some thoughts about this issue, but I even question
whether I should have these ideas. Let me sit down and sort things out
properly. I hope the v5 version won't be a disaster.

Thanks
Xin Zhao


^ permalink raw reply

* Re: [PATCH net-next] Documentation: networking: Add a test plan for ethtool pause validation
From: Andrew Lunn @ 2026-06-25 15:46 UTC (permalink / raw)
  To: Maxime Chevallier
  Cc: Jakub Kicinski, davem, Eric Dumazet, Paolo Abeni, Simon Horman,
	Russell King, Heiner Kallweit, Jonathan Corbet, Shuah Khan,
	Oleksij Rempel, Vladimir Oltean, Florian Fainelli,
	thomas.petazzoni, netdev, linux-kernel, linux-doc
In-Reply-To: <7a88fee8-bbb3-480f-9c93-677b7270a940@bootlin.com>

On Thu, Jun 25, 2026 at 12:46:44PM +0200, Maxime Chevallier wrote:
> Hi Andrew,
> 
> On 5/29/26 14:59, Andrew Lunn wrote:
> 
> (This discussion was a while ago, but this bit of context should be enough)
> 
> > But we also need to consider that for some APIs, we have decided that
> > a configuration can be set now, which does not actually apply in our
> > current conditions, but it will be stored away for when conditions
> > change and it is applicable. The half duplex case could fit that. When
> > the link is currently half duplex, you can configure pause, but you
> > don't expect it to actually change the current behaviour. It only
> > kicks in when the link renegotiates to full duplex sometime in the
> > future. We have to also consider this the other way around. The link
> > is full duplex and pause is configured by the user. Something happens
> > with the LP and the link renegotiates to half duplex. The local end
> > should not throw away the configuration, it simply cannot apply it
> > given the current situation.
> 
> I'm writing the test description for HD with a better formatting, so the
> HD test wouldn't be about "are we using pause stuff while in HD" as it
> doesn't make sense, but rather "do we correctly store the pause settings
> aside for later".

O.K.

> I'm realising that we don't really have an API to report the *true* in-use pause
> settings. Taking HD as an example :
> 
> # ethtool -s eth2 duplex half
> 
> [588209.379363] mvpp2 f4000000.ethernet eth2: Link is Up - 100Mbps/Half - flow control off
> 
> # ethtool eth2
> 	[...]
> 	Supported pause frame use: Symmetric Receive-only
> 	Advertised pause frame use: Symmetric Receive-only
> 	Link partner advertised pause frame use: Symmetric Receive-only

Does it even make sense to advertise this when in HD? But i don't
think we need to consider this now. I consider HD low priority, i
doubt it is actually used very often. We should concentrate on FD
testing.

> # ethtool -a eth2
> Autonegotiate:	on
> RX:		off
> TX:		off
> RX negotiated: on
> TX negotiated: on
> 
> 
> Sure, pause and HD don't make sense, however what I find confusing to some
> extent is that the only place we have information about the *actual* pause
> settings is the "link is Up" log in dmesg.

Maybe we should extend ksetting get to return the resolved pause
parameters? But i'm not sure how much that actually gives us. Anything
using phylink will just ask phylink to fill in the ksettings
information, and it seems unlikely phylink gets it wrong. What we are
really trying to test is drivers which don't user phylink, those are
the ones which are generally broken, and they are not going to
implement anything new in ksettings. So i think the test has to look
at:

> 	Advertised pause frame use: Symmetric Receive-only
> 	Link partner advertised pause frame use: Symmetric Receive-only

and check these match what we expect.

    Andrew

^ permalink raw reply

* Re: [PATCH v8 18/46] KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
From: Sean Christopherson @ 2026-06-25 15:40 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Ackerley Tng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <6ed7d12a-c3a1-4572-8385-754e6d5b8b44@kernel.org>

On Thu, Jun 25, 2026, David Hildenbrand (Arm) wrote:
> On 6/25/26 02:35, Sean Christopherson wrote:
> > One thought I had, to avoid the IPIs that draining all per-CPU caches requires,
> > was to disallow putting guest_memfd pages in folio batches, e.g. by hacking
> > something into folio_may_be_lru_cached().  But due to taking a per-lru lock,
> > that would penalize the relatively hot path and definitely common operation of
> > faulting in guest memory.  On the other hand, memory conversion is already a
> > relatively slow operation and is relatively uncommon compared to page faults,
> > (and likely very uncommon for real world setups).  I.e. having to drain all
> > caches if conversion isn't safe penalizes a relatively slow, relatively uncommon
> > path.
> 
> Yeah, the lru_add_drain_all is rather messy.
> 
> We have similar code in
> 
> collect_longterm_unpinnable_folios(), where we first try a lru_add_drain(), to
> then escalate to a lru_add_drain_all().
> 
> Maybe we could factor that (suboptimal code) out to not have to reinvent the
> same thing multiple times?

As discussed in the guest_memfd call, we should do this straightaway, i.e. instead
of merging this series as-is, so that we don't export lru_add_drain_all() only to
drop the export a kernel or two later, and can instead export the helper to drain
any batches for a folio (or set of folios/pages).


^ permalink raw reply

* Re: [PATCH net-next] Documentation: networking: Add a test plan for ethtool pause validation
From: Maxime Chevallier @ 2026-06-25 15:29 UTC (permalink / raw)
  To: Andrew Lunn, Jakub Kicinski
  Cc: davem, Eric Dumazet, Paolo Abeni, Simon Horman, Russell King,
	Heiner Kallweit, Jonathan Corbet, Shuah Khan, Oleksij Rempel,
	Vladimir Oltean, Florian Fainelli, thomas.petazzoni, netdev,
	linux-kernel, linux-doc
In-Reply-To: <5cb8e2b4-8eb6-4446-9b90-1cd4c7964cd9@lunn.ch>

Hi Andrew,

On 5/27/26 04:47, Andrew Lunn wrote:
> On Tue, May 26, 2026 at 05:24:47PM -0700, Jakub Kicinski wrote:
>> On Fri, 22 May 2026 19:51:06 +0200 Maxime Chevallier (Netdev
>> Foundation) wrote:
>>>  Documentation/networking/pause_test_plan.rst | 556 +++++++++++++++++++
>>
>> It'd be great to hear from others but IMHO in the current form this is
>> not suitable for Documentation/networking/ We can commit the "knowledge"
>> part but enumerating the test cases seems odd for Documentation/.
> 
> Sorry, not looked too deeply at the actual content yet.
> 
> What i was thinking was a python file, which sphinx can ingest to
> produce documentation, and place holders were code would be added to
> implement the actual test during the next phase.
> 
> This is how i've done testing in the past. I would be the evil one who
> thought up the tests and described them in detail using sphinx markup
> in a python test template file. After some review they got passed off
> to a python developer for implementation. And when they got run and
> failed, sometimes the feature developer, the test developer and myself
> got together to figure who made the error.
> 
> I'm not sure we even need sphinx. What i find important is that the
> test is documented. What kAPI calls should be made with what
> parameters. What results we are expected and why? So that when a test
> fails, a developer has the information they need to fix their
> code. The Why? is important, and often missing from the kernel tests.

This isn't sphynx, but I've come-up with something like this for a
test definition :


@ksft_ethtool_needs_supported_anyof([Pause, Asym_Pause])
def test_ethtool_pause_advertising(cfg, peer) -> None:
    """Pause advertisement

    Validate that changing pause params through the ETHTOOL_MSG_PAUSE command
    translates to a change in the advertised pause params, and that these
    parameters are correct w.r.t the supported pause params and requested pause
    params.
    
    This exercises the .set_pauseparams() ethtool ops for MAC configuration,
    as well as the reconfiguration of the PHY's advertising and negociation.
    
    On non-phylink MACs, the MAC should call phy_set_sym_pause() to update the
    PHY's advertising, and restart a negotiation with phy_start_aneg() if
    need be. Failure to do so will result on the wrong advertising parameters.
    
    Pn phylink-enabled MACs, phylink deals with the PHY reconfiguration provided
    the MAC driver calls phylink_ethtool_set_pauseparam().
    
    Failing this test likely means that the PHY driver is not correctly advertising
    pause settings, either due to the MAC not triggering a PHY reconfiguration,
    a misconficonfiguration of the advertising registers by the PHY, or by
    mis-handling the phydev->advertising bitfield in the PHY driver directly.
    
    The validation is made by looking at the advertised modes locally, as well as
    what the peer's 'lp_advertising' values report.

    cfg -- local device's interface configuration
    peer -- peer device handle
    """

    # Initial conditions :
    # - Local interface is admin UP, and reports lowlayer link UP
    # - Remote interface is adming UP, and reports lowlayer link UP
    #
    # Test 1
    # - SKIP if supported doesn't contain "Pause"
    # - run 'ethtool -A ethX rx on tx on autoneg on'
    # - FAIL if the return isn't 0
    # - FAIL if ETHTOOL_A_LINKMODES_OURS's advertised values does not contain
    #   "Pause" or contains "Asym_Pause"
    # - FAIL if peer's lp_advertising doesn't contain "Pause" or contains
    #   "Asym_Pause"
    # - Succeed otherwise
    #
    # Test 2
    # - SKIP uif supported doesn't contain both "Pause" and "Asym_Pause"
    # - run 'ethtool -A ethX rx on tx on autoneg on'
    # - FAIL if the return isn't 0
    # - FAIL if ETHTOOL_A_LINKMODES_OURS's advertised values does not contain
    #   "Pause" or contains "Asym_Pause"
    # - FAIL if peer's lp_advertising doesn't contain "Pause" or contains
    #   "Asym_Pause"
    #
    # ...
   
The annotation defines the pre-requisites in terms of locally supported
linkmodes, we have a docstring containing information for developpers
to debug their drivers, what I'm unsure about is the commented-out part
below, so either one big function testing multiple adjacent scenarios
or indivitual functions.

We could also use annotations to enumerate the various combinations of
modes to test.

That's just an extract of the full test suite for Pause, but before
writing the whole thing down i figure it's better to iterate on a single
test's design.

What do you think ?

Maxime

^ permalink raw reply

* Re: [PATCH v8 24/46] KVM: guest_memfd: Make in-place conversion the default\
From: Sean Christopherson @ 2026-06-25 14:36 UTC (permalink / raw)
  To: Yan Zhao
  Cc: Ackerley Tng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <aj0Jf30PS2f7x1nt@yzhao56-desk.sh.intel.com>

On Thu, Jun 25, 2026, Yan Zhao wrote:
> On Thu, Jun 25, 2026 at 09:51:01AM +0800, Yan Zhao wrote:
> > On Wed, Jun 24, 2026 at 05:41:58PM -0700, Sean Christopherson wrote:
> > > On Wed, Jun 24, 2026, Ackerley Tng wrote:
> > > > Yan Zhao <yan.y.zhao@intel.com> writes:
> > > > > With gmem_in_place_conversion=true, userspace can create guest_memfd without the
> > > > > MMAP flag. In such cases, shared memory is allocated from different backends.
> > > > > This means this module parameter only enables per-gmem memory attribute and does
> > > > > not guarantee that gmem in-place conversion will actually occur.
> > > 
> > > KVM module params are pretty much always about what KVM supports, not what is
> > > guaranteed to happen.
> > > 
> > >   - enable_mmio_caching doesn't guarantee there will actually be MMIO SPTEs,
> > >     because maybe the guest never accesses emulated MMIO.
> > >   - enable_pmu doesn't guarantee VMs will get a PMU, because userspace may elect
> > >     not to advertise one.
> > >   - and so on and so forth...
> > > 
> > > Yes, there's a small mental jump to get from "KVM supports in-place conversion"
> > > to "I need to set memory attributes on the guest_memfd instance, not the VM",
> > > but I don't see that as a big hurdle, certainly not in the long term.  And once
> > > the VMM code is written, I really do think most people are going to care about
> > > whether or not KVM supports in-place conversion, not where PRIVATE is tracked.
> > Sorry, I just saw this mail after posting my reply in [1].
> > 
> > I'm ok with gmem_in_place_conversion=true just means KVM supports in-place
> > conversion, while we can still create VMs with shared memory not from gmem.
> Or what about "allow_gmem_in_place_conversion" ?

No, because turning on the param also disallows setting PRIVATE in the VM-scoped
KVM_SET_MEMORY_ATTRIBUTES ioctl.

> > Though it still feels a bit odd to require TDX huge pages to depend on
> > gmem_in_place_conversion=true when shared memory is not currently allocated
> > from gmem, 

I fully expect that to be a transient state, and in all likelihood not something
that is *ever* shipped in production.  Landing TDX hugepages without guest_memfd
hugepage support is all about avoiding unnecessary serialization of series and
features that aren't strictly dependent on each other.

> > it should become more natural over time once gmem supports in-place
> > conversions for huge page.

Yes, and I want to prioritize the steady state for end users, not the in-progress
state for developers.  Once all of this settles out, I fully expect the majority
of deployments to only support in-place conversion, at which point the end user
is only going to care whether or not in-place conversion is enabled in KVM, not
the subtle detail that it's still possible to do out-of-place conversions (and
that will always hold true, it's not like VMA-based memslots are being deprecated).

> > Besides my current usage, there may be other scenarios where gmem memory
> > attributes is preferred without allocating shared memory from gmem.
> > (e.g., PAGE.ADD from a temp extra shared source memory).
> > 
> > For such use cases, I'm concerns that the admins may find it confusing if they
> > enable gmem_in_place_conversion but still observe extra memory consumptions for
> > shared memory.

KVM can help with documentation, but beyond that, it's not KVM's problem to solve.
If a VMM *and* platform owner chooses to deploy a setup that utilizes out-of-place
conversions, then it's on the VMM and/or plaform owner to understand and communicate
the implications to the end user.

And I'm not remotely convinced that prepending allow_ to the param will help
end users diagnose "unexpected" memory consumption, in quotes because anyone that
is deploying a stack that utilizes out-of-place conversion absolutely needs to
understand and plan for the additional memory consumption.  I.e. if the memory
consumption is "unexpected" to the end user, they likely have far bigger problems.

^ permalink raw reply

* Re: [PATCH v6 03/12] PCI: liveupdate: Track incoming preserved PCI devices
From: Pratyush Yadav @ 2026-06-25 14:35 UTC (permalink / raw)
  To: David Matlack
  Cc: Pasha Tatashin, kexec, linux-doc, linux-kernel, linux-mm,
	linux-pci, Adithya Jayachandran, Alexander Graf, Alex Williamson,
	Bjorn Helgaas, Chris Li, David Rientjes, Jacob Pan,
	Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
	Lukas Wunner, Mike Rapoport, Parav Pandit, Pranjal Shrivastava,
	Pratyush Yadav, Saeed Mahameed, Samiullah Khawaja, Shuah Khan,
	Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <ajBgj_aSuzMZG47e@google.com>

Hi David,

On Mon, Jun 15 2026, David Matlack wrote:

> On 2026-06-14 01:38 PM, Pasha Tatashin wrote:
>> On Fri, 22 May 2026 20:24:01 +0000, David Matlack <dmatlack@google.com> wrote:
[...]
>> > +	}
>> > +
>> > +	pci_info(dev, "Device was preserved by previous kernel across Live Update\n");
>> > +	dev->liveupdate.incoming = dev_ser;
>> > +
>> > +	/*
>> > +	 * Hold the ref on the incoming FLB until pci_liveupdate_finish() so
>> > +	 * that dev->liveupdate.incoming does not get freed while it is in use.
>> > +	 */
>> 
>> How would that work? If finish is not called FLB stays around until the 
>> next reboot.
>
> True... I think if the PCI core trusts drivers to call
> pci_liveupdate_finish() then we don't need to hold onto the incoming
> reference here.

That was my point when I was arguing against refcounts on outgoing FLBs.
This is very easy to abuse, especially when we are talking about device
drivers. And this refcounting mechanism makes the FLB no longer
file-lifecycle-bound, since now it is entirely up to drivers to decide
the lifecycle of this data.

I have been thinking about this a bit more in the last couple days, and
I wonder if we are doing this right. Here's an idea I have been thinking
of.

We should make live update a first class citizen in PCI. Instead of
patching in liveupdate via the liveupdate.incoming field, and letting
drivers figure out when to use it, we should separate out probe and
retrieve paths entirely.

Probe and retrieve are fundamentally different operations. While they
may share some common initialization logic for the _software_ state, how
they interface with the hardware is completely different. I think mixing
the two will result in driver code being more spaghetti by having
liveupdate checks sprayed out all over.

This series doesn't add support for any drivers, but looking at some of
the code we have downstream, I see this problem. The liveupdate code is
all over the place in the driver and it is very hard to wrap one's head
around how the device is actually retrieved.

So I think PCI core should track preserved devices, and if the device is
preserved, it should skip the probe and wait for retrieve. Retrieve does
the full initialization of the device. This fits in with the LUO model
as well. You can make retrieve a callback of struct pci_driver and do
some wrappers to talk with LUO, so device drivers don't directly
interface with LUO at all.

We should do similar things on the shutdown path. Shutdown is a
fundamentally different operation from freeze, and so we should separate
them out as well.

This solves the lifetime problem as well. When PCI core is initializing,
it knows for sure that no retrievals are going to happen. That's because
none of the drivers have registered yet. So it can safely access the FLB
and initialize its state. After that, drivers can register themselves
and start accepting retrieve() calls. Once the last driver goes away,
the FLB is freed automatically.

I am sorry for suggesting a big refactor at v6, but the early versions
looked good to me at the time, and I only thought more deeply about this
when trying to figure out how we can make the lifetimes cleaner.

What do you think? Does this make sense?

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* [RFC PATCH v1.2 01/11] Docs/mm/damon/design: update for DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP
From: SeongJae Park @ 2026-06-25 14:23 UTC (permalink / raw)
  Cc: SeongJae Park, Liam R. Howlett, Andrew Morton, David Hildenbrand,
	Jonathan Corbet, Lorenzo Stoakes, Michal Hocko, Mike Rapoport,
	Shuah Khan, Suren Baghdasaryan, Vlastimil Babka, damon, linux-doc,
	linux-kernel, linux-mm
In-Reply-To: <20260625142357.103500-1-sj@kernel.org>

Commit 9138e27a3bc3 ("mm/damon: add node_eligible_mem_bp goal metric")
introduced DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP but forgot updating the
DAMON design document for that.  Update.

Signed-off-by: SeongJae Park <sj@kernel.org>
---
 Documentation/mm/damon/design.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index 2da7ca0d3d17a..9dbace087a329 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -686,9 +686,11 @@ mechanism tries to make ``current_value`` of ``target_metric`` be same to
   (1/10,000).
 - ``inactive_mem_bp``: Inactive to active + inactive (LRU) memory size ratio in
   bp (1/10,000).
+- ``node_eligible_mem_bp``: Scheme target access pattern-eligible memory ratio
+  of a node in bp (1/10,000).
 
-``nid`` is optionally required for only ``node_mem_used_bp``,
-``node_mem_free_bp``, ``node_memcg_used_bp`` and ``node_memcg_free_bp`` to
+``nid`` is optionally required for ``node_mem_used_bp``, ``node_mem_free_bp``,
+``node_memcg_used_bp`,` ``node_memcg_free_bp`` and ``node_eligible_mem_bp`` to
 point the specific NUMA node.
 
 ``path`` is optionally required for only ``node_memcg_used_bp`` and
-- 
2.47.3

^ permalink raw reply related

* [RFC PATCH v1.2 00/11] mm/damon: update, optimize, and clean up doc, tests, and code
From: SeongJae Park @ 2026-06-25 14:23 UTC (permalink / raw)
  Cc: SeongJae Park, Liam R. Howlett, Andrew Morton, Brendan Higgins,
	David Gow, David Hildenbrand, Jonathan Corbet, Lorenzo Stoakes,
	Michal Hocko, Mike Rapoport, Shuah Khan, Shuah Khan,
	Suren Baghdasaryan, Vlastimil Babka, damon, kunit-dev, linux-doc,
	linux-kernel, linux-kselftest, linux-mm

Patches 1 and 2 update the design and ABI documents for recently added
DAMON features.  Patches 3-7 add or update more unit and self tests for
DAMON to cover recently changed or added functions and sysfs files.
Patch 8 optimizes damon_commit_target_regions() to skip unnecessary
adjacent ranges setup.  Patches 9-11 clean and fix up recently added
DAMON sysfs interface code for readability.

Changes from RFC v1.1
- RFC v1.1: https://lore.kernel.org/20260625050756.91115-1-sj@kernel.org
- Document nid requirement for node_eligible_mem_bp.
- Fix typos: s/memmcg/memcg/, s/geets/gets/.
- Drop damon_rnd() randomness test case; test boundness only.
- Fixup dests dir selftest to do real test with correct file permission checks.
Changes from RFC
- RFC: https://lore.kernel.org/20260624142008.87180-1-sj@kernel.org
- Rebase directly to latest mm-new.

SeongJae Park (11):
  Docs/mm/damon/design: update for DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP
  Docs/ABI/damon: document probe files
  mm/damon/tests/core-kunit: test damon_rand()
  selftests/damon/sysfs.sh: test multiple probe dirs creation
  selftests/damon/sysfs.sh: test {core,ops}_filters/ directories
  selftests/damon/sysfs.sh: test dests dir
  selftests/damon/sysfs.sh: test all files in quota goal dir
  mm/damon/core: reduce range setup in damon_commit_target_regions()
  mm/damon/sysfs: split probe setup function out
  mm/damon/sysfs: split out filters setup function
  mm/damon/sysfs: fix typos in probe_{add,rm}_dirs: s/attr/probe/

 .../ABI/testing/sysfs-kernel-mm-damon         |  40 +++++++
 Documentation/mm/damon/design.rst             |   6 +-
 mm/damon/core.c                               |  22 +++-
 mm/damon/sysfs.c                              | 102 ++++++++++--------
 mm/damon/tests/core-kunit.h                   |  17 +++
 tools/testing/selftests/damon/sysfs.sh        |  71 +++++++++++-
 6 files changed, 205 insertions(+), 53 deletions(-)


base-commit: ada7832345164eed1bbca10543b0c46f13738215
-- 
2.47.3

^ permalink raw reply

* Re: [RFC PATCH net-next v8 03/12] net: phylink: add phylink_release_pcs() to externally release a PCS
From: Maxime Chevallier @ 2026-06-25 14:13 UTC (permalink / raw)
  To: Christian Marangi, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Simon Horman, Jonathan Corbet, Shuah Khan,
	Lorenzo Bianconi, Heiner Kallweit, Russell King, Saravana Kannan,
	Philipp Zabel, Nathan Chancellor, Nick Desaulniers, Bill Wendling,
	Justin Stitt, netdev, devicetree, linux-kernel, linux-doc,
	linux-arm-kernel, linux-mediatek, llvm
In-Reply-To: <20260618125752.1223-4-ansuelsmth@gmail.com>

Hello Christian,

On 6/18/26 14:57, Christian Marangi wrote:
> Add phylink_release_pcs() to externally release a PCS from a phylink
> instance. This can be used to handle case when a single PCS needs to be
> removed and the phylink instance needs to be refreshed.
> 
> On calling phylink_release_pcs(), the PCS will be removed from the
> phylink internal PCS list and the phylink supported_interfaces value is
> reparsed with the remaining PCS interfaces.
> 
> Also a phylink resolve is triggered to handle the PCS removal.
> 
> The flag force_major_config is set to make phylink resolve reconfigure
> the interface (even if it didn't change).
> This is needed to handle the special case when the current PCS used
> by phylink is removed and a major_config is needed to propagae the
> configuration change. With this option enabled we also force mac_config
> even if the PHY link is not up for the in-band case.
> 
> Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
> ---
>  drivers/net/phy/phylink.c | 56 +++++++++++++++++++++++++++++++++++++++
>  include/linux/phylink.h   |  2 ++
>  2 files changed, 58 insertions(+)
> 
> diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
> index c38bcd43b8c8..064d6f5a06da 100644
> --- a/drivers/net/phy/phylink.c
> +++ b/drivers/net/phy/phylink.c
> @@ -158,6 +158,8 @@ static const phy_interface_t phylink_sfp_interface_preference[] = {
>  static DECLARE_PHY_INTERFACE_MASK(phylink_sfp_interfaces);
>  
>  static void phylink_run_resolve(struct phylink *pl);
> +static void phylink_link_down(struct phylink *pl);
> +static void phylink_pcs_disable(struct phylink_pcs *pcs);
>  
>  /**
>   * phylink_set_port_modes() - set the port type modes in the ethtool mask
> @@ -918,6 +920,60 @@ static void phylink_resolve_an_pause(struct phylink_link_state *state)
>  	}
>  }
>  
> +/**
> + * phylink_release_pcs - Removes a PCS from the phylink PCS available list
> + * @pcs: a pointer to the phylink_pcs struct to be released
> + *
> + * This function release a PCS from the phylink PCS available list if
> + * actually in use. It also refreshes the supported interfaces of the
> + * phylink instance by copying the supported interfaces from the phylink
> + * conf and merging the supported interfaces of the remaining available PCS
> + * in the list and trigger a resolve.
> + */
> +void phylink_release_pcs(struct phylink_pcs *pcs)
> +{
> +	struct phylink *pl;
> +
> +	ASSERT_RTNL();
> +
> +	pl = pcs->phylink;
> +	if (!pl)
> +		return;
> +
> +	mutex_lock(&pl->state_mutex);
> +
> +	list_del(&pcs->list);
> +	pcs->phylink = NULL;
> +
> +	/*
> +	 * Check if we are removing the PCS currently
> +	 * in use by phylink. If this is the case, tear down
> +	 * the link, force phylink resolve to reconfigure the
> +	 * interface mode, disable the current PCS and set the
> +	 * phylink PCS to NULL.
> +	 */
> +	if (pl->pcs == pcs) {
> +		phylink_link_down(pl);
> +		phylink_pcs_disable(pl->pcs);
> +
> +		pl->force_major_config = true;
> +		pl->pcs = NULL;
> +	}
> +
> +	mutex_unlock(&pl->state_mutex);
> +
> +	/* Refresh supported interfaces */
> +	phy_interface_copy(pl->supported_interfaces,
> +			   pl->config->supported_interfaces);
> +	list_for_each_entry(pcs, &pl->pcs_list, list)
> +		phy_interface_or(pl->supported_interfaces,
> +				 pl->supported_interfaces,
> +				 pcs->supported_interfaces);

I've given more thought to that 'supported_interfaces' thing. This
patchset redefines the meaning of

  pl->config->supported_interfaces

Currently, it's filled by the MAC driver and means "Every interface
we can support, including the ones provided by PCSs that we can use
with this MAC".

It now becomes "Every interface we support without needing a PCS", at
least the way I understand that.

It's not an error in your code, but I think this is worth documenting
somewhere as this changes one the things that's already fairly
error-prone in new drivers.

I don't know to what extent people use that, be we have a porting guide
that explains how to use phylink in a MAC driver, maybe an update in there
would be nice as well :

https://docs.kernel.org/networking/sfp-phylink.html#rough-guide-to-converting-a-network-driver-to-sfp-phylink

Maxime



^ permalink raw reply

* Re: [PATCH] cgroup/cpu: document cpu.stat.local
From: Tao Cui @ 2026-06-25 14:05 UTC (permalink / raw)
  To: Sun Shaojie, Tejun Heo, Johannes Weiner, Michal Koutný,
	Jonathan Corbet
  Cc: cui.tao, Shuah Khan, cgroups, linux-doc, linux-kernel
In-Reply-To: <20260625130723.1144463-1-sunshaojie@kylinos.cn>



在 2026/6/25 21:07, Sun Shaojie 写道:
> Add documentation for the cpu.stat.local interface file, which reports
> the throttled_usec stat -- the actual throttling time incurred by the
> cgroup's own runqueues, which may include throttling inherited from
> ancestor cgroup bandwidth limits. Unlike cpu.stat's throttled_usec
> which only accounts for throttling caused by the cgroup's own CFS
> bandwidth limit.
> 
> When the controller is not enabled, the stat is not reported.
> 
> Signed-off-by: Sun Shaojie <sunshaojie@kylinos.cn>
> ---
>  Documentation/admin-guide/cgroup-v2.rst | 17 +++++++++++++++++
>  1 file changed, 17 insertions(+)
> 
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 993446ab66d0..a7766f40ef65 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1160,6 +1160,23 @@ will be referred to. All time durations are in microseconds.
>  	- nr_bursts
>  	- burst_usec
>  
> +  cpu.stat.local
> +	A read-only flat-keyed file which exists on non-root cgroups.
> +	This file exists whether the controller is enabled or not.
> +

Hi Shaojie,

Thanks — the throttled_usec semantics are described correctly.

One fix needed: "exists on non-root cgroups" is inaccurate.
cpu.stat.local is registered without CFTYPE_NOT_ON_ROOT, so (like
cpu.stat) it exists on the root cgroup too:

  $ cat /sys/fs/cgroup/cpu.stat.local
  throttled_usec 0

Reviewed-by: Tao Cui <cuitao@kylinos.cn>

Thanks,
Tao

> +	It reports the following stat when the controller is enabled:
> +
> +	- throttled_usec
> +
> +	Unlike the ``throttled_usec`` reported by ``cpu.stat`` which
> +	accounts for throttling caused by this cgroup's own CFS
> +	bandwidth limit, ``cpu.stat.local`` reports the actual
> +	throttling time incurred by this cgroup's own runqueues,
> +	which may include throttling inherited from ancestor
> +	cgroup bandwidth limits.
> +
> +	When the controller is not enabled, this stat is not reported.
> +
>    cpu.weight
>  	A read-write single value file which exists on non-root
>  	cgroups.  The default is "100".


^ permalink raw reply

* Re: [PATCH v2] usbcore: Add quirk for 255-bytes initial config read
From: Greg KH @ 2026-06-25 13:56 UTC (permalink / raw)
  To: Nikhil Solanke
  Cc: linux-usb, linux-kernel, stern, michal.pecio, stable, corbet,
	skhan, linux-doc
In-Reply-To: <20260623161035.5792-1-nikhilsolanke5@gmail.com>

On Tue, Jun 23, 2026 at 09:40:35PM +0530, Nikhil Solanke wrote:
> @@ -912,6 +915,13 @@ int usb_get_configuration(struct usb_device *dev)
>  	unsigned char *bigbuffer;
>  	struct usb_config_descriptor *desc;
>  	int result;
> +	/*
> +	 * Devices with quirky firmware will stall or reset when asked only for
> +	 * the configuration header. This variable decides which size to use in
> +	 * that case, if the quirk for that device was set.
> +	 */
> +	size_t usb_config_req_size = (dev->quirks & USB_QUIRK_WINDOWS_CONFIG_REQ_SIZE)
> +		? USB_CONFIG_WINDOWS_REQ_SIZE : USB_DT_CONFIG_SIZE;

Please just use if () lines for code logic like this.  Don't abuse ?:
stuff as it's not needed.  Remember, we write code for people first,
compilers second, and in this case the compiler doesn't care either way
at all, but an if () line makes people much happier.

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH v6 07/10] ACPI: APEI: introduce GHES helper
From: Ahmed Tiba @ 2026-06-25 13:12 UTC (permalink / raw)
  To: Julian Braha, Rafael J. Wysocki, Tony Luck, Borislav Petkov,
	Hanjun Guo, Mauro Carvalho Chehab, Shuai Xue, Len Brown,
	Saket Dumbre, Davidlohr Bueso, Jonathan Cameron, Dave Jiang,
	Alison Schofield, Vishal Verma, Ira Weiny, Dan Williams,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, Jonathan Corbet,
	Shuah Khan
  Cc: linux-kernel, linux-acpi, acpica-devel, linux-cxl, devicetree,
	linux-edac, linux-doc, Dmitry.Lamerov
In-Reply-To: <84dabe49-8405-4213-8b73-433cf736ded9@gmail.com>

On 19/06/2026 18:46, Julian Braha wrote:
> On 6/19/26 16:45, Ahmed Tiba wrote:
>> GHES_CPER_HELPERS is intended for both the ACPI GHES path and the DT
>> firmware-first provider, so I do not want to tie it to ACPI.
> 
> So what's the plan to fix the build error when ACPI is disabled:
> https://lore.kernel.org/all/0f131ee4-d335-45d2-b6ae-49c18df1353b@gmail.com/
> 
> - Julian Braha

I can fix this by adding a !CONFIG_ACPI fallback for
arch_apei_report_x86_error() in drivers/firmware/efi/cper-x86.c.

Thanks,
Ahmed

^ permalink raw reply

* [PATCH] cgroup/cpu: document cpu.stat.local
From: Sun Shaojie @ 2026-06-25 13:07 UTC (permalink / raw)
  To: Tejun Heo, Johannes Weiner, Michal Koutný, Jonathan Corbet
  Cc: Shuah Khan, cgroups, linux-doc, linux-kernel, Sun Shaojie

Add documentation for the cpu.stat.local interface file, which reports
the throttled_usec stat -- the actual throttling time incurred by the
cgroup's own runqueues, which may include throttling inherited from
ancestor cgroup bandwidth limits. Unlike cpu.stat's throttled_usec
which only accounts for throttling caused by the cgroup's own CFS
bandwidth limit.

When the controller is not enabled, the stat is not reported.

Signed-off-by: Sun Shaojie <sunshaojie@kylinos.cn>
---
 Documentation/admin-guide/cgroup-v2.rst | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 993446ab66d0..a7766f40ef65 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1160,6 +1160,23 @@ will be referred to. All time durations are in microseconds.
 	- nr_bursts
 	- burst_usec
 
+  cpu.stat.local
+	A read-only flat-keyed file which exists on non-root cgroups.
+	This file exists whether the controller is enabled or not.
+
+	It reports the following stat when the controller is enabled:
+
+	- throttled_usec
+
+	Unlike the ``throttled_usec`` reported by ``cpu.stat`` which
+	accounts for throttling caused by this cgroup's own CFS
+	bandwidth limit, ``cpu.stat.local`` reports the actual
+	throttling time incurred by this cgroup's own runqueues,
+	which may include throttling inherited from ancestor
+	cgroup bandwidth limits.
+
+	When the controller is not enabled, this stat is not reported.
+
   cpu.weight
 	A read-write single value file which exists on non-root
 	cgroups.  The default is "100".
-- 
2.25.1


^ permalink raw reply related

* Re: [GIT PULL] Documentation fixes for 7.2
From: Jonathan Corbet @ 2026-06-25 12:58 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-doc, linux-kernel, Shuah Khan
In-Reply-To: <874iiq6aps.fsf@trenco.lwn.net>

Due to insufficient coffee absorption, I failed to copy this beyond
Linus, so, for the record...

Jonathan Corbet <corbet@lwn.net> writes:

> The following changes since commit fa34b01aa0f59355206b0807f862cced06c2b7a1:
>
>   docs: pt_BR: Translate 3.Early-stage.rst into Portuguese (2026-06-12 13:34:26 -0600)
>
> are available in the Git repository at:
>
>   git://git.kernel.org/pub/scm/linux/kernel/git/docs/linux.git tags/docs-7.2-2
>
> for you to fetch changes up to b13f724df35c4f1a69e20c965a2fc74fd2921e59:
>
>   docs: tools: Fix typo 'ackward' to 'awkward' in unittest.rst (2026-06-23 14:48:01 -0600)
>
> ----------------------------------------------------------------
> A handful of late-arriving docs fixes, along with one document update
> that fell through the cracks before.
>
> ----------------------------------------------------------------
> David Hildenbrand (Arm) (1):
>       docs/mm: clarify that we are not looking for LLM generated content
>
> Declan Wale (1):
>       docs: tools: Fix typo 'ackward' to 'awkward' in unittest.rst
>
> Doehyun Baek (1):
>       Docs/driver-api/uio-howto: document mmap_prepare callback
>
> Matthew Wilcox (Oracle) (1):
>       MAINTAINERS: Fix regex for kdoc
>
> Randy Dunlap (3):
>       kernel-doc: xforms: support __SYSFS_FUNCTION_ALTERNATIVE()
>       kdoc: xforms_lists: handle DECLARE_PER_CPU() in kernel-doc
>       kdoc: xforms: ignore special static/inline macros
>
> Yudistira Putra (1):
>       Documentation: tracing: fix typo in events documentation
>
> Zenghui Yu (1):
>       docs: kgdb: Fix path of driver options
>
>  Documentation/driver-api/uio-howto.rst   |  4 ++--
>  Documentation/mm/index.rst               | 13 +++++++++++++
>  Documentation/process/debugging/kgdb.rst |  2 +-
>  Documentation/tools/unittest.rst         |  2 +-
>  Documentation/trace/events.rst           |  2 +-
>  MAINTAINERS                              |  2 +-
>  tools/lib/python/kdoc/xforms_lists.py    |  4 ++++
>  7 files changed, 23 insertions(+), 6 deletions(-)

^ permalink raw reply

* Re: [PATCH v8 18/46] KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
From: David Hildenbrand (Arm) @ 2026-06-25 12:57 UTC (permalink / raw)
  To: Sean Christopherson, Ackerley Tng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, jmattson,
	jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <ajx3vmNPRf-M9kR6@google.com>

On 6/25/26 02:35, Sean Christopherson wrote:
> On Wed, Jun 24, 2026, Ackerley Tng wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>>
>>>
>>> Under what circumstances does this happen,
>>
>> It happened 100% of the time in selftests. Perhaps it's because in the
>> selftests the pages are almost always freshly allocated and so the
>> lru_add fbatch isn't full yet? (and that the host isn't super busy so
>> lru_add fbatch doesn't get drained yet).
> 
> I chatted with Ackerley about this.  What I wanted to understand is why guest_memfd
> pages were getting put onto per-CPU batches for lru_add(), given that guest_memfd
> pages are unevictable.  The answer (assuming I read the code right), is that
> lruvec_add_folio() updates stats and other per-lru metadata for the unevictable
> lru, and does so under a per-lru lock.  I.e. we don't want to skip that stuff
> entirely.

Hm. Our pages don't participate in any LRU activity (including
isolation+migration). Isolation+migration would only apply once we'd support
page migration.

But yes, secretmem also does it like that: filemap_add_folio() will call
folio_add_lru().

Traditionally we used the unevictable LRU only for mlock purposes.

But yeah, there are "unevictable" stats involved ....

> 
> One thought I had, to avoid the IPIs that draining all per-CPU caches requires,
> was to disallow putting guest_memfd pages in folio batches, e.g. by hacking
> something into folio_may_be_lru_cached().  But due to taking a per-lru lock,
> that would penalize the relatively hot path and definitely common operation of
> faulting in guest memory.  On the other hand, memory conversion is already a
> relatively slow operation and is relatively uncommon compared to page faults,
> (and likely very uncommon for real world setups).  I.e. having to drain all
> caches if conversion isn't safe penalizes a relatively slow, relatively uncommon
> path.

Yeah, the lru_add_drain_all is rather messy.

We have similar code in

collect_longterm_unpinnable_folios(), where we first try a lru_add_drain(), to
then escalate to a lru_add_drain_all().

Maybe we could factor that (suboptimal code) out to not have to reinvent the
same thing multiple times?

-- 
Cheers,

David

^ permalink raw reply

* [PATCH v5 24/24] virt/steal_monitor: Add design check of preferred subset of active
From: Shrikanth Hegde @ 2026-06-25 12:46 UTC (permalink / raw)
  To: linux-kernel, mingo, peterz, juri.lelli, vincent.guittot,
	yury.norov, kprateek.nayak, iii, corbet
  Cc: sshegde, tglx, gregkh, pbonzini, seanjc, vschneid, huschle,
	rostedt, dietmar.eggemann, maddy, srikar, hdanton, chleroy,
	vineeth, frederic, arighi, pauld, christian.loehle, tj,
	tommaso.cucinotta, maz, rafael, rdunlap, kernellwp, linux-doc
In-Reply-To: <20260625124648.802832-1-sshegde@linux.ibm.com>

One of the main design construct that CONFIG_PREFERRED_CPU maintains is
that preferred is always subset of active. Force that in any future arch
specific implementations.

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
v4->v5:
- new patch

 drivers/virt/steal_monitor/sm_core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/virt/steal_monitor/sm_core.c b/drivers/virt/steal_monitor/sm_core.c
index f5b0e568eb32..82beb2b94083 100644
--- a/drivers/virt/steal_monitor/sm_core.c
+++ b/drivers/virt/steal_monitor/sm_core.c
@@ -80,6 +80,9 @@ static void compute_preferred_cpus_work(struct work_struct *work)
 	/* At least one core is kept as preferred */
 	WARN_ON(cpumask_empty(cpu_preferred_mask));
 
+	/* Maintain design construct */
+	WARN_ON(!cpumask_subset(cpu_preferred_mask, cpu_active_mask));
+
 	/* Warn if interval_ms is set to 0, that might cause lockup. */
 	if (unlikely(sm_core_ctx.interval_ms == 0)) {
 		WARN_ON(1);
-- 
2.47.3


^ permalink raw reply related

* [PATCH v5 23/24] virt/steal_monitor: Add direction control
From: Shrikanth Hegde @ 2026-06-25 12:46 UTC (permalink / raw)
  To: linux-kernel, mingo, peterz, juri.lelli, vincent.guittot,
	yury.norov, kprateek.nayak, iii, corbet
  Cc: sshegde, tglx, gregkh, pbonzini, seanjc, vschneid, huschle,
	rostedt, dietmar.eggemann, maddy, srikar, hdanton, chleroy,
	vineeth, frederic, arighi, pauld, christian.loehle, tj,
	tommaso.cucinotta, maz, rafael, rdunlap, kernellwp, linux-doc
In-Reply-To: <20260625124648.802832-1-sshegde@linux.ibm.com>

Cache the previous direction on steal time. So two consecutive values of
high values or low values are taken for decrease/increase of preferred
CPUs. This helps to avoid oscillations.

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
v4->v5:
- Modified for steal_monitor

 drivers/virt/steal_monitor/sm_core.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/virt/steal_monitor/sm_core.c b/drivers/virt/steal_monitor/sm_core.c
index 641488a5a3b5..f5b0e568eb32 100644
--- a/drivers/virt/steal_monitor/sm_core.c
+++ b/drivers/virt/steal_monitor/sm_core.c
@@ -20,6 +20,12 @@ struct steal_monitor sm_core_ctx = {
 	.low_threshold = 200,	/* 2% */
 };
 
+enum sm_direction {
+	SM_DIR_INCREASE = -1,
+	SM_DIR_NONE	=  0,
+	SM_DIR_DECREASE	=  1,
+};
+
 module_param_named(interval_ms, sm_core_ctx.interval_ms, uint, 0644);
 MODULE_PARM_DESC(interval_ms,
 		 "Sampling frequency for steal values in milliseconds (default: 1000)");
@@ -54,13 +60,23 @@ static void compute_preferred_cpus_work(struct work_struct *work)
 		      (delta_ns * get_num_cpus_steal_ratio());
 
 	/* If the steal time values are high, reduce preferred CPUs */
-	if (steal_ratio > sm_core_ctx.high_threshold)
+	if (sm_core_ctx.prev_direction == SM_DIR_DECREASE &&
+	    steal_ratio > sm_core_ctx.high_threshold)
 		decrease_preferred_cpus(&sm_core_ctx);
 
 	/* If the steal time values are low, increase preferred CPUs */
-	if (steal_ratio <= sm_core_ctx.low_threshold)
+	if (sm_core_ctx.prev_direction == SM_DIR_INCREASE &&
+	    steal_ratio <= sm_core_ctx.low_threshold)
 		increase_preferred_cpus(&sm_core_ctx);
 
+	/* mark the direction. This helps to avoid ping-pongs */
+	if (steal_ratio > sm_core_ctx.high_threshold)
+		sm_core_ctx.prev_direction = SM_DIR_DECREASE;
+	else if (steal_ratio <= sm_core_ctx.low_threshold)
+		sm_core_ctx.prev_direction = SM_DIR_INCREASE;
+	else
+		sm_core_ctx.prev_direction = SM_DIR_NONE;
+
 	/* At least one core is kept as preferred */
 	WARN_ON(cpumask_empty(cpu_preferred_mask));
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH v5 22/24] virt/steal_monitor: Act on steal values at regular intervals
From: Shrikanth Hegde @ 2026-06-25 12:46 UTC (permalink / raw)
  To: linux-kernel, mingo, peterz, juri.lelli, vincent.guittot,
	yury.norov, kprateek.nayak, iii, corbet
  Cc: sshegde, tglx, gregkh, pbonzini, seanjc, vschneid, huschle,
	rostedt, dietmar.eggemann, maddy, srikar, hdanton, chleroy,
	vineeth, frederic, arighi, pauld, christian.loehle, tj,
	tommaso.cucinotta, maz, rafael, rdunlap, kernellwp, linux-doc
In-Reply-To: <20260625124648.802832-1-sshegde@linux.ibm.com>

This is the steal_monitor core functionality done in periodic work

- Calculate the steal_ratio. It is multiplied by 100 to consider the
  fractional values of steal time. I.e 10 means 0.1% steal time.
- If steal value is higher than high threshold, call the method to reduce
  the preferred CPUs.
- If steal value is lower or equal to low threshold, call the method to
  increase the preferred CPUs.
- If the steal value is in between, no action is taken.
- Save the values for next delta calculations.

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
v4->v5:
- Modified for steal_monitor

 drivers/virt/steal_monitor/sm_core.c | 29 ++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/drivers/virt/steal_monitor/sm_core.c b/drivers/virt/steal_monitor/sm_core.c
index fac8f4d5dac7..641488a5a3b5 100644
--- a/drivers/virt/steal_monitor/sm_core.c
+++ b/drivers/virt/steal_monitor/sm_core.c
@@ -34,6 +34,33 @@ MODULE_PARM_DESC(low_threshold,
 
 static void compute_preferred_cpus_work(struct work_struct *work)
 {
+	u64 curr_steal, delta_steal, delta_ns, steal_ratio;
+	ktime_t now;
+
+	curr_steal = get_system_steal_time();
+	now = ktime_get();
+
+	/* get the deltas */
+	delta_steal = curr_steal > sm_core_ctx.prev_steal ?
+		      curr_steal - sm_core_ctx.prev_steal : 0;
+	delta_ns = max_t(u64, ktime_to_ns(ktime_sub(now, sm_core_ctx.prev_time)), 1);
+
+	/* Update for next calculation */
+	sm_core_ctx.prev_steal = curr_steal;
+	sm_core_ctx.prev_time = now;
+
+	/* Multiply by 100 to consider the fractional values of steal time */
+	steal_ratio = (delta_steal * 100 * 100) /
+		      (delta_ns * get_num_cpus_steal_ratio());
+
+	/* If the steal time values are high, reduce preferred CPUs */
+	if (steal_ratio > sm_core_ctx.high_threshold)
+		decrease_preferred_cpus(&sm_core_ctx);
+
+	/* If the steal time values are low, increase preferred CPUs */
+	if (steal_ratio <= sm_core_ctx.low_threshold)
+		increase_preferred_cpus(&sm_core_ctx);
+
 	/* At least one core is kept as preferred */
 	WARN_ON(cpumask_empty(cpu_preferred_mask));
 
@@ -54,6 +81,8 @@ static int __init steal_monitor_init(void)
 		sm_core_ctx.interval_ms, sm_core_ctx.high_threshold, sm_core_ctx.low_threshold);
 
 	INIT_DELAYED_WORK(&sm_core_ctx.work, compute_preferred_cpus_work);
+	sm_core_ctx.prev_steal = get_system_steal_time();
+	sm_core_ctx.prev_time = ktime_get();
 
 	schedule_delayed_work(&sm_core_ctx.work,
 			      msecs_to_jiffies(sm_core_ctx.interval_ms));
-- 
2.47.3


^ permalink raw reply related

* [PATCH v5 21/24] virt/steal_monitor: Provide default method to get num of CPUs for steal ratio
From: Shrikanth Hegde @ 2026-06-25 12:46 UTC (permalink / raw)
  To: linux-kernel, mingo, peterz, juri.lelli, vincent.guittot,
	yury.norov, kprateek.nayak, iii, corbet
  Cc: sshegde, tglx, gregkh, pbonzini, seanjc, vschneid, huschle,
	rostedt, dietmar.eggemann, maddy, srikar, hdanton, chleroy,
	vineeth, frederic, arighi, pauld, christian.loehle, tj,
	tommaso.cucinotta, maz, rafael, rdunlap, kernellwp, linux-doc
In-Reply-To: <20260625124648.802832-1-sshegde@linux.ibm.com>

This method informs the steal_monitor core, how many CPUs it needs to
consider for steal ratio calculations.
steal_ratio = (delta_steal * 100 * 100) / (delta_ns * number_of_cpus);

Default method returns number of Active CPUs since it calculates steal
time across active CPUs.

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
v4->v5:
- new patch

 drivers/virt/steal_monitor/defaults.c | 10 ++++++++++
 drivers/virt/steal_monitor/sm_core.h  |  1 +
 2 files changed, 11 insertions(+)

diff --git a/drivers/virt/steal_monitor/defaults.c b/drivers/virt/steal_monitor/defaults.c
index 90ede838491f..3232fa8f4032 100644
--- a/drivers/virt/steal_monitor/defaults.c
+++ b/drivers/virt/steal_monitor/defaults.c
@@ -26,6 +26,16 @@ u64 __weak get_system_steal_time(void)
 	return total_steal;
 }
 
+/*
+ * Return number of CPUs to consider for steal ratio calculation
+ *
+ * Default returns number of active CPUs.
+ */
+unsigned int __weak get_num_cpus_steal_ratio(void)
+{
+	return num_active_cpus();
+}
+
 /*
  * Default implementation of decrementing the preferred CPUs based on steal
  * time. This is simple logic and decrease the preferred CPUs by 1 core.
diff --git a/drivers/virt/steal_monitor/sm_core.h b/drivers/virt/steal_monitor/sm_core.h
index 1857d6a9a295..17732c8bc136 100644
--- a/drivers/virt/steal_monitor/sm_core.h
+++ b/drivers/virt/steal_monitor/sm_core.h
@@ -26,6 +26,7 @@ struct steal_monitor {
 extern struct steal_monitor sm_core_ctx;
 
 u64 get_system_steal_time(void);
+unsigned int get_num_cpus_steal_ratio(void);
 void increase_preferred_cpus(struct steal_monitor *ctx);
 void decrease_preferred_cpus(struct steal_monitor *ctx);
 #endif /* __VIRT_STEAL_CORE_H */
-- 
2.47.3


^ permalink raw reply related

* [PATCH v5 20/24] virt/steal_monitor: Provide default method to inc/dec preferred CPUs
From: Shrikanth Hegde @ 2026-06-25 12:46 UTC (permalink / raw)
  To: linux-kernel, mingo, peterz, juri.lelli, vincent.guittot,
	yury.norov, kprateek.nayak, iii, corbet
  Cc: sshegde, tglx, gregkh, pbonzini, seanjc, vschneid, huschle,
	rostedt, dietmar.eggemann, maddy, srikar, hdanton, chleroy,
	vineeth, frederic, arighi, pauld, christian.loehle, tj,
	tommaso.cucinotta, maz, rafael, rdunlap, kernellwp, linux-doc
In-Reply-To: <20260625124648.802832-1-sshegde@linux.ibm.com>

These methods will be used by the steal_monitor core in subsequent
patches. Default implementation are likely good enough for most archs.

decrease_preferred_cpus() - Called when there is high steal time. It needs
to decide which CPUs to mark as non-preferred and set that state.
increase_preferred_cpus() - Called when there is low steal time. It needs
to decide which CPUs to mark as preferred and set that state.

Default Implementations:
decrease_preferred_cpus()
- Get the last CPU in cpu_preferred_mask.
- Check if that last CPU belong to first housekeeping core. If so there
  is nothing to do. This helps to keep at least one core as preferred.
  This is to be safe under non-normal cases.
- If it is not first housekeeping core, get its sibling and mark them as
  non-preferred. If they are nohz_full, enable the tick. push mechanism
  relies on sched_tick.

increase_preferred_cpus()
- Get the first active non-preferred CPUs. This likely is the last
  set of CPUs being marked as non-preferred.
- If there is no such CPU, i.e preferred is same as active. Nothing
  todo further.
- If not, get the siblings of that core and mark them as preferred.
  Note that clearing the tick isn't needed as that would be handled via
  sched_can_stop_tick.

Using core instead of individual CPUs give better numbers as SMT is
quite common and some hypervisor such as powerVM does core scheduling.

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
v4->v5:
- Modified for steal_monitor

 drivers/virt/steal_monitor/defaults.c | 68 +++++++++++++++++++++++++++
 drivers/virt/steal_monitor/sm_core.h  |  4 ++
 2 files changed, 72 insertions(+)

diff --git a/drivers/virt/steal_monitor/defaults.c b/drivers/virt/steal_monitor/defaults.c
index 17f57afacbe6..90ede838491f 100644
--- a/drivers/virt/steal_monitor/defaults.c
+++ b/drivers/virt/steal_monitor/defaults.c
@@ -25,3 +25,71 @@ u64 __weak get_system_steal_time(void)
 
 	return total_steal;
 }
+
+/*
+ * Default implementation of decrementing the preferred CPUs based on steal
+ * time. This is simple logic and decrease the preferred CPUs by 1 core.
+ * It takes out the last core in the active & preferred.
+ *
+ * Ensure at least one housekeeping core is always kept as preferred
+ *
+ * Could be overwritten by arch specific handling. Arch must ensure
+ * preferred is always subset of active.
+ */
+
+#define get_core_mask(cpu) topology_sibling_cpumask(cpu)
+
+void __weak decrease_preferred_cpus(struct steal_monitor *ctx)
+{
+	int last_cpu, tmp_cpu;
+	int first_hk_cpu;
+
+	guard(cpus_read_lock)();
+
+	last_cpu = cpumask_last(cpu_preferred_mask);
+	first_hk_cpu = cpumask_first_and(housekeeping_cpumask(HK_TYPE_KERNEL_NOISE),
+					 cpu_active_mask);
+	/*
+	 * If the core belongs to the first housekeeping CPUs, no action is
+	 * taken. This leaves at least one core preferred always.
+	 * This ensures at least some CPUs are available to run.
+	 */
+	if (cpumask_equal(get_core_mask(last_cpu), get_core_mask(first_hk_cpu)))
+		return;
+
+	/*
+	 * set tick bit for nohz_full CPU to push the task out. Once the tasks
+	 * are pushed out, bit will be cleared if there are no tasks.
+	 */
+
+	for_each_cpu_and(tmp_cpu, get_core_mask(last_cpu), cpu_active_mask) {
+		set_cpu_preferred(tmp_cpu, false);
+		if (tick_nohz_full_cpu(tmp_cpu))
+			tick_nohz_dep_set_cpu(tmp_cpu, TICK_DEP_BIT_SCHED);
+	}
+}
+
+/*
+ * Default implementation of incrementing preferred CPUs based on steal
+ * time. This is simple logic and increases the preferred CPUs by 1 core.
+ * It adds the first core in active & !preferred
+ *
+ * Nothing to do if active == preferred
+ *
+ * Could be overwritten by arch specific handling. Arch must ensure
+ * preferred is subset of active.
+ */
+void __weak increase_preferred_cpus(struct steal_monitor *ctx)
+{
+	int first_cpu, tmp_cpu;
+
+	guard(cpus_read_lock)();
+
+	first_cpu = cpumask_first_andnot(cpu_active_mask, cpu_preferred_mask);
+	/* All CPUs are preferred. Nothing to increase further */
+	if (first_cpu >= nr_cpu_ids)
+		return;
+
+	for_each_cpu_and(tmp_cpu, get_core_mask(first_cpu), cpu_active_mask)
+		set_cpu_preferred(tmp_cpu, true);
+}
diff --git a/drivers/virt/steal_monitor/sm_core.h b/drivers/virt/steal_monitor/sm_core.h
index e09745a2b813..1857d6a9a295 100644
--- a/drivers/virt/steal_monitor/sm_core.h
+++ b/drivers/virt/steal_monitor/sm_core.h
@@ -10,6 +10,8 @@
 #include <linux/cpumask.h>
 #include <linux/workqueue.h>
 #include <linux/kernel_stat.h>
+#include <linux/tick.h>
+#include <linux/sched/isolation.h>
 
 struct steal_monitor {
 	struct delayed_work	work;
@@ -24,4 +26,6 @@ struct steal_monitor {
 extern struct steal_monitor sm_core_ctx;
 
 u64 get_system_steal_time(void);
+void increase_preferred_cpus(struct steal_monitor *ctx);
+void decrease_preferred_cpus(struct steal_monitor *ctx);
 #endif /* __VIRT_STEAL_CORE_H */
-- 
2.47.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox