Linux Documentation
 help / color / mirror / Atom feed
* [PATCH v6 4/6] alloc_tag: add accuracy based filtering to ioctl
From: Abhishek Bapat @ 2026-06-18 17:36 UTC (permalink / raw)
  To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
  Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
	Sourav Panda, Abhishek Bapat
In-Reply-To: <cover.1781803482.git.abhishekbapat@google.com>

Extend the allocinfo filtering mechanism to allow users to filter tags
based on their accuracy.

Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
Acked-by: Hao Ge <hao.ge@linux.dev>
Acked-by: Suren Baghdasaryan <surenb@google.com>
---
 include/uapi/linux/alloc_tag.h | 4 ++++
 lib/alloc_tag.c                | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
index 0de5fc180790..270f693b1822 100644
--- a/include/uapi/linux/alloc_tag.h
+++ b/include/uapi/linux/alloc_tag.h
@@ -31,6 +31,8 @@ struct allocinfo_tag {
 	char function[ALLOCINFO_STR_SIZE];
 	char filename[ALLOCINFO_STR_SIZE];
 	__u64 lineno;
+	/* filter criteria only; see allocinfo_counter.accurate for actual accuracy */
+	__u64 inaccurate;
 };
 
 /* The alignment ensures 32-bit compatible interfaces are not broken */
@@ -50,6 +52,7 @@ enum {
 	ALLOCINFO_FILTER_FUNCTION,
 	ALLOCINFO_FILTER_FILENAME,
 	ALLOCINFO_FILTER_LINENO,
+	ALLOCINFO_FILTER_INACCURATE,
 	ALLOCINFO_FILTER_MIN_SIZE,
 	ALLOCINFO_FILTER_MAX_SIZE,
 	__ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_MAX_SIZE
@@ -59,6 +62,7 @@ enum {
 #define ALLOCINFO_FILTER_MASK_FUNCTION		(1 << ALLOCINFO_FILTER_FUNCTION)
 #define ALLOCINFO_FILTER_MASK_FILENAME		(1 << ALLOCINFO_FILTER_FILENAME)
 #define ALLOCINFO_FILTER_MASK_LINENO		(1 << ALLOCINFO_FILTER_LINENO)
+#define ALLOCINFO_FILTER_MASK_INACCURATE	(1 << ALLOCINFO_FILTER_INACCURATE)
 #define ALLOCINFO_FILTER_MASK_MIN_SIZE		(1 << ALLOCINFO_FILTER_MIN_SIZE)
 #define ALLOCINFO_FILTER_MASK_MAX_SIZE		(1 << ALLOCINFO_FILTER_MAX_SIZE)
 
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index ad33d63ef7b4..32ac0674d8bf 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -249,6 +249,8 @@ static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter,
 			   struct alloc_tag_counters *counters,
 			   bool *fetched_counters)
 {
+	bool inaccurate;
+
 	if (!filter || !filter->mask)
 		return true;
 
@@ -274,6 +276,12 @@ static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter,
 	    ct->lineno != filter->fields.lineno)
 		return false;
 
+	if (filter->mask & ALLOCINFO_FILTER_MASK_INACCURATE) {
+		inaccurate = !!(ct->flags & CODETAG_FLAG_INACCURATE);
+		if (inaccurate != !!(filter->fields.inaccurate))
+			return false;
+	}
+
 	if (filter->mask & (ALLOCINFO_FILTER_MASK_MIN_SIZE | ALLOCINFO_FILTER_MASK_MAX_SIZE)) {
 		if (!*fetched_counters) {
 			*counters = allocinfo_prefetch_counters(ct);
-- 
2.55.0.rc0.786.g65d90a0328-goog


^ permalink raw reply related

* [PATCH v6 3/6] alloc_tag: add size-based filtering to ioctl
From: Abhishek Bapat @ 2026-06-18 17:36 UTC (permalink / raw)
  To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
  Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
	Sourav Panda, Abhishek Bapat
In-Reply-To: <cover.1781803482.git.abhishekbapat@google.com>

Extend the allocinfo filtering mechanism to allow users to filter tags
based on the total number of bytes allocated [min_size, max_size]. The
size range is inclusive.

Filtering by size involves retrieving allocinfo per-CPU counters, which
is an expensive operation. Hence, the performance of size-based
filtering will be worse than other filters.

Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
Acked-by: Hao Ge <hao.ge@linux.dev>
---
 include/uapi/linux/alloc_tag.h |  8 ++++-
 lib/alloc_tag.c                | 64 +++++++++++++++++++++++++++-------
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
index 13e9b5916bf5..0de5fc180790 100644
--- a/include/uapi/linux/alloc_tag.h
+++ b/include/uapi/linux/alloc_tag.h
@@ -50,13 +50,17 @@ enum {
 	ALLOCINFO_FILTER_FUNCTION,
 	ALLOCINFO_FILTER_FILENAME,
 	ALLOCINFO_FILTER_LINENO,
-	__ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_LINENO
+	ALLOCINFO_FILTER_MIN_SIZE,
+	ALLOCINFO_FILTER_MAX_SIZE,
+	__ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_MAX_SIZE
 };
 
 #define ALLOCINFO_FILTER_MASK_MODNAME		(1 << ALLOCINFO_FILTER_MODNAME)
 #define ALLOCINFO_FILTER_MASK_FUNCTION		(1 << ALLOCINFO_FILTER_FUNCTION)
 #define ALLOCINFO_FILTER_MASK_FILENAME		(1 << ALLOCINFO_FILTER_FILENAME)
 #define ALLOCINFO_FILTER_MASK_LINENO		(1 << ALLOCINFO_FILTER_LINENO)
+#define ALLOCINFO_FILTER_MASK_MIN_SIZE		(1 << ALLOCINFO_FILTER_MIN_SIZE)
+#define ALLOCINFO_FILTER_MASK_MAX_SIZE		(1 << ALLOCINFO_FILTER_MAX_SIZE)
 
 #define ALLOCINFO_FILTER_MASKS \
 	((1 << (__ALLOCINFO_FILTER_LAST + 1)) - 1)
@@ -64,6 +68,8 @@ enum {
 struct allocinfo_filter {
 	__u64 mask; /* bitmask of the filter fields used */
 	struct allocinfo_tag fields;
+	__u64 min_size;
+	__u64 max_size;
 };
 
 struct allocinfo_get_at {
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index f00d731b81cf..ad33d63ef7b4 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -198,16 +198,20 @@ static int allocinfo_cmp_str(const char *str, const char *template)
 	return strncmp(allocinfo_str(str), template, ALLOCINFO_STR_SIZE);
 }
 
+/* Fetch the per-CPU counters */
+static inline struct alloc_tag_counters allocinfo_prefetch_counters(struct codetag *ct)
+{
+	return alloc_tag_read(ct_to_alloc_tag(ct));
+}
+
 /*
  * Populates the UAPI allocinfo_tag_data structure with active runtime
  * profiling counters extracted from the given kernel codetag.
  */
 static void allocinfo_to_params(struct codetag *ct,
-				struct allocinfo_tag_data *data)
+				struct allocinfo_tag_data *data,
+				struct alloc_tag_counters *counters)
 {
-	struct alloc_tag *tag = ct_to_alloc_tag(ct);
-	struct alloc_tag_counters counter = alloc_tag_read(tag);
-
 	if (ct->modname)
 		allocinfo_copy_str(data->tag.modname, ct->modname);
 	else
@@ -215,9 +219,9 @@ static void allocinfo_to_params(struct codetag *ct,
 	allocinfo_copy_str(data->tag.function, ct->function);
 	allocinfo_copy_str(data->tag.filename, ct->filename);
 	data->tag.lineno = ct->lineno;
-	data->counter.bytes = counter.bytes;
-	data->counter.calls = counter.calls;
-	data->counter.accurate = !alloc_tag_is_inaccurate(tag);
+	data->counter.bytes = counters->bytes;
+	data->counter.calls = counters->calls;
+	data->counter.accurate = !alloc_tag_is_inaccurate(ct_to_alloc_tag(ct));
 }
 
 /*
@@ -241,7 +245,9 @@ static int allocinfo_ioctl_get_content_id(struct seq_file *m, void __user *arg)
  * Verifies whether a given codetag satisfies the active filtering criteria by
  * matching its characteristics against the specified filter.
  */
-static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter)
+static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter,
+			   struct alloc_tag_counters *counters,
+			   bool *fetched_counters)
 {
 	if (!filter || !filter->mask)
 		return true;
@@ -268,6 +274,19 @@ static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter)
 	    ct->lineno != filter->fields.lineno)
 		return false;
 
+	if (filter->mask & (ALLOCINFO_FILTER_MASK_MIN_SIZE | ALLOCINFO_FILTER_MASK_MAX_SIZE)) {
+		if (!*fetched_counters) {
+			*counters = allocinfo_prefetch_counters(ct);
+			*fetched_counters = true;
+		}
+		if ((filter->mask & ALLOCINFO_FILTER_MASK_MIN_SIZE) &&
+		    counters->bytes < filter->min_size)
+			return false;
+		if ((filter->mask & ALLOCINFO_FILTER_MASK_MAX_SIZE) &&
+		    counters->bytes > filter->max_size)
+			return false;
+	}
+
 	return true;
 }
 
@@ -281,6 +300,8 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
 	struct codetag *ct;
 	struct allocinfo_get_at params = {0};
 	__u64 skip_count;
+	struct alloc_tag_counters counters;
+	bool fetched_counters;
 
 	if (copy_from_user(&params, arg, sizeof(params)))
 		return -EFAULT;
@@ -288,6 +309,11 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
 	if (params.filter.mask & ~ALLOCINFO_FILTER_MASKS)
 		return -EINVAL;
 
+	if ((params.filter.mask & ALLOCINFO_FILTER_MASK_MIN_SIZE) &&
+	    (params.filter.mask & ALLOCINFO_FILTER_MASK_MAX_SIZE) &&
+	    params.filter.min_size > params.filter.max_size)
+		return -EINVAL;
+
 	priv = m->private;
 
 	mutex_lock(&priv->ioctl_lock);
@@ -311,7 +337,8 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
 	ct = codetag_next_ct(&priv->ioctl_iter);
 
 	while (ct) {
-		if (matches_filter(ct, &priv->filter)) {
+		fetched_counters = false;
+		if (matches_filter(ct, &priv->filter, &counters, &fetched_counters)) {
 			if (skip_count == 0)
 				break;
 			skip_count--;
@@ -320,7 +347,9 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
 	}
 
 	if (ct) {
-		allocinfo_to_params(ct, &params.data);
+		if (!fetched_counters)
+			counters = allocinfo_prefetch_counters(ct);
+		allocinfo_to_params(ct, &params.data, &counters);
 		priv->positioned = true;
 	}
 
@@ -346,6 +375,8 @@ static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
 	struct codetag *ct;
 	struct allocinfo_tag_data params;
 	int ret = 0;
+	struct alloc_tag_counters counters;
+	bool fetched_counters;
 
 	memset(&params, 0, sizeof(params));
 	priv = m->private;
@@ -359,11 +390,18 @@ static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
 	}
 
 	ct = codetag_next_ct(&priv->ioctl_iter);
-	while (ct && !matches_filter(ct, &priv->filter))
+	while (ct) {
+		fetched_counters = false;
+		if (matches_filter(ct, &priv->filter, &counters, &fetched_counters))
+			break;
 		ct = codetag_next_ct(&priv->ioctl_iter);
-	if (ct)
-		allocinfo_to_params(ct, &params);
+	}
 
+	if (ct) {
+		if (!fetched_counters)
+			counters = allocinfo_prefetch_counters(ct);
+		allocinfo_to_params(ct, &params, &counters);
+	}
 	if (!ct) {
 		priv->positioned = false;
 		ret = -ENOENT;
-- 
2.55.0.rc0.786.g65d90a0328-goog


^ permalink raw reply related

* [PATCH v6 2/6] alloc_tag: add ioctl filters to /proc/allocinfo
From: Abhishek Bapat @ 2026-06-18 17:36 UTC (permalink / raw)
  To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
  Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
	Sourav Panda, Abhishek Bapat
In-Reply-To: <cover.1781803482.git.abhishekbapat@google.com>

Extend the capability of the IOCTL mechanism to filter allocations based
on tag's module name, function name, file name and line number.

Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
Acked-by: Hao Ge <hao.ge@linux.dev>
Acked-by: Suren Baghdasaryan <surenb@google.com>
---
 include/uapi/linux/alloc_tag.h | 26 ++++++++++++-
 lib/alloc_tag.c                | 68 ++++++++++++++++++++++++++++++++--
 2 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
index ee6a023cbaf4..13e9b5916bf5 100644
--- a/include/uapi/linux/alloc_tag.h
+++ b/include/uapi/linux/alloc_tag.h
@@ -45,8 +45,32 @@ struct allocinfo_tag_data {
 	struct allocinfo_counter counter;
 };
 
+enum {
+	ALLOCINFO_FILTER_MODNAME,
+	ALLOCINFO_FILTER_FUNCTION,
+	ALLOCINFO_FILTER_FILENAME,
+	ALLOCINFO_FILTER_LINENO,
+	__ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_LINENO
+};
+
+#define ALLOCINFO_FILTER_MASK_MODNAME		(1 << ALLOCINFO_FILTER_MODNAME)
+#define ALLOCINFO_FILTER_MASK_FUNCTION		(1 << ALLOCINFO_FILTER_FUNCTION)
+#define ALLOCINFO_FILTER_MASK_FILENAME		(1 << ALLOCINFO_FILTER_FILENAME)
+#define ALLOCINFO_FILTER_MASK_LINENO		(1 << ALLOCINFO_FILTER_LINENO)
+
+#define ALLOCINFO_FILTER_MASKS \
+	((1 << (__ALLOCINFO_FILTER_LAST + 1)) - 1)
+
+struct allocinfo_filter {
+	__u64 mask; /* bitmask of the filter fields used */
+	struct allocinfo_tag fields;
+};
+
 struct allocinfo_get_at {
-	__u64 pos;	/* input */
+	/* inputs */
+	__u64 pos;
+	struct allocinfo_filter filter;
+	/* output */
 	struct allocinfo_tag_data data;
 };
 
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index c73195000830..f00d731b81cf 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -49,6 +49,7 @@ struct allocinfo_private {
 	struct codetag_iterator iter;
 	struct codetag_iterator reported_iter;
 	bool print_header;
+	struct allocinfo_filter filter;
 	/* ioctl uses a separate iterator not to interfere with reads */
 	struct codetag_iterator ioctl_iter;
 	bool positioned; /* seq_open_private() sets to 0 */
@@ -191,6 +192,12 @@ static void allocinfo_copy_str(char *dest, const char *src)
 	strscpy_pad(dest, allocinfo_str(src), ALLOCINFO_STR_SIZE);
 }
 
+/* Compare two strings and only consider the trimmed suffix if s1 is too long */
+static int allocinfo_cmp_str(const char *str, const char *template)
+{
+	return strncmp(allocinfo_str(str), template, ALLOCINFO_STR_SIZE);
+}
+
 /*
  * Populates the UAPI allocinfo_tag_data structure with active runtime
  * profiling counters extracted from the given kernel codetag.
@@ -230,6 +237,40 @@ static int allocinfo_ioctl_get_content_id(struct seq_file *m, void __user *arg)
 	return 0;
 }
 
+/*
+ * Verifies whether a given codetag satisfies the active filtering criteria by
+ * matching its characteristics against the specified filter.
+ */
+static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter)
+{
+	if (!filter || !filter->mask)
+		return true;
+
+	if (filter->mask & ALLOCINFO_FILTER_MASK_MODNAME) {
+		/* user wants to filter by modname but ct->modname is NULL */
+		if (!ct->modname) {
+			/* validate if user was attempting to filter for built-in allocations */
+			if (filter->fields.modname[0] != '\0')
+				return false;
+		} else if (allocinfo_cmp_str(ct->modname, filter->fields.modname))
+			return false;
+	}
+
+	if ((filter->mask & ALLOCINFO_FILTER_MASK_FUNCTION) &&
+	    ct->function && allocinfo_cmp_str(ct->function, filter->fields.function))
+		return false;
+
+	if ((filter->mask & ALLOCINFO_FILTER_MASK_FILENAME) &&
+	    ct->filename && allocinfo_cmp_str(ct->filename, filter->fields.filename))
+		return false;
+
+	if ((filter->mask & ALLOCINFO_FILTER_MASK_LINENO) &&
+	    ct->lineno != filter->fields.lineno)
+		return false;
+
+	return true;
+}
+
 /*
  * Seeks the ioctl iterator to the specified 0-indexed tag position, reads its
  * profiling data and returns it to userspace.
@@ -238,29 +279,46 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
 {
 	struct allocinfo_private *priv;
 	struct codetag *ct;
-	__u64 pos;
 	struct allocinfo_get_at params = {0};
+	__u64 skip_count;
 
 	if (copy_from_user(&params, arg, sizeof(params)))
 		return -EFAULT;
 
+	if (params.filter.mask & ~ALLOCINFO_FILTER_MASKS)
+		return -EINVAL;
+
 	priv = m->private;
-	pos = params.pos;
 
 	mutex_lock(&priv->ioctl_lock);
 	codetag_lock_module_list(alloc_tag_cttype);
 
-	if (pos >= codetag_get_count(alloc_tag_cttype)) {
+	if (params.pos >= codetag_get_count(alloc_tag_cttype)) {
 		codetag_unlock_module_list(alloc_tag_cttype);
 		mutex_unlock(&priv->ioctl_lock);
 		return -ENOENT;
 	}
 
+	skip_count = params.pos;
+
+	if (params.filter.mask)
+		priv->filter = params.filter;
+	else
+		priv->filter.mask = 0;
+
 	/* Find the codetag */
 	priv->ioctl_iter = codetag_get_ct_iter(alloc_tag_cttype);
 	ct = codetag_next_ct(&priv->ioctl_iter);
-	while (ct && pos--)
+
+	while (ct) {
+		if (matches_filter(ct, &priv->filter)) {
+			if (skip_count == 0)
+				break;
+			skip_count--;
+		}
 		ct = codetag_next_ct(&priv->ioctl_iter);
+	}
+
 	if (ct) {
 		allocinfo_to_params(ct, &params.data);
 		priv->positioned = true;
@@ -301,6 +359,8 @@ static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
 	}
 
 	ct = codetag_next_ct(&priv->ioctl_iter);
+	while (ct && !matches_filter(ct, &priv->filter))
+		ct = codetag_next_ct(&priv->ioctl_iter);
 	if (ct)
 		allocinfo_to_params(ct, &params);
 
-- 
2.55.0.rc0.786.g65d90a0328-goog


^ permalink raw reply related

* [PATCH v6 1/6] alloc_tag: add ioctl to /proc/allocinfo
From: Abhishek Bapat @ 2026-06-18 17:36 UTC (permalink / raw)
  To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
  Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
	Sourav Panda, Abhishek Bapat
In-Reply-To: <cover.1781803482.git.abhishekbapat@google.com>

From: Suren Baghdasaryan <surenb@google.com>

Add the following ioctl commands for /proc/allocinfo file:

ALLOCINFO_IOC_CONTENT_ID - gets content identifier which can be used
to check whether the file content has changed specifically due to module
load/unload. Every time a module is loaded / unloaded, the returned
value will be different. By comparing the identifier value at the
beginning and at the end of the content retrieval operation, users can
validate retrieved information for consistency.

ALLOCINFO_IOC_GET_AT - gets the record at the specified position. This
is the position of a record in /proc/allocinfo.

ALLOCINFO_IOC_GET_NEXT - gets the record next to the last retrieved
one. If no records were previously retrieved, returns the first
record.

Note, function file and module names often have the same prefixes,
therefore when filtering for them, we compare the last 64 characters to
minimize the chances of name collisions.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
Acked-by: Hao Ge <hao.ge@linux.dev>
---
 Documentation/mm/allocation-profiling.rst     |   5 +
 .../userspace-api/ioctl/ioctl-number.rst      |   2 +
 MAINTAINERS                                   |   1 +
 include/linux/codetag.h                       |   2 +
 include/uapi/linux/alloc_tag.h                |  65 +++++
 lib/alloc_tag.c                               | 238 +++++++++++++++++-
 lib/codetag.c                                 |  18 ++
 7 files changed, 329 insertions(+), 2 deletions(-)
 create mode 100644 include/uapi/linux/alloc_tag.h

diff --git a/Documentation/mm/allocation-profiling.rst b/Documentation/mm/allocation-profiling.rst
index 5389d241176a..c3a28467955f 100644
--- a/Documentation/mm/allocation-profiling.rst
+++ b/Documentation/mm/allocation-profiling.rst
@@ -46,6 +46,11 @@ sysctl:
 Runtime info:
   /proc/allocinfo
 
+  Profiling data can be retrieved either by reading `/proc/allocinfo` directly as
+  text or programmatically via `ioctl()` calls defined in `<uapi/linux/alloc_tag.h>`.
+  The ioctl interface supports structured binary data extraction as well as filtering
+  by module name, function, file, line number, accuracy, or allocation size limits.
+
 Example output::
 
   root@moria-kvm:~# sort -g /proc/allocinfo|tail|numfmt --to=iec
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 331223761fff..84f6808a8578 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -349,6 +349,8 @@ Code  Seq#    Include File                                             Comments
                                                                        <mailto:luzmaximilian@gmail.com>
 0xA5  20-2F  linux/surface_aggregator/dtx.h                            Microsoft Surface DTX driver
                                                                        <mailto:luzmaximilian@gmail.com>
+0xA6  00-0F  uapi/linux/alloc_tag.h                                    Memory allocation profiling
+                                                                       <mailto:surenb@google.com>
 0xAA  00-3F  linux/uapi/linux/userfaultfd.h
 0xAB  00-1F  linux/nbd.h
 0xAC  00-1F  linux/raw.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 65bd4328fe05..019cc4c285a3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16713,6 +16713,7 @@ S:	Maintained
 F:	Documentation/mm/allocation-profiling.rst
 F:	include/linux/alloc_tag.h
 F:	include/linux/pgalloc_tag.h
+F:	include/uapi/linux/alloc_tag.h
 F:	lib/alloc_tag.c
 
 MEMORY CONTROLLER DRIVERS
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index ddae7484ca45..a25a085c2df1 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -77,6 +77,8 @@ struct codetag_iterator {
 void codetag_lock_module_list(struct codetag_type *cttype);
 bool codetag_trylock_module_list(struct codetag_type *cttype);
 void codetag_unlock_module_list(struct codetag_type *cttype);
+unsigned long codetag_get_content_id(struct codetag_type *cttype);
+unsigned int codetag_get_count(struct codetag_type *cttype);
 struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype);
 struct codetag *codetag_next_ct(struct codetag_iterator *iter);
 
diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
new file mode 100644
index 000000000000..ee6a023cbaf4
--- /dev/null
+++ b/include/uapi/linux/alloc_tag.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * alloc_tag IOCTL API definition
+ *
+ * Copyright (C) 2026 Google, LLC.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _UAPI_ALLOC_TAG_H
+#define _UAPI_ALLOC_TAG_H
+
+#include <linux/types.h>
+
+/*
+ * Function, file and module names often have the same prefixes, therefore
+ * when filtering by these criteria, we compare the last 64 characters to
+ * minimize the chances of name collisions
+ */
+#define ALLOCINFO_STR_SIZE	64
+
+struct allocinfo_content_id {
+	__u64 id;
+};
+
+struct allocinfo_tag {
+	/* Longer names are trimmed */
+	char modname[ALLOCINFO_STR_SIZE];
+	char function[ALLOCINFO_STR_SIZE];
+	char filename[ALLOCINFO_STR_SIZE];
+	__u64 lineno;
+};
+
+/* The alignment ensures 32-bit compatible interfaces are not broken */
+struct allocinfo_counter {
+	__u64 bytes;
+	__u64 calls;
+	__u8 accurate;
+} __attribute__((aligned(8)));
+
+struct allocinfo_tag_data {
+	struct allocinfo_tag tag;
+	struct allocinfo_counter counter;
+};
+
+struct allocinfo_get_at {
+	__u64 pos;	/* input */
+	struct allocinfo_tag_data data;
+};
+
+#define _ALLOCINFO_IOC_CONTENT_ID	0
+#define _ALLOCINFO_IOC_GET_AT		1
+#define _ALLOCINFO_IOC_GET_NEXT		2
+
+#define ALLOCINFO_IOC_BASE		0xA6
+#define ALLOCINFO_IOC_CONTENT_ID	_IOR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_CONTENT_ID,	\
+					     struct allocinfo_content_id)
+#define ALLOCINFO_IOC_GET_AT		_IOWR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_GET_AT,	\
+					      struct allocinfo_get_at)
+#define ALLOCINFO_IOC_GET_NEXT		_IOR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_GET_NEXT,	\
+					     struct allocinfo_tag_data)
+
+#endif /* _UAPI_ALLOC_TAG_H */
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index d9be1cf5187d..c73195000830 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -5,6 +5,7 @@
 #include <linux/gfp.h>
 #include <linux/kallsyms.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/page_ext.h>
 #include <linux/pgalloc_tag.h>
 #include <linux/proc_fs.h>
@@ -14,6 +15,7 @@
 #include <linux/string_choices.h>
 #include <linux/vmalloc.h>
 #include <linux/kmemleak.h>
+#include <uapi/linux/alloc_tag.h>
 
 #define ALLOCINFO_FILE_NAME		"allocinfo"
 #define MODULE_ALLOC_TAG_VMAP_SIZE	(100000UL * sizeof(struct alloc_tag))
@@ -47,6 +49,10 @@ struct allocinfo_private {
 	struct codetag_iterator iter;
 	struct codetag_iterator reported_iter;
 	bool print_header;
+	/* ioctl uses a separate iterator not to interfere with reads */
+	struct codetag_iterator ioctl_iter;
+	bool positioned; /* seq_open_private() sets to 0 */
+	struct mutex ioctl_lock;
 };
 
 static void *allocinfo_start(struct seq_file *m, loff_t *pos)
@@ -130,6 +136,235 @@ static const struct seq_operations allocinfo_seq_op = {
 	.show	= allocinfo_show,
 };
 
+/*
+ * Initializes seq_file operations and allocates private state when opening
+ * the /proc/allocinfo procfs entry.
+ */
+static int allocinfo_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open_private(file, &allocinfo_seq_op,
+			       sizeof(struct allocinfo_private));
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		struct allocinfo_private *priv = m->private;
+
+		mutex_init(&priv->ioctl_lock);
+	}
+	return ret;
+}
+
+/*
+ * Cleans up the seq_file state and frees up the private state allocated in
+ * allocinfo_open() when closing the /proc/allocinfo file descriptor.
+ */
+static int allocinfo_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	struct allocinfo_private *priv = m->private;
+
+	mutex_destroy(&priv->ioctl_lock);
+	return seq_release_private(inode, file);
+}
+
+/*
+ * Returns a pointer to the suffix of a string so that its length fits within
+ * ALLOCINFO_STR_SIZE, preserving the trailing characters.
+ * Function, file and module names often have the same prefixes, therefore
+ * when filtering by these criteria, we compare the last 64 characters to
+ * minimize the chances of name collisions
+ */
+static const char *allocinfo_str(const char *str)
+{
+	size_t len = strlen(str);
+
+	/* Keep an extra space for the trailing NULL. */
+	if (len >= ALLOCINFO_STR_SIZE)
+		str += (len - ALLOCINFO_STR_SIZE) + 1;
+	return str;
+}
+
+/* Copy a string and trim from the beginning if it's too long */
+static void allocinfo_copy_str(char *dest, const char *src)
+{
+	strscpy_pad(dest, allocinfo_str(src), ALLOCINFO_STR_SIZE);
+}
+
+/*
+ * Populates the UAPI allocinfo_tag_data structure with active runtime
+ * profiling counters extracted from the given kernel codetag.
+ */
+static void allocinfo_to_params(struct codetag *ct,
+				struct allocinfo_tag_data *data)
+{
+	struct alloc_tag *tag = ct_to_alloc_tag(ct);
+	struct alloc_tag_counters counter = alloc_tag_read(tag);
+
+	if (ct->modname)
+		allocinfo_copy_str(data->tag.modname, ct->modname);
+	else
+		data->tag.modname[0] = '\0';
+	allocinfo_copy_str(data->tag.function, ct->function);
+	allocinfo_copy_str(data->tag.filename, ct->filename);
+	data->tag.lineno = ct->lineno;
+	data->counter.bytes = counter.bytes;
+	data->counter.calls = counter.calls;
+	data->counter.accurate = !alloc_tag_is_inaccurate(tag);
+}
+
+/*
+ * Retrieves the unique content ID representing the current allocation tag module
+ * layout, allowing userspace to detect if modules were loaded / unloaded.
+ */
+static int allocinfo_ioctl_get_content_id(struct seq_file *m, void __user *arg)
+{
+	struct allocinfo_content_id params;
+
+	codetag_lock_module_list(alloc_tag_cttype);
+	params.id = codetag_get_content_id(alloc_tag_cttype);
+	codetag_unlock_module_list(alloc_tag_cttype);
+	if (copy_to_user(arg, &params, sizeof(params)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * Seeks the ioctl iterator to the specified 0-indexed tag position, reads its
+ * profiling data and returns it to userspace.
+ */
+static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
+{
+	struct allocinfo_private *priv;
+	struct codetag *ct;
+	__u64 pos;
+	struct allocinfo_get_at params = {0};
+
+	if (copy_from_user(&params, arg, sizeof(params)))
+		return -EFAULT;
+
+	priv = m->private;
+	pos = params.pos;
+
+	mutex_lock(&priv->ioctl_lock);
+	codetag_lock_module_list(alloc_tag_cttype);
+
+	if (pos >= codetag_get_count(alloc_tag_cttype)) {
+		codetag_unlock_module_list(alloc_tag_cttype);
+		mutex_unlock(&priv->ioctl_lock);
+		return -ENOENT;
+	}
+
+	/* Find the codetag */
+	priv->ioctl_iter = codetag_get_ct_iter(alloc_tag_cttype);
+	ct = codetag_next_ct(&priv->ioctl_iter);
+	while (ct && pos--)
+		ct = codetag_next_ct(&priv->ioctl_iter);
+	if (ct) {
+		allocinfo_to_params(ct, &params.data);
+		priv->positioned = true;
+	}
+
+	codetag_unlock_module_list(alloc_tag_cttype);
+	mutex_unlock(&priv->ioctl_lock);
+
+	if (!ct)
+		return -ENOENT;
+
+	if (copy_to_user(arg, &params, sizeof(params)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * Advances the ioctl iterator to the next allocation tag in the sequence and
+ * returns its profiling data to userspace.
+ */
+static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
+{
+	struct allocinfo_private *priv;
+	struct codetag *ct;
+	struct allocinfo_tag_data params;
+	int ret = 0;
+
+	memset(&params, 0, sizeof(params));
+	priv = m->private;
+
+	mutex_lock(&priv->ioctl_lock);
+	codetag_lock_module_list(alloc_tag_cttype);
+
+	if (!priv->positioned) {
+		priv->ioctl_iter = codetag_get_ct_iter(alloc_tag_cttype);
+		priv->positioned = true;
+	}
+
+	ct = codetag_next_ct(&priv->ioctl_iter);
+	if (ct)
+		allocinfo_to_params(ct, &params);
+
+	if (!ct) {
+		priv->positioned = false;
+		ret = -ENOENT;
+	}
+	codetag_unlock_module_list(alloc_tag_cttype);
+	mutex_unlock(&priv->ioctl_lock);
+
+	if (ret == 0) {
+		if (copy_to_user(arg, &params, sizeof(params)))
+			return -EFAULT;
+	}
+	return ret;
+}
+
+/*
+ * Entry point ioctl function for /proc/allocinfo routing requests to fetch the
+ * layout content ID, seek to a specific tag, or read sequential tags.
+ */
+static long allocinfo_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long __arg)
+{
+	void __user *arg = (void __user *)__arg;
+	int ret;
+
+	switch (cmd) {
+	case ALLOCINFO_IOC_CONTENT_ID:
+		ret = allocinfo_ioctl_get_content_id(file->private_data, arg);
+		break;
+	case ALLOCINFO_IOC_GET_AT:
+		ret = allocinfo_ioctl_get_at(file->private_data, arg);
+		break;
+	case ALLOCINFO_IOC_GET_NEXT:
+		ret = allocinfo_ioctl_get_next(file->private_data, arg);
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long allocinfo_compat_ioctl(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	return allocinfo_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct proc_ops allocinfo_proc_ops = {
+	.proc_open		= allocinfo_open,
+	.proc_read_iter		= seq_read_iter,
+	.proc_lseek		= seq_lseek,
+	.proc_release		= allocinfo_release,
+	.proc_ioctl		= allocinfo_ioctl,
+#ifdef CONFIG_COMPAT
+	.proc_compat_ioctl	= allocinfo_compat_ioctl,
+#endif
+};
+
 size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sleep)
 {
 	struct codetag_iterator iter;
@@ -993,8 +1228,7 @@ static int __init alloc_tag_init(void)
 		return 0;
 	}
 
-	if (!proc_create_seq_private(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op,
-				     sizeof(struct allocinfo_private), NULL)) {
+	if (!proc_create(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_proc_ops)) {
 		pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME);
 		shutdown_mem_profiling(false);
 		return -ENOMEM;
diff --git a/lib/codetag.c b/lib/codetag.c
index 4001a7ea6675..a9cda4c962a3 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -19,6 +19,8 @@ struct codetag_type {
 	struct codetag_type_desc desc;
 	/* generates unique sequence number for module load */
 	unsigned long next_mod_seq;
+	/* bumped on every module load and unload */
+	unsigned long content_id;
 };
 
 struct codetag_range {
@@ -50,6 +52,20 @@ void codetag_unlock_module_list(struct codetag_type *cttype)
 	up_read(&cttype->mod_lock);
 }
 
+unsigned long codetag_get_content_id(struct codetag_type *cttype)
+{
+	lockdep_assert_held(&cttype->mod_lock);
+
+	return cttype->content_id;
+}
+
+unsigned int codetag_get_count(struct codetag_type *cttype)
+{
+	lockdep_assert_held(&cttype->mod_lock);
+
+	return cttype->count;
+}
+
 struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype)
 {
 	struct codetag_iterator iter = {
@@ -204,6 +220,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
 
 	down_write(&cttype->mod_lock);
 	cmod->mod_seq = ++cttype->next_mod_seq;
+	++cttype->content_id;
 	mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
 	if (mod_id >= 0) {
 		if (cttype->desc.module_load) {
@@ -368,6 +385,7 @@ void codetag_unload_module(struct module *mod)
 			cttype->count -= range_size(cttype, &cmod->range);
 			idr_remove(&cttype->mod_idr, mod_id);
 			kfree(cmod);
+			++cttype->content_id;
 		}
 		up_write(&cttype->mod_lock);
 		if (found && cttype->desc.free_section_mem)
-- 
2.55.0.rc0.786.g65d90a0328-goog


^ permalink raw reply related

* [PATCH v6 0/6] alloc_tag: introduce IOCTL-based filtering for MAP
From: Abhishek Bapat @ 2026-06-18 17:36 UTC (permalink / raw)
  To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
  Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
	Sourav Panda, Abhishek Bapat

Currently, memory allocation profiling data is primarily exposed through
/proc/allocinfo. While useful for manual inspection, this text-based
interface poses challenges for production monitoring and large-scale
analysis:

1. Userspace must parse large amounts of text to extract specific
fields.
2. To find specific tags, userspace must read the entire dataset,
requiring many context switches and high data copying.
3. The kernel currently aggregates per-CPU counters for every allocation
size, even those the user intends to filter out immediately.

This series introduces a new IOCTL-based binary interface for allocinfo
that supports kernel-side filtering. By allowing the user to specify a
filter mask, we significantly reduce the work performed in-kernel and
the amount of data transferred to userspace. The IOCTL mechanism was
chosen for allocinfo to address the per-CPU counter aggregation
bottleneck. A traditional read() operation must report the total
allocation count and sizes for every code tag in the system. Doing so
requires iterating across all CPUs to sum their per-CPU counters for
thousands of tags, which introduces substantial runtime overhead.

The IOCTL interface allows userspace to push selective filtering
criteria directly into the kernel before the per-CPU counter
aggregation. The kernel aggregates per-CPU counters only for a small
subset of tags that match the filter. This results in significant
performance improvement.

Beyond fast filtered retrieval, the IOCTL foundation allows introducing
a context capture mechanism in the future to capture the context for
specific allocations.

Performance measurements were conducted on an Intel Xeon Platinum 8481C
(224 CPUs) with caches dropped before each run.

The IOCTL mechanism shows a ~20x performance improvement for
filtered queries. The kernel avoids the expensive per-CPU counter
aggregation (alloc_tag_read) for any tags that fail the initial string
or location filters.

Scenario 1: Specific File Filtering (arch/x86/events/rapl.c)
1. Traditional (cat /proc/allocinfo | grep): 22ms (sys)
2. IOCTL Interface: 1ms (sys)

Scenario 2: Compound Filtering (Filename + Size)
1. Traditional: (cat ... | grep | awk): 21ms (sys)
2. IOCTL Interface: 1ms (sys)

Scenario 3: Size-Based Filtering (min_size = 1MB)
1. Traditional: (cat ... | awk): 21ms (sys)
2. IOCTL Interface: 14ms (sys)

v6 changes:
- Patch 1/6: Added comments explaining why last 64 characters are
  compared in the filter.
- Patch 3/6: Moved allocinfo_prefetch_counters outside of
  allocinfo_to_params
- Patch 5/6: Fixed fd leak in get_filtered_ioctl_entries() function.
  Added alloc_tag selftest to the top-level Makefile.
- Patch 6/6: Moved include for errno.h to this patch.

v5 changes:
- Patch 1/6: Added explicit mutex_destroy.
- Patch 5/6: Self-contained file descriptors to avoid wrap-around errors
  in retry loops.
- Patch 6/6: Fixed minor issues raised by sashiko in v4.

v4 changes:
- Patch 1/6: Fixed a copyright comment inside
  include/uapi/linux/alloc_tag.h
- Patch 3/6: Among other nits, fixed the inadvertent build failure
  introduced in v3.
- Patch 4/6: Included a comment stating that the accurate field in
  struct allocinfo_tag is only used for filtering.
- Patch 5/6: Modified test to trim prefix and keep suffix for entries
  with filenames exceeding the size limit.
- Patch 6/6: Modified test_size_filter such that if content_id changes
  between the moment when procfs and ioctl entries are read, both
entries are invalidated and re-fetched. Removed the tags->count == 0
check from test_lineno_filter as it's virtually unreachable.

v3 changes:
- Patch 1/6: Modified Documentation to indicate that map supports
  ioctl(). Modified struct allocinfo_count to use
__attribute__((aligned(8))) instead of manual padding. Removed
redundance type-casting. Added comments for static functions in
lib/alloc_tag.c. Introduced a new seq counter for content_id that gets
bumped every time module is loaded / unloaded. Introduced logic to
validate user specified position is not greater than number of
allocation tags and return early if it is. Changed strscpy to
strscpy_pad to not echo arbitrary user data back to the user.
- Patch 2/6: Handled the case where user wants to specifically filter
  for built-in modules. Included some comments for static functions.
- Patch 3/6: Modified logic to only fetch per-CPU counters for codetags
  that satisfy other filters. Included some comments for static
functions.

v2 changes:
- Patch 1/6: Introduced locking for m->private. Also included the new uapi
header file in MAINTAINERS list.
- Patch 2/6: Handled the case where ALLOCINFO_FILTER_MASK_MODNAME is
passed but ct->modname is NULL.
- Patch 3/6: Moved min_size and max_size outside of struct allocinfo_tag
into struct allocinfo_filter. Added validation that min_size <=
max_size. Prefetched alloc_tag_counters if size based filter masks are
provided to avoid assimilating per-cpu counters twice.
- Patch 5/6: Removed the hardcoded logic to skip the header, instead the
test will skip lines that don't match the format. Also included the
newly added alloc_tag selftests directory in MAINTAINERS list.

Abhishek Bapat (5):
  alloc_tag: add ioctl filters to /proc/allocinfo
  alloc_tag: add size-based filtering to ioctl
  alloc_tag: add accuracy based filtering to ioctl
  kselftest: alloc_tag: add kselftest for ioctl interface
  kselftest: alloc_tag: extend the allocinfo ioctl kselftest

Suren Baghdasaryan (1):
  alloc_tag: add ioctl to /proc/allocinfo

 Documentation/mm/allocation-profiling.rst     |   5 +
 .../userspace-api/ioctl/ioctl-number.rst      |   2 +
 MAINTAINERS                                   |   2 +
 include/linux/codetag.h                       |   2 +
 include/uapi/linux/alloc_tag.h                |  99 ++++
 lib/alloc_tag.c                               | 344 +++++++++++-
 lib/codetag.c                                 |  18 +
 tools/testing/selftests/Makefile              |   1 +
 tools/testing/selftests/alloc_tag/Makefile    |   9 +
 .../alloc_tag/allocinfo_ioctl_test.c          | 531 ++++++++++++++++++
 10 files changed, 1011 insertions(+), 2 deletions(-)
 create mode 100644 include/uapi/linux/alloc_tag.h
 create mode 100644 tools/testing/selftests/alloc_tag/Makefile
 create mode 100644 tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c

-- 
2.55.0.rc0.786.g65d90a0328-goog


^ permalink raw reply

* Re: [PATCH v3 06/13] tick/nohz, context_tracking: Prepare for runtime nohz_full updates
From: Thomas Gleixner @ 2026-06-18 17:27 UTC (permalink / raw)
  To: Jing Wu, Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Valentin Schneider, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Mathieu Desnoyers, Lai Jiangshan, Zqiang,
	Anna-Maria Behnsen, Tejun Heo, Jonathan Corbet, Shuah Khan,
	Shuah Khan
  Cc: linux-kernel, rcu, cgroups, linux-doc, linux-kselftest, Jing Wu,
	Qiliang Yuan
In-Reply-To: <20260618-wujing-dhm-v3-6-28f1a4d83b68@gmail.com>

On Thu, Jun 18 2026 at 11:11, Jing Wu wrote:
> Remove __init from ct_cpu_track_user() and __initdata from the
> initialized flag so context tracking can be activated on CPUs that
> join nohz_full at runtime.  Drop the __ro_after_init attribute from
> the context_tracking_key static key, allowing static_branch_dec()
> when a CPU leaves nohz_full.
>
> Add ct_cpu_untrack_user() to reverse ct_cpu_track_user(), decrementing
> the static key and clearing the per-CPU tracking state.

Please do not enumerate WHAT the patch is doing. Explain the context and
the WHY

  https://docs.kernel.org/process/maintainer-tip.html#changelog


>  
>  #include <asm/irq_regs.h>
> @@ -653,11 +654,6 @@ void __init tick_nohz_init(void)
>  	if (!tick_nohz_full_running)
>  		return;
>  
> -	/*
> -	 * Full dynticks uses IRQ work to drive the tick rescheduling on safe
> -	 * locking contexts. But then we need IRQ work to raise its own
> -	 * interrupts to avoid circular dependency on the tick.
> -	 */

This comment is removed because it's not longer correct? How is this
related to $Subject?

>  	if (!arch_irq_work_has_interrupt()) {
>  		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
>  		cpumask_clear(tick_nohz_full_mask);
> @@ -676,6 +672,16 @@ void __init tick_nohz_init(void)
>  		}
>  	}
>  
> +	/*
> +	 * Pre-initialize context tracking for all possible CPUs so
> +	 * ctx tracking is already active when a CPU is later added to
> +	 * nohz_full at runtime.  The tracking overhead is negligible
> +	 * because the static key is not incremented yet — only per-CPU
> +	 * tracking state is set up.
> +	 */
> +	if (IS_ENABLED(CONFIG_CONTEXT_TRACKING_USER_FORCE))
> +		context_tracking_init();

Seriously? Care to look where and when context_tracking_init() is invoked?

>  	for_each_cpu(cpu, tick_nohz_full_mask)
>  		ct_cpu_track_user(cpu);
>  
> @@ -686,6 +692,147 @@ void __init tick_nohz_init(void)
>  	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
>  		cpumask_pr_args(tick_nohz_full_mask));
>  }
> +
> +static int tick_nohz_hk_validate(enum hk_type type,
> +				 const struct cpumask *cur_mask,
> +				 const struct cpumask *new_mask)
> +{
> +	if (!IS_ENABLED(CONFIG_NO_HZ_FULL))
> +		return -EOPNOTSUPP;
> +	return 0;
> +}

Why is this code even compiled when CONFIG_NO_HZ_FULL is not enabled?

> +
> +static void tick_nohz_hk_apply(enum hk_type type)
> +{
> +	static DEFINE_SPINLOCK(tick_nohz_lock);
> +	cpumask_var_t nohz_full, added, removed;
> +	bool was_running;
> +	int cpu;
> +
> +	if (!alloc_cpumask_var(&nohz_full, GFP_KERNEL))
> +		return;

This looks more than wrong. If this fails then the core code will
happily proceed with the completely wrong state.

> +	if (!alloc_cpumask_var(&added, GFP_KERNEL)) {
> +		free_cpumask_var(nohz_full);
> +		return;
> +	}
> +	if (!alloc_cpumask_var(&removed, GFP_KERNEL)) {
> +		free_cpumask_var(added);
> +		free_cpumask_var(nohz_full);
> +		return;
> +	}

        cpumask_var_t __free(free_cpumask_var) a = CPUMASK_VAR_NULL;
        cpumask_var_t __free(free_cpumask_var) b = CPUMASK_VAR_NULL;
        cpumask_var_t __free(free_cpumask_var) c = CPUMASK_VAR_NULL;

        if (!alloc_cpumask_var(&a, GFP_KERNEL))
        	return -ENOMEM;
        ....

> +
> +	/*
> +	 * Snapshot the new HK_TYPE_KERNEL_NOISE mask under an RCU read lock.
> +	 * housekeeping_update_types() completes synchronize_rcu() before
> +	 * invoking apply(), so the new pointer is stable; however the lockdep
> +	 * annotation in housekeeping_cpumask() still requires an RCU read-side
> +	 * critical section for runtime-mutable types.

This comment is explaining the obvious: housekeeping_cpumask_rcu()

> +	 */
> +	rcu_read_lock();

        scoped_guard(rcu)


> +	cpumask_andnot(nohz_full, cpu_possible_mask,
> +		       housekeeping_cpumask_rcu(HK_TYPE_KERNEL_NOISE));
> +	rcu_read_unlock();
> +
> +	/*
> +	 * When "nohz_full=" was not passed at boot, tick_nohz_full_running is
> +	 * false and the full dynticks infrastructure (sched_tick_offload_init,
> +	 * RCU nohz quiescent-state reporting, context-tracking bootstrap) was
> +	 * never initialised.  In that case restrict the update to
> +	 * tick_nohz_full_mask so the /sys/devices/system/cpu/nohz_full sysfs
> +	 * attribute reflects DHM-isolated CPUs without enabling tick
> +	 * suppression, context tracking, or timer migration – all of which
> +	 * require boot-time setup and would deadlock on the first
> +	 * synchronize_rcu() call after CPUs are offlined.

What? You tell user space that the CPUs are nohz_full by updating the
mask, which is exposed in sysfs, which is blatantly wrong.

> +	 */
> +	was_running = READ_ONCE(tick_nohz_full_running);

Q: This READ_ONCE() pairs with which WRITE_ONCE()? 
A: With none, so it's just voodoo programming.

> +	spin_lock(&tick_nohz_lock);

This lock protects against the housekeeping core code invoking the apply
callback multiple times in parallel, right?

If that happens then there are bigger problems than corrupted masks.

> +	/*
> +	 * When nohz_full= was active at boot, compute the delta and update
> +	 * context tracking for CPUs joining or leaving the nohz_full set.
> +	 * Skip when !was_running: ct_cpu_track_user() calls
> +	 * static_branch_inc() which may sleep (jump_label_update on the
> +	 * 0→1 transition) – illegal inside a spinlock.

If you remove the pointless voodoo lock then this nonsense goes away too.

> +	 */
> +	if (IS_ENABLED(CONFIG_CONTEXT_TRACKING_USER) &&
> +	    was_running &&
> +	    cpumask_available(tick_nohz_full_mask)) {

Why is this stuff even invoked when the mask is not available? If it's
not there then NOHZ full is not functional, period.

> +		cpumask_andnot(added, nohz_full, tick_nohz_full_mask);
> +		cpumask_andnot(removed, tick_nohz_full_mask, nohz_full);
> +		for_each_cpu(cpu, added)
> +			ct_cpu_track_user(cpu);
> +		for_each_cpu(cpu, removed)
> +			ct_cpu_untrack_user(cpu);
> +	}
> +
> +	/*
> +	 * Update tick_nohz_full_mask unconditionally: this is the snapshot
> +	 * read by the /sys/devices/system/cpu/nohz_full sysfs attribute and
> +	 * must reflect the current isolation set even in the DHM runtime case.
> +	 */
> +	if (cpumask_available(tick_nohz_full_mask))
> +		cpumask_copy(tick_nohz_full_mask, nohz_full);

Seriously?

> +	/*
> +	 * Only modify tick_nohz_full_running and migrate the global tick when
> +	 * nohz_full= was set at boot; without boot-time setup, setting
> +	 * tick_nohz_full_running would suppress ticks on isolated CPUs and
> +	 * prevent RCU quiescent-state reporting, causing synchronize_rcu()
> +	 * to stall permanently when a CPU is subsequently offlined.
> +	 */
> +	if (was_running) {

Again, why is any of this invoked when NOHZ full was never enabled and
initialized?

> +		tick_nohz_full_running = !cpumask_empty(nohz_full);

Brilliant. When NOHZ full was enabled on the command line, then changing
the mask can disable "running" and that makes it disabled forever. There
is no way to reenable it.

This 'was_running' check is just wrong. What you need is a
'tick_nohz_full_initialized' boolean, which is only true when nohz_full
was setup early on including the mask.

If that's not the case, then none of this code is supposed to run
ever. I.e. the callback is not installed in the first place.

> +	/*
> +	 * Ensure tick_nohz_full_mask is allocated so that tick_nohz_hk_apply()
> +	 * can update it (and the /sys/devices/system/cpu/nohz_full sysfs
> +	 * attribute) when CPUs are isolated at runtime via DHM.  If "nohz_full="
> +	 * was passed at boot the mask is already allocated; allocate an empty
> +	 * one here for the runtime-only case.

What's the runtime only case? The fake exposure in sysfs which is just
misleading the user? Not going to happen. If it's not enabled on the
command line then it's disabled, end of story.

> +	 */
> +	if (!cpumask_available(tick_nohz_full_mask) &&
> +	    !zalloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL))
> +		pr_warn("tick/nohz: failed to allocate nohz_full_mask for DHM\n");

ROTFL. If the allocation fails, then the apply callback becomes a
complete noop doing magic cpumask operations for nothing and pretending
to be successful.

Thanks,

        tglx

^ permalink raw reply

* Re: [PATCH v4 00/31] Introduce SCMI Telemetry FS support
From: David Hildenbrand (Arm) @ 2026-06-18 17:27 UTC (permalink / raw)
  To: Cristian Marussi, Christian Brauner
  Cc: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
	linux-doc, sudeep.holla, james.quinlan, f.fainelli,
	vincent.guittot, etienne.carriere, peng.fan, michal.simek, d-gole,
	jic23, elif.topuz, lukasz.luba, philip.radford,
	souvik.chakravarty, leitao, kas, puranjay, usama.arif,
	kernel-team
In-Reply-To: <29a304f0-1e62-418a-b84f-aabdc4c0de8d@kernel.org>

> Maybe you have it in some of the patches here, but what does the typical
> directory + file structure look like in the current implementation?
> 
> Do you have an example?

Found it in patch #20! :)

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v4 00/31] Introduce SCMI Telemetry FS support
From: David Hildenbrand (Arm) @ 2026-06-18 17:22 UTC (permalink / raw)
  To: Cristian Marussi, Christian Brauner
  Cc: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
	linux-doc, sudeep.holla, james.quinlan, f.fainelli,
	vincent.guittot, etienne.carriere, peng.fan, michal.simek, d-gole,
	jic23, elif.topuz, lukasz.luba, philip.radford,
	souvik.chakravarty, leitao, kas, puranjay, usama.arif,
	kernel-team
In-Reply-To: <ajLVW1eHzbGDm4yn@pluto>

Hi,

asking some clarifying questions that I assume also Christian might want to know.

>>> In a nutshell, the SCMI Telemetry protocol allows an agent to discover at
>>> runtime the set of Telemetry Data Events (DEs) available on a specific
>>> platform and provides the means to configure the set of DEs that a user is

Is the configuration aspect limited to enabling selected events, or is there
more that can be configured?

>>> interested into, while reading them back using the collection method that
>>> is deeemed more suitable for the usecase at hand. (...amongst the various
>>> possible collection methods allowed by SCMI specification)
>>>
>>> Without delving into the gory details of the whole SCMI Telemetry protocol
>>> let's just say that the SCMI platform/server firmware advertises a number
>>> of Telemetry Data Events, each one identified by a 32bit unique ID, and an
>>> SCMI agent/client, like Linux, can discover them and read back at will the
>>> associated data value in a number of ways.
>>> Data collection is mainly intended to happen on demand via shared memory
>>> areas exposed by the platform firmware, discovered dynamically via SCMI
>>> Telemetry and accessed by Linux on-demand, but some DE can also be reported
>>> via SCMI Notifications asynchronous messages or via direct dedicated
>>> FastChannels (another kind of SCMI memory based access): all of this
>>> underlying mechanism is anyway hidden to the user since it is mediated by
>>> the kernel driver which will return the proper data value when queried.
>>>
>>> Anyway, the set of well-known architected DE IDs defined by the spec is
>>> limited to a dozen IDs, which means that the vast majority of DE IDs are
>>> customizable per-platform: as a consequence, though, the same ID, say
>>> '0x1234', could represent completely different things on different systems.
>>>
>>> Precise definitions and semantic of such custom Data Event IDs are out of
>>> the scope of the SCMI Telemetry specification and of this implementation:
>>> they are supposed to be provided using some kind of JSON-like description
>>> file that will have to be consumed by a userspace tool which would be
>>> finally in charge of making sense of the set of available DEs.

You mention json here ... but I assume the data we are getting fed by the
protocol is not in some default format? (e.g., json)

>>>
>>> IOW, in turn, this means that even though the DEs enumerated via SCMI come
>>> with some sort of topological and qualitative description provided by the
>>> protocol (like unit of measurements, name, topology info etc), kernel-wise
>>> we CANNOT be completely sure of "what is what" without being fed-back some
>>> sort of information about the DEs by the afore mentioned userspace tool.

Maybe you have it in some of the patches here, but what does the typical
directory + file structure look like in the current implementation?

Do you have an example?

Also, is everything in that filesystem read-only, or are there some writable
file (IOW, how is stuff configured?).

>>>
>>> For these reasons, currently this series does NOT attempt to register any
>>> of these DEs with any of the usual in-kernel subsystems (like HWMON, IIO,
>>> PERF etc), simply because we cannot be sure which DE is suitable, or even
>>> desirable, for a given subsystem. This also means there are NO in-kernel
>>> users of these Telemetry data events as of now.

Okay, so you really only feed this data to user space, exposing all the data you
have easily available as part of the protocol.

>>>
>>> So, while we do not exclude, for the future, to feed/register some of the
>>> discovered DEs to/with some of the above mentioned Kernel subsystems, as
>>> of now we have ONLY modeled a custom userspace API to make SCMI Telemetry
>>> available to userspace tools.

It's a good question how that could be done, if you need more information about
these events from user space.

>>>
>>> In deciding which kind of interface to expose SCMI Telemetry data to a
>>> user, this new SCMI Telemetry driver aims at satisfying 2 main reqs:
>>>
>>>  - exposing an FS-based human-readable interface that can be used to
>>>    discover, configure and access our Telemetry data directly also from
>>>    the shell without special tools
>>>
>>>  - exposing alternative machine-friendly, more-performant, binary
>>>    interfaces that can be used to avoid the overhead of multiple accesses
>>>    to the VFS and that can be more suitable to access with custom tools
[...]

>>>
>>> Due to the above reasoning, since V1 we opted for a new approach with the
>>> proposed interfaces now based on a full fledged, unified, virtual pseudo
>>> filesystem implemented from scratch, so that we can:
>>>
>>>  - expose all the DEs property we like as before with SysFS, but without
>>>    any of the constraint imposed by the usage of SysFs or kernfs.
>>>
>>>  - easily expose additional alternative views of the same set of DEs
>>>    using symlinking capabilities (e.g. alternative topological view)

That sounds reasonable.

[...]

> ...I would not say that this was the kind of feedback I was hoping for,
> but I am NOT gonna argue, given that you shot down already what I thought
> were all my best selling points :P
> 
> At this point my understanding is that the way forward must be to use
> a custom tool to configure/extract/translate the raw Telemetry data and
> move up into userspace the whole human readable FS layer via FUSE, if
> really needed.
> 
> I suppose that the new kernel/user interface has to be some dedicated char
> device implementing proper fops. (like I did previously in early versions
> of this series and then abandoned...)
> 
> Is this you have in mind ? Dedicated character device(s) with enough fops
> to be able to configure/extract Telemetry data with a custom tool ?

I cannot speak for Christian, but I guess you could have some kind of libscmi in
user space that can obtain the information (as you say, probably char device,
not sure which alternatives we have), to expose the data through a nice ABI, to
then either make tools build upon that directly, or have a fuse server in user
space that mimics what you currently do with the file system.

One thing that is not clear to me yet is how stuff would be configured, and how
possibly multiple users of libscmi would possibly interact.

> 
> Should/could such a tool live in the kernel tree (tools/) at least for
> ease of development/deployment ?

I think OOT.

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v3 06/12] fs/resctrl: Initialize the global kernel-mode policy at subsystem init
From: Babu Moger @ 2026-06-18 17:14 UTC (permalink / raw)
  To: Reinette Chatre, corbet, tony.luck, Dave.Martin, james.morse,
	tglx, bp, dave.hansen
  Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
	feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
	seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
	kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
	linux-kernel, eranian, peternewman, sos-linux-ext-patches
In-Reply-To: <ffa4f5c5-9512-41fc-9354-803a182a85cd@intel.com>

Hi Reinette,


On 6/16/26 18:36, Reinette Chatre wrote:
> Hi Babu,
> 
> On 4/30/26 4:24 PM, Babu Moger wrote:
>> kernel_mode feature needs to add the interface that lets user space
>> choose between INHERIT_CTRL_AND_MON, GLOBAL_ASSIGN_CTRL_INHERIT_MON_PER_CPU
>> and GLOBAL_ASSIGN_CTRL_ASSIGN_MON_PER_CPU.  Both the generic resctrl
>> code and the architecture layer need a single shared snapshot of the
>> supported and effective policy plus the resource group that backs the
>> global-assign modes; that snapshot is struct resctrl_kmode_cfg.
> 
> This does not seem to match implementation since this implementation does
> not actually share struct resctrl_kmode_cfg as described above. Only
> resctrl_arch_get_kmode_support() exchanges this struct between fs and
> arch and as already mentioned that usage looks unnecessary. The other
> arch/fs touch points use either individual members or their properties
> (like closid/rmid).
> 
> As described in response to previous patch I think this can be simplified
> while also making it more robust.
> 

Ack.

>>
>> Add the file-local resctrl_kcfg and a helper resctrl_kmode_init() that:
>>
>>    - Adds kmode and kmode_cur with BIT(INHERIT_CTRL_AND_MON), the
>>      universally supported mode and today's behaviour;
>>    - points k_rdtgrp at rdtgroup_default so global-assign modes have a
>>      valid backing group from boot;
> 
> If the default mode is INHERIT_CTRL_AND_MON then should the default group
> not be NULL?

It will be initialized to NULL.

> 
>>    - calls resctrl_arch_get_kmode_support() so each architecture ORs
>>      BIT(<mode>) into kmode for the policies its hardware supports
>>      (on x86, AMD PLZA contributes the two global-assign modes).
>>
>> resctrl_kmode_init() runs from resctrl_init() once the default group
> 
> resctrl_kmode_init() can be dropped after changes described in response
> to previous patch. Apart from no longer being necessary I also find that
> having the kernel mode fully initialized *before* the hotplug handlers run
> to be simpler.

That means resctrl_set_kmode_support() will be called from the 
architecture layer, likely from core.c within get_rdt_alloc_resources().

The resctrl_set_kmode_support() handler would need to initialize both 
the default mode and all supported modes.

I see that this is where the hotplug handler gets registered. Therefore, 
the modes are already initialized before the hotplug handler is set up.

> 
>> has been set up.  No user-visible behaviour changes yet; later patches
> 
> (drop "later patches ...")
> 

Sure.

Thanks
Babi

^ permalink raw reply

* Re: [PATCH] kselftest docs: remove reference to obsolete/archived wiki
From: Shuah Khan @ 2026-06-18 17:02 UTC (permalink / raw)
  To: Brett Sheffield, Rafael Passos, shuah, corbet
  Cc: linux-kselftest, workflows, linux-doc, linux-kernel, Shuah Khan
In-Reply-To: <ajOvQKne74gN-7Y2@karahi.librecast.net>

On 6/18/26 02:41, Brett Sheffield wrote:
>> On 6/17/26 19:03, Shuah Khan wrote:
>>> On 6/17/26 17:57, Rafael Passos wrote:
>>>> This link in the docs point to a wiki that is no longer active.
>>>>
>>>> The wiki was moved to archive.kernel.org, and there is a warning:
>>>> "OBSOLETE CONTENT This wiki has been archived and the content is
>>>> no longer updated."
>>>>
>>>> Signed-off-by: Rafael Passos <rafael@rcpassos.me>
>>>> ---
>>>>
>>>>    Documentation/dev-tools/kselftest.rst | 5 -----
>>>>    1 file changed, 5 deletions(-)
>>>>
>>>> diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst
>>>> index d7bfe320338c..64c0ec7428a2 100644
>>>> --- a/Documentation/dev-tools/kselftest.rst
>>>> +++ b/Documentation/dev-tools/kselftest.rst
>>>> @@ -15,11 +15,6 @@ able to run that test on an older kernel. Hence, it is important to keep
>>>>    code that can still test an older kernel and make sure it skips the test
>>>>    gracefully on newer releases.
>>>> -You can find additional information on Kselftest framework, how to
>>>> -write new tests using the framework on Kselftest wiki:
>>>> -
>>>> -https://kselftest.wiki.kernel.org/
>>>> -
>>>>    On some systems, hot-plug tests could hang forever waiting for cpu and
>>>>    memory to be ready to be offlined. A special hot-plug target is created
>>>>    to run the full range of hot-plug tests. In default mode, hot-plug tests run
>>>
>>>
>>> Looks good to me.
>>>
>>> Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
>>
>> Jon,
>>
>> I can take this through kselftest tree as I usually do.
>>
>> thanks,
>> -- Shuah
> 
> Hi Shuah, Jon et al,
> 
> I've been trying to get the same change merged since August 2025:
> 
> https://lore.kernel.org/linux-doc/20250824075007.13901-2-bacs@librecast.net/
> 
> resent in January:
> 
> https://lore.kernel.org/linux-doc/20260115172817.7120-1-bacs@librecast.net/
> 
> It's great that this trivial fix is finally getting merged, but can someone
> explain why this patch was accepted in preference to the one I sent in August?
> 

Brett,

My apologies  for not taking your patch earlier. Considering the effort
you put in with a re-sending the patch and following up here, it is
only fair for me to take yours instead. Hope it will apply cleanly on
top of kselftest-next

Rafael, I am going to take Brett;s patch instead of yours.

Apologies to both of you for the mix up.

thanks,
-- Shuah



^ permalink raw reply

* Re: [PATCH v6 00/10] ACPI: APEI: share GHES CPER helpers and add DT FFH provider
From: Borislav Petkov @ 2026-06-18 16:48 UTC (permalink / raw)
  To: Ahmed Tiba
  Cc: Rafael J. Wysocki, Tony Luck, Hanjun Guo, Mauro Carvalho Chehab,
	Shuai Xue, Len Brown, Saket Dumbre, Davidlohr Bueso,
	Jonathan Cameron, Dave Jiang, Alison Schofield, Vishal Verma,
	Ira Weiny, Dan Williams, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Jonathan Corbet, Shuah Khan, linux-kernel,
	linux-acpi, acpica-devel, linux-cxl, devicetree, linux-edac,
	linux-doc, Dmitry.Lamerov
In-Reply-To: <20260617-topics-ahmtib01-ras_ffh_arm_internal_review-v6-0-91f725174aa0@arm.com>

On Wed, Jun 17, 2026 at 02:54:38PM +0100, Ahmed Tiba wrote:
> This is v6 of the GHES refactor series. Compared to v5, it addresses
> the latest review comments and tightens the DT CPER provider and
> related helper wiring.

Sashiko has comments:

https://sashiko.dev/#/patchset/20260617-topics-ahmtib01-ras_ffh_arm_internal_review-v6-0-91f725174aa0%40arm.com

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply

* Re: [PATCH v5 3/6] alloc_tag: add size-based filtering to ioctl
From: Abhishek Bapat @ 2026-06-18 16:38 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Andrew Morton, Kent Overstreet, Hao Ge, Shuah Khan,
	Jonathan Corbet, linux-doc, linux-kernel, linux-mm, Sourav Panda
In-Reply-To: <CAJuCfpFrgKBGFWoca=XuKh1p22vdfE_uSz_nt2Kj4UvnjvSUJQ@mail.gmail.com>

On Wed, Jun 17, 2026 at 4:01 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Jun 17, 2026 at 3:41 PM Abhishek Bapat <abhishekbapat@google.com> wrote:
> >
> > On Wed, Jun 17, 2026 at 3:35 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Wed, Jun 17, 2026 at 1:55 PM Abhishek Bapat <abhishekbapat@google.com> wrote:
> > > >
> > > > On Wed, Jun 17, 2026 at 9:29 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > >
> > > > > On Mon, Jun 15, 2026 at 4:04 PM Abhishek Bapat <abhishekbapat@google.com> wrote:
> > > > > >
> > > > > > Extend the allocinfo filtering mechanism to allow users to filter tags
> > > > > > based on the total number of bytes allocated [min_size, max_size]. The
> > > > > > size range is inclusive.
> > > > > >
> > > > > > Filtering by size involves retrieving allocinfo per-CPU counters, which
> > > > > > is an expensive operation. Hence, the performance of size-based
> > > > > > filtering will be worse than other filters.
> > > > > >
> > > > > > Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
> > > > > > Acked-by: Hao Ge <hao.ge@linux.dev>
> > > > > > ---
> > > > > >  include/uapi/linux/alloc_tag.h |  8 ++++-
> > > > > >  lib/alloc_tag.c                | 63 ++++++++++++++++++++++++++++------
> > > > > >  2 files changed, 59 insertions(+), 12 deletions(-)
> > > > > >
> > > > > > diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
> > > > > > index 3b11877955b9..7f5acbb44c14 100644
> > > > > > --- a/include/uapi/linux/alloc_tag.h
> > > > > > +++ b/include/uapi/linux/alloc_tag.h
> > > > > > @@ -45,13 +45,17 @@ enum {
> > > > > >         ALLOCINFO_FILTER_FUNCTION,
> > > > > >         ALLOCINFO_FILTER_FILENAME,
> > > > > >         ALLOCINFO_FILTER_LINENO,
> > > > > > -       __ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_LINENO
> > > > > > +       ALLOCINFO_FILTER_MIN_SIZE,
> > > > > > +       ALLOCINFO_FILTER_MAX_SIZE,
> > > > > > +       __ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_MAX_SIZE
> > > > > >  };
> > > > > >
> > > > > >  #define ALLOCINFO_FILTER_MASK_MODNAME          (1 << ALLOCINFO_FILTER_MODNAME)
> > > > > >  #define ALLOCINFO_FILTER_MASK_FUNCTION         (1 << ALLOCINFO_FILTER_FUNCTION)
> > > > > >  #define ALLOCINFO_FILTER_MASK_FILENAME         (1 << ALLOCINFO_FILTER_FILENAME)
> > > > > >  #define ALLOCINFO_FILTER_MASK_LINENO           (1 << ALLOCINFO_FILTER_LINENO)
> > > > > > +#define ALLOCINFO_FILTER_MASK_MIN_SIZE         (1 << ALLOCINFO_FILTER_MIN_SIZE)
> > > > > > +#define ALLOCINFO_FILTER_MASK_MAX_SIZE         (1 << ALLOCINFO_FILTER_MAX_SIZE)
> > > > > >
> > > > > >  #define ALLOCINFO_FILTER_MASKS \
> > > > > >         ((1 << (__ALLOCINFO_FILTER_LAST + 1)) - 1)
> > > > > > @@ -59,6 +63,8 @@ enum {
> > > > > >  struct allocinfo_filter {
> > > > > >         __u64 mask; /* bitmask of the filter fields used */
> > > > > >         struct allocinfo_tag fields;
> > > > > > +       __u64 min_size;
> > > > > > +       __u64 max_size;
> > > > > >  };
> > > > > >
> > > > > >  struct allocinfo_get_at {
> > > > > > diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> > > > > > index 5feb61d9fb92..b3d21834b61e 100644
> > > > > > --- a/lib/alloc_tag.c
> > > > > > +++ b/lib/alloc_tag.c
> > > > > > @@ -195,15 +195,26 @@ static int allocinfo_cmp_str(const char *str, const char *template)
> > > > > >         return strncmp(allocinfo_str(str), template, ALLOCINFO_STR_SIZE);
> > > > > >  }
> > > > > >
> > > > > > +/* Fetch the per-CPU counters */
> > > > > > +static inline struct alloc_tag_counters allocinfo_prefetch_counters(struct codetag *ct)
> > > > > > +{
> > > > > > +       return alloc_tag_read(ct_to_alloc_tag(ct));
> > > > > > +}
> > > > > > +
> > > > > >  /*
> > > > > >   * Populates the UAPI allocinfo_tag_data structure with active runtime
> > > > > >   * profiling counters extracted from the given kernel codetag.
> > > > > >   */
> > > > > >  static void allocinfo_to_params(struct codetag *ct,
> > > > > > -                               struct allocinfo_tag_data *data)
> > > > > > +                               struct allocinfo_tag_data *data,
> > > > > > +                               struct alloc_tag_counters *counters)
> > > > > >  {
> > > > > > -       struct alloc_tag *tag = ct_to_alloc_tag(ct);
> > > > > > -       struct alloc_tag_counters counter = alloc_tag_read(tag);
> > > > > > +       struct alloc_tag_counters local_counters;
> > > > > > +
> > > > > > +       if (!counters) {
> > > > > > +               local_counters = allocinfo_prefetch_counters(ct);
> > > > > > +               counters = &local_counters;
> > > > > > +       }
> > > > > >
> > > > > >         if (ct->modname)
> > > > > >                 allocinfo_copy_str(data->tag.modname, ct->modname);
> > > > > > @@ -212,9 +223,9 @@ static void allocinfo_to_params(struct codetag *ct,
> > > > > >         allocinfo_copy_str(data->tag.function, ct->function);
> > > > > >         allocinfo_copy_str(data->tag.filename, ct->filename);
> > > > > >         data->tag.lineno = ct->lineno;
> > > > > > -       data->counter.bytes = counter.bytes;
> > > > > > -       data->counter.calls = counter.calls;
> > > > > > -       data->counter.accurate = !alloc_tag_is_inaccurate(tag);
> > > > > > +       data->counter.bytes = counters->bytes;
> > > > > > +       data->counter.calls = counters->calls;
> > > > > > +       data->counter.accurate = !alloc_tag_is_inaccurate(ct_to_alloc_tag(ct));
> > > > > >  }
> > > > > >
> > > > > >  /*
> > > > > > @@ -238,7 +249,9 @@ static int allocinfo_ioctl_get_content_id(struct seq_file *m, void __user *arg)
> > > > > >   * Verifies whether a given codetag satisfies the active filtering criteria by
> > > > > >   * matching its characteristics against the specified filter.
> > > > > >   */
> > > > > > -static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter)
> > > > > > +static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter,
> > > > > > +                          struct alloc_tag_counters *counters,
> > > > > > +                          bool *fetched_counters)
> > > > > >  {
> > > > > >         if (!filter || !filter->mask)
> > > > > >                 return true;
> > > > > > @@ -265,6 +278,19 @@ static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter)
> > > > > >             ct->lineno != filter->fields.lineno)
> > > > > >                 return false;
> > > > > >
> > > > > > +       if (filter->mask & (ALLOCINFO_FILTER_MASK_MIN_SIZE | ALLOCINFO_FILTER_MASK_MAX_SIZE)) {
> > > > > > +               if (!*fetched_counters) {
> > > > > > +                       *counters = allocinfo_prefetch_counters(ct);
> > > > > > +                       *fetched_counters = true;
> > > > > > +               }
> > > > > > +               if ((filter->mask & ALLOCINFO_FILTER_MASK_MIN_SIZE) &&
> > > > > > +                   counters->bytes < filter->min_size)
> > > > > > +                       return false;
> > > > > > +               if ((filter->mask & ALLOCINFO_FILTER_MASK_MAX_SIZE) &&
> > > > > > +                   counters->bytes > filter->max_size)
> > > > > > +                       return false;
> > > > > > +       }
> > > > > > +
> > > > > >         return true;
> > > > > >  }
> > > > > >
> > > > > > @@ -278,6 +304,8 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
> > > > > >         struct codetag *ct;
> > > > > >         struct allocinfo_get_at params = {0};
> > > > > >         __u64 skip_count;
> > > > > > +       struct alloc_tag_counters counters;
> > > > > > +       bool fetched_counters;
> > > > > >
> > > > > >         if (copy_from_user(&params, arg, sizeof(params)))
> > > > > >                 return -EFAULT;
> > > > > > @@ -285,6 +313,11 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
> > > > > >         if (params.filter.mask & ~ALLOCINFO_FILTER_MASKS)
> > > > > >                 return -EINVAL;
> > > > > >
> > > > > > +       if ((params.filter.mask & ALLOCINFO_FILTER_MASK_MIN_SIZE) &&
> > > > > > +           (params.filter.mask & ALLOCINFO_FILTER_MASK_MAX_SIZE) &&
> > > > > > +           params.filter.min_size > params.filter.max_size)
> > > > > > +               return -EINVAL;
> > > > > > +
> > > > > >         priv = m->private;
> > > > > >
> > > > > >         mutex_lock(&priv->ioctl_lock);
> > > > > > @@ -308,7 +341,8 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
> > > > > >         ct = codetag_next_ct(&priv->ioctl_iter);
> > > > > >
> > > > > >         while (ct) {
> > > > > > -               if (matches_filter(ct, &priv->filter)) {
> > > > > > +               fetched_counters = false;
> > > > > > +               if (matches_filter(ct, &priv->filter, &counters, &fetched_counters)) {
> > > > >
> > > > > Do we really need this "fetched_counters" parameter? Here are the
> > > > > possible cases:
> > > > > 1. If the filter does not include ALLOCINFO_FILTER_MASK_MIN_SIZE |
> > > > > ALLOCINFO_FILTER_MASK_MAX_SIZE then counters would not be fetched.
> > > > > 2. If the filter includes ALLOCINFO_FILTER_MASK_MIN_SIZE |
> > > > > ALLOCINFO_FILTER_MASK_MAX_SIZE and
> > > > > 2.1. matches_filter() returns true then we know counters were fetched
> > > > > because they had to be validated.
> > > > > 2.2. matches_filter() returns false then we don't care if the counters
> > > > > were fetched. We do not report that tag anyway.
> > > > >
> > > > > So, instead of passing fetched_counters to matches_filter() we could do this:
> > > > >
> > > > > bool filter_by_size = (params.filter.mask &
> > > > > (ALLOCINFO_FILTER_MASK_MIN_SIZE | ALLOCINFO_FILTER_MASK_MAX_SIZE)) !=
> > > > > 0;
> > > > > while (ct) {
> > > > >            if (matches_filter(ct, &priv->filter, &counters)) {
> > > > > ...
> > > > > }
> > > > > if (ct) {
> > > > >            allocinfo_to_params(ct, &params.data, filter_by_size ?
> > > > > &counters : NULL);
> > > > > ...
> > > > > }
> > > > >
> > > > > Wouldn't that work?
> > > > >
> > > >
> > > > While we can deduce whether counters were fetched outside the
> > > > matches_filter function, I think the current implementation is more
> > > > intuitive from a readability perspective. I believe it  should be kept
> > > > as is for that reason. If we extract the logic, we'll first have to
> > > > replicate the boolean logic at two places. Second, we'd need to add a
> > > > comment explaining the boolean calculation, and the reader might have
> > > > a higher cognitive load trying to determine which function populates
> > > > the counters. The current implementation makes it easy for the reader
> > > > to deduce the original intention. Let me know what you think.
> > >
> > > Ok, I guess you have a point.
> > >
> > > I was also thinking why we are passing NULL to allocinfo_to_params()
> > > to fetch the counters into a local variable? Why can't we simply call
> > > allocinfo_prefetch_counters() before calling allocinfo_to_params()
> > > when fetched_counters==false? Basically:
> > >
> > > if (!fetched_counters)
> > >     counters = allocinfo_prefetch_counters(ct);
> > > allocinfo_to_params(ct, &params.data, &counters);
> > >
> > > This would simplify allocinfo_to_params() because counter will never
> > > be NULL and it would not need local counters.
> > >
> >
> > The only reason I did it that way was to avoid repeating the code at
> > two places i.e. allocinfo_ioctl_get_at and allocinfo_ioctl_get_next.
> > Either way, the per-CPU counters are assimilated only once. I can
> > include this change if you still want me to, but personally I like the
> > way it currently is implemented.
>
> Yeah, I think repeating 2 lines is preferable to passing NULL and
> fetching into a local variable. Please include that change.
>

Ack, I will change this in the next patchset version.

> >
> > > >
> > > > > >                         if (skip_count == 0)
> > > > > >                                 break;
> > > > > >                         skip_count--;
> > > > > > @@ -317,7 +351,7 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
> > > > > >         }
> > > > > >
> > > > > >         if (ct) {
> > > > > > -               allocinfo_to_params(ct, &params.data);
> > > > > > +               allocinfo_to_params(ct, &params.data, fetched_counters ? &counters : NULL);
> > > > > >                 priv->positioned = true;
> > > > > >         }
> > > > > >
> > > > > > @@ -343,6 +377,8 @@ static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
> > > > > >         struct codetag *ct;
> > > > > >         struct allocinfo_tag_data params;
> > > > > >         int ret = 0;
> > > > > > +       struct alloc_tag_counters counters;
> > > > > > +       bool fetched_counters;
> > > > > >
> > > > > >         memset(&params, 0, sizeof(params));
> > > > > >         priv = m->private;
> > > > > > @@ -356,10 +392,15 @@ static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
> > > > > >         }
> > > > > >
> > > > > >         ct = codetag_next_ct(&priv->ioctl_iter);
> > > > > > -       while (ct && !matches_filter(ct, &priv->filter))
> > > > > > +       while (ct) {
> > > > > > +               fetched_counters = false;
> > > > > > +               if (matches_filter(ct, &priv->filter, &counters, &fetched_counters))
> > > > > > +                       break;
> > > > > >                 ct = codetag_next_ct(&priv->ioctl_iter);
> > > > > > +       }
> > > > > > +
> > > > > >         if (ct)
> > > > > > -               allocinfo_to_params(ct, &params);
> > > > > > +               allocinfo_to_params(ct, &params, fetched_counters ? &counters : NULL);
> > > > > >
> > > > > >         if (!ct) {
> > > > > >                 priv->positioned = false;
> > > > > > --
> > > > > > 2.54.0.1136.gdb2ca164c4-goog
> > > > > >

^ permalink raw reply

* Re: [PATCH v6 01/12] PCI: liveupdate: Set up FLB handler for the PCI core
From: Pratyush Yadav @ 2026-06-18 16:35 UTC (permalink / raw)
  To: Pranjal Shrivastava
  Cc: David Matlack, Pasha Tatashin, Mike Rapoport, kexec, linux-doc,
	linux-kernel, linux-mm, linux-pci, Adithya Jayachandran,
	Alexander Graf, Alex Williamson, Bjorn Helgaas, Chris Li,
	David Rientjes, Jacob Pan, Jason Gunthorpe, Jonathan Corbet,
	Josh Hilke, Leon Romanovsky, Lukas Wunner, Parav Pandit,
	Pratyush Yadav, Saeed Mahameed, Samiullah Khawaja, Shuah Khan,
	Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <ajPzC2Xh1NMbfokP@google.com>

On Thu, Jun 18 2026, Pranjal Shrivastava wrote:

> On Mon, Jun 15, 2026 at 10:19:03PM +0000, David Matlack wrote:
>> On 2026-06-12 10:47 AM, Pasha Tatashin wrote:
>> > On 2026-06-12 09:54:44+03:00, Mike Rapoport wrote:
>> > > On Fri, Jun 12, 2026 at 05:15:02AM +0000, Pasha Tatashin wrote:
>> > > 
>> > > > On Fri, 22 May 2026 20:23:59 +0000, David Matlack <dmatlack@google.com> wrote:
>> > > > 
>> > > > Please add Pratyush, Mike, and myself so we are notified directly of 
>> > > > incoming patches, the same as with other areas where the liveupdate/ 
>> > > > tree is specified.
>> > > 
>> > > Or we can add PCI liveupdate files to LIVEUPDATE entry.
>> > 
>> > That will not work, as we cannot serve as maintainers for 
>> > PCI/VFIO/IOMMU/KVM, etc. David Matlack will be the maintainer for the 
>> > PCI components, and we will accept patches once they have been approved 
>> > by him.
>> > 
>> > The simplification we could do is to create an email alias 
>> > for the live-update tree maintainers. This would allow us to use a 
>> > single entry instead of listing all three of us individually.
>> 
>> We could create a Live Update mailing list for all code that can be CCed
>> on all patches that must be merged through the Live Update tree. I would
>> also be interested in subscribing to that list.
>
> +1. I'd like if there's a specific Live Update mailing list for
> submissions & discussion about the Live Update tree.

We treat kexec@lists.infradead.org as the "live update mailing list". We
considered getting a separate one, but I reckon the traffic is low
enough on kexec@ already that we can re-use it for live update.

So perhaps we just Cc kexec@? Is there anything to be gained by creating
an alias?

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v4 3/5] rpmsg: virtio_rpmsg_bus: get buffer size from config space
From: Shah, Tanmay @ 2026-06-18 16:31 UTC (permalink / raw)
  To: Arnaud POULIQUEN, tanmay.shah, andersson, mathieu.poirier, corbet,
	skhan
  Cc: linux-remoteproc, linux-doc, linux-kernel
In-Reply-To: <a32b579f-232c-452e-abef-585a97b32839@foss.st.com>



On 6/18/2026 3:32 AM, Arnaud POULIQUEN wrote:
> 
> 
> On 6/17/26 19:41, Shah, Tanmay wrote:
>>
>>
>> On 6/17/2026 4:15 AM, Arnaud POULIQUEN wrote:
>>> Hi Tanmay,
>>>
>>> On 6/15/26 22:20, Tanmay Shah wrote:
>>>> 512 bytes isn't always suitable for all case, let firmware
>>>> maker decide the best value from resource table.
>>>> enable by VIRTIO_RPMSG_F_BUFSZ feature bit.
>>>>
>>>> Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
>>>> ---
>>>>
>>>> Changes in v4: squash to virtio rpmsg config patch
>>>>     - Introduce new patch to modify rpmsg.rst documentation
>>>>     - check version is always 1.
>>>>     - check size field is same as size of struct virtio_rpmsg_config
>>>>     - introduce alignment field
>>>>     - check alignment field is power of 2
>>>>     - check tx and rx buf size is aligned with alignment passed in the
>>>>       structure
>>>>
>>>> Changes in v3:
>>>>     - change version field from u16 to u8
>>>>     - introduce size field in the rpmsg_virtio_config structure
>>>>     - check version field is set to any non-zero value.
>>>>     - check size field is not 0.
>>>>     - Remove field for private config, as not needed for now.
>>>>     - add documentation of rpmsg_virtio_config structure
>>>>
>>>>    drivers/rpmsg/virtio_rpmsg_bus.c   | 129 +++++++++++++++++++++++
>>>> +-----
>>>>    include/linux/rpmsg/virtio_rpmsg.h |  50 +++++++++++
>>>>    2 files changed, 160 insertions(+), 19 deletions(-)
>>>>    create mode 100644 include/linux/rpmsg/virtio_rpmsg.h
>>>>
>>>> diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/
>>>> virtio_rpmsg_bus.c
>>>> index 99df1ae07055..a59925f870a4 100644
>>>> --- a/drivers/rpmsg/virtio_rpmsg_bus.c
>>>> +++ b/drivers/rpmsg/virtio_rpmsg_bus.c
>>>> @@ -15,11 +15,13 @@
>>>>    #include <linux/idr.h>
>>>>    #include <linux/jiffies.h>
>>>>    #include <linux/kernel.h>
>>>> +#include <linux/log2.h>
>>>>    #include <linux/module.h>
>>>>    #include <linux/mutex.h>
>>>>    #include <linux/rpmsg.h>
>>>>    #include <linux/rpmsg/byteorder.h>
>>>>    #include <linux/rpmsg/ns.h>
>>>> +#include <linux/rpmsg/virtio_rpmsg.h>
>>>>    #include <linux/scatterlist.h>
>>>>    #include <linux/slab.h>
>>>>    #include <linux/sched.h>
>>>> @@ -39,7 +41,8 @@
>>>>     * @tx_bufs:    kernel address of tx buffers
>>>>     * @num_rx_buf: total number of rx buffers
>>>>     * @num_tx_buf: total number of tx buffers
>>>> - * @buf_size:   size of one rx or tx buffer
>>>> + * @rx_buf_size: size of one rx buffer
>>>> + * @tx_buf_size: size of one tx buffer
>>>>     * @last_tx_buf: index of last tx buffer used
>>>>     * @bufs_dma:    dma base addr of the buffers
>>>>     * @tx_lock:    protects svq and tx_bufs, to allow concurrent
>>>> senders.
>>>> @@ -59,7 +62,8 @@ struct virtproc_info {
>>>>        void *rx_bufs, *tx_bufs;
>>>>        unsigned int num_rx_buf;
>>>>        unsigned int num_tx_buf;
>>>> -    unsigned int buf_size;
>>>> +    unsigned int rx_buf_size;
>>>> +    unsigned int tx_buf_size;
>>>>        int last_tx_buf;
>>>>        dma_addr_t bufs_dma;
>>>>        struct mutex tx_lock;
>>>> @@ -68,9 +72,6 @@ struct virtproc_info {
>>>>        wait_queue_head_t sendq;
>>>>    };
>>>>    -/* The feature bitmap for virtio rpmsg */
>>>> -#define VIRTIO_RPMSG_F_NS    0 /* RP supports name service
>>>> notifications */
>>>> -
>>>>    /**
>>>>     * struct rpmsg_hdr - common header for all rpmsg messages
>>>>     * @src: source address
>>>> @@ -128,7 +129,7 @@ struct virtio_rpmsg_channel {
>>>>     * processor.
>>>>     */
>>>>    #define MAX_RPMSG_NUM_BUFS    (256)
>>>> -#define MAX_RPMSG_BUF_SIZE    (512)
>>>> +#define DEFAULT_RPMSG_BUF_SIZE    (512)
>>>>      /*
>>>>     * Local addresses are dynamically allocated on-demand.
>>>> @@ -444,7 +445,7 @@ static void *get_a_tx_buf(struct virtproc_info
>>>> *vrp)
>>>>          /* either pick the next unused tx buffer */
>>>>        if (vrp->last_tx_buf < vrp->num_tx_buf)
>>>> -        ret = vrp->tx_bufs + vrp->buf_size * vrp->last_tx_buf++;
>>>> +        ret = vrp->tx_bufs + vrp->tx_buf_size * vrp->last_tx_buf++;
>>>>        /* or recycle a used one */
>>>>        else
>>>>            ret = virtqueue_get_buf(vrp->svq, &len);
>>>> @@ -514,7 +515,7 @@ static int rpmsg_send_offchannel_raw(struct
>>>> rpmsg_device *rpdev,
>>>>         * messaging), or to improve the buffer allocator, to support
>>>>         * variable-length buffer sizes.
>>>>         */
>>>> -    if (len > vrp->buf_size - sizeof(struct rpmsg_hdr)) {
>>>> +    if (len > vrp->tx_buf_size - sizeof(struct rpmsg_hdr)) {
>>>>            dev_err(dev, "message is too big (%d)\n", len);
>>>>            return -EMSGSIZE;
>>>>        }
>>>> @@ -647,7 +648,7 @@ static ssize_t virtio_rpmsg_get_mtu(struct
>>>> rpmsg_endpoint *ept)
>>>>        struct rpmsg_device *rpdev = ept->rpdev;
>>>>        struct virtio_rpmsg_channel *vch =
>>>> to_virtio_rpmsg_channel(rpdev);
>>>>    -    return vch->vrp->buf_size - sizeof(struct rpmsg_hdr);
>>>> +    return vch->vrp->tx_buf_size - sizeof(struct rpmsg_hdr);
>>>>    }
>>>>      static int rpmsg_recv_single(struct virtproc_info *vrp, struct
>>>> device *dev,
>>>> @@ -673,7 +674,7 @@ static int rpmsg_recv_single(struct virtproc_info
>>>> *vrp, struct device *dev,
>>>>         * We currently use fixed-sized buffers, so trivially sanitize
>>>>         * the reported payload length.
>>>>         */
>>>> -    if (len > vrp->buf_size ||
>>>> +    if (len > vrp->rx_buf_size ||
>>>>            msg_len > (len - sizeof(struct rpmsg_hdr))) {
>>>>            dev_warn(dev, "inbound msg too big: (%d, %d)\n", len,
>>>> msg_len);
>>>>            return -EINVAL;
>>>> @@ -706,7 +707,7 @@ static int rpmsg_recv_single(struct virtproc_info
>>>> *vrp, struct device *dev,
>>>>            dev_warn_ratelimited(dev, "msg received with no
>>>> recipient\n");
>>>>          /* publish the real size of the buffer */
>>>> -    rpmsg_sg_init(&sg, msg, vrp->buf_size);
>>>> +    rpmsg_sg_init(&sg, msg, vrp->rx_buf_size);
>>>>          /* add the buffer back to the remote processor's virtqueue */
>>>>        err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL);
>>>> @@ -820,10 +821,13 @@ static int rpmsg_probe(struct virtio_device
>>>> *vdev)
>>>>        struct virtproc_info *vrp;
>>>>        struct virtio_rpmsg_channel *vch = NULL;
>>>>        struct rpmsg_device *rpdev_ns, *rpdev_ctrl;
>>>> +    u16 rpmsg_buf_align = 0;
>>>>        void *bufs_va;
>>>>        int err = 0, i;
>>>>        size_t total_buf_space;
>>>>        bool notify;
>>>> +    u8 version;
>>>> +    u16 size;
>>>>          vrp = kzalloc_obj(*vrp);
>>>>        if (!vrp)
>>>> @@ -855,9 +859,90 @@ static int rpmsg_probe(struct virtio_device *vdev)
>>>>        else
>>>>            vrp->num_tx_buf = MAX_RPMSG_NUM_BUFS;
>>>>    -    vrp->buf_size = MAX_RPMSG_BUF_SIZE;
>>>> +    /*
>>>> +     * If VIRTIO_RPMSG_F_BUFSZ feature is supported, then configure
>>>> buf
>>>> +     * size from virtio device config space from the resource table.
>>>> +     * If the feature is not supported, then assign default buf size.
>>>> +     */
>>>> +    if (virtio_has_feature(vdev, VIRTIO_RPMSG_F_BUFSZ)) {
>>>> +        virtio_cread(vdev, struct virtio_rpmsg_config,
>>>> +                 version, &version);
>>>> +
>>>> +        /* for now we support only v1 */
>>>> +        if (version != RPMSG_VDEV_CONFIG_V1) {
>>>> +            dev_err(&vdev->dev,
>>>> +                "unsupported vdev config version %u\n", version);
>>>> +            err = -EINVAL;
>>>> +            goto vqs_del;
>>>> +        }
>>>> +
>>>> +        /* size of the config space must match */
>>>> +        virtio_cread(vdev, struct virtio_rpmsg_config,
>>>> +                 size, &size);
>>>> +        if (size != sizeof(struct virtio_rpmsg_config)) {
>>>> +            dev_err(&vdev->dev, "invalid size of vdev config %u\n",
>>>> +                size);
>>>> +            err = -EINVAL;
>>>> +            goto vqs_del;
>>>> +        }
>>>>    -    total_buf_space = (vrp->num_rx_buf + vrp->num_tx_buf) * vrp-
>>>>> buf_size;
>>>> +        /*
>>>> +         * Optional alignment applied to each buffer size and to
>>>> the TX
>>>> +         * buffer base address (e.g. to align buffers on a cache
>>>> line).
>>>> +         * It must be a power of two; zero means no extra alignment.
>>>> +         */
>>>> +        virtio_cread(vdev, struct virtio_rpmsg_config,
>>>> +                 rpmsg_buf_align, &rpmsg_buf_align);
>>>> +        if (rpmsg_buf_align && !is_power_of_2(rpmsg_buf_align)) {
>>>> +            dev_err(&vdev->dev,
>>>> +                "bad vdev config: rpmsg_buf_align %u is not a power
>>>> of two\n",
>>>> +                rpmsg_buf_align);
>>>> +            err = -EINVAL;
>>>> +            goto vqs_del;
>>>> +        }
>>>> +
>>>> +        /* note: tx and rx are defined from remote view */
>>>> +        virtio_cread(vdev, struct virtio_rpmsg_config,
>>>> +                 txbuf_size, &vrp->rx_buf_size);
>>>> +        virtio_cread(vdev, struct virtio_rpmsg_config,
>>>> +                 rxbuf_size, &vrp->tx_buf_size);
>>>> +
>>>> +        /* The buffers must hold at least the rpmsg header */
>>>> +        if (vrp->rx_buf_size < sizeof(struct rpmsg_hdr) ||
>>>> +            vrp->tx_buf_size < sizeof(struct rpmsg_hdr)) {
>>>> +            dev_err(&vdev->dev,
>>>> +                "bad vdev config: rx buf sz = %u, tx buf sz = %u\n",
>>>> +                vrp->rx_buf_size, vrp->tx_buf_size);
>>>> +            err = -EINVAL;
>>>> +            goto vqs_del;
>>>> +        }
>>>> +
>>>> +        /*
>>>> +         * The buffer size must be aligned to the provided
>>>> alignment for
>>>> +         * so that the start address of tx bufs can be aligned.
>>>> +         */
>>>
>>> 'tx' to remove as  it also concerns Rx buffers
>>>
>>
>> Ack.
>>
>>>
>>> What about removing this check to manage alignment during buffer
>>> allocation?
>>>
>>> For example, if the alignment is on a 64-bit address and the tx_buffer
>>> and rx_buffer sizes are 40 bytes, 48 bytes can be allocated in memory
>>> for each buffer, and the virtio descriptor can be filled with aligned
>>> addresses.
>>>
>>> In other words, the rpmsg_buf_align field contains the alignment
>>> constraint from the remote processor. If the Linux kernel wants to
>>> impose another alignment constraint, it must test or update
>>> rpmsg_buf_align, but it must not impose alignment on the buffer size.
>>>
>>>
>>
>> This part I don't understand. `rpmsg_buf_align` is alignment for only
>> single buffer size. The linux kernel is checking that single rx buf size
>> and tx buf size is aligned with `rpmsg_buf_align` as firmware has
>> claimed.
>>
>> For reference the openamp-system-reference PR:
>> https://github.com/OpenAMP/openamp-system-reference/pull/106/changes
>>
>>     .vdev_config = {
>>         .version = 1,
>>         .reserved = 0,
>>         .size = (uint16_t)(sizeof(struct rpmsg_virtio_config) -
>> sizeof(bool)),
>>         .alignment = RPMSG_BUF_ALIGN,
>>         .reserved1 = 0,
>>         /* Tx for host */
>>         .h2r_buf_size = metal_align_up(4096, RPMSG_BUF_ALIGN),
>>         /* Rx for host */
>>         .r2h_buf_size = metal_align_up(4096, RPMSG_BUF_ALIGN),
>>     },
>>
>> IIUC, The linux kernel is not really supposed to modify
>> `rpmsg_buf_align`. It only uses it to check that firmware has assigned
>> correct size of single rx and tx buffer.
>>
>>
>> When the linux kernel uses dma_alloc_coherent() API it aligns total
>> buffer size with page size. That is different than single tx buf size
>> and single rx buf size. The total buf size alignment to page size is
>> irrelevant to `rpmsg_buf_align` field.
>>
>> Please let me know if I am missing something or didn't understand your
>> comment. I prefer that `rpmsg_buf_align` should be only modified by the
>> firmware and not the linux kernel.
> 
> 
> Sorry it was unclear, let try to reexplain my suggestion:
> 
> Two alignment constraints can apply:
> - The remote processor can require an alignment through
>   vdev_config::alignment.
> - The main processor, which runs Linux or another operating system (OS),
>   can require a different alignment, for example, for cache alignment.
> In current Linux implementation no constraint in Linux.
> nevertheless  I would be in favor of taking into account such future
> constraint without imposing constraint on the buffer sizes.

Is this ever going to be ture? Is it ever possible that Linux and remote
has different cache alignment? IIUC, both will be using same cache and
so same alignment will be applicable. That is why only signle alignment
is required.

> Based on that in short term the local 'rpmsg_buf_align' would still
> computed
> only from vdev_config::alignment (not update of vdev_config::alignment).
> 
> virtio_cread(vdev, struct virtio_rpmsg_config,
>                  rpmsg_buf_align, &rpmsg_buf_align);
> 
> Then you could use use ALIGN() helper:
> 
> unsigned int rx_buf_align_size = ALIGN(vrp->rx_buf_size,
>                        rpmsg_buf_align);
> unsigned int tx_buf_align_size = ALIGN(vrp->tx_buf_size,
>                        rpmsg_buf_align);
> 

This is where I have different opinion. Instead of Linux using ALIGN()
macro, can we expect that firmware must assign the aligned buffer size
with vdev_config::rpmsg_buf_align? And so Linux will fail if the buffer
size is not aligned already from the firmware side. That is why I had
introduced checks instead of doing alignment by linux.

> total_buf_space = (vrp->num_rx_buf * rx_buf_align_size) +
>           (vrp->num_tx_buf * tx_buf_align_size);
> 
> vrp->tx_bufs = bufs_va + vrp->num_rx_buf * rx_buf_align_size;
> 
> Apply the same rule to cpu_addr in the vring descriptor:
> 
> void *cpu_addr = vrp->rx_bufs + i * rx_buf_align_size;
> 
> rpmsg_sg_init(&sg, cpu_addr, vrp->rx_buf_size);
> 
> With this approach, the buffer addresses remain aligned
> independently of vdev_config::Rxbuf_size and vdev_config::txbuf_size.
> Don't hesitate if it is still not clear!

How they remain aligned independent of tx/rx_buf_size? tx_bufs address
is still calculated based on rx_buf_align_size, so its alignment still
depends on rx_buf_align_size which is derived using
vdev_config::rpmsg_buf_align.

I think we are trying to achive the same thing, but implementation is
differnt. We just need to decide where the alignment should be done?

Either on the linux side? Or in the firmware resource table?

I prefer that the firmware should already provide aligned buffer size,
and Linux should only check it. If alignment is not done, then simply
fail with error. That way, firmware also knows the correct size of the
buffer. If Linux does the alignment, then the firmware is not aware of
the correct size that is used by the linux.

I am open to move the alignment operation to the linux side with the
reasonable justification.

Thank You,
Tanmay

>>
>>
>>>> +        if (rpmsg_buf_align &&
>>>> +            (!IS_ALIGNED(vrp->rx_buf_size, rpmsg_buf_align) ||
>>>> +             !IS_ALIGNED(vrp->tx_buf_size, rpmsg_buf_align))) {
>>>> +            dev_err(&vdev->dev,
>>>> +                "bad vdev config: buf sizes (rx %u, tx %u) not
>>>> aligned to %u\n",
>>>> +                vrp->rx_buf_size, vrp->tx_buf_size,
>>>> +                rpmsg_buf_align);
>>>> +            err = -EINVAL;
>>>> +            goto vqs_del;
>>>> +        }
>>>> +
>>>> +        dev_dbg(&vdev->dev,
>>>> +            "vdev config: ver=%u, align=0x%x, rx sz = 0x%x, tx sz =
>>>> 0x%x\n",
>>>> +            version, rpmsg_buf_align, vrp->rx_buf_size,
>>>> +            vrp->tx_buf_size);
>>>> +    } else {
>>>> +        vrp->rx_buf_size = DEFAULT_RPMSG_BUF_SIZE;
>>>> +        vrp->tx_buf_size = DEFAULT_RPMSG_BUF_SIZE;
>>>> +    }
>>>> +
>>>> +    total_buf_space = (vrp->num_rx_buf * vrp->rx_buf_size) +
>>>> +              (vrp->num_tx_buf * vrp->tx_buf_size);
>>>>          /* allocate coherent memory for the buffers */
>>>>        bufs_va = dma_alloc_coherent(vdev->dev.parent,
>>>> @@ -874,15 +959,20 @@ static int rpmsg_probe(struct virtio_device
>>>> *vdev)
>>>>        /* first part of the buffers is dedicated for RX */
>>>>        vrp->rx_bufs = bufs_va;
>>>>    -    /* and second part is dedicated for TX */
>>>> -    vrp->tx_bufs = bufs_va + vrp->num_rx_buf * vrp->buf_size;
>>>> +    /*
>>>> +     * Here buf_va is aligned to a page. Also rx buf size is aligned
>>>> with
>>>> +     * cache line alignment provided by the firmware, so tx buf's
>>>> start
>>>> +     * address is guranteed to be aligned with the alignment
>>>> provided by
>>>> +     * the firmware.
>>>> +     */
>>>> +    vrp->tx_bufs = bufs_va + (vrp->num_rx_buf * vrp->rx_buf_size);
>>>>          /* set up the receive buffers */
>>>>        for (i = 0; i < vrp->num_rx_buf; i++) {
>>>>            struct scatterlist sg;
>>>> -        void *cpu_addr = vrp->rx_bufs + i * vrp->buf_size;
>>>> +        void *cpu_addr = vrp->rx_bufs + i * vrp->rx_buf_size;
>>>>    -        rpmsg_sg_init(&sg, cpu_addr, vrp->buf_size);
>>>> +        rpmsg_sg_init(&sg, cpu_addr, vrp->rx_buf_size);
>>>>              err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr,
>>>>                          GFP_KERNEL);
>>>> @@ -965,8 +1055,8 @@ static int rpmsg_remove_device(struct device
>>>> *dev, void *data)
>>>>    static void rpmsg_remove(struct virtio_device *vdev)
>>>>    {
>>>>        struct virtproc_info *vrp = vdev->priv;
>>>> -    unsigned int num_bufs = vrp->num_rx_buf + vrp->num_tx_buf;
>>>> -    size_t total_buf_space = num_bufs * vrp->buf_size;
>>>> +    size_t total_buf_space = (vrp->num_rx_buf * vrp->rx_buf_size) +
>>>> +                 (vrp->num_tx_buf * vrp->tx_buf_size);
>>>>        int ret;
>>>>          virtio_reset_device(vdev);
>>>> @@ -992,6 +1082,7 @@ static struct virtio_device_id id_table[] = {
>>>>      static unsigned int features[] = {
>>>>        VIRTIO_RPMSG_F_NS,
>>>> +    VIRTIO_RPMSG_F_BUFSZ,
>>>>    };
>>>>      static struct virtio_driver virtio_ipc_driver = {
>>>> diff --git a/include/linux/rpmsg/virtio_rpmsg.h b/include/linux/rpmsg/
>>>> virtio_rpmsg.h
>>>> new file mode 100644
>>>> index 000000000000..7e14da68fd17
>>>> --- /dev/null
>>>> +++ b/include/linux/rpmsg/virtio_rpmsg.h
>>>> @@ -0,0 +1,50 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>>> +/*
>>>> + * Copyright (C) Pinecone Inc. 2019
>>>> + * Copyright (C) Xiang Xiao <xiaoxiang@pinecone.net>
>>>> + * Copyright (C) Advanced Micro Devices, Inc. 2026
>>>> + */
>>>> +
>>>> +#ifndef _LINUX_VIRTIO_RPMSG_H
>>>> +#define _LINUX_VIRTIO_RPMSG_H
>>>> +
>>>> +#include <linux/types.h>
>>>> +#include <linux/virtio_types.h>
>>>> +
>>>> +/* The feature bitmap for virtio rpmsg */
>>>> +#define VIRTIO_RPMSG_F_NS    0 /* RP supports name service
>>>> notifications */
>>>> +#define VIRTIO_RPMSG_F_BUFSZ    1 /* RP get buffer size from config
>>>> space */
>>>> +
>>>> +/* Version of struct virtio_rpmsg_config understood by this driver */
>>>> +#define RPMSG_VDEV_CONFIG_V1    1
>>>> +
>>>> +/**
>>>> + * struct virtio_rpmsg_config - config space for rpmsg virtio device
>>>> + *
>>>> + * @version:    version of this structure, currently
>>>> %RPMSG_VDEV_CONFIG_V1.
>>>> + * @reserved:    reserved for padding, must be zero.
>>>> + * @size:    size of this structure in bytes.
>>>> + * @rpmsg_buf_align:    required alignment in bytes for each buffer.
>>>> Must be a
>>>> + *        power of two so that both the buffer sizes and the TX buffer
>>>> + *        base address can be aligned (e.g. to a cache line).
>>>> + * @reserved1:    reserved for padding, must be zero. Keeps the
>>>> following 32-bit
>>>> + *        fields naturally aligned.
>>>> + * @txbuf_size:    Tx buf size from remote's view. For Linux this is
>>>> rx buf size.
>>>> + * @rxbuf_size:    Rx buf size from remote's view. For Linux this is
>>>> tx buf size.
>>>> + *
>>>> + * This is the configuration structure shared by the device and the
>>>> driver,
>>>> + * read when %VIRTIO_RPMSG_F_BUFSZ is negotiated. The fields are laid
>>>> out so
>>>> + * the structure is naturally 32-bit aligned.
>>>> + */
>>>> +struct virtio_rpmsg_config {
>>>> +    u8 version;
>>>> +    u8 reserved;
>>>
>>> Why about defining the version type to u16 to avoid the reserved field?
>>>
>>>> +    __virtio16 size;
>>>> +    __virtio16 rpmsg_buf_align;
>>>> +    __virtio16 reserved1;
>>>
>>> Seems useless if __packed prevents the compiler from inserting extra
>>> padding
>>> bytes between fields,
>>>
>>>> +    /* The tx/rx individual buffer size (if VIRTIO_RPMSG_F_BUFSZ) */
>>>> +    __virtio32 txbuf_size;
>>>> +    __virtio32 rxbuf_size;
>>>> +} __packed;
>>>
>>> proposal
>>>
>>> +struct virtio_rpmsg_config {
>>> +    __virtio16 version;
>>> +    __virtio16 size;
>>> +    /* The tx/rx individual buffer size (if VIRTIO_RPMSG_F_BUFSZ) */
>>> +    __virtio32 txbuf_size;
>>> +    __virtio32 rxbuf_size;
>>> +    __virtio16 rpmsg_buf_align;
>>> +} __packed;
>>> +
>>>
>>
>> I am okay with the above proposal with minor difference:
>>
>> My proposal:
>>
>> +struct virtio_rpmsg_config {
>> +    u8 version;
>> +    __virtio16 size;
>> +    __virtio16 rpmsg_buf_align;
>> +    /* The tx/rx individual buffer size (if VIRTIO_RPMSG_F_BUFSZ) */
>> +    __virtio32 txbuf_size;
>> +    __virtio32 rxbuf_size;
>> +} __packed;
>>
>> I just want to keep version field 8-bit, as we will probably never use
>> upper byte of that field if we use 16-bit. Rest is okay. If the
>> strucutre is packed then reserved bytes are not needed.
>>
>> Please let me know your view.
> 
> No strong opinion on that. In the end, this structure is read only one
> time.
> If it is acceptable to Mathieu, it is acceptable to me.
> 
> Thanks,
> Arnaud
> 
>>
>> Thanks,
>> Tanmay
>>
>>
>>> Regards,
>>> Arnaud
>>>
>>>> +
>>>> +#endif /* _LINUX_VIRTIO_RPMSG_H */
>>>
>>
> 


^ permalink raw reply

* Re: [PATCH v3 05/12] x86/resctrl: Initialize supported kernel modes for PLZA
From: Babu Moger @ 2026-06-18 16:20 UTC (permalink / raw)
  To: Reinette Chatre, corbet, tony.luck, Dave.Martin, james.morse,
	tglx, bp, dave.hansen
  Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
	feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
	seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
	kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
	linux-kernel, eranian, peternewman
In-Reply-To: <283777e6-679f-4f02-8342-47b0349e92db@intel.com>

Hi Reinette,

On 6/16/26 18:35, Reinette Chatre wrote:
> Hi Babu,
> 
> On 4/30/26 4:24 PM, Babu Moger wrote:
>> Resctrl subsystem tracks which kernel-mode CLOSID/RMID policies the
>> platform can offer via struct resctrl_kmode_cfg and
>> resctrl_arch_get_kmode_support(). AMD PLZA (Privilege Level Zero
>> Association) is the x86 feature that allows kernel traffic to use an
>> assigned CLOSID alone or CLOSID and RMID together.
>>
>> Report the available kernel-modes when x86 PLZA is enabled.
>>
>> Signed-off-by: Babu Moger <babu.moger@amd.com>
>> ---
>> v3: New patch to report all the supported kernel mode by arch.
>> ---
>>   arch/x86/kernel/cpu/resctrl/core.c | 15 +++++++++++++++
>>   1 file changed, 15 insertions(+)
>>
>> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
>> index 4a8717157e3e..699d8bb82875 100644
>> --- a/arch/x86/kernel/cpu/resctrl/core.c
>> +++ b/arch/x86/kernel/cpu/resctrl/core.c
>> @@ -894,6 +894,21 @@ bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
>>   	}
>>   }
>>   
>> +/**
>> + * resctrl_arch_get_kmode_support() - x86: record which kernel-mode policies hardware supports
>> + * @kcfg:	Cumulative snapshot; OR bits into @kcfg->kmode (see &struct resctrl_kmode_cfg).
> 
> If this is intended to be a cumulative snapshot this is a very subtle requirement
> for architectures to "do the right thing" here. To make this more robust I think it will be
> simpler if resctrl fs boots with resctrl_kcfg initialized to expected defaults.
> Instead of this callback resctrl can add resctrl_set_kmode_support(u32 kmodes)
> that the architecture *may* use to further initialize the kmodes supported by it. This
> function is implemented by resctrl fs, instead of architecture, and it can fail if
> architecture does not support INHERIT_CTRL_AND_MON. This will help to keep
> struct resctrl_kmode_cfg private to resctrl fs while enforcing any assumptions about
> which modes are required to be supported.

Yes, agreed. I will move resctrl_set_kmode_support() to the FS layer and 
have the architecture code invoke it when setting the kmodes.

That will make the struct resctrl_kmode_cfg private to FS layer.

Thanks
Babu


^ permalink raw reply

* Re: [PATCH v2 07/11] hugetlb: replace filemap_lock_hugetlb_folio with filemap_lock_folio
From: Usama Arif @ 2026-06-18 16:16 UTC (permalink / raw)
  To: Jane Chu
  Cc: Usama Arif, akpm, willy, jack, viro, brauner, muchun.song,
	osalvador, david, hughd, baolin.wang, linmiaohe, nao.horiguchi,
	lorenzo, rppt, peterx, corbet, linux-doc, linux-mm, linux-kernel,
	linux-fsdevel
In-Reply-To: <20260617172534.1740152-8-jane.chu@oracle.com>

On Wed, 17 Jun 2026 11:25:28 -0600 Jane Chu <jane.chu@oracle.com> wrote:

> The problem with filemap_lock_hugetlb_folio() is redundancy, replace
> it with the generic filemap_lock_folio().
> 
> Suggested-by: David Hildenbrand <david@kernel.org>
> Signed-off-by: Jane Chu <jane.chu@oracle.com>
> ---
>  fs/hugetlbfs/inode.c    |  3 +--
>  include/linux/hugetlb.h | 12 ------------
>  mm/hugetlb.c            |  4 ++--
>  3 files changed, 3 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 02cb265a580e..6c883478f7e7 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -518,10 +518,9 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
>  					loff_t start,
>  					loff_t end)
>  {
> -	pgoff_t idx = start >> huge_page_shift(h);
>  	struct folio *folio;
>  
> -	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
> +	folio = filemap_lock_folio(mapping, start);

Do you need to do start >> PAGE_SHIFT over here?

>  	if (IS_ERR(folio))
>  		return;
>  
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index cae5cdd3ea00..e78d0f706681 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -824,12 +824,6 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)
>  	return huge_page_size(h) / 512;
>  }
>  
> -static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
> -				struct address_space *mapping, pgoff_t idx)
> -{
> -	return filemap_lock_folio(mapping, idx << huge_page_order(h));
> -}
> -
>  #include <asm/hugetlb.h>
>  
>  #ifndef is_hugepage_only_range
> @@ -1096,12 +1090,6 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio
>  	return NULL;
>  }
>  
> -static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
> -				struct address_space *mapping, pgoff_t idx)
> -{
> -	return NULL;
> -}
> -
>  static inline int isolate_or_dissolve_huge_folio(struct folio *folio,
>  						struct list_head *list)
>  {
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ecd1d1322fda..5484e78fe72e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5715,7 +5715,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
>  	 * before we get page_table_lock.
>  	 */
>  	new_folio = false;
> -	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
> +	folio = filemap_lock_folio(mapping, vmf->pgoff);
>  	if (IS_ERR(folio)) {
>  		size = i_size_read(mapping->host) >> PAGE_SHIFT;
>  		if (vmf->pgoff >= size)
> @@ -6201,7 +6201,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
>  
>  	if (is_continue) {
>  		ret = -EFAULT;
> -		folio = filemap_lock_hugetlb_folio(h, mapping, idx);
> +		folio = filemap_lock_folio(mapping, idx << huge_page_order(h));
>  		if (IS_ERR(folio))
>  			goto out;
>  		folio_in_pagecache = true;
> -- 
> 2.43.5
> 
> 

^ permalink raw reply

* Re: [PATCH v6 06/16] iio: core: create local __iio_chan_prefix_emit() for reuse
From: Rodrigo Alencar @ 2026-06-18 16:14 UTC (permalink / raw)
  To: Nuno Sá, rodrigo.alencar
  Cc: linux-iio, devicetree, linux-kernel, linux-doc, linux-hardening,
	Lars-Peter Clausen, Michael Hennerich, Jonathan Cameron,
	David Lechner, Andy Shevchenko, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Philipp Zabel, Jonathan Corbet, Shuah Khan,
	Kees Cook, Gustavo A. R. Silva
In-Reply-To: <ajQGTQ1_qcOwfzne@nsa>

On 18/06/26 16:06, Nuno Sá wrote:
> On Thu, Jun 18, 2026 at 02:27:22PM +0100, Rodrigo Alencar via B4 Relay wrote:
> > From: Rodrigo Alencar <rodrigo.alencar@analog.com>
> > 
> > Move logic to create a channel prefix for naming attribute files into a
> > separate __iio_chan_prefix_emit() function for reuse.

...

> > +static int __iio_chan_prefix_emit(const struct iio_chan_spec *chan,
> > +				  enum iio_shared_by shared_by,
> > +				  char *buf, size_t len)
> > +{
> > +	const char *dir = iio_direction[chan->output];
> > +	const char *type = iio_chan_type_name_spec[chan->type];
> > +	int n = 0;
> > +
> > +	switch (shared_by) {
> > +	case IIO_SHARED_BY_ALL:
> > +		buf[0] = '\0'; /* empty channel prefix */
> > +		break;
> > +	case IIO_SHARED_BY_DIR:
> > +		n = scnprintf(buf, len, "%s", dir);
> > +		break;
> > +	case IIO_SHARED_BY_TYPE:
> > +		n = scnprintf(buf, len, "%s_%s", dir, type);
> > +		if (chan->differential)
> > +			n += scnprintf(buf + n, len - n, "-%s", type);
> > +		break;
> > +	case IIO_SEPARATE:
> > +		if (chan->indexed) {
> > +			n = scnprintf(buf, len, "%s_%s%d", dir, type,
> > +				      chan->channel);
> > +			if (chan->differential)
> > +				n += scnprintf(buf + n, len - n, "-%s%d", type,
> > +					       chan->channel2);
> > +		} else {
> > +			if (chan->differential) {
> > +				WARN(1, "Differential channels must be indexed\n");
> > +				return -EINVAL;
> > +			}
> > +			n = scnprintf(buf, len, "%s_%s", dir, type);
> > +		}
> > +
> > +		if (chan->modified) {
> > +			if (chan->differential) {
> > +				WARN(1, "Differential channels can not have modifier\n");
> > +				return -EINVAL;
> 
> WARN() looks too much to me. dev_error() as we're treating it as such. I
> guess you don't want to pass struct device but not really an issue IMHO.

__iio_device_attr_init() also used WARN(), probably because it didnt have
access to a dev pointer. It would not be a problem to add an extra param.
 
> 
> > +			}
> > +			n += scnprintf(buf + n, len - n, "_%s",
> > +				       iio_modifier_names[chan->channel2]);
> > +		}
> > +
> > +		if (chan->extend_name)
> > +			n += scnprintf(buf + n, len - n, "_%s", chan->extend_name);
> > +		break;
> > +	}
> > +
> > +	if (n > 0 && n < len - 1) { /* prefix termination if not empty */
> > +		buf[n++] = '_';
> > +		buf[n] = '\0';
> > +	}
> > +
> 
> Can't we handle the above in the caller on kasprintf()? Then we could
> simplify and return in place.

I felt like doing this here would get a cleaner logic in the caller, which
would have to add the '_' conditionally.

> 
> > +	return n;
> > +}
> > +
> >  /**
> >   * iio_device_id() - query the unique ID for the device
> >   * @indio_dev:		Device structure whose ID is being queried
> > @@ -1100,106 +1159,19 @@ int __iio_device_attr_init(struct device_attribute *dev_attr,
> >  						size_t len),
> >  			   enum iio_shared_by shared_by)
> >  {
> > -	int ret = 0;
> > -	char *name = NULL;
> > -	char *full_postfix;
> > +	char prefix[NAME_MAX + 1];
> > +	int ret;
> >  
> >  	sysfs_attr_init(&dev_attr->attr);
> >  
> > -	/* Build up postfix of <extend_name>_<modifier>_postfix */
> > -	if (chan->modified && (shared_by == IIO_SEPARATE)) {
> > -		if (chan->extend_name)
> > -			full_postfix = kasprintf(GFP_KERNEL, "%s_%s_%s",
> > -						 iio_modifier_names[chan->channel2],
> > -						 chan->extend_name,
> > -						 postfix);
> > -		else
> > -			full_postfix = kasprintf(GFP_KERNEL, "%s_%s",
> > -						 iio_modifier_names[chan->channel2],
> > -						 postfix);
> > -	} else {
> > -		if (chan->extend_name == NULL || shared_by != IIO_SEPARATE)
> > -			full_postfix = kstrdup(postfix, GFP_KERNEL);
> > -		else
> > -			full_postfix = kasprintf(GFP_KERNEL,
> > -						 "%s_%s",
> > -						 chan->extend_name,
> > -						 postfix);
> > -	}
> > -	if (full_postfix == NULL)
> > +	ret = __iio_chan_prefix_emit(chan, shared_by, prefix, sizeof(prefix));
> > +	if (ret < 0)
> > +		return ret;
> > +
> > +	dev_attr->attr.name = kasprintf(GFP_KERNEL, "%s%s", prefix, postfix);
> > +	if (!dev_attr->attr.name)
> >  		return -ENOMEM;
> 
> I don't oppose the change. Looks like a nice cleanup. But bear in mind
> this very sensible as any subtle mistake means ABI breakage.

Yes! I tried to be careful... this is dangerous stuff!

-- 
Kind regards,

Rodrigo Alencar

^ permalink raw reply

* Re: [PATCH v3 05/13] cpu/hotplug: Reserve CPUHP states for nohz_full and managed IRQ down-paths
From: Thomas Gleixner @ 2026-06-18 16:06 UTC (permalink / raw)
  To: Jing Wu, Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Valentin Schneider, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Mathieu Desnoyers, Lai Jiangshan, Zqiang,
	Anna-Maria Behnsen, Tejun Heo, Jonathan Corbet, Shuah Khan,
	Shuah Khan
  Cc: linux-kernel, rcu, cgroups, linux-doc, linux-kselftest, Jing Wu,
	Qiliang Yuan
In-Reply-To: <20260618-wujing-dhm-v3-5-28f1a4d83b68@gmail.com>

On Thu, Jun 18 2026 at 11:11, Jing Wu wrote:
> Add CPUHP_AP_NO_HZ_FULL_DYING and CPUHP_AP_IRQ_AFFINITY_DYING to the
> cpuhp_state enum.  These dying callbacks are invoked during CPU offline
> before the tick is stopped, enabling clean tick handover and managed
> IRQ migration when a CPU transitions between isolated and housekeeping
> states.
>
> The existing CPUHP_AP_IRQ_AFFINITY_ONLINE already handles managed IRQ
> restoration on CPU online.  The new dying callback completes the pair,
> migrating managed interrupts away from the CPU before it goes down.

What? They are migrated away today already when the CPU goes down unless
the CPU is the last one in the affinity set of the interrupt. So why do
you need a new step for something which already exists?

> Subsequent patches register handlers for these states.
>
> Signed-off-by: Jing Wu <realwujing@gmail.com>
> Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>

This SOB chain is broken (in all patches). See Documentation/process/...

Thanks,

        tglx

^ permalink raw reply

* Re: [PATCH v6 06/16] iio: core: create local __iio_chan_prefix_emit() for reuse
From: Nuno Sá @ 2026-06-18 15:06 UTC (permalink / raw)
  To: rodrigo.alencar
  Cc: linux-iio, devicetree, linux-kernel, linux-doc, linux-hardening,
	Lars-Peter Clausen, Michael Hennerich, Jonathan Cameron,
	David Lechner, Andy Shevchenko, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Philipp Zabel, Jonathan Corbet, Shuah Khan,
	Kees Cook, Gustavo A. R. Silva
In-Reply-To: <20260618-ad9910-iio-driver-v6-6-79125ffbe430@analog.com>

On Thu, Jun 18, 2026 at 02:27:22PM +0100, Rodrigo Alencar via B4 Relay wrote:
> From: Rodrigo Alencar <rodrigo.alencar@analog.com>
> 
> Move logic to create a channel prefix for naming attribute files into a
> separate __iio_chan_prefix_emit() function for reuse.
> 
> Signed-off-by: Rodrigo Alencar <rodrigo.alencar@analog.com>
> ---
>  drivers/iio/industrialio-core.c | 167 ++++++++++++++++------------------------
>  1 file changed, 68 insertions(+), 99 deletions(-)
> 
> diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
> index 03019bf9327b..9373006235c8 100644
> --- a/drivers/iio/industrialio-core.c
> +++ b/drivers/iio/industrialio-core.c
> @@ -26,6 +26,7 @@
>  #include <linux/property.h>
>  #include <linux/sched.h>
>  #include <linux/slab.h>
> +#include <linux/sprintf.h>
>  #include <linux/wait.h>
>  
>  #include <linux/iio/buffer.h>
> @@ -199,6 +200,64 @@ static const char * const iio_chan_info_postfix[] = {
>  	[IIO_CHAN_INFO_CONVDELAY] = "convdelay",
>  	[IIO_CHAN_INFO_POWERFACTOR] = "powerfactor",
>  };
> +
> +static int __iio_chan_prefix_emit(const struct iio_chan_spec *chan,
> +				  enum iio_shared_by shared_by,
> +				  char *buf, size_t len)
> +{
> +	const char *dir = iio_direction[chan->output];
> +	const char *type = iio_chan_type_name_spec[chan->type];
> +	int n = 0;
> +
> +	switch (shared_by) {
> +	case IIO_SHARED_BY_ALL:
> +		buf[0] = '\0'; /* empty channel prefix */
> +		break;
> +	case IIO_SHARED_BY_DIR:
> +		n = scnprintf(buf, len, "%s", dir);
> +		break;
> +	case IIO_SHARED_BY_TYPE:
> +		n = scnprintf(buf, len, "%s_%s", dir, type);
> +		if (chan->differential)
> +			n += scnprintf(buf + n, len - n, "-%s", type);
> +		break;
> +	case IIO_SEPARATE:
> +		if (chan->indexed) {
> +			n = scnprintf(buf, len, "%s_%s%d", dir, type,
> +				      chan->channel);
> +			if (chan->differential)
> +				n += scnprintf(buf + n, len - n, "-%s%d", type,
> +					       chan->channel2);
> +		} else {
> +			if (chan->differential) {
> +				WARN(1, "Differential channels must be indexed\n");
> +				return -EINVAL;
> +			}
> +			n = scnprintf(buf, len, "%s_%s", dir, type);
> +		}
> +
> +		if (chan->modified) {
> +			if (chan->differential) {
> +				WARN(1, "Differential channels can not have modifier\n");
> +				return -EINVAL;

WARN() looks too much to me. dev_error() as we're treating it as such. I
guess you don't want to pass struct device but not really an issue IMHO.


> +			}
> +			n += scnprintf(buf + n, len - n, "_%s",
> +				       iio_modifier_names[chan->channel2]);
> +		}
> +
> +		if (chan->extend_name)
> +			n += scnprintf(buf + n, len - n, "_%s", chan->extend_name);
> +		break;
> +	}
> +
> +	if (n > 0 && n < len - 1) { /* prefix termination if not empty */
> +		buf[n++] = '_';
> +		buf[n] = '\0';
> +	}
> +

Can't we handle the above in the caller on kasprintf()? Then we could
simplify and return in place.

> +	return n;
> +}
> +
>  /**
>   * iio_device_id() - query the unique ID for the device
>   * @indio_dev:		Device structure whose ID is being queried
> @@ -1100,106 +1159,19 @@ int __iio_device_attr_init(struct device_attribute *dev_attr,
>  						size_t len),
>  			   enum iio_shared_by shared_by)
>  {
> -	int ret = 0;
> -	char *name = NULL;
> -	char *full_postfix;
> +	char prefix[NAME_MAX + 1];
> +	int ret;
>  
>  	sysfs_attr_init(&dev_attr->attr);
>  
> -	/* Build up postfix of <extend_name>_<modifier>_postfix */
> -	if (chan->modified && (shared_by == IIO_SEPARATE)) {
> -		if (chan->extend_name)
> -			full_postfix = kasprintf(GFP_KERNEL, "%s_%s_%s",
> -						 iio_modifier_names[chan->channel2],
> -						 chan->extend_name,
> -						 postfix);
> -		else
> -			full_postfix = kasprintf(GFP_KERNEL, "%s_%s",
> -						 iio_modifier_names[chan->channel2],
> -						 postfix);
> -	} else {
> -		if (chan->extend_name == NULL || shared_by != IIO_SEPARATE)
> -			full_postfix = kstrdup(postfix, GFP_KERNEL);
> -		else
> -			full_postfix = kasprintf(GFP_KERNEL,
> -						 "%s_%s",
> -						 chan->extend_name,
> -						 postfix);
> -	}
> -	if (full_postfix == NULL)
> +	ret = __iio_chan_prefix_emit(chan, shared_by, prefix, sizeof(prefix));
> +	if (ret < 0)
> +		return ret;
> +
> +	dev_attr->attr.name = kasprintf(GFP_KERNEL, "%s%s", prefix, postfix);
> +	if (!dev_attr->attr.name)
>  		return -ENOMEM;

I don't oppose the change. Looks like a nice cleanup. But bear in mind
this very sensible as any subtle mistake means ABI breakage.

- Nuno Sá

>  
> -	if (chan->differential) { /* Differential can not have modifier */
> -		switch (shared_by) {
> -		case IIO_SHARED_BY_ALL:
> -			name = kasprintf(GFP_KERNEL, "%s", full_postfix);
> -			break;
> -		case IIO_SHARED_BY_DIR:
> -			name = kasprintf(GFP_KERNEL, "%s_%s",
> -						iio_direction[chan->output],
> -						full_postfix);
> -			break;
> -		case IIO_SHARED_BY_TYPE:
> -			name = kasprintf(GFP_KERNEL, "%s_%s-%s_%s",
> -					    iio_direction[chan->output],
> -					    iio_chan_type_name_spec[chan->type],
> -					    iio_chan_type_name_spec[chan->type],
> -					    full_postfix);
> -			break;
> -		case IIO_SEPARATE:
> -			if (!chan->indexed) {
> -				WARN(1, "Differential channels must be indexed\n");
> -				ret = -EINVAL;
> -				goto error_free_full_postfix;
> -			}
> -			name = kasprintf(GFP_KERNEL,
> -					    "%s_%s%d-%s%d_%s",
> -					    iio_direction[chan->output],
> -					    iio_chan_type_name_spec[chan->type],
> -					    chan->channel,
> -					    iio_chan_type_name_spec[chan->type],
> -					    chan->channel2,
> -					    full_postfix);
> -			break;
> -		}
> -	} else { /* Single ended */
> -		switch (shared_by) {
> -		case IIO_SHARED_BY_ALL:
> -			name = kasprintf(GFP_KERNEL, "%s", full_postfix);
> -			break;
> -		case IIO_SHARED_BY_DIR:
> -			name = kasprintf(GFP_KERNEL, "%s_%s",
> -						iio_direction[chan->output],
> -						full_postfix);
> -			break;
> -		case IIO_SHARED_BY_TYPE:
> -			name = kasprintf(GFP_KERNEL, "%s_%s_%s",
> -					    iio_direction[chan->output],
> -					    iio_chan_type_name_spec[chan->type],
> -					    full_postfix);
> -			break;
> -
> -		case IIO_SEPARATE:
> -			if (chan->indexed)
> -				name = kasprintf(GFP_KERNEL, "%s_%s%d_%s",
> -						    iio_direction[chan->output],
> -						    iio_chan_type_name_spec[chan->type],
> -						    chan->channel,
> -						    full_postfix);
> -			else
> -				name = kasprintf(GFP_KERNEL, "%s_%s_%s",
> -						    iio_direction[chan->output],
> -						    iio_chan_type_name_spec[chan->type],
> -						    full_postfix);
> -			break;
> -		}
> -	}
> -	if (name == NULL) {
> -		ret = -ENOMEM;
> -		goto error_free_full_postfix;
> -	}
> -	dev_attr->attr.name = name;
> -
>  	if (readfunc) {
>  		dev_attr->attr.mode |= 0444;
>  		dev_attr->show = readfunc;
> @@ -1210,10 +1182,7 @@ int __iio_device_attr_init(struct device_attribute *dev_attr,
>  		dev_attr->store = writefunc;
>  	}
>  
> -error_free_full_postfix:
> -	kfree(full_postfix);
> -
> -	return ret;
> +	return 0;
>  }
>  
>  static void __iio_device_attr_deinit(struct device_attribute *dev_attr)
> 
> -- 
> 2.43.0
> 
> 

^ permalink raw reply

* Re: [PATCH v6 05/16] iio: core: support 64-bit register through debugfs
From: Nuno Sá @ 2026-06-18 14:45 UTC (permalink / raw)
  To: rodrigo.alencar
  Cc: linux-iio, devicetree, linux-kernel, linux-doc, linux-hardening,
	Lars-Peter Clausen, Michael Hennerich, Jonathan Cameron,
	David Lechner, Andy Shevchenko, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Philipp Zabel, Jonathan Corbet, Shuah Khan,
	Kees Cook, Gustavo A. R. Silva
In-Reply-To: <20260618-ad9910-iio-driver-v6-5-79125ffbe430@analog.com>

On Thu, Jun 18, 2026 at 02:27:21PM +0100, Rodrigo Alencar via B4 Relay wrote:
> From: Rodrigo Alencar <rodrigo.alencar@analog.com>
> 
> Add debugfs_reg64_access function pointer field into iio_info and modify
> file operation callbacks to favor 64-bit variant when it is available.
> 
> Signed-off-by: Rodrigo Alencar <rodrigo.alencar@analog.com>
> ---
>  drivers/iio/industrialio-core.c | 33 ++++++++++++++++++++++++---------
>  include/linux/iio/iio-opaque.h  |  2 +-
>  include/linux/iio/iio.h         |  4 ++++
>  3 files changed, 29 insertions(+), 10 deletions(-)
> 
> @@ -471,7 +485,8 @@ static void iio_device_register_debugfs(struct iio_dev *indio_dev)
>  {
>  	struct iio_dev_opaque *iio_dev_opaque;
>  
> -	if (indio_dev->info->debugfs_reg_access == NULL)
> +	if (!indio_dev->info->debugfs_reg_access &&
> +	    !indio_dev->info->debugfs_reg64_access)
>  		return;

Not really that important but should dev_warn() in case someone gives
both callbacks? Can't use both anyways.

(We now have agentic help reviewing the code so maybe even if someone
does it for some reason it won't pass review :))

- Nuno Sá
>  
>  	if (!iio_debugfs_dentry)
> diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
> index b87841a355f8..98330385e08d 100644
> --- a/include/linux/iio/iio-opaque.h
> +++ b/include/linux/iio/iio-opaque.h
> @@ -73,7 +73,7 @@ struct iio_dev_opaque {
>  #if defined(CONFIG_DEBUG_FS)
>  	struct dentry			*debugfs_dentry;
>  	unsigned int			cached_reg_addr;
> -	char				read_buf[20];
> +	char				read_buf[24];
>  	unsigned int			read_buf_len;
>  #endif
>  };
> diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
> index 711c00f67371..1c7d12af22da 100644
> --- a/include/linux/iio/iio.h
> +++ b/include/linux/iio/iio.h
> @@ -484,6 +484,7 @@ struct iio_trigger; /* forward declaration */
>   * @update_scan_mode:	function to configure device and scan buffer when
>   *			channels have changed
>   * @debugfs_reg_access:	function to read or write register value of device
> + * @debugfs_reg64_access: function to read or write 64-bit register value of device
>   * @fwnode_xlate:	fwnode based function pointer to obtain channel specifier index.
>   * @hwfifo_set_watermark: function pointer to set the current hardware
>   *			fifo watermark level; see hwfifo_* entries in
> @@ -572,6 +573,9 @@ struct iio_info {
>  	int (*debugfs_reg_access)(struct iio_dev *indio_dev,
>  				  unsigned int reg, unsigned int writeval,
>  				  unsigned int *readval);
> +	int (*debugfs_reg64_access)(struct iio_dev *indio_dev,
> +				    unsigned int reg, u64 writeval,
> +				    u64 *readval);
>  	int (*fwnode_xlate)(struct iio_dev *indio_dev,
>  			    const struct fwnode_reference_args *iiospec);
>  	int (*hwfifo_set_watermark)(struct iio_dev *indio_dev, unsigned int val);
> 
> -- 
> 2.43.0
> 
> 

^ permalink raw reply

* Re: [RFC PATCH 0/2] kasan: hw_tags: Add option to tag only at allocation time
From: Harry Yoo @ 2026-06-18 14:05 UTC (permalink / raw)
  To: Dev Jain, ryabinin.a.a, akpm, corbet
  Cc: glider, andreyknvl, dvyukov, vincenzo.frascino, kasan-dev,
	linux-mm, linux-kernel, skhan, workflows, linux-doc,
	linux-arm-kernel, ryan.roberts, anshuman.khandual, kaleshsingh,
	21cnbao, david, will, catalin.marinas
In-Reply-To: <b1502a60-09a1-4699-886b-93d041de7023@kernel.org>



On 6/18/26 10:35 PM, Harry Yoo wrote:
> 
> Hi Dev,
> 
> On 6/12/26 1:44 PM, Dev Jain wrote:
>> Introduce a boot option to tag only at allocation time of the objects. This
>> reduces KASAN MTE overhead, the tradeoff being reduced ability of
>> catching bugs.
> 
> I think most of overhead when enabling MTE comes from loading and
> validing tags for every memory access (either in SYNC or ASYNC mode),
> rather than from storing tags.

Is there any reason not to use STGM instead of STG + DC GVA when
setting/clearing tags for large sizes when we know they are properly
aligned?

>> Now, when a memory object will be freed, it will retain the random tag it
>> had at allocation time. This compromises on catching UAF bugs, till the
>> time the object is not reallocated, at which point it will have a new
>> random tag.
>>
>> Hence, not catching "use-after-free-before-reallocation" and not catching
>> "double-free" will be the compromise for reduced KASAN overhead.
> 
> I doubt users who care about security enough to enable HW_TAGS KASAN
> are willing to compromise on security just to save a few instructions
> to store tags in the free path.
> 
> To me, it looks like too much of a compromise on security for little
> performance gain.
> 
>> This is an RFC because we are not clear about the performance benefit.
>>
>> Android folks, please help with testing!
>>
>> ---
>> Applies on Linus master (9716c086c8e8).
>>
>> Dev Jain (2):
>>   kasan: hw_tags: Use KASAN_PAGE_REDZONE for vmalloc redzoning
>>   kasan: hw_tags: Add boot option to elide free time poisoning
>>
>>  Documentation/dev-tools/kasan.rst |  4 +++
>>  mm/kasan/hw_tags.c                | 45 +++++++++++++++++++++++++++++--
>>  mm/kasan/kasan.h                  | 23 +++++++++++++++-
>>  3 files changed, 69 insertions(+), 3 deletions(-)
>>
> 

-- 
Cheers,
Harry / Hyeonggon


^ permalink raw reply

* Re: [RFC PATCH 0/2] kasan: hw_tags: Add option to tag only at allocation time
From: Ryan Roberts @ 2026-06-18 13:48 UTC (permalink / raw)
  To: Dev Jain, ryabinin.a.a, akpm, corbet
  Cc: glider, andreyknvl, dvyukov, vincenzo.frascino, kasan-dev,
	linux-mm, linux-kernel, skhan, workflows, linux-doc,
	linux-arm-kernel, anshuman.khandual, kaleshsingh, 21cnbao, david,
	will, catalin.marinas
In-Reply-To: <20260612044425.763060-1-dev.jain@arm.com>

On 12/06/2026 05:44, Dev Jain wrote:
> Introduce a boot option to tag only at allocation time of the objects. This
> reduces KASAN MTE overhead, the tradeoff being reduced ability of
> catching bugs.
> 
> Now, when a memory object will be freed, it will retain the random tag it
> had at allocation time. This compromises on catching UAF bugs, till the
> time the object is not reallocated, at which point it will have a new
> random tag.
> 
> Hence, not catching "use-after-free-before-reallocation" and not catching
> "double-free" will be the compromise for reduced KASAN overhead.

Does standard KASAN with HW_TAGS really detect double-free? How does it do that?
I could imagine it testing the tags of memory being freed to see if they are set
to the poison tag, but that would lead to false positives for the GFP_SKIP_KASAN
case, surely?

If I'm right, then the only downgrade this new mode causes is that if
freed-but-not-yet-reallocated memory is accessed via it's dangling pointer, then
that bad access is not detected. I think that would be benign in all the cases I
can think of, so while it would be a problem for a debugging use case, it would
unlikely be a problem for security enforcement?

Thanks,
Ryan


> 
> This is an RFC because we are not clear about the performance benefit.
> 
> Android folks, please help with testing!
> 
> ---
> Applies on Linus master (9716c086c8e8).
> 
> Dev Jain (2):
>   kasan: hw_tags: Use KASAN_PAGE_REDZONE for vmalloc redzoning
>   kasan: hw_tags: Add boot option to elide free time poisoning
> 
>  Documentation/dev-tools/kasan.rst |  4 +++
>  mm/kasan/hw_tags.c                | 45 +++++++++++++++++++++++++++++--
>  mm/kasan/kasan.h                  | 23 +++++++++++++++-
>  3 files changed, 69 insertions(+), 3 deletions(-)
> 


^ permalink raw reply

* Re: [RFC PATCH 0/2] kasan: hw_tags: Add option to tag only at allocation time
From: Harry Yoo @ 2026-06-18 13:35 UTC (permalink / raw)
  To: Dev Jain, ryabinin.a.a, akpm, corbet
  Cc: glider, andreyknvl, dvyukov, vincenzo.frascino, kasan-dev,
	linux-mm, linux-kernel, skhan, workflows, linux-doc,
	linux-arm-kernel, ryan.roberts, anshuman.khandual, kaleshsingh,
	21cnbao, david, will, catalin.marinas
In-Reply-To: <20260612044425.763060-1-dev.jain@arm.com>


[-- Attachment #1.1: Type: text/plain, Size: 1655 bytes --]


Hi Dev,

On 6/12/26 1:44 PM, Dev Jain wrote:
> Introduce a boot option to tag only at allocation time of the objects. This
> reduces KASAN MTE overhead, the tradeoff being reduced ability of
> catching bugs.

I think most of overhead when enabling MTE comes from loading and
validing tags for every memory access (either in SYNC or ASYNC mode),
rather than from storing tags.

> Now, when a memory object will be freed, it will retain the random tag it
> had at allocation time. This compromises on catching UAF bugs, till the
> time the object is not reallocated, at which point it will have a new
> random tag.
> 
> Hence, not catching "use-after-free-before-reallocation" and not catching
> "double-free" will be the compromise for reduced KASAN overhead.

I doubt users who care about security enough to enable HW_TAGS KASAN
are willing to compromise on security just to save a few instructions
to store tags in the free path.

To me, it looks like too much of a compromise on security for little
performance gain.

> This is an RFC because we are not clear about the performance benefit.
>
> Android folks, please help with testing!
> 
> ---
> Applies on Linus master (9716c086c8e8).
> 
> Dev Jain (2):
>   kasan: hw_tags: Use KASAN_PAGE_REDZONE for vmalloc redzoning
>   kasan: hw_tags: Add boot option to elide free time poisoning
> 
>  Documentation/dev-tools/kasan.rst |  4 +++
>  mm/kasan/hw_tags.c                | 45 +++++++++++++++++++++++++++++--
>  mm/kasan/kasan.h                  | 23 +++++++++++++++-
>  3 files changed, 69 insertions(+), 3 deletions(-)
> 

-- 
Cheers,
Harry / Hyeonggon

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v6 01/12] PCI: liveupdate: Set up FLB handler for the PCI core
From: Pranjal Shrivastava @ 2026-06-18 13:30 UTC (permalink / raw)
  To: David Matlack
  Cc: Pasha Tatashin, Mike Rapoport, kexec, linux-doc, linux-kernel,
	linux-mm, linux-pci, Adithya Jayachandran, Alexander Graf,
	Alex Williamson, Bjorn Helgaas, Chris Li, David Rientjes,
	Jacob Pan, Jason Gunthorpe, Jonathan Corbet, Josh Hilke,
	Leon Romanovsky, Lukas Wunner, Parav Pandit, Pratyush Yadav,
	Saeed Mahameed, Samiullah Khawaja, Shuah Khan, Vipin Sharma,
	William Tu, Yi Liu
In-Reply-To: <ajB6V6yBHOjgK5ew@google.com>

On Mon, Jun 15, 2026 at 10:19:03PM +0000, David Matlack wrote:
> On 2026-06-12 10:47 AM, Pasha Tatashin wrote:
> > On 2026-06-12 09:54:44+03:00, Mike Rapoport wrote:
> > > On Fri, Jun 12, 2026 at 05:15:02AM +0000, Pasha Tatashin wrote:
> > > 
> > > > On Fri, 22 May 2026 20:23:59 +0000, David Matlack <dmatlack@google.com> wrote:
> > > > 
> > > > Please add Pratyush, Mike, and myself so we are notified directly of 
> > > > incoming patches, the same as with other areas where the liveupdate/ 
> > > > tree is specified.
> > > 
> > > Or we can add PCI liveupdate files to LIVEUPDATE entry.
> > 
> > That will not work, as we cannot serve as maintainers for 
> > PCI/VFIO/IOMMU/KVM, etc. David Matlack will be the maintainer for the 
> > PCI components, and we will accept patches once they have been approved 
> > by him.
> > 
> > The simplification we could do is to create an email alias 
> > for the live-update tree maintainers. This would allow us to use a 
> > single entry instead of listing all three of us individually.
> 
> We could create a Live Update mailing list for all code that can be CCed
> on all patches that must be merged through the Live Update tree. I would
> also be interested in subscribing to that list.

+1. I'd like if there's a specific Live Update mailing list for
submissions & discussion about the Live Update tree.

Thanks,
Praan

^ permalink raw reply

* Re: [RFC PATCH net-next v8 11/12] net: pcs: airoha: add PCS driver for Airoha AN7581 SoC
From: Benjamin Larsson @ 2026-06-18 13:30 UTC (permalink / raw)
  To: Christian Marangi, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Simon Horman, Jonathan Corbet, Shuah Khan,
	Lorenzo Bianconi, Heiner Kallweit, Russell King, Saravana Kannan,
	Philipp Zabel, Nathan Chancellor, Nick Desaulniers, Bill Wendling,
	Justin Stitt, netdev, devicetree, linux-kernel, linux-doc,
	linux-arm-kernel, linux-mediatek, llvm, Maxime Chevallier
In-Reply-To: <20260618125752.1223-12-ansuelsmth@gmail.com>

Hi.

On 18/06/2026 14:57, Christian Marangi wrote:
> Add PCS driver for Airoha AN7581 SoC for Ethernet/PON/PCIe/USB SERDES
> and permit usage of external PHY or connected SFP cage. Supported modes
> are USXGMII, 10G-BASER, 2500BASE-X, 1000BASE-X and SGMII.
> 
> The driver probe and register the various needed registers and register as
> a PCS provider for fwnode usage.
> 
> Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
> ---
>   drivers/net/pcs/Kconfig                    |    2 +
>   drivers/net/pcs/Makefile                   |    2 +
>   drivers/net/pcs/airoha/Kconfig             |   12 +
>   drivers/net/pcs/airoha/Makefile            |    7 +
>   drivers/net/pcs/airoha/pcs-airoha-common.c | 1324 +++++++++++++
>   drivers/net/pcs/airoha/pcs-airoha.h        | 1311 ++++++++++++
>   drivers/net/pcs/airoha/pcs-an7581.c        | 2093 ++++++++++++++++++++
>   7 files changed, 4751 insertions(+)
>   create mode 100644 drivers/net/pcs/airoha/Kconfig
>   create mode 100644 drivers/net/pcs/airoha/Makefile
>   create mode 100644 drivers/net/pcs/airoha/pcs-airoha-common.c
>   create mode 100644 drivers/net/pcs/airoha/pcs-airoha.h
>   create mode 100644 drivers/net/pcs/airoha/pcs-an7581.c
My comment that the files should be renamed now instead of later when 
support for other airoha platforms are added still stands. The common 
code is not common among other platforms (EN7523 as example).

MvH
Benjamin Larsson

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox