public inbox for linux-perf-users@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] tools: perf: add comm_ignore_digit column
@ 2026-03-16 17:56 Stephen Brennan
  2026-03-17 16:14 ` Ian Rogers
  0 siblings, 1 reply; 5+ messages in thread
From: Stephen Brennan @ 2026-03-16 17:56 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim
  Cc: Adrian Hunter, Ricky Ringler, Alexander Shishkin, Stephen Brennan,
	linux-perf-users, Ian Rogers, Jiri Olsa, Mark Rutland,
	linux-kernel, Tianyou Li, James Clark

The "comm" column allows grouping events by the process command. It is
intended to group like programs, despite having different PIDs. But some
workloads may adjust their own command, so that a unique identifier
(e.g. a PID or some other numeric value) is part of the command name.
This destroys the utility of "comm", forcing perf to place each unique
process name into its own bucket, which can contribute to a
combinatorial explosion of memory use in perf report.

Create a less strict version of this column, which ignores digits when
comparing command names. This allows "similar looking" processes to
again be placed in the same bucket.

Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
---

Changes from v1:
- Rebased on linux-tools-next. Resolved conflict with commit cbd41c6d4c26c
  ("perf report: Update sort key state from -F option").

v1: https://lore.kernel.org/linux-perf-users/20260305181847.3249498-1-stephen.s.brennan@oracle.com/

 tools/perf/util/hist.c |  1 +
 tools/perf/util/hist.h |  1 +
 tools/perf/util/sort.c | 92 +++++++++++++++++++++++++++++++++++++++++-
 tools/perf/util/sort.h |  1 +
 4 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 7ffaa3d9851b4..a6e566dda3a15 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -110,6 +110,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 	len = thread__comm_len(h->thread);
 	if (hists__new_col_len(hists, HISTC_COMM, len))
 		hists__set_col_len(hists, HISTC_THREAD, len + 8);
+	hists__new_col_len(hists, HISTC_COMM_IGNORE_DIGIT, len);
 
 	if (h->ms.map) {
 		len = dso__name_len(map__dso(h->ms.map));
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 1d5ea632ca4e1..ae7e98bd9e46d 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -44,6 +44,7 @@ enum hist_column {
 	HISTC_THREAD,
 	HISTC_TGID,
 	HISTC_COMM,
+	HISTC_COMM_IGNORE_DIGIT,
 	HISTC_CGROUP_ID,
 	HISTC_CGROUP,
 	HISTC_PARENT,
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 42d5cd7ef4e23..45662302ed5ba 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <ctype.h>
 #include <errno.h>
 #include <inttypes.h>
 #include <regex.h>
@@ -265,6 +266,93 @@ struct sort_entry sort_comm = {
 	.se_width_idx	= HISTC_COMM,
 };
 
+/* --sort comm_ignore_digit */
+
+static int64_t strcmp_nodigit(const char *left, const char *right)
+{
+	for (;;) {
+		while (*left && isdigit(*left))
+			left++;
+		while (*right && isdigit(*right))
+			right++;
+		if (*left == *right && !*left) {
+			return 0;
+		} else if (*left == *right) {
+			left++;
+			right++;
+		} else {
+			return (int64_t)*left - (int64_t)*right;
+		}
+	}
+}
+
+static int64_t
+sort__comm_ignore_digit_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
+}
+
+static int64_t
+sort__comm_ignore_digit_collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
+}
+
+static int64_t
+sort__comm_ignore_digit_sort(struct hist_entry *left, struct hist_entry *right)
+{
+	return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
+}
+
+static int hist_entry__comm_ignore_digit_snprintf(struct hist_entry *he, char *bf,
+						size_t size, unsigned int width)
+{
+	int ret = 0;
+	unsigned int print_len, printed = 0, start = 0, end = 0;
+	bool in_digit;
+	const char *comm = comm__str(he->comm), *print;
+
+	while (printed < width && printed < size && comm[start]) {
+		in_digit = !!isdigit(comm[start]);
+		end = start + 1;
+		while (comm[end] && !!isdigit(comm[end]) == in_digit)
+			end++;
+		if (in_digit) {
+			print_len = 3; /* <N> */
+			print = "<N>";
+		} else {
+			print_len = end - start;
+			print = &comm[start];
+		}
+		print_len = min(print_len, width - printed);
+		ret = repsep_snprintf(bf + printed, size - printed, "%-.*s",
+					print_len, print);
+		if (ret < 0)
+			return ret;
+		start = end;
+		printed += ret;
+	}
+	/* Pad to width if necessary */
+	if (printed < width && printed < size) {
+		ret = repsep_snprintf(bf + printed, size - printed, "%-*.*s",
+				       width - printed, width - printed, "");
+		if (ret < 0)
+			return ret;
+		printed += ret;
+	}
+	return printed;
+}
+
+struct sort_entry sort_comm_ignore_digit = {
+	.se_header	= "CommandIgnoreDigit",
+	.se_cmp		= sort__comm_ignore_digit_cmp,
+	.se_collapse	= sort__comm_ignore_digit_collapse,
+	.se_sort	= sort__comm_ignore_digit_sort,
+	.se_snprintf	= hist_entry__comm_ignore_digit_snprintf,
+	.se_filter	= hist_entry__thread_filter,
+	.se_width_idx	= HISTC_COMM_IGNORE_DIGIT,
+};
+
 /* --sort dso */
 
 static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
@@ -2583,6 +2671,7 @@ static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_PID, "pid", sort_thread),
 	DIM(SORT_TGID, "tgid", sort_tgid),
 	DIM(SORT_COMM, "comm", sort_comm),
+	DIM(SORT_COMM_IGNORE_DIGIT, "comm_ignore_digit", sort_comm_ignore_digit),
 	DIM(SORT_DSO, "dso", sort_dso),
 	DIM(SORT_SYM, "symbol", sort_sym),
 	DIM(SORT_PARENT, "parent", sort_parent),
@@ -3577,7 +3666,7 @@ static int __sort_dimension__update(struct sort_dimension *sd,
 		list->socket = 1;
 	} else if (sd->entry == &sort_thread) {
 		list->thread = 1;
-	} else if (sd->entry == &sort_comm) {
+	} else if (sd->entry == &sort_comm || sd->entry == &sort_comm_ignore_digit) {
 		list->comm = 1;
 	} else if (sd->entry == &sort_type_offset) {
 		symbol_conf.annotate_data_member = true;
@@ -4040,6 +4129,7 @@ static bool get_elide(int idx, FILE *output)
 	case HISTC_DSO:
 		return __get_elide(symbol_conf.dso_list, "dso", output);
 	case HISTC_COMM:
+	case HISTC_COMM_IGNORE_DIGIT:
 		return __get_elide(symbol_conf.comm_list, "comm", output);
 	default:
 		break;
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index d7787958e06b9..6819934b4d48a 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -43,6 +43,7 @@ enum sort_type {
 	/* common sort keys */
 	SORT_PID,
 	SORT_COMM,
+	SORT_COMM_IGNORE_DIGIT,
 	SORT_DSO,
 	SORT_SYM,
 	SORT_PARENT,
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] tools: perf: add comm_ignore_digit column
  2026-03-16 17:56 [PATCH v2] tools: perf: add comm_ignore_digit column Stephen Brennan
@ 2026-03-17 16:14 ` Ian Rogers
  2026-03-19 21:29   ` Namhyung Kim
  2026-03-20 22:04   ` Stephen Brennan
  0 siblings, 2 replies; 5+ messages in thread
From: Ian Rogers @ 2026-03-17 16:14 UTC (permalink / raw)
  To: Stephen Brennan
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Adrian Hunter, Ricky Ringler, Alexander Shishkin,
	linux-perf-users, Jiri Olsa, Mark Rutland, linux-kernel,
	Tianyou Li, James Clark

On Mon, Mar 16, 2026 at 10:56 AM Stephen Brennan
<stephen.s.brennan@oracle.com> wrote:
>
> The "comm" column allows grouping events by the process command. It is
> intended to group like programs, despite having different PIDs. But some
> workloads may adjust their own command, so that a unique identifier
> (e.g. a PID or some other numeric value) is part of the command name.
> This destroys the utility of "comm", forcing perf to place each unique
> process name into its own bucket, which can contribute to a
> combinatorial explosion of memory use in perf report.
>
> Create a less strict version of this column, which ignores digits when
> comparing command names. This allows "similar looking" processes to
> again be placed in the same bucket.
>
> Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
> ---
>
> Changes from v1:
> - Rebased on linux-tools-next. Resolved conflict with commit cbd41c6d4c26c
>   ("perf report: Update sort key state from -F option").
>
> v1: https://lore.kernel.org/linux-perf-users/20260305181847.3249498-1-stephen.s.brennan@oracle.com/
>
>  tools/perf/util/hist.c |  1 +
>  tools/perf/util/hist.h |  1 +
>  tools/perf/util/sort.c | 92 +++++++++++++++++++++++++++++++++++++++++-
>  tools/perf/util/sort.h |  1 +
>  4 files changed, 94 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> index 7ffaa3d9851b4..a6e566dda3a15 100644
> --- a/tools/perf/util/hist.c
> +++ b/tools/perf/util/hist.c
> @@ -110,6 +110,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
>         len = thread__comm_len(h->thread);
>         if (hists__new_col_len(hists, HISTC_COMM, len))
>                 hists__set_col_len(hists, HISTC_THREAD, len + 8);
> +       hists__new_col_len(hists, HISTC_COMM_IGNORE_DIGIT, len);

Thanks for working on this! Sashiko is noting some nits in its review:
https://sashiko.dev/#/patchset/20260316175631.3169955-1-stephen.s.brennan%40oracle.com

Because hist_entry__comm_ignore_digit_snprintf() replaces consecutive digits
with the 3-character string <N>, the formatted string is often longer than
the original string (e.g., a command named worker_1 expands from 8 to 10
characters: worker_<N>).

Since the snprintf logic strictly bounds the output to the calculated width,
will the expanded output be prematurely truncated in reports?

>
>         if (h->ms.map) {
>                 len = dso__name_len(map__dso(h->ms.map));
> diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
> index 1d5ea632ca4e1..ae7e98bd9e46d 100644
> --- a/tools/perf/util/hist.h
> +++ b/tools/perf/util/hist.h
> @@ -44,6 +44,7 @@ enum hist_column {
>         HISTC_THREAD,
>         HISTC_TGID,
>         HISTC_COMM,
> +       HISTC_COMM_IGNORE_DIGIT,
>         HISTC_CGROUP_ID,
>         HISTC_CGROUP,
>         HISTC_PARENT,
> diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
> index 42d5cd7ef4e23..45662302ed5ba 100644
> --- a/tools/perf/util/sort.c
> +++ b/tools/perf/util/sort.c
> @@ -1,4 +1,5 @@
>  // SPDX-License-Identifier: GPL-2.0
> +#include <ctype.h>
>  #include <errno.h>
>  #include <inttypes.h>
>  #include <regex.h>
> @@ -265,6 +266,93 @@ struct sort_entry sort_comm = {
>         .se_width_idx   = HISTC_COMM,
>  };
>
> +/* --sort comm_ignore_digit */
> +
> +static int64_t strcmp_nodigit(const char *left, const char *right)
> +{
> +       for (;;) {
> +               while (*left && isdigit(*left))
> +                       left++;
> +               while (*right && isdigit(*right))
> +                       right++;
> +               if (*left == *right && !*left) {
> +                       return 0;
> +               } else if (*left == *right) {
> +                       left++;
> +                       right++;
> +               } else {
> +                       return (int64_t)*left - (int64_t)*right;
> +               }
> +       }
> +}

Sashiko also notes what's below, but I sent out a patch to make all
chars unsigned (like the kernel):
https://lore.kernel.org/lkml/20260306191908.2065682-1-irogers@google.com/
but it has no reviews.

Process command strings can contain arbitrary non-ASCII bytes (e.g., UTF-8
characters). On architectures where char is signed, these bytes evaluate as
negative integers.

Passing a negative value (other than EOF) to ctype.h's isdigit() causes
undefined behavior. In glibc, it results in an out-of-bounds read on the
__ctype_b_loc array, which can lead to a segmentation fault.

Additionally, if char is signed, returning (int64_t)*left - (int64_t)*right
means non-ASCII characters will mathematically sort before ASCII characters.

Does this code need to cast the characters to unsigned char before passing
them to isdigit() and subtracting them?

Thanks,
Ian


> +
> +static int64_t
> +sort__comm_ignore_digit_cmp(struct hist_entry *left, struct hist_entry *right)
> +{
> +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
> +}
> +
> +static int64_t
> +sort__comm_ignore_digit_collapse(struct hist_entry *left, struct hist_entry *right)
> +{
> +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
> +}
> +
> +static int64_t
> +sort__comm_ignore_digit_sort(struct hist_entry *left, struct hist_entry *right)
> +{
> +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
> +}
> +
> +static int hist_entry__comm_ignore_digit_snprintf(struct hist_entry *he, char *bf,
> +                                               size_t size, unsigned int width)
> +{
> +       int ret = 0;
> +       unsigned int print_len, printed = 0, start = 0, end = 0;
> +       bool in_digit;
> +       const char *comm = comm__str(he->comm), *print;
> +
> +       while (printed < width && printed < size && comm[start]) {
> +               in_digit = !!isdigit(comm[start]);
> +               end = start + 1;
> +               while (comm[end] && !!isdigit(comm[end]) == in_digit)
> +                       end++;
> +               if (in_digit) {
> +                       print_len = 3; /* <N> */
> +                       print = "<N>";
> +               } else {
> +                       print_len = end - start;
> +                       print = &comm[start];
> +               }
> +               print_len = min(print_len, width - printed);
> +               ret = repsep_snprintf(bf + printed, size - printed, "%-.*s",
> +                                       print_len, print);
> +               if (ret < 0)
> +                       return ret;
> +               start = end;
> +               printed += ret;
> +       }
> +       /* Pad to width if necessary */
> +       if (printed < width && printed < size) {
> +               ret = repsep_snprintf(bf + printed, size - printed, "%-*.*s",
> +                                      width - printed, width - printed, "");
> +               if (ret < 0)
> +                       return ret;
> +               printed += ret;
> +       }
> +       return printed;
> +}
> +
> +struct sort_entry sort_comm_ignore_digit = {
> +       .se_header      = "CommandIgnoreDigit",
> +       .se_cmp         = sort__comm_ignore_digit_cmp,
> +       .se_collapse    = sort__comm_ignore_digit_collapse,
> +       .se_sort        = sort__comm_ignore_digit_sort,
> +       .se_snprintf    = hist_entry__comm_ignore_digit_snprintf,
> +       .se_filter      = hist_entry__thread_filter,
> +       .se_width_idx   = HISTC_COMM_IGNORE_DIGIT,
> +};
> +
>  /* --sort dso */
>
>  static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
> @@ -2583,6 +2671,7 @@ static struct sort_dimension common_sort_dimensions[] = {
>         DIM(SORT_PID, "pid", sort_thread),
>         DIM(SORT_TGID, "tgid", sort_tgid),
>         DIM(SORT_COMM, "comm", sort_comm),
> +       DIM(SORT_COMM_IGNORE_DIGIT, "comm_ignore_digit", sort_comm_ignore_digit),
>         DIM(SORT_DSO, "dso", sort_dso),
>         DIM(SORT_SYM, "symbol", sort_sym),
>         DIM(SORT_PARENT, "parent", sort_parent),
> @@ -3577,7 +3666,7 @@ static int __sort_dimension__update(struct sort_dimension *sd,
>                 list->socket = 1;
>         } else if (sd->entry == &sort_thread) {
>                 list->thread = 1;
> -       } else if (sd->entry == &sort_comm) {
> +       } else if (sd->entry == &sort_comm || sd->entry == &sort_comm_ignore_digit) {
>                 list->comm = 1;
>         } else if (sd->entry == &sort_type_offset) {
>                 symbol_conf.annotate_data_member = true;
> @@ -4040,6 +4129,7 @@ static bool get_elide(int idx, FILE *output)
>         case HISTC_DSO:
>                 return __get_elide(symbol_conf.dso_list, "dso", output);
>         case HISTC_COMM:
> +       case HISTC_COMM_IGNORE_DIGIT:
>                 return __get_elide(symbol_conf.comm_list, "comm", output);
>         default:
>                 break;
> diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
> index d7787958e06b9..6819934b4d48a 100644
> --- a/tools/perf/util/sort.h
> +++ b/tools/perf/util/sort.h
> @@ -43,6 +43,7 @@ enum sort_type {
>         /* common sort keys */
>         SORT_PID,
>         SORT_COMM,
> +       SORT_COMM_IGNORE_DIGIT,
>         SORT_DSO,
>         SORT_SYM,
>         SORT_PARENT,
> --
> 2.47.3
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] tools: perf: add comm_ignore_digit column
  2026-03-17 16:14 ` Ian Rogers
@ 2026-03-19 21:29   ` Namhyung Kim
  2026-03-20 22:07     ` Stephen Brennan
  2026-03-20 22:04   ` Stephen Brennan
  1 sibling, 1 reply; 5+ messages in thread
From: Namhyung Kim @ 2026-03-19 21:29 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Stephen Brennan, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Adrian Hunter, Ricky Ringler,
	Alexander Shishkin, linux-perf-users, Jiri Olsa, Mark Rutland,
	linux-kernel, Tianyou Li, James Clark

Hello,

On Tue, Mar 17, 2026 at 09:14:50AM -0700, Ian Rogers wrote:
> On Mon, Mar 16, 2026 at 10:56 AM Stephen Brennan
> <stephen.s.brennan@oracle.com> wrote:
> >
> > The "comm" column allows grouping events by the process command. It is
> > intended to group like programs, despite having different PIDs. But some
> > workloads may adjust their own command, so that a unique identifier
> > (e.g. a PID or some other numeric value) is part of the command name.
> > This destroys the utility of "comm", forcing perf to place each unique
> > process name into its own bucket, which can contribute to a
> > combinatorial explosion of memory use in perf report.
> >
> > Create a less strict version of this column, which ignores digits when
> > comparing command names. This allows "similar looking" processes to
> > again be placed in the same bucket.

Can you please include an example of this change so that others can
notice what's going on?

Actually I'm not happy with the name "comm_ignore_digit".  While it's
intuitive, it would be nice if we have a shorter name.  But maybe we
can just go with it unless someone has a better idea.

> >
> > Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
> > ---
> >
> > Changes from v1:
> > - Rebased on linux-tools-next. Resolved conflict with commit cbd41c6d4c26c
> >   ("perf report: Update sort key state from -F option").
> >
> > v1: https://lore.kernel.org/linux-perf-users/20260305181847.3249498-1-stephen.s.brennan@oracle.com/
> >
> >  tools/perf/util/hist.c |  1 +
> >  tools/perf/util/hist.h |  1 +
> >  tools/perf/util/sort.c | 92 +++++++++++++++++++++++++++++++++++++++++-
> >  tools/perf/util/sort.h |  1 +
> >  4 files changed, 94 insertions(+), 1 deletion(-)

Also please update the man page in the Documentation.

> >
> > diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> > index 7ffaa3d9851b4..a6e566dda3a15 100644
> > --- a/tools/perf/util/hist.c
> > +++ b/tools/perf/util/hist.c
> > @@ -110,6 +110,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
> >         len = thread__comm_len(h->thread);
> >         if (hists__new_col_len(hists, HISTC_COMM, len))
> >                 hists__set_col_len(hists, HISTC_THREAD, len + 8);
> > +       hists__new_col_len(hists, HISTC_COMM_IGNORE_DIGIT, len);
> 
> Thanks for working on this! Sashiko is noting some nits in its review:
> https://sashiko.dev/#/patchset/20260316175631.3169955-1-stephen.s.brennan%40oracle.com
> 
> Because hist_entry__comm_ignore_digit_snprintf() replaces consecutive digits
> with the 3-character string <N>, the formatted string is often longer than
> the original string (e.g., a command named worker_1 expands from 8 to 10
> characters: worker_<N>).
> 
> Since the snprintf logic strictly bounds the output to the calculated width,
> will the expanded output be prematurely truncated in reports?

Right.  You can add perf_hpp_list.comm_ignore_digit field and update it
the column length properly only if it's set.

Thanks,
Namhyung

> 
> >
> >         if (h->ms.map) {
> >                 len = dso__name_len(map__dso(h->ms.map));
> > diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
> > index 1d5ea632ca4e1..ae7e98bd9e46d 100644
> > --- a/tools/perf/util/hist.h
> > +++ b/tools/perf/util/hist.h
> > @@ -44,6 +44,7 @@ enum hist_column {
> >         HISTC_THREAD,
> >         HISTC_TGID,
> >         HISTC_COMM,
> > +       HISTC_COMM_IGNORE_DIGIT,
> >         HISTC_CGROUP_ID,
> >         HISTC_CGROUP,
> >         HISTC_PARENT,
> > diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
> > index 42d5cd7ef4e23..45662302ed5ba 100644
> > --- a/tools/perf/util/sort.c
> > +++ b/tools/perf/util/sort.c
> > @@ -1,4 +1,5 @@
> >  // SPDX-License-Identifier: GPL-2.0
> > +#include <ctype.h>
> >  #include <errno.h>
> >  #include <inttypes.h>
> >  #include <regex.h>
> > @@ -265,6 +266,93 @@ struct sort_entry sort_comm = {
> >         .se_width_idx   = HISTC_COMM,
> >  };
> >
> > +/* --sort comm_ignore_digit */
> > +
> > +static int64_t strcmp_nodigit(const char *left, const char *right)
> > +{
> > +       for (;;) {
> > +               while (*left && isdigit(*left))
> > +                       left++;
> > +               while (*right && isdigit(*right))
> > +                       right++;
> > +               if (*left == *right && !*left) {
> > +                       return 0;
> > +               } else if (*left == *right) {
> > +                       left++;
> > +                       right++;
> > +               } else {
> > +                       return (int64_t)*left - (int64_t)*right;
> > +               }
> > +       }
> > +}
> 
> Sashiko also notes what's below, but I sent out a patch to make all
> chars unsigned (like the kernel):
> https://lore.kernel.org/lkml/20260306191908.2065682-1-irogers@google.com/
> but it has no reviews.
> 
> Process command strings can contain arbitrary non-ASCII bytes (e.g., UTF-8
> characters). On architectures where char is signed, these bytes evaluate as
> negative integers.
> 
> Passing a negative value (other than EOF) to ctype.h's isdigit() causes
> undefined behavior. In glibc, it results in an out-of-bounds read on the
> __ctype_b_loc array, which can lead to a segmentation fault.
> 
> Additionally, if char is signed, returning (int64_t)*left - (int64_t)*right
> means non-ASCII characters will mathematically sort before ASCII characters.
> 
> Does this code need to cast the characters to unsigned char before passing
> them to isdigit() and subtracting them?
> 
> Thanks,
> Ian
> 
> 
> > +
> > +static int64_t
> > +sort__comm_ignore_digit_cmp(struct hist_entry *left, struct hist_entry *right)
> > +{
> > +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
> > +}
> > +
> > +static int64_t
> > +sort__comm_ignore_digit_collapse(struct hist_entry *left, struct hist_entry *right)
> > +{
> > +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
> > +}
> > +
> > +static int64_t
> > +sort__comm_ignore_digit_sort(struct hist_entry *left, struct hist_entry *right)
> > +{
> > +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
> > +}
> > +
> > +static int hist_entry__comm_ignore_digit_snprintf(struct hist_entry *he, char *bf,
> > +                                               size_t size, unsigned int width)
> > +{
> > +       int ret = 0;
> > +       unsigned int print_len, printed = 0, start = 0, end = 0;
> > +       bool in_digit;
> > +       const char *comm = comm__str(he->comm), *print;
> > +
> > +       while (printed < width && printed < size && comm[start]) {
> > +               in_digit = !!isdigit(comm[start]);
> > +               end = start + 1;
> > +               while (comm[end] && !!isdigit(comm[end]) == in_digit)
> > +                       end++;
> > +               if (in_digit) {
> > +                       print_len = 3; /* <N> */
> > +                       print = "<N>";
> > +               } else {
> > +                       print_len = end - start;
> > +                       print = &comm[start];
> > +               }
> > +               print_len = min(print_len, width - printed);
> > +               ret = repsep_snprintf(bf + printed, size - printed, "%-.*s",
> > +                                       print_len, print);
> > +               if (ret < 0)
> > +                       return ret;
> > +               start = end;
> > +               printed += ret;
> > +       }
> > +       /* Pad to width if necessary */
> > +       if (printed < width && printed < size) {
> > +               ret = repsep_snprintf(bf + printed, size - printed, "%-*.*s",
> > +                                      width - printed, width - printed, "");
> > +               if (ret < 0)
> > +                       return ret;
> > +               printed += ret;
> > +       }
> > +       return printed;
> > +}
> > +
> > +struct sort_entry sort_comm_ignore_digit = {
> > +       .se_header      = "CommandIgnoreDigit",
> > +       .se_cmp         = sort__comm_ignore_digit_cmp,
> > +       .se_collapse    = sort__comm_ignore_digit_collapse,
> > +       .se_sort        = sort__comm_ignore_digit_sort,
> > +       .se_snprintf    = hist_entry__comm_ignore_digit_snprintf,
> > +       .se_filter      = hist_entry__thread_filter,
> > +       .se_width_idx   = HISTC_COMM_IGNORE_DIGIT,
> > +};
> > +
> >  /* --sort dso */
> >
> >  static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
> > @@ -2583,6 +2671,7 @@ static struct sort_dimension common_sort_dimensions[] = {
> >         DIM(SORT_PID, "pid", sort_thread),
> >         DIM(SORT_TGID, "tgid", sort_tgid),
> >         DIM(SORT_COMM, "comm", sort_comm),
> > +       DIM(SORT_COMM_IGNORE_DIGIT, "comm_ignore_digit", sort_comm_ignore_digit),
> >         DIM(SORT_DSO, "dso", sort_dso),
> >         DIM(SORT_SYM, "symbol", sort_sym),
> >         DIM(SORT_PARENT, "parent", sort_parent),
> > @@ -3577,7 +3666,7 @@ static int __sort_dimension__update(struct sort_dimension *sd,
> >                 list->socket = 1;
> >         } else if (sd->entry == &sort_thread) {
> >                 list->thread = 1;
> > -       } else if (sd->entry == &sort_comm) {
> > +       } else if (sd->entry == &sort_comm || sd->entry == &sort_comm_ignore_digit) {
> >                 list->comm = 1;
> >         } else if (sd->entry == &sort_type_offset) {
> >                 symbol_conf.annotate_data_member = true;
> > @@ -4040,6 +4129,7 @@ static bool get_elide(int idx, FILE *output)
> >         case HISTC_DSO:
> >                 return __get_elide(symbol_conf.dso_list, "dso", output);
> >         case HISTC_COMM:
> > +       case HISTC_COMM_IGNORE_DIGIT:
> >                 return __get_elide(symbol_conf.comm_list, "comm", output);
> >         default:
> >                 break;
> > diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
> > index d7787958e06b9..6819934b4d48a 100644
> > --- a/tools/perf/util/sort.h
> > +++ b/tools/perf/util/sort.h
> > @@ -43,6 +43,7 @@ enum sort_type {
> >         /* common sort keys */
> >         SORT_PID,
> >         SORT_COMM,
> > +       SORT_COMM_IGNORE_DIGIT,
> >         SORT_DSO,
> >         SORT_SYM,
> >         SORT_PARENT,
> > --
> > 2.47.3
> >

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] tools: perf: add comm_ignore_digit column
  2026-03-17 16:14 ` Ian Rogers
  2026-03-19 21:29   ` Namhyung Kim
@ 2026-03-20 22:04   ` Stephen Brennan
  1 sibling, 0 replies; 5+ messages in thread
From: Stephen Brennan @ 2026-03-20 22:04 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Adrian Hunter, Ricky Ringler, Alexander Shishkin,
	linux-perf-users, Jiri Olsa, Mark Rutland, linux-kernel,
	Tianyou Li, James Clark



On 3/17/26 9:14 AM, Ian Rogers wrote:
> On Mon, Mar 16, 2026 at 10:56 AM Stephen Brennan
> <stephen.s.brennan@oracle.com> wrote:
>>
>> The "comm" column allows grouping events by the process command. It is
>> intended to group like programs, despite having different PIDs. But some
>> workloads may adjust their own command, so that a unique identifier
>> (e.g. a PID or some other numeric value) is part of the command name.
>> This destroys the utility of "comm", forcing perf to place each unique
>> process name into its own bucket, which can contribute to a
>> combinatorial explosion of memory use in perf report.
>>
>> Create a less strict version of this column, which ignores digits when
>> comparing command names. This allows "similar looking" processes to
>> again be placed in the same bucket.
>>
>> Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
>> ---
>>
>> Changes from v1:
>> - Rebased on linux-tools-next. Resolved conflict with commit cbd41c6d4c26c
>>   ("perf report: Update sort key state from -F option").
>>
>> v1: https://lore.kernel.org/linux-perf-users/20260305181847.3249498-1-stephen.s.brennan@oracle.com/
>>
>>  tools/perf/util/hist.c |  1 +
>>  tools/perf/util/hist.h |  1 +
>>  tools/perf/util/sort.c | 92 +++++++++++++++++++++++++++++++++++++++++-
>>  tools/perf/util/sort.h |  1 +
>>  4 files changed, 94 insertions(+), 1 deletion(-)
>>
>> diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
>> index 7ffaa3d9851b4..a6e566dda3a15 100644
>> --- a/tools/perf/util/hist.c
>> +++ b/tools/perf/util/hist.c
>> @@ -110,6 +110,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
>>         len = thread__comm_len(h->thread);
>>         if (hists__new_col_len(hists, HISTC_COMM, len))
>>                 hists__set_col_len(hists, HISTC_THREAD, len + 8);
>> +       hists__new_col_len(hists, HISTC_COMM_IGNORE_DIGIT, len);
> 
> Thanks for working on this! Sashiko is noting some nits in its review:
> https://sashiko.dev/#/patchset/20260316175631.3169955-1-stephen.s.brennan%40oracle.com
> 
> Because hist_entry__comm_ignore_digit_snprintf() replaces consecutive digits
> with the 3-character string <N>, the formatted string is often longer than
> the original string (e.g., a command named worker_1 expands from 8 to 10
> characters: worker_<N>).
> 
> Since the snprintf logic strictly bounds the output to the calculated width,
> will the expanded output be prematurely truncated in reports?

Yeah, thanks for this. I think I didn't really notice because most of
the numeric commands I saw were shorter than the common max of 15.

>>
>>         if (h->ms.map) {
>>                 len = dso__name_len(map__dso(h->ms.map));
>> diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
>> index 1d5ea632ca4e1..ae7e98bd9e46d 100644
>> --- a/tools/perf/util/hist.h
>> +++ b/tools/perf/util/hist.h
>> @@ -44,6 +44,7 @@ enum hist_column {
>>         HISTC_THREAD,
>>         HISTC_TGID,
>>         HISTC_COMM,
>> +       HISTC_COMM_IGNORE_DIGIT,
>>         HISTC_CGROUP_ID,
>>         HISTC_CGROUP,
>>         HISTC_PARENT,
>> diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
>> index 42d5cd7ef4e23..45662302ed5ba 100644
>> --- a/tools/perf/util/sort.c
>> +++ b/tools/perf/util/sort.c
>> @@ -1,4 +1,5 @@
>>  // SPDX-License-Identifier: GPL-2.0
>> +#include <ctype.h>
>>  #include <errno.h>
>>  #include <inttypes.h>
>>  #include <regex.h>
>> @@ -265,6 +266,93 @@ struct sort_entry sort_comm = {
>>         .se_width_idx   = HISTC_COMM,
>>  };
>>
>> +/* --sort comm_ignore_digit */
>> +
>> +static int64_t strcmp_nodigit(const char *left, const char *right)
>> +{
>> +       for (;;) {
>> +               while (*left && isdigit(*left))
>> +                       left++;
>> +               while (*right && isdigit(*right))
>> +                       right++;
>> +               if (*left == *right && !*left) {
>> +                       return 0;
>> +               } else if (*left == *right) {
>> +                       left++;
>> +                       right++;
>> +               } else {
>> +                       return (int64_t)*left - (int64_t)*right;
>> +               }
>> +       }
>> +}
> 
> Sashiko also notes what's below, but I sent out a patch to make all
> chars unsigned (like the kernel):
> https://lore.kernel.org/lkml/20260306191908.2065682-1-irogers@google.com/
> but it has no reviews.

That makes sense. I've encountered the unsigned char nature of the
kernel in drgn development [1]. It's so strange that such a fundamental
concept is left unspecified in C! :/

I'd be happy to test that patch out and share a reviewed-by assuming I
don't encounter any issues. For whatever that's worth.

[1]: https://github.com/osandov/drgn/issues/569

> Process command strings can contain arbitrary non-ASCII bytes (e.g., UTF-8
> characters). On architectures where char is signed, these bytes evaluate as
> negative integers.
> 
> Passing a negative value (other than EOF) to ctype.h's isdigit() causes
> undefined behavior. In glibc, it results in an out-of-bounds read on the
> __ctype_b_loc array, which can lead to a segmentation fault.

I was curious and tried some simple programs that call isdigit() with a
negative number. Large magnitude negative numbers do indeed trigger a
segfault.

From what I can see in the glibc source code, the table is explicitly
sized and offset to allow the values -128 to -1 to be indexed safely, to
make the call safe for signed chars.

Of course at the end of the day this was just to satisfy my curiosity.
It is UB and should be avoided.

> Additionally, if char is signed, returning (int64_t)*left - (int64_t)*right
> means non-ASCII characters will mathematically sort before ASCII characters.
> 
> Does this code need to cast the characters to unsigned char before passing
> them to isdigit() and subtracting them?

Yeah, that seems right too. It's not great but it matches the use of
unsigned characters documented in strcmp(3).

> Thanks,
> Ian

Thank you!
Stephen

> 
>> +
>> +static int64_t
>> +sort__comm_ignore_digit_cmp(struct hist_entry *left, struct hist_entry *right)
>> +{
>> +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
>> +}
>> +
>> +static int64_t
>> +sort__comm_ignore_digit_collapse(struct hist_entry *left, struct hist_entry *right)
>> +{
>> +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
>> +}
>> +
>> +static int64_t
>> +sort__comm_ignore_digit_sort(struct hist_entry *left, struct hist_entry *right)
>> +{
>> +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
>> +}
>> +
>> +static int hist_entry__comm_ignore_digit_snprintf(struct hist_entry *he, char *bf,
>> +                                               size_t size, unsigned int width)
>> +{
>> +       int ret = 0;
>> +       unsigned int print_len, printed = 0, start = 0, end = 0;
>> +       bool in_digit;
>> +       const char *comm = comm__str(he->comm), *print;
>> +
>> +       while (printed < width && printed < size && comm[start]) {
>> +               in_digit = !!isdigit(comm[start]);
>> +               end = start + 1;
>> +               while (comm[end] && !!isdigit(comm[end]) == in_digit)
>> +                       end++;
>> +               if (in_digit) {
>> +                       print_len = 3; /* <N> */
>> +                       print = "<N>";
>> +               } else {
>> +                       print_len = end - start;
>> +                       print = &comm[start];
>> +               }
>> +               print_len = min(print_len, width - printed);
>> +               ret = repsep_snprintf(bf + printed, size - printed, "%-.*s",
>> +                                       print_len, print);
>> +               if (ret < 0)
>> +                       return ret;
>> +               start = end;
>> +               printed += ret;
>> +       }
>> +       /* Pad to width if necessary */
>> +       if (printed < width && printed < size) {
>> +               ret = repsep_snprintf(bf + printed, size - printed, "%-*.*s",
>> +                                      width - printed, width - printed, "");
>> +               if (ret < 0)
>> +                       return ret;
>> +               printed += ret;
>> +       }
>> +       return printed;
>> +}
>> +
>> +struct sort_entry sort_comm_ignore_digit = {
>> +       .se_header      = "CommandIgnoreDigit",
>> +       .se_cmp         = sort__comm_ignore_digit_cmp,
>> +       .se_collapse    = sort__comm_ignore_digit_collapse,
>> +       .se_sort        = sort__comm_ignore_digit_sort,
>> +       .se_snprintf    = hist_entry__comm_ignore_digit_snprintf,
>> +       .se_filter      = hist_entry__thread_filter,
>> +       .se_width_idx   = HISTC_COMM_IGNORE_DIGIT,
>> +};
>> +
>>  /* --sort dso */
>>
>>  static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
>> @@ -2583,6 +2671,7 @@ static struct sort_dimension common_sort_dimensions[] = {
>>         DIM(SORT_PID, "pid", sort_thread),
>>         DIM(SORT_TGID, "tgid", sort_tgid),
>>         DIM(SORT_COMM, "comm", sort_comm),
>> +       DIM(SORT_COMM_IGNORE_DIGIT, "comm_ignore_digit", sort_comm_ignore_digit),
>>         DIM(SORT_DSO, "dso", sort_dso),
>>         DIM(SORT_SYM, "symbol", sort_sym),
>>         DIM(SORT_PARENT, "parent", sort_parent),
>> @@ -3577,7 +3666,7 @@ static int __sort_dimension__update(struct sort_dimension *sd,
>>                 list->socket = 1;
>>         } else if (sd->entry == &sort_thread) {
>>                 list->thread = 1;
>> -       } else if (sd->entry == &sort_comm) {
>> +       } else if (sd->entry == &sort_comm || sd->entry == &sort_comm_ignore_digit) {
>>                 list->comm = 1;
>>         } else if (sd->entry == &sort_type_offset) {
>>                 symbol_conf.annotate_data_member = true;
>> @@ -4040,6 +4129,7 @@ static bool get_elide(int idx, FILE *output)
>>         case HISTC_DSO:
>>                 return __get_elide(symbol_conf.dso_list, "dso", output);
>>         case HISTC_COMM:
>> +       case HISTC_COMM_IGNORE_DIGIT:
>>                 return __get_elide(symbol_conf.comm_list, "comm", output);
>>         default:
>>                 break;
>> diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
>> index d7787958e06b9..6819934b4d48a 100644
>> --- a/tools/perf/util/sort.h
>> +++ b/tools/perf/util/sort.h
>> @@ -43,6 +43,7 @@ enum sort_type {
>>         /* common sort keys */
>>         SORT_PID,
>>         SORT_COMM,
>> +       SORT_COMM_IGNORE_DIGIT,
>>         SORT_DSO,
>>         SORT_SYM,
>>         SORT_PARENT,
>> --
>> 2.47.3
>>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] tools: perf: add comm_ignore_digit column
  2026-03-19 21:29   ` Namhyung Kim
@ 2026-03-20 22:07     ` Stephen Brennan
  0 siblings, 0 replies; 5+ messages in thread
From: Stephen Brennan @ 2026-03-20 22:07 UTC (permalink / raw)
  To: Namhyung Kim, Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Adrian Hunter, Ricky Ringler, Alexander Shishkin,
	linux-perf-users, Jiri Olsa, Mark Rutland, linux-kernel,
	Tianyou Li, James Clark



On 3/19/26 2:29 PM, Namhyung Kim wrote:
> Hello,
> 
> On Tue, Mar 17, 2026 at 09:14:50AM -0700, Ian Rogers wrote:
>> On Mon, Mar 16, 2026 at 10:56 AM Stephen Brennan
>> <stephen.s.brennan@oracle.com> wrote:
>>>
>>> The "comm" column allows grouping events by the process command. It is
>>> intended to group like programs, despite having different PIDs. But some
>>> workloads may adjust their own command, so that a unique identifier
>>> (e.g. a PID or some other numeric value) is part of the command name.
>>> This destroys the utility of "comm", forcing perf to place each unique
>>> process name into its own bucket, which can contribute to a
>>> combinatorial explosion of memory use in perf report.
>>>
>>> Create a less strict version of this column, which ignores digits when
>>> comparing command names. This allows "similar looking" processes to
>>> again be placed in the same bucket.
> 
> Can you please include an example of this change so that others can
> notice what's going on?
> 
> Actually I'm not happy with the name "comm_ignore_digit".  While it's
> intuitive, it would be nice if we have a shorter name.  But maybe we
> can just go with it unless someone has a better idea.

Hi Namhyung,

That's totally fair. Would "comm_nodigit" work for you? It's at least a
bit smaller and still clear. I wouldn't want to do something terribly
abbreviated like "comm_nd" or something, because I don't think it would
be very intuitive for readers unfamiliar with this patch.

Acknowledged on your other points, I'll send an update with these
changes. Worst case we can bikeshed on the name again on that revision :D

Thanks,
Stephen

>>>
>>> Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
>>> ---
>>>
>>> Changes from v1:
>>> - Rebased on linux-tools-next. Resolved conflict with commit cbd41c6d4c26c
>>>   ("perf report: Update sort key state from -F option").
>>>
>>> v1: https://lore.kernel.org/linux-perf-users/20260305181847.3249498-1-stephen.s.brennan@oracle.com/
>>>
>>>  tools/perf/util/hist.c |  1 +
>>>  tools/perf/util/hist.h |  1 +
>>>  tools/perf/util/sort.c | 92 +++++++++++++++++++++++++++++++++++++++++-
>>>  tools/perf/util/sort.h |  1 +
>>>  4 files changed, 94 insertions(+), 1 deletion(-)
> 
> Also please update the man page in the Documentation.
> 
>>>
>>> diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
>>> index 7ffaa3d9851b4..a6e566dda3a15 100644
>>> --- a/tools/perf/util/hist.c
>>> +++ b/tools/perf/util/hist.c
>>> @@ -110,6 +110,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
>>>         len = thread__comm_len(h->thread);
>>>         if (hists__new_col_len(hists, HISTC_COMM, len))
>>>                 hists__set_col_len(hists, HISTC_THREAD, len + 8);
>>> +       hists__new_col_len(hists, HISTC_COMM_IGNORE_DIGIT, len);
>>
>> Thanks for working on this! Sashiko is noting some nits in its review:
>> https://sashiko.dev/#/patchset/20260316175631.3169955-1-stephen.s.brennan%40oracle.com
>>
>> Because hist_entry__comm_ignore_digit_snprintf() replaces consecutive digits
>> with the 3-character string <N>, the formatted string is often longer than
>> the original string (e.g., a command named worker_1 expands from 8 to 10
>> characters: worker_<N>).
>>
>> Since the snprintf logic strictly bounds the output to the calculated width,
>> will the expanded output be prematurely truncated in reports?
> 
> Right.  You can add perf_hpp_list.comm_ignore_digit field and update it
> the column length properly only if it's set.
> 
> Thanks,
> Namhyung
> 
>>
>>>
>>>         if (h->ms.map) {
>>>                 len = dso__name_len(map__dso(h->ms.map));
>>> diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
>>> index 1d5ea632ca4e1..ae7e98bd9e46d 100644
>>> --- a/tools/perf/util/hist.h
>>> +++ b/tools/perf/util/hist.h
>>> @@ -44,6 +44,7 @@ enum hist_column {
>>>         HISTC_THREAD,
>>>         HISTC_TGID,
>>>         HISTC_COMM,
>>> +       HISTC_COMM_IGNORE_DIGIT,
>>>         HISTC_CGROUP_ID,
>>>         HISTC_CGROUP,
>>>         HISTC_PARENT,
>>> diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
>>> index 42d5cd7ef4e23..45662302ed5ba 100644
>>> --- a/tools/perf/util/sort.c
>>> +++ b/tools/perf/util/sort.c
>>> @@ -1,4 +1,5 @@
>>>  // SPDX-License-Identifier: GPL-2.0
>>> +#include <ctype.h>
>>>  #include <errno.h>
>>>  #include <inttypes.h>
>>>  #include <regex.h>
>>> @@ -265,6 +266,93 @@ struct sort_entry sort_comm = {
>>>         .se_width_idx   = HISTC_COMM,
>>>  };
>>>
>>> +/* --sort comm_ignore_digit */
>>> +
>>> +static int64_t strcmp_nodigit(const char *left, const char *right)
>>> +{
>>> +       for (;;) {
>>> +               while (*left && isdigit(*left))
>>> +                       left++;
>>> +               while (*right && isdigit(*right))
>>> +                       right++;
>>> +               if (*left == *right && !*left) {
>>> +                       return 0;
>>> +               } else if (*left == *right) {
>>> +                       left++;
>>> +                       right++;
>>> +               } else {
>>> +                       return (int64_t)*left - (int64_t)*right;
>>> +               }
>>> +       }
>>> +}
>>
>> Sashiko also notes what's below, but I sent out a patch to make all
>> chars unsigned (like the kernel):
>> https://lore.kernel.org/lkml/20260306191908.2065682-1-irogers@google.com/
>> but it has no reviews.
>>
>> Process command strings can contain arbitrary non-ASCII bytes (e.g., UTF-8
>> characters). On architectures where char is signed, these bytes evaluate as
>> negative integers.
>>
>> Passing a negative value (other than EOF) to ctype.h's isdigit() causes
>> undefined behavior. In glibc, it results in an out-of-bounds read on the
>> __ctype_b_loc array, which can lead to a segmentation fault.
>>
>> Additionally, if char is signed, returning (int64_t)*left - (int64_t)*right
>> means non-ASCII characters will mathematically sort before ASCII characters.
>>
>> Does this code need to cast the characters to unsigned char before passing
>> them to isdigit() and subtracting them?
>>
>> Thanks,
>> Ian
>>
>>
>>> +
>>> +static int64_t
>>> +sort__comm_ignore_digit_cmp(struct hist_entry *left, struct hist_entry *right)
>>> +{
>>> +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
>>> +}
>>> +
>>> +static int64_t
>>> +sort__comm_ignore_digit_collapse(struct hist_entry *left, struct hist_entry *right)
>>> +{
>>> +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
>>> +}
>>> +
>>> +static int64_t
>>> +sort__comm_ignore_digit_sort(struct hist_entry *left, struct hist_entry *right)
>>> +{
>>> +       return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
>>> +}
>>> +
>>> +static int hist_entry__comm_ignore_digit_snprintf(struct hist_entry *he, char *bf,
>>> +                                               size_t size, unsigned int width)
>>> +{
>>> +       int ret = 0;
>>> +       unsigned int print_len, printed = 0, start = 0, end = 0;
>>> +       bool in_digit;
>>> +       const char *comm = comm__str(he->comm), *print;
>>> +
>>> +       while (printed < width && printed < size && comm[start]) {
>>> +               in_digit = !!isdigit(comm[start]);
>>> +               end = start + 1;
>>> +               while (comm[end] && !!isdigit(comm[end]) == in_digit)
>>> +                       end++;
>>> +               if (in_digit) {
>>> +                       print_len = 3; /* <N> */
>>> +                       print = "<N>";
>>> +               } else {
>>> +                       print_len = end - start;
>>> +                       print = &comm[start];
>>> +               }
>>> +               print_len = min(print_len, width - printed);
>>> +               ret = repsep_snprintf(bf + printed, size - printed, "%-.*s",
>>> +                                       print_len, print);
>>> +               if (ret < 0)
>>> +                       return ret;
>>> +               start = end;
>>> +               printed += ret;
>>> +       }
>>> +       /* Pad to width if necessary */
>>> +       if (printed < width && printed < size) {
>>> +               ret = repsep_snprintf(bf + printed, size - printed, "%-*.*s",
>>> +                                      width - printed, width - printed, "");
>>> +               if (ret < 0)
>>> +                       return ret;
>>> +               printed += ret;
>>> +       }
>>> +       return printed;
>>> +}
>>> +
>>> +struct sort_entry sort_comm_ignore_digit = {
>>> +       .se_header      = "CommandIgnoreDigit",
>>> +       .se_cmp         = sort__comm_ignore_digit_cmp,
>>> +       .se_collapse    = sort__comm_ignore_digit_collapse,
>>> +       .se_sort        = sort__comm_ignore_digit_sort,
>>> +       .se_snprintf    = hist_entry__comm_ignore_digit_snprintf,
>>> +       .se_filter      = hist_entry__thread_filter,
>>> +       .se_width_idx   = HISTC_COMM_IGNORE_DIGIT,
>>> +};
>>> +
>>>  /* --sort dso */
>>>
>>>  static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
>>> @@ -2583,6 +2671,7 @@ static struct sort_dimension common_sort_dimensions[] = {
>>>         DIM(SORT_PID, "pid", sort_thread),
>>>         DIM(SORT_TGID, "tgid", sort_tgid),
>>>         DIM(SORT_COMM, "comm", sort_comm),
>>> +       DIM(SORT_COMM_IGNORE_DIGIT, "comm_ignore_digit", sort_comm_ignore_digit),
>>>         DIM(SORT_DSO, "dso", sort_dso),
>>>         DIM(SORT_SYM, "symbol", sort_sym),
>>>         DIM(SORT_PARENT, "parent", sort_parent),
>>> @@ -3577,7 +3666,7 @@ static int __sort_dimension__update(struct sort_dimension *sd,
>>>                 list->socket = 1;
>>>         } else if (sd->entry == &sort_thread) {
>>>                 list->thread = 1;
>>> -       } else if (sd->entry == &sort_comm) {
>>> +       } else if (sd->entry == &sort_comm || sd->entry == &sort_comm_ignore_digit) {
>>>                 list->comm = 1;
>>>         } else if (sd->entry == &sort_type_offset) {
>>>                 symbol_conf.annotate_data_member = true;
>>> @@ -4040,6 +4129,7 @@ static bool get_elide(int idx, FILE *output)
>>>         case HISTC_DSO:
>>>                 return __get_elide(symbol_conf.dso_list, "dso", output);
>>>         case HISTC_COMM:
>>> +       case HISTC_COMM_IGNORE_DIGIT:
>>>                 return __get_elide(symbol_conf.comm_list, "comm", output);
>>>         default:
>>>                 break;
>>> diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
>>> index d7787958e06b9..6819934b4d48a 100644
>>> --- a/tools/perf/util/sort.h
>>> +++ b/tools/perf/util/sort.h
>>> @@ -43,6 +43,7 @@ enum sort_type {
>>>         /* common sort keys */
>>>         SORT_PID,
>>>         SORT_COMM,
>>> +       SORT_COMM_IGNORE_DIGIT,
>>>         SORT_DSO,
>>>         SORT_SYM,
>>>         SORT_PARENT,
>>> --
>>> 2.47.3
>>>


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-03-20 22:08 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-16 17:56 [PATCH v2] tools: perf: add comm_ignore_digit column Stephen Brennan
2026-03-17 16:14 ` Ian Rogers
2026-03-19 21:29   ` Namhyung Kim
2026-03-20 22:07     ` Stephen Brennan
2026-03-20 22:04   ` Stephen Brennan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox