[PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term

linux-perf-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term
@ 2024-07-18  0:30 Ian Rogers
  2024-07-18  0:30 ` [PATCH v2 1/6] perf pmu: Merge boolean sysfs event option parsing Ian Rogers
                   ` (6 more replies)
  0 siblings, 7 replies; 34+ messages in thread
From: Ian Rogers @ 2024-07-18  0:30 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, Kan Liang, Bjorn Helgaas,
	Jonathan Corbet, James Clark, Ravi Bangoria, Dominique Martinet,
	linux-kernel, linux-perf-users, Dhananjay Ugwekar, ananth.narayan,
	gautham.shenoy, kprateek.nayak, sandipan.das

The need for a sysfs event.cpus file is discussed here:
https://lore.kernel.org/lkml/CAP-5=fXXuWchzUK0n5KTH8kamr=DQoEni+bUoo8f-4j8Y+eMBg@mail.gmail.com/
following Dhananjay Ugwekar's work on the RAPL /sys/devices/power PMU.
These changes add support for the event.cpus file in sysfs and also a
cpu event term allowing events to have differing CPUs. This was added
in order to test the parsing and map propagation for the sysfs case.

v2: Add support for multiple cpu terms on an event that are
    merged. For example, an event of "l1d-misses/cpu=4,cpu=5/" will
    now be opened on both CPU 4 and 5 rather than just CPU 4.

Ian Rogers (6):
  perf pmu: Merge boolean sysfs event option parsing
  perf parse-events: Pass cpu_list as a perf_cpu_map in __add_event
  perf pmu: Add support for event.cpus files in sysfs
  libperf cpumap: Add ability to create CPU from a single CPU number
  perf parse-events: Set is_pmu_core for legacy hardware events
  perf parse-events: Add "cpu" term to set the CPU an event is recorded
    on

 .../sysfs-bus-event_source-devices-events     |  14 ++
 tools/lib/perf/cpumap.c                       |  10 ++
 tools/lib/perf/include/perf/cpumap.h          |   2 +
 tools/perf/Documentation/perf-list.txt        |   9 +
 tools/perf/util/evsel_config.h                |   1 +
 tools/perf/util/parse-events.c                | 162 ++++++++++++------
 tools/perf/util/parse-events.h                |   3 +-
 tools/perf/util/parse-events.l                |   1 +
 tools/perf/util/pmu.c                         |  92 +++++++---
 tools/perf/util/pmu.h                         |   1 +
 10 files changed, 221 insertions(+), 74 deletions(-)

-- 
2.45.2.1089.g2a221341d9-goog


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 1/6] perf pmu: Merge boolean sysfs event option parsing
  2024-07-18  0:30 [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Ian Rogers
@ 2024-07-18  0:30 ` Ian Rogers
  2024-07-18  0:30 ` [PATCH v2 2/6] perf parse-events: Pass cpu_list as a perf_cpu_map in __add_event Ian Rogers
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 34+ messages in thread
From: Ian Rogers @ 2024-07-18  0:30 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, Kan Liang, Bjorn Helgaas,
	Jonathan Corbet, James Clark, Ravi Bangoria, Dominique Martinet,
	linux-kernel, linux-perf-users, Dhananjay Ugwekar, ananth.narayan,
	gautham.shenoy, kprateek.nayak, sandipan.das

Merge perf_pmu__parse_per_pkg and perf_pmu__parse_snapshot that do the
same parsing except for the file suffix used.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/perf/util/pmu.c | 47 +++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 986166bc7c78..5148b6639dd3 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -367,8 +367,8 @@ static int perf_pmu__parse_unit(struct perf_pmu *pmu, struct perf_pmu_alias *ali
 	return -1;
 }
 
-static int
-perf_pmu__parse_per_pkg(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
+static bool perf_pmu__parse_event_source_bool(const char *pmu_name, const char *event_name,
+					      const char *suffix)
 {
 	char path[PATH_MAX];
 	size_t len;
@@ -376,37 +376,36 @@ perf_pmu__parse_per_pkg(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
 
 	len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path));
 	if (!len)
-		return 0;
-	scnprintf(path + len, sizeof(path) - len, "%s/events/%s.per-pkg", pmu->name, alias->name);
+		return false;
+
+	scnprintf(path + len, sizeof(path) - len, "%s/events/%s.%s", pmu_name, event_name, suffix);
 
 	fd = open(path, O_RDONLY);
 	if (fd == -1)
-		return -1;
+		return false;
 
-	close(fd);
+#ifndef NDEBUG
+	{
+		char buf[8];
 
-	alias->per_pkg = true;
-	return 0;
+		len = read(fd, buf, sizeof(buf));
+		assert(len == 1 || len == 2);
+		assert(buf[0] == '1');
+	}
+#endif
+
+	close(fd);
+	return true;
 }
 
-static int perf_pmu__parse_snapshot(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
+static void perf_pmu__parse_per_pkg(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
 {
-	char path[PATH_MAX];
-	size_t len;
-	int fd;
-
-	len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path));
-	if (!len)
-		return 0;
-	scnprintf(path + len, sizeof(path) - len, "%s/events/%s.snapshot", pmu->name, alias->name);
-
-	fd = open(path, O_RDONLY);
-	if (fd == -1)
-		return -1;
+	alias->per_pkg = perf_pmu__parse_event_source_bool(pmu->name, alias->name, "per-pkg");
+}
 
-	alias->snapshot = true;
-	close(fd);
-	return 0;
+static void perf_pmu__parse_snapshot(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
+{
+	alias->snapshot = perf_pmu__parse_event_source_bool(pmu->name, alias->name, "snapshot");
 }
 
 /* Delete an alias entry. */
-- 
2.45.2.1089.g2a221341d9-goog


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 2/6] perf parse-events: Pass cpu_list as a perf_cpu_map in __add_event
  2024-07-18  0:30 [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Ian Rogers
  2024-07-18  0:30 ` [PATCH v2 1/6] perf pmu: Merge boolean sysfs event option parsing Ian Rogers
@ 2024-07-18  0:30 ` Ian Rogers
  2024-07-18  0:30 ` [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs Ian Rogers
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 34+ messages in thread
From: Ian Rogers @ 2024-07-18  0:30 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, Kan Liang, Bjorn Helgaas,
	Jonathan Corbet, James Clark, Ravi Bangoria, Dominique Martinet,
	linux-kernel, linux-perf-users, Dhananjay Ugwekar, ananth.narayan,
	gautham.shenoy, kprateek.nayak, sandipan.das

Previously the cpu_list is a string and typically no cpu_list is
passed to __add_event. Wanting to make events have their cpus distinct
from the PMU means that in more occassions we want to pass a
cpu_list. If we're reading this from sysfs it is easier to read a
perf_cpu_map than allocate and pass around strings that will later be
parsed.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/perf/util/parse-events.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 321586fb5556..43501eb56336 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -227,12 +227,12 @@ __add_event(struct list_head *list, int *idx,
 	    bool init_attr,
 	    const char *name, const char *metric_id, struct perf_pmu *pmu,
 	    struct list_head *config_terms, bool auto_merge_stats,
-	    const char *cpu_list)
+	    struct perf_cpu_map *cpu_list)
 {
 	struct evsel *evsel;
-	struct perf_cpu_map *cpus = pmu ? perf_cpu_map__get(pmu->cpus) :
-			       cpu_list ? perf_cpu_map__new(cpu_list) : NULL;
+	struct perf_cpu_map *cpus = perf_cpu_map__is_empty(cpu_list) && pmu ? pmu->cpus : cpu_list;
 
+	cpus = perf_cpu_map__get(cpus);
 	if (pmu)
 		perf_pmu__warn_invalid_formats(pmu);
 
@@ -305,16 +305,17 @@ static int add_event_tool(struct list_head *list, int *idx,
 		.type = PERF_TYPE_SOFTWARE,
 		.config = PERF_COUNT_SW_DUMMY,
 	};
-	const char *cpu_list = NULL;
+	struct perf_cpu_map *cpu_list = NULL;
 
 	if (tool_event == PERF_TOOL_DURATION_TIME) {
 		/* Duration time is gathered globally, pretend it is only on CPU0. */
-		cpu_list = "0";
+		cpu_list = perf_cpu_map__new("0");
 	}
 	evsel = __add_event(list, idx, &attr, /*init_attr=*/true, /*name=*/NULL,
 			    /*metric_id=*/NULL, /*pmu=*/NULL,
 			    /*config_terms=*/NULL, /*auto_merge_stats=*/false,
 			    cpu_list);
+	perf_cpu_map__put(cpu_list);
 	if (!evsel)
 		return -ENOMEM;
 	evsel->tool_event = tool_event;
-- 
2.45.2.1089.g2a221341d9-goog


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-18  0:30 [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Ian Rogers
  2024-07-18  0:30 ` [PATCH v2 1/6] perf pmu: Merge boolean sysfs event option parsing Ian Rogers
  2024-07-18  0:30 ` [PATCH v2 2/6] perf parse-events: Pass cpu_list as a perf_cpu_map in __add_event Ian Rogers
@ 2024-07-18  0:30 ` Ian Rogers
  2024-07-18 14:33   ` Liang, Kan
  2024-07-18  0:30 ` [PATCH v2 4/6] libperf cpumap: Add ability to create CPU from a single CPU number Ian Rogers
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 34+ messages in thread
From: Ian Rogers @ 2024-07-18  0:30 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, Kan Liang, Bjorn Helgaas,
	Jonathan Corbet, James Clark, Ravi Bangoria, Dominique Martinet,
	linux-kernel, linux-perf-users, Dhananjay Ugwekar, ananth.narayan,
	gautham.shenoy, kprateek.nayak, sandipan.das

If an event file exists in sysfs, check if a event.cpus file exists
and read a perf_cpu_map from it if it does. This allows particular
events to have a different set of CPUs compared to the PMU.

One scenario where this could be useful is when a PMU is set up with a
cpumask/events per SMT thread but some events record for all SMT
threads. Programming an event on each SMT thread will cause
unnecessary counters to be programmed and the aggregate value to be
too large.

Another scenario where this could be useful if when a PMU has
historically had a cpumask at the package level, but now newer per
die, core or CPU information is available.

Additional context for the motivation is in these patches and
conversation:
https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/

Signed-off-by: Ian Rogers <irogers@google.com>
---
 .../sysfs-bus-event_source-devices-events     | 14 ++++++
 tools/perf/util/parse-events.c                | 45 ++++++++++---------
 tools/perf/util/pmu.c                         | 44 +++++++++++++++++-
 tools/perf/util/pmu.h                         |  1 +
 4 files changed, 82 insertions(+), 22 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
index e7efeab2ee83..d8e3a4dd3ba7 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
@@ -70,6 +70,20 @@ Description:	Per-pmu performance monitoring events specific to the running syste
 		This is referred to as "event parameterization". Event
 		parameters have the format 'param=?'.
 
+What: /sys/bus/event_source/devices/<pmu>/events/<event>.cpus
+Date: 2024/07/17
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:	Perf event CPUs
+
+		A list of CPUs on which a the perf event should be
+		opened by default. This is an event specific variant
+		of /sys/bus/event_source/devices/<pmu>/cpumask.
+
+		Examples (each of these lines would be in a separate file):
+
+			0,2,4,6
+			0-7
+
 What: /sys/bus/event_source/devices/<pmu>/events/<event>.unit
 Date: 2014/02/24
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 43501eb56336..b181f83c9678 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1425,12 +1425,13 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 				bool auto_merge_stats)
 {
 	struct perf_event_attr attr;
-	struct perf_pmu_info info;
+	struct perf_pmu_info info = {};
 	struct evsel *evsel;
 	struct parse_events_error *err = parse_state->error;
 	LIST_HEAD(config_terms);
 	struct parse_events_terms parsed_terms;
 	bool alias_rewrote_terms = false;
+	int ret = 0;
 
 	if (verbose > 1) {
 		struct strbuf sb;
@@ -1465,8 +1466,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 
 	parse_events_terms__init(&parsed_terms);
 	if (const_parsed_terms) {
-		int ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms);
-
+		ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms);
 		if (ret)
 			return ret;
 	}
@@ -1474,15 +1474,15 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 
 	/* Configure attr/terms with a known PMU, this will set hardcoded terms. */
 	if (config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) {
-		parse_events_terms__exit(&parsed_terms);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_err;
 	}
 
 	/* Look for event names in the terms and rewrite into format based terms. */
 	if (!parse_state->fake_pmu && perf_pmu__check_alias(pmu, &parsed_terms,
 							    &info, &alias_rewrote_terms, err)) {
-		parse_events_terms__exit(&parsed_terms);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_err;
 	}
 
 	if (verbose > 1) {
@@ -1497,13 +1497,13 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 	/* Configure attr/terms again if an alias was expanded. */
 	if (alias_rewrote_terms &&
 	    config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) {
-		parse_events_terms__exit(&parsed_terms);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_err;
 	}
 
 	if (get_config_terms(&parsed_terms, &config_terms)) {
-		parse_events_terms__exit(&parsed_terms);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_err;
 	}
 
 	/*
@@ -1512,24 +1512,23 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 	 */
 	if (pmu->perf_event_attr_init_default &&
 	    get_config_chgs(pmu, &parsed_terms, &config_terms)) {
-		parse_events_terms__exit(&parsed_terms);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_err;
 	}
 
 	if (!parse_state->fake_pmu &&
 	    perf_pmu__config(pmu, &attr, &parsed_terms, parse_state->error)) {
-		free_config_terms(&config_terms);
-		parse_events_terms__exit(&parsed_terms);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_err;
 	}
 
 	evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true,
 			    get_config_name(&parsed_terms),
 			    get_config_metric_id(&parsed_terms), pmu,
-			    &config_terms, auto_merge_stats, /*cpu_list=*/NULL);
+			    &config_terms, auto_merge_stats, info.cpus);
 	if (!evsel) {
-		parse_events_terms__exit(&parsed_terms);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_err;
 	}
 
 	if (evsel->name)
@@ -1542,13 +1541,17 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 		return 0;
 	}
 
-	parse_events_terms__exit(&parsed_terms);
 	free((char *)evsel->unit);
 	evsel->unit = strdup(info.unit);
 	evsel->scale = info.scale;
 	evsel->per_pkg = info.per_pkg;
 	evsel->snapshot = info.snapshot;
-	return 0;
+out_err:
+	parse_events_terms__exit(&parsed_terms);
+	if (ret)
+		free_config_terms(&config_terms);
+	perf_cpu_map__put(info.cpus);
+	return ret;
 }
 
 int parse_events_multi_pmu_add(struct parse_events_state *parse_state,
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 5148b6639dd3..280b2499c861 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -73,6 +73,11 @@ struct perf_pmu_alias {
 	 * differ from the PMU name as it won't have suffixes.
 	 */
 	char *pmu_name;
+	/**
+	 * @cpus: A possible per-event cpumap that overrides that given for the
+	 * PMU.
+	 */
+	struct perf_cpu_map *cpus;
 	/** @unit: Units for the event, such as bytes or cache lines. */
 	char unit[UNIT_MAX_LEN+1];
 	/** @scale: Value to scale read counter values by. */
@@ -332,6 +337,32 @@ static int perf_pmu__parse_scale(struct perf_pmu *pmu, struct perf_pmu_alias *al
 	return ret;
 }
 
+static void perf_pmu__parse_cpus(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
+{
+	char path[PATH_MAX];
+	size_t len;
+	FILE *file;
+	int fd;
+
+	len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path));
+	if (!len)
+		return;
+	scnprintf(path + len, sizeof(path) - len, "%s/events/%s.cpus", pmu->name, alias->name);
+
+	fd = open(path, O_RDONLY);
+	if (fd == -1)
+		return; /* Expected common case. */
+
+	file = fdopen(fd, "r");
+	if (!file) {
+		close(fd);
+		return;
+	}
+
+	alias->cpus = perf_cpu_map__read(file);
+	fclose(file);
+}
+
 static int perf_pmu__parse_unit(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
 {
 	char path[PATH_MAX];
@@ -493,6 +524,7 @@ static void read_alias_info(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
 	/*
 	 * load unit name and scale if available
 	 */
+	perf_pmu__parse_cpus(pmu, alias);
 	perf_pmu__parse_unit(pmu, alias);
 	perf_pmu__parse_scale(pmu, alias);
 	perf_pmu__parse_per_pkg(pmu, alias);
@@ -618,7 +650,7 @@ static inline bool pmu_alias_info_file(const char *name)
 	size_t len;
 
 	len = strlen(name);
-	if (len > 5 && !strcmp(name + len - 5, ".unit"))
+	if (len > 5 && (!strcmp(name + len - 5, ".cpus") || !strcmp(name + len - 5, ".unit")))
 		return true;
 	if (len > 6 && !strcmp(name + len - 6, ".scale"))
 		return true;
@@ -1560,6 +1592,12 @@ static int check_info_data(struct perf_pmu *pmu,
 	 * define unit, scale and snapshot, fail
 	 * if there's more than one.
 	 */
+	if (!perf_cpu_map__is_empty(info->cpus) && !perf_cpu_map__is_empty(alias->cpus)) {
+		parse_events_error__handle(err, column,
+					strdup("Attempt to set event's cpus twice"),
+					NULL);
+		return -EINVAL;
+	}
 	if (info->unit && alias->unit[0]) {
 		parse_events_error__handle(err, column,
 					strdup("Attempt to set event's unit twice"),
@@ -1579,6 +1617,9 @@ static int check_info_data(struct perf_pmu *pmu,
 		return -EINVAL;
 	}
 
+	if (!perf_cpu_map__is_empty(alias->cpus))
+		info->cpus = perf_cpu_map__get(alias->cpus);
+
 	if (alias->unit[0])
 		info->unit = alias->unit;
 
@@ -1610,6 +1651,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_
 	 * Mark unit and scale as not set
 	 * (different from default values, see below)
 	 */
+	info->cpus     = NULL;
 	info->unit     = NULL;
 	info->scale    = 0.0;
 	info->snapshot = false;
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index b2d3fd291f02..b1ccfe8d3df4 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -177,6 +177,7 @@ struct perf_pmu {
 extern struct perf_pmu perf_pmu__fake;
 
 struct perf_pmu_info {
+	struct perf_cpu_map *cpus;
 	const char *unit;
 	double scale;
 	bool per_pkg;
-- 
2.45.2.1089.g2a221341d9-goog


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 4/6] libperf cpumap: Add ability to create CPU from a single CPU number
  2024-07-18  0:30 [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Ian Rogers
                   ` (2 preceding siblings ...)
  2024-07-18  0:30 ` [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs Ian Rogers
@ 2024-07-18  0:30 ` Ian Rogers
  2024-07-18  0:30 ` [PATCH v2 5/6] perf parse-events: Set is_pmu_core for legacy hardware events Ian Rogers
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 34+ messages in thread
From: Ian Rogers @ 2024-07-18  0:30 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, Kan Liang, Bjorn Helgaas,
	Jonathan Corbet, James Clark, Ravi Bangoria, Dominique Martinet,
	linux-kernel, linux-perf-users, Dhananjay Ugwekar, ananth.narayan,
	gautham.shenoy, kprateek.nayak, sandipan.das

Add perf_cpu_map__new_int so that a CPU map can be created from a
single integer.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/lib/perf/cpumap.c              | 10 ++++++++++
 tools/lib/perf/include/perf/cpumap.h |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c
index cae799ad44e1..2c8e36d0efaa 100644
--- a/tools/lib/perf/cpumap.c
+++ b/tools/lib/perf/cpumap.c
@@ -293,6 +293,16 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
 	return cpus;
 }
 
+struct perf_cpu_map *perf_cpu_map__new_int(int cpu)
+{
+	struct perf_cpu_map *cpus = perf_cpu_map__alloc(1);
+
+	if (cpus)
+		RC_CHK_ACCESS(cpus)->map[0].cpu = cpu;
+
+	return cpus;
+}
+
 static int __perf_cpu_map__nr(const struct perf_cpu_map *cpus)
 {
 	return RC_CHK_ACCESS(cpus)->nr;
diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h
index 90457d17fb2f..79ed3449d288 100644
--- a/tools/lib/perf/include/perf/cpumap.h
+++ b/tools/lib/perf/include/perf/cpumap.h
@@ -37,6 +37,8 @@ LIBPERF_API struct perf_cpu_map *perf_cpu_map__new_online_cpus(void);
  *                     perf_cpu_map__new_online_cpus is returned.
  */
 LIBPERF_API struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list);
+/** perf_cpu_map__new_int - create a map with the one given cpu. */
+LIBPERF_API struct perf_cpu_map *perf_cpu_map__new_int(int cpu);
 LIBPERF_API struct perf_cpu_map *perf_cpu_map__read(FILE *file);
 LIBPERF_API struct perf_cpu_map *perf_cpu_map__get(struct perf_cpu_map *map);
 LIBPERF_API struct perf_cpu_map *perf_cpu_map__merge(struct perf_cpu_map *orig,
-- 
2.45.2.1089.g2a221341d9-goog


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 5/6] perf parse-events: Set is_pmu_core for legacy hardware events
  2024-07-18  0:30 [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Ian Rogers
                   ` (3 preceding siblings ...)
  2024-07-18  0:30 ` [PATCH v2 4/6] libperf cpumap: Add ability to create CPU from a single CPU number Ian Rogers
@ 2024-07-18  0:30 ` Ian Rogers
  2024-07-18  0:30 ` [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on Ian Rogers
  2024-07-18 13:42 ` [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Dhananjay Ugwekar
  6 siblings, 0 replies; 34+ messages in thread
From: Ian Rogers @ 2024-07-18  0:30 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, Kan Liang, Bjorn Helgaas,
	Jonathan Corbet, James Clark, Ravi Bangoria, Dominique Martinet,
	linux-kernel, linux-perf-users, Dhananjay Ugwekar, ananth.narayan,
	gautham.shenoy, kprateek.nayak, sandipan.das

Also set the CPU map to all online CPU maps. This is done so the
behavior of legacy hardware and hardware cache events better matches
that of sysfs and json events during __perf_evlist__propagate_maps.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/perf/util/parse-events.c | 37 +++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index b181f83c9678..8c0c33361c5e 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -230,21 +230,30 @@ __add_event(struct list_head *list, int *idx,
 	    struct perf_cpu_map *cpu_list)
 {
 	struct evsel *evsel;
-	struct perf_cpu_map *cpus = perf_cpu_map__is_empty(cpu_list) && pmu ? pmu->cpus : cpu_list;
+	bool is_pmu_core;
+	struct perf_cpu_map *cpus;
 
-	cpus = perf_cpu_map__get(cpus);
-	if (pmu)
+	if (pmu) {
+		is_pmu_core = pmu->is_core;
+		cpus = perf_cpu_map__get(perf_cpu_map__is_empty(cpu_list) ? pmu->cpus : cpu_list);
 		perf_pmu__warn_invalid_formats(pmu);
-
-	if (pmu && (attr->type == PERF_TYPE_RAW || attr->type >= PERF_TYPE_MAX)) {
-		perf_pmu__warn_invalid_config(pmu, attr->config, name,
-					      PERF_PMU_FORMAT_VALUE_CONFIG, "config");
-		perf_pmu__warn_invalid_config(pmu, attr->config1, name,
-					      PERF_PMU_FORMAT_VALUE_CONFIG1, "config1");
-		perf_pmu__warn_invalid_config(pmu, attr->config2, name,
-					      PERF_PMU_FORMAT_VALUE_CONFIG2, "config2");
-		perf_pmu__warn_invalid_config(pmu, attr->config3, name,
-					      PERF_PMU_FORMAT_VALUE_CONFIG3, "config3");
+		if (attr->type == PERF_TYPE_RAW || attr->type >= PERF_TYPE_MAX) {
+			perf_pmu__warn_invalid_config(pmu, attr->config, name,
+						PERF_PMU_FORMAT_VALUE_CONFIG, "config");
+			perf_pmu__warn_invalid_config(pmu, attr->config1, name,
+						PERF_PMU_FORMAT_VALUE_CONFIG1, "config1");
+			perf_pmu__warn_invalid_config(pmu, attr->config2, name,
+						PERF_PMU_FORMAT_VALUE_CONFIG2, "config2");
+			perf_pmu__warn_invalid_config(pmu, attr->config3, name,
+						PERF_PMU_FORMAT_VALUE_CONFIG3, "config3");
+		}
+	} else {
+		is_pmu_core = (attr->type == PERF_TYPE_HARDWARE ||
+			       attr->type == PERF_TYPE_HW_CACHE);
+		if (perf_cpu_map__is_empty(cpu_list))
+			cpus = is_pmu_core ? perf_cpu_map__new_online_cpus() : NULL;
+		else
+			cpus = perf_cpu_map__get(cpu_list);
 	}
 	if (init_attr)
 		event_attr_init(attr);
@@ -259,7 +268,7 @@ __add_event(struct list_head *list, int *idx,
 	evsel->core.cpus = cpus;
 	evsel->core.own_cpus = perf_cpu_map__get(cpus);
 	evsel->core.requires_cpu = pmu ? pmu->is_uncore : false;
-	evsel->core.is_pmu_core = pmu ? pmu->is_core : false;
+	evsel->core.is_pmu_core = is_pmu_core;
 	evsel->auto_merge_stats = auto_merge_stats;
 	evsel->pmu = pmu;
 	evsel->pmu_name = pmu ? strdup(pmu->name) : NULL;
-- 
2.45.2.1089.g2a221341d9-goog


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on
  2024-07-18  0:30 [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Ian Rogers
                   ` (4 preceding siblings ...)
  2024-07-18  0:30 ` [PATCH v2 5/6] perf parse-events: Set is_pmu_core for legacy hardware events Ian Rogers
@ 2024-07-18  0:30 ` Ian Rogers
  2024-07-18 14:09   ` James Clark
  2024-07-18 14:41   ` Liang, Kan
  2024-07-18 13:42 ` [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Dhananjay Ugwekar
  6 siblings, 2 replies; 34+ messages in thread
From: Ian Rogers @ 2024-07-18  0:30 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, Kan Liang, Bjorn Helgaas,
	Jonathan Corbet, James Clark, Ravi Bangoria, Dominique Martinet,
	linux-kernel, linux-perf-users, Dhananjay Ugwekar, ananth.narayan,
	gautham.shenoy, kprateek.nayak, sandipan.das

The -C option allows the CPUs for a list of events to be specified but
its not possible to set the CPU for a single event. Add a term to
allow this. The term isn't a general CPU list due to ',' already being
a special character in event parsing instead multiple cpu= terms may
be provided and they will be merged/unioned together.

An example of mixing different types of events counted on different CPUs:
```
$ perf stat -A -C 0,4-5,8 -e "instructions/cpu=0/,l1d-misses/cpu=4,cpu=5/,inst_retired.any/cpu=8/,cycles" -a sleep 0.1

 Performance counter stats for 'system wide':

CPU0              368,647      instructions/cpu=0/              #    0.26  insn per cycle
CPU4        <not counted>      instructions/cpu=0/
CPU5        <not counted>      instructions/cpu=0/
CPU8        <not counted>      instructions/cpu=0/
CPU0        <not counted>      l1d-misses [cpu]
CPU4              203,377      l1d-misses [cpu]
CPU5              138,231      l1d-misses [cpu]
CPU8        <not counted>      l1d-misses [cpu]
CPU0        <not counted>      cpu/cpu=8/
CPU4        <not counted>      cpu/cpu=8/
CPU5        <not counted>      cpu/cpu=8/
CPU8              943,861      cpu/cpu=8/
CPU0            1,412,071      cycles
CPU4           20,362,900      cycles
CPU5           10,172,725      cycles
CPU8            2,406,081      cycles

       0.102925309 seconds time elapsed
```

Note, the event name of inst_retired.any is missing, reported as
cpu/cpu=8/, as there are unmerged uniquify fixes:
https://lore.kernel.org/lkml/20240510053705.2462258-3-irogers@google.com/

An example of spreading uncore overhead across two CPUs:
```
$ perf stat -A -e "data_read/cpu=0/,data_write/cpu=1/" -a sleep 0.1

 Performance counter stats for 'system wide':

CPU0               223.65 MiB  uncore_imc_free_running_0/cpu=0/
CPU0               223.66 MiB  uncore_imc_free_running_1/cpu=0/
CPU0        <not counted> MiB  uncore_imc_free_running_0/cpu=1/
CPU1                 5.78 MiB  uncore_imc_free_running_0/cpu=1/
CPU0        <not counted> MiB  uncore_imc_free_running_1/cpu=1/
CPU1                 5.74 MiB  uncore_imc_free_running_1/cpu=1/
```

Manually fixing the output it should be:
```
CPU0               223.65 MiB  uncore_imc_free_running_0/data_read,cpu=0/
CPU0               223.66 MiB  uncore_imc_free_running_1/data_read,cpu=0/
CPU1                 5.78 MiB  uncore_imc_free_running_0/data_write,cpu=1/
CPU1                 5.74 MiB  uncore_imc_free_running_1/data_write,cpu=1/
```

That is data_read from 2 PMUs was counted on CPU0 and data_write was
counted on CPU1.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/perf/Documentation/perf-list.txt |  9 ++++
 tools/perf/util/evsel_config.h         |  1 +
 tools/perf/util/parse-events.c         | 73 ++++++++++++++++++++++----
 tools/perf/util/parse-events.h         |  3 +-
 tools/perf/util/parse-events.l         |  1 +
 tools/perf/util/pmu.c                  |  1 +
 6 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 6bf2468f59d3..15511afe94a1 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -273,6 +273,15 @@ Sums up the event counts for all hardware threads in a core, e.g.:
 
   perf stat -e cpu/event=0,umask=0x3,percore=1/
 
+cpu:
+
+Specifies the CPU to open the event upon. The value may be repeated to
+specify opening the event on multiple CPUs:
+
+
+  perf stat -e instructions/cpu=0,cpu=2/,cycles/cpu=1,cpu=2/ -a sleep 1
+  perf stat -e data_read/cpu=0/,data_write/cpu=1/ -a sleep 1
+
 
 EVENT GROUPS
 ------------
diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h
index aee6f808b512..9630c4a24721 100644
--- a/tools/perf/util/evsel_config.h
+++ b/tools/perf/util/evsel_config.h
@@ -47,6 +47,7 @@ struct evsel_config_term {
 		u32	      aux_sample_size;
 		u64	      cfg_chg;
 		char	      *str;
+		int	      cpu;
 	} val;
 	bool weak;
 };
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 8c0c33361c5e..85faef85b8de 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -7,6 +7,7 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/param.h>
+#include "cpumap.h"
 #include "term.h"
 #include "evlist.h"
 #include "evsel.h"
@@ -177,6 +178,26 @@ static char *get_config_name(const struct parse_events_terms *head_terms)
 	return get_config_str(head_terms, PARSE_EVENTS__TERM_TYPE_NAME);
 }
 
+static struct perf_cpu_map *get_config_cpu(const struct parse_events_terms *head_terms)
+{
+	struct parse_events_term *term;
+	struct perf_cpu_map *cpus = NULL;
+
+	if (!head_terms)
+		return NULL;
+
+	list_for_each_entry(term, &head_terms->terms, list) {
+		if (term->type_term == PARSE_EVENTS__TERM_TYPE_CPU) {
+			struct perf_cpu_map *cpu = perf_cpu_map__new_int(term->val.num);
+
+			cpus = perf_cpu_map__merge(cpus, cpu);
+			perf_cpu_map__put(cpu);
+		}
+	}
+
+	return cpus;
+}
+
 /**
  * fix_raw - For each raw term see if there is an event (aka alias) in pmu that
  *           matches the raw's string value. If the string value matches an
@@ -468,11 +489,12 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
 	bool found_supported = false;
 	const char *config_name = get_config_name(parsed_terms);
 	const char *metric_id = get_config_metric_id(parsed_terms);
+	struct perf_cpu_map *cpus = get_config_cpu(parsed_terms);
+	int ret = 0;
 
 	while ((pmu = perf_pmus__scan(pmu)) != NULL) {
 		LIST_HEAD(config_terms);
 		struct perf_event_attr attr;
-		int ret;
 
 		if (parse_events__filter_pmu(parse_state, pmu))
 			continue;
@@ -486,7 +508,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
 						   parsed_terms,
 						   perf_pmu__auto_merge_stats(pmu));
 			if (ret)
-				return ret;
+				goto out_err;
 			continue;
 		}
 
@@ -506,20 +528,27 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
 
 		if (parsed_terms) {
 			if (config_attr(&attr, parsed_terms, parse_state->error,
-					config_term_common))
-				return -EINVAL;
-
-			if (get_config_terms(parsed_terms, &config_terms))
-				return -ENOMEM;
+					config_term_common)) {
+				ret = -EINVAL;
+				goto out_err;
+			}
+			if (get_config_terms(parsed_terms, &config_terms)) {
+				ret = -ENOMEM;
+				goto out_err;
+			}
 		}
 
 		if (__add_event(list, idx, &attr, /*init_attr*/true, config_name ?: name,
 				metric_id, pmu, &config_terms, /*auto_merge_stats=*/false,
-				/*cpu_list=*/NULL) == NULL)
-			return -ENOMEM;
+				cpus) == NULL)
+			ret = -ENOMEM;
 
 		free_config_terms(&config_terms);
+		if (ret)
+			goto out_err;
 	}
+out_err:
+	perf_cpu_map__put(cpus);
 	return found_supported ? 0 : -EINVAL;
 }
 
@@ -814,6 +843,7 @@ static const char *config_term_name(enum parse_events__term_type term_type)
 		[PARSE_EVENTS__TERM_TYPE_RAW]                   = "raw",
 		[PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE]          = "legacy-cache",
 		[PARSE_EVENTS__TERM_TYPE_HARDWARE]              = "hardware",
+		[PARSE_EVENTS__TERM_TYPE_CPU]			= "cpu",
 	};
 	if ((unsigned int)term_type >= __PARSE_EVENTS__TERM_TYPE_NR)
 		return "unknown term";
@@ -843,6 +873,7 @@ config_term_avail(enum parse_events__term_type term_type, struct parse_events_er
 	case PARSE_EVENTS__TERM_TYPE_METRIC_ID:
 	case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
 	case PARSE_EVENTS__TERM_TYPE_PERCORE:
+	case PARSE_EVENTS__TERM_TYPE_CPU:
 		return true;
 	case PARSE_EVENTS__TERM_TYPE_USER:
 	case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ:
@@ -986,6 +1017,15 @@ do {									   \
 			return -EINVAL;
 		}
 		break;
+	case PARSE_EVENTS__TERM_TYPE_CPU:
+		CHECK_TYPE_VAL(NUM);
+		if (term->val.num >= (u64)cpu__max_present_cpu().cpu) {
+			parse_events_error__handle(err, term->err_val,
+						strdup("too big"),
+						NULL);
+			return -EINVAL;
+		}
+		break;
 	case PARSE_EVENTS__TERM_TYPE_DRV_CFG:
 	case PARSE_EVENTS__TERM_TYPE_USER:
 	case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
@@ -1112,6 +1152,7 @@ static int config_term_tracepoint(struct perf_event_attr *attr,
 	case PARSE_EVENTS__TERM_TYPE_RAW:
 	case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
 	case PARSE_EVENTS__TERM_TYPE_HARDWARE:
+	case PARSE_EVENTS__TERM_TYPE_CPU:
 	default:
 		if (err) {
 			parse_events_error__handle(err, term->err_term,
@@ -1243,6 +1284,7 @@ do {								\
 		case PARSE_EVENTS__TERM_TYPE_RAW:
 		case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
 		case PARSE_EVENTS__TERM_TYPE_HARDWARE:
+		case PARSE_EVENTS__TERM_TYPE_CPU:
 		default:
 			break;
 		}
@@ -1296,6 +1338,7 @@ static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head
 		case PARSE_EVENTS__TERM_TYPE_RAW:
 		case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
 		case PARSE_EVENTS__TERM_TYPE_HARDWARE:
+		case PARSE_EVENTS__TERM_TYPE_CPU:
 		default:
 			break;
 		}
@@ -1350,6 +1393,7 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state,
 	struct perf_event_attr attr;
 	LIST_HEAD(config_terms);
 	const char *name, *metric_id;
+	struct perf_cpu_map *cpus;
 	int ret;
 
 	memset(&attr, 0, sizeof(attr));
@@ -1371,9 +1415,11 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state,
 
 	name = get_config_name(head_config);
 	metric_id = get_config_metric_id(head_config);
+	cpus = get_config_cpu(head_config);
 	ret = __add_event(list, &parse_state->idx, &attr, /*init_attr*/true, name,
 			metric_id, pmu, &config_terms, /*auto_merge_stats=*/false,
-			/*cpu_list=*/NULL) ? 0 : -ENOMEM;
+			cpus) ? 0 : -ENOMEM;
+	perf_cpu_map__put(cpus);
 	free_config_terms(&config_terms);
 	return ret;
 }
@@ -1440,6 +1486,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 	LIST_HEAD(config_terms);
 	struct parse_events_terms parsed_terms;
 	bool alias_rewrote_terms = false;
+	struct perf_cpu_map *term_cpu = NULL;
 	int ret = 0;
 
 	if (verbose > 1) {
@@ -1531,6 +1578,12 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 		goto out_err;
 	}
 
+	term_cpu = get_config_cpu(&parsed_terms);
+	if (!perf_cpu_map__is_empty(term_cpu)) {
+		perf_cpu_map__put(info.cpus);
+		info.cpus = term_cpu;
+		term_cpu = NULL;
+	}
 	evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true,
 			    get_config_name(&parsed_terms),
 			    get_config_metric_id(&parsed_terms), pmu,
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index e13de2c8b706..b03857499030 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -79,7 +79,8 @@ enum parse_events__term_type {
 	PARSE_EVENTS__TERM_TYPE_RAW,
 	PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE,
 	PARSE_EVENTS__TERM_TYPE_HARDWARE,
-#define	__PARSE_EVENTS__TERM_TYPE_NR (PARSE_EVENTS__TERM_TYPE_HARDWARE + 1)
+	PARSE_EVENTS__TERM_TYPE_CPU,
+#define	__PARSE_EVENTS__TERM_TYPE_NR (PARSE_EVENTS__TERM_TYPE_CPU + 1)
 };
 
 struct parse_events_term {
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 16045c383ada..e06097a62796 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -330,6 +330,7 @@ percore			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_PERCORE); }
 aux-output		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT); }
 aux-sample-size		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE); }
 metric-id		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_METRIC_ID); }
+cpu			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CPU); }
 cpu-cycles|cycles				{ return hw_term(yyscanner, PERF_COUNT_HW_CPU_CYCLES); }
 stalled-cycles-frontend|idle-cycles-frontend	{ return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); }
 stalled-cycles-backend|idle-cycles-backend	{ return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); }
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 280b2499c861..27e2ff23799e 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1767,6 +1767,7 @@ int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_call
 		"percore",
 		"aux-output",
 		"aux-sample-size=number",
+		"cpu=number",
 	};
 	struct perf_pmu_format *format;
 	int ret;
-- 
2.45.2.1089.g2a221341d9-goog


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term
  2024-07-18  0:30 [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Ian Rogers
                   ` (5 preceding siblings ...)
  2024-07-18  0:30 ` [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on Ian Rogers
@ 2024-07-18 13:42 ` Dhananjay Ugwekar
  2024-07-18 15:00   ` Ian Rogers
  6 siblings, 1 reply; 34+ messages in thread
From: Dhananjay Ugwekar @ 2024-07-18 13:42 UTC (permalink / raw)
  To: Ian Rogers, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Kan Liang, Bjorn Helgaas, Jonathan Corbet,
	James Clark, Ravi Bangoria, Dominique Martinet, linux-kernel,
	linux-perf-users, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das

Hello Ian,

On 7/18/2024 6:00 AM, Ian Rogers wrote:
> The need for a sysfs event.cpus file is discussed here:
> https://lore.kernel.org/lkml/CAP-5=fXXuWchzUK0n5KTH8kamr=DQoEni+bUoo8f-4j8Y+eMBg@mail.gmail.com/
> following Dhananjay Ugwekar's work on the RAPL /sys/devices/power PMU.
> These changes add support for the event.cpus file in sysfs and also a
> cpu event term allowing events to have differing CPUs. This was added
> in order to test the parsing and map propagation for the sysfs case.

Tried applying the patchset on top of v6.10, torvalds/master and tip/master, 
it is failing at patch#2, what is the base commit/branch for this patchset?

Thanks,
Dhananjay

> 
> v2: Add support for multiple cpu terms on an event that are
>     merged. For example, an event of "l1d-misses/cpu=4,cpu=5/" will
>     now be opened on both CPU 4 and 5 rather than just CPU 4.
> 
> Ian Rogers (6):
>   perf pmu: Merge boolean sysfs event option parsing
>   perf parse-events: Pass cpu_list as a perf_cpu_map in __add_event
>   perf pmu: Add support for event.cpus files in sysfs
>   libperf cpumap: Add ability to create CPU from a single CPU number
>   perf parse-events: Set is_pmu_core for legacy hardware events
>   perf parse-events: Add "cpu" term to set the CPU an event is recorded
>     on
> 
>  .../sysfs-bus-event_source-devices-events     |  14 ++
>  tools/lib/perf/cpumap.c                       |  10 ++
>  tools/lib/perf/include/perf/cpumap.h          |   2 +
>  tools/perf/Documentation/perf-list.txt        |   9 +
>  tools/perf/util/evsel_config.h                |   1 +
>  tools/perf/util/parse-events.c                | 162 ++++++++++++------
>  tools/perf/util/parse-events.h                |   3 +-
>  tools/perf/util/parse-events.l                |   1 +
>  tools/perf/util/pmu.c                         |  92 +++++++---
>  tools/perf/util/pmu.h                         |   1 +
>  10 files changed, 221 insertions(+), 74 deletions(-)
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on
  2024-07-18  0:30 ` [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on Ian Rogers
@ 2024-07-18 14:09   ` James Clark
  2024-07-18 15:07     ` Ian Rogers
  2024-07-18 14:41   ` Liang, Kan
  1 sibling, 1 reply; 34+ messages in thread
From: James Clark @ 2024-07-18 14:09 UTC (permalink / raw)
  To: Ian Rogers, Dhananjay Ugwekar
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Kan Liang, Bjorn Helgaas, Jonathan Corbet,
	James Clark, Ravi Bangoria, Dominique Martinet, linux-kernel,
	linux-perf-users, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das



On 18/07/2024 1:30 am, Ian Rogers wrote:
> The -C option allows the CPUs for a list of events to be specified but
> its not possible to set the CPU for a single event. Add a term to
> allow this. The term isn't a general CPU list due to ',' already being
> a special character in event parsing instead multiple cpu= terms may
> be provided and they will be merged/unioned together.
> 
> An example of mixing different types of events counted on different CPUs:
> ```
> $ perf stat -A -C 0,4-5,8 -e "instructions/cpu=0/,l1d-misses/cpu=4,cpu=5/,inst_retired.any/cpu=8/,cycles" -a sleep 0.1
> 

I ran into cpu=N having no effect unless -C is also used. For example:

  $ perf stat -vv -e branch-misses/cpu=1/ -- true

  sys_perf_event_open: pid 10233  cpu -1  group_fd -1  flags 0x8 = 3

Vs:

  $ perf stat -C 0,1 -vv -e branch-misses/cpu=1/ -- true

  sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 3

Is it possible to have a warning or error in that case? Seems like it's 
quite likely to confuse a user.

Other than that it seems to work ok.

WRT the RAPL issue [1], I didn't quite follow how these two things will 
join up. Doesn't this only end up setting the cpu argument to 
perf_event_open(), which already aggregates per package rather than per 
core, even if the cpu is set? And the fix for that behavior was rejected 
because it would break Intel metrics [2].

Maybe I'm missing a piece.

[1]: 
https://lore.kernel.org/lkml/CAP-5=fXXuWchzUK0n5KTH8kamr=DQoEni+bUoo8f-4j8Y+eMBg@mail.gmail.com/
[2]: 
https://lore.kernel.org/lkml/3e766f0e-37d4-0f82-3868-31b14228868d@linux.intel.com/

James

>   Performance counter stats for 'system wide':
> 
> CPU0              368,647      instructions/cpu=0/              #    0.26  insn per cycle
> CPU4        <not counted>      instructions/cpu=0/
> CPU5        <not counted>      instructions/cpu=0/
> CPU8        <not counted>      instructions/cpu=0/
> CPU0        <not counted>      l1d-misses [cpu]
> CPU4              203,377      l1d-misses [cpu]
> CPU5              138,231      l1d-misses [cpu]
> CPU8        <not counted>      l1d-misses [cpu]
> CPU0        <not counted>      cpu/cpu=8/
> CPU4        <not counted>      cpu/cpu=8/
> CPU5        <not counted>      cpu/cpu=8/
> CPU8              943,861      cpu/cpu=8/
> CPU0            1,412,071      cycles
> CPU4           20,362,900      cycles
> CPU5           10,172,725      cycles
> CPU8            2,406,081      cycles
> 
>         0.102925309 seconds time elapsed
> ```
> 
> Note, the event name of inst_retired.any is missing, reported as
> cpu/cpu=8/, as there are unmerged uniquify fixes:
> https://lore.kernel.org/lkml/20240510053705.2462258-3-irogers@google.com/
> 
> An example of spreading uncore overhead across two CPUs:
> ```
> $ perf stat -A -e "data_read/cpu=0/,data_write/cpu=1/" -a sleep 0.1
> 
>   Performance counter stats for 'system wide':
> 
> CPU0               223.65 MiB  uncore_imc_free_running_0/cpu=0/
> CPU0               223.66 MiB  uncore_imc_free_running_1/cpu=0/
> CPU0        <not counted> MiB  uncore_imc_free_running_0/cpu=1/
> CPU1                 5.78 MiB  uncore_imc_free_running_0/cpu=1/
> CPU0        <not counted> MiB  uncore_imc_free_running_1/cpu=1/
> CPU1                 5.74 MiB  uncore_imc_free_running_1/cpu=1/
> ```
> 
> Manually fixing the output it should be:
> ```
> CPU0               223.65 MiB  uncore_imc_free_running_0/data_read,cpu=0/
> CPU0               223.66 MiB  uncore_imc_free_running_1/data_read,cpu=0/
> CPU1                 5.78 MiB  uncore_imc_free_running_0/data_write,cpu=1/
> CPU1                 5.74 MiB  uncore_imc_free_running_1/data_write,cpu=1/
> ```
> 
> That is data_read from 2 PMUs was counted on CPU0 and data_write was
> counted on CPU1.
> 
> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
>   tools/perf/Documentation/perf-list.txt |  9 ++++
>   tools/perf/util/evsel_config.h         |  1 +
>   tools/perf/util/parse-events.c         | 73 ++++++++++++++++++++++----
>   tools/perf/util/parse-events.h         |  3 +-
>   tools/perf/util/parse-events.l         |  1 +
>   tools/perf/util/pmu.c                  |  1 +
>   6 files changed, 77 insertions(+), 11 deletions(-)
> 
> diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
> index 6bf2468f59d3..15511afe94a1 100644
> --- a/tools/perf/Documentation/perf-list.txt
> +++ b/tools/perf/Documentation/perf-list.txt
> @@ -273,6 +273,15 @@ Sums up the event counts for all hardware threads in a core, e.g.:
>   
>     perf stat -e cpu/event=0,umask=0x3,percore=1/
>   
> +cpu:
> +
> +Specifies the CPU to open the event upon. The value may be repeated to
> +specify opening the event on multiple CPUs:
> +
> +
> +  perf stat -e instructions/cpu=0,cpu=2/,cycles/cpu=1,cpu=2/ -a sleep 1
> +  perf stat -e data_read/cpu=0/,data_write/cpu=1/ -a sleep 1
> +
>   
>   EVENT GROUPS
>   ------------
> diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h
> index aee6f808b512..9630c4a24721 100644
> --- a/tools/perf/util/evsel_config.h
> +++ b/tools/perf/util/evsel_config.h
> @@ -47,6 +47,7 @@ struct evsel_config_term {
>   		u32	      aux_sample_size;
>   		u64	      cfg_chg;
>   		char	      *str;
> +		int	      cpu;
>   	} val;
>   	bool weak;
>   };
> diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> index 8c0c33361c5e..85faef85b8de 100644
> --- a/tools/perf/util/parse-events.c
> +++ b/tools/perf/util/parse-events.c
> @@ -7,6 +7,7 @@
>   #include <errno.h>
>   #include <sys/ioctl.h>
>   #include <sys/param.h>
> +#include "cpumap.h"
>   #include "term.h"
>   #include "evlist.h"
>   #include "evsel.h"
> @@ -177,6 +178,26 @@ static char *get_config_name(const struct parse_events_terms *head_terms)
>   	return get_config_str(head_terms, PARSE_EVENTS__TERM_TYPE_NAME);
>   }
>   
> +static struct perf_cpu_map *get_config_cpu(const struct parse_events_terms *head_terms)
> +{
> +	struct parse_events_term *term;
> +	struct perf_cpu_map *cpus = NULL;
> +
> +	if (!head_terms)
> +		return NULL;
> +
> +	list_for_each_entry(term, &head_terms->terms, list) {
> +		if (term->type_term == PARSE_EVENTS__TERM_TYPE_CPU) {
> +			struct perf_cpu_map *cpu = perf_cpu_map__new_int(term->val.num);
> +
> +			cpus = perf_cpu_map__merge(cpus, cpu);
> +			perf_cpu_map__put(cpu);
> +		}
> +	}
> +
> +	return cpus;
> +}
> +
>   /**
>    * fix_raw - For each raw term see if there is an event (aka alias) in pmu that
>    *           matches the raw's string value. If the string value matches an
> @@ -468,11 +489,12 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
>   	bool found_supported = false;
>   	const char *config_name = get_config_name(parsed_terms);
>   	const char *metric_id = get_config_metric_id(parsed_terms);
> +	struct perf_cpu_map *cpus = get_config_cpu(parsed_terms);
> +	int ret = 0;
>   
>   	while ((pmu = perf_pmus__scan(pmu)) != NULL) {
>   		LIST_HEAD(config_terms);
>   		struct perf_event_attr attr;
> -		int ret;
>   
>   		if (parse_events__filter_pmu(parse_state, pmu))
>   			continue;
> @@ -486,7 +508,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
>   						   parsed_terms,
>   						   perf_pmu__auto_merge_stats(pmu));
>   			if (ret)
> -				return ret;
> +				goto out_err;
>   			continue;
>   		}
>   
> @@ -506,20 +528,27 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
>   
>   		if (parsed_terms) {
>   			if (config_attr(&attr, parsed_terms, parse_state->error,
> -					config_term_common))
> -				return -EINVAL;
> -
> -			if (get_config_terms(parsed_terms, &config_terms))
> -				return -ENOMEM;
> +					config_term_common)) {
> +				ret = -EINVAL;
> +				goto out_err;
> +			}
> +			if (get_config_terms(parsed_terms, &config_terms)) {
> +				ret = -ENOMEM;
> +				goto out_err;
> +			}
>   		}
>   
>   		if (__add_event(list, idx, &attr, /*init_attr*/true, config_name ?: name,
>   				metric_id, pmu, &config_terms, /*auto_merge_stats=*/false,
> -				/*cpu_list=*/NULL) == NULL)
> -			return -ENOMEM;
> +				cpus) == NULL)
> +			ret = -ENOMEM;
>   
>   		free_config_terms(&config_terms);
> +		if (ret)
> +			goto out_err;
>   	}
> +out_err:
> +	perf_cpu_map__put(cpus);
>   	return found_supported ? 0 : -EINVAL;
>   }
>   
> @@ -814,6 +843,7 @@ static const char *config_term_name(enum parse_events__term_type term_type)
>   		[PARSE_EVENTS__TERM_TYPE_RAW]                   = "raw",
>   		[PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE]          = "legacy-cache",
>   		[PARSE_EVENTS__TERM_TYPE_HARDWARE]              = "hardware",
> +		[PARSE_EVENTS__TERM_TYPE_CPU]			= "cpu",
>   	};
>   	if ((unsigned int)term_type >= __PARSE_EVENTS__TERM_TYPE_NR)
>   		return "unknown term";
> @@ -843,6 +873,7 @@ config_term_avail(enum parse_events__term_type term_type, struct parse_events_er
>   	case PARSE_EVENTS__TERM_TYPE_METRIC_ID:
>   	case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
>   	case PARSE_EVENTS__TERM_TYPE_PERCORE:
> +	case PARSE_EVENTS__TERM_TYPE_CPU:
>   		return true;
>   	case PARSE_EVENTS__TERM_TYPE_USER:
>   	case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ:
> @@ -986,6 +1017,15 @@ do {									   \
>   			return -EINVAL;
>   		}
>   		break;
> +	case PARSE_EVENTS__TERM_TYPE_CPU:
> +		CHECK_TYPE_VAL(NUM);
> +		if (term->val.num >= (u64)cpu__max_present_cpu().cpu) {
> +			parse_events_error__handle(err, term->err_val,
> +						strdup("too big"),
> +						NULL);
> +			return -EINVAL;
> +		}
> +		break;
>   	case PARSE_EVENTS__TERM_TYPE_DRV_CFG:
>   	case PARSE_EVENTS__TERM_TYPE_USER:
>   	case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
> @@ -1112,6 +1152,7 @@ static int config_term_tracepoint(struct perf_event_attr *attr,
>   	case PARSE_EVENTS__TERM_TYPE_RAW:
>   	case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
>   	case PARSE_EVENTS__TERM_TYPE_HARDWARE:
> +	case PARSE_EVENTS__TERM_TYPE_CPU:
>   	default:
>   		if (err) {
>   			parse_events_error__handle(err, term->err_term,
> @@ -1243,6 +1284,7 @@ do {								\
>   		case PARSE_EVENTS__TERM_TYPE_RAW:
>   		case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
>   		case PARSE_EVENTS__TERM_TYPE_HARDWARE:
> +		case PARSE_EVENTS__TERM_TYPE_CPU:
>   		default:
>   			break;
>   		}
> @@ -1296,6 +1338,7 @@ static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head
>   		case PARSE_EVENTS__TERM_TYPE_RAW:
>   		case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
>   		case PARSE_EVENTS__TERM_TYPE_HARDWARE:
> +		case PARSE_EVENTS__TERM_TYPE_CPU:
>   		default:
>   			break;
>   		}
> @@ -1350,6 +1393,7 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state,
>   	struct perf_event_attr attr;
>   	LIST_HEAD(config_terms);
>   	const char *name, *metric_id;
> +	struct perf_cpu_map *cpus;
>   	int ret;
>   
>   	memset(&attr, 0, sizeof(attr));
> @@ -1371,9 +1415,11 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state,
>   
>   	name = get_config_name(head_config);
>   	metric_id = get_config_metric_id(head_config);
> +	cpus = get_config_cpu(head_config);
>   	ret = __add_event(list, &parse_state->idx, &attr, /*init_attr*/true, name,
>   			metric_id, pmu, &config_terms, /*auto_merge_stats=*/false,
> -			/*cpu_list=*/NULL) ? 0 : -ENOMEM;
> +			cpus) ? 0 : -ENOMEM;
> +	perf_cpu_map__put(cpus);
>   	free_config_terms(&config_terms);
>   	return ret;
>   }
> @@ -1440,6 +1486,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>   	LIST_HEAD(config_terms);
>   	struct parse_events_terms parsed_terms;
>   	bool alias_rewrote_terms = false;
> +	struct perf_cpu_map *term_cpu = NULL;
>   	int ret = 0;
>   
>   	if (verbose > 1) {
> @@ -1531,6 +1578,12 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>   		goto out_err;
>   	}
>   
> +	term_cpu = get_config_cpu(&parsed_terms);
> +	if (!perf_cpu_map__is_empty(term_cpu)) {
> +		perf_cpu_map__put(info.cpus);
> +		info.cpus = term_cpu;
> +		term_cpu = NULL;
> +	}
>   	evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true,
>   			    get_config_name(&parsed_terms),
>   			    get_config_metric_id(&parsed_terms), pmu,
> diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
> index e13de2c8b706..b03857499030 100644
> --- a/tools/perf/util/parse-events.h
> +++ b/tools/perf/util/parse-events.h
> @@ -79,7 +79,8 @@ enum parse_events__term_type {
>   	PARSE_EVENTS__TERM_TYPE_RAW,
>   	PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE,
>   	PARSE_EVENTS__TERM_TYPE_HARDWARE,
> -#define	__PARSE_EVENTS__TERM_TYPE_NR (PARSE_EVENTS__TERM_TYPE_HARDWARE + 1)
> +	PARSE_EVENTS__TERM_TYPE_CPU,
> +#define	__PARSE_EVENTS__TERM_TYPE_NR (PARSE_EVENTS__TERM_TYPE_CPU + 1)
>   };
>   
>   struct parse_events_term {
> diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
> index 16045c383ada..e06097a62796 100644
> --- a/tools/perf/util/parse-events.l
> +++ b/tools/perf/util/parse-events.l
> @@ -330,6 +330,7 @@ percore			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_PERCORE); }
>   aux-output		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT); }
>   aux-sample-size		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE); }
>   metric-id		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_METRIC_ID); }
> +cpu			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CPU); }
>   cpu-cycles|cycles				{ return hw_term(yyscanner, PERF_COUNT_HW_CPU_CYCLES); }
>   stalled-cycles-frontend|idle-cycles-frontend	{ return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); }
>   stalled-cycles-backend|idle-cycles-backend	{ return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); }
> diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
> index 280b2499c861..27e2ff23799e 100644
> --- a/tools/perf/util/pmu.c
> +++ b/tools/perf/util/pmu.c
> @@ -1767,6 +1767,7 @@ int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_call
>   		"percore",
>   		"aux-output",
>   		"aux-sample-size=number",
> +		"cpu=number",
>   	};
>   	struct perf_pmu_format *format;
>   	int ret;

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-18  0:30 ` [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs Ian Rogers
@ 2024-07-18 14:33   ` Liang, Kan
  2024-07-18 15:39     ` Ian Rogers
  0 siblings, 1 reply; 34+ messages in thread
From: Liang, Kan @ 2024-07-18 14:33 UTC (permalink / raw)
  To: Ian Rogers, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das



On 2024-07-17 8:30 p.m., Ian Rogers wrote:
> If an event file exists in sysfs, check if a event.cpus file exists
> and read a perf_cpu_map from it if it does. This allows particular
> events to have a different set of CPUs compared to the PMU.
> 
> One scenario where this could be useful is when a PMU is set up with a
> cpumask/events per SMT thread but some events record for all SMT
> threads. Programming an event on each SMT thread will cause
> unnecessary counters to be programmed and the aggregate value to be
> too large.

If I understand the scenario correctly, I think the issue should have
been addressed since ICX, with the introduction of  the "SMT-aware
events". Is there a specific event which still has the issue on newer
platforms?

> 
> Another scenario where this could be useful if when a PMU has
> historically had a cpumask at the package level, but now newer per
> die, core or CPU information is available.

The traditional way to support new counters with a different scope is to
add a new PMU.

> 
> Additional context for the motivation is in these patches and
> conversation:
> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/

I don't see the advantages of the new event.cpus way.
The aggregation should be the same.
The RAPL counters are free-running counters. So there is no difference
whether grouping or not grouping.

But it makes the kernel driver complex, since it has to maintain at
least two different cpumasks.
The tool becomes complex either, since it has to take care of the
per-event CPU override case. The json file must also be updated to add a
new field cpumask.

Thanks,
Kan
> 
> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
>  .../sysfs-bus-event_source-devices-events     | 14 ++++++
>  tools/perf/util/parse-events.c                | 45 ++++++++++---------
>  tools/perf/util/pmu.c                         | 44 +++++++++++++++++-
>  tools/perf/util/pmu.h                         |  1 +
>  4 files changed, 82 insertions(+), 22 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
> index e7efeab2ee83..d8e3a4dd3ba7 100644
> --- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
> +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
> @@ -70,6 +70,20 @@ Description:	Per-pmu performance monitoring events specific to the running syste
>  		This is referred to as "event parameterization". Event
>  		parameters have the format 'param=?'.
>  
> +What: /sys/bus/event_source/devices/<pmu>/events/<event>.cpus
> +Date: 2024/07/17
> +Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
> +Description:	Perf event CPUs
> +
> +		A list of CPUs on which a the perf event should be
> +		opened by default. This is an event specific variant
> +		of /sys/bus/event_source/devices/<pmu>/cpumask.
> +
> +		Examples (each of these lines would be in a separate file):
> +
> +			0,2,4,6
> +			0-7
> +
>  What: /sys/bus/event_source/devices/<pmu>/events/<event>.unit
>  Date: 2014/02/24
>  Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
> diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> index 43501eb56336..b181f83c9678 100644
> --- a/tools/perf/util/parse-events.c
> +++ b/tools/perf/util/parse-events.c
> @@ -1425,12 +1425,13 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>  				bool auto_merge_stats)
>  {
>  	struct perf_event_attr attr;
> -	struct perf_pmu_info info;
> +	struct perf_pmu_info info = {};
>  	struct evsel *evsel;
>  	struct parse_events_error *err = parse_state->error;
>  	LIST_HEAD(config_terms);
>  	struct parse_events_terms parsed_terms;
>  	bool alias_rewrote_terms = false;
> +	int ret = 0;
>  
>  	if (verbose > 1) {
>  		struct strbuf sb;
> @@ -1465,8 +1466,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>  
>  	parse_events_terms__init(&parsed_terms);
>  	if (const_parsed_terms) {
> -		int ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms);
> -
> +		ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms);
>  		if (ret)
>  			return ret;
>  	}
> @@ -1474,15 +1474,15 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>  
>  	/* Configure attr/terms with a known PMU, this will set hardcoded terms. */
>  	if (config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) {
> -		parse_events_terms__exit(&parsed_terms);
> -		return -EINVAL;
> +		ret = -EINVAL;
> +		goto out_err;
>  	}
>  
>  	/* Look for event names in the terms and rewrite into format based terms. */
>  	if (!parse_state->fake_pmu && perf_pmu__check_alias(pmu, &parsed_terms,
>  							    &info, &alias_rewrote_terms, err)) {
> -		parse_events_terms__exit(&parsed_terms);
> -		return -EINVAL;
> +		ret = -EINVAL;
> +		goto out_err;
>  	}
>  
>  	if (verbose > 1) {
> @@ -1497,13 +1497,13 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>  	/* Configure attr/terms again if an alias was expanded. */
>  	if (alias_rewrote_terms &&
>  	    config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) {
> -		parse_events_terms__exit(&parsed_terms);
> -		return -EINVAL;
> +		ret = -EINVAL;
> +		goto out_err;
>  	}
>  
>  	if (get_config_terms(&parsed_terms, &config_terms)) {
> -		parse_events_terms__exit(&parsed_terms);
> -		return -ENOMEM;
> +		ret = -ENOMEM;
> +		goto out_err;
>  	}
>  
>  	/*
> @@ -1512,24 +1512,23 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>  	 */
>  	if (pmu->perf_event_attr_init_default &&
>  	    get_config_chgs(pmu, &parsed_terms, &config_terms)) {
> -		parse_events_terms__exit(&parsed_terms);
> -		return -ENOMEM;
> +		ret = -ENOMEM;
> +		goto out_err;
>  	}
>  
>  	if (!parse_state->fake_pmu &&
>  	    perf_pmu__config(pmu, &attr, &parsed_terms, parse_state->error)) {
> -		free_config_terms(&config_terms);
> -		parse_events_terms__exit(&parsed_terms);
> -		return -EINVAL;
> +		ret = -EINVAL;
> +		goto out_err;
>  	}
>  
>  	evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true,
>  			    get_config_name(&parsed_terms),
>  			    get_config_metric_id(&parsed_terms), pmu,
> -			    &config_terms, auto_merge_stats, /*cpu_list=*/NULL);
> +			    &config_terms, auto_merge_stats, info.cpus);
>  	if (!evsel) {
> -		parse_events_terms__exit(&parsed_terms);
> -		return -ENOMEM;
> +		ret = -ENOMEM;
> +		goto out_err;
>  	}
>  
>  	if (evsel->name)
> @@ -1542,13 +1541,17 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>  		return 0;
>  	}
>  
> -	parse_events_terms__exit(&parsed_terms);
>  	free((char *)evsel->unit);
>  	evsel->unit = strdup(info.unit);
>  	evsel->scale = info.scale;
>  	evsel->per_pkg = info.per_pkg;
>  	evsel->snapshot = info.snapshot;
> -	return 0;
> +out_err:
> +	parse_events_terms__exit(&parsed_terms);
> +	if (ret)
> +		free_config_terms(&config_terms);
> +	perf_cpu_map__put(info.cpus);
> +	return ret;
>  }
>  
>  int parse_events_multi_pmu_add(struct parse_events_state *parse_state,
> diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
> index 5148b6639dd3..280b2499c861 100644
> --- a/tools/perf/util/pmu.c
> +++ b/tools/perf/util/pmu.c
> @@ -73,6 +73,11 @@ struct perf_pmu_alias {
>  	 * differ from the PMU name as it won't have suffixes.
>  	 */
>  	char *pmu_name;
> +	/**
> +	 * @cpus: A possible per-event cpumap that overrides that given for the
> +	 * PMU.
> +	 */
> +	struct perf_cpu_map *cpus;
>  	/** @unit: Units for the event, such as bytes or cache lines. */
>  	char unit[UNIT_MAX_LEN+1];
>  	/** @scale: Value to scale read counter values by. */
> @@ -332,6 +337,32 @@ static int perf_pmu__parse_scale(struct perf_pmu *pmu, struct perf_pmu_alias *al
>  	return ret;
>  }
>  
> +static void perf_pmu__parse_cpus(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
> +{
> +	char path[PATH_MAX];
> +	size_t len;
> +	FILE *file;
> +	int fd;
> +
> +	len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path));
> +	if (!len)
> +		return;
> +	scnprintf(path + len, sizeof(path) - len, "%s/events/%s.cpus", pmu->name, alias->name);
> +
> +	fd = open(path, O_RDONLY);
> +	if (fd == -1)
> +		return; /* Expected common case. */
> +
> +	file = fdopen(fd, "r");
> +	if (!file) {
> +		close(fd);
> +		return;
> +	}
> +
> +	alias->cpus = perf_cpu_map__read(file);
> +	fclose(file);
> +}
> +
>  static int perf_pmu__parse_unit(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
>  {
>  	char path[PATH_MAX];
> @@ -493,6 +524,7 @@ static void read_alias_info(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
>  	/*
>  	 * load unit name and scale if available
>  	 */
> +	perf_pmu__parse_cpus(pmu, alias);
>  	perf_pmu__parse_unit(pmu, alias);
>  	perf_pmu__parse_scale(pmu, alias);
>  	perf_pmu__parse_per_pkg(pmu, alias);
> @@ -618,7 +650,7 @@ static inline bool pmu_alias_info_file(const char *name)
>  	size_t len;
>  
>  	len = strlen(name);
> -	if (len > 5 && !strcmp(name + len - 5, ".unit"))
> +	if (len > 5 && (!strcmp(name + len - 5, ".cpus") || !strcmp(name + len - 5, ".unit")))
>  		return true;
>  	if (len > 6 && !strcmp(name + len - 6, ".scale"))
>  		return true;
> @@ -1560,6 +1592,12 @@ static int check_info_data(struct perf_pmu *pmu,
>  	 * define unit, scale and snapshot, fail
>  	 * if there's more than one.
>  	 */
> +	if (!perf_cpu_map__is_empty(info->cpus) && !perf_cpu_map__is_empty(alias->cpus)) {
> +		parse_events_error__handle(err, column,
> +					strdup("Attempt to set event's cpus twice"),
> +					NULL);
> +		return -EINVAL;
> +	}
>  	if (info->unit && alias->unit[0]) {
>  		parse_events_error__handle(err, column,
>  					strdup("Attempt to set event's unit twice"),
> @@ -1579,6 +1617,9 @@ static int check_info_data(struct perf_pmu *pmu,
>  		return -EINVAL;
>  	}
>  
> +	if (!perf_cpu_map__is_empty(alias->cpus))
> +		info->cpus = perf_cpu_map__get(alias->cpus);
> +
>  	if (alias->unit[0])
>  		info->unit = alias->unit;
>  
> @@ -1610,6 +1651,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_
>  	 * Mark unit and scale as not set
>  	 * (different from default values, see below)
>  	 */
> +	info->cpus     = NULL;
>  	info->unit     = NULL;
>  	info->scale    = 0.0;
>  	info->snapshot = false;
> diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
> index b2d3fd291f02..b1ccfe8d3df4 100644
> --- a/tools/perf/util/pmu.h
> +++ b/tools/perf/util/pmu.h
> @@ -177,6 +177,7 @@ struct perf_pmu {
>  extern struct perf_pmu perf_pmu__fake;
>  
>  struct perf_pmu_info {
> +	struct perf_cpu_map *cpus;
>  	const char *unit;
>  	double scale;
>  	bool per_pkg;

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on
  2024-07-18  0:30 ` [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on Ian Rogers
  2024-07-18 14:09   ` James Clark
@ 2024-07-18 14:41   ` Liang, Kan
  2024-07-18 15:12     ` Ian Rogers
  1 sibling, 1 reply; 34+ messages in thread
From: Liang, Kan @ 2024-07-18 14:41 UTC (permalink / raw)
  To: Ian Rogers, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das



On 2024-07-17 8:30 p.m., Ian Rogers wrote:
> The -C option allows the CPUs for a list of events to be specified but
> its not possible to set the CPU for a single event. Add a term to
> allow this. The term isn't a general CPU list due to ',' already being
> a special character in event parsing instead multiple cpu= terms may
> be provided and they will be merged/unioned together.
> 
> An example of mixing different types of events counted on different CPUs:
> ```
> $ perf stat -A -C 0,4-5,8 -e "instructions/cpu=0/,l1d-misses/cpu=4,cpu=5/,inst_retired.any/cpu=8/,cycles" -a sleep 0.1
> 
>  Performance counter stats for 'system wide':
> 
> CPU0              368,647      instructions/cpu=0/              #    0.26  insn per cycle
> CPU4        <not counted>      instructions/cpu=0/
> CPU5        <not counted>      instructions/cpu=0/
> CPU8        <not counted>      instructions/cpu=0/
> CPU0        <not counted>      l1d-misses [cpu]
> CPU4              203,377      l1d-misses [cpu]
> CPU5              138,231      l1d-misses [cpu]
> CPU8        <not counted>      l1d-misses [cpu]
> CPU0        <not counted>      cpu/cpu=8/
> CPU4        <not counted>      cpu/cpu=8/
> CPU5        <not counted>      cpu/cpu=8/
> CPU8              943,861      cpu/cpu=8/
> CPU0            1,412,071      cycles
> CPU4           20,362,900      cycles
> CPU5           10,172,725      cycles
> CPU8            2,406,081      cycles
> 
>        0.102925309 seconds time elapsed
> ```
> 
> Note, the event name of inst_retired.any is missing, reported as
> cpu/cpu=8/, as there are unmerged uniquify fixes:
> https://lore.kernel.org/lkml/20240510053705.2462258-3-irogers@google.com/
> 
> An example of spreading uncore overhead across two CPUs:
> ```
> $ perf stat -A -e "data_read/cpu=0/,data_write/cpu=1/" -a sleep 0.1
> 
>  Performance counter stats for 'system wide':
> 
> CPU0               223.65 MiB  uncore_imc_free_running_0/cpu=0/
> CPU0               223.66 MiB  uncore_imc_free_running_1/cpu=0/
> CPU0        <not counted> MiB  uncore_imc_free_running_0/cpu=1/
> CPU1                 5.78 MiB  uncore_imc_free_running_0/cpu=1/
> CPU0        <not counted> MiB  uncore_imc_free_running_1/cpu=1/
> CPU1                 5.74 MiB  uncore_imc_free_running_1/cpu=1/
> ```
> 
> Manually fixing the output it should be:
> ```
> CPU0               223.65 MiB  uncore_imc_free_running_0/data_read,cpu=0/
> CPU0               223.66 MiB  uncore_imc_free_running_1/data_read,cpu=0/
> CPU1                 5.78 MiB  uncore_imc_free_running_0/data_write,cpu=1/
> CPU1                 5.74 MiB  uncore_imc_free_running_1/data_write,cpu=1/
> ```
> 
> That is data_read from 2 PMUs was counted on CPU0 and data_write was
> counted on CPU1.

There was an effort to make the counter access from any CPU of the package.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d6a2f9035bfc27d0e9d78b13635dda9fb017ac01

But now it limits the access from specific CPUs. It sounds like a
regression.

Thanks,
Kan

> 
> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
>  tools/perf/Documentation/perf-list.txt |  9 ++++
>  tools/perf/util/evsel_config.h         |  1 +
>  tools/perf/util/parse-events.c         | 73 ++++++++++++++++++++++----
>  tools/perf/util/parse-events.h         |  3 +-
>  tools/perf/util/parse-events.l         |  1 +
>  tools/perf/util/pmu.c                  |  1 +
>  6 files changed, 77 insertions(+), 11 deletions(-)
> 
> diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
> index 6bf2468f59d3..15511afe94a1 100644
> --- a/tools/perf/Documentation/perf-list.txt
> +++ b/tools/perf/Documentation/perf-list.txt
> @@ -273,6 +273,15 @@ Sums up the event counts for all hardware threads in a core, e.g.:
>  
>    perf stat -e cpu/event=0,umask=0x3,percore=1/
>  
> +cpu:
> +
> +Specifies the CPU to open the event upon. The value may be repeated to
> +specify opening the event on multiple CPUs:
> +
> +
> +  perf stat -e instructions/cpu=0,cpu=2/,cycles/cpu=1,cpu=2/ -a sleep 1
> +  perf stat -e data_read/cpu=0/,data_write/cpu=1/ -a sleep 1
> +
>  
>  EVENT GROUPS
>  ------------
> diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h
> index aee6f808b512..9630c4a24721 100644
> --- a/tools/perf/util/evsel_config.h
> +++ b/tools/perf/util/evsel_config.h
> @@ -47,6 +47,7 @@ struct evsel_config_term {
>  		u32	      aux_sample_size;
>  		u64	      cfg_chg;
>  		char	      *str;
> +		int	      cpu;
>  	} val;
>  	bool weak;
>  };
> diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> index 8c0c33361c5e..85faef85b8de 100644
> --- a/tools/perf/util/parse-events.c
> +++ b/tools/perf/util/parse-events.c
> @@ -7,6 +7,7 @@
>  #include <errno.h>
>  #include <sys/ioctl.h>
>  #include <sys/param.h>
> +#include "cpumap.h"
>  #include "term.h"
>  #include "evlist.h"
>  #include "evsel.h"
> @@ -177,6 +178,26 @@ static char *get_config_name(const struct parse_events_terms *head_terms)
>  	return get_config_str(head_terms, PARSE_EVENTS__TERM_TYPE_NAME);
>  }
>  
> +static struct perf_cpu_map *get_config_cpu(const struct parse_events_terms *head_terms)
> +{
> +	struct parse_events_term *term;
> +	struct perf_cpu_map *cpus = NULL;
> +
> +	if (!head_terms)
> +		return NULL;
> +
> +	list_for_each_entry(term, &head_terms->terms, list) {
> +		if (term->type_term == PARSE_EVENTS__TERM_TYPE_CPU) {
> +			struct perf_cpu_map *cpu = perf_cpu_map__new_int(term->val.num);
> +
> +			cpus = perf_cpu_map__merge(cpus, cpu);
> +			perf_cpu_map__put(cpu);
> +		}
> +	}
> +
> +	return cpus;
> +}
> +
>  /**
>   * fix_raw - For each raw term see if there is an event (aka alias) in pmu that
>   *           matches the raw's string value. If the string value matches an
> @@ -468,11 +489,12 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
>  	bool found_supported = false;
>  	const char *config_name = get_config_name(parsed_terms);
>  	const char *metric_id = get_config_metric_id(parsed_terms);
> +	struct perf_cpu_map *cpus = get_config_cpu(parsed_terms);
> +	int ret = 0;
>  
>  	while ((pmu = perf_pmus__scan(pmu)) != NULL) {
>  		LIST_HEAD(config_terms);
>  		struct perf_event_attr attr;
> -		int ret;
>  
>  		if (parse_events__filter_pmu(parse_state, pmu))
>  			continue;
> @@ -486,7 +508,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
>  						   parsed_terms,
>  						   perf_pmu__auto_merge_stats(pmu));
>  			if (ret)
> -				return ret;
> +				goto out_err;
>  			continue;
>  		}
>  
> @@ -506,20 +528,27 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
>  
>  		if (parsed_terms) {
>  			if (config_attr(&attr, parsed_terms, parse_state->error,
> -					config_term_common))
> -				return -EINVAL;
> -
> -			if (get_config_terms(parsed_terms, &config_terms))
> -				return -ENOMEM;
> +					config_term_common)) {
> +				ret = -EINVAL;
> +				goto out_err;
> +			}
> +			if (get_config_terms(parsed_terms, &config_terms)) {
> +				ret = -ENOMEM;
> +				goto out_err;
> +			}
>  		}
>  
>  		if (__add_event(list, idx, &attr, /*init_attr*/true, config_name ?: name,
>  				metric_id, pmu, &config_terms, /*auto_merge_stats=*/false,
> -				/*cpu_list=*/NULL) == NULL)
> -			return -ENOMEM;
> +				cpus) == NULL)
> +			ret = -ENOMEM;
>  
>  		free_config_terms(&config_terms);
> +		if (ret)
> +			goto out_err;
>  	}
> +out_err:
> +	perf_cpu_map__put(cpus);
>  	return found_supported ? 0 : -EINVAL;
>  }
>  
> @@ -814,6 +843,7 @@ static const char *config_term_name(enum parse_events__term_type term_type)
>  		[PARSE_EVENTS__TERM_TYPE_RAW]                   = "raw",
>  		[PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE]          = "legacy-cache",
>  		[PARSE_EVENTS__TERM_TYPE_HARDWARE]              = "hardware",
> +		[PARSE_EVENTS__TERM_TYPE_CPU]			= "cpu",
>  	};
>  	if ((unsigned int)term_type >= __PARSE_EVENTS__TERM_TYPE_NR)
>  		return "unknown term";
> @@ -843,6 +873,7 @@ config_term_avail(enum parse_events__term_type term_type, struct parse_events_er
>  	case PARSE_EVENTS__TERM_TYPE_METRIC_ID:
>  	case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
>  	case PARSE_EVENTS__TERM_TYPE_PERCORE:
> +	case PARSE_EVENTS__TERM_TYPE_CPU:
>  		return true;
>  	case PARSE_EVENTS__TERM_TYPE_USER:
>  	case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ:
> @@ -986,6 +1017,15 @@ do {									   \
>  			return -EINVAL;
>  		}
>  		break;
> +	case PARSE_EVENTS__TERM_TYPE_CPU:
> +		CHECK_TYPE_VAL(NUM);
> +		if (term->val.num >= (u64)cpu__max_present_cpu().cpu) {
> +			parse_events_error__handle(err, term->err_val,
> +						strdup("too big"),
> +						NULL);
> +			return -EINVAL;
> +		}
> +		break;
>  	case PARSE_EVENTS__TERM_TYPE_DRV_CFG:
>  	case PARSE_EVENTS__TERM_TYPE_USER:
>  	case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
> @@ -1112,6 +1152,7 @@ static int config_term_tracepoint(struct perf_event_attr *attr,
>  	case PARSE_EVENTS__TERM_TYPE_RAW:
>  	case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
>  	case PARSE_EVENTS__TERM_TYPE_HARDWARE:
> +	case PARSE_EVENTS__TERM_TYPE_CPU:
>  	default:
>  		if (err) {
>  			parse_events_error__handle(err, term->err_term,
> @@ -1243,6 +1284,7 @@ do {								\
>  		case PARSE_EVENTS__TERM_TYPE_RAW:
>  		case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
>  		case PARSE_EVENTS__TERM_TYPE_HARDWARE:
> +		case PARSE_EVENTS__TERM_TYPE_CPU:
>  		default:
>  			break;
>  		}
> @@ -1296,6 +1338,7 @@ static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head
>  		case PARSE_EVENTS__TERM_TYPE_RAW:
>  		case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
>  		case PARSE_EVENTS__TERM_TYPE_HARDWARE:
> +		case PARSE_EVENTS__TERM_TYPE_CPU:
>  		default:
>  			break;
>  		}
> @@ -1350,6 +1393,7 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state,
>  	struct perf_event_attr attr;
>  	LIST_HEAD(config_terms);
>  	const char *name, *metric_id;
> +	struct perf_cpu_map *cpus;
>  	int ret;
>  
>  	memset(&attr, 0, sizeof(attr));
> @@ -1371,9 +1415,11 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state,
>  
>  	name = get_config_name(head_config);
>  	metric_id = get_config_metric_id(head_config);
> +	cpus = get_config_cpu(head_config);
>  	ret = __add_event(list, &parse_state->idx, &attr, /*init_attr*/true, name,
>  			metric_id, pmu, &config_terms, /*auto_merge_stats=*/false,
> -			/*cpu_list=*/NULL) ? 0 : -ENOMEM;
> +			cpus) ? 0 : -ENOMEM;
> +	perf_cpu_map__put(cpus);
>  	free_config_terms(&config_terms);
>  	return ret;
>  }
> @@ -1440,6 +1486,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>  	LIST_HEAD(config_terms);
>  	struct parse_events_terms parsed_terms;
>  	bool alias_rewrote_terms = false;
> +	struct perf_cpu_map *term_cpu = NULL;
>  	int ret = 0;
>  
>  	if (verbose > 1) {
> @@ -1531,6 +1578,12 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>  		goto out_err;
>  	}
>  
> +	term_cpu = get_config_cpu(&parsed_terms);
> +	if (!perf_cpu_map__is_empty(term_cpu)) {
> +		perf_cpu_map__put(info.cpus);
> +		info.cpus = term_cpu;
> +		term_cpu = NULL;
> +	}
>  	evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true,
>  			    get_config_name(&parsed_terms),
>  			    get_config_metric_id(&parsed_terms), pmu,
> diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
> index e13de2c8b706..b03857499030 100644
> --- a/tools/perf/util/parse-events.h
> +++ b/tools/perf/util/parse-events.h
> @@ -79,7 +79,8 @@ enum parse_events__term_type {
>  	PARSE_EVENTS__TERM_TYPE_RAW,
>  	PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE,
>  	PARSE_EVENTS__TERM_TYPE_HARDWARE,
> -#define	__PARSE_EVENTS__TERM_TYPE_NR (PARSE_EVENTS__TERM_TYPE_HARDWARE + 1)
> +	PARSE_EVENTS__TERM_TYPE_CPU,
> +#define	__PARSE_EVENTS__TERM_TYPE_NR (PARSE_EVENTS__TERM_TYPE_CPU + 1)
>  };
>  
>  struct parse_events_term {
> diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
> index 16045c383ada..e06097a62796 100644
> --- a/tools/perf/util/parse-events.l
> +++ b/tools/perf/util/parse-events.l
> @@ -330,6 +330,7 @@ percore			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_PERCORE); }
>  aux-output		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT); }
>  aux-sample-size		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE); }
>  metric-id		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_METRIC_ID); }
> +cpu			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CPU); }
>  cpu-cycles|cycles				{ return hw_term(yyscanner, PERF_COUNT_HW_CPU_CYCLES); }
>  stalled-cycles-frontend|idle-cycles-frontend	{ return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); }
>  stalled-cycles-backend|idle-cycles-backend	{ return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); }
> diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
> index 280b2499c861..27e2ff23799e 100644
> --- a/tools/perf/util/pmu.c
> +++ b/tools/perf/util/pmu.c
> @@ -1767,6 +1767,7 @@ int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_call
>  		"percore",
>  		"aux-output",
>  		"aux-sample-size=number",
> +		"cpu=number",
>  	};
>  	struct perf_pmu_format *format;
>  	int ret;

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term
  2024-07-18 13:42 ` [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Dhananjay Ugwekar
@ 2024-07-18 15:00   ` Ian Rogers
  0 siblings, 0 replies; 34+ messages in thread
From: Ian Rogers @ 2024-07-18 15:00 UTC (permalink / raw)
  To: Dhananjay Ugwekar
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Kan Liang, Bjorn Helgaas, Jonathan Corbet,
	James Clark, Ravi Bangoria, Dominique Martinet, linux-kernel,
	linux-perf-users, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das

On Thu, Jul 18, 2024 at 6:42 AM Dhananjay Ugwekar
<Dhananjay.Ugwekar@amd.com> wrote:
>
> Hello Ian,
>
> On 7/18/2024 6:00 AM, Ian Rogers wrote:
> > The need for a sysfs event.cpus file is discussed here:
> > https://lore.kernel.org/lkml/CAP-5=fXXuWchzUK0n5KTH8kamr=DQoEni+bUoo8f-4j8Y+eMBg@mail.gmail.com/
> > following Dhananjay Ugwekar's work on the RAPL /sys/devices/power PMU.
> > These changes add support for the event.cpus file in sysfs and also a
> > cpu event term allowing events to have differing CPUs. This was added
> > in order to test the parsing and map propagation for the sysfs case.
>
> Tried applying the patchset on top of v6.10, torvalds/master and tip/master,
> it is failing at patch#2, what is the base commit/branch for this patchset?

Hi Dhanajay,

The latest perf tool work is on the perf-tool-next branch of the
perf-tool-next.git:
https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/

That said, Kan isn't keen on event.cpus, so  let's have a discussion about it.

Thanks,
Ian

> Thanks,
> Dhananjay
>
> >
> > v2: Add support for multiple cpu terms on an event that are
> >     merged. For example, an event of "l1d-misses/cpu=4,cpu=5/" will
> >     now be opened on both CPU 4 and 5 rather than just CPU 4.
> >
> > Ian Rogers (6):
> >   perf pmu: Merge boolean sysfs event option parsing
> >   perf parse-events: Pass cpu_list as a perf_cpu_map in __add_event
> >   perf pmu: Add support for event.cpus files in sysfs
> >   libperf cpumap: Add ability to create CPU from a single CPU number
> >   perf parse-events: Set is_pmu_core for legacy hardware events
> >   perf parse-events: Add "cpu" term to set the CPU an event is recorded
> >     on
> >
> >  .../sysfs-bus-event_source-devices-events     |  14 ++
> >  tools/lib/perf/cpumap.c                       |  10 ++
> >  tools/lib/perf/include/perf/cpumap.h          |   2 +
> >  tools/perf/Documentation/perf-list.txt        |   9 +
> >  tools/perf/util/evsel_config.h                |   1 +
> >  tools/perf/util/parse-events.c                | 162 ++++++++++++------
> >  tools/perf/util/parse-events.h                |   3 +-
> >  tools/perf/util/parse-events.l                |   1 +
> >  tools/perf/util/pmu.c                         |  92 +++++++---
> >  tools/perf/util/pmu.h                         |   1 +
> >  10 files changed, 221 insertions(+), 74 deletions(-)
> >

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on
  2024-07-18 14:09   ` James Clark
@ 2024-07-18 15:07     ` Ian Rogers
  0 siblings, 0 replies; 34+ messages in thread
From: Ian Rogers @ 2024-07-18 15:07 UTC (permalink / raw)
  To: James Clark
  Cc: Dhananjay Ugwekar, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Adrian Hunter, Kan Liang,
	Bjorn Helgaas, Jonathan Corbet, James Clark, Ravi Bangoria,
	Dominique Martinet, linux-kernel, linux-perf-users,
	ananth.narayan, gautham.shenoy, kprateek.nayak, sandipan.das

On Thu, Jul 18, 2024 at 7:10 AM James Clark <james.clark@linaro.org> wrote:
>
>
>
> On 18/07/2024 1:30 am, Ian Rogers wrote:
> > The -C option allows the CPUs for a list of events to be specified but
> > its not possible to set the CPU for a single event. Add a term to
> > allow this. The term isn't a general CPU list due to ',' already being
> > a special character in event parsing instead multiple cpu= terms may
> > be provided and they will be merged/unioned together.
> >
> > An example of mixing different types of events counted on different CPUs:
> > ```
> > $ perf stat -A -C 0,4-5,8 -e "instructions/cpu=0/,l1d-misses/cpu=4,cpu=5/,inst_retired.any/cpu=8/,cycles" -a sleep 0.1
> >
>
> I ran into cpu=N having no effect unless -C is also used. For example:
>
>   $ perf stat -vv -e branch-misses/cpu=1/ -- true
>
>   sys_perf_event_open: pid 10233  cpu -1  group_fd -1  flags 0x8 = 3
>
> Vs:
>
>   $ perf stat -C 0,1 -vv -e branch-misses/cpu=1/ -- true
>
>   sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 3
>
> Is it possible to have a warning or error in that case? Seems like it's
> quite likely to confuse a user.

Agreed. We have similar warnings for "-A". Tbh, there's no reason we
can support this, perf_event_open does. The evlist propagate maps
logic in my mind is crazy. I agree with a need to summarize the cpu
maps in the evlist, but the code changes evsel cpu maps in ways that
are brittle - for example, in this series I try to fix that legacy
hardware and hardware cache events aren't considered core. Rather than
processing command line options, the propagate maps fudge, I just wish
event parsing would do parsing and the events be pretty immutable
after that. The problem comes if you do -e then -C or -C then -e, as
our command line processing is ordering dependent.

Thanks,
Ian

> Other than that it seems to work ok.
>
> WRT the RAPL issue [1], I didn't quite follow how these two things will
> join up. Doesn't this only end up setting the cpu argument to
> perf_event_open(), which already aggregates per package rather than per
> core, even if the cpu is set? And the fix for that behavior was rejected
> because it would break Intel metrics [2].
>
> Maybe I'm missing a piece.
>
> [1]:
> https://lore.kernel.org/lkml/CAP-5=fXXuWchzUK0n5KTH8kamr=DQoEni+bUoo8f-4j8Y+eMBg@mail.gmail.com/
> [2]:
> https://lore.kernel.org/lkml/3e766f0e-37d4-0f82-3868-31b14228868d@linux.intel.com/
>
> James
>
> >   Performance counter stats for 'system wide':
> >
> > CPU0              368,647      instructions/cpu=0/              #    0.26  insn per cycle
> > CPU4        <not counted>      instructions/cpu=0/
> > CPU5        <not counted>      instructions/cpu=0/
> > CPU8        <not counted>      instructions/cpu=0/
> > CPU0        <not counted>      l1d-misses [cpu]
> > CPU4              203,377      l1d-misses [cpu]
> > CPU5              138,231      l1d-misses [cpu]
> > CPU8        <not counted>      l1d-misses [cpu]
> > CPU0        <not counted>      cpu/cpu=8/
> > CPU4        <not counted>      cpu/cpu=8/
> > CPU5        <not counted>      cpu/cpu=8/
> > CPU8              943,861      cpu/cpu=8/
> > CPU0            1,412,071      cycles
> > CPU4           20,362,900      cycles
> > CPU5           10,172,725      cycles
> > CPU8            2,406,081      cycles
> >
> >         0.102925309 seconds time elapsed
> > ```
> >
> > Note, the event name of inst_retired.any is missing, reported as
> > cpu/cpu=8/, as there are unmerged uniquify fixes:
> > https://lore.kernel.org/lkml/20240510053705.2462258-3-irogers@google.com/
> >
> > An example of spreading uncore overhead across two CPUs:
> > ```
> > $ perf stat -A -e "data_read/cpu=0/,data_write/cpu=1/" -a sleep 0.1
> >
> >   Performance counter stats for 'system wide':
> >
> > CPU0               223.65 MiB  uncore_imc_free_running_0/cpu=0/
> > CPU0               223.66 MiB  uncore_imc_free_running_1/cpu=0/
> > CPU0        <not counted> MiB  uncore_imc_free_running_0/cpu=1/
> > CPU1                 5.78 MiB  uncore_imc_free_running_0/cpu=1/
> > CPU0        <not counted> MiB  uncore_imc_free_running_1/cpu=1/
> > CPU1                 5.74 MiB  uncore_imc_free_running_1/cpu=1/
> > ```
> >
> > Manually fixing the output it should be:
> > ```
> > CPU0               223.65 MiB  uncore_imc_free_running_0/data_read,cpu=0/
> > CPU0               223.66 MiB  uncore_imc_free_running_1/data_read,cpu=0/
> > CPU1                 5.78 MiB  uncore_imc_free_running_0/data_write,cpu=1/
> > CPU1                 5.74 MiB  uncore_imc_free_running_1/data_write,cpu=1/
> > ```
> >
> > That is data_read from 2 PMUs was counted on CPU0 and data_write was
> > counted on CPU1.
> >
> > Signed-off-by: Ian Rogers <irogers@google.com>
> > ---
> >   tools/perf/Documentation/perf-list.txt |  9 ++++
> >   tools/perf/util/evsel_config.h         |  1 +
> >   tools/perf/util/parse-events.c         | 73 ++++++++++++++++++++++----
> >   tools/perf/util/parse-events.h         |  3 +-
> >   tools/perf/util/parse-events.l         |  1 +
> >   tools/perf/util/pmu.c                  |  1 +
> >   6 files changed, 77 insertions(+), 11 deletions(-)
> >
> > diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
> > index 6bf2468f59d3..15511afe94a1 100644
> > --- a/tools/perf/Documentation/perf-list.txt
> > +++ b/tools/perf/Documentation/perf-list.txt
> > @@ -273,6 +273,15 @@ Sums up the event counts for all hardware threads in a core, e.g.:
> >
> >     perf stat -e cpu/event=0,umask=0x3,percore=1/
> >
> > +cpu:
> > +
> > +Specifies the CPU to open the event upon. The value may be repeated to
> > +specify opening the event on multiple CPUs:
> > +
> > +
> > +  perf stat -e instructions/cpu=0,cpu=2/,cycles/cpu=1,cpu=2/ -a sleep 1
> > +  perf stat -e data_read/cpu=0/,data_write/cpu=1/ -a sleep 1
> > +
> >
> >   EVENT GROUPS
> >   ------------
> > diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h
> > index aee6f808b512..9630c4a24721 100644
> > --- a/tools/perf/util/evsel_config.h
> > +++ b/tools/perf/util/evsel_config.h
> > @@ -47,6 +47,7 @@ struct evsel_config_term {
> >               u32           aux_sample_size;
> >               u64           cfg_chg;
> >               char          *str;
> > +             int           cpu;
> >       } val;
> >       bool weak;
> >   };
> > diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> > index 8c0c33361c5e..85faef85b8de 100644
> > --- a/tools/perf/util/parse-events.c
> > +++ b/tools/perf/util/parse-events.c
> > @@ -7,6 +7,7 @@
> >   #include <errno.h>
> >   #include <sys/ioctl.h>
> >   #include <sys/param.h>
> > +#include "cpumap.h"
> >   #include "term.h"
> >   #include "evlist.h"
> >   #include "evsel.h"
> > @@ -177,6 +178,26 @@ static char *get_config_name(const struct parse_events_terms *head_terms)
> >       return get_config_str(head_terms, PARSE_EVENTS__TERM_TYPE_NAME);
> >   }
> >
> > +static struct perf_cpu_map *get_config_cpu(const struct parse_events_terms *head_terms)
> > +{
> > +     struct parse_events_term *term;
> > +     struct perf_cpu_map *cpus = NULL;
> > +
> > +     if (!head_terms)
> > +             return NULL;
> > +
> > +     list_for_each_entry(term, &head_terms->terms, list) {
> > +             if (term->type_term == PARSE_EVENTS__TERM_TYPE_CPU) {
> > +                     struct perf_cpu_map *cpu = perf_cpu_map__new_int(term->val.num);
> > +
> > +                     cpus = perf_cpu_map__merge(cpus, cpu);
> > +                     perf_cpu_map__put(cpu);
> > +             }
> > +     }
> > +
> > +     return cpus;
> > +}
> > +
> >   /**
> >    * fix_raw - For each raw term see if there is an event (aka alias) in pmu that
> >    *           matches the raw's string value. If the string value matches an
> > @@ -468,11 +489,12 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
> >       bool found_supported = false;
> >       const char *config_name = get_config_name(parsed_terms);
> >       const char *metric_id = get_config_metric_id(parsed_terms);
> > +     struct perf_cpu_map *cpus = get_config_cpu(parsed_terms);
> > +     int ret = 0;
> >
> >       while ((pmu = perf_pmus__scan(pmu)) != NULL) {
> >               LIST_HEAD(config_terms);
> >               struct perf_event_attr attr;
> > -             int ret;
> >
> >               if (parse_events__filter_pmu(parse_state, pmu))
> >                       continue;
> > @@ -486,7 +508,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
> >                                                  parsed_terms,
> >                                                  perf_pmu__auto_merge_stats(pmu));
> >                       if (ret)
> > -                             return ret;
> > +                             goto out_err;
> >                       continue;
> >               }
> >
> > @@ -506,20 +528,27 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name,
> >
> >               if (parsed_terms) {
> >                       if (config_attr(&attr, parsed_terms, parse_state->error,
> > -                                     config_term_common))
> > -                             return -EINVAL;
> > -
> > -                     if (get_config_terms(parsed_terms, &config_terms))
> > -                             return -ENOMEM;
> > +                                     config_term_common)) {
> > +                             ret = -EINVAL;
> > +                             goto out_err;
> > +                     }
> > +                     if (get_config_terms(parsed_terms, &config_terms)) {
> > +                             ret = -ENOMEM;
> > +                             goto out_err;
> > +                     }
> >               }
> >
> >               if (__add_event(list, idx, &attr, /*init_attr*/true, config_name ?: name,
> >                               metric_id, pmu, &config_terms, /*auto_merge_stats=*/false,
> > -                             /*cpu_list=*/NULL) == NULL)
> > -                     return -ENOMEM;
> > +                             cpus) == NULL)
> > +                     ret = -ENOMEM;
> >
> >               free_config_terms(&config_terms);
> > +             if (ret)
> > +                     goto out_err;
> >       }
> > +out_err:
> > +     perf_cpu_map__put(cpus);
> >       return found_supported ? 0 : -EINVAL;
> >   }
> >
> > @@ -814,6 +843,7 @@ static const char *config_term_name(enum parse_events__term_type term_type)
> >               [PARSE_EVENTS__TERM_TYPE_RAW]                   = "raw",
> >               [PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE]          = "legacy-cache",
> >               [PARSE_EVENTS__TERM_TYPE_HARDWARE]              = "hardware",
> > +             [PARSE_EVENTS__TERM_TYPE_CPU]                   = "cpu",
> >       };
> >       if ((unsigned int)term_type >= __PARSE_EVENTS__TERM_TYPE_NR)
> >               return "unknown term";
> > @@ -843,6 +873,7 @@ config_term_avail(enum parse_events__term_type term_type, struct parse_events_er
> >       case PARSE_EVENTS__TERM_TYPE_METRIC_ID:
> >       case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
> >       case PARSE_EVENTS__TERM_TYPE_PERCORE:
> > +     case PARSE_EVENTS__TERM_TYPE_CPU:
> >               return true;
> >       case PARSE_EVENTS__TERM_TYPE_USER:
> >       case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ:
> > @@ -986,6 +1017,15 @@ do {                                                                       \
> >                       return -EINVAL;
> >               }
> >               break;
> > +     case PARSE_EVENTS__TERM_TYPE_CPU:
> > +             CHECK_TYPE_VAL(NUM);
> > +             if (term->val.num >= (u64)cpu__max_present_cpu().cpu) {
> > +                     parse_events_error__handle(err, term->err_val,
> > +                                             strdup("too big"),
> > +                                             NULL);
> > +                     return -EINVAL;
> > +             }
> > +             break;
> >       case PARSE_EVENTS__TERM_TYPE_DRV_CFG:
> >       case PARSE_EVENTS__TERM_TYPE_USER:
> >       case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
> > @@ -1112,6 +1152,7 @@ static int config_term_tracepoint(struct perf_event_attr *attr,
> >       case PARSE_EVENTS__TERM_TYPE_RAW:
> >       case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
> >       case PARSE_EVENTS__TERM_TYPE_HARDWARE:
> > +     case PARSE_EVENTS__TERM_TYPE_CPU:
> >       default:
> >               if (err) {
> >                       parse_events_error__handle(err, term->err_term,
> > @@ -1243,6 +1284,7 @@ do {                                                            \
> >               case PARSE_EVENTS__TERM_TYPE_RAW:
> >               case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
> >               case PARSE_EVENTS__TERM_TYPE_HARDWARE:
> > +             case PARSE_EVENTS__TERM_TYPE_CPU:
> >               default:
> >                       break;
> >               }
> > @@ -1296,6 +1338,7 @@ static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head
> >               case PARSE_EVENTS__TERM_TYPE_RAW:
> >               case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE:
> >               case PARSE_EVENTS__TERM_TYPE_HARDWARE:
> > +             case PARSE_EVENTS__TERM_TYPE_CPU:
> >               default:
> >                       break;
> >               }
> > @@ -1350,6 +1393,7 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state,
> >       struct perf_event_attr attr;
> >       LIST_HEAD(config_terms);
> >       const char *name, *metric_id;
> > +     struct perf_cpu_map *cpus;
> >       int ret;
> >
> >       memset(&attr, 0, sizeof(attr));
> > @@ -1371,9 +1415,11 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state,
> >
> >       name = get_config_name(head_config);
> >       metric_id = get_config_metric_id(head_config);
> > +     cpus = get_config_cpu(head_config);
> >       ret = __add_event(list, &parse_state->idx, &attr, /*init_attr*/true, name,
> >                       metric_id, pmu, &config_terms, /*auto_merge_stats=*/false,
> > -                     /*cpu_list=*/NULL) ? 0 : -ENOMEM;
> > +                     cpus) ? 0 : -ENOMEM;
> > +     perf_cpu_map__put(cpus);
> >       free_config_terms(&config_terms);
> >       return ret;
> >   }
> > @@ -1440,6 +1486,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
> >       LIST_HEAD(config_terms);
> >       struct parse_events_terms parsed_terms;
> >       bool alias_rewrote_terms = false;
> > +     struct perf_cpu_map *term_cpu = NULL;
> >       int ret = 0;
> >
> >       if (verbose > 1) {
> > @@ -1531,6 +1578,12 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
> >               goto out_err;
> >       }
> >
> > +     term_cpu = get_config_cpu(&parsed_terms);
> > +     if (!perf_cpu_map__is_empty(term_cpu)) {
> > +             perf_cpu_map__put(info.cpus);
> > +             info.cpus = term_cpu;
> > +             term_cpu = NULL;
> > +     }
> >       evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true,
> >                           get_config_name(&parsed_terms),
> >                           get_config_metric_id(&parsed_terms), pmu,
> > diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
> > index e13de2c8b706..b03857499030 100644
> > --- a/tools/perf/util/parse-events.h
> > +++ b/tools/perf/util/parse-events.h
> > @@ -79,7 +79,8 @@ enum parse_events__term_type {
> >       PARSE_EVENTS__TERM_TYPE_RAW,
> >       PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE,
> >       PARSE_EVENTS__TERM_TYPE_HARDWARE,
> > -#define      __PARSE_EVENTS__TERM_TYPE_NR (PARSE_EVENTS__TERM_TYPE_HARDWARE + 1)
> > +     PARSE_EVENTS__TERM_TYPE_CPU,
> > +#define      __PARSE_EVENTS__TERM_TYPE_NR (PARSE_EVENTS__TERM_TYPE_CPU + 1)
> >   };
> >
> >   struct parse_events_term {
> > diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
> > index 16045c383ada..e06097a62796 100644
> > --- a/tools/perf/util/parse-events.l
> > +++ b/tools/perf/util/parse-events.l
> > @@ -330,6 +330,7 @@ percore                   { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_PERCORE); }
> >   aux-output          { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT); }
> >   aux-sample-size             { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE); }
> >   metric-id           { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_METRIC_ID); }
> > +cpu                  { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CPU); }
> >   cpu-cycles|cycles                           { return hw_term(yyscanner, PERF_COUNT_HW_CPU_CYCLES); }
> >   stalled-cycles-frontend|idle-cycles-frontend        { return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); }
> >   stalled-cycles-backend|idle-cycles-backend  { return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); }
> > diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
> > index 280b2499c861..27e2ff23799e 100644
> > --- a/tools/perf/util/pmu.c
> > +++ b/tools/perf/util/pmu.c
> > @@ -1767,6 +1767,7 @@ int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_call
> >               "percore",
> >               "aux-output",
> >               "aux-sample-size=number",
> > +             "cpu=number",
> >       };
> >       struct perf_pmu_format *format;
> >       int ret;

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on
  2024-07-18 14:41   ` Liang, Kan
@ 2024-07-18 15:12     ` Ian Rogers
  2024-07-18 18:02       ` Liang, Kan
  0 siblings, 1 reply; 34+ messages in thread
From: Ian Rogers @ 2024-07-18 15:12 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das

On Thu, Jul 18, 2024 at 7:41 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
>
>
> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
> > The -C option allows the CPUs for a list of events to be specified but
> > its not possible to set the CPU for a single event. Add a term to
> > allow this. The term isn't a general CPU list due to ',' already being
> > a special character in event parsing instead multiple cpu= terms may
> > be provided and they will be merged/unioned together.
> >
> > An example of mixing different types of events counted on different CPUs:
> > ```
> > $ perf stat -A -C 0,4-5,8 -e "instructions/cpu=0/,l1d-misses/cpu=4,cpu=5/,inst_retired.any/cpu=8/,cycles" -a sleep 0.1
> >
> >  Performance counter stats for 'system wide':
> >
> > CPU0              368,647      instructions/cpu=0/              #    0.26  insn per cycle
> > CPU4        <not counted>      instructions/cpu=0/
> > CPU5        <not counted>      instructions/cpu=0/
> > CPU8        <not counted>      instructions/cpu=0/
> > CPU0        <not counted>      l1d-misses [cpu]
> > CPU4              203,377      l1d-misses [cpu]
> > CPU5              138,231      l1d-misses [cpu]
> > CPU8        <not counted>      l1d-misses [cpu]
> > CPU0        <not counted>      cpu/cpu=8/
> > CPU4        <not counted>      cpu/cpu=8/
> > CPU5        <not counted>      cpu/cpu=8/
> > CPU8              943,861      cpu/cpu=8/
> > CPU0            1,412,071      cycles
> > CPU4           20,362,900      cycles
> > CPU5           10,172,725      cycles
> > CPU8            2,406,081      cycles
> >
> >        0.102925309 seconds time elapsed
> > ```
> >
> > Note, the event name of inst_retired.any is missing, reported as
> > cpu/cpu=8/, as there are unmerged uniquify fixes:
> > https://lore.kernel.org/lkml/20240510053705.2462258-3-irogers@google.com/
> >
> > An example of spreading uncore overhead across two CPUs:
> > ```
> > $ perf stat -A -e "data_read/cpu=0/,data_write/cpu=1/" -a sleep 0.1
> >
> >  Performance counter stats for 'system wide':
> >
> > CPU0               223.65 MiB  uncore_imc_free_running_0/cpu=0/
> > CPU0               223.66 MiB  uncore_imc_free_running_1/cpu=0/
> > CPU0        <not counted> MiB  uncore_imc_free_running_0/cpu=1/
> > CPU1                 5.78 MiB  uncore_imc_free_running_0/cpu=1/
> > CPU0        <not counted> MiB  uncore_imc_free_running_1/cpu=1/
> > CPU1                 5.74 MiB  uncore_imc_free_running_1/cpu=1/
> > ```
> >
> > Manually fixing the output it should be:
> > ```
> > CPU0               223.65 MiB  uncore_imc_free_running_0/data_read,cpu=0/
> > CPU0               223.66 MiB  uncore_imc_free_running_1/data_read,cpu=0/
> > CPU1                 5.78 MiB  uncore_imc_free_running_0/data_write,cpu=1/
> > CPU1                 5.74 MiB  uncore_imc_free_running_1/data_write,cpu=1/
> > ```
> >
> > That is data_read from 2 PMUs was counted on CPU0 and data_write was
> > counted on CPU1.
>
> There was an effort to make the counter access from any CPU of the package.
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d6a2f9035bfc27d0e9d78b13635dda9fb017ac01
>
> But now it limits the access from specific CPUs. It sounds like a
> regression.

Thanks Kan, I'm not sure I understand the comment. The overhead I was
thinking of here is more along the lines of cgroup context switches
(although that isn't in my example). There may be a large number of
say memory controller events just by having 2 events for each PMU and
then there are 10s of PMUs. By putting half of the events on 1 CPU and
half on another, the context switch overhead is shared. That said, the
counters don't care what cgroup is accessing memory, and users doing
this are likely making some kind of error.

Thanks,
Ian

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-18 14:33   ` Liang, Kan
@ 2024-07-18 15:39     ` Ian Rogers
  2024-07-18 17:47       ` Liang, Kan
  0 siblings, 1 reply; 34+ messages in thread
From: Ian Rogers @ 2024-07-18 15:39 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das

On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
>
>
> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
> > If an event file exists in sysfs, check if a event.cpus file exists
> > and read a perf_cpu_map from it if it does. This allows particular
> > events to have a different set of CPUs compared to the PMU.
> >
> > One scenario where this could be useful is when a PMU is set up with a
> > cpumask/events per SMT thread but some events record for all SMT
> > threads. Programming an event on each SMT thread will cause
> > unnecessary counters to be programmed and the aggregate value to be
> > too large.
>
> If I understand the scenario correctly, I think the issue should have
> been addressed since ICX, with the introduction of  the "SMT-aware
> events". Is there a specific event which still has the issue on newer
> platforms?

Nothing comes to mind, but it is there on popular machines like Skylake.

> >
> > Another scenario where this could be useful if when a PMU has
> > historically had a cpumask at the package level, but now newer per
> > die, core or CPU information is available.
>
> The traditional way to support new counters with a different scope is to
> add a new PMU.
>
> >
> > Additional context for the motivation is in these patches and
> > conversation:
> > https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/
>
> I don't see the advantages of the new event.cpus way.
> The aggregation should be the same.

Agreed. My concern is that we may end up with a pattern of
<pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just
for the sake of the cpumask.

> The RAPL counters are free-running counters. So there is no difference
> whether grouping or not grouping.

Should the RAPL counters return true for perf_pmu__is_software? In the
tool we currently return false and won't allow grouping, these events
with other hardware events. The intent in perf_pmu__is_software was to
emulate the way the kernel allows/disallows groups - software context
events can be in a hardware context but otherwise I don't believe the
grouping is allowed.

> But it makes the kernel driver complex, since it has to maintain at
> least two different cpumasks.

Two drivers each maintaining a cpumask or 1 driver maintaining 2
cpumasks, it seems like there is chance for code reuse in both cases.
I'm not seeing how it adds to complexity particularly.

> The tool becomes complex either, since it has to take care of the
> per-event CPU override case.

I'm not sure I agree with this. What we need for perf_event_open is a
perf_event_attr, we dress these up as evsels which also have the
ability to be for >1 CPU or thread. Longer term I think evsels can
also have >1 PMU, for the wildcard cases like uncore memory
controllers - this would remove the need for resorting evsels except
for topdown events which have thrown us a giant bundle of challenges.
Anyway, so an evsel is perf_event_attr information paired with CPU
information, it makes sense to me that the parser should do this
pairing. What's harder in the evsel/evlist setup is trying to fix CPU
maps up not in parsing, like with __perf_evlist__propagate_maps where
the parsing is trying to leave crumbs around (like system_wide,
has_user_cpus, is_pmu_core) so the map propagation works properly.

> The json file must also be updated to add a
> new field cpumask.

Yeah, I don't like this as it means we end up putting CPU information
into the json that isn't the same for every CPU variant of the same
architecture model. Maybe we can have some kind of "scope" enum value
in the json and then when the scope differs from the PMU's, core scope
vs the PMU's hyperthread scope, then in the tool we can figure out the
cpumask from the topology in sysfs. Maybe we should just always use
the topology and get rid of cpumask files in sysfs, replacing them
with "scope" files. Will Deacon pushed back on having ARM PMUs
supporting hot plugging
(https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/)
where the main thing hot plugging handler needs to maintain is set the
cpumask.

Thanks,
Ian

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-18 15:39     ` Ian Rogers
@ 2024-07-18 17:47       ` Liang, Kan
  2024-07-18 20:50         ` Ian Rogers
  0 siblings, 1 reply; 34+ messages in thread
From: Liang, Kan @ 2024-07-18 17:47 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das



On 2024-07-18 11:39 a.m., Ian Rogers wrote:
> On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>
>>
>>
>> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
>>> If an event file exists in sysfs, check if a event.cpus file exists
>>> and read a perf_cpu_map from it if it does. This allows particular
>>> events to have a different set of CPUs compared to the PMU.
>>>
>>> One scenario where this could be useful is when a PMU is set up with a
>>> cpumask/events per SMT thread but some events record for all SMT
>>> threads. Programming an event on each SMT thread will cause
>>> unnecessary counters to be programmed and the aggregate value to be
>>> too large.
>>
>> If I understand the scenario correctly, I think the issue should have
>> been addressed since ICX, with the introduction of  the "SMT-aware
>> events". Is there a specific event which still has the issue on newer
>> platforms?
> 
> Nothing comes to mind, but it is there on popular machines like Skylake.
> 
>>>
>>> Another scenario where this could be useful if when a PMU has
>>> historically had a cpumask at the package level, but now newer per
>>> die, core or CPU information is available.
>>
>> The traditional way to support new counters with a different scope is to
>> add a new PMU.
>>
>>>
>>> Additional context for the motivation is in these patches and
>>> conversation:
>>> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/
>>
>> I don't see the advantages of the new event.cpus way.
>> The aggregation should be the same.
> 
> Agreed. My concern is that we may end up with a pattern of
> <pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just
> for the sake of the cpumask.

The cstate PMUs already do so, e.g., cstate_core, cstate_pkg.

From another perspective, it discloses the scope information in a PMU
name. If a user only cares about the information of a package, they just
need to focus on everything under <pmu>_pkg. Otherwise, they have to go
through all the events.

> 
>> The RAPL counters are free-running counters. So there is no difference
>> whether grouping or not grouping.
> 
> Should the RAPL counters return true for perf_pmu__is_software? In the
> tool we currently return false and won't allow grouping, these events
> with other hardware events. The intent in perf_pmu__is_software was to
> emulate the way the kernel allows/disallows groups - software context
> events can be in a hardware context but otherwise I don't believe the
> grouping is allowed.

No, it's not allowed for the RAPL counters.

If the motivation is to find another way to group counters with
different scope, it may not work.

We once tried to mix the perf_invalid_context PMUs in a group. But it's
denied.
https://yhbt.net/lore/all/20150415172856.GA5029@twins.programming.kicks-ass.net/

The event.cpus still faces the same issues.

> 
>> But it makes the kernel driver complex, since it has to maintain at
>> least two different cpumasks.
> 
> Two drivers each maintaining a cpumask or 1 driver maintaining 2
> cpumasks, it seems like there is chance for code reuse in both cases.
> I'm not seeing how it adds to complexity particularly.

Yes, there are some cleanup opportunities for the cpumask/hotplug codes.

We may even abstracts some generic interfaces for pkg or core level PMUs.

Eventually, the complexity/duplication should be able to be reduced.

> 
>> The tool becomes complex either, since it has to take care of the
>> per-event CPU override case.
> 
> I'm not sure I agree with this. What we need for perf_event_open is a
> perf_event_attr, we dress these up as evsels which also have the
> ability to be for >1 CPU or thread. Longer term I think evsels can
> also have >1 PMU, for the wildcard cases like uncore memory
> controllers - this would remove the need for resorting evsels except
> for topdown events which have thrown us a giant bundle of challenges.
> Anyway, so an evsel is perf_event_attr information paired with CPU
> information, it makes sense to me that the parser should do this
> pairing. What's harder in the evsel/evlist setup is trying to fix CPU
> maps up not in parsing, like with __perf_evlist__propagate_maps where
> the parsing is trying to leave crumbs around (like system_wide,
> has_user_cpus, is_pmu_core) so the map propagation works properly.
> 
>> The json file must also be updated to add a
>> new field cpumask.
> 
> Yeah, I don't like this as it means we end up putting CPU information
> into the json that isn't the same for every CPU variant of the same
> architecture model. Maybe we can have some kind of "scope" enum value
> in the json and then when the scope differs from the PMU's, core scope
> vs the PMU's hyperthread scope, then in the tool we can figure out the
> cpumask from the topology in sysfs. Maybe we should just always use
> the topology and get rid of cpumask files in sysfs, replacing them
> with "scope" files. Will Deacon pushed back on having ARM PMUs
> supporting hot plugging
> (https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/)
> where the main thing hot plugging handler needs to maintain is set the
> cpumask.

Not just the cpumask but also migrate the context for some PMUs, see
perf_pmu_migrate_context().

It seems we really need a shared cpumask in the generic code, so the
drivers don't need to handle the hotplug everywhere. I will think about it.

Thanks,
Kan


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on
  2024-07-18 15:12     ` Ian Rogers
@ 2024-07-18 18:02       ` Liang, Kan
  2024-07-18 21:06         ` Ian Rogers
  0 siblings, 1 reply; 34+ messages in thread
From: Liang, Kan @ 2024-07-18 18:02 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das



On 2024-07-18 11:12 a.m., Ian Rogers wrote:
> On Thu, Jul 18, 2024 at 7:41 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>
>>
>>
>> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
>>> The -C option allows the CPUs for a list of events to be specified but
>>> its not possible to set the CPU for a single event. Add a term to
>>> allow this. The term isn't a general CPU list due to ',' already being
>>> a special character in event parsing instead multiple cpu= terms may
>>> be provided and they will be merged/unioned together.
>>>
>>> An example of mixing different types of events counted on different CPUs:
>>> ```
>>> $ perf stat -A -C 0,4-5,8 -e "instructions/cpu=0/,l1d-misses/cpu=4,cpu=5/,inst_retired.any/cpu=8/,cycles" -a sleep 0.1
>>>
>>>  Performance counter stats for 'system wide':
>>>
>>> CPU0              368,647      instructions/cpu=0/              #    0.26  insn per cycle
>>> CPU4        <not counted>      instructions/cpu=0/
>>> CPU5        <not counted>      instructions/cpu=0/
>>> CPU8        <not counted>      instructions/cpu=0/
>>> CPU0        <not counted>      l1d-misses [cpu]
>>> CPU4              203,377      l1d-misses [cpu]
>>> CPU5              138,231      l1d-misses [cpu]
>>> CPU8        <not counted>      l1d-misses [cpu]
>>> CPU0        <not counted>      cpu/cpu=8/
>>> CPU4        <not counted>      cpu/cpu=8/
>>> CPU5        <not counted>      cpu/cpu=8/
>>> CPU8              943,861      cpu/cpu=8/
>>> CPU0            1,412,071      cycles
>>> CPU4           20,362,900      cycles
>>> CPU5           10,172,725      cycles
>>> CPU8            2,406,081      cycles
>>>
>>>        0.102925309 seconds time elapsed
>>> ```
>>>
>>> Note, the event name of inst_retired.any is missing, reported as
>>> cpu/cpu=8/, as there are unmerged uniquify fixes:
>>> https://lore.kernel.org/lkml/20240510053705.2462258-3-irogers@google.com/
>>>
>>> An example of spreading uncore overhead across two CPUs:
>>> ```
>>> $ perf stat -A -e "data_read/cpu=0/,data_write/cpu=1/" -a sleep 0.1
>>>
>>>  Performance counter stats for 'system wide':
>>>
>>> CPU0               223.65 MiB  uncore_imc_free_running_0/cpu=0/
>>> CPU0               223.66 MiB  uncore_imc_free_running_1/cpu=0/
>>> CPU0        <not counted> MiB  uncore_imc_free_running_0/cpu=1/
>>> CPU1                 5.78 MiB  uncore_imc_free_running_0/cpu=1/
>>> CPU0        <not counted> MiB  uncore_imc_free_running_1/cpu=1/
>>> CPU1                 5.74 MiB  uncore_imc_free_running_1/cpu=1/
>>> ```
>>>
>>> Manually fixing the output it should be:
>>> ```
>>> CPU0               223.65 MiB  uncore_imc_free_running_0/data_read,cpu=0/
>>> CPU0               223.66 MiB  uncore_imc_free_running_1/data_read,cpu=0/
>>> CPU1                 5.78 MiB  uncore_imc_free_running_0/data_write,cpu=1/
>>> CPU1                 5.74 MiB  uncore_imc_free_running_1/data_write,cpu=1/
>>> ```
>>>
>>> That is data_read from 2 PMUs was counted on CPU0 and data_write was
>>> counted on CPU1.
>>
>> There was an effort to make the counter access from any CPU of the package.
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d6a2f9035bfc27d0e9d78b13635dda9fb017ac01
>>
>> But now it limits the access from specific CPUs. It sounds like a
>> regression.
> 
> Thanks Kan, I'm not sure I understand the comment. 

The flag is also applied for the uncore and RAPL.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/arch/x86/events/intel/uncore.c?&id=e64cd6f73ff5a7eb4f8f759049ee24a3fe55e731

So specifying a CPU to an uncore event doesn't make sense. If the
current CPU is in the same package as the asked CPU. The kernel will
always choose the current CPU.

Thanks,
Kan
> The overhead I was
> thinking of here is more along the lines of cgroup context switches
> (although that isn't in my example). There may be a large number of
> say memory controller events just by having 2 events for each PMU and
> then there are 10s of PMUs. By putting half of the events on 1 CPU and
> half on another, the context switch overhead is shared. That said, the
> counters don't care what cgroup is accessing memory, and users doing
> this are likely making some kind of error.
> 
> Thanks,
> Ian
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-18 17:47       ` Liang, Kan
@ 2024-07-18 20:50         ` Ian Rogers
  2024-07-19 13:55           ` Liang, Kan
  0 siblings, 1 reply; 34+ messages in thread
From: Ian Rogers @ 2024-07-18 20:50 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das

On Thu, Jul 18, 2024 at 10:48 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
>
>
> On 2024-07-18 11:39 a.m., Ian Rogers wrote:
> > On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >>
> >>
> >>
> >> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
> >>> If an event file exists in sysfs, check if a event.cpus file exists
> >>> and read a perf_cpu_map from it if it does. This allows particular
> >>> events to have a different set of CPUs compared to the PMU.
> >>>
> >>> One scenario where this could be useful is when a PMU is set up with a
> >>> cpumask/events per SMT thread but some events record for all SMT
> >>> threads. Programming an event on each SMT thread will cause
> >>> unnecessary counters to be programmed and the aggregate value to be
> >>> too large.
> >>
> >> If I understand the scenario correctly, I think the issue should have
> >> been addressed since ICX, with the introduction of  the "SMT-aware
> >> events". Is there a specific event which still has the issue on newer
> >> platforms?
> >
> > Nothing comes to mind, but it is there on popular machines like Skylake.
> >
> >>>
> >>> Another scenario where this could be useful if when a PMU has
> >>> historically had a cpumask at the package level, but now newer per
> >>> die, core or CPU information is available.
> >>
> >> The traditional way to support new counters with a different scope is to
> >> add a new PMU.
> >>
> >>>
> >>> Additional context for the motivation is in these patches and
> >>> conversation:
> >>> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/
> >>
> >> I don't see the advantages of the new event.cpus way.
> >> The aggregation should be the same.
> >
> > Agreed. My concern is that we may end up with a pattern of
> > <pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just
> > for the sake of the cpumask.
>
> The cstate PMUs already do so, e.g., cstate_core, cstate_pkg.
>
> From another perspective, it discloses the scope information in a PMU
> name. If a user only cares about the information of a package, they just
> need to focus on everything under <pmu>_pkg. Otherwise, they have to go
> through all the events.
>
> >
> >> The RAPL counters are free-running counters. So there is no difference
> >> whether grouping or not grouping.
> >
> > Should the RAPL counters return true for perf_pmu__is_software? In the
> > tool we currently return false and won't allow grouping, these events
> > with other hardware events. The intent in perf_pmu__is_software was to
> > emulate the way the kernel allows/disallows groups - software context
> > events can be in a hardware context but otherwise I don't believe the
> > grouping is allowed.
>
> No, it's not allowed for the RAPL counters.
>
> If the motivation is to find another way to group counters with
> different scope, it may not work.
>
> We once tried to mix the perf_invalid_context PMUs in a group. But it's
> denied.
> https://yhbt.net/lore/all/20150415172856.GA5029@twins.programming.kicks-ass.net/
>
> The event.cpus still faces the same issues.

Why so? The events would share the same perf_event_context, I thought
the perf_event_open would succeed.

> >
> >> But it makes the kernel driver complex, since it has to maintain at
> >> least two different cpumasks.
> >
> > Two drivers each maintaining a cpumask or 1 driver maintaining 2
> > cpumasks, it seems like there is chance for code reuse in both cases.
> > I'm not seeing how it adds to complexity particularly.
>
> Yes, there are some cleanup opportunities for the cpumask/hotplug codes.
>
> We may even abstracts some generic interfaces for pkg or core level PMUs.
>
> Eventually, the complexity/duplication should be able to be reduced.
>
> >
> >> The tool becomes complex either, since it has to take care of the
> >> per-event CPU override case.
> >
> > I'm not sure I agree with this. What we need for perf_event_open is a
> > perf_event_attr, we dress these up as evsels which also have the
> > ability to be for >1 CPU or thread. Longer term I think evsels can
> > also have >1 PMU, for the wildcard cases like uncore memory
> > controllers - this would remove the need for resorting evsels except
> > for topdown events which have thrown us a giant bundle of challenges.
> > Anyway, so an evsel is perf_event_attr information paired with CPU
> > information, it makes sense to me that the parser should do this
> > pairing. What's harder in the evsel/evlist setup is trying to fix CPU
> > maps up not in parsing, like with __perf_evlist__propagate_maps where
> > the parsing is trying to leave crumbs around (like system_wide,
> > has_user_cpus, is_pmu_core) so the map propagation works properly.
> >
> >> The json file must also be updated to add a
> >> new field cpumask.
> >
> > Yeah, I don't like this as it means we end up putting CPU information
> > into the json that isn't the same for every CPU variant of the same
> > architecture model. Maybe we can have some kind of "scope" enum value
> > in the json and then when the scope differs from the PMU's, core scope
> > vs the PMU's hyperthread scope, then in the tool we can figure out the
> > cpumask from the topology in sysfs. Maybe we should just always use
> > the topology and get rid of cpumask files in sysfs, replacing them
> > with "scope" files. Will Deacon pushed back on having ARM PMUs
> > supporting hot plugging
> > (https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/)
> > where the main thing hot plugging handler needs to maintain is set the
> > cpumask.
>
> Not just the cpumask but also migrate the context for some PMUs, see
> perf_pmu_migrate_context().

Will do, thanks.

> It seems we really need a shared cpumask in the generic code, so the
> drivers don't need to handle the hotplug everywhere. I will think about it.

Thanks. There are other problems on ARM PMUs like having no or empty
cpumasks, which the tool take to mean open the event on every online
CPU (there is no cpus or cpumask file on core PMUs historically, so we
adopt this behavior when the cpumask is empty or not present). I've
been thinking to expand `tools/perf/tests/pmu.c` with basic PMU sanity
tests. Some ideas:

1) some kind of cpumask sanity check - we could open an event with the
cpumask and see if it yields multiplexing.. which would highlight the
ARM no cpumask bug
2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
files parse correctly and have a corresponding event.
3) keep adding opening events on the PMU to a group to make sure that
when counters are exhausted the perf_event_open fails (I've seen this
bug on AMD)
4) are the values in the type file unique

Thanks,
Ian

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on
  2024-07-18 18:02       ` Liang, Kan
@ 2024-07-18 21:06         ` Ian Rogers
  2024-07-19 14:14           ` Liang, Kan
  0 siblings, 1 reply; 34+ messages in thread
From: Ian Rogers @ 2024-07-18 21:06 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das

On Thu, Jul 18, 2024 at 11:03 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
>
>
> On 2024-07-18 11:12 a.m., Ian Rogers wrote:
> > On Thu, Jul 18, 2024 at 7:41 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >>
> >>
> >>
> >> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
> >>> The -C option allows the CPUs for a list of events to be specified but
> >>> its not possible to set the CPU for a single event. Add a term to
> >>> allow this. The term isn't a general CPU list due to ',' already being
> >>> a special character in event parsing instead multiple cpu= terms may
> >>> be provided and they will be merged/unioned together.
> >>>
> >>> An example of mixing different types of events counted on different CPUs:
> >>> ```
> >>> $ perf stat -A -C 0,4-5,8 -e "instructions/cpu=0/,l1d-misses/cpu=4,cpu=5/,inst_retired.any/cpu=8/,cycles" -a sleep 0.1
> >>>
> >>>  Performance counter stats for 'system wide':
> >>>
> >>> CPU0              368,647      instructions/cpu=0/              #    0.26  insn per cycle
> >>> CPU4        <not counted>      instructions/cpu=0/
> >>> CPU5        <not counted>      instructions/cpu=0/
> >>> CPU8        <not counted>      instructions/cpu=0/
> >>> CPU0        <not counted>      l1d-misses [cpu]
> >>> CPU4              203,377      l1d-misses [cpu]
> >>> CPU5              138,231      l1d-misses [cpu]
> >>> CPU8        <not counted>      l1d-misses [cpu]
> >>> CPU0        <not counted>      cpu/cpu=8/
> >>> CPU4        <not counted>      cpu/cpu=8/
> >>> CPU5        <not counted>      cpu/cpu=8/
> >>> CPU8              943,861      cpu/cpu=8/
> >>> CPU0            1,412,071      cycles
> >>> CPU4           20,362,900      cycles
> >>> CPU5           10,172,725      cycles
> >>> CPU8            2,406,081      cycles
> >>>
> >>>        0.102925309 seconds time elapsed
> >>> ```
> >>>
> >>> Note, the event name of inst_retired.any is missing, reported as
> >>> cpu/cpu=8/, as there are unmerged uniquify fixes:
> >>> https://lore.kernel.org/lkml/20240510053705.2462258-3-irogers@google.com/
> >>>
> >>> An example of spreading uncore overhead across two CPUs:
> >>> ```
> >>> $ perf stat -A -e "data_read/cpu=0/,data_write/cpu=1/" -a sleep 0.1
> >>>
> >>>  Performance counter stats for 'system wide':
> >>>
> >>> CPU0               223.65 MiB  uncore_imc_free_running_0/cpu=0/
> >>> CPU0               223.66 MiB  uncore_imc_free_running_1/cpu=0/
> >>> CPU0        <not counted> MiB  uncore_imc_free_running_0/cpu=1/
> >>> CPU1                 5.78 MiB  uncore_imc_free_running_0/cpu=1/
> >>> CPU0        <not counted> MiB  uncore_imc_free_running_1/cpu=1/
> >>> CPU1                 5.74 MiB  uncore_imc_free_running_1/cpu=1/
> >>> ```
> >>>
> >>> Manually fixing the output it should be:
> >>> ```
> >>> CPU0               223.65 MiB  uncore_imc_free_running_0/data_read,cpu=0/
> >>> CPU0               223.66 MiB  uncore_imc_free_running_1/data_read,cpu=0/
> >>> CPU1                 5.78 MiB  uncore_imc_free_running_0/data_write,cpu=1/
> >>> CPU1                 5.74 MiB  uncore_imc_free_running_1/data_write,cpu=1/
> >>> ```
> >>>
> >>> That is data_read from 2 PMUs was counted on CPU0 and data_write was
> >>> counted on CPU1.
> >>
> >> There was an effort to make the counter access from any CPU of the package.
> >> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d6a2f9035bfc27d0e9d78b13635dda9fb017ac01
> >>
> >> But now it limits the access from specific CPUs. It sounds like a
> >> regression.
> >
> > Thanks Kan, I'm not sure I understand the comment.
>
> The flag is also applied for the uncore and RAPL.
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/arch/x86/events/intel/uncore.c?&id=e64cd6f73ff5a7eb4f8f759049ee24a3fe55e731
>
> So specifying a CPU to an uncore event doesn't make sense. If the
> current CPU is in the same package as the asked CPU. The kernel will
> always choose the current CPU.

Ugh, that sounds sub-optimal. If I'm monitoring uncore events with
cgroups CPU0 (or the first CPU in a package) is going to be loaded up
with all the events and have all of the rdmsr/wrmsrs in its context
switch. Perhaps we should warn and say to use BPF events.

Is there a way through say ioctls to get the CPU an event is on? That
way we could update the `perf stat -A` to accurately report cpus.
There's also the issue that the affinity stuff is going to be off.

Thanks,
Ian


> Thanks,
> Kan
> > The overhead I was
> > thinking of here is more along the lines of cgroup context switches
> > (although that isn't in my example). There may be a large number of
> > say memory controller events just by having 2 events for each PMU and
> > then there are 10s of PMUs. By putting half of the events on 1 CPU and
> > half on another, the context switch overhead is shared. That said, the
> > counters don't care what cgroup is accessing memory, and users doing
> > this are likely making some kind of error.
> >
> > Thanks,
> > Ian
> >

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-18 20:50         ` Ian Rogers
@ 2024-07-19 13:55           ` Liang, Kan
  2024-07-19 14:59             ` Ian Rogers
  0 siblings, 1 reply; 34+ messages in thread
From: Liang, Kan @ 2024-07-19 13:55 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das



On 2024-07-18 4:50 p.m., Ian Rogers wrote:
> On Thu, Jul 18, 2024 at 10:48 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>
>>
>>
>> On 2024-07-18 11:39 a.m., Ian Rogers wrote:
>>> On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>>>
>>>>
>>>>
>>>> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
>>>>> If an event file exists in sysfs, check if a event.cpus file exists
>>>>> and read a perf_cpu_map from it if it does. This allows particular
>>>>> events to have a different set of CPUs compared to the PMU.
>>>>>
>>>>> One scenario where this could be useful is when a PMU is set up with a
>>>>> cpumask/events per SMT thread but some events record for all SMT
>>>>> threads. Programming an event on each SMT thread will cause
>>>>> unnecessary counters to be programmed and the aggregate value to be
>>>>> too large.
>>>>
>>>> If I understand the scenario correctly, I think the issue should have
>>>> been addressed since ICX, with the introduction of  the "SMT-aware
>>>> events". Is there a specific event which still has the issue on newer
>>>> platforms?
>>>
>>> Nothing comes to mind, but it is there on popular machines like Skylake.
>>>
>>>>>
>>>>> Another scenario where this could be useful if when a PMU has
>>>>> historically had a cpumask at the package level, but now newer per
>>>>> die, core or CPU information is available.
>>>>
>>>> The traditional way to support new counters with a different scope is to
>>>> add a new PMU.
>>>>
>>>>>
>>>>> Additional context for the motivation is in these patches and
>>>>> conversation:
>>>>> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/
>>>>
>>>> I don't see the advantages of the new event.cpus way.
>>>> The aggregation should be the same.
>>>
>>> Agreed. My concern is that we may end up with a pattern of
>>> <pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just
>>> for the sake of the cpumask.
>>
>> The cstate PMUs already do so, e.g., cstate_core, cstate_pkg.
>>
>> From another perspective, it discloses the scope information in a PMU
>> name. If a user only cares about the information of a package, they just
>> need to focus on everything under <pmu>_pkg. Otherwise, they have to go
>> through all the events.
>>
>>>
>>>> The RAPL counters are free-running counters. So there is no difference
>>>> whether grouping or not grouping.
>>>
>>> Should the RAPL counters return true for perf_pmu__is_software? In the
>>> tool we currently return false and won't allow grouping, these events
>>> with other hardware events. The intent in perf_pmu__is_software was to
>>> emulate the way the kernel allows/disallows groups - software context
>>> events can be in a hardware context but otherwise I don't believe the
>>> grouping is allowed.
>>
>> No, it's not allowed for the RAPL counters.
>>
>> If the motivation is to find another way to group counters with
>> different scope, it may not work.
>>
>> We once tried to mix the perf_invalid_context PMUs in a group. But it's
>> denied.
>> https://yhbt.net/lore/all/20150415172856.GA5029@twins.programming.kicks-ass.net/
>>
>> The event.cpus still faces the same issues.
> 
> Why so? The events would share the same perf_event_context, I thought
> the perf_event_open would succeed.

I think it breaks what groups are as well. At least, you cannot
guarantee that two events can be co-scheduled on the same CPU. Even
worse, there could be no events on some CPU.
The first thing that pops up is the sample read feature. On CPU_A, the
event_A is the leader event, but on CPU_B, the event_B could be the
leader event if event_A's cpumask doesn't include the CPU_B.
There could be more similar features we have to special handle.

> 
>>>
>>>> But it makes the kernel driver complex, since it has to maintain at
>>>> least two different cpumasks.
>>>
>>> Two drivers each maintaining a cpumask or 1 driver maintaining 2
>>> cpumasks, it seems like there is chance for code reuse in both cases.
>>> I'm not seeing how it adds to complexity particularly.
>>
>> Yes, there are some cleanup opportunities for the cpumask/hotplug codes.
>>
>> We may even abstracts some generic interfaces for pkg or core level PMUs.
>>
>> Eventually, the complexity/duplication should be able to be reduced.
>>
>>>
>>>> The tool becomes complex either, since it has to take care of the
>>>> per-event CPU override case.
>>>
>>> I'm not sure I agree with this. What we need for perf_event_open is a
>>> perf_event_attr, we dress these up as evsels which also have the
>>> ability to be for >1 CPU or thread. Longer term I think evsels can
>>> also have >1 PMU, for the wildcard cases like uncore memory
>>> controllers - this would remove the need for resorting evsels except
>>> for topdown events which have thrown us a giant bundle of challenges.
>>> Anyway, so an evsel is perf_event_attr information paired with CPU
>>> information, it makes sense to me that the parser should do this
>>> pairing. What's harder in the evsel/evlist setup is trying to fix CPU
>>> maps up not in parsing, like with __perf_evlist__propagate_maps where
>>> the parsing is trying to leave crumbs around (like system_wide,
>>> has_user_cpus, is_pmu_core) so the map propagation works properly.
>>>
>>>> The json file must also be updated to add a
>>>> new field cpumask.
>>>
>>> Yeah, I don't like this as it means we end up putting CPU information
>>> into the json that isn't the same for every CPU variant of the same
>>> architecture model. Maybe we can have some kind of "scope" enum value
>>> in the json and then when the scope differs from the PMU's, core scope
>>> vs the PMU's hyperthread scope, then in the tool we can figure out the
>>> cpumask from the topology in sysfs. Maybe we should just always use
>>> the topology and get rid of cpumask files in sysfs, replacing them
>>> with "scope" files. Will Deacon pushed back on having ARM PMUs
>>> supporting hot plugging
>>> (https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/)
>>> where the main thing hot plugging handler needs to maintain is set the
>>> cpumask.
>>
>> Not just the cpumask but also migrate the context for some PMUs, see
>> perf_pmu_migrate_context().
> 
> Will do, thanks.
> 
>> It seems we really need a shared cpumask in the generic code, so the
>> drivers don't need to handle the hotplug everywhere. I will think about it.
> 
> Thanks. There are other problems on ARM PMUs like having no or empty
> cpumasks, which the tool take to mean open the event on every online
> CPU (there is no cpus or cpumask file on core PMUs historically, so we
> adopt this behavior when the cpumask is empty or not present). 

The no cpus/cpumasks and empty cpumasks should be different.
No cpus/cpumasks file means that the counters/events are available for
all the CPUs.
But if it's an empty cpus/cpumasks file, it means that there are no
proper CPUs. It may happen on a hybrid machine when all e-core CPUs are
hot-removed. Since it's possible that the CPUs can be hot-added later,
the kernel driver doesn't unregister the cpu_atom PMU.

> I've
> been thinking to expand `tools/perf/tests/pmu.c` with basic PMU sanity
> tests. Some ideas:
> 

Thanks.

> 1) some kind of cpumask sanity check - we could open an event with the
> cpumask and see if it yields multiplexing.. which would highlight the
> ARM no cpumask bug

The multiplexing is triggered when there are not enough counters. It
should not related to the cpumask.

For the PMU without cpumask, I think the test case should try to open an
event on all CPUs and check if the open succeeds.
For the PMU with cpumask, the test case should try to open an event on
the masked CPUs and check if the open succeeds.

The behavior of opening an event on unmasked CPUs seems not defined.
For uncore, it's OK. The kernel treats the CPUs from the same socket
exactly the same. But I'm not sure about the other PMUs.

> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
> files parse correctly and have a corresponding event.
> 3) keep adding opening events on the PMU to a group to make sure that
> when counters are exhausted the perf_event_open fails (I've seen this
> bug on AMD)
> 4) are the values in the type file unique
> 

The rest sounds good to me.

Thanks,
Kan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on
  2024-07-18 21:06         ` Ian Rogers
@ 2024-07-19 14:14           ` Liang, Kan
  2024-07-19 15:01             ` Ian Rogers
  0 siblings, 1 reply; 34+ messages in thread
From: Liang, Kan @ 2024-07-19 14:14 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das



On 2024-07-18 5:06 p.m., Ian Rogers wrote:
> On Thu, Jul 18, 2024 at 11:03 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>
>>
>>
>> On 2024-07-18 11:12 a.m., Ian Rogers wrote:
>>> On Thu, Jul 18, 2024 at 7:41 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>>>
>>>>
>>>>
>>>> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
>>>>> The -C option allows the CPUs for a list of events to be specified but
>>>>> its not possible to set the CPU for a single event. Add a term to
>>>>> allow this. The term isn't a general CPU list due to ',' already being
>>>>> a special character in event parsing instead multiple cpu= terms may
>>>>> be provided and they will be merged/unioned together.
>>>>>
>>>>> An example of mixing different types of events counted on different CPUs:
>>>>> ```
>>>>> $ perf stat -A -C 0,4-5,8 -e "instructions/cpu=0/,l1d-misses/cpu=4,cpu=5/,inst_retired.any/cpu=8/,cycles" -a sleep 0.1
>>>>>
>>>>>  Performance counter stats for 'system wide':
>>>>>
>>>>> CPU0              368,647      instructions/cpu=0/              #    0.26  insn per cycle
>>>>> CPU4        <not counted>      instructions/cpu=0/
>>>>> CPU5        <not counted>      instructions/cpu=0/
>>>>> CPU8        <not counted>      instructions/cpu=0/
>>>>> CPU0        <not counted>      l1d-misses [cpu]
>>>>> CPU4              203,377      l1d-misses [cpu]
>>>>> CPU5              138,231      l1d-misses [cpu]
>>>>> CPU8        <not counted>      l1d-misses [cpu]
>>>>> CPU0        <not counted>      cpu/cpu=8/
>>>>> CPU4        <not counted>      cpu/cpu=8/
>>>>> CPU5        <not counted>      cpu/cpu=8/
>>>>> CPU8              943,861      cpu/cpu=8/
>>>>> CPU0            1,412,071      cycles
>>>>> CPU4           20,362,900      cycles
>>>>> CPU5           10,172,725      cycles
>>>>> CPU8            2,406,081      cycles
>>>>>
>>>>>        0.102925309 seconds time elapsed
>>>>> ```
>>>>>
>>>>> Note, the event name of inst_retired.any is missing, reported as
>>>>> cpu/cpu=8/, as there are unmerged uniquify fixes:
>>>>> https://lore.kernel.org/lkml/20240510053705.2462258-3-irogers@google.com/
>>>>>
>>>>> An example of spreading uncore overhead across two CPUs:
>>>>> ```
>>>>> $ perf stat -A -e "data_read/cpu=0/,data_write/cpu=1/" -a sleep 0.1
>>>>>
>>>>>  Performance counter stats for 'system wide':
>>>>>
>>>>> CPU0               223.65 MiB  uncore_imc_free_running_0/cpu=0/
>>>>> CPU0               223.66 MiB  uncore_imc_free_running_1/cpu=0/
>>>>> CPU0        <not counted> MiB  uncore_imc_free_running_0/cpu=1/
>>>>> CPU1                 5.78 MiB  uncore_imc_free_running_0/cpu=1/
>>>>> CPU0        <not counted> MiB  uncore_imc_free_running_1/cpu=1/
>>>>> CPU1                 5.74 MiB  uncore_imc_free_running_1/cpu=1/
>>>>> ```
>>>>>
>>>>> Manually fixing the output it should be:
>>>>> ```
>>>>> CPU0               223.65 MiB  uncore_imc_free_running_0/data_read,cpu=0/
>>>>> CPU0               223.66 MiB  uncore_imc_free_running_1/data_read,cpu=0/
>>>>> CPU1                 5.78 MiB  uncore_imc_free_running_0/data_write,cpu=1/
>>>>> CPU1                 5.74 MiB  uncore_imc_free_running_1/data_write,cpu=1/
>>>>> ```
>>>>>
>>>>> That is data_read from 2 PMUs was counted on CPU0 and data_write was
>>>>> counted on CPU1.
>>>>
>>>> There was an effort to make the counter access from any CPU of the package.
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d6a2f9035bfc27d0e9d78b13635dda9fb017ac01
>>>>
>>>> But now it limits the access from specific CPUs. It sounds like a
>>>> regression.
>>>
>>> Thanks Kan, I'm not sure I understand the comment.
>>
>> The flag is also applied for the uncore and RAPL.
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/arch/x86/events/intel/uncore.c?&id=e64cd6f73ff5a7eb4f8f759049ee24a3fe55e731
>>
>> So specifying a CPU to an uncore event doesn't make sense. If the
>> current CPU is in the same package as the asked CPU. The kernel will
>> always choose the current CPU.
> 
> Ugh, that sounds sub-optimal. If I'm monitoring uncore events with
> cgroups CPU0 (or the first CPU in a package) is going to be loaded up
> with all the events and have all of the rdmsr/wrmsrs in its context
> switch. Perhaps we should warn and say to use BPF events.
> 
> Is there a way through say ioctls to get the CPU an event is on? That
> way we could update the `perf stat -A` to accurately report cpus.
> There's also the issue that the affinity stuff is going to be off.
>

I don't think there is such ioctl.

Emphasizing the CPU ID for an uncore event seems misleading. The uncore
only supports per-socket counter, not per-core counter.
Opening/reading an counter from any CPUs on a package should be identical.
An accurate report of the `perf stat -A` for an uncore should use "S0".

Thanks,
Kan

> Thanks,
> Ian
> 
> 
>> Thanks,
>> Kan
>>> The overhead I was
>>> thinking of here is more along the lines of cgroup context switches
>>> (although that isn't in my example). There may be a large number of
>>> say memory controller events just by having 2 events for each PMU and
>>> then there are 10s of PMUs. By putting half of the events on 1 CPU and
>>> half on another, the context switch overhead is shared. That said, the
>>> counters don't care what cgroup is accessing memory, and users doing
>>> this are likely making some kind of error.
>>>
>>> Thanks,
>>> Ian
>>>
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-19 13:55           ` Liang, Kan
@ 2024-07-19 14:59             ` Ian Rogers
  2024-07-19 16:35               ` Liang, Kan
  0 siblings, 1 reply; 34+ messages in thread
From: Ian Rogers @ 2024-07-19 14:59 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das

On Fri, Jul 19, 2024 at 6:56 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
>
>
> On 2024-07-18 4:50 p.m., Ian Rogers wrote:
> > On Thu, Jul 18, 2024 at 10:48 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >>
> >>
> >>
> >> On 2024-07-18 11:39 a.m., Ian Rogers wrote:
> >>> On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
> >>>>> If an event file exists in sysfs, check if a event.cpus file exists
> >>>>> and read a perf_cpu_map from it if it does. This allows particular
> >>>>> events to have a different set of CPUs compared to the PMU.
> >>>>>
> >>>>> One scenario where this could be useful is when a PMU is set up with a
> >>>>> cpumask/events per SMT thread but some events record for all SMT
> >>>>> threads. Programming an event on each SMT thread will cause
> >>>>> unnecessary counters to be programmed and the aggregate value to be
> >>>>> too large.
> >>>>
> >>>> If I understand the scenario correctly, I think the issue should have
> >>>> been addressed since ICX, with the introduction of  the "SMT-aware
> >>>> events". Is there a specific event which still has the issue on newer
> >>>> platforms?
> >>>
> >>> Nothing comes to mind, but it is there on popular machines like Skylake.
> >>>
> >>>>>
> >>>>> Another scenario where this could be useful if when a PMU has
> >>>>> historically had a cpumask at the package level, but now newer per
> >>>>> die, core or CPU information is available.
> >>>>
> >>>> The traditional way to support new counters with a different scope is to
> >>>> add a new PMU.
> >>>>
> >>>>>
> >>>>> Additional context for the motivation is in these patches and
> >>>>> conversation:
> >>>>> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/
> >>>>
> >>>> I don't see the advantages of the new event.cpus way.
> >>>> The aggregation should be the same.
> >>>
> >>> Agreed. My concern is that we may end up with a pattern of
> >>> <pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just
> >>> for the sake of the cpumask.
> >>
> >> The cstate PMUs already do so, e.g., cstate_core, cstate_pkg.
> >>
> >> From another perspective, it discloses the scope information in a PMU
> >> name. If a user only cares about the information of a package, they just
> >> need to focus on everything under <pmu>_pkg. Otherwise, they have to go
> >> through all the events.
> >>
> >>>
> >>>> The RAPL counters are free-running counters. So there is no difference
> >>>> whether grouping or not grouping.
> >>>
> >>> Should the RAPL counters return true for perf_pmu__is_software? In the
> >>> tool we currently return false and won't allow grouping, these events
> >>> with other hardware events. The intent in perf_pmu__is_software was to
> >>> emulate the way the kernel allows/disallows groups - software context
> >>> events can be in a hardware context but otherwise I don't believe the
> >>> grouping is allowed.
> >>
> >> No, it's not allowed for the RAPL counters.
> >>
> >> If the motivation is to find another way to group counters with
> >> different scope, it may not work.
> >>
> >> We once tried to mix the perf_invalid_context PMUs in a group. But it's
> >> denied.
> >> https://yhbt.net/lore/all/20150415172856.GA5029@twins.programming.kicks-ass.net/
> >>
> >> The event.cpus still faces the same issues.
> >
> > Why so? The events would share the same perf_event_context, I thought
> > the perf_event_open would succeed.
>
> I think it breaks what groups are as well. At least, you cannot
> guarantee that two events can be co-scheduled on the same CPU. Even
> worse, there could be no events on some CPU.
> The first thing that pops up is the sample read feature. On CPU_A, the
> event_A is the leader event, but on CPU_B, the event_B could be the
> leader event if event_A's cpumask doesn't include the CPU_B.
> There could be more similar features we have to special handle.

Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and
solely its perf event context, I want to know its core power and
package power as a group so I never record one without the other. That
grouping wouldn't be possible with 2 PMUs.

> >
> >>>
> >>>> But it makes the kernel driver complex, since it has to maintain at
> >>>> least two different cpumasks.
> >>>
> >>> Two drivers each maintaining a cpumask or 1 driver maintaining 2
> >>> cpumasks, it seems like there is chance for code reuse in both cases.
> >>> I'm not seeing how it adds to complexity particularly.
> >>
> >> Yes, there are some cleanup opportunities for the cpumask/hotplug codes.
> >>
> >> We may even abstracts some generic interfaces for pkg or core level PMUs.
> >>
> >> Eventually, the complexity/duplication should be able to be reduced.
> >>
> >>>
> >>>> The tool becomes complex either, since it has to take care of the
> >>>> per-event CPU override case.
> >>>
> >>> I'm not sure I agree with this. What we need for perf_event_open is a
> >>> perf_event_attr, we dress these up as evsels which also have the
> >>> ability to be for >1 CPU or thread. Longer term I think evsels can
> >>> also have >1 PMU, for the wildcard cases like uncore memory
> >>> controllers - this would remove the need for resorting evsels except
> >>> for topdown events which have thrown us a giant bundle of challenges.
> >>> Anyway, so an evsel is perf_event_attr information paired with CPU
> >>> information, it makes sense to me that the parser should do this
> >>> pairing. What's harder in the evsel/evlist setup is trying to fix CPU
> >>> maps up not in parsing, like with __perf_evlist__propagate_maps where
> >>> the parsing is trying to leave crumbs around (like system_wide,
> >>> has_user_cpus, is_pmu_core) so the map propagation works properly.
> >>>
> >>>> The json file must also be updated to add a
> >>>> new field cpumask.
> >>>
> >>> Yeah, I don't like this as it means we end up putting CPU information
> >>> into the json that isn't the same for every CPU variant of the same
> >>> architecture model. Maybe we can have some kind of "scope" enum value
> >>> in the json and then when the scope differs from the PMU's, core scope
> >>> vs the PMU's hyperthread scope, then in the tool we can figure out the
> >>> cpumask from the topology in sysfs. Maybe we should just always use
> >>> the topology and get rid of cpumask files in sysfs, replacing them
> >>> with "scope" files. Will Deacon pushed back on having ARM PMUs
> >>> supporting hot plugging
> >>> (https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/)
> >>> where the main thing hot plugging handler needs to maintain is set the
> >>> cpumask.
> >>
> >> Not just the cpumask but also migrate the context for some PMUs, see
> >> perf_pmu_migrate_context().
> >
> > Will do, thanks.
> >
> >> It seems we really need a shared cpumask in the generic code, so the
> >> drivers don't need to handle the hotplug everywhere. I will think about it.
> >
> > Thanks. There are other problems on ARM PMUs like having no or empty
> > cpumasks, which the tool take to mean open the event on every online
> > CPU (there is no cpus or cpumask file on core PMUs historically, so we
> > adopt this behavior when the cpumask is empty or not present).
>
> The no cpus/cpumasks and empty cpumasks should be different.
> No cpus/cpumasks file means that the counters/events are available for
> all the CPUs.
> But if it's an empty cpus/cpumasks file, it means that there are no
> proper CPUs. It may happen on a hybrid machine when all e-core CPUs are
> hot-removed. Since it's possible that the CPUs can be hot-added later,
> the kernel driver doesn't unregister the cpu_atom PMU.
>
> > I've
> > been thinking to expand `tools/perf/tests/pmu.c` with basic PMU sanity
> > tests. Some ideas:
> >
>
> Thanks.
>
> > 1) some kind of cpumask sanity check - we could open an event with the
> > cpumask and see if it yields multiplexing.. which would highlight the
> > ARM no cpumask bug
>
> The multiplexing is triggered when there are not enough counters. It
> should not related to the cpumask.

Agreed. Here I'm thinking about the bugs I see in PMUs. One of them is
to always succeed in opening siblings within a group and for the
unavailable counters to just report "not counted". This breaks weak
groups used by metrics.

> For the PMU without cpumask, I think the test case should try to open an
> event on all CPUs and check if the open succeeds.
> For the PMU with cpumask, the test case should try to open an event on
> the masked CPUs and check if the open succeeds.

I agree without cpumask means all CPUs, the bug I see on ARM PMUs is
that they have uncore PMUs with no or empty cpumasks leading to the
uncore events being programmed on every CPU and over counting,
multiplexing counters and so on. I'm trying to devise tests so that
they can see they are broken.

> The behavior of opening an event on unmasked CPUs seems not defined.
> For uncore, it's OK. The kernel treats the CPUs from the same socket
> exactly the same. But I'm not sure about the other PMUs.

My understanding had been that for core PMUs a "perf stat -C" option
would choose the particular CPU to count the event on, for an uncore
PMU the -C option would override the cpumask's "default" value. We
have code to validate this:
https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522
But it seems now that overriding an uncore PMU's default CPU is
ignored. If you did:
$ perf stat -C 1 -e data_read -a sleep 0.1
Then the tool thinks data_read is on CPU1 and will set its thread
affinity to CPU1 to avoid IPIs. It seems to fix this we need to just
throw away the -C option.

> > 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
> > files parse correctly and have a corresponding event.
> > 3) keep adding opening events on the PMU to a group to make sure that
> > when counters are exhausted the perf_event_open fails (I've seen this
> > bug on AMD)
> > 4) are the values in the type file unique
> >
>
> The rest sounds good to me.

Cool. Let me know if you can think of more.

Thanks,
Ian

> Thanks,
> Kan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on
  2024-07-19 14:14           ` Liang, Kan
@ 2024-07-19 15:01             ` Ian Rogers
  2024-07-19 16:42               ` Liang, Kan
  0 siblings, 1 reply; 34+ messages in thread
From: Ian Rogers @ 2024-07-19 15:01 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das

On Fri, Jul 19, 2024 at 7:14 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
>
>
> On 2024-07-18 5:06 p.m., Ian Rogers wrote:
> > On Thu, Jul 18, 2024 at 11:03 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >>
> >>
> >>
> >> On 2024-07-18 11:12 a.m., Ian Rogers wrote:
> >>> On Thu, Jul 18, 2024 at 7:41 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
> >>>>> The -C option allows the CPUs for a list of events to be specified but
> >>>>> its not possible to set the CPU for a single event. Add a term to
> >>>>> allow this. The term isn't a general CPU list due to ',' already being
> >>>>> a special character in event parsing instead multiple cpu= terms may
> >>>>> be provided and they will be merged/unioned together.
> >>>>>
> >>>>> An example of mixing different types of events counted on different CPUs:
> >>>>> ```
> >>>>> $ perf stat -A -C 0,4-5,8 -e "instructions/cpu=0/,l1d-misses/cpu=4,cpu=5/,inst_retired.any/cpu=8/,cycles" -a sleep 0.1
> >>>>>
> >>>>>  Performance counter stats for 'system wide':
> >>>>>
> >>>>> CPU0              368,647      instructions/cpu=0/              #    0.26  insn per cycle
> >>>>> CPU4        <not counted>      instructions/cpu=0/
> >>>>> CPU5        <not counted>      instructions/cpu=0/
> >>>>> CPU8        <not counted>      instructions/cpu=0/
> >>>>> CPU0        <not counted>      l1d-misses [cpu]
> >>>>> CPU4              203,377      l1d-misses [cpu]
> >>>>> CPU5              138,231      l1d-misses [cpu]
> >>>>> CPU8        <not counted>      l1d-misses [cpu]
> >>>>> CPU0        <not counted>      cpu/cpu=8/
> >>>>> CPU4        <not counted>      cpu/cpu=8/
> >>>>> CPU5        <not counted>      cpu/cpu=8/
> >>>>> CPU8              943,861      cpu/cpu=8/
> >>>>> CPU0            1,412,071      cycles
> >>>>> CPU4           20,362,900      cycles
> >>>>> CPU5           10,172,725      cycles
> >>>>> CPU8            2,406,081      cycles
> >>>>>
> >>>>>        0.102925309 seconds time elapsed
> >>>>> ```
> >>>>>
> >>>>> Note, the event name of inst_retired.any is missing, reported as
> >>>>> cpu/cpu=8/, as there are unmerged uniquify fixes:
> >>>>> https://lore.kernel.org/lkml/20240510053705.2462258-3-irogers@google.com/
> >>>>>
> >>>>> An example of spreading uncore overhead across two CPUs:
> >>>>> ```
> >>>>> $ perf stat -A -e "data_read/cpu=0/,data_write/cpu=1/" -a sleep 0.1
> >>>>>
> >>>>>  Performance counter stats for 'system wide':
> >>>>>
> >>>>> CPU0               223.65 MiB  uncore_imc_free_running_0/cpu=0/
> >>>>> CPU0               223.66 MiB  uncore_imc_free_running_1/cpu=0/
> >>>>> CPU0        <not counted> MiB  uncore_imc_free_running_0/cpu=1/
> >>>>> CPU1                 5.78 MiB  uncore_imc_free_running_0/cpu=1/
> >>>>> CPU0        <not counted> MiB  uncore_imc_free_running_1/cpu=1/
> >>>>> CPU1                 5.74 MiB  uncore_imc_free_running_1/cpu=1/
> >>>>> ```
> >>>>>
> >>>>> Manually fixing the output it should be:
> >>>>> ```
> >>>>> CPU0               223.65 MiB  uncore_imc_free_running_0/data_read,cpu=0/
> >>>>> CPU0               223.66 MiB  uncore_imc_free_running_1/data_read,cpu=0/
> >>>>> CPU1                 5.78 MiB  uncore_imc_free_running_0/data_write,cpu=1/
> >>>>> CPU1                 5.74 MiB  uncore_imc_free_running_1/data_write,cpu=1/
> >>>>> ```
> >>>>>
> >>>>> That is data_read from 2 PMUs was counted on CPU0 and data_write was
> >>>>> counted on CPU1.
> >>>>
> >>>> There was an effort to make the counter access from any CPU of the package.
> >>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d6a2f9035bfc27d0e9d78b13635dda9fb017ac01
> >>>>
> >>>> But now it limits the access from specific CPUs. It sounds like a
> >>>> regression.
> >>>
> >>> Thanks Kan, I'm not sure I understand the comment.
> >>
> >> The flag is also applied for the uncore and RAPL.
> >> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/arch/x86/events/intel/uncore.c?&id=e64cd6f73ff5a7eb4f8f759049ee24a3fe55e731
> >>
> >> So specifying a CPU to an uncore event doesn't make sense. If the
> >> current CPU is in the same package as the asked CPU. The kernel will
> >> always choose the current CPU.
> >
> > Ugh, that sounds sub-optimal. If I'm monitoring uncore events with
> > cgroups CPU0 (or the first CPU in a package) is going to be loaded up
> > with all the events and have all of the rdmsr/wrmsrs in its context
> > switch. Perhaps we should warn and say to use BPF events.
> >
> > Is there a way through say ioctls to get the CPU an event is on? That
> > way we could update the `perf stat -A` to accurately report cpus.
> > There's also the issue that the affinity stuff is going to be off.
> >
>
> I don't think there is such ioctl.
>
> Emphasizing the CPU ID for an uncore event seems misleading. The uncore
> only supports per-socket counter, not per-core counter.
> Opening/reading an counter from any CPUs on a package should be identical.
> An accurate report of the `perf stat -A` for an uncore should use "S0".

I think it is an "elite" level trick to try to do things like balance
context switch overhead. Putting everything on the first CPU feels
like a scalability issue for cgroup events. BPF events work around it
to some degree, but they are still going to put all the work on CPU0
which could cause performance issues, latency spikes, etc. on it.

Thanks,
Ian

> Thanks,
> Kan
>
> > Thanks,
> > Ian
> >
> >
> >> Thanks,
> >> Kan
> >>> The overhead I was
> >>> thinking of here is more along the lines of cgroup context switches
> >>> (although that isn't in my example). There may be a large number of
> >>> say memory controller events just by having 2 events for each PMU and
> >>> then there are 10s of PMUs. By putting half of the events on 1 CPU and
> >>> half on another, the context switch overhead is shared. That said, the
> >>> counters don't care what cgroup is accessing memory, and users doing
> >>> this are likely making some kind of error.
> >>>
> >>> Thanks,
> >>> Ian
> >>>
> >

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-19 14:59             ` Ian Rogers
@ 2024-07-19 16:35               ` Liang, Kan
  2024-07-19 22:02                 ` Ian Rogers
  0 siblings, 1 reply; 34+ messages in thread
From: Liang, Kan @ 2024-07-19 16:35 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das



On 2024-07-19 10:59 a.m., Ian Rogers wrote:
> On Fri, Jul 19, 2024 at 6:56 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>
>>
>>
>> On 2024-07-18 4:50 p.m., Ian Rogers wrote:
>>> On Thu, Jul 18, 2024 at 10:48 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>>>
>>>>
>>>>
>>>> On 2024-07-18 11:39 a.m., Ian Rogers wrote:
>>>>> On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
>>>>>>> If an event file exists in sysfs, check if a event.cpus file exists
>>>>>>> and read a perf_cpu_map from it if it does. This allows particular
>>>>>>> events to have a different set of CPUs compared to the PMU.
>>>>>>>
>>>>>>> One scenario where this could be useful is when a PMU is set up with a
>>>>>>> cpumask/events per SMT thread but some events record for all SMT
>>>>>>> threads. Programming an event on each SMT thread will cause
>>>>>>> unnecessary counters to be programmed and the aggregate value to be
>>>>>>> too large.
>>>>>>
>>>>>> If I understand the scenario correctly, I think the issue should have
>>>>>> been addressed since ICX, with the introduction of  the "SMT-aware
>>>>>> events". Is there a specific event which still has the issue on newer
>>>>>> platforms?
>>>>>
>>>>> Nothing comes to mind, but it is there on popular machines like Skylake.
>>>>>
>>>>>>>
>>>>>>> Another scenario where this could be useful if when a PMU has
>>>>>>> historically had a cpumask at the package level, but now newer per
>>>>>>> die, core or CPU information is available.
>>>>>>
>>>>>> The traditional way to support new counters with a different scope is to
>>>>>> add a new PMU.
>>>>>>
>>>>>>>
>>>>>>> Additional context for the motivation is in these patches and
>>>>>>> conversation:
>>>>>>> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/
>>>>>>
>>>>>> I don't see the advantages of the new event.cpus way.
>>>>>> The aggregation should be the same.
>>>>>
>>>>> Agreed. My concern is that we may end up with a pattern of
>>>>> <pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just
>>>>> for the sake of the cpumask.
>>>>
>>>> The cstate PMUs already do so, e.g., cstate_core, cstate_pkg.
>>>>
>>>> From another perspective, it discloses the scope information in a PMU
>>>> name. If a user only cares about the information of a package, they just
>>>> need to focus on everything under <pmu>_pkg. Otherwise, they have to go
>>>> through all the events.
>>>>
>>>>>
>>>>>> The RAPL counters are free-running counters. So there is no difference
>>>>>> whether grouping or not grouping.
>>>>>
>>>>> Should the RAPL counters return true for perf_pmu__is_software? In the
>>>>> tool we currently return false and won't allow grouping, these events
>>>>> with other hardware events. The intent in perf_pmu__is_software was to
>>>>> emulate the way the kernel allows/disallows groups - software context
>>>>> events can be in a hardware context but otherwise I don't believe the
>>>>> grouping is allowed.
>>>>
>>>> No, it's not allowed for the RAPL counters.
>>>>
>>>> If the motivation is to find another way to group counters with
>>>> different scope, it may not work.
>>>>
>>>> We once tried to mix the perf_invalid_context PMUs in a group. But it's
>>>> denied.
>>>> https://yhbt.net/lore/all/20150415172856.GA5029@twins.programming.kicks-ass.net/
>>>>
>>>> The event.cpus still faces the same issues.
>>>
>>> Why so? The events would share the same perf_event_context, I thought
>>> the perf_event_open would succeed.
>>
>> I think it breaks what groups are as well. At least, you cannot
>> guarantee that two events can be co-scheduled on the same CPU. Even
>> worse, there could be no events on some CPU.
>> The first thing that pops up is the sample read feature. On CPU_A, the
>> event_A is the leader event, but on CPU_B, the event_B could be the
>> leader event if event_A's cpumask doesn't include the CPU_B.
>> There could be more similar features we have to special handle.
> 
> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and
> solely its perf event context, I want to know its core power and
> package power as a group so I never record one without the other. That
> grouping wouldn't be possible with 2 PMUs.


For power, to be honest, I don't think it improves anything. It gives
users a false image that perf can group these counters.
But the truth is that perf cannot. The power counters are all
free-running counters. It's impossible to co-schedule them (which
requires a global mechanism to disable/enable all counters, e.g.,
GLOBAL_CTRL for core PMU). The kernel still has to read the counters one
by one while the counters keep running. There are no differences with or
without a group for the power events.

> 
>>>
>>>>>
>>>>>> But it makes the kernel driver complex, since it has to maintain at
>>>>>> least two different cpumasks.
>>>>>
>>>>> Two drivers each maintaining a cpumask or 1 driver maintaining 2
>>>>> cpumasks, it seems like there is chance for code reuse in both cases.
>>>>> I'm not seeing how it adds to complexity particularly.
>>>>
>>>> Yes, there are some cleanup opportunities for the cpumask/hotplug codes.
>>>>
>>>> We may even abstracts some generic interfaces for pkg or core level PMUs.
>>>>
>>>> Eventually, the complexity/duplication should be able to be reduced.
>>>>
>>>>>
>>>>>> The tool becomes complex either, since it has to take care of the
>>>>>> per-event CPU override case.
>>>>>
>>>>> I'm not sure I agree with this. What we need for perf_event_open is a
>>>>> perf_event_attr, we dress these up as evsels which also have the
>>>>> ability to be for >1 CPU or thread. Longer term I think evsels can
>>>>> also have >1 PMU, for the wildcard cases like uncore memory
>>>>> controllers - this would remove the need for resorting evsels except
>>>>> for topdown events which have thrown us a giant bundle of challenges.
>>>>> Anyway, so an evsel is perf_event_attr information paired with CPU
>>>>> information, it makes sense to me that the parser should do this
>>>>> pairing. What's harder in the evsel/evlist setup is trying to fix CPU
>>>>> maps up not in parsing, like with __perf_evlist__propagate_maps where
>>>>> the parsing is trying to leave crumbs around (like system_wide,
>>>>> has_user_cpus, is_pmu_core) so the map propagation works properly.
>>>>>
>>>>>> The json file must also be updated to add a
>>>>>> new field cpumask.
>>>>>
>>>>> Yeah, I don't like this as it means we end up putting CPU information
>>>>> into the json that isn't the same for every CPU variant of the same
>>>>> architecture model. Maybe we can have some kind of "scope" enum value
>>>>> in the json and then when the scope differs from the PMU's, core scope
>>>>> vs the PMU's hyperthread scope, then in the tool we can figure out the
>>>>> cpumask from the topology in sysfs. Maybe we should just always use
>>>>> the topology and get rid of cpumask files in sysfs, replacing them
>>>>> with "scope" files. Will Deacon pushed back on having ARM PMUs
>>>>> supporting hot plugging
>>>>> (https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/)
>>>>> where the main thing hot plugging handler needs to maintain is set the
>>>>> cpumask.
>>>>
>>>> Not just the cpumask but also migrate the context for some PMUs, see
>>>> perf_pmu_migrate_context().
>>>
>>> Will do, thanks.
>>>
>>>> It seems we really need a shared cpumask in the generic code, so the
>>>> drivers don't need to handle the hotplug everywhere. I will think about it.
>>>
>>> Thanks. There are other problems on ARM PMUs like having no or empty
>>> cpumasks, which the tool take to mean open the event on every online
>>> CPU (there is no cpus or cpumask file on core PMUs historically, so we
>>> adopt this behavior when the cpumask is empty or not present).
>>
>> The no cpus/cpumasks and empty cpumasks should be different.
>> No cpus/cpumasks file means that the counters/events are available for
>> all the CPUs.
>> But if it's an empty cpus/cpumasks file, it means that there are no
>> proper CPUs. It may happen on a hybrid machine when all e-core CPUs are
>> hot-removed. Since it's possible that the CPUs can be hot-added later,
>> the kernel driver doesn't unregister the cpu_atom PMU.
>>
>>> I've
>>> been thinking to expand `tools/perf/tests/pmu.c` with basic PMU sanity
>>> tests. Some ideas:
>>>
>>
>> Thanks.
>>
>>> 1) some kind of cpumask sanity check - we could open an event with the
>>> cpumask and see if it yields multiplexing.. which would highlight the
>>> ARM no cpumask bug
>>
>> The multiplexing is triggered when there are not enough counters. It
>> should not related to the cpumask.
> 
> Agreed. Here I'm thinking about the bugs I see in PMUs. One of them is
> to always succeed in opening siblings within a group and for the
> unavailable counters to just report "not counted". This breaks weak
> groups used by metrics.
> 
>> For the PMU without cpumask, I think the test case should try to open an
>> event on all CPUs and check if the open succeeds.
>> For the PMU with cpumask, the test case should try to open an event on
>> the masked CPUs and check if the open succeeds.
> 
> I agree without cpumask means all CPUs, the bug I see on ARM PMUs is
> that they have uncore PMUs with no or empty cpumasks leading to the
> uncore events being programmed on every CPU and over counting,
> multiplexing counters and so on. I'm trying to devise tests so that
> they can see they are broken.
> 
>> The behavior of opening an event on unmasked CPUs seems not defined.
>> For uncore, it's OK. The kernel treats the CPUs from the same socket
>> exactly the same. But I'm not sure about the other PMUs.
> 
> My understanding had been that for core PMUs a "perf stat -C" option
> would choose the particular CPU to count the event on, for an uncore
> PMU the -C option would override the cpumask's "default" value. We
> have code to validate this:
> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522
> But it seems now that overriding an uncore PMU's default CPU is
> ignored. 

For the uncore driver, no matter what -C set, it writes the default CPU
back.
https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760

> If you did:
> $ perf stat -C 1 -e data_read -a sleep 0.1
> Then the tool thinks data_read is on CPU1 and will set its thread
> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just
> throw away the -C option.
The perf tool can still read the the counter from CPU1 and no IPIs
because of the PMU_EV_CAP_READ_ACTIVE_PKG().

Not quite sure, but it seems only the open and close may be impacted and
silently changed to CPU0.

Thanks,
Kan
> 
>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
>>> files parse correctly and have a corresponding event.
>>> 3) keep adding opening events on the PMU to a group to make sure that
>>> when counters are exhausted the perf_event_open fails (I've seen this
>>> bug on AMD)
>>> 4) are the values in the type file unique
>>>
>>
>> The rest sounds good to me.
> 
> Cool. Let me know if you can think of more.
> 
> Thanks,
> Ian
> 
>> Thanks,
>> Kan
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on
  2024-07-19 15:01             ` Ian Rogers
@ 2024-07-19 16:42               ` Liang, Kan
  0 siblings, 0 replies; 34+ messages in thread
From: Liang, Kan @ 2024-07-19 16:42 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das



On 2024-07-19 11:01 a.m., Ian Rogers wrote:
> On Fri, Jul 19, 2024 at 7:14 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>
>>
>>
>> On 2024-07-18 5:06 p.m., Ian Rogers wrote:
>>> On Thu, Jul 18, 2024 at 11:03 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>>>
>>>>
>>>>
>>>> On 2024-07-18 11:12 a.m., Ian Rogers wrote:
>>>>> On Thu, Jul 18, 2024 at 7:41 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 2024-07-17 8:30 p.m., Ian Rogers wrote:
>>>>>>> The -C option allows the CPUs for a list of events to be specified but
>>>>>>> its not possible to set the CPU for a single event. Add a term to
>>>>>>> allow this. The term isn't a general CPU list due to ',' already being
>>>>>>> a special character in event parsing instead multiple cpu= terms may
>>>>>>> be provided and they will be merged/unioned together.
>>>>>>>
>>>>>>> An example of mixing different types of events counted on different CPUs:
>>>>>>> ```
>>>>>>> $ perf stat -A -C 0,4-5,8 -e "instructions/cpu=0/,l1d-misses/cpu=4,cpu=5/,inst_retired.any/cpu=8/,cycles" -a sleep 0.1
>>>>>>>
>>>>>>>  Performance counter stats for 'system wide':
>>>>>>>
>>>>>>> CPU0              368,647      instructions/cpu=0/              #    0.26  insn per cycle
>>>>>>> CPU4        <not counted>      instructions/cpu=0/
>>>>>>> CPU5        <not counted>      instructions/cpu=0/
>>>>>>> CPU8        <not counted>      instructions/cpu=0/
>>>>>>> CPU0        <not counted>      l1d-misses [cpu]
>>>>>>> CPU4              203,377      l1d-misses [cpu]
>>>>>>> CPU5              138,231      l1d-misses [cpu]
>>>>>>> CPU8        <not counted>      l1d-misses [cpu]
>>>>>>> CPU0        <not counted>      cpu/cpu=8/
>>>>>>> CPU4        <not counted>      cpu/cpu=8/
>>>>>>> CPU5        <not counted>      cpu/cpu=8/
>>>>>>> CPU8              943,861      cpu/cpu=8/
>>>>>>> CPU0            1,412,071      cycles
>>>>>>> CPU4           20,362,900      cycles
>>>>>>> CPU5           10,172,725      cycles
>>>>>>> CPU8            2,406,081      cycles
>>>>>>>
>>>>>>>        0.102925309 seconds time elapsed
>>>>>>> ```
>>>>>>>
>>>>>>> Note, the event name of inst_retired.any is missing, reported as
>>>>>>> cpu/cpu=8/, as there are unmerged uniquify fixes:
>>>>>>> https://lore.kernel.org/lkml/20240510053705.2462258-3-irogers@google.com/
>>>>>>>
>>>>>>> An example of spreading uncore overhead across two CPUs:
>>>>>>> ```
>>>>>>> $ perf stat -A -e "data_read/cpu=0/,data_write/cpu=1/" -a sleep 0.1
>>>>>>>
>>>>>>>  Performance counter stats for 'system wide':
>>>>>>>
>>>>>>> CPU0               223.65 MiB  uncore_imc_free_running_0/cpu=0/
>>>>>>> CPU0               223.66 MiB  uncore_imc_free_running_1/cpu=0/
>>>>>>> CPU0        <not counted> MiB  uncore_imc_free_running_0/cpu=1/
>>>>>>> CPU1                 5.78 MiB  uncore_imc_free_running_0/cpu=1/
>>>>>>> CPU0        <not counted> MiB  uncore_imc_free_running_1/cpu=1/
>>>>>>> CPU1                 5.74 MiB  uncore_imc_free_running_1/cpu=1/
>>>>>>> ```
>>>>>>>
>>>>>>> Manually fixing the output it should be:
>>>>>>> ```
>>>>>>> CPU0               223.65 MiB  uncore_imc_free_running_0/data_read,cpu=0/
>>>>>>> CPU0               223.66 MiB  uncore_imc_free_running_1/data_read,cpu=0/
>>>>>>> CPU1                 5.78 MiB  uncore_imc_free_running_0/data_write,cpu=1/
>>>>>>> CPU1                 5.74 MiB  uncore_imc_free_running_1/data_write,cpu=1/
>>>>>>> ```
>>>>>>>
>>>>>>> That is data_read from 2 PMUs was counted on CPU0 and data_write was
>>>>>>> counted on CPU1.
>>>>>>
>>>>>> There was an effort to make the counter access from any CPU of the package.
>>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d6a2f9035bfc27d0e9d78b13635dda9fb017ac01
>>>>>>
>>>>>> But now it limits the access from specific CPUs. It sounds like a
>>>>>> regression.
>>>>>
>>>>> Thanks Kan, I'm not sure I understand the comment.
>>>>
>>>> The flag is also applied for the uncore and RAPL.
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/arch/x86/events/intel/uncore.c?&id=e64cd6f73ff5a7eb4f8f759049ee24a3fe55e731
>>>>
>>>> So specifying a CPU to an uncore event doesn't make sense. If the
>>>> current CPU is in the same package as the asked CPU. The kernel will
>>>> always choose the current CPU.
>>>
>>> Ugh, that sounds sub-optimal. If I'm monitoring uncore events with
>>> cgroups CPU0 (or the first CPU in a package) is going to be loaded up
>>> with all the events and have all of the rdmsr/wrmsrs in its context
>>> switch. Perhaps we should warn and say to use BPF events.
>>>
>>> Is there a way through say ioctls to get the CPU an event is on? That
>>> way we could update the `perf stat -A` to accurately report cpus.
>>> There's also the issue that the affinity stuff is going to be off.
>>>
>>
>> I don't think there is such ioctl.
>>
>> Emphasizing the CPU ID for an uncore event seems misleading. The uncore
>> only supports per-socket counter, not per-core counter.
>> Opening/reading an counter from any CPUs on a package should be identical.
>> An accurate report of the `perf stat -A` for an uncore should use "S0".
> 
> I think it is an "elite" level trick to try to do things like balance
> context switch overhead. Putting everything on the first CPU feels
> like a scalability issue for cgroup events. BPF events work around it
> to some degree, but they are still going to put all the work on CPU0
> which could cause performance issues, latency spikes, etc. on it.

Yes, that's why the PERF_EV_CAP_READ_ACTIVE_PKG was introduced. So the
read would not be on the same CPU0.
But I'm not sure how bad it is if perf tool open/close all the uncore
events on the same default CPU0.

Thanks,
Kan
> 
> Thanks,
> Ian
> 
>> Thanks,
>> Kan
>>
>>> Thanks,
>>> Ian
>>>
>>>
>>>> Thanks,
>>>> Kan
>>>>> The overhead I was
>>>>> thinking of here is more along the lines of cgroup context switches
>>>>> (although that isn't in my example). There may be a large number of
>>>>> say memory controller events just by having 2 events for each PMU and
>>>>> then there are 10s of PMUs. By putting half of the events on 1 CPU and
>>>>> half on another, the context switch overhead is shared. That said, the
>>>>> counters don't care what cgroup is accessing memory, and users doing
>>>>> this are likely making some kind of error.
>>>>>
>>>>> Thanks,
>>>>> Ian
>>>>>
>>>
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-19 16:35               ` Liang, Kan
@ 2024-07-19 22:02                 ` Ian Rogers
  2024-07-22 13:57                   ` Liang, Kan
  2024-07-26  7:06                   ` Dhananjay Ugwekar
  0 siblings, 2 replies; 34+ messages in thread
From: Ian Rogers @ 2024-07-19 22:02 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das

On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> On 2024-07-19 10:59 a.m., Ian Rogers wrote:
> > Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and
> > solely its perf event context, I want to know its core power and
> > package power as a group so I never record one without the other. That
> > grouping wouldn't be possible with 2 PMUs.
>
> For power, to be honest, I don't think it improves anything. It gives
> users a false image that perf can group these counters.
> But the truth is that perf cannot. The power counters are all
> free-running counters. It's impossible to co-schedule them (which
> requires a global mechanism to disable/enable all counters, e.g.,
> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one
> by one while the counters keep running. There are no differences with or
> without a group for the power events.

Ok, so power should copy cstate with _core, _pkg, etc. I agree the
difference is small and I like the idea of being consistent. Do we
want to add "event.cpus" support to the tool anyway for potential
future uses? This would at least avoid problems with newer kernels and
older perf tools were we to find a good use for it.

> > My understanding had been that for core PMUs a "perf stat -C" option
> > would choose the particular CPU to count the event on, for an uncore
> > PMU the -C option would override the cpumask's "default" value. We
> > have code to validate this:
> > https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522
> > But it seems now that overriding an uncore PMU's default CPU is
> > ignored.
>
> For the uncore driver, no matter what -C set, it writes the default CPU
> back.
> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760
>
> > If you did:
> > $ perf stat -C 1 -e data_read -a sleep 0.1
> > Then the tool thinks data_read is on CPU1 and will set its thread
> > affinity to CPU1 to avoid IPIs. It seems to fix this we need to just
> > throw away the -C option.
> The perf tool can still read the the counter from CPU1 and no IPIs
> because of the PMU_EV_CAP_READ_ACTIVE_PKG().
>
> Not quite sure, but it seems only the open and close may be impacted and
> silently changed to CPU0.

There's also enable/disable. Andi did the work and there were some
notable gains but likely more on core events. Ultimately it'd be nice
to be opening, closing.. everything in parallel given the calls are
slow and the work is embarrassingly parallel.
It feels like the cpumasks for uncore could still do with some cleanup
wrt -C I'm just unsure at the moment what this should be. Tbh, I'm
tempted to rewrite evlist propagate maps as someone may look at it and
think I believe in what it is doing. The parallel stuff we should grab
Riccardo's past work.

Thanks,
Ian


> Thanks,
> Kan
> >
> >>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
> >>> files parse correctly and have a corresponding event.
> >>> 3) keep adding opening events on the PMU to a group to make sure that
> >>> when counters are exhausted the perf_event_open fails (I've seen this
> >>> bug on AMD)
> >>> 4) are the values in the type file unique
> >>>
> >>
> >> The rest sounds good to me.
> >
> > Cool. Let me know if you can think of more.
> >
> > Thanks,
> > Ian
> >
> >> Thanks,
> >> Kan
> >

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-19 22:02                 ` Ian Rogers
@ 2024-07-22 13:57                   ` Liang, Kan
  2024-07-22 15:43                     ` Ian Rogers
  2024-07-26  7:06                   ` Dhananjay Ugwekar
  1 sibling, 1 reply; 34+ messages in thread
From: Liang, Kan @ 2024-07-22 13:57 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das



On 2024-07-19 6:02 p.m., Ian Rogers wrote:
> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>> On 2024-07-19 10:59 a.m., Ian Rogers wrote:
>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and
>>> solely its perf event context, I want to know its core power and
>>> package power as a group so I never record one without the other. That
>>> grouping wouldn't be possible with 2 PMUs.
>>
>> For power, to be honest, I don't think it improves anything. It gives
>> users a false image that perf can group these counters.
>> But the truth is that perf cannot. The power counters are all
>> free-running counters. It's impossible to co-schedule them (which
>> requires a global mechanism to disable/enable all counters, e.g.,
>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one
>> by one while the counters keep running. There are no differences with or
>> without a group for the power events.
> 
> Ok, so power should copy cstate with _core, _pkg, etc. I agree the
> difference is small and I like the idea of being consistent. Do we
> want to add "event.cpus" support to the tool anyway for potential
> future uses? 

The only thing I can imagine is that it may be used to disclose the
event constraint information, Or even more to update/override the event
constraint information (which requires kernel update.). But what I'm
worried about is that it may be abused. It's very easy to confuse an
event and a counter in a PMU.

> This would at least avoid problems with newer kernels and
> older perf tools were we to find a good use for it.
> 
>>> My understanding had been that for core PMUs a "perf stat -C" option
>>> would choose the particular CPU to count the event on, for an uncore
>>> PMU the -C option would override the cpumask's "default" value. We
>>> have code to validate this:
>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522
>>> But it seems now that overriding an uncore PMU's default CPU is
>>> ignored.
>>
>> For the uncore driver, no matter what -C set, it writes the default CPU
>> back.
>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760
>>
>>> If you did:
>>> $ perf stat -C 1 -e data_read -a sleep 0.1
>>> Then the tool thinks data_read is on CPU1 and will set its thread
>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just
>>> throw away the -C option.
>> The perf tool can still read the the counter from CPU1 and no IPIs
>> because of the PMU_EV_CAP_READ_ACTIVE_PKG().
>>
>> Not quite sure, but it seems only the open and close may be impacted and
>> silently changed to CPU0.
> 
> There's also enable/disable. Andi did the work and there were some
> notable gains but likely more on core events. Ultimately it'd be nice
> to be opening, closing.. everything in parallel given the calls are
> slow and the work is embarrassingly parallel.
> It feels like the cpumasks for uncore could still do with some cleanup
> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm
> tempted to rewrite evlist propagate maps as someone may look at it and
> think I believe in what it is doing. The parallel stuff we should grab
> Riccardo's past work.

For core PMU, the parallel may not work, because the core PMU is usually
MSR based. Perf has to access the MSRs on the specific CPU. IPIs may be
triggered if the users try to operate them from the other CPUs.

But the parallel is good for the counters in the MMIO space. The
counters can be accessed from any CPU. There are more and more counters
which are moved to the MMIO space, e.g., new uncore PMUs, IOMMU PMU,
TMPI (for power), etc.

Thanks,
Kan
> 
> Thanks,
> Ian
> 
> 
>> Thanks,
>> Kan
>>>
>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
>>>>> files parse correctly and have a corresponding event.
>>>>> 3) keep adding opening events on the PMU to a group to make sure that
>>>>> when counters are exhausted the perf_event_open fails (I've seen this
>>>>> bug on AMD)
>>>>> 4) are the values in the type file unique
>>>>>
>>>>
>>>> The rest sounds good to me.
>>>
>>> Cool. Let me know if you can think of more.
>>>
>>> Thanks,
>>> Ian
>>>
>>>> Thanks,
>>>> Kan
>>>
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-22 13:57                   ` Liang, Kan
@ 2024-07-22 15:43                     ` Ian Rogers
  2024-07-22 16:45                       ` Liang, Kan
  0 siblings, 1 reply; 34+ messages in thread
From: Ian Rogers @ 2024-07-22 15:43 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das

On Mon, Jul 22, 2024 at 6:57 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
>
>
> On 2024-07-19 6:02 p.m., Ian Rogers wrote:
> > On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >> On 2024-07-19 10:59 a.m., Ian Rogers wrote:
> >>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and
> >>> solely its perf event context, I want to know its core power and
> >>> package power as a group so I never record one without the other. That
> >>> grouping wouldn't be possible with 2 PMUs.
> >>
> >> For power, to be honest, I don't think it improves anything. It gives
> >> users a false image that perf can group these counters.
> >> But the truth is that perf cannot. The power counters are all
> >> free-running counters. It's impossible to co-schedule them (which
> >> requires a global mechanism to disable/enable all counters, e.g.,
> >> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one
> >> by one while the counters keep running. There are no differences with or
> >> without a group for the power events.
> >
> > Ok, so power should copy cstate with _core, _pkg, etc. I agree the
> > difference is small and I like the idea of being consistent. Do we
> > want to add "event.cpus" support to the tool anyway for potential
> > future uses?
>
> The only thing I can imagine is that it may be used to disclose the
> event constraint information, Or even more to update/override the event
> constraint information (which requires kernel update.). But what I'm
> worried about is that it may be abused. It's very easy to confuse an
> event and a counter in a PMU.

So you mean if you have a dual socket machine and an uncore PMU with a
cpumask of "0,48" you worry that someone setting an event on CPU 47
may think they are getting a CPU on the second socket? Perhaps if the
user can express an intent to the tool, say "perf stat
-randomly-select-uncore-cpus ...", then this can be avoided. I'm not
sure I'm worried about this as specifying the CPU for an event to use
is already something of a more niche/advanced thing to be doing.

> > This would at least avoid problems with newer kernels and
> > older perf tools were we to find a good use for it.
> >
> >>> My understanding had been that for core PMUs a "perf stat -C" option
> >>> would choose the particular CPU to count the event on, for an uncore
> >>> PMU the -C option would override the cpumask's "default" value. We
> >>> have code to validate this:
> >>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522
> >>> But it seems now that overriding an uncore PMU's default CPU is
> >>> ignored.
> >>
> >> For the uncore driver, no matter what -C set, it writes the default CPU
> >> back.
> >> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760
> >>
> >>> If you did:
> >>> $ perf stat -C 1 -e data_read -a sleep 0.1
> >>> Then the tool thinks data_read is on CPU1 and will set its thread
> >>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just
> >>> throw away the -C option.
> >> The perf tool can still read the the counter from CPU1 and no IPIs
> >> because of the PMU_EV_CAP_READ_ACTIVE_PKG().
> >>
> >> Not quite sure, but it seems only the open and close may be impacted and
> >> silently changed to CPU0.
> >
> > There's also enable/disable. Andi did the work and there were some
> > notable gains but likely more on core events. Ultimately it'd be nice
> > to be opening, closing.. everything in parallel given the calls are
> > slow and the work is embarrassingly parallel.
> > It feels like the cpumasks for uncore could still do with some cleanup
> > wrt -C I'm just unsure at the moment what this should be. Tbh, I'm
> > tempted to rewrite evlist propagate maps as someone may look at it and
> > think I believe in what it is doing. The parallel stuff we should grab
> > Riccardo's past work.
>
> For core PMU, the parallel may not work, because the core PMU is usually
> MSR based. Perf has to access the MSRs on the specific CPU. IPIs may be
> triggered if the users try to operate them from the other CPUs.

Right, I think the idea would be to have as many threads as you have
CPUs then give each thread affinity to a different CPU. The work done
on the thread would match the CPU they have affinity with to avoid the
IPIs. Because of the use of RCU in the kernel perf code it is possible
to hit RCU synchronize where IPIs are sent after 200ms IIRC. If you
get an RCU synchronize needing an IPI then "200ms x num CPUs" can mean
seconds of delay in a serial implementation (500 CPUs would be 100
seconds). With parallel code a worst case slow down shouldn't increase
with the number of CPUs. On laptops, .. this doesn't matter much.

> But the parallel is good for the counters in the MMIO space. The
> counters can be accessed from any CPU. There are more and more counters
> which are moved to the MMIO space, e.g., new uncore PMUs, IOMMU PMU,
> TMPI (for power), etc.

Sounds good but I'm wondering how we can get the tool in the right
place for doing affinity games. For the MMIO case life's good and we
don't care. How can the tool know one case from another? Should the
tool always be just following the cpumask? What about on ARM where the
cpumasks are broken?

Thanks,
Ian

> Thanks,
> Kan
> >
> > Thanks,
> > Ian
> >
> >
> >> Thanks,
> >> Kan
> >>>
> >>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
> >>>>> files parse correctly and have a corresponding event.
> >>>>> 3) keep adding opening events on the PMU to a group to make sure that
> >>>>> when counters are exhausted the perf_event_open fails (I've seen this
> >>>>> bug on AMD)
> >>>>> 4) are the values in the type file unique
> >>>>>
> >>>>
> >>>> The rest sounds good to me.
> >>>
> >>> Cool. Let me know if you can think of more.
> >>>
> >>> Thanks,
> >>> Ian
> >>>
> >>>> Thanks,
> >>>> Kan
> >>>
> >

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-22 15:43                     ` Ian Rogers
@ 2024-07-22 16:45                       ` Liang, Kan
  0 siblings, 0 replies; 34+ messages in thread
From: Liang, Kan @ 2024-07-22 16:45 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	Dhananjay Ugwekar, ananth.narayan, gautham.shenoy, kprateek.nayak,
	sandipan.das



On 2024-07-22 11:43 a.m., Ian Rogers wrote:
> On Mon, Jul 22, 2024 at 6:57 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>
>>
>>
>> On 2024-07-19 6:02 p.m., Ian Rogers wrote:
>>> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>>> On 2024-07-19 10:59 a.m., Ian Rogers wrote:
>>>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and
>>>>> solely its perf event context, I want to know its core power and
>>>>> package power as a group so I never record one without the other. That
>>>>> grouping wouldn't be possible with 2 PMUs.
>>>>
>>>> For power, to be honest, I don't think it improves anything. It gives
>>>> users a false image that perf can group these counters.
>>>> But the truth is that perf cannot. The power counters are all
>>>> free-running counters. It's impossible to co-schedule them (which
>>>> requires a global mechanism to disable/enable all counters, e.g.,
>>>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one
>>>> by one while the counters keep running. There are no differences with or
>>>> without a group for the power events.
>>>
>>> Ok, so power should copy cstate with _core, _pkg, etc. I agree the
>>> difference is small and I like the idea of being consistent. Do we
>>> want to add "event.cpus" support to the tool anyway for potential
>>> future uses?
>>
>> The only thing I can imagine is that it may be used to disclose the
>> event constraint information, Or even more to update/override the event
>> constraint information (which requires kernel update.). But what I'm
>> worried about is that it may be abused. It's very easy to confuse an
>> event and a counter in a PMU.
> 
> So you mean if you have a dual socket machine and an uncore PMU with a
> cpumask of "0,48" you worry that someone setting an event on CPU 47
> may think they are getting a CPU on the second socket? Perhaps if the
> user can express an intent to the tool, say "perf stat
> -randomly-select-uncore-cpus ...", then this can be avoided. I'm not
> sure I'm worried about this as specifying the CPU for an event to use
> is already something of a more niche/advanced thing to be doing.
> 

The perf tool can specify any CPU the users want, as long as the kernel
can respond properly. I'm not worried about it. The "event.cpus" is
exposed by the kernel. The main concern is also from the kernel side.
Some drivers may use it to distinguish different scopes of counters. So
they can combine various types of counters into a single PMU, which may
break some rules they don't realize. An example is the above 'group' rule.

>>> This would at least avoid problems with newer kernels and
>>> older perf tools were we to find a good use for it.
>>>
>>>>> My understanding had been that for core PMUs a "perf stat -C" option
>>>>> would choose the particular CPU to count the event on, for an uncore
>>>>> PMU the -C option would override the cpumask's "default" value. We
>>>>> have code to validate this:
>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522
>>>>> But it seems now that overriding an uncore PMU's default CPU is
>>>>> ignored.
>>>>
>>>> For the uncore driver, no matter what -C set, it writes the default CPU
>>>> back.
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760
>>>>
>>>>> If you did:
>>>>> $ perf stat -C 1 -e data_read -a sleep 0.1
>>>>> Then the tool thinks data_read is on CPU1 and will set its thread
>>>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just
>>>>> throw away the -C option.
>>>> The perf tool can still read the the counter from CPU1 and no IPIs
>>>> because of the PMU_EV_CAP_READ_ACTIVE_PKG().
>>>>
>>>> Not quite sure, but it seems only the open and close may be impacted and
>>>> silently changed to CPU0.
>>>
>>> There's also enable/disable. Andi did the work and there were some
>>> notable gains but likely more on core events. Ultimately it'd be nice
>>> to be opening, closing.. everything in parallel given the calls are
>>> slow and the work is embarrassingly parallel.
>>> It feels like the cpumasks for uncore could still do with some cleanup
>>> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm
>>> tempted to rewrite evlist propagate maps as someone may look at it and
>>> think I believe in what it is doing. The parallel stuff we should grab
>>> Riccardo's past work.
>>
>> For core PMU, the parallel may not work, because the core PMU is usually
>> MSR based. Perf has to access the MSRs on the specific CPU. IPIs may be
>> triggered if the users try to operate them from the other CPUs.
> 
> Right, I think the idea would be to have as many threads as you have
> CPUs then give each thread affinity to a different CPU. The work done
> on the thread would match the CPU they have affinity with to avoid the
> IPIs. Because of the use of RCU in the kernel perf code it is possible
> to hit RCU synchronize where IPIs are sent after 200ms IIRC. If you
> get an RCU synchronize needing an IPI then "200ms x num CPUs" can mean
> seconds of delay in a serial implementation (500 CPUs would be 100
> seconds). With parallel code a worst case slow down shouldn't increase
> with the number of CPUs. On laptops, .. this doesn't matter much.
> 
>> But the parallel is good for the counters in the MMIO space. The
>> counters can be accessed from any CPU. There are more and more counters
>> which are moved to the MMIO space, e.g., new uncore PMUs, IOMMU PMU,
>> TMPI (for power), etc.
> 
> Sounds good but I'm wondering how we can get the tool in the right
> place for doing affinity games. For the MMIO case life's good and we
> don't care. How can the tool know one case from another? Should the
> tool always be just following the cpumask? What about on ARM where the
> cpumasks are broken?

The cpumask cannot tell such information. For example, the counters of
the uncore PMUs can be located in three different places, MSRs, the PCI
config space, and the MMIO space. But the scope of the uncore counters
is the same. So the cpumask is the same for various uncore PMUs.
To get the information, the kernel has to be updated, e.g., add a
caps/counter_type in sysfs.

Thanks,
Kan
> 
> Thanks,
> Ian
> 
>> Thanks,
>> Kan
>>>
>>> Thanks,
>>> Ian
>>>
>>>
>>>> Thanks,
>>>> Kan
>>>>>
>>>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
>>>>>>> files parse correctly and have a corresponding event.
>>>>>>> 3) keep adding opening events on the PMU to a group to make sure that
>>>>>>> when counters are exhausted the perf_event_open fails (I've seen this
>>>>>>> bug on AMD)
>>>>>>> 4) are the values in the type file unique
>>>>>>>
>>>>>>
>>>>>> The rest sounds good to me.
>>>>>
>>>>> Cool. Let me know if you can think of more.
>>>>>
>>>>> Thanks,
>>>>> Ian
>>>>>
>>>>>> Thanks,
>>>>>> Kan
>>>>>
>>>
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-19 22:02                 ` Ian Rogers
  2024-07-22 13:57                   ` Liang, Kan
@ 2024-07-26  7:06                   ` Dhananjay Ugwekar
  2024-07-26  7:09                     ` Dhananjay Ugwekar
  1 sibling, 1 reply; 34+ messages in thread
From: Dhananjay Ugwekar @ 2024-07-26  7:06 UTC (permalink / raw)
  To: Ian Rogers, Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	ananth.narayan, gautham.shenoy, kprateek.nayak, sandipan.das

Hello, Ian, Kan,

On 7/20/2024 3:32 AM, Ian Rogers wrote:
> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>> On 2024-07-19 10:59 a.m., Ian Rogers wrote:
>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and
>>> solely its perf event context, I want to know its core power and
>>> package power as a group so I never record one without the other. That
>>> grouping wouldn't be possible with 2 PMUs.
>>
>> For power, to be honest, I don't think it improves anything. It gives
>> users a false image that perf can group these counters.
>> But the truth is that perf cannot. The power counters are all
>> free-running counters. It's impossible to co-schedule them (which
>> requires a global mechanism to disable/enable all counters, e.g.,
>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one
>> by one while the counters keep running. There are no differences with or
>> without a group for the power events.
> 
> Ok, so power should copy cstate with _core, _pkg, etc. I agree the
> difference is small and I like the idea of being consistent.

So, it seems we want to follow the new PMU addition approach for RAPL 
being consistent with Intel cstate driver, should I revive my "power_per_core" 
PMU thread now?

Thanks,
Dhananjay

 Do we
> want to add "event.cpus" support to the tool anyway for potential
> future uses? This would at least avoid problems with newer kernels and
> older perf tools were we to find a good use for it.
> 
>>> My understanding had been that for core PMUs a "perf stat -C" option
>>> would choose the particular CPU to count the event on, for an uncore
>>> PMU the -C option would override the cpumask's "default" value. We
>>> have code to validate this:
>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522
>>> But it seems now that overriding an uncore PMU's default CPU is
>>> ignored.
>>
>> For the uncore driver, no matter what -C set, it writes the default CPU
>> back.
>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760
>>
>>> If you did:
>>> $ perf stat -C 1 -e data_read -a sleep 0.1
>>> Then the tool thinks data_read is on CPU1 and will set its thread
>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just
>>> throw away the -C option.
>> The perf tool can still read the the counter from CPU1 and no IPIs
>> because of the PMU_EV_CAP_READ_ACTIVE_PKG().
>>
>> Not quite sure, but it seems only the open and close may be impacted and
>> silently changed to CPU0.
> 
> There's also enable/disable. Andi did the work and there were some
> notable gains but likely more on core events. Ultimately it'd be nice
> to be opening, closing.. everything in parallel given the calls are
> slow and the work is embarrassingly parallel.
> It feels like the cpumasks for uncore could still do with some cleanup
> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm
> tempted to rewrite evlist propagate maps as someone may look at it and
> think I believe in what it is doing. The parallel stuff we should grab
> Riccardo's past work.
> 
> Thanks,
> Ian
> 
> 
>> Thanks,
>> Kan
>>>
>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
>>>>> files parse correctly and have a corresponding event.
>>>>> 3) keep adding opening events on the PMU to a group to make sure that
>>>>> when counters are exhausted the perf_event_open fails (I've seen this
>>>>> bug on AMD)
>>>>> 4) are the values in the type file unique
>>>>>
>>>>
>>>> The rest sounds good to me.
>>>
>>> Cool. Let me know if you can think of more.
>>>
>>> Thanks,
>>> Ian
>>>
>>>> Thanks,
>>>> Kan
>>>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-26  7:06                   ` Dhananjay Ugwekar
@ 2024-07-26  7:09                     ` Dhananjay Ugwekar
  2024-07-26  7:52                       ` Ian Rogers
  0 siblings, 1 reply; 34+ messages in thread
From: Dhananjay Ugwekar @ 2024-07-26  7:09 UTC (permalink / raw)
  To: Ian Rogers, Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	ananth.narayan, gautham.shenoy, kprateek.nayak, sandipan.das



On 7/26/2024 12:36 PM, Dhananjay Ugwekar wrote:
> Hello, Ian, Kan,
> 
> On 7/20/2024 3:32 AM, Ian Rogers wrote:
>> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>> On 2024-07-19 10:59 a.m., Ian Rogers wrote:
>>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and
>>>> solely its perf event context, I want to know its core power and
>>>> package power as a group so I never record one without the other. That
>>>> grouping wouldn't be possible with 2 PMUs.
>>>
>>> For power, to be honest, I don't think it improves anything. It gives
>>> users a false image that perf can group these counters.
>>> But the truth is that perf cannot. The power counters are all
>>> free-running counters. It's impossible to co-schedule them (which
>>> requires a global mechanism to disable/enable all counters, e.g.,
>>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one
>>> by one while the counters keep running. There are no differences with or
>>> without a group for the power events.
>>
>> Ok, so power should copy cstate with _core, _pkg, etc. I agree the
>> difference is small and I like the idea of being consistent.
> 
> So, it seems we want to follow the new PMU addition approach for RAPL 
> being consistent with Intel cstate driver, should I revive my "power_per_core" 
> PMU thread now?

The power_per_core PMU thread link for reference,

https://lore.kernel.org/all/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/

> 
> Thanks,
> Dhananjay
> 
>  Do we
>> want to add "event.cpus" support to the tool anyway for potential
>> future uses? This would at least avoid problems with newer kernels and
>> older perf tools were we to find a good use for it.
>>
>>>> My understanding had been that for core PMUs a "perf stat -C" option
>>>> would choose the particular CPU to count the event on, for an uncore
>>>> PMU the -C option would override the cpumask's "default" value. We
>>>> have code to validate this:
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522
>>>> But it seems now that overriding an uncore PMU's default CPU is
>>>> ignored.
>>>
>>> For the uncore driver, no matter what -C set, it writes the default CPU
>>> back.
>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760
>>>
>>>> If you did:
>>>> $ perf stat -C 1 -e data_read -a sleep 0.1
>>>> Then the tool thinks data_read is on CPU1 and will set its thread
>>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just
>>>> throw away the -C option.
>>> The perf tool can still read the the counter from CPU1 and no IPIs
>>> because of the PMU_EV_CAP_READ_ACTIVE_PKG().
>>>
>>> Not quite sure, but it seems only the open and close may be impacted and
>>> silently changed to CPU0.
>>
>> There's also enable/disable. Andi did the work and there were some
>> notable gains but likely more on core events. Ultimately it'd be nice
>> to be opening, closing.. everything in parallel given the calls are
>> slow and the work is embarrassingly parallel.
>> It feels like the cpumasks for uncore could still do with some cleanup
>> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm
>> tempted to rewrite evlist propagate maps as someone may look at it and
>> think I believe in what it is doing. The parallel stuff we should grab
>> Riccardo's past work.
>>
>> Thanks,
>> Ian
>>
>>
>>> Thanks,
>>> Kan
>>>>
>>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
>>>>>> files parse correctly and have a corresponding event.
>>>>>> 3) keep adding opening events on the PMU to a group to make sure that
>>>>>> when counters are exhausted the perf_event_open fails (I've seen this
>>>>>> bug on AMD)
>>>>>> 4) are the values in the type file unique
>>>>>>
>>>>>
>>>>> The rest sounds good to me.
>>>>
>>>> Cool. Let me know if you can think of more.
>>>>
>>>> Thanks,
>>>> Ian
>>>>
>>>>> Thanks,
>>>>> Kan
>>>>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-26  7:09                     ` Dhananjay Ugwekar
@ 2024-07-26  7:52                       ` Ian Rogers
  2024-07-26  8:17                         ` Dhananjay Ugwekar
  0 siblings, 1 reply; 34+ messages in thread
From: Ian Rogers @ 2024-07-26  7:52 UTC (permalink / raw)
  To: Dhananjay Ugwekar
  Cc: Liang, Kan, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	ananth.narayan, gautham.shenoy, kprateek.nayak, sandipan.das

On Fri, Jul 26, 2024 at 12:10 AM Dhananjay Ugwekar
<Dhananjay.Ugwekar@amd.com> wrote:
>
>
>
> On 7/26/2024 12:36 PM, Dhananjay Ugwekar wrote:
> > Hello, Ian, Kan,
> >
> > On 7/20/2024 3:32 AM, Ian Rogers wrote:
> >> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >>> On 2024-07-19 10:59 a.m., Ian Rogers wrote:
> >>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and
> >>>> solely its perf event context, I want to know its core power and
> >>>> package power as a group so I never record one without the other. That
> >>>> grouping wouldn't be possible with 2 PMUs.
> >>>
> >>> For power, to be honest, I don't think it improves anything. It gives
> >>> users a false image that perf can group these counters.
> >>> But the truth is that perf cannot. The power counters are all
> >>> free-running counters. It's impossible to co-schedule them (which
> >>> requires a global mechanism to disable/enable all counters, e.g.,
> >>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one
> >>> by one while the counters keep running. There are no differences with or
> >>> without a group for the power events.
> >>
> >> Ok, so power should copy cstate with _core, _pkg, etc. I agree the
> >> difference is small and I like the idea of being consistent.
> >
> > So, it seems we want to follow the new PMU addition approach for RAPL
> > being consistent with Intel cstate driver, should I revive my "power_per_core"
> > PMU thread now?
>
> The power_per_core PMU thread link for reference,
>
> https://lore.kernel.org/all/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/

I think so. Would it be possible to follow the same naming convention
as cstate, where there is cstate_pkg and cstate_core? (ie no "_per" in
the name)

Thanks,
Ian

> >
> > Thanks,
> > Dhananjay
> >
> >  Do we
> >> want to add "event.cpus" support to the tool anyway for potential
> >> future uses? This would at least avoid problems with newer kernels and
> >> older perf tools were we to find a good use for it.
> >>
> >>>> My understanding had been that for core PMUs a "perf stat -C" option
> >>>> would choose the particular CPU to count the event on, for an uncore
> >>>> PMU the -C option would override the cpumask's "default" value. We
> >>>> have code to validate this:
> >>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522
> >>>> But it seems now that overriding an uncore PMU's default CPU is
> >>>> ignored.
> >>>
> >>> For the uncore driver, no matter what -C set, it writes the default CPU
> >>> back.
> >>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760
> >>>
> >>>> If you did:
> >>>> $ perf stat -C 1 -e data_read -a sleep 0.1
> >>>> Then the tool thinks data_read is on CPU1 and will set its thread
> >>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just
> >>>> throw away the -C option.
> >>> The perf tool can still read the the counter from CPU1 and no IPIs
> >>> because of the PMU_EV_CAP_READ_ACTIVE_PKG().
> >>>
> >>> Not quite sure, but it seems only the open and close may be impacted and
> >>> silently changed to CPU0.
> >>
> >> There's also enable/disable. Andi did the work and there were some
> >> notable gains but likely more on core events. Ultimately it'd be nice
> >> to be opening, closing.. everything in parallel given the calls are
> >> slow and the work is embarrassingly parallel.
> >> It feels like the cpumasks for uncore could still do with some cleanup
> >> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm
> >> tempted to rewrite evlist propagate maps as someone may look at it and
> >> think I believe in what it is doing. The parallel stuff we should grab
> >> Riccardo's past work.
> >>
> >> Thanks,
> >> Ian
> >>
> >>
> >>> Thanks,
> >>> Kan
> >>>>
> >>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
> >>>>>> files parse correctly and have a corresponding event.
> >>>>>> 3) keep adding opening events on the PMU to a group to make sure that
> >>>>>> when counters are exhausted the perf_event_open fails (I've seen this
> >>>>>> bug on AMD)
> >>>>>> 4) are the values in the type file unique
> >>>>>>
> >>>>>
> >>>>> The rest sounds good to me.
> >>>>
> >>>> Cool. Let me know if you can think of more.
> >>>>
> >>>> Thanks,
> >>>> Ian
> >>>>
> >>>>> Thanks,
> >>>>> Kan
> >>>>
>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-26  7:52                       ` Ian Rogers
@ 2024-07-26  8:17                         ` Dhananjay Ugwekar
  2024-07-26 14:07                           ` Liang, Kan
  0 siblings, 1 reply; 34+ messages in thread
From: Dhananjay Ugwekar @ 2024-07-26  8:17 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Liang, Kan, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	ananth.narayan, gautham.shenoy, kprateek.nayak, sandipan.das



On 7/26/2024 1:22 PM, Ian Rogers wrote:
> On Fri, Jul 26, 2024 at 12:10 AM Dhananjay Ugwekar
> <Dhananjay.Ugwekar@amd.com> wrote:
>>
>>
>>
>> On 7/26/2024 12:36 PM, Dhananjay Ugwekar wrote:
>>> Hello, Ian, Kan,
>>>
>>> On 7/20/2024 3:32 AM, Ian Rogers wrote:
>>>> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>>>> On 2024-07-19 10:59 a.m., Ian Rogers wrote:
>>>>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and
>>>>>> solely its perf event context, I want to know its core power and
>>>>>> package power as a group so I never record one without the other. That
>>>>>> grouping wouldn't be possible with 2 PMUs.
>>>>>
>>>>> For power, to be honest, I don't think it improves anything. It gives
>>>>> users a false image that perf can group these counters.
>>>>> But the truth is that perf cannot. The power counters are all
>>>>> free-running counters. It's impossible to co-schedule them (which
>>>>> requires a global mechanism to disable/enable all counters, e.g.,
>>>>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one
>>>>> by one while the counters keep running. There are no differences with or
>>>>> without a group for the power events.
>>>>
>>>> Ok, so power should copy cstate with _core, _pkg, etc. I agree the
>>>> difference is small and I like the idea of being consistent.
>>>
>>> So, it seems we want to follow the new PMU addition approach for RAPL
>>> being consistent with Intel cstate driver, should I revive my "power_per_core"
>>> PMU thread now?
>>
>> The power_per_core PMU thread link for reference,
>>
>> https://lore.kernel.org/all/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/
> 
> I think so. Would it be possible to follow the same naming convention
> as cstate, where there is cstate_pkg and cstate_core? (ie no "_per" in
> the name)

Makes sense, we should probably rename the original "power" PMU to "power_pkg" 
as well.

Thanks,
Dhananjay

> 
> Thanks,
> Ian
> 
>>>
>>> Thanks,
>>> Dhananjay
>>>
>>>  Do we
>>>> want to add "event.cpus" support to the tool anyway for potential
>>>> future uses? This would at least avoid problems with newer kernels and
>>>> older perf tools were we to find a good use for it.
>>>>
>>>>>> My understanding had been that for core PMUs a "perf stat -C" option
>>>>>> would choose the particular CPU to count the event on, for an uncore
>>>>>> PMU the -C option would override the cpumask's "default" value. We
>>>>>> have code to validate this:
>>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522
>>>>>> But it seems now that overriding an uncore PMU's default CPU is
>>>>>> ignored.
>>>>>
>>>>> For the uncore driver, no matter what -C set, it writes the default CPU
>>>>> back.
>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760
>>>>>
>>>>>> If you did:
>>>>>> $ perf stat -C 1 -e data_read -a sleep 0.1
>>>>>> Then the tool thinks data_read is on CPU1 and will set its thread
>>>>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just
>>>>>> throw away the -C option.
>>>>> The perf tool can still read the the counter from CPU1 and no IPIs
>>>>> because of the PMU_EV_CAP_READ_ACTIVE_PKG().
>>>>>
>>>>> Not quite sure, but it seems only the open and close may be impacted and
>>>>> silently changed to CPU0.
>>>>
>>>> There's also enable/disable. Andi did the work and there were some
>>>> notable gains but likely more on core events. Ultimately it'd be nice
>>>> to be opening, closing.. everything in parallel given the calls are
>>>> slow and the work is embarrassingly parallel.
>>>> It feels like the cpumasks for uncore could still do with some cleanup
>>>> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm
>>>> tempted to rewrite evlist propagate maps as someone may look at it and
>>>> think I believe in what it is doing. The parallel stuff we should grab
>>>> Riccardo's past work.
>>>>
>>>> Thanks,
>>>> Ian
>>>>
>>>>
>>>>> Thanks,
>>>>> Kan
>>>>>>
>>>>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
>>>>>>>> files parse correctly and have a corresponding event.
>>>>>>>> 3) keep adding opening events on the PMU to a group to make sure that
>>>>>>>> when counters are exhausted the perf_event_open fails (I've seen this
>>>>>>>> bug on AMD)
>>>>>>>> 4) are the values in the type file unique
>>>>>>>>
>>>>>>>
>>>>>>> The rest sounds good to me.
>>>>>>
>>>>>> Cool. Let me know if you can think of more.
>>>>>>
>>>>>> Thanks,
>>>>>> Ian
>>>>>>
>>>>>>> Thanks,
>>>>>>> Kan
>>>>>>
>>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs
  2024-07-26  8:17                         ` Dhananjay Ugwekar
@ 2024-07-26 14:07                           ` Liang, Kan
  0 siblings, 0 replies; 34+ messages in thread
From: Liang, Kan @ 2024-07-26 14:07 UTC (permalink / raw)
  To: Dhananjay Ugwekar, Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Adrian Hunter, Bjorn Helgaas, Jonathan Corbet, James Clark,
	Ravi Bangoria, Dominique Martinet, linux-kernel, linux-perf-users,
	ananth.narayan, gautham.shenoy, kprateek.nayak, sandipan.das



On 2024-07-26 4:17 a.m., Dhananjay Ugwekar wrote:
> 
> 
> On 7/26/2024 1:22 PM, Ian Rogers wrote:
>> On Fri, Jul 26, 2024 at 12:10 AM Dhananjay Ugwekar
>> <Dhananjay.Ugwekar@amd.com> wrote:
>>>
>>>
>>>
>>> On 7/26/2024 12:36 PM, Dhananjay Ugwekar wrote:
>>>> Hello, Ian, Kan,
>>>>
>>>> On 7/20/2024 3:32 AM, Ian Rogers wrote:
>>>>> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>>>>> On 2024-07-19 10:59 a.m., Ian Rogers wrote:
>>>>>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and
>>>>>>> solely its perf event context, I want to know its core power and
>>>>>>> package power as a group so I never record one without the other. That
>>>>>>> grouping wouldn't be possible with 2 PMUs.
>>>>>>
>>>>>> For power, to be honest, I don't think it improves anything. It gives
>>>>>> users a false image that perf can group these counters.
>>>>>> But the truth is that perf cannot. The power counters are all
>>>>>> free-running counters. It's impossible to co-schedule them (which
>>>>>> requires a global mechanism to disable/enable all counters, e.g.,
>>>>>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one
>>>>>> by one while the counters keep running. There are no differences with or
>>>>>> without a group for the power events.
>>>>>
>>>>> Ok, so power should copy cstate with _core, _pkg, etc. I agree the
>>>>> difference is small and I like the idea of being consistent.
>>>>
>>>> So, it seems we want to follow the new PMU addition approach for RAPL
>>>> being consistent with Intel cstate driver, should I revive my "power_per_core"
>>>> PMU thread now?
>>>
>>> The power_per_core PMU thread link for reference,
>>>
>>> https://lore.kernel.org/all/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/
>>
>> I think so. Would it be possible to follow the same naming convention
>> as cstate, where there is cstate_pkg and cstate_core? (ie no "_per" in
>> the name)
> 
> Makes sense, we should probably rename the original "power" PMU to "power_pkg" 
> as well.

It may brings some compatible issue for the old platforms. There may be
two ways to address it.
- Add a symlink or something to link the "power" and "power_pkg".
- Only when there are two or more different scopes of counters in a
system, the "power_<scope>" are used. If there is only one scope of
power counter, "power" is still used.
The latter method is used for the Intel uncore and hybrid core drivers now.

Thanks,
Kan

> 
> Thanks,
> Dhananjay
> 
>>
>> Thanks,
>> Ian
>>
>>>>
>>>> Thanks,
>>>> Dhananjay
>>>>
>>>>  Do we
>>>>> want to add "event.cpus" support to the tool anyway for potential
>>>>> future uses? This would at least avoid problems with newer kernels and
>>>>> older perf tools were we to find a good use for it.
>>>>>
>>>>>>> My understanding had been that for core PMUs a "perf stat -C" option
>>>>>>> would choose the particular CPU to count the event on, for an uncore
>>>>>>> PMU the -C option would override the cpumask's "default" value. We
>>>>>>> have code to validate this:
>>>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522
>>>>>>> But it seems now that overriding an uncore PMU's default CPU is
>>>>>>> ignored.
>>>>>>
>>>>>> For the uncore driver, no matter what -C set, it writes the default CPU
>>>>>> back.
>>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760
>>>>>>
>>>>>>> If you did:
>>>>>>> $ perf stat -C 1 -e data_read -a sleep 0.1
>>>>>>> Then the tool thinks data_read is on CPU1 and will set its thread
>>>>>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just
>>>>>>> throw away the -C option.
>>>>>> The perf tool can still read the the counter from CPU1 and no IPIs
>>>>>> because of the PMU_EV_CAP_READ_ACTIVE_PKG().
>>>>>>
>>>>>> Not quite sure, but it seems only the open and close may be impacted and
>>>>>> silently changed to CPU0.
>>>>>
>>>>> There's also enable/disable. Andi did the work and there were some
>>>>> notable gains but likely more on core events. Ultimately it'd be nice
>>>>> to be opening, closing.. everything in parallel given the calls are
>>>>> slow and the work is embarrassingly parallel.
>>>>> It feels like the cpumasks for uncore could still do with some cleanup
>>>>> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm
>>>>> tempted to rewrite evlist propagate maps as someone may look at it and
>>>>> think I believe in what it is doing. The parallel stuff we should grab
>>>>> Riccardo's past work.
>>>>>
>>>>> Thanks,
>>>>> Ian
>>>>>
>>>>>
>>>>>> Thanks,
>>>>>> Kan
>>>>>>>
>>>>>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot)
>>>>>>>>> files parse correctly and have a corresponding event.
>>>>>>>>> 3) keep adding opening events on the PMU to a group to make sure that
>>>>>>>>> when counters are exhausted the perf_event_open fails (I've seen this
>>>>>>>>> bug on AMD)
>>>>>>>>> 4) are the values in the type file unique
>>>>>>>>>
>>>>>>>>
>>>>>>>> The rest sounds good to me.
>>>>>>>
>>>>>>> Cool. Let me know if you can think of more.
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Ian
>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> Kan
>>>>>>>
>>>
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2024-07-26 14:07 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-07-18  0:30 [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Ian Rogers
2024-07-18  0:30 ` [PATCH v2 1/6] perf pmu: Merge boolean sysfs event option parsing Ian Rogers
2024-07-18  0:30 ` [PATCH v2 2/6] perf parse-events: Pass cpu_list as a perf_cpu_map in __add_event Ian Rogers
2024-07-18  0:30 ` [PATCH v2 3/6] perf pmu: Add support for event.cpus files in sysfs Ian Rogers
2024-07-18 14:33   ` Liang, Kan
2024-07-18 15:39     ` Ian Rogers
2024-07-18 17:47       ` Liang, Kan
2024-07-18 20:50         ` Ian Rogers
2024-07-19 13:55           ` Liang, Kan
2024-07-19 14:59             ` Ian Rogers
2024-07-19 16:35               ` Liang, Kan
2024-07-19 22:02                 ` Ian Rogers
2024-07-22 13:57                   ` Liang, Kan
2024-07-22 15:43                     ` Ian Rogers
2024-07-22 16:45                       ` Liang, Kan
2024-07-26  7:06                   ` Dhananjay Ugwekar
2024-07-26  7:09                     ` Dhananjay Ugwekar
2024-07-26  7:52                       ` Ian Rogers
2024-07-26  8:17                         ` Dhananjay Ugwekar
2024-07-26 14:07                           ` Liang, Kan
2024-07-18  0:30 ` [PATCH v2 4/6] libperf cpumap: Add ability to create CPU from a single CPU number Ian Rogers
2024-07-18  0:30 ` [PATCH v2 5/6] perf parse-events: Set is_pmu_core for legacy hardware events Ian Rogers
2024-07-18  0:30 ` [PATCH v2 6/6] perf parse-events: Add "cpu" term to set the CPU an event is recorded on Ian Rogers
2024-07-18 14:09   ` James Clark
2024-07-18 15:07     ` Ian Rogers
2024-07-18 14:41   ` Liang, Kan
2024-07-18 15:12     ` Ian Rogers
2024-07-18 18:02       ` Liang, Kan
2024-07-18 21:06         ` Ian Rogers
2024-07-19 14:14           ` Liang, Kan
2024-07-19 15:01             ` Ian Rogers
2024-07-19 16:42               ` Liang, Kan
2024-07-18 13:42 ` [PATCH v2 0/6] Add support for sysfs event.cpus and cpu event term Dhananjay Ugwekar
2024-07-18 15:00   ` Ian Rogers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).