* [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers
2018-12-12 7:30 [PATCH v1 0/3] Reduce NUMA related overhead in perf record profiling on large server systems Alexey Budankov
@ 2018-12-12 7:40 ` Alexey Budankov
2018-12-12 12:14 ` Jiri Olsa
` (2 more replies)
2018-12-12 7:44 ` [PATCH v1 3/3] perf record: implement --affinity=node|cpu option Alexey Budankov
` (2 subsequent siblings)
3 siblings, 3 replies; 12+ messages in thread
From: Alexey Budankov @ 2018-12-12 7:40 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra
Cc: Jiri Olsa, Namhyung Kim, Alexander Shishkin, Andi Kleen,
linux-kernel
Build node cpu masks for mmap data buffers. Bind AIO data buffers
to nodes according to kernel data buffers location. Apply node cpu
masks to trace reading thread every time it references memory cross
node or cross cpu.
Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
---
tools/perf/builtin-record.c | 9 +++++++++
tools/perf/util/evlist.c | 6 +++++-
tools/perf/util/mmap.c | 38 ++++++++++++++++++++++++++++++++++++-
tools/perf/util/mmap.h | 1 +
4 files changed, 52 insertions(+), 2 deletions(-)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 4979719e54ae..1a1438c73f96 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -532,6 +532,9 @@ static int record__mmap_evlist(struct record *rec,
struct record_opts *opts = &rec->opts;
char msg[512];
+ if (opts->affinity != PERF_AFFINITY_SYS)
+ cpu__setup_cpunode_map();
+
if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
opts->auxtrace_mmap_pages,
opts->auxtrace_snapshot_mode,
@@ -751,6 +754,12 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
struct perf_mmap *map = &maps[i];
if (map->base) {
+ if (rec->opts.affinity != PERF_AFFINITY_SYS &&
+ !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
+ CPU_ZERO(&rec->affinity_mask);
+ CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
+ sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
+ }
if (!record__aio_enabled(rec)) {
if (perf_mmap__push(map, rec, record__pushfn) != 0) {
rc = -1;
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 60e825be944a..5ca5bb5ea0db 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1028,7 +1028,11 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
* Its value is decided by evsel's write_backward.
* So &mp should not be passed through const pointer.
*/
- struct mmap_params mp = { .nr_cblocks = nr_cblocks, .affinity = affinity };
+ struct mmap_params mp = {
+ .nr_cblocks = nr_cblocks,
+ .affinity = affinity,
+ .cpu_map = cpus
+ };
if (!evlist->mmap)
evlist->mmap = perf_evlist__alloc_mmap(evlist, false);
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index e68ba754a8e2..0d017ea85dcb 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -10,6 +10,9 @@
#include <sys/mman.h>
#include <inttypes.h>
#include <asm/bug.h>
+#ifdef HAVE_LIBNUMA_SUPPORT
+#include <numaif.h>
+#endif
#include "debug.h"
#include "event.h"
#include "mmap.h"
@@ -177,11 +180,27 @@ static int perf_mmap__aio_mmap(struct perf_mmap *map, struct mmap_params *mp)
}
delta_max = sysconf(_SC_AIO_PRIO_DELTA_MAX);
for (i = 0; i < map->aio.nr_cblocks; ++i) {
+#ifndef HAVE_LIBNUMA_SUPPORT
map->aio.data[i] = malloc(perf_mmap__mmap_len(map));
+#else
+ size_t mmap_len = perf_mmap__mmap_len(map);
+ map->aio.data[i] = mmap(NULL, mmap_len,
+ PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+#endif
if (!map->aio.data[i]) {
pr_debug2("failed to allocate data buffer area, error %m");
return -1;
}
+#ifdef HAVE_LIBNUMA_SUPPORT
+ if (mp->affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) {
+ unsigned long node_mask = 1UL << cpu__get_node(map->cpu);
+ if (mbind(map->aio.data[i], mmap_len, MPOL_BIND, &node_mask, 1, 0)) {
+ pr_debug2("failed to bind [%p-%p] to node %d\n",
+ map->aio.data[i], map->aio.data[i] + mmap_len,
+ cpu__get_node(map->cpu));
+ }
+ }
+#endif
/*
* Use cblock.aio_fildes value different from -1
* to denote started aio write operation on the
@@ -209,8 +228,13 @@ static void perf_mmap__aio_munmap(struct perf_mmap *map)
{
int i;
- for (i = 0; i < map->aio.nr_cblocks; ++i)
+ for (i = 0; i < map->aio.nr_cblocks; ++i) {
+#ifndef HAVE_LIBNUMA_SUPPORT
zfree(&map->aio.data[i]);
+#else
+ munmap(map->aio.data[i], perf_mmap__mmap_len(map));
+#endif
+ }
if (map->aio.data)
zfree(&map->aio.data);
zfree(&map->aio.cblocks);
@@ -316,6 +340,7 @@ void perf_mmap__munmap(struct perf_mmap *map)
int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu)
{
+ int c, nr_cpus, node;
/*
* The last one will be done at perf_mmap__consume(), so that we
* make sure we don't prevent tools from consuming every last event in
@@ -344,6 +369,17 @@ int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int c
map->cpu = cpu;
CPU_ZERO(&map->affinity_mask);
+ if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1) {
+ nr_cpus = cpu_map__nr(mp->cpu_map);
+ node = cpu__get_node(map->cpu);
+ for (c = 0; c < nr_cpus; c++) {
+ if (cpu__get_node(c) == node) {
+ CPU_SET(c, &map->affinity_mask);
+ }
+ }
+ } else if (mp->affinity == PERF_AFFINITY_CPU) {
+ CPU_SET(map->cpu, &map->affinity_mask);
+ }
if (auxtrace_mmap__mmap(&map->auxtrace_mmap,
&mp->auxtrace_mp, map->base, fd))
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index e566c19b242b..b3f724fad22e 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -72,6 +72,7 @@ enum bkw_mmap_state {
struct mmap_params {
int prot, mask, nr_cblocks, affinity;
struct auxtrace_mmap_params auxtrace_mp;
+ const struct cpu_map *cpu_map;
};
int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu);
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers
2018-12-12 7:40 ` [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers Alexey Budankov
@ 2018-12-12 12:14 ` Jiri Olsa
2018-12-13 6:26 ` Alexey Budankov
2018-12-12 12:15 ` Jiri Olsa
2018-12-12 12:15 ` Jiri Olsa
2 siblings, 1 reply; 12+ messages in thread
From: Jiri Olsa @ 2018-12-12 12:14 UTC (permalink / raw)
To: Alexey Budankov
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra,
Namhyung Kim, Alexander Shishkin, Andi Kleen, linux-kernel
On Wed, Dec 12, 2018 at 10:40:22AM +0300, Alexey Budankov wrote:
SNIP
> diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
> index e68ba754a8e2..0d017ea85dcb 100644
> --- a/tools/perf/util/mmap.c
> +++ b/tools/perf/util/mmap.c
> @@ -10,6 +10,9 @@
> #include <sys/mman.h>
> #include <inttypes.h>
> #include <asm/bug.h>
> +#ifdef HAVE_LIBNUMA_SUPPORT
> +#include <numaif.h>
> +#endif
> #include "debug.h"
> #include "event.h"
> #include "mmap.h"
> @@ -177,11 +180,27 @@ static int perf_mmap__aio_mmap(struct perf_mmap *map, struct mmap_params *mp)
> }
> delta_max = sysconf(_SC_AIO_PRIO_DELTA_MAX);
> for (i = 0; i < map->aio.nr_cblocks; ++i) {
> +#ifndef HAVE_LIBNUMA_SUPPORT
> map->aio.data[i] = malloc(perf_mmap__mmap_len(map));
> +#else
> + size_t mmap_len = perf_mmap__mmap_len(map);
> + map->aio.data[i] = mmap(NULL, mmap_len,
> + PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> +#endif
> if (!map->aio.data[i]) {
> pr_debug2("failed to allocate data buffer area, error %m");
> return -1;
> }
> +#ifdef HAVE_LIBNUMA_SUPPORT
> + if (mp->affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) {
> + unsigned long node_mask = 1UL << cpu__get_node(map->cpu);
> + if (mbind(map->aio.data[i], mmap_len, MPOL_BIND, &node_mask, 1, 0)) {
> + pr_debug2("failed to bind [%p-%p] to node %d\n",
> + map->aio.data[i], map->aio.data[i] + mmap_len,
> + cpu__get_node(map->cpu));
> + }
> + }
> +#endif
could you please do the same thing as we did for aio functions
(like record__aio_mmap_read_sync) and provide functions for both
#fidef cases?
thanks,
jirka
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers
2018-12-12 12:14 ` Jiri Olsa
@ 2018-12-13 6:26 ` Alexey Budankov
0 siblings, 0 replies; 12+ messages in thread
From: Alexey Budankov @ 2018-12-13 6:26 UTC (permalink / raw)
To: Jiri Olsa
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra,
Namhyung Kim, Alexander Shishkin, Andi Kleen, linux-kernel
Hi,
On 12.12.2018 15:14, Jiri Olsa wrote:
> On Wed, Dec 12, 2018 at 10:40:22AM +0300, Alexey Budankov wrote:
>
> SNIP
>
>> diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
>> index e68ba754a8e2..0d017ea85dcb 100644
>> --- a/tools/perf/util/mmap.c
>> +++ b/tools/perf/util/mmap.c
>> @@ -10,6 +10,9 @@
>> #include <sys/mman.h>
>> #include <inttypes.h>
>> #include <asm/bug.h>
>> +#ifdef HAVE_LIBNUMA_SUPPORT
>> +#include <numaif.h>
>> +#endif
>> #include "debug.h"
>> #include "event.h"
>> #include "mmap.h"
>> @@ -177,11 +180,27 @@ static int perf_mmap__aio_mmap(struct perf_mmap *map, struct mmap_params *mp)
>> }
>> delta_max = sysconf(_SC_AIO_PRIO_DELTA_MAX);
>> for (i = 0; i < map->aio.nr_cblocks; ++i) {
>> +#ifndef HAVE_LIBNUMA_SUPPORT
>> map->aio.data[i] = malloc(perf_mmap__mmap_len(map));
>> +#else
>> + size_t mmap_len = perf_mmap__mmap_len(map);
>> + map->aio.data[i] = mmap(NULL, mmap_len,
>> + PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
>> +#endif
>> if (!map->aio.data[i]) {
>> pr_debug2("failed to allocate data buffer area, error %m");
>> return -1;
>> }
>> +#ifdef HAVE_LIBNUMA_SUPPORT
>> + if (mp->affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) {
>> + unsigned long node_mask = 1UL << cpu__get_node(map->cpu);
>> + if (mbind(map->aio.data[i], mmap_len, MPOL_BIND, &node_mask, 1, 0)) {
>> + pr_debug2("failed to bind [%p-%p] to node %d\n",
>> + map->aio.data[i], map->aio.data[i] + mmap_len,
>> + cpu__get_node(map->cpu));
>> + }
>> + }
>> +#endif
>
> could you please do the same thing as we did for aio functions
> (like record__aio_mmap_read_sync) and provide functions for both
> #fidef cases?
Implemented perf_mmap__aio_alloc(),
perf_mmap__aio_free(),
perf_mmap__aio_bind() in v2.
Thanks,
Alexey
>
> thanks,
> jirka
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers
2018-12-12 7:40 ` [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers Alexey Budankov
2018-12-12 12:14 ` Jiri Olsa
@ 2018-12-12 12:15 ` Jiri Olsa
2018-12-13 7:04 ` Alexey Budankov
2018-12-12 12:15 ` Jiri Olsa
2 siblings, 1 reply; 12+ messages in thread
From: Jiri Olsa @ 2018-12-12 12:15 UTC (permalink / raw)
To: Alexey Budankov
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra,
Namhyung Kim, Alexander Shishkin, Andi Kleen, linux-kernel
On Wed, Dec 12, 2018 at 10:40:22AM +0300, Alexey Budankov wrote:
>
> Build node cpu masks for mmap data buffers. Bind AIO data buffers
> to nodes according to kernel data buffers location. Apply node cpu
> masks to trace reading thread every time it references memory cross
> node or cross cpu.
>
> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
> ---
> tools/perf/builtin-record.c | 9 +++++++++
> tools/perf/util/evlist.c | 6 +++++-
> tools/perf/util/mmap.c | 38 ++++++++++++++++++++++++++++++++++++-
> tools/perf/util/mmap.h | 1 +
> 4 files changed, 52 insertions(+), 2 deletions(-)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 4979719e54ae..1a1438c73f96 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -532,6 +532,9 @@ static int record__mmap_evlist(struct record *rec,
> struct record_opts *opts = &rec->opts;
> char msg[512];
>
> + if (opts->affinity != PERF_AFFINITY_SYS)
> + cpu__setup_cpunode_map();
> +
> if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
> opts->auxtrace_mmap_pages,
> opts->auxtrace_snapshot_mode,
> @@ -751,6 +754,12 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
> struct perf_mmap *map = &maps[i];
>
> if (map->base) {
> + if (rec->opts.affinity != PERF_AFFINITY_SYS &&
> + !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
> + CPU_ZERO(&rec->affinity_mask);
> + CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
> + sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
> + }
hum, so you change affinity every time you read different map?
I'm surprised this is actualy faster..
anyway this patch is doing 2 things.. binding the memory allocation
to nodes and setting the process affinity, please seprate those and
explain the logic behind
thanks,
jirka
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers
2018-12-12 12:15 ` Jiri Olsa
@ 2018-12-13 7:04 ` Alexey Budankov
0 siblings, 0 replies; 12+ messages in thread
From: Alexey Budankov @ 2018-12-13 7:04 UTC (permalink / raw)
To: Jiri Olsa
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra,
Namhyung Kim, Alexander Shishkin, Andi Kleen, linux-kernel
Hi,
On 12.12.2018 15:15, Jiri Olsa wrote:
> On Wed, Dec 12, 2018 at 10:40:22AM +0300, Alexey Budankov wrote:
>>
>> Build node cpu masks for mmap data buffers. Bind AIO data buffers
>> to nodes according to kernel data buffers location. Apply node cpu
>> masks to trace reading thread every time it references memory cross
>> node or cross cpu.
>>
>> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
>> ---
>> tools/perf/builtin-record.c | 9 +++++++++
>> tools/perf/util/evlist.c | 6 +++++-
>> tools/perf/util/mmap.c | 38 ++++++++++++++++++++++++++++++++++++-
>> tools/perf/util/mmap.h | 1 +
>> 4 files changed, 52 insertions(+), 2 deletions(-)
>>
>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>> index 4979719e54ae..1a1438c73f96 100644
>> --- a/tools/perf/builtin-record.c
>> +++ b/tools/perf/builtin-record.c
>> @@ -532,6 +532,9 @@ static int record__mmap_evlist(struct record *rec,
>> struct record_opts *opts = &rec->opts;
>> char msg[512];
>>
>> + if (opts->affinity != PERF_AFFINITY_SYS)
>> + cpu__setup_cpunode_map();
>> +
>> if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
>> opts->auxtrace_mmap_pages,
>> opts->auxtrace_snapshot_mode,
>> @@ -751,6 +754,12 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
>> struct perf_mmap *map = &maps[i];
>>
>> if (map->base) {
>> + if (rec->opts.affinity != PERF_AFFINITY_SYS &&
>> + !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
>> + CPU_ZERO(&rec->affinity_mask);
>> + CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
>> + sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
>> + }
>
> hum, so you change affinity every time you read different map?
That is what exactly happens when --affinity=cpu. With --affinity=node
thread affinity changes only when the thread gets mmap buffer allocated
at the remote node. For dual socket machine it is twice at max for one
loop execution.
> I'm surprised this is actualy faster..
Imagine that some app's thread running on cpu 0 of node 1 generates samples
into a kernel buffer which is also allocated at node 1. The tool thread
running on cpu 0 of node 0 takes the buffer and puts some part of it into
write syscall what can cause cross node memory move and induce collection
overhead (from the kernel buffer into fs cache buffers executing some portion
of write syscall code on cpu 0 of node 0).
>
> anyway this patch is doing 2 things.. binding the memory allocation
> to nodes and setting the process affinity, please seprate those and
> explain the logic behind
Separated in v2. Binding is implemented for AIO user space buffers only
to map them to the same nodes kernel buffers are mapped to. Tool thread
affinity mask bouncing is implemented and applicable as for serial as
for AIO streaming. AIO streaming without binding can result in cross node
memory moves from kernel buffers to AIO ones.
Thanks,
Alexey
>
> thanks,
> jirka
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers
2018-12-12 7:40 ` [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers Alexey Budankov
2018-12-12 12:14 ` Jiri Olsa
2018-12-12 12:15 ` Jiri Olsa
@ 2018-12-12 12:15 ` Jiri Olsa
2018-12-13 6:26 ` Alexey Budankov
2 siblings, 1 reply; 12+ messages in thread
From: Jiri Olsa @ 2018-12-12 12:15 UTC (permalink / raw)
To: Alexey Budankov
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra,
Namhyung Kim, Alexander Shishkin, Andi Kleen, linux-kernel
On Wed, Dec 12, 2018 at 10:40:22AM +0300, Alexey Budankov wrote:
SNIP
> int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu)
> {
> + int c, nr_cpus, node;
> /*
> * The last one will be done at perf_mmap__consume(), so that we
> * make sure we don't prevent tools from consuming every last event in
> @@ -344,6 +369,17 @@ int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int c
> map->cpu = cpu;
>
> CPU_ZERO(&map->affinity_mask);
> + if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1) {
> + nr_cpus = cpu_map__nr(mp->cpu_map);
> + node = cpu__get_node(map->cpu);
> + for (c = 0; c < nr_cpus; c++) {
> + if (cpu__get_node(c) == node) {
> + CPU_SET(c, &map->affinity_mask);
> + }
> + }
> + } else if (mp->affinity == PERF_AFFINITY_CPU) {
> + CPU_SET(map->cpu, &map->affinity_mask);
> + }
won't both of this end up in same mask?
jirka
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers
2018-12-12 12:15 ` Jiri Olsa
@ 2018-12-13 6:26 ` Alexey Budankov
0 siblings, 0 replies; 12+ messages in thread
From: Alexey Budankov @ 2018-12-13 6:26 UTC (permalink / raw)
To: Jiri Olsa
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra,
Namhyung Kim, Alexander Shishkin, Andi Kleen, linux-kernel
Hi,
On 12.12.2018 15:15, Jiri Olsa wrote:
> On Wed, Dec 12, 2018 at 10:40:22AM +0300, Alexey Budankov wrote:
>
> SNIP
>
>> int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu)
>> {
>> + int c, nr_cpus, node;
>> /*
>> * The last one will be done at perf_mmap__consume(), so that we
>> * make sure we don't prevent tools from consuming every last event in
>> @@ -344,6 +369,17 @@ int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int c
>> map->cpu = cpu;
>>
>> CPU_ZERO(&map->affinity_mask);
>> + if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1) {
>> + nr_cpus = cpu_map__nr(mp->cpu_map);
>> + node = cpu__get_node(map->cpu);
>> + for (c = 0; c < nr_cpus; c++) {
>> + if (cpu__get_node(c) == node) {
>> + CPU_SET(c, &map->affinity_mask);
>> + }
>> + }
>> + } else if (mp->affinity == PERF_AFFINITY_CPU) {
>> + CPU_SET(map->cpu, &map->affinity_mask);
>> + }
>
> won't both of this end up in same mask?
For tested dual socket 44 core broadwell:
node 0 node 1
cpu mask 0-21,44-65 22-43,66-87
For affinity=node map->affinity_mask is either [0-21,44-65] or [22-43,66-87].
For affinity=cpu map->affinity_mask is [0] or [1] or [2] and so on.
Without affinity option set map->affinity_mask and record->affinity_mask are [].
Thanks,
Alexey
>
> jirka
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v1 3/3] perf record: implement --affinity=node|cpu option
2018-12-12 7:30 [PATCH v1 0/3] Reduce NUMA related overhead in perf record profiling on large server systems Alexey Budankov
2018-12-12 7:40 ` [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers Alexey Budankov
@ 2018-12-12 7:44 ` Alexey Budankov
2018-12-12 7:54 ` [PATCH v1 1/3] perf record: allocate affinity masks Alexey Budankov
[not found] ` <afb88628-7a04-3c36-2bc9-b5f5774f8e8f@linux.intel.com>
3 siblings, 0 replies; 12+ messages in thread
From: Alexey Budankov @ 2018-12-12 7:44 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra
Cc: Jiri Olsa, Namhyung Kim, Alexander Shishkin, Andi Kleen,
linux-kernel
Implement --affinity=node|cpu option for the record mode defaulting
to system affinity mask bouncing.
Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
---
tools/perf/Documentation/perf-record.txt | 5 +++++
tools/perf/builtin-record.c | 18 ++++++++++++++++++
2 files changed, 23 insertions(+)
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index d232b13ea713..efb839784f32 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -440,6 +440,11 @@ Use <n> control blocks in asynchronous (Posix AIO) trace writing mode (default:
Asynchronous mode is supported only when linking Perf tool with libc library
providing implementation for Posix AIO API.
+--affinity=mode::
+Set affinity mask of trace reading thread according to the policy defined by 'mode' value:
+ node - thread affinity mask is set to NUMA node cpu mask of the processed mmap buffer
+ cpu - thread affinity mask is set to cpu of the processed mmap buffer
+
--all-kernel::
Configure all used events to run in kernel space.
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1a1438c73f96..3b4b47055aa1 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1649,6 +1649,21 @@ static int parse_clockid(const struct option *opt, const char *str, int unset)
ui__warning("unknown clockid %s, check man page\n", ostr);
return -1;
}
+static int record__parse_affinity(const struct option *opt, const char *str, int unset)
+{
+ struct record_opts *opts = (struct record_opts *)opt->value;
+
+ if (!unset) {
+ if (str) {
+ if (!strcasecmp(str, "node"))
+ opts->affinity = PERF_AFFINITY_NODE;
+ else if (!strcasecmp(str, "cpu"))
+ opts->affinity = PERF_AFFINITY_CPU;
+ }
+ }
+
+ return 0;
+}
static int record__parse_mmap_pages(const struct option *opt,
const char *str,
@@ -1957,6 +1972,9 @@ static struct option __record_options[] = {
&nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
record__aio_parse),
#endif
+ OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
+ "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
+ record__parse_affinity),
OPT_END()
};
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v1 1/3] perf record: allocate affinity masks
2018-12-12 7:30 [PATCH v1 0/3] Reduce NUMA related overhead in perf record profiling on large server systems Alexey Budankov
2018-12-12 7:40 ` [PATCH v1 2/3] perf record: apply affinity masks when reading mmap buffers Alexey Budankov
2018-12-12 7:44 ` [PATCH v1 3/3] perf record: implement --affinity=node|cpu option Alexey Budankov
@ 2018-12-12 7:54 ` Alexey Budankov
[not found] ` <afb88628-7a04-3c36-2bc9-b5f5774f8e8f@linux.intel.com>
3 siblings, 0 replies; 12+ messages in thread
From: Alexey Budankov @ 2018-12-12 7:54 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra
Cc: Jiri Olsa, Namhyung Kim, Alexander Shishkin, Andi Kleen,
linux-kernel
Allocate affinity option and masks for mmap data buffers and
record thread as well as initialize allocated objects.
Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
---
tools/perf/builtin-record.c | 11 ++++++++++-
tools/perf/perf.h | 5 +++++
tools/perf/util/evlist.c | 6 +++---
tools/perf/util/evlist.h | 2 +-
tools/perf/util/mmap.c | 2 ++
tools/perf/util/mmap.h | 3 ++-
6 files changed, 23 insertions(+), 6 deletions(-)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 882285fb9f64..4979719e54ae 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -81,6 +81,7 @@ struct record {
bool timestamp_boundary;
struct switch_output switch_output;
unsigned long long samples;
+ cpu_set_t affinity_mask;
};
static volatile int auxtrace_record__snapshot_started;
@@ -533,7 +534,8 @@ static int record__mmap_evlist(struct record *rec,
if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
opts->auxtrace_mmap_pages,
- opts->auxtrace_snapshot_mode, opts->nr_cblocks) < 0) {
+ opts->auxtrace_snapshot_mode,
+ opts->nr_cblocks, opts->affinity) < 0) {
if (errno == EPERM) {
pr_err("Permission error mapping pages.\n"
"Consider increasing "
@@ -1980,6 +1982,9 @@ int cmd_record(int argc, const char **argv)
# undef REASON
#endif
+ CPU_ZERO(&rec->affinity_mask);
+ rec->opts.affinity = PERF_AFFINITY_SYS;
+
rec->evlist = perf_evlist__new();
if (rec->evlist == NULL)
return -ENOMEM;
@@ -2143,6 +2148,10 @@ int cmd_record(int argc, const char **argv)
if (verbose > 0)
pr_info("nr_cblocks: %d\n", rec->opts.nr_cblocks);
+ pr_debug("affinity (UNSET:%d, NODE:%d, CPU:%d) = %d\n",
+ PERF_AFFINITY_SYS, PERF_AFFINITY_NODE,
+ PERF_AFFINITY_CPU, rec->opts.affinity);
+
err = __cmd_record(&record, argc, argv);
out:
perf_evlist__delete(rec->evlist);
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 388c6dd128b8..08e75815de2f 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -83,8 +83,13 @@ struct record_opts {
clockid_t clockid;
u64 clockid_res_ns;
int nr_cblocks;
+ int affinity;
};
+#define PERF_AFFINITY_SYS 0
+#define PERF_AFFINITY_NODE 1
+#define PERF_AFFINITY_CPU 2
+
struct option;
extern const char * const *record_usage;
extern struct option *record_options;
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index e90575192209..60e825be944a 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1018,7 +1018,7 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
*/
int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
unsigned int auxtrace_pages,
- bool auxtrace_overwrite, int nr_cblocks)
+ bool auxtrace_overwrite, int nr_cblocks, int affinity)
{
struct perf_evsel *evsel;
const struct cpu_map *cpus = evlist->cpus;
@@ -1028,7 +1028,7 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
* Its value is decided by evsel's write_backward.
* So &mp should not be passed through const pointer.
*/
- struct mmap_params mp = { .nr_cblocks = nr_cblocks };
+ struct mmap_params mp = { .nr_cblocks = nr_cblocks, .affinity = affinity };
if (!evlist->mmap)
evlist->mmap = perf_evlist__alloc_mmap(evlist, false);
@@ -1060,7 +1060,7 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages)
{
- return perf_evlist__mmap_ex(evlist, pages, 0, false, 0);
+ return perf_evlist__mmap_ex(evlist, pages, 0, false, 0, PERF_AFFINITY_SYS);
}
int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target)
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 868294491194..72728d7f4432 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -162,7 +162,7 @@ unsigned long perf_event_mlock_kb_in_pages(void);
int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
unsigned int auxtrace_pages,
- bool auxtrace_overwrite, int nr_cblocks);
+ bool auxtrace_overwrite, int nr_cblocks, int affinity);
int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages);
void perf_evlist__munmap(struct perf_evlist *evlist);
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 8fc39311a30d..e68ba754a8e2 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -343,6 +343,8 @@ int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int c
map->fd = fd;
map->cpu = cpu;
+ CPU_ZERO(&map->affinity_mask);
+
if (auxtrace_mmap__mmap(&map->auxtrace_mmap,
&mp->auxtrace_mp, map->base, fd))
return -1;
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index aeb6942fdb00..e566c19b242b 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -38,6 +38,7 @@ struct perf_mmap {
int nr_cblocks;
} aio;
#endif
+ cpu_set_t affinity_mask;
};
/*
@@ -69,7 +70,7 @@ enum bkw_mmap_state {
};
struct mmap_params {
- int prot, mask, nr_cblocks;
+ int prot, mask, nr_cblocks, affinity;
struct auxtrace_mmap_params auxtrace_mp;
};
^ permalink raw reply related [flat|nested] 12+ messages in thread[parent not found: <afb88628-7a04-3c36-2bc9-b5f5774f8e8f@linux.intel.com>]
* Re: [PATCH v1 1/3] perf record: allocate affinity masks
[not found] ` <afb88628-7a04-3c36-2bc9-b5f5774f8e8f@linux.intel.com>
@ 2018-12-12 12:15 ` Jiri Olsa
2018-12-13 6:26 ` Alexey Budankov
0 siblings, 1 reply; 12+ messages in thread
From: Jiri Olsa @ 2018-12-12 12:15 UTC (permalink / raw)
To: Alexey Budankov
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra,
Namhyung Kim, Alexander Shishkin, Andi Kleen, linux-kernel
On Wed, Dec 12, 2018 at 10:38:23AM +0300, Alexey Budankov wrote:
>
> Allocate affinity option and masks for mmap data buffers and
> record thread as well as initialize allocated objects.
>
> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
> ---
> tools/perf/builtin-record.c | 11 ++++++++++-
> tools/perf/perf.h | 5 +++++
> tools/perf/util/evlist.c | 6 +++---
> tools/perf/util/evlist.h | 2 +-
> tools/perf/util/mmap.c | 2 ++
> tools/perf/util/mmap.h | 3 ++-
> 6 files changed, 23 insertions(+), 6 deletions(-)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 882285fb9f64..4979719e54ae 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -81,6 +81,7 @@ struct record {
> bool timestamp_boundary;
> struct switch_output switch_output;
> unsigned long long samples;
> + cpu_set_t affinity_mask;
> };
>
> static volatile int auxtrace_record__snapshot_started;
> @@ -533,7 +534,8 @@ static int record__mmap_evlist(struct record *rec,
>
> if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
> opts->auxtrace_mmap_pages,
> - opts->auxtrace_snapshot_mode, opts->nr_cblocks) < 0) {
> + opts->auxtrace_snapshot_mode,
> + opts->nr_cblocks, opts->affinity) < 0) {
> if (errno == EPERM) {
> pr_err("Permission error mapping pages.\n"
> "Consider increasing "
> @@ -1980,6 +1982,9 @@ int cmd_record(int argc, const char **argv)
> # undef REASON
> #endif
>
> + CPU_ZERO(&rec->affinity_mask);
> + rec->opts.affinity = PERF_AFFINITY_SYS;
> +
> rec->evlist = perf_evlist__new();
> if (rec->evlist == NULL)
> return -ENOMEM;
> @@ -2143,6 +2148,10 @@ int cmd_record(int argc, const char **argv)
> if (verbose > 0)
> pr_info("nr_cblocks: %d\n", rec->opts.nr_cblocks);
>
> + pr_debug("affinity (UNSET:%d, NODE:%d, CPU:%d) = %d\n",
> + PERF_AFFINITY_SYS, PERF_AFFINITY_NODE,
> + PERF_AFFINITY_CPU, rec->opts.affinity);
please make this user friendly and display the actual string like "UNSET/NODE/CPU"
> +
> err = __cmd_record(&record, argc, argv);
> out:
> perf_evlist__delete(rec->evlist);
> diff --git a/tools/perf/perf.h b/tools/perf/perf.h
> index 388c6dd128b8..08e75815de2f 100644
> --- a/tools/perf/perf.h
> +++ b/tools/perf/perf.h
> @@ -83,8 +83,13 @@ struct record_opts {
> clockid_t clockid;
> u64 clockid_res_ns;
> int nr_cblocks;
> + int affinity;
> };
>
> +#define PERF_AFFINITY_SYS 0
> +#define PERF_AFFINITY_NODE 1
> +#define PERF_AFFINITY_CPU 2
please put those to enum
thanks,
jirka
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v1 1/3] perf record: allocate affinity masks
2018-12-12 12:15 ` Jiri Olsa
@ 2018-12-13 6:26 ` Alexey Budankov
0 siblings, 0 replies; 12+ messages in thread
From: Alexey Budankov @ 2018-12-13 6:26 UTC (permalink / raw)
To: Jiri Olsa
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra,
Namhyung Kim, Alexander Shishkin, Andi Kleen, linux-kernel
Hi,
On 12.12.2018 15:15, Jiri Olsa wrote:
> On Wed, Dec 12, 2018 at 10:38:23AM +0300, Alexey Budankov wrote:
>>
>> Allocate affinity option and masks for mmap data buffers and
>> record thread as well as initialize allocated objects.
>>
>> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
>> ---
>> tools/perf/builtin-record.c | 11 ++++++++++-
>> tools/perf/perf.h | 5 +++++
>> tools/perf/util/evlist.c | 6 +++---
>> tools/perf/util/evlist.h | 2 +-
>> tools/perf/util/mmap.c | 2 ++
>> tools/perf/util/mmap.h | 3 ++-
>> 6 files changed, 23 insertions(+), 6 deletions(-)
>>
>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>> index 882285fb9f64..4979719e54ae 100644
>> --- a/tools/perf/builtin-record.c
>> +++ b/tools/perf/builtin-record.c
>> @@ -81,6 +81,7 @@ struct record {
>> bool timestamp_boundary;
>> struct switch_output switch_output;
>> unsigned long long samples;
>> + cpu_set_t affinity_mask;
>> };
>>
>> static volatile int auxtrace_record__snapshot_started;
>> @@ -533,7 +534,8 @@ static int record__mmap_evlist(struct record *rec,
>>
>> if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
>> opts->auxtrace_mmap_pages,
>> - opts->auxtrace_snapshot_mode, opts->nr_cblocks) < 0) {
>> + opts->auxtrace_snapshot_mode,
>> + opts->nr_cblocks, opts->affinity) < 0) {
>> if (errno == EPERM) {
>> pr_err("Permission error mapping pages.\n"
>> "Consider increasing "
>> @@ -1980,6 +1982,9 @@ int cmd_record(int argc, const char **argv)
>> # undef REASON
>> #endif
>>
>> + CPU_ZERO(&rec->affinity_mask);
>> + rec->opts.affinity = PERF_AFFINITY_SYS;
>> +
>> rec->evlist = perf_evlist__new();
>> if (rec->evlist == NULL)
>> return -ENOMEM;
>> @@ -2143,6 +2148,10 @@ int cmd_record(int argc, const char **argv)
>> if (verbose > 0)
>> pr_info("nr_cblocks: %d\n", rec->opts.nr_cblocks);
>>
>> + pr_debug("affinity (UNSET:%d, NODE:%d, CPU:%d) = %d\n",
>> + PERF_AFFINITY_SYS, PERF_AFFINITY_NODE,
>> + PERF_AFFINITY_CPU, rec->opts.affinity);
>
> please make this user friendly and display the actual string like "UNSET/NODE/CPU"
Implemented in v2.
>
>> +
>> err = __cmd_record(&record, argc, argv);
>> out:
>> perf_evlist__delete(rec->evlist);
>> diff --git a/tools/perf/perf.h b/tools/perf/perf.h
>> index 388c6dd128b8..08e75815de2f 100644
>> --- a/tools/perf/perf.h
>> +++ b/tools/perf/perf.h
>> @@ -83,8 +83,13 @@ struct record_opts {
>> clockid_t clockid;
>> u64 clockid_res_ns;
>> int nr_cblocks;
>> + int affinity;
>> };
>>
>> +#define PERF_AFFINITY_SYS 0
>> +#define PERF_AFFINITY_NODE 1
>> +#define PERF_AFFINITY_CPU 2
>
> please put those to enum
Implemented in v2.
Thanks,
Alexey
>
> thanks,
> jirka
>
^ permalink raw reply [flat|nested] 12+ messages in thread