* [PATCH 00/12] trace-cmd: Add trace-cmd analyze command
@ 2022-03-24 2:57 Steven Rostedt
2022-03-24 2:57 ` [PATCH 01/12] trace-cmd: Add trace-cmd analyze Steven Rostedt
` (11 more replies)
0 siblings, 12 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Add a new command called "analyze" that reads a trace.dat file and will
report useful information from it. Such as:
Time the CPUs were idle.
What tasks ran the most on a CPU.
For each task, how long were the wake ups, how long was it blocked,
preempted, or just sleeping.
And many more to come.
See the man page at the last patch for more details.
Steven Rostedt (Google) (12):
trace-cmd: Add trace-cmd analyze
trace-cmd analyze: Show what tasks are running the most
trace-cmd analyze: Show how much tasks run on each CPU
trace-cmd analyze: Use sched_switch to find comm mappings
trace-cmd analyze: Use sched_switch event to update times
trace-cmd analyze: Add tracing of tasks and their states
trace-cmd analyze: Add "idleness"
trace-cmd analyze: Track migration
trace-cmd analyze: Add wake up latency timings
trace-cmd analyze: Add counting of page faults
trace-cmd analyze: Account for dropped events
trace-cmd analyze: Add documentation
.../trace-cmd/trace-cmd-analyze.1.txt | 290 +++++
tracecmd/Makefile | 1 +
tracecmd/include/trace-local.h | 2 +
tracecmd/trace-analyze.c | 1046 +++++++++++++++++
tracecmd/trace-cmd.c | 1 +
tracecmd/trace-usage.c | 7 +
6 files changed, 1347 insertions(+)
create mode 100644 Documentation/trace-cmd/trace-cmd-analyze.1.txt
create mode 100644 tracecmd/trace-analyze.c
--
2.35.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 01/12] trace-cmd: Add trace-cmd analyze
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
2022-03-28 16:08 ` Kalesh Singh
2022-03-24 2:57 ` [PATCH 02/12] trace-cmd analyze: Show what tasks are running the most Steven Rostedt
` (10 subsequent siblings)
11 siblings, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Add a new trace-cmd command "analyze" that will look at a trace.dat file
and report various statistics about the code.
Start off by showing the amount each CPU is idle.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tracecmd/Makefile | 1 +
tracecmd/include/trace-local.h | 2 +
tracecmd/trace-analyze.c | 508 +++++++++++++++++++++++++++++++++
tracecmd/trace-cmd.c | 1 +
4 files changed, 512 insertions(+)
create mode 100644 tracecmd/trace-analyze.c
diff --git a/tracecmd/Makefile b/tracecmd/Makefile
index 355f04723ad7..895294d09092 100644
--- a/tracecmd/Makefile
+++ b/tracecmd/Makefile
@@ -37,6 +37,7 @@ TRACE_CMD_OBJS += trace-dump.o
TRACE_CMD_OBJS += trace-clear.o
TRACE_CMD_OBJS += trace-vm.o
TRACE_CMD_OBJS += trace-convert.o
+TRACE_CMD_OBJS += trace-analyze.o
ifeq ($(VSOCK_DEFINED), 1)
TRACE_CMD_OBJS += trace-agent.o
diff --git a/tracecmd/include/trace-local.h b/tracecmd/include/trace-local.h
index bb33de06bb58..f9d0745914fe 100644
--- a/tracecmd/include/trace-local.h
+++ b/tracecmd/include/trace-local.h
@@ -112,6 +112,8 @@ void trace_dump(int argc, char **argv);
void trace_convert(int argc, char **argv);
+void trace_analyze(int argc, char **argv);
+
int trace_record_agent(struct tracecmd_msg_handle *msg_handle,
int cpus, int *fds,
int argc, char **argv, bool use_fifos,
diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
new file mode 100644
index 000000000000..4a17cf12eb07
--- /dev/null
+++ b/tracecmd/trace-analyze.c
@@ -0,0 +1,508 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright (C) 2022 Google Inc, Steven Rostedt <srostedt@redhat.com>
+ */
+#define _LARGEFILE64_SOURCE
+#include <getopt.h>
+#include <stdlib.h>
+
+#include "trace-cmd-local.h"
+#include "trace-local.h"
+#include "trace-hash.h"
+#include "list.h"
+
+static bool time_in_nsecs;
+
+struct analysis_data;
+
+struct cpu_data {
+ unsigned long long start_ts;
+ unsigned long long last_ts;
+ struct analysis_data *data;
+ struct trace_hash tasks;
+ int nr_tasks;
+ int cpu;
+ int current_pid;
+ int first_pid;
+};
+
+struct analysis_data {
+ unsigned long long start_ts;
+ unsigned long long last_ts;
+ struct cpu_data *cpu_data;
+ struct trace_hash tasks;
+ int nr_tasks;
+ int allocated_cpus;
+ int cpus;
+};
+
+struct task_item {
+ unsigned long long runtime;
+ unsigned long long start_ts;
+ struct trace_hash_item hash;
+ int pid;
+};
+
+struct task_cpu_item {
+ struct trace_hash_item hash;
+ struct task_item *task;
+};
+
+#define CPU_BLOCKS 32
+
+#define task_from_hash(item) container_of(item, struct task_item, hash)
+#define task_cpu_from_hash(item) container_of(item, struct task_cpu_item, hash)
+
+static struct cpu_data *get_cpu_data(struct analysis_data *data,
+ struct tep_record *record)
+{
+ struct cpu_data *cpu_data;
+ int cpu = record->cpu;
+ int cnt;
+
+ if (cpu < data->allocated_cpus) {
+ cpu_data = &data->cpu_data[cpu];
+ if (!cpu_data->data)
+ goto init_cpu_data;
+ return cpu_data;
+ }
+
+ /* Round up to CPU_BLOCKS multiplier */
+ cnt = ((cpu + CPU_BLOCKS) / CPU_BLOCKS) * CPU_BLOCKS;
+
+ cpu_data = realloc(data->cpu_data, sizeof(*cpu_data) * cnt);
+
+ if (!cpu_data)
+ die("Allocating cpu size %d for cpu %d", cnt, cpu);
+
+ memset(cpu_data + data->allocated_cpus, 0,
+ sizeof(*cpu_data) * (cnt - data->allocated_cpus));
+
+ data->allocated_cpus = cnt;
+
+ data->cpu_data = cpu_data;
+ cpu_data += cpu;
+
+ init_cpu_data:
+ cpu_data->current_pid = -1;
+ cpu_data->cpu = cpu;
+ cpu_data->data = data;
+
+ trace_hash_init(&cpu_data->tasks, 32);
+
+ return cpu_data;
+}
+
+static int check_idle(struct cpu_data *cpu_data, int pid)
+{
+ if (pid)
+ return pid;
+
+ /*
+ * Since pid 0 is the per cpu swapper task that
+ * means several of these tasks have the same pid
+ * and only differentiate between CPUs. Set the pid
+ * that is stored in the hash as -2 - CPU id.
+ */
+
+ return -2 - cpu_data->cpu;
+}
+
+static struct task_item *get_task(struct cpu_data *cpu_data, int pid)
+{
+ struct trace_hash_item *hash;
+ struct task_item *task;
+ int find_pid;
+
+ find_pid = check_idle(cpu_data, pid);
+
+ hash = trace_hash_find(&cpu_data->data->tasks, find_pid, NULL, NULL);
+ if (!hash) {
+ task = calloc(sizeof(*task), 1);
+ if (!task)
+ die("allocating task");
+ task->pid = find_pid;
+ hash = &task->hash;
+ hash->key = find_pid;
+ cpu_data->data->nr_tasks++;
+ trace_hash_add(&cpu_data->data->tasks, hash);
+ }
+
+ return task_from_hash(hash);
+}
+
+static struct task_cpu_item *get_cpu_task(struct cpu_data *cpu_data, int pid)
+{
+ struct trace_hash_item *hash;
+ struct task_cpu_item *task;
+
+ hash = trace_hash_find(&cpu_data->tasks, pid, NULL, NULL);
+ if (!hash) {
+ task = calloc(sizeof(*task), 1);
+ if (!task)
+ die("allocating cpu task");
+ task->task = get_task(cpu_data, pid);
+ hash = &task->hash;
+ hash->key = pid;
+ cpu_data->nr_tasks++;
+ trace_hash_add(&cpu_data->tasks, hash);
+ }
+
+ return task_cpu_from_hash(hash);
+};
+
+/* Update times for a task scheduling off the CPU */
+static void update_cpu_task_times(struct cpu_data *cpu_data,
+ struct task_cpu_item *cpu_task,
+ unsigned long long ts)
+{
+ unsigned long long delta;
+ struct task_item *task = cpu_task->task;
+
+ /*
+ * If the last event was a sched switch where the previous task
+ * ran on another CPU, and migrated back to this CPU, and sched
+ * switch was not recorded (saying that this task scheduled off)
+ * It could be miscronstrued to still be on this CPU, and that
+ * its start_ts is later than the last_ts of this CPU.
+ */
+ if (ts < task->start_ts)
+ return;
+
+ delta = ts - task->start_ts;
+ task->runtime += delta;
+}
+
+static void update_pid(struct cpu_data *cpu_data,
+ struct tep_record *record, int pid)
+{
+ struct task_cpu_item *cpu_task;
+ struct task_item *task;
+ int curr_pid = cpu_data->current_pid;
+ unsigned long long delta;
+
+ cpu_task = get_cpu_task(cpu_data, curr_pid);
+ task = cpu_task->task;
+
+ update_cpu_task_times(cpu_data, cpu_task, cpu_data->last_ts);
+
+ if (!record)
+ return;
+
+ cpu_task = get_cpu_task(cpu_data, pid);
+ task = cpu_task->task;
+
+ if (record->ts < cpu_data->last_ts) {
+ tracecmd_warning("task %d start time %llu greater than CPU time %llu",
+ pid, record->ts, cpu_data->last_ts);
+ } else {
+ delta = record->ts - cpu_data->last_ts;
+ task->runtime += delta;
+ }
+
+ cpu_data->last_ts = record->ts;
+
+ task->start_ts = cpu_data->last_ts;
+ cpu_data->current_pid = pid;
+}
+
+static void update_cpu_times(struct cpu_data *cpu_data,
+ struct tep_handle *tep, int pid,
+ struct tep_record *record)
+{
+ struct task_cpu_item *cpu_task;
+ struct task_item *task;
+
+ if (cpu_data->current_pid < 0) {
+ /* First time called */
+ cpu_data->start_ts = record->ts;
+ cpu_data->last_ts = record->ts;
+ cpu_data->current_pid = pid;
+ cpu_data->first_pid = pid;
+ cpu_task = get_cpu_task(cpu_data, pid);
+ task = cpu_task->task;
+ task->start_ts = record->ts;
+ if (record->ts < cpu_data->data->start_ts)
+ cpu_data->data->start_ts = record->ts;
+ return;
+ }
+
+ if (pid != cpu_data->current_pid) {
+ update_pid(cpu_data, record, pid);
+ return;
+ }
+
+ cpu_data->last_ts = record->ts;
+}
+
+static void update_first_pid(struct cpu_data *cpu_data)
+{
+ struct task_cpu_item *cpu_task;
+ struct task_item *task;
+ unsigned long long start_ts = cpu_data->data->start_ts;
+ unsigned long long delta;
+
+ cpu_task = get_cpu_task(cpu_data, cpu_data->first_pid);
+ task = cpu_task->task;
+ delta = cpu_data->start_ts - start_ts;
+ task->runtime += delta;
+ cpu_data->start_ts = start_ts;
+}
+
+static void process_cpu(struct analysis_data *data,
+ struct tep_handle *tep,
+ struct tep_record *record)
+{
+ struct cpu_data *cpu_data;
+ int pid;
+
+ pid = tep_data_pid(tep, record);
+ if (pid < 0) /* Warn? */
+ return;
+
+ cpu_data = get_cpu_data(data, record);
+ update_cpu_times(cpu_data, tep, pid, record);
+}
+
+static void print_time(unsigned long long ts, char delim)
+{
+ unsigned long long secs;
+ unsigned long long msecs;
+ unsigned long long usecs;
+ unsigned long long nsecs;
+
+ secs = ts / 1000000000;
+ ts -= secs * 1000000000;
+
+ msecs = ts / 1000000;
+ ts -= msecs * 1000000;
+
+ usecs = ts / 1000;
+ ts -= usecs * 1000;
+
+ nsecs = ts;
+
+ if (delim) {
+ printf("%6llu.%03llu%c%03llu",
+ secs, msecs, delim, usecs);
+ if (time_in_nsecs)
+ printf("%c%03llu", delim, nsecs);
+ } else {
+ printf("%6llu.%03llu%03llu",
+ secs, msecs, usecs);
+ if (time_in_nsecs)
+ printf("%03llu", nsecs);
+ }
+}
+
+static void print_total(struct tep_handle *tep, struct analysis_data *data)
+{
+ unsigned long long total_time;
+ struct trace_hash_item **bucket;
+ struct trace_hash_item *item;
+ struct task_item **idle_tasks;
+ struct task_item *task;
+ bool first = true;
+ int cpu;
+ int i = 0;
+
+ total_time = data->last_ts - data->start_ts;
+ printf("\nTotal time: ");
+ print_time(total_time, '_');
+ printf("\n");
+
+ idle_tasks = calloc(sizeof(*idle_tasks), data->allocated_cpus);
+ if (!idle_tasks)
+ die("Could not allocate idle task array");
+
+ trace_hash_for_each_bucket(bucket, &data->tasks) {
+ trace_hash_for_each_item(item, bucket) {
+ task = task_from_hash(item);
+ if (task->pid < 0) {
+ cpu = -2 - task->pid;
+ idle_tasks[cpu] = task;
+ }
+ }
+ }
+
+ for (i = 0; i < data->allocated_cpus; i++) {
+ if (!idle_tasks[i])
+ continue;
+
+ if (first) {
+ printf("\n Idle CPU\t Run time\n");
+ printf(" --------\t --------\n");
+ first = false;
+ }
+ printf("CPU %d idle:\t", i);
+ print_time(idle_tasks[i]->runtime, '_');
+ printf(" (%%%lld)\n", (idle_tasks[i]->runtime * 100) / total_time);
+ }
+ free(idle_tasks);
+}
+
+static void free_tasks(struct trace_hash *hash)
+{
+ struct trace_hash_item **bucket;
+ struct trace_hash_item *item;
+ struct task_item *task;
+
+ trace_hash_for_each_bucket(bucket, hash) {
+ trace_hash_while_item(item, bucket) {
+ task = task_from_hash(item);
+ trace_hash_del(item);
+ free(task);
+ }
+ }
+ trace_hash_free(hash);
+}
+
+static void free_cpu_tasks(struct trace_hash *hash)
+{
+ struct trace_hash_item **bucket;
+ struct trace_hash_item *item;
+ struct task_cpu_item *cpu_task;
+
+ trace_hash_for_each_bucket(bucket, hash) {
+ trace_hash_while_item(item, bucket) {
+ cpu_task = task_cpu_from_hash(item);
+ trace_hash_del(item);
+ free(cpu_task);
+ }
+ }
+ trace_hash_free(hash);
+}
+
+static void free_cpus(struct analysis_data *data)
+{
+ struct cpu_data *cpu_data;
+ int i;
+
+ for (i = 0; i < data->allocated_cpus; i++) {
+ cpu_data = &data->cpu_data[i];
+ if (!cpu_data->data)
+ continue;
+ free_cpu_tasks(&cpu_data->tasks);
+ }
+ free(data->cpu_data);
+}
+
+static void do_trace_analyze(struct tracecmd_input *handle)
+{
+ struct tep_handle *tep = tracecmd_get_tep(handle);
+ struct tep_record *record;
+ struct analysis_data data;
+ struct cpu_data *cpu_data;
+ int i;
+
+ memset(&data, 0, sizeof(data));
+
+ trace_hash_init(&data.tasks, 128);
+
+ /* Set to a very large number */
+ data.start_ts = -1ULL;
+
+ do {
+ record = tracecmd_read_next_data(handle, NULL);
+ if (record)
+ process_cpu(&data, tep, record);
+ tracecmd_free_record(record);
+ } while (record);
+
+ for (i = 0; i < data.allocated_cpus; i++) {
+ cpu_data = &data.cpu_data[i];
+ if (!cpu_data->data || !cpu_data->nr_tasks)
+ continue;
+ if (cpu_data->last_ts > data.last_ts)
+ data.last_ts = cpu_data->last_ts;
+ }
+
+ for (i = 0; i < data.allocated_cpus; i++) {
+ cpu_data = &data.cpu_data[i];
+ if (!cpu_data->data || !cpu_data->nr_tasks)
+ continue;
+ cpu_data->last_ts = data.last_ts;
+ update_pid(cpu_data, NULL, -1);
+ update_first_pid(cpu_data);
+ }
+
+ print_total(tep, &data);
+
+ free_cpus(&data);
+ free_tasks(&data.tasks);
+}
+
+void trace_analyze(int argc, char **argv)
+{
+ struct tracecmd_input *handle;
+ const char *input_file = NULL;
+ int instances;
+ int ret;
+
+ for (;;) {
+ int c;
+
+ c = getopt(argc-1, argv+1, "+hti:");
+ if (c == -1)
+ break;
+ switch (c) {
+ case 'h':
+ usage(argv);
+ break;
+ case 'i':
+ if (input_file)
+ die("Only one input for historgram");
+ input_file = optarg;
+ break;
+ case 't':
+ time_in_nsecs = true;
+ break;
+ default:
+ usage(argv);
+ }
+ }
+
+ if ((argc - optind) >= 2) {
+ if (input_file)
+ usage(argv);
+ input_file = argv[optind + 1];
+ }
+
+ if (!input_file)
+ input_file = DEFAULT_INPUT_FILE;
+
+ handle = tracecmd_alloc(input_file, 0);
+ if (!handle)
+ die("can't open %s\n", input_file);
+
+ ret = tracecmd_read_headers(handle, 0);
+ if (ret)
+ return;
+
+ ret = tracecmd_init_data(handle);
+ if (ret < 0)
+ die("failed to init data");
+
+ if (ret > 0)
+ die("trace-cmd hist does not work with latency traces\n");
+
+ do_trace_analyze(handle);
+
+ instances = tracecmd_buffer_instances(handle);
+ if (instances) {
+ struct tracecmd_input *new_handle;
+ int i;
+
+ for (i = 0; i < instances; i++) {
+ new_handle = tracecmd_buffer_instance_handle(handle, i);
+ if (!new_handle) {
+ warning("could not retrieve handle %d", i);
+ continue;
+ }
+ do_trace_analyze(new_handle);
+ tracecmd_close(new_handle);
+ }
+ }
+
+ tracecmd_close(handle);
+}
diff --git a/tracecmd/trace-cmd.c b/tracecmd/trace-cmd.c
index a83a8d0bb9e4..d6e1f9dacc3c 100644
--- a/tracecmd/trace-cmd.c
+++ b/tracecmd/trace-cmd.c
@@ -134,6 +134,7 @@ struct command commands[] = {
{"help", trace_usage},
{"dump", trace_dump},
{"convert", trace_convert},
+ {"analyze", trace_analyze},
{"-h", trace_usage},
};
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 02/12] trace-cmd analyze: Show what tasks are running the most
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
2022-03-24 2:57 ` [PATCH 01/12] trace-cmd: Add trace-cmd analyze Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
2022-03-24 2:57 ` [PATCH 03/12] trace-cmd analyze: Show how much tasks run on each CPU Steven Rostedt
` (9 subsequent siblings)
11 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Print out the tasks that are running the most on the system.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tracecmd/trace-analyze.c | 39 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 38 insertions(+), 1 deletion(-)
diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
index 4a17cf12eb07..2211079668e4 100644
--- a/tracecmd/trace-analyze.c
+++ b/tracecmd/trace-analyze.c
@@ -264,6 +264,17 @@ static void process_cpu(struct analysis_data *data,
update_cpu_times(cpu_data, tep, pid, record);
}
+static int cmp_tasks(const void *A, const void *B)
+{
+ struct task_item * const *a = A;
+ struct task_item * const *b = B;
+
+ if ((*a)->runtime > (*b)->runtime)
+ return -1;
+
+ return (*a)->runtime < (*b)->runtime;
+}
+
static void print_time(unsigned long long ts, char delim)
{
unsigned long long secs;
@@ -301,8 +312,10 @@ static void print_total(struct tep_handle *tep, struct analysis_data *data)
struct trace_hash_item **bucket;
struct trace_hash_item *item;
struct task_item **idle_tasks;
+ struct task_item **tasks;
struct task_item *task;
bool first = true;
+ int nr_tasks;
int cpu;
int i = 0;
@@ -311,6 +324,10 @@ static void print_total(struct tep_handle *tep, struct analysis_data *data)
print_time(total_time, '_');
printf("\n");
+ tasks = malloc(sizeof(*tasks) * data->nr_tasks);
+ if (!tasks)
+ die("Could not allocate task array");
+
idle_tasks = calloc(sizeof(*idle_tasks), data->allocated_cpus);
if (!idle_tasks)
die("Could not allocate idle task array");
@@ -321,9 +338,13 @@ static void print_total(struct tep_handle *tep, struct analysis_data *data)
if (task->pid < 0) {
cpu = -2 - task->pid;
idle_tasks[cpu] = task;
- }
+ } else
+ tasks[i++] = task;
}
}
+ nr_tasks = i;
+
+ qsort(tasks, nr_tasks, sizeof(*tasks), cmp_tasks);
for (i = 0; i < data->allocated_cpus; i++) {
if (!idle_tasks[i])
@@ -339,6 +360,22 @@ static void print_total(struct tep_handle *tep, struct analysis_data *data)
printf(" (%%%lld)\n", (idle_tasks[i]->runtime * 100) / total_time);
}
free(idle_tasks);
+
+ printf("\n");
+ for (i = 0; i < nr_tasks; i++) {
+ if (!i) {
+ printf(" Task name PID \t Run time\n");
+ printf(" --------- --- \t --------\n");
+ }
+ printf("%16s %8d\t",
+ tep_data_comm_from_pid(tep, tasks[i]->pid),
+ tasks[i]->pid);
+ print_time(tasks[i]->runtime, '_');
+ printf(" (%%%lld)\n", (tasks[i]->runtime * 100) / total_time);
+ }
+ free(tasks);
+
+ printf("\n");
}
static void free_tasks(struct trace_hash *hash)
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 03/12] trace-cmd analyze: Show how much tasks run on each CPU
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
2022-03-24 2:57 ` [PATCH 01/12] trace-cmd: Add trace-cmd analyze Steven Rostedt
2022-03-24 2:57 ` [PATCH 02/12] trace-cmd analyze: Show what tasks are running the most Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
2022-03-25 19:36 ` John Keeping
2022-03-24 2:57 ` [PATCH 04/12] trace-cmd analyze: Use sched_switch to find comm mappings Steven Rostedt
` (8 subsequent siblings)
11 siblings, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Display for each CPU that was traced, the amount of time tasks ran on
them. Listing the tasks from the longest runner to the least.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tracecmd/trace-analyze.c | 82 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 82 insertions(+)
diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
index 2211079668e4..43e9ffa95687 100644
--- a/tracecmd/trace-analyze.c
+++ b/tracecmd/trace-analyze.c
@@ -44,6 +44,7 @@ struct task_item {
};
struct task_cpu_item {
+ unsigned long long runtime;
struct trace_hash_item hash;
struct task_item *task;
};
@@ -171,6 +172,7 @@ static void update_cpu_task_times(struct cpu_data *cpu_data,
delta = ts - task->start_ts;
task->runtime += delta;
+ cpu_task->runtime += delta;
}
static void update_pid(struct cpu_data *cpu_data,
@@ -198,6 +200,7 @@ static void update_pid(struct cpu_data *cpu_data,
} else {
delta = record->ts - cpu_data->last_ts;
task->runtime += delta;
+ cpu_task->runtime += delta;
}
cpu_data->last_ts = record->ts;
@@ -246,6 +249,7 @@ static void update_first_pid(struct cpu_data *cpu_data)
task = cpu_task->task;
delta = cpu_data->start_ts - start_ts;
task->runtime += delta;
+ cpu_task->runtime += delta;
cpu_data->start_ts = start_ts;
}
@@ -275,6 +279,17 @@ static int cmp_tasks(const void *A, const void *B)
return (*a)->runtime < (*b)->runtime;
}
+static int cmp_cpu_tasks(const void *A, const void *B)
+{
+ struct task_cpu_item * const *a = A;
+ struct task_cpu_item * const *b = B;
+
+ if ((*a)->runtime > (*b)->runtime)
+ return -1;
+
+ return (*a)->runtime < (*b)->runtime;
+}
+
static void print_time(unsigned long long ts, char delim)
{
unsigned long long secs;
@@ -306,6 +321,67 @@ static void print_time(unsigned long long ts, char delim)
}
}
+static void print_cpu_data(struct tep_handle *tep, struct cpu_data *cpu_data)
+{
+ unsigned long long total_time;
+ struct trace_hash_item **bucket;
+ struct trace_hash_item *item;
+ struct task_cpu_item **cpu_tasks;
+ struct task_cpu_item *cpu_task;
+ struct task_item *idle_task = NULL;
+ struct task_item *task;
+ struct analysis_data *data;
+ int nr_tasks;
+ int i = 0;
+
+ data = cpu_data->data;
+ total_time = data->last_ts - data->start_ts;
+
+ printf("\nCPU %d\n", cpu_data->cpu);
+ printf("-------\n");
+
+ cpu_tasks = malloc(sizeof(*cpu_tasks) * cpu_data->nr_tasks);
+
+ if (!cpu_tasks)
+ die("Could not allocate task array");
+
+ trace_hash_for_each_bucket(bucket, &cpu_data->tasks) {
+ trace_hash_for_each_item(item, bucket) {
+ cpu_task = task_cpu_from_hash(item);
+ if (cpu_task->task->pid <= 0)
+ idle_task = cpu_task->task;
+ else
+ cpu_tasks[i++] = cpu_task;
+ }
+ }
+ nr_tasks = i;
+
+ if (idle_task) {
+ printf("idle:\t");
+ print_time(idle_task->runtime, '_');
+ printf(" (%%%lld)\n", (idle_task->runtime * 100) / total_time);
+ } else {
+ printf("Never idle!\n");
+ }
+
+ qsort(cpu_tasks, nr_tasks, sizeof(*cpu_tasks), cmp_cpu_tasks);
+
+ for (i = 0; i < nr_tasks; i++) {
+ task = cpu_tasks[i]->task;
+
+ if (!i) {
+ printf(" Task name PID \t Run time\n");
+ printf(" --------- --- \t --------\n");
+ }
+ printf("%16s %8d\t",
+ tep_data_comm_from_pid(tep, task->pid),
+ task->pid);
+ print_time(cpu_tasks[i]->runtime, '_');
+ printf(" (%%%lld)\n", (task->runtime * 100) / total_time);
+ }
+ free(cpu_tasks);
+}
+
static void print_total(struct tep_handle *tep, struct analysis_data *data)
{
unsigned long long total_time;
@@ -376,6 +452,12 @@ static void print_total(struct tep_handle *tep, struct analysis_data *data)
free(tasks);
printf("\n");
+
+ for (i = 0; i < data->allocated_cpus; i++) {
+ if (!data->cpu_data[i].data)
+ continue;
+ print_cpu_data(tep, &data->cpu_data[i]);
+ }
}
static void free_tasks(struct trace_hash *hash)
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 04/12] trace-cmd analyze: Use sched_switch to find comm mappings
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
` (2 preceding siblings ...)
2022-03-24 2:57 ` [PATCH 03/12] trace-cmd analyze: Show how much tasks run on each CPU Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
2022-03-24 2:57 ` [PATCH 05/12] trace-cmd analyze: Use sched_switch event to update times Steven Rostedt
` (7 subsequent siblings)
11 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
As the saved_cmdlines may miss a lot of comms, especially if the tracing
went for a while, to know what the comms mapping of PIDs are, use the
sched_switch event if it is present. If so, then update the task with the
comm and print that. This way we get more real names instead of just
"<...>".
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tracecmd/trace-analyze.c | 62 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 60 insertions(+), 2 deletions(-)
diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
index 43e9ffa95687..dec705ce8e15 100644
--- a/tracecmd/trace-analyze.c
+++ b/tracecmd/trace-analyze.c
@@ -29,6 +29,10 @@ struct cpu_data {
struct analysis_data {
unsigned long long start_ts;
unsigned long long last_ts;
+ struct tep_event *switch_event;
+ struct tep_format_field *prev_comm;
+ struct tep_format_field *next_comm;
+ struct tep_format_field *next_pid;
struct cpu_data *cpu_data;
struct trace_hash tasks;
int nr_tasks;
@@ -39,6 +43,7 @@ struct analysis_data {
struct task_item {
unsigned long long runtime;
unsigned long long start_ts;
+ char *comm;
struct trace_hash_item hash;
int pid;
};
@@ -253,11 +258,51 @@ static void update_first_pid(struct cpu_data *cpu_data)
cpu_data->start_ts = start_ts;
}
+static void process_switch(struct analysis_data *data,
+ struct tep_handle *tep, int pid,
+ struct tep_record *record)
+{
+ struct cpu_data *cpu_data = &data->cpu_data[record->cpu];
+ struct task_cpu_item *cpu_task;
+ struct task_item *task;
+ const char *comm;
+
+ cpu_task = get_cpu_task(cpu_data, pid);
+ task = cpu_task->task;
+
+ /* Fill in missing comms */
+ if (pid && data->prev_comm && !task->comm) {
+ comm = (char *)(record->data + data->prev_comm->offset);
+ task->comm = strdup(comm);
+ }
+
+ if (data->next_pid) {
+ unsigned long long val;
+
+ tep_read_number_field(data->next_pid, record->data, &val);
+ pid = val;
+ cpu_task = get_cpu_task(cpu_data, pid);
+ task = cpu_task->task;
+
+ /* Fill in missing comms */
+ if (pid && data->next_comm && !task->comm) {
+ comm = (char *)(record->data + data->next_comm->offset);
+ task->comm = strdup(comm);
+ }
+ }
+}
+
+static bool match_type(int type, struct tep_event *event)
+{
+ return event && type == event->id;
+}
+
static void process_cpu(struct analysis_data *data,
struct tep_handle *tep,
struct tep_record *record)
{
struct cpu_data *cpu_data;
+ int type;
int pid;
pid = tep_data_pid(tep, record);
@@ -266,6 +311,10 @@ static void process_cpu(struct analysis_data *data,
cpu_data = get_cpu_data(data, record);
update_cpu_times(cpu_data, tep, pid, record);
+
+ type = tep_data_type(tep, record);
+ if (match_type(type, data->switch_event))
+ process_switch(data, tep, pid, record);
}
static int cmp_tasks(const void *A, const void *B)
@@ -374,7 +423,7 @@ static void print_cpu_data(struct tep_handle *tep, struct cpu_data *cpu_data)
printf(" --------- --- \t --------\n");
}
printf("%16s %8d\t",
- tep_data_comm_from_pid(tep, task->pid),
+ task->comm ? : tep_data_comm_from_pid(tep, task->pid),
task->pid);
print_time(cpu_tasks[i]->runtime, '_');
printf(" (%%%lld)\n", (task->runtime * 100) / total_time);
@@ -444,7 +493,7 @@ static void print_total(struct tep_handle *tep, struct analysis_data *data)
printf(" --------- --- \t --------\n");
}
printf("%16s %8d\t",
- tep_data_comm_from_pid(tep, tasks[i]->pid),
+ tasks[i]->comm ? : tep_data_comm_from_pid(tep, tasks[i]->pid),
tasks[i]->pid);
print_time(tasks[i]->runtime, '_');
printf(" (%%%lld)\n", (tasks[i]->runtime * 100) / total_time);
@@ -470,6 +519,7 @@ static void free_tasks(struct trace_hash *hash)
trace_hash_while_item(item, bucket) {
task = task_from_hash(item);
trace_hash_del(item);
+ free(task->comm);
free(task);
}
}
@@ -518,9 +568,17 @@ static void do_trace_analyze(struct tracecmd_input *handle)
trace_hash_init(&data.tasks, 128);
+ data.switch_event = tep_find_event_by_name(tep, "sched", "sched_switch");
+
/* Set to a very large number */
data.start_ts = -1ULL;
+ if (data.switch_event) {
+ data.next_pid = tep_find_field(data.switch_event, "next_pid");
+ data.next_comm = tep_find_field(data.switch_event, "next_comm");
+ data.prev_comm = tep_find_field(data.switch_event, "prev_comm");
+ }
+
do {
record = tracecmd_read_next_data(handle, NULL);
if (record)
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 05/12] trace-cmd analyze: Use sched_switch event to update times
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
` (3 preceding siblings ...)
2022-03-24 2:57 ` [PATCH 04/12] trace-cmd analyze: Use sched_switch to find comm mappings Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
2022-03-24 2:57 ` [PATCH 06/12] trace-cmd analyze: Add tracing of tasks and their states Steven Rostedt
` (6 subsequent siblings)
11 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
If the sched_switch event is available, use that to figure out how long a
task is running on the CPU. It gives better accuracy than just basing it
off of events.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tracecmd/trace-analyze.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
index dec705ce8e15..56123f9b4f86 100644
--- a/tracecmd/trace-analyze.c
+++ b/tracecmd/trace-analyze.c
@@ -270,6 +270,8 @@ static void process_switch(struct analysis_data *data,
cpu_task = get_cpu_task(cpu_data, pid);
task = cpu_task->task;
+ update_cpu_task_times(cpu_data, cpu_task, record->ts);
+
/* Fill in missing comms */
if (pid && data->prev_comm && !task->comm) {
comm = (char *)(record->data + data->prev_comm->offset);
@@ -283,6 +285,8 @@ static void process_switch(struct analysis_data *data,
pid = val;
cpu_task = get_cpu_task(cpu_data, pid);
task = cpu_task->task;
+ task->start_ts = record->ts;
+ cpu_data->current_pid = pid;
/* Fill in missing comms */
if (pid && data->next_comm && !task->comm) {
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 06/12] trace-cmd analyze: Add tracing of tasks and their states
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
` (4 preceding siblings ...)
2022-03-24 2:57 ` [PATCH 05/12] trace-cmd analyze: Use sched_switch event to update times Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
2022-03-25 19:37 ` John Keeping
2022-03-24 2:57 ` [PATCH 07/12] trace-cmd analyze: Add "idleness" Steven Rostedt
` (5 subsequent siblings)
11 siblings, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
If sched_switch is found and has the prev_state field, then record them
and keep track of the time the task was in the state, the number of times
it was scheduled out of the state, and print the total time, average time,
and number of times the task was scheduled out of that state. Also show
the time stamp of where the longest time it went into that state.
Currently the only states that are supported are "preempted" (which was in
state running), "blocked" (state uninterruptible), "sleeping" (state
interruptible), and "other" (all other states).
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tracecmd/trace-analyze.c | 143 +++++++++++++++++++++++++++++++++++++--
1 file changed, 139 insertions(+), 4 deletions(-)
diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
index 56123f9b4f86..ee4f71a7da26 100644
--- a/tracecmd/trace-analyze.c
+++ b/tracecmd/trace-analyze.c
@@ -31,6 +31,7 @@ struct analysis_data {
unsigned long long last_ts;
struct tep_event *switch_event;
struct tep_format_field *prev_comm;
+ struct tep_format_field *prev_state;
struct tep_format_field *next_comm;
struct tep_format_field *next_pid;
struct cpu_data *cpu_data;
@@ -40,12 +41,25 @@ struct analysis_data {
int cpus;
};
+struct sched_timings {
+ unsigned long long last;
+ unsigned long long total;
+ unsigned long long nr;
+ unsigned long long longest;
+ unsigned long long longest_ts;
+};
+
struct task_item {
unsigned long long runtime;
unsigned long long start_ts;
+ struct sched_timings preempt;
+ struct sched_timings sleep;
+ struct sched_timings blocked;
+ struct sched_timings other;
char *comm;
struct trace_hash_item hash;
int pid;
+ int last_state;
};
struct task_cpu_item {
@@ -132,6 +146,7 @@ static struct task_item *get_task(struct cpu_data *cpu_data, int pid)
hash->key = find_pid;
cpu_data->data->nr_tasks++;
trace_hash_add(&cpu_data->data->tasks, hash);
+ task->last_state = -1;
}
return task_from_hash(hash);
@@ -157,6 +172,19 @@ static struct task_cpu_item *get_cpu_task(struct cpu_data *cpu_data, int pid)
return task_cpu_from_hash(hash);
};
+static void update_sched_timings (struct sched_timings *time, unsigned long long ts)
+{
+ unsigned long long delta;
+
+ delta = ts - time->last;
+ time->total += delta;
+ if (delta > time->longest) {
+ time->longest = delta;
+ time->longest_ts = time->last;
+ }
+ time->nr++;
+}
+
/* Update times for a task scheduling off the CPU */
static void update_cpu_task_times(struct cpu_data *cpu_data,
struct task_cpu_item *cpu_task,
@@ -265,6 +293,7 @@ static void process_switch(struct analysis_data *data,
struct cpu_data *cpu_data = &data->cpu_data[record->cpu];
struct task_cpu_item *cpu_task;
struct task_item *task;
+ unsigned long long val;
const char *comm;
cpu_task = get_cpu_task(cpu_data, pid);
@@ -273,9 +302,29 @@ static void process_switch(struct analysis_data *data,
update_cpu_task_times(cpu_data, cpu_task, record->ts);
/* Fill in missing comms */
- if (pid && data->prev_comm && !task->comm) {
- comm = (char *)(record->data + data->prev_comm->offset);
- task->comm = strdup(comm);
+ if (pid) {
+ if (data->prev_state) {
+ tep_read_number_field(data->prev_state, record->data, &val);
+ switch (val & 0x1f) {
+ case 0:
+ task->preempt.last = record->ts;
+ break;
+ case 0x1:
+ task->sleep.last = record->ts;
+ break;
+ case 0x2:
+ task->blocked.last = record->ts;
+ break;
+ default:
+ task->other.last = record->ts;
+ }
+ task->last_state = val & 0x1f;
+ }
+
+ if (data->prev_comm && !task->comm) {
+ comm = (char *)(record->data + data->prev_comm->offset);
+ task->comm = strdup(comm);
+ }
}
if (data->next_pid) {
@@ -288,6 +337,24 @@ static void process_switch(struct analysis_data *data,
task->start_ts = record->ts;
cpu_data->current_pid = pid;
+ switch (task->last_state) {
+ case -1:
+ /* First time seen */
+ break;
+ case 0:
+ update_sched_timings(&task->preempt, record->ts);
+ break;
+ case 0x1:
+ update_sched_timings(&task->sleep, record->ts);
+ break;
+ case 0x2:
+ update_sched_timings(&task->blocked, record->ts);
+ break;
+ default:
+ update_sched_timings(&task->other, record->ts);
+ }
+ task->last_state = val & 0x1f;
+
/* Fill in missing comms */
if (pid && data->next_comm && !task->comm) {
comm = (char *)(record->data + data->next_comm->offset);
@@ -343,6 +410,17 @@ static int cmp_cpu_tasks(const void *A, const void *B)
return (*a)->runtime < (*b)->runtime;
}
+static int cmp_task_pids(const void *A, const void *B)
+{
+ struct task_item * const *a = A;
+ struct task_item * const *b = B;
+
+ if ((*a)->pid < (*b)->pid)
+ return -1;
+
+ return (*a)->pid > (*b)->pid;
+}
+
static void print_time(unsigned long long ts, char delim)
{
unsigned long long secs;
@@ -374,6 +452,34 @@ static void print_time(unsigned long long ts, char delim)
}
}
+static void print_sched_timings(const char *label, struct sched_timings *time)
+{
+ unsigned long long avg;
+ int n;
+
+ if (!time->nr)
+ return;
+
+ printf("%s:\t", label);
+ print_time(time->total, 0);
+ n = printf(" (%llu)", time->nr);
+ if (n < 8)
+ printf("%*.s", 8 - n, "");
+ avg = time->total / time->nr;
+ print_time(avg, 0);
+ if (time_in_nsecs)
+ printf(" ");
+ else
+ printf(" ");
+ print_time(time->longest, 0);
+ if (time_in_nsecs)
+ printf(" ");
+ else
+ printf(" ");
+ print_time(time->longest_ts, 0);
+ printf("\n");
+}
+
static void print_cpu_data(struct tep_handle *tep, struct cpu_data *cpu_data)
{
unsigned long long total_time;
@@ -435,6 +541,27 @@ static void print_cpu_data(struct tep_handle *tep, struct cpu_data *cpu_data)
free(cpu_tasks);
}
+static void print_task(struct tep_handle *tep, struct task_item *task)
+{
+ const char *a = "";
+
+ if (time_in_nsecs)
+ a = " ";
+
+ printf("\nTask: %d : %s:\n",
+ task->pid , task->comm ? : tep_data_comm_from_pid(tep, task->pid));
+ printf("Runtime: ");
+ print_time(task->runtime, '_');
+ printf("\nType Total %s(cnt) Avg Longest Where\n",
+ a);
+ printf("---- ----- %s----- --- ------- -----\n",
+ a);
+ print_sched_timings("Preempted", &task->preempt);
+ print_sched_timings("Blocked", &task->blocked);
+ print_sched_timings("Sleeping", &task->sleep);
+ print_sched_timings("Other", &task->other);
+}
+
static void print_total(struct tep_handle *tep, struct analysis_data *data)
{
unsigned long long total_time;
@@ -502,7 +629,6 @@ static void print_total(struct tep_handle *tep, struct analysis_data *data)
print_time(tasks[i]->runtime, '_');
printf(" (%%%lld)\n", (tasks[i]->runtime * 100) / total_time);
}
- free(tasks);
printf("\n");
@@ -511,6 +637,14 @@ static void print_total(struct tep_handle *tep, struct analysis_data *data)
continue;
print_cpu_data(tep, &data->cpu_data[i]);
}
+
+ qsort(tasks, nr_tasks, sizeof(*tasks), cmp_task_pids);
+
+ printf("\n");
+ for (i = 0; i < nr_tasks; i++) {
+ print_task(tep, tasks[i]);
+ }
+ free(tasks);
}
static void free_tasks(struct trace_hash *hash)
@@ -581,6 +715,7 @@ static void do_trace_analyze(struct tracecmd_input *handle)
data.next_pid = tep_find_field(data.switch_event, "next_pid");
data.next_comm = tep_find_field(data.switch_event, "next_comm");
data.prev_comm = tep_find_field(data.switch_event, "prev_comm");
+ data.prev_state = tep_find_field(data.switch_event, "prev_state");
}
do {
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 07/12] trace-cmd analyze: Add "idleness"
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
` (5 preceding siblings ...)
2022-03-24 2:57 ` [PATCH 06/12] trace-cmd analyze: Add tracing of tasks and their states Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
2022-03-24 2:57 ` [PATCH 08/12] trace-cmd analyze: Track migration Steven Rostedt
` (4 subsequent siblings)
11 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Report the "idleness" of a CPU. How long it was idle, the longest time it
was idle, the average time it is idle, the number of times it was idle,
and where the longest time it was idle.
Note, anytime there is an event, the CPU is considered "not idle", that
is, it was woken up.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tracecmd/trace-analyze.c | 87 +++++++++++++++++++++++++++++++++++-----
1 file changed, 77 insertions(+), 10 deletions(-)
diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
index ee4f71a7da26..de776b76c343 100644
--- a/tracecmd/trace-analyze.c
+++ b/tracecmd/trace-analyze.c
@@ -14,16 +14,19 @@
static bool time_in_nsecs;
struct analysis_data;
+struct task_item;
struct cpu_data {
unsigned long long start_ts;
unsigned long long last_ts;
+ struct task_item *idle_task;
struct analysis_data *data;
struct trace_hash tasks;
int nr_tasks;
int cpu;
int current_pid;
int first_pid;
+ bool last_idle;
};
struct analysis_data {
@@ -185,6 +188,31 @@ static void update_sched_timings (struct sched_timings *time, unsigned long long
time->nr++;
}
+static void update_idle_task(struct cpu_data *cpu_data, struct task_item *task,
+ unsigned long long ts)
+{
+ if (!cpu_data->idle_task) {
+ if (task && task->pid <= 0) {
+ cpu_data->idle_task = task;
+ task->other.last = ts;
+ cpu_data->last_idle = true;
+ }
+ return;
+ }
+ if (!task || task->pid > 0) {
+ if (cpu_data->last_idle)
+ update_sched_timings(&cpu_data->idle_task->other, ts);
+
+ cpu_data->last_idle = false;
+ return;
+ }
+ if (cpu_data->last_idle)
+ update_sched_timings(&task->other, ts);
+
+ cpu_data->last_idle = true;
+ task->other.last = ts;
+}
+
/* Update times for a task scheduling off the CPU */
static void update_cpu_task_times(struct cpu_data *cpu_data,
struct task_cpu_item *cpu_task,
@@ -203,6 +231,8 @@ static void update_cpu_task_times(struct cpu_data *cpu_data,
if (ts < task->start_ts)
return;
+ update_idle_task(cpu_data, task, ts);
+
delta = ts - task->start_ts;
task->runtime += delta;
cpu_task->runtime += delta;
@@ -227,6 +257,8 @@ static void update_pid(struct cpu_data *cpu_data,
cpu_task = get_cpu_task(cpu_data, pid);
task = cpu_task->task;
+ update_idle_task(cpu_data, task, record->ts);
+
if (record->ts < cpu_data->last_ts) {
tracecmd_warning("task %d start time %llu greater than CPU time %llu",
pid, record->ts, cpu_data->last_ts);
@@ -258,6 +290,7 @@ static void update_cpu_times(struct cpu_data *cpu_data,
cpu_task = get_cpu_task(cpu_data, pid);
task = cpu_task->task;
task->start_ts = record->ts;
+ update_idle_task(cpu_data, task, record->ts);
if (record->ts < cpu_data->data->start_ts)
cpu_data->data->start_ts = record->ts;
return;
@@ -268,6 +301,8 @@ static void update_cpu_times(struct cpu_data *cpu_data,
return;
}
+ update_idle_task(cpu_data, !pid ? cpu_data->idle_task : NULL, record->ts);
+
cpu_data->last_ts = record->ts;
}
@@ -283,6 +318,17 @@ static void update_first_pid(struct cpu_data *cpu_data)
delta = cpu_data->start_ts - start_ts;
task->runtime += delta;
cpu_task->runtime += delta;
+
+ /* Handle idle timings if it was the first task */
+ if (task->pid <= 0) {
+ task->other.total += delta;
+ if (delta > task->other.longest) {
+ task->other.longest = delta;
+ task->other.longest_ts = cpu_data->start_ts;
+ }
+ task->other.nr++;
+ }
+
cpu_data->start_ts = start_ts;
}
@@ -337,6 +383,8 @@ static void process_switch(struct analysis_data *data,
task->start_ts = record->ts;
cpu_data->current_pid = pid;
+ update_idle_task(cpu_data, task, record->ts);
+
switch (task->last_state) {
case -1:
/* First time seen */
@@ -452,6 +500,27 @@ static void print_time(unsigned long long ts, char delim)
}
}
+static void print_timings_title(const char *label)
+{
+ const char *a = "";
+ int len;
+ int i;
+
+ if (time_in_nsecs)
+ a = " ";
+
+ len = snprintf(NULL, 0, "%s", label);
+
+ printf("%s%*sTotal %s(cnt) Avg Longest Where\n",
+ label, 21 - len, "", a);
+
+ for (i = 0; i < len; i++)
+ printf("-");
+
+ printf("%*s----- %s----- --- ------- -----\n",
+ 21 - len, "", a);
+}
+
static void print_sched_timings(const char *label, struct sched_timings *time)
{
unsigned long long avg;
@@ -460,7 +529,10 @@ static void print_sched_timings(const char *label, struct sched_timings *time)
if (!time->nr)
return;
- printf("%s:\t", label);
+ if (label)
+ printf("%s:%*s", label, 15 - (int)strlen(label), "");
+ else
+ printf("\t\t");
print_time(time->total, 0);
n = printf(" (%llu)", time->nr);
if (n < 8)
@@ -519,6 +591,8 @@ static void print_cpu_data(struct tep_handle *tep, struct cpu_data *cpu_data)
printf("idle:\t");
print_time(idle_task->runtime, '_');
printf(" (%%%lld)\n", (idle_task->runtime * 100) / total_time);
+ print_timings_title("Idleness");
+ print_sched_timings(NULL, &idle_task->other);
} else {
printf("Never idle!\n");
}
@@ -543,19 +617,12 @@ static void print_cpu_data(struct tep_handle *tep, struct cpu_data *cpu_data)
static void print_task(struct tep_handle *tep, struct task_item *task)
{
- const char *a = "";
-
- if (time_in_nsecs)
- a = " ";
-
printf("\nTask: %d : %s:\n",
task->pid , task->comm ? : tep_data_comm_from_pid(tep, task->pid));
printf("Runtime: ");
print_time(task->runtime, '_');
- printf("\nType Total %s(cnt) Avg Longest Where\n",
- a);
- printf("---- ----- %s----- --- ------- -----\n",
- a);
+ printf("\n");
+ print_timings_title("Type");
print_sched_timings("Preempted", &task->preempt);
print_sched_timings("Blocked", &task->blocked);
print_sched_timings("Sleeping", &task->sleep);
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 08/12] trace-cmd analyze: Track migration
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
` (6 preceding siblings ...)
2022-03-24 2:57 ` [PATCH 07/12] trace-cmd analyze: Add "idleness" Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
2022-03-24 2:57 ` [PATCH 09/12] trace-cmd analyze: Add wake up latency timings Steven Rostedt
` (3 subsequent siblings)
11 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Keep track of which CPU event task is at for every event, and if the CPU
changes, then mark it as a migration. Print out the number of times the
task migrated if it had migrated.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tracecmd/trace-analyze.c | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
index de776b76c343..4db93367727b 100644
--- a/tracecmd/trace-analyze.c
+++ b/tracecmd/trace-analyze.c
@@ -55,6 +55,7 @@ struct sched_timings {
struct task_item {
unsigned long long runtime;
unsigned long long start_ts;
+ unsigned long long migrated;
struct sched_timings preempt;
struct sched_timings sleep;
struct sched_timings blocked;
@@ -62,6 +63,7 @@ struct task_item {
char *comm;
struct trace_hash_item hash;
int pid;
+ int last_cpu;
int last_state;
};
@@ -149,6 +151,7 @@ static struct task_item *get_task(struct cpu_data *cpu_data, int pid)
hash->key = find_pid;
cpu_data->data->nr_tasks++;
trace_hash_add(&cpu_data->data->tasks, hash);
+ task->last_cpu = cpu_data->cpu;
task->last_state = -1;
}
@@ -256,6 +259,10 @@ static void update_pid(struct cpu_data *cpu_data,
cpu_task = get_cpu_task(cpu_data, pid);
task = cpu_task->task;
+ if (task->last_cpu != cpu_data->cpu) {
+ task->last_cpu = cpu_data->cpu;
+ task->migrated++;
+ }
update_idle_task(cpu_data, task, record->ts);
@@ -383,6 +390,11 @@ static void process_switch(struct analysis_data *data,
task->start_ts = record->ts;
cpu_data->current_pid = pid;
+ if (task->last_cpu != cpu_data->cpu) {
+ task->last_cpu = cpu_data->cpu;
+ task->migrated++;
+ }
+
update_idle_task(cpu_data, task, record->ts);
switch (task->last_state) {
@@ -619,9 +631,11 @@ static void print_task(struct tep_handle *tep, struct task_item *task)
{
printf("\nTask: %d : %s:\n",
task->pid , task->comm ? : tep_data_comm_from_pid(tep, task->pid));
- printf("Runtime: ");
+ printf("Runtime: ");
print_time(task->runtime, '_');
printf("\n");
+ if (task->migrated)
+ printf("Migrated:\t%llu\n", task->migrated);
print_timings_title("Type");
print_sched_timings("Preempted", &task->preempt);
print_sched_timings("Blocked", &task->blocked);
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 09/12] trace-cmd analyze: Add wake up latency timings
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
` (7 preceding siblings ...)
2022-03-24 2:57 ` [PATCH 08/12] trace-cmd analyze: Track migration Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
2022-03-25 19:34 ` John Keeping
2022-03-24 2:57 ` [PATCH 10/12] trace-cmd analyze: Add counting of page faults Steven Rostedt
` (2 subsequent siblings)
11 siblings, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
If the sched_waking (or sched_wakeup) event is found, then use it to time
the wake up latency for each task.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tracecmd/trace-analyze.c | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+)
diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
index 4db93367727b..7cf63923b962 100644
--- a/tracecmd/trace-analyze.c
+++ b/tracecmd/trace-analyze.c
@@ -33,10 +33,12 @@ struct analysis_data {
unsigned long long start_ts;
unsigned long long last_ts;
struct tep_event *switch_event;
+ struct tep_event *wakeup_event;
struct tep_format_field *prev_comm;
struct tep_format_field *prev_state;
struct tep_format_field *next_comm;
struct tep_format_field *next_pid;
+ struct tep_format_field *wakeup_pid;
struct cpu_data *cpu_data;
struct trace_hash tasks;
int nr_tasks;
@@ -60,11 +62,13 @@ struct task_item {
struct sched_timings sleep;
struct sched_timings blocked;
struct sched_timings other;
+ struct sched_timings wakeup;
char *comm;
struct trace_hash_item hash;
int pid;
int last_cpu;
int last_state;
+ bool woken;
};
struct task_cpu_item {
@@ -420,9 +424,31 @@ static void process_switch(struct analysis_data *data,
comm = (char *)(record->data + data->next_comm->offset);
task->comm = strdup(comm);
}
+
+ if (task->woken)
+ update_sched_timings(&task->wakeup, record->ts);
+ task->woken = false;
}
}
+static void process_wakeup(struct analysis_data *data,
+ struct tep_handle *tep,
+ struct tep_record *record)
+{
+ struct cpu_data *cpu_data = &data->cpu_data[record->cpu];
+ struct task_cpu_item *cpu_task;
+ struct task_item *task;
+ unsigned long long val;
+ int pid;
+
+ tep_read_number_field(data->wakeup_pid, record->data, &val);
+ pid = val;
+ cpu_task = get_cpu_task(cpu_data, pid);
+ task = cpu_task->task;
+ task->wakeup.last = record->ts;
+ task->woken = true;
+}
+
static bool match_type(int type, struct tep_event *event)
{
return event && type == event->id;
@@ -446,6 +472,9 @@ static void process_cpu(struct analysis_data *data,
type = tep_data_type(tep, record);
if (match_type(type, data->switch_event))
process_switch(data, tep, pid, record);
+
+ else if (match_type(type, data->wakeup_event))
+ process_wakeup(data, tep, record);
}
static int cmp_tasks(const void *A, const void *B)
@@ -637,6 +666,7 @@ static void print_task(struct tep_handle *tep, struct task_item *task)
if (task->migrated)
printf("Migrated:\t%llu\n", task->migrated);
print_timings_title("Type");
+ print_sched_timings("Wakeup", &task->wakeup);
print_sched_timings("Preempted", &task->preempt);
print_sched_timings("Blocked", &task->blocked);
print_sched_timings("Sleeping", &task->sleep);
@@ -788,6 +818,9 @@ static void do_trace_analyze(struct tracecmd_input *handle)
trace_hash_init(&data.tasks, 128);
data.switch_event = tep_find_event_by_name(tep, "sched", "sched_switch");
+ data.wakeup_event = tep_find_event_by_name(tep, "sched", "sched_waking");
+ if (!data.wakeup_event)
+ data.wakeup_event = tep_find_event_by_name(tep, "sched", "sched_wakeup");
/* Set to a very large number */
data.start_ts = -1ULL;
@@ -799,6 +832,12 @@ static void do_trace_analyze(struct tracecmd_input *handle)
data.prev_state = tep_find_field(data.switch_event, "prev_state");
}
+ if (data.wakeup_event) {
+ data.wakeup_pid = tep_find_field(data.wakeup_event, "pid");
+ if (!data.wakeup_pid)
+ data.wakeup_event = NULL;
+ }
+
do {
record = tracecmd_read_next_data(handle, NULL);
if (record)
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 10/12] trace-cmd analyze: Add counting of page faults
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
` (8 preceding siblings ...)
2022-03-24 2:57 ` [PATCH 09/12] trace-cmd analyze: Add wake up latency timings Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
2022-03-24 2:57 ` [PATCH 11/12] trace-cmd analyze: Account for dropped events Steven Rostedt
2022-03-24 2:57 ` [PATCH 12/12] trace-cmd analyze: Add documentation Steven Rostedt
11 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
If the page_fault_user event is found, then add counting of page faults
for each task and display them.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tracecmd/trace-analyze.c | 23 ++++++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
index 7cf63923b962..2603c360c404 100644
--- a/tracecmd/trace-analyze.c
+++ b/tracecmd/trace-analyze.c
@@ -34,6 +34,7 @@ struct analysis_data {
unsigned long long last_ts;
struct tep_event *switch_event;
struct tep_event *wakeup_event;
+ struct tep_event *page_fault_event;
struct tep_format_field *prev_comm;
struct tep_format_field *prev_state;
struct tep_format_field *next_comm;
@@ -58,6 +59,7 @@ struct task_item {
unsigned long long runtime;
unsigned long long start_ts;
unsigned long long migrated;
+ unsigned long long faulted;
struct sched_timings preempt;
struct sched_timings sleep;
struct sched_timings blocked;
@@ -449,6 +451,19 @@ static void process_wakeup(struct analysis_data *data,
task->woken = true;
}
+static void process_page_fault(struct analysis_data *data,
+ struct tep_handle *tep, int pid,
+ struct tep_record *record)
+{
+ struct cpu_data *cpu_data = &data->cpu_data[record->cpu];
+ struct task_cpu_item *cpu_task;
+ struct task_item *task;
+
+ cpu_task = get_cpu_task(cpu_data, pid);
+ task = cpu_task->task;
+ task->faulted++;
+}
+
static bool match_type(int type, struct tep_event *event)
{
return event && type == event->id;
@@ -475,6 +490,9 @@ static void process_cpu(struct analysis_data *data,
else if (match_type(type, data->wakeup_event))
process_wakeup(data, tep, record);
+
+ else if (match_type(type, data->page_fault_event))
+ process_page_fault(data, tep, pid, record);
}
static int cmp_tasks(const void *A, const void *B)
@@ -664,7 +682,9 @@ static void print_task(struct tep_handle *tep, struct task_item *task)
print_time(task->runtime, '_');
printf("\n");
if (task->migrated)
- printf("Migrated:\t%llu\n", task->migrated);
+ printf("Migrated: %8llu\n", task->migrated);
+ if (task->faulted)
+ printf("Faulted: %8llu\n", task->faulted);
print_timings_title("Type");
print_sched_timings("Wakeup", &task->wakeup);
print_sched_timings("Preempted", &task->preempt);
@@ -821,6 +841,7 @@ static void do_trace_analyze(struct tracecmd_input *handle)
data.wakeup_event = tep_find_event_by_name(tep, "sched", "sched_waking");
if (!data.wakeup_event)
data.wakeup_event = tep_find_event_by_name(tep, "sched", "sched_wakeup");
+ data.page_fault_event = tep_find_event_by_name(tep, "exceptions", "page_fault_user");
/* Set to a very large number */
data.start_ts = -1ULL;
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 11/12] trace-cmd analyze: Account for dropped events
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
` (9 preceding siblings ...)
2022-03-24 2:57 ` [PATCH 10/12] trace-cmd analyze: Add counting of page faults Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
2022-03-24 2:57 ` [PATCH 12/12] trace-cmd analyze: Add documentation Steven Rostedt
11 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
If the trace.dat file contains dropped events due to the reader not being
able to keep up with the writer, account for it. Update the CPUs where
there are dropped events and tag the tasks that are on those CPUs.
Add to the output the fact that events were dropped.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tracecmd/trace-analyze.c | 87 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 84 insertions(+), 3 deletions(-)
diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
index 2603c360c404..2f1b2720810e 100644
--- a/tracecmd/trace-analyze.c
+++ b/tracecmd/trace-analyze.c
@@ -19,6 +19,8 @@ struct task_item;
struct cpu_data {
unsigned long long start_ts;
unsigned long long last_ts;
+ unsigned long long missed_events;
+ unsigned long long unknown_missed_events;
struct task_item *idle_task;
struct analysis_data *data;
struct trace_hash tasks;
@@ -27,6 +29,7 @@ struct cpu_data {
int current_pid;
int first_pid;
bool last_idle;
+ bool keep_start;
};
struct analysis_data {
@@ -45,6 +48,7 @@ struct analysis_data {
int nr_tasks;
int allocated_cpus;
int cpus;
+ bool missed_events;
};
struct sched_timings {
@@ -71,6 +75,7 @@ struct task_item {
int last_cpu;
int last_state;
bool woken;
+ bool dropped_events;
};
struct task_cpu_item {
@@ -306,6 +311,9 @@ static void update_cpu_times(struct cpu_data *cpu_data,
update_idle_task(cpu_data, task, record->ts);
if (record->ts < cpu_data->data->start_ts)
cpu_data->data->start_ts = record->ts;
+ /* If we lost events before here, keep the current start time */
+ if (record->missed_events)
+ cpu_data->keep_start = true;
return;
}
@@ -326,6 +334,10 @@ static void update_first_pid(struct cpu_data *cpu_data)
unsigned long long start_ts = cpu_data->data->start_ts;
unsigned long long delta;
+ /* If the CPU started with dropped events, do not update */
+ if (cpu_data->keep_start)
+ return;
+
cpu_task = get_cpu_task(cpu_data, cpu_data->first_pid);
task = cpu_task->task;
delta = cpu_data->start_ts - start_ts;
@@ -464,6 +476,50 @@ static void process_page_fault(struct analysis_data *data,
task->faulted++;
}
+static void handle_missed_events(struct analysis_data *data,
+ struct tep_record *record)
+{
+ struct trace_hash_item **bucket;
+ struct trace_hash_item *item;
+ struct task_cpu_item *cpu_task;
+ struct cpu_data *cpu_data;
+ struct task_item *task;
+ long long missed_events = record->missed_events;
+ int cpu;
+
+ /* If this is the first event for the CPU do nothing here */
+ if (record->cpu >= data->allocated_cpus ||
+ !data->cpu_data[record->cpu].data)
+ return;
+
+ data->missed_events = true;
+
+ cpu_data = &data->cpu_data[record->cpu];
+ cpu = cpu_data->cpu;
+
+ if (missed_events > 0)
+ cpu_data->missed_events += missed_events;
+ else
+ cpu_data->unknown_missed_events++;
+
+ /* Need to reset all timings */
+ trace_hash_for_each_bucket(bucket, &cpu_data->tasks) {
+ trace_hash_for_each_item(item, bucket) {
+ cpu_task = task_cpu_from_hash(item);
+ task = cpu_task->task;
+ if (task->last_cpu != cpu)
+ continue;
+ task->preempt.last = record->ts;
+ task->sleep.last = record->ts;
+ task->blocked.last = record->ts;
+ task->other.last = record->ts;
+ task->woken = false;
+ task->last_state = -1;
+ task->dropped_events = true;
+ }
+ }
+}
+
static bool match_type(int type, struct tep_event *event)
{
return event && type == event->id;
@@ -477,6 +533,9 @@ static void process_cpu(struct analysis_data *data,
int type;
int pid;
+ if (record->missed_events)
+ handle_missed_events(data, record);
+
pid = tep_data_pid(tep, record);
if (pid < 0) /* Warn? */
return;
@@ -624,12 +683,29 @@ static void print_cpu_data(struct tep_handle *tep, struct cpu_data *cpu_data)
int nr_tasks;
int i = 0;
- data = cpu_data->data;
- total_time = data->last_ts - data->start_ts;
-
printf("\nCPU %d\n", cpu_data->cpu);
printf("-------\n");
+ if (cpu_data->missed_events || cpu_data->unknown_missed_events) {
+ printf("Missed: %llu events", cpu_data->missed_events);
+ if (cpu_data->unknown_missed_events)
+ printf(" + %llu drops of unknown events",
+ cpu_data->unknown_missed_events);
+ printf("\n");
+ }
+
+ data = cpu_data->data;
+
+ if (cpu_data->keep_start) {
+ total_time = data->last_ts - cpu_data->start_ts;
+ printf("[ Events dropped before start of this CPU ]\n");
+ printf("Total time: ");
+ print_time(total_time, '_');
+ printf("\n");
+ } else {
+ total_time = data->last_ts - data->start_ts;
+ }
+
cpu_tasks = malloc(sizeof(*cpu_tasks) * cpu_data->nr_tasks);
if (!cpu_tasks)
@@ -678,6 +754,8 @@ static void print_task(struct tep_handle *tep, struct task_item *task)
{
printf("\nTask: %d : %s:\n",
task->pid , task->comm ? : tep_data_comm_from_pid(tep, task->pid));
+ if (task->dropped_events)
+ printf("[ Events dropped for this task's CPU, may be missing data ]\n");
printf("Runtime: ");
print_time(task->runtime, '_');
printf("\n");
@@ -706,6 +784,9 @@ static void print_total(struct tep_handle *tep, struct analysis_data *data)
int cpu;
int i = 0;
+ if (data->missed_events)
+ printf("\n[ Missed events, some data may be missing ]\n");
+
total_time = data->last_ts - data->start_ts;
printf("\nTotal time: ");
print_time(total_time, '_');
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH 12/12] trace-cmd analyze: Add documentation
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
` (10 preceding siblings ...)
2022-03-24 2:57 ` [PATCH 11/12] trace-cmd analyze: Account for dropped events Steven Rostedt
@ 2022-03-24 2:57 ` Steven Rostedt
11 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-24 2:57 UTC (permalink / raw)
To: linux-trace-devel
Cc: williskung, kaleshsingh, linux-rt-users, Steven Rostedt (Google)
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Add the help messages and the man page for trace-cmd analyze.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
.../trace-cmd/trace-cmd-analyze.1.txt | 290 ++++++++++++++++++
tracecmd/trace-usage.c | 7 +
2 files changed, 297 insertions(+)
create mode 100644 Documentation/trace-cmd/trace-cmd-analyze.1.txt
diff --git a/Documentation/trace-cmd/trace-cmd-analyze.1.txt b/Documentation/trace-cmd/trace-cmd-analyze.1.txt
new file mode 100644
index 000000000000..17c0f7029215
--- /dev/null
+++ b/Documentation/trace-cmd/trace-cmd-analyze.1.txt
@@ -0,0 +1,290 @@
+TRACE-CMD-ANALYZE(1)
+====================
+
+NAME
+----
+trace-cmd-analyze - Report useful information from data in a trace.dat file
+
+SYNOPSIS
+--------
+*trace-cmd analyze ['OPTIONS']* [input-file]
+
+DESCRIPTION
+-----------
+The trace-cmd(1) analyze will read a trace.dat file created by *trace-cmd-record*(1)
+and report statistical information out of it. Such as, how much the CPUs were
+idle, what tasks ran most on each CPU, the timings of tasks such as wake up to
+scheduling, how long they were preempted, blocked, sleeping.
+
+The timings are reported in microseconds (assuming the trace.dat file was recorded
+using a nanosecond clock), unless the *-t* option is used.
+
+OPTIONS
+-------
+*-i* 'file'::
+ Input file to read from. Otherwise it uses the first file passed in to it.
+
+*-t*::
+ Show timings in nanoseconds instead of microseconds.
+
+EXAMPLES
+--------
+
+.....
+# trace-cmd record -e sched_switch -e sched_waking -e page_fault_user -e timer -e irq -e irq_vectors sleep 1
+# trace-cmd analyze
+version = 6
+
+Total time: 1.015_180
+
+ Idle CPU Run time
+ -------- --------
+CPU 0 idle: 1.011_667 (%99)
+CPU 1 idle: 1.015_180 (%100)
+
+ Task name PID Run time
+ --------- --- --------
+ trace-cmd 1116 0.002_285 (%0)
+ trace-cmd 1115 0.000_395 (%0)
+ trace-cmd 1114 0.000_382 (%0)
+ NetworkManager 580 0.000_181 (%0)
+ rtkit-daemon 618 0.000_088 (%0)
+ rcu_sched 12 0.000_045 (%0)
+ runaway-killer- 635 0.000_043 (%0)
+ kworker/u4:1 56 0.000_025 (%0)
+ rtkit-daemon 619 0.000_020 (%0)
+ kcompactd0 29 0.000_020 (%0)
+ kworker/0:4 408 0.000_015 (%0)
+ migration/0 13 0.000_009 (%0)
+ trace-cmd 1113 0.000_000 (%0)
+
+
+CPU 0
+-------
+idle: 1.011_667 (%99)
+Idleness Total (cnt) Avg Longest Where
+-------- ----- ----- --- ------- -----
+ 1.011667 (6252) 0.000161 0.001021 184.420132
+ Task name PID Run time
+ --------- --- --------
+ trace-cmd 1116 0.002_285 (%0)
+ trace-cmd 1115 0.000_395 (%0)
+ trace-cmd 1114 0.000_382 (%0)
+ NetworkManager 580 0.000_181 (%0)
+ rtkit-daemon 618 0.000_088 (%0)
+ rcu_sched 12 0.000_045 (%0)
+ runaway-killer- 635 0.000_043 (%0)
+ kworker/u4:1 56 0.000_025 (%0)
+ rtkit-daemon 619 0.000_020 (%0)
+ kcompactd0 29 0.000_020 (%0)
+ kworker/0:4 408 0.000_015 (%0)
+ migration/0 13 0.000_009 (%0)
+ trace-cmd 1113 0.000_000 (%0)
+
+CPU 1
+-------
+idle: 1.015_180 (%100)
+Idleness Total (cnt) Avg Longest Where
+-------- ----- ----- --- ------- -----
+ 1.015180 (62) 0.016373 0.443041 183.979216
+
+
+Task: 12 : rcu_sched:
+Runtime: 0.000_045
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000043 (5) 0.000008 0.000009 183.976142
+Preempted: 0.016962 (4) 0.004240 0.004973 183.984156
+
+Task: 13 : migration/0:
+Runtime: 0.000_009
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000028 (1) 0.000028 0.000028 184.047119
+
+Task: 29 : kcompactd0:
+Runtime: 0.000_020
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000022 (2) 0.000011 0.000012 184.542135
+Sleeping: 0.503988 (1) 0.503988 0.503988 184.038158
+
+Task: 56 : kworker/u4:1:
+Runtime: 0.000_025
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000018 (2) 0.000009 0.000009 184.398136
+Preempted: 0.191983 (1) 0.191983 0.191983 184.206162
+
+Task: 408 : kworker/0:4:
+Runtime: 0.000_015
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000010 (1) 0.000010 0.000010 184.654135
+
+Task: 580 : NetworkManager:
+Runtime: 0.000_181
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000024 (1) 0.000024 0.000024 184.886115
+
+Task: 618 : rtkit-daemon:
+Runtime: 0.000_088
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000025 (1) 0.000025 0.000025 184.565118
+Preempted: 0.000020 (1) 0.000020 0.000020 184.565222
+
+Task: 619 : rtkit-daemon:
+Runtime: 0.000_020
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000056 (1) 0.000056 0.000056 184.565166
+
+Task: 635 : runaway-killer-:
+Runtime: 0.000_043
+Faulted: 2
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000025 (1) 0.000025 0.000025 184.891113
+
+Task: 1113 : trace-cmd:
+Runtime: 0.000_000
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000008 (1) 0.000008 0.000008 184.989274
+
+Task: 1114 : trace-cmd:
+Runtime: 0.000_382
+Faulted: 37
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+
+Task: 1115 : trace-cmd:
+Runtime: 0.000_395
+Faulted: 37
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+
+Task: 1116 : trace-cmd:
+Runtime: 0.002_285
+Faulted: 52
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000090 (3) 0.000030 0.000056 184.989032
+Blocked: 0.012737 (2) 0.006368 0.012149 183.974451
+Sleeping: 1.000157 (1) 1.000157 1.000157 183.988932
+.....
+
+Breaking down the above:
+.....
+Total time: 1.015_180
+.....
+
+The total run time in seconds.
+
+.....
+ Idle CPU Run time
+ -------- --------
+CPU 0 idle: 1.011_667 (%99)
+CPU 1 idle: 1.015_180 (%100)
+.....
+
+How long each CPU was idle for.
+
+.....
+
+ Task name PID Run time
+ --------- --- --------
+ trace-cmd 1116 0.002_285 (%0)
+ trace-cmd 1115 0.000_395 (%0)
+ trace-cmd 1114 0.000_382 (%0)
+ NetworkManager 580 0.000_181 (%0)
+ rtkit-daemon 618 0.000_088 (%0)
+ rcu_sched 12 0.000_045 (%0)
+ runaway-killer- 635 0.000_043 (%0)
+ kworker/u4:1 56 0.000_025 (%0)
+ rtkit-daemon 619 0.000_020 (%0)
+ kcompactd0 29 0.000_020 (%0)
+ kworker/0:4 408 0.000_015 (%0)
+ migration/0 13 0.000_009 (%0)
+ trace-cmd 1113 0.000_000 (%0)
+.....
+
+A list of all tasks in the trace and their run times along with the percentage of time they
+were running with respect to the total time.
+
+
+.....
+CPU 0
+-------
+idle: 1.011_667 (%99)
+Idleness Total (cnt) Avg Longest Where
+-------- ----- ----- --- ------- -----
+ 1.011667 (6252) 0.000161 0.001021 184.420132
+ Task name PID Run time
+ --------- --- --------
+ trace-cmd 1116 0.002_285 (%0)
+ trace-cmd 1115 0.000_395 (%0)
+ trace-cmd 1114 0.000_382 (%0)
+ NetworkManager 580 0.000_181 (%0)
+ rtkit-daemon 618 0.000_088 (%0)
+ rcu_sched 12 0.000_045 (%0)
+ runaway-killer- 635 0.000_043 (%0)
+ kworker/u4:1 56 0.000_025 (%0)
+ rtkit-daemon 619 0.000_020 (%0)
+ kcompactd0 29 0.000_020 (%0)
+ kworker/0:4 408 0.000_015 (%0)
+ migration/0 13 0.000_009 (%0)
+ trace-cmd 1113 0.000_000 (%0)
+.....
+
+Each CPU is show, again showing its total time (which could be different than the total
+time of the system, if events were dropped before the first event on the CPU).
+
+The "Idleness" is the time between events for the CPU's idle task. This is useful for knowing
+if the CPU can make it into deep sleep if interrupts are being traced as well. Note, currently
+it does not remove the time of an interrupt entering to when it leaves, but that may change in the
+future.
+
+.....
+Task: 1116 : trace-cmd:
+Runtime: 0.002_285
+Faulted: 52
+Type Total (cnt) Avg Longest Where
+---- ----- ----- --- ------- -----
+Wakeup: 0.000090 (3) 0.000030 0.000056 184.989032
+Blocked: 0.012737 (2) 0.006368 0.012149 183.974451
+Sleeping: 1.000157 (1) 1.000157 1.000157 183.988932
+.....
+
+Each task is traced. It will show the total run time of the task, the number of times
+tha task migrated (not shown if the task never migrated during the trace). The wake up
+latency (total time it was in waiting to be woken up), the number of times it was woken up (cnt),
+the adverage time it took to wake it up, the longest time it took to wake up, along with the
+time stamp in the report of where it woke up with that longest time.
+
+Then the same timings for how long the task was preempted (scheduled out in the running
+state), blocked (scheduled out in an uninterruptible state), sleeping (scheduled out
+in an interruptible state), and "other" for all other states the task was in when
+scheduling out.
+
+SEE ALSO
+--------
+trace-cmd(1), trace-cmd-record(1), trace-cmd-report(1), trace-cmd-start(1),
+trace-cmd-stop(1), trace-cmd-reset(1), trace-cmd-split(1),
+trace-cmd-list(1), trace-cmd-listen(1)
+
+AUTHOR
+------
+Written by Steven Rostedt, <rostedt@goodmis.org>
+
+RESOURCES
+---------
+https://git.kernel.org/pub/scm/utils/trace-cmd/trace-cmd.git/
+
+COPYING
+-------
+Copyright \(C) 2022 Google, Inc. Free use of this software is granted under
+the terms of the GNU Public License (GPL).
+
diff --git a/tracecmd/trace-usage.c b/tracecmd/trace-usage.c
index 20995fabed73..53d3159f0903 100644
--- a/tracecmd/trace-usage.c
+++ b/tracecmd/trace-usage.c
@@ -274,6 +274,13 @@ static struct usage_help usage_help[] = {
" -H Allows users to hook two events together for timings\n"
" --verbose 'level' Set the desired log level\n"
},
+ {
+ "analyze",
+ "Analyze a trace file and extract various information about it",
+ " %s analyze [-i input][-t]\n"
+ " -i Specify the input file to read, otherwise read the first non option\n"
+ " -t Show time stamps in nanoseconds instead of microseconds\n"
+ },
{
"hist",
"show a histogram of the trace.dat information",
--
2.35.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 09/12] trace-cmd analyze: Add wake up latency timings
2022-03-24 2:57 ` [PATCH 09/12] trace-cmd analyze: Add wake up latency timings Steven Rostedt
@ 2022-03-25 19:34 ` John Keeping
2022-03-25 20:13 ` Steven Rostedt
0 siblings, 1 reply; 27+ messages in thread
From: John Keeping @ 2022-03-25 19:34 UTC (permalink / raw)
To: Steven Rostedt; +Cc: linux-trace-devel, williskung, kaleshsingh, linux-rt-users
On Wed, Mar 23, 2022 at 10:57:23PM -0400, Steven Rostedt wrote:
> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
>
> If the sched_waking (or sched_wakeup) event is found, then use it to time
> the wake up latency for each task.
>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> +static void process_wakeup(struct analysis_data *data,
> + struct tep_handle *tep,
> + struct tep_record *record)
> +{
> + struct cpu_data *cpu_data = &data->cpu_data[record->cpu];
> + struct task_cpu_item *cpu_task;
> + struct task_item *task;
> + unsigned long long val;
> + int pid;
> +
> + tep_read_number_field(data->wakeup_pid, record->data, &val);
> + pid = val;
> + cpu_task = get_cpu_task(cpu_data, pid);
Should this use get_task() instead of get_cpu_task()?
I was trying this out and it provides a great overview of a trace but I
found it strange that tasks affine to CPU N were also appearing in the
list for CPU M.
> + task = cpu_task->task;
> + task->wakeup.last = record->ts;
> + task->woken = true;
> +}
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 03/12] trace-cmd analyze: Show how much tasks run on each CPU
2022-03-24 2:57 ` [PATCH 03/12] trace-cmd analyze: Show how much tasks run on each CPU Steven Rostedt
@ 2022-03-25 19:36 ` John Keeping
2022-03-25 20:18 ` Steven Rostedt
0 siblings, 1 reply; 27+ messages in thread
From: John Keeping @ 2022-03-25 19:36 UTC (permalink / raw)
To: Steven Rostedt; +Cc: linux-trace-devel, williskung, kaleshsingh, linux-rt-users
On Wed, Mar 23, 2022 at 10:57:17PM -0400, Steven Rostedt wrote:
> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
>
> Display for each CPU that was traced, the amount of time tasks ran on
> them. Listing the tasks from the longest runner to the least.
>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> + for (i = 0; i < nr_tasks; i++) {
> + task = cpu_tasks[i]->task;
> +
> + if (!i) {
> + printf(" Task name PID \t Run time\n");
> + printf(" --------- --- \t --------\n");
> + }
> + printf("%16s %8d\t",
> + tep_data_comm_from_pid(tep, task->pid),
> + task->pid);
> + print_time(cpu_tasks[i]->runtime, '_');
> + printf(" (%%%lld)\n", (task->runtime * 100) / total_time);
Is there a reason for using the CPU-specific runtime for the value and
the total runtime for the percentage?
I expected the percentage to be the percentage of this CPU's time spend
running the task.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 06/12] trace-cmd analyze: Add tracing of tasks and their states
2022-03-24 2:57 ` [PATCH 06/12] trace-cmd analyze: Add tracing of tasks and their states Steven Rostedt
@ 2022-03-25 19:37 ` John Keeping
2022-03-25 20:19 ` Steven Rostedt
0 siblings, 1 reply; 27+ messages in thread
From: John Keeping @ 2022-03-25 19:37 UTC (permalink / raw)
To: Steven Rostedt; +Cc: linux-trace-devel, williskung, kaleshsingh, linux-rt-users
On Wed, Mar 23, 2022 at 10:57:20PM -0400, Steven Rostedt wrote:
> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
>
> If sched_switch is found and has the prev_state field, then record them
> and keep track of the time the task was in the state, the number of times
> it was scheduled out of the state, and print the total time, average time,
> and number of times the task was scheduled out of that state. Also show
> the time stamp of where the longest time it went into that state.
>
> Currently the only states that are supported are "preempted" (which was in
> state running), "blocked" (state uninterruptible), "sleeping" (state
> interruptible), and "other" (all other states).
>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> @@ -288,6 +337,24 @@ static void process_switch(struct analysis_data *data,
> task->start_ts = record->ts;
> cpu_data->current_pid = pid;
>
> + switch (task->last_state) {
> + case -1:
> + /* First time seen */
> + break;
> + case 0:
> + update_sched_timings(&task->preempt, record->ts);
> + break;
> + case 0x1:
> + update_sched_timings(&task->sleep, record->ts);
> + break;
> + case 0x2:
> + update_sched_timings(&task->blocked, record->ts);
> + break;
> + default:
> + update_sched_timings(&task->other, record->ts);
> + }
> + task->last_state = val & 0x1f;
val is next_pid here, does last_state even need to be updated for the
switched-to task?
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 09/12] trace-cmd analyze: Add wake up latency timings
2022-03-25 19:34 ` John Keeping
@ 2022-03-25 20:13 ` Steven Rostedt
2022-03-25 20:31 ` Steven Rostedt
0 siblings, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2022-03-25 20:13 UTC (permalink / raw)
To: John Keeping; +Cc: linux-trace-devel, williskung, kaleshsingh, linux-rt-users
On Fri, 25 Mar 2022 19:34:20 +0000
John Keeping <john@metanate.com> wrote:
> > +static void process_wakeup(struct analysis_data *data,
> > + struct tep_handle *tep,
> > + struct tep_record *record)
> > +{
> > + struct cpu_data *cpu_data = &data->cpu_data[record->cpu];
> > + struct task_cpu_item *cpu_task;
> > + struct task_item *task;
> > + unsigned long long val;
> > + int pid;
> > +
> > + tep_read_number_field(data->wakeup_pid, record->data, &val);
> > + pid = val;
> > + cpu_task = get_cpu_task(cpu_data, pid);
>
> Should this use get_task() instead of get_cpu_task()?
>
> I was trying this out and it provides a great overview of a trace but I
> found it strange that tasks affine to CPU N were also appearing in the
> list for CPU M.
Yes, you are correct, because I noticed the same issue :-D
I'll fix it in v2.
Thanks,
-- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 03/12] trace-cmd analyze: Show how much tasks run on each CPU
2022-03-25 19:36 ` John Keeping
@ 2022-03-25 20:18 ` Steven Rostedt
2022-03-26 11:07 ` John Keeping
0 siblings, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2022-03-25 20:18 UTC (permalink / raw)
To: John Keeping; +Cc: linux-trace-devel, williskung, kaleshsingh, linux-rt-users
On Fri, 25 Mar 2022 19:36:24 +0000
John Keeping <john@metanate.com> wrote:
> On Wed, Mar 23, 2022 at 10:57:17PM -0400, Steven Rostedt wrote:
> > From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
> >
> > Display for each CPU that was traced, the amount of time tasks ran on
> > them. Listing the tasks from the longest runner to the least.
> >
> > Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> > ---
> > + for (i = 0; i < nr_tasks; i++) {
> > + task = cpu_tasks[i]->task;
> > +
> > + if (!i) {
> > + printf(" Task name PID \t Run time\n");
> > + printf(" --------- --- \t --------\n");
> > + }
> > + printf("%16s %8d\t",
> > + tep_data_comm_from_pid(tep, task->pid),
> > + task->pid);
> > + print_time(cpu_tasks[i]->runtime, '_');
> > + printf(" (%%%lld)\n", (task->runtime * 100) / total_time);
>
> Is there a reason for using the CPU-specific runtime for the value and
> the total runtime for the percentage?
>
> I expected the percentage to be the percentage of this CPU's time spend
> running the task.
We modify it so that each CPU has the same run time, unless there's missed
events at the start (later patches), and then we change total_time to be
the total time of the events on the CPU and not the entire trace.
-- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 06/12] trace-cmd analyze: Add tracing of tasks and their states
2022-03-25 19:37 ` John Keeping
@ 2022-03-25 20:19 ` Steven Rostedt
0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-25 20:19 UTC (permalink / raw)
To: John Keeping; +Cc: linux-trace-devel, williskung, kaleshsingh, linux-rt-users
On Fri, 25 Mar 2022 19:37:03 +0000
John Keeping <john@metanate.com> wrote:
> > @@ -288,6 +337,24 @@ static void process_switch(struct analysis_data *data,
> > task->start_ts = record->ts;
> > cpu_data->current_pid = pid;
> >
> > + switch (task->last_state) {
> > + case -1:
> > + /* First time seen */
> > + break;
> > + case 0:
> > + update_sched_timings(&task->preempt, record->ts);
> > + break;
> > + case 0x1:
> > + update_sched_timings(&task->sleep, record->ts);
> > + break;
> > + case 0x2:
> > + update_sched_timings(&task->blocked, record->ts);
> > + break;
> > + default:
> > + update_sched_timings(&task->other, record->ts);
> > + }
> > + task->last_state = val & 0x1f;
>
> val is next_pid here, does last_state even need to be updated for the
> switched-to task?
-ECUTANDPASTE :-p
Thanks,
-- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 09/12] trace-cmd analyze: Add wake up latency timings
2022-03-25 20:13 ` Steven Rostedt
@ 2022-03-25 20:31 ` Steven Rostedt
2022-03-26 11:14 ` John Keeping
0 siblings, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2022-03-25 20:31 UTC (permalink / raw)
To: John Keeping; +Cc: linux-trace-devel, williskung, kaleshsingh, linux-rt-users
On Fri, 25 Mar 2022 16:13:33 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:
> I'll fix it in v2.
BTW, I plan on rewriting a lot of this to move it into libtracecmd. That
way other tools can easily extract data from it.
For example, instead of hard coding "page faults" and "wake ups" I plan on
adding a way to register events to trigger on. Something like:
tracecmd_analyze_register_event_timing(handle, name, start_event,
start_field, start_record, end_event, end_field,
end_record);
Where "start_record" and "end_record" is if the field is a task to record.
enum {
TRACECMD_ANALYZE_FL_NONE,
TRACECMD_ANALYZE_FL_PID,
TRACECMD_ANALYZE_FL_PID_CPU,
};
Thus, we could have:
tracecmd_analyze_register_event_timing(handle,
"Wakeup", sched_waking,
pid_field, TRACECMD_ANALYZE_FL_PID,
sched_switch, next_pid,
TRACECMD_ANALYZE_FL_PID_CPU);
Or something like the above, that will end up turning into the
process_wakeup() function (and properly record the waking pid globally).
One tool I want to use this with, is my ktrace shell (development has
stalled, but I do plan on starting it again).
https://git.kernel.org/pub/scm/linux/kernel/git/rostedt/ktrace.git/
And then I could have commands like:
ktrace> open trace.dat
ktrace> analyze register timing sched.sched_switch pid global \
sched.sched_switch next_pid cpu
ktrace> analyze
ktrace> analyze show cpu 1
And it only shows cpu 1 data.
ktrace> analyze show pid 512
to see just pid 512
etc.
-- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 03/12] trace-cmd analyze: Show how much tasks run on each CPU
2022-03-25 20:18 ` Steven Rostedt
@ 2022-03-26 11:07 ` John Keeping
2022-03-26 15:16 ` Steven Rostedt
0 siblings, 1 reply; 27+ messages in thread
From: John Keeping @ 2022-03-26 11:07 UTC (permalink / raw)
To: Steven Rostedt; +Cc: linux-trace-devel, williskung, kaleshsingh, linux-rt-users
On Fri, Mar 25, 2022 at 04:18:19PM -0400, Steven Rostedt wrote:
> On Fri, 25 Mar 2022 19:36:24 +0000
> John Keeping <john@metanate.com> wrote:
>
> > On Wed, Mar 23, 2022 at 10:57:17PM -0400, Steven Rostedt wrote:
> > > From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
> > >
> > > Display for each CPU that was traced, the amount of time tasks ran on
> > > them. Listing the tasks from the longest runner to the least.
> > >
> > > Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> > > ---
> > > + for (i = 0; i < nr_tasks; i++) {
> > > + task = cpu_tasks[i]->task;
> > > +
> > > + if (!i) {
> > > + printf(" Task name PID \t Run time\n");
> > > + printf(" --------- --- \t --------\n");
> > > + }
> > > + printf("%16s %8d\t",
> > > + tep_data_comm_from_pid(tep, task->pid),
> > > + task->pid);
> > > + print_time(cpu_tasks[i]->runtime, '_');
> > > + printf(" (%%%lld)\n", (task->runtime * 100) / total_time);
> >
> > Is there a reason for using the CPU-specific runtime for the value and
> > the total runtime for the percentage?
> >
> > I expected the percentage to be the percentage of this CPU's time spend
> > running the task.
>
> We modify it so that each CPU has the same run time, unless there's missed
> events at the start (later patches), and then we change total_time to be
> the total time of the events on the CPU and not the entire trace.
I think we're talking about different things here, I probably wasn't
entirely clear about this. It's the numerator of this division that I'm
concerned about and I wonder if this should be:
(cpu_tasks[i]->runtime * 100) / total_time
If total_time is 10 seconds and there's a task A which runs on CPU0 for
2 seconds and CPU1 for 1 second then I expect to see:
CPU 0
A 2.000 (20%)
CPU 1
A 1.000 (10%)
But at the moment both of those lines will give 30% (although the actual
run times are correct).
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 09/12] trace-cmd analyze: Add wake up latency timings
2022-03-25 20:31 ` Steven Rostedt
@ 2022-03-26 11:14 ` John Keeping
2022-03-26 15:32 ` Steven Rostedt
0 siblings, 1 reply; 27+ messages in thread
From: John Keeping @ 2022-03-26 11:14 UTC (permalink / raw)
To: Steven Rostedt; +Cc: linux-trace-devel, williskung, kaleshsingh, linux-rt-users
On Fri, Mar 25, 2022 at 04:31:48PM -0400, Steven Rostedt wrote:
> On Fri, 25 Mar 2022 16:13:33 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > I'll fix it in v2.
>
> BTW, I plan on rewriting a lot of this to move it into libtracecmd. That
> way other tools can easily extract data from it.
>
> For example, instead of hard coding "page faults" and "wake ups" I plan on
> adding a way to register events to trigger on. Something like:
>
> tracecmd_analyze_register_event_timing(handle, name, start_event,
> start_field, start_record, end_event, end_field,
> end_record);
>
> Where "start_record" and "end_record" is if the field is a task to record.
>
> enum {
> TRACECMD_ANALYZE_FL_NONE,
> TRACECMD_ANALYZE_FL_PID,
> TRACECMD_ANALYZE_FL_PID_CPU,
> };
>
> Thus, we could have:
>
> tracecmd_analyze_register_event_timing(handle,
> "Wakeup", sched_waking,
> pid_field, TRACECMD_ANALYZE_FL_PID,
> sched_switch, next_pid,
> TRACECMD_ANALYZE_FL_PID_CPU);
>
> Or something like the above, that will end up turning into the
> process_wakeup() function (and properly record the waking pid globally).
This sounds really neat.
I started having a look at whether it's possible to add an overview of
interrupts from irq:irq_handler_entry & irq:irq_handler_exit and I guess
that would fit in TRACECMD_ANALYZE_FL_NONE above, but it would be good
to have per-cpu stats for irqs as well so does that mean an additional
record type is needed?
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 03/12] trace-cmd analyze: Show how much tasks run on each CPU
2022-03-26 11:07 ` John Keeping
@ 2022-03-26 15:16 ` Steven Rostedt
2022-03-28 16:23 ` Kalesh Singh
0 siblings, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2022-03-26 15:16 UTC (permalink / raw)
To: John Keeping; +Cc: linux-trace-devel, williskung, kaleshsingh, linux-rt-users
On Sat, 26 Mar 2022 11:07:03 +0000
John Keeping <john@metanate.com> wrote:
> > > > + print_time(cpu_tasks[i]->runtime, '_');
> > > > + printf(" (%%%lld)\n", (task->runtime * 100) / total_time);
> > >
> > > Is there a reason for using the CPU-specific runtime for the value and
> > > the total runtime for the percentage?
> > >
> > > I expected the percentage to be the percentage of this CPU's time spend
> > > running the task.
> >
> > We modify it so that each CPU has the same run time, unless there's missed
> > events at the start (later patches), and then we change total_time to be
> > the total time of the events on the CPU and not the entire trace.
>
> I think we're talking about different things here, I probably wasn't
> entirely clear about this. It's the numerator of this division that I'm
> concerned about and I wonder if this should be:
>
> (cpu_tasks[i]->runtime * 100) / total_time
Ah, I did misunderstand you. Yes, that's a typo. Thanks for pointing
that out. I'll go fix it.
-- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 09/12] trace-cmd analyze: Add wake up latency timings
2022-03-26 11:14 ` John Keeping
@ 2022-03-26 15:32 ` Steven Rostedt
0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-26 15:32 UTC (permalink / raw)
To: John Keeping; +Cc: linux-trace-devel, williskung, kaleshsingh, linux-rt-users
On Sat, 26 Mar 2022 11:14:06 +0000
John Keeping <john@metanate.com> wrote:
> I started having a look at whether it's possible to add an overview of
> interrupts from irq:irq_handler_entry & irq:irq_handler_exit and I guess
> that would fit in TRACECMD_ANALYZE_FL_NONE above, but it would be good
> to have per-cpu stats for irqs as well so does that mean an additional
> record type is needed?
I actually created that enum for exactly that ;-) To be able to add
different metrics in the future. Which is why it states "PID". I
actually did think about adding things like
TRACECMD_ANALYZE_FL_IRQ
To say this is to add irq interruptions. Perhaps even add:
TRACECMD_ANALYZE_FL_STOLEN
for guest stolen times.
I haven't decided on that API yet. Perhaps we could add new custom
timings too, and include them somehow. I have to think about a good API
to use.
-- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 01/12] trace-cmd: Add trace-cmd analyze
2022-03-24 2:57 ` [PATCH 01/12] trace-cmd: Add trace-cmd analyze Steven Rostedt
@ 2022-03-28 16:08 ` Kalesh Singh
2022-03-28 16:34 ` Steven Rostedt
0 siblings, 1 reply; 27+ messages in thread
From: Kalesh Singh @ 2022-03-28 16:08 UTC (permalink / raw)
To: Steven Rostedt; +Cc: linux-trace-devel, williskung, linux-rt-users
On Wed, Mar 23, 2022 at 7:57 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
>
> Add a new trace-cmd command "analyze" that will look at a trace.dat file
> and report various statistics about the code.
>
> Start off by showing the amount each CPU is idle.
>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> tracecmd/Makefile | 1 +
> tracecmd/include/trace-local.h | 2 +
> tracecmd/trace-analyze.c | 508 +++++++++++++++++++++++++++++++++
> tracecmd/trace-cmd.c | 1 +
> 4 files changed, 512 insertions(+)
> create mode 100644 tracecmd/trace-analyze.c
>
> diff --git a/tracecmd/Makefile b/tracecmd/Makefile
> index 355f04723ad7..895294d09092 100644
> --- a/tracecmd/Makefile
> +++ b/tracecmd/Makefile
> @@ -37,6 +37,7 @@ TRACE_CMD_OBJS += trace-dump.o
> TRACE_CMD_OBJS += trace-clear.o
> TRACE_CMD_OBJS += trace-vm.o
> TRACE_CMD_OBJS += trace-convert.o
> +TRACE_CMD_OBJS += trace-analyze.o
>
> ifeq ($(VSOCK_DEFINED), 1)
> TRACE_CMD_OBJS += trace-agent.o
> diff --git a/tracecmd/include/trace-local.h b/tracecmd/include/trace-local.h
> index bb33de06bb58..f9d0745914fe 100644
> --- a/tracecmd/include/trace-local.h
> +++ b/tracecmd/include/trace-local.h
> @@ -112,6 +112,8 @@ void trace_dump(int argc, char **argv);
>
> void trace_convert(int argc, char **argv);
>
> +void trace_analyze(int argc, char **argv);
> +
> int trace_record_agent(struct tracecmd_msg_handle *msg_handle,
> int cpus, int *fds,
> int argc, char **argv, bool use_fifos,
> diff --git a/tracecmd/trace-analyze.c b/tracecmd/trace-analyze.c
> new file mode 100644
> index 000000000000..4a17cf12eb07
> --- /dev/null
> +++ b/tracecmd/trace-analyze.c
> @@ -0,0 +1,508 @@
> +// SPDX-License-Identifier: LGPL-2.1
> +/*
> + * Copyright (C) 2022 Google Inc, Steven Rostedt <srostedt@redhat.com>
> + */
> +#define _LARGEFILE64_SOURCE
> +#include <getopt.h>
> +#include <stdlib.h>
> +
> +#include "trace-cmd-local.h"
> +#include "trace-local.h"
> +#include "trace-hash.h"
> +#include "list.h"
> +
> +static bool time_in_nsecs;
> +
> +struct analysis_data;
> +
> +struct cpu_data {
> + unsigned long long start_ts;
> + unsigned long long last_ts;
> + struct analysis_data *data;
> + struct trace_hash tasks;
> + int nr_tasks;
> + int cpu;
> + int current_pid;
> + int first_pid;
> +};
> +
> +struct analysis_data {
> + unsigned long long start_ts;
> + unsigned long long last_ts;
> + struct cpu_data *cpu_data;
> + struct trace_hash tasks;
> + int nr_tasks;
> + int allocated_cpus;
> + int cpus;
Hi Steve,
I don't see analysis_data->cpus field ever used.
> +};
> +
> +struct task_item {
> + unsigned long long runtime;
> + unsigned long long start_ts;
> + struct trace_hash_item hash;
> + int pid;
> +};
> +
> +struct task_cpu_item {
> + struct trace_hash_item hash;
> + struct task_item *task;
> +};
> +
> +#define CPU_BLOCKS 32
> +
> +#define task_from_hash(item) container_of(item, struct task_item, hash)
> +#define task_cpu_from_hash(item) container_of(item, struct task_cpu_item, hash)
> +
> +static struct cpu_data *get_cpu_data(struct analysis_data *data,
> + struct tep_record *record)
> +{
> + struct cpu_data *cpu_data;
> + int cpu = record->cpu;
> + int cnt;
> +
> + if (cpu < data->allocated_cpus) {
> + cpu_data = &data->cpu_data[cpu];
> + if (!cpu_data->data)
> + goto init_cpu_data;
> + return cpu_data;
> + }
> +
> + /* Round up to CPU_BLOCKS multiplier */
> + cnt = ((cpu + CPU_BLOCKS) / CPU_BLOCKS) * CPU_BLOCKS;
> +
> + cpu_data = realloc(data->cpu_data, sizeof(*cpu_data) * cnt);
> +
> + if (!cpu_data)
> + die("Allocating cpu size %d for cpu %d", cnt, cpu);
> +
> + memset(cpu_data + data->allocated_cpus, 0,
> + sizeof(*cpu_data) * (cnt - data->allocated_cpus));
> +
> + data->allocated_cpus = cnt;
> +
> + data->cpu_data = cpu_data;
> + cpu_data += cpu;
> +
> + init_cpu_data:
> + cpu_data->current_pid = -1;
> + cpu_data->cpu = cpu;
> + cpu_data->data = data;
> +
> + trace_hash_init(&cpu_data->tasks, 32);
nit: #define CPU_TASKS_HASH_SIZE 32?
> +
> + return cpu_data;
> +}
> +
> +static int check_idle(struct cpu_data *cpu_data, int pid)
> +{
> + if (pid)
> + return pid;
> +
> + /*
> + * Since pid 0 is the per cpu swapper task that
> + * means several of these tasks have the same pid
> + * and only differentiate between CPUs. Set the pid
> + * that is stored in the hash as -2 - CPU id.
> + */
> +
> + return -2 - cpu_data->cpu;
> +}
> +
> +static struct task_item *get_task(struct cpu_data *cpu_data, int pid)
> +{
> + struct trace_hash_item *hash;
> + struct task_item *task;
> + int find_pid;
> +
> + find_pid = check_idle(cpu_data, pid);
> +
> + hash = trace_hash_find(&cpu_data->data->tasks, find_pid, NULL, NULL);
> + if (!hash) {
> + task = calloc(sizeof(*task), 1);
> + if (!task)
> + die("allocating task");
> + task->pid = find_pid;
> + hash = &task->hash;
> + hash->key = find_pid;
> + cpu_data->data->nr_tasks++;
> + trace_hash_add(&cpu_data->data->tasks, hash);
> + }
> +
> + return task_from_hash(hash);
> +}
> +
> +static struct task_cpu_item *get_cpu_task(struct cpu_data *cpu_data, int pid)
> +{
> + struct trace_hash_item *hash;
> + struct task_cpu_item *task;
> +
> + hash = trace_hash_find(&cpu_data->tasks, pid, NULL, NULL);
> + if (!hash) {
> + task = calloc(sizeof(*task), 1);
> + if (!task)
> + die("allocating cpu task");
> + task->task = get_task(cpu_data, pid);
> + hash = &task->hash;
> + hash->key = pid;
> + cpu_data->nr_tasks++;
> + trace_hash_add(&cpu_data->tasks, hash);
> + }
> +
> + return task_cpu_from_hash(hash);
> +};
> +
> +/* Update times for a task scheduling off the CPU */
> +static void update_cpu_task_times(struct cpu_data *cpu_data,
> + struct task_cpu_item *cpu_task,
> + unsigned long long ts)
> +{
> + unsigned long long delta;
> + struct task_item *task = cpu_task->task;
> +
> + /*
> + * If the last event was a sched switch where the previous task
> + * ran on another CPU, and migrated back to this CPU, and sched
> + * switch was not recorded (saying that this task scheduled off)
> + * It could be miscronstrued to still be on this CPU, and that
> + * its start_ts is later than the last_ts of this CPU.
> + */
> + if (ts < task->start_ts)
> + return;
> +
> + delta = ts - task->start_ts;
> + task->runtime += delta;
> +}
> +
> +static void update_pid(struct cpu_data *cpu_data,
> + struct tep_record *record, int pid)
> +{
> + struct task_cpu_item *cpu_task;
> + struct task_item *task;
> + int curr_pid = cpu_data->current_pid;
> + unsigned long long delta;
> +
> + cpu_task = get_cpu_task(cpu_data, curr_pid);
> + task = cpu_task->task;
> +
> + update_cpu_task_times(cpu_data, cpu_task, cpu_data->last_ts);
> +
> + if (!record)
> + return;
> +
> + cpu_task = get_cpu_task(cpu_data, pid);
> + task = cpu_task->task;
> +
> + if (record->ts < cpu_data->last_ts) {
> + tracecmd_warning("task %d start time %llu greater than CPU time %llu",
> + pid, record->ts, cpu_data->last_ts);
Shouldn’t this say, ... start time %llu "less than" CPU time ...?
> + } else {
> + delta = record->ts - cpu_data->last_ts;
> + task->runtime += delta;
> + }
> +
> + cpu_data->last_ts = record->ts;
> +
> + task->start_ts = cpu_data->last_ts;
> + cpu_data->current_pid = pid;
> +}
> +
> +static void update_cpu_times(struct cpu_data *cpu_data,
> + struct tep_handle *tep, int pid,
> + struct tep_record *record)
> +{
> + struct task_cpu_item *cpu_task;
> + struct task_item *task;
> +
> + if (cpu_data->current_pid < 0) {
> + /* First time called */
> + cpu_data->start_ts = record->ts;
> + cpu_data->last_ts = record->ts;
> + cpu_data->current_pid = pid;
> + cpu_data->first_pid = pid;
> + cpu_task = get_cpu_task(cpu_data, pid);
> + task = cpu_task->task;
> + task->start_ts = record->ts;
> + if (record->ts < cpu_data->data->start_ts)
> + cpu_data->data->start_ts = record->ts;
> + return;
> + }
> +
> + if (pid != cpu_data->current_pid) {
> + update_pid(cpu_data, record, pid);
> + return;
> + }
> +
> + cpu_data->last_ts = record->ts;
> +}
> +
> +static void update_first_pid(struct cpu_data *cpu_data)
> +{
> + struct task_cpu_item *cpu_task;
> + struct task_item *task;
> + unsigned long long start_ts = cpu_data->data->start_ts;
> + unsigned long long delta;
> +
> + cpu_task = get_cpu_task(cpu_data, cpu_data->first_pid);
> + task = cpu_task->task;
> + delta = cpu_data->start_ts - start_ts;
> + task->runtime += delta;
> + cpu_data->start_ts = start_ts;
> +}
> +
> +static void process_cpu(struct analysis_data *data,
> + struct tep_handle *tep,
> + struct tep_record *record)
> +{
> + struct cpu_data *cpu_data;
> + int pid;
> +
> + pid = tep_data_pid(tep, record);
> + if (pid < 0) /* Warn? */
> + return;
> +
> + cpu_data = get_cpu_data(data, record);
> + update_cpu_times(cpu_data, tep, pid, record);
> +}
> +
> +static void print_time(unsigned long long ts, char delim)
> +{
> + unsigned long long secs;
> + unsigned long long msecs;
> + unsigned long long usecs;
> + unsigned long long nsecs;
> +
> + secs = ts / 1000000000;
> + ts -= secs * 1000000000;
> +
> + msecs = ts / 1000000;
> + ts -= msecs * 1000000;
> +
> + usecs = ts / 1000;
> + ts -= usecs * 1000;
> +
> + nsecs = ts;
> +
> + if (delim) {
> + printf("%6llu.%03llu%c%03llu",
> + secs, msecs, delim, usecs);
> + if (time_in_nsecs)
> + printf("%c%03llu", delim, nsecs);
> + } else {
> + printf("%6llu.%03llu%03llu",
> + secs, msecs, usecs);
> + if (time_in_nsecs)
> + printf("%03llu", nsecs);
> + }
> +}
> +
> +static void print_total(struct tep_handle *tep, struct analysis_data *data)
> +{
> + unsigned long long total_time;
> + struct trace_hash_item **bucket;
> + struct trace_hash_item *item;
> + struct task_item **idle_tasks;
> + struct task_item *task;
> + bool first = true;
> + int cpu;
> + int i = 0;
> +
> + total_time = data->last_ts - data->start_ts;
> + printf("\nTotal time: ");
> + print_time(total_time, '_');
> + printf("\n");
> +
> + idle_tasks = calloc(sizeof(*idle_tasks), data->allocated_cpus);
> + if (!idle_tasks)
> + die("Could not allocate idle task array");
> +
> + trace_hash_for_each_bucket(bucket, &data->tasks) {
> + trace_hash_for_each_item(item, bucket) {
> + task = task_from_hash(item);
> + if (task->pid < 0) {
> + cpu = -2 - task->pid;
> + idle_tasks[cpu] = task;
> + }
> + }
> + }
> +
> + for (i = 0; i < data->allocated_cpus; i++) {
> + if (!idle_tasks[i])
> + continue;
> +
> + if (first) {
> + printf("\n Idle CPU\t Run time\n");
> + printf(" --------\t --------\n");
> + first = false;
> + }
> + printf("CPU %d idle:\t", i);
> + print_time(idle_tasks[i]->runtime, '_');
> + printf(" (%%%lld)\n", (idle_tasks[i]->runtime * 100) / total_time);
> + }
> + free(idle_tasks);
> +}
> +
> +static void free_tasks(struct trace_hash *hash)
> +{
> + struct trace_hash_item **bucket;
> + struct trace_hash_item *item;
> + struct task_item *task;
> +
> + trace_hash_for_each_bucket(bucket, hash) {
> + trace_hash_while_item(item, bucket) {
> + task = task_from_hash(item);
> + trace_hash_del(item);
> + free(task);
> + }
> + }
> + trace_hash_free(hash);
> +}
> +
> +static void free_cpu_tasks(struct trace_hash *hash)
> +{
> + struct trace_hash_item **bucket;
> + struct trace_hash_item *item;
> + struct task_cpu_item *cpu_task;
> +
> + trace_hash_for_each_bucket(bucket, hash) {
> + trace_hash_while_item(item, bucket) {
> + cpu_task = task_cpu_from_hash(item);
> + trace_hash_del(item);
> + free(cpu_task);
> + }
> + }
> + trace_hash_free(hash);
> +}
> +
> +static void free_cpus(struct analysis_data *data)
> +{
> + struct cpu_data *cpu_data;
> + int i;
> +
> + for (i = 0; i < data->allocated_cpus; i++) {
> + cpu_data = &data->cpu_data[i];
> + if (!cpu_data->data)
> + continue;
> + free_cpu_tasks(&cpu_data->tasks);
> + }
> + free(data->cpu_data);
> +}
> +
> +static void do_trace_analyze(struct tracecmd_input *handle)
> +{
> + struct tep_handle *tep = tracecmd_get_tep(handle);
> + struct tep_record *record;
> + struct analysis_data data;
> + struct cpu_data *cpu_data;
> + int i;
> +
> + memset(&data, 0, sizeof(data));
> +
> + trace_hash_init(&data.tasks, 128);
> +
> + /* Set to a very large number */
> + data.start_ts = -1ULL;
> +
> + do {
> + record = tracecmd_read_next_data(handle, NULL);
> + if (record)
> + process_cpu(&data, tep, record);
> + tracecmd_free_record(record);
> + } while (record);
> +
> + for (i = 0; i < data.allocated_cpus; i++) {
> + cpu_data = &data.cpu_data[i];
> + if (!cpu_data->data || !cpu_data->nr_tasks)
> + continue;
> + if (cpu_data->last_ts > data.last_ts)
> + data.last_ts = cpu_data->last_ts;
> + }
> +
> + for (i = 0; i < data.allocated_cpus; i++) {
> + cpu_data = &data.cpu_data[i];
> + if (!cpu_data->data || !cpu_data->nr_tasks)
> + continue;
> + cpu_data->last_ts = data.last_ts;
> + update_pid(cpu_data, NULL, -1);
I think having update_last_pid() makes this easier to read.
Thanks,
Kalesh
> + update_first_pid(cpu_data);
> + }
> +
> + print_total(tep, &data);
> +
> + free_cpus(&data);
> + free_tasks(&data.tasks);
> +}
> +
> +void trace_analyze(int argc, char **argv)
> +{
> + struct tracecmd_input *handle;
> + const char *input_file = NULL;
> + int instances;
> + int ret;
> +
> + for (;;) {
> + int c;
> +
> + c = getopt(argc-1, argv+1, "+hti:");
> + if (c == -1)
> + break;
> + switch (c) {
> + case 'h':
> + usage(argv);
> + break;
> + case 'i':
> + if (input_file)
> + die("Only one input for historgram");
> + input_file = optarg;
> + break;
> + case 't':
> + time_in_nsecs = true;
> + break;
> + default:
> + usage(argv);
> + }
> + }
> +
> + if ((argc - optind) >= 2) {
> + if (input_file)
> + usage(argv);
> + input_file = argv[optind + 1];
> + }
> +
> + if (!input_file)
> + input_file = DEFAULT_INPUT_FILE;
> +
> + handle = tracecmd_alloc(input_file, 0);
> + if (!handle)
> + die("can't open %s\n", input_file);
> +
> + ret = tracecmd_read_headers(handle, 0);
> + if (ret)
> + return;
> +
> + ret = tracecmd_init_data(handle);
> + if (ret < 0)
> + die("failed to init data");
> +
> + if (ret > 0)
> + die("trace-cmd hist does not work with latency traces\n");
> +
> + do_trace_analyze(handle);
> +
> + instances = tracecmd_buffer_instances(handle);
> + if (instances) {
> + struct tracecmd_input *new_handle;
> + int i;
> +
> + for (i = 0; i < instances; i++) {
> + new_handle = tracecmd_buffer_instance_handle(handle, i);
> + if (!new_handle) {
> + warning("could not retrieve handle %d", i);
> + continue;
> + }
> + do_trace_analyze(new_handle);
> + tracecmd_close(new_handle);
> + }
> + }
> +
> + tracecmd_close(handle);
> +}
> diff --git a/tracecmd/trace-cmd.c b/tracecmd/trace-cmd.c
> index a83a8d0bb9e4..d6e1f9dacc3c 100644
> --- a/tracecmd/trace-cmd.c
> +++ b/tracecmd/trace-cmd.c
> @@ -134,6 +134,7 @@ struct command commands[] = {
> {"help", trace_usage},
> {"dump", trace_dump},
> {"convert", trace_convert},
> + {"analyze", trace_analyze},
> {"-h", trace_usage},
> };
>
> --
> 2.35.1
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 03/12] trace-cmd analyze: Show how much tasks run on each CPU
2022-03-26 15:16 ` Steven Rostedt
@ 2022-03-28 16:23 ` Kalesh Singh
0 siblings, 0 replies; 27+ messages in thread
From: Kalesh Singh @ 2022-03-28 16:23 UTC (permalink / raw)
To: Steven Rostedt
Cc: John Keeping, linux-trace-devel, williskung, linux-rt-users
On Sat, Mar 26, 2022 at 8:16 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Sat, 26 Mar 2022 11:07:03 +0000
> John Keeping <john@metanate.com> wrote:
>
> > > > > + print_time(cpu_tasks[i]->runtime, '_');
> > > > > + printf(" (%%%lld)\n", (task->runtime * 100) / total_time);
> > > >
> > > > Is there a reason for using the CPU-specific runtime for the value and
> > > > the total runtime for the percentage?
> > > >
> > > > I expected the percentage to be the percentage of this CPU's time spend
> > > > running the task.
> > >
> > > We modify it so that each CPU has the same run time, unless there's missed
> > > events at the start (later patches), and then we change total_time to be
> > > the total time of the events on the CPU and not the entire trace.
> >
> > I think we're talking about different things here, I probably wasn't
> > entirely clear about this. It's the numerator of this division that I'm
> > concerned about and I wonder if this should be:
> >
> > (cpu_tasks[i]->runtime * 100) / total_time
>
> Ah, I did misunderstand you. Yes, that's a typo. Thanks for pointing
> that out. I'll go fix it.
> > > > > + if (idle_task) {
> > > > > + printf("idle:\t");
> > > > > + print_time(idle_task->runtime, '_');
> > > > > + printf(" (%%%lld)\n", (idle_task->runtime * 100) / total_time);
IIUC the idle_task->runtime works out the same as the per-cpu runtime.
But for consistency I think we should use idle_cpu_task->runtime here
as well
Thanks,
Kalesh
>
> -- Steve
>
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 01/12] trace-cmd: Add trace-cmd analyze
2022-03-28 16:08 ` Kalesh Singh
@ 2022-03-28 16:34 ` Steven Rostedt
0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2022-03-28 16:34 UTC (permalink / raw)
To: Kalesh Singh; +Cc: linux-trace-devel, williskung, linux-rt-users
On Mon, 28 Mar 2022 09:08:02 -0700
Kalesh Singh <kaleshsingh@google.com> wrote:
> > +++ b/tracecmd/trace-analyze.c
> > @@ -0,0 +1,508 @@
> > +// SPDX-License-Identifier: LGPL-2.1
> > +/*
> > + * Copyright (C) 2022 Google Inc, Steven Rostedt <srostedt@redhat.com>
> > + */
> > +#define _LARGEFILE64_SOURCE
> > +#include <getopt.h>
> > +#include <stdlib.h>
> > +
> > +#include "trace-cmd-local.h"
> > +#include "trace-local.h"
> > +#include "trace-hash.h"
> > +#include "list.h"
> > +
> > +static bool time_in_nsecs;
> > +
> > +struct analysis_data;
> > +
> > +struct cpu_data {
> > + unsigned long long start_ts;
> > + unsigned long long last_ts;
> > + struct analysis_data *data;
> > + struct trace_hash tasks;
> > + int nr_tasks;
> > + int cpu;
> > + int current_pid;
> > + int first_pid;
> > +};
> > +
> > +struct analysis_data {
> > + unsigned long long start_ts;
> > + unsigned long long last_ts;
> > + struct cpu_data *cpu_data;
> > + struct trace_hash tasks;
> > + int nr_tasks;
> > + int allocated_cpus;
> > + int cpus;
>
> Hi Steve,
>
> I don't see analysis_data->cpus field ever used.
Probably not. I was debating about adding RFC to this series, but decided
not to. Now I regret that, because it really is an RFC (as well as I rushed
a lot of it).
>
> > +};
> > +
> > +struct task_item {
> > + unsigned long long runtime;
> > + unsigned long long start_ts;
> > + struct trace_hash_item hash;
> > + int pid;
> > +};
> > +
> > +struct task_cpu_item {
> > + struct trace_hash_item hash;
> > + struct task_item *task;
> > +};
> > +
> > +#define CPU_BLOCKS 32
> > +
> > +#define task_from_hash(item) container_of(item, struct task_item, hash)
> > +#define task_cpu_from_hash(item) container_of(item, struct task_cpu_item, hash)
> > +
> > +static struct cpu_data *get_cpu_data(struct analysis_data *data,
> > + struct tep_record *record)
> > +{
> > + struct cpu_data *cpu_data;
> > + int cpu = record->cpu;
> > + int cnt;
> > +
> > + if (cpu < data->allocated_cpus) {
> > + cpu_data = &data->cpu_data[cpu];
> > + if (!cpu_data->data)
> > + goto init_cpu_data;
> > + return cpu_data;
> > + }
> > +
> > + /* Round up to CPU_BLOCKS multiplier */
> > + cnt = ((cpu + CPU_BLOCKS) / CPU_BLOCKS) * CPU_BLOCKS;
> > +
> > + cpu_data = realloc(data->cpu_data, sizeof(*cpu_data) * cnt);
> > +
> > + if (!cpu_data)
> > + die("Allocating cpu size %d for cpu %d", cnt, cpu);
> > +
> > + memset(cpu_data + data->allocated_cpus, 0,
> > + sizeof(*cpu_data) * (cnt - data->allocated_cpus));
> > +
> > + data->allocated_cpus = cnt;
> > +
> > + data->cpu_data = cpu_data;
> > + cpu_data += cpu;
> > +
> > + init_cpu_data:
> > + cpu_data->current_pid = -1;
> > + cpu_data->cpu = cpu;
> > + cpu_data->data = data;
> > +
> > + trace_hash_init(&cpu_data->tasks, 32);
>
> nit: #define CPU_TASKS_HASH_SIZE 32?
Agreed.
>
> > +
> > + return cpu_data;
> > +}
> > +
> > +static int check_idle(struct cpu_data *cpu_data, int pid)
> > +{
> > + if (pid)
> > + return pid;
> > +
> > + /*
> > + * Since pid 0 is the per cpu swapper task that
> > + * means several of these tasks have the same pid
> > + * and only differentiate between CPUs. Set the pid
> > + * that is stored in the hash as -2 - CPU id.
> > + */
> > +
> > + return -2 - cpu_data->cpu;
> > +}
> > +
> > +static struct task_item *get_task(struct cpu_data *cpu_data, int pid)
> > +{
> > + struct trace_hash_item *hash;
> > + struct task_item *task;
> > + int find_pid;
> > +
> > + find_pid = check_idle(cpu_data, pid);
> > +
> > + hash = trace_hash_find(&cpu_data->data->tasks, find_pid, NULL, NULL);
> > + if (!hash) {
> > + task = calloc(sizeof(*task), 1);
> > + if (!task)
> > + die("allocating task");
> > + task->pid = find_pid;
> > + hash = &task->hash;
> > + hash->key = find_pid;
> > + cpu_data->data->nr_tasks++;
> > + trace_hash_add(&cpu_data->data->tasks, hash);
> > + }
> > +
> > + return task_from_hash(hash);
> > +}
> > +
> > +static struct task_cpu_item *get_cpu_task(struct cpu_data *cpu_data, int pid)
> > +{
> > + struct trace_hash_item *hash;
> > + struct task_cpu_item *task;
> > +
> > + hash = trace_hash_find(&cpu_data->tasks, pid, NULL, NULL);
> > + if (!hash) {
> > + task = calloc(sizeof(*task), 1);
> > + if (!task)
> > + die("allocating cpu task");
> > + task->task = get_task(cpu_data, pid);
> > + hash = &task->hash;
> > + hash->key = pid;
> > + cpu_data->nr_tasks++;
> > + trace_hash_add(&cpu_data->tasks, hash);
> > + }
> > +
> > + return task_cpu_from_hash(hash);
> > +};
> > +
> > +/* Update times for a task scheduling off the CPU */
> > +static void update_cpu_task_times(struct cpu_data *cpu_data,
> > + struct task_cpu_item *cpu_task,
> > + unsigned long long ts)
> > +{
> > + unsigned long long delta;
> > + struct task_item *task = cpu_task->task;
> > +
> > + /*
> > + * If the last event was a sched switch where the previous task
> > + * ran on another CPU, and migrated back to this CPU, and sched
> > + * switch was not recorded (saying that this task scheduled off)
> > + * It could be miscronstrued to still be on this CPU, and that
> > + * its start_ts is later than the last_ts of this CPU.
> > + */
> > + if (ts < task->start_ts)
> > + return;
> > +
> > + delta = ts - task->start_ts;
> > + task->runtime += delta;
> > +}
> > +
> > +static void update_pid(struct cpu_data *cpu_data,
> > + struct tep_record *record, int pid)
> > +{
> > + struct task_cpu_item *cpu_task;
> > + struct task_item *task;
> > + int curr_pid = cpu_data->current_pid;
> > + unsigned long long delta;
> > +
> > + cpu_task = get_cpu_task(cpu_data, curr_pid);
> > + task = cpu_task->task;
> > +
> > + update_cpu_task_times(cpu_data, cpu_task, cpu_data->last_ts);
> > +
> > + if (!record)
> > + return;
> > +
> > + cpu_task = get_cpu_task(cpu_data, pid);
> > + task = cpu_task->task;
> > +
> > + if (record->ts < cpu_data->last_ts) {
> > + tracecmd_warning("task %d start time %llu greater than CPU time %llu",
> > + pid, record->ts, cpu_data->last_ts);
>
> Shouldn’t this say, ... start time %llu "less than" CPU time ...?
Yeah, that wording is confusing.
"task %d CPU start time %llu is before last event %llu on CPU for a different task"
?
>
> > + } else {
> > + delta = record->ts - cpu_data->last_ts;
> > + task->runtime += delta;
> > + }
> > +
> > + cpu_data->last_ts = record->ts;
> > +
> > + task->start_ts = cpu_data->last_ts;
> > + cpu_data->current_pid = pid;
> > +}
> > +
> > +static void update_cpu_times(struct cpu_data *cpu_data,
> > + struct tep_handle *tep, int pid,
> > + struct tep_record *record)
> > +{
> > + struct task_cpu_item *cpu_task;
> > + struct task_item *task;
> > +
> > + if (cpu_data->current_pid < 0) {
> > + /* First time called */
> > + cpu_data->start_ts = record->ts;
> > + cpu_data->last_ts = record->ts;
> > + cpu_data->current_pid = pid;
> > + cpu_data->first_pid = pid;
> > + cpu_task = get_cpu_task(cpu_data, pid);
> > + task = cpu_task->task;
> > + task->start_ts = record->ts;
> > + if (record->ts < cpu_data->data->start_ts)
> > + cpu_data->data->start_ts = record->ts;
> > + return;
> > + }
> > +
> > + if (pid != cpu_data->current_pid) {
> > + update_pid(cpu_data, record, pid);
> > + return;
> > + }
> > +
> > + cpu_data->last_ts = record->ts;
> > +}
> > +
> > +static void update_first_pid(struct cpu_data *cpu_data)
> > +{
> > + struct task_cpu_item *cpu_task;
> > + struct task_item *task;
> > + unsigned long long start_ts = cpu_data->data->start_ts;
> > + unsigned long long delta;
> > +
> > + cpu_task = get_cpu_task(cpu_data, cpu_data->first_pid);
> > + task = cpu_task->task;
> > + delta = cpu_data->start_ts - start_ts;
> > + task->runtime += delta;
> > + cpu_data->start_ts = start_ts;
> > +}
> > +
> > +static void process_cpu(struct analysis_data *data,
> > + struct tep_handle *tep,
> > + struct tep_record *record)
> > +{
> > + struct cpu_data *cpu_data;
> > + int pid;
> > +
> > + pid = tep_data_pid(tep, record);
> > + if (pid < 0) /* Warn? */
> > + return;
> > +
> > + cpu_data = get_cpu_data(data, record);
> > + update_cpu_times(cpu_data, tep, pid, record);
> > +}
> > +
> > +static void print_time(unsigned long long ts, char delim)
> > +{
> > + unsigned long long secs;
> > + unsigned long long msecs;
> > + unsigned long long usecs;
> > + unsigned long long nsecs;
> > +
> > + secs = ts / 1000000000;
> > + ts -= secs * 1000000000;
> > +
> > + msecs = ts / 1000000;
> > + ts -= msecs * 1000000;
> > +
> > + usecs = ts / 1000;
> > + ts -= usecs * 1000;
> > +
> > + nsecs = ts;
> > +
> > + if (delim) {
> > + printf("%6llu.%03llu%c%03llu",
> > + secs, msecs, delim, usecs);
> > + if (time_in_nsecs)
> > + printf("%c%03llu", delim, nsecs);
> > + } else {
> > + printf("%6llu.%03llu%03llu",
> > + secs, msecs, usecs);
> > + if (time_in_nsecs)
> > + printf("%03llu", nsecs);
> > + }
> > +}
> > +
> > +static void print_total(struct tep_handle *tep, struct analysis_data *data)
> > +{
> > + unsigned long long total_time;
> > + struct trace_hash_item **bucket;
> > + struct trace_hash_item *item;
> > + struct task_item **idle_tasks;
> > + struct task_item *task;
> > + bool first = true;
> > + int cpu;
> > + int i = 0;
> > +
> > + total_time = data->last_ts - data->start_ts;
> > + printf("\nTotal time: ");
> > + print_time(total_time, '_');
> > + printf("\n");
> > +
> > + idle_tasks = calloc(sizeof(*idle_tasks), data->allocated_cpus);
> > + if (!idle_tasks)
> > + die("Could not allocate idle task array");
> > +
> > + trace_hash_for_each_bucket(bucket, &data->tasks) {
> > + trace_hash_for_each_item(item, bucket) {
> > + task = task_from_hash(item);
> > + if (task->pid < 0) {
> > + cpu = -2 - task->pid;
> > + idle_tasks[cpu] = task;
> > + }
> > + }
> > + }
> > +
> > + for (i = 0; i < data->allocated_cpus; i++) {
> > + if (!idle_tasks[i])
> > + continue;
> > +
> > + if (first) {
> > + printf("\n Idle CPU\t Run time\n");
> > + printf(" --------\t --------\n");
> > + first = false;
> > + }
> > + printf("CPU %d idle:\t", i);
> > + print_time(idle_tasks[i]->runtime, '_');
> > + printf(" (%%%lld)\n", (idle_tasks[i]->runtime * 100) / total_time);
> > + }
> > + free(idle_tasks);
> > +}
> > +
> > +static void free_tasks(struct trace_hash *hash)
> > +{
> > + struct trace_hash_item **bucket;
> > + struct trace_hash_item *item;
> > + struct task_item *task;
> > +
> > + trace_hash_for_each_bucket(bucket, hash) {
> > + trace_hash_while_item(item, bucket) {
> > + task = task_from_hash(item);
> > + trace_hash_del(item);
> > + free(task);
> > + }
> > + }
> > + trace_hash_free(hash);
> > +}
> > +
> > +static void free_cpu_tasks(struct trace_hash *hash)
> > +{
> > + struct trace_hash_item **bucket;
> > + struct trace_hash_item *item;
> > + struct task_cpu_item *cpu_task;
> > +
> > + trace_hash_for_each_bucket(bucket, hash) {
> > + trace_hash_while_item(item, bucket) {
> > + cpu_task = task_cpu_from_hash(item);
> > + trace_hash_del(item);
> > + free(cpu_task);
> > + }
> > + }
> > + trace_hash_free(hash);
> > +}
> > +
> > +static void free_cpus(struct analysis_data *data)
> > +{
> > + struct cpu_data *cpu_data;
> > + int i;
> > +
> > + for (i = 0; i < data->allocated_cpus; i++) {
> > + cpu_data = &data->cpu_data[i];
> > + if (!cpu_data->data)
> > + continue;
> > + free_cpu_tasks(&cpu_data->tasks);
> > + }
> > + free(data->cpu_data);
> > +}
> > +
> > +static void do_trace_analyze(struct tracecmd_input *handle)
> > +{
> > + struct tep_handle *tep = tracecmd_get_tep(handle);
> > + struct tep_record *record;
> > + struct analysis_data data;
> > + struct cpu_data *cpu_data;
> > + int i;
> > +
> > + memset(&data, 0, sizeof(data));
> > +
> > + trace_hash_init(&data.tasks, 128);
> > +
> > + /* Set to a very large number */
> > + data.start_ts = -1ULL;
> > +
> > + do {
> > + record = tracecmd_read_next_data(handle, NULL);
> > + if (record)
> > + process_cpu(&data, tep, record);
> > + tracecmd_free_record(record);
> > + } while (record);
> > +
> > + for (i = 0; i < data.allocated_cpus; i++) {
> > + cpu_data = &data.cpu_data[i];
> > + if (!cpu_data->data || !cpu_data->nr_tasks)
> > + continue;
> > + if (cpu_data->last_ts > data.last_ts)
> > + data.last_ts = cpu_data->last_ts;
> > + }
> > +
> > + for (i = 0; i < data.allocated_cpus; i++) {
> > + cpu_data = &data.cpu_data[i];
> > + if (!cpu_data->data || !cpu_data->nr_tasks)
> > + continue;
> > + cpu_data->last_ts = data.last_ts;
> > + update_pid(cpu_data, NULL, -1);
>
> I think having update_last_pid() makes this easier to read.
You mean to add a helper function that does the last pid and have
update_pid() start with that?
I could do that.
Thanks for looking at this.
Note, I'm going to rewrite this so that this is actually part of the
library, to allow other applications to have access to this functionality.
-- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
end of thread, other threads:[~2022-03-28 16:34 UTC | newest]
Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-03-24 2:57 [PATCH 00/12] trace-cmd: Add trace-cmd analyze command Steven Rostedt
2022-03-24 2:57 ` [PATCH 01/12] trace-cmd: Add trace-cmd analyze Steven Rostedt
2022-03-28 16:08 ` Kalesh Singh
2022-03-28 16:34 ` Steven Rostedt
2022-03-24 2:57 ` [PATCH 02/12] trace-cmd analyze: Show what tasks are running the most Steven Rostedt
2022-03-24 2:57 ` [PATCH 03/12] trace-cmd analyze: Show how much tasks run on each CPU Steven Rostedt
2022-03-25 19:36 ` John Keeping
2022-03-25 20:18 ` Steven Rostedt
2022-03-26 11:07 ` John Keeping
2022-03-26 15:16 ` Steven Rostedt
2022-03-28 16:23 ` Kalesh Singh
2022-03-24 2:57 ` [PATCH 04/12] trace-cmd analyze: Use sched_switch to find comm mappings Steven Rostedt
2022-03-24 2:57 ` [PATCH 05/12] trace-cmd analyze: Use sched_switch event to update times Steven Rostedt
2022-03-24 2:57 ` [PATCH 06/12] trace-cmd analyze: Add tracing of tasks and their states Steven Rostedt
2022-03-25 19:37 ` John Keeping
2022-03-25 20:19 ` Steven Rostedt
2022-03-24 2:57 ` [PATCH 07/12] trace-cmd analyze: Add "idleness" Steven Rostedt
2022-03-24 2:57 ` [PATCH 08/12] trace-cmd analyze: Track migration Steven Rostedt
2022-03-24 2:57 ` [PATCH 09/12] trace-cmd analyze: Add wake up latency timings Steven Rostedt
2022-03-25 19:34 ` John Keeping
2022-03-25 20:13 ` Steven Rostedt
2022-03-25 20:31 ` Steven Rostedt
2022-03-26 11:14 ` John Keeping
2022-03-26 15:32 ` Steven Rostedt
2022-03-24 2:57 ` [PATCH 10/12] trace-cmd analyze: Add counting of page faults Steven Rostedt
2022-03-24 2:57 ` [PATCH 11/12] trace-cmd analyze: Account for dropped events Steven Rostedt
2022-03-24 2:57 ` [PATCH 12/12] trace-cmd analyze: Add documentation Steven Rostedt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).