From: "Török Edwin" <edwintorok@gmail.com>
To: mingo@elte.hu, srostedt@redhat.com
Cc: a.p.zijlstra@chello.nl, sandmann@daimi.au.dk,
linux-kernel@vger.kernel.org,
"Török Edwin" <edwintorok@gmail.com>
Subject: [PATCH 4/4] Implement semaphore latency tracer
Date: Sun, 12 Oct 2008 16:12:04 +0300 [thread overview]
Message-ID: <1223817124-27239-6-git-send-email-edwintorok@gmail.com> (raw)
In-Reply-To: <1223817124-27239-1-git-send-email-edwintorok@gmail.com>
Each time a down_read or down_write fails, a unique latency id is generated.
Later when someone releases the semaphore, it is blamed for the latency of all
tasks on the wait_list of the semaphore.
If you would group the output from latency_trace by the latency_id you get all those
who were contending on a lock, and the tasks that were holding the lock.
An entry in latency_trace has the format:
(latency_id) [semaphore_id] read|write
stacktrace <= stacktrace
Signed-off-by: Török Edwin <edwintorok@gmail.com>
---
kernel/trace/Kconfig | 12 +++
kernel/trace/Makefile | 1 +
kernel/trace/trace.c | 105 +++++++++++++++++++++----
kernel/trace/trace.h | 32 ++++++++
kernel/trace/trace_semlat.c | 186 +++++++++++++++++++++++++++++++++++++++++++
5 files changed, 321 insertions(+), 15 deletions(-)
create mode 100644 kernel/trace/trace_semlat.c
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 396aea1..8faac14 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -144,6 +144,18 @@ config STACK_TRACER
This tracer records the max stack of the kernel, and displays
it in debugfs/tracing/stack_trace
+config SEMLAT_TRACER
+ bool "Semaphore latency tracer"
+ depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
+ select TRACING
+ select TRACEPOINTS
+ select TRACER_MAX_TRACE
+ help
+ This tracer records the reason for semaphore contention, showing
+ who is holding a lock for too long and causing other tasks to contend
+ on the semaphore. Currently it instruments rwsem and rwsem-spinlock with tracepoints.
+
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
depends on FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index a85dfba..d037df3 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -24,5 +24,6 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
+obj-$(CONFIG_SEMLAT_TRACER) += trace_semlat.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 722ab74..bd9145a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -752,26 +752,42 @@ void __trace_stack(struct trace_array *tr,
ftrace_trace_stack(tr, data, flags, skip, preempt_count());
}
-static void __trace_userstack(struct trace_array *tr,
+static void ftrace_trace_userstack(struct trace_array *tr,
struct trace_array_cpu *data,
- unsigned long flags)
+ unsigned long flags, int pc)
{
- struct trace_entry *entry;
+ struct userstack_entry *entry;
struct stack_trace trace;
+ struct ring_buffer_event *event;
+ unsigned long irq_flags;
if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
- entry = tracing_get_trace_entry(tr, data);
- tracing_generic_entry_update(entry, flags);
- entry->type = TRACE_USER_STACK;
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_USER_STACK;
+
+ memset(&entry->caller, 0, sizeof(entry->caller));
- memset(&entry->field.stack, 0, sizeof(entry->field.stack));
trace.nr_entries = 0;
trace.max_entries = FTRACE_STACK_ENTRIES;
trace.skip = 0;
- trace.entries = entry->field.stack.caller;
+ trace.entries = entry->caller;
+
save_stack_trace_user(&trace);
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+}
+
+void __trace_userstack(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long flags)
+{
+ ftrace_trace_userstack(tr, data, flags, preempt_count());
}
static void
@@ -810,6 +826,36 @@ __trace_special(void *__tr, void *__data,
}
void
+trace_semlat(struct trace_array *tr, struct trace_array_cpu *data,
+ const struct semlat_data *sdata, int samestack)
+{
+ struct ring_buffer_event *event;
+ struct semlat_entry *entry;
+ unsigned long irq_flags;
+ int pc = preempt_count();
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, pc);
+ entry->ent.type = TRACE_SEM;
+ entry->data = *sdata;
+ entry->data.pid = current->pid;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ if (!samestack) {
+ /* show a stack trace only if different from previous */
+ __trace_stack(tr, data, irq_flags, 3);
+ __trace_userstack(tr, data, irq_flags);
+ }
+ ftrace_trace_stack(tr, data, irq_flags, 4, pc);
+ __trace_userstack(tr, data, irq_flags);
+
+ trace_wake_up();
+}
+
+void
tracing_sched_switch_trace(struct trace_array *tr,
struct trace_array_cpu *data,
struct task_struct *prev,
@@ -1192,7 +1238,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
}
static int
-seq_print_userip_objs(const struct trace_entry *entry, struct trace_seq *s,
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
unsigned long sym_flags)
{
struct mm_struct *mm = NULL;
@@ -1204,7 +1250,7 @@ seq_print_userip_objs(const struct trace_entry *entry, struct trace_seq *s,
/* we do the lookup on the thread group leader,
* since individual threads might have already quit! */
rcu_read_lock();
- task = find_task_by_vpid(entry->field.tgid);
+ task = find_task_by_vpid(entry->ent.tgid);
rcu_read_unlock();
if (task)
@@ -1212,7 +1258,7 @@ seq_print_userip_objs(const struct trace_entry *entry, struct trace_seq *s,
}
for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
- unsigned long ip = entry->field.stack.caller[i];
+ unsigned long ip = entry->caller[i];
unsigned long vmstart = 0;
struct file *file = NULL;
@@ -1532,10 +1578,34 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
trace_seq_print_cont(s, iter);
break;
}
- case TRACE_USER_STACK:
- seq_print_userip_objs(entry, s, sym_flags);
+ case TRACE_USER_STACK: {
+ struct userstack_entry *field;
+
+ trace_assign_type(field, entry);
+
+ seq_print_userip_objs(field, s, sym_flags);
trace_seq_putc(s, '\n');
break;
+ }
+ case TRACE_SEM: {
+ struct semlat_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (field->data.pid == field->ent.pid) {
+ /* task that suffered the latency */
+ comm = trace_find_cmdline(field->data.pid);
+ trace_seq_printf(s, " (%u) %s["IP_FMT"] %s %s\n",
+ field->data.latency_id,
+ field->data.name ? field->data.name : "",
+ field->data.semaphore,
+ field->data.kind_read ? "read" : "write",
+ comm);
+ } else
+ trace_seq_printf(s, " (%u)\n",
+ field->data.latency_id);
+ break;
+ }
default:
trace_seq_printf(s, "Unknown type %d\n", entry->type);
}
@@ -1671,8 +1741,13 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
if (entry->flags & TRACE_FLAG_CONT)
trace_seq_print_cont(s, iter);
break;
- case TRACE_USER_STACK:
- ret = seq_print_userip_objs(entry, s, sym_flags);
+ }
+ case TRACE_USER_STACK: {
+ struct userstack_entry *field;
+
+ trace_assign_type(field, entry);
+
+ ret = seq_print_userip_objs(field, s, sym_flags);
if (!ret)
return 0;
ret = trace_seq_putc(s, '\n');
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 59d3b17..d734561 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -23,6 +23,7 @@ enum trace_type {
TRACE_MMIO_MAP,
TRACE_BOOT,
TRACE_USER_STACK,
+ TRACE_SEM,
__TRACE_LAST_TYPE
};
@@ -39,6 +40,7 @@ struct trace_entry {
unsigned char flags;
unsigned char preempt_count;
int pid;
+ int tgid;
};
/*
@@ -86,6 +88,11 @@ struct stack_entry {
unsigned long caller[FTRACE_STACK_ENTRIES];
};
+struct userstack_entry {
+ struct trace_entry ent;
+ unsigned long caller[FTRACE_STACK_ENTRIES];
+};
+
/*
* ftrace_printk entry:
*/
@@ -119,6 +126,25 @@ struct trace_boot {
};
/*
+ * semaphore-latency entry
+ */
+
+struct semlat_data {
+ const void *semaphore;
+ const char *name;
+ unsigned int latency_id; /* a unique id for this latency instance */
+ unsigned int kind_read;
+ unsigned int pid;/* pid of task that suffered the latency */
+ cycle_t t;/* start of latency - lock contention */
+ int cpu;
+};
+
+struct semlat_entry {
+ struct trace_entry ent;
+ struct semlat_data data;
+};
+
+/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
* IRQS_OFF - interrupts were disabled
@@ -211,6 +237,7 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
+ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct special_entry, 0); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
@@ -218,6 +245,7 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
TRACE_MMIO_MAP); \
IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \
+ IF_ASSIGN(var, ent, struct semlat_entry, TRACE_SEM); \
__ftrace_bad_type(); \
} while (0)
@@ -314,6 +342,10 @@ void trace_special(struct trace_array *tr,
unsigned long arg1,
unsigned long arg2,
unsigned long arg3, int pc);
+
+void trace_semlat(struct trace_array *tr, struct trace_array_cpu *data,
+ const struct semlat_data *sdata, int samestack);
+
void trace_function(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long ip,
diff --git a/kernel/trace/trace_semlat.c b/kernel/trace/trace_semlat.c
new file mode 100644
index 0000000..d72e3e3
--- /dev/null
+++ b/kernel/trace/trace_semlat.c
@@ -0,0 +1,186 @@
+/*
+ * Semaphore latency tracer
+ *
+ * Copyright (C) 2008 Török Edwin <edwintorok@gmail.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <trace/rwsem.h>
+#include "trace.h"
+
+static struct trace_array *ctx_trace;
+static int __read_mostly tracer_enabled;
+
+static struct semlat_data map_pid_to_latency[PID_MAX_DEFAULT+1];
+
+static inline int report_latency(unsigned long delta)
+{
+ if (tracing_thresh) {
+ if (delta < tracing_thresh)
+ return 0;
+ } else {
+ if (delta <= tracing_max_latency)
+ return 0;
+ }
+ return 1;
+}
+
+static inline void probe_rwsem_wake(const struct rw_semaphore *sem,
+ const struct rwsem_waiter *waiter)
+{
+ /* first task in waitlist is oldest, so calculate delta against it */
+ const struct semlat_data *data;
+ struct trace_array_cpu *cpudata;
+ unsigned long T0, T1, delta;
+ int cpu, first;
+
+ if (!tracer_enabled)
+ return;
+ data = &map_pid_to_latency[waiter->task->pid];
+
+ if (data->cpu == -1)
+ return;
+ T0 = data->t;
+ T1 = ftrace_now(data->cpu);
+ delta = T1-T0;
+
+ if (!report_latency(delta))
+ return;
+ cpu = raw_smp_processor_id();
+ cpudata = ctx_trace->data[cpu];
+ /* blame the current task for latency of all tasks on wait list
+ * note that some of those are being woken now, but that doesn't matter
+ * we want to blame this task for their latency too */
+ if (list_empty(&waiter->list)) {
+ /* waiter is not part of wait_list, so process it here */
+ trace_semlat(ctx_trace, cpudata, data, 1);
+ first = 0;
+ } else {
+ first = 1;
+ }
+ list_for_each_entry(waiter, &sem->wait_list, list) {
+ data = &map_pid_to_latency[waiter->task->pid];
+ if (data->cpu != -1)
+ trace_semlat(ctx_trace, cpudata, data, first);
+ first = 0;
+ }
+}
+
+static atomic_t unique_latency_id = ATOMIC_INIT(0);
+
+static void probe_rwsem_lock_failed(const struct rw_semaphore *sem)
+{
+ const struct rwsem_waiter *waiter;
+ struct semlat_data *data;
+ int cpu = raw_smp_processor_id();
+
+ if (!tracer_enabled)
+ return;
+ waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+ data = &map_pid_to_latency[waiter->task->pid];
+ data->semaphore = sem;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ data->name = sem->dep_map.name;
+#else
+ data->name = NULL;
+#endif
+
+ data->latency_id = atomic_inc_return(&unique_latency_id);
+ data->kind_read = waiter->flags == RWSEM_WAITING_FOR_READ;
+ data->t = ftrace_now(cpu);
+ data->cpu = cpu;
+}
+
+static void probe_rwsem_lock_ok(const struct task_struct *tsk)
+{
+ struct semlat_data *data = &map_pid_to_latency[tsk->pid];
+ struct trace_array_cpu *cpudata;
+ int cpu;
+ cpu = raw_smp_processor_id();
+ cpudata = ctx_trace->data[cpu];
+
+ if (!tracer_enabled)
+ return;
+ trace_semlat(ctx_trace, cpudata, data, 1);
+ /* invalidate entry */
+ data->cpu = -1;
+}
+
+static void start_semlat_trace(struct trace_array *tr)
+{
+ int ret;
+ ret = register_trace_rwsem_wake(probe_rwsem_wake);
+ WARN_ON(ret);
+
+ ret = register_trace_rwsem_lock_failed(probe_rwsem_lock_failed);
+ WARN_ON(ret);
+
+ ret = register_trace_rwsem_lock_ok(probe_rwsem_lock_ok);
+ WARN_ON(ret);
+ printk(KERN_INFO "registered semlat\n");
+ tracer_enabled = 1;
+}
+
+static void stop_semlat_trace(struct trace_array *tr)
+{
+ tracer_enabled = 0;
+ unregister_trace_rwsem_wake(probe_rwsem_wake);
+ unregister_trace_rwsem_lock_failed(probe_rwsem_lock_failed);
+ printk(KERN_INFO "unregistered semlat\n");
+}
+
+static void semlat_trace_init(struct trace_array *tr)
+{
+ unsigned int i;
+ int cpu;
+
+ ctx_trace = tr;
+
+ atomic_set(&unique_latency_id, 0);
+ for (i = 0; i <= PID_MAX_DEFAULT; i++)
+ map_pid_to_latency[i].cpu = -1;
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+
+ if (tr->ctrl)
+ start_semlat_trace(tr);
+}
+
+static void semlat_trace_reset(struct trace_array *tr)
+{
+
+ if (tr->ctrl)
+ stop_semlat_trace(tr);
+}
+
+static void semlat_trace_ctrl_update(struct trace_array *tr)
+{
+ /* When starting a new trace, reset the buffers */
+ if (tr->ctrl)
+ start_semlat_trace(tr);
+ else
+ stop_semlat_trace(tr);
+}
+
+static struct tracer semlat_trace __read_mostly =
+{
+ .name = "semlat",
+ .init = semlat_trace_init,
+ .reset = semlat_trace_reset,
+ .ctrl_update = semlat_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+/* .selftest = trace_selftest_startup_semlat,*/
+#endif
+};
+
+__init static int init_semlat_trace(void)
+{
+ return register_tracer(&semlat_trace);
+}
+device_initcall(init_semlat_trace);
--
1.5.6.5
next prev parent reply other threads:[~2008-10-12 13:13 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-10-12 13:11 [PATCH 0/4] ftrace: add userspace stacktrace support and semaphore-latency tracer Török Edwin
2008-10-12 13:12 ` Török Edwin
2008-10-12 13:12 ` [PATCH 1/4] Add support for userspace stacktraces in tracing/iter_ctrl Török Edwin
2008-10-12 13:31 ` Frédéric Weisbecker
2008-10-12 13:53 ` Török Edwin
2008-10-13 8:02 ` Frédéric Weisbecker
2008-10-26 4:05 ` Frédéric Weisbecker
2008-10-26 7:03 ` Török Edwin
2008-10-26 15:06 ` Frédéric Weisbecker
2008-10-26 13:15 ` Frank Ch. Eigler
2008-10-26 13:29 ` Peter Zijlstra
2008-10-26 13:38 ` Török Edwin
2008-10-26 13:49 ` Frank Ch. Eigler
2008-10-27 16:03 ` Ingo Molnar
2008-10-27 16:16 ` Török Edwin
2008-10-12 13:12 ` [PATCH 2/4] Identify which executable object the userspace address belongs to Török Edwin
2008-10-12 13:12 ` [PATCH 3/4] add tracepoints in rwsem Török Edwin
2008-10-12 13:12 ` Török Edwin [this message]
2008-10-12 19:13 ` [PATCH 4/4] Implement semaphore latency tracer Peter Zijlstra
2008-10-12 20:10 ` Török Edwin
2008-10-22 15:28 ` Ingo Molnar
2008-10-22 15:41 ` Török Edwin
2008-10-22 15:48 ` Ingo Molnar
2008-10-22 17:22 ` Peter Zijlstra
2008-10-22 17:25 ` Török Edwin
2008-10-12 18:25 ` [PATCH 0/4] ftrace: add userspace stacktrace support and semaphore-latency tracer Steven Rostedt
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1223817124-27239-6-git-send-email-edwintorok@gmail.com \
--to=edwintorok@gmail.com \
--cc=a.p.zijlstra@chello.nl \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=sandmann@daimi.au.dk \
--cc=srostedt@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.