public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice
@ 2013-03-02  2:48 Steven Rostedt
  2013-03-02  2:48 ` [for-next][PATCH 1/7] tracing: Get trace_events kernel command line working again Steven Rostedt
                   ` (7 more replies)
  0 siblings, 8 replies; 10+ messages in thread
From: Steven Rostedt @ 2013-03-02  2:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Frederic Weisbecker,
	Masami Hiramatsu, David Sharp, Vaibhav Nagarnaik, hcochran

This is some more updates coming for v3.10.

One, I had to rebase from my last patch set because it broke
kgdb tracing as well as the new snapshot feature. The end of this
email contains the difference between my last patch set and what I
pushed to next in my rebase.

The first patch contains a fix to the kernel command line setting
of trace_events, as the multi buffers change broke it.

Watching Ezequiel Garcia presentation at ELC, he pointed out the waste
in the kernel from subsystems abusing kmalloc when kmem_cache_alloc()
would be better. The trace system was one of the problem areas.
By converting two common structures to slab caches, I was able to
save 36K of memory!

I also noticed that the field and event names in the format files
and filtering logic was being strdup'd from strings that happen to
be constant. I originally did this in case of modules, but then,
if a module adds an event, it must also remove it before unloading,
which would destroy the reference to the string.

By not doing the strdup() and just point to the original string
I was able to save over a 100K of memory!!! This also makes each
TRACE_EVENT() less expensive.

I finally got around to fixing a long standing bug in the splice
logic. That is, it never blocked when there was no data and required
the caller to poll. Now with irq_work(), splice and reads from
trace_pipe_raw() can block and wait for data in the buffer before
returning.

Also, since we have multiple buffers, and instead of waking up
all waiters on all buffers for data in a single buffer, I moved the
wake up logic into the ring buffer code itself. Now all users of the
ring buffer can block until data is present.

Enjoy,

-- Steve



Steven Rostedt (5):
      tracing: Get trace_events kernel command line working again
      tracing: Use kmem_cache_alloc instead of kmalloc in trace_events.c
      tracing: Use direct field, type and system names
      tracing: Fix polling on trace_pipe_raw
      tracing: Fix read blocking on trace_pipe_raw

Steven Rostedt (Red Hat) (2):
      tracing: Do not block on splice if either file or splice NONBLOCK flag is set
      tracing/ring-buffer: Move poll wake ups into ring buffer code

----
 include/linux/ring_buffer.h |    6 ++
 kernel/trace/ring_buffer.c  |  146 +++++++++++++++++++++++++++++++++
 kernel/trace/trace.c        |  171 +++++++++++++++++----------------------
 kernel/trace/trace.h        |    4 +-
 kernel/trace/trace_events.c |  188 ++++++++++++++++++++++++++++++++++++-------
 5 files changed, 386 insertions(+), 129 deletions(-)

[ rebase changes from last for-next patch set ]

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index af7be82..b36befa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4133,14 +4133,30 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
 #ifdef CONFIG_TRACER_SNAPSHOT
 static int tracing_snapshot_open(struct inode *inode, struct file *file)
 {
+	struct trace_cpu *tc = inode->i_private;
 	struct trace_iterator *iter;
+	struct seq_file *m;
 	int ret = 0;
 
 	if (file->f_mode & FMODE_READ) {
 		iter = __tracing_open(inode, file, true);
 		if (IS_ERR(iter))
 			ret = PTR_ERR(iter);
+	} else {
+		/* Writes still need the seq_file to hold the private data */
+		m = kzalloc(sizeof(*m), GFP_KERNEL);
+		if (!m)
+			return -ENOMEM;
+		iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+		if (!iter) {
+			kfree(m);
+			return -ENOMEM;
+		}
+		iter->tr = tc->tr;
+		m->private = iter;
+		file->private_data = m;
 	}
+
 	return ret;
 }
 
@@ -4148,7 +4164,9 @@ static ssize_t
 tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		       loff_t *ppos)
 {
-	struct trace_array *tr = filp->private_data;
+	struct seq_file *m = filp->private_data;
+	struct trace_iterator *iter = m->private;
+	struct trace_array *tr = iter->tr;
 	unsigned long val;
 	int ret;
 
@@ -4209,6 +4227,22 @@ out:
 	mutex_unlock(&trace_types_lock);
 	return ret;
 }
+
+static int tracing_snapshot_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	if (file->f_mode & FMODE_READ)
+		return tracing_release(inode, file);
+
+	/* If write only, the seq_file is just a stub */
+	if (m)
+		kfree(m->private);
+	kfree(m);
+
+	return 0;
+}
+
 #endif /* CONFIG_TRACER_SNAPSHOT */
 
 
@@ -4273,7 +4307,7 @@ static const struct file_operations snapshot_fops = {
 	.read		= seq_read,
 	.write		= tracing_snapshot_write,
 	.llseek		= tracing_seek,
-	.release	= tracing_release,
+	.release	= tracing_snapshot_release,
 };
 #endif /* CONFIG_TRACER_SNAPSHOT */
 
@@ -5284,7 +5318,7 @@ static __init int tracer_init_debugfs(void)
 
 #ifdef CONFIG_TRACER_SNAPSHOT
 	trace_create_file("snapshot", 0644, d_tracer,
-			  (void *) RING_BUFFER_ALL_CPUS, &snapshot_fops);
+			  (void *)&global_trace.trace_cpu, &snapshot_fops);
 #endif
 
 	create_trace_instances(d_tracer);
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index cc1dbdc..349f694 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
 	trace_init_global_iter(&iter);
 
 	for_each_tracing_cpu(cpu) {
-		atomic_inc(&iter.tr->data[cpu]->disabled);
+		atomic_inc(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
 	}
 
 	old_userobj = trace_flags;
@@ -83,7 +83,7 @@ out:
 	trace_flags = old_userobj;
 
 	for_each_tracing_cpu(cpu) {
-		atomic_dec(&iter.tr->data[cpu]->disabled);
+		atomic_dec(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
 	}
 
 	for_each_tracing_cpu(cpu)

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [for-next][PATCH 1/7] tracing: Get trace_events kernel command line working again
  2013-03-02  2:48 [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt
@ 2013-03-02  2:48 ` Steven Rostedt
  2013-03-02  2:48 ` [for-next][PATCH 2/7] tracing: Use kmem_cache_alloc instead of kmalloc in trace_events.c Steven Rostedt
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: Steven Rostedt @ 2013-03-02  2:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Frederic Weisbecker,
	Masami Hiramatsu, David Sharp, Vaibhav Nagarnaik, hcochran

[-- Attachment #1: 0001-tracing-Get-trace_events-kernel-command-line-working.patch --]
[-- Type: text/plain, Size: 6924 bytes --]

From: Steven Rostedt <srostedt@redhat.com>

With the new descriptors used to allow multiple buffers in the
tracing directory added, the kernel command line parameter
trace_events=... no longer works. This is because the top level
(global) trace array now has a list of descriptors associated
with the events and the files in the debugfs directory. But in
early bootup, when the command line is processed and the events
enabled, the trace array list of events has not been set up yet.

Without the list of events in the trace array, the setting of
events to record will fail because it would not match any events.

The solution is to set up the top level array in two stages.
The first is to just add the ftrace file descriptors that just point
to the events. This will allow events to be enabled and start tracing.
The second stage is called after the filesystem is set up, and this
stage will create the debugfs event files and directories associated
with the trace array events.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c |  143 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 136 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 06d6bc2..21fe83b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1473,6 +1473,28 @@ __trace_add_new_event(struct ftrace_event_call *call,
 	return event_create_dir(tr->event_dir, file, id, enable, filter, format);
 }
 
+/*
+ * Just create a decriptor for early init. A descriptor is required
+ * for enabling events at boot. We want to enable events before
+ * the filesystem is initialized.
+ */
+static __init int
+__trace_early_add_new_event(struct ftrace_event_call *call,
+			    struct trace_array *tr)
+{
+	struct ftrace_event_file *file;
+
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	file->event_call = call;
+	file->tr = tr;
+	list_add(&file->list, &tr->events);
+
+	return 0;
+}
+
 struct ftrace_module_file_ops;
 static void __add_event_to_tracers(struct ftrace_event_call *call,
 				   struct ftrace_module_file_ops *file_ops);
@@ -1709,6 +1731,56 @@ __trace_add_event_dirs(struct trace_array *tr)
 	}
 }
 
+/*
+ * The top level array has already had its ftrace_event_file
+ * descriptors created in order to allow for early events to
+ * be recorded. This function is called after the debugfs has been
+ * initialized, and we now have to create the files associated
+ * to the events.
+ */
+static __init void
+__trace_early_add_event_dirs(struct trace_array *tr)
+{
+	struct ftrace_event_file *file;
+	int ret;
+
+
+	list_for_each_entry(file, &tr->events, list) {
+		ret = event_create_dir(tr->event_dir, file,
+				       &ftrace_event_id_fops,
+				       &ftrace_enable_fops,
+				       &ftrace_event_filter_fops,
+				       &ftrace_event_format_fops);
+		if (ret < 0)
+			pr_warning("Could not create directory for event %s\n",
+				   file->event_call->name);
+	}
+}
+
+/*
+ * For early boot up, the top trace array requires to have
+ * a list of events that can be enabled. This must be done before
+ * the filesystem is set up in order to allow events to be traced
+ * early.
+ */
+static __init void
+__trace_early_add_events(struct trace_array *tr)
+{
+	struct ftrace_event_call *call;
+	int ret;
+
+	list_for_each_entry(call, &ftrace_events, list) {
+		/* Early boot up should not have any modules loaded */
+		if (WARN_ON_ONCE(call->mod))
+			continue;
+
+		ret = __trace_early_add_new_event(call, tr);
+		if (ret < 0)
+			pr_warning("Could not create early event %s\n",
+				   call->name);
+	}
+}
+
 /* Remove the event directory structure for a trace directory. */
 static void
 __trace_remove_event_dirs(struct trace_array *tr)
@@ -1763,25 +1835,23 @@ static __init int setup_trace_event(char *str)
 }
 __setup("trace_event=", setup_trace_event);
 
-int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
+/* Expects to have event_mutex held when called */
+static int
+create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 {
 	struct dentry *d_events;
 	struct dentry *entry;
 
-	mutex_lock(&event_mutex);
-
 	entry = debugfs_create_file("set_event", 0644, parent,
 				    tr, &ftrace_set_event_fops);
 	if (!entry) {
 		pr_warning("Could not create debugfs 'set_event' entry\n");
-		mutex_unlock(&event_mutex);
 		return -ENOMEM;
 	}
 
 	d_events = debugfs_create_dir("events", parent);
 	if (!d_events) {
 		pr_warning("Could not create debugfs 'events' directory\n");
-		mutex_unlock(&event_mutex);
 		return -ENOMEM;
 	}
 
@@ -1798,13 +1868,64 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
 			  tr, &ftrace_tr_enable_fops);
 
 	tr->event_dir = d_events;
+
+	return 0;
+}
+
+/**
+ * event_trace_add_tracer - add a instance of a trace_array to events
+ * @parent: The parent dentry to place the files/directories for events in
+ * @tr: The trace array associated with these events
+ *
+ * When a new instance is created, it needs to set up its events
+ * directory, as well as other files associated with events. It also
+ * creates the event hierachry in the @parent/events directory.
+ *
+ * Returns 0 on success.
+ */
+int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+	int ret;
+
+	mutex_lock(&event_mutex);
+
+	ret = create_event_toplevel_files(parent, tr);
+	if (ret)
+		goto out_unlock;
+
 	down_write(&trace_event_mutex);
 	__trace_add_event_dirs(tr);
 	up_write(&trace_event_mutex);
 
+ out_unlock:
 	mutex_unlock(&event_mutex);
 
-	return 0;
+	return ret;
+}
+
+/*
+ * The top trace array already had its file descriptors created.
+ * Now the files themselves need to be created.
+ */
+static __init int
+early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+	int ret;
+
+	mutex_lock(&event_mutex);
+
+	ret = create_event_toplevel_files(parent, tr);
+	if (ret)
+		goto out_unlock;
+
+	down_write(&trace_event_mutex);
+	__trace_early_add_event_dirs(tr);
+	up_write(&trace_event_mutex);
+
+ out_unlock:
+	mutex_unlock(&event_mutex);
+
+	return ret;
 }
 
 int event_trace_del_tracer(struct trace_array *tr)
@@ -1842,6 +1963,14 @@ static __init int event_trace_enable(void)
 			list_add(&call->list, &ftrace_events);
 	}
 
+	/*
+	 * We need the top trace array to have a working set of trace
+	 * points at early init, before the debug files and directories
+	 * are created. Create the file entries now, and attach them
+	 * to the actual file dentries later.
+	 */
+	__trace_early_add_events(tr);
+
 	while (true) {
 		token = strsep(&buf, ",");
 
@@ -1882,7 +2011,7 @@ static __init int event_trace_init(void)
 	if (trace_define_common_fields())
 		pr_warning("tracing: Failed to allocate common fields");
 
-	ret = event_trace_add_tracer(d_tracer, tr);
+	ret = early_event_add_tracer(d_tracer, tr);
 	if (ret)
 		return ret;
 
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [for-next][PATCH 2/7] tracing: Use kmem_cache_alloc instead of kmalloc in trace_events.c
  2013-03-02  2:48 [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt
  2013-03-02  2:48 ` [for-next][PATCH 1/7] tracing: Get trace_events kernel command line working again Steven Rostedt
@ 2013-03-02  2:48 ` Steven Rostedt
  2013-03-02  2:48 ` [for-next][PATCH 3/7] tracing: Use direct field, type and system names Steven Rostedt
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: Steven Rostedt @ 2013-03-02  2:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Frederic Weisbecker,
	Masami Hiramatsu, David Sharp, Vaibhav Nagarnaik, hcochran,
	Ezequiel Garcia

[-- Attachment #1: 0002-tracing-Use-kmem_cache_alloc-instead-of-kmalloc-in-t.patch --]
[-- Type: text/plain, Size: 4667 bytes --]

From: Steven Rostedt <srostedt@redhat.com>

The event structures used by the trace events are mostly persistent,
but they are also allocated by kmalloc, which is not the best at
allocating space for what is used. By converting these kmallocs
into kmem_cache_allocs, we can save over 50K of space that is
permanently allocated.

After boot we have:

 slab name          active allocated size
 ---------          ------ --------- ----
ftrace_event_file    979   1005     56   67    1
ftrace_event_field   2301   2310     48   77    1

The ftrace_event_file has at boot up 979 active objects out of
1005 allocated in the slabs. Each object is 56 bytes. In a normal
kmalloc, that would allocate 64 bytes for each object.

 1005 - 979  = 26 objects not used
 26 * 56 = 1456 bytes wasted

But if we used kmalloc:

 64 - 56 = 8 bytes unused per allocation
 8 * 979 = 7832 bytes wasted

 7832 - 1456 = 6376 bytes in savings

Doing the same for ftrace_event_field where there's 2301 objects
allocated in a slab that can hold 2310 with 48 bytes each we have:

 2310 - 2301 = 9 objects not used
 9 * 48 = 432 bytes wasted

A kmalloc would also use 64 bytes per object:

 64 - 48 = 16 bytes unused per allocation
 16 * 2301 = 36816 bytes wasted!

 36816 - 432 = 36384 bytes in savings

This change gives us a total of 42760 bytes in savings. At least
on my machine, but as there's a lot of these persistent objects
for all configurations that use trace points, this is a net win.

Thanks to Ezequiel Garcia for his trace_analyze presentation which
pointed out the wasted space in my code.

Cc: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c |   27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 21fe83b..5d8845d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -36,6 +36,11 @@ EXPORT_SYMBOL_GPL(event_storage);
 LIST_HEAD(ftrace_events);
 LIST_HEAD(ftrace_common_fields);
 
+#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
+
+static struct kmem_cache *field_cachep;
+static struct kmem_cache *file_cachep;
+
 /* Double loops, do not use break, only goto's work */
 #define do_for_each_event_file(tr, file)			\
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\
@@ -63,7 +68,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
 {
 	struct ftrace_event_field *field;
 
-	field = kzalloc(sizeof(*field), GFP_KERNEL);
+	field = kmem_cache_alloc(field_cachep, GFP_TRACE);
 	if (!field)
 		goto err;
 
@@ -91,7 +96,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
 err:
 	if (field)
 		kfree(field->name);
-	kfree(field);
+	kmem_cache_free(field_cachep, field);
 
 	return -ENOMEM;
 }
@@ -143,7 +148,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)
 		list_del(&field->link);
 		kfree(field->type);
 		kfree(field->name);
-		kfree(field);
+		kmem_cache_free(field_cachep, field);
 	}
 }
 
@@ -1383,7 +1388,7 @@ static void remove_event_from_tracers(struct ftrace_event_call *call)
 		list_del(&file->list);
 		debugfs_remove_recursive(file->dir);
 		remove_subsystem(file->system);
-		kfree(file);
+		kmem_cache_free(file_cachep, file);
 
 		/*
 		 * The do_for_each_event_file_safe() is
@@ -1462,7 +1467,7 @@ __trace_add_new_event(struct ftrace_event_call *call,
 {
 	struct ftrace_event_file *file;
 
-	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
 	if (!file)
 		return -ENOMEM;
 
@@ -1484,7 +1489,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
 {
 	struct ftrace_event_file *file;
 
-	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
 	if (!file)
 		return -ENOMEM;
 
@@ -1791,7 +1796,7 @@ __trace_remove_event_dirs(struct trace_array *tr)
 		list_del(&file->list);
 		debugfs_remove_recursive(file->dir);
 		remove_subsystem(file->system);
-		kfree(file);
+		kmem_cache_free(file_cachep, file);
 	}
 }
 
@@ -1947,6 +1952,13 @@ int event_trace_del_tracer(struct trace_array *tr)
 	return 0;
 }
 
+static __init int event_trace_memsetup(void)
+{
+	field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
+	file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
+	return 0;
+}
+
 static __init int event_trace_enable(void)
 {
 	struct trace_array *tr = top_trace_array();
@@ -2021,6 +2033,7 @@ static __init int event_trace_init(void)
 
 	return 0;
 }
+early_initcall(event_trace_memsetup);
 core_initcall(event_trace_enable);
 fs_initcall(event_trace_init);
 
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [for-next][PATCH 3/7] tracing: Use direct field, type and system names
  2013-03-02  2:48 [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt
  2013-03-02  2:48 ` [for-next][PATCH 1/7] tracing: Get trace_events kernel command line working again Steven Rostedt
  2013-03-02  2:48 ` [for-next][PATCH 2/7] tracing: Use kmem_cache_alloc instead of kmalloc in trace_events.c Steven Rostedt
@ 2013-03-02  2:48 ` Steven Rostedt
  2013-03-02  2:48 ` [for-next][PATCH 4/7] tracing: Do not block on splice if either file or splice NONBLOCK flag is set Steven Rostedt
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: Steven Rostedt @ 2013-03-02  2:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Frederic Weisbecker,
	Masami Hiramatsu, David Sharp, Vaibhav Nagarnaik, hcochran

[-- Attachment #1: 0003-tracing-Use-direct-field-type-and-system-names.patch --]
[-- Type: text/plain, Size: 2629 bytes --]

From: Steven Rostedt <srostedt@redhat.com>

The names used to display the field and type in the event format
files are copied, as well as the system name that is displayed.

All these names are created by constant values passed in.
If one of theses values were to be removed by a module, the module
would also be required to remove any event it created.

By using the strings directly, we can save over 100K of memory.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h        |    4 ++--
 kernel/trace/trace_events.c |   20 +++-----------------
 2 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 592e8f2..c597523 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -883,8 +883,8 @@ enum {
 
 struct ftrace_event_field {
 	struct list_head	link;
-	char			*name;
-	char			*type;
+	const char		*name;
+	const char		*type;
 	int			filter_type;
 	int			offset;
 	int			size;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5d8845d..63b4bdf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -72,13 +72,8 @@ static int __trace_define_field(struct list_head *head, const char *type,
 	if (!field)
 		goto err;
 
-	field->name = kstrdup(name, GFP_KERNEL);
-	if (!field->name)
-		goto err;
-
-	field->type = kstrdup(type, GFP_KERNEL);
-	if (!field->type)
-		goto err;
+	field->name = name;
+	field->type = type;
 
 	if (filter_type == FILTER_OTHER)
 		field->filter_type = filter_assign_type(type);
@@ -94,8 +89,6 @@ static int __trace_define_field(struct list_head *head, const char *type,
 	return 0;
 
 err:
-	if (field)
-		kfree(field->name);
 	kmem_cache_free(field_cachep, field);
 
 	return -ENOMEM;
@@ -146,8 +139,6 @@ void trace_destroy_fields(struct ftrace_event_call *call)
 	head = trace_get_fields(call);
 	list_for_each_entry_safe(field, next, head, link) {
 		list_del(&field->link);
-		kfree(field->type);
-		kfree(field->name);
 		kmem_cache_free(field_cachep, field);
 	}
 }
@@ -286,7 +277,6 @@ static void __put_system(struct event_subsystem *system)
 		kfree(filter->filter_string);
 		kfree(filter);
 	}
-	kfree(system->name);
 	kfree(system);
 }
 
@@ -1202,10 +1192,7 @@ create_new_subsystem(const char *name)
 		return NULL;
 
 	system->ref_count = 1;
-	system->name = kstrdup(name, GFP_KERNEL);
-
-	if (!system->name)
-		goto out_free;
+	system->name = name;
 
 	system->filter = NULL;
 
@@ -1218,7 +1205,6 @@ create_new_subsystem(const char *name)
 	return system;
 
  out_free:
-	kfree(system->name);
 	kfree(system);
 	return NULL;
 }
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [for-next][PATCH 4/7] tracing: Do not block on splice if either file or splice NONBLOCK flag is set
  2013-03-02  2:48 [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt
                   ` (2 preceding siblings ...)
  2013-03-02  2:48 ` [for-next][PATCH 3/7] tracing: Use direct field, type and system names Steven Rostedt
@ 2013-03-02  2:48 ` Steven Rostedt
  2013-03-02  2:48 ` [for-next][PATCH 5/7] tracing: Fix polling on trace_pipe_raw Steven Rostedt
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: Steven Rostedt @ 2013-03-02  2:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Frederic Weisbecker,
	Masami Hiramatsu, David Sharp, Vaibhav Nagarnaik, hcochran

[-- Attachment #1: 0004-tracing-Do-not-block-on-splice-if-either-file-or-spl.patch --]
[-- Type: text/plain, Size: 797 bytes --]

From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>

Currently only the splice NONBLOCK flag is checked to determine if
the splice read should block or not. But the file descriptor NONBLOCK
flag also needs to be checked.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b36befa..00e5e6e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4543,7 +4543,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
 	/* did we read anything? */
 	if (!spd.nr_pages) {
-		if (flags & SPLICE_F_NONBLOCK)
+		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
 			ret = -EAGAIN;
 		else
 			ret = 0;
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [for-next][PATCH 5/7] tracing: Fix polling on trace_pipe_raw
  2013-03-02  2:48 [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt
                   ` (3 preceding siblings ...)
  2013-03-02  2:48 ` [for-next][PATCH 4/7] tracing: Do not block on splice if either file or splice NONBLOCK flag is set Steven Rostedt
@ 2013-03-02  2:48 ` Steven Rostedt
  2013-03-12 15:52   ` Mauro Carvalho Chehab
  2013-03-02  2:48 ` [for-next][PATCH 6/7] tracing: Fix read blocking " Steven Rostedt
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 10+ messages in thread
From: Steven Rostedt @ 2013-03-02  2:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Frederic Weisbecker,
	Masami Hiramatsu, David Sharp, Vaibhav Nagarnaik, hcochran,
	Mauro Carvalho Chehab

[-- Attachment #1: 0005-tracing-Fix-polling-on-trace_pipe_raw.patch --]
[-- Type: text/plain, Size: 6557 bytes --]

From: Steven Rostedt <srostedt@redhat.com>

The trace_pipe_raw never implemented polling and this was casing
issues for several utilities. This is now implemented.

Blocked reads still are on the TODO list.

Reported-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c |   78 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 51 insertions(+), 27 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 00e5e6e..b958e9d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3503,10 +3503,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 }
 
 static unsigned int
-tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
 {
-	struct trace_iterator *iter = filp->private_data;
-
 	if (trace_flags & TRACE_ITER_BLOCK) {
 		/*
 		 * Always select as readable when in blocking mode
@@ -3515,6 +3513,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 	} else {
 		if (!trace_empty(iter))
 			return POLLIN | POLLRDNORM;
+		trace_wakeup_needed = true;
 		poll_wait(filp, &trace_wait, poll_table);
 		if (!trace_empty(iter))
 			return POLLIN | POLLRDNORM;
@@ -3523,6 +3522,14 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 	}
 }
 
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+	struct trace_iterator *iter = filp->private_data;
+
+	return trace_poll(iter, filp, poll_table);
+}
+
 /*
  * This is a make-shift waitqueue.
  * A tracer might use this callback on some rare cases:
@@ -4312,9 +4319,8 @@ static const struct file_operations snapshot_fops = {
 #endif /* CONFIG_TRACER_SNAPSHOT */
 
 struct ftrace_buffer_info {
-	struct trace_array	*tr;
+	struct trace_iterator	iter;
 	void			*spare;
-	int			cpu;
 	unsigned int		read;
 };
 
@@ -4331,22 +4337,32 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 	if (!info)
 		return -ENOMEM;
 
-	info->tr	= tr;
-	info->cpu	= tc->cpu;
-	info->spare	= NULL;
+	info->iter.tr		= tr;
+	info->iter.cpu_file	= tc->cpu;
+	info->spare		= NULL;
 	/* Force reading ring buffer for first read */
-	info->read	= (unsigned int)-1;
+	info->read		= (unsigned int)-1;
 
 	filp->private_data = info;
 
 	return nonseekable_open(inode, filp);
 }
 
+static unsigned int
+tracing_buffers_poll(struct file *filp, poll_table *poll_table)
+{
+	struct ftrace_buffer_info *info = filp->private_data;
+	struct trace_iterator *iter = &info->iter;
+
+	return trace_poll(iter, filp, poll_table);
+}
+
 static ssize_t
 tracing_buffers_read(struct file *filp, char __user *ubuf,
 		     size_t count, loff_t *ppos)
 {
 	struct ftrace_buffer_info *info = filp->private_data;
+	struct trace_iterator *iter = &info->iter;
 	ssize_t ret;
 	size_t size;
 
@@ -4354,7 +4370,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 		return 0;
 
 	if (!info->spare)
-		info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
+		info->spare = ring_buffer_alloc_read_page(iter->tr->buffer, iter->cpu_file);
 	if (!info->spare)
 		return -ENOMEM;
 
@@ -4362,12 +4378,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	if (info->read < PAGE_SIZE)
 		goto read;
 
-	trace_access_lock(info->cpu);
-	ret = ring_buffer_read_page(info->tr->buffer,
+	trace_access_lock(iter->cpu_file);
+	ret = ring_buffer_read_page(iter->tr->buffer,
 				    &info->spare,
 				    count,
-				    info->cpu, 0);
-	trace_access_unlock(info->cpu);
+				    iter->cpu_file, 0);
+	trace_access_unlock(iter->cpu_file);
 	if (ret < 0)
 		return 0;
 
@@ -4392,9 +4408,10 @@ read:
 static int tracing_buffers_release(struct inode *inode, struct file *file)
 {
 	struct ftrace_buffer_info *info = file->private_data;
+	struct trace_iterator *iter = &info->iter;
 
 	if (info->spare)
-		ring_buffer_free_read_page(info->tr->buffer, info->spare);
+		ring_buffer_free_read_page(iter->tr->buffer, info->spare);
 	kfree(info);
 
 	return 0;
@@ -4461,6 +4478,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			    unsigned int flags)
 {
 	struct ftrace_buffer_info *info = file->private_data;
+	struct trace_iterator *iter = &info->iter;
 	struct partial_page partial_def[PIPE_DEF_BUFFERS];
 	struct page *pages_def[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
@@ -4491,8 +4509,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		len &= PAGE_MASK;
 	}
 
-	trace_access_lock(info->cpu);
-	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+ again:
+	trace_access_lock(iter->cpu_file);
+	entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file);
 
 	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
@@ -4503,15 +4522,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			break;
 
 		ref->ref = 1;
-		ref->buffer = info->tr->buffer;
-		ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
+		ref->buffer = iter->tr->buffer;
+		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
 		if (!ref->page) {
 			kfree(ref);
 			break;
 		}
 
 		r = ring_buffer_read_page(ref->buffer, &ref->page,
-					  len, info->cpu, 1);
+					  len, iter->cpu_file, 1);
 		if (r < 0) {
 			ring_buffer_free_read_page(ref->buffer, ref->page);
 			kfree(ref);
@@ -4535,20 +4554,24 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		spd.nr_pages++;
 		*ppos += PAGE_SIZE;
 
-		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+		entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file);
 	}
 
-	trace_access_unlock(info->cpu);
+	trace_access_unlock(iter->cpu_file);
 	spd.nr_pages = i;
 
 	/* did we read anything? */
 	if (!spd.nr_pages) {
-		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
+		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
 			ret = -EAGAIN;
-		else
-			ret = 0;
-		/* TODO: block */
-		goto out;
+			goto out;
+		}
+		default_wait_pipe(iter);
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			goto out;
+		}
+		goto again;
 	}
 
 	ret = splice_to_pipe(pipe, &spd);
@@ -4560,6 +4583,7 @@ out:
 static const struct file_operations tracing_buffers_fops = {
 	.open		= tracing_buffers_open,
 	.read		= tracing_buffers_read,
+	.poll		= tracing_buffers_poll,
 	.release	= tracing_buffers_release,
 	.splice_read	= tracing_buffers_splice_read,
 	.llseek		= no_llseek,
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [for-next][PATCH 6/7] tracing: Fix read blocking on trace_pipe_raw
  2013-03-02  2:48 [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt
                   ` (4 preceding siblings ...)
  2013-03-02  2:48 ` [for-next][PATCH 5/7] tracing: Fix polling on trace_pipe_raw Steven Rostedt
@ 2013-03-02  2:48 ` Steven Rostedt
  2013-03-02  2:48 ` [for-next][PATCH 7/7] tracing/ring-buffer: Move poll wake ups into ring buffer code Steven Rostedt
  2013-03-02  3:06 ` [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt
  7 siblings, 0 replies; 10+ messages in thread
From: Steven Rostedt @ 2013-03-02  2:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Frederic Weisbecker,
	Masami Hiramatsu, David Sharp, Vaibhav Nagarnaik, hcochran

[-- Attachment #1: 0006-tracing-Fix-read-blocking-on-trace_pipe_raw.patch --]
[-- Type: text/plain, Size: 1910 bytes --]

From: Steven Rostedt <srostedt@redhat.com>

If the ring buffer is empty, a read to trace_pipe_raw wont block.
The tracing code has the infrastructure to wake up waiting readers,
but the trace_pipe_raw doesn't take advantage of that.

When a read is done to trace_pipe_raw without the O_NONBLOCK flag
set, have the read block until there's data in the requested buffer.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c |   18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b958e9d..32b003c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4339,6 +4339,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 
 	info->iter.tr		= tr;
 	info->iter.cpu_file	= tc->cpu;
+	info->iter.trace	= tr->current_trace;
 	info->spare		= NULL;
 	/* Force reading ring buffer for first read */
 	info->read		= (unsigned int)-1;
@@ -4378,18 +4379,29 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	if (info->read < PAGE_SIZE)
 		goto read;
 
+ again:
 	trace_access_lock(iter->cpu_file);
 	ret = ring_buffer_read_page(iter->tr->buffer,
 				    &info->spare,
 				    count,
 				    iter->cpu_file, 0);
 	trace_access_unlock(iter->cpu_file);
-	if (ret < 0)
+
+	if (ret < 0) {
+		if (trace_empty(iter)) {
+			if ((filp->f_flags & O_NONBLOCK))
+				return -EAGAIN;
+			iter->trace->wait_pipe(iter);
+			if (signal_pending(current))
+				return -EINTR;
+			goto again;
+		}
 		return 0;
+	}
 
 	info->read = 0;
 
-read:
+ read:
 	size = PAGE_SIZE - info->read;
 	if (size > count)
 		size = count;
@@ -4566,7 +4578,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			ret = -EAGAIN;
 			goto out;
 		}
-		default_wait_pipe(iter);
+		iter->trace->wait_pipe(iter);
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			goto out;
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [for-next][PATCH 7/7] tracing/ring-buffer: Move poll wake ups into ring buffer code
  2013-03-02  2:48 [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt
                   ` (5 preceding siblings ...)
  2013-03-02  2:48 ` [for-next][PATCH 6/7] tracing: Fix read blocking " Steven Rostedt
@ 2013-03-02  2:48 ` Steven Rostedt
  2013-03-02  3:06 ` [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt
  7 siblings, 0 replies; 10+ messages in thread
From: Steven Rostedt @ 2013-03-02  2:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Frederic Weisbecker,
	Masami Hiramatsu, David Sharp, Vaibhav Nagarnaik, hcochran

[-- Attachment #1: 0007-tracing-ring-buffer-Move-poll-wake-ups-into-ring-buf.patch --]
[-- Type: text/plain, Size: 12799 bytes --]

From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>

Move the logic to wake up on ring buffer data into the ring buffer
code itself. This simplifies the tracing code a lot and also has the
added benefit that waiters on one of the instance buffers can be woken
only when data is added to that instance instead of data added to
any instance.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |    6 ++
 kernel/trace/ring_buffer.c  |  146 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c        |   83 ++++--------------------
 3 files changed, 164 insertions(+), 71 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 1342e69..d69cf63 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -4,6 +4,7 @@
 #include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/seq_file.h>
+#include <linux/poll.h>
 
 struct ring_buffer;
 struct ring_buffer_iter;
@@ -96,6 +97,11 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
 	__ring_buffer_alloc((size), (flags), &__key);	\
 })
 
+void ring_buffer_wait(struct ring_buffer *buffer, int cpu);
+int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+			  struct file *filp, poll_table *poll_table);
+
+
 #define RING_BUFFER_ALL_CPUS -1
 
 void ring_buffer_free(struct ring_buffer *buffer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7244acd..56b6ea3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,6 +8,7 @@
 #include <linux/trace_clock.h>
 #include <linux/trace_seq.h>
 #include <linux/spinlock.h>
+#include <linux/irq_work.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
@@ -442,6 +443,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 	return ret;
 }
 
+struct rb_irq_work {
+	struct irq_work			work;
+	wait_queue_head_t		waiters;
+	bool				waiters_pending;
+};
+
 /*
  * head_page == tail_page && head == tail then buffer is empty.
  */
@@ -476,6 +483,8 @@ struct ring_buffer_per_cpu {
 	struct list_head		new_pages; /* new pages to add */
 	struct work_struct		update_pages_work;
 	struct completion		update_done;
+
+	struct rb_irq_work		irq_work;
 };
 
 struct ring_buffer {
@@ -495,6 +504,8 @@ struct ring_buffer {
 	struct notifier_block		cpu_notify;
 #endif
 	u64				(*clock)(void);
+
+	struct rb_irq_work		irq_work;
 };
 
 struct ring_buffer_iter {
@@ -506,6 +517,118 @@ struct ring_buffer_iter {
 	u64				read_stamp;
 };
 
+/*
+ * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
+ *
+ * Schedules a delayed work to wake up any task that is blocked on the
+ * ring buffer waiters queue.
+ */
+static void rb_wake_up_waiters(struct irq_work *work)
+{
+	struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
+
+	wake_up_all(&rbwork->waiters);
+}
+
+/**
+ * ring_buffer_wait - wait for input to the ring buffer
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ */
+void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	DEFINE_WAIT(wait);
+	struct rb_irq_work *work;
+
+	/*
+	 * Depending on what the caller is waiting for, either any
+	 * data in any cpu buffer, or a specific buffer, put the
+	 * caller on the appropriate wait queue.
+	 */
+	if (cpu == RING_BUFFER_ALL_CPUS)
+		work = &buffer->irq_work;
+	else {
+		cpu_buffer = buffer->buffers[cpu];
+		work = &cpu_buffer->irq_work;
+	}
+
+
+	prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+
+	/*
+	 * The events can happen in critical sections where
+	 * checking a work queue can cause deadlocks.
+	 * After adding a task to the queue, this flag is set
+	 * only to notify events to try to wake up the queue
+	 * using irq_work.
+	 *
+	 * We don't clear it even if the buffer is no longer
+	 * empty. The flag only causes the next event to run
+	 * irq_work to do the work queue wake up. The worse
+	 * that can happen if we race with !trace_empty() is that
+	 * an event will cause an irq_work to try to wake up
+	 * an empty queue.
+	 *
+	 * There's no reason to protect this flag either, as
+	 * the work queue and irq_work logic will do the necessary
+	 * synchronization for the wake ups. The only thing
+	 * that is necessary is that the wake up happens after
+	 * a task has been queued. It's OK for spurious wake ups.
+	 */
+	work->waiters_pending = true;
+
+	if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
+	    (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
+		schedule();
+
+	finish_wait(&work->waiters, &wait);
+}
+
+/**
+ * ring_buffer_poll_wait - poll on buffer input
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ * @filp: the file descriptor
+ * @poll_table: The poll descriptor
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ *
+ * Returns POLLIN | POLLRDNORM if data exists in the buffers,
+ * zero otherwise.
+ */
+int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+			  struct file *filp, poll_table *poll_table)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct rb_irq_work *work;
+
+	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+		return POLLIN | POLLRDNORM;
+
+	if (cpu == RING_BUFFER_ALL_CPUS)
+		work = &buffer->irq_work;
+	else {
+		cpu_buffer = buffer->buffers[cpu];
+		work = &cpu_buffer->irq_work;
+	}
+
+	work->waiters_pending = true;
+	poll_wait(filp, &work->waiters, poll_table);
+
+	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
 /* buffer may be either ring_buffer or ring_buffer_per_cpu */
 #define RB_WARN_ON(b, cond)						\
 	({								\
@@ -1061,6 +1184,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
 	cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 	INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
 	init_completion(&cpu_buffer->update_done);
+	init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
 
 	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 			    GFP_KERNEL, cpu_to_node(cpu));
@@ -1156,6 +1280,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	buffer->clock = trace_clock_local;
 	buffer->reader_lock_key = key;
 
+	init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
+
 	/* need at least two pages */
 	if (nr_pages < 2)
 		nr_pages = 2;
@@ -2610,6 +2736,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 	rb_end_commit(cpu_buffer);
 }
 
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+	if (buffer->irq_work.waiters_pending) {
+		buffer->irq_work.waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&buffer->irq_work.work);
+	}
+
+	if (cpu_buffer->irq_work.waiters_pending) {
+		cpu_buffer->irq_work.waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&cpu_buffer->irq_work.work);
+	}
+}
+
 /**
  * ring_buffer_unlock_commit - commit a reserved
  * @buffer: The buffer to commit to
@@ -2629,6 +2771,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	rb_commit(cpu_buffer, event);
 
+	rb_wakeups(buffer, cpu_buffer);
+
 	trace_recursive_unlock();
 
 	preempt_enable_notrace();
@@ -2801,6 +2945,8 @@ int ring_buffer_write(struct ring_buffer *buffer,
 
 	rb_commit(cpu_buffer, event);
 
+	rb_wakeups(buffer, cpu_buffer);
+
 	ret = 0;
  out:
 	preempt_enable_notrace();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 32b003c..e0cf94c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -19,7 +19,6 @@
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
-#include <linux/irq_work.h>
 #include <linux/debugfs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
@@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
 static DEFINE_PER_CPU(bool, trace_cmdline_save);
 
 /*
- * When a reader is waiting for data, then this variable is
- * set to true.
- */
-static bool trace_wakeup_needed;
-
-static struct irq_work trace_work_wakeup;
-
-/*
  * Kill all tracing for good (never come back).
  * It is initialized to 1 but will turn to zero if the initialization
  * of the tracer is successful. But that is the only place that sets
@@ -334,9 +325,6 @@ static inline void trace_access_lock_init(void)
 
 #endif
 
-/* trace_wait is a waitqueue for tasks blocked on trace_poll */
-static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
-
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
@@ -344,19 +332,6 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS;
 
 /**
- * trace_wake_up - wake up tasks waiting for trace input
- *
- * Schedules a delayed work to wake up any task that is blocked on the
- * trace_wait queue. These is used with trace_poll for tasks polling the
- * trace.
- */
-static void trace_wake_up(struct irq_work *work)
-{
-	wake_up_all(&trace_wait);
-
-}
-
-/**
  * tracing_on - enable tracing buffers
  *
  * This function enables tracing buffers that may have been
@@ -762,36 +737,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 static void default_wait_pipe(struct trace_iterator *iter)
 {
-	DEFINE_WAIT(wait);
-
-	prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
-
-	/*
-	 * The events can happen in critical sections where
-	 * checking a work queue can cause deadlocks.
-	 * After adding a task to the queue, this flag is set
-	 * only to notify events to try to wake up the queue
-	 * using irq_work.
-	 *
-	 * We don't clear it even if the buffer is no longer
-	 * empty. The flag only causes the next event to run
-	 * irq_work to do the work queue wake up. The worse
-	 * that can happen if we race with !trace_empty() is that
-	 * an event will cause an irq_work to try to wake up
-	 * an empty queue.
-	 *
-	 * There's no reason to protect this flag either, as
-	 * the work queue and irq_work logic will do the necessary
-	 * synchronization for the wake ups. The only thing
-	 * that is necessary is that the wake up happens after
-	 * a task has been queued. It's OK for spurious wake ups.
-	 */
-	trace_wakeup_needed = true;
-
-	if (trace_empty(iter))
-		schedule();
+	/* Iterators are static, they should be filled or empty */
+	if (trace_buffer_iter(iter, iter->cpu_file))
+		return;
 
-	finish_wait(&trace_wait, &wait);
+	ring_buffer_wait(iter->tr->buffer, iter->cpu_file);
 }
 
 /**
@@ -1261,11 +1211,6 @@ void
 __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
 {
 	__this_cpu_write(trace_cmdline_save, true);
-	if (trace_wakeup_needed) {
-		trace_wakeup_needed = false;
-		/* irq_work_queue() supplies it's own memory barriers */
-		irq_work_queue(&trace_work_wakeup);
-	}
 	ring_buffer_unlock_commit(buffer, event);
 }
 
@@ -3505,21 +3450,18 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 static unsigned int
 trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
 {
-	if (trace_flags & TRACE_ITER_BLOCK) {
+	/* Iterators are static, they should be filled or empty */
+	if (trace_buffer_iter(iter, iter->cpu_file))
+		return POLLIN | POLLRDNORM;
+
+	if (trace_flags & TRACE_ITER_BLOCK)
 		/*
 		 * Always select as readable when in blocking mode
 		 */
 		return POLLIN | POLLRDNORM;
-	} else {
-		if (!trace_empty(iter))
-			return POLLIN | POLLRDNORM;
-		trace_wakeup_needed = true;
-		poll_wait(filp, &trace_wait, poll_table);
-		if (!trace_empty(iter))
-			return POLLIN | POLLRDNORM;
-
-		return 0;
-	}
+	else
+		return ring_buffer_poll_wait(iter->tr->buffer, iter->cpu_file,
+					     filp, poll_table);
 }
 
 static unsigned int
@@ -5644,7 +5586,6 @@ __init static int tracer_alloc_buffers(void)
 #endif
 
 	trace_init_cmdlines();
-	init_irq_work(&trace_work_wakeup, trace_wake_up);
 
 	register_tracer(&nop_trace);
 
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice
  2013-03-02  2:48 [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt
                   ` (6 preceding siblings ...)
  2013-03-02  2:48 ` [for-next][PATCH 7/7] tracing/ring-buffer: Move poll wake ups into ring buffer code Steven Rostedt
@ 2013-03-02  3:06 ` Steven Rostedt
  7 siblings, 0 replies; 10+ messages in thread
From: Steven Rostedt @ 2013-03-02  3:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Frederic Weisbecker,
	Masami Hiramatsu, David Sharp, Vaibhav Nagarnaik, hcochran

On Fri, 2013-03-01 at 21:48 -0500, Steven Rostedt wrote:

> By converting two common structures to slab caches, I was able to
> save 36K of memory!

Correction... 36K was for only one slab conversion. The total was 42K in
savings ;-)

-- Steve



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [for-next][PATCH 5/7] tracing: Fix polling on trace_pipe_raw
  2013-03-02  2:48 ` [for-next][PATCH 5/7] tracing: Fix polling on trace_pipe_raw Steven Rostedt
@ 2013-03-12 15:52   ` Mauro Carvalho Chehab
  0 siblings, 0 replies; 10+ messages in thread
From: Mauro Carvalho Chehab @ 2013-03-12 15:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, Ingo Molnar, Andrew Morton, Thomas Gleixner,
	Frederic Weisbecker, Masami Hiramatsu, David Sharp,
	Vaibhav Nagarnaik, hcochran

Hi Steven,

Em Fri, 01 Mar 2013 21:48:38 -0500
Steven Rostedt <rostedt@goodmis.org> escreveu:

> From: Steven Rostedt <srostedt@redhat.com>
> 
> The trace_pipe_raw never implemented polling and this was casing
> issues for several utilities. This is now implemented.
> 
> Blocked reads still are on the TODO list.

I tested it and it works fine.

> Reported-by: Mauro Carvalho Chehab <mchehab@redhat.com>

Tested-by: Mauro Carvalho Chehab <mchehab@redhat.com>

> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
> ---
>  kernel/trace/trace.c |   78 +++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 51 insertions(+), 27 deletions(-)
> 
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 00e5e6e..b958e9d 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -3503,10 +3503,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
>  }
>  
>  static unsigned int
> -tracing_poll_pipe(struct file *filp, poll_table *poll_table)
> +trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
>  {
> -	struct trace_iterator *iter = filp->private_data;
> -
>  	if (trace_flags & TRACE_ITER_BLOCK) {
>  		/*
>  		 * Always select as readable when in blocking mode
> @@ -3515,6 +3513,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
>  	} else {
>  		if (!trace_empty(iter))
>  			return POLLIN | POLLRDNORM;
> +		trace_wakeup_needed = true;
>  		poll_wait(filp, &trace_wait, poll_table);
>  		if (!trace_empty(iter))
>  			return POLLIN | POLLRDNORM;
> @@ -3523,6 +3522,14 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
>  	}
>  }
>  
> +static unsigned int
> +tracing_poll_pipe(struct file *filp, poll_table *poll_table)
> +{
> +	struct trace_iterator *iter = filp->private_data;
> +
> +	return trace_poll(iter, filp, poll_table);
> +}
> +
>  /*
>   * This is a make-shift waitqueue.
>   * A tracer might use this callback on some rare cases:
> @@ -4312,9 +4319,8 @@ static const struct file_operations snapshot_fops = {
>  #endif /* CONFIG_TRACER_SNAPSHOT */
>  
>  struct ftrace_buffer_info {
> -	struct trace_array	*tr;
> +	struct trace_iterator	iter;
>  	void			*spare;
> -	int			cpu;
>  	unsigned int		read;
>  };
>  
> @@ -4331,22 +4337,32 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
>  	if (!info)
>  		return -ENOMEM;
>  
> -	info->tr	= tr;
> -	info->cpu	= tc->cpu;
> -	info->spare	= NULL;
> +	info->iter.tr		= tr;
> +	info->iter.cpu_file	= tc->cpu;
> +	info->spare		= NULL;
>  	/* Force reading ring buffer for first read */
> -	info->read	= (unsigned int)-1;
> +	info->read		= (unsigned int)-1;
>  
>  	filp->private_data = info;
>  
>  	return nonseekable_open(inode, filp);
>  }
>  
> +static unsigned int
> +tracing_buffers_poll(struct file *filp, poll_table *poll_table)
> +{
> +	struct ftrace_buffer_info *info = filp->private_data;
> +	struct trace_iterator *iter = &info->iter;
> +
> +	return trace_poll(iter, filp, poll_table);
> +}
> +
>  static ssize_t
>  tracing_buffers_read(struct file *filp, char __user *ubuf,
>  		     size_t count, loff_t *ppos)
>  {
>  	struct ftrace_buffer_info *info = filp->private_data;
> +	struct trace_iterator *iter = &info->iter;
>  	ssize_t ret;
>  	size_t size;
>  
> @@ -4354,7 +4370,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
>  		return 0;
>  
>  	if (!info->spare)
> -		info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
> +		info->spare = ring_buffer_alloc_read_page(iter->tr->buffer, iter->cpu_file);
>  	if (!info->spare)
>  		return -ENOMEM;
>  
> @@ -4362,12 +4378,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
>  	if (info->read < PAGE_SIZE)
>  		goto read;
>  
> -	trace_access_lock(info->cpu);
> -	ret = ring_buffer_read_page(info->tr->buffer,
> +	trace_access_lock(iter->cpu_file);
> +	ret = ring_buffer_read_page(iter->tr->buffer,
>  				    &info->spare,
>  				    count,
> -				    info->cpu, 0);
> -	trace_access_unlock(info->cpu);
> +				    iter->cpu_file, 0);
> +	trace_access_unlock(iter->cpu_file);
>  	if (ret < 0)
>  		return 0;
>  
> @@ -4392,9 +4408,10 @@ read:
>  static int tracing_buffers_release(struct inode *inode, struct file *file)
>  {
>  	struct ftrace_buffer_info *info = file->private_data;
> +	struct trace_iterator *iter = &info->iter;
>  
>  	if (info->spare)
> -		ring_buffer_free_read_page(info->tr->buffer, info->spare);
> +		ring_buffer_free_read_page(iter->tr->buffer, info->spare);
>  	kfree(info);
>  
>  	return 0;
> @@ -4461,6 +4478,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
>  			    unsigned int flags)
>  {
>  	struct ftrace_buffer_info *info = file->private_data;
> +	struct trace_iterator *iter = &info->iter;
>  	struct partial_page partial_def[PIPE_DEF_BUFFERS];
>  	struct page *pages_def[PIPE_DEF_BUFFERS];
>  	struct splice_pipe_desc spd = {
> @@ -4491,8 +4509,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
>  		len &= PAGE_MASK;
>  	}
>  
> -	trace_access_lock(info->cpu);
> -	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
> + again:
> +	trace_access_lock(iter->cpu_file);
> +	entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file);
>  
>  	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
>  		struct page *page;
> @@ -4503,15 +4522,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
>  			break;
>  
>  		ref->ref = 1;
> -		ref->buffer = info->tr->buffer;
> -		ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
> +		ref->buffer = iter->tr->buffer;
> +		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
>  		if (!ref->page) {
>  			kfree(ref);
>  			break;
>  		}
>  
>  		r = ring_buffer_read_page(ref->buffer, &ref->page,
> -					  len, info->cpu, 1);
> +					  len, iter->cpu_file, 1);
>  		if (r < 0) {
>  			ring_buffer_free_read_page(ref->buffer, ref->page);
>  			kfree(ref);
> @@ -4535,20 +4554,24 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
>  		spd.nr_pages++;
>  		*ppos += PAGE_SIZE;
>  
> -		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
> +		entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file);
>  	}
>  
> -	trace_access_unlock(info->cpu);
> +	trace_access_unlock(iter->cpu_file);
>  	spd.nr_pages = i;
>  
>  	/* did we read anything? */
>  	if (!spd.nr_pages) {
> -		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
> +		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
>  			ret = -EAGAIN;
> -		else
> -			ret = 0;
> -		/* TODO: block */
> -		goto out;
> +			goto out;
> +		}
> +		default_wait_pipe(iter);
> +		if (signal_pending(current)) {
> +			ret = -EINTR;
> +			goto out;
> +		}
> +		goto again;
>  	}
>  
>  	ret = splice_to_pipe(pipe, &spd);
> @@ -4560,6 +4583,7 @@ out:
>  static const struct file_operations tracing_buffers_fops = {
>  	.open		= tracing_buffers_open,
>  	.read		= tracing_buffers_read,
> +	.poll		= tracing_buffers_poll,
>  	.release	= tracing_buffers_release,
>  	.splice_read	= tracing_buffers_splice_read,
>  	.llseek		= no_llseek,


-- 

Cheers,
Mauro

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2013-03-12 15:52 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-03-02  2:48 [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt
2013-03-02  2:48 ` [for-next][PATCH 1/7] tracing: Get trace_events kernel command line working again Steven Rostedt
2013-03-02  2:48 ` [for-next][PATCH 2/7] tracing: Use kmem_cache_alloc instead of kmalloc in trace_events.c Steven Rostedt
2013-03-02  2:48 ` [for-next][PATCH 3/7] tracing: Use direct field, type and system names Steven Rostedt
2013-03-02  2:48 ` [for-next][PATCH 4/7] tracing: Do not block on splice if either file or splice NONBLOCK flag is set Steven Rostedt
2013-03-02  2:48 ` [for-next][PATCH 5/7] tracing: Fix polling on trace_pipe_raw Steven Rostedt
2013-03-12 15:52   ` Mauro Carvalho Chehab
2013-03-02  2:48 ` [for-next][PATCH 6/7] tracing: Fix read blocking " Steven Rostedt
2013-03-02  2:48 ` [for-next][PATCH 7/7] tracing/ring-buffer: Move poll wake ups into ring buffer code Steven Rostedt
2013-03-02  3:06 ` [for-next][PATCH 0/7] tracing: fixups, memory savings, and block on splice Steven Rostedt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox