public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
To: Linus Torvalds <torvalds@linux-foundation.org>,
	Ingo Molnar <mingo@elte.hu>,
	linux-kernel@vger.kernel.org,
	Andrew Morton <akpm@linux-foundation.org>,
	Steven Rostedt <rostedt@goodmis.org>,
	ltt-dev@lists.casi.polymtl.ca,
	Peter Zijlstra <peterz@infradead.org>,
	Frederic Weisbecker <fweisbec@gmail.com>,
	Arjan van de Ven <arjan@infradead.org>,
	Pekka Paalanen <pq@iki.fi>,
	Arnaldo Carvalho de Melo <acme@redhat.com>,
	"H. Peter Anvin" <hpa@zytor.com>,
	Martin Bligh <mbligh@google.com>,
	"Frank Ch. Eigler" <fche@redhat.com>,
	Tom Zanussi <tzanussi@gmail.com>,
	Masami Hiramatsu <mhiramat@redhat.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Jason Baron <jbaron@redhat.com>,
	Christoph Hellwig <hch@infradead.org>,
	Jiaying Zhang <jiayingz@google.com>,
	Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>,
	mrubin@google.com, md@google.com
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>,
	William Lee Irwin III <wli@holomorphy.com>
Subject: [RFC patch 15/41] Poll : add poll_wait_set_exclusive
Date: Thu, 05 Mar 2009 17:47:43 -0500	[thread overview]
Message-ID: <20090305225515.055491563@polymtl.ca> (raw)
In-Reply-To: 20090305224728.947235917@polymtl.ca

[-- Attachment #1: poll-wait-exclusive.patch --]
[-- Type: text/plain, Size: 4267 bytes --]


Problem description :

In LTTng, all lttd readers are polling all the available debugfs files
for data. This is principally because the number of reader threads is
user-defined and there are typical workloads where a single CPU is
producing most of the tracing data and all other CPUs are idle,
available to consume data. It therefore makes sense not to tie those
threads to specific buffers. However, when the number of threads grows,
we face a "thundering herd" problem where many threads can be woken up
and put back to sleep, leaving only a single thread doing useful work.

Solution :

I just created a patch which adds a poll_wait_set_exclusive() primitive
to poll(), so the code which implements the pollfd operation can specify
that only a single waiter must be woken up.

poll_wait_set_exclusive : set poll wait queue to exclusive
Sets up a poll wait queue to use exclusive wakeups. This is useful to
wake up only one waiter at each wakeup. Used to work-around "thundering herd"
problem.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: William Lee Irwin III <wli@holomorphy.com>
CC: Ingo Molnar <mingo@elte.hu>
---
 fs/select.c          |   41 ++++++++++++++++++++++++++++++++++++++---
 include/linux/poll.h |    2 ++
 2 files changed, 40 insertions(+), 3 deletions(-)

Index: linux-2.6-lttng/fs/select.c
===================================================================
--- linux-2.6-lttng.orig/fs/select.c	2009-03-05 15:41:29.000000000 -0500
+++ linux-2.6-lttng/fs/select.c	2009-03-05 15:41:42.000000000 -0500
@@ -105,6 +105,9 @@ struct poll_table_page {
  */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 		       poll_table *p);
+static void __pollwait_exclusive(struct file *filp,
+				 wait_queue_head_t *wait_address,
+				 poll_table *p);
 
 void poll_initwait(struct poll_wqueues *pwq)
 {
@@ -144,6 +147,20 @@ void poll_freewait(struct poll_wqueues *
 }
 EXPORT_SYMBOL(poll_freewait);
 
+/**
+ * poll_wait_set_exclusive - set poll wait queue to exclusive
+ *
+ * Sets up a poll wait queue to use exclusive wakeups. This is useful to
+ * wake up only one waiter at each wakeup. Used to work-around "thundering herd"
+ * problem.
+ */
+void poll_wait_set_exclusive(poll_table *p)
+{
+	if (p)
+		init_poll_funcptr(p, __pollwait_exclusive);
+}
+EXPORT_SYMBOL(poll_wait_set_exclusive);
+
 static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 {
 	struct poll_table_page *table = p->table;
@@ -195,8 +212,10 @@ static int pollwake(wait_queue_t *wait, 
 }
 
 /* Add a new entry */
-static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
-				poll_table *p)
+static void __pollwait_common(struct file *filp,
+			      wait_queue_head_t *wait_address,
+			      poll_table *p,
+			      int exclusive)
 {
 	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
 	struct poll_table_entry *entry = poll_get_entry(pwq);
@@ -207,7 +226,23 @@ static void __pollwait(struct file *filp
 	entry->wait_address = wait_address;
 	init_waitqueue_func_entry(&entry->wait, pollwake);
 	entry->wait.private = pwq;
-	add_wait_queue(wait_address, &entry->wait);
+	if (!exclusive)
+		add_wait_queue(wait_address, &entry->wait);
+	else
+		add_wait_queue_exclusive(wait_address, &entry->wait);
+}
+
+static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
+				poll_table *p)
+{
+	__pollwait_common(filp, wait_address, p, 0);
+}
+
+static void __pollwait_exclusive(struct file *filp,
+				 wait_queue_head_t *wait_address,
+				 poll_table *p)
+{
+	__pollwait_common(filp, wait_address, p, 1);
 }
 
 int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
Index: linux-2.6-lttng/include/linux/poll.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/poll.h	2009-03-05 15:41:29.000000000 -0500
+++ linux-2.6-lttng/include/linux/poll.h	2009-03-05 15:41:32.000000000 -0500
@@ -74,6 +74,8 @@ static inline int poll_schedule(struct p
 	return poll_schedule_timeout(pwq, state, NULL, 0);
 }
 
+extern void poll_wait_set_exclusive(poll_table *p);
+
 /*
  * Scaleable version of the fd_set.
  */

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

  parent reply	other threads:[~2009-03-05 23:33 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-03-05 22:47 [RFC patch 00/41] LTTng 0.105 core for Linux 2.6.27-rc9 Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 01/41] LTTng - core header Mathieu Desnoyers
2009-03-06 18:37   ` Steven Rostedt
2009-03-05 22:47 ` [RFC patch 02/41] LTTng - core data structures Mathieu Desnoyers
2009-03-06 18:41   ` Steven Rostedt
2009-03-05 22:47 ` [RFC patch 03/41] LTTng core x86 Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 04/41] LTTng core powerpc Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 05/41] LTTng relay buffer allocation, read, write Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 06/41] LTTng optimize write to page function Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 07/41] LTTng dynamic channels Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 08/41] LTTng - tracer header Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 09/41] LTTng optimize write to page function deal with unaligned access Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 10/41] lttng-optimize-write-to-page-function-remove-some-memcpy-calls Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 11/41] ltt-relay: cache pages address Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 12/41] x86 : export vmalloc_sync_all() Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 13/41] LTTng - tracer code Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 14/41] Splice and pipe : export pipe buf operations for GPL modules Mathieu Desnoyers
2009-03-05 22:47 ` Mathieu Desnoyers [this message]
2009-03-05 22:47 ` [RFC patch 16/41] LTTng Transport Locked Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 17/41] LTTng - serialization Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 18/41] Seq_file add support for sorted list Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 19/41] Sort module list by pointer address to get coherent sleepable seq_file iterators Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 20/41] Linux Kernel Markers - Iterator Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 21/41] LTTng probes specialized tracepoints Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 22/41] LTTng marker control Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 23/41] Immediate Values Stub header Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 24/41] Linux Kernel Markers - Use Immediate Values Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 25/41] Markers Support for Proprierary Modules Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 26/41] Marers remove old comment Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 27/41] Markers use dynamic channels Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 28/41] LTT trace control Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 29/41] LTTng menus Mathieu Desnoyers
2009-03-05 23:35   ` Randy Dunlap
2009-03-05 23:47     ` Mathieu Desnoyers
2009-03-05 23:51       ` Randy Dunlap
2009-03-06  0:01         ` [ltt-dev] " Mathieu Desnoyers
2009-03-06  0:12           ` Randy Dunlap
2009-03-05 22:47 ` [RFC patch 30/41] LTTng build Mathieu Desnoyers
2009-03-05 22:47 ` [RFC patch 31/41] LTTng userspace event v2 Mathieu Desnoyers
2009-03-05 22:48 ` [RFC patch 32/41] LTTng filter Mathieu Desnoyers
2009-03-05 22:48 ` [RFC patch 33/41] LTTng dynamic tracing support with kprobes Mathieu Desnoyers
2009-03-05 22:48 ` [RFC patch 34/41] Marker header API update Mathieu Desnoyers
2009-03-05 22:48 ` [RFC patch 35/41] Marker " Mathieu Desnoyers
2009-03-05 22:48 ` [RFC patch 36/41] kvm markers " Mathieu Desnoyers
2009-03-05 22:48 ` [RFC patch 37/41] Markers : multi-probes test Mathieu Desnoyers
2009-03-05 22:48 ` [RFC patch 38/41] Markers examples API update Mathieu Desnoyers
2009-03-05 22:48 ` [RFC patch 39/41] SPUFS markers " Mathieu Desnoyers
2009-03-05 22:48 ` [RFC patch 40/41] EXT4: instrumentation with tracepoints Mathieu Desnoyers
2009-03-05 22:48 ` [RFC patch 41/41] JBD2: use tracepoints for instrumentation Mathieu Desnoyers
2009-03-06 10:11 ` [RFC patch 00/41] LTTng 0.105 core for Linux 2.6.27-rc9 Ingo Molnar
2009-03-06 19:02   ` Mathieu Desnoyers
2009-03-11 18:32     ` Ingo Molnar
2009-03-13 16:18       ` Mathieu Desnoyers
2009-03-14 16:43         ` Ingo Molnar
2009-03-14 16:59           ` [ltt-dev] " Mathieu Desnoyers
2009-03-06 18:34 ` Steven Rostedt
2009-03-06 19:01   ` Frederic Weisbecker

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090305225515.055491563@polymtl.ca \
    --to=mathieu.desnoyers@polymtl.ca \
    --cc=acme@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=arjan@infradead.org \
    --cc=eduard.munteanu@linux360.ro \
    --cc=fche@redhat.com \
    --cc=fweisbec@gmail.com \
    --cc=hch@infradead.org \
    --cc=hpa@zytor.com \
    --cc=jbaron@redhat.com \
    --cc=jiayingz@google.com \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ltt-dev@lists.casi.polymtl.ca \
    --cc=mbligh@google.com \
    --cc=md@google.com \
    --cc=mhiramat@redhat.com \
    --cc=mingo@elte.hu \
    --cc=mrubin@google.com \
    --cc=peterz@infradead.org \
    --cc=pq@iki.fi \
    --cc=rostedt@goodmis.org \
    --cc=torvalds@linux-foundation.org \
    --cc=tzanussi@gmail.com \
    --cc=wli@holomorphy.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox