* [RFC/PATCH 1/3] rt: rename rt_mutex_setprio to task_setprio
2007-10-22 9:50 [RFC/PATCH 0/3] rt: workqueue PI support Peter Zijlstra
@ 2007-10-22 9:50 ` Peter Zijlstra
2007-10-22 9:50 ` [RFC/PATCH 2/3] rt: PI-workqueue support Peter Zijlstra
2007-10-22 9:50 ` [RFC/PATCH 3/3] rt: PI-workqueue: fix barriers Peter Zijlstra
2 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2007-10-22 9:50 UTC (permalink / raw)
To: linux-kernel
Cc: Daniel Walker, Steven Rostedt, Ingo Molnar, Thomas Gleixner,
Gregory Haskins, Oleg Nesterov, Peter Zijlstra
[-- Attachment #1: rt_mutex_setprio.patch --]
[-- Type: text/plain, Size: 2688 bytes --]
With there being multiple non-mutex users of this function its past time it
got renamed.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 7 ++++++-
kernel/rcupreempt-boost.c | 4 ++--
kernel/sched.c | 8 ++------
3 files changed, 10 insertions(+), 9 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1655,9 +1655,14 @@ extern unsigned int sysctl_sched_compat_
extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features;
+extern void task_setprio(struct task_struct *p, int prio);
+
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
+static inline void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+ task_setprio(p, prio);
+}
extern void rt_mutex_adjust_pi(struct task_struct *p);
#else
static inline int rt_mutex_getprio(struct task_struct *p)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4445,10 +4445,8 @@ long __sched sleep_on_timeout(wait_queue
}
EXPORT_SYMBOL(sleep_on_timeout);
-#ifdef CONFIG_RT_MUTEXES
-
/*
- * rt_mutex_setprio - set the current priority of a task
+ * task_setprio - set the current priority of a task
* @p: task
* @prio: prio value (kernel-internal form)
*
@@ -4457,7 +4455,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
*
* Used by the rt_mutex code to implement priority inheritance logic.
*/
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void task_setprio(struct task_struct *p, int prio)
{
unsigned long flags;
int oldprio, prev_resched, on_rq;
@@ -4522,8 +4520,6 @@ out_unlock:
task_rq_unlock(rq, &flags);
}
-#endif
-
void set_user_nice(struct task_struct *p, long nice)
{
int old_prio, delta, on_rq;
Index: linux-2.6/kernel/rcupreempt-boost.c
===================================================================
--- linux-2.6.orig/kernel/rcupreempt-boost.c
+++ linux-2.6/kernel/rcupreempt-boost.c
@@ -233,7 +233,7 @@ static void rcu_boost_task(struct task_s
if (task->rcu_prio < task->prio) {
rcu_trace_boost_task_boosted(RCU_BOOST_ME);
- rt_mutex_setprio(task, task->rcu_prio);
+ task_setprio(task, task->rcu_prio);
}
}
@@ -325,7 +325,7 @@ void __rcu_preempt_unboost(void)
spin_lock(&curr->pi_lock);
prio = rt_mutex_getprio(curr);
- rt_mutex_setprio(curr, prio);
+ task_setprio(curr, prio);
curr->rcub_rbdp = NULL;
--
^ permalink raw reply [flat|nested] 11+ messages in thread* [RFC/PATCH 2/3] rt: PI-workqueue support
2007-10-22 9:50 [RFC/PATCH 0/3] rt: workqueue PI support Peter Zijlstra
2007-10-22 9:50 ` [RFC/PATCH 1/3] rt: rename rt_mutex_setprio to task_setprio Peter Zijlstra
@ 2007-10-22 9:50 ` Peter Zijlstra
2007-10-22 12:00 ` Steven Rostedt
2007-10-22 9:50 ` [RFC/PATCH 3/3] rt: PI-workqueue: fix barriers Peter Zijlstra
2 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2007-10-22 9:50 UTC (permalink / raw)
To: linux-kernel
Cc: Daniel Walker, Steven Rostedt, Ingo Molnar, Thomas Gleixner,
Gregory Haskins, Oleg Nesterov, Peter Zijlstra
[-- Attachment #1: rt-workqeue-prio.patch --]
[-- Type: text/plain, Size: 6334 bytes --]
Add support for priority queueing and priority inheritance to the workqueue
infrastructure. This is done by replacing the linear linked worklist with a
priority sorted plist.
The drawback is that this breaks the workqueue barrier, needed to support
flush_workqueue() and wait_on_work().
Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/workqueue.h | 7 ++++---
kernel/power/poweroff.c | 1 +
kernel/sched.c | 4 ----
kernel/workqueue.c | 40 +++++++++++++++++++++++++---------------
4 files changed, 30 insertions(+), 22 deletions(-)
Index: linux-2.6/include/linux/workqueue.h
===================================================================
--- linux-2.6.orig/include/linux/workqueue.h
+++ linux-2.6/include/linux/workqueue.h
@@ -8,6 +8,7 @@
#include <linux/timer.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
+#include <linux/plist.h>
#include <asm/atomic.h>
struct workqueue_struct;
@@ -26,7 +27,7 @@ struct work_struct {
#define WORK_STRUCT_PENDING 0 /* T if work item pending execution */
#define WORK_STRUCT_FLAG_MASK (3UL)
#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
- struct list_head entry;
+ struct plist_node entry;
work_func_t func;
};
@@ -43,7 +44,7 @@ struct execute_work {
#define __WORK_INITIALIZER(n, f) { \
.data = WORK_DATA_INIT(), \
- .entry = { &(n).entry, &(n).entry }, \
+ .entry = PLIST_NODE_INIT(n.entry, MAX_PRIO), \
.func = (f), \
}
@@ -79,7 +80,7 @@ struct execute_work {
#define INIT_WORK(_work, _func) \
do { \
(_work)->data = (atomic_long_t) WORK_DATA_INIT(); \
- INIT_LIST_HEAD(&(_work)->entry); \
+ plist_node_init(&(_work)->entry, -1); \
PREPARE_WORK((_work), (_func)); \
} while (0)
Index: linux-2.6/kernel/power/poweroff.c
===================================================================
--- linux-2.6.orig/kernel/power/poweroff.c
+++ linux-2.6/kernel/power/poweroff.c
@@ -8,6 +8,7 @@
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/pm.h>
+#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/reboot.h>
Index: linux-2.6/kernel/workqueue.c
===================================================================
--- linux-2.6.orig/kernel/workqueue.c
+++ linux-2.6/kernel/workqueue.c
@@ -44,7 +44,7 @@ struct cpu_workqueue_struct {
spinlock_t lock;
- struct list_head worklist;
+ struct plist_head worklist;
wait_queue_head_t more_work;
struct work_struct *current_work;
@@ -127,16 +127,19 @@ struct cpu_workqueue_struct *get_wq_data
static void insert_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work, int tail)
{
+ int prio = current->normal_prio;
+
set_wq_data(work, cwq);
/*
* Ensure that we get the right work->data if we see the
* result of list_add() below, see try_to_grab_pending().
*/
smp_wmb();
- if (tail)
- list_add_tail(&work->entry, &cwq->worklist);
- else
- list_add(&work->entry, &cwq->worklist);
+ plist_node_init(&work->entry, prio);
+ plist_add(&work->entry, &cwq->worklist);
+
+ if (prio < cwq->thread->prio)
+ task_setprio(cwq->thread, prio);
wake_up(&cwq->more_work);
}
@@ -168,7 +171,7 @@ int fastcall queue_work(struct workqueue
int ret = 0, cpu = raw_smp_processor_id();
if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
- BUG_ON(!list_empty(&work->entry));
+ BUG_ON(!plist_node_empty(&work->entry));
__queue_work(wq_per_cpu(wq, cpu), work);
ret = 1;
}
@@ -222,7 +225,7 @@ int queue_delayed_work_on(int cpu, struc
if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
BUG_ON(timer_pending(timer));
- BUG_ON(!list_empty(&work->entry));
+ BUG_ON(!plist_node_empty(&work->entry));
/* This stores cwq for the moment, for the timer_fn */
set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
@@ -264,13 +267,17 @@ static void run_workqueue(struct cpu_wor
__FUNCTION__, cwq->run_depth);
dump_stack();
}
- while (!list_empty(&cwq->worklist)) {
- struct work_struct *work = list_entry(cwq->worklist.next,
+ while (!plist_head_empty(&cwq->worklist)) {
+ struct work_struct *work = plist_first_entry(&cwq->worklist,
struct work_struct, entry);
work_func_t f = work->func;
+ if (likely(cwq->thread->prio != work->entry.prio))
+ task_setprio(cwq->thread, work->entry.prio);
+
cwq->current_work = work;
- list_del_init(cwq->worklist.next);
+ plist_del(&work->entry, &cwq->worklist);
+ plist_node_init(&work->entry, MAX_PRIO);
spin_unlock_irq(&cwq->lock);
BUG_ON(get_wq_data(work) != cwq);
@@ -283,6 +290,7 @@ static void run_workqueue(struct cpu_wor
spin_lock_irq(&cwq->lock);
cwq->current_work = NULL;
}
+ task_setprio(cwq->thread, current->normal_prio);
cwq->run_depth--;
spin_unlock_irq(&cwq->lock);
}
@@ -301,7 +309,7 @@ static int worker_thread(void *__cwq)
prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
if (!freezing(current) &&
!kthread_should_stop() &&
- list_empty(&cwq->worklist))
+ plist_head_empty(&cwq->worklist))
schedule();
finish_wait(&cwq->more_work, &wait);
@@ -354,7 +362,8 @@ static int flush_cpu_workqueue(struct cp
active = 0;
spin_lock_irq(&cwq->lock);
- if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+ if (!plist_head_empty(&cwq->worklist) ||
+ cwq->current_work != NULL) {
insert_wq_barrier(cwq, &barr, 1);
active = 1;
}
@@ -413,7 +422,7 @@ static int try_to_grab_pending(struct wo
return ret;
spin_lock_irq(&cwq->lock);
- if (!list_empty(&work->entry)) {
+ if (!plist_node_empty(&work->entry)) {
/*
* This work is queued, but perhaps we locked the wrong cwq.
* In that case we must see the new value after rmb(), see
@@ -421,7 +430,8 @@ static int try_to_grab_pending(struct wo
*/
smp_rmb();
if (cwq == get_wq_data(work)) {
- list_del_init(&work->entry);
+ plist_del(&work->entry, &cwq->worklist);
+ plist_node_init(&work->entry, MAX_PRIO);
ret = 1;
}
}
@@ -747,7 +757,7 @@ init_cpu_workqueue(struct workqueue_stru
cwq->wq = wq;
spin_lock_init(&cwq->lock);
- INIT_LIST_HEAD(&cwq->worklist);
+ plist_head_init(&cwq->worklist, NULL);
init_waitqueue_head(&cwq->more_work);
return cwq;
--
^ permalink raw reply [flat|nested] 11+ messages in thread* [RFC/PATCH 3/3] rt: PI-workqueue: fix barriers
2007-10-22 9:50 [RFC/PATCH 0/3] rt: workqueue PI support Peter Zijlstra
2007-10-22 9:50 ` [RFC/PATCH 1/3] rt: rename rt_mutex_setprio to task_setprio Peter Zijlstra
2007-10-22 9:50 ` [RFC/PATCH 2/3] rt: PI-workqueue support Peter Zijlstra
@ 2007-10-22 9:50 ` Peter Zijlstra
2007-10-22 11:48 ` [RFC/PATCH 4/3] rt: PI-workqueue: fixup the barrier prio Peter Zijlstra
2007-10-22 17:34 ` [RFC/PATCH 3/3] rt: PI-workqueue: fix barriers Oleg Nesterov
2 siblings, 2 replies; 11+ messages in thread
From: Peter Zijlstra @ 2007-10-22 9:50 UTC (permalink / raw)
To: linux-kernel
Cc: Daniel Walker, Steven Rostedt, Ingo Molnar, Thomas Gleixner,
Gregory Haskins, Oleg Nesterov, Peter Zijlstra
[-- Attachment #1: rt-workqueue-barrier.patch --]
[-- Type: text/plain, Size: 9468 bytes --]
The plist change to the workqueues left the barrier functionality broken.
The barrier is used for two things:
- wait_on_work(), and
- flush_cpu_workqueue().
wait_on_work() - uses the barrier to wait on the completion of the currently
worklet. This was done by inserting a completion barrier at the very head of
the worklist. With plist this would be the head of the highest prio.
In order to do that, we extend the plist_add functionality to allow adding to
the head of a prio. Another noteworthy point it that this high prio worklet
must not boost the prio further than the waiting task's prio, even though we
enqueue it at prio 99.
flush_cpu_workqueue() - is a full ordering barrier, although as the name
suggests usually used to wait for the worklist to drain. We'll support the
full ordering semantics currently present. This means that:
W10, W22, W65, B, W80, B, W99
[ where Wn is a worklet at prio n, and B the barrier ]
would most likely execute in the following order:
W10@99, W65@99, W22@99, W80@99, W99
[ Wn@m is Wn executed at prio m ]
[ W10 would be first because it can start executing while the others
are being added ]
Whereas without the barriers it would be:
W10@99, W99, W80, W65, W22
The prio ordering of the plist makes it hard to impose an extra order on top.
The solution used is to nest plist structures. The example will look like:
W10, B(B(W65, W22), W80), W99
That is, the barrier will splice the worklist into itself, and enqueue itself
as the next item to run (very first item, highest prio). The barrier will then
run its own plist to completion before 'popping' back to the regular worklist.
To avoid callstack nesting, run_workqueue is taught about this barrier stack.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/plist.h | 14 ++++++
kernel/workqueue.c | 104 ++++++++++++++++++++++++++++++++++++++++++--------
lib/plist.c | 10 +++-
3 files changed, 110 insertions(+), 18 deletions(-)
Index: linux-2.6/include/linux/plist.h
===================================================================
--- linux-2.6.orig/include/linux/plist.h
+++ linux-2.6/include/linux/plist.h
@@ -145,9 +145,21 @@ static inline void plist_node_init(struc
plist_head_init(&node->plist, NULL);
}
-extern void plist_add(struct plist_node *node, struct plist_head *head);
+void __plist_add(struct plist_node *node, struct plist_head *head, int tail);
+
+static inline void plist_add(struct plist_node *node, struct plist_head *head)
+{
+ __plist_add(node, head, 1);
+}
+
extern void plist_del(struct plist_node *node, struct plist_head *head);
+static inline void plist_head_splice(struct plist_head *src, struct plist_head *dst)
+{
+ list_splice_init(&src->prio_list, &dst->prio_list);
+ list_splice_init(&src->node_list, &dst->node_list);
+}
+
/**
* plist_for_each - iterate over the plist
* @pos: the type * to use as a loop counter
Index: linux-2.6/lib/plist.c
===================================================================
--- linux-2.6.orig/lib/plist.c
+++ linux-2.6/lib/plist.c
@@ -72,7 +72,7 @@ static void plist_check_head(struct plis
* @node: &struct plist_node pointer
* @head: &struct plist_head pointer
*/
-void plist_add(struct plist_node *node, struct plist_head *head)
+void __plist_add(struct plist_node *node, struct plist_head *head, int tail)
{
struct plist_node *iter;
@@ -92,7 +92,13 @@ void plist_add(struct plist_node *node,
lt_prio:
list_add_tail(&node->plist.prio_list, &iter->plist.prio_list);
eq_prio:
- list_add_tail(&node->plist.node_list, &iter->plist.node_list);
+ if (likely(tail))
+ list_add_tail(&node->plist.node_list, &iter->plist.node_list);
+ else {
+ iter = list_entry(iter->plist.prio_list.prev,
+ struct plist_node, plist.prio_list);
+ list_add(&node->plist.node_list, &iter->plist.node_list);
+ }
plist_check_head(head);
}
Index: linux-2.6/kernel/workqueue.c
===================================================================
--- linux-2.6.orig/kernel/workqueue.c
+++ linux-2.6/kernel/workqueue.c
@@ -36,6 +36,8 @@
#include <asm/uaccess.h>
+struct wq_full_barrier;
+
/*
* The per-CPU workqueue (if single thread, we always use the first
* possible cpu).
@@ -52,6 +54,8 @@ struct cpu_workqueue_struct {
struct task_struct *thread;
int run_depth; /* Detect run_workqueue() recursion depth */
+
+ struct wq_full_barrier *barrier;
} ____cacheline_aligned;
/*
@@ -125,10 +129,8 @@ struct cpu_workqueue_struct *get_wq_data
}
static void insert_work(struct cpu_workqueue_struct *cwq,
- struct work_struct *work, int tail)
+ struct work_struct *work, int prio, int boost_prio, int tail)
{
- int prio = current->normal_prio;
-
set_wq_data(work, cwq);
/*
* Ensure that we get the right work->data if we see the
@@ -136,10 +138,10 @@ static void insert_work(struct cpu_workq
*/
smp_wmb();
plist_node_init(&work->entry, prio);
- plist_add(&work->entry, &cwq->worklist);
+ __plist_add(&work->entry, &cwq->worklist, tail);
- if (prio < cwq->thread->prio)
- task_setprio(cwq->thread, prio);
+ if (boost_prio < cwq->thread->prio)
+ task_setprio(cwq->thread, boost_prio);
wake_up(&cwq->more_work);
}
@@ -150,7 +152,7 @@ static void __queue_work(struct cpu_work
unsigned long flags;
spin_lock_irqsave(&cwq->lock, flags);
- insert_work(cwq, work, 1);
+ insert_work(cwq, work, current->normal_prio, current->normal_prio, 1);
spin_unlock_irqrestore(&cwq->lock, flags);
}
@@ -257,8 +259,19 @@ static void leak_check(void *func)
dump_stack();
}
+struct wq_full_barrier {
+ struct work_struct work;
+ struct plist_head worklist;
+ struct wq_full_barrier *prev_barrier;
+ int prev_prio;
+ struct cpu_workqueue_struct *cwq;
+ struct completion done;
+};
+
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
+ struct plist_head *worklist = &cwq->worklist;
+
spin_lock_irq(&cwq->lock);
cwq->run_depth++;
if (cwq->run_depth > 3) {
@@ -267,16 +280,25 @@ static void run_workqueue(struct cpu_wor
__FUNCTION__, cwq->run_depth);
dump_stack();
}
- while (!plist_head_empty(&cwq->worklist)) {
- struct work_struct *work = plist_first_entry(&cwq->worklist,
+
+again:
+ while (!plist_head_empty(worklist)) {
+ int prio;
+ struct work_struct *work = plist_first_entry(worklist,
struct work_struct, entry);
work_func_t f = work->func;
- if (likely(cwq->thread->prio != work->entry.prio))
- task_setprio(cwq->thread, work->entry.prio);
+ prio = work->entry.prio;
+ if (unlikely(worklist != &cwq->worklist)) {
+ prio = min(prio, cwq->barrier->prev_prio);
+ prio = min(prio, plist_first(&cwq->worklist)->prio);
+ }
+
+ if (likely(cwq->thread->prio != prio))
+ task_setprio(cwq->thread, prio);
cwq->current_work = work;
- plist_del(&work->entry, &cwq->worklist);
+ plist_del(&work->entry, worklist);
plist_node_init(&work->entry, MAX_PRIO);
spin_unlock_irq(&cwq->lock);
@@ -289,7 +311,27 @@ static void run_workqueue(struct cpu_wor
spin_lock_irq(&cwq->lock);
cwq->current_work = NULL;
+
+ if (unlikely(cwq->barrier))
+ worklist = &cwq->barrier->worklist;
+ }
+
+ if (unlikely(worklist != &cwq->worklist)) {
+ struct wq_full_barrier *barrier = cwq->barrier;
+
+ BUG_ON(!barrier);
+ cwq->barrier = barrier->prev_barrier;
+ complete(&barrier->done);
+
+ if (unlikely(cwq->barrier))
+ worklist = &cwq->barrier->worklist;
+ else
+ worklist = &cwq->worklist;
+
+ if (!plist_head_empty(worklist))
+ goto again;
}
+
task_setprio(cwq->thread, current->normal_prio);
cwq->run_depth--;
spin_unlock_irq(&cwq->lock);
@@ -343,7 +385,38 @@ static void insert_wq_barrier(struct cpu
init_completion(&barr->done);
- insert_work(cwq, &barr->work, tail);
+ insert_work(cwq, &barr->work, 0, current->normal_prio, tail);
+}
+
+static void wq_full_barrier_func(struct work_struct *work)
+{
+ struct wq_full_barrier *barrier =
+ container_of(work, struct wq_full_barrier, work);
+ struct cpu_workqueue_struct *cwq = barrier->cwq;
+
+ spin_lock_irq(&cwq->lock);
+ barrier->prev_barrier = cwq->barrier;
+ if (cwq->barrier) {
+ barrier->prev_prio = min(cwq->barrier->prev_prio,
+ plist_first(&cwq->barrier->worklist)->prio);
+ } else
+ barrier->prev_prio = MAX_PRIO;
+ cwq->barrier = barrier;
+ spin_unlock_irq(&cwq->lock);
+}
+
+static void insert_wq_full_barrier(struct cpu_workqueue_struct *cwq,
+ struct wq_full_barrier *barr)
+{
+ INIT_WORK(&barr->work, wq_full_barrier_func);
+ __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
+
+ plist_head_init(&barr->worklist, NULL);
+ plist_head_splice(&cwq->worklist, &barr->worklist);
+ barr->cwq = cwq;
+ init_completion(&barr->done);
+
+ insert_work(cwq, &barr->work, 0, current->normal_prio, 1);
}
static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -358,13 +431,13 @@ static int flush_cpu_workqueue(struct cp
run_workqueue(cwq);
active = 1;
} else {
- struct wq_barrier barr;
+ struct wq_full_barrier barr;
active = 0;
spin_lock_irq(&cwq->lock);
if (!plist_head_empty(&cwq->worklist) ||
cwq->current_work != NULL) {
- insert_wq_barrier(cwq, &barr, 1);
+ insert_wq_full_barrier(cwq, &barr);
active = 1;
}
spin_unlock_irq(&cwq->lock);
@@ -759,6 +832,7 @@ init_cpu_workqueue(struct workqueue_stru
spin_lock_init(&cwq->lock);
plist_head_init(&cwq->worklist, NULL);
init_waitqueue_head(&cwq->more_work);
+ cwq->barrier = NULL;
return cwq;
}
--
^ permalink raw reply [flat|nested] 11+ messages in thread