* [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
@ 2007-08-01 0:26 Gregory Haskins
2007-08-01 3:52 ` Daniel Walker
0 siblings, 1 reply; 45+ messages in thread
From: Gregory Haskins @ 2007-08-01 0:26 UTC (permalink / raw)
To: linux-rt-users; +Cc: linux-kernel, ghaskins
The following workqueue related patch is a port of the original VFCIPI PI
patch I submitted earlier. There is still more work to be done to add the
"schedule_on_cpu()" type behavior, and even more if we want to use this as
part of KVM. But for now, this patch can stand alone so I thought I would
get it out there.
------------------------------------------------------------------
Mainline workqueues treat all requests the same. This patch prioritizes
requests in the queue based on the callers current rt_priority. It also
adds a priority-inheritance feature to avoid inversion.
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
include/linux/workqueue.h | 2
kernel/workqueue.c | 198 +++++++++++++++++++++++++++++++++------------
2 files changed, 149 insertions(+), 51 deletions(-)
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 925d898..ae740b6 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -28,6 +28,7 @@ struct work_struct {
#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
struct list_head entry;
work_func_t func;
+ int prio;
};
#define WORK_DATA_INIT() ATOMIC_LONG_INIT(0)
@@ -81,6 +82,7 @@ struct execute_work {
(_work)->data = (atomic_long_t) WORK_DATA_INIT(); \
INIT_LIST_HEAD(&(_work)->entry); \
PREPARE_WORK((_work), (_func)); \
+ (_work)->prio = -1; \
} while (0)
#define INIT_DELAYED_WORK(_work, _func) \
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e339c80..ecd5e01 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -14,6 +14,8 @@
* Theodore Ts'o <tytso@mit.edu>
*
* Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>.
+ *
+ * RT-queues and PI by Gregory Haskins <ghaskins@novell.com>
*/
#include <linux/module.h>
@@ -36,6 +38,13 @@
#include <asm/uaccess.h>
+/* Note: prio_array inspiration credit goes to the RT scheduler...*/
+struct prio_array {
+ DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* +1 bit for delimiter */
+ unsigned long count;
+ struct list_head queue[MAX_RT_PRIO];
+};
+
/*
* The per-CPU workqueue (if single thread, we always use the first
* possible cpu).
@@ -45,6 +54,7 @@ struct cpu_workqueue_struct {
spinlock_t lock;
struct list_head worklist;
+ struct prio_array rt_worklist;
wait_queue_head_t more_work;
struct work_struct *current_work;
@@ -52,6 +62,7 @@ struct cpu_workqueue_struct {
struct task_struct *thread;
int run_depth; /* Detect run_workqueue() recursion depth */
+ int prio; /* -1 = normal priority, 0-99 = RT priority */
} ____cacheline_aligned;
/*
@@ -82,6 +93,94 @@ static cpumask_t cpu_singlethread_map __read_mostly;
*/
static cpumask_t cpu_populated_map __read_mostly;
+/*
+ * ----------------------------------------
+ * prio_array
+ * ----------------------------------------
+ */
+static void prio_array_init(struct prio_array *array)
+{
+ int i;
+
+ memset(array->bitmap, 0, sizeof(array->bitmap));
+ array->count = 0;
+
+ for (i=0; i<MAX_RT_PRIO; i++)
+ INIT_LIST_HEAD(&array->queue[i]);
+}
+
+static struct work_struct* prio_array_dequeue(struct prio_array *array)
+{
+ struct list_head *head;
+ struct work_struct *work;
+ int idx;
+
+ if (!array->count)
+ return NULL;
+
+ idx = sched_find_first_bit(array->bitmap);
+
+ head = array->queue + idx;
+
+ /* If we got here, there better be something in the list */
+ BUG_ON(!head);
+ BUG_ON(list_empty(head));
+
+ work = list_first_entry(head, struct work_struct, entry);
+ BUG_ON(!work);
+
+ list_del_init(&work->entry);
+ array->count--;
+
+ if (list_empty(head))
+ __clear_bit(idx, &array->bitmap);
+
+ return work;
+}
+
+static void prio_array_enqueue(struct prio_array *array,
+ struct work_struct *work,
+ int prio, int tail)
+{
+ struct list_head *head;
+
+ BUG_ON(prio < 0);
+ BUG_ON(prio > MAX_RT_PRIO);
+
+ head = array->queue + prio;
+
+ if (tail)
+ list_add_tail(&work->entry, head);
+ else
+ list_add(&work->entry, head);
+
+ __set_bit(prio, &array->bitmap);
+ array->count++;
+}
+
+#define CWQ_DEFAULT_PRIO 1
+
+/* Assumes lock is held */
+static void wq_task_setprio(struct cpu_workqueue_struct *cwq, int prio)
+{
+ struct sched_param param = { 0, };
+ pid_t pid = cwq->thread->pid;
+
+ if (prio < CWQ_DEFAULT_PRIO)
+ prio = CWQ_DEFAULT_PRIO;
+
+ if (cwq->prio != prio) {
+ if (prio != -1) {
+ param.sched_priority = prio;
+ sys_sched_setscheduler(pid, SCHED_FIFO, ¶m);
+ } else {
+ sys_sched_setscheduler(pid, SCHED_NORMAL, ¶m);
+ }
+
+ cwq->prio = prio;
+ }
+}
+
/* If it's single threaded, it isn't in the list of workqueues. */
static inline int is_single_threaded(struct workqueue_struct *wq)
{
@@ -133,10 +232,22 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
* result of list_add() below, see try_to_grab_pending().
*/
smp_wmb();
- if (tail)
- list_add_tail(&work->entry, &cwq->worklist);
- else
- list_add(&work->entry, &cwq->worklist);
+ if (rt_task(current)) {
+ work->prio = current->rt_priority;
+ prio_array_enqueue(&cwq->rt_worklist, work, work->prio, tail);
+
+ /* Priority inheritance on the kthread */
+ if (cwq->prio < work->prio)
+ wq_task_setprio(cwq, work->prio);
+ } else {
+ work->prio = -1;
+ if (tail)
+ list_add_tail(&work->entry, &cwq->worklist);
+ else
+ list_add(&work->entry, &cwq->worklist);
+
+ }
+
wake_up(&cwq->more_work);
}
@@ -254,8 +365,30 @@ static void leak_check(void *func)
dump_stack();
}
+static struct work_struct* cwq_dequeue(struct cpu_workqueue_struct *cwq)
+{
+ struct work_struct *work;
+
+ /* First check the RT items */
+ work = prio_array_dequeue(&cwq->rt_worklist);
+ if (!work) {
+ /* If nothing is found there, check the normal queue */
+ if (!list_empty(&cwq->worklist)) {
+ work = list_first_entry(&cwq->worklist,
+ struct work_struct,
+ entry);
+ BUG_ON(!work);
+ list_del_init(&work->entry);
+ }
+ }
+
+ return work;
+}
+
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
+ struct work_struct *work;
+
spin_lock_irq(&cwq->lock);
cwq->run_depth++;
if (cwq->run_depth > 3) {
@@ -264,13 +397,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
__FUNCTION__, cwq->run_depth);
dump_stack();
}
- while (!list_empty(&cwq->worklist)) {
- struct work_struct *work = list_entry(cwq->worklist.next,
- struct work_struct, entry);
+ while ((work = cwq_dequeue(cwq))) {
work_func_t f = work->func;
+ wq_task_setprio(cwq, work->prio);
cwq->current_work = work;
- list_del_init(cwq->worklist.next);
+
spin_unlock_irq(&cwq->lock);
BUG_ON(get_wq_data(work) != cwq);
@@ -283,6 +415,10 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
spin_lock_irq(&cwq->lock);
cwq->current_work = NULL;
}
+
+ /* Make sure we are back to normal priority before sleeping */
+ wq_task_setprio(cwq, -1);
+
cwq->run_depth--;
spin_unlock_irq(&cwq->lock);
}
@@ -295,7 +431,7 @@ static int worker_thread(void *__cwq)
if (!cwq->wq->freezeable)
current->flags |= PF_NOFREEZE;
- set_user_nice(current, -5);
+ wq_task_setprio(cwq, -1);
for (;;) {
prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
@@ -691,7 +827,9 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
cwq->wq = wq;
spin_lock_init(&cwq->lock);
INIT_LIST_HEAD(&cwq->worklist);
+ prio_array_init(&cwq->rt_worklist);
init_waitqueue_head(&cwq->more_work);
+ cwq->prio = -1;
return cwq;
}
@@ -803,47 +941,6 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
cwq->thread = NULL;
}
-void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu,
- int policy, int rt_priority, int nice)
-{
- struct sched_param param = { .sched_priority = rt_priority };
- struct cpu_workqueue_struct *cwq;
- mm_segment_t oldfs = get_fs();
- struct task_struct *p;
- unsigned long flags;
- int ret;
-
- cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- spin_lock_irqsave(&cwq->lock, flags);
- p = cwq->thread;
- spin_unlock_irqrestore(&cwq->lock, flags);
-
- set_user_nice(p, nice);
-
- set_fs(KERNEL_DS);
- ret = sys_sched_setscheduler(p->pid, policy, ¶m);
- set_fs(oldfs);
-
- WARN_ON(ret);
-}
-
- void set_workqueue_prio(struct workqueue_struct *wq, int policy,
- int rt_priority, int nice)
-{
- int cpu;
-
- /* We don't need the distraction of CPUs appearing and vanishing. */
- mutex_lock(&workqueue_mutex);
- if (is_single_threaded(wq))
- set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice);
- else {
- for_each_online_cpu(cpu)
- set_workqueue_thread_prio(wq, cpu, policy,
- rt_priority, nice);
- }
- mutex_unlock(&workqueue_mutex);
-}
-
/**
* destroy_workqueue - safely terminate a workqueue
* @wq: target workqueue
@@ -926,5 +1023,4 @@ void __init init_workqueues(void)
hotcpu_notifier(workqueue_cpu_callback, 0);
keventd_wq = create_workqueue("events");
BUG_ON(!keventd_wq);
- set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20);
}
^ permalink raw reply related [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 0:26 [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure Gregory Haskins
@ 2007-08-01 3:52 ` Daniel Walker
2007-08-01 11:59 ` Gregory Haskins
2007-08-01 17:01 ` Peter Zijlstra
0 siblings, 2 replies; 45+ messages in thread
From: Daniel Walker @ 2007-08-01 3:52 UTC (permalink / raw)
To: Gregory Haskins; +Cc: linux-rt-users, linux-kernel
On Tue, 2007-07-31 at 20:26 -0400, Gregory Haskins wrote:
> The following workqueue related patch is a port of the original VFCIPI PI
> patch I submitted earlier. There is still more work to be done to add the
> "schedule_on_cpu()" type behavior, and even more if we want to use this as
> part of KVM. But for now, this patch can stand alone so I thought I would
> get it out there.
>
> ------------------------------------------------------------------
>
> Mainline workqueues treat all requests the same. This patch prioritizes
> requests in the queue based on the callers current rt_priority. It also
> adds a priority-inheritance feature to avoid inversion.
>
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> ---
>
> include/linux/workqueue.h | 2
> kernel/workqueue.c | 198 +++++++++++++++++++++++++++++++++------------
> 2 files changed, 149 insertions(+), 51 deletions(-)
Here's a simpler version .. uses the plist data structure instead of the
100 queues, which makes for a cleaner patch ..
Signed-off-by: Daniel Walker <dwalker@mvista.com>
---
include/linux/workqueue.h | 7 ++++---
kernel/power/poweroff.c | 1 +
kernel/sched.c | 4 ----
kernel/workqueue.c | 40 +++++++++++++++++++++++++---------------
4 files changed, 30 insertions(+), 22 deletions(-)
Index: linux-2.6.22/include/linux/workqueue.h
===================================================================
--- linux-2.6.22.orig/include/linux/workqueue.h
+++ linux-2.6.22/include/linux/workqueue.h
@@ -8,6 +8,7 @@
#include <linux/timer.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
+#include <linux/plist.h>
#include <asm/atomic.h>
struct workqueue_struct;
@@ -26,7 +27,7 @@ struct work_struct {
#define WORK_STRUCT_PENDING 0 /* T if work item pending execution */
#define WORK_STRUCT_FLAG_MASK (3UL)
#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
- struct list_head entry;
+ struct plist_node entry;
work_func_t func;
};
@@ -43,7 +44,7 @@ struct execute_work {
#define __WORK_INITIALIZER(n, f) { \
.data = WORK_DATA_INIT(), \
- .entry = { &(n).entry, &(n).entry }, \
+ .entry = PLIST_NODE_INIT(n.entry, MAX_PRIO), \
.func = (f), \
}
@@ -79,7 +80,7 @@ struct execute_work {
#define INIT_WORK(_work, _func) \
do { \
(_work)->data = (atomic_long_t) WORK_DATA_INIT(); \
- INIT_LIST_HEAD(&(_work)->entry); \
+ plist_node_init(&(_work)->entry, -1); \
PREPARE_WORK((_work), (_func)); \
} while (0)
Index: linux-2.6.22/kernel/power/poweroff.c
===================================================================
--- linux-2.6.22.orig/kernel/power/poweroff.c
+++ linux-2.6.22/kernel/power/poweroff.c
@@ -8,6 +8,7 @@
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/pm.h>
+#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/reboot.h>
Index: linux-2.6.22/kernel/sched.c
===================================================================
--- linux-2.6.22.orig/kernel/sched.c
+++ linux-2.6.22/kernel/sched.c
@@ -4379,8 +4379,6 @@ long __sched sleep_on_timeout(wait_queue
}
EXPORT_SYMBOL(sleep_on_timeout);
-#ifdef CONFIG_RT_MUTEXES
-
/*
* rt_mutex_setprio - set the current priority of a task
* @p: task
@@ -4457,8 +4455,6 @@ out_unlock:
task_rq_unlock(rq, &flags);
}
-#endif
-
void set_user_nice(struct task_struct *p, long nice)
{
int old_prio, delta, on_rq;
Index: linux-2.6.22/kernel/workqueue.c
===================================================================
--- linux-2.6.22.orig/kernel/workqueue.c
+++ linux-2.6.22/kernel/workqueue.c
@@ -44,7 +44,7 @@ struct cpu_workqueue_struct {
spinlock_t lock;
- struct list_head worklist;
+ struct plist_head worklist;
wait_queue_head_t more_work;
struct work_struct *current_work;
@@ -127,16 +127,19 @@ struct cpu_workqueue_struct *get_wq_data
static void insert_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work, int tail)
{
+ int prio = current->normal_prio;
+
set_wq_data(work, cwq);
/*
* Ensure that we get the right work->data if we see the
* result of list_add() below, see try_to_grab_pending().
*/
smp_wmb();
- if (tail)
- list_add_tail(&work->entry, &cwq->worklist);
- else
- list_add(&work->entry, &cwq->worklist);
+ plist_node_init(&work->entry, prio);
+ plist_add(&work->entry, &cwq->worklist);
+
+ if (prio < cwq->thread->prio)
+ rt_mutex_setprio(cwq->thread, prio);
wake_up(&cwq->more_work);
}
@@ -168,7 +171,7 @@ int fastcall queue_work(struct workqueue
int ret = 0, cpu = raw_smp_processor_id();
if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
- BUG_ON(!list_empty(&work->entry));
+ BUG_ON(!plist_node_empty(&work->entry));
__queue_work(wq_per_cpu(wq, cpu), work);
ret = 1;
}
@@ -222,7 +225,7 @@ int queue_delayed_work_on(int cpu, struc
if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
BUG_ON(timer_pending(timer));
- BUG_ON(!list_empty(&work->entry));
+ BUG_ON(!plist_node_empty(&work->entry));
/* This stores cwq for the moment, for the timer_fn */
set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
@@ -264,13 +267,17 @@ static void run_workqueue(struct cpu_wor
__FUNCTION__, cwq->run_depth);
dump_stack();
}
- while (!list_empty(&cwq->worklist)) {
- struct work_struct *work = list_entry(cwq->worklist.next,
+ while (!plist_head_empty(&cwq->worklist)) {
+ struct work_struct *work = plist_first_entry(&cwq->worklist,
struct work_struct, entry);
work_func_t f = work->func;
+ if (likely(cwq->thread->prio != work->entry.prio))
+ rt_mutex_setprio(cwq->thread, work->entry.prio);
+
cwq->current_work = work;
- list_del_init(cwq->worklist.next);
+ plist_del(&work->entry, &cwq->worklist);
+ plist_node_init(&work->entry, MAX_PRIO);
spin_unlock_irq(&cwq->lock);
BUG_ON(get_wq_data(work) != cwq);
@@ -283,6 +290,7 @@ static void run_workqueue(struct cpu_wor
spin_lock_irq(&cwq->lock);
cwq->current_work = NULL;
}
+ rt_mutex_setprio(cwq->thread, current->normal_prio);
cwq->run_depth--;
spin_unlock_irq(&cwq->lock);
}
@@ -301,7 +309,7 @@ static int worker_thread(void *__cwq)
prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
if (!freezing(current) &&
!kthread_should_stop() &&
- list_empty(&cwq->worklist))
+ plist_head_empty(&cwq->worklist))
schedule();
finish_wait(&cwq->more_work, &wait);
@@ -354,7 +362,8 @@ static int flush_cpu_workqueue(struct cp
active = 0;
spin_lock_irq(&cwq->lock);
- if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+ if (!plist_head_empty(&cwq->worklist) ||
+ cwq->current_work != NULL) {
insert_wq_barrier(cwq, &barr, 1);
active = 1;
}
@@ -413,7 +422,7 @@ static int try_to_grab_pending(struct wo
return ret;
spin_lock_irq(&cwq->lock);
- if (!list_empty(&work->entry)) {
+ if (!plist_node_empty(&work->entry)) {
/*
* This work is queued, but perhaps we locked the wrong cwq.
* In that case we must see the new value after rmb(), see
@@ -421,7 +430,8 @@ static int try_to_grab_pending(struct wo
*/
smp_rmb();
if (cwq == get_wq_data(work)) {
- list_del_init(&work->entry);
+ plist_del(&work->entry, &cwq->worklist);
+ plist_node_init(&work->entry, MAX_PRIO);
ret = 1;
}
}
@@ -747,7 +757,7 @@ init_cpu_workqueue(struct workqueue_stru
cwq->wq = wq;
spin_lock_init(&cwq->lock);
- INIT_LIST_HEAD(&cwq->worklist);
+ plist_head_init(&cwq->worklist, NULL);
init_waitqueue_head(&cwq->more_work);
return cwq;
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 3:52 ` Daniel Walker
@ 2007-08-01 11:59 ` Gregory Haskins
2007-08-01 15:10 ` Daniel Walker
2007-08-01 17:01 ` Peter Zijlstra
1 sibling, 1 reply; 45+ messages in thread
From: Gregory Haskins @ 2007-08-01 11:59 UTC (permalink / raw)
To: Daniel Walker; +Cc: linux-rt-users, linux-kernel
On Tue, 2007-07-31 at 20:52 -0700, Daniel Walker wrote:
>
> Here's a simpler version .. uses the plist data structure instead of the
> 100 queues, which makes for a cleaner patch ..
Hi Daniel,
I like your idea on the plist simplification a lot. I will definitely
roll that into my series.
I am not too psyched about using the rt_mutex_setprio() API directly,
however. It seems broken to be calling that directly from non rt_mutex
code, IMHO. Perhaps the PI subsystem should be broken out from the
rt_mutex code so it can be used generally? There are other areas where
PI could potentially be used besides rt_mutex (this patch as a prime
example), so this might make sense.
Anyway, thanks for finding that simplification! I definitely learned
something.
-Greg
>
> Signed-off-by: Daniel Walker <dwalker@mvista.com>
>
> ---
> include/linux/workqueue.h | 7 ++++---
> kernel/power/poweroff.c | 1 +
> kernel/sched.c | 4 ----
> kernel/workqueue.c | 40 +++++++++++++++++++++++++---------------
> 4 files changed, 30 insertions(+), 22 deletions(-)
>
> Index: linux-2.6.22/include/linux/workqueue.h
> ===================================================================
> --- linux-2.6.22.orig/include/linux/workqueue.h
> +++ linux-2.6.22/include/linux/workqueue.h
> @@ -8,6 +8,7 @@
> #include <linux/timer.h>
> #include <linux/linkage.h>
> #include <linux/bitops.h>
> +#include <linux/plist.h>
> #include <asm/atomic.h>
>
> struct workqueue_struct;
> @@ -26,7 +27,7 @@ struct work_struct {
> #define WORK_STRUCT_PENDING 0 /* T if work item pending execution */
> #define WORK_STRUCT_FLAG_MASK (3UL)
> #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
> - struct list_head entry;
> + struct plist_node entry;
> work_func_t func;
> };
>
> @@ -43,7 +44,7 @@ struct execute_work {
>
> #define __WORK_INITIALIZER(n, f) { \
> .data = WORK_DATA_INIT(), \
> - .entry = { &(n).entry, &(n).entry }, \
> + .entry = PLIST_NODE_INIT(n.entry, MAX_PRIO), \
> .func = (f), \
> }
>
> @@ -79,7 +80,7 @@ struct execute_work {
> #define INIT_WORK(_work, _func) \
> do { \
> (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \
> - INIT_LIST_HEAD(&(_work)->entry); \
> + plist_node_init(&(_work)->entry, -1); \
> PREPARE_WORK((_work), (_func)); \
> } while (0)
>
> Index: linux-2.6.22/kernel/power/poweroff.c
> ===================================================================
> --- linux-2.6.22.orig/kernel/power/poweroff.c
> +++ linux-2.6.22/kernel/power/poweroff.c
> @@ -8,6 +8,7 @@
> #include <linux/sysrq.h>
> #include <linux/init.h>
> #include <linux/pm.h>
> +#include <linux/sched.h>
> #include <linux/workqueue.h>
> #include <linux/reboot.h>
>
> Index: linux-2.6.22/kernel/sched.c
> ===================================================================
> --- linux-2.6.22.orig/kernel/sched.c
> +++ linux-2.6.22/kernel/sched.c
> @@ -4379,8 +4379,6 @@ long __sched sleep_on_timeout(wait_queue
> }
> EXPORT_SYMBOL(sleep_on_timeout);
>
> -#ifdef CONFIG_RT_MUTEXES
> -
> /*
> * rt_mutex_setprio - set the current priority of a task
> * @p: task
> @@ -4457,8 +4455,6 @@ out_unlock:
> task_rq_unlock(rq, &flags);
> }
>
> -#endif
> -
> void set_user_nice(struct task_struct *p, long nice)
> {
> int old_prio, delta, on_rq;
> Index: linux-2.6.22/kernel/workqueue.c
> ===================================================================
> --- linux-2.6.22.orig/kernel/workqueue.c
> +++ linux-2.6.22/kernel/workqueue.c
> @@ -44,7 +44,7 @@ struct cpu_workqueue_struct {
>
> spinlock_t lock;
>
> - struct list_head worklist;
> + struct plist_head worklist;
> wait_queue_head_t more_work;
> struct work_struct *current_work;
>
> @@ -127,16 +127,19 @@ struct cpu_workqueue_struct *get_wq_data
> static void insert_work(struct cpu_workqueue_struct *cwq,
> struct work_struct *work, int tail)
> {
> + int prio = current->normal_prio;
> +
> set_wq_data(work, cwq);
> /*
> * Ensure that we get the right work->data if we see the
> * result of list_add() below, see try_to_grab_pending().
> */
> smp_wmb();
> - if (tail)
> - list_add_tail(&work->entry, &cwq->worklist);
> - else
> - list_add(&work->entry, &cwq->worklist);
> + plist_node_init(&work->entry, prio);
> + plist_add(&work->entry, &cwq->worklist);
> +
> + if (prio < cwq->thread->prio)
> + rt_mutex_setprio(cwq->thread, prio);
> wake_up(&cwq->more_work);
> }
>
> @@ -168,7 +171,7 @@ int fastcall queue_work(struct workqueue
> int ret = 0, cpu = raw_smp_processor_id();
>
> if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
> - BUG_ON(!list_empty(&work->entry));
> + BUG_ON(!plist_node_empty(&work->entry));
> __queue_work(wq_per_cpu(wq, cpu), work);
> ret = 1;
> }
> @@ -222,7 +225,7 @@ int queue_delayed_work_on(int cpu, struc
>
> if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
> BUG_ON(timer_pending(timer));
> - BUG_ON(!list_empty(&work->entry));
> + BUG_ON(!plist_node_empty(&work->entry));
>
> /* This stores cwq for the moment, for the timer_fn */
> set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
> @@ -264,13 +267,17 @@ static void run_workqueue(struct cpu_wor
> __FUNCTION__, cwq->run_depth);
> dump_stack();
> }
> - while (!list_empty(&cwq->worklist)) {
> - struct work_struct *work = list_entry(cwq->worklist.next,
> + while (!plist_head_empty(&cwq->worklist)) {
> + struct work_struct *work = plist_first_entry(&cwq->worklist,
> struct work_struct, entry);
> work_func_t f = work->func;
>
> + if (likely(cwq->thread->prio != work->entry.prio))
> + rt_mutex_setprio(cwq->thread, work->entry.prio);
> +
> cwq->current_work = work;
> - list_del_init(cwq->worklist.next);
> + plist_del(&work->entry, &cwq->worklist);
> + plist_node_init(&work->entry, MAX_PRIO);
> spin_unlock_irq(&cwq->lock);
>
> BUG_ON(get_wq_data(work) != cwq);
> @@ -283,6 +290,7 @@ static void run_workqueue(struct cpu_wor
> spin_lock_irq(&cwq->lock);
> cwq->current_work = NULL;
> }
> + rt_mutex_setprio(cwq->thread, current->normal_prio);
> cwq->run_depth--;
> spin_unlock_irq(&cwq->lock);
> }
> @@ -301,7 +309,7 @@ static int worker_thread(void *__cwq)
> prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
> if (!freezing(current) &&
> !kthread_should_stop() &&
> - list_empty(&cwq->worklist))
> + plist_head_empty(&cwq->worklist))
> schedule();
> finish_wait(&cwq->more_work, &wait);
>
> @@ -354,7 +362,8 @@ static int flush_cpu_workqueue(struct cp
>
> active = 0;
> spin_lock_irq(&cwq->lock);
> - if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
> + if (!plist_head_empty(&cwq->worklist) ||
> + cwq->current_work != NULL) {
> insert_wq_barrier(cwq, &barr, 1);
> active = 1;
> }
> @@ -413,7 +422,7 @@ static int try_to_grab_pending(struct wo
> return ret;
>
> spin_lock_irq(&cwq->lock);
> - if (!list_empty(&work->entry)) {
> + if (!plist_node_empty(&work->entry)) {
> /*
> * This work is queued, but perhaps we locked the wrong cwq.
> * In that case we must see the new value after rmb(), see
> @@ -421,7 +430,8 @@ static int try_to_grab_pending(struct wo
> */
> smp_rmb();
> if (cwq == get_wq_data(work)) {
> - list_del_init(&work->entry);
> + plist_del(&work->entry, &cwq->worklist);
> + plist_node_init(&work->entry, MAX_PRIO);
> ret = 1;
> }
> }
> @@ -747,7 +757,7 @@ init_cpu_workqueue(struct workqueue_stru
>
> cwq->wq = wq;
> spin_lock_init(&cwq->lock);
> - INIT_LIST_HEAD(&cwq->worklist);
> + plist_head_init(&cwq->worklist, NULL);
> init_waitqueue_head(&cwq->more_work);
>
> return cwq;
>
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 11:59 ` Gregory Haskins
@ 2007-08-01 15:10 ` Daniel Walker
2007-08-01 15:19 ` Gregory Haskins
2007-08-01 21:48 ` Esben Nielsen
0 siblings, 2 replies; 45+ messages in thread
From: Daniel Walker @ 2007-08-01 15:10 UTC (permalink / raw)
To: Gregory Haskins; +Cc: linux-rt-users, linux-kernel
On Wed, 2007-08-01 at 07:59 -0400, Gregory Haskins wrote:
> On Tue, 2007-07-31 at 20:52 -0700, Daniel Walker wrote:
>
> >
> > Here's a simpler version .. uses the plist data structure instead of the
> > 100 queues, which makes for a cleaner patch ..
>
> Hi Daniel,
>
> I like your idea on the plist simplification a lot. I will definitely
> roll that into my series.
>
> I am not too psyched about using the rt_mutex_setprio() API directly,
> however. It seems broken to be calling that directly from non rt_mutex
> code, IMHO. Perhaps the PI subsystem should be broken out from the
> rt_mutex code so it can be used generally? There are other areas where
> PI could potentially be used besides rt_mutex (this patch as a prime
> example), so this might make sense.
rt_mutex_setprio() is just a function. It was also designed specifically
for PI , so it seems fairly sane to use it in other PI type
situations ..
Daniel
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 15:10 ` Daniel Walker
@ 2007-08-01 15:19 ` Gregory Haskins
2007-08-01 15:55 ` Daniel Walker
2007-08-01 21:48 ` Esben Nielsen
1 sibling, 1 reply; 45+ messages in thread
From: Gregory Haskins @ 2007-08-01 15:19 UTC (permalink / raw)
To: Daniel Walker; +Cc: linux-rt-users, linux-kernel
On Wed, 2007-08-01 at 08:10 -0700, Daniel Walker wrote:
>
> rt_mutex_setprio() is just a function. It was also designed specifically
> for PI , so it seems fairly sane to use it in other PI type
> situations ..
>
Yes. It is designed for PI and I wasn't suggesting you shouldn't use
the logic itself. What I was suggesting is that dealing with an API
that has "rt_mutex" in it for something that has nothing to do with
rt_mutexes is, well...
All I was suggesting is that we break out the PI subsystem from rt_mutex
code so its an independent PI API and have the rt_mutex subsystem become
a user. That's a far cleaner way to do it, IMHO.
-Greg
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 15:19 ` Gregory Haskins
@ 2007-08-01 15:55 ` Daniel Walker
2007-08-01 17:32 ` Gregory Haskins
0 siblings, 1 reply; 45+ messages in thread
From: Daniel Walker @ 2007-08-01 15:55 UTC (permalink / raw)
To: Gregory Haskins; +Cc: linux-rt-users, linux-kernel
On Wed, 2007-08-01 at 11:19 -0400, Gregory Haskins wrote:
> On Wed, 2007-08-01 at 08:10 -0700, Daniel Walker wrote:
>
> >
> > rt_mutex_setprio() is just a function. It was also designed specifically
> > for PI , so it seems fairly sane to use it in other PI type
> > situations ..
> >
>
> Yes. It is designed for PI and I wasn't suggesting you shouldn't use
> the logic itself. What I was suggesting is that dealing with an API
> that has "rt_mutex" in it for something that has nothing to do with
> rt_mutexes is, well...
It's fine for now .. One step at a time..
> All I was suggesting is that we break out the PI subsystem from rt_mutex
> code so its an independent PI API and have the rt_mutex subsystem become
> a user. That's a far cleaner way to do it, IMHO.
The workqueues don't really need full blown transitive PI. So without
that your back to the rt_mutex_setprio function .. Which could be
renamed ..
Here was my attempt years ago ,
http://lkml.org/lkml/2005/5/31/288
Looking back on it, I'm not sure what users I was planning to implement
along with it .. I'm sure I was thinking "There must be other blocking
primitives that could use this.." , but now I don't think there are ..
Everything pretty much runs through the rt mutex.. workqueues are just
"dancing" , or changing priorities up/down which is really only the
lowest level of what the rt-mutex does.
Daniel
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 3:52 ` Daniel Walker
2007-08-01 11:59 ` Gregory Haskins
@ 2007-08-01 17:01 ` Peter Zijlstra
2007-08-01 17:10 ` Daniel Walker
2007-08-01 18:12 ` Oleg Nesterov
1 sibling, 2 replies; 45+ messages in thread
From: Peter Zijlstra @ 2007-08-01 17:01 UTC (permalink / raw)
To: Daniel Walker, Ingo Molnar, Oleg Nesterov
Cc: Gregory Haskins, linux-rt-users, linux-kernel
(you guys forgot to CC Ingo, Oleg and me)
On Tue, 2007-07-31 at 20:52 -0700, Daniel Walker wrote:
> Here's a simpler version .. uses the plist data structure instead of the
> 100 queues, which makes for a cleaner patch ..
>
> Signed-off-by: Daniel Walker <dwalker@mvista.com>
looks good, assuming you actually ran the code:
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
small nit below though..
> ---
> include/linux/workqueue.h | 7 ++++---
> kernel/power/poweroff.c | 1 +
> kernel/sched.c | 4 ----
> kernel/workqueue.c | 40 +++++++++++++++++++++++++---------------
> 4 files changed, 30 insertions(+), 22 deletions(-)
>
> Index: linux-2.6.22/include/linux/workqueue.h
> ===================================================================
> --- linux-2.6.22.orig/include/linux/workqueue.h
> +++ linux-2.6.22/include/linux/workqueue.h
> @@ -8,6 +8,7 @@
> #include <linux/timer.h>
> #include <linux/linkage.h>
> #include <linux/bitops.h>
> +#include <linux/plist.h>
> #include <asm/atomic.h>
>
> struct workqueue_struct;
> @@ -26,7 +27,7 @@ struct work_struct {
> #define WORK_STRUCT_PENDING 0 /* T if work item pending execution */
> #define WORK_STRUCT_FLAG_MASK (3UL)
> #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
> - struct list_head entry;
> + struct plist_node entry;
> work_func_t func;
> };
>
> @@ -43,7 +44,7 @@ struct execute_work {
>
> #define __WORK_INITIALIZER(n, f) { \
> .data = WORK_DATA_INIT(), \
> - .entry = { &(n).entry, &(n).entry }, \
> + .entry = PLIST_NODE_INIT(n.entry, MAX_PRIO), \
> .func = (f), \
> }
>
> @@ -79,7 +80,7 @@ struct execute_work {
> #define INIT_WORK(_work, _func) \
> do { \
> (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \
> - INIT_LIST_HEAD(&(_work)->entry); \
> + plist_node_init(&(_work)->entry, -1); \
> PREPARE_WORK((_work), (_func)); \
> } while (0)
>
> Index: linux-2.6.22/kernel/power/poweroff.c
> ===================================================================
> --- linux-2.6.22.orig/kernel/power/poweroff.c
> +++ linux-2.6.22/kernel/power/poweroff.c
> @@ -8,6 +8,7 @@
> #include <linux/sysrq.h>
> #include <linux/init.h>
> #include <linux/pm.h>
> +#include <linux/sched.h>
> #include <linux/workqueue.h>
> #include <linux/reboot.h>
>
> Index: linux-2.6.22/kernel/sched.c
> ===================================================================
> --- linux-2.6.22.orig/kernel/sched.c
> +++ linux-2.6.22/kernel/sched.c
> @@ -4379,8 +4379,6 @@ long __sched sleep_on_timeout(wait_queue
> }
> EXPORT_SYMBOL(sleep_on_timeout);
>
> -#ifdef CONFIG_RT_MUTEXES
> -
> /*
> * rt_mutex_setprio - set the current priority of a task
> * @p: task
> @@ -4457,8 +4455,6 @@ out_unlock:
> task_rq_unlock(rq, &flags);
> }
>
> -#endif
> -
> void set_user_nice(struct task_struct *p, long nice)
> {
> int old_prio, delta, on_rq;
> Index: linux-2.6.22/kernel/workqueue.c
> ===================================================================
> --- linux-2.6.22.orig/kernel/workqueue.c
> +++ linux-2.6.22/kernel/workqueue.c
> @@ -44,7 +44,7 @@ struct cpu_workqueue_struct {
>
> spinlock_t lock;
>
> - struct list_head worklist;
> + struct plist_head worklist;
> wait_queue_head_t more_work;
> struct work_struct *current_work;
>
> @@ -127,16 +127,19 @@ struct cpu_workqueue_struct *get_wq_data
> static void insert_work(struct cpu_workqueue_struct *cwq,
> struct work_struct *work, int tail)
> {
> + int prio = current->normal_prio;
> +
> set_wq_data(work, cwq);
> /*
> * Ensure that we get the right work->data if we see the
> * result of list_add() below, see try_to_grab_pending().
> */
> smp_wmb();
> - if (tail)
> - list_add_tail(&work->entry, &cwq->worklist);
> - else
> - list_add(&work->entry, &cwq->worklist);
> + plist_node_init(&work->entry, prio);
> + plist_add(&work->entry, &cwq->worklist);
perhaps we ought to handle tail, perhaps not, not sure what the
consequences are.
- Peter
> +
> + if (prio < cwq->thread->prio)
> + rt_mutex_setprio(cwq->thread, prio);
> wake_up(&cwq->more_work);
> }
>
> @@ -168,7 +171,7 @@ int fastcall queue_work(struct workqueue
> int ret = 0, cpu = raw_smp_processor_id();
>
> if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
> - BUG_ON(!list_empty(&work->entry));
> + BUG_ON(!plist_node_empty(&work->entry));
> __queue_work(wq_per_cpu(wq, cpu), work);
> ret = 1;
> }
> @@ -222,7 +225,7 @@ int queue_delayed_work_on(int cpu, struc
>
> if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
> BUG_ON(timer_pending(timer));
> - BUG_ON(!list_empty(&work->entry));
> + BUG_ON(!plist_node_empty(&work->entry));
>
> /* This stores cwq for the moment, for the timer_fn */
> set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
> @@ -264,13 +267,17 @@ static void run_workqueue(struct cpu_wor
> __FUNCTION__, cwq->run_depth);
> dump_stack();
> }
> - while (!list_empty(&cwq->worklist)) {
> - struct work_struct *work = list_entry(cwq->worklist.next,
> + while (!plist_head_empty(&cwq->worklist)) {
> + struct work_struct *work = plist_first_entry(&cwq->worklist,
> struct work_struct, entry);
> work_func_t f = work->func;
>
> + if (likely(cwq->thread->prio != work->entry.prio))
> + rt_mutex_setprio(cwq->thread, work->entry.prio);
> +
> cwq->current_work = work;
> - list_del_init(cwq->worklist.next);
> + plist_del(&work->entry, &cwq->worklist);
> + plist_node_init(&work->entry, MAX_PRIO);
> spin_unlock_irq(&cwq->lock);
>
> BUG_ON(get_wq_data(work) != cwq);
> @@ -283,6 +290,7 @@ static void run_workqueue(struct cpu_wor
> spin_lock_irq(&cwq->lock);
> cwq->current_work = NULL;
> }
> + rt_mutex_setprio(cwq->thread, current->normal_prio);
> cwq->run_depth--;
> spin_unlock_irq(&cwq->lock);
> }
> @@ -301,7 +309,7 @@ static int worker_thread(void *__cwq)
> prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
> if (!freezing(current) &&
> !kthread_should_stop() &&
> - list_empty(&cwq->worklist))
> + plist_head_empty(&cwq->worklist))
> schedule();
> finish_wait(&cwq->more_work, &wait);
>
> @@ -354,7 +362,8 @@ static int flush_cpu_workqueue(struct cp
>
> active = 0;
> spin_lock_irq(&cwq->lock);
> - if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
> + if (!plist_head_empty(&cwq->worklist) ||
> + cwq->current_work != NULL) {
> insert_wq_barrier(cwq, &barr, 1);
> active = 1;
> }
> @@ -413,7 +422,7 @@ static int try_to_grab_pending(struct wo
> return ret;
>
> spin_lock_irq(&cwq->lock);
> - if (!list_empty(&work->entry)) {
> + if (!plist_node_empty(&work->entry)) {
> /*
> * This work is queued, but perhaps we locked the wrong cwq.
> * In that case we must see the new value after rmb(), see
> @@ -421,7 +430,8 @@ static int try_to_grab_pending(struct wo
> */
> smp_rmb();
> if (cwq == get_wq_data(work)) {
> - list_del_init(&work->entry);
> + plist_del(&work->entry, &cwq->worklist);
> + plist_node_init(&work->entry, MAX_PRIO);
> ret = 1;
> }
> }
> @@ -747,7 +757,7 @@ init_cpu_workqueue(struct workqueue_stru
>
> cwq->wq = wq;
> spin_lock_init(&cwq->lock);
> - INIT_LIST_HEAD(&cwq->worklist);
> + plist_head_init(&cwq->worklist, NULL);
> init_waitqueue_head(&cwq->more_work);
>
> return cwq;
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 17:01 ` Peter Zijlstra
@ 2007-08-01 17:10 ` Daniel Walker
2007-08-01 18:26 ` Oleg Nesterov
2007-08-01 18:12 ` Oleg Nesterov
1 sibling, 1 reply; 45+ messages in thread
From: Daniel Walker @ 2007-08-01 17:10 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Oleg Nesterov, Gregory Haskins, linux-rt-users,
linux-kernel
On Wed, 2007-08-01 at 19:01 +0200, Peter Zijlstra wrote:
> (you guys forgot to CC Ingo, Oleg and me)
>
> On Tue, 2007-07-31 at 20:52 -0700, Daniel Walker wrote:
>
> > Here's a simpler version .. uses the plist data structure instead of the
> > 100 queues, which makes for a cleaner patch ..
> >
> > Signed-off-by: Daniel Walker <dwalker@mvista.com>
>
> looks good, assuming you actually ran the code:
No faith huh? I boot tested a few times, but it could be tested more.
> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
>
> small nit below though..
>
> > ---
> > include/linux/workqueue.h | 7 ++++---
> > kernel/power/poweroff.c | 1 +
> > kernel/sched.c | 4 ----
> > kernel/workqueue.c | 40 +++++++++++++++++++++++++---------------
> > 4 files changed, 30 insertions(+), 22 deletions(-)
> >
> > Index: linux-2.6.22/include/linux/workqueue.h
> > ===================================================================
> > --- linux-2.6.22.orig/include/linux/workqueue.h
> > +++ linux-2.6.22/include/linux/workqueue.h
> > @@ -8,6 +8,7 @@
> > #include <linux/timer.h>
> > #include <linux/linkage.h>
> > #include <linux/bitops.h>
> > +#include <linux/plist.h>
> > #include <asm/atomic.h>
> >
> > struct workqueue_struct;
> > @@ -26,7 +27,7 @@ struct work_struct {
> > #define WORK_STRUCT_PENDING 0 /* T if work item pending execution */
> > #define WORK_STRUCT_FLAG_MASK (3UL)
> > #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
> > - struct list_head entry;
> > + struct plist_node entry;
> > work_func_t func;
> > };
> >
> > @@ -43,7 +44,7 @@ struct execute_work {
> >
> > #define __WORK_INITIALIZER(n, f) { \
> > .data = WORK_DATA_INIT(), \
> > - .entry = { &(n).entry, &(n).entry }, \
> > + .entry = PLIST_NODE_INIT(n.entry, MAX_PRIO), \
> > .func = (f), \
> > }
> >
> > @@ -79,7 +80,7 @@ struct execute_work {
> > #define INIT_WORK(_work, _func) \
> > do { \
> > (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \
> > - INIT_LIST_HEAD(&(_work)->entry); \
> > + plist_node_init(&(_work)->entry, -1); \
> > PREPARE_WORK((_work), (_func)); \
> > } while (0)
> >
> > Index: linux-2.6.22/kernel/power/poweroff.c
> > ===================================================================
> > --- linux-2.6.22.orig/kernel/power/poweroff.c
> > +++ linux-2.6.22/kernel/power/poweroff.c
> > @@ -8,6 +8,7 @@
> > #include <linux/sysrq.h>
> > #include <linux/init.h>
> > #include <linux/pm.h>
> > +#include <linux/sched.h>
> > #include <linux/workqueue.h>
> > #include <linux/reboot.h>
> >
> > Index: linux-2.6.22/kernel/sched.c
> > ===================================================================
> > --- linux-2.6.22.orig/kernel/sched.c
> > +++ linux-2.6.22/kernel/sched.c
> > @@ -4379,8 +4379,6 @@ long __sched sleep_on_timeout(wait_queue
> > }
> > EXPORT_SYMBOL(sleep_on_timeout);
> >
> > -#ifdef CONFIG_RT_MUTEXES
> > -
> > /*
> > * rt_mutex_setprio - set the current priority of a task
> > * @p: task
> > @@ -4457,8 +4455,6 @@ out_unlock:
> > task_rq_unlock(rq, &flags);
> > }
> >
> > -#endif
> > -
> > void set_user_nice(struct task_struct *p, long nice)
> > {
> > int old_prio, delta, on_rq;
> > Index: linux-2.6.22/kernel/workqueue.c
> > ===================================================================
> > --- linux-2.6.22.orig/kernel/workqueue.c
> > +++ linux-2.6.22/kernel/workqueue.c
> > @@ -44,7 +44,7 @@ struct cpu_workqueue_struct {
> >
> > spinlock_t lock;
> >
> > - struct list_head worklist;
> > + struct plist_head worklist;
> > wait_queue_head_t more_work;
> > struct work_struct *current_work;
> >
> > @@ -127,16 +127,19 @@ struct cpu_workqueue_struct *get_wq_data
> > static void insert_work(struct cpu_workqueue_struct *cwq,
> > struct work_struct *work, int tail)
> > {
> > + int prio = current->normal_prio;
> > +
> > set_wq_data(work, cwq);
> > /*
> > * Ensure that we get the right work->data if we see the
> > * result of list_add() below, see try_to_grab_pending().
> > */
> > smp_wmb();
> > - if (tail)
> > - list_add_tail(&work->entry, &cwq->worklist);
> > - else
> > - list_add(&work->entry, &cwq->worklist);
> > + plist_node_init(&work->entry, prio);
> > + plist_add(&work->entry, &cwq->worklist);
>
> perhaps we ought to handle tail, perhaps not, not sure what the
> consequences are.
The plist doesn't distinguish since it's a sorted list. There is no way
to force something to the back of the list ..
It seems like the "tail" option was the old way to prioritize.. I think
It could be removed since we're always priority ordering now anyway..
Daniel
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 15:55 ` Daniel Walker
@ 2007-08-01 17:32 ` Gregory Haskins
0 siblings, 0 replies; 45+ messages in thread
From: Gregory Haskins @ 2007-08-01 17:32 UTC (permalink / raw)
To: Daniel Walker; +Cc: linux-rt-users, linux-kernel
On Wed, 2007-08-01 at 08:55 -0700, Daniel Walker wrote:
> On Wed, 2007-08-01 at 11:19 -0400, Gregory Haskins wrote:
> > On Wed, 2007-08-01 at 08:10 -0700, Daniel Walker wrote:
> >
> > >
> > > rt_mutex_setprio() is just a function. It was also designed specifically
> > > for PI , so it seems fairly sane to use it in other PI type
> > > situations ..
> > >
> >
> > Yes. It is designed for PI and I wasn't suggesting you shouldn't use
> > the logic itself. What I was suggesting is that dealing with an API
> > that has "rt_mutex" in it for something that has nothing to do with
> > rt_mutexes is, well...
>
> It's fine for now .. One step at a time..
I can live with that.
>
> > All I was suggesting is that we break out the PI subsystem from rt_mutex
> > code so its an independent PI API and have the rt_mutex subsystem become
> > a user. That's a far cleaner way to do it, IMHO.
>
> The workqueues don't really need full blown transitive PI. So without
> that your back to the rt_mutex_setprio function .. Which could be
> renamed ..
Right ;) That's all I am really saying, but its not a big deal.
>
> Here was my attempt years ago ,
>
> http://lkml.org/lkml/2005/5/31/288
Cool! I haven't looked at the patches in depth yet, but perhaps that
concept should be revisited.
> Looking back on it, I'm not sure what users I was planning to implement
> along with it .. I'm sure I was thinking "There must be other blocking
> primitives that could use this.." , but now I don't think there are ..
> Everything pretty much runs through the rt mutex..
Well, not necessarily. I think your first instinct was right. Really
any form of waiting can potentially be subject to inversion, IMHO.
Waiting for a mutex is an obvious form, and also easy to solve since you
have clear delineation between the waiters (and their priorities) and
the resource holder.
However, inversion can happen any time you are waiting on a resource
regardless of the blocking mechanism...for instance, waiting on a
wait_queue. Consider the situation of three RT tasks A, B, C. A is the
highest, C the lowest. Assume C is a threaded IRQ handler for, say,
diskio, and C will signal a wait_queue when IO completes. In this
scenario, A can get inverted behind B if it does diskio. (In fact, I
think someone was reporting a similar problem on the list last week, and
the response was that we don't support that yet...I would like to change
that :).
The problem with something like a wait_queue, as previously mentioned,
is you don't have a clear way to delineate who should get PI boosted. I
am wondering if we can put our heads together and come up with a
comprehensive solution to this problem (even if it only works in certain
situations..e.g. threads that explicitly register with a waitqueue,
etc).
The solution here might be as simple as the previously mentioned
registration API, it might be something that can be done automatically
in the infrastructure, or it might be something along the lines of
converting more things over to use rt_mutex? I don't know what the
answer will be yet. All have their challenges. But I do think its a
problem worth solving.
I think the workqueue stuff we are doing is really the first step in
this direction...but really some parts of that work are just special
casing one form of the "wait_queue" scenario. I would like to
generalize this going forward.
> workqueues are just
> "dancing" , or changing priorities up/down which is really only the
> lowest level of what the rt-mutex does.
I totally agree that what the workqueues are currently doing in our
patches are just a subset of PI. But I absolutely think there are
potential users of at least some of PI like facilities other than just
mutexes.
I appreciate your review, discussion, and contribution on this matter,
Daniel. Thank you! I will take a look at your PI patches to see if I
think it aligns with my current thoughts.
-Greg
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 17:01 ` Peter Zijlstra
2007-08-01 17:10 ` Daniel Walker
@ 2007-08-01 18:12 ` Oleg Nesterov
2007-08-01 18:29 ` Daniel Walker
1 sibling, 1 reply; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-01 18:12 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Daniel Walker, Ingo Molnar, Gregory Haskins, linux-rt-users,
linux-kernel
On 08/01, Peter Zijlstra wrote:
>
> On Tue, 2007-07-31 at 20:52 -0700, Daniel Walker wrote:
>
> > static void insert_work(struct cpu_workqueue_struct *cwq,
> > struct work_struct *work, int tail)
> > {
> > + int prio = current->normal_prio;
> > +
> > set_wq_data(work, cwq);
> > /*
> > * Ensure that we get the right work->data if we see the
> > * result of list_add() below, see try_to_grab_pending().
> > */
> > smp_wmb();
> > - if (tail)
> > - list_add_tail(&work->entry, &cwq->worklist);
> > - else
> > - list_add(&work->entry, &cwq->worklist);
> > + plist_node_init(&work->entry, prio);
> > + plist_add(&work->entry, &cwq->worklist);
Sorry, this patch is completely wrong. It immediately breaks
flush_workqueue/cancel_workqueue functions.
And I personally think it is not very useful, even if it was correct.
You can create your own workqueue and change the priority of cwq->thread.
> > @@ -168,7 +171,7 @@ int fastcall queue_work(struct workqueue
> > int ret = 0, cpu = raw_smp_processor_id();
> >
> > if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
> > - BUG_ON(!list_empty(&work->entry));
> > + BUG_ON(!plist_node_empty(&work->entry));
> > __queue_work(wq_per_cpu(wq, cpu), work);
> > ret = 1;
Side note, looks like you use some strange kernel. This raw_smp_processor_id()
above is wrong.
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 17:10 ` Daniel Walker
@ 2007-08-01 18:26 ` Oleg Nesterov
2007-08-01 18:39 ` Daniel Walker
0 siblings, 1 reply; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-01 18:26 UTC (permalink / raw)
To: Daniel Walker
Cc: Peter Zijlstra, Ingo Molnar, Gregory Haskins, linux-rt-users,
linux-kernel
On 08/01, Daniel Walker wrote:
>
> On Wed, 2007-08-01 at 19:01 +0200, Peter Zijlstra wrote:
> > > static void insert_work(struct cpu_workqueue_struct *cwq,
> > > struct work_struct *work, int tail)
> > > {
> > > + int prio = current->normal_prio;
> > > +
> > > set_wq_data(work, cwq);
> > > /*
> > > * Ensure that we get the right work->data if we see the
> > > * result of list_add() below, see try_to_grab_pending().
> > > */
> > > smp_wmb();
> > > - if (tail)
> > > - list_add_tail(&work->entry, &cwq->worklist);
> > > - else
> > > - list_add(&work->entry, &cwq->worklist);
> > > + plist_node_init(&work->entry, prio);
> > > + plist_add(&work->entry, &cwq->worklist);
> >
> > perhaps we ought to handle tail, perhaps not, not sure what the
> > consequences are.
>
> The plist doesn't distinguish since it's a sorted list. There is no way
> to force something to the back of the list ..
>
> It seems like the "tail" option was the old way to prioritize.. I think
> It could be removed since we're always priority ordering now anyway..
No, the "tail" option has nothing to do with prioritize, we can't remove
it. Please look at the code.
Also, flush_workqueue() must not be delayed by the new incoming work_struct's,
the change like this breaks this assumption.
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 18:12 ` Oleg Nesterov
@ 2007-08-01 18:29 ` Daniel Walker
2007-08-01 20:18 ` Oleg Nesterov
0 siblings, 1 reply; 45+ messages in thread
From: Daniel Walker @ 2007-08-01 18:29 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Peter Zijlstra, Ingo Molnar, Gregory Haskins, linux-rt-users,
linux-kernel
On Wed, 2007-08-01 at 22:12 +0400, Oleg Nesterov wrote:
> And I personally think it is not very useful, even if it was correct.
> You can create your own workqueue and change the priority of cwq->thread.
This change is more dynamic than than just setting a single priority ..
There was some other work going on around this, so it's not totally
clear what the benefits are ..
> > > @@ -168,7 +171,7 @@ int fastcall queue_work(struct workqueue
> > > int ret = 0, cpu = raw_smp_processor_id();
> > >
> > > if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
> > > - BUG_ON(!list_empty(&work->entry));
> > > + BUG_ON(!plist_node_empty(&work->entry));
> > > __queue_work(wq_per_cpu(wq, cpu), work);
> > > ret = 1;
>
> Side note, looks like you use some strange kernel. This raw_smp_processor_id()
> above is wrong.
As the topic suggests , it's a Real Time kernel .. I can give you a link
where to download it if you want.
Daniel
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 18:26 ` Oleg Nesterov
@ 2007-08-01 18:39 ` Daniel Walker
2007-08-01 20:25 ` Oleg Nesterov
0 siblings, 1 reply; 45+ messages in thread
From: Daniel Walker @ 2007-08-01 18:39 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Peter Zijlstra, Ingo Molnar, Gregory Haskins, linux-rt-users,
linux-kernel
On Wed, 2007-08-01 at 22:26 +0400, Oleg Nesterov wrote:
> No, the "tail" option has nothing to do with prioritize, we can't remove
> it. Please look at the code.
So you insert a work struct that executes last which wakes the flushing
thread?
> Also, flush_workqueue() must not be delayed by the new incoming work_struct's,
> the change like this breaks this assumption.
True ..
Daniel
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 18:29 ` Daniel Walker
@ 2007-08-01 20:18 ` Oleg Nesterov
2007-08-01 20:32 ` Oleg Nesterov
2007-08-01 20:34 ` Daniel Walker
0 siblings, 2 replies; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-01 20:18 UTC (permalink / raw)
To: Daniel Walker
Cc: Peter Zijlstra, Ingo Molnar, Gregory Haskins, linux-rt-users,
linux-kernel
On 08/01, Daniel Walker wrote:
>
> On Wed, 2007-08-01 at 22:12 +0400, Oleg Nesterov wrote:
>
> > And I personally think it is not very useful, even if it was correct.
> > You can create your own workqueue and change the priority of cwq->thread.
>
> This change is more dynamic than than just setting a single priority ..
> There was some other work going on around this, so it's not totally
> clear what the benefits are ..
Yes, I see. But still I think the whole idea is broken, not just the
implementation.
What about delayed_work? insert_work() will use ->normal_prio of
the random interrupted process, while queue_work() uses current.
What if a niced thread queues the work? This work may have no chance
to run if workqueue is actively used.
And I don't understand why rt_mutex_setprio() is called just before
calling work->func(). This means that a high-priority work could
be delayed by the low-priority ->current_work.
> > > > @@ -168,7 +171,7 @@ int fastcall queue_work(struct workqueue
> > > > int ret = 0, cpu = raw_smp_processor_id();
> > > >
> > > > if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
> > > > - BUG_ON(!list_empty(&work->entry));
> > > > + BUG_ON(!plist_node_empty(&work->entry));
> > > > __queue_work(wq_per_cpu(wq, cpu), work);
> > > > ret = 1;
> >
> > Side note, looks like you use some strange kernel. This raw_smp_processor_id()
> > above is wrong.
>
> As the topic suggests , it's a Real Time kernel .. I can give you a link
> where to download it if you want.
Ok, thanks, I'll take a look. Still, we can't use raw_smp_processor_id()
unless we disabled cpu-hotplug.
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 18:39 ` Daniel Walker
@ 2007-08-01 20:25 ` Oleg Nesterov
0 siblings, 0 replies; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-01 20:25 UTC (permalink / raw)
To: Daniel Walker
Cc: Peter Zijlstra, Ingo Molnar, Gregory Haskins, linux-rt-users,
linux-kernel
On 08/01, Daniel Walker wrote:
>
> On Wed, 2007-08-01 at 22:26 +0400, Oleg Nesterov wrote:
>
> > No, the "tail" option has nothing to do with prioritize, we can't remove
> > it. Please look at the code.
>
> So you insert a work struct that executes last which wakes the flushing
> thread?
No, tail == 1 means a "normal" work, tail == 0 means "just after the
->current_work". Yes, the latter is to wake up the flusher at the "right time".
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 20:18 ` Oleg Nesterov
@ 2007-08-01 20:32 ` Oleg Nesterov
2007-08-01 20:43 ` Daniel Walker
2007-08-01 20:34 ` Daniel Walker
1 sibling, 1 reply; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-01 20:32 UTC (permalink / raw)
To: Daniel Walker
Cc: Peter Zijlstra, Ingo Molnar, Gregory Haskins, linux-rt-users,
linux-kernel
On 08/02, Oleg Nesterov wrote:
>
> And I don't understand why rt_mutex_setprio() is called just before
> calling work->func(). This means that a high-priority work could
> be delayed by the low-priority ->current_work.
Aha, I missed the rt_mutex_setprio() in insert_work().
This is not good either. Suppose we have
void work_handler(struct work_struct *self)
{
if (!try_lock()) {
// try again later...
queue_work(wq, self);
return;
}
do_the_work();
}
What if that work was queued by the low-priority thread, and
then a high-priority thread inserts a new work when work_handler()
is running?
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 20:18 ` Oleg Nesterov
2007-08-01 20:32 ` Oleg Nesterov
@ 2007-08-01 20:34 ` Daniel Walker
2007-08-01 20:50 ` Oleg Nesterov
1 sibling, 1 reply; 45+ messages in thread
From: Daniel Walker @ 2007-08-01 20:34 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Peter Zijlstra, Ingo Molnar, Gregory Haskins, linux-rt-users,
linux-kernel
On Thu, 2007-08-02 at 00:18 +0400, Oleg Nesterov wrote:
> On 08/01, Daniel Walker wrote:
> >
> > On Wed, 2007-08-01 at 22:12 +0400, Oleg Nesterov wrote:
> >
> > > And I personally think it is not very useful, even if it was correct.
> > > You can create your own workqueue and change the priority of cwq->thread.
> >
> > This change is more dynamic than than just setting a single priority ..
> > There was some other work going on around this, so it's not totally
> > clear what the benefits are ..
>
> Yes, I see. But still I think the whole idea is broken, not just the
> implementation.
It's translating priorities through the work queues, which doesn't seem
to happen with the current implementation. A high priority, say
SCHED_FIFO priority 99, task may have to wait for a nice -5 work queue
to finish.. You can set the priority of your work queue, but you never
know what the priority of the tasks are that use the work queue ..
> What about delayed_work? insert_work() will use ->normal_prio of
> the random interrupted process, while queue_work() uses current.
Actually it would be the priority of the timer softirq .. I think what
is desired here would be saving the priority of the task calling
delayed_work then using that..
> What if a niced thread queues the work? This work may have no chance
> to run if workqueue is actively used.
Yes , if it actively used by higher priority threads .. We could
restrict it to SCHED_FIFO/SCHED_RR tho ..
> > As the topic suggests , it's a Real Time kernel .. I can give you a link
> > where to download it if you want.
>
> Ok, thanks, I'll take a look. Still, we can't use raw_smp_processor_id()
> unless we disabled cpu-hotplug.
I didn't touch this particular piece of code, but I'm assuming the
raw_smp_processor_id() is safe giving the rest of the code in -rt ..
Daniel
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 20:32 ` Oleg Nesterov
@ 2007-08-01 20:43 ` Daniel Walker
0 siblings, 0 replies; 45+ messages in thread
From: Daniel Walker @ 2007-08-01 20:43 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Peter Zijlstra, Ingo Molnar, Gregory Haskins, linux-rt-users,
linux-kernel
On Thu, 2007-08-02 at 00:32 +0400, Oleg Nesterov wrote:
> On 08/02, Oleg Nesterov wrote:
> >
> > And I don't understand why rt_mutex_setprio() is called just before
> > calling work->func(). This means that a high-priority work could
> > be delayed by the low-priority ->current_work.
>
> Aha, I missed the rt_mutex_setprio() in insert_work().
>
> This is not good either. Suppose we have
>
> void work_handler(struct work_struct *self)
> {
> if (!try_lock()) {
> // try again later...
> queue_work(wq, self);
> return;
> }
>
> do_the_work();
> }
>
> What if that work was queued by the low-priority thread, and
> then a high-priority thread inserts a new work when work_handler()
> is running?
You mean the above queue_work(wq, self) would get an arbitrarily higher
priority vs. what it would normally?
Yeah, I suppose we would want follow the priority inside the workqueue
thread only ..
Daniel
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 20:34 ` Daniel Walker
@ 2007-08-01 20:50 ` Oleg Nesterov
2007-08-01 21:02 ` Daniel Walker
2007-08-01 21:13 ` Gregory Haskins
0 siblings, 2 replies; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-01 20:50 UTC (permalink / raw)
To: Daniel Walker
Cc: Peter Zijlstra, Ingo Molnar, Gregory Haskins, linux-rt-users,
linux-kernel
On 08/01, Daniel Walker wrote:
>
> On Thu, 2007-08-02 at 00:18 +0400, Oleg Nesterov wrote:
> > On 08/01, Daniel Walker wrote:
> > >
> > > On Wed, 2007-08-01 at 22:12 +0400, Oleg Nesterov wrote:
> > >
> > > > And I personally think it is not very useful, even if it was correct.
> > > > You can create your own workqueue and change the priority of cwq->thread.
> > >
> > > This change is more dynamic than than just setting a single priority ..
> > > There was some other work going on around this, so it's not totally
> > > clear what the benefits are ..
> >
> > Yes, I see. But still I think the whole idea is broken, not just the
> > implementation.
>
> It's translating priorities through the work queues, which doesn't seem
> to happen with the current implementation. A high priority, say
> SCHED_FIFO priority 99, task may have to wait for a nice -5 work queue
> to finish..
Why should that task wait?
> > What about delayed_work? insert_work() will use ->normal_prio of
> > the random interrupted process, while queue_work() uses current.
>
> Actually it would be the priority of the timer softirq .. I think what
> is desired here would be saving the priority of the task calling
> delayed_work then using that..
But mainline calls __do_softirq() from interrupt (irq_exit).
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 20:50 ` Oleg Nesterov
@ 2007-08-01 21:02 ` Daniel Walker
2007-08-01 21:13 ` Gregory Haskins
1 sibling, 0 replies; 45+ messages in thread
From: Daniel Walker @ 2007-08-01 21:02 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Peter Zijlstra, Ingo Molnar, Gregory Haskins, linux-rt-users,
linux-kernel
On Thu, 2007-08-02 at 00:50 +0400, Oleg Nesterov wrote:
> On 08/01, Daniel Walker wrote:
> >
> > On Thu, 2007-08-02 at 00:18 +0400, Oleg Nesterov wrote:
> > > On 08/01, Daniel Walker wrote:
> > > >
> > > > On Wed, 2007-08-01 at 22:12 +0400, Oleg Nesterov wrote:
> > > >
> > > > > And I personally think it is not very useful, even if it was correct.
> > > > > You can create your own workqueue and change the priority of cwq->thread.
> > > >
> > > > This change is more dynamic than than just setting a single priority ..
> > > > There was some other work going on around this, so it's not totally
> > > > clear what the benefits are ..
> > >
> > > Yes, I see. But still I think the whole idea is broken, not just the
> > > implementation.
> >
> > It's translating priorities through the work queues, which doesn't seem
> > to happen with the current implementation. A high priority, say
> > SCHED_FIFO priority 99, task may have to wait for a nice -5 work queue
> > to finish..
>
> Why should that task wait?
If the high priority tasks is waiting for the work to complete..
Assuming the scenario happens which your more likely to know than me..
I suppose in the flush_workqueue situation a thread could be waiting on
the lower priority work queue ..
> > > What about delayed_work? insert_work() will use ->normal_prio of
> > > the random interrupted process, while queue_work() uses current.
> >
> > Actually it would be the priority of the timer softirq .. I think what
> > is desired here would be saving the priority of the task calling
> > delayed_work then using that..
>
> But mainline calls __do_softirq() from interrupt (irq_exit).
Yeah, I suppose your right in that case .. In -rt softirq's are all in
threads so it would be the timer softirq thread..
Daniel
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 20:50 ` Oleg Nesterov
2007-08-01 21:02 ` Daniel Walker
@ 2007-08-01 21:13 ` Gregory Haskins
2007-08-01 21:34 ` Oleg Nesterov
1 sibling, 1 reply; 45+ messages in thread
From: Gregory Haskins @ 2007-08-01 21:13 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On Thu, 2007-08-02 at 00:50 +0400, Oleg Nesterov wrote:
> On 08/01, Daniel Walker wrote:
> >
> > On Thu, 2007-08-02 at 00:18 +0400, Oleg Nesterov wrote:
> > > On 08/01, Daniel Walker wrote:
> > > >
> > > > On Wed, 2007-08-01 at 22:12 +0400, Oleg Nesterov wrote:
> > > >
> > > > > And I personally think it is not very useful, even if it was correct.
> > > > > You can create your own workqueue and change the priority of cwq->thread.
> > > >
> > > > This change is more dynamic than than just setting a single priority ..
> > > > There was some other work going on around this, so it's not totally
> > > > clear what the benefits are ..
> > >
> > > Yes, I see. But still I think the whole idea is broken, not just the
> > > implementation.
> >
> > It's translating priorities through the work queues, which doesn't seem
> > to happen with the current implementation. A high priority, say
> > SCHED_FIFO priority 99, task may have to wait for a nice -5 work queue
> > to finish..
>
> Why should that task wait?
I assume "that task" = the RT99 task? If so, that is precisely the
question. It shouldn't wait. ;) With mainline, it is simply queued
with every other request. There could be an RT40, and a SCHED_NORMAL in
front of it in the queue that will get processed first. In addition,
the system could suffer from a priority inversion if some unrelated but
lower priority task (say RT98) was blocking the workqueue thread from
making forward progress on the nice -5 job.
To clarify: when a design utilizes a singlethread per workqueue (such as
in both mainline and this patch), the RT99 will always have to wait
behind any already dispatched jobs. That is a given. However, with
Daniels patch, two things happen in addition to normal processing.
1) The RT99 task would move ahead in the queue of anything else that was
also scheduled on the workqueue that is < RT99.
2) The priority of the workqueue task would be temporarily elevated to
RT99 so that the currently dispatched task will complete at the same
priority as the waiter. This prevents inversion.
HTH
-Greg
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 21:13 ` Gregory Haskins
@ 2007-08-01 21:34 ` Oleg Nesterov
2007-08-01 21:59 ` Gregory Haskins
0 siblings, 1 reply; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-01 21:34 UTC (permalink / raw)
To: Gregory Haskins
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On 08/01, Gregory Haskins wrote:
>
> On Thu, 2007-08-02 at 00:50 +0400, Oleg Nesterov wrote:
> > On 08/01, Daniel Walker wrote:
> > >
> > > It's translating priorities through the work queues, which doesn't seem
> > > to happen with the current implementation. A high priority, say
> > > SCHED_FIFO priority 99, task may have to wait for a nice -5 work queue
> > > to finish..
> >
> > Why should that task wait?
>
> I assume "that task" = the RT99 task? If so, that is precisely the
> question. It shouldn't wait. ;) With mainline, it is simply queued
> with every other request. There could be an RT40, and a SCHED_NORMAL in
> front of it in the queue that will get processed first. In addition,
> the system could suffer from a priority inversion if some unrelated but
> lower priority task (say RT98) was blocking the workqueue thread from
> making forward progress on the nice -5 job.
>
> To clarify: when a design utilizes a singlethread per workqueue (such as
> in both mainline and this patch), the RT99 will always have to wait
> behind any already dispatched jobs.
It is not that "RT99 will always have to wait". But yes, the work_struct
queued by RT99 has to wait.
> That is a given. However, with
> Daniels patch, two things happen in addition to normal processing.
Yes, I see what the patch does,
> 1) The RT99 task would move ahead in the queue of anything else that was
> also scheduled on the workqueue that is < RT99.
this itself is wrong, breaks flush_workqueue() semantics
> 2) The priority of the workqueue task would be temporarily elevated to
> RT99 so that the currently dispatched task will complete at the same
> priority as the waiter.
_Which_ waiter? I can't understand at all why work_struct should "inherit"
the priority of the task which queued it. This looks just wrong to me,
even if could implement this safely.
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 15:10 ` Daniel Walker
2007-08-01 15:19 ` Gregory Haskins
@ 2007-08-01 21:48 ` Esben Nielsen
1 sibling, 0 replies; 45+ messages in thread
From: Esben Nielsen @ 2007-08-01 21:48 UTC (permalink / raw)
To: Daniel Walker; +Cc: Gregory Haskins, linux-rt-users, linux-kernel
On Wed, 1 Aug 2007, Daniel Walker wrote:
> On Wed, 2007-08-01 at 07:59 -0400, Gregory Haskins wrote:
>> On Tue, 2007-07-31 at 20:52 -0700, Daniel Walker wrote:
>>
>>>
>>> Here's a simpler version .. uses the plist data structure instead of the
>>> 100 queues, which makes for a cleaner patch ..
>>
>> Hi Daniel,
>>
>> I like your idea on the plist simplification a lot. I will definitely
>> roll that into my series.
>>
>> I am not too psyched about using the rt_mutex_setprio() API directly,
>> however. It seems broken to be calling that directly from non rt_mutex
>> code, IMHO. Perhaps the PI subsystem should be broken out from the
>> rt_mutex code so it can be used generally? There are other areas where
>> PI could potentially be used besides rt_mutex (this patch as a prime
>> example), so this might make sense.
>
> rt_mutex_setprio() is just a function. It was also designed specifically
> for PI , so it seems fairly sane to use it in other PI type
> situations ..
>
> Daniel
>
There seems to be a general need for boosting threads temporarely in a few
places. HR-timers also have it, last time I checked. And preemptive RCU as
well for boosting RCU readers. They all seems to deal with the same issues
of correctly dealing with setting the priority and PI bosting from
mutexes.
When boosting of RCU readers was discussed I came to the conclusion that
the boosting property should be taken out of the of the rt_mutex module
and instead be made into a sub-property of the scheduler:
task->pi_waiters should be replaced with task->prio_boosters being a
pi_list of struct prio_booster representing something, which temporarely
wants to boost a task.
A rt_mutex_waiter should of course contain a prio_booster which is added
to owner->prio_boosters. A work queue element should contain a prio_booster.
When boosting a RCU reader a prio_booster is added to the reader's
prio_boosters.
Such a system will correctly deal with boosters going away in arbitrary
order. Something which is not strait forward when each user of boosting is
trying to do it on their own.
Esben
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 21:34 ` Oleg Nesterov
@ 2007-08-01 21:59 ` Gregory Haskins
2007-08-01 22:22 ` Oleg Nesterov
0 siblings, 1 reply; 45+ messages in thread
From: Gregory Haskins @ 2007-08-01 21:59 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On Thu, 2007-08-02 at 01:34 +0400, Oleg Nesterov wrote:
> On 08/01, Gregory Haskins wrote:
> >
> > On Thu, 2007-08-02 at 00:50 +0400, Oleg Nesterov wrote:
> > > On 08/01, Daniel Walker wrote:
> > > >
> > > > It's translating priorities through the work queues, which doesn't seem
> > > > to happen with the current implementation. A high priority, say
> > > > SCHED_FIFO priority 99, task may have to wait for a nice -5 work queue
> > > > to finish..
> > >
> > > Why should that task wait?
> >
> > I assume "that task" = the RT99 task? If so, that is precisely the
> > question. It shouldn't wait. ;) With mainline, it is simply queued
> > with every other request. There could be an RT40, and a SCHED_NORMAL in
> > front of it in the queue that will get processed first. In addition,
> > the system could suffer from a priority inversion if some unrelated but
> > lower priority task (say RT98) was blocking the workqueue thread from
> > making forward progress on the nice -5 job.
> >
> > To clarify: when a design utilizes a singlethread per workqueue (such as
> > in both mainline and this patch), the RT99 will always have to wait
> > behind any already dispatched jobs.
>
> It is not that "RT99 will always have to wait". But yes, the work_struct
> queued by RT99 has to wait.
Agreed. We are talking only within the scope of workqueues here.
> > 1) The RT99 task would move ahead in the queue of anything else that was
> > also scheduled on the workqueue that is < RT99.
>
> this itself is wrong, breaks flush_workqueue() semantics
Perhaps in Daniels patch. I am not familiar enough with plist to be
able to tell you if it has a problem there or not. However, I think
this works in the original patch I submitted with had a different
queuing mechanism. Daniels patch was derived from that so he may have
inadvertently picked up something from mine which was no longer true in
the new design. I'll defer to Daniel for clarification there.
However, IIUC the point of flush_workqueue() is a barrier only relative
to your own submissions, correct?. E.g. to make sure *your* requests
are finished, not necessarily the entire queue.
If thats true, the flush would be injected at the same priority(*) as
your other pending tasks at the tail of that priority level. Then you
would still block until all of your tasks complete.
If flush_workqueue is supposed to behave differently than I describe,
then I agree its broken even in my original patch.
(*) We would probably have to protect against somebody else PI-boosting
*us* ;)
>
> > 2) The priority of the workqueue task would be temporarily elevated to
> > RT99 so that the currently dispatched task will complete at the same
> > priority as the waiter.
>
> _Which_ waiter?
The RT99 task that submitted the request.
> I can't understand at all why work_struct should "inherit"
> the priority of the task which queued it.
Think about it: If you are an RT99 task and you have work to do,
shouldn't *all* of the work you need be done at RT99 if possible. Why
should something like a measly RT98 task block you from completing your
work. ;) The fact that you need to do some work via a workqueue (perhaps
because you need specific cpu routing) is inconsequential, IMHO.
Regards,
-Greg
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 21:59 ` Gregory Haskins
@ 2007-08-01 22:22 ` Oleg Nesterov
2007-08-01 23:53 ` Gregory Haskins
0 siblings, 1 reply; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-01 22:22 UTC (permalink / raw)
To: Gregory Haskins
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On 08/01, Gregory Haskins wrote:
>
> On Thu, 2007-08-02 at 01:34 +0400, Oleg Nesterov wrote:
> > On 08/01, Gregory Haskins wrote:
> > >
> > > On Thu, 2007-08-02 at 00:50 +0400, Oleg Nesterov wrote:
> > > > On 08/01, Daniel Walker wrote:
> > > > >
> > > > > It's translating priorities through the work queues, which doesn't seem
> > > > > to happen with the current implementation. A high priority, say
> > > > > SCHED_FIFO priority 99, task may have to wait for a nice -5 work queue
> > > > > to finish..
> > > >
> > > > Why should that task wait?
> > >
> > > I assume "that task" = the RT99 task? If so, that is precisely the
> > > question. It shouldn't wait. ;) With mainline, it is simply queued
> > > with every other request. There could be an RT40, and a SCHED_NORMAL in
> > > front of it in the queue that will get processed first. In addition,
> > > the system could suffer from a priority inversion if some unrelated but
> > > lower priority task (say RT98) was blocking the workqueue thread from
> > > making forward progress on the nice -5 job.
> > >
> > > To clarify: when a design utilizes a singlethread per workqueue (such as
> > > in both mainline and this patch), the RT99 will always have to wait
> > > behind any already dispatched jobs.
> >
> > It is not that "RT99 will always have to wait". But yes, the work_struct
> > queued by RT99 has to wait.
>
> Agreed. We are talking only within the scope of workqueues here.
>
>
> > > 1) The RT99 task would move ahead in the queue of anything else that was
> > > also scheduled on the workqueue that is < RT99.
> >
> > this itself is wrong, breaks flush_workqueue() semantics
>
> Perhaps in Daniels patch.
No.
> However, IIUC the point of flush_workqueue() is a barrier only relative
> to your own submissions, correct?. E.g. to make sure *your* requests
> are finished, not necessarily the entire queue.
No,
> If flush_workqueue is supposed to behave differently than I describe,
> then I agree its broken even in my original patch.
The comment near flush_workqueue() says:
* We sleep until all works which were queued on entry have been handled,
* but we are not livelocked by new incoming ones.
> > > 2) The priority of the workqueue task would be temporarily elevated to
> > > RT99 so that the currently dispatched task will complete at the same
> > > priority as the waiter.
> >
> > _Which_ waiter?
>
> The RT99 task that submitted the request.
Now, again, why do you think this task should wait?
> > I can't understand at all why work_struct should "inherit"
> > the priority of the task which queued it.
>
> Think about it: If you are an RT99 task and you have work to do,
> shouldn't *all* of the work you need be done at RT99 if possible.
No, I don't think so. Quite opposite, I think sometimes it makes
sense to "offload" some low-priority work from RT99 to workqueue
thread exactly because it has no rt policy.
And what about queue_work() from irq? Why should that work take a
"random" priority?
> Why
> should something like a measly RT98 task block you from completing your
> work. ;) The fact that you need to do some work via a workqueue (perhaps
> because you need specific cpu routing) is inconsequential, IMHO.
In that case I think it is better to create a special workqueue
and raise its priority.
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 22:22 ` Oleg Nesterov
@ 2007-08-01 23:53 ` Gregory Haskins
2007-08-02 19:50 ` Oleg Nesterov
0 siblings, 1 reply; 45+ messages in thread
From: Gregory Haskins @ 2007-08-01 23:53 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On Thu, 2007-08-02 at 02:22 +0400, Oleg Nesterov wrote:
>
> No.
>
> > However, IIUC the point of flush_workqueue() is a barrier only relative
> > to your own submissions, correct?. E.g. to make sure *your* requests
> > are finished, not necessarily the entire queue.
>
> No,
You sure are a confident one ;)
>
> > If flush_workqueue is supposed to behave differently than I describe,
> > then I agree its broken even in my original patch.
>
> The comment near flush_workqueue() says:
>
> * We sleep until all works which were queued on entry have been handled,
> * but we are not livelocked by new incoming ones.
Dude, of *course* says that. It would be completely illogical for it to
say otherwise with the linear priority queue that mainline has. Since
we are changing things here you have to read between the lines and ask
yourself "what is the intention of this barrier logic?". Generally
speaking, the point of a barrier is to flush relevant work from your own
context, sometimes at the granularity of flushing everyone elses work
inadvertently if the flush mechanism isn't fine grained enough. But
that is a side-effect, not a requirement.
So now my turn:
No. :P
But in all seriousness, let me ask you this: Why do you need a barrier
that flushes *all* work instead of just *your* work. Do you really
care? If you do, could we adapt the API to support the notion of
"flush() and "flush_all". Could we stay with one API call and make it
flush all work again and you are happy?
To be honest, I think you have made me realize there is actually a
legitimate problem w.r.t. what I mentioned earlier (unguarded local
PI-boost messing things up), and its my bad. I originally wrote this
patch for a different RT subsystem which used an entirely different
barrier mechanism and therefore didn't have this problem(*). I think it
just didn't translate in the port to workqueues directly, and now we
need to address it.
Even though I disagree with you on the semantics of flush_workqueue, the
fact that we don't protect against a local PI-boost means the current
mechanism isn't safe (and thank you for banging that home). However,
you seem to have objections to the overall change in general aside from
this bug, and there we can agree to disagree.
(*)Basically, the signaling mechanisms in the original design were
tightly coupled to the work-units and therefore there was no
relationship between disparate items in the queue such as there is in
the workqueue subsystem.
>
> > > > 2) The priority of the workqueue task would be temporarily elevated to
> > > > RT99 so that the currently dispatched task will complete at the same
> > > > priority as the waiter.
> > >
> > > _Which_ waiter?
> >
> > The RT99 task that submitted the request.
>
> Now, again, why do you think this task should wait?
I don't think it *should* wait. It *will* wait and we don't want that.
And without PI-boost, it could wait indefinitely. I think the detail
you are missing is that the RT kernel introduces some new workqueue APIs
that allow for "RPC" like behavior. E.g. they are like
"smp_call_function()", but instead of using an IPI, it uses workqueues
to dispatch work to other CPUs. I could go on and on about why this is,
but hopefully you just immediately understand that this is a *good*
thing to do, especially in RT.
So now, we are enhancing that RPC mechanism to be RT aware with this
proposed changeset so it A) priority-queues, and B) prevents inversion.
I hope that helps to clear it up.
Originally I had proposed this RPC mechanism to be a separate subsystem
from workqueues. But it involved new kthreads, rt-queuing, and PI. It
was sensibly suggested in review that the kthread work was redundant
with workqueues, but that the rt/pi stuff had general merit. So it was
suggested that we port the rt/pi concepts to workqueues and base the
work on that. So here we are.... ;)
>
> > > I can't understand at all why work_struct should "inherit"
> > > the priority of the task which queued it.
> >
> > Think about it: If you are an RT99 task and you have work to do,
> > shouldn't *all* of the work you need be done at RT99 if possible.
>
> No, I don't think so. Quite opposite,
Ouch, there's that confidence again...
> I think sometimes it makes
> sense to "offload" some low-priority work from RT99 to workqueue
> thread exactly because it has no rt policy.
The operative word in your statement being "sometimes"? I could flip
your argument right around on you and say "sometimes we want to use
workqueues to remotely represent our high priority butts somewhere
else" ;) Perhaps that is the key to compromise. Perhaps the API needs
to be adjusted to deal with the fact that sometimes you want to inherit
priority, sometimes you don't.
>
> And what about queue_work() from irq? Why should that work take a
> "random" priority?
Yes, you raise a legitimate point here. In RT, the irq is a kthread
with an RT priority so it would inherit that as the patch stands right
now. However, whether this is correct/desired and how this translates
to operation under non-RT is unclear. Perhaps the next pass with
proposed API changes will solve this issue.
>
> > Why
> > should something like a measly RT98 task block you from completing your
> > work. ;) The fact that you need to do some work via a workqueue (perhaps
> > because you need specific cpu routing) is inconsequential, IMHO.
>
> In that case I think it is better to create a special workqueue
> and raise its priority.
I disagree with you here. Why should we have a bunch of new workqueues
kicking around for all the different priorities and scenarios. Like I
said, we are trying to build a RT-aware general RPC mechanism here.
Those "event/%d"s are just laying around idle. Lets use em! ;)
Again, to be honest I somewhat see your point. I originally used my own
kthreads (one per CPU) precisely because I didn't *want* to mess with
the workqueue infrastructure. So you got one camp saying "don't create
your own homegrown workqueues!", and you got another saying "don't
change my workqueue infrastructure"! I dont know what is the best
answer, but I can tell you this RT/PI stuff is a legitimate problem to
solve, at least from the RT perspective. So lets put our heads together
and figure it out.
Regards
-Greg
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-01 23:53 ` Gregory Haskins
@ 2007-08-02 19:50 ` Oleg Nesterov
2007-08-06 11:35 ` Gregory Haskins
` (2 more replies)
0 siblings, 3 replies; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-02 19:50 UTC (permalink / raw)
To: Gregory Haskins
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On 08/01, Gregory Haskins wrote:
>
> On Thu, 2007-08-02 at 02:22 +0400, Oleg Nesterov wrote:
>
> > No,
>
> You sure are a confident one ;)
Yeah, this is a rare case when I am very sure I am right ;)
I strongly believe you guys take a _completely_ wrong approach.
queue_work() should _not_ take the priority of the caller into
account, this is bogus.
Once again. Why do you think that queue_work() from RT99 should
be considered as more important? This is just not true _unless_
this task has to _wait_ for this work.
So, perhaps, perhaps, it makes sense to modify insert_wq_barrier()
so that it temporary raises the priority of cwq->thread to match
that of the caller of flush_workqueue() or cancel_work_sync().
However, I think even this is not needed. Please show us the real
life example why the current implementation sucks.
> > The comment near flush_workqueue() says:
> >
> > * We sleep until all works which were queued on entry have been handled,
> > * but we are not livelocked by new incoming ones.
>
> Dude, of *course* says that. It would be completely illogical for it to
> say otherwise with the linear priority queue that mainline has. Since
> we are changing things here you have to read between the lines and ask
> yourself "what is the intention of this barrier logic?". Generally
> speaking, the point of a barrier is to flush relevant work from your own
> context, sometimes at the granularity of flushing everyone elses work
> inadvertently if the flush mechanism isn't fine grained enough. But
> that is a side-effect, not a requirement.
>
> So now my turn:
>
> No. :P
OK. Suppose that we re-order work_struct's according to the caller's
priority.
Now, a niced thread doing flush_workqueue() can livelock if workqueue
is actively used.
Actually a niced (so that its priority is lower than cwq->thread's one)
can deadlock. Suppose it does
lock(LOCK);
flush_workueue(wq);
and we have a pending work_struct which does:
void work_handler(struct work_struct *self)
{
if (!try_lock(LOCK)) {
// try again later...
queue_work(wq, self);
return;
}
do_something();
}
Deadlock. Because now the caller of queue_work() is cwq->thread itself,
so we insert this work ahead of the barrier which should complete
flush_workueue(wq) above.
There are other scenarious scenarios for more subtle deadlocks. Say,
the pending task doesn't touch LOCK itself, but schedules another
work which takes the LOCK.
[... snip a good portion of text which I wasn't able to translate
and understand ... ;) ]
> > Now, again, why do you think this task should wait?
>
> I don't think it *should* wait. It *will* wait and we don't want that.
> And without PI-boost, it could wait indefinitely. I think the detail
> you are missing is that the RT kernel introduces some new workqueue APIs
> that allow for "RPC" like behavior. E.g. they are like
> "smp_call_function()", but instead of using an IPI, it uses workqueues
> to dispatch work to other CPUs.
Aha. And this is exactly what I meant above. And this means that flush
should govern the priority boosting, not queueing!
But again, I think we can just create a special workqueue for that and
make it RT99. No re-ordering, no playing games with re-niceing.
Because that workqueue should be "idle" most of the time (no pending
works), otherwise we are doing something wrong. And I don't think this
wq can have many users and work->func() shouldn't be heavy, so perhaps
it is OK if it always runs with the high priority. The latter may be
wrong though.
> However, you seem to have objections to the overall change in general
Well, yes... You propose the complex changes to solve the problem which
doesn't look very common, this makes me unhappy :)
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-02 19:50 ` Oleg Nesterov
@ 2007-08-06 11:35 ` Gregory Haskins
2007-08-06 14:26 ` Oleg Nesterov
2007-08-06 11:49 ` Ingo Molnar
2007-08-06 19:33 ` Oleg Nesterov
2 siblings, 1 reply; 45+ messages in thread
From: Gregory Haskins @ 2007-08-06 11:35 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On Thu, 2007-08-02 at 23:50 +0400, Oleg Nesterov wrote:
> I strongly believe you guys take a _completely_ wrong approach.
> queue_work() should _not_ take the priority of the caller into
> account, this is bogus.
I think you have argued very effectively that there are situations in
which the priority should not be taken into account. I also think a
case could easily be made to say that this RT business may be too
complex to justifiably pollute the workqueue infrastructure.
However, I assure you this: The concept of a priority ordered queue
coupled with PI-boost is *very* desirable (IMHO) in the RT kernel for
use as an RPC mechanism. Couple that with the fact that there are many
many parallels to what we need to do for an RT/RPC as there is for
workqueues. To say outright that priority *can* never or *should* never
be taken into account is equally as bogus.
Ideally we can modify workqueues so that they both retain their
usefulness as they are used today, as well as provide new features as an
RT/RPC transport mechanism. It would be a bonus if it could be done in
such a way as to advance the usefulness of the workqueues in general.
Perhaps it is not possible, but I doubt some compromise could not be
found.
The motivation to go in this direction is twofold:
1) To avoid having redundant "schedule something arbitrary on a kthread"
type subsystems
2) To possibly allow all users of the workqueue subsystem to benefit
from any advances we can come up with.
As I mentioned in the last mail, to make our proposal work I agree that
the API proposed in the original patch needs to change. The priority
behavior simply cannot be the default. I believe this right there will
make most of your examples of where it's broken fall away.
>
> Once again. Why do you think that queue_work() from RT99 should
> be considered as more important? This is just not true _unless_
> this task has to _wait_ for this work.
But that's just it. The thing we are building *does* have to wait, thus
the concern.
Like I said before, the original non-workqueue implementation *only*
provided for a REQUEST/RESPONSE pattern for RPCs. The translation to
workqueues was not as straightforward as we originally hoped for because
they support more than REQUEST/RESPONSE. I realize now that we need to
consider patterns beyond RPCs if we are to get this thing to work so
that all camps are happy.
>
> So, perhaps, perhaps, it makes sense to modify insert_wq_barrier()
> so that it temporary raises the priority of cwq->thread to match
> that of the caller of flush_workqueue() or cancel_work_sync().
That is not a bad idea, actually.
>
> However, I think even this is not needed. Please show us the real
> life example why the current implementation sucks.
Assume the following:
1) Three tasks: A, B, and C where A has the highest priority and C the
lowest.
2) C is your workqueue-daemon, and is bound to processor 0.
3) B is cpu bound and is currently scheduled on processor 0, and A is on
processor 1.
4) A inserts work to C's workqueue.
When will the job complete? Whats more, what if making any more forward
progress on A was predicated on the completion of that job (as it would
be in your RPC type model).
The answer, of course, is that the work will not complete until B gives
up the processor because C is lower priority than B. Both A and C will
block indefinitely behind B, thus effectively inverting B's priority to
become logically higher than A's. This is an example of
priority-inversion and preventing it is one of the things that the patch
tries to address.
>
> OK. Suppose that we re-order work_struct's according to the caller's
> priority.
>
> Now, a niced thread doing flush_workqueue() can livelock if workqueue
> is actively used.
No, that's not livelock. That's preemption and it's by design. Work
will continue when the higher-priority items are finished. Livelock
would be logic like "while (!queue_empty());". Here you are still
retaining your position in the queue relative to others at your priority
and below.
>
> Actually a niced (so that its priority is lower than cwq->thread's one)
> can deadlock. Suppose it does
>
> lock(LOCK);
> flush_workueue(wq);
>
> and we have a pending work_struct which does:
>
> void work_handler(struct work_struct *self)
> {
> if (!try_lock(LOCK)) {
> // try again later...
> queue_work(wq, self);
> return;
> }
>
> do_something();
> }
>
> Deadlock.
That code is completely broken, so I don't think it matters much. But
regardless, the new API changes will address that.
>
> Aha. And this is exactly what I meant above. And this means that flush
> should govern the priority boosting, not queueing!
Ok. I am not against this idea (I think). Like I said, the original
system had the work and barrier logic as one unit so boosting at
queue-insert made sense. We just brought the concept straight over in
the port to workqueues without thinking about it the way you are
proposing. Upon cursory thought, your argument for flush-time boosting
makes sense to me.
>
> But again, I think we can just create a special workqueue for that and
> make it RT99. No re-ordering, no playing games with re-niceing.
No, that's just broken in the other direction. We might as well use
smp_call() at that point.
>
> Because that workqueue should be "idle" most of the time (no pending
> works), otherwise we are doing something wrong. And I don't think this
> wq can have many users and work->func() shouldn't be heavy, so perhaps
> it is OK if it always runs with the high priority. The latter may be
> wrong though.
>
> > However, you seem to have objections to the overall change in general
>
> Well, yes... You propose the complex changes to solve the problem which
> doesn't look very common, this makes me unhappy :)
That's fair. It's very possible that when its all said and done it may
turn out to be illogical to make the requirements harmonize into one
subsystem. They may be simply too different from one another justify
the complexity needed to managed between the two worlds. But I am not
convinced that is the case just yet. I think we can make it work for
everyone with a little more design work.
I will submit a new patch later which addresses these concerns.
Regards,
-Greg
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-02 19:50 ` Oleg Nesterov
2007-08-06 11:35 ` Gregory Haskins
@ 2007-08-06 11:49 ` Ingo Molnar
2007-08-06 13:18 ` Oleg Nesterov
2007-08-06 19:33 ` Oleg Nesterov
2 siblings, 1 reply; 45+ messages in thread
From: Ingo Molnar @ 2007-08-06 11:49 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Gregory Haskins, Daniel Walker, Peter Zijlstra, linux-rt-users,
linux-kernel
* Oleg Nesterov <oleg@tv-sign.ru> wrote:
> On 08/01, Gregory Haskins wrote:
> >
> > On Thu, 2007-08-02 at 02:22 +0400, Oleg Nesterov wrote:
> >
> > > No,
> >
> > You sure are a confident one ;)
>
> Yeah, this is a rare case when I am very sure I am right ;)
>
> I strongly believe you guys take a _completely_ wrong approach.
> queue_work() should _not_ take the priority of the caller into
> account, this is bogus.
Oleg, i'd like to make it sure that the role of Gregory Haskins is clear
here: he submitted some new infrastructure into the -rt tree, and i
reviewed that but found it quite complex and duplicative and suggested
him to think about enhancing workqueues with priority properties - which
might or might not make sense.
It is not the intention of the -rt project to pester any upstream
maintainer with -rt issues if that upstream maintainer is not interested
in it ... so please just forget about all this if you are not interested
in it, we'll sort it out within -rt. Thanks,
Ingo
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 11:49 ` Ingo Molnar
@ 2007-08-06 13:18 ` Oleg Nesterov
2007-08-06 13:29 ` Peter Zijlstra
0 siblings, 1 reply; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-06 13:18 UTC (permalink / raw)
To: Ingo Molnar
Cc: Gregory Haskins, Daniel Walker, Peter Zijlstra, linux-rt-users,
linux-kernel
Gregory, Ingo,
On 08/06, Ingo Molnar wrote:
>
> * Oleg Nesterov <oleg@tv-sign.ru> wrote:
>
> > On 08/01, Gregory Haskins wrote:
> > >
> > > On Thu, 2007-08-02 at 02:22 +0400, Oleg Nesterov wrote:
> > >
> > > > No,
> > >
> > > You sure are a confident one ;)
> >
> > Yeah, this is a rare case when I am very sure I am right ;)
> >
> > I strongly believe you guys take a _completely_ wrong approach.
> > queue_work() should _not_ take the priority of the caller into
> > account, this is bogus.
>
> Oleg, i'd like to make it sure that the role of Gregory Haskins is clear
> here: he submitted some new infrastructure into the -rt tree, and i
> reviewed that but found it quite complex and duplicative and suggested
> him to think about enhancing workqueues with priority properties - which
> might or might not make sense.
>
> It is not the intention of the -rt project to pester any upstream
> maintainer with -rt issues if that upstream maintainer is not interested
> in it ... so please just forget about all this if you are not interested
> in it, we'll sort it out within -rt. Thanks,
I am not trying to sabotage these changes, and I am sorry if it looked
that way.
I jumped into this discuassion because both patches I saw (Daniel's and
Gregory's) were very wrong technically.
Yes, I still disagree with the whole idea because I hope we can make
something more simpler to solve the problem, but I must admit I don't
quite understand what the problem is.
So, please consider a noise from my side as my attempt to help. And
in fact, I am very curious about -rt tree, just I never had a time
to study it :)
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 13:18 ` Oleg Nesterov
@ 2007-08-06 13:29 ` Peter Zijlstra
2007-08-06 13:32 ` Peter Zijlstra
0 siblings, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2007-08-06 13:29 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Ingo Molnar, Gregory Haskins, Daniel Walker, linux-rt-users,
linux-kernel
On Mon, 2007-08-06 at 17:18 +0400, Oleg Nesterov wrote:
> Yes, I still disagree with the whole idea because I hope we can make
> something more simpler to solve the problem, but I must admit I don't
> quite understand what the problem is.
>
> So, please consider a noise from my side as my attempt to help. And
> in fact, I am very curious about -rt tree, just I never had a time
> to study it :)
Well, the thing is, suppose we have 2 drivers both using keventd say a
NIC and some USB thingy.
Now the NIC is deemed important hand gets irq thread prio 90, and the
USB could not be cared less about and gets 10 (note that on -rt irq
handlers are threaded and run SCHED_FIFO).
So now you can get priority inversion in keventd. Say the USB thingy
schedules a work item which will be executed. Then during the execution
of this work the NIC will also schedule a work item. Now the NIC (fifo
90) will have to wait for the USB work (fifo 10) to complete.
The typical solution is priority inheritance, where the highest prio of
any waiter is propagated to the currently running work, so that it can
finish and get on with the more important work.
So these patches aimed to provide proper PI in the workqueue structure
to avoid this problem.
However as you rightly noted, this horribly breaks the barrier/flush
semantics.
I suspect most of the barrier/flush semantics could be replaced with
completions from specific work items. Doing this will be a lot of work
though.
I hope this rambling is not confusing you any further :-)
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 13:29 ` Peter Zijlstra
@ 2007-08-06 13:32 ` Peter Zijlstra
2007-08-06 14:45 ` Oleg Nesterov
0 siblings, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2007-08-06 13:32 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Ingo Molnar, Gregory Haskins, Daniel Walker, linux-rt-users,
linux-kernel
On Mon, 2007-08-06 at 15:29 +0200, Peter Zijlstra wrote:
> On Mon, 2007-08-06 at 17:18 +0400, Oleg Nesterov wrote:
>
> > Yes, I still disagree with the whole idea because I hope we can make
> > something more simpler to solve the problem, but I must admit I don't
> > quite understand what the problem is.
> >
> > So, please consider a noise from my side as my attempt to help. And
> > in fact, I am very curious about -rt tree, just I never had a time
> > to study it :)
>
>
> Well, the thing is, suppose we have 2 drivers both using keventd say a
> NIC and some USB thingy.
>
> Now the NIC is deemed important hand gets irq thread prio 90, and the
> USB could not be cared less about and gets 10 (note that on -rt irq
> handlers are threaded and run SCHED_FIFO).
>
> So now you can get priority inversion in keventd. Say the USB thingy
> schedules a work item which will be executed. Then during the execution
> of this work the NIC will also schedule a work item. Now the NIC (fifo
> 90) will have to wait for the USB work (fifo 10) to complete.
/me hits himself.
of course today everything will run on whatever prio keventd ends up,
regardless of the prio of the submitter.
still this does not change the fundamental issue of a high prio piece of
work waiting on a lower prio task.
> The typical solution is priority inheritance, where the highest prio of
> any waiter is propagated to the currently running work, so that it can
> finish and get on with the more important work.
>
>
> So these patches aimed to provide proper PI in the workqueue structure
> to avoid this problem.
>
> However as you rightly noted, this horribly breaks the barrier/flush
> semantics.
>
> I suspect most of the barrier/flush semantics could be replaced with
> completions from specific work items. Doing this will be a lot of work
> though.
>
> I hope this rambling is not confusing you any further :-)
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 11:35 ` Gregory Haskins
@ 2007-08-06 14:26 ` Oleg Nesterov
2007-08-06 14:57 ` Gregory Haskins
0 siblings, 1 reply; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-06 14:26 UTC (permalink / raw)
To: Gregory Haskins
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On 08/06, Gregory Haskins wrote:
>
> On Thu, 2007-08-02 at 23:50 +0400, Oleg Nesterov wrote:
>
> > I strongly believe you guys take a _completely_ wrong approach.
> > queue_work() should _not_ take the priority of the caller into
> > account, this is bogus.
>
> I think you have argued very effectively that there are situations in
> which the priority should not be taken into account. I also think a
> case could easily be made to say that this RT business may be too
> complex to justifiably pollute the workqueue infrastructure.
Yes, perhaps we can make a separate inreface...
> However, I assure you this: The concept of a priority ordered queue
> coupled with PI-boost is *very* desirable (IMHO) in the RT kernel for
> use as an RPC mechanism. Couple that with the fact that there are many
> many parallels to what we need to do for an RT/RPC as there is for
> workqueues.
As I said, I do not understand what the problem exactly is, but surely
I undestand you didn't start this thread without any reason :)
> To say outright that priority *can* never or *should* never
> be taken into account is equally as bogus.
This is true of course, and I didn't claim this.
> > Once again. Why do you think that queue_work() from RT99 should
> > be considered as more important? This is just not true _unless_
> > this task has to _wait_ for this work.
>
> But that's just it. The thing we are building *does* have to wait, thus
> the concern.
Yes, as I said before when we have to wait, it makes sense to do
some tricks.
> > So, perhaps, perhaps, it makes sense to modify insert_wq_barrier()
> > so that it temporary raises the priority of cwq->thread to match
> > that of the caller of flush_workqueue() or cancel_work_sync().
>
> That is not a bad idea, actually.
Great ;)
> > However, I think even this is not needed. Please show us the real
> > life example why the current implementation sucks.
>
> Assume the following:
>
> 1) Three tasks: A, B, and C where A has the highest priority and C the
> lowest.
>
> 2) C is your workqueue-daemon, and is bound to processor 0.
...workqueue has a thread for each CPU...
> 3) B is cpu bound and is currently scheduled on processor 0, and A is on
> processor 1.
>
> 4) A inserts work to C's workqueue.
>
> When will the job complete?
Immediately, A inserts the work on CPU 1.
But yes, I see what you mean, but again, again, unless A has to wait
for result etc, etc.
> > OK. Suppose that we re-order work_struct's according to the caller's
> > priority.
> >
> > Now, a niced thread doing flush_workqueue() can livelock if workqueue
> > is actively used.
>
> No, that's not livelock. That's preemption and it's by design. Work
> will continue when the higher-priority items are finished. Livelock
> would be logic like "while (!queue_empty());". Here you are still
> retaining your position in the queue relative to others at your priority
> and below.
It can livelock if other higher-priority threads add works to this wq
repeteadly.
> > Actually a niced (so that its priority is lower than cwq->thread's one)
> > can deadlock. Suppose it does
> >
> > lock(LOCK);
> > flush_workueue(wq);
> >
> > and we have a pending work_struct which does:
> >
> > void work_handler(struct work_struct *self)
> > {
> > if (!try_lock(LOCK)) {
> > // try again later...
> > queue_work(wq, self);
> > return;
> > }
> >
> > do_something();
> > }
> >
> > Deadlock.
>
> That code is completely broken, so I don't think it matters much. But
> regardless, the new API changes will address that.
Sorry. This code is not very nice, but it is correct currently.
And I'll give you another example. Let's suppose the task does
rt_mutex_lock(some_unrelated_lock);
queue_work(WORK1);
// WINDOW
queue_work(WORK2);
I think it would be very natural to expect that these works will be
executed in order, do you agree?
Now suppose that the current's priority was boosted because another
higher-priority task tries to take rt_mutex in the WINDOW above?
Still, still I believe we should not re-order work_structs. If we do
this, we should add another, simplified and specialized interface imho.
> > But again, I think we can just create a special workqueue for that and
> > make it RT99. No re-ordering, no playing games with re-niceing.
>
> No, that's just broken in the other direction. We might as well use
> smp_call() at that point.
Well, if we can use smp_call_() we don't need these complications?
> I will submit a new patch later which addresses these concerns.
Please cc me ;)
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 13:32 ` Peter Zijlstra
@ 2007-08-06 14:45 ` Oleg Nesterov
2007-08-06 14:52 ` Peter Zijlstra
2007-08-06 15:04 ` Gregory Haskins
0 siblings, 2 replies; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-06 14:45 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Gregory Haskins, Daniel Walker, linux-rt-users,
linux-kernel
On 08/06, Peter Zijlstra wrote:
>
> On Mon, 2007-08-06 at 15:29 +0200, Peter Zijlstra wrote:
> > On Mon, 2007-08-06 at 17:18 +0400, Oleg Nesterov wrote:
> >
> > > Yes, I still disagree with the whole idea because I hope we can make
> > > something more simpler to solve the problem, but I must admit I don't
> > > quite understand what the problem is.
> > >
> > > So, please consider a noise from my side as my attempt to help. And
> > > in fact, I am very curious about -rt tree, just I never had a time
> > > to study it :)
> >
> >
> > Well, the thing is, suppose we have 2 drivers both using keventd say a
> > NIC and some USB thingy.
> >
> > Now the NIC is deemed important hand gets irq thread prio 90, and the
> > USB could not be cared less about and gets 10 (note that on -rt irq
> > handlers are threaded and run SCHED_FIFO).
> >
> > So now you can get priority inversion in keventd. Say the USB thingy
> > schedules a work item which will be executed. Then during the execution
> > of this work the NIC will also schedule a work item. Now the NIC (fifo
> > 90) will have to wait for the USB work (fifo 10) to complete.
>
> /me hits himself.
>
> of course today everything will run on whatever prio keventd ends up,
> regardless of the prio of the submitter.
>
> still this does not change the fundamental issue of a high prio piece of
> work waiting on a lower prio task.
^^^^^^^
waiting. This is a "key" word, and this was my (perhaps wrong) point.
> > I suspect most of the barrier/flush semantics could be replaced with
> > completions from specific work items.
Hm. But this is exactly how it works?
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 14:45 ` Oleg Nesterov
@ 2007-08-06 14:52 ` Peter Zijlstra
2007-08-06 16:40 ` Oleg Nesterov
2007-08-06 15:04 ` Gregory Haskins
1 sibling, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2007-08-06 14:52 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Ingo Molnar, Gregory Haskins, Daniel Walker, linux-rt-users,
linux-kernel
On Mon, 2007-08-06 at 18:45 +0400, Oleg Nesterov wrote:
> On 08/06, Peter Zijlstra wrote:
> > still this does not change the fundamental issue of a high prio piece of
> > work waiting on a lower prio task.
> ^^^^^^^
> waiting. This is a "key" word, and this was my (perhaps wrong) point.
Yeah, its having a higher prio item processed at a lower prio that is
the problem. It might be delayed by less important issues.
But I'm feeling a question wanting to jump out of your statement, I just
fail to find it.
> > > I suspect most of the barrier/flush semantics could be replaced with
> > > completions from specific work items.
>
> Hm. But this is exactly how it works?
Ah, I fail to be clear :-/
Yes, barriers work by enqueueing work and waiting for that one work item
to fall out, thereby knowing that all previous work has been completed.
My point was that most flushes are there to wait for a previously
enqueued work item, and might as well wait for that one.
Let me try to illustrate: a regular pattern is, we enqueue work A and
then flush the whole queue to ensure A is processed. So instead of
enqueueing A, then B in the barrier code, and wait for B to pop out, we
might as well wait for A to begin with.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 14:26 ` Oleg Nesterov
@ 2007-08-06 14:57 ` Gregory Haskins
2007-08-06 15:36 ` Oleg Nesterov
0 siblings, 1 reply; 45+ messages in thread
From: Gregory Haskins @ 2007-08-06 14:57 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On Mon, 2007-08-06 at 18:26 +0400, Oleg Nesterov wrote:
>
> This is true of course, and I didn't claim this.
My apologies. I misunderstood you.
> > When will the job complete?
>
> Immediately, A inserts the work on CPU 1.
Well, if you didn't care about which CPU, that's true. But suppose we
want to direct this specifically at CWQ for cpu 0. This is the type of
environment we need to build a "schedule_on_cpu()" type function.
Executing the function locally would be pointless for a RPC/smp_call()
like interface.
>
> It can livelock if other higher-priority threads add works to this wq
> repeteadly.
That's really starvation, though. I agree with you that it is
technically possible to happen if the bandwidth of the higher priority
producer task is greater than the bandwidth of the consumer. But note
that the RT priorities already forgo starvation avoidance, so IMO the
queue can here as well. E.g. if the system designer runs a greedy app
at a high RT priority, it can starve as many lower priority items as it
wants anyway. This is no different.
(There might potentially be a problem here with Daniel's version of the
patch because I didn't initially notice that he used the entire priority
spectrum instead of only RT. I think we need to go back to RT + 1
"normal" priority)
>
> > > Actually a niced (so that its priority is lower than cwq->thread's one)
> > > can deadlock. Suppose it does
> > >
> > > lock(LOCK);
> > > flush_workueue(wq);
> > >
> > > and we have a pending work_struct which does:
> > >
> > > void work_handler(struct work_struct *self)
> > > {
> > > if (!try_lock(LOCK)) {
> > > // try again later...
> > > queue_work(wq, self);
> > > return;
> > > }
> > >
> > > do_something();
> > > }
> > >
> > > Deadlock.
> >
> > That code is completely broken, so I don't think it matters much. But
> > regardless, the new API changes will address that.
>
> Sorry. This code is not very nice, but it is correct currently.
Well, the "trylock+requeue" avoids the obvious recursive deadlock, but
it introduces a more subtle error: the reschedule effectively bypasses
the flush. E.g. whatever work was being flushed was allowed to escape
out from behind the barrier. If you don't care about the flush working,
why do it at all?
But like I said, its moot...we will fix that condition at the API level
(or move away from overloading the workqueues)
>
> And I'll give you another example. Let's suppose the task does
>
> rt_mutex_lock(some_unrelated_lock);
>
> queue_work(WORK1);
> // WINDOW
> queue_work(WORK2);
>
> I think it would be very natural to expect that these works will be
> executed in order, do you agree?
I agree. That's a valid point and will be tough to solve. If you look
at dynamic priority, you have the problem you are highlighting, and if
you look at static priority you can reintroduce inversion. :( This
might be the issue that kills the idea.
>
> Now suppose that the current's priority was boosted because another
> higher-priority task tries to take rt_mutex in the WINDOW above?
>
> Still, still I believe we should not re-order work_structs. If we do
> this, we should add another, simplified and specialized interface imho.
>
> > > But again, I think we can just create a special workqueue for that and
> > > make it RT99. No re-ordering, no playing games with re-niceing.
> >
> > No, that's just broken in the other direction. We might as well use
> > smp_call() at that point.
>
> Well, if we can use smp_call_() we don't need these complications?
All I meant is that running the queue at RT99 would have as many
deficiencies as smp_call() does, and therefore we might as well not
bother with new infrastructure.
With an RT99 or smp_call() solution, all invocations effectively preempt
whatever is running on the target CPU. That is why I said it was the
opposite problem. A low priority client can preempt a high-priority
task, which is just as bad as the current situation.
>
> > I will submit a new patch later which addresses these concerns.
>
> Please cc me ;)
Will do. Thank you for the review!
-Greg
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 14:45 ` Oleg Nesterov
2007-08-06 14:52 ` Peter Zijlstra
@ 2007-08-06 15:04 ` Gregory Haskins
2007-08-06 15:38 ` Oleg Nesterov
1 sibling, 1 reply; 45+ messages in thread
From: Gregory Haskins @ 2007-08-06 15:04 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Peter Zijlstra, Ingo Molnar, Daniel Walker, linux-rt-users,
linux-kernel
On Mon, 2007-08-06 at 18:45 +0400, Oleg Nesterov wrote:
> On 08/06, Peter Zijlstra wrote:
> >
> > still this does not change the fundamental issue of a high prio piece of
> > work waiting on a lower prio task.
> ^^^^^^^
> waiting. This is a "key" word, and this was my (perhaps wrong) point.
Actually, I think Peter is making a really important point here.
"Waiting" can be defined in more ways than the REQUEST/RESPONSE pattern
that I have been rambling about.
Using Peters NIC vs USB example: What if a NIC driver is using a
workqueue as a bottom-half mechanism for its RX packet queuing. In a
nice RT environment it would be highly ideal if we allow the deferred
work to complete with respect to the priority that was assigned to the
subsystem.
So while the submitter isn't technically blocking on the work, the
application that is receiving packets is now subject to the arbitrary
priority of the keventd as opposed to the NIC irq. Thus there is still
"waiting" being subject to inversion, its just not in a REQUEST/RESPONSE
pattern.
-Greg
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 14:57 ` Gregory Haskins
@ 2007-08-06 15:36 ` Oleg Nesterov
2007-08-06 15:50 ` Gregory Haskins
0 siblings, 1 reply; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-06 15:36 UTC (permalink / raw)
To: Gregory Haskins
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On 08/06, Gregory Haskins wrote:
>
> On Mon, 2007-08-06 at 18:26 +0400, Oleg Nesterov wrote:
>
> > Immediately, A inserts the work on CPU 1.
>
> Well, if you didn't care about which CPU, that's true. But suppose we
> want to direct this specifically at CWQ for cpu 0.
please see below...
> > It can livelock if other higher-priority threads add works to this wq
> > repeteadly.
>
> That's really starvation, though. I agree with you that it is
> technically possible to happen if the bandwidth of the higher priority
> producer task is greater than the bandwidth of the consumer. But note
> that the RT priorities already forgo starvation avoidance, so IMO the
> queue can here as well. E.g. if the system designer runs a greedy app
> at a high RT priority, it can starve as many lower priority items as it
> wants anyway. This is no different.
OK.
> > > > Actually a niced (so that its priority is lower than cwq->thread's one)
> > > > can deadlock. Suppose it does
> > > >
> > > > lock(LOCK);
> > > > flush_workueue(wq);
> > > >
> > > > and we have a pending work_struct which does:
> > > >
> > > > void work_handler(struct work_struct *self)
> > > > {
> > > > if (!try_lock(LOCK)) {
> > > > // try again later...
> > > > queue_work(wq, self);
> > > > return;
> > > > }
> > > >
> > > > do_something();
> > > > }
> > > >
> > > > Deadlock.
> > >
> > > That code is completely broken, so I don't think it matters much. But
> > > regardless, the new API changes will address that.
> >
> > Sorry. This code is not very nice, but it is correct currently.
>
> Well, the "trylock+requeue" avoids the obvious recursive deadlock, but
> it introduces a more subtle error: the reschedule effectively bypasses
> the flush.
this is OK, flush_workqueue() should only care about work_struct's that are
already queued.
> E.g. whatever work was being flushed was allowed to escape
> out from behind the barrier. If you don't care about the flush working,
> why do it at all?
The caller of flush_workueue() doesn't necessary know we have such a work
on list. It just wants to flush its own works.
> But like I said, its moot...we will fix that condition at the API level
> (or move away from overloading the workqueues)
Oh, good :)
> > Well, if we can use smp_call_() we don't need these complications?
>
> With an RT99 or smp_call() solution, all invocations effectively preempt
> whatever is running on the target CPU. That is why I said it was the
> opposite problem. A low priority client can preempt a high-priority
> task, which is just as bad as the current situation.
Aha, now I see what another problem you are trying to solve. I had a false
impression that might_sleep() is the issue.
After reading the couple of Peter's emails, I guess I am starting to
understand another issue. RT has irq threads, and they have different
priorities. So, in that case I agree, it is natural to consider the work
which was queued from the higher-priority irq thread as "more important".
Yes?
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 15:04 ` Gregory Haskins
@ 2007-08-06 15:38 ` Oleg Nesterov
0 siblings, 0 replies; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-06 15:38 UTC (permalink / raw)
To: Gregory Haskins
Cc: Peter Zijlstra, Ingo Molnar, Daniel Walker, linux-rt-users,
linux-kernel
On 08/06, Gregory Haskins wrote:
>
> On Mon, 2007-08-06 at 18:45 +0400, Oleg Nesterov wrote:
> > On 08/06, Peter Zijlstra wrote:
>
> > >
> > > still this does not change the fundamental issue of a high prio piece of
> > > work waiting on a lower prio task.
> > ^^^^^^^
> > waiting. This is a "key" word, and this was my (perhaps wrong) point.
>
> Actually, I think Peter is making a really important point here.
Yes. Please see another email I just sent.
> "Waiting" can be defined in more ways than the REQUEST/RESPONSE pattern
> that I have been rambling about.
>
> Using Peters NIC vs USB example: What if a NIC driver is using a
> workqueue as a bottom-half mechanism for its RX packet queuing. In a
> nice RT environment it would be highly ideal if we allow the deferred
> work to complete with respect to the priority that was assigned to the
> subsystem.
>
> So while the submitter isn't technically blocking on the work, the
> application that is receiving packets is now subject to the arbitrary
> priority of the keventd as opposed to the NIC irq. Thus there is still
> "waiting" being subject to inversion, its just not in a REQUEST/RESPONSE
> pattern.
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 15:36 ` Oleg Nesterov
@ 2007-08-06 15:50 ` Gregory Haskins
2007-08-06 16:50 ` Oleg Nesterov
0 siblings, 1 reply; 45+ messages in thread
From: Gregory Haskins @ 2007-08-06 15:50 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On Mon, 2007-08-06 at 19:36 +0400, Oleg Nesterov wrote:
> > Well, the "trylock+requeue" avoids the obvious recursive deadlock, but
> > it introduces a more subtle error: the reschedule effectively bypasses
> > the flush.
>
> this is OK, flush_workqueue() should only care about work_struct's that are
> already queued.
Agreed.
>
> > E.g. whatever work was being flushed was allowed to escape
> > out from behind the barrier. If you don't care about the flush working,
> > why do it at all?
>
> The caller of flush_workueue() doesn't necessary know we have such a work
> on list. It just wants to flush its own works.
I was assuming that the work being flushed was submitted by the same
context, but I think I see what you are saying now. Essentially if that
queued work was unrelated to the thread that was doing the flush, it
doesn't care if it gets rescheduled.
Do you agree that if the context was the same there is a bug? Or did I
miss something else?
-Greg
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 14:52 ` Peter Zijlstra
@ 2007-08-06 16:40 ` Oleg Nesterov
0 siblings, 0 replies; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-06 16:40 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Gregory Haskins, Daniel Walker, linux-rt-users,
linux-kernel
On 08/06, Peter Zijlstra wrote:
>
> On Mon, 2007-08-06 at 18:45 +0400, Oleg Nesterov wrote:
> > On 08/06, Peter Zijlstra wrote:
>
> > > > I suspect most of the barrier/flush semantics could be replaced with
> > > > completions from specific work items.
> >
> > Hm. But this is exactly how it works?
>
> Yes, barriers work by enqueueing work and waiting for that one work item
> to fall out, thereby knowing that all previous work has been completed.
>
> My point was that most flushes are there to wait for a previously
> enqueued work item, and might as well wait for that one.
>
> Let me try to illustrate: a regular pattern is, we enqueue work A and
> then flush the whole queue to ensure A is processed. So instead of
> enqueueing A, then B in the barrier code, and wait for B to pop out, we
> might as well wait for A to begin with.
This is a bit off-topic, but in that particular case we can do
if (cancel_work_sync(&A))
A->func(&A);
unless we want to execute ->func() on another CPU of course. It is easy to
implement flush_work() which waits for a single work_strcut, but it is not
so useful when it comes to schedule_on_each_cpu().
But I agree, flush_workqueue() should be avoided if possible. Not sure
it makes sense, but we can do something like below.
Oleg.
--- kernel/workqueue.c~ 2007-07-28 16:58:17.000000000 +0400
+++ kernel/workqueue.c 2007-08-06 20:33:25.000000000 +0400
@@ -572,25 +572,54 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
*
* schedule_on_each_cpu() is very slow.
*/
+
+struct xxx
+{
+ atomic_t count;
+ struct completion done;
+ work_func_t func;
+};
+
+struct yyy
+{
+ struct work_struct work;
+ struct xxx *xxx;
+};
+
+static void yyy_func(struct work_struct *work)
+{
+ struct xxx *xxx = container_of(work, struct yyy, work)->xxx;
+ xxx->func(work);
+
+ if (atomic_dec_and_test(&xxx->count))
+ complete(&xxx->done);
+}
+
int schedule_on_each_cpu(work_func_t func)
{
int cpu;
- struct work_struct *works;
+ struct xxx xxx;
+ struct yyy *works;
- works = alloc_percpu(struct work_struct);
+ init_completion(&xxx.done);
+ xxx.func = func;
+
+ works = alloc_percpu(struct yyy);
if (!works)
return -ENOMEM;
preempt_disable(); /* CPU hotplug */
+ atomic_set(&xxx.count, num_online_cpus());
for_each_online_cpu(cpu) {
- struct work_struct *work = per_cpu_ptr(works, cpu);
+ struct yyy *yyy = per_cpu_ptr(works, cpu);
- INIT_WORK(work, func);
- set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
- __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
+ yyy->xxx = &xxx;
+ INIT_WORK(&yyy->work, yyy_func);
+ set_bit(WORK_STRUCT_PENDING, work_data_bits(&yyy->work));
+ __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), &yyy->work);
}
preempt_enable();
- flush_workqueue(keventd_wq);
+ wait_for_completion(&xxx.done);
free_percpu(works);
return 0;
}
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 15:50 ` Gregory Haskins
@ 2007-08-06 16:50 ` Oleg Nesterov
2007-08-06 16:57 ` Gregory Haskins
0 siblings, 1 reply; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-06 16:50 UTC (permalink / raw)
To: Gregory Haskins
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On 08/06, Gregory Haskins wrote:
>
> On Mon, 2007-08-06 at 19:36 +0400, Oleg Nesterov wrote:
>
> > > E.g. whatever work was being flushed was allowed to escape
> > > out from behind the barrier. If you don't care about the flush working,
> > > why do it at all?
> >
> > The caller of flush_workueue() doesn't necessary know we have such a work
> > on list. It just wants to flush its own works.
>
> I was assuming that the work being flushed was submitted by the same
> context, but I think I see what you are saying now. Essentially if that
> queued work was unrelated to the thread that was doing the flush, it
> doesn't care if it gets rescheduled.
Yes.
> Do you agree that if the context was the same there is a bug? Or did I
> miss something else?
Yes sure. We can't expect we can "flush" work_struct with flush_workqueue()
unless we know it doesn't re-schedule itself.
OTOH, it is not the bug to call flush_workqueue() even if that work was
queued by us, it should complete.
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 16:50 ` Oleg Nesterov
@ 2007-08-06 16:57 ` Gregory Haskins
0 siblings, 0 replies; 45+ messages in thread
From: Gregory Haskins @ 2007-08-06 16:57 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On Mon, 2007-08-06 at 20:50 +0400, Oleg Nesterov wrote:
> Yes.
>
> > Do you agree that if the context was the same there is a bug? Or did I
> > miss something else?
>
> Yes sure. We can't expect we can "flush" work_struct with flush_workqueue()
> unless we know it doesn't re-schedule itself.
Agreed.
>
> OTOH, it is not the bug to call flush_workqueue() even if that work was
> queued by us, it should complete.
Well, I guess it depends on the application but that would be highly
unusual unless the flush was already superfluous to begin with.
Typically you only call flush to ensure a strongly ordered operation.
The reschedule defeats the strong ordering and thus would break anything
that was dependent on it.
But we are splitting hairs at this point since we both agree that the
API as put forth in the PI patch was deficient ;)
-Greg
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-02 19:50 ` Oleg Nesterov
2007-08-06 11:35 ` Gregory Haskins
2007-08-06 11:49 ` Ingo Molnar
@ 2007-08-06 19:33 ` Oleg Nesterov
2007-08-06 19:37 ` Gregory Haskins
2 siblings, 1 reply; 45+ messages in thread
From: Oleg Nesterov @ 2007-08-06 19:33 UTC (permalink / raw)
To: Gregory Haskins
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
Gregory, we seem to more or less agree with each other, but still...
On 08/02, Oleg Nesterov wrote:
>
> On 08/01, Gregory Haskins wrote:
> >
> > On Thu, 2007-08-02 at 02:22 +0400, Oleg Nesterov wrote:
> >
> > > No,
> >
> > You sure are a confident one ;)
>
> Yeah, this is a rare case when I am very sure I am right ;)
>
> I strongly believe you guys take a _completely_ wrong approach.
> queue_work() should _not_ take the priority of the caller into
> account, this is bogus.
OK. I have to take my words back. I completely misunderstood why you
are doing this and which problems you are trying to solve, my bad.
Perhaps, I am also wrong on the "work_struct's could be re-ordered"
issue. Yes, we can break the code which is currently correct, that
was my point. But I must admit, I can't imagine the "good" code mich
may suffer. Perhaps we can just document the change in behaviour, and
"deprecate" such a code.
The only objection (and you seem to agree) is that the "regular"
queue_work() should not always take the callers's priority as the
priority of work_struct.
Oleg.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure
2007-08-06 19:33 ` Oleg Nesterov
@ 2007-08-06 19:37 ` Gregory Haskins
0 siblings, 0 replies; 45+ messages in thread
From: Gregory Haskins @ 2007-08-06 19:37 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Daniel Walker, Peter Zijlstra, Ingo Molnar, linux-rt-users,
linux-kernel
On Mon, 2007-08-06 at 23:33 +0400, Oleg Nesterov wrote:
> OK. I have to take my words back. I completely misunderstood why you
> are doing this and which problems you are trying to solve, my bad.
No problem man. You found some legitimate problems too so your input is
very much appreciated.
>
> Perhaps, I am also wrong on the "work_struct's could be re-ordered"
> issue. Yes, we can break the code which is currently correct, that
> was my point. But I must admit, I can't imagine the "good" code mich
> may suffer. Perhaps we can just document the change in behaviour, and
> "deprecate" such a code.
>
> The only objection (and you seem to agree) is that the "regular"
> queue_work() should not always take the callers's priority as the
> priority of work_struct.
Agreed. I think that is the right direction (assuming we can resolve
the other problems, like the double queue+queue problem you brought up).
Regards,
-Greg
^ permalink raw reply [flat|nested] 45+ messages in thread
end of thread, other threads:[~2007-08-06 21:28 UTC | newest]
Thread overview: 45+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-08-01 0:26 [PATCH] RT: Add priority-queuing and priority-inheritance to workqueue infrastructure Gregory Haskins
2007-08-01 3:52 ` Daniel Walker
2007-08-01 11:59 ` Gregory Haskins
2007-08-01 15:10 ` Daniel Walker
2007-08-01 15:19 ` Gregory Haskins
2007-08-01 15:55 ` Daniel Walker
2007-08-01 17:32 ` Gregory Haskins
2007-08-01 21:48 ` Esben Nielsen
2007-08-01 17:01 ` Peter Zijlstra
2007-08-01 17:10 ` Daniel Walker
2007-08-01 18:26 ` Oleg Nesterov
2007-08-01 18:39 ` Daniel Walker
2007-08-01 20:25 ` Oleg Nesterov
2007-08-01 18:12 ` Oleg Nesterov
2007-08-01 18:29 ` Daniel Walker
2007-08-01 20:18 ` Oleg Nesterov
2007-08-01 20:32 ` Oleg Nesterov
2007-08-01 20:43 ` Daniel Walker
2007-08-01 20:34 ` Daniel Walker
2007-08-01 20:50 ` Oleg Nesterov
2007-08-01 21:02 ` Daniel Walker
2007-08-01 21:13 ` Gregory Haskins
2007-08-01 21:34 ` Oleg Nesterov
2007-08-01 21:59 ` Gregory Haskins
2007-08-01 22:22 ` Oleg Nesterov
2007-08-01 23:53 ` Gregory Haskins
2007-08-02 19:50 ` Oleg Nesterov
2007-08-06 11:35 ` Gregory Haskins
2007-08-06 14:26 ` Oleg Nesterov
2007-08-06 14:57 ` Gregory Haskins
2007-08-06 15:36 ` Oleg Nesterov
2007-08-06 15:50 ` Gregory Haskins
2007-08-06 16:50 ` Oleg Nesterov
2007-08-06 16:57 ` Gregory Haskins
2007-08-06 11:49 ` Ingo Molnar
2007-08-06 13:18 ` Oleg Nesterov
2007-08-06 13:29 ` Peter Zijlstra
2007-08-06 13:32 ` Peter Zijlstra
2007-08-06 14:45 ` Oleg Nesterov
2007-08-06 14:52 ` Peter Zijlstra
2007-08-06 16:40 ` Oleg Nesterov
2007-08-06 15:04 ` Gregory Haskins
2007-08-06 15:38 ` Oleg Nesterov
2007-08-06 19:33 ` Oleg Nesterov
2007-08-06 19:37 ` Gregory Haskins
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox