[PATCH 0/2] VFCIPI support v2

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2] VFCIPI support v2
@ 2007-07-30 21:15 Gregory Haskins
  2007-07-30 21:15 ` [PATCH 1/2] RT: Preemptible Function-Call-IPI Support Gregory Haskins
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Gregory Haskins @ 2007-07-30 21:15 UTC (permalink / raw)
  To: linux-rt-users; +Cc: ghaskins

The following series Virtual Function-Call IPI support.  Changes from v1:

1) Support for platforms other than x86_64
2) Support for priority-inheritance.

Built/tested on x86_64 four-way SMP machine.  Other platforms untested.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] RT: Preemptible Function-Call-IPI Support
  2007-07-30 21:15 [PATCH 0/2] VFCIPI support v2 Gregory Haskins
@ 2007-07-30 21:15 ` Gregory Haskins
  2007-07-31  9:19   ` Ingo Molnar
  2007-07-30 21:15 ` [PATCH 2/2] RT: Add priority inheritance to the VFCIPI facility Gregory Haskins
  2007-07-30 21:34 ` [PATCH 0/2] VFCIPI support v2 Daniel Walker
  2 siblings, 1 reply; 14+ messages in thread
From: Gregory Haskins @ 2007-07-30 21:15 UTC (permalink / raw)
  To: linux-rt-users; +Cc: ghaskins

This code allows FUNCTION_CALL IPIs to become preemptible by executing
them in kthread context instead of interrupt context.  They are referred
to as "Virtual Function Call IPIs" (VFCIPI) because we no longer rely
on the actual FCIPI facility.  Instead we schedule a thread to run.  This
essentially replaces the synchronous FCIPI with an async RESCHEDULE IPI.

Since the function will be executed in kthread context, it is fully
sleepable and preemptible, thus providing more determinism.  It also allows
code that was written to expect spin_locks to work properly, even though
they may have converted to rt_mutex under the hood.  In summary, this
subsystem does for FCIPI interrupts what PREEMPT_HARDIRQs does for normal
interrupts.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 arch/i386/kernel/smpcommon.c |   16 +
 arch/ia64/kernel/smp.c       |    8 -
 arch/powerpc/kernel/smp.c    |   12 +
 arch/x86_64/kernel/smp.c     |   18 +-
 include/linux/smp.h          |   25 ++
 include/linux/vfcipi.h       |   10 +
 init/main.c                  |    3 
 kernel/Kconfig.preempt       |   12 +
 kernel/Makefile              |    1 
 kernel/vfcipi/Makefile       |    4 
 kernel/vfcipi/heap.c         |  136 +++++++++++++
 kernel/vfcipi/heap.h         |   20 ++
 kernel/vfcipi/thread.c       |  454 ++++++++++++++++++++++++++++++++++++++++++
 13 files changed, 689 insertions(+), 30 deletions(-)

diff --git a/arch/i386/kernel/smpcommon.c b/arch/i386/kernel/smpcommon.c
index 1868ae1..c400287 100644
--- a/arch/i386/kernel/smpcommon.c
+++ b/arch/i386/kernel/smpcommon.c
@@ -25,7 +25,7 @@ __cpuinit void init_gdt(int cpu)
 
 
 /**
- * smp_call_function(): Run a function on all other CPUs.
+ * smp_call_function__nodelay(): Run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
  * @nonatomic: Unused.
@@ -39,15 +39,15 @@ __cpuinit void init_gdt(int cpu)
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
-		      int wait)
+int smp_call_function__nodelay(void (*func) (void *info), void *info,
+			       int nonatomic, int wait)
 {
 	return smp_call_function_mask(cpu_online_map, func, info, wait);
 }
-EXPORT_SYMBOL(smp_call_function);
+EXPORT_SYMBOL(smp_call_function__nodelay);
 
 /**
- * smp_call_function_single - Run a function on another CPU
+ * smp_call_function_single__nodelay - Run a function on another CPU
  * @cpu: The target CPU.  Cannot be the calling CPU.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
@@ -59,8 +59,8 @@ EXPORT_SYMBOL(smp_call_function);
  * If @wait is true, then returns once @func has returned; otherwise
  * it returns just before the target cpu calls @func.
  */
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-			     int nonatomic, int wait)
+int smp_call_function_single__nodelay(int cpu, void (*func) (void *info),
+				      void *info, int nonatomic, int wait)
 {
 	/* prevent preemption and reschedule on another processor */
 	int ret;
@@ -76,4 +76,4 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 	put_cpu();
 	return ret;
 }
-EXPORT_SYMBOL(smp_call_function_single);
+EXPORT_SYMBOL(smp_call_function_single__nodelay);
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 2256b08..a194477 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -374,7 +374,7 @@ smp_flush_tlb_mm (struct mm_struct *mm)
  */
 
 int
-smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int nonatomic,
+smp_call_function_single__nodelay (int cpuid, void (*func) (void *info), void *info, int nonatomic,
 			  int wait)
 {
 	struct call_data_struct data;
@@ -413,7 +413,7 @@ smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int
 	put_cpu();
 	return 0;
 }
-EXPORT_SYMBOL(smp_call_function_single);
+EXPORT_SYMBOL(smp_call_function_single__nodelay);
 
 /*
  * this function sends a 'generic call function' IPI to all other CPUs
@@ -435,7 +435,7 @@ EXPORT_SYMBOL(smp_call_function_single);
  * hardware interrupt handler or from a bottom half handler.
  */
 int
-smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait)
+smp_call_function__nodelay (void (*func) (void *info), void *info, int nonatomic, int wait)
 {
 	struct call_data_struct data;
 	int cpus;
@@ -473,7 +473,7 @@ smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wai
 	spin_unlock(&call_lock);
 	return 0;
 }
-EXPORT_SYMBOL(smp_call_function);
+EXPORT_SYMBOL(smp_call_function__nodelay);
 
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index c4987d9..a1e839a 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -283,15 +283,15 @@ int smp_call_function_map(void (*func) (void *info), void *info, int nonatomic,
 	return ret;
 }
 
-int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
-			int wait)
+int smp_call_function__nodelay(void (*func) (void *info), void *info,
+			       int nonatomic, int wait)
 {
 	return smp_call_function_map(func,info,nonatomic,wait,cpu_online_map);
 }
-EXPORT_SYMBOL(smp_call_function);
+EXPORT_SYMBOL(smp_call_function__nodelay);
 
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int nonatomic,
-			int wait)
+int smp_call_function_single__nodelay(int cpu, void (*func) (void *info),
+				      void *info, int nonatomic, int wait)
 {
 	cpumask_t map = CPU_MASK_NONE;
 	int ret = -EBUSY;
@@ -305,7 +305,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int
 	put_cpu();
 	return ret;
 }
-EXPORT_SYMBOL(smp_call_function_single);
+EXPORT_SYMBOL(smp_call_function_single__nodelay);
 
 void smp_call_function_interrupt(void)
 {
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 8cf7a0d..71fbe2f 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -367,7 +367,7 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 
 /*
- * smp_call_function_single - Run a function on another CPU
+ * smp_call_function_single__nodelay - Run a function on another CPU
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
  * @nonatomic: Currently unused.
@@ -378,9 +378,9 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info,
  * Does not return until the remote CPU is nearly ready to execute <func>
  * or is or has executed.
  */
-
-int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
-	int nonatomic, int wait)
+int
+smp_call_function_single__nodelay (int cpu, void (*func) (void *info),
+				  void *info, int nonatomic, int wait)
 {
 	/* prevent preemption and reschedule on another processor */
 	int me = get_cpu();
@@ -398,7 +398,7 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
 	put_cpu();
 	return 0;
 }
-EXPORT_SYMBOL(smp_call_function_single);
+EXPORT_SYMBOL(smp_call_function_single__nodelay);
 
 /*
  * this function sends a 'generic call function' IPI to all other CPUs
@@ -437,7 +437,7 @@ static void __smp_call_function (void (*func) (void *info), void *info,
 }
 
 /*
- * smp_call_function - run a function on all other CPUs.
+ * smp_call_function__nodelay - run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
  * @nonatomic: currently unused.
@@ -451,15 +451,15 @@ static void __smp_call_function (void (*func) (void *info), void *info,
  * hardware interrupt handler or from a bottom half handler.
  * Actually there are a few legal cases, like panic.
  */
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-			int wait)
+int smp_call_function__nodelay (void (*func) (void *info), void *info,
+				int nonatomic, int wait)
 {
 	spin_lock(&call_lock);
 	__smp_call_function(func,info,nonatomic,wait);
 	spin_unlock(&call_lock);
 	return 0;
 }
-EXPORT_SYMBOL(smp_call_function);
+EXPORT_SYMBOL(smp_call_function__nodelay);
 
 static void stop_this_cpu(void *dummy)
 {
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 442f87b..5017a97 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -62,10 +62,29 @@ extern void smp_cpus_done(unsigned int max_cpus);
 /*
  * Call a function on all other processors
  */
-int smp_call_function(void(*func)(void *info), void *info, int retry, int wait);
 
-int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
-				int retry, int wait);
+int smp_call_function__nodelay(void(*func)(void *info), void *info,
+			       int retry, int wait);
+
+int smp_call_function_single__nodelay(int cpuid, void (*func) (void *info),
+				      void *info, int retry, int wait);
+
+#ifdef CONFIG_PREEMPT_FCIPI
+
+int smp_call_function(void(*func)(void *info), void *info,
+		      int retry, int wait);
+
+int smp_call_function_single(int cpuid, void (*func) (void *info),
+			     void *info, int retry, int wait);
+
+#else
+
+#define smp_call_function(func, info, retry, wait)  \
+       smp_call_function__nodelay(func, info, retry, wait)
+#define smp_call_function_single(cpuid, func, info, retry, wait) \
+       smp_call_function_single__nodelay(cpuid, func, info, retry, wait)
+
+#endif /* CONFIG_PREEMPT_FCIPI */
 
 /*
  * Call a function on all processors
diff --git a/include/linux/vfcipi.h b/include/linux/vfcipi.h
new file mode 100644
index 0000000..8cedf21
--- /dev/null
+++ b/include/linux/vfcipi.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_VFCIPI_H
+#define _LINUX_VFCIPI_H
+
+#ifdef CONFIG_PREEMPT_FCIPI
+extern int vfcipi_init(void);
+#else
+#define vfcipi_init() {}
+#endif
+
+#endif /* */
diff --git a/init/main.c b/init/main.c
index 9829b27..ff28740 100644
--- a/init/main.c
+++ b/init/main.c
@@ -57,6 +57,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
 #include <linux/kthread.h>
+#include <linux/vfcipi.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -842,6 +843,8 @@ static int __init kernel_init(void * unused)
 
 	do_basic_setup();
 
+	vfcipi_init();
+
 	/*
 	 * check if there is an early userspace init.  If yes, let it do all
 	 * the work
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 8355494..f509ccf 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -120,6 +120,18 @@ config PREEMPT_HARDIRQS
 
 	  Say N if you are unsure.
 
+config PREEMPT_FCIPI
+	bool "Thread Function-Call Interprocessor Interrupts"
+	default n
+	depends on SMP
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          FUNCTION_CALL IPIs. This means that all (or selected) FCIPIs will
+	  run in their own kernel thread context. While this helps latency,
+          this feature can also reduce performance.
+
+	  Say N if you are unsure.
+
 config SPINLOCK_BKL
 	bool "Old-Style Big Kernel Lock"
 	depends on (PREEMPT || SMP) && !PREEMPT_RT
diff --git a/kernel/Makefile b/kernel/Makefile
index e592de8..ab1a8ae 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_PREEMPT_RT) += rt.o
+obj-$(CONFIG_PREEMPT_FCIPI) += vfcipi/
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/vfcipi/Makefile b/kernel/vfcipi/Makefile
new file mode 100644
index 0000000..55100fa
--- /dev/null
+++ b/kernel/vfcipi/Makefile
@@ -0,0 +1,4 @@
+
+obj-y := thread.o
+obj-$(CONFIG_PREEMPT_RT) += heap.o
+
diff --git a/kernel/vfcipi/heap.c b/kernel/vfcipi/heap.c
new file mode 100644
index 0000000..5fc4c5e
--- /dev/null
+++ b/kernel/vfcipi/heap.c
@@ -0,0 +1,136 @@
+/*
+ * kernel/vfcipi/heap
+ *
+ * kmalloc(GFP_ATOMIC) is currently broken on RT.  This file implements a
+ * simple heap manager that supports true GFP_ATOMIC like guarantees in the
+ * interim.
+ *
+ * Copyright (C) 2007 Novell, Gregory Haskins <ghaskins@novell.com>
+ *
+ * This code is licensed under the GPLv2
+ */
+
+#include <linux/sched.h>
+
+struct vfcipi_heap {
+	raw_spinlock_t   lock;
+	char            *data;
+	int              element_size;
+	struct list_head free;
+	struct list_head inuse;
+};
+
+#define VFCIPI_HEAP_MAGIC 0xf347ab23
+
+struct vfcipi_heap_item {
+	u32                 magic;
+	struct vfcipi_heap *heap;
+	struct list_head    list;
+	u8                  inuse;
+	char                data[1];
+};
+
+static __init int _vfcipi_heap_init(struct vfcipi_heap *heap,
+				    int element_size, int nr_elements)
+{
+	size_t actual_size = (element_size + sizeof(struct vfcipi_heap_item) - 1);
+	int i;
+
+	heap->data = kzalloc(actual_size * nr_elements, GFP_KERNEL);
+	if (!heap->data)
+		return -ENOMEM;
+
+	spin_lock_init(&heap->lock);
+	heap->element_size = element_size;
+	INIT_LIST_HEAD(&heap->free);
+	INIT_LIST_HEAD(&heap->inuse);
+
+	for (i = 0; i<nr_elements; ++i) {
+		struct vfcipi_heap_item *hi;
+		size_t offset = i*actual_size;
+
+		hi = (struct vfcipi_heap_item*)&heap->data[offset];
+
+		hi->magic = VFCIPI_HEAP_MAGIC;
+		hi->heap  = heap;
+		hi->inuse = 0;
+		INIT_LIST_HEAD(&hi->list);
+		list_add_tail(&hi->list, &heap->free);
+	}
+
+	return 0;
+}
+
+static void* _vfcipi_heap_alloc(struct vfcipi_heap *heap)
+{
+	void *ptr = NULL;
+	struct vfcipi_heap_item *hi;
+
+	spin_lock(&heap->lock);
+
+	if (!list_empty(&heap->free)) {
+		hi = list_first_entry(&heap->free,
+				      struct vfcipi_heap_item, list);
+		BUG_ON(!hi);
+		list_del_init(&hi->list);
+
+		ptr = &hi->data[0];
+
+		list_add_tail(&hi->list, &heap->inuse);
+		hi->inuse = 1;
+
+	}
+
+	spin_unlock(&heap->lock);
+
+	return ptr;
+}
+
+void vfcipi_heap_free(void *ptr)
+{
+	struct vfcipi_heap_item *hi;
+	struct vfcipi_heap *heap;
+
+	hi = container_of(ptr, struct vfcipi_heap_item, data);
+
+	BUG_ON(hi->magic != VFCIPI_HEAP_MAGIC);
+	BUG_ON(!hi->inuse);
+
+	heap = hi->heap;
+
+	spin_lock(&heap->lock);
+
+	list_del_init(&hi->list);
+	list_add_tail(&hi->list, &heap->free);
+	hi->inuse = 0;
+
+	spin_unlock(&heap->lock);
+}
+
+static struct vfcipi_heap vfcipi_heap;
+
+__init void vfcipi_heap_init(int element_size, int nr_elements)
+{
+	_vfcipi_heap_init(&vfcipi_heap, element_size, nr_elements);
+}
+
+void* vfcipi_heap_alloc(size_t size)
+{
+	BUG_ON(size > vfcipi_heap.element_size);
+
+	return _vfcipi_heap_alloc(&vfcipi_heap);
+}
+
+void* vfcipi_heap_zalloc(size_t size)
+{
+	void *ptr = vfcipi_heap_alloc(size);
+	if (ptr)
+		memset(ptr, 0, size);
+
+	return ptr;
+}
+
+
+
+
+
diff --git a/kernel/vfcipi/heap.h b/kernel/vfcipi/heap.h
new file mode 100644
index 0000000..3cd264e
--- /dev/null
+++ b/kernel/vfcipi/heap.h
@@ -0,0 +1,20 @@
+#ifndef _VFCIPI_HEAP_H
+#define _VFCIPI_HEAP_H
+
+#ifdef CONFIG_PREEMPT_RT
+
+void vfcipi_heap_init(int element_size, int nr_elements);
+void* vfcipi_heap_alloc(size_t);
+void* vfcipi_heap_zalloc(size_t);
+void  vfcipi_heap_free(void *);
+
+#else
+
+#define vfcipi_heap_init(element_size, nr_elements) {}
+#define vfcipi_heap_alloc(size_t size) kmalloc(size, GFP_ATOMIC);
+#define vfcipi_heap_alloc(size_t size) kzalloc(size, GFP_ATOMIC);
+#define vfcipi_heap_free(void *ptr)    kfree(ptr);
+
+#endif
+
+#endif /* _VFCIPI_HEAP_H */
diff --git a/kernel/vfcipi/thread.c b/kernel/vfcipi/thread.c
new file mode 100644
index 0000000..0f1ef90
--- /dev/null
+++ b/kernel/vfcipi/thread.c
@@ -0,0 +1,454 @@
+/*
+ * kernel/vfcipi/thread
+ *
+ * Preemptible Function-Call-IPI Support
+ * -------------------------------------
+ *  This code allows FUNCTION_CALL IPIs to become preemptible by executing
+ *  them in kthread context instead of interrupt context.  They are referred
+ *  to as "Virtual Function Call IPIs" (VFCIPI) because we no longer rely
+ *  on the actual FCIPI facility.  Instead we schedule a thread to run.  This
+ *  essentially replaces the synchronous FCIPI with an async RESCHEDULE IPI.
+ *
+ *  Since the function will be executed in kthread context, it is fully
+ *  sleepable and preemptible, thus providing more determinism.  It also allows
+ *  code that was written to expect spin_locks to work properly, even though
+ *  they may have converted to rt_mutex under the hood.  In summary, this
+ *  subsystem does for FCIPI interrupts what PREEMPT_HARDIRQs does for normal
+ *  interrupts.
+ *
+ * Copyright (C) 2007 Novell, Gregory Haskins <ghaskins@novell.com>
+ *
+ * This code is licensed under the GPLv2
+ */
+
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/hardirq.h>
+#include <linux/irqflags.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+
+#include <asm/atomic.h>
+#include <asm/cmpxchg.h>
+
+#include "heap.h"
+
+struct vfcipi_status {
+	atomic_t            curr;
+	int                 threshold;
+	struct task_struct *task;
+};
+
+struct vfcipi_workitem {
+	atomic_t              ref;
+	void                 (*func)(void *data);
+	void                 *data;
+	int                   prio;
+	struct vfcipi_status  started;
+	struct vfcipi_status  finished;
+};
+
+struct vfcipi_queueitem {
+	struct list_head        list;
+	struct vfcipi_workitem *item;
+};
+
+struct prio_array {
+	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+	unsigned long    count;
+	struct list_head queue[MAX_RT_PRIO];
+};
+
+struct vfcipi_task {
+	raw_spinlock_t      lock;
+	struct task_struct *task;
+	struct prio_array   rt_rq; /* Real-time request queue */
+	struct list_head    rq;    /* Normal request queue */
+};
+
+static DEFINE_PER_CPU(struct vfcipi_task*, vfcipi_tasks);
+
+/*
+ * ----------------------------------------
+ * prio_array
+ * ----------------------------------------
+ */
+static void prio_array_init(struct prio_array *array)
+{
+	int i;
+
+	memset(array->bitmap, 0, sizeof(array->bitmap));
+	array->count = 0;
+
+	for (i=0; i<MAX_RT_PRIO; i++)
+		INIT_LIST_HEAD(&array->queue[i]);
+}
+
+/* Note: prio_array code credit goes to the RT scheduler...*/
+static struct vfcipi_queueitem* prio_array_dequeue(struct prio_array *array)
+{
+	struct list_head        *head;
+	struct vfcipi_queueitem *qi;
+	int			 idx;
+
+	if (!array->count)
+		return NULL;
+
+	idx = sched_find_first_bit(array->bitmap);
+
+	head = array->queue + idx;
+
+	/* If we got here, there better be something in the list */
+	BUG_ON(!head);
+	BUG_ON(list_empty(head));
+
+	qi = list_first_entry(head, struct vfcipi_queueitem, list);
+	BUG_ON(!qi);
+
+	list_del(&qi->list);
+	array->count--;
+
+	if (list_empty(head))
+		__clear_bit(idx, &array->bitmap);
+
+	return qi;
+}
+
+static void prio_array_enqueue(struct prio_array *array,
+			       struct vfcipi_queueitem *qi,
+			       int prio)
+{
+	struct list_head *head;
+
+	BUG_ON(prio > MAX_RT_PRIO);
+
+	head = array->queue + prio;
+	list_add_tail(&qi->list, head);
+	__set_bit(prio, &array->bitmap);
+	array->count++;
+}
+
+/*
+ * ----------------------------------------
+ * vfcipi_status
+ * ----------------------------------------
+ */
+static void vfcipi_status_init(struct vfcipi_status *s, int threshold,
+			       int wait)
+{
+	atomic_set(&s->curr, 0);
+	s->threshold = threshold;
+
+	if (wait && !in_atomic() && !irqs_disabled())
+		s->task = current;
+}
+
+static void vfcipi_status_signal(struct vfcipi_status *s)
+{
+	int curr = atomic_inc_return(&s->curr);
+
+	if (s->task && (curr >= s->threshold))
+		wake_up_process(s->task);
+}
+
+static void vfcipi_status_wait(struct vfcipi_status *s)
+{
+	while (1) {
+		if (s->task)
+			set_current_state(TASK_UNINTERRUPTIBLE);
+
+		if (atomic_read(&s->curr) != s->threshold) {
+			if (s->task) {
+				schedule();
+			} else
+				cpu_relax();
+		} else
+			break;
+	}
+
+	set_current_state(TASK_RUNNING);
+}
+
+/*
+ * ----------------------------------------
+ * vfcipi_workitem
+ * ----------------------------------------
+ */
+static struct vfcipi_workitem*
+vfcipi_workitem_init(void (*func)(void *data), void *data, int nr_cpus,
+		     int wait)
+{
+	struct vfcipi_workitem *item = vfcipi_heap_zalloc(sizeof(*item));
+	if (!item)
+		return NULL;
+
+	atomic_set(&item->ref, 1);
+	item->func = func;
+	item->data = data;
+	item->prio = -1;
+
+	/*
+	 * Theres no need to wait for both a start and a finish event.	You
+	 * really only need one.  Therefore, we exclusively select one based
+	 * on the *wait* variable
+	 */
+	vfcipi_status_init(&item->started, nr_cpus, !wait);
+	vfcipi_status_init(&item->finished, nr_cpus, wait);
+
+	return item;
+}
+
+static void vfcipi_workitem_dropref(struct vfcipi_workitem *item)
+{
+	if (atomic_dec_and_test(&item->ref))
+		vfcipi_heap_free(item);
+}
+
+static void vfcipi_workitem_wait(struct vfcipi_workitem *item, int wait)
+{
+	if (!wait)
+		/*
+		 * If the user indicated we should not wait, we will still wait
+		 * for the execution to at least start.	 This is how the
+		 * standard IPI based FUNCTION_CALL works, so we will replicate
+		 * that behavior.
+		 */
+		vfcipi_status_wait(&item->started);
+	else
+		/*
+		 * Likewise, if they selected to wait, we will wait until the
+		 * function completes entirely.
+		 */
+		vfcipi_status_wait(&item->finished);
+
+}
+
+/*
+ * ----------------------------------------
+ * vfcipi_thread - daemon process for vfcipi per CPU
+ * ----------------------------------------
+ */
+static int vfcipi_thread(void *data)
+{
+	struct vfcipi_task *ftask = per_cpu(vfcipi_tasks,
+					    raw_smp_processor_id());
+
+	while (1) {
+		struct vfcipi_workitem *item;
+		struct vfcipi_queueitem *qi;
+
+		spin_lock(&ftask->lock);
+
+		/* First check the RT items */
+		qi  = prio_array_dequeue(&ftask->rt_rq);
+		if (!qi) {
+			/* If nothing is found there, check the normal queue */
+			if (!list_empty(&ftask->rq)) {
+				qi = list_first_entry(&ftask->rq,
+						      struct vfcipi_queueitem,
+						      list);
+				BUG_ON(!qi);
+				list_del(&qi->list);
+			}
+		}
+
+		if (!qi) {
+			/* Nothing to process for now.. */
+			set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock(&ftask->lock);
+			schedule();
+			continue;
+		}
+
+		spin_unlock(&ftask->lock);
+
+		/*
+		 * Extract the real pointer and discard the queueitem shell.
+		 * We no longer need it.
+		 */
+		item = qi->item;
+		vfcipi_heap_free(qi);
+
+		/*
+		 * Execute the actual user-provided function
+		 */
+		vfcipi_status_signal(&item->started);
+		item->func(item->data);
+		vfcipi_status_signal(&item->finished);
+
+		vfcipi_workitem_dropref(item);
+	}
+}
+
+/*
+ * ----------------------------------------
+ * client side code
+ * ----------------------------------------
+ */
+static int vfcipi_enqueue(struct vfcipi_workitem *item, int cpu)
+{
+	struct vfcipi_task      *ftask = per_cpu(vfcipi_tasks, cpu);
+	struct vfcipi_queueitem *qi    = vfcipi_heap_alloc(sizeof(*qi));
+
+	BUG_ON(!ftask);
+
+	if (!qi)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&qi->list);
+	qi->item = item;
+
+	/*
+	 * We increment the ref count here right before the list insertion.
+	 * It will get decremented when the kthread finishes processing it
+	 */
+	atomic_inc(&item->ref);
+
+	spin_lock(&ftask->lock);
+
+#ifdef NOT_YET
+	if (rt_task(current)) {
+		item->prio = task_prio(current);
+		prio_array_enqueue(&ftask->rt_rq, qi, item->prio);
+
+		/* Priority inheritance on the kthread */
+		if (task_prio(ftask->task) < item->prio)
+			set_prio_somehow(ftask->task, item->prio);
+	} else
+#endif
+		list_add_tail(&qi->list, &ftask->rq);
+
+	wake_up_process(ftask->task);
+
+	spin_unlock(&ftask->lock);
+
+	return 0;
+}
+
+static int vfcipi_call_function_single(int cpu, void (*func)(void *data),
+				      void *data, int nonatomic, int wait)
+{
+	struct vfcipi_workitem *item;
+	int ret;
+
+	item = vfcipi_workitem_init(func, data, 1, wait);
+
+	ret = vfcipi_enqueue(item, cpu);
+	if (ret < 0)
+		goto out;
+
+	vfcipi_workitem_wait(item, wait);
+
+ out:
+	/* We are finished with the reference in this context */
+	vfcipi_workitem_dropref(item);
+
+	return ret;
+}
+
+static int vfcipi_call_function(void (*func)(void *data), void *data,
+			       int nonatomic, int wait)
+{
+	struct vfcipi_workitem *item;
+	int ret = 0;
+	int cpu;
+	int mycpu = raw_smp_processor_id();
+	int nr_cpus = num_online_cpus()-1;
+
+	item = vfcipi_workitem_init(func, data, nr_cpus, wait);
+
+	for_each_online_cpu(cpu) {
+		if (cpu != mycpu) {
+			ret = vfcipi_enqueue(item, cpu);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+	vfcipi_workitem_wait(item, wait);
+
+ out:
+	/* We are finished with the reference in this context */
+	vfcipi_workitem_dropref(item);
+
+	return ret;
+}
+
+struct vfcipi_vtable {
+	int (*call_single)(int cpu, void (*func)(void *data),
+			   void *data, int nonatomic, int wait);
+	int (*call_allbutself)(void (*func)(void *data), void *data,
+			       int nonatomic, int wait);
+};
+
+static struct vfcipi_vtable vfcipi_vtable__nodelay = {
+	.call_single     = smp_call_function_single__nodelay,
+	.call_allbutself = smp_call_function__nodelay
+};
+
+static struct vfcipi_vtable vfcipi_vtable__threaded = {
+	.call_single     = vfcipi_call_function_single,
+	.call_allbutself = vfcipi_call_function
+};
+
+/*
+ * By default the system will fall back on the __nodelay implementation
+ * since the __threaded version will not be online until the vfcipi_init()
+ * function has a chance to run
+ */
+static struct vfcipi_vtable *vfcipi_vtable = &vfcipi_vtable__nodelay;
+
+int smp_call_function_single(int cpu, void (*func)(void *data),
+				       void *data, int nonatomic, int wait)
+{
+	return vfcipi_vtable->call_single(cpu, func, data, nonatomic, wait);
+}
+EXPORT_SYMBOL(smp_call_function_single);
+
+int smp_call_function(void (*func)(void *data), void *data,
+				int nonatomic, int wait)
+{
+	return vfcipi_vtable->call_allbutself(func, data, nonatomic, wait);
+}
+EXPORT_SYMBOL(smp_call_function);
+
+int __init vfcipi_init(void)
+{
+	int cpu;
+	struct vfcipi_vtable *old;
+
+	vfcipi_heap_init(sizeof(struct vfcipi_workitem), 4096);
+
+	for_each_present_cpu(cpu) {
+		struct vfcipi_task *ftask = kzalloc(sizeof(*ftask),
+						    GFP_KERNEL);
+
+		if (!ftask)
+			goto out_free;
+
+		spin_lock_init(&ftask->lock);
+		prio_array_init(&ftask->rt_rq);
+		INIT_LIST_HEAD(&ftask->rq);
+		per_cpu(vfcipi_tasks, cpu) = ftask;
+
+		ftask->task = kthread_create(vfcipi_thread, NULL,
+					     "vfcipi/%d", cpu);
+		kthread_bind(ftask->task, cpu);
+
+		wake_up_process(ftask->task);
+	}
+
+	/* Now atomically switch to threaded mode */
+	old = xchg(&vfcipi_vtable, &vfcipi_vtable__threaded);
+
+	return 0;
+
+ out_free:
+	for_each_present_cpu(cpu) {
+		struct vfcipi_task *ftask = per_cpu(vfcipi_tasks, cpu);
+		kfree(ftask);
+		per_cpu(vfcipi_tasks, cpu) = NULL;
+	}
+
+	return -ENOMEM;
+}

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] RT: Preemptible Function-Call-IPI Support
  2007-07-30 21:15 ` [PATCH 1/2] RT: Preemptible Function-Call-IPI Support Gregory Haskins
@ 2007-07-31  9:19   ` Ingo Molnar
  2007-07-31  9:21     ` Ingo Molnar
  0 siblings, 1 reply; 14+ messages in thread
From: Ingo Molnar @ 2007-07-31  9:19 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: linux-rt-users


* Gregory Haskins <ghaskins@novell.com> wrote:

> This code allows FUNCTION_CALL IPIs to become preemptible by executing 
> them in kthread context instead of interrupt context.  They are 
> referred to as "Virtual Function Call IPIs" (VFCIPI) because we no 
> longer rely on the actual FCIPI facility.  Instead we schedule a 
> thread to run.  This essentially replaces the synchronous FCIPI with 
> an async RESCHEDULE IPI.

why do we need this? It's quite complex and brings little extra AFAICS. 
See the "schedule_on_each_cpu-enhance.patch" from Peter Ziljstra that 
lets a function to be executed on all CPUs. That should be extended 
(trivially) to execute a function on another CPU. That's all we need.

	Ingo

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] RT: Preemptible Function-Call-IPI Support
  2007-07-31  9:19   ` Ingo Molnar
@ 2007-07-31  9:21     ` Ingo Molnar
  2007-07-31  9:25       ` Ingo Molnar
  0 siblings, 1 reply; 14+ messages in thread
From: Ingo Molnar @ 2007-07-31  9:21 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: linux-rt-users, linux-kernel

[ mail re-sent with lkml Cc:-ed. _Please_ Cc: all patches to lkml too! 
  Unless you want -rt to suffer the fate of -ck, keep upstream involved 
  all the time. The recent /proc/interrupts-all discussion with upstream 
  folks showed the clear benefits of that approach. ]

* Gregory Haskins <ghaskins@novell.com> wrote:

> This code allows FUNCTION_CALL IPIs to become preemptible by executing 
> them in kthread context instead of interrupt context.  They are 
> referred to as "Virtual Function Call IPIs" (VFCIPI) because we no 
> longer rely on the actual FCIPI facility.  Instead we schedule a 
> thread to run.  This essentially replaces the synchronous FCIPI with 
> an async RESCHEDULE IPI.

why do we need this? It's quite complex and brings little extra AFAICS. 
See the "schedule_on_each_cpu-enhance.patch" from Peter Ziljstra that 
lets a function to be executed on all CPUs. That should be extended 
(trivially) to execute a function on another CPU. That's all we need. 

	Ingo

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] RT: Preemptible Function-Call-IPI Support
  2007-07-31  9:21     ` Ingo Molnar
@ 2007-07-31  9:25       ` Ingo Molnar
  2007-07-31 14:26         ` Gregory Haskins
  2007-07-31 14:26         ` Gregory Haskins
  0 siblings, 2 replies; 14+ messages in thread
From: Ingo Molnar @ 2007-07-31  9:25 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: linux-rt-users, linux-kernel


* Ingo Molnar <mingo@elte.hu> wrote:

> * Gregory Haskins <ghaskins@novell.com> wrote:
>  
> > This code allows FUNCTION_CALL IPIs to become preemptible by 
> > executing them in kthread context instead of interrupt context.  
> > They are referred to as "Virtual Function Call IPIs" (VFCIPI) 
> > because we no longer rely on the actual FCIPI facility.  Instead we 
> > schedule a thread to run.  This essentially replaces the synchronous 
> > FCIPI with an async RESCHEDULE IPI.
> 
> why do we need this? It's quite complex and brings little extra 
> AFAICS. See the "schedule_on_each_cpu-enhance.patch" from Peter 
> Ziljstra that lets a function to be executed on all CPUs. That should 
> be extended (trivially) to execute a function on another CPU. That's 
> all we need.

as far as the prioritization of function calls goes, _that_ makes sense, 
but it should not be a separate API but should be done to our normal 
workqueue APIs. That not only extends the effects of priorities to all 
current workqueue using kernel subsystems, but also keeps the API more 
unified. We really dont want to have too many -rt specific APIs.

	Ingo

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] RT: Preemptible Function-Call-IPI Support
  2007-07-31  9:25       ` Ingo Molnar
@ 2007-07-31 14:26         ` Gregory Haskins
  2007-07-31 14:26         ` Gregory Haskins
  1 sibling, 0 replies; 14+ messages in thread
From: Gregory Haskins @ 2007-07-31 14:26 UTC (permalink / raw)
  Cc: linux-kernel, linux-rt-users

>>> On Tue, Jul 31, 2007 at  5:25 AM, in message <20070731092521.GA16177@elte.hu>,
Ingo Molnar <mingo@elte.hu> wrote: 


> as far as the prioritization of function calls goes, _that_ makes sense, 
> but it should not be a separate API but should be done to our normal 
> workqueue APIs. That not only extends the effects of priorities to all 
> current workqueue using kernel subsystems, but also keeps the API more 
> unified. We really dont want to have too many -rt specific APIs.

I just took a look at the workqueue code .  There are two immediate problems that I see:

1) cpu_workqueue_struct->lock is a spinlock_t and will need to become a raw_spinlock_t

2) The lock is held for the duration of the execution of workqueue items.  We will need to revamp this such that new workqueue items can still be queued even while executing others.

Are these acceptable changes?  If so, I will put together a prototype based around this concept.

Regards,
-Greg

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] RT: Preemptible Function-Call-IPI Support
  2007-07-31  9:25       ` Ingo Molnar
  2007-07-31 14:26         ` Gregory Haskins
@ 2007-07-31 14:26         ` Gregory Haskins
  2007-07-31 20:14           ` Gregory Haskins
  1 sibling, 1 reply; 14+ messages in thread
From: Gregory Haskins @ 2007-07-31 14:26 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, linux-rt-users

>>> On Tue, Jul 31, 2007 at  5:25 AM, in message <20070731092521.GA16177@elte.hu>,
Ingo Molnar <mingo@elte.hu> wrote: 


> as far as the prioritization of function calls goes, _that_ makes sense, 
> but it should not be a separate API but should be done to our normal 
> workqueue APIs. That not only extends the effects of priorities to all 
> current workqueue using kernel subsystems, but also keeps the API more 
> unified. We really dont want to have too many -rt specific APIs.

I just took a look at the workqueue code .  There are two immediate problems that I see:

1) cpu_workqueue_struct->lock is a spinlock_t and will need to become a raw_spinlock_t

2) The lock is held for the duration of the execution of workqueue items.  We will need to revamp this such that new workqueue items can still be queued even while executing others.

Are these acceptable changes?  If so, I will put together a prototype based around this concept.

Regards,
-Greg

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] RT: Preemptible Function-Call-IPI Support
  2007-07-31 14:26         ` Gregory Haskins
@ 2007-07-31 20:14           ` Gregory Haskins
  0 siblings, 0 replies; 14+ messages in thread
From: Gregory Haskins @ 2007-07-31 20:14 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, linux-rt-users

On Tue, 2007-07-31 at 10:26 -0400, Gregory Haskins wrote:
> >>> On Tue, Jul 31, 2007 at  5:25 AM, in message <20070731092521.GA16177@elte.hu>,
> Ingo Molnar <mingo@elte.hu> wrote: 
> 
> 
> > as far as the prioritization of function calls goes, _that_ makes sense, 
> > but it should not be a separate API but should be done to our normal 
> > workqueue APIs. That not only extends the effects of priorities to all 
> > current workqueue using kernel subsystems, but also keeps the API more 
> > unified. We really dont want to have too many -rt specific APIs.
> 
> I just took a look at the workqueue code .  There are two immediate problems that I see:
> 
> 1) cpu_workqueue_struct->lock is a spinlock_t and will need to become a raw_spinlock_t
> 
> 2) The lock is held for the duration of the execution of workqueue items.  We will need to revamp this such that new workqueue items can still be queued even while executing others.
> 

Duh...scratch #2.  I missed the unlock/lock sequence. :P

#1 is still a potential problem, as is the use of completion variables.

-Greg

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/2] RT: Add priority inheritance to the VFCIPI facility
  2007-07-30 21:15 [PATCH 0/2] VFCIPI support v2 Gregory Haskins
  2007-07-30 21:15 ` [PATCH 1/2] RT: Preemptible Function-Call-IPI Support Gregory Haskins
@ 2007-07-30 21:15 ` Gregory Haskins
  2007-07-30 21:34 ` [PATCH 0/2] VFCIPI support v2 Daniel Walker
  2 siblings, 0 replies; 14+ messages in thread
From: Gregory Haskins @ 2007-07-30 21:15 UTC (permalink / raw)
  To: linux-rt-users; +Cc: ghaskins

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/vfcipi/thread.c |   52 +++++++++++++++++++++++++++++++++++++++++-------
 1 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/kernel/vfcipi/thread.c b/kernel/vfcipi/thread.c
index 0f1ef90..c8a3950 100644
--- a/kernel/vfcipi/thread.c
+++ b/kernel/vfcipi/thread.c
@@ -27,6 +27,7 @@
 #include <linux/irqflags.h>
 #include <linux/module.h>
 #include <linux/cpumask.h>
+#include <linux/syscalls.h>
 
 #include <asm/atomic.h>
 #include <asm/cmpxchg.h>
@@ -62,6 +63,7 @@ struct prio_array {
 struct vfcipi_task {
 	raw_spinlock_t      lock;
 	struct task_struct *task;
+	int                 prio;
 	struct prio_array   rt_rq; /* Real-time request queue */
 	struct list_head    rq;    /* Normal request queue */
 };
@@ -120,6 +122,7 @@ static void prio_array_enqueue(struct prio_array *array,
 {
 	struct list_head *head;
 
+	BUG_ON(prio < 0);
 	BUG_ON(prio > MAX_RT_PRIO);
 
 	head = array->queue + prio;
@@ -220,9 +223,33 @@ static void vfcipi_workitem_wait(struct vfcipi_workitem *item, int wait)
 		 * function completes entirely.
 		 */
 		vfcipi_status_wait(&item->finished);
+}
+
+/*
+ * ----------------------------------------
+ * priority-inheritance helpers
+ * ----------------------------------------
+ */
 
+/* Assumes ftask->lock is held */
+static void vfcipi_task_setprio(struct vfcipi_task *ftask, int prio)
+{
+	struct sched_param param = { 0, };
+	pid_t              pid   = ftask->task->pid;
+
+	if (ftask->prio != prio) {
+		if (prio != -1) {
+			param.sched_priority = prio;
+			sys_sched_setscheduler(pid, SCHED_FIFO, &param);
+		} else {
+			sys_sched_setscheduler(pid, SCHED_NORMAL, &param);
+		}
+
+		ftask->prio = prio;
+	}
 }
 
+
 /*
  * ----------------------------------------
  * vfcipi_thread - daemon process for vfcipi per CPU
@@ -254,19 +281,29 @@ static int vfcipi_thread(void *data)
 
 		if (!qi) {
 			/* Nothing to process for now.. */
+
+			/* Set us back to normal priority */
+			vfcipi_task_setprio(ftask, -1);
+
 			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock(&ftask->lock);
 			schedule();
 			continue;
 		}
 
+		item = qi->item;
+
+		/*
+		 * Adjust the priority of our task based on what was pulled
+		 * from the queue.  In theory, its our highest priority item
+		 */
+		vfcipi_task_setprio(ftask, item->prio);
+
 		spin_unlock(&ftask->lock);
 
 		/*
-		 * Extract the real pointer and discard the queueitem shell.
-		 * We no longer need it.
+		 * Discard the shell since the item is already extracted.
 		 */
-		item = qi->item;
 		vfcipi_heap_free(qi);
 
 		/*
@@ -306,16 +343,14 @@ static int vfcipi_enqueue(struct vfcipi_workitem *item, int cpu)
 
 	spin_lock(&ftask->lock);
 
-#ifdef NOT_YET
 	if (rt_task(current)) {
-		item->prio = task_prio(current);
+		item->prio = current->rt_priority;
 		prio_array_enqueue(&ftask->rt_rq, qi, item->prio);
 
 		/* Priority inheritance on the kthread */
-		if (task_prio(ftask->task) < item->prio)
-			set_prio_somehow(ftask->task, item->prio);
+		if (ftask->prio < item->prio)
+			vfcipi_task_setprio(ftask, item->prio);
 	} else
-#endif
 		list_add_tail(&qi->list, &ftask->rq);
 
 	wake_up_process(ftask->task);
@@ -427,6 +462,7 @@ int __init vfcipi_init(void)
 			goto out_free;
 
 		spin_lock_init(&ftask->lock);
+		ftask->prio = -1;
 		prio_array_init(&ftask->rt_rq);
 		INIT_LIST_HEAD(&ftask->rq);
 		per_cpu(vfcipi_tasks, cpu) = ftask;

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/2] VFCIPI support v2
  2007-07-30 21:15 [PATCH 0/2] VFCIPI support v2 Gregory Haskins
  2007-07-30 21:15 ` [PATCH 1/2] RT: Preemptible Function-Call-IPI Support Gregory Haskins
  2007-07-30 21:15 ` [PATCH 2/2] RT: Add priority inheritance to the VFCIPI facility Gregory Haskins
@ 2007-07-30 21:34 ` Daniel Walker
  2 siblings, 0 replies; 14+ messages in thread
From: Daniel Walker @ 2007-07-30 21:34 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: linux-rt-users

On Mon, 2007-07-30 at 17:15 -0400, Gregory Haskins wrote:
> The following series Virtual Function-Call IPI support.  Changes from v1:
> 
> 1) Support for platforms other than x86_64
> 2) Support for priority-inheritance.
> 
> Built/tested on x86_64 four-way SMP machine.  Other platforms untested.

Do you have any information regard the amount that IPI's contribute to
overall system latency? My experience is that most IPIs are relatively
short. I wonder if the effect of a threaded IPI might be worse for
latency than non-threaded ..

Daniel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] RT: Preemptible Function-Call-IPI Support
@ 2007-07-31 11:44 Gregory Haskins
  0 siblings, 0 replies; 14+ messages in thread
From: Gregory Haskins @ 2007-07-31 11:44 UTC (permalink / raw)
  To: mingo; +Cc: linux-kernel, linux-rt-users

On Tue, 2007-07-31 at 11:21 +0200, Ingo Molnar wrote:
> [ mail re-sent with lkml Cc:-ed. _Please_ Cc: all patches to lkml too! 
>   Unless you want -rt to suffer the fate of -ck, keep upstream involved 
>   all the time. The recent /proc/interrupts-all discussion with upstream 
>   folks showed the clear benefits of that approach. ]

My apologies.  I wasn't getting really any responses to my proposal, so
I shortened the distribution to avoid becoming a pest to people that
didn't care.  I will CC both from now on.

> 
> why do we need this? 

I wrote this when I discovered that KVM was having problems with
smp_call_function() on -rt.  It was utilizing spinlock_t which of course
was transparently converted to rt_mutex.  This blew up in the interrupt
context of the FUNCTION_CALL whenever the lock was acquired.  I was
thinking to myself "why is the FCIPI vector being treated any different
that other IRQs?".  That question drove the design/implementation of
this series.

> It's quite complex 

I think if you look closely at the code you will see its actually pretty
straight forward.  However, for whatever complexity you may perceive,
note that I made the choices I did (as opposed to something like
modifying the work-queue infrastructure) because I felt it had the
minimum impact on other subsystems unrelated to FCIPI.  There are, of
course, many ways to skin a rabbit. ;)

> and brings little extra AFAICS.

Brings little extra to what?  Do you think the whole concept of "FCIPIs
in a thread" is a waste of time, or do you just think my implementation
choices are bad?

> See the "schedule_on_each_cpu-enhance.patch" from Peter Ziljstra that 
> lets a function to be executed on all CPUs. That should be extended 
> (trivially) to execute a function on another CPU. That's all we need.

I haven't seen that.  I will take a look.  

The key part of my design is as follows:

1) No new API: smp_call_function_[single]) must just transparently
switch over to threaded mode (Just like the IRQ handler in
PREEMPT_HARDIRQs does)
2) Support priority inheritance:  Unlike normal HARDIRQs which can use a
relatively static priority assignment, FCIPIs are driven by another
software entity which may or may not have RT priority.  Therefore, being
able to execute the call in the same priority as the caller is critical,
IMO.  Calls are sorted and scheduled by priority.
3) More robust parallelism:  mainline smp_call_function has a system
wide serialization point when a call is made.  We should be able to
support a high degree of parallel access to prevent priority inversion.
This means more than one call can be in-flight at a time.  
4) Preemptible/sleepable code on both the caller and callee sides.
Today, both the caller and callee sides of the link are critical
sections with preemption disabled.
5) The API must work from both in_atomic()==1 and in_atomic()==0 modes.
In addition, it will opportunistically sleep while waiting for replies
if in_atomic()==0.

If we can make Peter's patch work within this criteria and people like
it better than what I put forth, that is fine by me.

Regards,
-Greg

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] RT: Preemptible Function-Call-IPI Support
@ 2007-07-31 12:11 Gregory Haskins
  2007-07-31 14:22 ` Ingo Molnar
  0 siblings, 1 reply; 14+ messages in thread
From: Gregory Haskins @ 2007-07-31 12:11 UTC (permalink / raw)
  To: mingo; +Cc: linux-kernel, linux-rt-users

On Tue, 2007-07-31 at 11:25 +0200, Ingo Molnar wrote:
> * Ingo Molnar <mingo@elte.hu> wrote:
> 
> > * Gregory Haskins <ghaskins@novell.com> wrote:
> >  
> > > This code allows FUNCTION_CALL IPIs to become preemptible by 
> > > executing them in kthread context instead of interrupt context.  
> > > They are referred to as "Virtual Function Call IPIs" (VFCIPI) 
> > > because we no longer rely on the actual FCIPI facility.  Instead we 
> > > schedule a thread to run.  This essentially replaces the synchronous 
> > > FCIPI with an async RESCHEDULE IPI.
> > 
> > why do we need this? It's quite complex and brings little extra 
> > AFAICS. See the "schedule_on_each_cpu-enhance.patch" from Peter 
> > Ziljstra that lets a function to be executed on all CPUs. That should 
> > be extended (trivially) to execute a function on another CPU. That's 
> > all we need.
> 
> as far as the prioritization of function calls goes, _that_ makes sense, 
> but it should not be a separate API but should be done to our normal 
> workqueue APIs. That not only extends the effects of priorities to all 
> current workqueue using kernel subsystems, but also keeps the API more 
> unified. We really dont want to have too many -rt specific APIs.

I agree with you that having some kind of priority and cpu-binding
specifiers for work-queues would be very cool indeed.  However, note
that I didn't actually introduce a new API(*), per se.  I simply worked
under the existing smp_call_function[_single]() API.

Using the smp_call_functions is critical design factor, however.  I
really want clients of this function to seamlessly transition to
threaded mode.  So even if we ultimately can modify work-queues to have
those features mentioned, IMO we still need to modify the underlying
smp_call_function API to use the new workqueue stuff.  And more
importantly, the new workqueue APIs will have to be as flexible as the
current implementation to work in various modes (e.g. in_atomic=1 or 0,
etc).

(*)Ok, ok...i admit...There is one new API:  That is the legacy access
to the real FCIPI calls (which BTW, ive changed from
"smp_call_function__nodelay" to "raw_smp_call_function" in my latest
(and unreleased) series.  This makes them more in sync with some of the
other naming conventions in -rt).  So technically you are right ;) but
that is more of an implementation detail.  There are no users of the new
API other than the VFCIPI code.

Regards,
-Greg  

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] RT: Preemptible Function-Call-IPI Support
  2007-07-31 12:11 Gregory Haskins
@ 2007-07-31 14:22 ` Ingo Molnar
  0 siblings, 0 replies; 14+ messages in thread
From: Ingo Molnar @ 2007-07-31 14:22 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: linux-kernel, linux-rt-users


* Gregory Haskins <ghaskins@novell.com> wrote:

> > as far as the prioritization of function calls goes, _that_ makes 
> > sense, but it should not be a separate API but should be done to our 
> > normal workqueue APIs. That not only extends the effects of 
> > priorities to all current workqueue using kernel subsystems, but 
> > also keeps the API more unified. We really dont want to have too 
> > many -rt specific APIs.
> 
> I agree with you that having some kind of priority and cpu-binding 
> specifiers for work-queues would be very cool indeed.  However, note 
> that I didn't actually introduce a new API(*), per se.  I simply 
> worked under the existing smp_call_function[_single]() API.
> 
> Using the smp_call_functions is critical design factor, however.  I 
> really want clients of this function to seamlessly transition to 
> threaded mode. [...]

well, 'clients' of this function are low-level architectural bits like 
the scheduler and the TLB flush code which stays atomic nevertheless. 
smp_call_function() is _not_ a true generic framework and to 'thread' it 
is wrong and misplaced and leads to the kind of over-complification that 
your patch shows. Please work based on the workqueue APIs.

	Ingo

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 0/2][RFC] VFCIPI v3
@ 2007-07-31 13:24 Gregory Haskins
  2007-07-31 13:24 ` [PATCH 1/2] RT: Preemptible Function-Call-IPI Support Gregory Haskins
  0 siblings, 1 reply; 14+ messages in thread
From: Gregory Haskins @ 2007-07-31 13:24 UTC (permalink / raw)
  To: linux-rt-users; +Cc: linux-kernel, ghaskins

Changelog from v2:

1) Converted "smp_call_funtion[_single]__nodelay" to
   "raw_smp_call_function[_single]" to match existing nomenclature in the -rt
   series.

2) Removed all PI related code from Patch #1 and moved it to #2 where it
   belonged.

Signed-off-by: Gregory Haskins <ghaskins@novell.com> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] RT: Preemptible Function-Call-IPI Support
  2007-07-31 13:24 [PATCH 0/2][RFC] VFCIPI v3 Gregory Haskins
@ 2007-07-31 13:24 ` Gregory Haskins
  0 siblings, 0 replies; 14+ messages in thread
From: Gregory Haskins @ 2007-07-31 13:24 UTC (permalink / raw)
  To: linux-rt-users; +Cc: linux-kernel, ghaskins

This code allows FUNCTION_CALL IPIs to become preemptible by executing
them in kthread context instead of interrupt context.  They are referred
to as "Virtual Function Call IPIs" (VFCIPI) because we no longer rely
on the actual FCIPI facility.  Instead we schedule a thread to run.  This
essentially replaces the synchronous FCIPI with an async RESCHEDULE IPI.

Since the function will be executed in kthread context, it is fully
sleepable and preemptible, thus providing more determinism.  It also allows
code that was written to expect spin_locks to work properly, even though
they may have converted to rt_mutex under the hood.  In summary, this
subsystem does for FCIPI interrupts what PREEMPT_HARDIRQs does for normal
interrupts.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 arch/i386/kernel/smpcommon.c |   16 +-
 arch/ia64/kernel/smp.c       |    8 -
 arch/powerpc/kernel/smp.c    |   12 +
 arch/x86_64/kernel/smp.c     |   18 +-
 include/linux/smp.h          |   25 ++-
 include/linux/vfcipi.h       |   10 +
 init/main.c                  |    3 
 kernel/Kconfig.preempt       |   12 +
 kernel/Makefile              |    1 
 kernel/vfcipi/Makefile       |    4 
 kernel/vfcipi/heap.c         |  136 +++++++++++++++
 kernel/vfcipi/heap.h         |   20 ++
 kernel/vfcipi/thread.c       |  372 ++++++++++++++++++++++++++++++++++++++++++
 13 files changed, 607 insertions(+), 30 deletions(-)

diff --git a/arch/i386/kernel/smpcommon.c b/arch/i386/kernel/smpcommon.c
index 1868ae1..e352773 100644
--- a/arch/i386/kernel/smpcommon.c
+++ b/arch/i386/kernel/smpcommon.c
@@ -25,7 +25,7 @@ __cpuinit void init_gdt(int cpu)
 
 
 /**
- * smp_call_function(): Run a function on all other CPUs.
+ * raw_smp_call_function(): Run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
  * @nonatomic: Unused.
@@ -39,15 +39,15 @@ __cpuinit void init_gdt(int cpu)
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
-		      int wait)
+int raw_smp_call_function(void (*func) (void *info), void *info,
+			       int nonatomic, int wait)
 {
 	return smp_call_function_mask(cpu_online_map, func, info, wait);
 }
-EXPORT_SYMBOL(smp_call_function);
+EXPORT_SYMBOL(raw_smp_call_function);
 
 /**
- * smp_call_function_single - Run a function on another CPU
+ * raw_smp_call_function_single - Run a function on another CPU
  * @cpu: The target CPU.  Cannot be the calling CPU.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
@@ -59,8 +59,8 @@ EXPORT_SYMBOL(smp_call_function);
  * If @wait is true, then returns once @func has returned; otherwise
  * it returns just before the target cpu calls @func.
  */
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-			     int nonatomic, int wait)
+int raw_smp_call_function_single(int cpu, void (*func) (void *info),
+				      void *info, int nonatomic, int wait)
 {
 	/* prevent preemption and reschedule on another processor */
 	int ret;
@@ -76,4 +76,4 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 	put_cpu();
 	return ret;
 }
-EXPORT_SYMBOL(smp_call_function_single);
+EXPORT_SYMBOL(raw_smp_call_function_single);
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 2256b08..3c65867 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -374,7 +374,7 @@ smp_flush_tlb_mm (struct mm_struct *mm)
  */
 
 int
-smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int nonatomic,
+raw_smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int nonatomic,
 			  int wait)
 {
 	struct call_data_struct data;
@@ -413,7 +413,7 @@ smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int
 	put_cpu();
 	return 0;
 }
-EXPORT_SYMBOL(smp_call_function_single);
+EXPORT_SYMBOL(raw_smp_call_function_single);
 
 /*
  * this function sends a 'generic call function' IPI to all other CPUs
@@ -435,7 +435,7 @@ EXPORT_SYMBOL(smp_call_function_single);
  * hardware interrupt handler or from a bottom half handler.
  */
 int
-smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait)
+raw_smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait)
 {
 	struct call_data_struct data;
 	int cpus;
@@ -473,7 +473,7 @@ smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wai
 	spin_unlock(&call_lock);
 	return 0;
 }
-EXPORT_SYMBOL(smp_call_function);
+EXPORT_SYMBOL(raw_smp_call_function);
 
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index c4987d9..a6cfab8 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -283,15 +283,15 @@ int smp_call_function_map(void (*func) (void *info), void *info, int nonatomic,
 	return ret;
 }
 
-int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
-			int wait)
+int raw_smp_call_function(void (*func) (void *info), void *info,
+			       int nonatomic, int wait)
 {
 	return smp_call_function_map(func,info,nonatomic,wait,cpu_online_map);
 }
-EXPORT_SYMBOL(smp_call_function);
+EXPORT_SYMBOL(raw_smp_call_function);
 
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int nonatomic,
-			int wait)
+int raw_smp_call_function_single(int cpu, void (*func) (void *info),
+				      void *info, int nonatomic, int wait)
 {
 	cpumask_t map = CPU_MASK_NONE;
 	int ret = -EBUSY;
@@ -305,7 +305,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int
 	put_cpu();
 	return ret;
 }
-EXPORT_SYMBOL(smp_call_function_single);
+EXPORT_SYMBOL(raw_smp_call_function_single);
 
 void smp_call_function_interrupt(void)
 {
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 8cf7a0d..aa77510 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -367,7 +367,7 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 
 /*
- * smp_call_function_single - Run a function on another CPU
+ * raw_smp_call_function_single - Run a function on another CPU
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
  * @nonatomic: Currently unused.
@@ -378,9 +378,9 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info,
  * Does not return until the remote CPU is nearly ready to execute <func>
  * or is or has executed.
  */
-
-int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
-	int nonatomic, int wait)
+int
+raw_smp_call_function_single (int cpu, void (*func) (void *info),
+				  void *info, int nonatomic, int wait)
 {
 	/* prevent preemption and reschedule on another processor */
 	int me = get_cpu();
@@ -398,7 +398,7 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
 	put_cpu();
 	return 0;
 }
-EXPORT_SYMBOL(smp_call_function_single);
+EXPORT_SYMBOL(raw_smp_call_function_single);
 
 /*
  * this function sends a 'generic call function' IPI to all other CPUs
@@ -437,7 +437,7 @@ static void __smp_call_function (void (*func) (void *info), void *info,
 }
 
 /*
- * smp_call_function - run a function on all other CPUs.
+ * raw_smp_call_function - run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
  * @nonatomic: currently unused.
@@ -451,15 +451,15 @@ static void __smp_call_function (void (*func) (void *info), void *info,
  * hardware interrupt handler or from a bottom half handler.
  * Actually there are a few legal cases, like panic.
  */
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-			int wait)
+int raw_smp_call_function (void (*func) (void *info), void *info,
+				int nonatomic, int wait)
 {
 	spin_lock(&call_lock);
 	__smp_call_function(func,info,nonatomic,wait);
 	spin_unlock(&call_lock);
 	return 0;
 }
-EXPORT_SYMBOL(smp_call_function);
+EXPORT_SYMBOL(raw_smp_call_function);
 
 static void stop_this_cpu(void *dummy)
 {
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 442f87b..d0b6d61 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -62,10 +62,29 @@ extern void smp_cpus_done(unsigned int max_cpus);
 /*
  * Call a function on all other processors
  */
-int smp_call_function(void(*func)(void *info), void *info, int retry, int wait);
 
-int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
-				int retry, int wait);
+int raw_smp_call_function(void(*func)(void *info), void *info,
+			  int retry, int wait);
+
+int raw_smp_call_function_single(int cpuid, void (*func) (void *info),
+				 void *info, int retry, int wait);
+
+#ifdef CONFIG_PREEMPT_FCIPI
+
+int smp_call_function(void(*func)(void *info), void *info,
+		      int retry, int wait);
+
+int smp_call_function_single(int cpuid, void (*func) (void *info),
+			     void *info, int retry, int wait);
+
+#else
+
+#define smp_call_function(func, info, retry, wait)  \
+       raw_smp_call_function(func, info, retry, wait)
+#define smp_call_function_single(cpuid, func, info, retry, wait) \
+       raw_smp_call_function_single(cpuid, func, info, retry, wait)
+
+#endif /* CONFIG_PREEMPT_FCIPI */
 
 /*
  * Call a function on all processors
diff --git a/include/linux/vfcipi.h b/include/linux/vfcipi.h
new file mode 100644
index 0000000..8cedf21
--- /dev/null
+++ b/include/linux/vfcipi.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_VFCIPI_H
+#define _LINUX_VFCIPI_H
+
+#ifdef CONFIG_PREEMPT_FCIPI
+extern int vfcipi_init(void);
+#else
+#define vfcipi_init() {}
+#endif
+
+#endif /* */
diff --git a/init/main.c b/init/main.c
index 9829b27..ff28740 100644
--- a/init/main.c
+++ b/init/main.c
@@ -57,6 +57,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
 #include <linux/kthread.h>
+#include <linux/vfcipi.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -842,6 +843,8 @@ static int __init kernel_init(void * unused)
 
 	do_basic_setup();
 
+	vfcipi_init();
+
 	/*
 	 * check if there is an early userspace init.  If yes, let it do all
 	 * the work
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 8355494..f509ccf 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -120,6 +120,18 @@ config PREEMPT_HARDIRQS
 
 	  Say N if you are unsure.
 
+config PREEMPT_FCIPI
+	bool "Thread Function-Call Interprocessor Interrupts"
+	default n
+	depends on SMP
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          FUNCTION_CALL IPIs. This means that all (or selected) FCIPIs will
+	  run in their own kernel thread context. While this helps latency,
+          this feature can also reduce performance.
+
+	  Say N if you are unsure.
+
 config SPINLOCK_BKL
 	bool "Old-Style Big Kernel Lock"
 	depends on (PREEMPT || SMP) && !PREEMPT_RT
diff --git a/kernel/Makefile b/kernel/Makefile
index e592de8..ab1a8ae 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_PREEMPT_RT) += rt.o
+obj-$(CONFIG_PREEMPT_FCIPI) += vfcipi/
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/vfcipi/Makefile b/kernel/vfcipi/Makefile
new file mode 100644
index 0000000..55100fa
--- /dev/null
+++ b/kernel/vfcipi/Makefile
@@ -0,0 +1,4 @@
+
+obj-y := thread.o
+obj-$(CONFIG_PREEMPT_RT) += heap.o
+
diff --git a/kernel/vfcipi/heap.c b/kernel/vfcipi/heap.c
new file mode 100644
index 0000000..5fc4c5e
--- /dev/null
+++ b/kernel/vfcipi/heap.c
@@ -0,0 +1,136 @@
+/*
+ * kernel/vfcipi/heap
+ *
+ * kmalloc(GFP_ATOMIC) is currently broken on RT.  This file implements a
+ * simple heap manager that supports true GFP_ATOMIC like guarantees in the
+ * interim.
+ *
+ * Copyright (C) 2007 Novell, Gregory Haskins <ghaskins@novell.com>
+ *
+ * This code is licensed under the GPLv2
+ */
+
+#include <linux/sched.h>
+
+struct vfcipi_heap {
+	raw_spinlock_t   lock;
+	char            *data;
+	int              element_size;
+	struct list_head free;
+	struct list_head inuse;
+};
+
+#define VFCIPI_HEAP_MAGIC 0xf347ab23
+
+struct vfcipi_heap_item {
+	u32                 magic;
+	struct vfcipi_heap *heap;
+	struct list_head    list;
+	u8                  inuse;
+	char                data[1];
+};
+
+static __init int _vfcipi_heap_init(struct vfcipi_heap *heap,
+				    int element_size, int nr_elements)
+{
+	size_t actual_size = (element_size + sizeof(struct vfcipi_heap_item) - 1);
+	int i;
+
+	heap->data = kzalloc(actual_size * nr_elements, GFP_KERNEL);
+	if (!heap->data)
+		return -ENOMEM;
+
+	spin_lock_init(&heap->lock);
+	heap->element_size = element_size;
+	INIT_LIST_HEAD(&heap->free);
+	INIT_LIST_HEAD(&heap->inuse);
+
+	for (i = 0; i<nr_elements; ++i) {
+		struct vfcipi_heap_item *hi;
+		size_t offset = i*actual_size;
+
+		hi = (struct vfcipi_heap_item*)&heap->data[offset];
+
+		hi->magic = VFCIPI_HEAP_MAGIC;
+		hi->heap  = heap;
+		hi->inuse = 0;
+		INIT_LIST_HEAD(&hi->list);
+		list_add_tail(&hi->list, &heap->free);
+	}
+
+	return 0;
+}
+
+static void* _vfcipi_heap_alloc(struct vfcipi_heap *heap)
+{
+	void *ptr = NULL;
+	struct vfcipi_heap_item *hi;
+
+	spin_lock(&heap->lock);
+
+	if (!list_empty(&heap->free)) {
+		hi = list_first_entry(&heap->free,
+				      struct vfcipi_heap_item, list);
+		BUG_ON(!hi);
+		list_del_init(&hi->list);
+
+		ptr = &hi->data[0];
+
+		list_add_tail(&hi->list, &heap->inuse);
+		hi->inuse = 1;
+
+	}
+
+	spin_unlock(&heap->lock);
+
+	return ptr;
+}
+
+void vfcipi_heap_free(void *ptr)
+{
+	struct vfcipi_heap_item *hi;
+	struct vfcipi_heap *heap;
+
+	hi = container_of(ptr, struct vfcipi_heap_item, data);
+
+	BUG_ON(hi->magic != VFCIPI_HEAP_MAGIC);
+	BUG_ON(!hi->inuse);
+
+	heap = hi->heap;
+
+	spin_lock(&heap->lock);
+
+	list_del_init(&hi->list);
+	list_add_tail(&hi->list, &heap->free);
+	hi->inuse = 0;
+
+	spin_unlock(&heap->lock);
+}
+
+static struct vfcipi_heap vfcipi_heap;
+
+__init void vfcipi_heap_init(int element_size, int nr_elements)
+{
+	_vfcipi_heap_init(&vfcipi_heap, element_size, nr_elements);
+}
+
+void* vfcipi_heap_alloc(size_t size)
+{
+	BUG_ON(size > vfcipi_heap.element_size);
+
+	return _vfcipi_heap_alloc(&vfcipi_heap);
+}
+
+void* vfcipi_heap_zalloc(size_t size)
+{
+	void *ptr = vfcipi_heap_alloc(size);
+	if (ptr)
+		memset(ptr, 0, size);
+
+	return ptr;
+}
+
+
+
+
+
diff --git a/kernel/vfcipi/heap.h b/kernel/vfcipi/heap.h
new file mode 100644
index 0000000..3cd264e
--- /dev/null
+++ b/kernel/vfcipi/heap.h
@@ -0,0 +1,20 @@
+#ifndef _VFCIPI_HEAP_H
+#define _VFCIPI_HEAP_H
+
+#ifdef CONFIG_PREEMPT_RT
+
+void vfcipi_heap_init(int element_size, int nr_elements);
+void* vfcipi_heap_alloc(size_t);
+void* vfcipi_heap_zalloc(size_t);
+void  vfcipi_heap_free(void *);
+
+#else
+
+#define vfcipi_heap_init(element_size, nr_elements) {}
+#define vfcipi_heap_alloc(size_t size) kmalloc(size, GFP_ATOMIC);
+#define vfcipi_heap_alloc(size_t size) kzalloc(size, GFP_ATOMIC);
+#define vfcipi_heap_free(void *ptr)    kfree(ptr);
+
+#endif
+
+#endif /* _VFCIPI_HEAP_H */
diff --git a/kernel/vfcipi/thread.c b/kernel/vfcipi/thread.c
new file mode 100644
index 0000000..45bb4e2
--- /dev/null
+++ b/kernel/vfcipi/thread.c
@@ -0,0 +1,372 @@
+/*
+ * kernel/vfcipi/thread
+ *
+ * Preemptible Function-Call-IPI Support
+ * -------------------------------------
+ *  This code allows FUNCTION_CALL IPIs to become preemptible by executing
+ *  them in kthread context instead of interrupt context.  They are referred
+ *  to as "Virtual Function Call IPIs" (VFCIPI) because we no longer rely
+ *  on the actual FCIPI facility.  Instead we schedule a thread to run.	 This
+ *  essentially replaces the synchronous FCIPI with an async RESCHEDULE IPI.
+ *
+ *  Since the function will be executed in kthread context, it is fully
+ *  sleepable and preemptible, thus providing more determinism.	 It also allows
+ *  code that was written to expect spin_locks to work properly, even though
+ *  they may have converted to rt_mutex under the hood.	 In summary, this
+ *  subsystem does for FCIPI interrupts what PREEMPT_HARDIRQs does for normal
+ *  interrupts.
+ *
+ * Copyright (C) 2007 Novell, Gregory Haskins <ghaskins@novell.com>
+ *
+ * This code is licensed under the GPLv2
+ */
+
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/hardirq.h>
+#include <linux/irqflags.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+
+#include <asm/atomic.h>
+#include <asm/cmpxchg.h>
+
+#include "heap.h"
+
+struct vfcipi_status {
+	atomic_t	    curr;
+	int		    threshold;
+	struct task_struct *task;
+};
+
+struct vfcipi_workitem {
+	atomic_t	      ref;
+	void		     (*func)(void *data);
+	void		     *data;
+	int		      prio;
+	struct vfcipi_status  started;
+	struct vfcipi_status  finished;
+};
+
+struct vfcipi_queueitem {
+	struct list_head	list;
+	struct vfcipi_workitem *item;
+};
+
+struct vfcipi_task {
+	raw_spinlock_t	    lock;
+	struct task_struct *task;
+	struct prio_array   rt_rq; /* Real-time request queue */
+	struct list_head    rq;	   /* Normal request queue */
+};
+
+static DEFINE_PER_CPU(struct vfcipi_task*, vfcipi_tasks);
+
+/*
+ * ----------------------------------------
+ * vfcipi_status
+ * ----------------------------------------
+ */
+static void vfcipi_status_init(struct vfcipi_status *s, int threshold,
+			       int wait)
+{
+	atomic_set(&s->curr, 0);
+	s->threshold = threshold;
+
+	if (wait && !in_atomic() && !irqs_disabled())
+		s->task = current;
+}
+
+static void vfcipi_status_signal(struct vfcipi_status *s)
+{
+	int curr = atomic_inc_return(&s->curr);
+
+	if (s->task && (curr >= s->threshold))
+		wake_up_process(s->task);
+}
+
+static void vfcipi_status_wait(struct vfcipi_status *s)
+{
+	while (1) {
+		if (s->task)
+			set_current_state(TASK_UNINTERRUPTIBLE);
+
+		if (atomic_read(&s->curr) != s->threshold) {
+			if (s->task) {
+				schedule();
+			} else
+				cpu_relax();
+		} else
+			break;
+	}
+
+	set_current_state(TASK_RUNNING);
+}
+
+/*
+ * ----------------------------------------
+ * vfcipi_workitem
+ * ----------------------------------------
+ */
+static struct vfcipi_workitem*
+vfcipi_workitem_init(void (*func)(void *data), void *data, int nr_cpus,
+		     int wait)
+{
+	struct vfcipi_workitem *item = vfcipi_heap_zalloc(sizeof(*item));
+	if (!item)
+		return NULL;
+
+	atomic_set(&item->ref, 1);
+	item->func = func;
+	item->data = data;
+	item->prio = -1;
+
+	/*
+	 * Theres no need to wait for both a start and a finish event.	You
+	 * really only need one.  Therefore, we exclusively select one based
+	 * on the *wait* variable
+	 */
+	vfcipi_status_init(&item->started, nr_cpus, !wait);
+	vfcipi_status_init(&item->finished, nr_cpus, wait);
+
+	return item;
+}
+
+static void vfcipi_workitem_dropref(struct vfcipi_workitem *item)
+{
+	if (atomic_dec_and_test(&item->ref))
+		vfcipi_heap_free(item);
+}
+
+static void vfcipi_workitem_wait(struct vfcipi_workitem *item, int wait)
+{
+	if (!wait)
+		/*
+		 * If the user indicated we should not wait, we will still wait
+		 * for the execution to at least start.	 This is how the
+		 * standard IPI based FUNCTION_CALL works, so we will replicate
+		 * that behavior.
+		 */
+		vfcipi_status_wait(&item->started);
+	else
+		/*
+		 * Likewise, if they selected to wait, we will wait until the
+		 * function completes entirely.
+		 */
+		vfcipi_status_wait(&item->finished);
+
+}
+
+/*
+ * ----------------------------------------
+ * vfcipi_thread - daemon process for vfcipi per CPU
+ * ----------------------------------------
+ */
+static int vfcipi_thread(void *data)
+{
+	struct vfcipi_task *ftask = per_cpu(vfcipi_tasks,
+					    raw_smp_processor_id());
+
+	while (1) {
+		struct vfcipi_workitem *item;
+		struct vfcipi_queueitem *qi = NULL;
+
+		spin_lock(&ftask->lock);
+		
+		if (!list_empty(&ftask->rq)) {
+			qi = list_first_entry(&ftask->rq,
+					      struct vfcipi_queueitem,
+					      list);
+			BUG_ON(!qi);
+			list_del(&qi->list);
+		}
+
+		if (!qi) {
+			/* Nothing to process for now.. */
+			set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock(&ftask->lock);
+			schedule();
+			continue;
+		}
+
+		spin_unlock(&ftask->lock);
+
+		/*
+		 * Extract the real pointer and discard the queueitem shell.
+		 * We no longer need it.
+		 */
+		item = qi->item;
+		vfcipi_heap_free(qi);
+
+		/*
+		 * Execute the actual user-provided function
+		 */
+		vfcipi_status_signal(&item->started);
+		item->func(item->data);
+		vfcipi_status_signal(&item->finished);
+
+		vfcipi_workitem_dropref(item);
+	}
+}
+
+/*
+ * ----------------------------------------
+ * client side code
+ * ----------------------------------------
+ */
+static int vfcipi_enqueue(struct vfcipi_workitem *item, int cpu)
+{
+	struct vfcipi_task	*ftask = per_cpu(vfcipi_tasks, cpu);
+	struct vfcipi_queueitem *qi    = vfcipi_heap_alloc(sizeof(*qi));
+
+	BUG_ON(!ftask);
+
+	if (!qi)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&qi->list);
+	qi->item = item;
+
+	/*
+	 * We increment the ref count here right before the list insertion.
+	 * It will get decremented when the kthread finishes processing it
+	 */
+	atomic_inc(&item->ref);
+
+	spin_lock(&ftask->lock);
+
+	list_add_tail(&qi->list, &ftask->rq);
+
+	wake_up_process(ftask->task);
+
+	spin_unlock(&ftask->lock);
+
+	return 0;
+}
+
+static int vfcipi_call_function_single(int cpu, void (*func)(void *data),
+				      void *data, int nonatomic, int wait)
+{
+	struct vfcipi_workitem *item;
+	int ret;
+
+	item = vfcipi_workitem_init(func, data, 1, wait);
+
+	ret = vfcipi_enqueue(item, cpu);
+	if (ret < 0)
+		goto out;
+
+	vfcipi_workitem_wait(item, wait);
+
+ out:
+	/* We are finished with the reference in this context */
+	vfcipi_workitem_dropref(item);
+
+	return ret;
+}
+
+static int vfcipi_call_function(void (*func)(void *data), void *data,
+			       int nonatomic, int wait)
+{
+	struct vfcipi_workitem *item;
+	int ret = 0;
+	int cpu;
+	int mycpu = raw_smp_processor_id();
+	int nr_cpus = num_online_cpus()-1;
+
+	item = vfcipi_workitem_init(func, data, nr_cpus, wait);
+
+	for_each_online_cpu(cpu) {
+		if (cpu != mycpu) {
+			ret = vfcipi_enqueue(item, cpu);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+	vfcipi_workitem_wait(item, wait);
+
+ out:
+	/* We are finished with the reference in this context */
+	vfcipi_workitem_dropref(item);
+
+	return ret;
+}
+
+struct vfcipi_vtable {
+	int (*call_single)(int cpu, void (*func)(void *data),
+			   void *data, int nonatomic, int wait);
+	int (*call_allbutself)(void (*func)(void *data), void *data,
+			       int nonatomic, int wait);
+};
+
+static struct vfcipi_vtable raw_vtable = {
+	.call_single	 = raw_smp_call_function_single,
+	.call_allbutself = raw_smp_call_function
+};
+
+static struct vfcipi_vtable threaded_vtable = {
+	.call_single	 = vfcipi_call_function_single,
+	.call_allbutself = vfcipi_call_function
+};
+
+/*
+ * By default the system will fall back on the __raw implementation
+ * since the __threaded version will not be online until the vfcipi_init()
+ * function has a chance to run
+ */
+static struct vfcipi_vtable *vfcipi_vtable = &raw_vtable;
+
+int smp_call_function_single(int cpu, void (*func)(void *data),
+			     void *data, int nonatomic, int wait)
+{
+	return vfcipi_vtable->call_single(cpu, func, data, nonatomic, wait);
+}
+EXPORT_SYMBOL(smp_call_function_single);
+
+int smp_call_function(void (*func)(void *data), void *data,
+		      int nonatomic, int wait)
+{
+	return vfcipi_vtable->call_allbutself(func, data, nonatomic, wait);
+}
+EXPORT_SYMBOL(smp_call_function);
+
+int __init vfcipi_init(void)
+{
+	int cpu;
+	struct vfcipi_vtable *old;
+
+	vfcipi_heap_init(sizeof(struct vfcipi_workitem), 4096);
+
+	for_each_present_cpu(cpu) {
+		struct vfcipi_task *ftask = kzalloc(sizeof(*ftask),
+						    GFP_KERNEL);
+
+		if (!ftask)
+			goto out_free;
+
+		spin_lock_init(&ftask->lock);
+		INIT_LIST_HEAD(&ftask->rq);
+		per_cpu(vfcipi_tasks, cpu) = ftask;
+
+		ftask->task = kthread_create(vfcipi_thread, NULL,
+					     "vfcipi/%d", cpu);
+		kthread_bind(ftask->task, cpu);
+
+		wake_up_process(ftask->task);
+	}
+
+	/* Now atomically switch to threaded mode */
+	old = xchg(&vfcipi_vtable, &threaded_vtable);
+
+	return 0;
+
+ out_free:
+	for_each_present_cpu(cpu) {
+		struct vfcipi_task *ftask = per_cpu(vfcipi_tasks, cpu);
+		kfree(ftask);
+		per_cpu(vfcipi_tasks, cpu) = NULL;
+	}
+
+	return -ENOMEM;
+}

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2007-07-31 20:36 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-07-30 21:15 [PATCH 0/2] VFCIPI support v2 Gregory Haskins
2007-07-30 21:15 ` [PATCH 1/2] RT: Preemptible Function-Call-IPI Support Gregory Haskins
2007-07-31  9:19   ` Ingo Molnar
2007-07-31  9:21     ` Ingo Molnar
2007-07-31  9:25       ` Ingo Molnar
2007-07-31 14:26         ` Gregory Haskins
2007-07-31 14:26         ` Gregory Haskins
2007-07-31 20:14           ` Gregory Haskins
2007-07-30 21:15 ` [PATCH 2/2] RT: Add priority inheritance to the VFCIPI facility Gregory Haskins
2007-07-30 21:34 ` [PATCH 0/2] VFCIPI support v2 Daniel Walker
  -- strict thread matches above, loose matches on Subject: below --
2007-07-31 11:44 [PATCH 1/2] RT: Preemptible Function-Call-IPI Support Gregory Haskins
2007-07-31 12:11 Gregory Haskins
2007-07-31 14:22 ` Ingo Molnar
2007-07-31 13:24 [PATCH 0/2][RFC] VFCIPI v3 Gregory Haskins
2007-07-31 13:24 ` [PATCH 1/2] RT: Preemptible Function-Call-IPI Support Gregory Haskins

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.