[PATCH] 2.4.0-prerelease: preemptive kernel.

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] 2.4.0-prerelease: preemptive kernel.
@ 2001-01-04  1:56 ludovic fernandez
  2001-01-04  7:35 ` Daniel Phillips
                   ` (3 more replies)
  0 siblings, 4 replies; 25+ messages in thread
From: ludovic fernandez @ 2001-01-04  1:56 UTC (permalink / raw)
  To: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 724 bytes --]

Hello,

For hackers,
The following patch makes the kernel preemptable.
It is against 2.4.0-prerelease on for i386 only.
It should work for UP and SMP even though I
didn't validate it on SMP.
Comments are welcome.

NOTES: since the lock implementation is modified,
you need obviously to re-compile all your modules.
I introduced a dependency between spinlock.h and sched.h
and this has some bad side effects: Some files will
generate warnings during the compilation complaining
that disable_preempt()/enable_preempt() are not defined.
The warnings should be harmless but I was too lazy to
fix all of them. If the compilation fails because of
that, there is big chance to fix things by including
sched.h in the cfile.

Ludo.


[-- Attachment #2: preempt.patch --]
[-- Type: text/plain, Size: 25240 bytes --]

diff -u --recursive linux-2.4-prerelease.org/arch/i386/kernel/apic.c linux-2.4-prerelease/arch/i386/kernel/apic.c
--- linux-2.4-prerelease.org/arch/i386/kernel/apic.c	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/arch/i386/kernel/apic.c	Wed Jan  3 12:58:57 2001
@@ -726,6 +726,7 @@
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
 	irq_enter(cpu, 0);
+	disable_preempt();
 	smp_local_timer_interrupt(regs);
 	irq_exit(cpu, 0);
 }
@@ -746,6 +747,8 @@
 	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
 		ack_APIC_irq();
 
+	disable_preempt();
+
 	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
 	printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n",
 			smp_processor_id());
@@ -776,6 +779,9 @@
 	   6: Received illegal vector
 	   7: Illegal register address
 	*/
+
+	disable_preempt();
+
 	printk (KERN_ERR "APIC error on CPU%d: %02lx(%02lx)\n",
 	        smp_processor_id(), v , v1);
 }
diff -u --recursive linux-2.4-prerelease.org/arch/i386/kernel/entry.S linux-2.4-prerelease/arch/i386/kernel/entry.S
--- linux-2.4-prerelease.org/arch/i386/kernel/entry.S	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/arch/i386/kernel/entry.S	Wed Jan  3 12:58:57 2001
@@ -79,6 +79,8 @@
 need_resched	= 20
 tsk_ptrace	= 24
 processor	= 52
+preemptable = 56
+
 
 ENOSYS = 38
 
@@ -203,6 +205,9 @@
 	call *SYMBOL_NAME(sys_call_table)(,%eax,4)
 	movl %eax,EAX(%esp)		# save the return value
 ENTRY(ret_from_sys_call)
+	movl $1, %edx
+	lock
+	xaddl %edx, preemptable(%ebx)
 #ifdef CONFIG_SMP
 	movl processor(%ebx),%eax
 	shll $CONFIG_X86_L1_CACHE_SHIFT,%eax
@@ -213,13 +218,22 @@
 	testl SYMBOL_NAME(irq_stat)+4,%ecx	# softirq_mask
 #endif
 	jne   handle_softirq
-	
+	cmpl $0, %edx			# task is preemptable ?
+	jne check_signal
 ret_with_reschedule:
 	cmpl $0,need_resched(%ebx)
 	jne reschedule
+check_signal:
+#if 0	
+	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
+	movb CS(%esp), %al
+	testl $(VM_MASK | 3), %eax	# return to user mode ?
+	je restore_all			# no bypass signal check
+#endif	
 	cmpl $0,sigpending(%ebx)
 	jne signal_return
 restore_all:
+	decl preemptable(%ebx)	
 	RESTORE_ALL
 
 	ALIGN
@@ -270,14 +284,22 @@
 #endif
 	jne   handle_softirq
 
+/*
+ * ret_from_intr is the common path used to return
+ * from interruptions (either hard of soft) and exceptions.
+ * At that point the preemption is disabled 
+ * (see do_IRQ and handle_softirq)
+ * Reenable the preemption, verify that the current thread 
+ * is preemptable and check for a pending scheduling request.
+ */ 		
 ENTRY(ret_from_intr)
 	GET_CURRENT(%ebx)
-	movl EFLAGS(%esp),%eax		# mix EFLAGS and CS
-	movb CS(%esp),%al
-	testl $(VM_MASK | 3),%eax	# return to VM86 mode or non-supervisor?
-	jne ret_with_reschedule
-	jmp restore_all
-
+	cmpl $1, preemptable(%ebx)
+	jne  restore_all
+	cmpl $0, state(%ebx)		# current task is running ?
+	jne restore_all
+	jmp ret_with_reschedule
+	
 	ALIGN
 handle_softirq:
 	call SYMBOL_NAME(do_softirq)
@@ -286,6 +308,7 @@
 	ALIGN
 reschedule:
 	call SYMBOL_NAME(schedule)    # test
+	decl preemptable(%ebx)
 	jmp ret_from_sys_call
 
 ENTRY(divide_error)
@@ -316,6 +339,13 @@
 	movl %edx,%ds
 	movl %edx,%es
 	GET_CURRENT(%ebx)
+/*
+ * All exceptions are called with the preemption disabled.
+ * In addition, some of them (page_fault) are not reentrant 
+ * and need to be atomic until the preemption can be disabled.
+ */
+	incl preemptable(%ebx)
+	sti	
 	call *%edi
 	addl $8,%esp
 	jmp ret_from_exception
@@ -334,6 +364,7 @@
 	pushl $-1		# mark this as an int
 	SAVE_ALL
 	GET_CURRENT(%ebx)
+	incl preemptable(%ebx)
 	pushl $ret_from_exception
 	movl %cr0,%eax
 	testl $0x4,%eax			# EM (math emulation bit)
diff -u --recursive linux-2.4-prerelease.org/arch/i386/kernel/irq.c linux-2.4-prerelease/arch/i386/kernel/irq.c
--- linux-2.4-prerelease.org/arch/i386/kernel/irq.c	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/arch/i386/kernel/irq.c	Wed Jan  3 12:58:57 2001
@@ -564,6 +564,12 @@
 	unsigned int status;
 
 	kstat.irqs[cpu][irq]++;
+	/*
+	 * Disable preemption for the current task.
+	 * ret_from_intr will reenable the preemption and
+	 * check for a scheduling request.
+	 */
+	disable_preempt();
 	spin_lock(&desc->lock);
 	desc->handler->ack(irq);
 	/*
diff -u --recursive linux-2.4-prerelease.org/arch/i386/kernel/smp.c linux-2.4-prerelease/arch/i386/kernel/smp.c
--- linux-2.4-prerelease.org/arch/i386/kernel/smp.c	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/arch/i386/kernel/smp.c	Wed Jan  3 12:58:57 2001
@@ -277,6 +277,8 @@
 {
 	unsigned long cpu = smp_processor_id();
 
+	disable_preempt();
+
 	if (!test_bit(cpu, &flush_cpumask))
 		return;
 		/* 
@@ -518,6 +520,7 @@
 asmlinkage void smp_reschedule_interrupt(void)
 {
 	ack_APIC_irq();
+	disable_preempt();
 }
 
 asmlinkage void smp_call_function_interrupt(void)
@@ -532,6 +535,7 @@
 	 * about to execute the function
 	 */
 	atomic_inc(&call_data->started);
+	disable_preempt();
 	/*
 	 * At this point the info structure may be out of scope unless wait==1
 	 */
diff -u --recursive linux-2.4-prerelease.org/arch/i386/kernel/traps.c linux-2.4-prerelease/arch/i386/kernel/traps.c
--- linux-2.4-prerelease.org/arch/i386/kernel/traps.c	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/arch/i386/kernel/traps.c	Wed Jan  3 12:58:57 2001
@@ -958,7 +958,7 @@
 	set_trap_gate(11,&segment_not_present);
 	set_trap_gate(12,&stack_segment);
 	set_trap_gate(13,&general_protection);
-	set_trap_gate(14,&page_fault);
+	set_intr_gate(14,&page_fault);
 	set_trap_gate(15,&spurious_interrupt_bug);
 	set_trap_gate(16,&coprocessor_error);
 	set_trap_gate(17,&alignment_check);
diff -u --recursive linux-2.4-prerelease.org/arch/i386/lib/dec_and_lock.c linux-2.4-prerelease/arch/i386/lib/dec_and_lock.c
--- linux-2.4-prerelease.org/arch/i386/lib/dec_and_lock.c	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/arch/i386/lib/dec_and_lock.c	Wed Jan  3 12:58:57 2001
@@ -9,6 +9,7 @@
 
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
+#include <linux/sched.h>
 
 int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 {
diff -u --recursive linux-2.4-prerelease.org/arch/i386/mm/fault.c linux-2.4-prerelease/arch/i386/mm/fault.c
--- linux-2.4-prerelease.org/arch/i386/mm/fault.c	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/arch/i386/mm/fault.c	Wed Jan  3 12:58:57 2001
@@ -112,6 +112,8 @@
 	unsigned long page;
 	unsigned long fixup;
 	int write;
+	int ret;
+
 	siginfo_t info;
 
 	/* get the address */
@@ -193,7 +195,17 @@
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
+
+	/*
+	 * Re-enable the preemption before calling the generic handler.
+	 * This is rather for fun and to validate things a bit since
+	 * the mm semaphore is hold at that point and that can cause
+	 * a lot of contentions.
+	 */
+	enable_preempt();
+	ret = handle_mm_fault(mm, vma, address, write);
+	disable_preempt();
+	switch (ret) {
 	case 1:
 		tsk->min_flt++;
 		break;
diff -u --recursive linux-2.4-prerelease.org/drivers/pcmcia/ds.c linux-2.4-prerelease/drivers/pcmcia/ds.c
--- linux-2.4-prerelease.org/drivers/pcmcia/ds.c	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/drivers/pcmcia/ds.c	Wed Jan  3 12:58:57 2001
@@ -880,7 +880,16 @@
     int i, ret;
     
     DEBUG(0, "%s\n", version);
- 
+#if 1
+    /*
+     * I got some problems with PCMCIA initialization and a
+     * preemptive kernel;
+     * init_pcmcia_ds() beeing called before the completion
+     * of pending scheduled tasks. I don't know if this is the
+     * right fix though.
+     */
+    flush_scheduled_tasks();
+#endif
     /*
      * Ugly. But we want to wait for the socket threads to have started up.
      * We really should let the drivers themselves drive some of this..
diff -u --recursive linux-2.4-prerelease.org/include/asm-i386/smplock.h linux-2.4-prerelease/include/asm-i386/smplock.h
--- linux-2.4-prerelease.org/include/asm-i386/smplock.h	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/include/asm-i386/smplock.h	Wed Jan  3 17:29:36 2001
@@ -17,6 +17,7 @@
  */
 #define release_kernel_lock(task, cpu) \
 do { \
+	disable_preempt(); \
 	if (task->lock_depth >= 0) \
 		spin_unlock(&kernel_flag); \
 	release_irqlock(cpu); \
@@ -30,6 +31,7 @@
 do { \
 	if (task->lock_depth >= 0) \
 		spin_lock(&kernel_flag); \
+	enable_preempt(); \
 } while (0)
 
 
@@ -43,6 +45,7 @@
 extern __inline__ void lock_kernel(void)
 {
 #if 1
+	disable_preempt();
 	if (!++current->lock_depth)
 		spin_lock(&kernel_flag);
 #else
@@ -63,6 +66,7 @@
 #if 1
 	if (--current->lock_depth < 0)
 		spin_unlock(&kernel_flag);
+	enable_preempt();
 #else
 	__asm__ __volatile__(
 		"decl %1\n\t"
diff -u --recursive linux-2.4-prerelease.org/include/asm-i386/softirq.h linux-2.4-prerelease/include/asm-i386/softirq.h
--- linux-2.4-prerelease.org/include/asm-i386/softirq.h	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/include/asm-i386/softirq.h	Wed Jan  3 14:21:58 2001
@@ -7,8 +7,10 @@
 #define cpu_bh_disable(cpu)	do { local_bh_count(cpu)++; barrier(); } while (0)
 #define cpu_bh_enable(cpu)	do { barrier(); local_bh_count(cpu)--; } while (0)
 
-#define local_bh_disable()	cpu_bh_disable(smp_processor_id())
-#define local_bh_enable()	cpu_bh_enable(smp_processor_id())
+#define local_bh_disable()	\
+do { disable_preempt(); cpu_bh_disable(smp_processor_id()); } while (0)
+#define local_bh_enable()	\
+do { cpu_bh_enable(smp_processor_id()); enable_preempt(); } while (0)
 
 #define in_softirq() (local_bh_count(smp_processor_id()) != 0)
 
diff -u --recursive linux-2.4-prerelease.org/include/asm-i386/spinlock.h linux-2.4-prerelease/include/asm-i386/spinlock.h
--- linux-2.4-prerelease.org/include/asm-i386/spinlock.h	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/include/asm-i386/spinlock.h	Wed Jan  3 14:21:58 2001
@@ -65,7 +65,7 @@
 #define spin_unlock_string \
 	"movb $1,%0"
 
-static inline int spin_trylock(spinlock_t *lock)
+static inline int _spin_trylock(spinlock_t *lock)
 {
 	char oldval;
 	__asm__ __volatile__(
@@ -75,7 +75,7 @@
 	return oldval > 0;
 }
 
-static inline void spin_lock(spinlock_t *lock)
+static inline void _spin_lock(spinlock_t *lock)
 {
 #if SPINLOCK_DEBUG
 	__label__ here;
@@ -90,7 +90,7 @@
 		:"=m" (lock->lock) : : "memory");
 }
 
-static inline void spin_unlock(spinlock_t *lock)
+static inline void _spin_unlock(spinlock_t *lock)
 {
 #if SPINLOCK_DEBUG
 	if (lock->magic != SPINLOCK_MAGIC)
@@ -143,7 +143,7 @@
  */
 /* the spinlock helpers are in arch/i386/kernel/semaphore.c */
 
-static inline void read_lock(rwlock_t *rw)
+static inline void _read_lock(rwlock_t *rw)
 {
 #if SPINLOCK_DEBUG
 	if (rw->magic != RWLOCK_MAGIC)
@@ -152,7 +152,7 @@
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void write_lock(rwlock_t *rw)
+static inline void _write_lock(rwlock_t *rw)
 {
 #if SPINLOCK_DEBUG
 	if (rw->magic != RWLOCK_MAGIC)
@@ -161,8 +161,8 @@
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-#define read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
-#define write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
+#define _read_unlock(rw)	asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
+#define _write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
 
 static inline int write_trylock(rwlock_t *lock)
 {
diff -u --recursive linux-2.4-prerelease.org/include/asm-i386/system.h linux-2.4-prerelease/include/asm-i386/system.h
--- linux-2.4-prerelease.org/include/asm-i386/system.h	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/include/asm-i386/system.h	Wed Jan  3 14:21:58 2001
@@ -306,6 +306,13 @@
 #define local_irq_disable()	__cli()
 #define local_irq_enable()	__sti()
 
+static inline int local_irq_are_enabled(void)
+{
+	unsigned long flags;
+	__save_flags(flags);
+	return (flags & 0x00000200);
+}
+
 #ifdef CONFIG_SMP
 
 extern void __global_cli(void);
diff -u --recursive linux-2.4-prerelease.org/include/linux/sched.h linux-2.4-prerelease/include/linux/sched.h
--- linux-2.4-prerelease.org/include/linux/sched.h	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/include/linux/sched.h	Wed Jan  3 15:31:41 2001
@@ -296,6 +296,7 @@
 	unsigned long policy;
 	struct mm_struct *mm;
 	int has_cpu, processor;
+	atomic_t preemptable;
 	unsigned long cpus_allowed;
 	/*
 	 * (only the 'next' pointer fits into the cacheline, but
@@ -443,6 +444,7 @@
     policy:		SCHED_OTHER,					\
     mm:			NULL,						\
     active_mm:		&init_mm,					\
+    preemptable:	ATOMIC_INIT(0),					\
     cpus_allowed:	-1,						\
     run_list:		LIST_HEAD_INIT(tsk.run_list),			\
     next_task:		&tsk,						\
@@ -524,6 +526,7 @@
 extern void free_uid(struct user_struct *);
 
 #include <asm/current.h>
+#include <asm/hardirq.h>
 
 extern unsigned long volatile jiffies;
 extern unsigned long itimer_ticks;
@@ -634,6 +637,41 @@
 {
 	return (current->sas_ss_size == 0 ? SS_DISABLE
 		: on_sig_stack(sp) ? SS_ONSTACK : 0);
+}
+
+static inline void disable_preempt(void)
+{
+	atomic_inc(&current->preemptable);
+}
+
+static inline void enable_preempt(void)
+{
+	if (atomic_read(&current->preemptable) <= 0) {
+		BUG();
+	}
+	if (atomic_read(&current->preemptable) == 1) {
+		/*
+		 * At that point a scheduling is healthy iff:
+		 * - a scheduling request is pending.
+		 * - the task is in running state.
+		 * - this is not an interrupt context.
+		 * - local interrupts are enabled.
+		 */
+		if (current->need_resched == 1     &&
+		    current->state == TASK_RUNNING &&
+		    !in_interrupt()                &&
+		    local_irq_are_enabled())
+		{
+			schedule();
+		}
+	}
+	atomic_dec(&current->preemptable);
+}	
+
+static inline int preemptable(void)
+{
+	return (!in_interrupt() && 
+		!atomic_read(&current->preemptable));
 }
 
 extern int request_irq(unsigned int,
diff -u --recursive linux-2.4-prerelease.org/include/linux/smp_lock.h linux-2.4-prerelease/include/linux/smp_lock.h
--- linux-2.4-prerelease.org/include/linux/smp_lock.h	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/include/linux/smp_lock.h	Wed Jan  3 17:30:04 2001
@@ -5,11 +5,37 @@
 
 #ifndef CONFIG_SMP
 
-#define lock_kernel()				do { } while(0)
-#define unlock_kernel()				do { } while(0)
-#define release_kernel_lock(task, cpu)		do { } while(0)
-#define reacquire_kernel_lock(task)		do { } while(0)
-#define kernel_locked() 1
+/*
+ * Release global kernel lock.
+ * Regarding preemption, this actually does the reverse -
+ */
+#define release_kernel_lock(task, cpu) \
+do { \
+	disable_preempt();  \
+} while (0)
+
+/*
+ * Re-acquire the kernel lock
+ * Re-enable the preemption - see comments above.
+ * Note: enable_preempt() cannot not be called at
+ * that point (otherwise schedule() becomes reentrant). 
+ */
+#define reacquire_kernel_lock(task) \
+do { \
+	atomic_dec(&current->preemptable); \
+} while (0)
+
+#define lock_kernel() \
+do { \
+	disable_preempt(); \
+} while(0); 
+
+#define unlock_kernel() \
+do { \
+	enable_preempt(); \
+} while (0)
+
+#define kernel_locked()	(!preemptable())
 
 #else
 
diff -u --recursive linux-2.4-prerelease.org/include/linux/spinlock.h linux-2.4-prerelease/include/linux/spinlock.h
--- linux-2.4-prerelease.org/include/linux/spinlock.h	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/include/linux/spinlock.h	Wed Jan  3 14:21:58 2001
@@ -3,33 +3,72 @@
 
 #include <linux/config.h>
 
+static inline void disable_preempt(void);
+static inline void enable_preempt(void);
+
 /*
  * These are the generic versions of the spinlocks and read-write
  * locks..
  */
-#define spin_lock_irqsave(lock, flags)		do { local_irq_save(flags);       spin_lock(lock); } while (0)
-#define spin_lock_irq(lock)			do { local_irq_disable();         spin_lock(lock); } while (0)
-#define spin_lock_bh(lock)			do { local_bh_disable();          spin_lock(lock); } while (0)
-
-#define read_lock_irqsave(lock, flags)		do { local_irq_save(flags);       read_lock(lock); } while (0)
-#define read_lock_irq(lock)			do { local_irq_disable();         read_lock(lock); } while (0)
-#define read_lock_bh(lock)			do { local_bh_disable();          read_lock(lock); } while (0)
-
-#define write_lock_irqsave(lock, flags)		do { local_irq_save(flags);      write_lock(lock); } while (0)
-#define write_lock_irq(lock)			do { local_irq_disable();        write_lock(lock); } while (0)
-#define write_lock_bh(lock)			do { local_bh_disable();         write_lock(lock); } while (0)
-
-#define spin_unlock_irqrestore(lock, flags)	do { spin_unlock(lock);  local_irq_restore(flags); } while (0)
-#define spin_unlock_irq(lock)			do { spin_unlock(lock);  local_irq_enable();       } while (0)
-#define spin_unlock_bh(lock)			do { spin_unlock(lock);  local_bh_enable();        } while (0)
-
-#define read_unlock_irqrestore(lock, flags)	do { read_unlock(lock);  local_irq_restore(flags); } while (0)
-#define read_unlock_irq(lock)			do { read_unlock(lock);  local_irq_enable();       } while (0)
-#define read_unlock_bh(lock)			do { read_unlock(lock);  local_bh_enable();        } while (0)
-
-#define write_unlock_irqrestore(lock, flags)	do { write_unlock(lock); local_irq_restore(flags); } while (0)
-#define write_unlock_irq(lock)			do { write_unlock(lock); local_irq_enable();       } while (0)
-#define write_unlock_bh(lock)			do { write_unlock(lock); local_bh_enable();        } while (0)
+#define spin_lock_irqsave(lock, flags)	\
+	do { disable_preempt(); local_irq_save(flags); _spin_lock(lock); } while (0)
+#define spin_lock_irq(lock) \
+	do { disable_preempt(); local_irq_disable(); _spin_lock(lock); } while (0)
+#define spin_lock_bh(lock) \
+	do { disable_preempt(); local_bh_disable(); _spin_lock(lock); } while (0)
+
+#define read_lock_irqsave(lock, flags) \
+	do { disable_preempt(); local_irq_save(flags); _read_lock(lock); } while (0)
+#define read_lock_irq(lock) \
+	do { disable_preempt(); local_irq_disable(); _read_lock(lock); } while (0)
+#define read_lock_bh(lock) \
+	do { disable_preempt(); local_bh_disable(); _read_lock(lock); } while (0)
+
+#define write_lock_irqsave(lock, flags)	\
+	do { disable_preempt(); local_irq_save(flags); _write_lock(lock); } while (0)
+#define write_lock_irq(lock) \
+	do { disable_preempt(); local_irq_disable(); _write_lock(lock); } while (0)
+#define write_lock_bh(lock) \
+	do { disable_preempt(); local_bh_disable(); _write_lock(lock); } while (0)
+
+#define spin_unlock_irqrestore(lock, flags) \
+	do { _spin_unlock(lock); local_irq_restore(flags); enable_preempt(); } while (0)
+#define spin_unlock_irq(lock) \
+	do { _spin_unlock(lock); local_irq_enable(); enable_preempt(); } while (0)
+#define spin_unlock_bh(lock) \
+	do { _spin_unlock(lock); local_bh_enable(); enable_preempt(); } while (0)
+
+#define read_unlock_irqrestore(lock, flags) \
+	do { _read_unlock(lock); local_irq_restore(flags); enable_preempt(); } while (0)
+#define read_unlock_irq(lock) \
+	do { _read_unlock(lock); local_irq_enable(); enable_preempt(); } while (0)
+#define read_unlock_bh(lock) \
+	do { _read_unlock(lock); local_bh_enable(); enable_preempt(); } while (0)
+
+#define write_unlock_irqrestore(lock, flags) \
+	do { _write_unlock(lock); local_irq_restore(flags); enable_preempt(); } while (0)
+#define write_unlock_irq(lock) \
+	do { _write_unlock(lock); local_irq_enable(); enable_preempt(); } while (0)
+#define write_unlock_bh(lock) \
+	do { _write_unlock(lock); local_bh_enable(); enable_preempt(); } while (0)
+
+#define spin_lock(lock) \
+	do { disable_preempt(); _spin_lock(lock); } while (0)
+#define spin_unlock(lock) \
+	do { _spin_unlock(lock); enable_preempt(); } while (0)
+#define spin_trylock(lock) \
+	({disable_preempt(); _spin_trylock(lock)? 1: (enable_preempt(), 0);})
+
+#define read_lock(lock) \
+	do { disable_preempt(); _read_lock(lock); } while (0)
+#define read_unlock(lock) \
+	do { _read_unlock(lock); enable_preempt(); } while (0)
+
+#define write_lock(lock) \
+	do { disable_preempt(); _write_lock(lock); } while (0)
+#define write_unlock(lock) \
+	do { _write_unlock(lock); enable_preempt(); } while (0)
+
 
 #ifdef CONFIG_SMP
 #include <asm/spinlock.h>
@@ -40,8 +79,6 @@
 
 #if (DEBUG_SPINLOCKS < 1)
 
-#define atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic)
-
 /*
  * Your basic spinlocks, allowing only a single CPU anywhere
  *
@@ -56,11 +93,11 @@
 #endif
 
 #define spin_lock_init(lock)	do { } while(0)
-#define spin_lock(lock)		(void)(lock) /* Not "unused variable". */
+#define _spin_lock(lock)	(void)(lock) /* Not "unused variable". */
 #define spin_is_locked(lock)	(0)
-#define spin_trylock(lock)	({1; })
+#define _spin_trylock(lock)	({1; })
 #define spin_unlock_wait(lock)	do { } while(0)
-#define spin_unlock(lock)	do { } while(0)
+#define _spin_unlock(lock)	do { } while(0)
 
 #elif (DEBUG_SPINLOCKS < 2)
 
@@ -71,11 +108,11 @@
 
 #define spin_lock_init(x)	do { (x)->lock = 0; } while (0)
 #define spin_is_locked(lock)	(test_bit(0,(lock)))
-#define spin_trylock(lock)	(!test_and_set_bit(0,(lock)))
+#define _spin_trylock(lock)	(!test_and_set_bit(0,(lock)))
 
-#define spin_lock(x)		do { (x)->lock = 1; } while (0)
+#define _spin_lock(x)		do { (x)->lock = 1; } while (0)
 #define spin_unlock_wait(x)	do { } while (0)
-#define spin_unlock(x)		do { (x)->lock = 0; } while (0)
+#define _spin_unlock(x)		do { (x)->lock = 0; } while (0)
 
 #else /* (DEBUG_SPINLOCKS >= 2) */
 
@@ -90,11 +127,11 @@
 
 #define spin_lock_init(x)	do { (x)->lock = 0; } while (0)
 #define spin_is_locked(lock)	(test_bit(0,(lock)))
-#define spin_trylock(lock)	(!test_and_set_bit(0,(lock)))
+#define _spin_trylock(lock)	(!test_and_set_bit(0,(lock)))
 
-#define spin_lock(x)		do {unsigned long __spinflags; save_flags(__spinflags); cli(); if ((x)->lock&&(x)->babble) {printk("%s:%d: spin_lock(%s:%p) already locked\n", __BASE_FILE__,__LINE__, (x)->module, (x));(x)->babble--;} (x)->lock = 1; restore_flags(__spinflags);} while (0)
+#define _spin_lock(x)		do {unsigned long __spinflags; save_flags(__spinflags); cli(); if ((x)->lock&&(x)->babble) {printk("%s:%d: spin_lock(%s:%p) already locked\n", __BASE_FILE__,__LINE__, (x)->module, (x));(x)->babble--;} (x)->lock = 1; restore_flags(__spinflags);} while (0)
 #define spin_unlock_wait(x)	do {unsigned long __spinflags; save_flags(__spinflags); cli(); if ((x)->lock&&(x)->babble) {printk("%s:%d: spin_unlock_wait(%s:%p) deadlock\n", __BASE_FILE__,__LINE__, (x)->module, (x));(x)->babble--;} restore_flags(__spinflags);} while (0)
-#define spin_unlock(x)		do {unsigned long __spinflags; save_flags(__spinflags); cli(); if (!(x)->lock&&(x)->babble) {printk("%s:%d: spin_unlock(%s:%p) not locked\n", __BASE_FILE__,__LINE__, (x)->module, (x));(x)->babble--;} (x)->lock = 0; restore_flags(__spinflags);} while (0)
+#define _spin_unlock(x)		do {unsigned long __spinflags; save_flags(__spinflags); cli(); if (!(x)->lock&&(x)->babble) {printk("%s:%d: spin_unlock(%s:%p) not locked\n", __BASE_FILE__,__LINE__, (x)->module, (x));(x)->babble--;} (x)->lock = 0; restore_flags(__spinflags);} while (0)
 
 #endif	/* DEBUG_SPINLOCKS */
 
@@ -119,10 +156,10 @@
 #endif
 
 #define rwlock_init(lock)	do { } while(0)
-#define read_lock(lock)		(void)(lock) /* Not "unused variable". */
-#define read_unlock(lock)	do { } while(0)
-#define write_lock(lock)	(void)(lock) /* Not "unused variable". */
-#define write_unlock(lock)	do { } while(0)
+#define _read_lock(lock)		(void)(lock) /* Not "unused variable". */
+#define _read_unlock(lock)	do { } while(0)
+#define _write_lock(lock)	(void)(lock) /* Not "unused variable". */
+#define _write_unlock(lock)	do { } while(0)
 
 #endif /* !SMP */
 
diff -u --recursive linux-2.4-prerelease.org/kernel/fork.c linux-2.4-prerelease/kernel/fork.c
--- linux-2.4-prerelease.org/kernel/fork.c	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/kernel/fork.c	Wed Jan  3 17:27:38 2001
@@ -622,6 +622,7 @@
 	}
 #endif
 	p->lock_depth = -1;		/* -1 = no lock */
+	atomic_set(&p->preemptable, 0);
 	p->start_time = jiffies;
 
 	retval = -ENOMEM;
diff -u --recursive linux-2.4-prerelease.org/kernel/sched.c linux-2.4-prerelease/kernel/sched.c
--- linux-2.4-prerelease.org/kernel/sched.c	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/kernel/sched.c	Wed Jan  3 13:53:17 2001
@@ -550,6 +550,16 @@
 			del_from_runqueue(prev);
 		case TASK_RUNNING:
 	}
+	/*
+	 * Check if the context switch is still necessary.
+	 * This catches up things like if (need_resched) schedule()
+	 * that is not atomic and open a window with a preemptive
+	 * kernel where a task can be scheduled twice.
+	 */
+	if (prev->need_resched == 0 && prev->state == TASK_RUNNING) {
+		spin_unlock_irq(&runqueue_lock);
+		goto same_process;
+	}
 	prev->need_resched = 0;
 
 	/*
@@ -1150,7 +1160,7 @@
 		printk(" %5d\n", p->p_osptr->pid);
 	else
 		printk("\n");
-
+	printk("    preemptable : %d\n", atomic_read(&p->preemptable));
 	{
 		struct sigqueue *q;
 		char s[sizeof(sigset_t)*2+1], b[sizeof(sigset_t)*2+1]; 
diff -u --recursive linux-2.4-prerelease.org/lib/dec_and_lock.c linux-2.4-prerelease/lib/dec_and_lock.c
--- linux-2.4-prerelease.org/lib/dec_and_lock.c	Wed Jan  3 17:19:44 2001
+++ linux-2.4-prerelease/lib/dec_and_lock.c	Wed Jan  3 12:58:58 2001
@@ -1,5 +1,6 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
+#include <linux/sched.h>
 
 /*
  * This is an architecture-neutral, but slow,

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04  1:56 [PATCH] 2.4.0-prerelease: preemptive kernel ludovic fernandez
@ 2001-01-04  7:35 ` Daniel Phillips
  2001-01-04  8:11   ` Andi Kleen
                     ` (2 more replies)
  2001-01-04  9:00 ` David Woodhouse
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 25+ messages in thread
From: Daniel Phillips @ 2001-01-04  7:35 UTC (permalink / raw)
  To: ludovic fernandez, linux-kernel

ludovic fernandez wrote:
> The following patch makes the kernel preemptable.
> It is against 2.4.0-prerelease on for i386 only.
> It should work for UP and SMP even though I
> didn't validate it on SMP.
> Comments are welcome.

I was expecting to see this sometime in 2.5, not quite so soon...

The key idea here is to disable preemption on spin lock and reenable on
spin unlock.  That's a practical idea, highly compatible with the
current way of doing things.  Its a fairly heavy hit on spinlock
performance, but maybe the overall performance hit is small.  Benchmarks
are needed.

A more ambitious way to proceed is to change spinlocks so they can sleep
(not in interrupts of course).  There would not be any extra overhead
for this on spin_lock (because the sleep test is handled off the fast
path) but spin_unlock gets a little slower - it has to test and jump on
a flag if there are sleepers.

--
Daniel
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04  7:35 ` Daniel Phillips
@ 2001-01-04  8:11   ` Andi Kleen
  2001-01-04 12:32     ` Anton Blanchard
  2001-01-04 21:39     ` Nigel Gamble
  2001-01-04  8:43   ` ludovic fernandez
  2001-01-04 21:28   ` Nigel Gamble
  2 siblings, 2 replies; 25+ messages in thread
From: Andi Kleen @ 2001-01-04  8:11 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: ludovic fernandez, linux-kernel

On Thu, Jan 04, 2001 at 08:35:02AM +0100, Daniel Phillips wrote:
> A more ambitious way to proceed is to change spinlocks so they can sleep
> (not in interrupts of course).  There would not be any extra overhead

Imagine what happens when a non sleeping spinlock in a interrupt waits 
for a "sleeping spinlock" somewhere else...
I'm not sure if this is a good idea. Sleeping locks everywhere would
imply scheduled interrupts, which are nasty. 

I think a better way to proceed would be to make semaphores a bit more 
intelligent and turn them into something like adaptive spinlocks and use
them more where appropiate (currently using semaphores usually causes
lots of context switches where some could probably be avoided). Problem
is that for some cases like your producer-consumer pattern (which has been
used previously in unreleased kernel code BTW) it would be a pessimization
to spin, so such adaptive locks would probably need a different name.

-Andi

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04  7:35 ` Daniel Phillips
  2001-01-04  8:11   ` Andi Kleen
@ 2001-01-04  8:43   ` ludovic fernandez
  2001-01-04 22:10     ` Roger Larsson
  2001-01-04 21:28   ` Nigel Gamble
  2 siblings, 1 reply; 25+ messages in thread
From: ludovic fernandez @ 2001-01-04  8:43 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: linux-kernel

Daniel Phillips wrote:

>
> The key idea here is to disable preemption on spin lock and reenable on
> spin unlock.  That's a practical idea, highly compatible with the
> current way of doing things.  Its a fairly heavy hit on spinlock
> performance, but maybe the overall performance hit is small.  Benchmarks
> are needed.
>

I'm not sure the hit on spinlock is this heavy (one increment for lock
and one dec + test on unlock), but I completely agree (and volonteer)
for benchmarking. I'm not convinced a full preemptive kernel is something
interesting mainly due to the context switch cost (actually mmu contex switch).
Benchmarking is a good way to get a global overview on this.
What about only preemptable kernel threads ?

>
> A more ambitious way to proceed is to change spinlocks so they can sleep
> (not in interrupts of course).  There would not be any extra overhead
> for this on spin_lock (because the sleep test is handled off the fast
> path) but spin_unlock gets a little slower - it has to test and jump on
> a flag if there are sleepers.
>

I may be tired but I believe you're focusing on SMP architecture ?
This code simply defer the preemption at the end of the spinlock/lock
section. I don't see how you can easily mix sleeping lock and this
mechanism.

Ludo.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04  1:56 [PATCH] 2.4.0-prerelease: preemptive kernel ludovic fernandez
  2001-01-04  7:35 ` Daniel Phillips
@ 2001-01-04  9:00 ` David Woodhouse
  2001-01-04 16:17 ` Rik van Riel
  2001-01-04 20:06 ` Nigel Gamble
  3 siblings, 0 replies; 25+ messages in thread
From: David Woodhouse @ 2001-01-04  9:00 UTC (permalink / raw)
  To: ludovic fernandez; +Cc: linux-kernel

On Wed, 3 Jan 2001, ludovic fernandez wrote:

+#if 1
+    /*
+     * I got some problems with PCMCIA initialization and a
+     * preemptive kernel;
+     * init_pcmcia_ds() beeing called before the completion
+     * of pending scheduled tasks. I don't know if this is the
+     * right fix though.
+     */
+    flush_scheduled_tasks();
+#endif


Not really the right fix, but it'll do. The right fix is probably to
register the socket immediately in yenta_open() rather than doing it from
the queued task. We just weren't brave enough to make that change though,
when it was working anyway.

-- 
dwmw2


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04  8:11   ` Andi Kleen
@ 2001-01-04 12:32     ` Anton Blanchard
  2001-01-04 12:44       ` Andi Kleen
  2001-01-04 21:39     ` Nigel Gamble
  1 sibling, 1 reply; 25+ messages in thread
From: Anton Blanchard @ 2001-01-04 12:32 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Daniel Phillips, ludovic fernandez, linux-kernel

 
> I think a better way to proceed would be to make semaphores a bit more 
> intelligent and turn them into something like adaptive spinlocks and use
> them more where appropiate (currently using semaphores usually causes
> lots of context switches where some could probably be avoided). Problem
> is that for some cases like your producer-consumer pattern (which has been
> used previously in unreleased kernel code BTW) it would be a pessimization
> to spin, so such adaptive locks would probably need a different name.

Like solaris adaptive mutexes? It would be interesting to test,
however considering read/write semaphores are hardly ever used these
days we want to be sure they are worth it before adding yet another
synchronisation primitive.

Anton
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04 12:32     ` Anton Blanchard
@ 2001-01-04 12:44       ` Andi Kleen
  2001-01-04 21:54         ` Nigel Gamble
  0 siblings, 1 reply; 25+ messages in thread
From: Andi Kleen @ 2001-01-04 12:44 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: Andi Kleen, Daniel Phillips, ludovic fernandez, linux-kernel

On Thu, Jan 04, 2001 at 11:32:11PM +1100, Anton Blanchard wrote:
>  
> > I think a better way to proceed would be to make semaphores a bit more 
> > intelligent and turn them into something like adaptive spinlocks and use
> > them more where appropiate (currently using semaphores usually causes
> > lots of context switches where some could probably be avoided). Problem
> > is that for some cases like your producer-consumer pattern (which has been
> > used previously in unreleased kernel code BTW) it would be a pessimization
> > to spin, so such adaptive locks would probably need a different name.
> 
> Like solaris adaptive mutexes? It would be interesting to test,
> however considering read/write semaphores are hardly ever used these
> days we want to be sure they are worth it before adding yet another
> synchronisation primitive.

A bit similar, yes, but much simpler @-)

The problem is that current Linux semaphores are very costly locks -- they
always cause a context switch.


-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04  1:56 [PATCH] 2.4.0-prerelease: preemptive kernel ludovic fernandez
  2001-01-04  7:35 ` Daniel Phillips
  2001-01-04  9:00 ` David Woodhouse
@ 2001-01-04 16:17 ` Rik van Riel
  2001-01-04 20:06 ` Nigel Gamble
  3 siblings, 0 replies; 25+ messages in thread
From: Rik van Riel @ 2001-01-04 16:17 UTC (permalink / raw)
  To: ludovic fernandez; +Cc: linux-kernel

On Wed, 3 Jan 2001, ludovic fernandez wrote:

> The following patch makes the kernel preemptable.
> It is against 2.4.0-prerelease on for i386 only.

> Comments are welcome.

I think this would be a nice thing to start testing
once 2.5 is forked off.

regards,

Rik
--
Hollywood goes for world dumbination,
	Trailer at 11.

		http://www.surriel.com/
http://www.conectiva.com/	http://distro.conectiva.com.br/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04  1:56 [PATCH] 2.4.0-prerelease: preemptive kernel ludovic fernandez
                   ` (2 preceding siblings ...)
  2001-01-04 16:17 ` Rik van Riel
@ 2001-01-04 20:06 ` Nigel Gamble
  2001-01-04 20:36   ` ludovic fernandez
  3 siblings, 1 reply; 25+ messages in thread
From: Nigel Gamble @ 2001-01-04 20:06 UTC (permalink / raw)
  To: ludovic fernandez; +Cc: linux-kernel

On Wed, 3 Jan 2001, ludovic fernandez wrote:
> For hackers,
> The following patch makes the kernel preemptable.
> It is against 2.4.0-prerelease on for i386 only.
> It should work for UP and SMP even though I
> didn't validate it on SMP.
> Comments are welcome.

Hi Ludo,

I didn't realise you were still working on this.  Did you know that
I am also?  Our most recent version is at:

ftp://ftp.mvista.com/pub/Area51/preemptible_kernel/

although I have yet to put up a 2.4.0-prerelease patch (coming soon).
We should probably pool our efforts on this for 2.5.

Cheers,
Nigel

Nigel Gamble                                    nigel@nrg.org
Mountain View, CA, USA.                         http://www.nrg.org/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04 20:06 ` Nigel Gamble
@ 2001-01-04 20:36   ` ludovic fernandez
  2001-01-05  0:56     ` Daniel Phillips
  0 siblings, 1 reply; 25+ messages in thread
From: ludovic fernandez @ 2001-01-04 20:36 UTC (permalink / raw)
  To: nigel; +Cc: linux-kernel

Hello Nigel,

Nigel Gamble wrote:

>
> Hi Ludo,
>
> I didn't realise you were still working on this.  Did you know that
> I am also?  Our most recent version is at:
>
> ftp://ftp.mvista.com/pub/Area51/preemptible_kernel/
>

I was on vacation and had a little time to kill...
Going through your README, you seem much more
advanced than this simple patch.

>
> although I have yet to put up a 2.4.0-prerelease patch (coming soon).
> We should probably pool our efforts on this for 2.5.
>

Agreed.
Right now I will be interested to run some benchmarks (latency but
also performance) to see how the system is disturbed by beeing
preemptable. I'm little bit lost on this and I don't know where to start.
Do you have any pointers on benchmark suites I could run ?
Also, maybe it's a off topic subject now....

Ludo.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04  7:35 ` Daniel Phillips
  2001-01-04  8:11   ` Andi Kleen
  2001-01-04  8:43   ` ludovic fernandez
@ 2001-01-04 21:28   ` Nigel Gamble
  2 siblings, 0 replies; 25+ messages in thread
From: Nigel Gamble @ 2001-01-04 21:28 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: ludovic fernandez, linux-kernel

On Thu, 4 Jan 2001, Daniel Phillips wrote:
> A more ambitious way to proceed is to change spinlocks so they can sleep
> (not in interrupts of course).  There would not be any extra overhead
> for this on spin_lock (because the sleep test is handled off the fast
> path) but spin_unlock gets a little slower - it has to test and jump on
> a flag if there are sleepers.

I already have a preemption patch that also changes the longest
held spinlocks into sleep locks, i.e. the locks that are routinely
held for > 1ms.  This gives a kernel with very good interactive
response, good enough for most audio apps.

Nigel Gamble                                    nigel@nrg.org
Mountain View, CA, USA.                         http://www.nrg.org/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04  8:11   ` Andi Kleen
  2001-01-04 12:32     ` Anton Blanchard
@ 2001-01-04 21:39     ` Nigel Gamble
  2001-01-04 22:09       ` Andi Kleen
  1 sibling, 1 reply; 25+ messages in thread
From: Nigel Gamble @ 2001-01-04 21:39 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Daniel Phillips, ludovic fernandez, linux-kernel

On Thu, 4 Jan 2001, Andi Kleen wrote:
> On Thu, Jan 04, 2001 at 08:35:02AM +0100, Daniel Phillips wrote:
> > A more ambitious way to proceed is to change spinlocks so they can sleep
> > (not in interrupts of course).  There would not be any extra overhead
> 
> Imagine what happens when a non sleeping spinlock in a interrupt waits 
> for a "sleeping spinlock" somewhere else...
> I'm not sure if this is a good idea. Sleeping locks everywhere would
> imply scheduled interrupts, which are nasty. 

Yes, you have to make sure that you never call a sleeping lock
while holding a spinlock.  And you can't call a sleeping lock from
interrupt handlers in the current model.  But this is easy to avoid.

> I think a better way to proceed would be to make semaphores a bit more 
> intelligent and turn them into something like adaptive spinlocks and use
> them more where appropiate (currently using semaphores usually causes
> lots of context switches where some could probably be avoided). Problem
> is that for some cases like your producer-consumer pattern (which has been
> used previously in unreleased kernel code BTW) it would be a pessimization
> to spin, so such adaptive locks would probably need a different name.

Experience has shown that adaptive spinlocks are not worth the extra
overhead (if you mean the type that spin for a short time
and then decide to sleep).  It is better to use spin_lock_irqsave()
(which, by definition, disables kernel preemption without the need
to set a no-preempt flag) to protect regions where the lock is held
for a maximum of around 100us, and to use a sleeping mutex lock for
longer regions.  This is what I'm working towards.

Nigel Gamble                                    nigel@nrg.org
Mountain View, CA, USA.                         http://www.nrg.org/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04 12:44       ` Andi Kleen
@ 2001-01-04 21:54         ` Nigel Gamble
  0 siblings, 0 replies; 25+ messages in thread
From: Nigel Gamble @ 2001-01-04 21:54 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Anton Blanchard, Daniel Phillips, ludovic fernandez, linux-kernel

On Thu, 4 Jan 2001, Andi Kleen wrote:
> The problem is that current Linux semaphores are very costly locks -- they
> always cause a context switch.

My preemptible kernel patch currently just uses Linux semaphores to
implement sleeping kernel mutexes, but we (at MontaVista Software) are
working on a new implementation that also does priority inheritance,
to avoid the priority inversion problem, and that does the minimum
necessary context switches.

Nigel Gamble                                    nigel@nrg.org
Mountain View, CA, USA.                         http://www.nrg.org/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04 21:39     ` Nigel Gamble
@ 2001-01-04 22:09       ` Andi Kleen
  2001-01-04 22:28         ` Nigel Gamble
  0 siblings, 1 reply; 25+ messages in thread
From: Andi Kleen @ 2001-01-04 22:09 UTC (permalink / raw)
  To: Nigel Gamble; +Cc: Andi Kleen, Daniel Phillips, ludovic fernandez, linux-kernel

On Thu, Jan 04, 2001 at 01:39:57PM -0800, Nigel Gamble wrote:
> Experience has shown that adaptive spinlocks are not worth the extra
> overhead (if you mean the type that spin for a short time
> and then decide to sleep).  It is better to use spin_lock_irqsave()
> (which, by definition, disables kernel preemption without the need
> to set a no-preempt flag) to protect regions where the lock is held
> for a maximum of around 100us, and to use a sleeping mutex lock for
> longer regions.  This is what I'm working towards.

What experience ?  Only real-time latency testing or SMP scalability 
testing? 

The case I was thinking about is a heavily contended lock like the
inode semaphore of a file that is used by several threads on several
CPUs in parallel or the mm semaphore of a often faulted shared mm. 

It's not an option to convert them to a spinlock, but often the delays
are short enough that a short spin could make sense. 

-Andi

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04  8:43   ` ludovic fernandez
@ 2001-01-04 22:10     ` Roger Larsson
  2001-01-04 23:16       ` ludovic fernandez
  2001-01-05  5:29       ` george anzinger
  0 siblings, 2 replies; 25+ messages in thread
From: Roger Larsson @ 2001-01-04 22:10 UTC (permalink / raw)
  To: ludovic fernandez, Daniel Phillips, george anzinger; +Cc: linux-kernel

On Thursday 04 January 2001 09:43, ludovic fernandez wrote:
> Daniel Phillips wrote:
> > The key idea here is to disable preemption on spin lock and reenable on
> > spin unlock.  That's a practical idea, highly compatible with the
> > current way of doing things.  Its a fairly heavy hit on spinlock
> > performance, but maybe the overall performance hit is small.  Benchmarks
> > are needed.
>
> I'm not sure the hit on spinlock is this heavy (one increment for lock
> and one dec + test on unlock), but I completely agree (and volonteer)
> for benchmarking.

And the conditional jump is usually predicted correctly :-)
+static inline void enable_preempt(void)

+{
+        if (atomic_read(&current->preemptable) <= 0) {
+                BUG();
+        }
+        if (atomic_read(&current->preemptable) == 1) {


This part can probably be put in a proper non inline function.
Cache issues...
+                /*
+                * At that point a scheduling is healthy iff:
+                * - a scheduling request is pending.
+                * - the task is in running state.
+                * - this is not an interrupt context.
+                * - local interrupts are enabled.
+                */
+                if (current->need_resched == 1     &&
+                   current->state == TASK_RUNNING &&
+                   !in_interrupt()                &&
+                   local_irq_are_enabled())
+                {
+                        schedule();
+                }


+        }
+        atomic_dec(&current->preemptable);

What if something happens during the schedule() that would require
another thread...?

+}       

I have been discussing different layout with George on Montavista
also doing this kind of work... (different var and value range)

static incline void enable_preempt(void) {
    if (--current->preempt_count) {
        smp_mb(); /* not shure if needed... */
        preempt_schedule();
    }
}

in sched.c (some smp_mb might be needed here too...)
void preempt_schedule() {
    while (current->need_resched) {
        current->preempt->count++; /* prevent competition with IRQ code */
        if (current->need_resched)
            schedule();
        current->preempt_count--;
    }
}

> I'm not convinced a full preemptive kernel is something
> interesting mainly due to the context switch cost (actually mmu contex
> switch). 

It will NOT be fully, it will be mostly.
You will only context switch when a higher prio thread gets runnable, two 
ways:
1) external intterupt waking higher prio process, same context swithes as 
when running in user code. We won't get more interrupts.
2) wake up due to something we do. Not that many places, mostly due to
releasing syncronization objects (spinlocks does not count).

If this still is a problem, we can select to only preemt to processes running
RT stuff. SCHED_FIFO and SCHED_RR by letting them set need_resched to 2...


> Benchmarking is a good way to get a global overview on this.

Remember to benchmark with stuff that will make the positive aspects visible 
too. Playing audio (with smaller buffers), more reliably burning CD ROMs,
less hichups while playing video [if run with higher prio...]
Plain throuput tests won't tell the whole story!

see
 http://www.gardena.net/benno/linux/audio
 http://www.linuxdj.com/latency-graph/

> What about only preemptable kernel threads ?

No, it won't help enough.


-- 
--
Home page:
  http://www.norran.net/nra02596/
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04 22:09       ` Andi Kleen
@ 2001-01-04 22:28         ` Nigel Gamble
  0 siblings, 0 replies; 25+ messages in thread
From: Nigel Gamble @ 2001-01-04 22:28 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Daniel Phillips, ludovic fernandez, linux-kernel

On Thu, 4 Jan 2001, Andi Kleen wrote:
> On Thu, Jan 04, 2001 at 01:39:57PM -0800, Nigel Gamble wrote:
> > Experience has shown that adaptive spinlocks are not worth the extra
> > overhead (if you mean the type that spin for a short time
> > and then decide to sleep).  It is better to use spin_lock_irqsave()
> > (which, by definition, disables kernel preemption without the need
> > to set a no-preempt flag) to protect regions where the lock is held
> > for a maximum of around 100us, and to use a sleeping mutex lock for
> > longer regions.  This is what I'm working towards.
> 
> What experience ?  Only real-time latency testing or SMP scalability 
> testing? 

Both.  We spent a lot of time on this when I was at SGI working on IRIX.
I think we ended up with excellent SMP scalability and good real-time
latency.  There is also some academic research that suggests that
the extra overhead of a dynamic adaptive spinlock usually outweighs
any possible gains.

> The case I was thinking about is a heavily contended lock like the
> inode semaphore of a file that is used by several threads on several
> CPUs in parallel or the mm semaphore of a often faulted shared mm. 
> 
> It's not an option to convert them to a spinlock, but often the delays
> are short enough that a short spin could make sense. 

I think the first order performance problem of a heavily contended lock
is not how it is implemented, but the fact that it is heavily contended.
In IRIX we spent a lot of time looking for these bottlenecks and
re-architecting to avoid them.  (This would mean minimizing the shared
accesses in your examples.)

Nigel Gamble                                    nigel@nrg.org
Mountain View, CA, USA.                         http://www.nrg.org/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04 22:10     ` Roger Larsson
@ 2001-01-04 23:16       ` ludovic fernandez
  2001-01-05  0:10         ` Nigel Gamble
  2001-01-05  5:29       ` george anzinger
  1 sibling, 1 reply; 25+ messages in thread
From: ludovic fernandez @ 2001-01-04 23:16 UTC (permalink / raw)
  To: Roger Larsson; +Cc: Daniel Phillips, george anzinger, linux-kernel

Roger Larsson wrote:

> On Thursday 04 January 2001 09:43, ludovic fernandez wrote:
>
> > I'm not convinced a full preemptive kernel is something
> > interesting mainly due to the context switch cost (actually mmu contex
> > switch).
>
> It will NOT be fully, it will be mostly.
> You will only context switch when a higher prio thread gets runnable, two
> ways:
> 1) external intterupt waking higher prio process, same context swithes as
> when running in user code. We won't get more interrupts.
> 2) wake up due to something we do. Not that many places, mostly due to
> releasing syncronization objects (spinlocks does not count).
>
> If this still is a problem, we can select to only preemt to processes running
> RT stuff. SCHED_FIFO and SCHED_RR by letting them set need_resched to 2...
> > What about only preemptable kernel threads ?
>
> No, it won't help enough.
>

This is not the point I was trying to make .....
So far we are talking about real time behaviour. This is a very interesting/exciting
thing and we all agree it's a huge task which goes much more behind
just having a preemptive kernel.
I'm not convinced that a preemptive kernel is interesting for apps using
the time sharing scheduling, mainly because it is not deterministic and the
price of a mmu conntext switch is still way to heavy (that's my 2 cents belief
anyway).
But, having a preemptive kernel could be interesting for an another issue.
More and more the linux kernel is using the concept of kernel threads to
defer part of the processing. Right now they are not preemptable and one
has to put explicit preemption points. I believe this solution doesn't fly in
the long term (the code changes, the locking design changes and the
preemption points become irrelevant). They could be preemptable because
they have a way of being deterministic (preemption disable) and they
are lightweight to schedule since they don't use a mmu context.

Ludo.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04 23:16       ` ludovic fernandez
@ 2001-01-05  0:10         ` Nigel Gamble
  2001-01-05  0:36           ` ludovic fernandez
  0 siblings, 1 reply; 25+ messages in thread
From: Nigel Gamble @ 2001-01-05  0:10 UTC (permalink / raw)
  To: ludovic fernandez
  Cc: Roger Larsson, Daniel Phillips, george anzinger, linux-kernel

On Thu, 4 Jan 2001, ludovic fernandez wrote:
> This is not the point I was trying to make .....
> So far we are talking about real time behaviour. This is a very interesting/exciting
> thing and we all agree it's a huge task which goes much more behind
> just having a preemptive kernel.

You're right that it is more than just a preemptible kernel, but I don't
agree that it's all that huge.  But this is the third time I have worked
on enabling real-time behavior in unix-like OSes, so I may be biased ;-)

> I'm not convinced that a preemptive kernel is interesting for apps using
> the time sharing scheduling, mainly because it is not deterministic and the
> price of a mmu conntext switch is still way to heavy (that's my 2 cents belief
> anyway).

But as Roger pointed out, the number of extra context switches
introduced by having a preemptible kernel is actually very low.  If an
interrupt occurs while running in user mode, the context switch it may
cause will happen even in a non-preemptible kernel.  I think that
running a kernel compile for example, the number of context switches per
second caused by kernel preemption is probably between 1% and 10% of the
total context switches per second.  And it's certainly interesting to me
that I can listen to MP3s without interruption now, while doing a kernel
build!

Nigel Gamble                                    nigel@nrg.org
Mountain View, CA, USA.                         http://www.nrg.org/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-05  0:10         ` Nigel Gamble
@ 2001-01-05  0:36           ` ludovic fernandez
  2001-01-05  0:45             ` Andi Kleen
  0 siblings, 1 reply; 25+ messages in thread
From: ludovic fernandez @ 2001-01-05  0:36 UTC (permalink / raw)
  To: nigel; +Cc: Roger Larsson, Daniel Phillips, george anzinger, linux-kernel

Nigel Gamble wrote:

> On Thu, 4 Jan 2001, ludovic fernandez wrote:
> > This is not the point I was trying to make .....
> > So far we are talking about real time behaviour. This is a very interesting/exciting
> > thing and we all agree it's a huge task which goes much more behind
> > just having a preemptive kernel.
>
> You're right that it is more than just a preemptible kernel, but I don't
> agree that it's all that huge.  But this is the third time I have worked
> on enabling real-time behavior in unix-like OSes, so I may be biased ;-)
>
> > I'm not convinced that a preemptive kernel is interesting for apps using
> > the time sharing scheduling, mainly because it is not deterministic and the
> > price of a mmu conntext switch is still way to heavy (that's my 2 cents belief
> > anyway).
>
> But as Roger pointed out, the number of extra context switches
> introduced by having a preemptible kernel is actually very low.  If an
> interrupt occurs while running in user mode, the context switch it may
> cause will happen even in a non-preemptible kernel.  I think that
> running a kernel compile for example, the number of context switches per
> second caused by kernel preemption is probably between 1% and 10% of the
> total context switches per second.  And it's certainly interesting to me
> that I can listen to MP3s without interruption now, while doing a kernel
> build!
>

I agree Nigel, but as you pointed out you will have to deal with
scheduling behaviour, interrupt latency and priority inversion to
achieve that.
I was just trying to point out that just having kernel preemptable threads
will enable, for example, kswapd to adjust itself more efficiently with the
current load of the system and what it needs to achieve (running from
low priority to non preemptable thread). Providing a better (smoother)
swap behaviour.
Saying that, I definitely agree that I want/need to one day listen to
my MP3s while building  my kernel.

Ludo.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-05  0:36           ` ludovic fernandez
@ 2001-01-05  0:45             ` Andi Kleen
  2001-01-05  1:13               ` Alan Olsen
  0 siblings, 1 reply; 25+ messages in thread
From: Andi Kleen @ 2001-01-05  0:45 UTC (permalink / raw)
  To: ludovic fernandez
  Cc: nigel, Roger Larsson, Daniel Phillips, george anzinger,
	linux-kernel

On Thu, Jan 04, 2001 at 04:36:32PM -0800, ludovic fernandez wrote:
> Saying that, I definitely agree that I want/need to one day listen to
> my MP3s while building  my kernel.

??? I can listen to MP3s just fine while building kernels, on a not very
powerful K6. 

-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04 20:36   ` ludovic fernandez
@ 2001-01-05  0:56     ` Daniel Phillips
  0 siblings, 0 replies; 25+ messages in thread
From: Daniel Phillips @ 2001-01-05  0:56 UTC (permalink / raw)
  To: ludovic fernandez, linux-kernel

ludovic fernandez wrote:
> Right now I will be interested to run some benchmarks (latency but
> also performance) to see how the system is disturbed by beeing
> preemptable. I'm little bit lost on this and I don't know where to start.
> Do you have any pointers on benchmark suites I could run ?
> Also, maybe it's a off topic subject now....

No!  Not off topic.  And I hope you don't throw away that simple patch,
it will always be useful for doing reality checks on the performance of
the fancy system, and who knows, maybe it's useful in its own right.

The current fashion is to use dbench:

  ftp://samba.org/pub/tridge/dbench

I think this is good for your patch because it's inherently parallel. 
Interesting numbers of tasks are, e.g., 1, 2, 10, 50.   Of course dbench
is not the last word in benchmarks but it's been pretty useful so far. 
You probably want something entirely cpu-bound too.  How about dbench
with ramfs?

--
Daniel
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-05  0:45             ` Andi Kleen
@ 2001-01-05  1:13               ` Alan Olsen
  0 siblings, 0 replies; 25+ messages in thread
From: Alan Olsen @ 2001-01-05  1:13 UTC (permalink / raw)
  To: Andi Kleen
  Cc: ludovic fernandez, nigel, Roger Larsson, Daniel Phillips,
	george anzinger, linux-kernel

On Fri, 5 Jan 2001, Andi Kleen wrote:

> On Thu, Jan 04, 2001 at 04:36:32PM -0800, ludovic fernandez wrote:
> > Saying that, I definitely agree that I want/need to one day listen to
> > my MP3s while building  my kernel.
> 
> ??? I can listen to MP3s just fine while building kernels, on a not very
> powerful K6. 

I have found that sound card "hick-ups" while doing heavy work under Linux
are an indication of an IRQ problem.  I had that problem on my P-III 650
until I went in and rearranged cards and sorted out what was on what IRQ.
(The BIOS just picked numbers, it gave you no way to determine order or
slots.  What IRQ you got depended on the slot and there was no real way
to chenge it.) 

Some boards have a very poor way of chosing the assignment
of IRQs. You just have to shuffle things until you get what you want. 

Since I did the manual reorg of the hardware, things have run MUCH
smoother.

alan@ctrl-alt-del.com | Note to AOL users: for a quick shortcut to reply
Alan Olsen            | to my mail, just hit the ctrl, alt and del keys.
    "In the future, everything will have its 15 minutes of blame."

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-04 22:10     ` Roger Larsson
  2001-01-04 23:16       ` ludovic fernandez
@ 2001-01-05  5:29       ` george anzinger
  2001-01-05  6:45         ` ludovic fernandez
  1 sibling, 1 reply; 25+ messages in thread
From: george anzinger @ 2001-01-05  5:29 UTC (permalink / raw)
  To: Roger Larsson; +Cc: ludovic fernandez, Daniel Phillips, linux-kernel

Roger Larsson wrote:
> 
> On Thursday 04 January 2001 09:43, ludovic fernandez wrote:
> > Daniel Phillips wrote:
> > > The key idea here is to disable preemption on spin lock and reenable on
> > > spin unlock.  That's a practical idea, highly compatible with the
> > > current way of doing things.  Its a fairly heavy hit on spinlock
> > > performance, but maybe the overall performance hit is small.  Benchmarks
> > > are needed.
> >
> > I'm not sure the hit on spinlock is this heavy (one increment for lock
> > and one dec + test on unlock), but I completely agree (and volonteer)
> > for benchmarking.
> 
> And the conditional jump is usually predicted correctly :-)
> +static inline void enable_preempt(void)
> 
> +{
> +        if (atomic_read(&current->preemptable) <= 0) {
> +                BUG();
> +        }
> +        if (atomic_read(&current->preemptable) == 1) {
> 
> This part can probably be put in a proper non inline function.
> Cache issues...
> +                /*
> +                * At that point a scheduling is healthy iff:
> +                * - a scheduling request is pending.
> +                * - the task is in running state.
> +                * - this is not an interrupt context.
> +                * - local interrupts are enabled.
> +                */
> +                if (current->need_resched == 1     &&
> +                   current->state == TASK_RUNNING &&
> +                   !in_interrupt()                &&
> +                   local_irq_are_enabled())
> +                {
> +                        schedule();
> +                }
>
Actually the MontaVista Patch cleverly removes the tests for
in_interrupt() and local_irq_are_enabled() AND the state ==
TASK_RUNNING.  In actual fact these states can be considered way points
on the system status vector.  For example the interrupts off state
implies all the rest, the in_interrupt() implies not preemptable and
finally, not preemptable is one station away from fully preemptable.  

TASK_RUNNING is easily solved by makeing schedule() aware that it is
being called for preemption.  See the MontaVista patch for details.


ftp://ftp.mvista.com/pub/Area51/preemptible_kernel/


 
> +        }
> +        atomic_dec(&current->preemptable);
> 
> What if something happens during the schedule() that would require
> another thread...?
> 
> +}
> 
> I have been discussing different layout with George on Montavista
> also doing this kind of work... (different var and value range)
> 
> static incline void enable_preempt(void) {
>     if (--current->preempt_count) {
>         smp_mb(); /* not shure if needed... */
>         preempt_schedule();
>     }
> }
> 
> in sched.c (some smp_mb might be needed here too...)
> void preempt_schedule() {
>     while (current->need_resched) {
>         current->preempt->count++; /* prevent competition with IRQ code */
>         if (current->need_resched)
>             schedule();
>         current->preempt_count--;
>     }
> }
> 
> > I'm not convinced a full preemptive kernel is something
> > interesting mainly due to the context switch cost (actually mmu contex
> > switch).
> 
> It will NOT be fully, it will be mostly.
> You will only context switch when a higher prio thread gets runnable, two
> ways:
> 1) external intterupt waking higher prio process, same context swithes as
> when running in user code. We won't get more interrupts.
> 2) wake up due to something we do. Not that many places, mostly due to
> releasing syncronization objects (spinlocks does not count).
> 
> If this still is a problem, we can select to only preemt to processes running
> RT stuff. SCHED_FIFO and SCHED_RR by letting them set need_resched to 2...

The preemption ususally just switches earlier.  The switch would happen
soon anyway.  That is what need_resched =1; means.
> 
> > Benchmarking is a good way to get a global overview on this.
> 
> Remember to benchmark with stuff that will make the positive aspects visible
> too. Playing audio (with smaller buffers), more reliably burning CD ROMs,
> less hichups while playing video [if run with higher prio...]
> Plain throuput tests won't tell the whole story!
> 
> see
>  http://www.gardena.net/benno/linux/audio
>  http://www.linuxdj.com/latency-graph/
> 
> > What about only preemptable kernel threads ?
> 
> No, it won't help enough.
> 
> --
> --
> Home page:
>   http://www.norran.net/nra02596/
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-05  5:29       ` george anzinger
@ 2001-01-05  6:45         ` ludovic fernandez
  2001-01-05  8:10           ` george anzinger
  0 siblings, 1 reply; 25+ messages in thread
From: ludovic fernandez @ 2001-01-05  6:45 UTC (permalink / raw)
  To: george anzinger; +Cc: Roger Larsson, Daniel Phillips, linux-kernel

george anzinger wrote:

> Roger Larsson wrote:
> >
>
> > This part can probably be put in a proper non inline function.
> > Cache issues...
> > +                /*
> > +                * At that point a scheduling is healthy iff:
> > +                * - a scheduling request is pending.
> > +                * - the task is in running state.
> > +                * - this is not an interrupt context.
> > +                * - local interrupts are enabled.
> > +                */
> > +                if (current->need_resched == 1     &&
> > +                   current->state == TASK_RUNNING &&
> > +                   !in_interrupt()                &&
> > +                   local_irq_are_enabled())
> > +                {
> > +                        schedule();
> > +                }
> >
> Actually the MontaVista Patch cleverly removes the tests for
> in_interrupt() and local_irq_are_enabled() AND the state ==
> TASK_RUNNING.  In actual fact these states can be considered way points
> on the system status vector.  For example the interrupts off state
> implies all the rest, the in_interrupt() implies not preemptable and
> finally, not preemptable is one station away from fully preemptable.
>
> TASK_RUNNING is easily solved by makeing schedule() aware that it is
> being called for preemption.  See the MontaVista patch for details.
>

Humm, I'm just curious,
Regarding in_interrupt(). How do you deal with soft interrupts?
Guys calling cpu_bh_disable() or even incrementing the count on
their own. I don't know if this acceptable but definitely can be done,
I prefer to rely on fact than on API.
Regarding local_irq_enabled(). How do you handle the code that
call local_irq_disable(), then spin_lock(), spin_unlock() and only
re-enable the interruptions ? In this case, you preempt code that
is supposed to run interruptions disabled.
Finally, regarding the test on the task state, there may be a cache issue
but calling schedule() has also some overhead.

Ludo.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] 2.4.0-prerelease: preemptive kernel.
  2001-01-05  6:45         ` ludovic fernandez
@ 2001-01-05  8:10           ` george anzinger
  0 siblings, 0 replies; 25+ messages in thread
From: george anzinger @ 2001-01-05  8:10 UTC (permalink / raw)
  To: ludovic fernandez; +Cc: Roger Larsson, Daniel Phillips, linux-kernel

ludovic fernandez wrote:
> 
> george anzinger wrote:
> 
> > Roger Larsson wrote:
> > >
> >
> > > This part can probably be put in a proper non inline function.
> > > Cache issues...
> > > +                /*
> > > +                * At that point a scheduling is healthy iff:
> > > +                * - a scheduling request is pending.
> > > +                * - the task is in running state.
> > > +                * - this is not an interrupt context.
> > > +                * - local interrupts are enabled.
> > > +                */
> > > +                if (current->need_resched == 1     &&
> > > +                   current->state == TASK_RUNNING &&
> > > +                   !in_interrupt()                &&
> > > +                   local_irq_are_enabled())
> > > +                {
> > > +                        schedule();
> > > +                }
> > >
> > Actually the MontaVista Patch cleverly removes the tests for
> > in_interrupt() and local_irq_are_enabled() AND the state ==
> > TASK_RUNNING.  In actual fact these states can be considered way points
> > on the system status vector.  For example the interrupts off state
> > implies all the rest, the in_interrupt() implies not preemptable and
> > finally, not preemptable is one station away from fully preemptable.
> >
> > TASK_RUNNING is easily solved by makeing schedule() aware that it is
> > being called for preemption.  See the MontaVista patch for details.
> >
> 
> Humm, I'm just curious,
> Regarding in_interrupt(). How do you deal with soft interrupts?
> Guys calling cpu_bh_disable() or even incrementing the count on
> their own.
 
#define cpu_bh_disable(cpu)	do { ctx_sw_off(); local_bh_count(cpu)++;
barrier(); } while (0)
#define cpu_bh_enable(cpu)	do { barrier();
local_bh_count(cpu)--;ctx_sw_on(); } while (0)

I don't know if this acceptable but definitely can be done,
> I prefer to rely on fact than on API.

Yes, of course anything CAN be done, but then they would be SOL with the
movement of the flag location (as was done on the way from 2.3 to
2.4.0).  If we encounter such problems, we just fix them.

> Regarding local_irq_enabled(). How do you handle the code that
> call local_irq_disable(), then spin_lock(), spin_unlock() and only
> re-enable the interruptions ? 

Good question, as this is exactly what spin_lock_irq()/spin_unlock_irq()
do.  In this case it is not a problem as the intent was the same anyway,
but we can fix the code to handle this.  If you read the patch, you will
find that we call preempt_schedule() which calls schedule().  We could
easily put a test of the interrupt off state here and reject the
preemption.  The real issue here is how to catch the preemption when
local_irq_enable() is called.  If the system has an interrupt dedicated
to scheduling we could use this, however, while this is available in SMP
systems it is usually not available in UP systems.

On the other hand I have not seen any code do this.  I have, however,
seen code that:
    spin_lock_irq()
      :
    local_irq_enable()
      :
    spin_unlock()

We would rather not mess with the preemption count while irq is disabled
but this sort of code messes up the pairing we need to make this work.

> In this case, you preempt code that
> is supposed to run interruptions disabled.
> Finally, regarding the test on the task state, there may be a cache issue
> but calling schedule() has also some overhead.
> 
I am not sure what you are getting at here.  The task state will be
looked at by schedule() in short order in any case so a cache miss is
not the issue.  We don't look at the state but on the way to schedule()
(in preempt_schedule()) we add a flag to the state to indicate that it
is a preemption call.  schedule() is then changed to treat this task as
running, regardless of the state.

George

> Ludo.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2001-01-05  8:11 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2001-01-04  1:56 [PATCH] 2.4.0-prerelease: preemptive kernel ludovic fernandez
2001-01-04  7:35 ` Daniel Phillips
2001-01-04  8:11   ` Andi Kleen
2001-01-04 12:32     ` Anton Blanchard
2001-01-04 12:44       ` Andi Kleen
2001-01-04 21:54         ` Nigel Gamble
2001-01-04 21:39     ` Nigel Gamble
2001-01-04 22:09       ` Andi Kleen
2001-01-04 22:28         ` Nigel Gamble
2001-01-04  8:43   ` ludovic fernandez
2001-01-04 22:10     ` Roger Larsson
2001-01-04 23:16       ` ludovic fernandez
2001-01-05  0:10         ` Nigel Gamble
2001-01-05  0:36           ` ludovic fernandez
2001-01-05  0:45             ` Andi Kleen
2001-01-05  1:13               ` Alan Olsen
2001-01-05  5:29       ` george anzinger
2001-01-05  6:45         ` ludovic fernandez
2001-01-05  8:10           ` george anzinger
2001-01-04 21:28   ` Nigel Gamble
2001-01-04  9:00 ` David Woodhouse
2001-01-04 16:17 ` Rik van Riel
2001-01-04 20:06 ` Nigel Gamble
2001-01-04 20:36   ` ludovic fernandez
2001-01-05  0:56     ` Daniel Phillips

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox