public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] 2.4-ac: sparc64 support for O(1) scheduler
@ 2002-06-13 19:21 Robert Love
  2002-06-14  4:25 ` David S. Miller
  0 siblings, 1 reply; 36+ messages in thread
From: Robert Love @ 2002-06-13 19:21 UTC (permalink / raw)
  To: alan; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 405 bytes --]

Alan,

Attached patch provides SPARC64 support for the O(1) scheduler in
2.4-ac.  This is based off a 2.5 backport for my O(1) scheduler patches
by Thomas Duffy (i.e. give him the credit).

I do not know if any other architectures in 2.4-ac support the new
scheduler yet, but I will work on sending you the diffs as I get them or
do them...

Patch is against 2.4.19-pre10-ac2, please apply.

	Robert Love

[-- Attachment #2: sched-O1-sparc64-rml-2.4.19-pre10-ac2-1.patch --]
[-- Type: text/x-patch, Size: 18342 bytes --]

diff -urN linux-2.4.19-pre10-ac2/arch/sparc64/kernel/entry.S linux/arch/sparc64/kernel/entry.S
--- linux-2.4.19-pre10-ac2/arch/sparc64/kernel/entry.S	Thu Jun  6 08:55:13 2002
+++ linux/arch/sparc64/kernel/entry.S	Thu Jun 13 12:09:51 2002
@@ -1436,8 +1436,10 @@
 		 * %o7 for us.  Check performance counter stuff too.
 		 */
 		andn		%o7, SPARC_FLAG_NEWCHILD, %l0
+#if CONFIG_SMP
 		mov		%g5, %o0	/* 'prev' */
 		call		schedule_tail
+#endif
 		 stb		%l0, [%g6 + AOFF_task_thread + AOFF_thread_flags]
 		andcc		%l0, SPARC_FLAG_PERFCTR, %g0
 		be,pt		%icc, 1f
diff -urN linux-2.4.19-pre10-ac2/arch/sparc64/kernel/irq.c linux/arch/sparc64/kernel/irq.c
--- linux-2.4.19-pre10-ac2/arch/sparc64/kernel/irq.c	Thu Jun  6 08:55:13 2002
+++ linux/arch/sparc64/kernel/irq.c	Thu Jun 13 12:09:51 2002
@@ -162,7 +162,7 @@
 		tid = ((tid & UPA_CONFIG_MID) << 9);
 		tid &= IMAP_TID_UPA;
 	} else {
-		tid = (starfire_translate(imap, current->processor) << 26);
+		tid = (starfire_translate(imap, smp_processor_id()) << 26);
 		tid &= IMAP_TID_UPA;
 	}
 
diff -urN linux-2.4.19-pre10-ac2/arch/sparc64/kernel/process.c linux/arch/sparc64/kernel/process.c
--- linux-2.4.19-pre10-ac2/arch/sparc64/kernel/process.c	Thu Jun  6 08:55:13 2002
+++ linux/arch/sparc64/kernel/process.c	Thu Jun 13 12:09:51 2002
@@ -53,12 +53,8 @@
 		return -EPERM;
 
 	/* endless idle loop with no priority at all */
-	current->nice = 20;
-	current->counter = -100;
-	init_idle();
-
 	for (;;) {
-		/* If current->need_resched is zero we should really
+		/* If current->word.need_resched is zero we should really
 		 * setup for a system wakup event and execute a shutdown
 		 * instruction.
 		 *
@@ -79,16 +75,12 @@
 /*
  * the idle loop on a UltraMultiPenguin...
  */
-#define idle_me_harder()	(cpu_data[current->processor].idle_volume += 1)
-#define unidle_me()		(cpu_data[current->processor].idle_volume = 0)
+#define idle_me_harder()	(cpu_data[smp_processor_id()].idle_volume += 1)
+#define unidle_me()		(cpu_data[smp_processor_id()].idle_volume = 0)
 int cpu_idle(void)
 {
-	current->nice = 20;
-	current->counter = -100;
-	init_idle();
-
 	while(1) {
-		if (current->need_resched != 0) {
+		if (current->need_resched) {
 			unidle_me();
 			schedule();
 			check_pgt_cache();
diff -urN linux-2.4.19-pre10-ac2/arch/sparc64/kernel/rtrap.S linux/arch/sparc64/kernel/rtrap.S
--- linux-2.4.19-pre10-ac2/arch/sparc64/kernel/rtrap.S	Thu Jun  6 08:55:13 2002
+++ linux/arch/sparc64/kernel/rtrap.S	Thu Jun 13 12:09:51 2002
@@ -140,7 +140,7 @@
 		.align			64
 		.globl			rtrap_clr_l6, rtrap, irqsz_patchme, rtrap_xcall
 rtrap_clr_l6:	clr			%l6
-rtrap:		lduw			[%g6 + AOFF_task_processor], %l0
+rtrap:		lduw			[%g6 + AOFF_task_cpu], %l0
 		sethi			%hi(irq_stat), %l2	! &softirq_active
 		or			%l2, %lo(irq_stat), %l2	! &softirq_active
 irqsz_patchme:	sllx			%l0, 0, %l0
diff -urN linux-2.4.19-pre10-ac2/arch/sparc64/kernel/smp.c linux/arch/sparc64/kernel/smp.c
--- linux-2.4.19-pre10-ac2/arch/sparc64/kernel/smp.c	Thu Jun  6 08:55:13 2002
+++ linux/arch/sparc64/kernel/smp.c	Thu Jun 13 12:09:51 2002
@@ -252,6 +252,8 @@
  */
 static struct task_struct *cpu_new_task = NULL;
 
+static void smp_tune_scheduling(void);
+
 void __init smp_boot_cpus(void)
 {
 	int cpucount = 0, i;
@@ -259,10 +261,11 @@
 	printk("Entering UltraSMPenguin Mode...\n");
 	__sti();
 	smp_store_cpu_info(boot_cpu_id);
-	init_idle();
 
-	if (linux_num_cpus == 1)
+	if (linux_num_cpus == 1) {
+		smp_tune_scheduling();
 		return;
+	}
 
 	for (i = 0; i < NR_CPUS; i++) {
 		if (i == boot_cpu_id)
@@ -278,16 +281,13 @@
 			int no;
 
 			prom_printf("Starting CPU %d... ", i);
-			kernel_thread(start_secondary, NULL, CLONE_PID);
+			kernel_thread(NULL, NULL, CLONE_PID);
 			cpucount++;
 
 			p = init_task.prev_task;
-			init_tasks[cpucount] = p;
 
-			p->processor = i;
-			p->cpus_runnable = 1UL << i; /* we schedule the first task manually */
+			init_idle(p, i);
 
-			del_from_runqueue(p);
 			unhash_process(p);
 
 			callin_flag = 0;
@@ -338,6 +338,12 @@
 		smp_activated = 1;
 		smp_num_cpus = cpucount + 1;
 	}
+
+	/* We want to run this with all the other cpus spinning
+	* in the kernel.
+	*/
+	smp_tune_scheduling();
+
 	smp_processors_ready = 1;
 	membar("#StoreStore | #StoreLoad");
 }
@@ -1148,7 +1154,6 @@
 	__cpu_number_map[boot_cpu_id] = 0;
 	prom_cpu_nodes[boot_cpu_id] = linux_cpus[0].prom_node;
 	__cpu_logical_map[0] = boot_cpu_id;
-	current->processor = boot_cpu_id;
 	prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1;
 }
 
@@ -1175,6 +1180,96 @@
 	return base;
 }
 
+cycles_t cacheflush_time;
+unsigned long cache_decay_ticks;
+
+extern unsigned long cheetah_tune_scheduling(void);
+extern unsigned long timer_ticks_per_usec_quotient;
+
+static void __init smp_tune_scheduling(void)
+{
+	unsigned long orig_flush_base, flush_base, flags, *p;
+	unsigned int ecache_size, order;
+	cycles_t tick1, tick2, raw;
+
+	/* Approximate heuristic for SMP scheduling.  It is an
+	 * estimation of the time it takes to flush the L2 cache
+	 * on the local processor.
+	 *
+	 * The ia32 chooses to use the L1 cache flush time instead,
+	 * and I consider this complete nonsense.  The Ultra can service
+	 * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and
+	 * L2 misses are what create extra bus traffic (ie. the "cost"
+	 * of moving a process from one cpu to another).
+	 */
+	printk("SMP: Calibrating ecache flush... ");
+	if (tlb_type == cheetah) {
+		cacheflush_time = cheetah_tune_scheduling();
+		goto report;
+	}
+
+	ecache_size = prom_getintdefault(linux_cpus[0].prom_node,
+					 "ecache-size", (512 * 1024));
+	if (ecache_size > (4 * 1024 * 1024))
+		ecache_size = (4 * 1024 * 1024);
+	orig_flush_base = flush_base =
+		__get_free_pages(GFP_KERNEL, order = get_order(ecache_size));
+
+	if (flush_base != 0UL) {
+		__save_and_cli(flags);
+
+		/* Scan twice the size once just to get the TLB entries
+		 * loaded and make sure the second scan measures pure misses.
+		 */
+		for (p = (unsigned long *)flush_base;
+		     ((unsigned long)p) < (flush_base + (ecache_size<<1));
+		     p += (64 / sizeof(unsigned long)))
+			*((volatile unsigned long *)p);
+
+		/* Now the real measurement. */
+		__asm__ __volatile__("
+		b,pt	%%xcc, 1f
+		 rd	%%tick, %0
+
+		.align	64
+1:		ldx	[%2 + 0x000], %%g1
+		ldx	[%2 + 0x040], %%g2
+		ldx	[%2 + 0x080], %%g3
+		ldx	[%2 + 0x0c0], %%g5
+		add	%2, 0x100, %2
+		cmp	%2, %4
+		bne,pt	%%xcc, 1b
+		 nop
+	
+		rd	%%tick, %1"
+		: "=&r" (tick1), "=&r" (tick2), "=&r" (flush_base)
+		: "2" (flush_base), "r" (flush_base + ecache_size)
+		: "g1", "g2", "g3", "g5");
+
+		__restore_flags(flags);
+
+		raw = (tick2 - tick1);
+
+		/* Dampen it a little, considering two processes
+		 * sharing the cache and fitting.
+		 */
+		cacheflush_time = (raw - (raw >> 2));
+
+		free_pages(orig_flush_base, order);
+	} else {
+		cacheflush_time = ((ecache_size << 2) +
+				   (ecache_size << 1));
+	}
+report:
+	/* Convert cpu ticks to jiffie ticks. */
+	cache_decay_ticks = ((long)cacheflush_time * timer_ticks_per_usec_quotient);
+	cache_decay_ticks >>= 32UL;
+	cache_decay_ticks = (cache_decay_ticks * HZ) / 1000;
+
+	printk("Using heuristic of %ld cycles, %ld ticks.\n",
+	       cacheflush_time, cache_decay_ticks);
+}
+
 /* /proc/profile writes can call this, don't __init it please. */
 int setup_profiling_timer(unsigned int multiplier)
 {
diff -urN linux-2.4.19-pre10-ac2/arch/sparc64/kernel/trampoline.S linux/arch/sparc64/kernel/trampoline.S
--- linux-2.4.19-pre10-ac2/arch/sparc64/kernel/trampoline.S	Thu Jun  6 08:55:13 2002
+++ linux/arch/sparc64/kernel/trampoline.S	Thu Jun 13 12:09:51 2002
@@ -253,7 +253,7 @@
 	wrpr		%o1, PSTATE_IG, %pstate
 
 	/* Get our UPA MID. */
-	lduw		[%o2 + AOFF_task_processor], %g1
+	lduw		[%o2 + AOFF_task_cpu], %g1
 	sethi		%hi(cpu_data), %g5
 	or		%g5, %lo(cpu_data), %g5
 
diff -urN linux-2.4.19-pre10-ac2/arch/sparc64/kernel/traps.c linux/arch/sparc64/kernel/traps.c
--- linux-2.4.19-pre10-ac2/arch/sparc64/kernel/traps.c	Thu Jun  6 08:55:13 2002
+++ linux/arch/sparc64/kernel/traps.c	Thu Jun 13 12:09:51 2002
@@ -1,4 +1,4 @@
-/* $Id: traps.c,v 1.82 2001/11/18 00:12:56 davem Exp $
+/* $Id: traps.c,v 1.84 2002/01/30 01:39:56 davem Exp $
  * arch/sparc64/kernel/traps.c
  *
  * Copyright (C) 1995,1997 David S. Miller (davem@caip.rutgers.edu)
@@ -527,6 +527,21 @@
 			       "i" (ASI_PHYS_USE_EC));
 }
 
+#ifdef CONFIG_SMP
+unsigned long cheetah_tune_scheduling(void)
+{
+	unsigned long tick1, tick2, raw;
+
+	__asm__ __volatile__("rd %%tick, %0" : "=r" (tick1));
+	cheetah_flush_ecache();
+	__asm__ __volatile__("rd %%tick, %0" : "=r" (tick2));
+
+	raw = (tick2 - tick1);
+
+	return (raw - (raw >> 2));
+}
+#endif
+
 /* Unfortunately, the diagnostic access to the I-cache tags we need to
  * use to clear the thing interferes with I-cache coherency transactions.
  *
@@ -1660,13 +1675,16 @@
 	}
 }
 
+/* Only invoked on boot processor. */
 void trap_init(void)
 {
-	/* Attach to the address space of init_task. */
+	/* Attach to the address space of init_task.  On SMP we
+	 * do this in smp.c:smp_callin for other cpus.
+	 */
 	atomic_inc(&init_mm.mm_count);
 	current->active_mm = &init_mm;
 
-	/* NOTE: Other cpus have this done as they are started
-	 *       up on SMP.
-	 */
+#ifdef CONFIG_SMP
+	current->cpu = hard_smp_processor_id();
+#endif
 }
diff -urN linux-2.4.19-pre10-ac2/arch/sparc64/solaris/misc.c linux/arch/sparc64/solaris/misc.c
--- linux-2.4.19-pre10-ac2/arch/sparc64/solaris/misc.c	Thu Jun  6 08:55:15 2002
+++ linux/arch/sparc64/solaris/misc.c	Thu Jun 13 12:09:51 2002
@@ -15,6 +15,7 @@
 #include <linux/mman.h>
 #include <linux/file.h>
 #include <linux/timex.h>
+#include <linux/major.h>
 
 #include <asm/uaccess.h>
 #include <asm/string.h>
diff -urN linux-2.4.19-pre10-ac2/include/asm-sparc64/bitops.h linux/include/asm-sparc64/bitops.h
--- linux-2.4.19-pre10-ac2/include/asm-sparc64/bitops.h	Thu Jun  6 08:54:13 2002
+++ linux/include/asm-sparc64/bitops.h	Thu Jun 13 12:09:51 2002
@@ -1,4 +1,4 @@
-/* $Id: bitops.h,v 1.38 2001/11/19 18:36:34 davem Exp $
+/* $Id: bitops.h,v 1.39 2002/01/30 01:40:00 davem Exp $
  * bitops.h: Bit string operations on the V9.
  *
  * Copyright 1996, 1997 David S. Miller (davem@caip.rutgers.edu)
@@ -7,6 +7,7 @@
 #ifndef _SPARC64_BITOPS_H
 #define _SPARC64_BITOPS_H
 
+#include <linux/compiler.h>
 #include <asm/byteorder.h>
 
 extern long ___test_and_set_bit(unsigned long nr, volatile void *addr);
@@ -64,66 +65,71 @@
 #define smp_mb__before_clear_bit()	do { } while(0)
 #define smp_mb__after_clear_bit()	do { } while(0)
 
-extern __inline__ int test_bit(int nr, __const__ void *addr)
+static __inline__ int test_bit(int nr, __const__ void *addr)
 {
 	return (1UL & (((__const__ long *) addr)[nr >> 6] >> (nr & 63))) != 0UL;
 }
 
 /* The easy/cheese version for now. */
-extern __inline__ unsigned long ffz(unsigned long word)
+static __inline__ unsigned long ffz(unsigned long word)
 {
 	unsigned long result;
 
-#ifdef ULTRA_HAS_POPULATION_COUNT	/* Thanks for nothing Sun... */
-	__asm__ __volatile__(
-"	brz,pn	%0, 1f\n"
-"	 neg	%0, %%g1\n"
-"	xnor	%0, %%g1, %%g2\n"
-"	popc	%%g2, %0\n"
-"1:	" : "=&r" (result)
-	  : "0" (word)
-	  : "g1", "g2");
-#else
-#if 1 /* def EASY_CHEESE_VERSION */
 	result = 0;
 	while(word & 1) {
 		result++;
 		word >>= 1;
 	}
-#else
-	unsigned long tmp;
+	return result;
+}
 
-	result = 0;	
-	tmp = ~word & -~word;
-	if (!(unsigned)tmp) {
-		tmp >>= 32;
-		result = 32;
-	}
-	if (!(unsigned short)tmp) {
-		tmp >>= 16;
-		result += 16;
-	}
-	if (!(unsigned char)tmp) {
-		tmp >>= 8;
-		result += 8;
-	}
-	if (tmp & 0xf0) result += 4;
-	if (tmp & 0xcc) result += 2;
-	if (tmp & 0xaa) result ++;
-#endif
-#endif
+/**
+ * __ffs - find first bit in word.
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static __inline__ unsigned long __ffs(unsigned long word)
+{
+	unsigned long result = 0;
+
+	while (!(word & 1UL)) {
+		result++;
+		word >>= 1;
+	}
 	return result;
 }
 
 #ifdef __KERNEL__
 
 /*
+ * Every architecture must define this function. It's the fastest
+ * way of searching a 140-bit bitmap where the first 100 bits are
+ * unlikely to be set. It's guaranteed that at least one of the 140
+ * bits is cleared.
+ */
+static inline int _sched_find_first_bit(unsigned long *b)
+{
+	if (unlikely(b[0]))
+		return __ffs(b[0]);
+	if (unlikely(((unsigned int)b[1])))
+		return __ffs(b[1]) + 64;
+	if (b[1] >> 32)
+		return __ffs(b[1] >> 32) + 96;
+	return __ffs(b[2]) + 128;
+}
+
+/*
  * ffs: find first bit set. This is defined the same way as
  * the libc and compiler builtin ffs routines, therefore
  * differs in spirit from the above ffz (man ffs).
  */
-
-#define ffs(x) generic_ffs(x)
+static __inline__ int ffs(int x)
+{
+	if (!x)
+		return 0;
+	return __ffs((unsigned long)x);
+}
 
 /*
  * hweightN: returns the hamming weight (i.e. the number
@@ -132,7 +138,7 @@
 
 #ifdef ULTRA_HAS_POPULATION_COUNT
 
-extern __inline__ unsigned int hweight32(unsigned int w)
+static __inline__ unsigned int hweight32(unsigned int w)
 {
 	unsigned int res;
 
@@ -140,7 +146,7 @@
 	return res;
 }
 
-extern __inline__ unsigned int hweight16(unsigned int w)
+static __inline__ unsigned int hweight16(unsigned int w)
 {
 	unsigned int res;
 
@@ -148,7 +154,7 @@
 	return res;
 }
 
-extern __inline__ unsigned int hweight8(unsigned int w)
+static __inline__ unsigned int hweight8(unsigned int w)
 {
 	unsigned int res;
 
@@ -165,12 +171,67 @@
 #endif
 #endif /* __KERNEL__ */
 
+/**
+ * find_next_bit - find the next set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+static __inline__ unsigned long find_next_bit(void *addr, unsigned long size, unsigned long offset)
+{
+	unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
+	unsigned long result = offset & ~63UL;
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset &= 63UL;
+	if (offset) {
+		tmp = *(p++);
+		tmp &= (~0UL << offset);
+		if (size < 64)
+			goto found_first;
+		if (tmp)
+			goto found_middle;
+		size -= 64;
+		result += 64;
+	}
+	while (size & ~63UL) {
+		if ((tmp = *(p++)))
+			goto found_middle;
+		result += 64;
+		size -= 64;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+
+found_first:
+	tmp &= (~0UL >> (64 - size));
+	if (tmp == 0UL)        /* Are any bits set? */
+		return result + size; /* Nope. */
+found_middle:
+	return result + __ffs(tmp);
+}
+
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+#define find_first_bit(addr, size) \
+	find_next_bit((addr), (size), 0)
+
 /* find_next_zero_bit() finds the first zero bit in a bit string of length
  * 'size' bits, starting the search at bit 'offset'. This is largely based
  * on Linus's ALPHA routines, which are pretty portable BTW.
  */
 
-extern __inline__ unsigned long find_next_zero_bit(void *addr, unsigned long size, unsigned long offset)
+static __inline__ unsigned long find_next_zero_bit(void *addr, unsigned long size, unsigned long offset)
 {
 	unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
 	unsigned long result = offset & ~63UL;
@@ -219,7 +280,7 @@
 #define set_le_bit(nr,addr)		((void)___test_and_set_le_bit(nr,addr))
 #define clear_le_bit(nr,addr)		((void)___test_and_clear_le_bit(nr,addr))
 
-extern __inline__ int test_le_bit(int nr, __const__ void * addr)
+static __inline__ int test_le_bit(int nr, __const__ void * addr)
 {
 	int			mask;
 	__const__ unsigned char	*ADDR = (__const__ unsigned char *) addr;
@@ -232,7 +293,7 @@
 #define find_first_zero_le_bit(addr, size) \
         find_next_zero_le_bit((addr), (size), 0)
 
-extern __inline__ unsigned long find_next_zero_le_bit(void *addr, unsigned long size, unsigned long offset)
+static __inline__ unsigned long find_next_zero_le_bit(void *addr, unsigned long size, unsigned long offset)
 {
 	unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
 	unsigned long result = offset & ~63UL;
diff -urN linux-2.4.19-pre10-ac2/include/asm-sparc64/smp.h linux/include/asm-sparc64/smp.h
--- linux-2.4.19-pre10-ac2/include/asm-sparc64/smp.h	Thu Jun  6 08:54:13 2002
+++ linux/include/asm-sparc64/smp.h	Thu Jun 13 12:09:51 2002
@@ -103,7 +103,7 @@
 	}
 }
 
-#define smp_processor_id() (current->processor)
+#define smp_processor_id() (current->cpu)
 
 /* This needn't do anything as we do not sleep the cpu
  * inside of the idler task, so an interrupt is not needed
@@ -127,8 +127,6 @@
 
 #endif /* !(__ASSEMBLY__) */
 
-#define PROC_CHANGE_PENALTY	20
-
 #endif /* !(CONFIG_SMP) */
 
 #define NO_PROC_ID		0xFF
diff -urN linux-2.4.19-pre10-ac2/include/linux/kbd_kern.h linux/include/linux/kbd_kern.h
--- linux-2.4.19-pre10-ac2/include/linux/kbd_kern.h	Thu Jun  6 08:54:05 2002
+++ linux/include/linux/kbd_kern.h	Thu Jun 13 12:09:51 2002
@@ -1,6 +1,7 @@
 #ifndef _KBD_KERN_H
 #define _KBD_KERN_H
 
+#include <linux/tty.h>
 #include <linux/interrupt.h>
 #include <linux/keyboard.h>
 
diff -urN linux-2.4.19-pre10-ac2/kernel/sched.c linux/kernel/sched.c
--- linux-2.4.19-pre10-ac2/kernel/sched.c	Thu Jun  6 11:16:04 2002
+++ linux/kernel/sched.c	Thu Jun 13 12:09:51 2002
@@ -392,8 +392,15 @@
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next, smp_processor_id());
-	} else
+	} else {
 		switch_mm(oldmm, mm, next, smp_processor_id());
+#ifdef CONFIG_SPARC64
+		if (oldmm == mm) {
+			load_secondary_context(mm);
+			reload_tlbmiss_state(next, mm);
+		}
+#endif
+	}
 
 	if (unlikely(!prev->mm)) {
 		prev->active_mm = NULL;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] 2.4-ac: sparc64 support for O(1) scheduler
  2002-06-13 19:21 [PATCH] 2.4-ac: sparc64 support for O(1) scheduler Robert Love
@ 2002-06-14  4:25 ` David S. Miller
  2002-06-14 17:32   ` Robert Love
  2002-06-14 22:00   ` Thomas Duffy
  0 siblings, 2 replies; 36+ messages in thread
From: David S. Miller @ 2002-06-14  4:25 UTC (permalink / raw)
  To: rml; +Cc: alan, linux-kernel

   From: Robert Love <rml@mvista.com>
   Date: 13 Jun 2002 12:21:58 -0700
   
   Patch is against 2.4.19-pre10-ac2, please apply.
   
Ummm what is with all of those switch_mm() hacks?  Is this an attempt
to work around the locking problems?  Please don't do that as it is
going to kill performance and having ifdef sparc64 sched.c changes is
ugly to say the least.

Ingo posted the correct fix to the locking problem with the patch
he posted the other day, that is what should go into the -ac patches.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] 2.4-ac: sparc64 support for O(1) scheduler
  2002-06-14  4:25 ` David S. Miller
@ 2002-06-14 17:32   ` Robert Love
  2002-06-15 13:22     ` David S. Miller
  2002-06-16 15:19     ` Ingo Molnar
  2002-06-14 22:00   ` Thomas Duffy
  1 sibling, 2 replies; 36+ messages in thread
From: Robert Love @ 2002-06-14 17:32 UTC (permalink / raw)
  To: David S. Miller; +Cc: alan, linux-kernel

On Thu, 2002-06-13 at 21:25, David S. Miller wrote:

> Ummm what is with all of those switch_mm() hacks?  Is this an attempt
> to work around the locking problems?  Please don't do that as it is
> going to kill performance and having ifdef sparc64 sched.c changes is
> ugly to say the least.
>
> Ingo posted the correct fix to the locking problem with the patch
> he posted the other day, that is what should go into the -ac patches.

I am explicitly refraining from sending Alan any code that is not
well-tested in 2.5 and my machines first.  As Ingo's new switch_mm()
bits are not even in 2.5 yet, I plan to wait a bit before sending
them... (I am currently putting together all the scheduler bits we have
been working on for a 2.4-ac patch...)

If you like, Alan can hold off on this and take it when the appropriate
patches are in.

	Robert Love



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] 2.4-ac: sparc64 support for O(1) scheduler
  2002-06-14  4:25 ` David S. Miller
  2002-06-14 17:32   ` Robert Love
@ 2002-06-14 22:00   ` Thomas Duffy
  2002-06-15 13:35     ` David S. Miller
  1 sibling, 1 reply; 36+ messages in thread
From: Thomas Duffy @ 2002-06-14 22:00 UTC (permalink / raw)
  To: linux-kernel

begin  David S. Miller quotation on Thu, 13 Jun 2002 21:32:11 -0700:

>    From: Robert Love <rml@mvista.com>
>    Date: 13 Jun 2002 12:21:58 -0700
>    
>    Patch is against 2.4.19-pre10-ac2, please apply.
>    
> Ummm what is with all of those switch_mm() hacks?  Is this an attempt to
> work around the locking problems?  Please don't do that as it is going
> to kill performance and having ifdef sparc64 sched.c changes is ugly to
> say the least.
> 
> Ingo posted the correct fix to the locking problem with the patch he
> posted the other day, that is what should go into the -ac patches.

This part of the patch (the change to kernel/sched.c) can be safely	
removed without making o1 stop working.

This hack (conservatively) fixes an issue where on bootup, the machine
would get into a page fault loop and hang.  This only happens a very
small percentage of the time.  I will investigate whether the patch
Ingo put out fixes this issue.

-tduffy

-- 
He who receives an idea from me, receives instruction himself without
lessening mine; as he who lights his taper at mine, receives light
without darkening me.                      -- Thomas Jefferson

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] 2.4-ac: sparc64 support for O(1) scheduler
  2002-06-14 17:32   ` Robert Love
@ 2002-06-15 13:22     ` David S. Miller
  2002-06-20 19:42       ` Alan Cox
  2002-06-16 15:19     ` Ingo Molnar
  1 sibling, 1 reply; 36+ messages in thread
From: David S. Miller @ 2002-06-15 13:22 UTC (permalink / raw)
  To: rml; +Cc: alan, linux-kernel

   From: Robert Love <rml@mvista.com>
   Date: 14 Jun 2002 10:32:32 -0700
   
   I am explicitly refraining from sending Alan any code that is not
   well-tested in 2.5 and my machines first.  As Ingo's new switch_mm()
   bits are not even in 2.5 yet, I plan to wait a bit before sending
   them... (I am currently putting together all the scheduler bits we have
   been working on for a 2.4-ac patch...)

Your sparc64 kernel/sched.c bits have zero testing in any kernel.
What point are you trying to make?  It disables a very important
optimization on SMP sparc64.  It's simply unacceptable.

Ingo's change which deletes the frozen locking bits has to be
installed with the patches which allow sparc64 to continue working
without the deadlock bug, they cannot be added seperately.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] 2.4-ac: sparc64 support for O(1) scheduler
  2002-06-14 22:00   ` Thomas Duffy
@ 2002-06-15 13:35     ` David S. Miller
  0 siblings, 0 replies; 36+ messages in thread
From: David S. Miller @ 2002-06-15 13:35 UTC (permalink / raw)
  To: linux-kernel, Thomas.Duffy.99

   From: "Thomas Duffy" <Thomas.Duffy.99@alumni.brown.edu>
   Date: Fri, 14 Jun 2002 15:00:03 -0700

   This part of the patch (the change to kernel/sched.c) can be safely	
   removed without making o1 stop working.
   
If Ingo's changes to remove the "frozen" stuff is installed, the
kernel is going to hang when you hit the deadlock condition on
sparc64.  Unless I'm mistaken, the "frozen" stuff had been removed.

You are going to hit this deadlock which I mentioned in another thread
on this list when Ingo mentioned that he had removed the "frozen"
locking I added because it causes other problems.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] 2.4-ac: sparc64 support for O(1) scheduler
  2002-06-14 17:32   ` Robert Love
  2002-06-15 13:22     ` David S. Miller
@ 2002-06-16 15:19     ` Ingo Molnar
  2002-06-16 17:00       ` [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3 Ingo Molnar
  2002-06-16 23:45       ` [PATCH] 2.4-ac: sparc64 support for O(1) scheduler Robert Love
  1 sibling, 2 replies; 36+ messages in thread
From: Ingo Molnar @ 2002-06-16 15:19 UTC (permalink / raw)
  To: Robert Love; +Cc: David S. Miller, alan, linux-kernel


On 14 Jun 2002, Robert Love wrote:

> > Ummm what is with all of those switch_mm() hacks?  Is this an attempt
> > to work around the locking problems?  Please don't do that as it is
> > going to kill performance and having ifdef sparc64 sched.c changes is
> > ugly to say the least.
> >
> > Ingo posted the correct fix to the locking problem with the patch
> > he posted the other day, that is what should go into the -ac patches.
> 
> I am explicitly refraining from sending Alan any code that is not
> well-tested in 2.5 and my machines first.  As Ingo's new switch_mm()
> bits are not even in 2.5 yet, [...]

Linus applied them already, they will be in 2.5.22. They fix real bugs and
i've seen no problems on my testboxes. Those bits are a must for SMP x86
and Sparc64 as well, there is absolutely no reason to selectively delay
their backmerge. Besides the last task_rq_lock() optimization which got
undone in 2.5 already, all the recent scheduler bits i posted are needed.

	Ingo


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-16 15:19     ` Ingo Molnar
@ 2002-06-16 17:00       ` Ingo Molnar
  2002-06-16 23:57         ` Robert Love
  2002-06-16 23:45       ` [PATCH] 2.4-ac: sparc64 support for O(1) scheduler Robert Love
  1 sibling, 1 reply; 36+ messages in thread
From: Ingo Molnar @ 2002-06-16 17:00 UTC (permalink / raw)
  To: Alan Cox; +Cc: David S. Miller, rml, linux-kernel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1228 bytes --]


the attached patch, sched-2.4.19-pre10-ac2-A3, is a backport of the
current 2.5 O(1) scheduler, against 2.4.19-pre10-ac2. The patch includes
all the recent fixes. (It should not break any architecture that was
working before on -ac. The most affected architecture is Sparc64, i added
the bits without testing them. David?)

The patch can also be downloaded from:

    http://redhat.com/~mingo/O(1)-scheduler/sched-2.4.19-pre10-ac2-A3

Changes relative to 2.4.19-pre10-ac2:

Bugfixes:

 - rq-frozen fixes, which closes SMP races on x86 and Sparc64 as well.

 - O(1) scheduling sched_yield() fixes: do not starve CPU-intensive 
   processes.

 - migration bugfix, do not fast-migrate the task incorrectly if the task
   is in the middle of load_balance().

 - sync wakeup reintroduction - this should fix the pipe latency problems
   observed.

Feature backports:

 - nr_uninterruptible optimization. (This is a fairly straightforward
   and risk-less feature, and since it also made the backport easier, i
   included it.)

 - sched_setaffinity() & sched_getaffinity() syscalls on x86.

plus identity changes, comment updates, to bring sched.c in line with the
2.5 version.

the patch was tested on x86 UP and SMP boxes.

	Ingo

[-- Attachment #2: Type: TEXT/PLAIN, Size: 22784 bytes --]

--- linux/fs/pipe.c.orig	Sun Jun 16 18:46:45 2002
+++ linux/fs/pipe.c	Sun Jun 16 18:47:14 2002
@@ -115,7 +115,7 @@
 		 * writers synchronously that there is more
 		 * room.
 		 */
-		wake_up_interruptible(PIPE_WAIT(*inode));
+		wake_up_interruptible_sync(PIPE_WAIT(*inode));
 		if (!PIPE_EMPTY(*inode))
 			BUG();
 		goto do_more_read;
@@ -215,7 +215,7 @@
 			 * is going to give up this CPU, so it doesnt have
 			 * to do idle reschedules.
 			 */
-			wake_up_interruptible(PIPE_WAIT(*inode));
+			wake_up_interruptible_sync(PIPE_WAIT(*inode));
 			PIPE_WAITING_WRITERS(*inode)++;
 			pipe_wait(inode);
 			PIPE_WAITING_WRITERS(*inode)--;
--- linux/kernel/sched.c.orig	Sun Jun 16 17:34:06 2002
+++ linux/kernel/sched.c	Sun Jun 16 18:23:44 2002
@@ -1,5 +1,5 @@
 /*
- *  linux/kernel/sched.c
+ *  kernel/sched.c
  *
  *  Kernel scheduler and related syscalls
  *
@@ -13,16 +13,18 @@
  *  		hybrid priority-list and round-robin design with
  *  		an array-switch method of distributing timeslices
  *  		and per-CPU runqueues.  Additional code by Davide
- *  		Libenzi, Robert Love, and Rusty Russel.
+ *  		Libenzi, Robert Love, and Rusty Russell.
  */
 
 #include <linux/mm.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <asm/uaccess.h>
+#include <linux/highmem.h>
 #include <linux/smp_lock.h>
-#include <linux/interrupt.h>
 #include <asm/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
 #include <linux/kernel_stat.h>
 
 /*
@@ -133,8 +135,8 @@
  */
 struct runqueue {
 	spinlock_t lock;
-	spinlock_t frozen;
 	unsigned long nr_running, nr_switches, expired_timestamp;
+	signed long nr_uninterruptible;
 	task_t *curr, *idle;
 	prio_array_t *active, *expired, arrays[2];
 	int prev_nr_running[NR_CPUS];
@@ -150,13 +152,29 @@
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define rt_task(p)		((p)->prio < MAX_RT_PRIO)
 
+/*
+ * Default context-switch locking:
+ */
+#ifndef prepare_arch_schedule
+# define prepare_arch_schedule(prev)	do { } while(0)
+# define finish_arch_schedule(prev)	do { } while(0)
+# define prepare_arch_switch(rq)	do { } while(0)
+# define finish_arch_switch(rq)		spin_unlock_irq(&(rq)->lock)
+#endif
+
+/*
+ * task_rq_lock - lock the runqueue a given task resides on and disable
+ * interrupts.  Note the ordering: we can safely lookup the task_rq without
+ * explicitly disabling preemption.
+ */
 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 {
 	struct runqueue *rq;
 
 repeat_lock_task:
+	local_irq_save(*flags);
 	rq = task_rq(p);
-	spin_lock_irqsave(&rq->lock, *flags);
+	spin_lock(&rq->lock);
 	if (unlikely(rq != task_rq(p))) {
 		spin_unlock_irqrestore(&rq->lock, *flags);
 		goto repeat_lock_task;
@@ -170,6 +188,23 @@
 }
 
 /*
+ * rq_lock - lock a given runqueue and disable interrupts.
+ */
+static inline runqueue_t *rq_lock(runqueue_t *rq)
+{
+	local_irq_disable();
+	rq = this_rq();
+	spin_lock(&rq->lock);
+	return rq;
+}
+
+static inline void rq_unlock(runqueue_t *rq)
+{
+	spin_unlock(&rq->lock);
+	local_irq_enable();
+}
+
+/*
  * Adding/removing a task to/from a priority array:
  */
 static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
@@ -239,12 +274,15 @@
 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
 	rq->nr_running--;
+	if (p->state == TASK_UNINTERRUPTIBLE)
+		rq->nr_uninterruptible++;
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
 
 static inline void resched_task(task_t *p)
 {
+#ifdef CONFIG_SMP
 	int need_resched;
 
 	need_resched = p->need_resched;
@@ -252,6 +290,9 @@
 	set_tsk_need_resched(p);
 	if (!need_resched && (p->cpu != smp_processor_id()))
 		smp_send_reschedule(p->cpu);
+#else
+	set_tsk_need_resched(p);
+#endif
 }
 
 #ifdef CONFIG_SMP
@@ -267,9 +308,9 @@
 
 repeat:
 	rq = task_rq(p);
-	while (unlikely(rq->curr == p)) {
+	if (unlikely(rq->curr == p)) {
 		cpu_relax();
-		barrier();
+		goto repeat;
 	}
 	rq = task_rq_lock(p, &flags);
 	if (unlikely(rq->curr == p)) {
@@ -303,35 +344,50 @@
  * "current->state = TASK_RUNNING" to mark yourself runnable
  * without the overhead of this.
  */
-static int try_to_wake_up(task_t * p)
+static int try_to_wake_up(task_t * p, int sync)
 {
 	unsigned long flags;
 	int success = 0;
+	long old_state;
 	runqueue_t *rq;
 
+repeat_lock_task:
 	rq = task_rq_lock(p, &flags);
-	p->state = TASK_RUNNING;
+	old_state = p->state;
 	if (!p->array) {
+		if (unlikely(sync && (rq->curr != p))) {
+			if (p->cpu != smp_processor_id()) {
+				p->cpu = smp_processor_id();
+				task_rq_unlock(rq, &flags);
+				goto repeat_lock_task;
+			}
+		}
+		if (old_state == TASK_UNINTERRUPTIBLE)
+			rq->nr_uninterruptible--;
 		activate_task(p, rq);
+		/*
+		 * If sync is set, a resched_task() is a NOOP
+		 */
 		if (p->prio < rq->curr->prio)
 			resched_task(rq->curr);
 		success = 1;
 	}
+	p->state = TASK_RUNNING;
 	task_rq_unlock(rq, &flags);
+
 	return success;
 }
 
 int wake_up_process(task_t * p)
 {
-	return try_to_wake_up(p);
+	return try_to_wake_up(p, 0);
 }
 
 void wake_up_forked_process(task_t * p)
 {
 	runqueue_t *rq;
 
-	rq = this_rq();
-	spin_lock_irq(&rq->lock);
+	rq = rq_lock(rq);
 
 	p->state = TASK_RUNNING;
 	if (!rt_task(p)) {
@@ -346,7 +402,8 @@
 	}
 	p->cpu = smp_processor_id();
 	activate_task(p, rq);
-	spin_unlock_irq(&rq->lock);
+
+	rq_unlock(rq);
 }
 
 /*
@@ -377,17 +434,16 @@
 #if CONFIG_SMP
 asmlinkage void schedule_tail(task_t *prev)
 {
-	spin_unlock_irq(&this_rq()->frozen);
+	finish_arch_switch(this_rq());
+	finish_arch_schedule(prev);
 }
 #endif
 
-static inline void context_switch(task_t *prev, task_t *next)
+static inline task_t * context_switch(task_t *prev, task_t *next)
 {
 	struct mm_struct *mm = next->mm;
 	struct mm_struct *oldmm = prev->active_mm;
 
-	prepare_to_switch();
-
 	if (unlikely(!mm)) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
@@ -402,6 +458,8 @@
 
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
+
+	return prev;
 }
 
 unsigned long nr_running(void)
@@ -414,6 +472,16 @@
 	return sum;
 }
 
+unsigned long nr_uninterruptible(void)
+{
+	unsigned long i, sum = 0;
+
+	for (i = 0; i < smp_num_cpus; i++)
+		sum += cpu_rq(cpu_logical_map(i))->nr_uninterruptible;
+
+	return sum;
+}
+
 unsigned long nr_context_switches(void)
 {
 	unsigned long i, sum = 0;
@@ -569,7 +637,7 @@
 #define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
 	((jiffies - (p)->sleep_timestamp > cache_decay_ticks) &&	\
 		((p) != (rq)->curr) &&					\
-			((p)->cpus_allowed & (1 << (this_cpu))))
+			((p)->cpus_allowed & (1UL << (this_cpu))))
 
 	if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
 		curr = curr->next;
@@ -726,13 +794,14 @@
 	list_t *queue;
 	int idx;
 
-	BUG_ON(in_interrupt());
-
+	if (unlikely(in_interrupt()))
+		BUG();
 need_resched:
 	prev = current;
 	rq = this_rq();
 
 	release_kernel_lock(prev, smp_processor_id());
+	prepare_arch_schedule(prev);
 	prev->sleep_timestamp = jiffies;
 	spin_lock_irq(&rq->lock);
 
@@ -783,26 +852,19 @@
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
-		spin_lock(&rq->frozen);
-		spin_unlock(&rq->lock);
-		
-		context_switch(prev, next);
-		/*
-		 * The runqueue pointer might be from another CPU
-		 * if the new task was last running on a different
-		 * CPU - thus re-load it.
-		 */
-		mb();
+	
+		prepare_arch_switch(rq);
+		prev = context_switch(prev, next);
+		barrier();
 		rq = this_rq();
-		spin_unlock_irq(&rq->frozen);
-	} else {
+		finish_arch_switch(rq);
+	} else
 		spin_unlock_irq(&rq->lock);
-	}
+	finish_arch_schedule(prev);
 
 	reacquire_kernel_lock(current);
 	if (need_resched())
 		goto need_resched;
-	return;
 }
 
 /*
@@ -814,8 +876,7 @@
  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-				    int nr_exclusive)
+static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync)
 {
 	struct list_head *tmp;
 	unsigned int state;
@@ -826,7 +887,7 @@
 		curr = list_entry(tmp, wait_queue_t, task_list);
 		p = curr->task;
 		state = p->state;
-		if ((state & mode) && try_to_wake_up(p) &&
+		if ((state & mode) && try_to_wake_up(p, sync) &&
 			((curr->flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive))
 				break;
 	}
@@ -839,24 +900,43 @@
 	if (unlikely(!q))
 		return;
 
-	wq_read_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive);
-	wq_read_unlock_irqrestore(&q->lock, flags);
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, 0);
+	spin_unlock_irqrestore(&q->lock, flags);
 }
 
+#if CONFIG_SMP
+
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	unsigned long flags;
+
+	if (unlikely(!q))
+		return;
+
+	spin_lock_irqsave(&q->lock, flags);
+	if (likely(nr_exclusive))
+		__wake_up_common(q, mode, nr_exclusive, 1);
+	else
+		__wake_up_common(q, mode, nr_exclusive, 0);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+
+#endif
+ 
 void complete(struct completion *x)
 {
 	unsigned long flags;
 
-	wq_write_lock_irqsave(&x->wait.lock, flags);
+	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
-	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1);
-	wq_write_unlock_irqrestore(&x->wait.lock, flags);
+	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 
 void wait_for_completion(struct completion *x)
 {
-	wq_write_lock_irq(&x->wait.lock);
+	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 
@@ -864,14 +944,14 @@
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			__set_current_state(TASK_UNINTERRUPTIBLE);
-			wq_write_unlock_irq(&x->wait.lock);
+			spin_unlock_irq(&x->wait.lock);
 			schedule();
-			wq_write_lock_irq(&x->wait.lock);
+			spin_lock_irq(&x->wait.lock);
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
-	wq_write_unlock_irq(&x->wait.lock);
+	spin_unlock_irq(&x->wait.lock);
 }
 
 #define	SLEEP_ON_VAR				\
@@ -880,14 +960,14 @@
 	init_waitqueue_entry(&wait, current);
 
 #define	SLEEP_ON_HEAD					\
-	wq_write_lock_irqsave(&q->lock,flags);		\
+	spin_lock_irqsave(&q->lock,flags);		\
 	__add_wait_queue(q, &wait);			\
-	wq_write_unlock(&q->lock);
+	spin_unlock(&q->lock);
 
 #define	SLEEP_ON_TAIL						\
-	wq_write_lock_irq(&q->lock);				\
+	spin_lock_irq(&q->lock);				\
 	__remove_wait_queue(q, &wait);				\
-	wq_write_unlock_irqrestore(&q->lock,flags);
+	spin_unlock_irqrestore(&q->lock, flags);
 
 void interruptible_sleep_on(wait_queue_head_t *q)
 {
@@ -1027,6 +1107,11 @@
 	return TASK_NICE(p);
 }
 
+int idle_cpu(int cpu)
+{
+	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
 static inline task_t *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_pid(pid) : current;
@@ -1077,7 +1162,7 @@
 
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
-	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0.
+	 * 1..MAX_USER_RT_PRIO, valid priority for SCHED_OTHER is 0.
 	 */
 	retval = -EINVAL;
 	if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
@@ -1177,28 +1262,127 @@
 	return retval;
 }
 
-asmlinkage long sys_sched_yield(void)
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ */
+asmlinkage int sys_sched_setaffinity(pid_t pid, unsigned int len,
+				      unsigned long *user_mask_ptr)
 {
-	runqueue_t *rq;
-	prio_array_t *array;
+	unsigned long new_mask;
+	task_t *p;
+	int retval;
 
-	rq = this_rq();
+	if (len < sizeof(new_mask))
+		return -EINVAL;
+
+	if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
+		return -EFAULT;
+
+	new_mask &= cpu_online_map;
+	if (!new_mask)
+		return -EINVAL;
+
+	read_lock(&tasklist_lock);
+
+	p = find_process_by_pid(pid);
+	if (!p) {
+		read_unlock(&tasklist_lock);
+		return -ESRCH;
+	}
 
 	/*
-	 * Decrease the yielding task's priority by one, to avoid
-	 * livelocks. This priority loss is temporary, it's recovered
-	 * once the current timeslice expires.
-	 *
-	 * If priority is already MAX_PRIO-1 then we still
-	 * roundrobin the task within the runlist.
+	 * It is not safe to call set_cpus_allowed with the
+	 * tasklist_lock held.  We will bump the task_struct's
+	 * usage count and then drop tasklist_lock.
 	 */
-	spin_lock_irq(&rq->lock);
-	array = current->array;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	retval = -EPERM;
+	if ((current->euid != p->euid) && (current->euid != p->uid) &&
+			!capable(CAP_SYS_NICE))
+		goto out_unlock;
+
+	retval = 0;
+	set_cpus_allowed(p, new_mask);
+
+out_unlock:
+	free_task_struct(p);
+	return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ */
+asmlinkage int sys_sched_getaffinity(pid_t pid, unsigned int len,
+				      unsigned long *user_mask_ptr)
+{
+	unsigned long mask;
+	unsigned int real_len;
+	task_t *p;
+	int retval;
+
+	real_len = sizeof(mask);
+
+	if (len < real_len)
+		return -EINVAL;
+
+	read_lock(&tasklist_lock);
+
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = 0;
+	mask = p->cpus_allowed & cpu_online_map;
+
+out_unlock:
+	read_unlock(&tasklist_lock);
+	if (retval)
+		return retval;
+	if (copy_to_user(user_mask_ptr, &mask, real_len))
+		return -EFAULT;
+	return real_len;
+}
+
+asmlinkage long sys_sched_yield(void)
+{
+	runqueue_t *rq = rq_lock(rq);
+	prio_array_t *array = current->array;
+
 	/*
-	 * If the task has reached maximum priority (or is a RT task)
-	 * then just requeue the task to the end of the runqueue:
+	 * There are three levels of how a yielding task will give up
+	 * the current CPU:
+	 *
+	 *  #1 - it decreases its priority by one. This priority loss is
+	 *       temporary, it's recovered once the current timeslice
+	 *       expires.
+	 *
+	 *  #2 - once it has reached the lowest priority level,
+	 *       it will give up timeslices one by one. (We do not
+	 *       want to give them up all at once, it's gradual,
+	 *       to protect the casual yield()er.)
+	 *
+	 *  #3 - once all timeslices are gone we put the process into
+	 *       the expired array.
+	 *
+	 *  (special rule: RT tasks do not lose any priority, they just
+	 *  roundrobin on their current priority level.)
 	 */
-	if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) {
+	if (likely(current->prio == MAX_PRIO-1)) {
+		if (current->time_slice <= 1) {
+			dequeue_task(current, rq->active);
+			enqueue_task(current, rq->expired);
+		} else
+			current->time_slice--;
+	} else if (unlikely(rt_task(current))) {
 		list_del(&current->run_list);
 		list_add_tail(&current->run_list, array->queue + current->prio);
 	} else {
@@ -1396,7 +1580,7 @@
 		spin_unlock(&rq2->lock);
 }
 
-void init_idle(task_t *idle, int cpu)
+void __init init_idle(task_t *idle, int cpu)
 {
 	runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(idle->cpu);
 	unsigned long flags;
@@ -1427,13 +1611,12 @@
 	int i, j, k;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		runqueue_t *rq = cpu_rq(i);
 		prio_array_t *array;
 
+		rq = cpu_rq(i);
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		spin_lock_init(&rq->lock);
-		spin_lock_init(&rq->frozen);
 		INIT_LIST_HEAD(&rq->migration_queue);
 
 		for (j = 0; j < 2; j++) {
@@ -1497,8 +1680,8 @@
  * is removed from the allowed bitmask.
  *
  * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely.  No
- * spinlocks can be held.
+ * task must not exit() & deallocate itself prematurely.  The
+ * call is not atomic; no spinlocks may be held.
  */
 void set_cpus_allowed(task_t *p, unsigned long new_mask)
 {
@@ -1518,19 +1701,17 @@
 	 */
 	if (new_mask & (1UL << p->cpu)) {
 		task_rq_unlock(rq, &flags);
-		return;
+		goto out;
 	}
-
 	/*
-	 * If the task is not on a runqueue, then it is safe to
-	 * simply update the task's cpu field.
+	 * If the task is not on a runqueue (and not running), then
+	 * it is sufficient to simply update the task's cpu field.
 	 */
-	if (!p->array) {
+	if (!p->array && (p != rq->curr)) {
 		p->cpu = __ffs(p->cpus_allowed);
 		task_rq_unlock(rq, &flags);
-		return;
+		goto out;
 	}
-
 	init_MUTEX_LOCKED(&req.sem);
 	req.task = p;
 	list_add(&req.list, &rq->migration_queue);
@@ -1538,6 +1719,7 @@
 	wake_up_process(rq->migration_thread);
 
 	down(&req.sem);
+out:
 }
 
 static int migration_thread(void * bind_cpu)
@@ -1550,17 +1732,16 @@
 	daemonize();
 	sigfillset(&current->blocked);
 	set_fs(KERNEL_DS);
-
 	/*
-	 * The first migration thread is started on CPU #0. This one can
-	 * migrate the other migration threads to their destination CPUs.
+	 * The first migration thread is started on CPU #0. This one can migrate
+	 * the other migration threads to their destination CPUs.
 	 */
 	if (cpu != 0) {
 		while (!cpu_rq(cpu_logical_map(0))->migration_thread)
 			yield();
 		set_cpus_allowed(current, 1UL << cpu);
 	}
-	printk("migration_task %d on cpu=%d\n", cpu, smp_processor_id());
+	printk("migration_task %d on cpu=%d\n",cpu,smp_processor_id());
 	ret = setscheduler(0, SCHED_FIFO, &param);
 
 	rq = this_rq();
@@ -1632,5 +1813,4 @@
 		while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
 			schedule_timeout(2);
 }
-
-#endif /* CONFIG_SMP */
+#endif
--- linux/kernel/timer.c.orig	Sun Jun 16 17:43:38 2002
+++ linux/kernel/timer.c	Sun Jun 16 17:43:50 2002
@@ -608,17 +608,7 @@
  */
 static unsigned long count_active_tasks(void)
 {
-	struct task_struct *p;
-	unsigned long nr = 0;
-
-	read_lock(&tasklist_lock);
-	for_each_task(p) {
-		if ((p->state == TASK_RUNNING ||
-		     (p->state & TASK_UNINTERRUPTIBLE)))
-			nr += FIXED_1;
-	}
-	read_unlock(&tasklist_lock);
-	return nr;
+        return (nr_running() + nr_uninterruptible()) * FIXED_1;
 }
 
 /*
--- linux/include/linux/sched.h.orig	Sun Jun 16 17:44:13 2002
+++ linux/include/linux/sched.h	Sun Jun 16 18:54:01 2002
@@ -76,6 +76,7 @@
 extern int nr_threads;
 extern int last_pid;
 extern unsigned long nr_running(void);
+extern unsigned long nr_uninterruptible(void);
 
 #include <linux/fs.h>
 #include <linux/time.h>
@@ -610,6 +611,7 @@
 #define CURRENT_TIME (xtime.tv_sec)
 
 extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr));
+extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr));
 extern void FASTCALL(sleep_on(wait_queue_head_t *q));
 extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q,
 				      signed long timeout));
@@ -626,6 +628,12 @@
 #define wake_up_interruptible(x)	__wake_up((x),TASK_INTERRUPTIBLE, 1)
 #define wake_up_interruptible_nr(x, nr)	__wake_up((x),TASK_INTERRUPTIBLE, nr)
 #define wake_up_interruptible_all(x)	__wake_up((x),TASK_INTERRUPTIBLE, 0)
+#ifdef CONFIG_SMP
+#define wake_up_interruptible_sync(x)   __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
+#else
+#define wake_up_interruptible_sync(x)   __wake_up((x),TASK_INTERRUPTIBLE, 1)
+#endif
+
 asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
 
 extern int in_group_p(gid_t);
--- linux/include/asm-i386/system.h.orig	Sun Jun 16 17:48:41 2002
+++ linux/include/asm-i386/system.h	Sun Jun 16 18:53:47 2002
@@ -12,25 +12,22 @@
 struct task_struct;	/* one of the stranger aspects of C forward declarations.. */
 extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
 
-#define prepare_to_switch()	do { } while(0)
 #define switch_to(prev,next,last) do {					\
 	asm volatile("pushl %%esi\n\t"					\
 		     "pushl %%edi\n\t"					\
 		     "pushl %%ebp\n\t"					\
 		     "movl %%esp,%0\n\t"	/* save ESP */		\
-		     "movl %3,%%esp\n\t"	/* restore ESP */	\
+		     "movl %2,%%esp\n\t"	/* restore ESP */	\
 		     "movl $1f,%1\n\t"		/* save EIP */		\
-		     "pushl %4\n\t"		/* restore EIP */	\
+		     "pushl %3\n\t"		/* restore EIP */	\
 		     "jmp __switch_to\n"				\
 		     "1:\t"						\
 		     "popl %%ebp\n\t"					\
 		     "popl %%edi\n\t"					\
 		     "popl %%esi\n\t"					\
-		     :"=m" (prev->thread.esp),"=m" (prev->thread.eip),	\
-		      "=b" (last)					\
+		     :"=m" (prev->thread.esp),"=m" (prev->thread.eip)	\
 		     :"m" (next->thread.esp),"m" (next->thread.eip),	\
-		      "a" (prev), "d" (next),				\
-		      "b" (prev));					\
+		      "a" (prev), "d" (next));				\
 } while (0)
 
 #define _set_base(addr,base) do { unsigned long __pr; \
--- linux/include/asm-sparc64/system.h.orig	Sun Jun 16 18:19:03 2002
+++ linux/include/asm-sparc64/system.h	Sun Jun 16 18:19:44 2002
@@ -149,7 +149,11 @@
 
 #define flush_user_windows flushw_user
 #define flush_register_windows flushw_all
-#define prepare_to_switch flushw_all
+
+#define prepare_arch_schedule(prev)		task_lock(prev)
+#define finish_arch_schedule(prev)		task_unlock(prev)
+#define prepare_arch_switch(rq)			do { spin_unlock(&(rq)->lock); flushw_all(); }
+#define finish_arch_switch(rq)			__sti()
 
 #ifndef CONFIG_DEBUG_SPINLOCK
 #define CHECK_LOCKS(PREV)	do { } while(0)
--- linux/arch/i386/kernel/entry.S.orig	Sun Jun 16 18:14:33 2002
+++ linux/arch/i386/kernel/entry.S	Sun Jun 16 18:15:05 2002
@@ -639,8 +639,8 @@
  	.long SYMBOL_NAME(sys_tkill)
 	.long SYMBOL_NAME(sys_ni_syscall)	/* reserved for sendfile64 */
 	.long SYMBOL_NAME(sys_ni_syscall)	/* 240 reserved for futex */
-	.long SYMBOL_NAME(sys_ni_syscall)	/* reserved for sched_setaffinity */
-	.long SYMBOL_NAME(sys_ni_syscall)	/* reserved for sched_getaffinity */
+	.long SYMBOL_NAME(sys_sched_setaffinity)
+	.long SYMBOL_NAME(sys_sched_getaffinity)
 
 	.rept NR_syscalls-(.-sys_call_table)/4
 		.long SYMBOL_NAME(sys_ni_syscall)

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] 2.4-ac: sparc64 support for O(1) scheduler
  2002-06-16 15:19     ` Ingo Molnar
  2002-06-16 17:00       ` [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3 Ingo Molnar
@ 2002-06-16 23:45       ` Robert Love
  2002-06-17  5:28         ` David S. Miller
  1 sibling, 1 reply; 36+ messages in thread
From: Robert Love @ 2002-06-16 23:45 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: David S. Miller, alan, linux-kernel

On Sun, 2002-06-16 at 08:19, Ingo Molnar wrote:

> Linus applied them already, they will be in 2.5.22. They fix real bugs and
> i've seen no problems on my testboxes. Those bits are a must for SMP x86
> and Sparc64 as well, there is absolutely no reason to selectively delay
> their backmerge. Besides the last task_rq_lock() optimization which got
> undone in 2.5 already, all the recent scheduler bits i posted are needed.

I know they are fine (I looked over them) and I saw Linus took them, but
2.5.22 is not yet out and I did not see any reason to rush to new bits
to Alan for 2.4 when we could wait a bit and make sure 2.5 proves them
fine...

My approach thus far with 2.5 -> 2.4 O(1) backports has been one of
caution and it has worked fine thus far.  I figure, what is the rush?

	Robert Love


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-16 17:00       ` [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3 Ingo Molnar
@ 2002-06-16 23:57         ` Robert Love
  2002-06-17  0:13           ` J.A. Magallon
                             ` (3 more replies)
  0 siblings, 4 replies; 36+ messages in thread
From: Robert Love @ 2002-06-16 23:57 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Alan Cox, David S. Miller, linux-kernel

On Sun, 2002-06-16 at 10:00, Ingo Molnar wrote:

> Feature backports:
> 
>  - nr_uninterruptible optimization. (This is a fairly straightforward
>    and risk-less feature, and since it also made the backport easier, i
>    included it.)

Yah, I agree - this is safe and good.

>  - sched_setaffinity() & sched_getaffinity() syscalls on x86.

Do we want to introduce this into 2.4 now?  I realize 2.4-ac is not 2.4
proper, but if there is a chance this interface could change...

> -	BUG_ON(in_interrupt());
> -
> +	if (unlikely(in_interrupt()))
> +		BUG();

Eh, why do this?  BUG_ON is the same effect and it is more readable to
me... seems better that 2.5 gets 2.4-ac's behavior instead of the other
way around.

> +int idle_cpu(int cpu)
> +{
> +	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
> +}
> +

I did not include this in my original O(1) backport update because
nothing in 2.4-ac seems to use it... so why include it?

>  	/*
>  	 * Valid priorities for SCHED_FIFO and SCHED_RR are
> -	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0.
> +	 * 1..MAX_USER_RT_PRIO, valid priority for SCHED_OTHER is 0.
>  	 */

Another case of 2.4-ac being right: the priority range is
1..MAX_USER_RT_PRIO-1 (i.e. 1 to 99, inclusive).

>  	/*
> -	 * The first migration thread is started on CPU #0. This one can
> -	 * migrate the other migration threads to their destination CPUs.
> +	 * The first migration thread is started on CPU #0. This one can migrate
> +	 * the other migration threads to their destination CPUs.
>  	 */
>  	if (cpu != 0) {
>  		while (!cpu_rq(cpu_logical_map(0))->migration_thread)
>  			yield();
>  		set_cpus_allowed(current, 1UL << cpu);
>  	}
> -	printk("migration_task %d on cpu=%d\n", cpu, smp_processor_id());
> +	printk("migration_task %d on cpu=%d\n",cpu,smp_processor_id());
>  	ret = setscheduler(0, SCHED_FIFO, &param);
>  	rq = this_rq();
> @@ -1632,5 +1813,4 @@
>  		while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
>  			schedule_timeout(2);
>  }
> -
> -#endif /* CONFIG_SMP */
> +#endif

I think all three of these hunks look better in 2.4-ac... in all three
cases, the formatting seems better than in 2.5 IMO.

	Robert Love


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-16 23:57         ` Robert Love
@ 2002-06-17  0:13           ` J.A. Magallon
  2002-06-17  4:28             ` Ingo Molnar
  2002-06-17  0:15           ` Robert Love
                             ` (2 subsequent siblings)
  3 siblings, 1 reply; 36+ messages in thread
From: J.A. Magallon @ 2002-06-17  0:13 UTC (permalink / raw)
  To: Robert Love; +Cc: Ingo Molnar, Alan Cox, David S. Miller, linux-kernel


On 2002.06.17 Robert Love wrote:
>On Sun, 2002-06-16 at 10:00, Ingo Molnar wrote:
>
>> +int idle_cpu(int cpu)
>> +{
>> +	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
>> +}
>> +
>
>I did not include this in my original O(1) backport update because
>nothing in 2.4-ac seems to use it... so why include it?
>

Well, you asked...

- the irqbalance patch for p4 needs idle_cpu (and not sure about idle_task).
  BTW, they were macros before...
- the bproc patch needs task_nice (you can be less interested in this, but
  it does not hurt...)

So could I ask you, please
- to make public idle_[cpu,task], as macros or exported functions, here it
  does not matter, irqbalance is not a module. Perhaps some other piece of code
  could need them.
- to export all the set/get prio/nice interfaces

???

Thanks.

-- 
J.A. Magallon             \   Software is like sex: It's better when it's free
mailto:jamagallon@able.es  \                    -- Linus Torvalds, FSF T-shirt
Linux werewolf 2.4.19-pre10-jam3, Mandrake Linux 8.3 (Cooker) for i586
gcc (GCC) 3.1.1 (Mandrake Linux 8.3 3.1.1-0.4mdk)

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-16 23:57         ` Robert Love
  2002-06-17  0:13           ` J.A. Magallon
@ 2002-06-17  0:15           ` Robert Love
  2002-06-17  3:49             ` Ingo Molnar
  2002-06-17  3:24           ` [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3 Ingo Molnar
  2002-06-17  4:51           ` Toshiba PCToPIC97 PC Card freeze in 2.4.18 Stephen Satchell
  3 siblings, 1 reply; 36+ messages in thread
From: Robert Love @ 2002-06-17  0:15 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Alan Cox, David S. Miller, linux-kernel, torvalds

On Sun, 2002-06-16 at 16:57, Robert Love wrote:

> Another case of 2.4-ac being right

Attached patch brings over the sane bits from 2.4-ac: i.e. if Linus
merges this and Alan merges your patch minus my complaints, the two
trees will be in sync...

	Robert Love

diff -urN linux-2.5.21/kernel/sched.c linux/kernel/sched.c
--- linux-2.5.21/kernel/sched.c	Sat Jun  8 22:28:13 2002
+++ linux/kernel/sched.c	Sun Jun 16 17:14:31 2002
@@ -762,8 +762,8 @@
 	list_t *queue;
 	int idx;
 
-	if (unlikely(in_interrupt()))
-		BUG();
+	BUG_ON(in_interrupt());
+
 #if CONFIG_DEBUG_HIGHMEM
 	check_highmem_ptes();
 #endif
@@ -1147,7 +1147,7 @@
 
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
-	 * 1..MAX_USER_RT_PRIO, valid priority for SCHED_OTHER is 0.
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0.
 	 */
 	retval = -EINVAL;
 	if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
@@ -1710,15 +1710,15 @@
 	sigfillset(&current->blocked);
 	set_fs(KERNEL_DS);
 	/*
-	 * The first migration thread is started on CPU #0. This one can migrate
-	 * the other migration threads to their destination CPUs.
+	 * The first migration thread is started on CPU #0. This one can
+	 * migrate the other migration threads to their destination CPUs.
 	 */
 	if (cpu != 0) {
 		while (!cpu_rq(cpu_logical_map(0))->migration_thread)
 			yield();
 		set_cpus_allowed(current, 1UL << cpu);
 	}
-	printk("migration_task %d on cpu=%d\n",cpu,smp_processor_id());
+	printk("migration_task %d on cpu=%d\n", cpu, smp_processor_id());
 	ret = setscheduler(0, SCHED_FIFO, &param);
 
 	rq = this_rq();
@@ -1790,4 +1790,4 @@
 		while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
 			schedule_timeout(2);
 }
-#endif
+#endif /* CONFIG_SMP */


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-16 23:57         ` Robert Love
  2002-06-17  0:13           ` J.A. Magallon
  2002-06-17  0:15           ` Robert Love
@ 2002-06-17  3:24           ` Ingo Molnar
  2002-06-17  3:35             ` Robert Love
                               ` (2 more replies)
  2002-06-17  4:51           ` Toshiba PCToPIC97 PC Card freeze in 2.4.18 Stephen Satchell
  3 siblings, 3 replies; 36+ messages in thread
From: Ingo Molnar @ 2002-06-17  3:24 UTC (permalink / raw)
  To: Robert Love; +Cc: Alan Cox, David S. Miller, linux-kernel


On 16 Jun 2002, Robert Love wrote:

> > +int idle_cpu(int cpu)
> > +{
> > +	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
> > +}
> > +
> 
> I did not include this in my original O(1) backport update because
> nothing in 2.4-ac seems to use it... so why include it?

i have planned to submit the irqbalance patch for 2.4-ac real soon, which
needs this function - current IRQ distribution on P4 SMP boxes is a
showstopper.

> >  - sched_setaffinity() & sched_getaffinity() syscalls on x86.
>
> Do we want to introduce this into 2.4 now?  I realize 2.4-ac is not 2.4
> proper, but if there is a chance this interface could change...

the setaffinity()/getaffinity() interface looks pretty robust, i dont
expect any changes - there's just so many ways to set an affinity mask for
an opaque set of CPUs. And being able to set affinities is something that
was frequently asked for by application developers.

> > -	BUG_ON(in_interrupt());
> > -
> > +	if (unlikely(in_interrupt()))
> > +		BUG();
> 
> Eh, why do this?  BUG_ON is the same effect and it is more readable to
> me... seems better that 2.5 gets 2.4-ac's behavior instead of the other
> way around.

IMO BUG_ON() is just an ugly way of doing an assert(), i dont like code
with magic conditionals embedded within. But, the main reason was that
2.5-mainline has the code so that's being used.

> >  	/*
> >  	 * Valid priorities for SCHED_FIFO and SCHED_RR are
> > -	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0.
> > +	 * 1..MAX_USER_RT_PRIO, valid priority for SCHED_OTHER is 0.
> >  	 */
> 
> Another case of 2.4-ac being right: the priority range is
> 1..MAX_USER_RT_PRIO-1 (i.e. 1 to 99, inclusive).

like above, 2.5 is the reference base. Especially for 100% nonfunctional
things like this it makes no sense to apply them to 2.4-ac only. But i
agree that existing comment fixes should be forward ported into 2.5, i've
applied them to my tree.

	Ingo


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  3:24           ` [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3 Ingo Molnar
@ 2002-06-17  3:35             ` Robert Love
  2002-06-17  4:01               ` Ingo Molnar
  2002-06-17  7:50             ` Zwane Mwaikambo
  2002-06-17 16:26             ` Rusty Russell
  2 siblings, 1 reply; 36+ messages in thread
From: Robert Love @ 2002-06-17  3:35 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Alan Cox, David S. Miller, linux-kernel

On Sun, 2002-06-16 at 20:24, Ingo Molnar wrote:

> On 16 Jun 2002, Robert Love wrote:
> 
> > > +int idle_cpu(int cpu)
> > > +{
> > > +	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
> > > +}
> > > +
> > 
> > I did not include this in my original O(1) backport update because
> > nothing in 2.4-ac seems to use it... so why include it?
> 
> i have planned to submit the irqbalance patch for 2.4-ac real soon, which
> needs this function - current IRQ distribution on P4 SMP boxes is a
> showstopper.

Fair enough.

> > >  - sched_setaffinity() & sched_getaffinity() syscalls on x86.
> >
> > Do we want to introduce this into 2.4 now?  I realize 2.4-ac is not 2.4
> > proper, but if there is a chance this interface could change...
> 
> the setaffinity()/getaffinity() interface looks pretty robust, i dont
> expect any changes - there's just so many ways to set an affinity mask for
> an opaque set of CPUs. And being able to set affinities is something that
> was frequently asked for by application developers.

I agree it seems robust and there have been no complaints, although
there could always be changes to the interface.  Personally I'd like the
interfaces in 2.4/2.4-ac sooner rather than later too - I just want to
make sure we do not "etch it in stone" prematurely.

> IMO BUG_ON() is just an ugly way of doing an assert(), i dont like code
> with magic conditionals embedded within. But, the main reason was that
> 2.5-mainline has the code so that's being used.

Heh I like BUG_ON :-)

> like above, 2.5 is the reference base. Especially for 100% nonfunctional
> things like this it makes no sense to apply them to 2.4-ac only. But i
> agree that existing comment fixes should be forward ported into 2.5, i've
> applied them to my tree.

I agree the changes are nonfunctional and thus not a big deal...but I
didn't see a point in pushing erroneous changes onto 2.4-ac, whether
they are in 2.5 or not.

Although now it is all a moot point - Linus merged the patch I posted
earlier with the 2.4-ac bits against 2.5... so now a diff of 2.4-ac and
2.5 will be proper. ;-)

	Robert Love


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  0:15           ` Robert Love
@ 2002-06-17  3:49             ` Ingo Molnar
  2002-06-17  3:57               ` Robert Love
  2002-06-17  4:02               ` Robert Love
  0 siblings, 2 replies; 36+ messages in thread
From: Ingo Molnar @ 2002-06-17  3:49 UTC (permalink / raw)
  To: Robert Love; +Cc: Alan Cox, David S. Miller, linux-kernel, Linus Torvalds


i agree with the comment fixes, except these items:

> -	if (unlikely(in_interrupt()))
> -		BUG();
> +	BUG_ON(in_interrupt());
> +

see the previous mail.

> @@ -1790,4 +1790,4 @@
>  		while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
>  			schedule_timeout(2);
>  }
> -#endif
> +#endif /* CONFIG_SMP */

and this is just silly... I can see the point in doing #if comments in
include files, but the nesting here is just so obvious.

the rest looks fine. (patch of my current 2.5 scheduler tree attached,
against 2.5.22, with some more other nonfunctional bits added as well.)

	Ingo

--- linux/kernel/sched.c.orig	Mon Jun 17 05:43:53 2002
+++ linux/kernel/sched.c	Mon Jun 17 05:42:03 2002
@@ -6,14 +6,14 @@
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
- *              make semaphores SMP safe
+ *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
- *  		hybrid priority-list and round-robin design with
- *  		an array-switch method of distributing timeslices
- *  		and per-CPU runqueues.  Additional code by Davide
- *  		Libenzi, Robert Love, and Rusty Russell.
+ *		hybrid priority-list and round-robin design with
+ *		an array-switch method of distributing timeslices
+ *		and per-CPU runqueues.  Additional code by Davide
+ *		Libenzi, Robert Love, and Rusty Russell.
  */
 
 #include <linux/mm.h>
@@ -797,7 +797,8 @@
 	list_t *queue;
 	int idx;
 
-	BUG_ON(in_interrupt());
+	if (in_interrupt())
+		BUG();
 
 #if CONFIG_DEBUG_HIGHMEM
 	check_highmem_ptes();
@@ -1392,25 +1393,35 @@
 
 asmlinkage long sys_sched_yield(void)
 {
-	runqueue_t *rq;
-	prio_array_t *array;
-
-	rq = rq_lock(rq);
+	runqueue_t *rq = rq_lock(rq);
+	prio_array_t *array = current->array;
 
 	/*
-	 * Decrease the yielding task's priority by one, to avoid
-	 * livelocks. This priority loss is temporary, it's recovered
-	 * once the current timeslice expires.
+	 * There are three levels of how a yielding task will give up
+	 * the current CPU:
 	 *
-	 * If priority is already MAX_PRIO-1 then we still
-	 * roundrobin the task within the runlist.
-	 */
-	array = current->array;
-	/*
-	 * If the task has reached maximum priority (or is a RT task)
-	 * then just requeue the task to the end of the runqueue:
+	 *  #1 - it decreases its priority by one. This priority loss is
+	 *       temporary, it's recovered once the current timeslice
+	 *       expires.
+	 *
+	 *  #2 - once it has reached the lowest priority level,
+	 *       it will give up timeslices one by one. (We do not
+	 *       want to give them up all at once, it's gradual,
+	 *       to protect the casual yield()er.)
+	 *
+	 *  #3 - once all timeslices are gone we put the process into
+	 *       the expired array.
+	 *
+	 *  (special rule: RT tasks do not lose any priority, they just
+	 *  roundrobin on their current priority level.)
 	 */
-	if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) {
+	if (likely(current->prio == MAX_PRIO-1)) {
+		if (current->time_slice <= 1) {
+			dequeue_task(current, rq->active);
+			enqueue_task(current, rq->expired);
+		} else
+			current->time_slice--;
+	} else if (unlikely(rt_task(current))) {
 		list_del(&current->run_list);
 		list_add_tail(&current->run_list, array->queue + current->prio);
 	} else {
@@ -1836,15 +1847,14 @@
 	int cpu;
 
 	current->cpus_allowed = 1UL << cpu_logical_map(0);
-	for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+	for (cpu = 0; cpu < smp_num_cpus; cpu++)
 		if (kernel_thread(migration_thread, (void *) (long) cpu,
 				CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
 			BUG();
-	}
 	current->cpus_allowed = -1L;
 
 	for (cpu = 0; cpu < smp_num_cpus; cpu++)
 		while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
 			schedule_timeout(2);
 }
-#endif /* CONFIG_SMP */
+#endif


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  3:49             ` Ingo Molnar
@ 2002-06-17  3:57               ` Robert Love
  2002-06-17  4:07                 ` Ingo Molnar
  2002-06-17  4:02               ` Robert Love
  1 sibling, 1 reply; 36+ messages in thread
From: Robert Love @ 2002-06-17  3:57 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Alan Cox, David S. Miller, linux-kernel, Linus Torvalds

On Sun, 2002-06-16 at 20:49, Ingo Molnar wrote:

> i agree with the comment fixes, except these items:
> 
> > -	if (unlikely(in_interrupt()))
> > -		BUG();
> > +	BUG_ON(in_interrupt());
> > +
> 
> see the previous mail.

Shrug.  Preference I guess... though this is _the_ case for BUG_ON.

> > @@ -1790,4 +1790,4 @@
> >  		while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
> >  			schedule_timeout(2);
> >  }
> > -#endif
> > +#endif /* CONFIG_SMP */
> 
> and this is just silly... I can see the point in doing #if comments in
> include files, but the nesting here is just so obvious.

I disagree, but OK.  I like having the #if marked by the #endif if they
are not close... and elsewhere through the kernel mirrors this.  While I
can scroll up and look - assuming the nesting is sane - a simple comment
makes that clear so what is the pain?

> the rest looks fine. (patch of my current 2.5 scheduler tree attached,
> against 2.5.22, with some more other nonfunctional bits added as well.)

Rest looks fine.

Then again, this is all invariants and comments so its really not a big
deal at all.  I guess better this than we are fighting over real code,
eh? ;-)

	Robert Love


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  3:35             ` Robert Love
@ 2002-06-17  4:01               ` Ingo Molnar
  0 siblings, 0 replies; 36+ messages in thread
From: Ingo Molnar @ 2002-06-17  4:01 UTC (permalink / raw)
  To: Robert Love; +Cc: Alan Cox, David S. Miller, linux-kernel


On 16 Jun 2002, Robert Love wrote:

> > like above, 2.5 is the reference base. Especially for 100% nonfunctional
> > things like this it makes no sense to apply them to 2.4-ac only. But i
> > agree that existing comment fixes should be forward ported into 2.5, i've
> > applied them to my tree.
> 
> I agree the changes are nonfunctional and thus not a big deal...but I
> didn't see a point in pushing erroneous changes onto 2.4-ac, whether
> they are in 2.5 or not.

My method is that the less differences in a merge, the better. I dont mind
if a few comment fixes are lost temporarily, they'll be noticed and
forward ported the minute they get zapped by the backport. (and i have
reviewed -ac for ac-only functional fixes, none existed.) This way the
actual code creation part of the backport was a few minutes work only -
the real work mostly involved reviewing the functional parts of the
changes.

	Ingo


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  3:49             ` Ingo Molnar
  2002-06-17  3:57               ` Robert Love
@ 2002-06-17  4:02               ` Robert Love
  2002-06-17  4:26                 ` Ingo Molnar
  2002-06-17  4:49                 ` [patch] 2.5.22 current scheduler bits #1 Ingo Molnar
  1 sibling, 2 replies; 36+ messages in thread
From: Robert Love @ 2002-06-17  4:02 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Alan Cox, David S. Miller, linux-kernel, Linus Torvalds

On Sun, 2002-06-16 at 20:49, Ingo Molnar wrote:
smlinkage long sys_sched_yield(void)
>  {
> -	runqueue_t *rq;
> -	prio_array_t *array;
> -
> -	rq = rq_lock(rq);
> +	runqueue_t *rq = rq_lock(rq);
> +	prio_array_t *array = current->array;

Question.  I have always wondered what the C rules are here... is
rq_lock guaranteed to be evaluated before current->array?  I.e., is the
above synonymous with:

	runqueue_t *rq;
	prio_array_t *array;
	rq = rq_lock(rq);
	array = current->array;

...guaranteed?

	Robert Love


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  3:57               ` Robert Love
@ 2002-06-17  4:07                 ` Ingo Molnar
  0 siblings, 0 replies; 36+ messages in thread
From: Ingo Molnar @ 2002-06-17  4:07 UTC (permalink / raw)
  To: Robert Love; +Cc: Alan Cox, David S. Miller, linux-kernel, Linus Torvalds


On 16 Jun 2002, Robert Love wrote:

> > > @@ -1790,4 +1790,4 @@
> > >  		while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
> > >  			schedule_timeout(2);
> > >  }
> > > -#endif
> > > +#endif /* CONFIG_SMP */
> > 
> > and this is just silly... I can see the point in doing #if comments in
> > include files, but the nesting here is just so obvious.
> 
> I disagree, but OK.  I like having the #if marked by the #endif if they
> are not close... and elsewhere through the kernel mirrors this.  While I
> can scroll up and look - assuming the nesting is sane - a simple comment
> makes that clear so what is the pain?

and in this specific sched.c case, are we going to put in magic comments
every 25 lines inbetween:

/* this is CONFIG_SMP conditional code */

just to save us some scrolling up? I dont think #endif is special wrt.  
such comments.

in header files the #ifdef jungle often makes proper nesting hard. In
those cases putting comments to #else and #endif makes a real difference
in readability. But in sched.c there is not a single nested #ifdef. (and
that's very much intentional.)

	Ingo


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  4:02               ` Robert Love
@ 2002-06-17  4:26                 ` Ingo Molnar
  2002-06-17  4:49                 ` [patch] 2.5.22 current scheduler bits #1 Ingo Molnar
  1 sibling, 0 replies; 36+ messages in thread
From: Ingo Molnar @ 2002-06-17  4:26 UTC (permalink / raw)
  To: Robert Love; +Cc: Alan Cox, David S. Miller, linux-kernel, Linus Torvalds


On 16 Jun 2002, Robert Love wrote:

> Question.  I have always wondered what the C rules are here... is
> rq_lock guaranteed to be evaluated before current->array?  I.e., is the
> above synonymous with:
> 
> 	runqueue_t *rq;
> 	prio_array_t *array;
> 	rq = rq_lock(rq);
> 	array = current->array;
> 
> ...guaranteed?

yes. We rely on this in the kernel quite frequently. Btw., i did a few
more coding style cleanups to the scheduler, you can see more such
examples in my patch.

	Ingo

--- linux/kernel/sched.c.orig	Mon Jun 17 05:43:53 2002
+++ linux/kernel/sched.c	Mon Jun 17 06:21:32 2002
@@ -6,14 +6,14 @@
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
- *              make semaphores SMP safe
+ *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
- *  		hybrid priority-list and round-robin design with
- *  		an array-switch method of distributing timeslices
- *  		and per-CPU runqueues.  Additional code by Davide
- *  		Libenzi, Robert Love, and Rusty Russell.
+ *		hybrid priority-list and round-robin design with
+ *		an array-switch method of distributing timeslices
+ *		and per-CPU runqueues.  Additional code by Davide
+ *		Libenzi, Robert Love, and Rusty Russell.
  */
 
 #include <linux/mm.h>
@@ -180,11 +180,14 @@
 /*
  * rq_lock - lock a given runqueue and disable interrupts.
  */
-static inline runqueue_t *rq_lock(runqueue_t *rq)
+static inline runqueue_t *this_rq_lock(void)
 {
+	runqueue_t *rq;
+
 	local_irq_disable();
 	rq = this_rq();
 	spin_lock(&rq->lock);
+
 	return rq;
 }
 
@@ -388,9 +391,7 @@
 
 void wake_up_forked_process(task_t * p)
 {
-	runqueue_t *rq;
-
-	rq = rq_lock(rq);
+	runqueue_t *rq = this_rq_lock();
 
 	p->state = TASK_RUNNING;
 	if (!rt_task(p)) {
@@ -797,7 +798,8 @@
 	list_t *queue;
 	int idx;
 
-	BUG_ON(in_interrupt());
+	if (in_interrupt())
+		BUG();
 
 #if CONFIG_DEBUG_HIGHMEM
 	check_highmem_ptes();
@@ -1158,13 +1160,12 @@
 static int setscheduler(pid_t pid, int policy, struct sched_param *param)
 {
 	struct sched_param lp;
+	int retval = -EINVAL;
 	prio_array_t *array;
 	unsigned long flags;
 	runqueue_t *rq;
-	int retval;
 	task_t *p;
 
-	retval = -EINVAL;
 	if (!param || pid < 0)
 		goto out_nounlock;
 
@@ -1251,10 +1252,9 @@
 
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
+	int retval = -EINVAL;
 	task_t *p;
-	int retval;
 
-	retval = -EINVAL;
 	if (pid < 0)
 		goto out_nounlock;
 
@@ -1271,11 +1271,10 @@
 
 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
 {
-	task_t *p;
 	struct sched_param lp;
-	int retval;
+	int retval = -EINVAL;
+	task_t *p;
 
-	retval = -EINVAL;
 	if (!param || pid < 0)
 		goto out_nounlock;
 
@@ -1310,8 +1309,8 @@
 				      unsigned long *user_mask_ptr)
 {
 	unsigned long new_mask;
-	task_t *p;
 	int retval;
+	task_t *p;
 
 	if (len < sizeof(new_mask))
 		return -EINVAL;
@@ -1361,13 +1360,12 @@
 asmlinkage int sys_sched_getaffinity(pid_t pid, unsigned int len,
 				      unsigned long *user_mask_ptr)
 {
-	unsigned long mask;
 	unsigned int real_len;
-	task_t *p;
+	unsigned long mask;
 	int retval;
+	task_t *p;
 
 	real_len = sizeof(mask);
-
 	if (len < real_len)
 		return -EINVAL;
 
@@ -1392,25 +1390,35 @@
 
 asmlinkage long sys_sched_yield(void)
 {
-	runqueue_t *rq;
-	prio_array_t *array;
-
-	rq = rq_lock(rq);
+	runqueue_t *rq = this_rq_lock();
+	prio_array_t *array = current->array;
 
 	/*
-	 * Decrease the yielding task's priority by one, to avoid
-	 * livelocks. This priority loss is temporary, it's recovered
-	 * once the current timeslice expires.
+	 * There are three levels of how a yielding task will give up
+	 * the current CPU:
 	 *
-	 * If priority is already MAX_PRIO-1 then we still
-	 * roundrobin the task within the runlist.
-	 */
-	array = current->array;
-	/*
-	 * If the task has reached maximum priority (or is a RT task)
-	 * then just requeue the task to the end of the runqueue:
+	 *  #1 - it decreases its priority by one. This priority loss is
+	 *       temporary, it's recovered once the current timeslice
+	 *       expires.
+	 *
+	 *  #2 - once it has reached the lowest priority level,
+	 *       it will give up timeslices one by one. (We do not
+	 *       want to give them up all at once, it's gradual,
+	 *       to protect the casual yield()er.)
+	 *
+	 *  #3 - once all timeslices are gone we put the process into
+	 *       the expired array.
+	 *
+	 *  (special rule: RT tasks do not lose any priority, they just
+	 *  roundrobin on their current priority level.)
 	 */
-	if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) {
+	if (likely(current->prio == MAX_PRIO-1)) {
+		if (current->time_slice <= 1) {
+			dequeue_task(current, rq->active);
+			enqueue_task(current, rq->expired);
+		} else
+			current->time_slice--;
+	} else if (unlikely(rt_task(current))) {
 		list_del(&current->run_list);
 		list_add_tail(&current->run_list, array->queue + current->prio);
 	} else {
@@ -1461,9 +1469,9 @@
 
 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
 {
+	int retval = -EINVAL;
 	struct timespec t;
 	task_t *p;
-	int retval = -EINVAL;
 
 	if (pid < 0)
 		goto out_nounlock;
@@ -1758,8 +1766,8 @@
 
 static int migration_thread(void * bind_cpu)
 {
-	int cpu = cpu_logical_map((int) (long) bind_cpu);
 	struct sched_param param = { sched_priority: MAX_RT_PRIO-1 };
+	int cpu = cpu_logical_map((int) (long) bind_cpu);
 	runqueue_t *rq;
 	int ret;
 
@@ -1836,15 +1844,14 @@
 	int cpu;
 
 	current->cpus_allowed = 1UL << cpu_logical_map(0);
-	for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+	for (cpu = 0; cpu < smp_num_cpus; cpu++)
 		if (kernel_thread(migration_thread, (void *) (long) cpu,
 				CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
 			BUG();
-	}
 	current->cpus_allowed = -1L;
 
 	for (cpu = 0; cpu < smp_num_cpus; cpu++)
 		while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
 			schedule_timeout(2);
 }
-#endif /* CONFIG_SMP */
+#endif


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  0:13           ` J.A. Magallon
@ 2002-06-17  4:28             ` Ingo Molnar
  0 siblings, 0 replies; 36+ messages in thread
From: Ingo Molnar @ 2002-06-17  4:28 UTC (permalink / raw)
  To: J.A. Magallon; +Cc: Robert Love, Alan Cox, David S. Miller, linux-kernel


On Mon, 17 Jun 2002, J.A. Magallon wrote:

> - the irqbalance patch for p4 needs idle_cpu (and not sure about idle_task).
>   BTW, they were macros before...
> - the bproc patch needs task_nice (you can be less interested in this, but
>   it does not hurt...)
> 
> So could I ask you, please
> - to make public idle_[cpu,task], as macros or exported functions, here it
>   does not matter, irqbalance is not a module. Perhaps some other piece of code
>   could need them.
> - to export all the set/get prio/nice interfaces
> 
> ???

sure.

	Ingo


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [patch] 2.5.22 current scheduler bits #1.
  2002-06-17  4:02               ` Robert Love
  2002-06-17  4:26                 ` Ingo Molnar
@ 2002-06-17  4:49                 ` Ingo Molnar
  1 sibling, 0 replies; 36+ messages in thread
From: Ingo Molnar @ 2002-06-17  4:49 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alan Cox, David S. Miller, Robert Love, linux-kernel,
	Linus Torvalds


one more fix: do not forced-migrate tasks in wakeup if it violates the
affinity mask. My current scheduler tree is attached, against 2.5.22.

	Ingo

--- linux/kernel/sched.c.orig	Mon Jun 17 05:43:53 2002
+++ linux/kernel/sched.c	Mon Jun 17 06:46:01 2002
@@ -6,14 +6,14 @@
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
- *              make semaphores SMP safe
+ *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
- *  		hybrid priority-list and round-robin design with
- *  		an array-switch method of distributing timeslices
- *  		and per-CPU runqueues.  Additional code by Davide
- *  		Libenzi, Robert Love, and Rusty Russell.
+ *		hybrid priority-list and round-robin design with
+ *		an array-switch method of distributing timeslices
+ *		and per-CPU runqueues.  Additional code by Davide
+ *		Libenzi, Robert Love, and Rusty Russell.
  */
 
 #include <linux/mm.h>
@@ -180,11 +180,14 @@
 /*
  * rq_lock - lock a given runqueue and disable interrupts.
  */
-static inline runqueue_t *rq_lock(runqueue_t *rq)
+static inline runqueue_t *this_rq_lock(void)
 {
+	runqueue_t *rq;
+
 	local_irq_disable();
 	rq = this_rq();
 	spin_lock(&rq->lock);
+
 	return rq;
 }
 
@@ -358,12 +361,17 @@
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!p->array) {
-		if (unlikely(sync && (rq->curr != p))) {
-			if (p->thread_info->cpu != smp_processor_id()) {
-				p->thread_info->cpu = smp_processor_id();
-				task_rq_unlock(rq, &flags);
-				goto repeat_lock_task;
-			}
+		/*
+		 * Fast-migrate the task if it's not running or runnable
+		 * currently. Do not violate hard affinity.
+		 */
+		if (unlikely(sync && (rq->curr != p) &&
+			(p->thread_info->cpu != smp_processor_id()) &&
+			(p->cpus_allowed & (1UL << smp_processor_id())))) {
+
+			p->thread_info->cpu = smp_processor_id();
+			task_rq_unlock(rq, &flags);
+			goto repeat_lock_task;
 		}
 		if (old_state == TASK_UNINTERRUPTIBLE)
 			rq->nr_uninterruptible--;
@@ -388,9 +396,7 @@
 
 void wake_up_forked_process(task_t * p)
 {
-	runqueue_t *rq;
-
-	rq = rq_lock(rq);
+	runqueue_t *rq = this_rq_lock();
 
 	p->state = TASK_RUNNING;
 	if (!rt_task(p)) {
@@ -797,7 +803,8 @@
 	list_t *queue;
 	int idx;
 
-	BUG_ON(in_interrupt());
+	if (unlikely(in_interrupt()))
+		BUG();
 
 #if CONFIG_DEBUG_HIGHMEM
 	check_highmem_ptes();
@@ -1158,13 +1165,12 @@
 static int setscheduler(pid_t pid, int policy, struct sched_param *param)
 {
 	struct sched_param lp;
+	int retval = -EINVAL;
 	prio_array_t *array;
 	unsigned long flags;
 	runqueue_t *rq;
-	int retval;
 	task_t *p;
 
-	retval = -EINVAL;
 	if (!param || pid < 0)
 		goto out_nounlock;
 
@@ -1251,10 +1257,9 @@
 
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
+	int retval = -EINVAL;
 	task_t *p;
-	int retval;
 
-	retval = -EINVAL;
 	if (pid < 0)
 		goto out_nounlock;
 
@@ -1271,11 +1276,10 @@
 
 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
 {
-	task_t *p;
 	struct sched_param lp;
-	int retval;
+	int retval = -EINVAL;
+	task_t *p;
 
-	retval = -EINVAL;
 	if (!param || pid < 0)
 		goto out_nounlock;
 
@@ -1310,8 +1314,8 @@
 				      unsigned long *user_mask_ptr)
 {
 	unsigned long new_mask;
-	task_t *p;
 	int retval;
+	task_t *p;
 
 	if (len < sizeof(new_mask))
 		return -EINVAL;
@@ -1361,13 +1365,12 @@
 asmlinkage int sys_sched_getaffinity(pid_t pid, unsigned int len,
 				      unsigned long *user_mask_ptr)
 {
-	unsigned long mask;
 	unsigned int real_len;
-	task_t *p;
+	unsigned long mask;
 	int retval;
+	task_t *p;
 
 	real_len = sizeof(mask);
-
 	if (len < real_len)
 		return -EINVAL;
 
@@ -1392,25 +1395,35 @@
 
 asmlinkage long sys_sched_yield(void)
 {
-	runqueue_t *rq;
-	prio_array_t *array;
-
-	rq = rq_lock(rq);
+	runqueue_t *rq = this_rq_lock();
+	prio_array_t *array = current->array;
 
 	/*
-	 * Decrease the yielding task's priority by one, to avoid
-	 * livelocks. This priority loss is temporary, it's recovered
-	 * once the current timeslice expires.
+	 * There are three levels of how a yielding task will give up
+	 * the current CPU:
 	 *
-	 * If priority is already MAX_PRIO-1 then we still
-	 * roundrobin the task within the runlist.
-	 */
-	array = current->array;
-	/*
-	 * If the task has reached maximum priority (or is a RT task)
-	 * then just requeue the task to the end of the runqueue:
+	 *  #1 - it decreases its priority by one. This priority loss is
+	 *       temporary, it's recovered once the current timeslice
+	 *       expires.
+	 *
+	 *  #2 - once it has reached the lowest priority level,
+	 *       it will give up timeslices one by one. (We do not
+	 *       want to give them up all at once, it's gradual,
+	 *       to protect the casual yield()er.)
+	 *
+	 *  #3 - once all timeslices are gone we put the process into
+	 *       the expired array.
+	 *
+	 *  (special rule: RT tasks do not lose any priority, they just
+	 *  roundrobin on their current priority level.)
 	 */
-	if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) {
+	if (likely(current->prio == MAX_PRIO-1)) {
+		if (current->time_slice <= 1) {
+			dequeue_task(current, rq->active);
+			enqueue_task(current, rq->expired);
+		} else
+			current->time_slice--;
+	} else if (unlikely(rt_task(current))) {
 		list_del(&current->run_list);
 		list_add_tail(&current->run_list, array->queue + current->prio);
 	} else {
@@ -1461,9 +1474,9 @@
 
 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
 {
+	int retval = -EINVAL;
 	struct timespec t;
 	task_t *p;
-	int retval = -EINVAL;
 
 	if (pid < 0)
 		goto out_nounlock;
@@ -1758,8 +1771,8 @@
 
 static int migration_thread(void * bind_cpu)
 {
-	int cpu = cpu_logical_map((int) (long) bind_cpu);
 	struct sched_param param = { sched_priority: MAX_RT_PRIO-1 };
+	int cpu = cpu_logical_map((int) (long) bind_cpu);
 	runqueue_t *rq;
 	int ret;
 
@@ -1836,15 +1849,14 @@
 	int cpu;
 
 	current->cpus_allowed = 1UL << cpu_logical_map(0);
-	for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+	for (cpu = 0; cpu < smp_num_cpus; cpu++)
 		if (kernel_thread(migration_thread, (void *) (long) cpu,
 				CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
 			BUG();
-	}
 	current->cpus_allowed = -1L;
 
 	for (cpu = 0; cpu < smp_num_cpus; cpu++)
 		while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
 			schedule_timeout(2);
 }
-#endif /* CONFIG_SMP */
+#endif


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Toshiba PCToPIC97 PC Card freeze in 2.4.18
  2002-06-16 23:57         ` Robert Love
                             ` (2 preceding siblings ...)
  2002-06-17  3:24           ` [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3 Ingo Molnar
@ 2002-06-17  4:51           ` Stephen Satchell
  3 siblings, 0 replies; 36+ messages in thread
From: Stephen Satchell @ 2002-06-17  4:51 UTC (permalink / raw)
  To: linux-kernel

All:

I'm at my wit's end.  I have a Toshiba Satellite 2545XCDT which has a PC 
Card adapter.  I have been happily running this laptop with a 2.2.16 kernel 
without problem.  Today, when trying to upgrade to a 20GB hard disk and a 
2.4.18 kernel, the box would freeze when trying to start the PCMCIA 
service.  Here is the message that I get on the screen:

PCI:  No IRQ known for interrupt pin A of device 00:13:0.  Please try using 
pci=biosirq
PCI:  No IRQ known for interrupt pin B of device 00:13.0.  Please try using 
pci=biosirq
Yenta IRQ list 06b8 PCI irq 0
Socket status: 30000007

and the system is completely frozen at that point -- even CTRL-ALT-DEL 
doesn't work.  (The soft power switch does, which tells me that NMI 
interrupts get through, but nothing else.)  As you might guess, SysRq 
didn't work, either.  Only powering off would allow me to restart the system.

When I recompile the kernel to not make PCMCIA a module, there is NO 
message, just the system freeze.

Nothing interesting shows up in syslog.

Probing the /proc filesystem, I find that under 2.2.16 there is a character 
device 254 labeled PCMCIA; in the 2.4.18 kernel I see no device 254 or any 
device with the label PCMCIA.  Granted, in the case of 2.2.16 the various 
modules successfully loaded, so they may have advertised device 254, 
whereas on the 2.4.18 kernel the failure kept the device from being advertised.

Dumping /proc/pci, I see device 19 (0x13) listed but completely different 
capabilities advertised.  Under 2.2.16, I see "Slow devsel.  Fast 
back-to-back capable.  Master Capable.  No bursts.  Min Gnt=128.Max 
lat=4."  The same device under 2.4.18 reports "Non-prefetchable 32 bit 
memory at 0x100000000 [0x100000fff]."  Other PCI devices have reports that 
differ in format but not significantly in the amount of and values in content.

I even went so far as to download the latest version 
(pcmcia-cs-3.1.34.tar.gz) of the PCMCIA stuff from SourceForge, compiled it 
all, and ended up with exactly the same results.  So I'm beginning to 
believe that it's not the PCMCIA/PCCard software.

I checked the kernel archives for any mention of this problem, and the 
closest I could find was a complaint regarding an IBM ThinkPad.  Ditto 
checking the bug list for the project on SourceForge.  Nothing on Toshiba

I put the old hard drive back into the laptop so I can get some work done, 
but I still have all the stuff on the new drive.

The distributions involved are Red Hat 7.0 and Red Hat 7.3.

Where to "try using pci=biosirq"?  I tried adding it to the boot sequence, 
with no result.

I'm stumped.  Any suggestions where to start looking?


Stephen Satchell


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] 2.4-ac: sparc64 support for O(1) scheduler
  2002-06-16 23:45       ` [PATCH] 2.4-ac: sparc64 support for O(1) scheduler Robert Love
@ 2002-06-17  5:28         ` David S. Miller
  2002-06-17 21:18           ` Robert Love
  0 siblings, 1 reply; 36+ messages in thread
From: David S. Miller @ 2002-06-17  5:28 UTC (permalink / raw)
  To: rml; +Cc: mingo, alan, linux-kernel

   From: Robert Love <rml@mvista.com>
   Date: 16 Jun 2002 16:45:45 -0700
   
   My approach thus far with 2.5 -> 2.4 O(1) backports has been one of
   caution and it has worked fine thus far.  I figure, what is the rush?

Your changes were pretty, that's part of the problem.  Fixing things
correctly is 10 times more preferable to a 1 time hack "just for now".


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  3:24           ` [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3 Ingo Molnar
  2002-06-17  3:35             ` Robert Love
@ 2002-06-17  7:50             ` Zwane Mwaikambo
  2002-06-17  8:32               ` Ingo Molnar
  2002-06-17 16:26             ` Rusty Russell
  2 siblings, 1 reply; 36+ messages in thread
From: Zwane Mwaikambo @ 2002-06-17  7:50 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Robert Love, Alan Cox, David S. Miller, linux-kernel

On Mon, 17 Jun 2002, Ingo Molnar wrote:

> i have planned to submit the irqbalance patch for 2.4-ac real soon, which
> needs this function - current IRQ distribution on P4 SMP boxes is a
> showstopper.

Can we add a config time option for irqbalance? I consider it extra 
overhead for setups which can do the interrupt distribution via hardware 
properly, also irqbalance breaks NUMAQ horribly seeing as it assumes a 
number of things like addressing modes.

Regards,
	Zwane Mwaikambo

-- 
http://function.linuxpower.ca
		



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  8:32               ` Ingo Molnar
@ 2002-06-17  8:23                 ` Zwane Mwaikambo
  2002-06-17  9:00                   ` Ingo Molnar
  0 siblings, 1 reply; 36+ messages in thread
From: Zwane Mwaikambo @ 2002-06-17  8:23 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Robert Love, Alan Cox, David S. Miller, linux-kernel

On Mon, 17 Jun 2002, Ingo Molnar wrote:

> 
> On Mon, 17 Jun 2002, Zwane Mwaikambo wrote:
> 
> > > i have planned to submit the irqbalance patch for 2.4-ac real soon, which
> > > needs this function - current IRQ distribution on P4 SMP boxes is a
> > > showstopper.
> > 
> > Can we add a config time option for irqbalance? I consider it extra
> > overhead for setups which can do the interrupt distribution via hardware
> > properly, [...]
> 
> What x86 hardware do you have in mind?

ye olde generic x86 SMP box, the interrupt handling imbalance came about 
with the P4 and their newer APIC setup did it not? Although i am aware 
that some x86 SMP boxes don't do the distribution properly too, thats why 
i reckon config option would be best.

> My main issue with irqbalance is the lack of testing it has - eg. a
> showstopper SMP-on-UP bug was found just two days ago.

Understandable, i agree not many people run 2.5 and it would help if it 
got into 2.4-ac for testing purposes.

> > [...] also irqbalance breaks NUMAQ horribly seeing as it assumes a
> > number of things like addressing modes.
> 
> exactly what does it assume that breaks NUMAQ?

<Disclaimer>
I am not a NUMAQ expert and do not even have access to one for testing
</Disclaimer>

The addressing mode, irq_balance assumes that the addressing mode is 
logical mode (when programming the IOREDTBL entries), whilst NUMAQ uses a 
completely different addressing architecture. Also another thing is 
consider this situation;

irqbalance programs IOAPIC#0 on node0 to deliver to CPU#6 on node1

Will that interrupt get delivered?

Regards,
	Zwane Mwaikambo

-- 
http://function.linuxpower.ca
		




^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  7:50             ` Zwane Mwaikambo
@ 2002-06-17  8:32               ` Ingo Molnar
  2002-06-17  8:23                 ` Zwane Mwaikambo
  0 siblings, 1 reply; 36+ messages in thread
From: Ingo Molnar @ 2002-06-17  8:32 UTC (permalink / raw)
  To: Zwane Mwaikambo; +Cc: Robert Love, Alan Cox, David S. Miller, linux-kernel


On Mon, 17 Jun 2002, Zwane Mwaikambo wrote:

> > i have planned to submit the irqbalance patch for 2.4-ac real soon, which
> > needs this function - current IRQ distribution on P4 SMP boxes is a
> > showstopper.
> 
> Can we add a config time option for irqbalance? I consider it extra
> overhead for setups which can do the interrupt distribution via hardware
> properly, [...]

What x86 hardware do you have in mind?

My main issue with irqbalance is the lack of testing it has - eg. a
showstopper SMP-on-UP bug was found just two days ago.

> [...] also irqbalance breaks NUMAQ horribly seeing as it assumes a
> number of things like addressing modes.

exactly what does it assume that breaks NUMAQ?

	Ingo


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  8:23                 ` Zwane Mwaikambo
@ 2002-06-17  9:00                   ` Ingo Molnar
  2002-06-17  9:34                     ` Zwane Mwaikambo
  2002-06-18  7:16                     ` William Lee Irwin III
  0 siblings, 2 replies; 36+ messages in thread
From: Ingo Molnar @ 2002-06-17  9:00 UTC (permalink / raw)
  To: Zwane Mwaikambo; +Cc: Robert Love, Alan Cox, David S. Miller, linux-kernel


On Mon, 17 Jun 2002, Zwane Mwaikambo wrote:

> > > Can we add a config time option for irqbalance? I consider it extra
> > > overhead for setups which can do the interrupt distribution via hardware
> > > properly, [...]
> > 
> > What x86 hardware do you have in mind?
> 
> ye olde generic x86 SMP box, the interrupt handling imbalance came about
> with the P4 and their newer APIC setup did it not? Although i am aware
> that some x86 SMP boxes don't do the distribution properly too, thats
> why i reckon config option would be best.

even generic x86 SMP boxes benefit from irqbalance due to better irq
affinity. Actually one could hardly find a worse way to distribute
interrupts than the IO-APIC + lowest priority delivery mode does... [in
fact there is one, the P4's do it ;-) ]

> > > [...] also irqbalance breaks NUMAQ horribly seeing as it assumes a
> > > number of things like addressing modes.
> > 
> > exactly what does it assume that breaks NUMAQ?
> 
> <Disclaimer>
> I am not a NUMAQ expert and do not even have access to one for testing
> </Disclaimer>
> 
> The addressing mode, irq_balance assumes that the addressing mode is
> logical mode (when programming the IOREDTBL entries), whilst NUMAQ uses
> a completely different addressing architecture. [...]

irqbalance uses the set_ioapic_affinity() method to set affinity. The
clustered APIC code is broken if it doesnt handle this properly. (i dont
have such hardware so i cant tell, but it indeed doesnt appear to handle
this case properly.) By wrapping around at node boundary the irqbalance
code will work just fine.

> [...] Also another thing is consider this situation;
> 
> irqbalance programs IOAPIC#0 on node0 to deliver to CPU#6 on node1
> 
> Will that interrupt get delivered?

i agree that this could be a problem, but set_ioapic_affinity() can be
made dependent on the actual NUMA setup that is used. This is absolutely
needed anyway for a proper /proc/irq/*/smp_affinity feature.

	Ingo


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  9:00                   ` Ingo Molnar
@ 2002-06-17  9:34                     ` Zwane Mwaikambo
  2002-06-18  7:16                     ` William Lee Irwin III
  1 sibling, 0 replies; 36+ messages in thread
From: Zwane Mwaikambo @ 2002-06-17  9:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Robert Love, Alan Cox, David S. Miller, Linux Kernel,
	Martin Bligh

<Martin Bligh added to CC>

On Mon, 17 Jun 2002, Ingo Molnar wrote:

> irqbalance uses the set_ioapic_affinity() method to set affinity. The
> clustered APIC code is broken if it doesnt handle this properly. (i dont
> have such hardware so i cant tell, but it indeed doesnt appear to handle
> this case properly.) By wrapping around at node boundary the irqbalance
> code will work just fine.

I agree, Also we have to be careful about the usage of cpu_online_map in 
balance_irq, there might need to be a bit of reworking of some of the 
other parts to get this working e.g. being able to determine which node a 
specific IOAPIC register is on (perhaps there might be 1 or 2 IOAPICs / 
node) etc etc. Martin?

> i agree that this could be a problem, but set_ioapic_affinity() can be
> made dependent on the actual NUMA setup that is used. This is absolutely
> needed anyway for a proper /proc/irq/*/smp_affinity feature.

Agreed.

Thanks,
	Zwane
-- 
http://function.linuxpower.ca
		



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  3:24           ` [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3 Ingo Molnar
  2002-06-17  3:35             ` Robert Love
  2002-06-17  7:50             ` Zwane Mwaikambo
@ 2002-06-17 16:26             ` Rusty Russell
  2 siblings, 0 replies; 36+ messages in thread
From: Rusty Russell @ 2002-06-17 16:26 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: rml, alan, davem, linux-kernel

On Mon, 17 Jun 2002 05:24:30 +0200 (CEST)
Ingo Molnar <mingo@elte.hu> wrote:

> > >  - sched_setaffinity() & sched_getaffinity() syscalls on x86.
> >
> > Do we want to introduce this into 2.4 now?  I realize 2.4-ac is not 2.4
> > proper, but if there is a chance this interface could change...
> 
> the setaffinity()/getaffinity() interface looks pretty robust, i dont
> expect any changes

There's one coming.  In 2.5.soon, you'll need to handle the "CPU going away"
signal, otherwise your process will abort as someone downs a CPU.

The problem with backporting one and not the other, is that apps can't be
written correctly for 2.4 and 2.5 8(

Rusty.
-- 
   there are those who do and those who hang on and you don't see too
   many doers quoting their contemporaries.  -- Larry McVoy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] 2.4-ac: sparc64 support for O(1) scheduler
  2002-06-17  5:28         ` David S. Miller
@ 2002-06-17 21:18           ` Robert Love
  0 siblings, 0 replies; 36+ messages in thread
From: Robert Love @ 2002-06-17 21:18 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-kernel

On Sun, 2002-06-16 at 22:28, David S. Miller wrote:

> Your changes were pretty, that's part of the problem.  Fixing things
> correctly is 10 times more preferable to a 1 time hack "just for now".

*shrug* I think you are missing my point but that is OK - we really do
not need to fight over it.

The switch_mm patch touched _core_ bits - code that affects i386 which
works fine now in 2.4-ac.  As 2.4-ac is stable and i386 is working fine,
I want to move changes into it slowly and with testing.

If you object to merging the "broken" sparc64 patch now but concede we
can wait for Ingo's patch, then I agree.  In fact, in light of Ingo's
patch Alan should not merge what I sent.  But my opinion would be to
hold off until the new bits saw some testing in 2.5 ... however trivial
they may be.

	Robert Love


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-17  9:00                   ` Ingo Molnar
  2002-06-17  9:34                     ` Zwane Mwaikambo
@ 2002-06-18  7:16                     ` William Lee Irwin III
  2002-06-19  1:05                       ` Matthew Dobson
  2002-06-24  0:16                       ` Martin J. Bligh
  1 sibling, 2 replies; 36+ messages in thread
From: William Lee Irwin III @ 2002-06-18  7:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Robert Love, Alan Cox, David S. Miller,
	linux-kernel, Martin.Bligh, colpatch, hbaum, cleverdj

On Mon, Jun 17, 2002 at 11:00:26AM +0200, Ingo Molnar wrote:
> irqbalance uses the set_ioapic_affinity() method to set affinity. The
> clustered APIC code is broken if it doesnt handle this properly. (i dont
> have such hardware so i cant tell, but it indeed doesnt appear to handle
> this case properly.) By wrapping around at node boundary the irqbalance
> code will work just fine.

Perhaps a brief look at the code will help. Please forgive my
non-preservation of whitespace as I cut and pasted it.


static inline void balance_irq(int irq)
{
#if CONFIG_SMP
    irq_balance_t *entry = irq_balance + irq;
    unsigned long now = jiffies;

    if (unlikely(entry->timestamp != now)) {
        unsigned long allowed_mask;
        int random_number;

        rdtscl(random_number);
        random_number &= 1;

        allowed_mask = cpu_online_map & irq_affinity[irq];
        entry->timestamp = now;
        entry->cpu = move(entry->cpu, allowed_mask, now, random_number);
        set_ioapic_affinity(irq, 1 << entry->cpu);
    }
#endif
}

        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	1 << entry->cpu



This could be problematic ...


static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
{
    unsigned long flags;

    /*
     * Only the first 8 bits are valid.
     */
    mask = mask << 24;
    spin_lock_irqsave(&ioapic_lock, flags);
    __DO_ACTION(1, = mask, )
    spin_unlock_irqrestore(&ioapic_lock, flags);
}


According to this, nothing over 8 cpu's can work as the cpu id is used
as a shift into an 8-bit bitfield. Also,


#define __DO_ACTION(R, ACTION, FINAL)                                   \
                                                                        \
{                                                                       \
        int pin;                                                        \
        struct irq_pin_list *entry = irq_2_pin + irq;                   \
                                                                        \
        for (;;) {                                                      \
                unsigned int reg;                                       \
                pin = entry->pin;                                       \
                if (pin == -1)                                          \
                        break;                                          \
                reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
                reg ACTION;                                             \
                io_apic_modify(entry->apic, reg);                       \
                if (!entry->next)                                       \
                        break;                                          \
                entry = irq_2_pin + entry->next;                        \
        }                                                               \
        FINAL;                                                          \
}

ACTION is supposed to be an assignment to reg; in clustered hierarchical
destination format this is not a bitmask as assumed by 1 << entry->cpu.


Matt, Mike, please comment.


Cheers,
Bill

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-18  7:16                     ` William Lee Irwin III
@ 2002-06-19  1:05                       ` Matthew Dobson
  2002-06-20 20:22                         ` Andrew Theurer
  2002-06-24  0:16                       ` Martin J. Bligh
  1 sibling, 1 reply; 36+ messages in thread
From: Matthew Dobson @ 2002-06-19  1:05 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Ingo Molnar, Zwane Mwaikambo, Robert Love, Alan Cox,
	David S. Miller, linux-kernel, Martin.Bligh, hbaum, cleverdj

I'm looking at this right now, as it is definitely broken on our NUMA-Q 
hardware when running in multiquad mode.  It needs to respect clustered APIC 
mode, so I'm working on it.

Cheers!

-Matt

William Lee Irwin III wrote:
> On Mon, Jun 17, 2002 at 11:00:26AM +0200, Ingo Molnar wrote:
> 
>>irqbalance uses the set_ioapic_affinity() method to set affinity. The
>>clustered APIC code is broken if it doesnt handle this properly. (i dont
>>have such hardware so i cant tell, but it indeed doesnt appear to handle
>>this case properly.) By wrapping around at node boundary the irqbalance
>>code will work just fine.
> 
> 
> Perhaps a brief look at the code will help. Please forgive my
> non-preservation of whitespace as I cut and pasted it.
> 
> 
> static inline void balance_irq(int irq)
> {
> #if CONFIG_SMP
>     irq_balance_t *entry = irq_balance + irq;
>     unsigned long now = jiffies;
> 
>     if (unlikely(entry->timestamp != now)) {
>         unsigned long allowed_mask;
>         int random_number;
> 
>         rdtscl(random_number);
>         random_number &= 1;
> 
>         allowed_mask = cpu_online_map & irq_affinity[irq];
>         entry->timestamp = now;
>         entry->cpu = move(entry->cpu, allowed_mask, now, random_number);
>         set_ioapic_affinity(irq, 1 << entry->cpu);
>     }
> #endif
> }
> 
>         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 	1 << entry->cpu
> 
> 
> 
> This could be problematic ...
> 
> 
> static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
> {
>     unsigned long flags;
> 
>     /*
>      * Only the first 8 bits are valid.
>      */
>     mask = mask << 24;
>     spin_lock_irqsave(&ioapic_lock, flags);
>     __DO_ACTION(1, = mask, )
>     spin_unlock_irqrestore(&ioapic_lock, flags);
> }
> 
> 
> According to this, nothing over 8 cpu's can work as the cpu id is used
> as a shift into an 8-bit bitfield. Also,
> 
> 
> #define __DO_ACTION(R, ACTION, FINAL)                                   \
>                                                                         \
> {                                                                       \
>         int pin;                                                        \
>         struct irq_pin_list *entry = irq_2_pin + irq;                   \
>                                                                         \
>         for (;;) {                                                      \
>                 unsigned int reg;                                       \
>                 pin = entry->pin;                                       \
>                 if (pin == -1)                                          \
>                         break;                                          \
>                 reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
>                 reg ACTION;                                             \
>                 io_apic_modify(entry->apic, reg);                       \
>                 if (!entry->next)                                       \
>                         break;                                          \
>                 entry = irq_2_pin + entry->next;                        \
>         }                                                               \
>         FINAL;                                                          \
> }
> 
> ACTION is supposed to be an assignment to reg; in clustered hierarchical
> destination format this is not a bitmask as assumed by 1 << entry->cpu.
> 
> 
> Matt, Mike, please comment.
> 
> 
> Cheers,
> Bill
> 



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] 2.4-ac: sparc64 support for O(1) scheduler
  2002-06-15 13:22     ` David S. Miller
@ 2002-06-20 19:42       ` Alan Cox
  0 siblings, 0 replies; 36+ messages in thread
From: Alan Cox @ 2002-06-20 19:42 UTC (permalink / raw)
  To: David S. Miller; +Cc: rml, alan, linux-kernel

>    them... (I am currently putting together all the scheduler bits we have
>    been working on for a 2.4-ac patch...)
> 
> Your sparc64 kernel/sched.c bits have zero testing in any kernel.
> What point are you trying to make?  It disables a very important
> optimization on SMP sparc64.  It's simply unacceptable.

I don't care about Sparc64, especially as a short term item. Long term
yes you are right but for the -ac work, it can fall back for a while

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-19  1:05                       ` Matthew Dobson
@ 2002-06-20 20:22                         ` Andrew Theurer
  0 siblings, 0 replies; 36+ messages in thread
From: Andrew Theurer @ 2002-06-20 20:22 UTC (permalink / raw)
  To: colpatch, William Lee Irwin III
  Cc: Ingo Molnar, Zwane Mwaikambo, Robert Love, Alan Cox,
	David S. Miller, linux-kernel, Martin.Bligh, hbaum, cleverdj,
	anton

Ingo,

Could we also change "now" to a longer interval?  In netbench, 2.4.18, O1, 
irqbalance, I get the following results:

[4-way P4, 4 acenics]
now = jiffies	743 Mbps
now = jiffies*10	784 Mbps
now = jiffies*20	803 Mbps
now = jiffies*30	800 Mbps
now = jiffies*100	770 Mbps

[no irqbalance patch]
all IRQs on one CPU	809 Mbps
1 acenic per CPU	800 Mbps

Either the IRQs don't get to stick around long enough, or there is a high cost 
for the IOAPIC programming?  Anton may have some info on this as well....

-Andrew Theurer

On Tuesday 18 June 2002 20:05, Matthew Dobson wrote:
> I'm looking at this right now, as it is definitely broken on our NUMA-Q
> hardware when running in multiquad mode.  It needs to respect clustered
> APIC mode, so I'm working on it.
>
> Cheers!
>
> -Matt
>
> William Lee Irwin III wrote:
> > On Mon, Jun 17, 2002 at 11:00:26AM +0200, Ingo Molnar wrote:
> >>irqbalance uses the set_ioapic_affinity() method to set affinity. The
> >>clustered APIC code is broken if it doesnt handle this properly. (i dont
> >>have such hardware so i cant tell, but it indeed doesnt appear to handle
> >>this case properly.) By wrapping around at node boundary the irqbalance
> >>code will work just fine.
> >
> > Perhaps a brief look at the code will help. Please forgive my
> > non-preservation of whitespace as I cut and pasted it.
> >
> >
> > static inline void balance_irq(int irq)
> > {
> > #if CONFIG_SMP
> >     irq_balance_t *entry = irq_balance + irq;
> >     unsigned long now = jiffies;
> >
> >     if (unlikely(entry->timestamp != now)) {
> >         unsigned long allowed_mask;
> >         int random_number;
> >
> >         rdtscl(random_number);
> >         random_number &= 1;
> >
> >         allowed_mask = cpu_online_map & irq_affinity[irq];
> >         entry->timestamp = now;
> >         entry->cpu = move(entry->cpu, allowed_mask, now, random_number);
> >         set_ioapic_affinity(irq, 1 << entry->cpu);
> >     }
> > #endif
> > }
> >
> >         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > 	1 << entry->cpu
> >
> >
> >
> > This could be problematic ...
> >
> >
> > static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
> > {
> >     unsigned long flags;
> >
> >     /*
> >      * Only the first 8 bits are valid.
> >      */
> >     mask = mask << 24;
> >     spin_lock_irqsave(&ioapic_lock, flags);
> >     __DO_ACTION(1, = mask, )
> >     spin_unlock_irqrestore(&ioapic_lock, flags);
> > }
> >
> >
> > According to this, nothing over 8 cpu's can work as the cpu id is used
> > as a shift into an 8-bit bitfield. Also,
> >
> >
> > #define __DO_ACTION(R, ACTION, FINAL)                                   \
> >                                                                         \
> > {                                                                       \
> >         int pin;                                                        \
> >         struct irq_pin_list *entry = irq_2_pin + irq;                   \
> >                                                                         \
> >         for (;;) {                                                      \
> >                 unsigned int reg;                                       \
> >                 pin = entry->pin;                                       \
> >                 if (pin == -1)                                          \
> >                         break;                                          \
> >                 reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
> >                 reg ACTION;                                             \
> >                 io_apic_modify(entry->apic, reg);                       \
> >                 if (!entry->next)                                       \
> >                         break;                                          \
> >                 entry = irq_2_pin + entry->next;                        \
> >         }                                                               \
> >         FINAL;                                                          \
> > }
> >
> > ACTION is supposed to be an assignment to reg; in clustered hierarchical
> > destination format this is not a bitmask as assumed by 1 << entry->cpu.
> >
> >
> > Matt, Mike, please comment.
> >
> >
> > Cheers,
> > Bill
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3.
  2002-06-18  7:16                     ` William Lee Irwin III
  2002-06-19  1:05                       ` Matthew Dobson
@ 2002-06-24  0:16                       ` Martin J. Bligh
  1 sibling, 0 replies; 36+ messages in thread
From: Martin J. Bligh @ 2002-06-24  0:16 UTC (permalink / raw)
  To: William Lee Irwin III, Ingo Molnar
  Cc: Zwane Mwaikambo, Robert Love, Alan Cox, David S. Miller,
	linux-kernel, colpatch, hbaum, cleverdj

> On Mon, Jun 17, 2002 at 11:00:26AM +0200, Ingo Molnar wrote:
>> irqbalance uses the set_ioapic_affinity() method to set affinity. The
>> clustered APIC code is broken if it doesnt handle this properly. (i dont
>> have such hardware so i cant tell, but it indeed doesnt appear to handle
>> this case properly.) By wrapping around at node boundary the irqbalance
>> code will work just fine.
> 
> Perhaps a brief look at the code will help. Please forgive my
> non-preservation of whitespace as I cut and pasted it.

IIRC, I set up the IOAPICs to use physical mode broadcast
on all quads - physical broadcasts are quad-local, and thus
the interrupt is always processesed by a cpu on the quad
where it originated. Much simpler than trying to correctly
program clustered logical mode broadcasts differently for
every quad.

You also don't want to end up reprogramming the IO-APICs
cross-quad, you want a per-node thread to do this. We have
2 IO-APICs per node.

Whilst balancing of some form is definitely valuable for a P4,
I'm less convinced it's worthwhile for a P3 system. I presume
what you're trying to achieve is cache warmth for the interrupt
handling code at the expense of the cost of constantly reprogramming
the IO-APICs. 

At the very least, we need to have a simple disable config option 
in order to benchmark whether this change is worthwhile for each subarchitecture.

M.



^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2002-06-24  0:18 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-06-13 19:21 [PATCH] 2.4-ac: sparc64 support for O(1) scheduler Robert Love
2002-06-14  4:25 ` David S. Miller
2002-06-14 17:32   ` Robert Love
2002-06-15 13:22     ` David S. Miller
2002-06-20 19:42       ` Alan Cox
2002-06-16 15:19     ` Ingo Molnar
2002-06-16 17:00       ` [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3 Ingo Molnar
2002-06-16 23:57         ` Robert Love
2002-06-17  0:13           ` J.A. Magallon
2002-06-17  4:28             ` Ingo Molnar
2002-06-17  0:15           ` Robert Love
2002-06-17  3:49             ` Ingo Molnar
2002-06-17  3:57               ` Robert Love
2002-06-17  4:07                 ` Ingo Molnar
2002-06-17  4:02               ` Robert Love
2002-06-17  4:26                 ` Ingo Molnar
2002-06-17  4:49                 ` [patch] 2.5.22 current scheduler bits #1 Ingo Molnar
2002-06-17  3:24           ` [patch] 2.4.19-pre10-ac2: O(1) scheduler merge, -A3 Ingo Molnar
2002-06-17  3:35             ` Robert Love
2002-06-17  4:01               ` Ingo Molnar
2002-06-17  7:50             ` Zwane Mwaikambo
2002-06-17  8:32               ` Ingo Molnar
2002-06-17  8:23                 ` Zwane Mwaikambo
2002-06-17  9:00                   ` Ingo Molnar
2002-06-17  9:34                     ` Zwane Mwaikambo
2002-06-18  7:16                     ` William Lee Irwin III
2002-06-19  1:05                       ` Matthew Dobson
2002-06-20 20:22                         ` Andrew Theurer
2002-06-24  0:16                       ` Martin J. Bligh
2002-06-17 16:26             ` Rusty Russell
2002-06-17  4:51           ` Toshiba PCToPIC97 PC Card freeze in 2.4.18 Stephen Satchell
2002-06-16 23:45       ` [PATCH] 2.4-ac: sparc64 support for O(1) scheduler Robert Love
2002-06-17  5:28         ` David S. Miller
2002-06-17 21:18           ` Robert Love
2002-06-14 22:00   ` Thomas Duffy
2002-06-15 13:35     ` David S. Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox