* [PATCH][2.5] IRQ distribution patch for 2.5.58
@ 2003-01-16 21:08 Kamble, Nitin A
2003-02-17 18:16 ` William Lee Irwin III
0 siblings, 1 reply; 6+ messages in thread
From: Kamble, Nitin A @ 2003-01-16 21:08 UTC (permalink / raw)
To: linux-kernel; +Cc: Nakajima, Jun, Mallick, Asit K, Saxena, Sunil
[-- Attachment #1: Type: text/plain, Size: 14186 bytes --]
Hi Linus,
This patch improves Linux kernel performance with heavy interrupts load on IA32 systems, also incorporates Hyper-Threading awareness in the IRQ distribution.
I have rebased the patch for 2.5.58 kernel, with minor changes for the comments I received from Greg KH on LKML.
The performance change results collected and received with validation so far:
4xP4 Xeon with HT : netperf : +12%
4xP4 Xeon with HT on IBM Summit : NetBench: +5%
4xp6 system : netperf : +2%
Please add it in the 2.5.* kernel tree.
Thanks & Regards,
Nitin
diff -Naru 2.5.58/arch/i386/kernel/io_apic.c kirq/arch/i386/kernel/io_apic.c
--- 2.5.58/arch/i386/kernel/io_apic.c 2003-01-13 21:58:29.000000000 -0800
+++ kirq/arch/i386/kernel/io_apic.c 2003-01-15 18:32:44.000000000 -0800
@@ -207,19 +207,34 @@
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-#if CONFIG_SMP
-
-typedef struct {
- unsigned int cpu;
- unsigned long timestamp;
-} ____cacheline_aligned irq_balance_t;
-
-static irq_balance_t irq_balance[NR_IRQS] __cacheline_aligned
- = { [ 0 ... NR_IRQS-1 ] = { 0, 0 } };
+#if defined(CONFIG_SMP)
+# include <asm/processor.h> /* kernel_thread() */
+# include <linux/kernel_stat.h> /* kstat */
+# include <linux/slab.h> /* kmalloc() */
+# include <linux/timer.h> /* time_after() */
+
+# if CONFIG_BALANCED_IRQ_DEBUG
+# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
+# define Dprintk(x...) do { TDprintk(x); } while (0)
+# else
+# define TDprintk(x...)
+# define Dprintk(x...)
+# endif
extern unsigned long irq_affinity [NR_IRQS];
-
-#endif
+unsigned long __cacheline_aligned irq_balance_mask [NR_IRQS];
+static int irqbalance_disabled __initdata = 0;
+static int physical_balance = 0;
+
+struct irq_cpu_info {
+ unsigned long * last_irq;
+ unsigned long * irq_delta;
+ unsigned long irq;
+} irq_cpu_data[NR_CPUS];
+
+#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
+#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
#define IDLE_ENOUGH(cpu,now) \
(idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1))
@@ -227,10 +242,224 @@
#define IRQ_ALLOWED(cpu,allowed_mask) \
((1 << cpu) & (allowed_mask))
-#if CONFIG_SMP
+#define CPU_TO_PACKAGEINDEX(i) \
+ ((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i)
+
+#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
+#define BALANCED_IRQ_MORE_DELTA (HZ/10)
+#define BALANCED_IRQ_LESS_DELTA (HZ)
+
+long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
+
+static inline void balance_irq(int cpu, int irq);
+
+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
+{
+ int i, j;
+ Dprintk("Rotating IRQs among CPUs.\n");
+ for (i = 0; i < NR_CPUS; i++) {
+ for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) {
+ if (!irq_desc[j].action)
+ continue;
+ /* Is it a significant load ? */
+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < useful_load_threshold)
+ continue;
+ balance_irq(i, j);
+ }
+ }
+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ return;
+}
+
+static void do_irq_balance(void)
+{
+ int i, j;
+ unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
+ unsigned long move_this_load = 0;
+ int max_loaded = 0, min_loaded = 0;
+ unsigned long useful_load_threshold = balanced_irq_interval + 10;
+ int selected_irq;
+ int tmp_loaded, first_attempt = 1;
+ unsigned long tmp_cpu_irq;
+ unsigned long imbalance = 0;
+ unsigned long allowed_mask;
+ unsigned long target_cpu_mask;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ int package_index;
+ CPU_IRQ(i) = 0;
+ if (!cpu_online(i))
+ continue;
+ package_index = CPU_TO_PACKAGEINDEX(i);
+ for (j = 0; j < NR_IRQS; j++) {
+ unsigned long value_now, delta;
+ /* Is this an active IRQ? */
+ if (!irq_desc[j].action)
+ continue;
+ if ( package_index == i )
+ IRQ_DELTA(package_index,j) = 0;
+ /* Determine the total count per processor per IRQ */
+ value_now = (unsigned long) kstat_cpu(i).irqs[j];
+
+ /* Determine the activity per processor per IRQ */
+ delta = value_now - LAST_CPU_IRQ(i,j);
+
+ /* Update last_cpu_irq[][] for the next time */
+ LAST_CPU_IRQ(i,j) = value_now;
+
+ /* Ignore IRQs whose rate is less than the clock */
+ if (delta < useful_load_threshold)
+ continue;
+ /* update the load for the processor or package total */
+ IRQ_DELTA(package_index,j) += delta;
+
+ /* Keep track of the higher numbered sibling as well */
+ if (i != package_index)
+ CPU_IRQ(i) += delta;
+ /*
+ * We have sibling A and sibling B in the package
+ *
+ * cpu_irq[A] = load for cpu A + load for cpu B
+ * cpu_irq[B] = load for cpu B
+ */
+ CPU_IRQ(package_index) += delta;
+ }
+ }
+ /* Find the least loaded processor package */
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i))
+ continue;
+ if (physical_balance && i > cpu_sibling_map[i])
+ continue;
+ if (min_cpu_irq > CPU_IRQ(i)) {
+ min_cpu_irq = CPU_IRQ(i);
+ min_loaded = i;
+ }
+ }
+ max_cpu_irq = ULONG_MAX;
+
+tryanothercpu:
+ /* Look for heaviest loaded processor.
+ * We may come back to get the next heaviest loaded processor.
+ * Skip processors with trivial loads.
+ */
+ tmp_cpu_irq = 0;
+ tmp_loaded = -1;
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i))
+ continue;
+ if (physical_balance && i > cpu_sibling_map[i])
+ continue;
+ if (max_cpu_irq <= CPU_IRQ(i))
+ continue;
+ if (tmp_cpu_irq < CPU_IRQ(i)) {
+ tmp_cpu_irq = CPU_IRQ(i);
+ tmp_loaded = i;
+ }
+ }
+
+ if (tmp_loaded == -1) {
+ /* In the case of small number of heavy interrupt sources,
+ * loading some of the cpus too much. We use Ingo's original
+ * approach to rotate them around.
+ */
+ if (!first_attempt && imbalance >= useful_load_threshold) {
+ rotate_irqs_among_cpus(useful_load_threshold);
+ return;
+ }
+ goto not_worth_the_effort;
+ }
+
+ first_attempt = 0; /* heaviest search */
+ max_cpu_irq = tmp_cpu_irq; /* load */
+ max_loaded = tmp_loaded; /* processor */
+ imbalance = (max_cpu_irq - min_cpu_irq) / 2;
+
+ Dprintk("max_loaded cpu = %d\n", max_loaded);
+ Dprintk("min_loaded cpu = %d\n", min_loaded);
+ Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
+ Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
+ Dprintk("load imbalance = %lu\n", imbalance);
+
+ /* if imbalance is less than approx 10% of max load, then
+ * observe diminishing returns action. - quit
+ */
+ if (imbalance < (max_cpu_irq >> 3)) {
+ Dprintk("Imbalance too trivial\n");
+ goto not_worth_the_effort;
+ }
+
+tryanotherirq:
+ /* if we select an IRQ to move that can't go where we want, then
+ * see if there is another one to try.
+ */
+ move_this_load = 0;
+ selected_irq = -1;
+ for (j = 0; j < NR_IRQS; j++) {
+ /* Is this an active IRQ? */
+ if (!irq_desc[j].action)
+ continue;
+ if (imbalance <= IRQ_DELTA(max_loaded,j))
+ continue;
+ /* Try to find the IRQ that is closest to the imbalance
+ * without going over.
+ */
+ if (move_this_load < IRQ_DELTA(max_loaded,j)) {
+ move_this_load = IRQ_DELTA(max_loaded,j);
+ selected_irq = j;
+ }
+ }
+ if (selected_irq == -1) {
+ goto tryanothercpu;
+ }
-#define IRQ_BALANCE_INTERVAL (HZ/50)
+ imbalance = move_this_load;
+ /* For physical_balance case, we accumlated both load
+ * values in the one of the siblings cpu_irq[],
+ * to use the same code for physical and logical processors
+ * as much as possible.
+ *
+ * NOTE: the cpu_irq[] array holds the sum of the load for
+ * sibling A and sibling B in the slot for the lowest numbered
+ * sibling (A), _AND_ the load for sibling B in the slot for
+ * the higher numbered sibling.
+ *
+ * We seek the least loaded sibling by making the comparison
+ * (A+B)/2 vs B
+ */
+ if (physical_balance && (CPU_IRQ(min_loaded) >> 1) > CPU_IRQ(cpu_sibling_map[min_loaded]))
+ min_loaded = cpu_sibling_map[min_loaded];
+
+ allowed_mask = cpu_online_map & irq_affinity[selected_irq];
+ target_cpu_mask = 1 << min_loaded;
+
+ if (target_cpu_mask & allowed_mask) {
+ irq_desc_t *desc = irq_desc + selected_irq;
+ Dprintk("irq = %d moved to cpu = %d\n", selected_irq, min_loaded);
+ /* mark for change destination */
+ spin_lock(&desc->lock);
+ irq_balance_mask[selected_irq] = target_cpu_mask;
+ spin_unlock(&desc->lock);
+ /* Since we made a change, come back sooner to
+ * check for more variation.
+ */
+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ return;
+ }
+ goto tryanotherirq;
+
+not_worth_the_effort:
+ /* if we did not find an IRQ to move, then adjust the time interval upward */
+ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
+ Dprintk("IRQ worth rotating not found\n");
+ return;
+}
+
static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
{
int search_idle = 1;
@@ -257,34 +486,111 @@
return cpu;
}
-static inline void balance_irq(int irq)
+static inline void balance_irq (int cpu, int irq)
{
- irq_balance_t *entry = irq_balance + irq;
unsigned long now = jiffies;
-
+ unsigned long allowed_mask;
+ unsigned int new_cpu;
+
if (no_balance_irq)
return;
- if (unlikely(time_after(now, entry->timestamp + IRQ_BALANCE_INTERVAL))) {
- unsigned long allowed_mask;
- unsigned int new_cpu;
- int random_number;
+ allowed_mask = cpu_online_map & irq_affinity[irq];
+ new_cpu = move(cpu, allowed_mask, now, 1);
+ if (cpu != new_cpu) {
+ irq_desc_t *desc = irq_desc + irq;
+ spin_lock(&desc->lock);
+ irq_balance_mask[irq] = 1 << new_cpu;
+ spin_unlock(&desc->lock);
+ }
+}
- rdtscl(random_number);
- random_number &= 1;
+int balanced_irq(void *unused)
+{
+ int i;
+ unsigned long prev_balance_time = jiffies;
+ long time_remaining = balanced_irq_interval;
+ daemonize();
+ sigfillset(¤t->blocked);
+ sprintf(current->comm, "balanced_irq");
+
+ /* push everything to CPU 0 to give us a starting point. */
+ for (i = 0 ; i < NR_IRQS ; i++)
+ irq_balance_mask[i] = 1 << 0;
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ time_remaining = schedule_timeout(time_remaining);
+ if (time_after(jiffies, prev_balance_time+balanced_irq_interval)) {
+ Dprintk("balanced_irq: calling do_irq_balance() %lu\n", jiffies);
+ do_irq_balance();
+ prev_balance_time = jiffies;
+ time_remaining = balanced_irq_interval;
+ }
+ }
+}
+
+static int __init balanced_irq_init(void)
+{
+ int i;
+ struct cpuinfo_x86 *c;
+ c = &boot_cpu_data;
+ if (irqbalance_disabled)
+ return 0;
+ /* Enable physical balance only if more than 1 physical processor is present */
+ if (smp_num_siblings > 1 && cpu_online_map >> 2)
+ physical_balance = 1;
- allowed_mask = cpu_online_map & irq_affinity[irq];
- entry->timestamp = now;
- new_cpu = move(entry->cpu, allowed_mask, now, random_number);
- if (entry->cpu != new_cpu) {
- entry->cpu = new_cpu;
- set_ioapic_affinity(irq, 1 << new_cpu);
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i))
+ continue;
+ irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
+ printk(KERN_ERR "balanced_irq_init: out of memory");
+ goto failed;
}
+ memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
+ memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
}
+
+ printk(KERN_INFO "Starting balanced_irq\n");
+ if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
+ return 0;
+ else
+ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
+failed:
+ for (i = 0; i < NR_CPUS; i++) {
+ if(irq_cpu_data[i].irq_delta)
+ kfree(irq_cpu_data[i].irq_delta);
+ if(irq_cpu_data[i].last_irq)
+ kfree(irq_cpu_data[i].last_irq);
+ }
+ return 0;
}
-#else /* !SMP */
-static inline void balance_irq(int irq) { }
-#endif
+
+static int __init irqbalance_disable(char *str)
+{
+ irqbalance_disabled = 1;
+ return 0;
+}
+
+__setup("noirqbalance", irqbalance_disable);
+
+static void set_ioapic_affinity (unsigned int irq, unsigned long mask);
+
+static inline void move_irq(int irq)
+{
+ /* note - we hold the desc->lock */
+ if (unlikely(irq_balance_mask[irq])) {
+ set_ioapic_affinity(irq, irq_balance_mask[irq]);
+ irq_balance_mask[irq] = 0;
+ }
+}
+
+__initcall(balanced_irq_init);
+
+#endif /* defined(CONFIG_SMP) */
+
/*
* support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -1307,7 +1613,7 @@
*/
static void ack_edge_ioapic_irq(unsigned int irq)
{
- balance_irq(irq);
+ move_irq(irq);
if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
== (IRQ_PENDING | IRQ_DISABLED))
mask_IO_APIC_irq(irq);
@@ -1347,7 +1653,7 @@
unsigned long v;
int i;
- balance_irq(irq);
+ move_irq(irq);
/*
* It appears there is an erratum which affects at least version 0x11
* of I/O APIC (that's the 82093AA and cores integrated into various
diff -Naru 2.5.58/Documentation/kernel-parameters.txt kirq/Documentation/kernel-parameters.txt
--- 2.5.58/Documentation/kernel-parameters.txt 2003-01-13 21:59:35.000000000 -0800
+++ kirq/Documentation/kernel-parameters.txt 2003-01-14 16:18:41.000000000 -0800
@@ -352,6 +352,8 @@
hugepages= [HW,IA-32] Maximal number of HugeTLB pages
+ noirqbalance [IA-32,SMP,KNL] Disable kernel irq balancing
+
i8042_direct [HW] Non-translated mode
i8042_dumbkbd
i8042_noaux
[-- Attachment #2: kirq_2.5.58.patch --]
[-- Type: application/octet-stream, Size: 13155 bytes --]
diff -Naru 2.5.58/arch/i386/kernel/io_apic.c kirq/arch/i386/kernel/io_apic.c
--- 2.5.58/arch/i386/kernel/io_apic.c 2003-01-13 21:58:29.000000000 -0800
+++ kirq/arch/i386/kernel/io_apic.c 2003-01-15 18:32:44.000000000 -0800
@@ -207,19 +207,34 @@
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-#if CONFIG_SMP
-
-typedef struct {
- unsigned int cpu;
- unsigned long timestamp;
-} ____cacheline_aligned irq_balance_t;
-
-static irq_balance_t irq_balance[NR_IRQS] __cacheline_aligned
- = { [ 0 ... NR_IRQS-1 ] = { 0, 0 } };
+#if defined(CONFIG_SMP)
+# include <asm/processor.h> /* kernel_thread() */
+# include <linux/kernel_stat.h> /* kstat */
+# include <linux/slab.h> /* kmalloc() */
+# include <linux/timer.h> /* time_after() */
+
+# if CONFIG_BALANCED_IRQ_DEBUG
+# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
+# define Dprintk(x...) do { TDprintk(x); } while (0)
+# else
+# define TDprintk(x...)
+# define Dprintk(x...)
+# endif
extern unsigned long irq_affinity [NR_IRQS];
-
-#endif
+unsigned long __cacheline_aligned irq_balance_mask [NR_IRQS];
+static int irqbalance_disabled __initdata = 0;
+static int physical_balance = 0;
+
+struct irq_cpu_info {
+ unsigned long * last_irq;
+ unsigned long * irq_delta;
+ unsigned long irq;
+} irq_cpu_data[NR_CPUS];
+
+#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
+#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
#define IDLE_ENOUGH(cpu,now) \
(idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1))
@@ -227,10 +242,224 @@
#define IRQ_ALLOWED(cpu,allowed_mask) \
((1 << cpu) & (allowed_mask))
-#if CONFIG_SMP
+#define CPU_TO_PACKAGEINDEX(i) \
+ ((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i)
+
+#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
+#define BALANCED_IRQ_MORE_DELTA (HZ/10)
+#define BALANCED_IRQ_LESS_DELTA (HZ)
+
+long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
+
+static inline void balance_irq(int cpu, int irq);
+
+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
+{
+ int i, j;
+ Dprintk("Rotating IRQs among CPUs.\n");
+ for (i = 0; i < NR_CPUS; i++) {
+ for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) {
+ if (!irq_desc[j].action)
+ continue;
+ /* Is it a significant load ? */
+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < useful_load_threshold)
+ continue;
+ balance_irq(i, j);
+ }
+ }
+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ return;
+}
+
+static void do_irq_balance(void)
+{
+ int i, j;
+ unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
+ unsigned long move_this_load = 0;
+ int max_loaded = 0, min_loaded = 0;
+ unsigned long useful_load_threshold = balanced_irq_interval + 10;
+ int selected_irq;
+ int tmp_loaded, first_attempt = 1;
+ unsigned long tmp_cpu_irq;
+ unsigned long imbalance = 0;
+ unsigned long allowed_mask;
+ unsigned long target_cpu_mask;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ int package_index;
+ CPU_IRQ(i) = 0;
+ if (!cpu_online(i))
+ continue;
+ package_index = CPU_TO_PACKAGEINDEX(i);
+ for (j = 0; j < NR_IRQS; j++) {
+ unsigned long value_now, delta;
+ /* Is this an active IRQ? */
+ if (!irq_desc[j].action)
+ continue;
+ if ( package_index == i )
+ IRQ_DELTA(package_index,j) = 0;
+ /* Determine the total count per processor per IRQ */
+ value_now = (unsigned long) kstat_cpu(i).irqs[j];
+
+ /* Determine the activity per processor per IRQ */
+ delta = value_now - LAST_CPU_IRQ(i,j);
+
+ /* Update last_cpu_irq[][] for the next time */
+ LAST_CPU_IRQ(i,j) = value_now;
+
+ /* Ignore IRQs whose rate is less than the clock */
+ if (delta < useful_load_threshold)
+ continue;
+ /* update the load for the processor or package total */
+ IRQ_DELTA(package_index,j) += delta;
+
+ /* Keep track of the higher numbered sibling as well */
+ if (i != package_index)
+ CPU_IRQ(i) += delta;
+ /*
+ * We have sibling A and sibling B in the package
+ *
+ * cpu_irq[A] = load for cpu A + load for cpu B
+ * cpu_irq[B] = load for cpu B
+ */
+ CPU_IRQ(package_index) += delta;
+ }
+ }
+ /* Find the least loaded processor package */
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i))
+ continue;
+ if (physical_balance && i > cpu_sibling_map[i])
+ continue;
+ if (min_cpu_irq > CPU_IRQ(i)) {
+ min_cpu_irq = CPU_IRQ(i);
+ min_loaded = i;
+ }
+ }
+ max_cpu_irq = ULONG_MAX;
+
+tryanothercpu:
+ /* Look for heaviest loaded processor.
+ * We may come back to get the next heaviest loaded processor.
+ * Skip processors with trivial loads.
+ */
+ tmp_cpu_irq = 0;
+ tmp_loaded = -1;
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i))
+ continue;
+ if (physical_balance && i > cpu_sibling_map[i])
+ continue;
+ if (max_cpu_irq <= CPU_IRQ(i))
+ continue;
+ if (tmp_cpu_irq < CPU_IRQ(i)) {
+ tmp_cpu_irq = CPU_IRQ(i);
+ tmp_loaded = i;
+ }
+ }
+
+ if (tmp_loaded == -1) {
+ /* In the case of small number of heavy interrupt sources,
+ * loading some of the cpus too much. We use Ingo's original
+ * approach to rotate them around.
+ */
+ if (!first_attempt && imbalance >= useful_load_threshold) {
+ rotate_irqs_among_cpus(useful_load_threshold);
+ return;
+ }
+ goto not_worth_the_effort;
+ }
+
+ first_attempt = 0; /* heaviest search */
+ max_cpu_irq = tmp_cpu_irq; /* load */
+ max_loaded = tmp_loaded; /* processor */
+ imbalance = (max_cpu_irq - min_cpu_irq) / 2;
+
+ Dprintk("max_loaded cpu = %d\n", max_loaded);
+ Dprintk("min_loaded cpu = %d\n", min_loaded);
+ Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
+ Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
+ Dprintk("load imbalance = %lu\n", imbalance);
+
+ /* if imbalance is less than approx 10% of max load, then
+ * observe diminishing returns action. - quit
+ */
+ if (imbalance < (max_cpu_irq >> 3)) {
+ Dprintk("Imbalance too trivial\n");
+ goto not_worth_the_effort;
+ }
+
+tryanotherirq:
+ /* if we select an IRQ to move that can't go where we want, then
+ * see if there is another one to try.
+ */
+ move_this_load = 0;
+ selected_irq = -1;
+ for (j = 0; j < NR_IRQS; j++) {
+ /* Is this an active IRQ? */
+ if (!irq_desc[j].action)
+ continue;
+ if (imbalance <= IRQ_DELTA(max_loaded,j))
+ continue;
+ /* Try to find the IRQ that is closest to the imbalance
+ * without going over.
+ */
+ if (move_this_load < IRQ_DELTA(max_loaded,j)) {
+ move_this_load = IRQ_DELTA(max_loaded,j);
+ selected_irq = j;
+ }
+ }
+ if (selected_irq == -1) {
+ goto tryanothercpu;
+ }
-#define IRQ_BALANCE_INTERVAL (HZ/50)
+ imbalance = move_this_load;
+ /* For physical_balance case, we accumlated both load
+ * values in the one of the siblings cpu_irq[],
+ * to use the same code for physical and logical processors
+ * as much as possible.
+ *
+ * NOTE: the cpu_irq[] array holds the sum of the load for
+ * sibling A and sibling B in the slot for the lowest numbered
+ * sibling (A), _AND_ the load for sibling B in the slot for
+ * the higher numbered sibling.
+ *
+ * We seek the least loaded sibling by making the comparison
+ * (A+B)/2 vs B
+ */
+ if (physical_balance && (CPU_IRQ(min_loaded) >> 1) > CPU_IRQ(cpu_sibling_map[min_loaded]))
+ min_loaded = cpu_sibling_map[min_loaded];
+
+ allowed_mask = cpu_online_map & irq_affinity[selected_irq];
+ target_cpu_mask = 1 << min_loaded;
+
+ if (target_cpu_mask & allowed_mask) {
+ irq_desc_t *desc = irq_desc + selected_irq;
+ Dprintk("irq = %d moved to cpu = %d\n", selected_irq, min_loaded);
+ /* mark for change destination */
+ spin_lock(&desc->lock);
+ irq_balance_mask[selected_irq] = target_cpu_mask;
+ spin_unlock(&desc->lock);
+ /* Since we made a change, come back sooner to
+ * check for more variation.
+ */
+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ return;
+ }
+ goto tryanotherirq;
+
+not_worth_the_effort:
+ /* if we did not find an IRQ to move, then adjust the time interval upward */
+ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
+ Dprintk("IRQ worth rotating not found\n");
+ return;
+}
+
static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
{
int search_idle = 1;
@@ -257,34 +486,111 @@
return cpu;
}
-static inline void balance_irq(int irq)
+static inline void balance_irq (int cpu, int irq)
{
- irq_balance_t *entry = irq_balance + irq;
unsigned long now = jiffies;
-
+ unsigned long allowed_mask;
+ unsigned int new_cpu;
+
if (no_balance_irq)
return;
- if (unlikely(time_after(now, entry->timestamp + IRQ_BALANCE_INTERVAL))) {
- unsigned long allowed_mask;
- unsigned int new_cpu;
- int random_number;
+ allowed_mask = cpu_online_map & irq_affinity[irq];
+ new_cpu = move(cpu, allowed_mask, now, 1);
+ if (cpu != new_cpu) {
+ irq_desc_t *desc = irq_desc + irq;
+ spin_lock(&desc->lock);
+ irq_balance_mask[irq] = 1 << new_cpu;
+ spin_unlock(&desc->lock);
+ }
+}
- rdtscl(random_number);
- random_number &= 1;
+int balanced_irq(void *unused)
+{
+ int i;
+ unsigned long prev_balance_time = jiffies;
+ long time_remaining = balanced_irq_interval;
+ daemonize();
+ sigfillset(¤t->blocked);
+ sprintf(current->comm, "balanced_irq");
+
+ /* push everything to CPU 0 to give us a starting point. */
+ for (i = 0 ; i < NR_IRQS ; i++)
+ irq_balance_mask[i] = 1 << 0;
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ time_remaining = schedule_timeout(time_remaining);
+ if (time_after(jiffies, prev_balance_time+balanced_irq_interval)) {
+ Dprintk("balanced_irq: calling do_irq_balance() %lu\n", jiffies);
+ do_irq_balance();
+ prev_balance_time = jiffies;
+ time_remaining = balanced_irq_interval;
+ }
+ }
+}
+
+static int __init balanced_irq_init(void)
+{
+ int i;
+ struct cpuinfo_x86 *c;
+ c = &boot_cpu_data;
+ if (irqbalance_disabled)
+ return 0;
+ /* Enable physical balance only if more than 1 physical processor is present */
+ if (smp_num_siblings > 1 && cpu_online_map >> 2)
+ physical_balance = 1;
- allowed_mask = cpu_online_map & irq_affinity[irq];
- entry->timestamp = now;
- new_cpu = move(entry->cpu, allowed_mask, now, random_number);
- if (entry->cpu != new_cpu) {
- entry->cpu = new_cpu;
- set_ioapic_affinity(irq, 1 << new_cpu);
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i))
+ continue;
+ irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
+ printk(KERN_ERR "balanced_irq_init: out of memory");
+ goto failed;
}
+ memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
+ memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
}
+
+ printk(KERN_INFO "Starting balanced_irq\n");
+ if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
+ return 0;
+ else
+ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
+failed:
+ for (i = 0; i < NR_CPUS; i++) {
+ if(irq_cpu_data[i].irq_delta)
+ kfree(irq_cpu_data[i].irq_delta);
+ if(irq_cpu_data[i].last_irq)
+ kfree(irq_cpu_data[i].last_irq);
+ }
+ return 0;
}
-#else /* !SMP */
-static inline void balance_irq(int irq) { }
-#endif
+
+static int __init irqbalance_disable(char *str)
+{
+ irqbalance_disabled = 1;
+ return 0;
+}
+
+__setup("noirqbalance", irqbalance_disable);
+
+static void set_ioapic_affinity (unsigned int irq, unsigned long mask);
+
+static inline void move_irq(int irq)
+{
+ /* note - we hold the desc->lock */
+ if (unlikely(irq_balance_mask[irq])) {
+ set_ioapic_affinity(irq, irq_balance_mask[irq]);
+ irq_balance_mask[irq] = 0;
+ }
+}
+
+__initcall(balanced_irq_init);
+
+#endif /* defined(CONFIG_SMP) */
+
/*
* support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -1307,7 +1613,7 @@
*/
static void ack_edge_ioapic_irq(unsigned int irq)
{
- balance_irq(irq);
+ move_irq(irq);
if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
== (IRQ_PENDING | IRQ_DISABLED))
mask_IO_APIC_irq(irq);
@@ -1347,7 +1653,7 @@
unsigned long v;
int i;
- balance_irq(irq);
+ move_irq(irq);
/*
* It appears there is an erratum which affects at least version 0x11
* of I/O APIC (that's the 82093AA and cores integrated into various
diff -Naru 2.5.58/Documentation/kernel-parameters.txt kirq/Documentation/kernel-parameters.txt
--- 2.5.58/Documentation/kernel-parameters.txt 2003-01-13 21:59:35.000000000 -0800
+++ kirq/Documentation/kernel-parameters.txt 2003-01-14 16:18:41.000000000 -0800
@@ -352,6 +352,8 @@
hugepages= [HW,IA-32] Maximal number of HugeTLB pages
+ noirqbalance [IA-32,SMP,KNL] Disable kernel irq balancing
+
i8042_direct [HW] Non-translated mode
i8042_dumbkbd
i8042_noaux
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH][2.5] IRQ distribution patch for 2.5.58
2003-01-16 21:08 [PATCH][2.5] IRQ distribution patch for 2.5.58 Kamble, Nitin A
@ 2003-02-17 18:16 ` William Lee Irwin III
2003-02-17 18:49 ` Martin J. Bligh
2003-02-18 3:52 ` Zwane Mwaikambo
0 siblings, 2 replies; 6+ messages in thread
From: William Lee Irwin III @ 2003-02-17 18:16 UTC (permalink / raw)
To: Kamble, Nitin A
Cc: linux-kernel, Nakajima, Jun, Mallick, Asit K, Saxena, Sunil
On Thu, Jan 16, 2003 at 01:08:55PM -0800, Kamble, Nitin A wrote:
> + spin_lock(&desc->lock);
> + irq_balance_mask[selected_irq] = target_cpu_mask;
> + spin_unlock(&desc->lock);
Wrong.
irq_balance_mask[selected_irq] = cpu_to_logical_apicid(min_loaded);
... except this needs auditing for the assumption that the RTE's are
using logical DESTMOD.
Guess whose box won't boot with your code in?
-- wli
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH][2.5] IRQ distribution patch for 2.5.58
2003-02-17 18:16 ` William Lee Irwin III
@ 2003-02-17 18:49 ` Martin J. Bligh
2003-02-17 18:59 ` Dave Hansen
2003-02-18 3:52 ` Zwane Mwaikambo
1 sibling, 1 reply; 6+ messages in thread
From: Martin J. Bligh @ 2003-02-17 18:49 UTC (permalink / raw)
To: William Lee Irwin III, Kamble, Nitin A
Cc: linux-kernel, Nakajima, Jun, Mallick, Asit K, Saxena, Sunil
I think Dave already sent out a fix for that at the weekend.
M.
--On Monday, February 17, 2003 10:16:14 -0800 William Lee Irwin III <wli@holomorphy.com> wrote:
> On Thu, Jan 16, 2003 at 01:08:55PM -0800, Kamble, Nitin A wrote:
>> + spin_lock(&desc->lock);
>> + irq_balance_mask[selected_irq] = target_cpu_mask;
>> + spin_unlock(&desc->lock);
>
> Wrong.
>
> irq_balance_mask[selected_irq] = cpu_to_logical_apicid(min_loaded);
>
> ... except this needs auditing for the assumption that the RTE's are
> using logical DESTMOD.
>
> Guess whose box won't boot with your code in?
>
> -- wli
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH][2.5] IRQ distribution patch for 2.5.58
2003-02-17 18:49 ` Martin J. Bligh
@ 2003-02-17 18:59 ` Dave Hansen
0 siblings, 0 replies; 6+ messages in thread
From: Dave Hansen @ 2003-02-17 18:59 UTC (permalink / raw)
To: Martin J. Bligh
Cc: William Lee Irwin III, Kamble, Nitin A, linux-kernel,
Nakajima, Jun, Mallick, Asit K, Saxena, Sunil
[-- Attachment #1: Type: text/plain, Size: 137 bytes --]
Martin J. Bligh wrote:
> I think Dave already sent out a fix for that at the weekend.
Here you go.
--
Dave Hansen
haveblue@us.ibm.com
[-- Attachment #2: kirq-apicid-fix-2.5.61-1.patch --]
[-- Type: text/plain, Size: 1946 bytes --]
diff -ru linux-2.5.61-clean/arch/i386/kernel/io_apic.c linux-2.5.61-irqdebug/arch/i386/kernel/io_apic.c
--- linux-2.5.61-clean/arch/i386/kernel/io_apic.c 2003-02-14 17:51:26.000000000 -0600
+++ linux-2.5.61-irqdebug/arch/i386/kernel/io_apic.c 2003-02-15 17:42:51.000000000 -0600
@@ -222,7 +222,7 @@
# endif
extern unsigned long irq_affinity [NR_IRQS];
-unsigned long __cacheline_aligned irq_balance_mask [NR_IRQS];
+int __cacheline_aligned pending_irq_balance_apicid [NR_IRQS];
static int irqbalance_disabled __initdata = 0;
static int physical_balance = 0;
@@ -441,7 +441,7 @@
Dprintk("irq = %d moved to cpu = %d\n", selected_irq, min_loaded);
/* mark for change destination */
spin_lock(&desc->lock);
- irq_balance_mask[selected_irq] = target_cpu_mask;
+ pending_irq_balance_apicid[selected_irq] = cpu_to_logical_apicid(min_loaded);
spin_unlock(&desc->lock);
/* Since we made a change, come back sooner to
* check for more variation.
@@ -500,7 +500,7 @@
if (cpu != new_cpu) {
irq_desc_t *desc = irq_desc + irq;
spin_lock(&desc->lock);
- irq_balance_mask[irq] = cpu_to_logical_apicid(new_cpu);
+ pending_irq_balance_apicid[irq] = cpu_to_logical_apicid(new_cpu);
spin_unlock(&desc->lock);
}
}
@@ -515,7 +515,7 @@
/* push everything to CPU 0 to give us a starting point. */
for (i = 0 ; i < NR_IRQS ; i++)
- irq_balance_mask[i] = 1 << 0;
+ pending_irq_balance_apicid[i] = cpu_to_logical_apicid(0);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
time_remaining = schedule_timeout(time_remaining);
@@ -580,9 +580,9 @@
static inline void move_irq(int irq)
{
/* note - we hold the desc->lock */
- if (unlikely(irq_balance_mask[irq])) {
- set_ioapic_affinity(irq, irq_balance_mask[irq]);
- irq_balance_mask[irq] = 0;
+ if (unlikely(pending_irq_balance_apicid[irq])) {
+ set_ioapic_affinity(irq, pending_irq_balance_apicid[irq]);
+ pending_irq_balance_apicid[irq] = 0;
}
}
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH][2.5] IRQ distribution patch for 2.5.58
2003-02-17 18:16 ` William Lee Irwin III
2003-02-17 18:49 ` Martin J. Bligh
@ 2003-02-18 3:52 ` Zwane Mwaikambo
2003-02-18 4:15 ` William Lee Irwin III
1 sibling, 1 reply; 6+ messages in thread
From: Zwane Mwaikambo @ 2003-02-18 3:52 UTC (permalink / raw)
To: Kamble, Nitin A
Cc: Linux Kernel, Nakajima, Jun, Mallick, Asit K, Saxena, Sunil,
William Lee Irwin III
On Mon, 17 Feb 2003, William Lee Irwin III wrote:
> On Thu, Jan 16, 2003 at 01:08:55PM -0800, Kamble, Nitin A wrote:
> > + spin_lock(&desc->lock);
> > + irq_balance_mask[selected_irq] = target_cpu_mask;
> > + spin_unlock(&desc->lock);
>
> Wrong.
The desc locking for irq_balance_mask looks very strange, what made you
put it in?
Zwane
--
function.linuxpower.ca
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH][2.5] IRQ distribution patch for 2.5.58
2003-02-18 3:52 ` Zwane Mwaikambo
@ 2003-02-18 4:15 ` William Lee Irwin III
0 siblings, 0 replies; 6+ messages in thread
From: William Lee Irwin III @ 2003-02-18 4:15 UTC (permalink / raw)
To: Zwane Mwaikambo
Cc: Kamble, Nitin A, Linux Kernel, Nakajima, Jun, Mallick, Asit K,
Saxena, Sunil
On Thu, Jan 16, 2003 at 01:08:55PM -0800, Kamble, Nitin A wrote:
+ spin_lock(&desc->lock);
+ irq_balance_mask[selected_irq] = target_cpu_mask;
+ spin_unlock(&desc->lock);
On Mon, 17 Feb 2003, William Lee Irwin III wrote:
>> Wrong.
On Mon, Feb 17, 2003 at 10:52:35PM -0500, Zwane Mwaikambo wrote:
> The desc locking for irq_balance_mask looks very strange, what made you
> put it in?
I only quoted the patch. I've got enough going wrong with hackers
running wild and stuffing bitmasks, ASCII art, and possibly even JPEG's
into my RTE's without changing thelocking.
-- wli
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2003-02-18 4:06 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-01-16 21:08 [PATCH][2.5] IRQ distribution patch for 2.5.58 Kamble, Nitin A
2003-02-17 18:16 ` William Lee Irwin III
2003-02-17 18:49 ` Martin J. Bligh
2003-02-17 18:59 ` Dave Hansen
2003-02-18 3:52 ` Zwane Mwaikambo
2003-02-18 4:15 ` William Lee Irwin III
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.