public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC/PATCHv2] kernel/irq: allow more precise irq affinity policies
@ 2010-09-22 23:52 Arthur Kepner
  2010-09-23 10:56 ` Thomas Gleixner
  0 siblings, 1 reply; 4+ messages in thread
From: Arthur Kepner @ 2010-09-22 23:52 UTC (permalink / raw)
  To: linux-kernel; +Cc: Thomas Gleixner, Ben Hutchings


SGI has encountered situations where particular CPUs run out of 
interrupt vectors on systems with many (several hundred or more) 
CPUs. This happens because some drivers (particularly the mlx4_core 
driver) select the number of interrupts they allocate based on the 
number of CPUs, and because of how the default irq affinity is used.

The following patch allows for a more precise policy about how irq 
affinities are assigned by the kernel.

Changes from version 1:

	- IRQ_POLICY_NUMA is implemented

	- The 'irq_policy' can be changed at runtime, and interrupts 
	  redistributed according to the new policy. Notifications are 
	  sent when this happens.

Signed-off-by: Arthur Kepner <akepner@sgi.com>

---

 arch/x86/Kconfig           |   11 +
 include/linux/irq_policy.h |   21 +++
 kernel/irq/Makefile        |    2 
 kernel/irq/handle.c        |    5 
 kernel/irq/manage.c        |    3 
 kernel/irq/policy.c        |  291 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/proc.c          |   61 +++++++++
 7 files changed, 392 insertions(+), 2 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9..8fa7f52 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -313,6 +313,17 @@ config NUMA_IRQ_DESC
 	def_bool y
 	depends on SPARSE_IRQ && NUMA
 
+config IRQ_POLICY_NUMA
+	bool "Assign default interrupt affinities in a NUMA-friendly way"
+	def_bool y
+	depends on SPARSE_IRQ && NUMA
+	---help---
+	   When a device requests an interrupt, the default CPU used to
+	   service the interrupt will be selected from a node 'near by'
+	   the device. Also, interrupt affinities will be spread around
+	   the node so as to prevent any single CPU from running out of
+	   interrupt vectors.
+
 config X86_MPPARSE
 	bool "Enable MPS table" if ACPI
 	default y
diff --git a/include/linux/irq_policy.h b/include/linux/irq_policy.h
new file mode 100644
index 0000000..f009757
--- /dev/null
+++ b/include/linux/irq_policy.h
@@ -0,0 +1,21 @@
+#ifndef _LINUX_IRQ_POLICY_H
+#define _LINUX_IRQ_POLICY_H
+
+#include <linux/notifier.h>
+#include <linux/seq_file.h>
+#include <linux/irq.h>
+
+int available_irq_policy_show(struct seq_file *m, void *v);
+int irq_policy_show(struct seq_file *m, void *v);
+
+void __init init_irq_policy(void);
+int irq_policy_change(char *str);
+void irq_policy_apply(struct irq_desc *desc);
+
+enum irq_policy_notifiers {
+	IRQ_POLICY_REDISTRIBUTED,
+};
+
+int irq_policy_notify(struct notifier_block *nb);
+
+#endif /* _LINUX_IRQ_POLICY_H */
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 7d04780..0532082 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,5 @@
 
-obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
+obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o policy.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27e5c69..a4f1087 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -21,6 +21,7 @@
 #include <linux/hash.h>
 #include <linux/radix-tree.h>
 #include <trace/events/irq.h>
+#include <linux/irq_policy.h>
 
 #include "internals.h"
 
@@ -171,6 +172,8 @@ int __init early_irq_init(void)
 
 	init_irq_default_affinity();
 
+	init_irq_policy();
+
 	 /* initialize nr_irqs based on nr_cpu_ids */
 	arch_probe_nr_irqs();
 	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
@@ -258,6 +261,8 @@ int __init early_irq_init(void)
 
 	init_irq_default_affinity();
 
+	init_irq_policy();
+
 	printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
 
 	desc = irq_desc;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c3003e9..9141adc 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -14,6 +14,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/irq_policy.h>
 
 #include "internals.h"
 
@@ -175,7 +176,7 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
 
-	cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
+	irq_policy_apply(desc);
 set_affinity:
 	desc->chip->set_affinity(irq, desc->affinity);
 
diff --git a/kernel/irq/policy.c b/kernel/irq/policy.c
new file mode 100644
index 0000000..bc3f719
--- /dev/null
+++ b/kernel/irq/policy.c
@@ -0,0 +1,291 @@
+
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/string.h>
+#include <linux/interrupt.h>
+#include <linux/irq_policy.h>
+
+#include "internals.h"
+
+struct irq_policy *current_irq_policy;
+DEFINE_MUTEX(irq_policy_mutex); /* protect current_irq_policy */
+
+ATOMIC_NOTIFIER_HEAD(irq_policy_notify_list);
+
+int irq_policy_notify(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&irq_policy_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(irq_policy_notify);
+
+#ifdef CONFIG_IRQ_POLICY_NUMA
+
+static int irqs_per_cpu[NR_CPUS];
+
+void apply_numa(struct irq_desc *newdesc)
+{
+	struct irq_desc *desc;
+	int newnode = newdesc->node;
+	int cpu;
+	int irq;
+	int best;
+	unsigned int min = -1;
+	unsigned long flags;
+
+	if (newdesc->irq < NR_IRQS_LEGACY || newnode == -1) {
+		cpumask_and(newdesc->affinity, cpu_online_mask,
+			    irq_default_affinity);
+		return;
+	}
+
+	raw_spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	memset(irqs_per_cpu, 0, sizeof(irqs_per_cpu));
+
+	for_each_irq_desc(irq, desc) {
+
+		int node = desc->node;
+
+		if (node != newnode)
+			continue;
+
+		if (cpumask_full(desc->affinity))
+			continue;
+
+		if (!cpumask_intersects(desc->affinity, cpumask_of_node(node)))
+			continue; /* is that possible? */
+
+		for_each_cpu(cpu, desc->affinity)
+			irqs_per_cpu[cpu]++;
+
+	}
+	raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	best = cpumask_first(cpumask_of_node(newnode));
+	for_each_cpu(cpu, cpumask_of_node(newnode))
+		if (irqs_per_cpu[cpu] < min) {
+			min = irqs_per_cpu[cpu];
+			best = cpu;
+		}
+
+	cpumask_clear(newdesc->affinity);
+	cpumask_set_cpu(best, newdesc->affinity);
+}
+
+void redistribute_numa(void)
+{
+	struct irq_desc *desc1, *desc2;
+	int irq1, irq2;
+	unsigned long flags;
+	cpumask_var_t mask;
+
+	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
+		printk(KERN_NOTICE "%s cannot allocate cpumask\n", __func__);
+		return;
+	}
+
+	raw_spin_lock_irqsave(&sparse_irq_lock, flags);
+	for_each_irq_desc(irq1, desc1) {
+
+		int node1 = desc1->node;
+		int best;
+		int cpu;
+		unsigned int min = -1;
+
+		if (irq1 < NR_IRQS_LEGACY)
+			continue;
+
+		if (desc1->chip == NULL || desc1->chip->set_affinity == NULL)
+			continue;
+
+		if (node1 == -1) {
+			cpumask_and(desc1->affinity, cpu_online_mask,
+				    irq_default_affinity);
+			continue;
+		}
+
+		memset(irqs_per_cpu, 0, sizeof(irqs_per_cpu));
+		raw_spin_lock(&desc1->lock);
+
+		for_each_irq_desc(irq2, desc2) {
+
+			int node2 = desc2->node;
+
+			if (irq2 >= irq1)
+				break;
+
+			if (node2 != node1)
+				continue;
+
+			if (cpumask_full(desc2->affinity))
+				continue;
+
+			if (!cpumask_intersects(desc2->affinity,
+						cpumask_of_node(node2)))
+				continue; /* is that possible? */
+
+			for_each_cpu(cpu, desc2->affinity)
+				irqs_per_cpu[cpu]++;
+
+		}
+
+		best = cpumask_first(cpumask_of_node(node1));
+		for_each_cpu(cpu, cpumask_of_node(node1))
+			if (irqs_per_cpu[cpu] < min) {
+				min = irqs_per_cpu[cpu];
+				best = cpu;
+			}
+
+		cpumask_clear(mask);
+		cpumask_set_cpu(best, mask);
+		desc1->chip->set_affinity(irq1, mask);
+		raw_spin_unlock(&desc1->lock);
+	}
+	raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
+	free_cpumask_var(mask);
+}
+#endif /* CONFIG_IRQ_POLICY_NUMA */
+
+void apply_default(struct irq_desc *desc)
+{
+	cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
+}
+
+void redistribute_default(void)
+{
+	struct irq_desc *desc;
+	int irq;
+	cpumask_var_t mask;
+
+	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
+		printk(KERN_NOTICE "%s cannot allocate cpumask\n", __func__);
+		return;
+	}
+
+	for_each_irq_desc(irq, desc) {
+		unsigned long flags;
+		if (irq < NR_IRQS_LEGACY)
+			continue;
+
+		if (desc->chip == NULL || desc->chip->set_affinity == NULL)
+			continue;
+
+		raw_spin_lock_irqsave(&desc->lock, flags);
+		cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
+		desc->chip->set_affinity(irq, desc->affinity);
+		raw_spin_unlock_irqrestore(&desc->lock, flags);
+	}
+
+	free_cpumask_var(mask);
+}
+
+#define IRQ_POLICY_DEFAULT 0
+
+struct irq_policy {
+	char *name;
+	void (*apply) (struct irq_desc *desc); /* apply the policy */
+	void (*redistribute) (void); /* redistribute all irqs */
+} irq_policies[] = {
+	{
+		.name = "default",
+		.apply = apply_default,
+		.redistribute = redistribute_default,
+	},
+#ifdef CONFIG_IRQ_POLICY_NUMA
+	{
+		.name = "numa",
+		.apply = apply_numa,
+		.redistribute = redistribute_numa,
+	},
+#endif /* CONFIG_IRQ_POLICY_NUMA */
+};
+
+int available_irq_policy_show(struct seq_file *m, void *v)
+{
+	int i, imax = sizeof(irq_policies) / sizeof(irq_policies[0]);
+
+	for (i = 0; i < imax; i++)
+		seq_printf(m, "%s%s", irq_policies[i].name,
+			   i == (imax - 1) ? "\n" : " ");
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(available_irq_policy_show);
+
+int irq_policy_show(struct seq_file *m, void *v)
+{
+	mutex_lock(&irq_policy_mutex);
+	seq_printf(m, "%s\n", current_irq_policy->name);
+	mutex_unlock(&irq_policy_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_policy_show);
+
+static int irq_policy_select(char *str)
+{
+	int changed = 0;
+	int i, imax = sizeof(irq_policies) / sizeof(irq_policies[0]);
+
+	for (i = 0; i < imax; i++)
+		if (!strcmp(irq_policies[i].name, str))
+			break;
+
+	if (i < imax) {
+		mutex_lock(&irq_policy_mutex);
+		if (current_irq_policy != &irq_policies[i]) {
+			current_irq_policy = &irq_policies[i];
+			changed = 1;
+		}
+		mutex_unlock(&irq_policy_mutex);
+		return changed;
+	} else {
+		printk(KERN_INFO "irq_policy %s is invalid\n", str);
+		return -EINVAL;
+	}
+}
+
+int irq_policy_change(char *str)
+{
+	int ret = irq_policy_select(str);
+	int changed = ret > 0;
+
+	if (changed) {
+		current_irq_policy->redistribute();
+		atomic_notifier_call_chain(&irq_policy_notify_list,
+					   IRQ_POLICY_REDISTRIBUTED,
+					   NULL);
+	}
+
+	return changed ? 0 : ret;
+}
+EXPORT_SYMBOL_GPL(irq_policy_change);
+
+void irq_policy_apply(struct irq_desc *desc)
+{
+	assert_raw_spin_locked(&desc->lock);
+	mutex_lock(&irq_policy_mutex);
+	current_irq_policy->apply(desc);
+	mutex_unlock(&irq_policy_mutex);
+}
+EXPORT_SYMBOL_GPL(irq_policy_apply);
+
+void __init init_irq_policy(void)
+{
+	mutex_lock(&irq_policy_mutex);
+	if (current_irq_policy == NULL)
+		current_irq_policy = &irq_policies[IRQ_POLICY_DEFAULT];
+	mutex_unlock(&irq_policy_mutex);
+}
+
+static int __init irq_policy_setup(char* str)
+{
+	if (irq_policy_select(str))
+		return 0;
+	return 1;
+}
+
+__setup("irq_policy=", irq_policy_setup);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee5..64db2b8 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
+#include <linux/irq_policy.h>
 
 #include "internals.h"
 
@@ -181,6 +182,55 @@ static const struct file_operations default_affinity_proc_fops = {
 	.write		= default_affinity_write,
 };
 
+#define MAX_IRQ_POLICY_WRITE	31
+
+static ssize_t irq_policy_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char lbuf[MAX_IRQ_POLICY_WRITE + 1], *tmp;
+	size_t ret;
+
+	if (count > MAX_IRQ_POLICY_WRITE)
+		return -EINVAL;
+	if (copy_from_user(lbuf, buf, count))
+		return -EFAULT;
+
+	 lbuf[MAX_IRQ_POLICY_WRITE] = '\0';
+
+	tmp = strchr(lbuf, '\n');
+	if (tmp)
+		*tmp = '\0';
+
+	ret = irq_policy_change(lbuf);
+
+	return ret ? ret : count;
+}
+
+static int irq_policy_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_policy_show, NULL);
+}
+
+static const struct file_operations irq_policy_proc_fops = {
+	.open		= irq_policy_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= irq_policy_write,
+};
+
+static int available_irq_policy_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, available_irq_policy_show, NULL);
+}
+
+static const struct file_operations available_irq_policy_proc_fops = {
+	.open		= available_irq_policy_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int irq_node_proc_show(struct seq_file *m, void *v)
 {
 	struct irq_desc *desc = irq_to_desc((long) m->private);
@@ -316,6 +366,15 @@ static void register_default_affinity_proc(void)
 #endif
 }
 
+static void register_policy_proc(void)
+{
+#ifdef CONFIG_SMP
+	proc_create("irq/irq_policy", 0644, NULL, &irq_policy_proc_fops);
+	proc_create("irq/available_irq_policies", 0444, NULL,
+		    &available_irq_policy_proc_fops);
+#endif
+}
+
 void init_irq_proc(void)
 {
 	unsigned int irq;
@@ -328,6 +387,8 @@ void init_irq_proc(void)
 
 	register_default_affinity_proc();
 
+	register_policy_proc();
+
 	/*
 	 * Create entries for all existing IRQs.
 	 */

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-09-27  3:57 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-22 23:52 [RFC/PATCHv2] kernel/irq: allow more precise irq affinity policies Arthur Kepner
2010-09-23 10:56 ` Thomas Gleixner
2010-09-23 18:36   ` Thomas Gleixner
2010-09-27  3:57     ` Arthur Kepner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox