public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] sparse_irq aka dyn_irq
       [not found]                   ` <20081107124957.GA21709@elte.hu>
@ 2008-11-09  7:05                     ` Yinghai Lu
  2008-11-09  7:38                       ` Ingo Molnar
  2008-11-09  7:50                       ` Cyrill Gorcunov
  0 siblings, 2 replies; 66+ messages in thread
From: Yinghai Lu @ 2008-11-09  7:05 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Andrew Morton
  Cc: linux-kernel@vger.kernel.org


impact: new feature sparseirq

for sparse_irq, irq_desc, and irq_cfg is not using list_head to chain up
also not add per_cpu_dyn_array... no user now

add some kind of hash table as Ingo suggesting.
remove dyna_array, and enable sparse_irq by default, use kzalloc_node to get it
use desc->chip_data for x86 to store irq_cfg
make irq_desc to go with affinity aka irq_desc moving etc
only call move_irq_desc in irq_complete_move() --- but it seems not trigger that moving.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/Kconfig                   |    4 
 arch/x86/include/asm/io_apic.h     |    2 
 arch/x86/include/asm/irq_vectors.h |    2 
 arch/x86/kernel/io_apic.c          |  384 ++++++++++++++++++++++++-------------
 arch/x86/kernel/irq.c              |   19 -
 arch/x86/kernel/irq_32.c           |    2 
 arch/x86/kernel/irq_64.c           |    2 
 arch/x86/kernel/irqinit_32.c       |    3 
 arch/x86/kernel/irqinit_64.c       |    3 
 arch/x86/mm/init_32.c              |    3 
 drivers/char/random.c              |   31 ++
 drivers/pci/htirq.c                |   18 +
 drivers/pci/intr_remapping.c       |   65 ++++++
 drivers/xen/events.c               |    9 
 fs/proc/interrupts.c               |   13 +
 fs/proc/stat.c                     |   17 +
 include/linux/interrupt.h          |    2 
 include/linux/irq.h                |   60 +++++
 include/linux/irqnr.h              |   15 -
 include/linux/kernel_stat.h        |   14 +
 init/main.c                        |    2 
 kernel/irq/autoprobe.c             |   10 
 kernel/irq/chip.c                  |    4 
 kernel/irq/handle.c                |  338 +++++++++++++++++++++++++++++++-
 kernel/irq/proc.c                  |    3 
 kernel/irq/spurious.c              |    4 
 26 files changed, 839 insertions(+), 190 deletions(-)

Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -236,6 +236,10 @@ config X86_HAS_BOOT_CPU_ID
 	def_bool y
 	depends on X86_VOYAGER
 
+config HAVE_SPARSE_IRQ
+	bool
+	default y
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
Index: linux-2.6/arch/x86/kernel/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/io_apic.c
+++ linux-2.6/arch/x86/kernel/io_apic.c
@@ -108,8 +108,8 @@ static int __init parse_noapic(char *str
 early_param("noapic", parse_noapic);
 
 struct irq_pin_list;
+
 struct irq_cfg {
-	unsigned int irq;
 	struct irq_pin_list *irq_2_pin;
 	cpumask_t domain;
 	cpumask_t old_domain;
@@ -119,44 +119,117 @@ struct irq_cfg {
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-static struct irq_cfg irq_cfgx[NR_IRQS] = {
-	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-	[3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-	[4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-	[5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-	[6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-	[7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-	[8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-	[9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-	[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-	[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-	[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-	[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-	[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+static struct irq_cfg irq_cfg_legacy[] = {
+	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
 };
 
-#define for_each_irq_cfg(irq, cfg)		\
-	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+void __init arch_sparse_irq_init_work(void)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int legacy_count;
+	int i;
+
+	cfg = irq_cfg_legacy;
+	legacy_count = ARRAY_SIZE(irq_cfg_legacy);
+
+	BUG_ON(legacy_count > NR_IRQS_LEGACY);
+
+	for (i = 0; i < legacy_count; i++) {
+		desc = irq_to_desc(i);
+		desc->chip_data = &cfg[i];
+	}
+}
 
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+	struct irq_cfg *cfg = NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		cfg = desc->chip_data;
+
+	return cfg;
 }
 
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 {
-	return irq_cfg(irq);
+	struct irq_cfg *cfg;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
+
+	return cfg;
 }
 
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+static void free_irq_cfg(struct irq_cfg *cfg)
+{
+	kfree(cfg);
+}
+
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
+
+	cfg = desc->chip_data;
+	if (!cfg)
+		desc->chip_data = get_one_free_irq_cfg(cpu);
+}
+
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu);
+
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
+
+	cfg = get_one_free_irq_cfg(cpu);
+	desc->chip_data = cfg;
+
+	old_cfg = old_desc->chip_data;
+
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_2_pin(struct irq_cfg *cfg);
+
+void arch_free_chip_data(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+
+	cfg = desc->chip_data;
+	if (cfg) {
+		free_irq_2_pin(cfg);
+		if (desc->irq >= NR_IRQS_LEGACY)
+			free_irq_cfg(cfg);
+		desc->chip_data = NULL;
+	}
+}
 
 /*
  * This is performance-critical, we want to do it O(1)
@@ -170,30 +243,48 @@ struct irq_pin_list {
 	struct irq_pin_list *next;
 };
 
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+	struct irq_pin_list *pin;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	pin = kzalloc_node(sizeof(*pin), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+
+	return pin;
+}
+
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin);
 
-static void __init irq_2_pin_init(void)
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_head;
-	int i;
+	struct irq_pin_list *old_entry;
 
-	for (i = 1; i < PIN_MAP_SIZE; i++)
-		pin[i-1].next = &pin[i];
+	old_entry = old_cfg->irq_2_pin;
 
-	irq_2_pin_ptr = &pin[0];
+	while (old_entry) {
+		add_pin_to_irq_cpu(cfg, cpu, old_entry->apic, old_entry->pin);
+		old_entry = old_entry->next;
+	}
 }
 
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+static void free_irq_2_pin(struct irq_cfg *cfg)
 {
-	struct irq_pin_list *pin = irq_2_pin_ptr;
+	struct irq_pin_list *entry, *next;
 
-	if (!pin)
-		panic("can not get more irq_2_pin\n");
+	entry = cfg->irq_2_pin;
 
-	irq_2_pin_ptr = pin->next;
-	pin->next = NULL;
-	return pin;
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	cfg->irq_2_pin = NULL;
 }
 
 struct io_apic {
@@ -359,7 +450,12 @@ static void __target_IO_APIC_irq(unsigne
 	}
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
+
+static void __set_desc_affinity(struct irq_desc *desc, cpumask_t mask)
+{
+	desc->affinity = mask;
+}
 
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 {
@@ -374,7 +470,7 @@ static void set_ioapic_affinity_irq(unsi
 		return;
 
 	cfg = irq_cfg(irq);
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
 	cpus_and(tmp, cfg->domain, mask);
@@ -387,7 +483,7 @@ static void set_ioapic_affinity_irq(unsi
 	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
 	__target_IO_APIC_irq(irq, dest, cfg->vector);
-	desc->affinity = mask;
+	__set_desc_affinity(desc, mask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 #endif /* CONFIG_SMP */
@@ -397,16 +493,13 @@ static void set_ioapic_affinity_irq(unsi
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	/* first time to refer irq_cfg, so with new */
-	cfg = irq_cfg_alloc(irq);
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin();
+		entry = get_one_free_irq_2_pin(cpu);
 		cfg->irq_2_pin = entry;
 		entry->apic = apic;
 		entry->pin = pin;
@@ -421,20 +514,31 @@ static void add_pin_to_irq(unsigned int
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin();
+	entry->next = get_one_free_irq_2_pin(cpu);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
 }
 
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int cpu = smp_processor_id();
+
+	/* first time to refer irq_cfg, so with new */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	cfg = desc->chip_data;
+	add_pin_to_irq_cpu(cfg, cpu, apic, pin);
+}
+
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
 	struct irq_pin_list *entry = cfg->irq_2_pin;
 	int replaced = 0;
 
@@ -451,7 +555,7 @@ static void __init replace_pin_at_irq(un
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq(irq, newapic, newpin);
+		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
 
 static inline void io_apic_modify_irq(unsigned int irq,
@@ -809,7 +913,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector
  */
 static int EISA_ELCR(unsigned int irq)
 {
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		unsigned int port = 0x4d0 + (irq >> 3);
 		return (inb(port) >> (irq & 7)) & 1;
 	}
@@ -1034,7 +1138,7 @@ void unlock_vector_lock(void)
 	spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1050,9 +1154,6 @@ static int __assign_irq_vector(int irq,
 	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
 	unsigned int old_vector;
 	int cpu;
-	struct irq_cfg *cfg;
-
-	cfg = irq_cfg(irq);
 
 	/* Only try and allocate irqs on cpus that are present */
 	cpus_and(mask, mask, cpu_online_map);
@@ -1113,24 +1214,22 @@ next:
 	return -ENOSPC;
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	int err;
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, cfg, mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-	struct irq_cfg *cfg;
 	cpumask_t mask;
 	int cpu, vector;
 
-	cfg = irq_cfg(irq);
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
@@ -1148,14 +1247,16 @@ void __setup_vector_irq(int cpu)
 	/* This function must be called with vector_lock held */
 	int irq, vector;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	/* Mark the inuse vectors */
-	for_each_irq_cfg(irq, cfg) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
 		if (!cpu_isset(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
-	}
+	} end_for_each_irq_desc();
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
 		irq = per_cpu(vector_irq, cpu)[vector];
@@ -1205,7 +1306,8 @@ static void ioapic_register_intr(int irq
 {
 	struct irq_desc *desc;
 
-	desc = irq_to_desc(irq);
+	/* could be first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
@@ -1310,7 +1412,7 @@ static void setup_IO_APIC_irq(int apic,
 	cfg = irq_cfg(irq);
 
 	mask = TARGET_CPUS;
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
 	cpus_and(mask, cfg->domain, mask);
@@ -1327,12 +1429,12 @@ static void setup_IO_APIC_irq(int apic,
 			       cfg->vector)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic].mp_apicid, pin);
-		__clear_irq_vector(irq);
+		__clear_irq_vector(irq, cfg);
 		return;
 	}
 
 	ioapic_register_intr(irq, trigger);
-	if (irq < 16)
+	if (irq < NR_IRQS_LEGACY)
 		disable_8259A_irq(irq);
 
 	ioapic_write_entry(apic, pin, entry);
@@ -1434,6 +1536,7 @@ __apicdebuginit(void) print_IO_APIC(void
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 	unsigned int irq;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1523,8 +1626,11 @@ __apicdebuginit(void) print_IO_APIC(void
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for_each_irq_cfg(irq, cfg) {
-		struct irq_pin_list *entry = cfg->irq_2_pin;
+	for_each_irq_desc(irq, desc) {
+		struct irq_pin_list *entry;
+
+		cfg = desc->chip_data;
+		entry = cfg->irq_2_pin;
 		if (!entry)
 			continue;
 		printk(KERN_DEBUG "IRQ%d ", irq);
@@ -1535,7 +1641,7 @@ __apicdebuginit(void) print_IO_APIC(void
 			entry = entry->next;
 		}
 		printk("\n");
-	}
+	} end_for_each_irq_desc();
 
 	printk(KERN_INFO ".................................... done.\n");
 
@@ -2010,7 +2116,7 @@ static unsigned int startup_ioapic_irq(u
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		disable_8259A_irq(irq);
 		if (i8259A_irq_pending(irq))
 			was_pending = 1;
@@ -2095,10 +2201,10 @@ static void migrate_ioapic_irq(int irq,
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = irq_cfg(irq);
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -2125,7 +2231,7 @@ static void migrate_ioapic_irq(int irq,
 		cfg->move_in_progress = 0;
 	}
 
-	desc->affinity = mask;
+	__set_desc_affinity(desc, mask);
 }
 
 static int migrate_irq_remapped_level(int irq)
@@ -2178,7 +2284,7 @@ static void ir_irq_migration(struct work
 			desc->chip->set_affinity(irq, desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
@@ -2238,7 +2344,8 @@ unlock:
 
 static void irq_complete_move(unsigned int irq)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
 	if (likely(!cfg->move_in_progress))
@@ -2249,6 +2356,11 @@ static void irq_complete_move(unsigned i
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
 
+		move_irq_desc(desc, me);
+		desc = NULL;
+
+		/* get the new one */
+		cfg = irq_cfg(irq);
 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
@@ -2416,22 +2528,21 @@ static inline void init_IO_APIC_traps(vo
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for_each_irq_cfg(irq, cfg) {
-		if (IO_APIC_IRQ(irq) && !cfg->vector) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
+		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
 			 * interrupt if we can..
 			 */
-			if (irq < 16)
+			if (irq < NR_IRQS_LEGACY)
 				make_8259A_irq(irq);
-			else {
-				desc = irq_to_desc(irq);
+			else
 				/* Strange. Oh, well.. */
 				desc->chip = &no_irq_chip;
-			}
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
@@ -2575,6 +2686,7 @@ int timer_through_8259 __initdata;
 static inline void __init check_timer(void)
 {
 	struct irq_cfg *cfg = irq_cfg(0);
+	int cpu = smp_processor_id();
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	unsigned int ver;
@@ -2589,7 +2701,7 @@ static inline void __init check_timer(vo
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	assign_irq_vector(0, TARGET_CPUS);
+	assign_irq_vector(0, cfg, TARGET_CPUS);
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2640,7 +2752,7 @@ static inline void __init check_timer(vo
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq(0, apic1, pin1);
+			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
 		unmask_IO_APIC_irq(0);
@@ -2669,7 +2781,7 @@ static inline void __init check_timer(vo
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		unmask_IO_APIC_irq(0);
 		enable_8259A_irq(0);
@@ -2888,22 +3000,23 @@ unsigned int create_irq_nr(unsigned int
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
-	struct irq_cfg *cfg_new;
-
-	irq_want = nr_irqs - 1;
+	struct irq_cfg *cfg_new = NULL;
+	struct irq_desc *desc_new = NULL;
+	int cpu;
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
+	cpu = smp_processor_id();
 	for (new = irq_want; new > 0; new--) {
 		if (platform_legacy_irq(new))
 			continue;
-		cfg_new = irq_cfg(new);
-		if (cfg_new && cfg_new->vector != 0)
+
+		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		cfg_new = desc_new->chip_data;
+
+		if (cfg_new->vector != 0)
 			continue;
-		/* check if need to create one */
-		if (!cfg_new)
-			cfg_new = irq_cfg_alloc(new);
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
 	}
@@ -2911,6 +3024,9 @@ unsigned int create_irq_nr(unsigned int
 
 	if (irq > 0) {
 		dynamic_irq_init(irq);
+		/* restore it, in case dynamic_irq_init clear it */
+		if (desc_new)
+			desc_new->chip_data = cfg_new;
 	}
 	return irq;
 }
@@ -2930,14 +3046,22 @@ int create_irq(void)
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
+	/* store it, in case dynamic_irq_cleanup clear it */
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
 	dynamic_irq_cleanup(irq);
+	/* connect back irq_cfg */
+	if (desc)
+		desc->chip_data = cfg;
 
 #ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
 #endif
 	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
+	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
@@ -2952,12 +3076,12 @@ static int msi_compose_msg(struct pci_de
 	unsigned dest;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (err)
 		return err;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, tmp);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3025,10 +3149,10 @@ static void set_msi_irq_affinity(unsigne
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = irq_cfg(irq);
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3041,7 +3165,7 @@ static void set_msi_irq_affinity(unsigne
 
 	write_msi_msg(irq, &msg);
 	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	__set_desc_affinity(desc, mask);
 }
 
 #ifdef CONFIG_INTR_REMAP
@@ -3064,10 +3188,10 @@ static void ir_set_msi_irq_affinity(unsi
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = irq_cfg(irq);
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3092,7 +3216,7 @@ static void ir_set_msi_irq_affinity(unsi
 	}
 
 	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	__set_desc_affinity(desc, mask);
 }
 #endif
 #endif /* CONFIG_SMP */
@@ -3176,7 +3300,7 @@ static int setup_msi_irq(struct pci_dev
 #endif
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
-	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for MSI/MSI-X\n", irq, irq);
 
 	return 0;
 }
@@ -3199,7 +3323,7 @@ int arch_setup_msi_irq(struct pci_dev *d
 	int ret;
 	unsigned int irq_want;
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 
 	irq = create_irq_nr(irq_want);
 	if (irq == 0)
@@ -3240,7 +3364,7 @@ int arch_setup_msi_irqs(struct pci_dev *
 	int index = 0;
 #endif
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 	sub_handle = 0;
 	list_for_each_entry(desc, &dev->msi_list, list) {
 		irq = create_irq_nr(irq_want--);
@@ -3306,10 +3430,10 @@ static void dmar_msi_set_affinity(unsign
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = irq_cfg(irq);
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3322,7 +3446,7 @@ static void dmar_msi_set_affinity(unsign
 
 	dmar_msi_write(irq, &msg);
 	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	__set_desc_affinity(desc, mask);
 }
 #endif /* CONFIG_SMP */
 
@@ -3367,10 +3491,10 @@ static void hpet_msi_set_affinity(unsign
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = irq_cfg(irq);
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3383,7 +3507,7 @@ static void hpet_msi_set_affinity(unsign
 
 	hpet_msi_write(irq, &msg);
 	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	__set_desc_affinity(desc, mask);
 }
 #endif /* CONFIG_SMP */
 
@@ -3448,16 +3572,16 @@ static void set_ht_irq_affinity(unsigned
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = irq_cfg(irq);
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
 	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	__set_desc_affinity(desc, mask);
 }
 #endif
 
@@ -3478,13 +3602,13 @@ int arch_setup_ht_irq(unsigned int irq,
 	int err;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (!err) {
 		struct ht_irq_msg msg;
 		unsigned dest;
 
-		cfg = irq_cfg(irq);
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
@@ -3508,7 +3632,8 @@ int arch_setup_ht_irq(unsigned int irq,
 		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
 					      handle_edge_irq, "edge");
 
-		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
+		dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for HT\n",
+				 irq, irq);
 	}
 	return err;
 }
@@ -3530,7 +3655,9 @@ int arch_enable_uv_irq(char *irq_name, u
 	unsigned long flags;
 	int err;
 
-	err = assign_irq_vector(irq, *eligible_cpu);
+	cfg = irq_cfg(irq);
+
+	err = assign_irq_vector(irq, cfg, *eligible_cpu);
 	if (err != 0)
 		return err;
 
@@ -3539,8 +3666,6 @@ int arch_enable_uv_irq(char *irq_name, u
 				      irq_name);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	cfg = irq_cfg(irq);
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3611,8 +3736,6 @@ int __init probe_nr_irqs(void)
 	/* something wrong ? */
 	if (nr < nr_min)
 		nr = nr_min;
-	if (WARN_ON(nr > NR_IRQS))
-		nr = NR_IRQS;
 
 	return nr;
 }
@@ -3722,7 +3845,7 @@ int io_apic_set_pci_routing (int ioapic,
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
-	if (irq >= 16)
+	if (irq >= NR_IRQS_LEGACY)
 		add_pin_to_irq(irq, ioapic, pin);
 
 	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
@@ -3852,7 +3975,6 @@ void __init ioapic_init_mappings(void)
 	struct resource *ioapic_res;
 	int i;
 
-	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
Index: linux-2.6/arch/x86/kernel/irqinit_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_32.c
+++ linux-2.6/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
 	/*
 	 * 16 old-style INTA-cycle interrupts:
 	 */
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/kernel/irqinit_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_64.c
+++ linux-2.6/arch/x86/kernel/irqinit_64.c
@@ -142,8 +142,7 @@ void __init init_ISA_irqs(void)
 	init_bsp_APIC();
 	init_8259A(0);
 
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/mm/init_32.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_32.c
+++ linux-2.6/arch/x86/mm/init_32.c
@@ -66,6 +66,7 @@ static unsigned long __meminitdata table
 static unsigned long __meminitdata table_top;
 
 static int __initdata after_init_bootmem;
+int after_bootmem;
 
 static __init void *alloc_low_page(unsigned long *phys)
 {
@@ -987,6 +988,8 @@ void __init mem_init(void)
 
 	set_highmem_pages_init();
 
+	after_bootmem = 1;
+
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
Index: linux-2.6/drivers/char/random.c
===================================================================
--- linux-2.6.orig/drivers/char/random.c
+++ linux-2.6/drivers/char/random.c
@@ -558,6 +558,8 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
+#ifndef CONFIG_HAVE_SPARSE_IRQ
+
 static struct timer_rand_state *irq_timer_state[NR_IRQS];
 
 static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
@@ -576,6 +578,33 @@ static void set_timer_rand_state(unsigne
 	irq_timer_state[irq] = state;
 }
 
+#else
+
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return NULL;
+
+	return desc->timer_rand_state;
+}
+
+static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return;
+
+	desc->timer_rand_state = state;
+}
+#endif
+
 static struct timer_rand_state input_timer_state;
 
 /*
@@ -933,8 +962,10 @@ void rand_initialize_irq(int irq)
 {
 	struct timer_rand_state *state;
 
+#ifndef CONFIG_HAVE_SPARSE_IRQ
 	if (irq >= nr_irqs)
 		return;
+#endif
 
 	state = get_timer_rand_state(irq);
 
Index: linux-2.6/drivers/pci/htirq.c
===================================================================
--- linux-2.6.orig/drivers/pci/htirq.c
+++ linux-2.6/drivers/pci/htirq.c
@@ -82,6 +82,18 @@ void unmask_ht_irq(unsigned int irq)
 	write_ht_irq_msg(irq, &msg);
 }
 
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+	unsigned int irq;
+
+	irq = dev->bus->number;
+	irq <<= 8;
+	irq |= dev->devfn;
+	irq <<= 12;
+
+	return irq;
+}
+
 /**
  * __ht_create_irq - create an irq and attach it to a device.
  * @dev: The hypertransport device to find the irq capability on.
@@ -98,6 +110,7 @@ int __ht_create_irq(struct pci_dev *dev,
 	int max_irq;
 	int pos;
 	int irq;
+	unsigned int irq_want;
 
 	pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
 	if (!pos)
@@ -125,7 +138,12 @@ int __ht_create_irq(struct pci_dev *dev,
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
+	irq_want = build_irq_for_pci_dev(dev);
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+	irq = create_irq_nr(irq_want + idx);
+#else
 	irq = create_irq();
+#endif
 
 	if (irq <= 0) {
 		kfree(cfg);
Index: linux-2.6/drivers/pci/intr_remapping.c
===================================================================
--- linux-2.6.orig/drivers/pci/intr_remapping.c
+++ linux-2.6/drivers/pci/intr_remapping.c
@@ -19,17 +19,76 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+{
+	struct irq_2_iommu *iommu;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	iommu = kzalloc_node(sizeof(*iommu), GFP_KERNEL, node);
+	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+
+	return iommu;
+}
 
 static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 {
-	return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	BUG_ON(!desc);
+
+	return desc->irq_2_iommu;
 }
 
+static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct irq_2_iommu *irq_iommu;
+
+	/*
+	 * alloc irq desc if not allocated already.
+	 */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+
+	irq_iommu = desc->irq_2_iommu;
+
+	if (!irq_iommu)
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+
+	return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+	return irq_2_iommu_alloc_cpu(irq, -1);
+}
+
+#else /* !CONFIG_HAVE_SPARSE_IRQ */
+
+static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_2_iommuX[irq];
+
+	return NULL;
+}
+static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_2_iommu(irq);
+}
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
 	return irq_2_iommu(irq);
 }
+#endif
 
 static DEFINE_SPINLOCK(irq_2_ir_lock);
 
@@ -86,9 +145,11 @@ int alloc_irte(struct intel_iommu *iommu
 	if (!count)
 		return -1;
 
+#ifndef CONFIG_HAVE_SPARSE_IRQ
 	/* protect irq_2_iommu_alloc later */
 	if (irq >= nr_irqs)
 		return -1;
+#endif
 
 	/*
 	 * start the IRTE search from index 0.
Index: linux-2.6/drivers/xen/events.c
===================================================================
--- linux-2.6.orig/drivers/xen/events.c
+++ linux-2.6/drivers/xen/events.c
@@ -141,8 +141,9 @@ static void init_evtchn_cpu_bindings(voi
 	int i;
 
 	/* By default all event channels notify CPU#0. */
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		desc->affinity = cpumask_of_cpu(0);
+	} end_for_each_irq_desc();
 #endif
 
 	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@ -231,7 +232,7 @@ static int find_unbound_irq(void)
 	int irq;
 
 	/* Only allocate from dynirq range */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		if (irq_bindcount[irq] == 0)
 			break;
 
@@ -792,7 +793,7 @@ void xen_irq_resume(void)
 		mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		irq_info[irq].evtchn = 0; /* zap event-channel binding */
 
 	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@ -824,7 +825,7 @@ void __init xen_init_IRQ(void)
 		mask_evtchn(i);
 
 	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-	for_each_irq_nr(i)
+	for (i = 0; i < nr_irqs; i++)
 		irq_bindcount[i] = 0;
 
 	irq_ctx_init(smp_processor_id());
Index: linux-2.6/fs/proc/stat.c
===================================================================
--- linux-2.6.orig/fs/proc/stat.c
+++ linux-2.6/fs/proc/stat.c
@@ -27,6 +27,9 @@ static int show_stat(struct seq_file *p,
 	u64 sum = 0;
 	struct timespec boottime;
 	unsigned int per_irq_sum;
+#ifdef CONFIG_GENERIC_HARDIRQS
+	struct irq_desc *desc;
+#endif
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -44,10 +47,9 @@ static int show_stat(struct seq_file *p,
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-
-		for_each_irq_nr(j)
+		for_each_irq_desc(j, desc) {
 			sum += kstat_irqs_cpu(j, i);
-
+		} end_for_each_irq_desc();
 		sum += arch_irq_stat_cpu(i);
 	}
 	sum += arch_irq_stat();
@@ -90,14 +92,17 @@ static int show_stat(struct seq_file *p,
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_nr(j) {
+	for_each_irq_desc(j, desc) {
 		per_irq_sum = 0;
-
 		for_each_possible_cpu(i)
 			per_irq_sum += kstat_irqs_cpu(j, i);
 
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+		seq_printf(p, " %#x:%u", j, per_irq_sum);
+#else
 		seq_printf(p, " %u", per_irq_sum);
-	}
+#endif
+	} end_for_each_irq_desc();
 
 	seq_printf(p,
 		"\nctxt %llu\n"
Index: linux-2.6/fs/proc/interrupts.c
===================================================================
--- linux-2.6.orig/fs/proc/interrupts.c
+++ linux-2.6/fs/proc/interrupts.c
@@ -10,20 +10,31 @@
  */
 static void *int_seq_start(struct seq_file *f, loff_t *pos)
 {
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+	rcu_read_lock();
+	return seq_list_start(&sparse_irqs_head, *pos);
+#else
 	return (*pos <= nr_irqs) ? pos : NULL;
+#endif
 }
 
 static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
 {
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+	return seq_list_next(v, &sparse_irqs_head, pos);
+#else
 	(*pos)++;
 	if (*pos > nr_irqs)
 		return NULL;
 	return pos;
+#endif
 }
 
 static void int_seq_stop(struct seq_file *f, void *v)
 {
-	/* Nothing to do */
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+	rcu_read_unlock();
+#endif
 }
 
 static const struct seq_operations int_seq_ops = {
Index: linux-2.6/include/linux/interrupt.h
===================================================================
--- linux-2.6.orig/include/linux/interrupt.h
+++ linux-2.6/include/linux/interrupt.h
@@ -18,6 +18,8 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
+extern int nr_irqs;
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h
+++ linux-2.6/include/linux/irq.h
@@ -129,6 +129,8 @@ struct irq_chip {
 	const char	*typename;
 };
 
+struct timer_rand_state;
+struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -155,6 +157,15 @@ struct irq_chip {
  */
 struct irq_desc {
 	unsigned int		irq;
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+	struct list_head	list;
+	struct list_head	hash_entry;
+	struct timer_rand_state *timer_rand_state;
+	unsigned int            *kstat_irqs;
+# ifdef CONFIG_INTR_REMAP
+	struct irq_2_iommu      *irq_2_iommu;
+# endif
+#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -182,14 +193,59 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+extern void arch_sparse_irq_init_work(void);
+extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
+					struct irq_desc *desc, int cpu);
+extern void arch_free_chip_data(struct irq_desc *desc);
+
+#ifndef CONFIG_HAVE_SPARSE_IRQ
 
+/* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
 
-static inline struct irq_desc *irq_to_desc(unsigned int irq)
+#ifdef CONFIG_GENERIC_HARDIRQS
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+# define for_each_irq_desc_reverse(irq, desc)                          \
+	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
+	    irq >= 0; irq--, desc--)
+
+#define end_for_each_irq_desc()
+#endif
+
+static inline early_sparse_irq_init_work(void)
 {
-	return (irq < nr_irqs) ? irq_desc + irq : NULL;
 }
 
+#else
+
+void early_sparse_irq_init_work(void);
+extern struct list_head sparse_irqs_head;
+#define for_each_irq_desc(irqX, desc)					\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.next), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.next), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.next), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define for_each_irq_desc_reverse(irqX, desc)				\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.prev), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.prev), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.prev), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define end_for_each_irq_desc() rcu_read_unlock()
+
+#define kstat_irqs_this_cpu(DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()])
+#define kstat_incr_irqs_this_cpu(irqno, DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()]++)
+#endif
+
 /*
  * Migration helpers for obsolete names, they will go away:
  */
Index: linux-2.6/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.orig/include/linux/kernel_stat.h
+++ linux-2.6/include/linux/kernel_stat.h
@@ -28,7 +28,9 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-	unsigned int irqs[NR_IRQS];
+#ifndef CONFIG_HAVE_SPARSE_IRQ
+       unsigned int irqs[NR_IRQS];
+#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, ksta
 
 extern unsigned long long nr_context_switches(void);
 
+#ifndef CONFIG_HAVE_SPARSE_IRQ
+#define kstat_irqs_this_cpu(irq) \
+	(kstat_this_cpu.irqs[irq])
+
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
@@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_
 {
 	kstat_this_cpu.irqs[irq]++;
 }
+#endif
+
 
+#ifndef CONFIG_HAVE_SPARSE_IRQ
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
+#else
+extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#endif
 
 /*
  * Number of interrupts per specific IRQ source, since bootup
Index: linux-2.6/kernel/irq/autoprobe.c
===================================================================
--- linux-2.6.orig/kernel/irq/autoprobe.c
+++ linux-2.6/kernel/irq/autoprobe.c
@@ -57,7 +57,7 @@ unsigned long probe_irq_on(void)
 			desc->chip->startup(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/* Wait for longstanding interrupts to trigger. */
 	msleep(20);
@@ -75,7 +75,7 @@ unsigned long probe_irq_on(void)
 				desc->status |= IRQ_PENDING;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/*
 	 * Wait for spurious interrupts to trigger
@@ -99,7 +99,7 @@ unsigned long probe_irq_on(void)
 					mask |= 1 << i;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	return mask;
 }
@@ -135,7 +135,7 @@ unsigned int probe_irq_mask(unsigned lon
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	return mask & val;
@@ -179,7 +179,7 @@ int probe_irq_off(unsigned long val)
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	if (nr_of_irqs > 1)
Index: linux-2.6/kernel/irq/chip.c
===================================================================
--- linux-2.6.orig/kernel/irq/chip.c
+++ linux-2.6/kernel/irq/chip.c
@@ -24,9 +24,11 @@
  */
 void dynamic_irq_init(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc;
 	unsigned long flags;
 
+	/* first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c
+++ linux-2.6/kernel/irq/handle.c
@@ -15,9 +15,16 @@
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
 
 #include "internals.h"
 
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
@@ -49,6 +56,296 @@ void handle_bad_irq(unsigned int irq, st
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+static struct irq_desc irq_desc_init = {
+	.irq = -1U,
+	.status = IRQ_DISABLED,
+	.chip = &no_irq_chip,
+	.handle_irq = handle_bad_irq,
+	.depth = 1,
+	.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+	.affinity = CPU_MASK_ALL
+#endif
+};
+
+static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+{
+	unsigned long bytes;
+	char *ptr;
+	int node;
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	ptr = kzalloc_node(bytes, GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+
+	desc->kstat_irqs = (unsigned int *)ptr;
+}
+
+#ifdef CONFIG_SMP
+static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc,
+				 int cpu, int nr)
+{
+	unsigned long bytes;
+
+	init_kstat_irqs(desc, cpu, nr);
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+}
+
+static void free_kstat_irqs(struct irq_desc *desc)
+{
+	kfree(desc->kstat_irqs);
+	desc->kstat_irqs = NULL;
+}
+#endif
+
+void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+}
+
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
+	desc->irq = irq;
+#ifdef CONFIG_SMP
+	desc->cpu = cpu;
+#endif
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_kstat_irqs(desc, cpu, nr_cpu_ids);
+	arch_init_chip_data(desc, cpu);
+}
+
+#ifdef CONFIG_SMP
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+		 struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	desc->cpu = cpu;
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *desc)
+{
+	free_kstat_irqs(desc);
+	arch_free_chip_data(desc);
+}
+#endif
+/*
+ * Protect the sparse_irqs_free freelist:
+ */
+static DEFINE_SPINLOCK(sparse_irq_lock);
+LIST_HEAD(sparse_irqs_head);
+
+/*
+ * The sparse irqs are in a hash-table as well, for fast lookup:
+ */
+#define SPARSEIRQHASH_BITS          (13 - 1)
+#define SPARSEIRQHASH_SIZE          (1UL << SPARSEIRQHASH_BITS)
+#define __sparseirqhashfn(key)      hash_long((unsigned long)key, SPARSEIRQHASH_BITS)
+#define sparseirqhashentry(key)     (sparseirqhash_table + __sparseirqhashfn((key)))
+
+static struct list_head sparseirqhash_table[SPARSEIRQHASH_SIZE];
+
+static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
+	[0 ... NR_IRQS_LEGACY-1] = {
+		.irq = -1U,
+		.status = IRQ_DISABLED,
+		.chip = &no_irq_chip,
+		.handle_irq = handle_bad_irq,
+		.depth = 1,
+		.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+		.affinity = CPU_MASK_ALL
+#endif
+	}
+};
+
+/* FIXME: use bootmem alloc ...*/
+static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+
+void __init __attribute__((weak)) arch_sparse_irq_init_work(void)
+{
+}
+
+void __init early_sparse_irq_init_work(void)
+{
+	struct irq_desc *desc;
+	int legacy_count;
+	int i;
+
+	/* init_work to init list for sparseirq */
+	for (i = 0; i < SPARSEIRQHASH_SIZE; i++)
+		INIT_LIST_HEAD(sparseirqhash_table + i);
+
+	desc = irq_desc_legacy;
+	legacy_count = ARRAY_SIZE(irq_desc_legacy);
+
+	for (i = 0; i < legacy_count; i++) {
+		struct list_head *hash_head;
+
+		hash_head = sparseirqhashentry(i);
+		desc[i].irq = i;
+		desc[i].kstat_irqs = kstat_irqs_legacy[i];
+		list_add_tail(&desc[i].hash_entry, hash_head);
+		list_add_tail(&desc[i].list, &sparse_irqs_head);
+	}
+
+	arch_sparse_irq_init_work();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+
+	hash_head = sparseirqhashentry(irq);
+
+	/*
+	 * We can walk the hash lockfree, because the hash only
+	 * grows, and we are careful when adding entries to the end:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry) {
+		if (desc->irq == irq)
+			return desc;
+	}
+
+	return NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		return desc;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry)
+		if (desc->irq == irq)
+			goto out_unlock;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_desc for %d aka %#x on cpu %d node %d\n",
+		 irq, irq, cpu, node);
+	init_one_irq_desc(irq, desc, cpu);
+
+	/*
+	 * We use RCU's safe list-add method to make
+	 * parallel walking of the hash-list safe:
+	 */
+	list_add_tail_rcu(&desc->hash_entry, hash_head);
+	/*
+	 * Add it to the global list:
+	 */
+	list_add_tail_rcu(&desc->list, &sparse_irqs_head);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc_alloc_cpu(irq, -1);
+}
+
+#ifdef CONFIG_SMP
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+						int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	irq = old_desc->irq;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry)
+		if (desc->irq == irq && old_desc != desc)
+			goto out_unlock;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  move irq_desc for %d aka %#x to cpu %d node %d\n",
+		 irq, irq, cpu, node);
+
+	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+	list_replace_rcu(&desc->hash_entry, &desc->hash_entry);
+	list_replace_rcu(&desc->list, &desc->list);
+
+	/* free the old one */
+	free_one_irq_desc(old_desc);
+	if (irq >= NR_IRQS_LEGACY)
+		kfree(old_desc);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+	int old_cpu;
+	int node, old_node;
+
+	old_cpu = desc->cpu;
+
+	if (old_cpu != cpu) {
+		node = cpu_to_node(cpu);
+		old_node = cpu_to_node(old_cpu);
+		if (old_node != node)
+			desc = __real_move_irq_desc(desc, cpu);
+		else
+			desc->cpu = cpu;
+	}
+
+	return desc;
+}
+#endif
+
+#else
+
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
@@ -62,6 +359,27 @@ struct irq_desc irq_desc[NR_IRQS] __cach
 	}
 };
 
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_desc[irq];
+
+	return NULL;
+}
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu)
+{
+	return old_desc;
+}
+#endif
+
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
@@ -261,17 +579,25 @@ out:
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-static struct lock_class_key irq_desc_lock_class;
-
 void early_init_irq_lock_class(void)
 {
+#ifndef CONFIG_HAVE_SPARSE_IRQ
 	struct irq_desc *desc;
 	int i;
 
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	} end_for_each_irq_desc();
+#endif
 }
 #endif
+
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	return desc->kstat_irqs[cpu];
+}
+#endif
+EXPORT_SYMBOL(kstat_irqs_cpu);
+
Index: linux-2.6/arch/x86/kernel/irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq.c
+++ linux-2.6/arch/x86/kernel/irq.c
@@ -99,25 +99,20 @@ static int show_other_interrupts(struct
 int show_interrupts(struct seq_file *p, void *v)
 {
 	unsigned long flags, any_count = 0;
-	int i = *(loff_t *) v, j;
+	int i, j;
 	struct irqaction *action;
 	struct irq_desc *desc;
 
-	if (i > nr_irqs)
-		return 0;
-
-	if (i == nr_irqs)
-		return show_other_interrupts(p);
-
-	/* print header */
-	if (i == 0) {
+	desc = list_entry(v, struct irq_desc, list);
+	i = desc->irq;
+	if (&desc->list == sparse_irqs_head.next) {
+		/* print header */
 		seq_printf(p, "           ");
 		for_each_online_cpu(j)
 			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
 	}
 
-	desc = irq_to_desc(i);
 	spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 	any_count = kstat_irqs(i);
@@ -148,6 +143,10 @@ int show_interrupts(struct seq_file *p,
 	seq_putc(p, '\n');
 out:
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+	if (&desc->list == sparse_irqs_head.prev)
+		show_other_interrupts(p);
+
 	return 0;
 }
 
Index: linux-2.6/include/linux/irqnr.h
===================================================================
--- linux-2.6.orig/include/linux/irqnr.h
+++ linux-2.6/include/linux/irqnr.h
@@ -7,18 +7,11 @@
 
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0; irq < nr_irqs; irq++)
-#else
-extern int nr_irqs;
+# define end_for_each_irq_desc()
 
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-
-# define for_each_irq_desc_reverse(irq, desc)				\
-	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);	\
-	     irq >= 0; irq--, desc--)
+static inline early_sparse_irq_init_work(void)
+{
+}
 #endif
 
-#define for_each_irq_nr(irq)			\
-	for (irq = 0; irq < nr_irqs; irq++)
-
 #endif
Index: linux-2.6/arch/x86/kernel/irq_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_32.c
+++ linux-2.6/arch/x86/kernel/irq_32.c
@@ -254,7 +254,7 @@ void fixup_irqs(cpumask_t map)
 			desc->chip->set_affinity(irq, mask);
 		else if (desc->action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 #if 0
 	barrier();
Index: linux-2.6/arch/x86/kernel/irq_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_64.c
+++ linux-2.6/arch/x86/kernel/irq_64.c
@@ -129,7 +129,7 @@ void fixup_irqs(cpumask_t map)
 			printk("Broke affinity for irq %i\n", irq);
 		else if (!set_affinity)
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 	/* That doesn't seem sufficient.  Give it 1ms. */
 	local_irq_enable();
Index: linux-2.6/kernel/irq/proc.c
===================================================================
--- linux-2.6.orig/kernel/irq/proc.c
+++ linux-2.6/kernel/irq/proc.c
@@ -243,7 +243,8 @@ void init_irq_proc(void)
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for_each_irq_desc(irq, desc)
+	for_each_irq_desc(irq, desc) {
 		register_irq_proc(irq, desc);
+	} end_for_each_irq_desc();
 }
 
Index: linux-2.6/kernel/irq/spurious.c
===================================================================
--- linux-2.6.orig/kernel/irq/spurious.c
+++ linux-2.6/kernel/irq/spurious.c
@@ -99,7 +99,7 @@ static int misrouted_irq(int irq)
 
 		if (try_one_irq(i, desc))
 			ok = 1;
-	}
+	} end_for_each_irq_desc();
 	/* So the caller can adjust the irq error counts */
 	return ok;
 }
@@ -122,7 +122,7 @@ static void poll_spurious_irqs(unsigned
 			continue;
 
 		try_one_irq(i, desc);
-	}
+	} end_for_each_irq_desc();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c
+++ linux-2.6/init/main.c
@@ -611,6 +611,8 @@ asmlinkage void __init start_kernel(void
 	sort_main_extable();
 	trap_init();
 	rcu_init();
+	/* init some list before init_ISA_irqs() */
+	early_sparse_irq_init_work();
 	init_IRQ();
 	pidhash_init();
 	init_timers();
Index: linux-2.6/arch/x86/include/asm/io_apic.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/io_apic.h
+++ linux-2.6/arch/x86/include/asm/io_apic.h
@@ -192,6 +192,7 @@ extern int io_apic_set_pci_routing(int i
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
+struct irq_desc;
 #ifdef CONFIG_X86_64
 extern int save_mask_IO_APIC_setup(void);
 extern void restore_IO_APIC_setup(void);
@@ -199,7 +200,6 @@ extern void reinit_intr_remapped_IO_APIC
 #endif
 
 extern int probe_nr_irqs(void);
-
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
 static const int timer_through_8259 = 0;
Index: linux-2.6/arch/x86/include/asm/irq_vectors.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/irq_vectors.h
+++ linux-2.6/arch/x86/include/asm/irq_vectors.h
@@ -101,6 +101,8 @@
 #define LAST_VM86_IRQ		15
 #define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
 
+#define NR_IRQS_LEGACY		16
+
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC PATCH] sparse_irq aka dyn_irq
  2008-11-09  7:05                     ` [RFC PATCH] sparse_irq aka dyn_irq Yinghai Lu
@ 2008-11-09  7:38                       ` Ingo Molnar
  2008-11-09  8:03                         ` Yinghai Lu
  2008-11-09  8:36                         ` [RFC PATCH] sparse_irq aka dyn_irq H. Peter Anvin
  2008-11-09  7:50                       ` Cyrill Gorcunov
  1 sibling, 2 replies; 66+ messages in thread
From: Ingo Molnar @ 2008-11-09  7:38 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Thomas Gleixner, H. Peter Anvin, Andrew Morton,
	linux-kernel@vger.kernel.org


General impression: very nice patch!

A lot of the structural problems have been addressed: the descriptor 
lookup is now hashed, the dynarray stuff got cleaned up / eliminated, 
the irq_desc->chip_data binding is very nice as well.

(And the patch needs to be split up like it was in the past, once all 
review feedback has been seen and addressed.)

> +config HAVE_SPARSE_IRQ
> +	bool
> +	default y

i think it should be made user-configurable - at least initially. It 
should not cause extra complications, right?

> +	if (irq < NR_IRQS_LEGACY) {

please s/NR_IRQS_LEGACY/NR_IRQS_X86_LEGACY - this is never used 
outside of x86 code.

> +		cfg_new = desc_new->chip_data;

the chip_data binding is a nice touch.

> -	irq_want = build_irq_for_pci_dev(dev) + 0x100;
> +	irq_want = build_irq_for_pci_dev(dev) + 0xfff;

please replace magic constant with a properly named constant.

> -	if (WARN_ON(nr > NR_IRQS))
> -		nr = NR_IRQS;

this will have to stay for the !SPARSE_IRQ case.

> +++ linux-2.6/arch/x86/mm/init_32.c
> @@ -66,6 +66,7 @@ static unsigned long __meminitdata table
>  static unsigned long __meminitdata table_top;
>  
>  static int __initdata after_init_bootmem;
> +int after_bootmem;
>  
>  static __init void *alloc_low_page(unsigned long *phys)
>  {
> @@ -987,6 +988,8 @@ void __init mem_init(void)
>  
>  	set_highmem_pages_init();
>  
> +	after_bootmem = 1;

this hack can go away once we have a proper percpu_alloc() that can be 
used early enough.

> +#ifndef CONFIG_HAVE_SPARSE_IRQ

i'd suggest s/HAVE_SPARSE_IRQ/SPARSE_IRQ - as the HAVE_* flags are for 
architecture code to signal the presence of a facility.

> +#ifndef CONFIG_HAVE_SPARSE_IRQ
>  	if (irq >= nr_irqs)
>  		return;
> +#endif

we should hide as many ugly #ifdefs as possible, and define nr_irqs to 
NR_IRQS in the !SPARSE_IRQ case.

> +++ linux-2.6/drivers/pci/htirq.c
> @@ -82,6 +82,18 @@ void unmask_ht_irq(unsigned int irq)
>  	write_ht_irq_msg(irq, &msg);
>  }
>  
> +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
> +{
> +	unsigned int irq;
> +
> +	irq = dev->bus->number;
> +	irq <<= 8;
> +	irq |= dev->devfn;
> +	irq <<= 12;
> +
> +	return irq;

magic constants should be named.

> +#ifdef CONFIG_HAVE_SPARSE_IRQ
> +	irq = create_irq_nr(irq_want + idx);
> +#else
>  	irq = create_irq();
> +#endif

please eliminate this #ifdef by adding one new API: 
create_irq_nr(idx), which just maps to the create_irq() API in the 
!SPARSE_IRQ case.

>  static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
>  {
> -	return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
> +	struct irq_desc *desc;
> +
> +	desc = irq_to_desc(irq);
> +
> +	BUG_ON(!desc);
> +
> +	return desc->irq_2_iommu;

the BUG_ON() is not too friendly, please do something like this 
instead:

	if (WARN_ON_ONCE(!desc))
		return NULL;

> +#ifndef CONFIG_HAVE_SPARSE_IRQ
>  	/* protect irq_2_iommu_alloc later */
>  	if (irq >= nr_irqs)
>  		return -1;
> +#endif

this #ifdef can be eliminated too and turned into straight code via 
the #define nr_irqs NR_IRQS trick in the !SPARSE_IRQ case.

> -	for_each_irq_desc(i, desc)
> +	for_each_irq_desc(i, desc) {
>  		desc->affinity = cpumask_of_cpu(0);
> +	} end_for_each_irq_desc();

Sidenote: later on, once the patch is upstream, we should do a global 
rename:

 s/for_each_irq_desc/do_each_irq_desc
 s/end_for_each_irq_desc/while_each_irq_desc

as it's much harder to miss the "while" in a "do ..." loop, than it is 
to miss the "end" in a "for" loop.

> +#ifdef CONFIG_HAVE_SPARSE_IRQ
> +static struct irq_desc irq_desc_init = {
> +	.irq = -1U,
> +	.status = IRQ_DISABLED,
> +	.chip = &no_irq_chip,
> +	.handle_irq = handle_bad_irq,
> +	.depth = 1,
> +	.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
> +#ifdef CONFIG_SMP
> +	.affinity = CPU_MASK_ALL
> +#endif
> +};

please align structure fields vertically.

> +static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
> +	[0 ... NR_IRQS_LEGACY-1] = {
> +		.irq = -1U,
> +		.status = IRQ_DISABLED,
> +		.chip = &no_irq_chip,
> +		.handle_irq = handle_bad_irq,
> +		.depth = 1,
> +		.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
> +#ifdef CONFIG_SMP
> +		.affinity = CPU_MASK_ALL
> +#endif
> +	}
> +};

same here.

> @@ -199,7 +200,6 @@ extern void reinit_intr_remapped_IO_APIC
>  #endif
>  
>  extern int probe_nr_irqs(void);
> -
>  #else  /* !CONFIG_X86_IO_APIC */
>  #define io_apic_assign_pci_irqs 0
>  static const int timer_through_8259 = 0;

that's a spurious removal of a newline.

all in one, i cannot see fundamental problems in this patch.

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC PATCH] sparse_irq aka dyn_irq
  2008-11-09  7:05                     ` [RFC PATCH] sparse_irq aka dyn_irq Yinghai Lu
  2008-11-09  7:38                       ` Ingo Molnar
@ 2008-11-09  7:50                       ` Cyrill Gorcunov
  1 sibling, 0 replies; 66+ messages in thread
From: Cyrill Gorcunov @ 2008-11-09  7:50 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Andrew Morton,
	linux-kernel@vger.kernel.org

[Yinghai Lu - Sat, Nov 08, 2008 at 11:05:55PM -0800]
| 
| impact: new feature sparseirq
| 
| for sparse_irq, irq_desc, and irq_cfg is not using list_head to chain up
| also not add per_cpu_dyn_array... no user now
| 
| add some kind of hash table as Ingo suggesting.
| remove dyna_array, and enable sparse_irq by default, use kzalloc_node to get it
| use desc->chip_data for x86 to store irq_cfg
| make irq_desc to go with affinity aka irq_desc moving etc
| only call move_irq_desc in irq_complete_move() --- but it seems not trigger that moving.
| 
| Signed-off-by: Yinghai Lu <yinghai@kernel.org>
| 
| ---

Hi Yinghai,

from a glance view (didn't read the whole patch)

...
|  
| -static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
| +static struct irq_cfg *get_one_free_irq_cfg(int cpu)
|  {
| -	return irq_cfg(irq);
| +	struct irq_cfg *cfg;
| +	int node;
| +
| +	if (cpu < 0)
| +		cpu = smp_processor_id();
| +	node = cpu_to_node(cpu);
| +
| +	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
| +	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
| +
| +	return cfg;
|  }
|  
| -/*
| - * Rough estimation of how many shared IRQs there are, can be changed
| - * anytime.
| - */
| -#define MAX_PLUS_SHARED_IRQS NR_IRQS
| -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
| +static void free_irq_cfg(struct irq_cfg *cfg)
| +{
| +	kfree(cfg);
| +}
| +
| +void arch_init_chip_data(struct irq_desc *desc, int cpu)
| +{
| +	struct irq_cfg *cfg;
| +
| +	cfg = desc->chip_data;
| +	if (!cfg)
| +		desc->chip_data = get_one_free_irq_cfg(cpu);
| +}
| +
| +static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
| +				 int cpu);
| +
| +void arch_init_copy_chip_data(struct irq_desc *old_desc,
| +				 struct irq_desc *desc, int cpu)
| +{
| +	struct irq_cfg *cfg;
| +	struct irq_cfg *old_cfg;
| +
| +	cfg = get_one_free_irq_cfg(cpu);
| +	desc->chip_data = cfg;
| +
| +	old_cfg = old_desc->chip_data;
| +
| +	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));

If cfg gets NULL here we will be NULL-dereferring
(cause of possible kzalloc_node fails).

| +
| +	init_copy_irq_2_pin(old_cfg, cfg, cpu);
| +}
| +
...

Am I missgin something?

		- Cyrill -

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC PATCH] sparse_irq aka dyn_irq
  2008-11-09  7:38                       ` Ingo Molnar
@ 2008-11-09  8:03                         ` Yinghai Lu
  2008-11-10  9:40                           ` Ingo Molnar
  2008-11-09  8:36                         ` [RFC PATCH] sparse_irq aka dyn_irq H. Peter Anvin
  1 sibling, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-09  8:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, H. Peter Anvin, Andrew Morton,
	linux-kernel@vger.kernel.org

On Sat, Nov 8, 2008 at 11:38 PM, Ingo Molnar <mingo@elte.hu> wrote:
>
> General impression: very nice patch!
>
> A lot of the structural problems have been addressed: the descriptor
> lookup is now hashed, the dynarray stuff got cleaned up / eliminated,
> the irq_desc->chip_data binding is very nice as well.
>
> (And the patch needs to be split up like it was in the past, once all
> review feedback has been seen and addressed.)
>
>> +config HAVE_SPARSE_IRQ
>> +     bool
>> +     default y
>
> i think it should be made user-configurable - at least initially. It
> should not cause extra complications, right?

io_apic.c will get more complicated.

>
>> +     if (irq < NR_IRQS_LEGACY) {
>
> please s/NR_IRQS_LEGACY/NR_IRQS_X86_LEGACY - this is never used
> outside of x86 code.

will use that in kernel/irq/handle.c too, because dyn_array is dumped.

>
>> +             cfg_new = desc_new->chip_data;
>
> the chip_data binding is a nice touch.
>
>> -     irq_want = build_irq_for_pci_dev(dev) + 0x100;
>> +     irq_want = build_irq_for_pci_dev(dev) + 0xfff;
>
> please replace magic constant with a properly named constant.
>
>> -     if (WARN_ON(nr > NR_IRQS))
>> -             nr = NR_IRQS;
>
> this will have to stay for the !SPARSE_IRQ case.

Yes

>
>> +++ linux-2.6/arch/x86/mm/init_32.c
>> @@ -66,6 +66,7 @@ static unsigned long __meminitdata table
>>  static unsigned long __meminitdata table_top;
>>
>>  static int __initdata after_init_bootmem;
>> +int after_bootmem;
>>
>>  static __init void *alloc_low_page(unsigned long *phys)
>>  {
>> @@ -987,6 +988,8 @@ void __init mem_init(void)
>>
>>       set_highmem_pages_init();
>>
>> +     after_bootmem = 1;
>
> this hack can go away once we have a proper percpu_alloc() that can be
> used early enough.

where is that fancy patch?
current percpu_alloc(), will keep big pointer in array..., instead of
put that pointer in percpu_area

64bit has that after_bootmem already.

>
>> +#ifndef CONFIG_HAVE_SPARSE_IRQ
>
> i'd suggest s/HAVE_SPARSE_IRQ/SPARSE_IRQ - as the HAVE_* flags are for
> architecture code to signal the presence of a facility.

OK

>
>> +#ifndef CONFIG_HAVE_SPARSE_IRQ
>>       if (irq >= nr_irqs)
>>               return;
>> +#endif
>
> we should hide as many ugly #ifdefs as possible, and define nr_irqs to
> NR_IRQS in the !SPARSE_IRQ case.
>
>> +++ linux-2.6/drivers/pci/htirq.c
>> @@ -82,6 +82,18 @@ void unmask_ht_irq(unsigned int irq)
>>       write_ht_irq_msg(irq, &msg);
>>  }
>>
>> +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
>> +{
>> +     unsigned int irq;
>> +
>> +     irq = dev->bus->number;
>> +     irq <<= 8;
>> +     irq |= dev->devfn;
>> +     irq <<= 12;
>> +
>> +     return irq;
>
> magic constants should be named.

should add more comment here.

>
>> +#ifdef CONFIG_HAVE_SPARSE_IRQ
>> +     irq = create_irq_nr(irq_want + idx);
>> +#else
>>       irq = create_irq();
>> +#endif
>
> please eliminate this #ifdef by adding one new API:
> create_irq_nr(idx), which just maps to the create_irq() API in the
> !SPARSE_IRQ case.
>
>>  static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
>>  {
>> -     return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
>> +     struct irq_desc *desc;
>> +
>> +     desc = irq_to_desc(irq);
>> +
>> +     BUG_ON(!desc);
>> +
>> +     return desc->irq_2_iommu;
>
> the BUG_ON() is not too friendly, please do something like this
> instead:
>
>        if (WARN_ON_ONCE(!desc))
>                return NULL;
>
>> +#ifndef CONFIG_HAVE_SPARSE_IRQ
>>       /* protect irq_2_iommu_alloc later */
>>       if (irq >= nr_irqs)
>>               return -1;
>> +#endif
>
> this #ifdef can be eliminated too and turned into straight code via
> the #define nr_irqs NR_IRQS trick in the !SPARSE_IRQ case.
>
>> -     for_each_irq_desc(i, desc)
>> +     for_each_irq_desc(i, desc) {
>>               desc->affinity = cpumask_of_cpu(0);
>> +     } end_for_each_irq_desc();
>
> Sidenote: later on, once the patch is upstream, we should do a global
> rename:
>
>  s/for_each_irq_desc/do_each_irq_desc
>  s/end_for_each_irq_desc/while_each_irq_desc
>
> as it's much harder to miss the "while" in a "do ..." loop, than it is
> to miss the "end" in a "for" loop.
>
>> +#ifdef CONFIG_HAVE_SPARSE_IRQ
>> +static struct irq_desc irq_desc_init = {
>> +     .irq = -1U,
>> +     .status = IRQ_DISABLED,
>> +     .chip = &no_irq_chip,
>> +     .handle_irq = handle_bad_irq,
>> +     .depth = 1,
>> +     .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
>> +#ifdef CONFIG_SMP
>> +     .affinity = CPU_MASK_ALL
>> +#endif
>> +};
>
> please align structure fields vertically.
>
>> +static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
>> +     [0 ... NR_IRQS_LEGACY-1] = {
>> +             .irq = -1U,
>> +             .status = IRQ_DISABLED,
>> +             .chip = &no_irq_chip,
>> +             .handle_irq = handle_bad_irq,
>> +             .depth = 1,
>> +             .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
>> +#ifdef CONFIG_SMP
>> +             .affinity = CPU_MASK_ALL
>> +#endif
>> +     }
>> +};
>
> same here.
>
>> @@ -199,7 +200,6 @@ extern void reinit_intr_remapped_IO_APIC
>>  #endif
>>
>>  extern int probe_nr_irqs(void);
>> -
>>  #else  /* !CONFIG_X86_IO_APIC */
>>  #define io_apic_assign_pci_irqs 0
>>  static const int timer_through_8259 = 0;
>
> that's a spurious removal of a newline.

.....

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC PATCH] sparse_irq aka dyn_irq
  2008-11-09  7:38                       ` Ingo Molnar
  2008-11-09  8:03                         ` Yinghai Lu
@ 2008-11-09  8:36                         ` H. Peter Anvin
  1 sibling, 0 replies; 66+ messages in thread
From: H. Peter Anvin @ 2008-11-09  8:36 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Yinghai Lu, Thomas Gleixner, Andrew Morton,
	linux-kernel@vger.kernel.org

Ingo Molnar wrote:
> 
>> +++ linux-2.6/arch/x86/mm/init_32.c
>> @@ -66,6 +66,7 @@ static unsigned long __meminitdata table
>>  static unsigned long __meminitdata table_top;
>>  
>>  static int __initdata after_init_bootmem;
>> +int after_bootmem;
>>  
>>  static __init void *alloc_low_page(unsigned long *phys)
>>  {
>> @@ -987,6 +988,8 @@ void __init mem_init(void)
>>  
>>  	set_highmem_pages_init();
>>  
>> +	after_bootmem = 1;
> 
> this hack can go away once we have a proper percpu_alloc() that can be 
> used early enough.
> 

Also, flags should be "bool".  We're not aggressively going after old
code to convert it, but new code should use "bool".

	-hpa

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC PATCH] sparse_irq aka dyn_irq
  2008-11-09  8:03                         ` Yinghai Lu
@ 2008-11-10  9:40                           ` Ingo Molnar
  2008-11-10  9:51                             ` [PATCH] sparse_irq aka dyn_irq v10 Yinghai Lu
  2008-11-10  9:55                             ` [RFC PATCH] sparse_irq aka dyn_irq Andrew Morton
  0 siblings, 2 replies; 66+ messages in thread
From: Ingo Molnar @ 2008-11-10  9:40 UTC (permalink / raw)
  To: Yinghai Lu, Andrew Morton
  Cc: Thomas Gleixner, H. Peter Anvin, Andrew Morton,
	linux-kernel@vger.kernel.org


(Andrew, please see the early_kzalloc() reference below)

* Yinghai Lu <yinghai@kernel.org> wrote:

> On Sat, Nov 8, 2008 at 11:38 PM, Ingo Molnar <mingo@elte.hu> wrote:
> >
> > General impression: very nice patch!
> >
> > A lot of the structural problems have been addressed: the descriptor
> > lookup is now hashed, the dynarray stuff got cleaned up / eliminated,
> > the irq_desc->chip_data binding is very nice as well.
> >
> > (And the patch needs to be split up like it was in the past, once all
> > review feedback has been seen and addressed.)
> >
> >> +config HAVE_SPARSE_IRQ
> >> +     bool
> >> +     default y
> >
> > i think it should be made user-configurable - at least initially. It
> > should not cause extra complications, right?
> 
> io_apic.c will get more complicated.

yes, with such constructs:

+#ifdef CONFIG_SPARSE_IRQ
+       struct irq_desc *desc;
+
+	/* first time to refer irq_cfg, so with new */
+       desc = irq_to_desc_alloc_cpu(irq, cpu);
+	cfg = desc->chip_data;
+#else
+       cfg = irq_cfg(irq);
+#endif

please introduce a proper helper that eliminates such complications. 
Any reason why chip_data could not be used in the !SPARSE_IRQ case? 
irq_cfg_alloc() perhaps?

> >> + if (irq < NR_IRQS_LEGACY) {
> >
> > please s/NR_IRQS_LEGACY/NR_IRQS_X86_LEGACY - this is never used
> > outside of x86 code.
> 
> will use that in kernel/irq/handle.c too, because dyn_array is dumped.

ah, i missed that. Okay - lets keep NR_IRQS_LEGACY then.

> >> @@ -987,6 +988,8 @@ void __init mem_init(void)
> >>
> >>       set_highmem_pages_init();
> >>
> >> +     after_bootmem = 1;
> >
> > this hack can go away once we have a proper percpu_alloc() that can be
> > used early enough.
> 
> where is that fancy patch? current percpu_alloc(), will keep big 
> pointer in array..., instead of put that pointer in percpu_area
> 
> 64bit has that after_bootmem already.

or at least introduce a "bootmem agnostic" allocator instead of 
open-coding the after_bootmem flag.

Something like:

  early_kzalloc()

?

Andrew, any preferences?

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] sparse_irq aka dyn_irq v10
  2008-11-10  9:40                           ` Ingo Molnar
@ 2008-11-10  9:51                             ` Yinghai Lu
  2008-11-10  9:53                               ` Ingo Molnar
  2008-11-10  9:55                             ` [RFC PATCH] sparse_irq aka dyn_irq Andrew Morton
  1 sibling, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-10  9:51 UTC (permalink / raw)
  To: Ingo Molnar, Andrew Morton, Thomas Gleixner, H. Peter Anvin
  Cc: linux-kernel@vger.kernel.org

getting closer, irq_desc can be moved according to smp_affinity.

it is getting some big now..., may split it to two patch: one only have sparse irq but don't move irq_desc 
second one will move irq_desc according to affinity.

YH

----------
From: Yinghai Lu <yinghai@kernel.org>
Subject: sparseirq v10

impact: new feature sparseirq

add some kind of hash table as Ingo suggesting.
remove dyna_array
when sparse_irq is used, use kzalloc_node to get irq_desc, irq_cfg
  use desc->chip_data for x86 to store irq_cfg
  make irq_desc to go with affinity aka irq_desc moving etc
  call move_irq_desc in irq_complete_move()
  need to add struct (irq_desc **descp) to ack_edge/level to make sure desc get updated
  legacy irq_desc is not moved, because they are allocated via static array

  for logical apic mode, need to add move_desc_in_progress_in_same_domain. otherwise it will not get moved. ==> also could need two phase to get irq_desc moved.
	for example: 0xff is old affinity, and need to set 0xf, and then set to 0xf0.

LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,000000ff
LBSuse:~ # echo f > /proc/irq/22/smp_affinity
LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,0000000f
LBSuse:~ # tail /var/log/messages
...
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
LBSuse:~ # echo f0 > /proc/irq/22/smp_affinity
LBSuse:~ # tail /var/log/messages
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
Oct 27 12:36:46 LBSuse kernel:   move irq_desc for 22 aka 0x16 to cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc kstat_irqs on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_cfg on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_2_pin on cpu 7 node 1

so assume the user space program should update /proc/irq/XX/smp_affinity to 03 or 0f at first on boot
or we change irq_default_affinity ?

  for physical apic is much simple
on 4 sockets 16 cores system
irq_desc is moving..
when
# echo 10 > /proc/irq/134483967/smp_affinity
# echo 100 > /proc/irq/134483967/smp_affinity
# echo 1000 > /proc/irq/134483967/smp_affinity
got
Nov  9 21:39:51 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc kstat_irqs on cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc irq_cfg on cpu 4 node 1
Nov  9 21:40:05 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc kstat_irqs on cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc irq_cfg on cpu 8 node 2
Nov  9 21:40:18 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc kstat_irqs on cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc irq_cfg on cpu 12 node 3

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/Kconfig                   |   11 
 arch/x86/include/asm/irq_vectors.h |    2 
 arch/x86/kernel/i8259.c            |   24 +
 arch/x86/kernel/io_apic.c          |  510 ++++++++++++++++++++++++++-----------
 arch/x86/kernel/irq.c              |   24 +
 arch/x86/kernel/irq_32.c           |    2 
 arch/x86/kernel/irq_64.c           |    2 
 arch/x86/kernel/irqinit_32.c       |    3 
 arch/x86/kernel/irqinit_64.c       |    3 
 arch/x86/kernel/uv_irq.c           |   22 +
 arch/x86/mm/init_32.c              |    3 
 drivers/char/random.c              |   31 ++
 drivers/pci/htirq.c                |   19 +
 drivers/pci/intr_remapping.c       |   66 ++++
 drivers/xen/events.c               |    9 
 fs/proc/interrupts.c               |   13 
 fs/proc/stat.c                     |   17 -
 include/linux/interrupt.h          |    2 
 include/linux/irq.h                |   71 ++++-
 include/linux/irqnr.h              |   15 -
 include/linux/kernel_stat.h        |   14 -
 init/main.c                        |    2 
 kernel/irq/autoprobe.c             |   10 
 kernel/irq/chip.c                  |   51 ++-
 kernel/irq/handle.c                |  384 ++++++++++++++++++++++++++-
 kernel/irq/proc.c                  |    3 
 kernel/irq/spurious.c              |    4 
 27 files changed, 1098 insertions(+), 219 deletions(-)

Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -236,6 +236,17 @@ config X86_HAS_BOOT_CPU_ID
 	def_bool y
 	depends on X86_VOYAGER
 
+config SPARSE_IRQ
+	bool "Support sparse irq numbering"
+	depends on PCI_MSI || HT_IRQ
+	default y
+	help
+	  This enables support for sparse irq, esp for msi/msi-x. the irq
+	  number will be bus/dev/fn + 12bit. You may need if you have lots of
+	  cards supports msi-x installed.
+
+	  If you don't know what to do here, say Y.
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
Index: linux-2.6/arch/x86/kernel/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/io_apic.c
+++ linux-2.6/arch/x86/kernel/io_apic.c
@@ -108,94 +108,220 @@ static int __init parse_noapic(char *str
 early_param("noapic", parse_noapic);
 
 struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+	int apic, pin;
+	struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+	struct irq_pin_list *pin;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	pin = kzalloc_node(sizeof(*pin), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+
+	return pin;
+}
+
 struct irq_cfg {
-	unsigned int irq;
 	struct irq_pin_list *irq_2_pin;
 	cpumask_t domain;
 	cpumask_t old_domain;
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
+#ifdef CONFIG_SPARSE_IRQ
+	u8 move_desc_in_progress_in_same_domain : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
 static struct irq_cfg irq_cfgx[NR_IRQS] = {
-	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-	[3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-	[4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-	[5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-	[6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-	[7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-	[8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-	[9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-	[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-	[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-	[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-	[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-	[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+#endif
+	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
 };
 
-#define for_each_irq_cfg(irq, cfg)		\
-	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+void __init arch_early_irq_init_work(void)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int count;
+	int i;
+#ifdef CONFIG_SPARSE_IRQ
+	int count_desc = NR_IRQS_LEGACY;
+#else
+	int count_desc = NR_IRQS;
+#endif
+
+	cfg = irq_cfgx;
+	count = ARRAY_SIZE(irq_cfgx);
+
+	BUG_ON(count > count_desc);
 
+	for (i = 0; i < count; i++) {
+		desc = irq_to_desc(i);
+		desc->chip_data = &cfg[i];
+	}
+}
+
+#ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+	struct irq_cfg *cfg = NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		cfg = desc->chip_data;
+
+	return cfg;
 }
 
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 {
-	return irq_cfg(irq);
+	struct irq_cfg *cfg;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
+
+	return cfg;
 }
 
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+static void free_irq_cfg(struct irq_cfg *cfg)
+{
+	kfree(cfg);
+}
 
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
 
-struct irq_pin_list {
-	int apic, pin;
-	struct irq_pin_list *next;
-};
+	cfg = desc->chip_data;
+	if (!cfg)
+		desc->chip_data = get_one_free_irq_cfg(cpu);
+}
 
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu);
 
-static void __init irq_2_pin_init(void)
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_head;
-	int i;
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
 
-	for (i = 1; i < PIN_MAP_SIZE; i++)
-		pin[i-1].next = &pin[i];
+	cfg = get_one_free_irq_cfg(cpu);
+	desc->chip_data = cfg;
 
-	irq_2_pin_ptr = &pin[0];
+	old_cfg = old_desc->chip_data;
+
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_2_pin(struct irq_cfg *cfg);
+
+void arch_free_chip_data(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+
+	cfg = desc->chip_data;
+	if (cfg) {
+		free_irq_2_pin(cfg);
+		free_irq_cfg(cfg);
+		desc->chip_data = NULL;
+	}
 }
 
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin);
+
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_ptr;
+	struct irq_pin_list *old_entry;
 
-	if (!pin)
-		panic("can not get more irq_2_pin\n");
+	old_entry = old_cfg->irq_2_pin;
 
-	irq_2_pin_ptr = pin->next;
-	pin->next = NULL;
-	return pin;
+	while (old_entry) {
+		add_pin_to_irq_cpu(cfg, cpu, old_entry->apic, old_entry->pin);
+		old_entry = old_entry->next;
+	}
 }
 
+static void free_irq_2_pin(struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry, *next;
+
+	entry = cfg->irq_2_pin;
+
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	cfg->irq_2_pin = NULL;
+}
+
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+	struct irq_cfg *cfg = desc->chip_data;
+
+	if (!cfg->move_in_progress) {
+		/* it means domain is not changed */
+		cpumask_t tmp;
+
+		cpus_and(tmp, desc->affinity, mask);
+		if (cpus_empty(tmp))
+			cfg->move_desc_in_progress_in_same_domain = 1;
+	}
+}
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+}
+#endif
+
 struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
@@ -359,7 +485,7 @@ static void __target_IO_APIC_irq(unsigne
 	}
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
 
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 {
@@ -373,10 +499,13 @@ static void set_ioapic_affinity_irq(unsi
 	if (cpus_empty(tmp))
 		return;
 
-	cfg = irq_cfg(irq);
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 	/*
@@ -384,7 +513,6 @@ static void set_ioapic_affinity_irq(unsi
 	 */
 	dest = SET_APIC_LOGICAL_ID(dest);
 
-	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
 	__target_IO_APIC_irq(irq, dest, cfg->vector);
 	desc->affinity = mask;
@@ -397,16 +525,13 @@ static void set_ioapic_affinity_irq(unsi
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	/* first time to refer irq_cfg, so with new */
-	cfg = irq_cfg_alloc(irq);
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin();
+		entry = get_one_free_irq_2_pin(cpu);
 		cfg->irq_2_pin = entry;
 		entry->apic = apic;
 		entry->pin = pin;
@@ -421,20 +546,31 @@ static void add_pin_to_irq(unsigned int
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin();
+	entry->next = get_one_free_irq_2_pin(cpu);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
 }
 
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int cpu = smp_processor_id();
+
+	/* first time to refer irq_cfg, so with new */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	cfg = desc->chip_data;
+	add_pin_to_irq_cpu(cfg, cpu, apic, pin);
+}
+
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
 	struct irq_pin_list *entry = cfg->irq_2_pin;
 	int replaced = 0;
 
@@ -451,7 +587,7 @@ static void __init replace_pin_at_irq(un
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq(irq, newapic, newpin);
+		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
 
 static inline void io_apic_modify_irq(unsigned int irq,
@@ -809,7 +945,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector
  */
 static int EISA_ELCR(unsigned int irq)
 {
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		unsigned int port = 0x4d0 + (irq >> 3);
 		return (inb(port) >> (irq & 7)) & 1;
 	}
@@ -1034,7 +1170,7 @@ void unlock_vector_lock(void)
 	spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1050,9 +1186,6 @@ static int __assign_irq_vector(int irq,
 	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
 	unsigned int old_vector;
 	int cpu;
-	struct irq_cfg *cfg;
-
-	cfg = irq_cfg(irq);
 
 	/* Only try and allocate irqs on cpus that are present */
 	cpus_and(mask, mask, cpu_online_map);
@@ -1113,24 +1246,22 @@ next:
 	return -ENOSPC;
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	int err;
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, cfg, mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-	struct irq_cfg *cfg;
 	cpumask_t mask;
 	int cpu, vector;
 
-	cfg = irq_cfg(irq);
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
@@ -1148,14 +1279,16 @@ void __setup_vector_irq(int cpu)
 	/* This function must be called with vector_lock held */
 	int irq, vector;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	/* Mark the inuse vectors */
-	for_each_irq_cfg(irq, cfg) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
 		if (!cpu_isset(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
-	}
+	} end_for_each_irq_desc();
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
 		irq = per_cpu(vector_irq, cpu)[vector];
@@ -1205,7 +1338,8 @@ static void ioapic_register_intr(int irq
 {
 	struct irq_desc *desc;
 
-	desc = irq_to_desc(irq);
+	/* could be first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
@@ -1310,7 +1444,7 @@ static void setup_IO_APIC_irq(int apic,
 	cfg = irq_cfg(irq);
 
 	mask = TARGET_CPUS;
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
 	cpus_and(mask, cfg->domain, mask);
@@ -1327,12 +1461,12 @@ static void setup_IO_APIC_irq(int apic,
 			       cfg->vector)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic].mp_apicid, pin);
-		__clear_irq_vector(irq);
+		__clear_irq_vector(irq, cfg);
 		return;
 	}
 
 	ioapic_register_intr(irq, trigger);
-	if (irq < 16)
+	if (irq < NR_IRQS_LEGACY)
 		disable_8259A_irq(irq);
 
 	ioapic_write_entry(apic, pin, entry);
@@ -1434,6 +1568,7 @@ __apicdebuginit(void) print_IO_APIC(void
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 	unsigned int irq;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1523,8 +1658,10 @@ __apicdebuginit(void) print_IO_APIC(void
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for_each_irq_cfg(irq, cfg) {
-		struct irq_pin_list *entry = cfg->irq_2_pin;
+	for_each_irq_desc(irq, desc) {
+		struct irq_pin_list *entry;
+		cfg = desc->chip_data;
+		entry = cfg->irq_2_pin;
 		if (!entry)
 			continue;
 		printk(KERN_DEBUG "IRQ%d ", irq);
@@ -1535,7 +1672,7 @@ __apicdebuginit(void) print_IO_APIC(void
 			entry = entry->next;
 		}
 		printk("\n");
-	}
+	} end_for_each_irq_desc();
 
 	printk(KERN_INFO ".................................... done.\n");
 
@@ -2010,7 +2147,7 @@ static unsigned int startup_ioapic_irq(u
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		disable_8259A_irq(irq);
 		if (i8259A_irq_pending(irq))
 			was_pending = 1;
@@ -2095,10 +2232,10 @@ static void migrate_ioapic_irq(int irq,
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = irq_cfg(irq);
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -2178,7 +2315,7 @@ static void ir_irq_migration(struct work
 			desc->chip->set_affinity(irq, desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
@@ -2236,19 +2373,40 @@ unlock:
 	irq_exit();
 }
 
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_desc *desc = *descp;
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress))
+	if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_SPARSE_IRQ
+		if (likely(!cfg->move_desc_in_progress_in_same_domain))
+			return;
+
+		/* domain is not change, but affinity is changed */
+		me = smp_processor_id();
+		if (cpu_isset(me, desc->affinity)) {
+			*descp = desc = move_irq_desc(desc, me);
+			/* get the new one */
+			cfg = desc->chip_data;
+			cfg->move_desc_in_progress_in_same_domain = 0;
+		}
+#endif
 		return;
+	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
 
+#ifdef CONFIG_SPARSE_IRQ
+		*descp = desc = move_irq_desc(desc, me);
+		/* get the new one */
+		cfg = desc->chip_data;
+#endif
+
 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
@@ -2256,41 +2414,68 @@ static void irq_complete_move(unsigned i
 	}
 }
 #else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
 #ifdef CONFIG_INTR_REMAP
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_x2apic_level(unsigned int irq, struct irq_desc **descp)
+#else
 static void ack_x2apic_level(unsigned int irq)
+#endif
 {
 	ack_x2APIC_irq();
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_x2apic_edge(unsigned int irq, struct irq_desc **descp)
+#else
 static void ack_x2apic_edge(unsigned int irq)
+#endif
 {
 	ack_x2APIC_irq();
 }
 #endif
 
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_apic_edge(unsigned int irq, struct irq_desc **descp)
+{
+	irq_complete_move(descp);
+	move_native_irq(irq);
+	ack_APIC_irq();
+}
+#else
 static void ack_apic_edge(unsigned int irq)
 {
-	irq_complete_move(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	irq_complete_move(&desc);
 	move_native_irq(irq);
 	ack_APIC_irq();
 }
+#endif
 
 atomic_t irq_mis_count;
 
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_apic_level(unsigned int irq, struct irq_desc **descp)
+{
+#else
 static void ack_apic_level(unsigned int irq)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc **descp = &desc;
+#endif
 #ifdef CONFIG_X86_32
 	unsigned long v;
 	int i;
+	struct irq_cfg *cfg;
 #endif
 	int do_unmask_irq = 0;
 
-	irq_complete_move(irq);
+	irq_complete_move(descp);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+	if (unlikely((*descp)->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
 		mask_IO_APIC_irq(irq);
 	}
@@ -2316,7 +2501,8 @@ static void ack_apic_level(unsigned int
 	* operation to prevent an edge-triggered interrupt escaping meanwhile.
 	* The idea is from Manfred Spraul.  --macro
 	*/
-	i = irq_cfg(irq)->vector;
+	cfg = (*descp)->chip_data;
+	i = cfg->vector;
 
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 #endif
@@ -2416,22 +2602,21 @@ static inline void init_IO_APIC_traps(vo
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for_each_irq_cfg(irq, cfg) {
-		if (IO_APIC_IRQ(irq) && !cfg->vector) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
+		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
 			 * interrupt if we can..
 			 */
-			if (irq < 16)
+			if (irq < NR_IRQS_LEGACY)
 				make_8259A_irq(irq);
-			else {
-				desc = irq_to_desc(irq);
+			else
 				/* Strange. Oh, well.. */
 				desc->chip = &no_irq_chip;
-			}
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
@@ -2454,7 +2639,11 @@ static void unmask_lapic_irq(unsigned in
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_lapic_irq (unsigned int irq, struct irq_desc **descp)
+#else
 static void ack_lapic_irq (unsigned int irq)
+#endif
 {
 	ack_APIC_irq();
 }
@@ -2575,6 +2764,7 @@ int timer_through_8259 __initdata;
 static inline void __init check_timer(void)
 {
 	struct irq_cfg *cfg = irq_cfg(0);
+	int cpu = smp_processor_id();
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	unsigned int ver;
@@ -2589,7 +2779,7 @@ static inline void __init check_timer(vo
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	assign_irq_vector(0, TARGET_CPUS);
+	assign_irq_vector(0, cfg, TARGET_CPUS);
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2640,7 +2830,7 @@ static inline void __init check_timer(vo
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq(0, apic1, pin1);
+			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
 		unmask_IO_APIC_irq(0);
@@ -2669,7 +2859,7 @@ static inline void __init check_timer(vo
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		unmask_IO_APIC_irq(0);
 		enable_8259A_irq(0);
@@ -2888,22 +3078,27 @@ unsigned int create_irq_nr(unsigned int
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
-	struct irq_cfg *cfg_new;
+	struct irq_cfg *cfg_new = NULL;
+	int cpu;
+	struct irq_desc *desc_new = NULL;
 
+#ifndef CONFIG_SPARSE_IRQ
 	irq_want = nr_irqs - 1;
+#endif
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
+	cpu = smp_processor_id();
 	for (new = irq_want; new > 0; new--) {
 		if (platform_legacy_irq(new))
 			continue;
-		cfg_new = irq_cfg(new);
-		if (cfg_new && cfg_new->vector != 0)
+
+		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		cfg_new = desc_new->chip_data;
+
+		if (cfg_new->vector != 0)
 			continue;
-		/* check if need to create one */
-		if (!cfg_new)
-			cfg_new = irq_cfg_alloc(new);
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
 	}
@@ -2911,6 +3106,9 @@ unsigned int create_irq_nr(unsigned int
 
 	if (irq > 0) {
 		dynamic_irq_init(irq);
+		/* restore it, in case dynamic_irq_init clear it */
+		if (desc_new)
+			desc_new->chip_data = cfg_new;
 	}
 	return irq;
 }
@@ -2930,14 +3128,22 @@ int create_irq(void)
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
+	/* store it, in case dynamic_irq_cleanup clear it */
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
 	dynamic_irq_cleanup(irq);
+	/* connect back irq_cfg */
+	if (desc)
+		desc->chip_data = cfg;
 
 #ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
 #endif
 	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
+	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
@@ -2952,12 +3158,12 @@ static int msi_compose_msg(struct pci_de
 	unsigned dest;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (err)
 		return err;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, tmp);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3025,10 +3231,13 @@ static void set_msi_irq_affinity(unsigne
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3040,7 +3249,6 @@ static void set_msi_irq_affinity(unsigne
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	write_msi_msg(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 
@@ -3064,10 +3272,13 @@ static void ir_set_msi_irq_affinity(unsi
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3091,7 +3302,6 @@ static void ir_set_msi_irq_affinity(unsi
 		cfg->move_in_progress = 0;
 	}
 
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif
@@ -3176,7 +3386,7 @@ static int setup_msi_irq(struct pci_dev
 #endif
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
-	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for MSI/MSI-X\n", irq, irq);
 
 	return 0;
 }
@@ -3185,6 +3395,7 @@ static unsigned int build_irq_for_pci_de
 {
 	unsigned int irq;
 
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
 	irq = dev->bus->number;
 	irq <<= 8;
 	irq |= dev->devfn;
@@ -3199,7 +3410,7 @@ int arch_setup_msi_irq(struct pci_dev *d
 	int ret;
 	unsigned int irq_want;
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 
 	irq = create_irq_nr(irq_want);
 	if (irq == 0)
@@ -3240,7 +3451,8 @@ int arch_setup_msi_irqs(struct pci_dev *
 	int index = 0;
 #endif
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	/* count from the top 0xfff in 12 bits range */
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 	sub_handle = 0;
 	list_for_each_entry(desc, &dev->msi_list, list) {
 		irq = create_irq_nr(irq_want--);
@@ -3306,10 +3518,13 @@ static void dmar_msi_set_affinity(unsign
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3321,7 +3536,6 @@ static void dmar_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
@@ -3367,10 +3581,13 @@ static void hpet_msi_set_affinity(unsign
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3382,7 +3599,6 @@ static void hpet_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
@@ -3448,15 +3664,17 @@ static void set_ht_irq_affinity(unsigned
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif
@@ -3478,13 +3696,13 @@ int arch_setup_ht_irq(unsigned int irq,
 	int err;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (!err) {
 		struct ht_irq_msg msg;
 		unsigned dest;
 
-		cfg = irq_cfg(irq);
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
@@ -3508,7 +3726,8 @@ int arch_setup_ht_irq(unsigned int irq,
 		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
 					      handle_edge_irq, "edge");
 
-		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
+		dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for HT\n",
+				 irq, irq);
 	}
 	return err;
 }
@@ -3530,7 +3749,9 @@ int arch_enable_uv_irq(char *irq_name, u
 	unsigned long flags;
 	int err;
 
-	err = assign_irq_vector(irq, *eligible_cpu);
+	cfg = irq_cfg(irq);
+
+	err = assign_irq_vector(irq, cfg, *eligible_cpu);
 	if (err != 0)
 		return err;
 
@@ -3539,8 +3760,6 @@ int arch_enable_uv_irq(char *irq_name, u
 				      irq_name);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	cfg = irq_cfg(irq);
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3594,6 +3813,7 @@ int __init io_apic_get_redir_entries (in
 
 int __init probe_nr_irqs(void)
 {
+#ifdef CONFIG_SPARSE_IRQ
 	int idx;
 	int nr = 0;
 #ifndef CONFIG_XEN
@@ -3611,10 +3831,11 @@ int __init probe_nr_irqs(void)
 	/* something wrong ? */
 	if (nr < nr_min)
 		nr = nr_min;
-	if (WARN_ON(nr > NR_IRQS))
-		nr = NR_IRQS;
 
 	return nr;
+#else
+	return NR_IRQS;
+#endif
 }
 
 /* --------------------------------------------------------------------------
@@ -3722,7 +3943,7 @@ int io_apic_set_pci_routing (int ioapic,
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
-	if (irq >= 16)
+	if (irq >= NR_IRQS_LEGACY)
 		add_pin_to_irq(irq, ioapic, pin);
 
 	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
@@ -3852,7 +4073,6 @@ void __init ioapic_init_mappings(void)
 	struct resource *ioapic_res;
 	int i;
 
-	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
Index: linux-2.6/arch/x86/kernel/irqinit_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_32.c
+++ linux-2.6/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
 	/*
 	 * 16 old-style INTA-cycle interrupts:
 	 */
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/kernel/irqinit_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_64.c
+++ linux-2.6/arch/x86/kernel/irqinit_64.c
@@ -142,8 +142,7 @@ void __init init_ISA_irqs(void)
 	init_bsp_APIC();
 	init_8259A(0);
 
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/mm/init_32.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_32.c
+++ linux-2.6/arch/x86/mm/init_32.c
@@ -66,6 +66,7 @@ static unsigned long __meminitdata table
 static unsigned long __meminitdata table_top;
 
 static int __initdata after_init_bootmem;
+int after_bootmem;
 
 static __init void *alloc_low_page(unsigned long *phys)
 {
@@ -987,6 +988,8 @@ void __init mem_init(void)
 
 	set_highmem_pages_init();
 
+	after_bootmem = 1;
+
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
Index: linux-2.6/drivers/char/random.c
===================================================================
--- linux-2.6.orig/drivers/char/random.c
+++ linux-2.6/drivers/char/random.c
@@ -558,6 +558,8 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
+#ifndef CONFIG_SPARSE_IRQ
+
 static struct timer_rand_state *irq_timer_state[NR_IRQS];
 
 static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
@@ -576,6 +578,33 @@ static void set_timer_rand_state(unsigne
 	irq_timer_state[irq] = state;
 }
 
+#else
+
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return NULL;
+
+	return desc->timer_rand_state;
+}
+
+static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return;
+
+	desc->timer_rand_state = state;
+}
+#endif
+
 static struct timer_rand_state input_timer_state;
 
 /*
@@ -933,8 +962,10 @@ void rand_initialize_irq(int irq)
 {
 	struct timer_rand_state *state;
 
+#ifndef CONFIG_SPARSE_IRQ
 	if (irq >= nr_irqs)
 		return;
+#endif
 
 	state = get_timer_rand_state(irq);
 
Index: linux-2.6/drivers/pci/htirq.c
===================================================================
--- linux-2.6.orig/drivers/pci/htirq.c
+++ linux-2.6/drivers/pci/htirq.c
@@ -82,6 +82,19 @@ void unmask_ht_irq(unsigned int irq)
 	write_ht_irq_msg(irq, &msg);
 }
 
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+	unsigned int irq;
+
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
+	irq = dev->bus->number;
+	irq <<= 8;
+	irq |= dev->devfn;
+	irq <<= 12;
+
+	return irq;
+}
+
 /**
  * __ht_create_irq - create an irq and attach it to a device.
  * @dev: The hypertransport device to find the irq capability on.
@@ -98,6 +111,7 @@ int __ht_create_irq(struct pci_dev *dev,
 	int max_irq;
 	int pos;
 	int irq;
+	unsigned int irq_want;
 
 	pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
 	if (!pos)
@@ -125,7 +139,12 @@ int __ht_create_irq(struct pci_dev *dev,
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
+	irq_want = build_irq_for_pci_dev(dev);
+#ifdef CONFIG_SPARSE_IRQ
+	irq = create_irq_nr(irq_want + idx);
+#else
 	irq = create_irq();
+#endif
 
 	if (irq <= 0) {
 		kfree(cfg);
Index: linux-2.6/drivers/pci/intr_remapping.c
===================================================================
--- linux-2.6.orig/drivers/pci/intr_remapping.c
+++ linux-2.6/drivers/pci/intr_remapping.c
@@ -19,17 +19,77 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+{
+	struct irq_2_iommu *iommu;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	iommu = kzalloc_node(sizeof(*iommu), GFP_KERNEL, node);
+	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+
+	return iommu;
+}
 
 static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 {
-	return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (WARN_ON_ONCE(!desc))
+		return NULL;
+
+	return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct irq_2_iommu *irq_iommu;
+
+	/*
+	 * alloc irq desc if not allocated already.
+	 */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+
+	irq_iommu = desc->irq_2_iommu;
+
+	if (!irq_iommu)
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+
+	return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+	return irq_2_iommu_alloc_cpu(irq, -1);
 }
 
+#else /* !CONFIG_SPARSE_IRQ */
+
+static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_2_iommuX[irq];
+
+	return NULL;
+}
+static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_2_iommu(irq);
+}
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
 	return irq_2_iommu(irq);
 }
+#endif
 
 static DEFINE_SPINLOCK(irq_2_ir_lock);
 
@@ -86,9 +146,11 @@ int alloc_irte(struct intel_iommu *iommu
 	if (!count)
 		return -1;
 
+#ifndef CONFIG_SPARSE_IRQ
 	/* protect irq_2_iommu_alloc later */
 	if (irq >= nr_irqs)
 		return -1;
+#endif
 
 	/*
 	 * start the IRTE search from index 0.
Index: linux-2.6/drivers/xen/events.c
===================================================================
--- linux-2.6.orig/drivers/xen/events.c
+++ linux-2.6/drivers/xen/events.c
@@ -141,8 +141,9 @@ static void init_evtchn_cpu_bindings(voi
 	int i;
 
 	/* By default all event channels notify CPU#0. */
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		desc->affinity = cpumask_of_cpu(0);
+	} end_for_each_irq_desc();
 #endif
 
 	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@ -231,7 +232,7 @@ static int find_unbound_irq(void)
 	int irq;
 
 	/* Only allocate from dynirq range */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		if (irq_bindcount[irq] == 0)
 			break;
 
@@ -792,7 +793,7 @@ void xen_irq_resume(void)
 		mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		irq_info[irq].evtchn = 0; /* zap event-channel binding */
 
 	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@ -824,7 +825,7 @@ void __init xen_init_IRQ(void)
 		mask_evtchn(i);
 
 	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-	for_each_irq_nr(i)
+	for (i = 0; i < nr_irqs; i++)
 		irq_bindcount[i] = 0;
 
 	irq_ctx_init(smp_processor_id());
Index: linux-2.6/fs/proc/stat.c
===================================================================
--- linux-2.6.orig/fs/proc/stat.c
+++ linux-2.6/fs/proc/stat.c
@@ -27,6 +27,9 @@ static int show_stat(struct seq_file *p,
 	u64 sum = 0;
 	struct timespec boottime;
 	unsigned int per_irq_sum;
+#ifdef CONFIG_GENERIC_HARDIRQS
+	struct irq_desc *desc;
+#endif
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -44,10 +47,9 @@ static int show_stat(struct seq_file *p,
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-
-		for_each_irq_nr(j)
+		for_each_irq_desc(j, desc) {
 			sum += kstat_irqs_cpu(j, i);
-
+		} end_for_each_irq_desc();
 		sum += arch_irq_stat_cpu(i);
 	}
 	sum += arch_irq_stat();
@@ -90,14 +92,17 @@ static int show_stat(struct seq_file *p,
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_nr(j) {
+	for_each_irq_desc(j, desc) {
 		per_irq_sum = 0;
-
 		for_each_possible_cpu(i)
 			per_irq_sum += kstat_irqs_cpu(j, i);
 
+#ifdef CONFIG_SPARSE_IRQ
+		seq_printf(p, " %#x:%u", j, per_irq_sum);
+#else
 		seq_printf(p, " %u", per_irq_sum);
-	}
+#endif
+	} end_for_each_irq_desc();
 
 	seq_printf(p,
 		"\nctxt %llu\n"
Index: linux-2.6/fs/proc/interrupts.c
===================================================================
--- linux-2.6.orig/fs/proc/interrupts.c
+++ linux-2.6/fs/proc/interrupts.c
@@ -10,20 +10,31 @@
  */
 static void *int_seq_start(struct seq_file *f, loff_t *pos)
 {
+#ifdef CONFIG_SPARSE_IRQ
+	rcu_read_lock();
+	return seq_list_start(&sparse_irqs_head, *pos);
+#else
 	return (*pos <= nr_irqs) ? pos : NULL;
+#endif
 }
 
 static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
 {
+#ifdef CONFIG_SPARSE_IRQ
+	return seq_list_next(v, &sparse_irqs_head, pos);
+#else
 	(*pos)++;
 	if (*pos > nr_irqs)
 		return NULL;
 	return pos;
+#endif
 }
 
 static void int_seq_stop(struct seq_file *f, void *v)
 {
-	/* Nothing to do */
+#ifdef CONFIG_SPARSE_IRQ
+	rcu_read_unlock();
+#endif
 }
 
 static const struct seq_operations int_seq_ops = {
Index: linux-2.6/include/linux/interrupt.h
===================================================================
--- linux-2.6.orig/include/linux/interrupt.h
+++ linux-2.6/include/linux/interrupt.h
@@ -18,6 +18,8 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
+extern int nr_irqs;
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h
+++ linux-2.6/include/linux/irq.h
@@ -106,11 +106,17 @@ struct irq_chip {
 	void		(*enable)(unsigned int irq);
 	void		(*disable)(unsigned int irq);
 
-	void		(*ack)(unsigned int irq);
 	void		(*mask)(unsigned int irq);
-	void		(*mask_ack)(unsigned int irq);
 	void		(*unmask)(unsigned int irq);
+#ifdef CONFIG_SPARSE_IRQ
+	void		(*ack)(unsigned int irq, struct irq_desc **descp);
+	void		(*mask_ack)(unsigned int irq, struct irq_desc **descp);
+	void		(*eoi)(unsigned int irq, struct irq_desc **descp);
+#else
+	void		(*ack)(unsigned int irq);
+	void		(*mask_ack)(unsigned int irq);
 	void		(*eoi)(unsigned int irq);
+#endif
 
 	void		(*end)(unsigned int irq);
 	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
@@ -129,6 +135,8 @@ struct irq_chip {
 	const char	*typename;
 };
 
+struct timer_rand_state;
+struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -155,6 +163,15 @@ struct irq_chip {
  */
 struct irq_desc {
 	unsigned int		irq;
+#ifdef CONFIG_SPARSE_IRQ
+	struct list_head	list;
+	struct list_head	hash_entry;
+	struct timer_rand_state *timer_rand_state;
+	unsigned int            *kstat_irqs;
+# ifdef CONFIG_INTR_REMAP
+	struct irq_2_iommu      *irq_2_iommu;
+# endif
+#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -182,14 +199,60 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+extern void arch_early_irq_init_work(void);
+extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
+					struct irq_desc *desc, int cpu);
+extern void arch_free_chip_data(struct irq_desc *desc);
+
+#ifndef CONFIG_SPARSE_IRQ
 
+/* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
 
-static inline struct irq_desc *irq_to_desc(unsigned int irq)
+#ifdef CONFIG_GENERIC_HARDIRQS
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+# define for_each_irq_desc_reverse(irq, desc)                          \
+	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
+	    irq >= 0; irq--, desc--)
+
+#define end_for_each_irq_desc()
+#endif
+
+static inline void early_irq_init_work(void)
 {
-	return (irq < nr_irqs) ? irq_desc + irq : NULL;
+	arch_early_irq_init_work();
 }
 
+#else
+
+void early_irq_init_work(void);
+extern struct list_head sparse_irqs_head;
+#define for_each_irq_desc(irqX, desc)					\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.next), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.next), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.next), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define for_each_irq_desc_reverse(irqX, desc)				\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.prev), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.prev), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.prev), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define end_for_each_irq_desc() rcu_read_unlock()
+
+#define kstat_irqs_this_cpu(DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()])
+#define kstat_incr_irqs_this_cpu(irqno, DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()]++)
+#endif
+
 /*
  * Migration helpers for obsolete names, they will go away:
  */
Index: linux-2.6/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.orig/include/linux/kernel_stat.h
+++ linux-2.6/include/linux/kernel_stat.h
@@ -28,7 +28,9 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-	unsigned int irqs[NR_IRQS];
+#ifndef CONFIG_SPARSE_IRQ
+       unsigned int irqs[NR_IRQS];
+#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, ksta
 
 extern unsigned long long nr_context_switches(void);
 
+#ifndef CONFIG_SPARSE_IRQ
+#define kstat_irqs_this_cpu(irq) \
+	(kstat_this_cpu.irqs[irq])
+
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
@@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_
 {
 	kstat_this_cpu.irqs[irq]++;
 }
+#endif
+
 
+#ifndef CONFIG_SPARSE_IRQ
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
+#else
+extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#endif
 
 /*
  * Number of interrupts per specific IRQ source, since bootup
Index: linux-2.6/kernel/irq/autoprobe.c
===================================================================
--- linux-2.6.orig/kernel/irq/autoprobe.c
+++ linux-2.6/kernel/irq/autoprobe.c
@@ -57,7 +57,7 @@ unsigned long probe_irq_on(void)
 			desc->chip->startup(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/* Wait for longstanding interrupts to trigger. */
 	msleep(20);
@@ -75,7 +75,7 @@ unsigned long probe_irq_on(void)
 				desc->status |= IRQ_PENDING;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/*
 	 * Wait for spurious interrupts to trigger
@@ -99,7 +99,7 @@ unsigned long probe_irq_on(void)
 					mask |= 1 << i;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	return mask;
 }
@@ -135,7 +135,7 @@ unsigned int probe_irq_mask(unsigned lon
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	return mask & val;
@@ -179,7 +179,7 @@ int probe_irq_off(unsigned long val)
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	if (nr_of_irqs > 1)
Index: linux-2.6/kernel/irq/chip.c
===================================================================
--- linux-2.6.orig/kernel/irq/chip.c
+++ linux-2.6/kernel/irq/chip.c
@@ -24,9 +24,11 @@
  */
 void dynamic_irq_init(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc;
 	unsigned long flags;
 
+	/* first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
@@ -282,13 +284,23 @@ void irq_chip_set_defaults(struct irq_ch
 		chip->end = dummy_irq_chip.end;
 }
 
-static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+static inline void mask_ack_irq(struct irq_desc **descp, int irq)
 {
-	if (desc->chip->mask_ack)
+	struct irq_desc *desc = *descp;
+
+	if (desc->chip->mask_ack) {
+#ifdef CONFIG_SPARSE_IRQ
+		desc->chip->mask_ack(irq, descp);
+#else
 		desc->chip->mask_ack(irq);
-	else {
+#endif
+	} else {
 		desc->chip->mask(irq);
+#ifdef CONFIG_SPARSE_IRQ
+		desc->chip->ack(irq, descp);
+#else
 		desc->chip->ack(irq);
+#endif
 	}
 }
 
@@ -351,7 +363,7 @@ handle_level_irq(unsigned int irq, struc
 	irqreturn_t action_ret;
 
 	spin_lock(&desc->lock);
-	mask_ack_irq(desc, irq);
+	mask_ack_irq(&desc, irq);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -428,7 +440,11 @@ handle_fasteoi_irq(unsigned int irq, str
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 out:
+#ifdef CONFIG_SPARSE_IRQ
+	desc->chip->eoi(irq, &desc);
+#else
 	desc->chip->eoi(irq);
+#endif
 
 	spin_unlock(&desc->lock);
 }
@@ -464,13 +480,17 @@ handle_edge_irq(unsigned int irq, struct
 	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
-		mask_ack_irq(desc, irq);
+		mask_ack_irq(&desc, irq);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
+#ifdef CONFIG_SPARSE_IRQ
+	desc->chip->ack(irq, &desc);
+#else
 	desc->chip->ack(irq);
+#endif
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -524,15 +544,25 @@ handle_percpu_irq(unsigned int irq, stru
 
 	kstat_incr_irqs_this_cpu(irq, desc);
 
-	if (desc->chip->ack)
+	if (desc->chip->ack) {
+#ifdef CONFIG_SPARSE_IRQ
+		desc->chip->ack(irq, &desc);
+#else
 		desc->chip->ack(irq);
+#endif
+	}
 
 	action_ret = handle_IRQ_event(irq, desc->action);
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
-	if (desc->chip->eoi)
+	if (desc->chip->eoi) {
+#ifdef CONFIG_SPARSE_IRQ
+		desc->chip->eoi(irq, &desc);
+#else
 		desc->chip->eoi(irq);
+#endif
+	}
 }
 
 void
@@ -567,8 +597,9 @@ __set_irq_handler(unsigned int irq, irq_
 
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
-		if (desc->chip != &no_irq_chip)
-			mask_ack_irq(desc, irq);
+		if (desc->chip != &no_irq_chip) {
+			mask_ack_irq(&desc, irq);
+		}
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c
+++ linux-2.6/kernel/irq/handle.c
@@ -15,9 +15,16 @@
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
 
 #include "internals.h"
 
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
@@ -49,6 +56,299 @@ void handle_bad_irq(unsigned int irq, st
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_desc irq_desc_init = {
+	.irq	    = -1U,
+	.status	    = IRQ_DISABLED,
+	.chip	    = &no_irq_chip,
+	.handle_irq = handle_bad_irq,
+	.depth      = 1,
+	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+	.affinity   = CPU_MASK_ALL
+#endif
+};
+
+static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+{
+	unsigned long bytes;
+	char *ptr;
+	int node;
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	ptr = kzalloc_node(bytes, GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+
+	desc->kstat_irqs = (unsigned int *)ptr;
+}
+
+#ifdef CONFIG_SMP
+static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc,
+				 int cpu, int nr)
+{
+	unsigned long bytes;
+
+	init_kstat_irqs(desc, cpu, nr);
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+}
+
+static void free_kstat_irqs(struct irq_desc *desc)
+{
+	kfree(desc->kstat_irqs);
+	desc->kstat_irqs = NULL;
+}
+#endif
+
+void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+}
+
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
+	desc->irq = irq;
+#ifdef CONFIG_SMP
+	desc->cpu = cpu;
+#endif
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_kstat_irqs(desc, cpu, nr_cpu_ids);
+	arch_init_chip_data(desc, cpu);
+}
+
+#ifdef CONFIG_SMP
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+		 struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	desc->cpu = cpu;
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *desc)
+{
+	free_kstat_irqs(desc);
+	arch_free_chip_data(desc);
+}
+#endif
+/*
+ * Protect the sparse_irqs_free freelist:
+ */
+static DEFINE_SPINLOCK(sparse_irq_lock);
+LIST_HEAD(sparse_irqs_head);
+
+/*
+ * The sparse irqs are in a hash-table as well, for fast lookup:
+ */
+#define SPARSEIRQHASH_BITS          (13 - 1)
+#define SPARSEIRQHASH_SIZE          (1UL << SPARSEIRQHASH_BITS)
+#define __sparseirqhashfn(key)      hash_long((unsigned long)key, SPARSEIRQHASH_BITS)
+#define sparseirqhashentry(key)     (sparseirqhash_table + __sparseirqhashfn((key)))
+
+static struct list_head sparseirqhash_table[SPARSEIRQHASH_SIZE];
+
+static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
+	[0 ... NR_IRQS_LEGACY-1] = {
+		.irq	    = -1U,
+		.status	    = IRQ_DISABLED,
+		.chip	    = &no_irq_chip,
+		.handle_irq = handle_bad_irq,
+		.depth	    = 1,
+		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+		.affinity   = CPU_MASK_ALL
+#endif
+	}
+};
+
+/* FIXME: use bootmem alloc ...*/
+static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+
+void __init __attribute__((weak)) arch_early_irq_init_work(void)
+{
+}
+
+void __init early_irq_init_work(void)
+{
+	struct irq_desc *desc;
+	int legacy_count;
+	int i;
+
+	/* init_work to init list for sparseirq */
+	for (i = 0; i < SPARSEIRQHASH_SIZE; i++)
+		INIT_LIST_HEAD(sparseirqhash_table + i);
+
+	desc = irq_desc_legacy;
+	legacy_count = ARRAY_SIZE(irq_desc_legacy);
+
+	for (i = 0; i < legacy_count; i++) {
+		struct list_head *hash_head;
+
+		hash_head = sparseirqhashentry(i);
+		desc[i].irq = i;
+		desc[i].kstat_irqs = kstat_irqs_legacy[i];
+		list_add_tail(&desc[i].hash_entry, hash_head);
+		list_add_tail(&desc[i].list, &sparse_irqs_head);
+	}
+
+	arch_early_irq_init_work();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+
+	hash_head = sparseirqhashentry(irq);
+
+	/*
+	 * We can walk the hash lockfree, because the hash only
+	 * grows, and we are careful when adding entries to the end:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry) {
+		if (desc->irq == irq)
+			return desc;
+	}
+
+	return NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		return desc;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry)
+		if (desc->irq == irq)
+			goto out_unlock;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_desc for %d aka %#x on cpu %d node %d\n",
+		 irq, irq, cpu, node);
+	init_one_irq_desc(irq, desc, cpu);
+
+	/*
+	 * We use RCU's safe list-add method to make
+	 * parallel walking of the hash-list safe:
+	 */
+	list_add_tail_rcu(&desc->hash_entry, hash_head);
+	/*
+	 * Add it to the global list:
+	 */
+	list_add_tail_rcu(&desc->list, &sparse_irqs_head);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc_alloc_cpu(irq, -1);
+}
+
+#ifdef CONFIG_SMP
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+						int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	irq = old_desc->irq;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry)
+		if (desc->irq == irq && old_desc != desc)
+			goto out_unlock;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  move irq_desc for %d aka %#x to cpu %d node %d\n",
+		 irq, irq, cpu, node);
+
+	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+	list_replace_rcu(&old_desc->hash_entry, &desc->hash_entry);
+	list_replace_rcu(&old_desc->list, &desc->list);
+
+	/* free the old one */
+	free_one_irq_desc(old_desc);
+	kfree(old_desc);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+	int old_cpu;
+	int node, old_node;
+
+	old_cpu = desc->cpu;
+
+	/* those all static, do move them */
+	if (desc->irq < NR_IRQS_LEGACY)
+		return desc;
+
+	if (old_cpu != cpu) {
+		node = cpu_to_node(cpu);
+		old_node = cpu_to_node(old_cpu);
+		if (old_node != node)
+			desc = __real_move_irq_desc(desc, cpu);
+		else
+			desc->cpu = cpu;
+	}
+
+	return desc;
+}
+#endif
+
+#else
+
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
@@ -62,18 +362,49 @@ struct irq_desc irq_desc[NR_IRQS] __cach
 	}
 };
 
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_desc[irq];
+
+	return NULL;
+}
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu)
+{
+	return old_desc;
+}
+#endif
+
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
  */
-static void ack_bad(unsigned int irq)
+static void ack_bad_desc(unsigned int irq, struct irq_desc *desc)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
 	print_irq_desc(irq, desc);
 	ack_bad_irq(irq);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_bad_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	ack_bad_desc(irq, *descp);
+}
+#else
+static void ack_bad_wrapper(unsigned int irq)
+{
+	ack_bad_desc(irq, irq_to_desc(irq));
+}
+#endif
+
 /*
  * NOP functions
  */
@@ -81,6 +412,15 @@ static void noop(unsigned int irq)
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void noop_wrapper(unsigned int irq, struct irq_desc **descp)
+#else
+static void noop_wrapper(unsigned int irq)
+#endif
+{
+	noop(irq);
+}
+
 static unsigned int noop_ret(unsigned int irq)
 {
 	return 0;
@@ -95,7 +435,7 @@ struct irq_chip no_irq_chip = {
 	.shutdown	= noop,
 	.enable		= noop,
 	.disable	= noop,
-	.ack		= ack_bad,
+	.ack		= ack_bad_wrapper,
 	.end		= noop,
 };
 
@@ -109,7 +449,7 @@ struct irq_chip dummy_irq_chip = {
 	.shutdown	= noop,
 	.enable		= noop,
 	.disable	= noop,
-	.ack		= noop,
+	.ack		= noop_wrapper,
 	.mask		= noop,
 	.unmask		= noop,
 	.end		= noop,
@@ -179,8 +519,13 @@ unsigned int __do_IRQ(unsigned int irq)
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		if (desc->chip->ack)
+		if (desc->chip->ack) {
+#ifdef CONFIG_SPARSE_IRQ
+			desc->chip->ack(irq, &desc);
+#else
 			desc->chip->ack(irq);
+#endif
+		}
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -191,8 +536,13 @@ unsigned int __do_IRQ(unsigned int irq)
 	}
 
 	spin_lock(&desc->lock);
-	if (desc->chip->ack)
+	if (desc->chip->ack) {
+#ifdef CONFIG_SPARSE_IRQ
+		desc->chip->ack(irq, &desc);
+#else
 		desc->chip->ack(irq);
+#endif
+	}
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
@@ -261,17 +611,25 @@ out:
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-static struct lock_class_key irq_desc_lock_class;
-
 void early_init_irq_lock_class(void)
 {
+#ifndef CONFIG_SPARSE_IRQ
 	struct irq_desc *desc;
 	int i;
 
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	} end_for_each_irq_desc();
+#endif
+}
+#endif
+
+#ifdef CONFIG_SPARSE_IRQ
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	return desc->kstat_irqs[cpu];
 }
 #endif
+EXPORT_SYMBOL(kstat_irqs_cpu);
+
Index: linux-2.6/arch/x86/kernel/irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq.c
+++ linux-2.6/arch/x86/kernel/irq.c
@@ -99,25 +99,37 @@ static int show_other_interrupts(struct
 int show_interrupts(struct seq_file *p, void *v)
 {
 	unsigned long flags, any_count = 0;
-	int i = *(loff_t *) v, j;
+	int i, j;
 	struct irqaction *action;
 	struct irq_desc *desc;
+	int head = 0;
 
+#ifdef CONFIG_SPARSE_IRQ
+	desc = list_entry(v, struct irq_desc, list);
+	i = desc->irq;
+	if (&desc->list == sparse_irqs_head.next)
+		head = 1;
+#else
+	i = *(loff_t *) v;
 	if (i > nr_irqs)
 		return 0;
 
 	if (i == nr_irqs)
 		return show_other_interrupts(p);
+	if (i == 0)
+		head = 1;
+
+	desc = irq_to_desc(i);
+#endif
 
 	/* print header */
-	if (i == 0) {
+	if (head) {
 		seq_printf(p, "           ");
 		for_each_online_cpu(j)
 			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
 	}
 
-	desc = irq_to_desc(i);
 	spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 	any_count = kstat_irqs(i);
@@ -148,6 +160,12 @@ int show_interrupts(struct seq_file *p,
 	seq_putc(p, '\n');
 out:
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+#ifdef CONFIG_SPARSE_IRQ
+	if (&desc->list == sparse_irqs_head.prev)
+		show_other_interrupts(p);
+#endif
+
 	return 0;
 }
 
Index: linux-2.6/include/linux/irqnr.h
===================================================================
--- linux-2.6.orig/include/linux/irqnr.h
+++ linux-2.6/include/linux/irqnr.h
@@ -7,18 +7,11 @@
 
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0; irq < nr_irqs; irq++)
-#else
-extern int nr_irqs;
+# define end_for_each_irq_desc()
 
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-
-# define for_each_irq_desc_reverse(irq, desc)				\
-	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);	\
-	     irq >= 0; irq--, desc--)
+static inline early_sparse_irq_init_work(void)
+{
+}
 #endif
 
-#define for_each_irq_nr(irq)			\
-	for (irq = 0; irq < nr_irqs; irq++)
-
 #endif
Index: linux-2.6/arch/x86/kernel/irq_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_32.c
+++ linux-2.6/arch/x86/kernel/irq_32.c
@@ -254,7 +254,7 @@ void fixup_irqs(cpumask_t map)
 			desc->chip->set_affinity(irq, mask);
 		else if (desc->action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 #if 0
 	barrier();
Index: linux-2.6/arch/x86/kernel/irq_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_64.c
+++ linux-2.6/arch/x86/kernel/irq_64.c
@@ -129,7 +129,7 @@ void fixup_irqs(cpumask_t map)
 			printk("Broke affinity for irq %i\n", irq);
 		else if (!set_affinity)
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 	/* That doesn't seem sufficient.  Give it 1ms. */
 	local_irq_enable();
Index: linux-2.6/kernel/irq/proc.c
===================================================================
--- linux-2.6.orig/kernel/irq/proc.c
+++ linux-2.6/kernel/irq/proc.c
@@ -243,7 +243,8 @@ void init_irq_proc(void)
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for_each_irq_desc(irq, desc)
+	for_each_irq_desc(irq, desc) {
 		register_irq_proc(irq, desc);
+	} end_for_each_irq_desc();
 }
 
Index: linux-2.6/kernel/irq/spurious.c
===================================================================
--- linux-2.6.orig/kernel/irq/spurious.c
+++ linux-2.6/kernel/irq/spurious.c
@@ -99,7 +99,7 @@ static int misrouted_irq(int irq)
 
 		if (try_one_irq(i, desc))
 			ok = 1;
-	}
+	} end_for_each_irq_desc();
 	/* So the caller can adjust the irq error counts */
 	return ok;
 }
@@ -122,7 +122,7 @@ static void poll_spurious_irqs(unsigned
 			continue;
 
 		try_one_irq(i, desc);
-	}
+	} end_for_each_irq_desc();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c
+++ linux-2.6/init/main.c
@@ -611,6 +611,8 @@ asmlinkage void __init start_kernel(void
 	sort_main_extable();
 	trap_init();
 	rcu_init();
+	/* init some links before init_ISA_irqs() */
+	early_irq_init_work();
 	init_IRQ();
 	pidhash_init();
 	init_timers();
Index: linux-2.6/arch/x86/include/asm/irq_vectors.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/irq_vectors.h
+++ linux-2.6/arch/x86/include/asm/irq_vectors.h
@@ -101,6 +101,8 @@
 #define LAST_VM86_IRQ		15
 #define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
 
+#define NR_IRQS_LEGACY		16
+
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
Index: linux-2.6/arch/x86/kernel/i8259.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/i8259.c
+++ linux-2.6/arch/x86/kernel/i8259.c
@@ -36,12 +36,21 @@ static int i8259A_auto_eoi;
 DEFINE_SPINLOCK(i8259A_lock);
 static void mask_and_ack_8259A(unsigned int);
 
+#ifdef CONFIG_SPARSE_IRQ
+static void mask_and_ack_8259A_wrapper(unsigned int irq, struct irq_desc **descp)
+#else
+static void mask_and_ack_8259A_wrapper(unsigned int irq)
+#endif
+{
+	mask_and_ack_8259A(irq);
+}
+
 struct irq_chip i8259A_chip = {
 	.name		= "XT-PIC",
 	.mask		= disable_8259A_irq,
 	.disable	= disable_8259A_irq,
 	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
+	.mask_ack	= mask_and_ack_8259A_wrapper,
 };
 
 /*
@@ -78,6 +87,15 @@ void disable_8259A_irq(unsigned int irq)
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void disable_8259A_irq_wrapper(unsigned int irq, struct irq_desc **descp)
+#else
+static void disable_8259A_irq_wrapper(unsigned int irq)
+#endif
+{
+	disable_8259A_irq(irq);
+}
+
 void enable_8259A_irq(unsigned int irq)
 {
 	unsigned int mask = ~(1 << irq);
@@ -348,9 +366,9 @@ void init_8259A(int auto_eoi)
 		 * In AEOI mode we just have to mask the interrupt
 		 * when acking.
 		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
+		i8259A_chip.mask_ack = disable_8259A_irq_wrapper;
 	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
+		i8259A_chip.mask_ack = mask_and_ack_8259A_wrapper;
 
 	udelay(100);		/* wait for 8259A to initialize */
 
Index: linux-2.6/arch/x86/kernel/uv_irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/uv_irq.c
+++ linux-2.6/arch/x86/kernel/uv_irq.c
@@ -18,6 +18,15 @@ static void uv_noop(unsigned int irq)
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_noop_wrapper(unsigned int irq, struct irq_desc **descp)
+#else
+static void uv_noop_wrapper(unsigned int irq)
+#endif
+{
+	uv_noop(irq);
+}
+
 static unsigned int uv_noop_ret(unsigned int irq)
 {
 	return 0;
@@ -28,16 +37,25 @@ static void uv_ack_apic(unsigned int irq
 	ack_APIC_irq();
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_ack_apic_wrapper(unsigned int irq, struct irq_desc **descp)
+#else
+static void uv_ack_apic_wrapper(unsigned int irq)
+#endif
+{
+	uv_ack_apic(irq);
+}
+
 struct irq_chip uv_irq_chip = {
 	.name		= "UV-CORE",
 	.startup	= uv_noop_ret,
 	.shutdown	= uv_noop,
 	.enable		= uv_noop,
 	.disable	= uv_noop,
-	.ack		= uv_noop,
+	.ack		= uv_noop_wrapper,
 	.mask		= uv_noop,
 	.unmask		= uv_noop,
-	.eoi		= uv_ack_apic,
+	.eoi		= uv_ack_apic_wrapper,
 	.end		= uv_noop,
 };
 


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v10
  2008-11-10  9:51                             ` [PATCH] sparse_irq aka dyn_irq v10 Yinghai Lu
@ 2008-11-10  9:53                               ` Ingo Molnar
  2008-11-10  9:55                                 ` Yinghai Lu
  0 siblings, 1 reply; 66+ messages in thread
From: Ingo Molnar @ 2008-11-10  9:53 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Andrew Morton, Thomas Gleixner, H. Peter Anvin,
	linux-kernel@vger.kernel.org


* Yinghai Lu <yinghai@kernel.org> wrote:

> +#ifdef CONFIG_SPARSE_IRQ
> +static void uv_ack_apic_wrapper(unsigned int irq, struct irq_desc **descp)
> +#else
> +static void uv_ack_apic_wrapper(unsigned int irq)
> +#endif

hm, why not change it to the new prototype unconditionally? (just pass 
in NULL or so)

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC PATCH] sparse_irq aka dyn_irq
  2008-11-10  9:40                           ` Ingo Molnar
  2008-11-10  9:51                             ` [PATCH] sparse_irq aka dyn_irq v10 Yinghai Lu
@ 2008-11-10  9:55                             ` Andrew Morton
  2008-11-10 10:00                               ` Yinghai Lu
  1 sibling, 1 reply; 66+ messages in thread
From: Andrew Morton @ 2008-11-10  9:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Yinghai Lu, Thomas Gleixner, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Mon, 10 Nov 2008 10:40:33 +0100 Ingo Molnar <mingo@elte.hu> wrote:

> > >> @@ -987,6 +988,8 @@ void __init mem_init(void)
> > >>
> > >>       set_highmem_pages_init();
> > >>
> > >> +     after_bootmem = 1;
> > >
> > > this hack can go away once we have a proper percpu_alloc() that can be
> > > used early enough.
> > 
> > where is that fancy patch? current percpu_alloc(), will keep big 
> > pointer in array..., instead of put that pointer in percpu_area
> > 
> > 64bit has that after_bootmem already.
> 
> or at least introduce a "bootmem agnostic" allocator instead of 
> open-coding the after_bootmem flag.
> 
> Something like:
> 
>   early_kzalloc()
> 
> ?
> 
> Andrew, any preferences?

My mind reading ain't what it was, and this after_bootmem flag is
write-only in this patch.

So what's all this about?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v10
  2008-11-10  9:53                               ` Ingo Molnar
@ 2008-11-10  9:55                                 ` Yinghai Lu
  2008-11-10  9:57                                   ` Ingo Molnar
  0 siblings, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-10  9:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Thomas Gleixner, H. Peter Anvin,
	linux-kernel@vger.kernel.org

Ingo Molnar wrote:
> * Yinghai Lu <yinghai@kernel.org> wrote:
> 
>> +#ifdef CONFIG_SPARSE_IRQ
>> +static void uv_ack_apic_wrapper(unsigned int irq, struct irq_desc **descp)
>> +#else
>> +static void uv_ack_apic_wrapper(unsigned int irq)
>> +#endif
> 
> hm, why not change it to the new prototype unconditionally? (just pass 
> in NULL or so)

that is sitting on irq_chip, and if change that, we need to go over all those kind of funcs and structure of other platforms.

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v10
  2008-11-10  9:55                                 ` Yinghai Lu
@ 2008-11-10  9:57                                   ` Ingo Molnar
  0 siblings, 0 replies; 66+ messages in thread
From: Ingo Molnar @ 2008-11-10  9:57 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Andrew Morton, Thomas Gleixner, H. Peter Anvin,
	linux-kernel@vger.kernel.org


* Yinghai Lu <yinghai@kernel.org> wrote:

> Ingo Molnar wrote:
> > * Yinghai Lu <yinghai@kernel.org> wrote:
> > 
> >> +#ifdef CONFIG_SPARSE_IRQ
> >> +static void uv_ack_apic_wrapper(unsigned int irq, struct irq_desc **descp)
> >> +#else
> >> +static void uv_ack_apic_wrapper(unsigned int irq)
> >> +#endif
> > 
> > hm, why not change it to the new prototype unconditionally? (just pass 
> > in NULL or so)
> 
> that is sitting on irq_chip, and if change that, we need to go over 
> all those kind of funcs and structure of other platforms.

okay, lets not go there just yet.

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC PATCH] sparse_irq aka dyn_irq
  2008-11-10  9:55                             ` [RFC PATCH] sparse_irq aka dyn_irq Andrew Morton
@ 2008-11-10 10:00                               ` Yinghai Lu
  2008-11-10 10:03                                 ` Ingo Molnar
  0 siblings, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-10 10:00 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	linux-kernel@vger.kernel.org

Andrew Morton wrote:
> On Mon, 10 Nov 2008 10:40:33 +0100 Ingo Molnar <mingo@elte.hu> wrote:
> 
>>>>> @@ -987,6 +988,8 @@ void __init mem_init(void)
>>>>>
>>>>>       set_highmem_pages_init();
>>>>>
>>>>> +     after_bootmem = 1;
>>>> this hack can go away once we have a proper percpu_alloc() that can be
>>>> used early enough.
>>> where is that fancy patch? current percpu_alloc(), will keep big 
>>> pointer in array..., instead of put that pointer in percpu_area
>>>
>>> 64bit has that after_bootmem already.
>> or at least introduce a "bootmem agnostic" allocator instead of 
>> open-coding the after_bootmem flag.
>>
>> Something like:
>>
>>   early_kzalloc()
>>
>> ?
>>
>> Andrew, any preferences?
> 
> My mind reading ain't what it was, and this after_bootmem flag is
> write-only in this patch.
> 
> So what's all this about?

if i use alloc_bootmem to get some memory, and later after_bootmem, can I use kfree to free it?

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC PATCH] sparse_irq aka dyn_irq
  2008-11-10 10:00                               ` Yinghai Lu
@ 2008-11-10 10:03                                 ` Ingo Molnar
  2008-11-10 10:05                                   ` Yinghai Lu
                                                     ` (2 more replies)
  0 siblings, 3 replies; 66+ messages in thread
From: Ingo Molnar @ 2008-11-10 10:03 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Andrew Morton, Thomas Gleixner, H. Peter Anvin,
	linux-kernel@vger.kernel.org


* Yinghai Lu <yinghai@kernel.org> wrote:

> Andrew Morton wrote:
> > On Mon, 10 Nov 2008 10:40:33 +0100 Ingo Molnar <mingo@elte.hu> wrote:
> > 
> >>>>> @@ -987,6 +988,8 @@ void __init mem_init(void)
> >>>>>
> >>>>>       set_highmem_pages_init();
> >>>>>
> >>>>> +     after_bootmem = 1;
> >>>> this hack can go away once we have a proper percpu_alloc() that can be
> >>>> used early enough.
> >>> where is that fancy patch? current percpu_alloc(), will keep big 
> >>> pointer in array..., instead of put that pointer in percpu_area
> >>>
> >>> 64bit has that after_bootmem already.
> >> or at least introduce a "bootmem agnostic" allocator instead of 
> >> open-coding the after_bootmem flag.
> >>
> >> Something like:
> >>
> >>   early_kzalloc()
> >>
> >> ?
> >>
> >> Andrew, any preferences?
> > 
> > My mind reading ain't what it was, and this after_bootmem flag is
> > write-only in this patch.
> > 
> > So what's all this about?
> 
> if i use alloc_bootmem to get some memory, and later after_bootmem, 
> can I use kfree to free it?

hm, no. If we used alloc_bootmem(), then we must not free it after 
after_bootmem has been set.

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC PATCH] sparse_irq aka dyn_irq
  2008-11-10 10:03                                 ` Ingo Molnar
@ 2008-11-10 10:05                                   ` Yinghai Lu
  2008-11-10 10:09                                     ` Ingo Molnar
  2008-11-11  6:28                                   ` [PATCH] sparse_irq aka dyn_irq v11 Yinghai Lu
       [not found]                                   ` <491A9F87.8040403@kernel.org>
  2 siblings, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-10 10:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Thomas Gleixner, H. Peter Anvin,
	linux-kernel@vger.kernel.org

Ingo Molnar wrote:
> * Yinghai Lu <yinghai@kernel.org> wrote:
> 
>> Andrew Morton wrote:
>>> On Mon, 10 Nov 2008 10:40:33 +0100 Ingo Molnar <mingo@elte.hu> wrote:
>>>
>>>>>>> @@ -987,6 +988,8 @@ void __init mem_init(void)
>>>>>>>
>>>>>>>       set_highmem_pages_init();
>>>>>>>
>>>>>>> +     after_bootmem = 1;
>>>>>> this hack can go away once we have a proper percpu_alloc() that can be
>>>>>> used early enough.
>>>>> where is that fancy patch? current percpu_alloc(), will keep big 
>>>>> pointer in array..., instead of put that pointer in percpu_area
>>>>>
>>>>> 64bit has that after_bootmem already.
>>>> or at least introduce a "bootmem agnostic" allocator instead of 
>>>> open-coding the after_bootmem flag.
>>>>
>>>> Something like:
>>>>
>>>>   early_kzalloc()
>>>>
>>>> ?
>>>>
>>>> Andrew, any preferences?
>>> My mind reading ain't what it was, and this after_bootmem flag is
>>> write-only in this patch.
>>>
>>> So what's all this about?
>> if i use alloc_bootmem to get some memory, and later after_bootmem, 
>> can I use kfree to free it?
> 
> hm, no. If we used alloc_bootmem(), then we must not free it after 
> after_bootmem has been set.

ok, let keep irq_desc for legacy irqs not movable...

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC PATCH] sparse_irq aka dyn_irq
  2008-11-10 10:05                                   ` Yinghai Lu
@ 2008-11-10 10:09                                     ` Ingo Molnar
  2008-11-10 19:47                                       ` Yinghai Lu
  0 siblings, 1 reply; 66+ messages in thread
From: Ingo Molnar @ 2008-11-10 10:09 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Andrew Morton, Thomas Gleixner, H. Peter Anvin,
	linux-kernel@vger.kernel.org


* Yinghai Lu <yinghai@kernel.org> wrote:

> Ingo Molnar wrote:
> > * Yinghai Lu <yinghai@kernel.org> wrote:
> > 
> >> Andrew Morton wrote:
> >>> On Mon, 10 Nov 2008 10:40:33 +0100 Ingo Molnar <mingo@elte.hu> wrote:
> >>>
> >>>>>>> @@ -987,6 +988,8 @@ void __init mem_init(void)
> >>>>>>>
> >>>>>>>       set_highmem_pages_init();
> >>>>>>>
> >>>>>>> +     after_bootmem = 1;
> >>>>>> this hack can go away once we have a proper percpu_alloc() that can be
> >>>>>> used early enough.
> >>>>> where is that fancy patch? current percpu_alloc(), will keep big 
> >>>>> pointer in array..., instead of put that pointer in percpu_area
> >>>>>
> >>>>> 64bit has that after_bootmem already.
> >>>> or at least introduce a "bootmem agnostic" allocator instead of 
> >>>> open-coding the after_bootmem flag.
> >>>>
> >>>> Something like:
> >>>>
> >>>>   early_kzalloc()
> >>>>
> >>>> ?
> >>>>
> >>>> Andrew, any preferences?
> >>> My mind reading ain't what it was, and this after_bootmem flag is
> >>> write-only in this patch.
> >>>
> >>> So what's all this about?
> >> if i use alloc_bootmem to get some memory, and later after_bootmem, 
> >> can I use kfree to free it?
> > 
> > hm, no. If we used alloc_bootmem(), then we must not free it after 
> > after_bootmem has been set.
> 
> ok, let keep irq_desc for legacy irqs not movable...

most of them are movable right now, correct? If we restrict their 
movability now that might surprise existing usecases negatively.

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC PATCH] sparse_irq aka dyn_irq
  2008-11-10 10:09                                     ` Ingo Molnar
@ 2008-11-10 19:47                                       ` Yinghai Lu
  0 siblings, 0 replies; 66+ messages in thread
From: Yinghai Lu @ 2008-11-10 19:47 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Thomas Gleixner, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Mon, Nov 10, 2008 at 2:09 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Yinghai Lu <yinghai@kernel.org> wrote:
>
>> Ingo Molnar wrote:
>> > * Yinghai Lu <yinghai@kernel.org> wrote:
>> >
>> >> Andrew Morton wrote:
>> >>> On Mon, 10 Nov 2008 10:40:33 +0100 Ingo Molnar <mingo@elte.hu> wrote:
>> >>>
>> >>>>>>> @@ -987,6 +988,8 @@ void __init mem_init(void)
>> >>>>>>>
>> >>>>>>>       set_highmem_pages_init();
>> >>>>>>>
>> >>>>>>> +     after_bootmem = 1;
>> >>>>>> this hack can go away once we have a proper percpu_alloc() that can be
>> >>>>>> used early enough.
>> >>>>> where is that fancy patch? current percpu_alloc(), will keep big
>> >>>>> pointer in array..., instead of put that pointer in percpu_area
>> >>>>>
>> >>>>> 64bit has that after_bootmem already.
>> >>>> or at least introduce a "bootmem agnostic" allocator instead of
>> >>>> open-coding the after_bootmem flag.
>> >>>>
>> >>>> Something like:
>> >>>>
>> >>>>   early_kzalloc()
>> >>>>
>> >>>> ?
>> >>>>
>> >>>> Andrew, any preferences?
>> >>> My mind reading ain't what it was, and this after_bootmem flag is
>> >>> write-only in this patch.
>> >>>
>> >>> So what's all this about?
>> >> if i use alloc_bootmem to get some memory, and later after_bootmem,
>> >> can I use kfree to free it?
>> >
>> > hm, no. If we used alloc_bootmem(), then we must not free it after
>> > after_bootmem has been set.
>>
>> ok, let keep irq_desc for legacy irqs not movable...
>
> most of them are movable right now, correct? If we restrict their
> movability now that might surprise existing usecases negatively.

i mean irq_desc will not be allocated one one on new cpus...

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] sparse_irq aka dyn_irq v11
  2008-11-10 10:03                                 ` Ingo Molnar
  2008-11-10 10:05                                   ` Yinghai Lu
@ 2008-11-11  6:28                                   ` Yinghai Lu
       [not found]                                   ` <491A9F87.8040403@kernel.org>
  2 siblings, 0 replies; 66+ messages in thread
From: Yinghai Lu @ 2008-11-11  6:28 UTC (permalink / raw)
  To: Ingo Molnar, Andrew Morton, Thomas Gleixner, H. Peter Anvin
  Cc: linux-kernel@vger.kernel.org

done. please check it.
fix compiling problem on every config

---

From: Yinghai Lu <yinghai@kernel.org>
Subject: sparseirq v11

impact: new feature sparseirq

add some kind of hash table as Ingo suggesting.
remove dyna_array
when sparse_irq is used, use kzalloc_node to get irq_desc, irq_cfg
  use desc->chip_data for x86 to store irq_cfg
  make irq_desc to go with affinity aka irq_desc moving etc
  call move_irq_desc in irq_complete_move()
  need to add struct (irq_desc **descp) to ack_edge/level to make sure desc get updated
  try to pass desc cfg as more possible to avoid list look up.
  legacy irq_desc is not moved, because they are allocated via static array

  for logical apic mode, need to add move_desc_in_progress_in_same_domain. otherwise it will not get moved. ==> also could need two phase to get irq_desc moved.
	for example: 0xff is old affinity, and need to set 0xf, and then set to 0xf0.
	[ or we need to change domain definition to cpus on the same node ? ]

LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,000000ff
LBSuse:~ # echo f > /proc/irq/22/smp_affinity
LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,0000000f
LBSuse:~ # tail /var/log/messages
...
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
LBSuse:~ # echo f0 > /proc/irq/22/smp_affinity
LBSuse:~ # tail /var/log/messages
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
Oct 27 12:36:46 LBSuse kernel:   move irq_desc for 22 aka 0x16 to cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc kstat_irqs on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_cfg on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_2_pin on cpu 7 node 1

so assume the user space program should update /proc/irq/XX/smp_affinity to 03 or 0f at first on boot
or we change irq_default_affinity ?

  for physical apic is much simple
on 4 sockets 16 cores system
irq_desc is moving..
when
# echo 10 > /proc/irq/134483967/smp_affinity
# echo 100 > /proc/irq/134483967/smp_affinity
# echo 1000 > /proc/irq/134483967/smp_affinity
got
Nov  9 21:39:51 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc kstat_irqs on cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc irq_cfg on cpu 4 node 1
Nov  9 21:40:05 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc kstat_irqs on cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc irq_cfg on cpu 8 node 2
Nov  9 21:40:18 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc kstat_irqs on cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc irq_cfg on cpu 12 node 3

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/Kconfig                   |   11 
 arch/x86/include/asm/hpet.h        |    5 
 arch/x86/include/asm/irq_vectors.h |    2 
 arch/x86/kernel/hpet.c             |    8 
 arch/x86/kernel/i8259.c            |   37 +-
 arch/x86/kernel/io_apic.c          |  665 ++++++++++++++++++++++++++-----------
 arch/x86/kernel/irq.c              |   24 +
 arch/x86/kernel/irq_32.c           |    2 
 arch/x86/kernel/irq_64.c           |   16 
 arch/x86/kernel/irqinit_32.c       |    3 
 arch/x86/kernel/irqinit_64.c       |    3 
 arch/x86/kernel/uv_irq.c           |   26 +
 arch/x86/mm/init_32.c              |    3 
 drivers/char/random.c              |   31 +
 drivers/pci/htirq.c                |   27 +
 drivers/pci/intel-iommu.c          |    8 
 drivers/pci/intr_remapping.c       |   62 +++
 drivers/pci/msi.c                  |   42 +-
 drivers/xen/events.c               |    9 
 fs/proc/interrupts.c               |   13 
 fs/proc/stat.c                     |   17 
 include/linux/dmar.h               |    5 
 include/linux/htirq.h              |    6 
 include/linux/interrupt.h          |    2 
 include/linux/irq.h                |   78 ++++
 include/linux/irqnr.h              |   15 
 include/linux/kernel_stat.h        |   14 
 include/linux/msi.h                |    6 
 init/main.c                        |    2 
 kernel/irq/autoprobe.c             |   10 
 kernel/irq/chip.c                  |   82 +++-
 kernel/irq/handle.c                |  388 ++++++++++++++++++++-
 kernel/irq/migration.c             |   18 +
 kernel/irq/proc.c                  |    3 
 kernel/irq/spurious.c              |    4 
 35 files changed, 1365 insertions(+), 282 deletions(-)

Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/arch/x86/Kconfig	2011-02-05 16:34:42.000000000 -0800
@@ -236,6 +236,17 @@
 	def_bool y
 	depends on X86_VOYAGER
 
+config SPARSE_IRQ
+	bool "Support sparse irq numbering"
+	depends on PCI_MSI || HT_IRQ
+	default y
+	help
+	  This enables support for sparse irq, esp for msi/msi-x. the irq
+	  number will be bus/dev/fn + 12bit. You may need if you have lots of
+	  cards supports msi-x installed.
+
+	  If you don't know what to do here, say Y.
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
Index: linux-2.6/arch/x86/kernel/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/io_apic.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/arch/x86/kernel/io_apic.c	2011-02-05 17:16:28.000000000 -0800
@@ -108,94 +108,240 @@
 early_param("noapic", parse_noapic);
 
 struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+	int apic, pin;
+	struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+	struct irq_pin_list *pin;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	pin = kzalloc_node(sizeof(*pin), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+
+	return pin;
+}
+
 struct irq_cfg {
-	unsigned int irq;
 	struct irq_pin_list *irq_2_pin;
 	cpumask_t domain;
 	cpumask_t old_domain;
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
+#ifdef CONFIG_SPARSE_IRQ
+	u8 move_desc_in_progress_in_same_domain : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
 static struct irq_cfg irq_cfgx[NR_IRQS] = {
-	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-	[3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-	[4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-	[5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-	[6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-	[7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-	[8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-	[9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-	[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-	[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-	[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-	[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-	[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+#endif
+	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
 };
 
-#define for_each_irq_cfg(irq, cfg)		\
-	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+void __init arch_early_irq_init_work(void)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int count;
+	int i;
+#ifdef CONFIG_SPARSE_IRQ
+	int count_desc = NR_IRQS_LEGACY;
+#else
+	int count_desc = NR_IRQS;
+#endif
+
+	cfg = irq_cfgx;
+	count = ARRAY_SIZE(irq_cfgx);
 
+	BUG_ON(count > count_desc);
+
+	for (i = 0; i < count; i++) {
+		desc = irq_to_desc(i);
+		desc->chip_data = &cfg[i];
+	}
+}
+
+#ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+	struct irq_cfg *cfg = NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		cfg = desc->chip_data;
+
+	return cfg;
 }
 
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 {
-	return irq_cfg(irq);
+	struct irq_cfg *cfg;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
+
+	return cfg;
 }
 
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+static void free_irq_cfg(struct irq_cfg *cfg)
+{
+	kfree(cfg);
+}
 
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
 
-struct irq_pin_list {
-	int apic, pin;
-	struct irq_pin_list *next;
-};
+	cfg = desc->chip_data;
+	if (!cfg)
+		desc->chip_data = get_one_free_irq_cfg(cpu);
+}
 
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu);
 
-static void __init irq_2_pin_init(void)
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_head;
-	int i;
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
+
+	cfg = get_one_free_irq_cfg(cpu);
+	desc->chip_data = cfg;
 
-	for (i = 1; i < PIN_MAP_SIZE; i++)
-		pin[i-1].next = &pin[i];
+	old_cfg = old_desc->chip_data;
 
-	irq_2_pin_ptr = &pin[0];
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
 }
 
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+static void free_irq_2_pin(struct irq_cfg *cfg);
+
+void arch_free_chip_data(struct irq_desc *desc)
 {
-	struct irq_pin_list *pin = irq_2_pin_ptr;
+	struct irq_cfg *cfg;
 
-	if (!pin)
-		panic("can not get more irq_2_pin\n");
+	cfg = desc->chip_data;
+	if (cfg) {
+		free_irq_2_pin(cfg);
+		free_irq_cfg(cfg);
+		desc->chip_data = NULL;
+	}
+}
 
-	irq_2_pin_ptr = pin->next;
-	pin->next = NULL;
-	return pin;
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu)
+{
+	struct irq_pin_list *old_entry, *tail, *entry;
+
+	cfg->irq_2_pin = NULL;
+	old_entry = old_cfg->irq_2_pin;
+	if (!old_entry)
+		return;
+
+	entry = get_one_free_irq_2_pin(cpu);
+	entry->apic = old_entry->apic;
+	entry->pin = old_entry->pin;
+	cfg->irq_2_pin = entry;
+	tail = entry;
+	old_entry = old_entry->next;
+
+	while (old_entry) {
+		entry = get_one_free_irq_2_pin(cpu);
+		entry->apic = old_entry->apic;
+		entry->pin = old_entry->pin;
+		tail->next = entry;
+		tail = entry;
+		old_entry = old_entry->next;
+	}
+
+	tail->next = NULL;
+}
+
+static void free_irq_2_pin(struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry, *next;
+
+	entry = cfg->irq_2_pin;
+
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	cfg->irq_2_pin = NULL;
+}
+
+
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+#ifdef CONFIG_SMP
+	struct irq_cfg *cfg = desc->chip_data;
+
+	if (!cfg->move_in_progress) {
+		/* it means domain is not changed */
+		cpumask_t tmp;
+
+		cpus_and(tmp, desc->affinity, mask);
+		if (cpus_empty(tmp))
+			cfg->move_desc_in_progress_in_same_domain = 1;
+	}
+#endif
+}
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irq < nr_irqs ? irq_cfgx + irq : NULL;
 }
 
+static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+}
+
+#endif
+
 struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
@@ -237,11 +383,10 @@
 	writel(value, &io_apic->data);
 }
 
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(unsigned int irq, struct irq_cfg *cfg)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
-	struct irq_cfg *cfg = irq_cfg(irq);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	entry = cfg->irq_2_pin;
@@ -323,13 +468,12 @@
 }
 
 #ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
 {
 	int apic, pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
 
-	cfg = irq_cfg(irq);
 	entry = cfg->irq_2_pin;
 	for (;;) {
 		unsigned int reg;
@@ -359,7 +503,7 @@
 	}
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
 
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 {
@@ -373,10 +517,13 @@
 	if (cpus_empty(tmp))
 		return;
 
-	cfg = irq_cfg(irq);
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 	/*
@@ -384,9 +531,8 @@
 	 */
 	dest = SET_APIC_LOGICAL_ID(dest);
 
-	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__target_IO_APIC_irq(irq, dest, cfg->vector);
+	__target_IO_APIC_irq(irq, dest, cfg);
 	desc->affinity = mask;
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
@@ -397,16 +543,13 @@
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	/* first time to refer irq_cfg, so with new */
-	cfg = irq_cfg_alloc(irq);
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin();
+		entry = get_one_free_irq_2_pin(cpu);
 		cfg->irq_2_pin = entry;
 		entry->apic = apic;
 		entry->pin = pin;
@@ -421,20 +564,31 @@
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin();
+	entry->next = get_one_free_irq_2_pin(cpu);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
 }
 
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int cpu = smp_processor_id();
+
+	/* first time to refer irq_cfg, so with new */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	cfg = desc->chip_data;
+	add_pin_to_irq_cpu(cfg, cpu, apic, pin);
+}
+
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
 	struct irq_pin_list *entry = cfg->irq_2_pin;
 	int replaced = 0;
 
@@ -451,18 +605,16 @@
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq(irq, newapic, newpin);
+		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
 
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
 				int mask_and, int mask_or,
 				void (*final)(struct irq_pin_list *entry))
 {
 	int pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	cfg = irq_cfg(irq);
 	for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
 		unsigned int reg;
 		pin = entry->pin;
@@ -475,9 +627,9 @@
 	}
 }
 
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
 #ifdef CONFIG_X86_64
@@ -492,44 +644,62 @@
 	readl(&io_apic->data);
 }
 
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 }
 #else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
 			IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
 			IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
 #endif /* CONFIG_X86_32 */
 
-static void mask_IO_APIC_irq (unsigned int irq)
+#ifdef CONFIG_SPARSE_IRQ
+static void mask_IO_APIC_irq(unsigned int irq, struct irq_desc **descp)
+{
+#else
+static void mask_IO_APIC_irq(unsigned int irq)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc **descp = &desc;
+#endif
+	struct irq_cfg *cfg = (*descp)->chip_data;
 	unsigned long flags;
 
+	BUG_ON(!cfg);
+
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__mask_IO_APIC_irq(irq);
+	__mask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_IO_APIC_irq (unsigned int irq)
+#ifdef CONFIG_SPARSE_IRQ
+static void unmask_IO_APIC_irq(unsigned int irq, struct irq_desc **descp)
 {
+#else
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc **descp = &desc;
+#endif
+	struct irq_cfg *cfg = (*descp)->chip_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_IO_APIC_irq(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -809,7 +979,7 @@
  */
 static int EISA_ELCR(unsigned int irq)
 {
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		unsigned int port = 0x4d0 + (irq >> 3);
 		return (inb(port) >> (irq & 7)) & 1;
 	}
@@ -1034,7 +1204,7 @@
 	spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1050,16 +1220,13 @@
 	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
 	unsigned int old_vector;
 	int cpu;
-	struct irq_cfg *cfg;
 
-	cfg = irq_cfg(irq);
+	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+		return -EBUSY;
 
 	/* Only try and allocate irqs on cpus that are present */
 	cpus_and(mask, mask, cpu_online_map);
 
-	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
-		return -EBUSY;
-
 	old_vector = cfg->vector;
 	if (old_vector) {
 		cpumask_t tmp;
@@ -1113,24 +1280,22 @@
 	return -ENOSPC;
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	int err;
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, cfg, mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-	struct irq_cfg *cfg;
 	cpumask_t mask;
 	int cpu, vector;
 
-	cfg = irq_cfg(irq);
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
@@ -1148,14 +1313,16 @@
 	/* This function must be called with vector_lock held */
 	int irq, vector;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	/* Mark the inuse vectors */
-	for_each_irq_cfg(irq, cfg) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
 		if (!cpu_isset(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
-	}
+	} end_for_each_irq_desc();
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
 		irq = per_cpu(vector_irq, cpu)[vector];
@@ -1205,7 +1372,8 @@
 {
 	struct irq_desc *desc;
 
-	desc = irq_to_desc(irq);
+	/* could be first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
@@ -1310,7 +1478,7 @@
 	cfg = irq_cfg(irq);
 
 	mask = TARGET_CPUS;
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
 	cpus_and(mask, cfg->domain, mask);
@@ -1327,12 +1495,12 @@
 			       cfg->vector)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic].mp_apicid, pin);
-		__clear_irq_vector(irq);
+		__clear_irq_vector(irq, cfg);
 		return;
 	}
 
 	ioapic_register_intr(irq, trigger);
-	if (irq < 16)
+	if (irq < NR_IRQS_LEGACY)
 		disable_8259A_irq(irq);
 
 	ioapic_write_entry(apic, pin, entry);
@@ -1434,6 +1602,7 @@
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 	unsigned int irq;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1523,8 +1692,10 @@
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for_each_irq_cfg(irq, cfg) {
-		struct irq_pin_list *entry = cfg->irq_2_pin;
+	for_each_irq_desc(irq, desc) {
+		struct irq_pin_list *entry;
+		cfg = desc->chip_data;
+		entry = cfg->irq_2_pin;
 		if (!entry)
 			continue;
 		printk(KERN_DEBUG "IRQ%d ", irq);
@@ -1535,7 +1706,7 @@
 			entry = entry->next;
 		}
 		printk("\n");
-	}
+	} end_for_each_irq_desc();
 
 	printk(KERN_INFO ".................................... done.\n");
 
@@ -2008,14 +2179,16 @@
 {
 	int was_pending = 0;
 	unsigned long flags;
+	struct irq_cfg *cfg;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		disable_8259A_irq(irq);
 		if (i8259A_irq_pending(irq))
 			was_pending = 1;
 	}
-	__unmask_IO_APIC_irq(irq);
+	cfg = irq_cfg(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
@@ -2078,10 +2251,9 @@
  * as simple as edge triggered migration and we can do the irq migration
  * with a simple atomic update to IO-APIC RTE.
  */
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void migrate_ioapic_irq(int irq, struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
 	int modify_ioapic_rte;
@@ -2095,18 +2267,19 @@
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
-	desc = irq_to_desc(irq);
 	modify_ioapic_rte = desc->status & IRQ_LEVEL;
 	if (modify_ioapic_rte) {
 		spin_lock_irqsave(&ioapic_lock, flags);
-		__target_IO_APIC_irq(irq, dest, cfg->vector);
+		__target_IO_APIC_irq(irq, dest, cfg);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -2128,14 +2301,18 @@
 	desc->affinity = mask;
 }
 
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level(int irq, struct irq_desc *desc)
 {
 	int ret = -1;
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
 
+#ifdef CONFIG_SPARSE_IRQ
+	mask_IO_APIC_irq(irq, &desc);
+#else
 	mask_IO_APIC_irq(irq);
+#endif
 
-	if (io_apic_level_ack_pending(irq)) {
+	if (io_apic_level_ack_pending(irq, cfg)) {
 		/*
 		 * Interrupt in progress. Migrating irq now will change the
 		 * vector information in the IO-APIC RTE and that will confuse
@@ -2147,14 +2324,19 @@
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq(irq, desc->pending_mask);
+	migrate_ioapic_irq(irq, desc, desc->pending_mask);
 
 	ret = 0;
 	desc->status &= ~IRQ_MOVE_PENDING;
 	cpus_clear(desc->pending_mask);
 
 unmask:
+#ifdef CONFIG_SPARSE_IRQ
+	unmask_IO_APIC_irq(irq, &desc);
+#else
 	unmask_IO_APIC_irq(irq);
+#endif
+
 	return ret;
 }
 
@@ -2178,7 +2360,7 @@
 			desc->chip->set_affinity(irq, desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
@@ -2191,11 +2373,11 @@
 	if (desc->status & IRQ_LEVEL) {
 		desc->status |= IRQ_MOVE_PENDING;
 		desc->pending_mask = mask;
-		migrate_irq_remapped_level(irq);
+		migrate_irq_remapped_level(irq, desc);
 		return;
 	}
 
-	migrate_ioapic_irq(irq, mask);
+	migrate_ioapic_irq(irq, desc, mask);
 }
 #endif
 
@@ -2236,19 +2418,40 @@
 	irq_exit();
 }
 
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_desc *desc = *descp;
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress))
+	if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_SPARSE_IRQ
+		if (likely(!cfg->move_desc_in_progress_in_same_domain))
+			return;
+
+		/* domain is not change, but affinity is changed */
+		me = smp_processor_id();
+		if (cpu_isset(me, desc->affinity)) {
+			*descp = desc = move_irq_desc(desc, me);
+			/* get the new one */
+			cfg = desc->chip_data;
+			cfg->move_desc_in_progress_in_same_domain = 0;
+		}
+#endif
 		return;
+	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
 
+#ifdef CONFIG_SPARSE_IRQ
+		*descp = desc = move_irq_desc(desc, me);
+		/* get the new one */
+		cfg = desc->chip_data;
+#endif
+
 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
@@ -2256,43 +2459,76 @@
 	}
 }
 #else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
 #ifdef CONFIG_INTR_REMAP
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_x2apic_level(unsigned int irq, struct irq_desc **descp)
+#else
 static void ack_x2apic_level(unsigned int irq)
+#endif
 {
 	ack_x2APIC_irq();
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_x2apic_edge(unsigned int irq, struct irq_desc **descp)
+#else
 static void ack_x2apic_edge(unsigned int irq)
+#endif
 {
 	ack_x2APIC_irq();
 }
 #endif
 
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_apic_edge(unsigned int irq, struct irq_desc **descp)
+{
+	irq_complete_move(descp);
+#ifdef CONFIG_SMP
+	move_native_irq(irq, descp);
+#endif
+	ack_APIC_irq();
+}
+#else
 static void ack_apic_edge(unsigned int irq)
 {
-	irq_complete_move(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	irq_complete_move(&desc);
 	move_native_irq(irq);
 	ack_APIC_irq();
 }
+#endif
 
 atomic_t irq_mis_count;
 
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_apic_level(unsigned int irq, struct irq_desc **descp)
+{
+#else
 static void ack_apic_level(unsigned int irq)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc **descp = &desc;
+#endif
 #ifdef CONFIG_X86_32
 	unsigned long v;
 	int i;
 #endif
+	struct irq_cfg *cfg;
 	int do_unmask_irq = 0;
 
-	irq_complete_move(irq);
+	irq_complete_move(descp);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+	if (unlikely((*descp)->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
+#ifdef CONFIG_SPARSE_IRQ
+		mask_IO_APIC_irq(irq, descp);
+#else
 		mask_IO_APIC_irq(irq);
+#endif
 	}
 #endif
 
@@ -2316,7 +2552,8 @@
 	* operation to prevent an edge-triggered interrupt escaping meanwhile.
 	* The idea is from Manfred Spraul.  --macro
 	*/
-	i = irq_cfg(irq)->vector;
+	cfg = (*descp)->chip_data;
+	i = cfg->vector;
 
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 #endif
@@ -2355,17 +2592,27 @@
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(irq))
+		cfg = (*descp)->chip_data;
+#ifdef CONFIG_SPARSE_IRQ
+		if (!io_apic_level_ack_pending(irq, cfg)) {
+# ifdef CONFIG_SMP
+			move_masked_irq(irq, descp);
+# endif
+		}
+		unmask_IO_APIC_irq(irq, descp);
+#else
+		if (!io_apic_level_ack_pending(irq, cfg))
 			move_masked_irq(irq);
 		unmask_IO_APIC_irq(irq);
+#endif
 	}
 
 #ifdef CONFIG_X86_32
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		__mask_and_edge_IO_APIC_irq(cfg);
+		__unmask_and_level_IO_APIC_irq(cfg);
 		spin_unlock(&ioapic_lock);
 	}
 #endif
@@ -2416,29 +2663,32 @@
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for_each_irq_cfg(irq, cfg) {
-		if (IO_APIC_IRQ(irq) && !cfg->vector) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
+		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
 			 * interrupt if we can..
 			 */
-			if (irq < 16)
+			if (irq < NR_IRQS_LEGACY)
 				make_8259A_irq(irq);
-			else {
-				desc = irq_to_desc(irq);
+			else
 				/* Strange. Oh, well.. */
 				desc->chip = &no_irq_chip;
-			}
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
  * The local APIC irq-chip implementation:
  */
 
-static void mask_lapic_irq(unsigned int irq)
+#ifdef CONFIG_SPARSE_IRQ
+static void mask_lapic_irq (unsigned int irq, struct irq_desc **descp)
+#else
+static void mask_lapic_irq (unsigned int irq)
+#endif
 {
 	unsigned long v;
 
@@ -2446,7 +2696,11 @@
 	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
-static void unmask_lapic_irq(unsigned int irq)
+#ifdef CONFIG_SPARSE_IRQ
+static void unmask_lapic_irq (unsigned int irq, struct irq_desc **descp)
+#else
+static void unmask_lapic_irq (unsigned int irq)
+#endif
 {
 	unsigned long v;
 
@@ -2454,7 +2708,11 @@
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_lapic_irq (unsigned int irq, struct irq_desc **descp)
+#else
 static void ack_lapic_irq (unsigned int irq)
+#endif
 {
 	ack_APIC_irq();
 }
@@ -2574,7 +2832,11 @@
  */
 static inline void __init check_timer(void)
 {
+#ifdef CONFIG_SPARSE_IRQ
+	struct irq_desc *desc = irq_to_desc(0);
+#endif
 	struct irq_cfg *cfg = irq_cfg(0);
+	int cpu = smp_processor_id();
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	unsigned int ver;
@@ -2589,7 +2851,7 @@
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	assign_irq_vector(0, TARGET_CPUS);
+	assign_irq_vector(0, cfg, TARGET_CPUS);
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2640,10 +2902,14 @@
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq(0, apic1, pin1);
+			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
+#ifdef CONFIG_SPARSE_IRQ
+		unmask_IO_APIC_irq(0, &desc);
+#else
 		unmask_IO_APIC_irq(0);
+#endif
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
@@ -2669,9 +2935,13 @@
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+#ifdef CONFIG_SPARSE_IRQ
+		unmask_IO_APIC_irq(0, &desc);
+#else
 		unmask_IO_APIC_irq(0);
+#endif
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2888,22 +3158,27 @@
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
-	struct irq_cfg *cfg_new;
+	struct irq_cfg *cfg_new = NULL;
+	int cpu;
+	struct irq_desc *desc_new = NULL;
 
+#ifndef CONFIG_SPARSE_IRQ
 	irq_want = nr_irqs - 1;
+#endif
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
+	cpu = smp_processor_id();
 	for (new = irq_want; new > 0; new--) {
 		if (platform_legacy_irq(new))
 			continue;
-		cfg_new = irq_cfg(new);
-		if (cfg_new && cfg_new->vector != 0)
+
+		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		cfg_new = desc_new->chip_data;
+
+		if (cfg_new->vector != 0)
 			continue;
-		/* check if need to create one */
-		if (!cfg_new)
-			cfg_new = irq_cfg_alloc(new);
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
 	}
@@ -2911,6 +3186,9 @@
 
 	if (irq > 0) {
 		dynamic_irq_init(irq);
+		/* restore it, in case dynamic_irq_init clear it */
+		if (desc_new)
+			desc_new->chip_data = cfg_new;
 	}
 	return irq;
 }
@@ -2930,14 +3208,22 @@
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
+	/* store it, in case dynamic_irq_cleanup clear it */
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
 	dynamic_irq_cleanup(irq);
+	/* connect back irq_cfg */
+	if (desc)
+		desc->chip_data = cfg;
 
 #ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
 #endif
 	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
+	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
@@ -2952,12 +3238,12 @@
 	unsigned dest;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (err)
 		return err;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, tmp);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3025,10 +3311,13 @@
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3040,7 +3329,6 @@
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	write_msi_msg(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 
@@ -3064,10 +3352,13 @@
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3091,7 +3382,6 @@
 		cfg->move_in_progress = 0;
 	}
 
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif
@@ -3176,7 +3466,7 @@
 #endif
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
-	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for MSI/MSI-X\n", irq, irq);
 
 	return 0;
 }
@@ -3185,6 +3475,7 @@
 {
 	unsigned int irq;
 
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
 	irq = dev->bus->number;
 	irq <<= 8;
 	irq |= dev->devfn;
@@ -3199,7 +3490,7 @@
 	int ret;
 	unsigned int irq_want;
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 
 	irq = create_irq_nr(irq_want);
 	if (irq == 0)
@@ -3240,7 +3531,8 @@
 	int index = 0;
 #endif
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	/* count from the top 0xfff in 12 bits range */
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 	sub_handle = 0;
 	list_for_each_entry(desc, &dev->msi_list, list) {
 		irq = create_irq_nr(irq_want--);
@@ -3306,10 +3598,13 @@
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3321,7 +3616,6 @@
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
@@ -3367,10 +3661,13 @@
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3382,7 +3679,6 @@
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
@@ -3448,15 +3744,17 @@
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif
@@ -3478,13 +3776,13 @@
 	int err;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (!err) {
 		struct ht_irq_msg msg;
 		unsigned dest;
 
-		cfg = irq_cfg(irq);
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
@@ -3508,7 +3806,8 @@
 		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
 					      handle_edge_irq, "edge");
 
-		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
+		dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for HT\n",
+				 irq, irq);
 	}
 	return err;
 }
@@ -3530,7 +3829,9 @@
 	unsigned long flags;
 	int err;
 
-	err = assign_irq_vector(irq, *eligible_cpu);
+	cfg = irq_cfg(irq);
+
+	err = assign_irq_vector(irq, cfg, *eligible_cpu);
 	if (err != 0)
 		return err;
 
@@ -3539,8 +3840,6 @@
 				      irq_name);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	cfg = irq_cfg(irq);
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3594,6 +3893,7 @@
 
 int __init probe_nr_irqs(void)
 {
+#ifdef CONFIG_SPARSE_IRQ
 	int idx;
 	int nr = 0;
 #ifndef CONFIG_XEN
@@ -3611,10 +3911,11 @@
 	/* something wrong ? */
 	if (nr < nr_min)
 		nr = nr_min;
-	if (WARN_ON(nr > NR_IRQS))
-		nr = NR_IRQS;
 
 	return nr;
+#else
+	return NR_IRQS;
+#endif
 }
 
 /* --------------------------------------------------------------------------
@@ -3722,7 +4023,7 @@
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
-	if (irq >= 16)
+	if (irq >= NR_IRQS_LEGACY)
 		add_pin_to_irq(irq, ioapic, pin);
 
 	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
@@ -3852,7 +4153,6 @@
 	struct resource *ioapic_res;
 	int i;
 
-	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
Index: linux-2.6/arch/x86/kernel/irqinit_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_32.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/arch/x86/kernel/irqinit_32.c	2011-02-05 16:34:42.000000000 -0800
@@ -68,8 +68,7 @@
 	/*
 	 * 16 old-style INTA-cycle interrupts:
 	 */
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/kernel/irqinit_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_64.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/arch/x86/kernel/irqinit_64.c	2011-02-05 16:34:42.000000000 -0800
@@ -142,8 +142,7 @@
 	init_bsp_APIC();
 	init_8259A(0);
 
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/mm/init_32.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_32.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/arch/x86/mm/init_32.c	2011-02-05 16:34:42.000000000 -0800
@@ -66,6 +66,7 @@
 static unsigned long __meminitdata table_top;
 
 static int __initdata after_init_bootmem;
+int after_bootmem;
 
 static __init void *alloc_low_page(unsigned long *phys)
 {
@@ -987,6 +988,8 @@
 
 	set_highmem_pages_init();
 
+	after_bootmem = 1;
+
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
Index: linux-2.6/drivers/char/random.c
===================================================================
--- linux-2.6.orig/drivers/char/random.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/drivers/char/random.c	2011-02-05 16:34:42.000000000 -0800
@@ -558,6 +558,8 @@
 	unsigned dont_count_entropy:1;
 };
 
+#ifndef CONFIG_SPARSE_IRQ
+
 static struct timer_rand_state *irq_timer_state[NR_IRQS];
 
 static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
@@ -576,6 +578,33 @@
 	irq_timer_state[irq] = state;
 }
 
+#else
+
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return NULL;
+
+	return desc->timer_rand_state;
+}
+
+static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return;
+
+	desc->timer_rand_state = state;
+}
+#endif
+
 static struct timer_rand_state input_timer_state;
 
 /*
@@ -933,8 +962,10 @@
 {
 	struct timer_rand_state *state;
 
+#ifndef CONFIG_SPARSE_IRQ
 	if (irq >= nr_irqs)
 		return;
+#endif
 
 	state = get_timer_rand_state(irq);
 
Index: linux-2.6/drivers/pci/htirq.c
===================================================================
--- linux-2.6.orig/drivers/pci/htirq.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/drivers/pci/htirq.c	2011-02-05 16:34:42.000000000 -0800
@@ -58,7 +58,11 @@
 	*msg = cfg->msg;
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+void mask_ht_irq(unsigned int irq, struct irq_desc **descp)
+#else
 void mask_ht_irq(unsigned int irq)
+#endif
 {
 	struct ht_irq_cfg *cfg;
 	struct ht_irq_msg msg;
@@ -70,7 +74,11 @@
 	write_ht_irq_msg(irq, &msg);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+void unmask_ht_irq(unsigned int irq, struct irq_desc **descp)
+#else
 void unmask_ht_irq(unsigned int irq)
+#endif
 {
 	struct ht_irq_cfg *cfg;
 	struct ht_irq_msg msg;
@@ -82,6 +90,19 @@
 	write_ht_irq_msg(irq, &msg);
 }
 
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+	unsigned int irq;
+
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
+	irq = dev->bus->number;
+	irq <<= 8;
+	irq |= dev->devfn;
+	irq <<= 12;
+
+	return irq;
+}
+
 /**
  * __ht_create_irq - create an irq and attach it to a device.
  * @dev: The hypertransport device to find the irq capability on.
@@ -98,6 +119,7 @@
 	int max_irq;
 	int pos;
 	int irq;
+	unsigned int irq_want;
 
 	pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
 	if (!pos)
@@ -125,7 +147,12 @@
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
+	irq_want = build_irq_for_pci_dev(dev);
+#ifdef CONFIG_SPARSE_IRQ
+	irq = create_irq_nr(irq_want + idx);
+#else
 	irq = create_irq();
+#endif
 
 	if (irq <= 0) {
 		kfree(cfg);
Index: linux-2.6/drivers/pci/intr_remapping.c
===================================================================
--- linux-2.6.orig/drivers/pci/intr_remapping.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/drivers/pci/intr_remapping.c	2011-02-05 16:34:42.000000000 -0800
@@ -19,17 +19,73 @@
 	u8  irte_mask;
 };
 
-static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+{
+	struct irq_2_iommu *iommu;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	iommu = kzalloc_node(sizeof(*iommu), GFP_KERNEL, node);
+	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+
+	return iommu;
+}
 
 static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 {
-	return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (WARN_ON_ONCE(!desc))
+		return NULL;
+
+	return desc->irq_2_iommu;
 }
 
+static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct irq_2_iommu *irq_iommu;
+
+	/*
+	 * alloc irq desc if not allocated already.
+	 */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+
+	irq_iommu = desc->irq_2_iommu;
+
+	if (!irq_iommu)
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+
+	return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+	return irq_2_iommu_alloc_cpu(irq, -1);
+}
+
+#else /* !CONFIG_SPARSE_IRQ */
+
+static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_2_iommuX[irq];
+
+	return NULL;
+}
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
 	return irq_2_iommu(irq);
 }
+#endif
 
 static DEFINE_SPINLOCK(irq_2_ir_lock);
 
@@ -86,9 +142,11 @@
 	if (!count)
 		return -1;
 
+#ifndef CONFIG_SPARSE_IRQ
 	/* protect irq_2_iommu_alloc later */
 	if (irq >= nr_irqs)
 		return -1;
+#endif
 
 	/*
 	 * start the IRTE search from index 0.
Index: linux-2.6/drivers/xen/events.c
===================================================================
--- linux-2.6.orig/drivers/xen/events.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/drivers/xen/events.c	2011-02-05 16:34:42.000000000 -0800
@@ -141,8 +141,9 @@
 	int i;
 
 	/* By default all event channels notify CPU#0. */
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		desc->affinity = cpumask_of_cpu(0);
+	} end_for_each_irq_desc();
 #endif
 
 	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@ -231,7 +232,7 @@
 	int irq;
 
 	/* Only allocate from dynirq range */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		if (irq_bindcount[irq] == 0)
 			break;
 
@@ -792,7 +793,7 @@
 		mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		irq_info[irq].evtchn = 0; /* zap event-channel binding */
 
 	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@ -824,7 +825,7 @@
 		mask_evtchn(i);
 
 	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-	for_each_irq_nr(i)
+	for (i = 0; i < nr_irqs; i++)
 		irq_bindcount[i] = 0;
 
 	irq_ctx_init(smp_processor_id());
Index: linux-2.6/fs/proc/stat.c
===================================================================
--- linux-2.6.orig/fs/proc/stat.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/fs/proc/stat.c	2011-02-05 16:34:42.000000000 -0800
@@ -27,6 +27,9 @@
 	u64 sum = 0;
 	struct timespec boottime;
 	unsigned int per_irq_sum;
+#ifdef CONFIG_GENERIC_HARDIRQS
+	struct irq_desc *desc;
+#endif
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -44,10 +47,9 @@
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-
-		for_each_irq_nr(j)
+		for_each_irq_desc(j, desc) {
 			sum += kstat_irqs_cpu(j, i);
-
+		} end_for_each_irq_desc();
 		sum += arch_irq_stat_cpu(i);
 	}
 	sum += arch_irq_stat();
@@ -90,14 +92,17 @@
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_nr(j) {
+	for_each_irq_desc(j, desc) {
 		per_irq_sum = 0;
-
 		for_each_possible_cpu(i)
 			per_irq_sum += kstat_irqs_cpu(j, i);
 
+#ifdef CONFIG_SPARSE_IRQ
+		seq_printf(p, " %#x:%u", j, per_irq_sum);
+#else
 		seq_printf(p, " %u", per_irq_sum);
-	}
+#endif
+	} end_for_each_irq_desc();
 
 	seq_printf(p,
 		"\nctxt %llu\n"
Index: linux-2.6/fs/proc/interrupts.c
===================================================================
--- linux-2.6.orig/fs/proc/interrupts.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/fs/proc/interrupts.c	2011-02-05 16:34:42.000000000 -0800
@@ -10,20 +10,31 @@
  */
 static void *int_seq_start(struct seq_file *f, loff_t *pos)
 {
+#ifdef CONFIG_SPARSE_IRQ
+	rcu_read_lock();
+	return seq_list_start(&sparse_irqs_head, *pos);
+#else
 	return (*pos <= nr_irqs) ? pos : NULL;
+#endif
 }
 
 static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
 {
+#ifdef CONFIG_SPARSE_IRQ
+	return seq_list_next(v, &sparse_irqs_head, pos);
+#else
 	(*pos)++;
 	if (*pos > nr_irqs)
 		return NULL;
 	return pos;
+#endif
 }
 
 static void int_seq_stop(struct seq_file *f, void *v)
 {
-	/* Nothing to do */
+#ifdef CONFIG_SPARSE_IRQ
+	rcu_read_unlock();
+#endif
 }
 
 static const struct seq_operations int_seq_ops = {
Index: linux-2.6/include/linux/interrupt.h
===================================================================
--- linux-2.6.orig/include/linux/interrupt.h	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/include/linux/interrupt.h	2011-02-05 16:34:42.000000000 -0800
@@ -18,6 +18,8 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
+extern int nr_irqs;
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/include/linux/irq.h	2011-02-05 17:04:19.000000000 -0800
@@ -106,11 +106,19 @@
 	void		(*enable)(unsigned int irq);
 	void		(*disable)(unsigned int irq);
 
-	void		(*ack)(unsigned int irq);
+#ifdef CONFIG_SPARSE_IRQ
+	void		(*mask)(unsigned int irq, struct irq_desc **descp);
+	void		(*unmask)(unsigned int irq, struct irq_desc **descp);
+	void		(*ack)(unsigned int irq, struct irq_desc **descp);
+	void		(*mask_ack)(unsigned int irq, struct irq_desc **descp);
+	void		(*eoi)(unsigned int irq, struct irq_desc **descp);
+#else
 	void		(*mask)(unsigned int irq);
-	void		(*mask_ack)(unsigned int irq);
 	void		(*unmask)(unsigned int irq);
+	void		(*ack)(unsigned int irq);
+	void		(*mask_ack)(unsigned int irq);
 	void		(*eoi)(unsigned int irq);
+#endif
 
 	void		(*end)(unsigned int irq);
 	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
@@ -129,6 +137,8 @@
 	const char	*typename;
 };
 
+struct timer_rand_state;
+struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -155,6 +165,15 @@
  */
 struct irq_desc {
 	unsigned int		irq;
+#ifdef CONFIG_SPARSE_IRQ
+	struct list_head	list;
+	struct list_head	hash_entry;
+	struct timer_rand_state *timer_rand_state;
+	unsigned int            *kstat_irqs;
+# ifdef CONFIG_INTR_REMAP
+	struct irq_2_iommu      *irq_2_iommu;
+# endif
+#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -182,13 +201,54 @@
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+extern void arch_early_irq_init_work(void);
+extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
+					struct irq_desc *desc, int cpu);
+extern void arch_free_chip_data(struct irq_desc *desc);
+
+#ifndef CONFIG_SPARSE_IRQ
 
+/* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
 
-static inline struct irq_desc *irq_to_desc(unsigned int irq)
-{
-	return (irq < nr_irqs) ? irq_desc + irq : NULL;
-}
+#ifdef CONFIG_GENERIC_HARDIRQS
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+# define for_each_irq_desc_reverse(irq, desc)                          \
+	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
+	    irq >= 0; irq--, desc--)
+
+#define end_for_each_irq_desc()
+#endif
+
+#else
+
+void early_irq_init_work(void);
+extern struct list_head sparse_irqs_head;
+#define for_each_irq_desc(irqX, desc)					\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.next), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.next), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.next), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define for_each_irq_desc_reverse(irqX, desc)				\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.prev), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.prev), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.prev), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define end_for_each_irq_desc() rcu_read_unlock()
+
+#define kstat_irqs_this_cpu(DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()])
+#define kstat_incr_irqs_this_cpu(irqno, DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()]++)
+#endif
 
 /*
  * Migration helpers for obsolete names, they will go away:
@@ -211,8 +271,13 @@
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 
+#ifdef CONFIG_SPARSE_IRQ
+void move_native_irq(int irq, struct irq_desc **descp);
+void move_masked_irq(int irq, struct irq_desc **descp);
+#else
 void move_native_irq(int irq);
 void move_masked_irq(int irq);
+#endif
 
 #else /* CONFIG_GENERIC_PENDING_IRQ */
 
Index: linux-2.6/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.orig/include/linux/kernel_stat.h	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/include/linux/kernel_stat.h	2011-02-05 16:34:42.000000000 -0800
@@ -28,7 +28,9 @@
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-	unsigned int irqs[NR_IRQS];
+#ifndef CONFIG_SPARSE_IRQ
+       unsigned int irqs[NR_IRQS];
+#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -39,6 +41,10 @@
 
 extern unsigned long long nr_context_switches(void);
 
+#ifndef CONFIG_SPARSE_IRQ
+#define kstat_irqs_this_cpu(irq) \
+	(kstat_this_cpu.irqs[irq])
+
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
@@ -46,11 +52,17 @@
 {
 	kstat_this_cpu.irqs[irq]++;
 }
+#endif
+
 
+#ifndef CONFIG_SPARSE_IRQ
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
+#else
+extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#endif
 
 /*
  * Number of interrupts per specific IRQ source, since bootup
Index: linux-2.6/kernel/irq/autoprobe.c
===================================================================
--- linux-2.6.orig/kernel/irq/autoprobe.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/kernel/irq/autoprobe.c	2011-02-05 16:34:42.000000000 -0800
@@ -57,7 +57,7 @@
 			desc->chip->startup(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/* Wait for longstanding interrupts to trigger. */
 	msleep(20);
@@ -75,7 +75,7 @@
 				desc->status |= IRQ_PENDING;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/*
 	 * Wait for spurious interrupts to trigger
@@ -99,7 +99,7 @@
 					mask |= 1 << i;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	return mask;
 }
@@ -135,7 +135,7 @@
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	return mask & val;
@@ -179,7 +179,7 @@
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	if (nr_of_irqs > 1)
Index: linux-2.6/kernel/irq/chip.c
===================================================================
--- linux-2.6.orig/kernel/irq/chip.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/kernel/irq/chip.c	2011-02-05 16:34:42.000000000 -0800
@@ -24,9 +24,11 @@
  */
 void dynamic_irq_init(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc;
 	unsigned long flags;
 
+	/* first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
@@ -223,7 +225,11 @@
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
+#ifdef CONFIG_SPARSE_IRQ
+	desc->chip->unmask(irq, &desc);
+#else
 	desc->chip->unmask(irq);
+#endif
 	desc->status &= ~IRQ_MASKED;
 }
 
@@ -252,7 +258,11 @@
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
+#ifdef CONFIG_SPARSE_IRQ
+	desc->chip->mask(irq, &desc);
+#else
 	desc->chip->mask(irq);
+#endif
 	desc->status |= IRQ_MASKED;
 }
 
@@ -282,13 +292,24 @@
 		chip->end = dummy_irq_chip.end;
 }
 
-static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+static inline void mask_ack_irq(struct irq_desc **descp, int irq)
 {
-	if (desc->chip->mask_ack)
+	struct irq_desc *desc = *descp;
+
+	if (desc->chip->mask_ack) {
+#ifdef CONFIG_SPARSE_IRQ
+		desc->chip->mask_ack(irq, descp);
+#else
 		desc->chip->mask_ack(irq);
-	else {
+#endif
+	} else {
+#ifdef CONFIG_SPARSE_IRQ
+		desc->chip->mask(irq, descp);
+		desc->chip->ack(irq, descp);
+#else
 		desc->chip->mask(irq);
 		desc->chip->ack(irq);
+#endif
 	}
 }
 
@@ -351,7 +372,7 @@
 	irqreturn_t action_ret;
 
 	spin_lock(&desc->lock);
-	mask_ack_irq(desc, irq);
+	mask_ack_irq(&desc, irq);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -375,8 +396,13 @@
 
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
-	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) {
+#ifdef CONFIG_SPARSE_IRQ
+		desc->chip->unmask(irq, &desc);
+#else
 		desc->chip->unmask(irq);
+#endif
+	}
 out_unlock:
 	spin_unlock(&desc->lock);
 }
@@ -412,8 +438,13 @@
 	action = desc->action;
 	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
 		desc->status |= IRQ_PENDING;
-		if (desc->chip->mask)
+		if (desc->chip->mask) {
+#ifdef CONFIG_SPARSE_IRQ
+			desc->chip->mask(irq, &desc);
+#else
 			desc->chip->mask(irq);
+#endif
+		}
 		goto out;
 	}
 
@@ -428,7 +459,11 @@
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 out:
+#ifdef CONFIG_SPARSE_IRQ
+	desc->chip->eoi(irq, &desc);
+#else
 	desc->chip->eoi(irq);
+#endif
 
 	spin_unlock(&desc->lock);
 }
@@ -464,13 +499,17 @@
 	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
-		mask_ack_irq(desc, irq);
+		mask_ack_irq(&desc, irq);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
+#ifdef CONFIG_SPARSE_IRQ
+	desc->chip->ack(irq, &desc);
+#else
 	desc->chip->ack(irq);
+#endif
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -480,7 +519,11 @@
 		irqreturn_t action_ret;
 
 		if (unlikely(!action)) {
+#ifdef CONFIG_SPARSE_IRQ
+			desc->chip->mask(irq, &desc);
+#else
 			desc->chip->mask(irq);
+#endif
 			goto out_unlock;
 		}
 
@@ -492,7 +535,11 @@
 		if (unlikely((desc->status &
 			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
 			      (IRQ_PENDING | IRQ_MASKED))) {
+#ifdef CONFIG_SPARSE_IRQ
+			desc->chip->unmask(irq, &desc);
+#else
 			desc->chip->unmask(irq);
+#endif
 			desc->status &= ~IRQ_MASKED;
 		}
 
@@ -524,15 +571,25 @@
 
 	kstat_incr_irqs_this_cpu(irq, desc);
 
-	if (desc->chip->ack)
+	if (desc->chip->ack) {
+#ifdef CONFIG_SPARSE_IRQ
+		desc->chip->ack(irq, &desc);
+#else
 		desc->chip->ack(irq);
+#endif
+	}
 
 	action_ret = handle_IRQ_event(irq, desc->action);
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
-	if (desc->chip->eoi)
+	if (desc->chip->eoi) {
+#ifdef CONFIG_SPARSE_IRQ
+		desc->chip->eoi(irq, &desc);
+#else
 		desc->chip->eoi(irq);
+#endif
+	}
 }
 
 void
@@ -567,8 +624,9 @@
 
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
-		if (desc->chip != &no_irq_chip)
-			mask_ack_irq(desc, irq);
+		if (desc->chip != &no_irq_chip) {
+			mask_ack_irq(&desc, irq);
+		}
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/kernel/irq/handle.c	2011-02-05 17:00:31.000000000 -0800
@@ -15,9 +15,16 @@
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
 
 #include "internals.h"
 
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
@@ -49,6 +56,299 @@
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
+void __init __attribute__((weak)) arch_early_irq_init_work(void)
+{
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_desc irq_desc_init = {
+	.irq	    = -1U,
+	.status	    = IRQ_DISABLED,
+	.chip	    = &no_irq_chip,
+	.handle_irq = handle_bad_irq,
+	.depth      = 1,
+	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+	.affinity   = CPU_MASK_ALL
+#endif
+};
+
+static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+{
+	unsigned long bytes;
+	char *ptr;
+	int node;
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	ptr = kzalloc_node(bytes, GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+
+	desc->kstat_irqs = (unsigned int *)ptr;
+}
+
+#ifdef CONFIG_SMP
+static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc,
+				 int cpu, int nr)
+{
+	unsigned long bytes;
+
+	init_kstat_irqs(desc, cpu, nr);
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+}
+
+static void free_kstat_irqs(struct irq_desc *desc)
+{
+	kfree(desc->kstat_irqs);
+	desc->kstat_irqs = NULL;
+}
+#endif
+
+void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+}
+
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
+	desc->irq = irq;
+#ifdef CONFIG_SMP
+	desc->cpu = cpu;
+#endif
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_kstat_irqs(desc, cpu, nr_cpu_ids);
+	arch_init_chip_data(desc, cpu);
+}
+
+#ifdef CONFIG_SMP
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+		 struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	desc->cpu = cpu;
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *desc)
+{
+	free_kstat_irqs(desc);
+	arch_free_chip_data(desc);
+}
+#endif
+/*
+ * Protect the sparse_irqs_free freelist:
+ */
+static DEFINE_SPINLOCK(sparse_irq_lock);
+LIST_HEAD(sparse_irqs_head);
+
+/*
+ * The sparse irqs are in a hash-table as well, for fast lookup:
+ */
+#define SPARSEIRQHASH_BITS          (13 - 1)
+#define SPARSEIRQHASH_SIZE          (1UL << SPARSEIRQHASH_BITS)
+#define __sparseirqhashfn(key)      hash_long((unsigned long)key, SPARSEIRQHASH_BITS)
+#define sparseirqhashentry(key)     (sparseirqhash_table + __sparseirqhashfn((key)))
+
+static struct list_head sparseirqhash_table[SPARSEIRQHASH_SIZE];
+
+static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
+	[0 ... NR_IRQS_LEGACY-1] = {
+		.irq	    = -1U,
+		.status	    = IRQ_DISABLED,
+		.chip	    = &no_irq_chip,
+		.handle_irq = handle_bad_irq,
+		.depth	    = 1,
+		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+		.affinity   = CPU_MASK_ALL
+#endif
+	}
+};
+
+/* FIXME: use bootmem alloc ...*/
+static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+
+void __init early_irq_init_work(void)
+{
+	struct irq_desc *desc;
+	int legacy_count;
+	int i;
+
+	/* init_work to init list for sparseirq */
+	for (i = 0; i < SPARSEIRQHASH_SIZE; i++)
+		INIT_LIST_HEAD(sparseirqhash_table + i);
+
+	desc = irq_desc_legacy;
+	legacy_count = ARRAY_SIZE(irq_desc_legacy);
+
+	for (i = 0; i < legacy_count; i++) {
+		struct list_head *hash_head;
+
+		hash_head = sparseirqhashentry(i);
+		desc[i].irq = i;
+		desc[i].kstat_irqs = kstat_irqs_legacy[i];
+		list_add_tail(&desc[i].hash_entry, hash_head);
+		list_add_tail(&desc[i].list, &sparse_irqs_head);
+	}
+
+	arch_early_irq_init_work();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+
+	hash_head = sparseirqhashentry(irq);
+
+	/*
+	 * We can walk the hash lockfree, because the hash only
+	 * grows, and we are careful when adding entries to the end:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry) {
+		if (desc->irq == irq)
+			return desc;
+	}
+
+	return NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		return desc;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry)
+		if (desc->irq == irq)
+			goto out_unlock;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_desc for %d aka %#x on cpu %d node %d\n",
+		 irq, irq, cpu, node);
+	init_one_irq_desc(irq, desc, cpu);
+
+	/*
+	 * We use RCU's safe list-add method to make
+	 * parallel walking of the hash-list safe:
+	 */
+	list_add_tail_rcu(&desc->hash_entry, hash_head);
+	/*
+	 * Add it to the global list:
+	 */
+	list_add_tail_rcu(&desc->list, &sparse_irqs_head);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc_alloc_cpu(irq, -1);
+}
+
+#ifdef CONFIG_SMP
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+						int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	irq = old_desc->irq;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry)
+		if (desc->irq == irq && old_desc != desc)
+			goto out_unlock;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  move irq_desc for %d aka %#x to cpu %d node %d\n",
+		 irq, irq, cpu, node);
+
+	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+	list_replace_rcu(&old_desc->hash_entry, &desc->hash_entry);
+	list_replace_rcu(&old_desc->list, &desc->list);
+
+	/* free the old one */
+	free_one_irq_desc(old_desc);
+	kfree(old_desc);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+	int old_cpu;
+	int node, old_node;
+
+	/* those all static, do move them */
+	if (desc->irq < NR_IRQS_LEGACY)
+		return desc;
+
+	old_cpu = desc->cpu;
+	printk(KERN_DEBUG "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
+	if (old_cpu != cpu) {
+		node = cpu_to_node(cpu);
+		old_node = cpu_to_node(old_cpu);
+		if (old_node != node)
+			desc = __real_move_irq_desc(desc, cpu);
+		else
+			desc->cpu = cpu;
+	}
+
+	return desc;
+}
+#endif
+
+#else
+
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
@@ -62,18 +362,49 @@
 	}
 };
 
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_desc[irq];
+
+	return NULL;
+}
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu)
+{
+	return old_desc;
+}
+#endif
+
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
  */
-static void ack_bad(unsigned int irq)
+static void ack_bad_desc(unsigned int irq, struct irq_desc *desc)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
 	print_irq_desc(irq, desc);
 	ack_bad_irq(irq);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_bad_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	ack_bad_desc(irq, *descp);
+}
+#else
+static void ack_bad_wrapper(unsigned int irq)
+{
+	ack_bad_desc(irq, irq_to_desc(irq));
+}
+#endif
+
 /*
  * NOP functions
  */
@@ -81,6 +412,15 @@
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void noop_wrapper(unsigned int irq, struct irq_desc **descp)
+#else
+static void noop_wrapper(unsigned int irq)
+#endif
+{
+	noop(irq);
+}
+
 static unsigned int noop_ret(unsigned int irq)
 {
 	return 0;
@@ -95,7 +435,7 @@
 	.shutdown	= noop,
 	.enable		= noop,
 	.disable	= noop,
-	.ack		= ack_bad,
+	.ack		= ack_bad_wrapper,
 	.end		= noop,
 };
 
@@ -109,9 +449,9 @@
 	.shutdown	= noop,
 	.enable		= noop,
 	.disable	= noop,
-	.ack		= noop,
-	.mask		= noop,
-	.unmask		= noop,
+	.ack		= noop_wrapper,
+	.mask		= noop_wrapper,
+	.unmask		= noop_wrapper,
 	.end		= noop,
 };
 
@@ -179,8 +519,13 @@
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		if (desc->chip->ack)
+		if (desc->chip->ack) {
+#ifdef CONFIG_SPARSE_IRQ
+			desc->chip->ack(irq, &desc);
+#else
 			desc->chip->ack(irq);
+#endif
+		}
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -191,8 +536,13 @@
 	}
 
 	spin_lock(&desc->lock);
-	if (desc->chip->ack)
+	if (desc->chip->ack) {
+#ifdef CONFIG_SPARSE_IRQ
+		desc->chip->ack(irq, &desc);
+#else
 		desc->chip->ack(irq);
+#endif
+	}
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
@@ -261,17 +611,25 @@
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-static struct lock_class_key irq_desc_lock_class;
-
 void early_init_irq_lock_class(void)
 {
+#ifndef CONFIG_SPARSE_IRQ
 	struct irq_desc *desc;
 	int i;
 
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	} end_for_each_irq_desc();
+#endif
 }
 #endif
+
+#ifdef CONFIG_SPARSE_IRQ
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	return desc->kstat_irqs[cpu];
+}
+#endif
+EXPORT_SYMBOL(kstat_irqs_cpu);
+
Index: linux-2.6/arch/x86/kernel/irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/arch/x86/kernel/irq.c	2011-02-05 16:34:42.000000000 -0800
@@ -99,25 +99,37 @@
 int show_interrupts(struct seq_file *p, void *v)
 {
 	unsigned long flags, any_count = 0;
-	int i = *(loff_t *) v, j;
+	int i, j;
 	struct irqaction *action;
 	struct irq_desc *desc;
+	int head = 0;
 
+#ifdef CONFIG_SPARSE_IRQ
+	desc = list_entry(v, struct irq_desc, list);
+	i = desc->irq;
+	if (&desc->list == sparse_irqs_head.next)
+		head = 1;
+#else
+	i = *(loff_t *) v;
 	if (i > nr_irqs)
 		return 0;
 
 	if (i == nr_irqs)
 		return show_other_interrupts(p);
+	if (i == 0)
+		head = 1;
+
+	desc = irq_to_desc(i);
+#endif
 
 	/* print header */
-	if (i == 0) {
+	if (head) {
 		seq_printf(p, "           ");
 		for_each_online_cpu(j)
 			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
 	}
 
-	desc = irq_to_desc(i);
 	spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 	any_count = kstat_irqs(i);
@@ -148,6 +160,12 @@
 	seq_putc(p, '\n');
 out:
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+#ifdef CONFIG_SPARSE_IRQ
+	if (&desc->list == sparse_irqs_head.prev)
+		show_other_interrupts(p);
+#endif
+
 	return 0;
 }
 
Index: linux-2.6/include/linux/irqnr.h
===================================================================
--- linux-2.6.orig/include/linux/irqnr.h	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/include/linux/irqnr.h	2011-02-05 16:34:42.000000000 -0800
@@ -7,18 +7,11 @@
 
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0; irq < nr_irqs; irq++)
-#else
-extern int nr_irqs;
+# define end_for_each_irq_desc()
 
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-
-# define for_each_irq_desc_reverse(irq, desc)				\
-	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);	\
-	     irq >= 0; irq--, desc--)
+static inline early_sparse_irq_init_work(void)
+{
+}
 #endif
 
-#define for_each_irq_nr(irq)			\
-	for (irq = 0; irq < nr_irqs; irq++)
-
 #endif
Index: linux-2.6/arch/x86/kernel/irq_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_32.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/arch/x86/kernel/irq_32.c	2011-02-05 16:34:42.000000000 -0800
@@ -254,7 +254,7 @@
 			desc->chip->set_affinity(irq, mask);
 		else if (desc->action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 #if 0
 	barrier();
Index: linux-2.6/arch/x86/kernel/irq_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_64.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/arch/x86/kernel/irq_64.c	2011-02-05 16:34:42.000000000 -0800
@@ -112,16 +112,26 @@
 			mask = map;
 		}
 
-		if (desc->chip->mask)
+		if (desc->chip->mask) {
+#ifdef CONFIG_SPARSE_IRQ
+			desc->chip->mask(irq, &desc);
+#else
 			desc->chip->mask(irq);
+#endif
+		}
 
 		if (desc->chip->set_affinity)
 			desc->chip->set_affinity(irq, mask);
 		else if (!(warned++))
 			set_affinity = 0;
 
-		if (desc->chip->unmask)
+		if (desc->chip->unmask) {
+#ifdef CONFIG_SPARSE_IRQ
+			desc->chip->unmask(irq, &desc);
+#else
 			desc->chip->unmask(irq);
+#endif
+		}
 
 		spin_unlock(&desc->lock);
 
@@ -129,7 +139,7 @@
 			printk("Broke affinity for irq %i\n", irq);
 		else if (!set_affinity)
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 	/* That doesn't seem sufficient.  Give it 1ms. */
 	local_irq_enable();
Index: linux-2.6/kernel/irq/proc.c
===================================================================
--- linux-2.6.orig/kernel/irq/proc.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/kernel/irq/proc.c	2011-02-05 16:34:42.000000000 -0800
@@ -243,7 +243,8 @@
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for_each_irq_desc(irq, desc)
+	for_each_irq_desc(irq, desc) {
 		register_irq_proc(irq, desc);
+	} end_for_each_irq_desc();
 }
 
Index: linux-2.6/kernel/irq/spurious.c
===================================================================
--- linux-2.6.orig/kernel/irq/spurious.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/kernel/irq/spurious.c	2011-02-05 16:34:42.000000000 -0800
@@ -99,7 +99,7 @@
 
 		if (try_one_irq(i, desc))
 			ok = 1;
-	}
+	} end_for_each_irq_desc();
 	/* So the caller can adjust the irq error counts */
 	return ok;
 }
@@ -122,7 +122,7 @@
 			continue;
 
 		try_one_irq(i, desc);
-	}
+	} end_for_each_irq_desc();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/init/main.c	2011-02-05 17:04:54.000000000 -0800
@@ -541,6 +541,15 @@
 {
 }
 
+void __init __attribute__((weak)) arch_early_irq_init_work(void)
+{
+}
+
+void __init __attribute__((weak)) early_irq_init_work(void)
+{
+	arch_early_irq_init_work();
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
@@ -611,6 +620,8 @@
 	sort_main_extable();
 	trap_init();
 	rcu_init();
+	/* init some links before init_ISA_irqs() */
+	early_irq_init_work();
 	init_IRQ();
 	pidhash_init();
 	init_timers();
Index: linux-2.6/arch/x86/include/asm/irq_vectors.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/irq_vectors.h	2011-02-05 16:33:24.000000000 -0800
+++ linux-2.6/arch/x86/include/asm/irq_vectors.h	2011-02-05 16:34:42.000000000 -0800
@@ -101,6 +101,8 @@
 #define LAST_VM86_IRQ		15
 #define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
 
+#define NR_IRQS_LEGACY		16
+
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
Index: linux-2.6/arch/x86/kernel/i8259.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/i8259.c	2011-01-26 18:47:17.000000000 -0800
+++ linux-2.6/arch/x86/kernel/i8259.c	2011-02-05 16:34:42.000000000 -0800
@@ -36,12 +36,39 @@
 DEFINE_SPINLOCK(i8259A_lock);
 static void mask_and_ack_8259A(unsigned int);
 
+#ifdef CONFIG_SPARSE_IRQ
+static void mask_and_ack_8259A_wrapper(unsigned int irq, struct irq_desc **descp)
+#else
+static void mask_and_ack_8259A_wrapper(unsigned int irq)
+#endif
+{
+	mask_and_ack_8259A(irq);
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+static void disable_8259A_irq_wrapper(unsigned int irq, struct irq_desc **descp)
+#else
+static void disable_8259A_irq_wrapper(unsigned int irq)
+#endif
+{
+	disable_8259A_irq(irq);
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+static void enable_8259A_irq_wrapper(unsigned int irq, struct irq_desc **descp)
+#else
+static void enable_8259A_irq_wrapper(unsigned int irq)
+#endif
+{
+	enable_8259A_irq(irq);
+}
+
 struct irq_chip i8259A_chip = {
 	.name		= "XT-PIC",
-	.mask		= disable_8259A_irq,
+	.mask		= disable_8259A_irq_wrapper,
 	.disable	= disable_8259A_irq,
-	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
+	.unmask		= enable_8259A_irq_wrapper,
+	.mask_ack	= mask_and_ack_8259A_wrapper,
 };
 
 /*
@@ -348,9 +375,9 @@
 		 * In AEOI mode we just have to mask the interrupt
 		 * when acking.
 		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
+		i8259A_chip.mask_ack = disable_8259A_irq_wrapper;
 	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
+		i8259A_chip.mask_ack = mask_and_ack_8259A_wrapper;
 
 	udelay(100);		/* wait for 8259A to initialize */
 
Index: linux-2.6/arch/x86/kernel/uv_irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/uv_irq.c	2011-01-26 18:47:17.000000000 -0800
+++ linux-2.6/arch/x86/kernel/uv_irq.c	2011-02-05 16:34:42.000000000 -0800
@@ -18,6 +18,15 @@
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_noop_wrapper(unsigned int irq, struct irq_desc **descp)
+#else
+static void uv_noop_wrapper(unsigned int irq)
+#endif
+{
+	uv_noop(irq);
+}
+
 static unsigned int uv_noop_ret(unsigned int irq)
 {
 	return 0;
@@ -28,16 +37,25 @@
 	ack_APIC_irq();
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_ack_apic_wrapper(unsigned int irq, struct irq_desc **descp)
+#else
+static void uv_ack_apic_wrapper(unsigned int irq)
+#endif
+{
+	uv_ack_apic(irq);
+}
+
 struct irq_chip uv_irq_chip = {
 	.name		= "UV-CORE",
 	.startup	= uv_noop_ret,
 	.shutdown	= uv_noop,
 	.enable		= uv_noop,
 	.disable	= uv_noop,
-	.ack		= uv_noop,
-	.mask		= uv_noop,
-	.unmask		= uv_noop,
-	.eoi		= uv_ack_apic,
+	.ack		= uv_noop_wrapper,
+	.mask		= uv_noop_wrapper,
+	.unmask		= uv_noop_wrapper,
+	.eoi		= uv_ack_apic_wrapper,
 	.end		= uv_noop,
 };
 
Index: linux-2.6/drivers/pci/msi.c
===================================================================
--- linux-2.6.orig/drivers/pci/msi.c	2011-01-26 18:47:18.000000000 -0800
+++ linux-2.6/drivers/pci/msi.c	2011-02-05 16:34:42.000000000 -0800
@@ -103,11 +103,11 @@
 	}
 }
 
-static void msix_flush_writes(unsigned int irq)
+static void msix_flush_writes(struct irq_desc *desc)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = desc->msi_desc;
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -135,11 +135,11 @@
  * Returns 1 if it succeeded in masking the interrupt and 0 if the device
  * doesn't support MSI masking.
  */
-static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
+static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = desc->msi_desc;
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -252,16 +252,30 @@
 	entry->msg = *msg;
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+void mask_msi_irq(unsigned int irq, struct irq_desc **descp)
+{
+	struct irq_desc *desc = *descp;
+#else
 void mask_msi_irq(unsigned int irq)
 {
-	msi_set_mask_bits(irq, 1, 1);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+#endif
+	msi_set_mask_bits(desc, 1, 1);
+	msix_flush_writes(desc);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+void unmask_msi_irq(unsigned int irq, struct irq_desc **descp)
+{
+	struct irq_desc *desc = *descp;
+#else
 void unmask_msi_irq(unsigned int irq)
 {
-	msi_set_mask_bits(irq, 1, 0);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+#endif
+	msi_set_mask_bits(desc, 1, 0);
+	msix_flush_writes(desc);
 }
 
 static int msi_free_irqs(struct pci_dev* dev);
@@ -303,9 +317,11 @@
 	pci_intx_for_msi(dev, 0);
 	msi_set_enable(dev, 0);
 	write_msi_msg(dev->irq, &entry->msg);
-	if (entry->msi_attrib.maskbit)
-		msi_set_mask_bits(dev->irq, entry->msi_attrib.maskbits_mask,
+	if (entry->msi_attrib.maskbit) {
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask,
 				  entry->msi_attrib.masked);
+	}
 
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
@@ -327,8 +343,9 @@
 	msix_set_enable(dev, 0);
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
+		struct irq_desc *desc = irq_to_desc(entry->irq);
 		write_msi_msg(entry->irq, &entry->msg);
-		msi_set_mask_bits(entry->irq, 1, entry->msi_attrib.masked);
+		msi_set_mask_bits(desc, 1, entry->msi_attrib.masked);
 	}
 
 	BUG_ON(list_empty(&dev->msi_list));
@@ -596,7 +613,8 @@
 	/* Return the the pci reset with msi irqs unmasked */
 	if (entry->msi_attrib.maskbit) {
 		u32 mask = entry->msi_attrib.maskbits_mask;
-		msi_set_mask_bits(dev->irq, mask, ~mask);
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, mask, ~mask);
 	}
 	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
 		return;
Index: linux-2.6/include/linux/msi.h
===================================================================
--- linux-2.6.orig/include/linux/msi.h	2011-01-26 18:47:20.000000000 -0800
+++ linux-2.6/include/linux/msi.h	2011-02-05 16:34:42.000000000 -0800
@@ -10,8 +10,14 @@
 };
 
 /* Helper functions */
+#ifdef CONFIG_SPARSE_IRQ
+struct irq_desc;
+extern void mask_msi_irq(unsigned int irq, struct irq_desc **descp);
+extern void unmask_msi_irq(unsigned int irq, struct irq_desc **descp);
+#else
 extern void mask_msi_irq(unsigned int irq);
 extern void unmask_msi_irq(unsigned int irq);
+#endif
 extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
 extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
Index: linux-2.6/arch/x86/include/asm/hpet.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/hpet.h	2011-01-26 18:47:17.000000000 -0800
+++ linux-2.6/arch/x86/include/asm/hpet.h	2011-02-05 16:34:42.000000000 -0800
@@ -72,8 +72,13 @@
 extern unsigned long hpet_readl(unsigned long a);
 extern void force_hpet_resume(void);
 
+#ifdef CONFIG_SPARSE_IRQ
+extern void hpet_msi_unmask(unsigned int irq, struct irq_desc **descp);
+extern void hpet_msi_mask(unsigned int irq, struct irq_desc **descp);
+#else
 extern void hpet_msi_unmask(unsigned int irq);
 extern void hpet_msi_mask(unsigned int irq);
+#endif
 extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
 extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
 
Index: linux-2.6/arch/x86/kernel/hpet.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/hpet.c	2011-02-05 16:33:53.000000000 -0800
+++ linux-2.6/arch/x86/kernel/hpet.c	2011-02-05 16:34:42.000000000 -0800
@@ -347,7 +347,11 @@
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev	*hpet_devs;
 
+#ifdef CONFIG_SPARSE_IRQ
+void hpet_msi_unmask(unsigned int irq, struct irq_desc **descp)
+#else
 void hpet_msi_unmask(unsigned int irq)
+#endif
 {
 	struct hpet_dev *hdev = get_irq_data(irq);
 	unsigned long cfg;
@@ -358,7 +362,11 @@
 	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+void hpet_msi_mask(unsigned int irq, struct irq_desc **descp)
+#else
 void hpet_msi_mask(unsigned int irq)
+#endif
 {
 	unsigned long cfg;
 	struct hpet_dev *hdev = get_irq_data(irq);
Index: linux-2.6/include/linux/htirq.h
===================================================================
--- linux-2.6.orig/include/linux/htirq.h	2011-01-26 18:47:20.000000000 -0800
+++ linux-2.6/include/linux/htirq.h	2011-02-05 16:34:42.000000000 -0800
@@ -9,8 +9,14 @@
 /* Helper functions.. */
 void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
 void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
+#ifdef CONFIG_SPARSE_IRQ
+struct irq_desc;
+void mask_ht_irq(unsigned int irq, struct irq_desc **descp);
+void unmask_ht_irq(unsigned int irq, struct irq_desc **descp);
+#else
 void mask_ht_irq(unsigned int irq);
 void unmask_ht_irq(unsigned int irq);
+#endif
 
 /* The arch hook for getting things started */
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev);
Index: linux-2.6/kernel/irq/migration.c
===================================================================
--- linux-2.6.orig/kernel/irq/migration.c	2011-02-04 11:27:22.000000000 -0800
+++ linux-2.6/kernel/irq/migration.c	2011-02-05 16:34:42.000000000 -0800
@@ -1,9 +1,15 @@
 
 #include <linux/irq.h>
 
+#ifdef CONFIG_SPARSE_IRQ
+void move_masked_irq(int irq, struct irq_desc **descp)
+{
+	struct irq_desc *desc = *descp;
+#else
 void move_masked_irq(int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
+#endif
 	cpumask_t tmp;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
@@ -47,9 +53,15 @@
 	cpus_clear(desc->pending_mask);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+void move_native_irq(int irq, struct irq_desc **descp)
+{
+	struct irq_desc *desc = *descp;
+#else
 void move_native_irq(int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
+#endif
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
@@ -57,8 +69,14 @@
 	if (unlikely(desc->status & IRQ_DISABLED))
 		return;
 
+#ifdef CONFIG_SPARSE_IRQ
+	desc->chip->mask(irq, descp);
+	move_masked_irq(irq, descp);
+	desc->chip->unmask(irq, descp);
+#else
 	desc->chip->mask(irq);
 	move_masked_irq(irq);
 	desc->chip->unmask(irq);
+#endif
 }
 
Index: linux-2.6/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.orig/drivers/pci/intel-iommu.c	2011-01-26 18:47:18.000000000 -0800
+++ linux-2.6/drivers/pci/intel-iommu.c	2011-02-05 16:34:42.000000000 -0800
@@ -751,7 +751,11 @@
 		return fault_reason_strings[fault_reason];
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+void dmar_msi_unmask(unsigned int irq, struct irq_desc **descp)
+#else
 void dmar_msi_unmask(unsigned int irq)
+#endif
 {
 	struct intel_iommu *iommu = get_irq_data(irq);
 	unsigned long flag;
@@ -764,7 +768,11 @@
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+void dmar_msi_mask(unsigned int irq, struct irq_desc **descp)
+#else
 void dmar_msi_mask(unsigned int irq)
+#endif
 {
 	unsigned long flag;
 	struct intel_iommu *iommu = get_irq_data(irq);
Index: linux-2.6/include/linux/dmar.h
===================================================================
--- linux-2.6.orig/include/linux/dmar.h	2011-01-26 18:47:20.000000000 -0800
+++ linux-2.6/include/linux/dmar.h	2011-02-05 16:34:42.000000000 -0800
@@ -122,8 +122,13 @@
 /* Can't use the common MSI interrupt functions
  * since DMAR is not a pci device
  */
+#ifdef CONFIG_SPARSE_IRQ
+extern void dmar_msi_unmask(unsigned int irq, struct irq_desc **descp);
+extern void dmar_msi_mask(unsigned int irq, struct irq_desc **descp);
+#else
 extern void dmar_msi_unmask(unsigned int irq);
 extern void dmar_msi_mask(unsigned int irq);
+#endif
 extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);


^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] sparse_irq aka dyn_irq v13
       [not found]                                     ` <20081112120814.GG11352@elte.hu>
@ 2008-11-13  7:01                                       ` Yinghai Lu
  2008-11-13  9:53                                         ` Ingo Molnar
  2008-11-13 20:16                                       ` Yinghai Lu
  1 sibling, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-13  7:01 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Andrew Morton
  Cc: linux-kernel@vger.kernel.org

reduce the #ifdef numbers

YH

---

From: Yinghai Lu <yinghai@kernel.org>
Subject: sparseirq v13

impact: new feature sparseirq

add some kind of hash table as Ingo suggesting.
remove dyna_array

when sparse_irq is used (CONFIG_SPARSE_IRQ), use kzalloc_node to get irq_desc, irq_cfg
  use desc->chip_data for x86 to store irq_cfg

if CONFIG_MOVE_IRQ_DESC is set
  make irq_desc to go with affinity aka irq_desc moving etc
  call move_irq_desc in irq_complete_move()
  need to add struct (irq_desc **descp) to ack_edge/level to make sure desc get updated
  try to pass desc cfg as more as possible to avoid list looking up. 
  legacy irq_desc is not moved, because they are allocated via static array

for logical apic mode, need to add move_desc_in_progress_in_same_domain. otherwise it will not get moved. ==> also could need two phase to get irq_desc moved.
	for example: 0xff is old affinity, and need to set 0xf, and then set to 0xf0.
	[ or we need to change domain definition to cpus on the same node ? ]

LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,000000ff
LBSuse:~ # echo f > /proc/irq/22/smp_affinity
LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,0000000f
LBSuse:~ # tail /var/log/messages
...
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
LBSuse:~ # echo f0 > /proc/irq/22/smp_affinity
LBSuse:~ # tail /var/log/messages
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
Oct 27 12:36:46 LBSuse kernel:   move irq_desc for 22 aka 0x16 to cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc kstat_irqs on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_cfg on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_2_pin on cpu 7 node 1

so assume the user space program should update /proc/irq/XX/smp_affinity to 03 or 0f at first on boot
or we change irq_default_affinity ?

for physical apic is much simple
on 4 sockets 16 cores system
irq_desc is moving..
when
# echo 10 > /proc/irq/134483967/smp_affinity
# echo 100 > /proc/irq/134483967/smp_affinity
# echo 1000 > /proc/irq/134483967/smp_affinity
got
Nov  9 21:39:51 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc kstat_irqs on cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc irq_cfg on cpu 4 node 1
Nov  9 21:40:05 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc kstat_irqs on cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc irq_cfg on cpu 8 node 2
Nov  9 21:40:18 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc kstat_irqs on cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc irq_cfg on cpu 12 node 3

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/Kconfig                   |   20 +
 arch/x86/include/asm/hpet.h        |    7 
 arch/x86/include/asm/irq_vectors.h |    2 
 arch/x86/kernel/hpet.c             |   19 -
 arch/x86/kernel/i8259.c            |   29 +
 arch/x86/kernel/io_apic.c          |  688 ++++++++++++++++++++++++++-----------
 arch/x86/kernel/irq.c              |   24 +
 arch/x86/kernel/irq_32.c           |    2 
 arch/x86/kernel/irq_64.c           |    6 
 arch/x86/kernel/irqinit_32.c       |    3 
 arch/x86/kernel/irqinit_64.c       |    3 
 arch/x86/kernel/uv_irq.c           |   28 +
 arch/x86/mm/init_32.c              |    3 
 drivers/char/random.c              |   31 +
 drivers/pci/htirq.c                |   38 +-
 drivers/pci/intel-iommu.c          |   19 -
 drivers/pci/intr_remapping.c       |   62 +++
 drivers/pci/msi.c                  |   49 +-
 drivers/xen/events.c               |    9 
 fs/proc/interrupts.c               |   18 
 fs/proc/stat.c                     |   17 
 include/linux/dmar.h               |    7 
 include/linux/htirq.h              |    8 
 include/linux/interrupt.h          |    2 
 include/linux/irq.h                |   85 ++++
 include/linux/irqnr.h              |   15 
 include/linux/kernel_stat.h        |   14 
 include/linux/msi.h                |    8 
 init/main.c                        |   11 
 kernel/irq/autoprobe.c             |   10 
 kernel/irq/chip.c                  |   43 +-
 kernel/irq/handle.c                |  373 +++++++++++++++++++-
 kernel/irq/migration.c             |   29 +
 kernel/irq/proc.c                  |    3 
 kernel/irq/spurious.c              |    4 
 35 files changed, 1377 insertions(+), 312 deletions(-)

Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -240,6 +240,26 @@ config X86_HAS_BOOT_CPU_ID
 	def_bool y
 	depends on X86_VOYAGER
 
+config SPARSE_IRQ
+	bool "Support sparse irq numbering"
+	depends on PCI_MSI || HT_IRQ
+	default y
+	help
+	  This enables support for sparse irq, esp for msi/msi-x. the irq
+	  number will be bus/dev/fn + 12bit. You may need if you have lots of
+	  cards supports msi-x installed.
+
+	  If you don't know what to do here, say Y.
+
+config MOVE_IRQ_DESC
+	bool "Move irq desc when changing irq smp_affinity"
+	depends on SPARSE_IRQ && SMP
+	default y
+	help
+	  This enables moving irq_desc to cpu/node that irq will use handled.
+
+	  If you don't know what to do here, say Y.
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
Index: linux-2.6/arch/x86/kernel/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/io_apic.c
+++ linux-2.6/arch/x86/kernel/io_apic.c
@@ -108,93 +108,236 @@ static int __init parse_noapic(char *str
 early_param("noapic", parse_noapic);
 
 struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+	int apic, pin;
+	struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+	struct irq_pin_list *pin;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	pin = kzalloc_node(sizeof(*pin), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+
+	return pin;
+}
+
 struct irq_cfg {
-	unsigned int irq;
 	struct irq_pin_list *irq_2_pin;
 	cpumask_t domain;
 	cpumask_t old_domain;
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
+#ifdef CONFIG_MOVE_IRQ_DESC
+	u8 move_desc_in_progress_in_same_domain : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
 static struct irq_cfg irq_cfgx[NR_IRQS] = {
-	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-	[3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-	[4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-	[5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-	[6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-	[7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-	[8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-	[9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-	[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-	[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-	[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-	[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-	[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+#endif
+	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
 };
 
-#define for_each_irq_cfg(irq, cfg)		\
-	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+void __init arch_early_irq_init_work(void)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int count;
+	int i;
+#ifdef CONFIG_SPARSE_IRQ
+	int count_desc = NR_IRQS_LEGACY;
+#else
+	int count_desc = NR_IRQS;
+#endif
+
+	cfg = irq_cfgx;
+	count = ARRAY_SIZE(irq_cfgx);
+
+	BUG_ON(count > count_desc);
 
+	for (i = 0; i < count; i++) {
+		desc = irq_to_desc(i);
+		desc->chip_data = &cfg[i];
+	}
+}
+
+#ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+	struct irq_cfg *cfg = NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		cfg = desc->chip_data;
+
+	return cfg;
 }
 
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 {
-	return irq_cfg(irq);
+	struct irq_cfg *cfg;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
+
+	return cfg;
 }
 
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
 
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+	cfg = desc->chip_data;
+	if (!cfg)
+		desc->chip_data = get_one_free_irq_cfg(cpu);
+}
 
-struct irq_pin_list {
-	int apic, pin;
-	struct irq_pin_list *next;
-};
+#ifdef CONFIG_MOVE_IRQ_DESC
+
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu)
+{
+	struct irq_pin_list *old_entry, *tail, *entry;
+
+	cfg->irq_2_pin = NULL;
+	old_entry = old_cfg->irq_2_pin;
+	if (!old_entry)
+		return;
 
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+	entry = get_one_free_irq_2_pin(cpu);
+	entry->apic = old_entry->apic;
+	entry->pin = old_entry->pin;
+	cfg->irq_2_pin = entry;
+	tail = entry;
+	old_entry = old_entry->next;
 
-static void __init irq_2_pin_init(void)
+	while (old_entry) {
+		entry = get_one_free_irq_2_pin(cpu);
+		entry->apic = old_entry->apic;
+		entry->pin = old_entry->pin;
+		tail->next = entry;
+		tail = entry;
+		old_entry = old_entry->next;
+	}
+
+	tail->next = NULL;
+}
+
+static void free_irq_2_pin(struct irq_cfg *cfg)
 {
-	struct irq_pin_list *pin = irq_2_pin_head;
-	int i;
+	struct irq_pin_list *entry, *next;
 
-	for (i = 1; i < PIN_MAP_SIZE; i++)
-		pin[i-1].next = &pin[i];
+	entry = cfg->irq_2_pin;
 
-	irq_2_pin_ptr = &pin[0];
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	cfg->irq_2_pin = NULL;
 }
 
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_ptr;
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
 
-	if (!pin)
-		panic("can not get more irq_2_pin\n");
+	cfg = get_one_free_irq_cfg(cpu);
+	desc->chip_data = cfg;
 
-	irq_2_pin_ptr = pin->next;
-	pin->next = NULL;
-	return pin;
+	old_cfg = old_desc->chip_data;
+
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *cfg)
+{
+	kfree(cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+
+	cfg = desc->chip_data;
+	if (cfg) {
+		free_irq_2_pin(cfg);
+		free_irq_cfg(cfg);
+		desc->chip_data = NULL;
+	}
+}
+
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+	struct irq_cfg *cfg = desc->chip_data;
+
+	if (!cfg->move_in_progress) {
+		/* it means that domain is not changed */
+		cpumask_t tmp;
+
+		cpus_and(tmp, desc->affinity, mask);
+		if (cpus_empty(tmp))
+			cfg->move_desc_in_progress_in_same_domain = 1;
+	}
 }
+#endif
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+#endif
+
+#ifndef CONFIG_MOVE_IRQ_DESC
+static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+}
+#endif
 
 struct io_apic {
 	unsigned int index;
@@ -237,11 +380,10 @@ static inline void io_apic_modify(unsign
 	writel(value, &io_apic->data);
 }
 
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(unsigned int irq, struct irq_cfg *cfg)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
-	struct irq_cfg *cfg = irq_cfg(irq);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	entry = cfg->irq_2_pin;
@@ -323,13 +465,12 @@ static void ioapic_mask_entry(int apic,
 }
 
 #ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
 {
 	int apic, pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
 
-	cfg = irq_cfg(irq);
 	entry = cfg->irq_2_pin;
 	for (;;) {
 		unsigned int reg;
@@ -359,7 +500,7 @@ static void __target_IO_APIC_irq(unsigne
 	}
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
 
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 {
@@ -373,10 +514,13 @@ static void set_ioapic_affinity_irq(unsi
 	if (cpus_empty(tmp))
 		return;
 
-	cfg = irq_cfg(irq);
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 	/*
@@ -384,9 +528,8 @@ static void set_ioapic_affinity_irq(unsi
 	 */
 	dest = SET_APIC_LOGICAL_ID(dest);
 
-	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__target_IO_APIC_irq(irq, dest, cfg->vector);
+	__target_IO_APIC_irq(irq, dest, cfg);
 	desc->affinity = mask;
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
@@ -397,16 +540,13 @@ static void set_ioapic_affinity_irq(unsi
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	/* first time to refer irq_cfg, so with new */
-	cfg = irq_cfg_alloc(irq);
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin();
+		entry = get_one_free_irq_2_pin(cpu);
 		cfg->irq_2_pin = entry;
 		entry->apic = apic;
 		entry->pin = pin;
@@ -421,20 +561,31 @@ static void add_pin_to_irq(unsigned int
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin();
+	entry->next = get_one_free_irq_2_pin(cpu);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
 }
 
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int cpu = smp_processor_id();
+
+	/* first time to refer irq_cfg, so with new */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	cfg = desc->chip_data;
+	add_pin_to_irq_cpu(cfg, cpu, apic, pin);
+}
+
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
 	struct irq_pin_list *entry = cfg->irq_2_pin;
 	int replaced = 0;
 
@@ -451,18 +602,16 @@ static void __init replace_pin_at_irq(un
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq(irq, newapic, newpin);
+		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
 
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
 				int mask_and, int mask_or,
 				void (*final)(struct irq_pin_list *entry))
 {
 	int pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	cfg = irq_cfg(irq);
 	for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
 		unsigned int reg;
 		pin = entry->pin;
@@ -475,9 +624,9 @@ static inline void io_apic_modify_irq(un
 	}
 }
 
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
 #ifdef CONFIG_X86_64
@@ -492,47 +641,69 @@ void io_apic_sync(struct irq_pin_list *e
 	readl(&io_apic->data);
 }
 
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 }
 #else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
 			IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
 			IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
 #endif /* CONFIG_X86_32 */
 
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irqx(unsigned int irq, struct irq_desc **descp)
 {
+	struct irq_cfg *cfg = (*descp)->chip_data;
 	unsigned long flags;
 
+	BUG_ON(!cfg);
+
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__mask_IO_APIC_irq(irq);
+	__mask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irqx(unsigned int irq, struct irq_desc **descp)
 {
+	struct irq_cfg *cfg = (*descp)->chip_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_IO_APIC_irq(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+#define mask_IO_APIC_irq mask_IO_APIC_irqx
+#define unmask_IO_APIC_irq unmask_IO_APIC_irqx
+#else
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_IO_APIC_irqx(irq, &desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_IO_APIC_irqx(irq, &desc);
+}
+#endif
+
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
@@ -809,7 +980,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector
  */
 static int EISA_ELCR(unsigned int irq)
 {
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		unsigned int port = 0x4d0 + (irq >> 3);
 		return (inb(port) >> (irq & 7)) & 1;
 	}
@@ -1034,7 +1205,7 @@ void unlock_vector_lock(void)
 	spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1050,16 +1221,13 @@ static int __assign_irq_vector(int irq,
 	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
 	unsigned int old_vector;
 	int cpu;
-	struct irq_cfg *cfg;
 
-	cfg = irq_cfg(irq);
+	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+		return -EBUSY;
 
 	/* Only try and allocate irqs on cpus that are present */
 	cpus_and(mask, mask, cpu_online_map);
 
-	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
-		return -EBUSY;
-
 	old_vector = cfg->vector;
 	if (old_vector) {
 		cpumask_t tmp;
@@ -1113,24 +1281,22 @@ next:
 	return -ENOSPC;
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	int err;
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, cfg, mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-	struct irq_cfg *cfg;
 	cpumask_t mask;
 	int cpu, vector;
 
-	cfg = irq_cfg(irq);
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
@@ -1148,14 +1314,16 @@ void __setup_vector_irq(int cpu)
 	/* This function must be called with vector_lock held */
 	int irq, vector;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	/* Mark the inuse vectors */
-	for_each_irq_cfg(irq, cfg) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
 		if (!cpu_isset(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
-	}
+	} end_for_each_irq_desc();
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
 		irq = per_cpu(vector_irq, cpu)[vector];
@@ -1205,7 +1373,8 @@ static void ioapic_register_intr(int irq
 {
 	struct irq_desc *desc;
 
-	desc = irq_to_desc(irq);
+	/* could be first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
@@ -1310,7 +1479,7 @@ static void setup_IO_APIC_irq(int apic,
 	cfg = irq_cfg(irq);
 
 	mask = TARGET_CPUS;
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
 	cpus_and(mask, cfg->domain, mask);
@@ -1327,12 +1496,12 @@ static void setup_IO_APIC_irq(int apic,
 			       cfg->vector)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic].mp_apicid, pin);
-		__clear_irq_vector(irq);
+		__clear_irq_vector(irq, cfg);
 		return;
 	}
 
 	ioapic_register_intr(irq, trigger);
-	if (irq < 16)
+	if (irq < NR_IRQS_LEGACY)
 		disable_8259A_irq(irq);
 
 	ioapic_write_entry(apic, pin, entry);
@@ -1434,6 +1603,7 @@ __apicdebuginit(void) print_IO_APIC(void
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 	unsigned int irq;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1523,8 +1693,10 @@ __apicdebuginit(void) print_IO_APIC(void
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for_each_irq_cfg(irq, cfg) {
-		struct irq_pin_list *entry = cfg->irq_2_pin;
+	for_each_irq_desc(irq, desc) {
+		struct irq_pin_list *entry;
+		cfg = desc->chip_data;
+		entry = cfg->irq_2_pin;
 		if (!entry)
 			continue;
 		printk(KERN_DEBUG "IRQ%d ", irq);
@@ -1535,7 +1707,7 @@ __apicdebuginit(void) print_IO_APIC(void
 			entry = entry->next;
 		}
 		printk("\n");
-	}
+	} end_for_each_irq_desc();
 
 	printk(KERN_INFO ".................................... done.\n");
 
@@ -2008,14 +2180,16 @@ static unsigned int startup_ioapic_irq(u
 {
 	int was_pending = 0;
 	unsigned long flags;
+	struct irq_cfg *cfg;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		disable_8259A_irq(irq);
 		if (i8259A_irq_pending(irq))
 			was_pending = 1;
 	}
-	__unmask_IO_APIC_irq(irq);
+	cfg = irq_cfg(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
@@ -2078,10 +2252,9 @@ static DECLARE_DELAYED_WORK(ir_migration
  * as simple as edge triggered migration and we can do the irq migration
  * with a simple atomic update to IO-APIC RTE.
  */
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void migrate_ioapic_irq(int irq, struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
 	int modify_ioapic_rte;
@@ -2095,18 +2268,19 @@ static void migrate_ioapic_irq(int irq,
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
-	desc = irq_to_desc(irq);
 	modify_ioapic_rte = desc->status & IRQ_LEVEL;
 	if (modify_ioapic_rte) {
 		spin_lock_irqsave(&ioapic_lock, flags);
-		__target_IO_APIC_irq(irq, dest, cfg->vector);
+		__target_IO_APIC_irq(irq, dest, cfg);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -2128,14 +2302,14 @@ static void migrate_ioapic_irq(int irq,
 	desc->affinity = mask;
 }
 
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level(int irq, struct irq_desc *desc)
 {
 	int ret = -1;
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
 
-	mask_IO_APIC_irq(irq);
+	mask_IO_APIC_irqx(irq, &desc);
 
-	if (io_apic_level_ack_pending(irq)) {
+	if (io_apic_level_ack_pending(irq, cfg)) {
 		/*
 		 * Interrupt in progress. Migrating irq now will change the
 		 * vector information in the IO-APIC RTE and that will confuse
@@ -2147,14 +2321,15 @@ static int migrate_irq_remapped_level(in
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq(irq, desc->pending_mask);
+	migrate_ioapic_irq(irq, desc, desc->pending_mask);
 
 	ret = 0;
 	desc->status &= ~IRQ_MOVE_PENDING;
 	cpus_clear(desc->pending_mask);
 
 unmask:
-	unmask_IO_APIC_irq(irq);
+	unmask_IO_APIC_irqx(irq, &desc);
+
 	return ret;
 }
 
@@ -2178,7 +2353,7 @@ static void ir_irq_migration(struct work
 			desc->chip->set_affinity(irq, desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
@@ -2191,11 +2366,11 @@ static void set_ir_ioapic_affinity_irq(u
 	if (desc->status & IRQ_LEVEL) {
 		desc->status |= IRQ_MOVE_PENDING;
 		desc->pending_mask = mask;
-		migrate_irq_remapped_level(irq);
+		migrate_irq_remapped_level(irq, desc);
 		return;
 	}
 
-	migrate_ioapic_irq(irq, mask);
+	migrate_ioapic_irq(irq, desc, mask);
 }
 #endif
 
@@ -2236,19 +2411,40 @@ unlock:
 	irq_exit();
 }
 
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_desc *desc = *descp;
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress))
+	if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_MOVE_IRQ_DESC
+		if (likely(!cfg->move_desc_in_progress_in_same_domain))
+			return;
+
+		/* domain is not change, but affinity is changed */
+		me = smp_processor_id();
+		if (cpu_isset(me, desc->affinity)) {
+			*descp = desc = move_irq_desc(desc, me);
+			/* get the new one */
+			cfg = desc->chip_data;
+			cfg->move_desc_in_progress_in_same_domain = 0;
+		}
+#endif
 		return;
+	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+		*descp = desc = move_irq_desc(desc, me);
+		/* get the new one */
+		cfg = desc->chip_data;
+#endif
+
 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
@@ -2256,9 +2452,24 @@ static void irq_complete_move(unsigned i
 	}
 }
 #else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
+
 #ifdef CONFIG_INTR_REMAP
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_x2apic_levelx(unsigned int irq, struct irq_desc **descp)
+{
+	ack_x2APIC_irq();
+}
+
+static void ack_x2apic_edgex(unsigned int irq, struct irq_desc **descp)
+{
+	ack_x2APIC_irq();
+}
+
+#define ack_x2apic_level ack_x2apic_levelx
+#define ack_x2apic_edge ack_x2apic_edgex
+#else
 static void ack_x2apic_level(unsigned int irq)
 {
 	ack_x2APIC_irq();
@@ -2270,29 +2481,44 @@ static void ack_x2apic_edge(unsigned int
 }
 #endif
 
-static void ack_apic_edge(unsigned int irq)
+#endif
+
+static void ack_apic_edgex(unsigned int irq, struct irq_desc **descp)
 {
-	irq_complete_move(irq);
-	move_native_irq(irq);
+	irq_complete_move(descp);
+#ifdef CONFIG_SMP
+	move_native_irqx(irq, descp);
+#endif
 	ack_APIC_irq();
 }
+#ifdef CONFIG_SPARSE_IRQ
+#define ack_apic_edge ack_apic_edgex
+#else
+static void ack_apic_edge(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_apic_edgex(irq, &desc);
+}
+#endif
 
 atomic_t irq_mis_count;
 
-static void ack_apic_level(unsigned int irq)
+static void ack_apic_levelx(unsigned int irq, struct irq_desc **descp)
 {
 #ifdef CONFIG_X86_32
 	unsigned long v;
 	int i;
 #endif
+	struct irq_cfg *cfg;
 	int do_unmask_irq = 0;
 
-	irq_complete_move(irq);
+	irq_complete_move(descp);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+	if (unlikely((*descp)->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
-		mask_IO_APIC_irq(irq);
+		mask_IO_APIC_irqx(irq, descp);
 	}
 #endif
 
@@ -2316,7 +2542,8 @@ static void ack_apic_level(unsigned int
 	* operation to prevent an edge-triggered interrupt escaping meanwhile.
 	* The idea is from Manfred Spraul.  --macro
 	*/
-	i = irq_cfg(irq)->vector;
+	cfg = (*descp)->chip_data;
+	i = cfg->vector;
 
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 #endif
@@ -2355,22 +2582,37 @@ static void ack_apic_level(unsigned int
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(irq))
-			move_masked_irq(irq);
-		unmask_IO_APIC_irq(irq);
+		cfg = (*descp)->chip_data;
+		if (!io_apic_level_ack_pending(irq, cfg)) {
+# ifdef CONFIG_SMP
+			move_masked_irqx(irq, descp);
+# endif
+		}
+		unmask_IO_APIC_irqx(irq, descp);
 	}
 
 #ifdef CONFIG_X86_32
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		__mask_and_edge_IO_APIC_irq(cfg);
+		__unmask_and_level_IO_APIC_irq(cfg);
 		spin_unlock(&ioapic_lock);
 	}
 #endif
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+#define ack_apic_level ack_apic_levelx
+#else
+static void ack_apic_level(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_apic_levelx(irq, &desc);
+}
+#endif
+
 static struct irq_chip ioapic_chip __read_mostly = {
 	.name		= "IO-APIC",
 	.startup	= startup_ioapic_irq,
@@ -2416,29 +2658,28 @@ static inline void init_IO_APIC_traps(vo
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for_each_irq_cfg(irq, cfg) {
-		if (IO_APIC_IRQ(irq) && !cfg->vector) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
+		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
 			 * interrupt if we can..
 			 */
-			if (irq < 16)
+			if (irq < NR_IRQS_LEGACY)
 				make_8259A_irq(irq);
-			else {
-				desc = irq_to_desc(irq);
+			else
 				/* Strange. Oh, well.. */
 				desc->chip = &no_irq_chip;
-			}
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
  * The local APIC irq-chip implementation:
  */
 
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irqx(unsigned int irq)
 {
 	unsigned long v;
 
@@ -2446,7 +2687,7 @@ static void mask_lapic_irq(unsigned int
 	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irqx(unsigned int irq)
 {
 	unsigned long v;
 
@@ -2454,11 +2695,30 @@ static void unmask_lapic_irq(unsigned in
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irqx(unsigned int irq)
 {
 	ack_APIC_irq();
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void mask_lapic_irq(unsigned int irq, struct irq_desc **descp)
+{
+	mask_lapic_irqx(irq);
+}
+static void unmask_lapic_irq(unsigned int irq, struct irq_desc **descp)
+{
+	unmask_lapic_irqx(irq);
+}
+static void ack_lapic_irq(unsigned int irq, struct irq_desc **descp)
+{
+	ack_lapic_irqx(irq);
+}
+#else
+#define mask_lapic_irq mask_lapic_irqx
+#define unmask_lapic_irq unmask_lapic_irqx
+#define ack_lapic_irq ack_lapic_irqx
+#endif
+
 static struct irq_chip lapic_chip __read_mostly = {
 	.name		= "local-APIC",
 	.mask		= mask_lapic_irq,
@@ -2574,7 +2834,9 @@ int timer_through_8259 __initdata;
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_desc *desc = irq_to_desc(0);
+	struct irq_cfg *cfg = desc->chip_data;
+	int cpu = smp_processor_id();
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	unsigned int ver;
@@ -2589,7 +2851,7 @@ static inline void __init check_timer(vo
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	assign_irq_vector(0, TARGET_CPUS);
+	assign_irq_vector(0, cfg, TARGET_CPUS);
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2640,10 +2902,10 @@ static inline void __init check_timer(vo
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq(0, apic1, pin1);
+			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irqx(0, &desc);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
@@ -2669,9 +2931,9 @@ static inline void __init check_timer(vo
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irqx(0, &desc);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2888,22 +3150,27 @@ unsigned int create_irq_nr(unsigned int
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
-	struct irq_cfg *cfg_new;
+	struct irq_cfg *cfg_new = NULL;
+	int cpu;
+	struct irq_desc *desc_new = NULL;
 
+#ifndef CONFIG_SPARSE_IRQ
 	irq_want = nr_irqs - 1;
+#endif
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
+	cpu = smp_processor_id();
 	for (new = irq_want; new > 0; new--) {
 		if (platform_legacy_irq(new))
 			continue;
-		cfg_new = irq_cfg(new);
-		if (cfg_new && cfg_new->vector != 0)
+
+		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		cfg_new = desc_new->chip_data;
+
+		if (cfg_new->vector != 0)
 			continue;
-		/* check if need to create one */
-		if (!cfg_new)
-			cfg_new = irq_cfg_alloc(new);
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
 	}
@@ -2911,6 +3178,9 @@ unsigned int create_irq_nr(unsigned int
 
 	if (irq > 0) {
 		dynamic_irq_init(irq);
+		/* restore it, in case dynamic_irq_init clear it */
+		if (desc_new)
+			desc_new->chip_data = cfg_new;
 	}
 	return irq;
 }
@@ -2930,14 +3200,22 @@ int create_irq(void)
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
+	/* store it, in case dynamic_irq_cleanup clear it */
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
 	dynamic_irq_cleanup(irq);
+	/* connect back irq_cfg */
+	if (desc)
+		desc->chip_data = cfg;
 
 #ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
 #endif
 	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
+	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
@@ -2952,12 +3230,12 @@ static int msi_compose_msg(struct pci_de
 	unsigned dest;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (err)
 		return err;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, tmp);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3025,10 +3303,13 @@ static void set_msi_irq_affinity(unsigne
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3040,7 +3321,6 @@ static void set_msi_irq_affinity(unsigne
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	write_msi_msg(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 
@@ -3064,10 +3344,13 @@ static void ir_set_msi_irq_affinity(unsi
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3091,7 +3374,6 @@ static void ir_set_msi_irq_affinity(unsi
 		cfg->move_in_progress = 0;
 	}
 
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif
@@ -3176,7 +3458,7 @@ static int setup_msi_irq(struct pci_dev
 #endif
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
-	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for MSI/MSI-X\n", irq, irq);
 
 	return 0;
 }
@@ -3185,6 +3467,7 @@ static unsigned int build_irq_for_pci_de
 {
 	unsigned int irq;
 
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
 	irq = dev->bus->number;
 	irq <<= 8;
 	irq |= dev->devfn;
@@ -3199,7 +3482,7 @@ int arch_setup_msi_irq(struct pci_dev *d
 	int ret;
 	unsigned int irq_want;
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 
 	irq = create_irq_nr(irq_want);
 	if (irq == 0)
@@ -3240,7 +3523,8 @@ int arch_setup_msi_irqs(struct pci_dev *
 	int index = 0;
 #endif
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	/* count from the top 0xfff in 12 bits range */
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 	sub_handle = 0;
 	list_for_each_entry(desc, &dev->msi_list, list) {
 		irq = create_irq_nr(irq_want--);
@@ -3306,10 +3590,13 @@ static void dmar_msi_set_affinity(unsign
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3321,7 +3608,6 @@ static void dmar_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
@@ -3367,10 +3653,13 @@ static void hpet_msi_set_affinity(unsign
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3382,7 +3671,6 @@ static void hpet_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
@@ -3448,15 +3736,17 @@ static void set_ht_irq_affinity(unsigned
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif
@@ -3478,13 +3768,13 @@ int arch_setup_ht_irq(unsigned int irq,
 	int err;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (!err) {
 		struct ht_irq_msg msg;
 		unsigned dest;
 
-		cfg = irq_cfg(irq);
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
@@ -3508,7 +3798,8 @@ int arch_setup_ht_irq(unsigned int irq,
 		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
 					      handle_edge_irq, "edge");
 
-		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
+		dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for HT\n",
+				 irq, irq);
 	}
 	return err;
 }
@@ -3530,7 +3821,9 @@ int arch_enable_uv_irq(char *irq_name, u
 	unsigned long flags;
 	int err;
 
-	err = assign_irq_vector(irq, *eligible_cpu);
+	cfg = irq_cfg(irq);
+
+	err = assign_irq_vector(irq, cfg, *eligible_cpu);
 	if (err != 0)
 		return err;
 
@@ -3539,8 +3832,6 @@ int arch_enable_uv_irq(char *irq_name, u
 				      irq_name);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	cfg = irq_cfg(irq);
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3594,6 +3885,7 @@ int __init io_apic_get_redir_entries (in
 
 int __init probe_nr_irqs(void)
 {
+#ifdef CONFIG_SPARSE_IRQ
 	int idx;
 	int nr = 0;
 #ifndef CONFIG_XEN
@@ -3611,10 +3903,11 @@ int __init probe_nr_irqs(void)
 	/* something wrong ? */
 	if (nr < nr_min)
 		nr = nr_min;
-	if (WARN_ON(nr > NR_IRQS))
-		nr = NR_IRQS;
 
 	return nr;
+#else
+	return NR_IRQS;
+#endif
 }
 
 /* --------------------------------------------------------------------------
@@ -3722,7 +4015,7 @@ int io_apic_set_pci_routing (int ioapic,
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
-	if (irq >= 16)
+	if (irq >= NR_IRQS_LEGACY)
 		add_pin_to_irq(irq, ioapic, pin);
 
 	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
@@ -3852,7 +4145,6 @@ void __init ioapic_init_mappings(void)
 	struct resource *ioapic_res;
 	int i;
 
-	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
Index: linux-2.6/arch/x86/kernel/irqinit_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_32.c
+++ linux-2.6/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
 	/*
 	 * 16 old-style INTA-cycle interrupts:
 	 */
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/kernel/irqinit_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_64.c
+++ linux-2.6/arch/x86/kernel/irqinit_64.c
@@ -142,8 +142,7 @@ void __init init_ISA_irqs(void)
 	init_bsp_APIC();
 	init_8259A(0);
 
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/mm/init_32.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_32.c
+++ linux-2.6/arch/x86/mm/init_32.c
@@ -66,6 +66,7 @@ static unsigned long __meminitdata table
 static unsigned long __meminitdata table_top;
 
 static int __initdata after_init_bootmem;
+int after_bootmem;
 
 static __init void *alloc_low_page(unsigned long *phys)
 {
@@ -987,6 +988,8 @@ void __init mem_init(void)
 
 	set_highmem_pages_init();
 
+	after_bootmem = 1;
+
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
Index: linux-2.6/drivers/char/random.c
===================================================================
--- linux-2.6.orig/drivers/char/random.c
+++ linux-2.6/drivers/char/random.c
@@ -558,6 +558,8 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
+#ifndef CONFIG_SPARSE_IRQ
+
 static struct timer_rand_state *irq_timer_state[NR_IRQS];
 
 static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
@@ -576,6 +578,33 @@ static void set_timer_rand_state(unsigne
 	irq_timer_state[irq] = state;
 }
 
+#else
+
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return NULL;
+
+	return desc->timer_rand_state;
+}
+
+static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return;
+
+	desc->timer_rand_state = state;
+}
+#endif
+
 static struct timer_rand_state input_timer_state;
 
 /*
@@ -933,8 +962,10 @@ void rand_initialize_irq(int irq)
 {
 	struct timer_rand_state *state;
 
+#ifndef CONFIG_SPARSE_IRQ
 	if (irq >= nr_irqs)
 		return;
+#endif
 
 	state = get_timer_rand_state(irq);
 
Index: linux-2.6/drivers/pci/htirq.c
===================================================================
--- linux-2.6.orig/drivers/pci/htirq.c
+++ linux-2.6/drivers/pci/htirq.c
@@ -58,7 +58,7 @@ void fetch_ht_irq_msg(unsigned int irq,
 	*msg = cfg->msg;
 }
 
-void mask_ht_irq(unsigned int irq)
+void mask_ht_irqx(unsigned int irq, struct irq_desc **descp)
 {
 	struct ht_irq_cfg *cfg;
 	struct ht_irq_msg msg;
@@ -70,7 +70,7 @@ void mask_ht_irq(unsigned int irq)
 	write_ht_irq_msg(irq, &msg);
 }
 
-void unmask_ht_irq(unsigned int irq)
+void unmask_ht_irqx(unsigned int irq, struct irq_desc **descp)
 {
 	struct ht_irq_cfg *cfg;
 	struct ht_irq_msg msg;
@@ -82,6 +82,34 @@ void unmask_ht_irq(unsigned int irq)
 	write_ht_irq_msg(irq, &msg);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void mask_ht_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_ht_irqx(irq, &desc);
+}
+void unmask_ht_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_ht_irqx(irq, &desc);
+}
+#endif
+
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+	unsigned int irq;
+
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
+	irq = dev->bus->number;
+	irq <<= 8;
+	irq |= dev->devfn;
+	irq <<= 12;
+
+	return irq;
+}
+
 /**
  * __ht_create_irq - create an irq and attach it to a device.
  * @dev: The hypertransport device to find the irq capability on.
@@ -98,6 +126,7 @@ int __ht_create_irq(struct pci_dev *dev,
 	int max_irq;
 	int pos;
 	int irq;
+	unsigned int irq_want;
 
 	pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
 	if (!pos)
@@ -125,7 +154,12 @@ int __ht_create_irq(struct pci_dev *dev,
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
+	irq_want = build_irq_for_pci_dev(dev);
+#ifdef CONFIG_SPARSE_IRQ
+	irq = create_irq_nr(irq_want + idx);
+#else
 	irq = create_irq();
+#endif
 
 	if (irq <= 0) {
 		kfree(cfg);
Index: linux-2.6/drivers/pci/intr_remapping.c
===================================================================
--- linux-2.6.orig/drivers/pci/intr_remapping.c
+++ linux-2.6/drivers/pci/intr_remapping.c
@@ -19,17 +19,73 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+{
+	struct irq_2_iommu *iommu;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	iommu = kzalloc_node(sizeof(*iommu), GFP_KERNEL, node);
+	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+
+	return iommu;
+}
 
 static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 {
-	return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (WARN_ON_ONCE(!desc))
+		return NULL;
+
+	return desc->irq_2_iommu;
 }
 
+static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct irq_2_iommu *irq_iommu;
+
+	/*
+	 * alloc irq desc if not allocated already.
+	 */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+
+	irq_iommu = desc->irq_2_iommu;
+
+	if (!irq_iommu)
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+
+	return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+	return irq_2_iommu_alloc_cpu(irq, -1);
+}
+
+#else /* !CONFIG_SPARSE_IRQ */
+
+static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_2_iommuX[irq];
+
+	return NULL;
+}
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
 	return irq_2_iommu(irq);
 }
+#endif
 
 static DEFINE_SPINLOCK(irq_2_ir_lock);
 
@@ -86,9 +142,11 @@ int alloc_irte(struct intel_iommu *iommu
 	if (!count)
 		return -1;
 
+#ifndef CONFIG_SPARSE_IRQ
 	/* protect irq_2_iommu_alloc later */
 	if (irq >= nr_irqs)
 		return -1;
+#endif
 
 	/*
 	 * start the IRTE search from index 0.
Index: linux-2.6/drivers/xen/events.c
===================================================================
--- linux-2.6.orig/drivers/xen/events.c
+++ linux-2.6/drivers/xen/events.c
@@ -141,8 +141,9 @@ static void init_evtchn_cpu_bindings(voi
 	int i;
 
 	/* By default all event channels notify CPU#0. */
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		desc->affinity = cpumask_of_cpu(0);
+	} end_for_each_irq_desc();
 #endif
 
 	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@ -231,7 +232,7 @@ static int find_unbound_irq(void)
 	int irq;
 
 	/* Only allocate from dynirq range */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		if (irq_bindcount[irq] == 0)
 			break;
 
@@ -792,7 +793,7 @@ void xen_irq_resume(void)
 		mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		irq_info[irq].evtchn = 0; /* zap event-channel binding */
 
 	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@ -824,7 +825,7 @@ void __init xen_init_IRQ(void)
 		mask_evtchn(i);
 
 	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-	for_each_irq_nr(i)
+	for (i = 0; i < nr_irqs; i++)
 		irq_bindcount[i] = 0;
 
 	irq_ctx_init(smp_processor_id());
Index: linux-2.6/fs/proc/stat.c
===================================================================
--- linux-2.6.orig/fs/proc/stat.c
+++ linux-2.6/fs/proc/stat.c
@@ -27,6 +27,9 @@ static int show_stat(struct seq_file *p,
 	u64 sum = 0;
 	struct timespec boottime;
 	unsigned int per_irq_sum;
+#ifdef CONFIG_GENERIC_HARDIRQS
+	struct irq_desc *desc;
+#endif
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -44,10 +47,9 @@ static int show_stat(struct seq_file *p,
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-
-		for_each_irq_nr(j)
+		for_each_irq_desc(j, desc) {
 			sum += kstat_irqs_cpu(j, i);
-
+		} end_for_each_irq_desc();
 		sum += arch_irq_stat_cpu(i);
 	}
 	sum += arch_irq_stat();
@@ -90,14 +92,17 @@ static int show_stat(struct seq_file *p,
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_nr(j) {
+	for_each_irq_desc(j, desc) {
 		per_irq_sum = 0;
-
 		for_each_possible_cpu(i)
 			per_irq_sum += kstat_irqs_cpu(j, i);
 
+#ifdef CONFIG_SPARSE_IRQ
+		seq_printf(p, " %#x:%u", j, per_irq_sum);
+#else
 		seq_printf(p, " %u", per_irq_sum);
-	}
+#endif
+	} end_for_each_irq_desc();
 
 	seq_printf(p,
 		"\nctxt %llu\n"
Index: linux-2.6/fs/proc/interrupts.c
===================================================================
--- linux-2.6.orig/fs/proc/interrupts.c
+++ linux-2.6/fs/proc/interrupts.c
@@ -8,6 +8,23 @@
 /*
  * /proc/interrupts
  */
+#ifdef CONFIG_SPARSE_IRQ
+static void *int_seq_start(struct seq_file *f, loff_t *pos)
+{
+	rcu_read_lock();
+	return seq_list_start(&sparse_irqs_head, *pos);
+}
+
+static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &sparse_irqs_head, pos);
+}
+
+static void int_seq_stop(struct seq_file *f, void *v)
+{
+	rcu_read_unlock();
+}
+#else
 static void *int_seq_start(struct seq_file *f, loff_t *pos)
 {
 	return (*pos <= nr_irqs) ? pos : NULL;
@@ -25,6 +42,7 @@ static void int_seq_stop(struct seq_file
 {
 	/* Nothing to do */
 }
+#endif
 
 static const struct seq_operations int_seq_ops = {
 	.start = int_seq_start,
Index: linux-2.6/include/linux/interrupt.h
===================================================================
--- linux-2.6.orig/include/linux/interrupt.h
+++ linux-2.6/include/linux/interrupt.h
@@ -18,6 +18,8 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
+extern int nr_irqs;
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h
+++ linux-2.6/include/linux/irq.h
@@ -106,11 +106,19 @@ struct irq_chip {
 	void		(*enable)(unsigned int irq);
 	void		(*disable)(unsigned int irq);
 
+#ifdef CONFIG_SPARSE_IRQ
+	void		(*ack)(unsigned int irq, struct irq_desc **descp);
+	void		(*mask)(unsigned int irq, struct irq_desc **descp);
+	void		(*mask_ack)(unsigned int irq, struct irq_desc **descp);
+	void		(*unmask)(unsigned int irq, struct irq_desc **descp);
+	void		(*eoi)(unsigned int irq, struct irq_desc **descp);
+#else
 	void		(*ack)(unsigned int irq);
 	void		(*mask)(unsigned int irq);
 	void		(*mask_ack)(unsigned int irq);
 	void		(*unmask)(unsigned int irq);
 	void		(*eoi)(unsigned int irq);
+#endif
 
 	void		(*end)(unsigned int irq);
 	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
@@ -129,6 +137,8 @@ struct irq_chip {
 	const char	*typename;
 };
 
+struct timer_rand_state;
+struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -155,6 +165,15 @@ struct irq_chip {
  */
 struct irq_desc {
 	unsigned int		irq;
+#ifdef CONFIG_SPARSE_IRQ
+	struct list_head	list;
+	struct list_head	hash_entry;
+	struct timer_rand_state *timer_rand_state;
+	unsigned int            *kstat_irqs;
+# ifdef CONFIG_INTR_REMAP
+	struct irq_2_iommu      *irq_2_iommu;
+# endif
+#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -182,13 +201,67 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+extern void arch_early_irq_init_work(void);
+extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
+					struct irq_desc *desc, int cpu);
+extern void arch_free_chip_data(struct irq_desc *desc);
+
+#ifndef CONFIG_SPARSE_IRQ
 
+/* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
 
-static inline struct irq_desc *irq_to_desc(unsigned int irq)
-{
-	return (irq < nr_irqs) ? irq_desc + irq : NULL;
-}
+#ifdef CONFIG_GENERIC_HARDIRQS
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+# define for_each_irq_desc_reverse(irq, desc)                          \
+	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
+	    irq >= 0; irq--, desc--)
+
+#define end_for_each_irq_desc()
+#endif
+
+#define desc_chip_ack(irq, descp) desc->chip->ack(irq)
+#define desc_chip_mask(irq, descp) desc->chip->mask(irq)
+#define desc_chip_mask_ack(irq, descp) desc->chip->mask_ack(irq)
+#define desc_chip_unmask(irq, descp) desc->chip->unmask(irq)
+#define desc_chip_eoi(irq, descp) desc->chip->eoi(irq)
+
+#else
+
+void early_irq_init_work(void);
+extern struct list_head sparse_irqs_head;
+#define for_each_irq_desc(irqX, desc)					\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.next), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.next), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.next), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define for_each_irq_desc_reverse(irqX, desc)				\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.prev), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.prev), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.prev), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define end_for_each_irq_desc() rcu_read_unlock()
+
+#define kstat_irqs_this_cpu(DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()])
+#define kstat_incr_irqs_this_cpu(irqno, DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()]++)
+
+#define desc_chip_ack(irq, descp) desc->chip->ack(irq, descp)
+#define desc_chip_mask(irq, descp) desc->chip->mask(irq, descp)
+#define desc_chip_mask_ack(irq, descp) desc->chip->mask_ack(irq, descp)
+#define desc_chip_unmask(irq, descp) desc->chip->unmask(irq, descp)
+#define desc_chip_eoi(irq, descp) desc->chip->eoi(irq, descp)
+
+#endif
 
 /*
  * Migration helpers for obsolete names, they will go away:
@@ -211,8 +284,12 @@ extern int setup_irq(unsigned int irq, s
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 
+void move_native_irqx(int irq, struct irq_desc **descp);
+void move_masked_irqx(int irq, struct irq_desc **descp);
+#ifndef CONFIG_SPARSE_IRQ
 void move_native_irq(int irq);
 void move_masked_irq(int irq);
+#endif
 
 #else /* CONFIG_GENERIC_PENDING_IRQ */
 
Index: linux-2.6/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.orig/include/linux/kernel_stat.h
+++ linux-2.6/include/linux/kernel_stat.h
@@ -28,7 +28,9 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-	unsigned int irqs[NR_IRQS];
+#ifndef CONFIG_SPARSE_IRQ
+       unsigned int irqs[NR_IRQS];
+#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, ksta
 
 extern unsigned long long nr_context_switches(void);
 
+#ifndef CONFIG_SPARSE_IRQ
+#define kstat_irqs_this_cpu(irq) \
+	(kstat_this_cpu.irqs[irq])
+
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
@@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_
 {
 	kstat_this_cpu.irqs[irq]++;
 }
+#endif
+
 
+#ifndef CONFIG_SPARSE_IRQ
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
+#else
+extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#endif
 
 /*
  * Number of interrupts per specific IRQ source, since bootup
Index: linux-2.6/kernel/irq/autoprobe.c
===================================================================
--- linux-2.6.orig/kernel/irq/autoprobe.c
+++ linux-2.6/kernel/irq/autoprobe.c
@@ -57,7 +57,7 @@ unsigned long probe_irq_on(void)
 			desc->chip->startup(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/* Wait for longstanding interrupts to trigger. */
 	msleep(20);
@@ -75,7 +75,7 @@ unsigned long probe_irq_on(void)
 				desc->status |= IRQ_PENDING;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/*
 	 * Wait for spurious interrupts to trigger
@@ -99,7 +99,7 @@ unsigned long probe_irq_on(void)
 					mask |= 1 << i;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	return mask;
 }
@@ -135,7 +135,7 @@ unsigned int probe_irq_mask(unsigned lon
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	return mask & val;
@@ -179,7 +179,7 @@ int probe_irq_off(unsigned long val)
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	if (nr_of_irqs > 1)
Index: linux-2.6/kernel/irq/chip.c
===================================================================
--- linux-2.6.orig/kernel/irq/chip.c
+++ linux-2.6/kernel/irq/chip.c
@@ -24,9 +24,11 @@
  */
 void dynamic_irq_init(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc;
 	unsigned long flags;
 
+	/* first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
@@ -223,7 +225,7 @@ static void default_enable(unsigned int
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	desc->chip->unmask(irq);
+	desc_chip_unmask(irq, &desc);
 	desc->status &= ~IRQ_MASKED;
 }
 
@@ -252,7 +254,7 @@ static void default_shutdown(unsigned in
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	desc->chip->mask(irq);
+	desc_chip_mask(irq, &desc);
 	desc->status |= IRQ_MASKED;
 }
 
@@ -282,13 +284,15 @@ void irq_chip_set_defaults(struct irq_ch
 		chip->end = dummy_irq_chip.end;
 }
 
-static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+static inline void mask_ack_irq(struct irq_desc **descp, int irq)
 {
+	struct irq_desc *desc = *descp;
+
 	if (desc->chip->mask_ack)
-		desc->chip->mask_ack(irq);
+		desc_chip_mask_ack(irq, descp);
 	else {
-		desc->chip->mask(irq);
-		desc->chip->ack(irq);
+		desc_chip_mask(irq, descp);
+		desc_chip_ack(irq, descp);
 	}
 }
 
@@ -351,7 +355,7 @@ handle_level_irq(unsigned int irq, struc
 	irqreturn_t action_ret;
 
 	spin_lock(&desc->lock);
-	mask_ack_irq(desc, irq);
+	mask_ack_irq(&desc, irq);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -376,7 +380,7 @@ handle_level_irq(unsigned int irq, struc
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
-		desc->chip->unmask(irq);
+		desc_chip_unmask(irq, &desc);
 out_unlock:
 	spin_unlock(&desc->lock);
 }
@@ -413,7 +417,7 @@ handle_fasteoi_irq(unsigned int irq, str
 	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
 		desc->status |= IRQ_PENDING;
 		if (desc->chip->mask)
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 		goto out;
 	}
 
@@ -428,7 +432,7 @@ handle_fasteoi_irq(unsigned int irq, str
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 out:
-	desc->chip->eoi(irq);
+	desc_chip_eoi(irq, &desc);
 
 	spin_unlock(&desc->lock);
 }
@@ -464,13 +468,13 @@ handle_edge_irq(unsigned int irq, struct
 	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
-		mask_ack_irq(desc, irq);
+		mask_ack_irq(&desc, irq);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
-	desc->chip->ack(irq);
+	desc_chip_ack(irq, &desc);
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -480,7 +484,7 @@ handle_edge_irq(unsigned int irq, struct
 		irqreturn_t action_ret;
 
 		if (unlikely(!action)) {
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 			goto out_unlock;
 		}
 
@@ -492,7 +496,7 @@ handle_edge_irq(unsigned int irq, struct
 		if (unlikely((desc->status &
 			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
 			      (IRQ_PENDING | IRQ_MASKED))) {
-			desc->chip->unmask(irq);
+			desc_chip_unmask(irq, &desc);
 			desc->status &= ~IRQ_MASKED;
 		}
 
@@ -525,14 +529,14 @@ handle_percpu_irq(unsigned int irq, stru
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	if (desc->chip->ack)
-		desc->chip->ack(irq);
+		desc_chip_ack(irq, &desc);
 
 	action_ret = handle_IRQ_event(irq, desc->action);
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
 	if (desc->chip->eoi)
-		desc->chip->eoi(irq);
+		desc_chip_eoi(irq, &desc);
 }
 
 void
@@ -567,8 +571,9 @@ __set_irq_handler(unsigned int irq, irq_
 
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
-		if (desc->chip != &no_irq_chip)
-			mask_ack_irq(desc, irq);
+		if (desc->chip != &no_irq_chip) {
+			mask_ack_irq(&desc, irq);
+		}
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c
+++ linux-2.6/kernel/irq/handle.c
@@ -15,9 +15,16 @@
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
 
 #include "internals.h"
 
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
@@ -49,6 +56,299 @@ void handle_bad_irq(unsigned int irq, st
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
+void __init __attribute__((weak)) arch_early_irq_init_work(void)
+{
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_desc irq_desc_init = {
+	.irq	    = -1U,
+	.status	    = IRQ_DISABLED,
+	.chip	    = &no_irq_chip,
+	.handle_irq = handle_bad_irq,
+	.depth      = 1,
+	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+	.affinity   = CPU_MASK_ALL
+#endif
+};
+
+static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+{
+	unsigned long bytes;
+	char *ptr;
+	int node;
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	ptr = kzalloc_node(bytes, GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+
+	desc->kstat_irqs = (unsigned int *)ptr;
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc,
+				 int cpu, int nr)
+{
+	unsigned long bytes;
+
+	init_kstat_irqs(desc, cpu, nr);
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+}
+
+static void free_kstat_irqs(struct irq_desc *desc)
+{
+	kfree(desc->kstat_irqs);
+	desc->kstat_irqs = NULL;
+}
+#endif
+
+void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+}
+
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
+	desc->irq = irq;
+#ifdef CONFIG_SMP
+	desc->cpu = cpu;
+#endif
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_kstat_irqs(desc, cpu, nr_cpu_ids);
+	arch_init_chip_data(desc, cpu);
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+		 struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	desc->cpu = cpu;
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *desc)
+{
+	free_kstat_irqs(desc);
+	arch_free_chip_data(desc);
+}
+#endif
+/*
+ * Protect the sparse_irqs_free freelist:
+ */
+static DEFINE_SPINLOCK(sparse_irq_lock);
+LIST_HEAD(sparse_irqs_head);
+
+/*
+ * The sparse irqs are in a hash-table as well, for fast lookup:
+ */
+#define SPARSEIRQHASH_BITS          (13 - 1)
+#define SPARSEIRQHASH_SIZE          (1UL << SPARSEIRQHASH_BITS)
+#define __sparseirqhashfn(key)      hash_long((unsigned long)key, SPARSEIRQHASH_BITS)
+#define sparseirqhashentry(key)     (sparseirqhash_table + __sparseirqhashfn((key)))
+
+static struct list_head sparseirqhash_table[SPARSEIRQHASH_SIZE];
+
+static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
+	[0 ... NR_IRQS_LEGACY-1] = {
+		.irq	    = -1U,
+		.status	    = IRQ_DISABLED,
+		.chip	    = &no_irq_chip,
+		.handle_irq = handle_bad_irq,
+		.depth	    = 1,
+		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+		.affinity   = CPU_MASK_ALL
+#endif
+	}
+};
+
+/* FIXME: use bootmem alloc ...*/
+static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+
+void __init early_irq_init_work(void)
+{
+	struct irq_desc *desc;
+	int legacy_count;
+	int i;
+
+	/* init_work to init list for sparseirq */
+	for (i = 0; i < SPARSEIRQHASH_SIZE; i++)
+		INIT_LIST_HEAD(sparseirqhash_table + i);
+
+	desc = irq_desc_legacy;
+	legacy_count = ARRAY_SIZE(irq_desc_legacy);
+
+	for (i = 0; i < legacy_count; i++) {
+		struct list_head *hash_head;
+
+		hash_head = sparseirqhashentry(i);
+		desc[i].irq = i;
+		desc[i].kstat_irqs = kstat_irqs_legacy[i];
+		list_add_tail(&desc[i].hash_entry, hash_head);
+		list_add_tail(&desc[i].list, &sparse_irqs_head);
+	}
+
+	arch_early_irq_init_work();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+
+	hash_head = sparseirqhashentry(irq);
+
+	/*
+	 * We can walk the hash lockfree, because the hash only
+	 * grows, and we are careful when adding entries to the end:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry) {
+		if (desc->irq == irq)
+			return desc;
+	}
+
+	return NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		return desc;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry)
+		if (desc->irq == irq)
+			goto out_unlock;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_desc for %d aka %#x on cpu %d node %d\n",
+		 irq, irq, cpu, node);
+	init_one_irq_desc(irq, desc, cpu);
+
+	/*
+	 * We use RCU's safe list-add method to make
+	 * parallel walking of the hash-list safe:
+	 */
+	list_add_tail_rcu(&desc->hash_entry, hash_head);
+	/*
+	 * Add it to the global list:
+	 */
+	list_add_tail_rcu(&desc->list, &sparse_irqs_head);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc_alloc_cpu(irq, -1);
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+						int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	irq = old_desc->irq;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry)
+		if (desc->irq == irq && old_desc != desc)
+			goto out_unlock;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  move irq_desc for %d aka %#x to cpu %d node %d\n",
+		 irq, irq, cpu, node);
+
+	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+	list_replace_rcu(&old_desc->hash_entry, &desc->hash_entry);
+	list_replace_rcu(&old_desc->list, &desc->list);
+
+	/* free the old one */
+	free_one_irq_desc(old_desc);
+	kfree(old_desc);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+	int old_cpu;
+	int node, old_node;
+
+	/* those all static, do move them */
+	if (desc->irq < NR_IRQS_LEGACY)
+		return desc;
+
+	old_cpu = desc->cpu;
+	printk(KERN_DEBUG "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
+	if (old_cpu != cpu) {
+		node = cpu_to_node(cpu);
+		old_node = cpu_to_node(old_cpu);
+		if (old_node != node)
+			desc = __real_move_irq_desc(desc, cpu);
+		else
+			desc->cpu = cpu;
+	}
+
+	return desc;
+}
+#endif
+
+#else
+
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
@@ -62,17 +362,47 @@ struct irq_desc irq_desc[NR_IRQS] __cach
 	}
 };
 
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_desc[irq];
+
+	return NULL;
+}
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu)
+{
+	return old_desc;
+}
+#endif
+
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
  */
+static void ack_badx(unsigned int irq, struct irq_desc **descp)
+{
+	print_irq_desc(irq, *descp);
+	ack_bad_irq(irq);
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+#define ack_bad ack_badx
+#else
 static void ack_bad(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	print_irq_desc(irq, desc);
-	ack_bad_irq(irq);
+	ack_badx(irq, &desc);
 }
+#endif
 
 /*
  * NOP functions
@@ -81,6 +411,15 @@ static void noop(unsigned int irq)
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void noopx(unsigned int irq, struct irq_desc **descp)
+{
+	noop(irq);
+}
+#else
+#define noopx noop
+#endif
+
 static unsigned int noop_ret(unsigned int irq)
 {
 	return 0;
@@ -109,9 +448,9 @@ struct irq_chip dummy_irq_chip = {
 	.shutdown	= noop,
 	.enable		= noop,
 	.disable	= noop,
-	.ack		= noop,
-	.mask		= noop,
-	.unmask		= noop,
+	.ack		= noopx,
+	.mask		= noopx,
+	.unmask		= noopx,
 	.end		= noop,
 };
 
@@ -180,7 +519,7 @@ unsigned int __do_IRQ(unsigned int irq)
 		 * No locking required for CPU-local interrupts:
 		 */
 		if (desc->chip->ack)
-			desc->chip->ack(irq);
+			desc_chip_ack(irq, &desc);
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -192,7 +531,7 @@ unsigned int __do_IRQ(unsigned int irq)
 
 	spin_lock(&desc->lock);
 	if (desc->chip->ack)
-		desc->chip->ack(irq);
+		desc_chip_ack(irq, &desc);
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
@@ -261,17 +600,25 @@ out:
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-static struct lock_class_key irq_desc_lock_class;
-
 void early_init_irq_lock_class(void)
 {
+#ifndef CONFIG_SPARSE_IRQ
 	struct irq_desc *desc;
 	int i;
 
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	} end_for_each_irq_desc();
+#endif
 }
 #endif
+
+#ifdef CONFIG_SPARSE_IRQ
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	return desc->kstat_irqs[cpu];
+}
+#endif
+EXPORT_SYMBOL(kstat_irqs_cpu);
+
Index: linux-2.6/arch/x86/kernel/irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq.c
+++ linux-2.6/arch/x86/kernel/irq.c
@@ -99,25 +99,37 @@ static int show_other_interrupts(struct
 int show_interrupts(struct seq_file *p, void *v)
 {
 	unsigned long flags, any_count = 0;
-	int i = *(loff_t *) v, j;
+	int i, j;
 	struct irqaction *action;
 	struct irq_desc *desc;
+	int head = 0;
 
+#ifdef CONFIG_SPARSE_IRQ
+	desc = list_entry(v, struct irq_desc, list);
+	i = desc->irq;
+	if (&desc->list == sparse_irqs_head.next)
+		head = 1;
+#else
+	i = *(loff_t *) v;
 	if (i > nr_irqs)
 		return 0;
 
 	if (i == nr_irqs)
 		return show_other_interrupts(p);
+	if (i == 0)
+		head = 1;
+
+	desc = irq_to_desc(i);
+#endif
 
 	/* print header */
-	if (i == 0) {
+	if (head) {
 		seq_printf(p, "           ");
 		for_each_online_cpu(j)
 			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
 	}
 
-	desc = irq_to_desc(i);
 	spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 	any_count = kstat_irqs(i);
@@ -148,6 +160,12 @@ int show_interrupts(struct seq_file *p,
 	seq_putc(p, '\n');
 out:
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+#ifdef CONFIG_SPARSE_IRQ
+	if (&desc->list == sparse_irqs_head.prev)
+		show_other_interrupts(p);
+#endif
+
 	return 0;
 }
 
Index: linux-2.6/include/linux/irqnr.h
===================================================================
--- linux-2.6.orig/include/linux/irqnr.h
+++ linux-2.6/include/linux/irqnr.h
@@ -7,18 +7,11 @@
 
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0; irq < nr_irqs; irq++)
-#else
-extern int nr_irqs;
+# define end_for_each_irq_desc()
 
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-
-# define for_each_irq_desc_reverse(irq, desc)				\
-	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);	\
-	     irq >= 0; irq--, desc--)
+static inline early_sparse_irq_init_work(void)
+{
+}
 #endif
 
-#define for_each_irq_nr(irq)			\
-	for (irq = 0; irq < nr_irqs; irq++)
-
 #endif
Index: linux-2.6/arch/x86/kernel/irq_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_32.c
+++ linux-2.6/arch/x86/kernel/irq_32.c
@@ -254,7 +254,7 @@ void fixup_irqs(cpumask_t map)
 			desc->chip->set_affinity(irq, mask);
 		else if (desc->action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 #if 0
 	barrier();
Index: linux-2.6/arch/x86/kernel/irq_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_64.c
+++ linux-2.6/arch/x86/kernel/irq_64.c
@@ -113,7 +113,7 @@ void fixup_irqs(cpumask_t map)
 		}
 
 		if (desc->chip->mask)
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 
 		if (desc->chip->set_affinity)
 			desc->chip->set_affinity(irq, mask);
@@ -121,7 +121,7 @@ void fixup_irqs(cpumask_t map)
 			set_affinity = 0;
 
 		if (desc->chip->unmask)
-			desc->chip->unmask(irq);
+			desc_chip_unmask(irq, &desc);
 
 		spin_unlock(&desc->lock);
 
@@ -129,7 +129,7 @@ void fixup_irqs(cpumask_t map)
 			printk("Broke affinity for irq %i\n", irq);
 		else if (!set_affinity)
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 	/* That doesn't seem sufficient.  Give it 1ms. */
 	local_irq_enable();
Index: linux-2.6/kernel/irq/proc.c
===================================================================
--- linux-2.6.orig/kernel/irq/proc.c
+++ linux-2.6/kernel/irq/proc.c
@@ -243,7 +243,8 @@ void init_irq_proc(void)
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for_each_irq_desc(irq, desc)
+	for_each_irq_desc(irq, desc) {
 		register_irq_proc(irq, desc);
+	} end_for_each_irq_desc();
 }
 
Index: linux-2.6/kernel/irq/spurious.c
===================================================================
--- linux-2.6.orig/kernel/irq/spurious.c
+++ linux-2.6/kernel/irq/spurious.c
@@ -99,7 +99,7 @@ static int misrouted_irq(int irq)
 
 		if (try_one_irq(i, desc))
 			ok = 1;
-	}
+	} end_for_each_irq_desc();
 	/* So the caller can adjust the irq error counts */
 	return ok;
 }
@@ -122,7 +122,7 @@ static void poll_spurious_irqs(unsigned
 			continue;
 
 		try_one_irq(i, desc);
-	}
+	} end_for_each_irq_desc();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c
+++ linux-2.6/init/main.c
@@ -542,6 +542,15 @@ void __init __weak thread_info_cache_ini
 {
 }
 
+void __init __attribute__((weak)) arch_early_irq_init_work(void)
+{
+}
+
+void __init __attribute__((weak)) early_irq_init_work(void)
+{
+	arch_early_irq_init_work();
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
@@ -612,6 +621,8 @@ asmlinkage void __init start_kernel(void
 	sort_main_extable();
 	trap_init();
 	rcu_init();
+	/* init some links before init_ISA_irqs() */
+	early_irq_init_work();
 	init_IRQ();
 	pidhash_init();
 	init_timers();
Index: linux-2.6/arch/x86/include/asm/irq_vectors.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/irq_vectors.h
+++ linux-2.6/arch/x86/include/asm/irq_vectors.h
@@ -101,6 +101,8 @@
 #define LAST_VM86_IRQ		15
 #define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
 
+#define NR_IRQS_LEGACY		16
+
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
Index: linux-2.6/arch/x86/kernel/i8259.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/i8259.c
+++ linux-2.6/arch/x86/kernel/i8259.c
@@ -36,12 +36,31 @@ static int i8259A_auto_eoi;
 DEFINE_SPINLOCK(i8259A_lock);
 static void mask_and_ack_8259A(unsigned int);
 
+#ifdef CONFIG_SPARSE_IRQ
+static void mask_and_ack_8259A_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	mask_and_ack_8259A(irq);
+}
+static void disable_8259A_irq_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	disable_8259A_irq(irq);
+}
+static void enable_8259A_irq_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	enable_8259A_irq(irq);
+}
+#else
+#define mask_and_ack_8259A_wrapper mask_and_ack_8259A
+#define disable_8259A_irq_wrapper disable_8259A_irq
+#define enable_8259A_irq_wrapper enable_8259A_irq
+#endif
+
 struct irq_chip i8259A_chip = {
 	.name		= "XT-PIC",
-	.mask		= disable_8259A_irq,
+	.mask		= disable_8259A_irq_wrapper,
 	.disable	= disable_8259A_irq,
-	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
+	.unmask		= enable_8259A_irq_wrapper,
+	.mask_ack	= mask_and_ack_8259A_wrapper,
 };
 
 /*
@@ -348,9 +367,9 @@ void init_8259A(int auto_eoi)
 		 * In AEOI mode we just have to mask the interrupt
 		 * when acking.
 		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
+		i8259A_chip.mask_ack = disable_8259A_irq_wrapper;
 	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
+		i8259A_chip.mask_ack = mask_and_ack_8259A_wrapper;
 
 	udelay(100);		/* wait for 8259A to initialize */
 
Index: linux-2.6/arch/x86/kernel/uv_irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/uv_irq.c
+++ linux-2.6/arch/x86/kernel/uv_irq.c
@@ -18,6 +18,16 @@ static void uv_noop(unsigned int irq)
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_noop_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	uv_noop(irq);
+}
+
+#else
+#define uv_noop_wrapper uv_noop
+#endif
+
 static unsigned int uv_noop_ret(unsigned int irq)
 {
 	return 0;
@@ -28,16 +38,26 @@ static void uv_ack_apic(unsigned int irq
 	ack_APIC_irq();
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_ack_apic_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	uv_ack_apic(irq);
+}
+
+#else
+#define uv_ack_apic_wrapper uv_ack_apic
+#endif
+
 struct irq_chip uv_irq_chip = {
 	.name		= "UV-CORE",
 	.startup	= uv_noop_ret,
 	.shutdown	= uv_noop,
 	.enable		= uv_noop,
 	.disable	= uv_noop,
-	.ack		= uv_noop,
-	.mask		= uv_noop,
-	.unmask		= uv_noop,
-	.eoi		= uv_ack_apic,
+	.ack		= uv_noop_wrapper,
+	.mask		= uv_noop_wrapper,
+	.unmask		= uv_noop_wrapper,
+	.eoi		= uv_ack_apic_wrapper,
 	.end		= uv_noop,
 };
 
Index: linux-2.6/drivers/pci/msi.c
===================================================================
--- linux-2.6.orig/drivers/pci/msi.c
+++ linux-2.6/drivers/pci/msi.c
@@ -103,11 +103,11 @@ static void msix_set_enable(struct pci_d
 	}
 }
 
-static void msix_flush_writes(unsigned int irq)
+static void msix_flush_writes(struct irq_desc *desc)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = desc->msi_desc;
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -135,11 +135,11 @@ static void msix_flush_writes(unsigned i
  * Returns 1 if it succeeded in masking the interrupt and 0 if the device
  * doesn't support MSI masking.
  */
-static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
+static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = desc->msi_desc;
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -252,17 +252,36 @@ void write_msi_msg(unsigned int irq, str
 	entry->msg = *msg;
 }
 
-void mask_msi_irq(unsigned int irq)
+void mask_msi_irqx(unsigned int irq, struct irq_desc **descp)
 {
-	msi_set_mask_bits(irq, 1, 1);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = *descp;
+
+	msi_set_mask_bits(desc, 1, 1);
+	msix_flush_writes(desc);
 }
 
+void unmask_msi_irqx(unsigned int irq, struct irq_desc **descp)
+{
+	struct irq_desc *desc = *descp;
+
+	msi_set_mask_bits(desc, 1, 0);
+	msix_flush_writes(desc);
+}
+
+#ifndef CONFIG_SPARSE_IRQ
+void mask_msi_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_msi_irqx(irq, &desc);
+}
 void unmask_msi_irq(unsigned int irq)
 {
-	msi_set_mask_bits(irq, 1, 0);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_msi_irqx(irq, &desc);
 }
+#endif
 
 static int msi_free_irqs(struct pci_dev* dev);
 
@@ -303,9 +322,11 @@ static void __pci_restore_msi_state(stru
 	pci_intx_for_msi(dev, 0);
 	msi_set_enable(dev, 0);
 	write_msi_msg(dev->irq, &entry->msg);
-	if (entry->msi_attrib.maskbit)
-		msi_set_mask_bits(dev->irq, entry->msi_attrib.maskbits_mask,
+	if (entry->msi_attrib.maskbit) {
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask,
 				  entry->msi_attrib.masked);
+	}
 
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
@@ -327,8 +348,9 @@ static void __pci_restore_msix_state(str
 	msix_set_enable(dev, 0);
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
+		struct irq_desc *desc = irq_to_desc(entry->irq);
 		write_msi_msg(entry->irq, &entry->msg);
-		msi_set_mask_bits(entry->irq, 1, entry->msi_attrib.masked);
+		msi_set_mask_bits(desc, 1, entry->msi_attrib.masked);
 	}
 
 	BUG_ON(list_empty(&dev->msi_list));
@@ -596,7 +618,8 @@ void pci_msi_shutdown(struct pci_dev* de
 	/* Return the the pci reset with msi irqs unmasked */
 	if (entry->msi_attrib.maskbit) {
 		u32 mask = entry->msi_attrib.maskbits_mask;
-		msi_set_mask_bits(dev->irq, mask, ~mask);
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, mask, ~mask);
 	}
 	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
 		return;
Index: linux-2.6/include/linux/msi.h
===================================================================
--- linux-2.6.orig/include/linux/msi.h
+++ linux-2.6/include/linux/msi.h
@@ -10,8 +10,16 @@ struct msi_msg {
 };
 
 /* Helper functions */
+#ifdef CONFIG_SPARSE_IRQ
+struct irq_desc;
+extern void mask_msi_irqx(unsigned int irq, struct irq_desc **descp);
+extern void unmask_msi_irqx(unsigned int irq, struct irq_desc **descp);
+#define mask_msi_irq mask_msi_irqx
+#define unmask_msi_irq unmask_msi_irqx
+#else
 extern void mask_msi_irq(unsigned int irq);
 extern void unmask_msi_irq(unsigned int irq);
+#endif
 extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
 extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
Index: linux-2.6/arch/x86/include/asm/hpet.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/hpet.h
+++ linux-2.6/arch/x86/include/asm/hpet.h
@@ -72,8 +72,15 @@ extern void hpet_disable(void);
 extern unsigned long hpet_readl(unsigned long a);
 extern void force_hpet_resume(void);
 
+#ifdef CONFIG_SPARSE_IRQ
+extern void hpet_msi_unmaskx(unsigned int irq, struct irq_desc **descp);
+extern void hpet_msi_maskx(unsigned int irq, struct irq_desc **descp);
+#define hpet_msi_unmask hpet_msi_unmaskx
+#define hpet_msi_mask hpet_msi_maskx
+#else
 extern void hpet_msi_unmask(unsigned int irq);
 extern void hpet_msi_mask(unsigned int irq);
+#endif
 extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
 extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
 
Index: linux-2.6/arch/x86/kernel/hpet.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/hpet.c
+++ linux-2.6/arch/x86/kernel/hpet.c
@@ -347,7 +347,7 @@ static int hpet_legacy_next_event(unsign
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev	*hpet_devs;
 
-void hpet_msi_unmask(unsigned int irq)
+void hpet_msi_unmaskx(unsigned int irq, struct irq_desc **descp)
 {
 	struct hpet_dev *hdev = get_irq_data(irq);
 	unsigned long cfg;
@@ -358,7 +358,7 @@ void hpet_msi_unmask(unsigned int irq)
 	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
 
-void hpet_msi_mask(unsigned int irq)
+void hpet_msi_maskx(unsigned int irq, struct irq_desc **descp)
 {
 	unsigned long cfg;
 	struct hpet_dev *hdev = get_irq_data(irq);
@@ -369,6 +369,21 @@ void hpet_msi_mask(unsigned int irq)
 	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void hpet_msi_unmask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	hpet_msi_unmaskx(irq, &desc);
+}
+void hpet_msi_mask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	hpet_msi_maskx(irq, &desc);
+}
+#endif
+
 void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
 {
 	struct hpet_dev *hdev = get_irq_data(irq);
Index: linux-2.6/include/linux/htirq.h
===================================================================
--- linux-2.6.orig/include/linux/htirq.h
+++ linux-2.6/include/linux/htirq.h
@@ -9,8 +9,16 @@ struct ht_irq_msg {
 /* Helper functions.. */
 void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
 void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
+#ifdef CONFIG_SPARSE_IRQ
+struct irq_desc;
+void mask_ht_irqx(unsigned int irq, struct irq_desc **descp);
+void unmask_ht_irqx(unsigned int irq, struct irq_desc **descp);
+#define mask_ht_irq mask_ht_irqx
+#define unmask_ht_irq unmask_ht_irqx
+#else
 void mask_ht_irq(unsigned int irq);
 void unmask_ht_irq(unsigned int irq);
+#endif
 
 /* The arch hook for getting things started */
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev);
Index: linux-2.6/kernel/irq/migration.c
===================================================================
--- linux-2.6.orig/kernel/irq/migration.c
+++ linux-2.6/kernel/irq/migration.c
@@ -1,9 +1,9 @@
 
 #include <linux/irq.h>
 
-void move_masked_irq(int irq)
+void move_masked_irqx(int irq, struct irq_desc **descp)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc = *descp;
 	cpumask_t tmp;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
@@ -47,9 +47,9 @@ void move_masked_irq(int irq)
 	cpus_clear(desc->pending_mask);
 }
 
-void move_native_irq(int irq)
+void move_native_irqx(int irq, struct irq_desc **descp)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc = *descp;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
@@ -57,8 +57,23 @@ void move_native_irq(int irq)
 	if (unlikely(desc->status & IRQ_DISABLED))
 		return;
 
-	desc->chip->mask(irq);
-	move_masked_irq(irq);
-	desc->chip->unmask(irq);
+	desc_chip_mask(irq, descp);
+	move_masked_irqx(irq, descp);
+	desc_chip_unmask(irq, descp);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void move_masked_irq(int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	move_masked_irqx(irq, &desc);
+}
+
+void move_native_irq(int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	move_native_irqx(irq, &desc);
+}
+#endif
Index: linux-2.6/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.orig/drivers/pci/intel-iommu.c
+++ linux-2.6/drivers/pci/intel-iommu.c
@@ -751,7 +751,7 @@ const char *dmar_get_fault_reason(u8 fau
 		return fault_reason_strings[fault_reason];
 }
 
-void dmar_msi_unmask(unsigned int irq)
+void dmar_msi_unmaskx(unsigned int irq, struct irq_desc **descp)
 {
 	struct intel_iommu *iommu = get_irq_data(irq);
 	unsigned long flag;
@@ -764,7 +764,7 @@ void dmar_msi_unmask(unsigned int irq)
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
-void dmar_msi_mask(unsigned int irq)
+void dmar_msi_maskx(unsigned int irq, struct irq_desc **descp)
 {
 	unsigned long flag;
 	struct intel_iommu *iommu = get_irq_data(irq);
@@ -777,6 +777,21 @@ void dmar_msi_mask(unsigned int irq)
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void dmar_msi_unmask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	dmar_msi_unmaskx(irq, &desc);
+}
+void dmar_msi_mask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	dmar_msi_maskx(irq, &desc);
+}
+#endif
+
 void dmar_msi_write(int irq, struct msi_msg *msg)
 {
 	struct intel_iommu *iommu = get_irq_data(irq);
Index: linux-2.6/include/linux/dmar.h
===================================================================
--- linux-2.6.orig/include/linux/dmar.h
+++ linux-2.6/include/linux/dmar.h
@@ -122,8 +122,15 @@ extern const char *dmar_get_fault_reason
 /* Can't use the common MSI interrupt functions
  * since DMAR is not a pci device
  */
+#ifdef CONFIG_SPARSE_IRQ
+extern void dmar_msi_unmaskx(unsigned int irq, struct irq_desc **descp);
+extern void dmar_msi_maskx(unsigned int irq, struct irq_desc **descp);
+#define dmar_msi_unmask dmar_msi_unmaskx
+#define dmar_msi_mask dmar_msi_maskx
+#else
 extern void dmar_msi_unmask(unsigned int irq);
 extern void dmar_msi_mask(unsigned int irq);
+#endif
 extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13  7:01                                       ` [PATCH] sparse_irq aka dyn_irq v13 Yinghai Lu
@ 2008-11-13  9:53                                         ` Ingo Molnar
  2008-11-13 20:06                                           ` Yinghai Lu
  0 siblings, 1 reply; 66+ messages in thread
From: Ingo Molnar @ 2008-11-13  9:53 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Thomas Gleixner, H. Peter Anvin, Andrew Morton,
	linux-kernel@vger.kernel.org


* Yinghai Lu <yinghai@kernel.org> wrote:

> reduce the #ifdef numbers

> Index: linux-2.6/arch/x86/include/asm/hpet.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/include/asm/hpet.h
> +++ linux-2.6/arch/x86/include/asm/hpet.h
> @@ -72,8 +72,15 @@ extern void hpet_disable(void);
>  extern unsigned long hpet_readl(unsigned long a);
>  extern void force_hpet_resume(void);
>  
> +#ifdef CONFIG_SPARSE_IRQ
> +extern void hpet_msi_unmaskx(unsigned int irq, struct irq_desc **descp);
> +extern void hpet_msi_maskx(unsigned int irq, struct irq_desc **descp);
> +#define hpet_msi_unmask hpet_msi_unmaskx
> +#define hpet_msi_mask hpet_msi_maskx
> +#else
>  extern void hpet_msi_unmask(unsigned int irq);
>  extern void hpet_msi_mask(unsigned int irq);
> +#endif
>  extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
>  extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);

please use inlines instead of #define's.

> Index: linux-2.6/arch/x86/kernel/hpet.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/hpet.c
> +++ linux-2.6/arch/x86/kernel/hpet.c
> @@ -347,7 +347,7 @@ static int hpet_legacy_next_event(unsign
>  static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
>  static struct hpet_dev	*hpet_devs;
>  
> -void hpet_msi_unmask(unsigned int irq)
> +void hpet_msi_unmaskx(unsigned int irq, struct irq_desc **descp)
>  {
>  	struct hpet_dev *hdev = get_irq_data(irq);
>  	unsigned long cfg;
> @@ -358,7 +358,7 @@ void hpet_msi_unmask(unsigned int irq)
>  	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
>  }
>  
> -void hpet_msi_mask(unsigned int irq)
> +void hpet_msi_maskx(unsigned int irq, struct irq_desc **descp)

please name it hpet_msi_mask_desc() - 'maskx' sounds quirky.

>  {
>  	unsigned long cfg;
>  	struct hpet_dev *hdev = get_irq_data(irq);
> @@ -369,6 +369,21 @@ void hpet_msi_mask(unsigned int irq)
>  	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
>  }
>  
> +#ifndef CONFIG_SPARSE_IRQ
> +void hpet_msi_unmask(unsigned int irq)
> +{
> +	struct irq_desc *desc = irq_to_desc(irq);
> +
> +	hpet_msi_unmaskx(irq, &desc);
> +}
> +void hpet_msi_mask(unsigned int irq)
> +{
> +	struct irq_desc *desc = irq_to_desc(irq);
> +
> +	hpet_msi_maskx(irq, &desc);
> +}
> +#endif
> +
>  void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
>  {
>  	struct hpet_dev *hdev = get_irq_data(irq);

it still looks ugly to me: couldnt we make it completely #ifdef-free, 
by just adding the new API variants?

i.e. leave these present unconditionally:

>  extern void hpet_msi_unmask(unsigned int irq);
>  extern void hpet_msi_mask(unsigned int irq);

and just _add_ these (unconditionally):

> +extern void hpet_msi_unmask_desc(unsigned int irq, struct irq_desc **descp);
> +extern void hpet_msi_mask_desc(unsigned int irq, struct irq_desc **descp);

that gives us zero #ifdefs and much nicer to read patches. Am i 
missing something why this isnt possible?

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13  9:53                                         ` Ingo Molnar
@ 2008-11-13 20:06                                           ` Yinghai Lu
  0 siblings, 0 replies; 66+ messages in thread
From: Yinghai Lu @ 2008-11-13 20:06 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, H. Peter Anvin, Andrew Morton,
	linux-kernel@vger.kernel.org

Ingo Molnar wrote:

> it still looks ugly to me: couldnt we make it completely #ifdef-free, 
> by just adding the new API variants?
> 
> i.e. leave these present unconditionally:
> 
>>  extern void hpet_msi_unmask(unsigned int irq);
>>  extern void hpet_msi_mask(unsigned int irq);
> 
> and just _add_ these (unconditionally):
> 
>> +extern void hpet_msi_unmask_desc(unsigned int irq, struct irq_desc **descp);
>> +extern void hpet_msi_mask_desc(unsigned int irq, struct irq_desc **descp);
> 
> that gives us zero #ifdefs and much nicer to read patches. Am i 
> missing something why this isnt possible?

in io_apic.c

struct irq_chip hpet_msi_type = {
        .name = "HPET_MSI",
        .unmask = hpet_msi_unmask,
        .mask = hpet_msi_mask,
        .ack = ack_apic_edge,
#ifdef CONFIG_SMP
        .set_affinity = hpet_msi_set_affinity,
#endif
        .retrigger = ioapic_retrigger_irq,
};

so those ack, mask, unmak, eoi field will have different function prototype..when sparseirq is enabled or not.

if you want to remove those #ifdef, we need to go over all irq_chip definition to make all those field to take struct irq_desc *desc (or **descp) instead of unsigned int irq.

actually we only need to pass desc struct instead of irq, because we can get desc->irq in case.

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] sparse_irq aka dyn_irq v13
       [not found]                                     ` <20081112120814.GG11352@elte.hu>
  2008-11-13  7:01                                       ` [PATCH] sparse_irq aka dyn_irq v13 Yinghai Lu
@ 2008-11-13 20:16                                       ` Yinghai Lu
  2008-11-13 21:18                                         ` Andrew Morton
  1 sibling, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-13 20:16 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Andrew Morton
  Cc: linux-kernel@vger.kernel.org, Mike Travis

-- adding CC Mike

---------

reduce the #ifdef numbers

YH

---

From: Yinghai Lu <yinghai@kernel.org>
Subject: sparseirq v13

impact: new feature sparseirq

add some kind of hash table as Ingo suggesting.
remove dyna_array

when sparse_irq is used (CONFIG_SPARSE_IRQ), use kzalloc_node to get irq_desc, irq_cfg
  use desc->chip_data for x86 to store irq_cfg

if CONFIG_MOVE_IRQ_DESC is set
  make irq_desc to go with affinity aka irq_desc moving etc
  call move_irq_desc in irq_complete_move()
  need to add struct (irq_desc **descp) to ack_edge/level to make sure desc get updated
  try to pass desc cfg as more as possible to avoid list looking up. 
  legacy irq_desc is not moved, because they are allocated via static array

for logical apic mode, need to add move_desc_in_progress_in_same_domain. otherwise it will not get moved. ==> also could need two phase to get irq_desc moved.
	for example: 0xff is old affinity, and need to set 0xf, and then set to 0xf0.
	[ or we need to change domain definition to cpus on the same node ? ]

LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,000000ff
LBSuse:~ # echo f > /proc/irq/22/smp_affinity
LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,0000000f
LBSuse:~ # tail /var/log/messages
...
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
LBSuse:~ # echo f0 > /proc/irq/22/smp_affinity
LBSuse:~ # tail /var/log/messages
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
Oct 27 12:36:46 LBSuse kernel:   move irq_desc for 22 aka 0x16 to cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc kstat_irqs on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_cfg on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_2_pin on cpu 7 node 1

so assume the user space program should update /proc/irq/XX/smp_affinity to 03 or 0f at first on boot
or we change irq_default_affinity ?

for physical apic is much simple
on 4 sockets 16 cores system
irq_desc is moving..
when
# echo 10 > /proc/irq/134483967/smp_affinity
# echo 100 > /proc/irq/134483967/smp_affinity
# echo 1000 > /proc/irq/134483967/smp_affinity
got
Nov  9 21:39:51 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc kstat_irqs on cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc irq_cfg on cpu 4 node 1
Nov  9 21:40:05 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc kstat_irqs on cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc irq_cfg on cpu 8 node 2
Nov  9 21:40:18 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc kstat_irqs on cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc irq_cfg on cpu 12 node 3

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/Kconfig                   |   20 +
 arch/x86/include/asm/hpet.h        |    7 
 arch/x86/include/asm/irq_vectors.h |    2 
 arch/x86/kernel/hpet.c             |   19 -
 arch/x86/kernel/i8259.c            |   29 +
 arch/x86/kernel/io_apic.c          |  688 ++++++++++++++++++++++++++-----------
 arch/x86/kernel/irq.c              |   24 +
 arch/x86/kernel/irq_32.c           |    2 
 arch/x86/kernel/irq_64.c           |    6 
 arch/x86/kernel/irqinit_32.c       |    3 
 arch/x86/kernel/irqinit_64.c       |    3 
 arch/x86/kernel/uv_irq.c           |   28 +
 arch/x86/mm/init_32.c              |    3 
 drivers/char/random.c              |   31 +
 drivers/pci/htirq.c                |   38 +-
 drivers/pci/intel-iommu.c          |   19 -
 drivers/pci/intr_remapping.c       |   62 +++
 drivers/pci/msi.c                  |   49 +-
 drivers/xen/events.c               |    9 
 fs/proc/interrupts.c               |   18 
 fs/proc/stat.c                     |   17 
 include/linux/dmar.h               |    7 
 include/linux/htirq.h              |    8 
 include/linux/interrupt.h          |    2 
 include/linux/irq.h                |   85 ++++
 include/linux/irqnr.h              |   15 
 include/linux/kernel_stat.h        |   14 
 include/linux/msi.h                |    8 
 init/main.c                        |   11 
 kernel/irq/autoprobe.c             |   10 
 kernel/irq/chip.c                  |   43 +-
 kernel/irq/handle.c                |  373 +++++++++++++++++++-
 kernel/irq/migration.c             |   29 +
 kernel/irq/proc.c                  |    3 
 kernel/irq/spurious.c              |    4 
 35 files changed, 1377 insertions(+), 312 deletions(-)

Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -240,6 +240,26 @@ config X86_HAS_BOOT_CPU_ID
 	def_bool y
 	depends on X86_VOYAGER
 
+config SPARSE_IRQ
+	bool "Support sparse irq numbering"
+	depends on PCI_MSI || HT_IRQ
+	default y
+	help
+	  This enables support for sparse irq, esp for msi/msi-x. the irq
+	  number will be bus/dev/fn + 12bit. You may need if you have lots of
+	  cards supports msi-x installed.
+
+	  If you don't know what to do here, say Y.
+
+config MOVE_IRQ_DESC
+	bool "Move irq desc when changing irq smp_affinity"
+	depends on SPARSE_IRQ && SMP
+	default y
+	help
+	  This enables moving irq_desc to cpu/node that irq will use handled.
+
+	  If you don't know what to do here, say Y.
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
Index: linux-2.6/arch/x86/kernel/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/io_apic.c
+++ linux-2.6/arch/x86/kernel/io_apic.c
@@ -108,93 +108,236 @@ static int __init parse_noapic(char *str
 early_param("noapic", parse_noapic);
 
 struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+	int apic, pin;
+	struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+	struct irq_pin_list *pin;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	pin = kzalloc_node(sizeof(*pin), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+
+	return pin;
+}
+
 struct irq_cfg {
-	unsigned int irq;
 	struct irq_pin_list *irq_2_pin;
 	cpumask_t domain;
 	cpumask_t old_domain;
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
+#ifdef CONFIG_MOVE_IRQ_DESC
+	u8 move_desc_in_progress_in_same_domain : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
 static struct irq_cfg irq_cfgx[NR_IRQS] = {
-	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-	[3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-	[4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-	[5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-	[6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-	[7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-	[8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-	[9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-	[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-	[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-	[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-	[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-	[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+#endif
+	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
 };
 
-#define for_each_irq_cfg(irq, cfg)		\
-	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+void __init arch_early_irq_init_work(void)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int count;
+	int i;
+#ifdef CONFIG_SPARSE_IRQ
+	int count_desc = NR_IRQS_LEGACY;
+#else
+	int count_desc = NR_IRQS;
+#endif
+
+	cfg = irq_cfgx;
+	count = ARRAY_SIZE(irq_cfgx);
+
+	BUG_ON(count > count_desc);
 
+	for (i = 0; i < count; i++) {
+		desc = irq_to_desc(i);
+		desc->chip_data = &cfg[i];
+	}
+}
+
+#ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+	struct irq_cfg *cfg = NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		cfg = desc->chip_data;
+
+	return cfg;
 }
 
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 {
-	return irq_cfg(irq);
+	struct irq_cfg *cfg;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
+
+	return cfg;
 }
 
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
 
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+	cfg = desc->chip_data;
+	if (!cfg)
+		desc->chip_data = get_one_free_irq_cfg(cpu);
+}
 
-struct irq_pin_list {
-	int apic, pin;
-	struct irq_pin_list *next;
-};
+#ifdef CONFIG_MOVE_IRQ_DESC
+
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu)
+{
+	struct irq_pin_list *old_entry, *tail, *entry;
+
+	cfg->irq_2_pin = NULL;
+	old_entry = old_cfg->irq_2_pin;
+	if (!old_entry)
+		return;
 
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+	entry = get_one_free_irq_2_pin(cpu);
+	entry->apic = old_entry->apic;
+	entry->pin = old_entry->pin;
+	cfg->irq_2_pin = entry;
+	tail = entry;
+	old_entry = old_entry->next;
 
-static void __init irq_2_pin_init(void)
+	while (old_entry) {
+		entry = get_one_free_irq_2_pin(cpu);
+		entry->apic = old_entry->apic;
+		entry->pin = old_entry->pin;
+		tail->next = entry;
+		tail = entry;
+		old_entry = old_entry->next;
+	}
+
+	tail->next = NULL;
+}
+
+static void free_irq_2_pin(struct irq_cfg *cfg)
 {
-	struct irq_pin_list *pin = irq_2_pin_head;
-	int i;
+	struct irq_pin_list *entry, *next;
 
-	for (i = 1; i < PIN_MAP_SIZE; i++)
-		pin[i-1].next = &pin[i];
+	entry = cfg->irq_2_pin;
 
-	irq_2_pin_ptr = &pin[0];
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	cfg->irq_2_pin = NULL;
 }
 
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_ptr;
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
 
-	if (!pin)
-		panic("can not get more irq_2_pin\n");
+	cfg = get_one_free_irq_cfg(cpu);
+	desc->chip_data = cfg;
 
-	irq_2_pin_ptr = pin->next;
-	pin->next = NULL;
-	return pin;
+	old_cfg = old_desc->chip_data;
+
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *cfg)
+{
+	kfree(cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+
+	cfg = desc->chip_data;
+	if (cfg) {
+		free_irq_2_pin(cfg);
+		free_irq_cfg(cfg);
+		desc->chip_data = NULL;
+	}
+}
+
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+	struct irq_cfg *cfg = desc->chip_data;
+
+	if (!cfg->move_in_progress) {
+		/* it means that domain is not changed */
+		cpumask_t tmp;
+
+		cpus_and(tmp, desc->affinity, mask);
+		if (cpus_empty(tmp))
+			cfg->move_desc_in_progress_in_same_domain = 1;
+	}
 }
+#endif
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+#endif
+
+#ifndef CONFIG_MOVE_IRQ_DESC
+static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+}
+#endif
 
 struct io_apic {
 	unsigned int index;
@@ -237,11 +380,10 @@ static inline void io_apic_modify(unsign
 	writel(value, &io_apic->data);
 }
 
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(unsigned int irq, struct irq_cfg *cfg)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
-	struct irq_cfg *cfg = irq_cfg(irq);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	entry = cfg->irq_2_pin;
@@ -323,13 +465,12 @@ static void ioapic_mask_entry(int apic,
 }
 
 #ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
 {
 	int apic, pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
 
-	cfg = irq_cfg(irq);
 	entry = cfg->irq_2_pin;
 	for (;;) {
 		unsigned int reg;
@@ -359,7 +500,7 @@ static void __target_IO_APIC_irq(unsigne
 	}
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
 
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 {
@@ -373,10 +514,13 @@ static void set_ioapic_affinity_irq(unsi
 	if (cpus_empty(tmp))
 		return;
 
-	cfg = irq_cfg(irq);
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 	/*
@@ -384,9 +528,8 @@ static void set_ioapic_affinity_irq(unsi
 	 */
 	dest = SET_APIC_LOGICAL_ID(dest);
 
-	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__target_IO_APIC_irq(irq, dest, cfg->vector);
+	__target_IO_APIC_irq(irq, dest, cfg);
 	desc->affinity = mask;
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
@@ -397,16 +540,13 @@ static void set_ioapic_affinity_irq(unsi
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	/* first time to refer irq_cfg, so with new */
-	cfg = irq_cfg_alloc(irq);
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin();
+		entry = get_one_free_irq_2_pin(cpu);
 		cfg->irq_2_pin = entry;
 		entry->apic = apic;
 		entry->pin = pin;
@@ -421,20 +561,31 @@ static void add_pin_to_irq(unsigned int
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin();
+	entry->next = get_one_free_irq_2_pin(cpu);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
 }
 
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int cpu = smp_processor_id();
+
+	/* first time to refer irq_cfg, so with new */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	cfg = desc->chip_data;
+	add_pin_to_irq_cpu(cfg, cpu, apic, pin);
+}
+
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
 	struct irq_pin_list *entry = cfg->irq_2_pin;
 	int replaced = 0;
 
@@ -451,18 +602,16 @@ static void __init replace_pin_at_irq(un
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq(irq, newapic, newpin);
+		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
 
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
 				int mask_and, int mask_or,
 				void (*final)(struct irq_pin_list *entry))
 {
 	int pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	cfg = irq_cfg(irq);
 	for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
 		unsigned int reg;
 		pin = entry->pin;
@@ -475,9 +624,9 @@ static inline void io_apic_modify_irq(un
 	}
 }
 
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
 #ifdef CONFIG_X86_64
@@ -492,47 +641,69 @@ void io_apic_sync(struct irq_pin_list *e
 	readl(&io_apic->data);
 }
 
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 }
 #else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
 			IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
 			IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
 #endif /* CONFIG_X86_32 */
 
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irqx(unsigned int irq, struct irq_desc **descp)
 {
+	struct irq_cfg *cfg = (*descp)->chip_data;
 	unsigned long flags;
 
+	BUG_ON(!cfg);
+
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__mask_IO_APIC_irq(irq);
+	__mask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irqx(unsigned int irq, struct irq_desc **descp)
 {
+	struct irq_cfg *cfg = (*descp)->chip_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_IO_APIC_irq(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+#define mask_IO_APIC_irq mask_IO_APIC_irqx
+#define unmask_IO_APIC_irq unmask_IO_APIC_irqx
+#else
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_IO_APIC_irqx(irq, &desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_IO_APIC_irqx(irq, &desc);
+}
+#endif
+
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
@@ -809,7 +980,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector
  */
 static int EISA_ELCR(unsigned int irq)
 {
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		unsigned int port = 0x4d0 + (irq >> 3);
 		return (inb(port) >> (irq & 7)) & 1;
 	}
@@ -1034,7 +1205,7 @@ void unlock_vector_lock(void)
 	spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1050,16 +1221,13 @@ static int __assign_irq_vector(int irq,
 	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
 	unsigned int old_vector;
 	int cpu;
-	struct irq_cfg *cfg;
 
-	cfg = irq_cfg(irq);
+	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+		return -EBUSY;
 
 	/* Only try and allocate irqs on cpus that are present */
 	cpus_and(mask, mask, cpu_online_map);
 
-	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
-		return -EBUSY;
-
 	old_vector = cfg->vector;
 	if (old_vector) {
 		cpumask_t tmp;
@@ -1113,24 +1281,22 @@ next:
 	return -ENOSPC;
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	int err;
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, cfg, mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-	struct irq_cfg *cfg;
 	cpumask_t mask;
 	int cpu, vector;
 
-	cfg = irq_cfg(irq);
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
@@ -1148,14 +1314,16 @@ void __setup_vector_irq(int cpu)
 	/* This function must be called with vector_lock held */
 	int irq, vector;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	/* Mark the inuse vectors */
-	for_each_irq_cfg(irq, cfg) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
 		if (!cpu_isset(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
-	}
+	} end_for_each_irq_desc();
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
 		irq = per_cpu(vector_irq, cpu)[vector];
@@ -1205,7 +1373,8 @@ static void ioapic_register_intr(int irq
 {
 	struct irq_desc *desc;
 
-	desc = irq_to_desc(irq);
+	/* could be first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
@@ -1310,7 +1479,7 @@ static void setup_IO_APIC_irq(int apic,
 	cfg = irq_cfg(irq);
 
 	mask = TARGET_CPUS;
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
 	cpus_and(mask, cfg->domain, mask);
@@ -1327,12 +1496,12 @@ static void setup_IO_APIC_irq(int apic,
 			       cfg->vector)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic].mp_apicid, pin);
-		__clear_irq_vector(irq);
+		__clear_irq_vector(irq, cfg);
 		return;
 	}
 
 	ioapic_register_intr(irq, trigger);
-	if (irq < 16)
+	if (irq < NR_IRQS_LEGACY)
 		disable_8259A_irq(irq);
 
 	ioapic_write_entry(apic, pin, entry);
@@ -1434,6 +1603,7 @@ __apicdebuginit(void) print_IO_APIC(void
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 	unsigned int irq;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1523,8 +1693,10 @@ __apicdebuginit(void) print_IO_APIC(void
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for_each_irq_cfg(irq, cfg) {
-		struct irq_pin_list *entry = cfg->irq_2_pin;
+	for_each_irq_desc(irq, desc) {
+		struct irq_pin_list *entry;
+		cfg = desc->chip_data;
+		entry = cfg->irq_2_pin;
 		if (!entry)
 			continue;
 		printk(KERN_DEBUG "IRQ%d ", irq);
@@ -1535,7 +1707,7 @@ __apicdebuginit(void) print_IO_APIC(void
 			entry = entry->next;
 		}
 		printk("\n");
-	}
+	} end_for_each_irq_desc();
 
 	printk(KERN_INFO ".................................... done.\n");
 
@@ -2008,14 +2180,16 @@ static unsigned int startup_ioapic_irq(u
 {
 	int was_pending = 0;
 	unsigned long flags;
+	struct irq_cfg *cfg;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		disable_8259A_irq(irq);
 		if (i8259A_irq_pending(irq))
 			was_pending = 1;
 	}
-	__unmask_IO_APIC_irq(irq);
+	cfg = irq_cfg(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
@@ -2078,10 +2252,9 @@ static DECLARE_DELAYED_WORK(ir_migration
  * as simple as edge triggered migration and we can do the irq migration
  * with a simple atomic update to IO-APIC RTE.
  */
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void migrate_ioapic_irq(int irq, struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
 	int modify_ioapic_rte;
@@ -2095,18 +2268,19 @@ static void migrate_ioapic_irq(int irq,
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
-	desc = irq_to_desc(irq);
 	modify_ioapic_rte = desc->status & IRQ_LEVEL;
 	if (modify_ioapic_rte) {
 		spin_lock_irqsave(&ioapic_lock, flags);
-		__target_IO_APIC_irq(irq, dest, cfg->vector);
+		__target_IO_APIC_irq(irq, dest, cfg);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -2128,14 +2302,14 @@ static void migrate_ioapic_irq(int irq,
 	desc->affinity = mask;
 }
 
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level(int irq, struct irq_desc *desc)
 {
 	int ret = -1;
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
 
-	mask_IO_APIC_irq(irq);
+	mask_IO_APIC_irqx(irq, &desc);
 
-	if (io_apic_level_ack_pending(irq)) {
+	if (io_apic_level_ack_pending(irq, cfg)) {
 		/*
 		 * Interrupt in progress. Migrating irq now will change the
 		 * vector information in the IO-APIC RTE and that will confuse
@@ -2147,14 +2321,15 @@ static int migrate_irq_remapped_level(in
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq(irq, desc->pending_mask);
+	migrate_ioapic_irq(irq, desc, desc->pending_mask);
 
 	ret = 0;
 	desc->status &= ~IRQ_MOVE_PENDING;
 	cpus_clear(desc->pending_mask);
 
 unmask:
-	unmask_IO_APIC_irq(irq);
+	unmask_IO_APIC_irqx(irq, &desc);
+
 	return ret;
 }
 
@@ -2178,7 +2353,7 @@ static void ir_irq_migration(struct work
 			desc->chip->set_affinity(irq, desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
@@ -2191,11 +2366,11 @@ static void set_ir_ioapic_affinity_irq(u
 	if (desc->status & IRQ_LEVEL) {
 		desc->status |= IRQ_MOVE_PENDING;
 		desc->pending_mask = mask;
-		migrate_irq_remapped_level(irq);
+		migrate_irq_remapped_level(irq, desc);
 		return;
 	}
 
-	migrate_ioapic_irq(irq, mask);
+	migrate_ioapic_irq(irq, desc, mask);
 }
 #endif
 
@@ -2236,19 +2411,40 @@ unlock:
 	irq_exit();
 }
 
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_desc *desc = *descp;
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress))
+	if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_MOVE_IRQ_DESC
+		if (likely(!cfg->move_desc_in_progress_in_same_domain))
+			return;
+
+		/* domain is not change, but affinity is changed */
+		me = smp_processor_id();
+		if (cpu_isset(me, desc->affinity)) {
+			*descp = desc = move_irq_desc(desc, me);
+			/* get the new one */
+			cfg = desc->chip_data;
+			cfg->move_desc_in_progress_in_same_domain = 0;
+		}
+#endif
 		return;
+	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+		*descp = desc = move_irq_desc(desc, me);
+		/* get the new one */
+		cfg = desc->chip_data;
+#endif
+
 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
@@ -2256,9 +2452,24 @@ static void irq_complete_move(unsigned i
 	}
 }
 #else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
+
 #ifdef CONFIG_INTR_REMAP
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_x2apic_levelx(unsigned int irq, struct irq_desc **descp)
+{
+	ack_x2APIC_irq();
+}
+
+static void ack_x2apic_edgex(unsigned int irq, struct irq_desc **descp)
+{
+	ack_x2APIC_irq();
+}
+
+#define ack_x2apic_level ack_x2apic_levelx
+#define ack_x2apic_edge ack_x2apic_edgex
+#else
 static void ack_x2apic_level(unsigned int irq)
 {
 	ack_x2APIC_irq();
@@ -2270,29 +2481,44 @@ static void ack_x2apic_edge(unsigned int
 }
 #endif
 
-static void ack_apic_edge(unsigned int irq)
+#endif
+
+static void ack_apic_edgex(unsigned int irq, struct irq_desc **descp)
 {
-	irq_complete_move(irq);
-	move_native_irq(irq);
+	irq_complete_move(descp);
+#ifdef CONFIG_SMP
+	move_native_irqx(irq, descp);
+#endif
 	ack_APIC_irq();
 }
+#ifdef CONFIG_SPARSE_IRQ
+#define ack_apic_edge ack_apic_edgex
+#else
+static void ack_apic_edge(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_apic_edgex(irq, &desc);
+}
+#endif
 
 atomic_t irq_mis_count;
 
-static void ack_apic_level(unsigned int irq)
+static void ack_apic_levelx(unsigned int irq, struct irq_desc **descp)
 {
 #ifdef CONFIG_X86_32
 	unsigned long v;
 	int i;
 #endif
+	struct irq_cfg *cfg;
 	int do_unmask_irq = 0;
 
-	irq_complete_move(irq);
+	irq_complete_move(descp);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+	if (unlikely((*descp)->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
-		mask_IO_APIC_irq(irq);
+		mask_IO_APIC_irqx(irq, descp);
 	}
 #endif
 
@@ -2316,7 +2542,8 @@ static void ack_apic_level(unsigned int
 	* operation to prevent an edge-triggered interrupt escaping meanwhile.
 	* The idea is from Manfred Spraul.  --macro
 	*/
-	i = irq_cfg(irq)->vector;
+	cfg = (*descp)->chip_data;
+	i = cfg->vector;
 
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 #endif
@@ -2355,22 +2582,37 @@ static void ack_apic_level(unsigned int
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(irq))
-			move_masked_irq(irq);
-		unmask_IO_APIC_irq(irq);
+		cfg = (*descp)->chip_data;
+		if (!io_apic_level_ack_pending(irq, cfg)) {
+# ifdef CONFIG_SMP
+			move_masked_irqx(irq, descp);
+# endif
+		}
+		unmask_IO_APIC_irqx(irq, descp);
 	}
 
 #ifdef CONFIG_X86_32
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		__mask_and_edge_IO_APIC_irq(cfg);
+		__unmask_and_level_IO_APIC_irq(cfg);
 		spin_unlock(&ioapic_lock);
 	}
 #endif
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+#define ack_apic_level ack_apic_levelx
+#else
+static void ack_apic_level(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_apic_levelx(irq, &desc);
+}
+#endif
+
 static struct irq_chip ioapic_chip __read_mostly = {
 	.name		= "IO-APIC",
 	.startup	= startup_ioapic_irq,
@@ -2416,29 +2658,28 @@ static inline void init_IO_APIC_traps(vo
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for_each_irq_cfg(irq, cfg) {
-		if (IO_APIC_IRQ(irq) && !cfg->vector) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
+		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
 			 * interrupt if we can..
 			 */
-			if (irq < 16)
+			if (irq < NR_IRQS_LEGACY)
 				make_8259A_irq(irq);
-			else {
-				desc = irq_to_desc(irq);
+			else
 				/* Strange. Oh, well.. */
 				desc->chip = &no_irq_chip;
-			}
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
  * The local APIC irq-chip implementation:
  */
 
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irqx(unsigned int irq)
 {
 	unsigned long v;
 
@@ -2446,7 +2687,7 @@ static void mask_lapic_irq(unsigned int
 	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irqx(unsigned int irq)
 {
 	unsigned long v;
 
@@ -2454,11 +2695,30 @@ static void unmask_lapic_irq(unsigned in
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irqx(unsigned int irq)
 {
 	ack_APIC_irq();
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void mask_lapic_irq(unsigned int irq, struct irq_desc **descp)
+{
+	mask_lapic_irqx(irq);
+}
+static void unmask_lapic_irq(unsigned int irq, struct irq_desc **descp)
+{
+	unmask_lapic_irqx(irq);
+}
+static void ack_lapic_irq(unsigned int irq, struct irq_desc **descp)
+{
+	ack_lapic_irqx(irq);
+}
+#else
+#define mask_lapic_irq mask_lapic_irqx
+#define unmask_lapic_irq unmask_lapic_irqx
+#define ack_lapic_irq ack_lapic_irqx
+#endif
+
 static struct irq_chip lapic_chip __read_mostly = {
 	.name		= "local-APIC",
 	.mask		= mask_lapic_irq,
@@ -2574,7 +2834,9 @@ int timer_through_8259 __initdata;
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_desc *desc = irq_to_desc(0);
+	struct irq_cfg *cfg = desc->chip_data;
+	int cpu = smp_processor_id();
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	unsigned int ver;
@@ -2589,7 +2851,7 @@ static inline void __init check_timer(vo
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	assign_irq_vector(0, TARGET_CPUS);
+	assign_irq_vector(0, cfg, TARGET_CPUS);
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2640,10 +2902,10 @@ static inline void __init check_timer(vo
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq(0, apic1, pin1);
+			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irqx(0, &desc);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
@@ -2669,9 +2931,9 @@ static inline void __init check_timer(vo
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irqx(0, &desc);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2888,22 +3150,27 @@ unsigned int create_irq_nr(unsigned int
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
-	struct irq_cfg *cfg_new;
+	struct irq_cfg *cfg_new = NULL;
+	int cpu;
+	struct irq_desc *desc_new = NULL;
 
+#ifndef CONFIG_SPARSE_IRQ
 	irq_want = nr_irqs - 1;
+#endif
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
+	cpu = smp_processor_id();
 	for (new = irq_want; new > 0; new--) {
 		if (platform_legacy_irq(new))
 			continue;
-		cfg_new = irq_cfg(new);
-		if (cfg_new && cfg_new->vector != 0)
+
+		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		cfg_new = desc_new->chip_data;
+
+		if (cfg_new->vector != 0)
 			continue;
-		/* check if need to create one */
-		if (!cfg_new)
-			cfg_new = irq_cfg_alloc(new);
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
 	}
@@ -2911,6 +3178,9 @@ unsigned int create_irq_nr(unsigned int
 
 	if (irq > 0) {
 		dynamic_irq_init(irq);
+		/* restore it, in case dynamic_irq_init clear it */
+		if (desc_new)
+			desc_new->chip_data = cfg_new;
 	}
 	return irq;
 }
@@ -2930,14 +3200,22 @@ int create_irq(void)
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
+	/* store it, in case dynamic_irq_cleanup clear it */
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
 	dynamic_irq_cleanup(irq);
+	/* connect back irq_cfg */
+	if (desc)
+		desc->chip_data = cfg;
 
 #ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
 #endif
 	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
+	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
@@ -2952,12 +3230,12 @@ static int msi_compose_msg(struct pci_de
 	unsigned dest;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (err)
 		return err;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, tmp);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3025,10 +3303,13 @@ static void set_msi_irq_affinity(unsigne
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3040,7 +3321,6 @@ static void set_msi_irq_affinity(unsigne
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	write_msi_msg(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 
@@ -3064,10 +3344,13 @@ static void ir_set_msi_irq_affinity(unsi
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3091,7 +3374,6 @@ static void ir_set_msi_irq_affinity(unsi
 		cfg->move_in_progress = 0;
 	}
 
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif
@@ -3176,7 +3458,7 @@ static int setup_msi_irq(struct pci_dev
 #endif
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
-	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for MSI/MSI-X\n", irq, irq);
 
 	return 0;
 }
@@ -3185,6 +3467,7 @@ static unsigned int build_irq_for_pci_de
 {
 	unsigned int irq;
 
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
 	irq = dev->bus->number;
 	irq <<= 8;
 	irq |= dev->devfn;
@@ -3199,7 +3482,7 @@ int arch_setup_msi_irq(struct pci_dev *d
 	int ret;
 	unsigned int irq_want;
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 
 	irq = create_irq_nr(irq_want);
 	if (irq == 0)
@@ -3240,7 +3523,8 @@ int arch_setup_msi_irqs(struct pci_dev *
 	int index = 0;
 #endif
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	/* count from the top 0xfff in 12 bits range */
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 	sub_handle = 0;
 	list_for_each_entry(desc, &dev->msi_list, list) {
 		irq = create_irq_nr(irq_want--);
@@ -3306,10 +3590,13 @@ static void dmar_msi_set_affinity(unsign
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3321,7 +3608,6 @@ static void dmar_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
@@ -3367,10 +3653,13 @@ static void hpet_msi_set_affinity(unsign
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3382,7 +3671,6 @@ static void hpet_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
@@ -3448,15 +3736,17 @@ static void set_ht_irq_affinity(unsigned
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
 #endif
@@ -3478,13 +3768,13 @@ int arch_setup_ht_irq(unsigned int irq,
 	int err;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (!err) {
 		struct ht_irq_msg msg;
 		unsigned dest;
 
-		cfg = irq_cfg(irq);
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
@@ -3508,7 +3798,8 @@ int arch_setup_ht_irq(unsigned int irq,
 		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
 					      handle_edge_irq, "edge");
 
-		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
+		dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for HT\n",
+				 irq, irq);
 	}
 	return err;
 }
@@ -3530,7 +3821,9 @@ int arch_enable_uv_irq(char *irq_name, u
 	unsigned long flags;
 	int err;
 
-	err = assign_irq_vector(irq, *eligible_cpu);
+	cfg = irq_cfg(irq);
+
+	err = assign_irq_vector(irq, cfg, *eligible_cpu);
 	if (err != 0)
 		return err;
 
@@ -3539,8 +3832,6 @@ int arch_enable_uv_irq(char *irq_name, u
 				      irq_name);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	cfg = irq_cfg(irq);
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3594,6 +3885,7 @@ int __init io_apic_get_redir_entries (in
 
 int __init probe_nr_irqs(void)
 {
+#ifdef CONFIG_SPARSE_IRQ
 	int idx;
 	int nr = 0;
 #ifndef CONFIG_XEN
@@ -3611,10 +3903,11 @@ int __init probe_nr_irqs(void)
 	/* something wrong ? */
 	if (nr < nr_min)
 		nr = nr_min;
-	if (WARN_ON(nr > NR_IRQS))
-		nr = NR_IRQS;
 
 	return nr;
+#else
+	return NR_IRQS;
+#endif
 }
 
 /* --------------------------------------------------------------------------
@@ -3722,7 +4015,7 @@ int io_apic_set_pci_routing (int ioapic,
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
-	if (irq >= 16)
+	if (irq >= NR_IRQS_LEGACY)
 		add_pin_to_irq(irq, ioapic, pin);
 
 	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
@@ -3852,7 +4145,6 @@ void __init ioapic_init_mappings(void)
 	struct resource *ioapic_res;
 	int i;
 
-	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
Index: linux-2.6/arch/x86/kernel/irqinit_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_32.c
+++ linux-2.6/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
 	/*
 	 * 16 old-style INTA-cycle interrupts:
 	 */
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/kernel/irqinit_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_64.c
+++ linux-2.6/arch/x86/kernel/irqinit_64.c
@@ -142,8 +142,7 @@ void __init init_ISA_irqs(void)
 	init_bsp_APIC();
 	init_8259A(0);
 
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/mm/init_32.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_32.c
+++ linux-2.6/arch/x86/mm/init_32.c
@@ -66,6 +66,7 @@ static unsigned long __meminitdata table
 static unsigned long __meminitdata table_top;
 
 static int __initdata after_init_bootmem;
+int after_bootmem;
 
 static __init void *alloc_low_page(unsigned long *phys)
 {
@@ -987,6 +988,8 @@ void __init mem_init(void)
 
 	set_highmem_pages_init();
 
+	after_bootmem = 1;
+
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
Index: linux-2.6/drivers/char/random.c
===================================================================
--- linux-2.6.orig/drivers/char/random.c
+++ linux-2.6/drivers/char/random.c
@@ -558,6 +558,8 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
+#ifndef CONFIG_SPARSE_IRQ
+
 static struct timer_rand_state *irq_timer_state[NR_IRQS];
 
 static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
@@ -576,6 +578,33 @@ static void set_timer_rand_state(unsigne
 	irq_timer_state[irq] = state;
 }
 
+#else
+
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return NULL;
+
+	return desc->timer_rand_state;
+}
+
+static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return;
+
+	desc->timer_rand_state = state;
+}
+#endif
+
 static struct timer_rand_state input_timer_state;
 
 /*
@@ -933,8 +962,10 @@ void rand_initialize_irq(int irq)
 {
 	struct timer_rand_state *state;
 
+#ifndef CONFIG_SPARSE_IRQ
 	if (irq >= nr_irqs)
 		return;
+#endif
 
 	state = get_timer_rand_state(irq);
 
Index: linux-2.6/drivers/pci/htirq.c
===================================================================
--- linux-2.6.orig/drivers/pci/htirq.c
+++ linux-2.6/drivers/pci/htirq.c
@@ -58,7 +58,7 @@ void fetch_ht_irq_msg(unsigned int irq,
 	*msg = cfg->msg;
 }
 
-void mask_ht_irq(unsigned int irq)
+void mask_ht_irqx(unsigned int irq, struct irq_desc **descp)
 {
 	struct ht_irq_cfg *cfg;
 	struct ht_irq_msg msg;
@@ -70,7 +70,7 @@ void mask_ht_irq(unsigned int irq)
 	write_ht_irq_msg(irq, &msg);
 }
 
-void unmask_ht_irq(unsigned int irq)
+void unmask_ht_irqx(unsigned int irq, struct irq_desc **descp)
 {
 	struct ht_irq_cfg *cfg;
 	struct ht_irq_msg msg;
@@ -82,6 +82,34 @@ void unmask_ht_irq(unsigned int irq)
 	write_ht_irq_msg(irq, &msg);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void mask_ht_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_ht_irqx(irq, &desc);
+}
+void unmask_ht_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_ht_irqx(irq, &desc);
+}
+#endif
+
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+	unsigned int irq;
+
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
+	irq = dev->bus->number;
+	irq <<= 8;
+	irq |= dev->devfn;
+	irq <<= 12;
+
+	return irq;
+}
+
 /**
  * __ht_create_irq - create an irq and attach it to a device.
  * @dev: The hypertransport device to find the irq capability on.
@@ -98,6 +126,7 @@ int __ht_create_irq(struct pci_dev *dev,
 	int max_irq;
 	int pos;
 	int irq;
+	unsigned int irq_want;
 
 	pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
 	if (!pos)
@@ -125,7 +154,12 @@ int __ht_create_irq(struct pci_dev *dev,
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
+	irq_want = build_irq_for_pci_dev(dev);
+#ifdef CONFIG_SPARSE_IRQ
+	irq = create_irq_nr(irq_want + idx);
+#else
 	irq = create_irq();
+#endif
 
 	if (irq <= 0) {
 		kfree(cfg);
Index: linux-2.6/drivers/pci/intr_remapping.c
===================================================================
--- linux-2.6.orig/drivers/pci/intr_remapping.c
+++ linux-2.6/drivers/pci/intr_remapping.c
@@ -19,17 +19,73 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+{
+	struct irq_2_iommu *iommu;
+	int node;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	iommu = kzalloc_node(sizeof(*iommu), GFP_KERNEL, node);
+	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+
+	return iommu;
+}
 
 static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 {
-	return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (WARN_ON_ONCE(!desc))
+		return NULL;
+
+	return desc->irq_2_iommu;
 }
 
+static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct irq_2_iommu *irq_iommu;
+
+	/*
+	 * alloc irq desc if not allocated already.
+	 */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+
+	irq_iommu = desc->irq_2_iommu;
+
+	if (!irq_iommu)
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+
+	return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+	return irq_2_iommu_alloc_cpu(irq, -1);
+}
+
+#else /* !CONFIG_SPARSE_IRQ */
+
+static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_2_iommuX[irq];
+
+	return NULL;
+}
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
 	return irq_2_iommu(irq);
 }
+#endif
 
 static DEFINE_SPINLOCK(irq_2_ir_lock);
 
@@ -86,9 +142,11 @@ int alloc_irte(struct intel_iommu *iommu
 	if (!count)
 		return -1;
 
+#ifndef CONFIG_SPARSE_IRQ
 	/* protect irq_2_iommu_alloc later */
 	if (irq >= nr_irqs)
 		return -1;
+#endif
 
 	/*
 	 * start the IRTE search from index 0.
Index: linux-2.6/drivers/xen/events.c
===================================================================
--- linux-2.6.orig/drivers/xen/events.c
+++ linux-2.6/drivers/xen/events.c
@@ -141,8 +141,9 @@ static void init_evtchn_cpu_bindings(voi
 	int i;
 
 	/* By default all event channels notify CPU#0. */
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		desc->affinity = cpumask_of_cpu(0);
+	} end_for_each_irq_desc();
 #endif
 
 	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@ -231,7 +232,7 @@ static int find_unbound_irq(void)
 	int irq;
 
 	/* Only allocate from dynirq range */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		if (irq_bindcount[irq] == 0)
 			break;
 
@@ -792,7 +793,7 @@ void xen_irq_resume(void)
 		mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		irq_info[irq].evtchn = 0; /* zap event-channel binding */
 
 	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@ -824,7 +825,7 @@ void __init xen_init_IRQ(void)
 		mask_evtchn(i);
 
 	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-	for_each_irq_nr(i)
+	for (i = 0; i < nr_irqs; i++)
 		irq_bindcount[i] = 0;
 
 	irq_ctx_init(smp_processor_id());
Index: linux-2.6/fs/proc/stat.c
===================================================================
--- linux-2.6.orig/fs/proc/stat.c
+++ linux-2.6/fs/proc/stat.c
@@ -27,6 +27,9 @@ static int show_stat(struct seq_file *p,
 	u64 sum = 0;
 	struct timespec boottime;
 	unsigned int per_irq_sum;
+#ifdef CONFIG_GENERIC_HARDIRQS
+	struct irq_desc *desc;
+#endif
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -44,10 +47,9 @@ static int show_stat(struct seq_file *p,
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-
-		for_each_irq_nr(j)
+		for_each_irq_desc(j, desc) {
 			sum += kstat_irqs_cpu(j, i);
-
+		} end_for_each_irq_desc();
 		sum += arch_irq_stat_cpu(i);
 	}
 	sum += arch_irq_stat();
@@ -90,14 +92,17 @@ static int show_stat(struct seq_file *p,
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_nr(j) {
+	for_each_irq_desc(j, desc) {
 		per_irq_sum = 0;
-
 		for_each_possible_cpu(i)
 			per_irq_sum += kstat_irqs_cpu(j, i);
 
+#ifdef CONFIG_SPARSE_IRQ
+		seq_printf(p, " %#x:%u", j, per_irq_sum);
+#else
 		seq_printf(p, " %u", per_irq_sum);
-	}
+#endif
+	} end_for_each_irq_desc();
 
 	seq_printf(p,
 		"\nctxt %llu\n"
Index: linux-2.6/fs/proc/interrupts.c
===================================================================
--- linux-2.6.orig/fs/proc/interrupts.c
+++ linux-2.6/fs/proc/interrupts.c
@@ -8,6 +8,23 @@
 /*
  * /proc/interrupts
  */
+#ifdef CONFIG_SPARSE_IRQ
+static void *int_seq_start(struct seq_file *f, loff_t *pos)
+{
+	rcu_read_lock();
+	return seq_list_start(&sparse_irqs_head, *pos);
+}
+
+static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &sparse_irqs_head, pos);
+}
+
+static void int_seq_stop(struct seq_file *f, void *v)
+{
+	rcu_read_unlock();
+}
+#else
 static void *int_seq_start(struct seq_file *f, loff_t *pos)
 {
 	return (*pos <= nr_irqs) ? pos : NULL;
@@ -25,6 +42,7 @@ static void int_seq_stop(struct seq_file
 {
 	/* Nothing to do */
 }
+#endif
 
 static const struct seq_operations int_seq_ops = {
 	.start = int_seq_start,
Index: linux-2.6/include/linux/interrupt.h
===================================================================
--- linux-2.6.orig/include/linux/interrupt.h
+++ linux-2.6/include/linux/interrupt.h
@@ -18,6 +18,8 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
+extern int nr_irqs;
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h
+++ linux-2.6/include/linux/irq.h
@@ -106,11 +106,19 @@ struct irq_chip {
 	void		(*enable)(unsigned int irq);
 	void		(*disable)(unsigned int irq);
 
+#ifdef CONFIG_SPARSE_IRQ
+	void		(*ack)(unsigned int irq, struct irq_desc **descp);
+	void		(*mask)(unsigned int irq, struct irq_desc **descp);
+	void		(*mask_ack)(unsigned int irq, struct irq_desc **descp);
+	void		(*unmask)(unsigned int irq, struct irq_desc **descp);
+	void		(*eoi)(unsigned int irq, struct irq_desc **descp);
+#else
 	void		(*ack)(unsigned int irq);
 	void		(*mask)(unsigned int irq);
 	void		(*mask_ack)(unsigned int irq);
 	void		(*unmask)(unsigned int irq);
 	void		(*eoi)(unsigned int irq);
+#endif
 
 	void		(*end)(unsigned int irq);
 	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
@@ -129,6 +137,8 @@ struct irq_chip {
 	const char	*typename;
 };
 
+struct timer_rand_state;
+struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -155,6 +165,15 @@ struct irq_chip {
  */
 struct irq_desc {
 	unsigned int		irq;
+#ifdef CONFIG_SPARSE_IRQ
+	struct list_head	list;
+	struct list_head	hash_entry;
+	struct timer_rand_state *timer_rand_state;
+	unsigned int            *kstat_irqs;
+# ifdef CONFIG_INTR_REMAP
+	struct irq_2_iommu      *irq_2_iommu;
+# endif
+#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -182,13 +201,67 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+extern void arch_early_irq_init_work(void);
+extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
+					struct irq_desc *desc, int cpu);
+extern void arch_free_chip_data(struct irq_desc *desc);
+
+#ifndef CONFIG_SPARSE_IRQ
 
+/* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
 
-static inline struct irq_desc *irq_to_desc(unsigned int irq)
-{
-	return (irq < nr_irqs) ? irq_desc + irq : NULL;
-}
+#ifdef CONFIG_GENERIC_HARDIRQS
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+# define for_each_irq_desc_reverse(irq, desc)                          \
+	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
+	    irq >= 0; irq--, desc--)
+
+#define end_for_each_irq_desc()
+#endif
+
+#define desc_chip_ack(irq, descp) desc->chip->ack(irq)
+#define desc_chip_mask(irq, descp) desc->chip->mask(irq)
+#define desc_chip_mask_ack(irq, descp) desc->chip->mask_ack(irq)
+#define desc_chip_unmask(irq, descp) desc->chip->unmask(irq)
+#define desc_chip_eoi(irq, descp) desc->chip->eoi(irq)
+
+#else
+
+void early_irq_init_work(void);
+extern struct list_head sparse_irqs_head;
+#define for_each_irq_desc(irqX, desc)					\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.next), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.next), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.next), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define for_each_irq_desc_reverse(irqX, desc)				\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.prev), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.prev), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.prev), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define end_for_each_irq_desc() rcu_read_unlock()
+
+#define kstat_irqs_this_cpu(DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()])
+#define kstat_incr_irqs_this_cpu(irqno, DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()]++)
+
+#define desc_chip_ack(irq, descp) desc->chip->ack(irq, descp)
+#define desc_chip_mask(irq, descp) desc->chip->mask(irq, descp)
+#define desc_chip_mask_ack(irq, descp) desc->chip->mask_ack(irq, descp)
+#define desc_chip_unmask(irq, descp) desc->chip->unmask(irq, descp)
+#define desc_chip_eoi(irq, descp) desc->chip->eoi(irq, descp)
+
+#endif
 
 /*
  * Migration helpers for obsolete names, they will go away:
@@ -211,8 +284,12 @@ extern int setup_irq(unsigned int irq, s
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 
+void move_native_irqx(int irq, struct irq_desc **descp);
+void move_masked_irqx(int irq, struct irq_desc **descp);
+#ifndef CONFIG_SPARSE_IRQ
 void move_native_irq(int irq);
 void move_masked_irq(int irq);
+#endif
 
 #else /* CONFIG_GENERIC_PENDING_IRQ */
 
Index: linux-2.6/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.orig/include/linux/kernel_stat.h
+++ linux-2.6/include/linux/kernel_stat.h
@@ -28,7 +28,9 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-	unsigned int irqs[NR_IRQS];
+#ifndef CONFIG_SPARSE_IRQ
+       unsigned int irqs[NR_IRQS];
+#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, ksta
 
 extern unsigned long long nr_context_switches(void);
 
+#ifndef CONFIG_SPARSE_IRQ
+#define kstat_irqs_this_cpu(irq) \
+	(kstat_this_cpu.irqs[irq])
+
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
@@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_
 {
 	kstat_this_cpu.irqs[irq]++;
 }
+#endif
+
 
+#ifndef CONFIG_SPARSE_IRQ
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
+#else
+extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#endif
 
 /*
  * Number of interrupts per specific IRQ source, since bootup
Index: linux-2.6/kernel/irq/autoprobe.c
===================================================================
--- linux-2.6.orig/kernel/irq/autoprobe.c
+++ linux-2.6/kernel/irq/autoprobe.c
@@ -57,7 +57,7 @@ unsigned long probe_irq_on(void)
 			desc->chip->startup(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/* Wait for longstanding interrupts to trigger. */
 	msleep(20);
@@ -75,7 +75,7 @@ unsigned long probe_irq_on(void)
 				desc->status |= IRQ_PENDING;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/*
 	 * Wait for spurious interrupts to trigger
@@ -99,7 +99,7 @@ unsigned long probe_irq_on(void)
 					mask |= 1 << i;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	return mask;
 }
@@ -135,7 +135,7 @@ unsigned int probe_irq_mask(unsigned lon
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	return mask & val;
@@ -179,7 +179,7 @@ int probe_irq_off(unsigned long val)
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	if (nr_of_irqs > 1)
Index: linux-2.6/kernel/irq/chip.c
===================================================================
--- linux-2.6.orig/kernel/irq/chip.c
+++ linux-2.6/kernel/irq/chip.c
@@ -24,9 +24,11 @@
  */
 void dynamic_irq_init(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc;
 	unsigned long flags;
 
+	/* first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
@@ -223,7 +225,7 @@ static void default_enable(unsigned int
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	desc->chip->unmask(irq);
+	desc_chip_unmask(irq, &desc);
 	desc->status &= ~IRQ_MASKED;
 }
 
@@ -252,7 +254,7 @@ static void default_shutdown(unsigned in
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	desc->chip->mask(irq);
+	desc_chip_mask(irq, &desc);
 	desc->status |= IRQ_MASKED;
 }
 
@@ -282,13 +284,15 @@ void irq_chip_set_defaults(struct irq_ch
 		chip->end = dummy_irq_chip.end;
 }
 
-static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+static inline void mask_ack_irq(struct irq_desc **descp, int irq)
 {
+	struct irq_desc *desc = *descp;
+
 	if (desc->chip->mask_ack)
-		desc->chip->mask_ack(irq);
+		desc_chip_mask_ack(irq, descp);
 	else {
-		desc->chip->mask(irq);
-		desc->chip->ack(irq);
+		desc_chip_mask(irq, descp);
+		desc_chip_ack(irq, descp);
 	}
 }
 
@@ -351,7 +355,7 @@ handle_level_irq(unsigned int irq, struc
 	irqreturn_t action_ret;
 
 	spin_lock(&desc->lock);
-	mask_ack_irq(desc, irq);
+	mask_ack_irq(&desc, irq);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -376,7 +380,7 @@ handle_level_irq(unsigned int irq, struc
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
-		desc->chip->unmask(irq);
+		desc_chip_unmask(irq, &desc);
 out_unlock:
 	spin_unlock(&desc->lock);
 }
@@ -413,7 +417,7 @@ handle_fasteoi_irq(unsigned int irq, str
 	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
 		desc->status |= IRQ_PENDING;
 		if (desc->chip->mask)
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 		goto out;
 	}
 
@@ -428,7 +432,7 @@ handle_fasteoi_irq(unsigned int irq, str
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 out:
-	desc->chip->eoi(irq);
+	desc_chip_eoi(irq, &desc);
 
 	spin_unlock(&desc->lock);
 }
@@ -464,13 +468,13 @@ handle_edge_irq(unsigned int irq, struct
 	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
-		mask_ack_irq(desc, irq);
+		mask_ack_irq(&desc, irq);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
-	desc->chip->ack(irq);
+	desc_chip_ack(irq, &desc);
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -480,7 +484,7 @@ handle_edge_irq(unsigned int irq, struct
 		irqreturn_t action_ret;
 
 		if (unlikely(!action)) {
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 			goto out_unlock;
 		}
 
@@ -492,7 +496,7 @@ handle_edge_irq(unsigned int irq, struct
 		if (unlikely((desc->status &
 			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
 			      (IRQ_PENDING | IRQ_MASKED))) {
-			desc->chip->unmask(irq);
+			desc_chip_unmask(irq, &desc);
 			desc->status &= ~IRQ_MASKED;
 		}
 
@@ -525,14 +529,14 @@ handle_percpu_irq(unsigned int irq, stru
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	if (desc->chip->ack)
-		desc->chip->ack(irq);
+		desc_chip_ack(irq, &desc);
 
 	action_ret = handle_IRQ_event(irq, desc->action);
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
 	if (desc->chip->eoi)
-		desc->chip->eoi(irq);
+		desc_chip_eoi(irq, &desc);
 }
 
 void
@@ -567,8 +571,9 @@ __set_irq_handler(unsigned int irq, irq_
 
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
-		if (desc->chip != &no_irq_chip)
-			mask_ack_irq(desc, irq);
+		if (desc->chip != &no_irq_chip) {
+			mask_ack_irq(&desc, irq);
+		}
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c
+++ linux-2.6/kernel/irq/handle.c
@@ -15,9 +15,16 @@
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
 
 #include "internals.h"
 
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
@@ -49,6 +56,299 @@ void handle_bad_irq(unsigned int irq, st
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
+void __init __attribute__((weak)) arch_early_irq_init_work(void)
+{
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_desc irq_desc_init = {
+	.irq	    = -1U,
+	.status	    = IRQ_DISABLED,
+	.chip	    = &no_irq_chip,
+	.handle_irq = handle_bad_irq,
+	.depth      = 1,
+	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+	.affinity   = CPU_MASK_ALL
+#endif
+};
+
+static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+{
+	unsigned long bytes;
+	char *ptr;
+	int node;
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	ptr = kzalloc_node(bytes, GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+
+	desc->kstat_irqs = (unsigned int *)ptr;
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc,
+				 int cpu, int nr)
+{
+	unsigned long bytes;
+
+	init_kstat_irqs(desc, cpu, nr);
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+}
+
+static void free_kstat_irqs(struct irq_desc *desc)
+{
+	kfree(desc->kstat_irqs);
+	desc->kstat_irqs = NULL;
+}
+#endif
+
+void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+}
+
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
+	desc->irq = irq;
+#ifdef CONFIG_SMP
+	desc->cpu = cpu;
+#endif
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_kstat_irqs(desc, cpu, nr_cpu_ids);
+	arch_init_chip_data(desc, cpu);
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+		 struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	desc->cpu = cpu;
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *desc)
+{
+	free_kstat_irqs(desc);
+	arch_free_chip_data(desc);
+}
+#endif
+/*
+ * Protect the sparse_irqs_free freelist:
+ */
+static DEFINE_SPINLOCK(sparse_irq_lock);
+LIST_HEAD(sparse_irqs_head);
+
+/*
+ * The sparse irqs are in a hash-table as well, for fast lookup:
+ */
+#define SPARSEIRQHASH_BITS          (13 - 1)
+#define SPARSEIRQHASH_SIZE          (1UL << SPARSEIRQHASH_BITS)
+#define __sparseirqhashfn(key)      hash_long((unsigned long)key, SPARSEIRQHASH_BITS)
+#define sparseirqhashentry(key)     (sparseirqhash_table + __sparseirqhashfn((key)))
+
+static struct list_head sparseirqhash_table[SPARSEIRQHASH_SIZE];
+
+static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
+	[0 ... NR_IRQS_LEGACY-1] = {
+		.irq	    = -1U,
+		.status	    = IRQ_DISABLED,
+		.chip	    = &no_irq_chip,
+		.handle_irq = handle_bad_irq,
+		.depth	    = 1,
+		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+		.affinity   = CPU_MASK_ALL
+#endif
+	}
+};
+
+/* FIXME: use bootmem alloc ...*/
+static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+
+void __init early_irq_init_work(void)
+{
+	struct irq_desc *desc;
+	int legacy_count;
+	int i;
+
+	/* init_work to init list for sparseirq */
+	for (i = 0; i < SPARSEIRQHASH_SIZE; i++)
+		INIT_LIST_HEAD(sparseirqhash_table + i);
+
+	desc = irq_desc_legacy;
+	legacy_count = ARRAY_SIZE(irq_desc_legacy);
+
+	for (i = 0; i < legacy_count; i++) {
+		struct list_head *hash_head;
+
+		hash_head = sparseirqhashentry(i);
+		desc[i].irq = i;
+		desc[i].kstat_irqs = kstat_irqs_legacy[i];
+		list_add_tail(&desc[i].hash_entry, hash_head);
+		list_add_tail(&desc[i].list, &sparse_irqs_head);
+	}
+
+	arch_early_irq_init_work();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+
+	hash_head = sparseirqhashentry(irq);
+
+	/*
+	 * We can walk the hash lockfree, because the hash only
+	 * grows, and we are careful when adding entries to the end:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry) {
+		if (desc->irq == irq)
+			return desc;
+	}
+
+	return NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		return desc;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry)
+		if (desc->irq == irq)
+			goto out_unlock;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  alloc irq_desc for %d aka %#x on cpu %d node %d\n",
+		 irq, irq, cpu, node);
+	init_one_irq_desc(irq, desc, cpu);
+
+	/*
+	 * We use RCU's safe list-add method to make
+	 * parallel walking of the hash-list safe:
+	 */
+	list_add_tail_rcu(&desc->hash_entry, hash_head);
+	/*
+	 * Add it to the global list:
+	 */
+	list_add_tail_rcu(&desc->list, &sparse_irqs_head);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc_alloc_cpu(irq, -1);
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+						int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	irq = old_desc->irq;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry)
+		if (desc->irq == irq && old_desc != desc)
+			goto out_unlock;
+
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
+	printk(KERN_DEBUG "  move irq_desc for %d aka %#x to cpu %d node %d\n",
+		 irq, irq, cpu, node);
+
+	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+	list_replace_rcu(&old_desc->hash_entry, &desc->hash_entry);
+	list_replace_rcu(&old_desc->list, &desc->list);
+
+	/* free the old one */
+	free_one_irq_desc(old_desc);
+	kfree(old_desc);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+	int old_cpu;
+	int node, old_node;
+
+	/* those all static, do move them */
+	if (desc->irq < NR_IRQS_LEGACY)
+		return desc;
+
+	old_cpu = desc->cpu;
+	printk(KERN_DEBUG "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
+	if (old_cpu != cpu) {
+		node = cpu_to_node(cpu);
+		old_node = cpu_to_node(old_cpu);
+		if (old_node != node)
+			desc = __real_move_irq_desc(desc, cpu);
+		else
+			desc->cpu = cpu;
+	}
+
+	return desc;
+}
+#endif
+
+#else
+
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
@@ -62,17 +362,47 @@ struct irq_desc irq_desc[NR_IRQS] __cach
 	}
 };
 
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_desc[irq];
+
+	return NULL;
+}
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu)
+{
+	return old_desc;
+}
+#endif
+
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
  */
+static void ack_badx(unsigned int irq, struct irq_desc **descp)
+{
+	print_irq_desc(irq, *descp);
+	ack_bad_irq(irq);
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+#define ack_bad ack_badx
+#else
 static void ack_bad(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	print_irq_desc(irq, desc);
-	ack_bad_irq(irq);
+	ack_badx(irq, &desc);
 }
+#endif
 
 /*
  * NOP functions
@@ -81,6 +411,15 @@ static void noop(unsigned int irq)
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void noopx(unsigned int irq, struct irq_desc **descp)
+{
+	noop(irq);
+}
+#else
+#define noopx noop
+#endif
+
 static unsigned int noop_ret(unsigned int irq)
 {
 	return 0;
@@ -109,9 +448,9 @@ struct irq_chip dummy_irq_chip = {
 	.shutdown	= noop,
 	.enable		= noop,
 	.disable	= noop,
-	.ack		= noop,
-	.mask		= noop,
-	.unmask		= noop,
+	.ack		= noopx,
+	.mask		= noopx,
+	.unmask		= noopx,
 	.end		= noop,
 };
 
@@ -180,7 +519,7 @@ unsigned int __do_IRQ(unsigned int irq)
 		 * No locking required for CPU-local interrupts:
 		 */
 		if (desc->chip->ack)
-			desc->chip->ack(irq);
+			desc_chip_ack(irq, &desc);
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -192,7 +531,7 @@ unsigned int __do_IRQ(unsigned int irq)
 
 	spin_lock(&desc->lock);
 	if (desc->chip->ack)
-		desc->chip->ack(irq);
+		desc_chip_ack(irq, &desc);
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
@@ -261,17 +600,25 @@ out:
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-static struct lock_class_key irq_desc_lock_class;
-
 void early_init_irq_lock_class(void)
 {
+#ifndef CONFIG_SPARSE_IRQ
 	struct irq_desc *desc;
 	int i;
 
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	} end_for_each_irq_desc();
+#endif
 }
 #endif
+
+#ifdef CONFIG_SPARSE_IRQ
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	return desc->kstat_irqs[cpu];
+}
+#endif
+EXPORT_SYMBOL(kstat_irqs_cpu);
+
Index: linux-2.6/arch/x86/kernel/irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq.c
+++ linux-2.6/arch/x86/kernel/irq.c
@@ -99,25 +99,37 @@ static int show_other_interrupts(struct
 int show_interrupts(struct seq_file *p, void *v)
 {
 	unsigned long flags, any_count = 0;
-	int i = *(loff_t *) v, j;
+	int i, j;
 	struct irqaction *action;
 	struct irq_desc *desc;
+	int head = 0;
 
+#ifdef CONFIG_SPARSE_IRQ
+	desc = list_entry(v, struct irq_desc, list);
+	i = desc->irq;
+	if (&desc->list == sparse_irqs_head.next)
+		head = 1;
+#else
+	i = *(loff_t *) v;
 	if (i > nr_irqs)
 		return 0;
 
 	if (i == nr_irqs)
 		return show_other_interrupts(p);
+	if (i == 0)
+		head = 1;
+
+	desc = irq_to_desc(i);
+#endif
 
 	/* print header */
-	if (i == 0) {
+	if (head) {
 		seq_printf(p, "           ");
 		for_each_online_cpu(j)
 			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
 	}
 
-	desc = irq_to_desc(i);
 	spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 	any_count = kstat_irqs(i);
@@ -148,6 +160,12 @@ int show_interrupts(struct seq_file *p,
 	seq_putc(p, '\n');
 out:
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+#ifdef CONFIG_SPARSE_IRQ
+	if (&desc->list == sparse_irqs_head.prev)
+		show_other_interrupts(p);
+#endif
+
 	return 0;
 }
 
Index: linux-2.6/include/linux/irqnr.h
===================================================================
--- linux-2.6.orig/include/linux/irqnr.h
+++ linux-2.6/include/linux/irqnr.h
@@ -7,18 +7,11 @@
 
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0; irq < nr_irqs; irq++)
-#else
-extern int nr_irqs;
+# define end_for_each_irq_desc()
 
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-
-# define for_each_irq_desc_reverse(irq, desc)				\
-	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);	\
-	     irq >= 0; irq--, desc--)
+static inline early_sparse_irq_init_work(void)
+{
+}
 #endif
 
-#define for_each_irq_nr(irq)			\
-	for (irq = 0; irq < nr_irqs; irq++)
-
 #endif
Index: linux-2.6/arch/x86/kernel/irq_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_32.c
+++ linux-2.6/arch/x86/kernel/irq_32.c
@@ -254,7 +254,7 @@ void fixup_irqs(cpumask_t map)
 			desc->chip->set_affinity(irq, mask);
 		else if (desc->action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 #if 0
 	barrier();
Index: linux-2.6/arch/x86/kernel/irq_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_64.c
+++ linux-2.6/arch/x86/kernel/irq_64.c
@@ -113,7 +113,7 @@ void fixup_irqs(cpumask_t map)
 		}
 
 		if (desc->chip->mask)
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 
 		if (desc->chip->set_affinity)
 			desc->chip->set_affinity(irq, mask);
@@ -121,7 +121,7 @@ void fixup_irqs(cpumask_t map)
 			set_affinity = 0;
 
 		if (desc->chip->unmask)
-			desc->chip->unmask(irq);
+			desc_chip_unmask(irq, &desc);
 
 		spin_unlock(&desc->lock);
 
@@ -129,7 +129,7 @@ void fixup_irqs(cpumask_t map)
 			printk("Broke affinity for irq %i\n", irq);
 		else if (!set_affinity)
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 	/* That doesn't seem sufficient.  Give it 1ms. */
 	local_irq_enable();
Index: linux-2.6/kernel/irq/proc.c
===================================================================
--- linux-2.6.orig/kernel/irq/proc.c
+++ linux-2.6/kernel/irq/proc.c
@@ -243,7 +243,8 @@ void init_irq_proc(void)
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for_each_irq_desc(irq, desc)
+	for_each_irq_desc(irq, desc) {
 		register_irq_proc(irq, desc);
+	} end_for_each_irq_desc();
 }
 
Index: linux-2.6/kernel/irq/spurious.c
===================================================================
--- linux-2.6.orig/kernel/irq/spurious.c
+++ linux-2.6/kernel/irq/spurious.c
@@ -99,7 +99,7 @@ static int misrouted_irq(int irq)
 
 		if (try_one_irq(i, desc))
 			ok = 1;
-	}
+	} end_for_each_irq_desc();
 	/* So the caller can adjust the irq error counts */
 	return ok;
 }
@@ -122,7 +122,7 @@ static void poll_spurious_irqs(unsigned
 			continue;
 
 		try_one_irq(i, desc);
-	}
+	} end_for_each_irq_desc();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c
+++ linux-2.6/init/main.c
@@ -542,6 +542,15 @@ void __init __weak thread_info_cache_ini
 {
 }
 
+void __init __attribute__((weak)) arch_early_irq_init_work(void)
+{
+}
+
+void __init __attribute__((weak)) early_irq_init_work(void)
+{
+	arch_early_irq_init_work();
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
@@ -612,6 +621,8 @@ asmlinkage void __init start_kernel(void
 	sort_main_extable();
 	trap_init();
 	rcu_init();
+	/* init some links before init_ISA_irqs() */
+	early_irq_init_work();
 	init_IRQ();
 	pidhash_init();
 	init_timers();
Index: linux-2.6/arch/x86/include/asm/irq_vectors.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/irq_vectors.h
+++ linux-2.6/arch/x86/include/asm/irq_vectors.h
@@ -101,6 +101,8 @@
 #define LAST_VM86_IRQ		15
 #define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
 
+#define NR_IRQS_LEGACY		16
+
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
Index: linux-2.6/arch/x86/kernel/i8259.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/i8259.c
+++ linux-2.6/arch/x86/kernel/i8259.c
@@ -36,12 +36,31 @@ static int i8259A_auto_eoi;
 DEFINE_SPINLOCK(i8259A_lock);
 static void mask_and_ack_8259A(unsigned int);
 
+#ifdef CONFIG_SPARSE_IRQ
+static void mask_and_ack_8259A_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	mask_and_ack_8259A(irq);
+}
+static void disable_8259A_irq_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	disable_8259A_irq(irq);
+}
+static void enable_8259A_irq_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	enable_8259A_irq(irq);
+}
+#else
+#define mask_and_ack_8259A_wrapper mask_and_ack_8259A
+#define disable_8259A_irq_wrapper disable_8259A_irq
+#define enable_8259A_irq_wrapper enable_8259A_irq
+#endif
+
 struct irq_chip i8259A_chip = {
 	.name		= "XT-PIC",
-	.mask		= disable_8259A_irq,
+	.mask		= disable_8259A_irq_wrapper,
 	.disable	= disable_8259A_irq,
-	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
+	.unmask		= enable_8259A_irq_wrapper,
+	.mask_ack	= mask_and_ack_8259A_wrapper,
 };
 
 /*
@@ -348,9 +367,9 @@ void init_8259A(int auto_eoi)
 		 * In AEOI mode we just have to mask the interrupt
 		 * when acking.
 		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
+		i8259A_chip.mask_ack = disable_8259A_irq_wrapper;
 	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
+		i8259A_chip.mask_ack = mask_and_ack_8259A_wrapper;
 
 	udelay(100);		/* wait for 8259A to initialize */
 
Index: linux-2.6/arch/x86/kernel/uv_irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/uv_irq.c
+++ linux-2.6/arch/x86/kernel/uv_irq.c
@@ -18,6 +18,16 @@ static void uv_noop(unsigned int irq)
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_noop_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	uv_noop(irq);
+}
+
+#else
+#define uv_noop_wrapper uv_noop
+#endif
+
 static unsigned int uv_noop_ret(unsigned int irq)
 {
 	return 0;
@@ -28,16 +38,26 @@ static void uv_ack_apic(unsigned int irq
 	ack_APIC_irq();
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_ack_apic_wrapper(unsigned int irq, struct irq_desc **descp)
+{
+	uv_ack_apic(irq);
+}
+
+#else
+#define uv_ack_apic_wrapper uv_ack_apic
+#endif
+
 struct irq_chip uv_irq_chip = {
 	.name		= "UV-CORE",
 	.startup	= uv_noop_ret,
 	.shutdown	= uv_noop,
 	.enable		= uv_noop,
 	.disable	= uv_noop,
-	.ack		= uv_noop,
-	.mask		= uv_noop,
-	.unmask		= uv_noop,
-	.eoi		= uv_ack_apic,
+	.ack		= uv_noop_wrapper,
+	.mask		= uv_noop_wrapper,
+	.unmask		= uv_noop_wrapper,
+	.eoi		= uv_ack_apic_wrapper,
 	.end		= uv_noop,
 };
 
Index: linux-2.6/drivers/pci/msi.c
===================================================================
--- linux-2.6.orig/drivers/pci/msi.c
+++ linux-2.6/drivers/pci/msi.c
@@ -103,11 +103,11 @@ static void msix_set_enable(struct pci_d
 	}
 }
 
-static void msix_flush_writes(unsigned int irq)
+static void msix_flush_writes(struct irq_desc *desc)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = desc->msi_desc;
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -135,11 +135,11 @@ static void msix_flush_writes(unsigned i
  * Returns 1 if it succeeded in masking the interrupt and 0 if the device
  * doesn't support MSI masking.
  */
-static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
+static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = desc->msi_desc;
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -252,17 +252,36 @@ void write_msi_msg(unsigned int irq, str
 	entry->msg = *msg;
 }
 
-void mask_msi_irq(unsigned int irq)
+void mask_msi_irqx(unsigned int irq, struct irq_desc **descp)
 {
-	msi_set_mask_bits(irq, 1, 1);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = *descp;
+
+	msi_set_mask_bits(desc, 1, 1);
+	msix_flush_writes(desc);
 }
 
+void unmask_msi_irqx(unsigned int irq, struct irq_desc **descp)
+{
+	struct irq_desc *desc = *descp;
+
+	msi_set_mask_bits(desc, 1, 0);
+	msix_flush_writes(desc);
+}
+
+#ifndef CONFIG_SPARSE_IRQ
+void mask_msi_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_msi_irqx(irq, &desc);
+}
 void unmask_msi_irq(unsigned int irq)
 {
-	msi_set_mask_bits(irq, 1, 0);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_msi_irqx(irq, &desc);
 }
+#endif
 
 static int msi_free_irqs(struct pci_dev* dev);
 
@@ -303,9 +322,11 @@ static void __pci_restore_msi_state(stru
 	pci_intx_for_msi(dev, 0);
 	msi_set_enable(dev, 0);
 	write_msi_msg(dev->irq, &entry->msg);
-	if (entry->msi_attrib.maskbit)
-		msi_set_mask_bits(dev->irq, entry->msi_attrib.maskbits_mask,
+	if (entry->msi_attrib.maskbit) {
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask,
 				  entry->msi_attrib.masked);
+	}
 
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
@@ -327,8 +348,9 @@ static void __pci_restore_msix_state(str
 	msix_set_enable(dev, 0);
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
+		struct irq_desc *desc = irq_to_desc(entry->irq);
 		write_msi_msg(entry->irq, &entry->msg);
-		msi_set_mask_bits(entry->irq, 1, entry->msi_attrib.masked);
+		msi_set_mask_bits(desc, 1, entry->msi_attrib.masked);
 	}
 
 	BUG_ON(list_empty(&dev->msi_list));
@@ -596,7 +618,8 @@ void pci_msi_shutdown(struct pci_dev* de
 	/* Return the the pci reset with msi irqs unmasked */
 	if (entry->msi_attrib.maskbit) {
 		u32 mask = entry->msi_attrib.maskbits_mask;
-		msi_set_mask_bits(dev->irq, mask, ~mask);
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, mask, ~mask);
 	}
 	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
 		return;
Index: linux-2.6/include/linux/msi.h
===================================================================
--- linux-2.6.orig/include/linux/msi.h
+++ linux-2.6/include/linux/msi.h
@@ -10,8 +10,16 @@ struct msi_msg {
 };
 
 /* Helper functions */
+#ifdef CONFIG_SPARSE_IRQ
+struct irq_desc;
+extern void mask_msi_irqx(unsigned int irq, struct irq_desc **descp);
+extern void unmask_msi_irqx(unsigned int irq, struct irq_desc **descp);
+#define mask_msi_irq mask_msi_irqx
+#define unmask_msi_irq unmask_msi_irqx
+#else
 extern void mask_msi_irq(unsigned int irq);
 extern void unmask_msi_irq(unsigned int irq);
+#endif
 extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
 extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
Index: linux-2.6/arch/x86/include/asm/hpet.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/hpet.h
+++ linux-2.6/arch/x86/include/asm/hpet.h
@@ -72,8 +72,15 @@ extern void hpet_disable(void);
 extern unsigned long hpet_readl(unsigned long a);
 extern void force_hpet_resume(void);
 
+#ifdef CONFIG_SPARSE_IRQ
+extern void hpet_msi_unmaskx(unsigned int irq, struct irq_desc **descp);
+extern void hpet_msi_maskx(unsigned int irq, struct irq_desc **descp);
+#define hpet_msi_unmask hpet_msi_unmaskx
+#define hpet_msi_mask hpet_msi_maskx
+#else
 extern void hpet_msi_unmask(unsigned int irq);
 extern void hpet_msi_mask(unsigned int irq);
+#endif
 extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
 extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
 
Index: linux-2.6/arch/x86/kernel/hpet.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/hpet.c
+++ linux-2.6/arch/x86/kernel/hpet.c
@@ -347,7 +347,7 @@ static int hpet_legacy_next_event(unsign
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev	*hpet_devs;
 
-void hpet_msi_unmask(unsigned int irq)
+void hpet_msi_unmaskx(unsigned int irq, struct irq_desc **descp)
 {
 	struct hpet_dev *hdev = get_irq_data(irq);
 	unsigned long cfg;
@@ -358,7 +358,7 @@ void hpet_msi_unmask(unsigned int irq)
 	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
 
-void hpet_msi_mask(unsigned int irq)
+void hpet_msi_maskx(unsigned int irq, struct irq_desc **descp)
 {
 	unsigned long cfg;
 	struct hpet_dev *hdev = get_irq_data(irq);
@@ -369,6 +369,21 @@ void hpet_msi_mask(unsigned int irq)
 	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void hpet_msi_unmask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	hpet_msi_unmaskx(irq, &desc);
+}
+void hpet_msi_mask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	hpet_msi_maskx(irq, &desc);
+}
+#endif
+
 void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
 {
 	struct hpet_dev *hdev = get_irq_data(irq);
Index: linux-2.6/include/linux/htirq.h
===================================================================
--- linux-2.6.orig/include/linux/htirq.h
+++ linux-2.6/include/linux/htirq.h
@@ -9,8 +9,16 @@ struct ht_irq_msg {
 /* Helper functions.. */
 void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
 void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
+#ifdef CONFIG_SPARSE_IRQ
+struct irq_desc;
+void mask_ht_irqx(unsigned int irq, struct irq_desc **descp);
+void unmask_ht_irqx(unsigned int irq, struct irq_desc **descp);
+#define mask_ht_irq mask_ht_irqx
+#define unmask_ht_irq unmask_ht_irqx
+#else
 void mask_ht_irq(unsigned int irq);
 void unmask_ht_irq(unsigned int irq);
+#endif
 
 /* The arch hook for getting things started */
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev);
Index: linux-2.6/kernel/irq/migration.c
===================================================================
--- linux-2.6.orig/kernel/irq/migration.c
+++ linux-2.6/kernel/irq/migration.c
@@ -1,9 +1,9 @@
 
 #include <linux/irq.h>
 
-void move_masked_irq(int irq)
+void move_masked_irqx(int irq, struct irq_desc **descp)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc = *descp;
 	cpumask_t tmp;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
@@ -47,9 +47,9 @@ void move_masked_irq(int irq)
 	cpus_clear(desc->pending_mask);
 }
 
-void move_native_irq(int irq)
+void move_native_irqx(int irq, struct irq_desc **descp)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc = *descp;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
@@ -57,8 +57,23 @@ void move_native_irq(int irq)
 	if (unlikely(desc->status & IRQ_DISABLED))
 		return;
 
-	desc->chip->mask(irq);
-	move_masked_irq(irq);
-	desc->chip->unmask(irq);
+	desc_chip_mask(irq, descp);
+	move_masked_irqx(irq, descp);
+	desc_chip_unmask(irq, descp);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void move_masked_irq(int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	move_masked_irqx(irq, &desc);
+}
+
+void move_native_irq(int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	move_native_irqx(irq, &desc);
+}
+#endif
Index: linux-2.6/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.orig/drivers/pci/intel-iommu.c
+++ linux-2.6/drivers/pci/intel-iommu.c
@@ -751,7 +751,7 @@ const char *dmar_get_fault_reason(u8 fau
 		return fault_reason_strings[fault_reason];
 }
 
-void dmar_msi_unmask(unsigned int irq)
+void dmar_msi_unmaskx(unsigned int irq, struct irq_desc **descp)
 {
 	struct intel_iommu *iommu = get_irq_data(irq);
 	unsigned long flag;
@@ -764,7 +764,7 @@ void dmar_msi_unmask(unsigned int irq)
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
-void dmar_msi_mask(unsigned int irq)
+void dmar_msi_maskx(unsigned int irq, struct irq_desc **descp)
 {
 	unsigned long flag;
 	struct intel_iommu *iommu = get_irq_data(irq);
@@ -777,6 +777,21 @@ void dmar_msi_mask(unsigned int irq)
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void dmar_msi_unmask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	dmar_msi_unmaskx(irq, &desc);
+}
+void dmar_msi_mask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	dmar_msi_maskx(irq, &desc);
+}
+#endif
+
 void dmar_msi_write(int irq, struct msi_msg *msg)
 {
 	struct intel_iommu *iommu = get_irq_data(irq);
Index: linux-2.6/include/linux/dmar.h
===================================================================
--- linux-2.6.orig/include/linux/dmar.h
+++ linux-2.6/include/linux/dmar.h
@@ -122,8 +122,15 @@ extern const char *dmar_get_fault_reason
 /* Can't use the common MSI interrupt functions
  * since DMAR is not a pci device
  */
+#ifdef CONFIG_SPARSE_IRQ
+extern void dmar_msi_unmaskx(unsigned int irq, struct irq_desc **descp);
+extern void dmar_msi_maskx(unsigned int irq, struct irq_desc **descp);
+#define dmar_msi_unmask dmar_msi_unmaskx
+#define dmar_msi_mask dmar_msi_maskx
+#else
 extern void dmar_msi_unmask(unsigned int irq);
 extern void dmar_msi_mask(unsigned int irq);
+#endif
 extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 20:16                                       ` Yinghai Lu
@ 2008-11-13 21:18                                         ` Andrew Morton
  2008-11-13 21:21                                           ` Ingo Molnar
                                                             ` (3 more replies)
  0 siblings, 4 replies; 66+ messages in thread
From: Andrew Morton @ 2008-11-13 21:18 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: mingo, tglx, hpa, linux-kernel, travis

On Thu, 13 Nov 2008 12:16:56 -0800
Yinghai Lu <yinghai@kernel.org> wrote:

> From: Yinghai Lu <yinghai@kernel.org>
> Subject: sparseirq v13

My overall view on this is that it takes some of the kernel's most
fragile and most problem-dense code and makes it much more complex, and
by adding new configuration options it significantly worsens our
testing coverage.

The patch is HHHHHUUUUUUUUUUGGGGGEEE!  Did it really need to be a
single megapatch?

Other architectures want (or have) sparse interrupts.  Are those guys
paying attention here?

I don't have a clue what all this does.  I hope those who will work on
this code are sufficiently familiar with it all to be able to maintain
it when there are close to zero comments in some of our most tricky and
problem-prone code.

>
> ...
>
> +config SPARSE_IRQ
> +	bool "Support sparse irq numbering"
> +	depends on PCI_MSI || HT_IRQ
> +	default y
> +	help
> +	  This enables support for sparse irq, esp for msi/msi-x. the irq
> +	  number will be bus/dev/fn + 12bit. You may need if you have lots of
> +	  cards supports msi-x installed.
> +
> +	  If you don't know what to do here, say Y.
> +
> +config MOVE_IRQ_DESC
> +	bool "Move irq desc when changing irq smp_affinity"
> +	depends on SPARSE_IRQ && SMP
> +	default y
> +	help
> +	  This enables moving irq_desc to cpu/node that irq will use handled.
> +
> +	  If you don't know what to do here, say Y.

Do these reeeealy have to exist?  How are users to know which to
choose?  Which option will distros choose and why did we make them have
to decide?

>
> ...
>
> +{
> +	struct irq_pin_list *pin;
> +	int node;
> +
> +	if (cpu < 0)
> +		cpu = smp_processor_id();
> +	node = cpu_to_node(cpu);
> +
> +	pin = kzalloc_node(sizeof(*pin), GFP_KERNEL, node);

It's a bug to call smp_processor_id() from preemptible code and it's a
bug to use GFP_KERNEL in non-preemptible code.  How can this be?

> +	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
> +
> +	return pin;
> +}
>
>
> ...
>
> -static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
> +static struct irq_cfg *get_one_free_irq_cfg(int cpu)
>  {
> -	return irq_cfg(irq);
> +	struct irq_cfg *cfg;
> +	int node;
> +
> +	if (cpu < 0)
> +		cpu = smp_processor_id();
> +	node = cpu_to_node(cpu);
> +
> +	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);

See above.

> +	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
> +
> +	return cfg;
>  }

So all callers of this function must test the return value and if it is
NULL, take appropriate action.

That is something which should have been documented in this function's
interface description.  Only that doesn't exist.

The one caller which I checked (arch_init_copy_chip_data()) fails to
check for this and will oops.

>
> ...
>
> +static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
> +{
> +	struct irq_cfg *cfg = desc->chip_data;
> +
> +	if (!cfg->move_in_progress) {
> +		/* it means that domain is not changed */
> +		cpumask_t tmp;
> +
> +		cpus_and(tmp, desc->affinity, mask);
> +		if (cpus_empty(tmp))
> +			cfg->move_desc_in_progress_in_same_domain = 1;
> +	}

Aren't we trying to avoid on-stack cpumask_t's?

I'd have though that this one could be eliminated via the use of
cpus_intersects()?

>  }
> +#endif
>
> ...
>
>  static struct irq_chip lapic_chip __read_mostly = {
>  	.name		= "local-APIC",
>  	.mask		= mask_lapic_irq,
> @@ -2574,7 +2834,9 @@ int timer_through_8259 __initdata;
>   */
>  static inline void __init check_timer(void)

An inlined __init function makes little sense.

>
> ...
>
> @@ -3306,10 +3590,13 @@ static void dmar_msi_set_affinity(unsign
>  	if (cpus_empty(tmp))
>  		return;

`tmp' is always a bad choice of identifier.

>
> ...
>
> --- linux-2.6.orig/arch/x86/mm/init_32.c
> +++ linux-2.6/arch/x86/mm/init_32.c
> @@ -66,6 +66,7 @@ static unsigned long __meminitdata table
>  static unsigned long __meminitdata table_top;
>  
>  static int __initdata after_init_bootmem;
> +int after_bootmem;

This isn't a very well-chosen identifier for an x86-specific global.

>
> ...
>
> @@ -98,6 +126,7 @@ int __ht_create_irq(struct pci_dev *dev,
>  	int max_irq;
>  	int pos;
>  	int irq;
> +	unsigned int irq_want;
>  
>  	pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
>  	if (!pos)
> @@ -125,7 +154,12 @@ int __ht_create_irq(struct pci_dev *dev,
>  	cfg->msg.address_lo = 0xffffffff;
>  	cfg->msg.address_hi = 0xffffffff;
>  
> +	irq_want = build_irq_for_pci_dev(dev);
> +#ifdef CONFIG_SPARSE_IRQ
> +	irq = create_irq_nr(irq_want + idx);
> +#else
>  	irq = create_irq();
> +#endif

irq_want is unused if CONFIG_SPARSE_IRQ=n.

>  	if (irq <= 0) {
>  		kfree(cfg);
> Index: linux-2.6/drivers/pci/intr_remapping.c
> ===================================================================
> --- linux-2.6.orig/drivers/pci/intr_remapping.c
> +++ linux-2.6/drivers/pci/intr_remapping.c
> @@ -19,17 +19,73 @@ struct irq_2_iommu {
>  	u8  irte_mask;
>  };
>  
> -static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
> +#ifdef CONFIG_SPARSE_IRQ
> +static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
> +{
> +	struct irq_2_iommu *iommu;
> +	int node;
> +
> +	if (cpu < 0)
> +		cpu = smp_processor_id();
> +	node = cpu_to_node(cpu);
> +
> +	iommu = kzalloc_node(sizeof(*iommu), GFP_KERNEL, node);

See above.

> +	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
> +
> +	return iommu;
> +}
>  
>
> ...
>
> --- linux-2.6.orig/fs/proc/stat.c
> +++ linux-2.6/fs/proc/stat.c
> @@ -27,6 +27,9 @@ static int show_stat(struct seq_file *p,
>  	u64 sum = 0;
>  	struct timespec boottime;
>  	unsigned int per_irq_sum;
> +#ifdef CONFIG_GENERIC_HARDIRQS
> +	struct irq_desc *desc;
> +#endif
>  
>  	user = nice = system = idle = iowait =
>  		irq = softirq = steal = cputime64_zero;
> @@ -44,10 +47,9 @@ static int show_stat(struct seq_file *p,
>  		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
>  		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
>  		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
> -
> -		for_each_irq_nr(j)
> +		for_each_irq_desc(j, desc) {

This won't compile if CONFIG_GENERIC_HARDIRQS=n, I suspect.

>  			sum += kstat_irqs_cpu(j, i);
> -
> +		} end_for_each_irq_desc();
>  		sum += arch_irq_stat_cpu(i);
>  	}
>  	sum += arch_irq_stat();
> @@ -90,14 +92,17 @@ static int show_stat(struct seq_file *p,
>  	seq_printf(p, "intr %llu", (unsigned long long)sum);
>  
>  	/* sum again ? it could be updated? */
> -	for_each_irq_nr(j) {
> +	for_each_irq_desc(j, desc) {

ditto.

>
> ...
>
> +#define end_for_each_irq_desc()
> +#endif
> +
> +#define desc_chip_ack(irq, descp) desc->chip->ack(irq)
> +#define desc_chip_mask(irq, descp) desc->chip->mask(irq)
> +#define desc_chip_mask_ack(irq, descp) desc->chip->mask_ack(irq)
> +#define desc_chip_unmask(irq, descp) desc->chip->unmask(irq)
> +#define desc_chip_eoi(irq, descp) desc->chip->eoi(irq)

Was it necessary to implement these as macros?

>
> ...
>
> @@ -49,6 +56,299 @@ void handle_bad_irq(unsigned int irq, st
>  int nr_irqs = NR_IRQS;
>  EXPORT_SYMBOL_GPL(nr_irqs);
>  
> +void __init __attribute__((weak)) arch_early_irq_init_work(void)
> +{
> +}
> +
> +#ifdef CONFIG_SPARSE_IRQ
> +static struct irq_desc irq_desc_init = {
> +	.irq	    = -1U,

Plain old `-1' would be better here.  It works in all cases and the
reader doesn't need to go and check that this field really is an
unsigned int and it won't need editing if that field gets changed to
long.

> +	.status	    = IRQ_DISABLED,
> +	.chip	    = &no_irq_chip,
> +	.handle_irq = handle_bad_irq,
> +	.depth      = 1,
> +	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
> +#ifdef CONFIG_SMP
> +	.affinity   = CPU_MASK_ALL
> +#endif
> +};
> +
> +static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
> +{
> +	unsigned long bytes;
> +	char *ptr;
> +	int node;
> +
> +	/* Compute how many bytes we need per irq and allocate them */
> +	bytes = nr * sizeof(unsigned int);
> +
> +	if (cpu < 0)
> +		cpu = smp_processor_id();
> +
> +	node = cpu_to_node(cpu);
> +	ptr = kzalloc_node(bytes, GFP_KERNEL, node);

See above.

> +	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
> +
> +	desc->kstat_irqs = (unsigned int *)ptr;
> +}
> +
>
> ...
>
> +#endif
> +/*
> + * Protect the sparse_irqs_free freelist:
> + */
> +static DEFINE_SPINLOCK(sparse_irq_lock);
> +LIST_HEAD(sparse_irqs_head);

It's strange that the list is global and is accessed from other .c
files, but the lock which protects it is static.

> +/*
> + * The sparse irqs are in a hash-table as well, for fast lookup:
> + */
> +#define SPARSEIRQHASH_BITS          (13 - 1)
> +#define SPARSEIRQHASH_SIZE          (1UL << SPARSEIRQHASH_BITS)
> +#define __sparseirqhashfn(key)      hash_long((unsigned long)key, SPARSEIRQHASH_BITS)
> +#define sparseirqhashentry(key)     (sparseirqhash_table + __sparseirqhashfn((key)))

Why implement these via macros?

> +static struct list_head sparseirqhash_table[SPARSEIRQHASH_SIZE];
> +
> +static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
> +	[0 ... NR_IRQS_LEGACY-1] = {
> +		.irq	    = -1U,
> +		.status	    = IRQ_DISABLED,
> +		.chip	    = &no_irq_chip,
> +		.handle_irq = handle_bad_irq,
> +		.depth	    = 1,
> +		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
> +#ifdef CONFIG_SMP
> +		.affinity   = CPU_MASK_ALL
> +#endif
> +	}
> +};
> +
> +/* FIXME: use bootmem alloc ...*/
> +static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];

Do these need to be 32-bit?  Maybe they'll fit in 16-bit, dunno.

> +void __init early_irq_init_work(void)

The use of "_work" implies that this function is invoked by
schedule_work().  But it isn't.

> +{
> +	struct irq_desc *desc;
> +	int legacy_count;
> +	int i;
> +
> +	/* init_work to init list for sparseirq */
> +	for (i = 0; i < SPARSEIRQHASH_SIZE; i++)
> +		INIT_LIST_HEAD(sparseirqhash_table + i);
> +
> +	desc = irq_desc_legacy;
> +	legacy_count = ARRAY_SIZE(irq_desc_legacy);
> +
> +	for (i = 0; i < legacy_count; i++) {
> +		struct list_head *hash_head;
> +
> +		hash_head = sparseirqhashentry(i);
> +		desc[i].irq = i;
> +		desc[i].kstat_irqs = kstat_irqs_legacy[i];
> +		list_add_tail(&desc[i].hash_entry, hash_head);
> +		list_add_tail(&desc[i].list, &sparse_irqs_head);
> +	}
> +
> +	arch_early_irq_init_work();
> +}
> +
>
> ...
>
> +struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
> +{
> +	struct irq_desc *desc;
> +	struct list_head *hash_head;
> +	unsigned long flags;
> +	int node;
> +
> +	desc = irq_to_desc(irq);
> +	if (desc)
> +		return desc;
> +
> +	hash_head = sparseirqhashentry(irq);
> +
> +	spin_lock_irqsave(&sparse_irq_lock, flags);
> +
> +	/*
> +	 * We have to do the hash-walk again, to avoid races
> +	 * with another CPU:
> +	 */
> +	list_for_each_entry(desc, hash_head, hash_entry)
> +		if (desc->irq == irq)
> +			goto out_unlock;
> +
> +	if (cpu < 0)
> +		cpu = smp_processor_id();
> +
> +	node = cpu_to_node(cpu);
> +	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);

Oh for gawd's sake.  PLEASE read Documentation/SubmitChecklist. 
Carefully.  We've already discussed this.

You cannot do a GFP_KERNEL allocation under spin_lock_irqsave(). 

> +	printk(KERN_DEBUG "  alloc irq_desc for %d aka %#x on cpu %d node %d\n",
> +		 irq, irq, cpu, node);
> +	init_one_irq_desc(irq, desc, cpu);
> +
> +	/*
> +	 * We use RCU's safe list-add method to make
> +	 * parallel walking of the hash-list safe:
> +	 */
> +	list_add_tail_rcu(&desc->hash_entry, hash_head);
> +	/*
> +	 * Add it to the global list:
> +	 */
> +	list_add_tail_rcu(&desc->list, &sparse_irqs_head);
> +
> +out_unlock:
> +	spin_unlock_irqrestore(&sparse_irq_lock, flags);
> +
> +	return desc;
> +}
> +
> +struct irq_desc *irq_to_desc_alloc(unsigned int irq)
> +{
> +	return irq_to_desc_alloc_cpu(irq, -1);
> +}
> +
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
> +						int cpu)
> +{
> +	struct irq_desc *desc;
> +	unsigned int irq;
> +	struct list_head *hash_head;
> +	unsigned long flags;
> +	int node;
> +
> +	irq = old_desc->irq;
> +
> +	hash_head = sparseirqhashentry(irq);
> +
> +	spin_lock_irqsave(&sparse_irq_lock, flags);
> +	/*
> +	 * We have to do the hash-walk again, to avoid races
> +	 * with another CPU:
> +	 */
> +	list_for_each_entry(desc, hash_head, hash_entry)
> +		if (desc->irq == irq && old_desc != desc)
> +			goto out_unlock;
> +
> +	if (cpu < 0)
> +		cpu = smp_processor_id();
> +
> +	node = cpu_to_node(cpu);
> +	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);

Ditto.

Also, the return value from the memory allocation attempt is not
checked.

> +	printk(KERN_DEBUG "  move irq_desc for %d aka %#x to cpu %d node %d\n",
> +		 irq, irq, cpu, node);
> +
> +	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
> +
> +	list_replace_rcu(&old_desc->hash_entry, &desc->hash_entry);
> +	list_replace_rcu(&old_desc->list, &desc->list);
> +
> +	/* free the old one */
> +	free_one_irq_desc(old_desc);
> +	kfree(old_desc);
> +
> +out_unlock:
> +	spin_unlock_irqrestore(&sparse_irq_lock, flags);
> +
> +	return desc;
> +}
> +
>
> ...
>
> --- linux-2.6.orig/init/main.c
> +++ linux-2.6/init/main.c
> @@ -542,6 +542,15 @@ void __init __weak thread_info_cache_ini
>  {
>  }
>  
> +void __init __attribute__((weak)) arch_early_irq_init_work(void)
> +{
> +}
> +
> +void __init __attribute__((weak)) early_irq_init_work(void)
> +{
> +	arch_early_irq_init_work();
> +}

Please use __weak

>
> ...
>



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 21:18                                         ` Andrew Morton
@ 2008-11-13 21:21                                           ` Ingo Molnar
  2008-11-13 22:01                                           ` Yinghai Lu
                                                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 66+ messages in thread
From: Ingo Molnar @ 2008-11-13 21:21 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Yinghai Lu, tglx, hpa, linux-kernel, travis


* Andrew Morton <akpm@linux-foundation.org> wrote:

> The patch is HHHHHUUUUUUUUUUGGGGGEEE!  Did it really need to be a 
> single megapatch?

i asked Yinghai to do that for now, instead of sending a 50-patch 
series 10 times over - so that we can see the overall structure.

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 21:18                                         ` Andrew Morton
  2008-11-13 21:21                                           ` Ingo Molnar
@ 2008-11-13 22:01                                           ` Yinghai Lu
  2008-11-13 22:05                                             ` Ingo Molnar
  2008-11-13 22:13                                             ` Andrew Morton
  2008-11-13 22:19                                           ` [PATCH] sparse_irq aka dyn_irq v13 Paul Mackerras
  2008-11-16 20:58                                           ` Benjamin Herrenschmidt
  3 siblings, 2 replies; 66+ messages in thread
From: Yinghai Lu @ 2008-11-13 22:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: mingo, tglx, hpa, linux-kernel, travis

On Thu, Nov 13, 2008 at 1:18 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Thu, 13 Nov 2008 12:16:56 -0800
> Yinghai Lu <yinghai@kernel.org> wrote:
>
>> From: Yinghai Lu <yinghai@kernel.org>
>> Subject: sparseirq v13
>
> My overall view on this is that it takes some of the kernel's most
> fragile and most problem-dense code and makes it much more complex, and
> by adding new configuration options it significantly worsens our
> testing coverage.
>
> The patch is HHHHHUUUUUUUUUUGGGGGEEE!  Did it really need to be a
> single megapatch?
>
> Other architectures want (or have) sparse interrupts.  Are those guys
> paying attention here?
>
> I don't have a clue what all this does.  I hope those who will work on
> this code are sufficiently familiar with it all to be able to maintain
> it when there are close to zero comments in some of our most tricky and
> problem-prone code.
>
>>
>> ...
>>
>> +config SPARSE_IRQ
>> +     bool "Support sparse irq numbering"
>> +     depends on PCI_MSI || HT_IRQ
>> +     default y
>> +     help
>> +       This enables support for sparse irq, esp for msi/msi-x. the irq
>> +       number will be bus/dev/fn + 12bit. You may need if you have lots of
>> +       cards supports msi-x installed.
>> +
>> +       If you don't know what to do here, say Y.
>> +
>> +config MOVE_IRQ_DESC
>> +     bool "Move irq desc when changing irq smp_affinity"
>> +     depends on SPARSE_IRQ && SMP
>> +     default y
>> +     help
>> +       This enables moving irq_desc to cpu/node that irq will use handled.
>> +
>> +       If you don't know what to do here, say Y.
>
> Do these reeeealy have to exist?  How are users to know which to
> choose?  Which option will distros choose and why did we make them have
> to decide?

want to use that as marker, so later could split the patch to small
ones by enabling by steps.

>
>>
>> ...
>>
>> +{
>> +     struct irq_pin_list *pin;
>> +     int node;
>> +
>> +     if (cpu < 0)
>> +             cpu = smp_processor_id();
>> +     node = cpu_to_node(cpu);
>> +
>> +     pin = kzalloc_node(sizeof(*pin), GFP_KERNEL, node);
>
> It's a bug to call smp_processor_id() from preemptible code and it's a
> bug to use GFP_KERNEL in non-preemptible code.  How can this be?

the could should be executed in boot stage, only bsp is running, or
interrupt conext
via irq_complete_move

>
>> +     printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
>> +
>> +     return pin;
>> +}
>>
>>
>> ...
>>
>> -static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
>> +static struct irq_cfg *get_one_free_irq_cfg(int cpu)
>>  {
>> -     return irq_cfg(irq);
>> +     struct irq_cfg *cfg;
>> +     int node;
>> +
>> +     if (cpu < 0)
>> +             cpu = smp_processor_id();
>> +     node = cpu_to_node(cpu);
>> +
>> +     cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
>
> See above.
>
>> +     printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
>> +
>> +     return cfg;
>>  }
>
> So all callers of this function must test the return value and if it is
> NULL, take appropriate action.
>
> That is something which should have been documented in this function's
> interface description.  Only that doesn't exist.
>
> The one caller which I checked (arch_init_copy_chip_data()) fails to
> check for this and will oops.

will add one function to wrap it.

>
>>
>> ...
>>
>> +static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
>> +{
>> +     struct irq_cfg *cfg = desc->chip_data;
>> +
>> +     if (!cfg->move_in_progress) {
>> +             /* it means that domain is not changed */
>> +             cpumask_t tmp;
>> +
>> +             cpus_and(tmp, desc->affinity, mask);
>> +             if (cpus_empty(tmp))
>> +                     cfg->move_desc_in_progress_in_same_domain = 1;
>> +     }
>
> Aren't we trying to avoid on-stack cpumask_t's?
>
> I'd have though that this one could be eliminated via the use of
> cpus_intersects()?

will change to that.

there some other in __assign_irq_vector...

>
>>  }
>> +#endif
>>
>> ...
>>
>>  static struct irq_chip lapic_chip __read_mostly = {
>>       .name           = "local-APIC",
>>       .mask           = mask_lapic_irq,
>> @@ -2574,7 +2834,9 @@ int timer_through_8259 __initdata;
>>   */
>>  static inline void __init check_timer(void)
>
> An inlined __init function makes little sense.

should be done other patch...?

>
>>
>> ...
>>
>> @@ -3306,10 +3590,13 @@ static void dmar_msi_set_affinity(unsign
>>       if (cpus_empty(tmp))
>>               return;
>
> `tmp' is always a bad choice of identifier.

other patch?

>
>>
>> ...
>>
>> --- linux-2.6.orig/arch/x86/mm/init_32.c
>> +++ linux-2.6/arch/x86/mm/init_32.c
>> @@ -66,6 +66,7 @@ static unsigned long __meminitdata table
>>  static unsigned long __meminitdata table_top;
>>
>>  static int __initdata after_init_bootmem;
>> +int after_bootmem;
>
> This isn't a very well-chosen identifier for an x86-specific global.

it is not used, will remove that.

>
>>
>> ...
>>
>> @@ -98,6 +126,7 @@ int __ht_create_irq(struct pci_dev *dev,
>>       int max_irq;
>>       int pos;
>>       int irq;
>> +     unsigned int irq_want;
>>
>>       pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
>>       if (!pos)
>> @@ -125,7 +154,12 @@ int __ht_create_irq(struct pci_dev *dev,
>>       cfg->msg.address_lo = 0xffffffff;
>>       cfg->msg.address_hi = 0xffffffff;
>>
>> +     irq_want = build_irq_for_pci_dev(dev);
>> +#ifdef CONFIG_SPARSE_IRQ
>> +     irq = create_irq_nr(irq_want + idx);
>> +#else
>>       irq = create_irq();
>> +#endif
>
> irq_want is unused if CONFIG_SPARSE_IRQ=n.
>
>>       if (irq <= 0) {
>>               kfree(cfg);
>> Index: linux-2.6/drivers/pci/intr_remapping.c
>> ===================================================================
>> --- linux-2.6.orig/drivers/pci/intr_remapping.c
>> +++ linux-2.6/drivers/pci/intr_remapping.c
>> @@ -19,17 +19,73 @@ struct irq_2_iommu {
>>       u8  irte_mask;
>>  };
>>
>> -static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
>> +#ifdef CONFIG_SPARSE_IRQ
>> +static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
>> +{
>> +     struct irq_2_iommu *iommu;
>> +     int node;
>> +
>> +     if (cpu < 0)
>> +             cpu = smp_processor_id();
>> +     node = cpu_to_node(cpu);
>> +
>> +     iommu = kzalloc_node(sizeof(*iommu), GFP_KERNEL, node);
>
> See above.
>
>> +     printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
>> +
>> +     return iommu;
>> +}
>>
>>
>> ...
>>
>> --- linux-2.6.orig/fs/proc/stat.c
>> +++ linux-2.6/fs/proc/stat.c
>> @@ -27,6 +27,9 @@ static int show_stat(struct seq_file *p,
>>       u64 sum = 0;
>>       struct timespec boottime;
>>       unsigned int per_irq_sum;
>> +#ifdef CONFIG_GENERIC_HARDIRQS
>> +     struct irq_desc *desc;
>> +#endif
>>
>>       user = nice = system = idle = iowait =
>>               irq = softirq = steal = cputime64_zero;
>> @@ -44,10 +47,9 @@ static int show_stat(struct seq_file *p,
>>               softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
>>               steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
>>               guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
>> -
>> -             for_each_irq_nr(j)
>> +             for_each_irq_desc(j, desc) {
>
> This won't compile if CONFIG_GENERIC_HARDIRQS=n, I suspect.
>
we have

#ifndef CONFIG_GENERIC_HARDIRQS
#include <asm/irq.h>
# define nr_irqs                NR_IRQS

# define for_each_irq_desc(irq, desc)           \
        for (irq = 0; irq < nr_irqs; irq++)
# define end_for_each_irq_desc()


>>                       sum += kstat_irqs_cpu(j, i);
>> -
>> +             } end_for_each_irq_desc();
>>               sum += arch_irq_stat_cpu(i);
>>       }
>>       sum += arch_irq_stat();
>> @@ -90,14 +92,17 @@ static int show_stat(struct seq_file *p,
>>       seq_printf(p, "intr %llu", (unsigned long long)sum);
>>
>>       /* sum again ? it could be updated? */
>> -     for_each_irq_nr(j) {
>> +     for_each_irq_desc(j, desc) {
>
> ditto.
>
>>
>> ...
>>
>> +#define end_for_each_irq_desc()
>> +#endif
>> +
>> +#define desc_chip_ack(irq, descp) desc->chip->ack(irq)
>> +#define desc_chip_mask(irq, descp) desc->chip->mask(irq)
>> +#define desc_chip_mask_ack(irq, descp) desc->chip->mask_ack(irq)
>> +#define desc_chip_unmask(irq, descp) desc->chip->unmask(irq)
>> +#define desc_chip_eoi(irq, descp) desc->chip->eoi(irq)
>
> Was it necessary to implement these as macros?

try to avoid bunch of #idef with different parameters that need to be passed.

>
>>
>> ...
>>
>> @@ -49,6 +56,299 @@ void handle_bad_irq(unsigned int irq, st
>>  int nr_irqs = NR_IRQS;
>>  EXPORT_SYMBOL_GPL(nr_irqs);
>>
>> +void __init __attribute__((weak)) arch_early_irq_init_work(void)
>> +{
>> +}
>> +
>> +#ifdef CONFIG_SPARSE_IRQ
>> +static struct irq_desc irq_desc_init = {
>> +     .irq        = -1U,
>
> Plain old `-1' would be better here.  It works in all cases and the
> reader doesn't need to go and check that this field really is an
> unsigned int and it won't need editing if that field gets changed to
> long
.
ok.

>
>> +     .status     = IRQ_DISABLED,
>> +     .chip       = &no_irq_chip,
>> +     .handle_irq = handle_bad_irq,
>> +     .depth      = 1,
>> +     .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
>> +#ifdef CONFIG_SMP
>> +     .affinity   = CPU_MASK_ALL
>> +#endif
>> +};
>> +
>> +static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
>> +{
>> +     unsigned long bytes;
>> +     char *ptr;
>> +     int node;
>> +
>> +     /* Compute how many bytes we need per irq and allocate them */
>> +     bytes = nr * sizeof(unsigned int);
>> +
>> +     if (cpu < 0)
>> +             cpu = smp_processor_id();
>> +
>> +     node = cpu_to_node(cpu);
>> +     ptr = kzalloc_node(bytes, GFP_KERNEL, node);
>
> See above.
>
>> +     printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
>> +
>> +     desc->kstat_irqs = (unsigned int *)ptr;
>> +}
>> +
>>
>> ...
>>
>> +#endif
>> +/*
>> + * Protect the sparse_irqs_free freelist:
>> + */
>> +static DEFINE_SPINLOCK(sparse_irq_lock);
>> +LIST_HEAD(sparse_irqs_head);
>
> It's strange that the list is global and is accessed from other .c
> files, but the lock which protects it is static
.
only protect that when try to append the list. with rcu add tail.

>
>> +/*
>> + * The sparse irqs are in a hash-table as well, for fast lookup:
>> + */
>> +#define SPARSEIRQHASH_BITS          (13 - 1)
>> +#define SPARSEIRQHASH_SIZE          (1UL << SPARSEIRQHASH_BITS)
>> +#define __sparseirqhashfn(key)      hash_long((unsigned long)key, SPARSEIRQHASH_BITS)
>> +#define sparseirqhashentry(key)     (sparseirqhash_table + __sparseirqhashfn((key)))
>
> Why implement these via macros?

copied from sched.c

>
>> +static struct list_head sparseirqhash_table[SPARSEIRQHASH_SIZE];
>> +
>> +static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
>> +     [0 ... NR_IRQS_LEGACY-1] = {
>> +             .irq        = -1U,
>> +             .status     = IRQ_DISABLED,
>> +             .chip       = &no_irq_chip,
>> +             .handle_irq = handle_bad_irq,
>> +             .depth      = 1,
>> +             .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
>> +#ifdef CONFIG_SMP
>> +             .affinity   = CPU_MASK_ALL
>> +#endif
>> +     }
>> +};
>> +
>> +/* FIXME: use bootmem alloc ...*/
>> +static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
>
> Do these need to be 32-bit?  Maybe they'll fit in 16-bit, dunno.
>

struct irq_desc {
        unsigned int            irq;
#ifdef CONFIG_SPARSE_IRQ
        struct list_head        list;
        struct list_head        hash_entry;
        struct timer_rand_state *timer_rand_state;
        unsigned int            *kstat_irqs;

>> +void __init early_irq_init_work(void)
>
> The use of "_work" implies that this function is invoked by
> schedule_work().  But it isn't

ok, will remove it.

>
>> +{
>> +     struct irq_desc *desc;
>> +     int legacy_count;
>> +     int i;
>> +
>> +     /* init_work to init list for sparseirq */
>> +     for (i = 0; i < SPARSEIRQHASH_SIZE; i++)
>> +             INIT_LIST_HEAD(sparseirqhash_table + i);
>> +
>> +     desc = irq_desc_legacy;
>> +     legacy_count = ARRAY_SIZE(irq_desc_legacy);
>> +
>> +     for (i = 0; i < legacy_count; i++) {
>> +             struct list_head *hash_head;
>> +
>> +             hash_head = sparseirqhashentry(i);
>> +             desc[i].irq = i;
>> +             desc[i].kstat_irqs = kstat_irqs_legacy[i];
>> +             list_add_tail(&desc[i].hash_entry, hash_head);
>> +             list_add_tail(&desc[i].list, &sparse_irqs_head);
>> +     }
>> +
>> +     arch_early_irq_init_work();
>> +}
>> +
>>
>> ...
>>
>> +struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
>> +{
>> +     struct irq_desc *desc;
>> +     struct list_head *hash_head;
>> +     unsigned long flags;
>> +     int node;
>> +
>> +     desc = irq_to_desc(irq);
>> +     if (desc)
>> +             return desc;
>> +
>> +     hash_head = sparseirqhashentry(irq);
>> +
>> +     spin_lock_irqsave(&sparse_irq_lock, flags);
>> +
>> +     /*
>> +      * We have to do the hash-walk again, to avoid races
>> +      * with another CPU:
>> +      */
>> +     list_for_each_entry(desc, hash_head, hash_entry)
>> +             if (desc->irq == irq)
>> +                     goto out_unlock;
>> +
>> +     if (cpu < 0)
>> +             cpu = smp_processor_id();
>> +
>> +     node = cpu_to_node(cpu);
>> +     desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
>
> Oh for gawd's sake.  PLEASE read Documentation/SubmitChecklist.
> Carefully.  We've already discussed this.
there are 13 errors with checkpatch scripts. seems all about macro definition.

>
> You cannot do a GFP_KERNEL allocation under spin_lock_irqsave().
>
>> +     printk(KERN_DEBUG "  alloc irq_desc for %d aka %#x on cpu %d node %d\n",
>> +              irq, irq, cpu, node);
>> +     init_one_irq_desc(irq, desc, cpu);
>> +
>> +     /*
>> +      * We use RCU's safe list-add method to make
>> +      * parallel walking of the hash-list safe:
>> +      */
>> +     list_add_tail_rcu(&desc->hash_entry, hash_head);
>> +     /*
>> +      * Add it to the global list:
>> +      */
>> +     list_add_tail_rcu(&desc->list, &sparse_irqs_head);
>> +
>> +out_unlock:
>> +     spin_unlock_irqrestore(&sparse_irq_lock, flags);
>> +
>> +     return desc;
>> +}
>> +
>> +struct irq_desc *irq_to_desc_alloc(unsigned int irq)
>> +{
>> +     return irq_to_desc_alloc_cpu(irq, -1);
>> +}
>> +
>> +#ifdef CONFIG_MOVE_IRQ_DESC
>> +static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
>> +                                             int cpu)
>> +{
>> +     struct irq_desc *desc;
>> +     unsigned int irq;
>> +     struct list_head *hash_head;
>> +     unsigned long flags;
>> +     int node;
>> +
>> +     irq = old_desc->irq;
>> +
>> +     hash_head = sparseirqhashentry(irq);
>> +
>> +     spin_lock_irqsave(&sparse_irq_lock, flags);
>> +     /*
>> +      * We have to do the hash-walk again, to avoid races
>> +      * with another CPU:
>> +      */
>> +     list_for_each_entry(desc, hash_head, hash_entry)
>> +             if (desc->irq == irq && old_desc != desc)
>> +                     goto out_unlock;
>> +
>> +     if (cpu < 0)
>> +             cpu = smp_processor_id();
>> +
>> +     node = cpu_to_node(cpu);
>> +     desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
>
> Ditto.
>
> Also, the return value from the memory allocation attempt is not
> checked.
>
>> +     printk(KERN_DEBUG "  move irq_desc for %d aka %#x to cpu %d node %d\n",
>> +              irq, irq, cpu, node);
>> +
>> +     init_copy_one_irq_desc(irq, old_desc, desc, cpu);
>> +
>> +     list_replace_rcu(&old_desc->hash_entry, &desc->hash_entry);
>> +     list_replace_rcu(&old_desc->list, &desc->list);
>> +
>> +     /* free the old one */
>> +     free_one_irq_desc(old_desc);
>> +     kfree(old_desc);
>> +
>> +out_unlock:
>> +     spin_unlock_irqrestore(&sparse_irq_lock, flags);
>> +
>> +     return desc;
>> +}
>> +
>>
>> ...
>>
>> --- linux-2.6.orig/init/main.c
>> +++ linux-2.6/init/main.c
>> @@ -542,6 +542,15 @@ void __init __weak thread_info_cache_ini
>>  {
>>  }
>>
>> +void __init __attribute__((weak)) arch_early_irq_init_work(void)
>> +{
>> +}
>> +
>> +void __init __attribute__((weak)) early_irq_init_work(void)
>> +{
>> +     arch_early_irq_init_work();
>> +}
>
> Please use __weak

ok

thanks for reviewing.

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 22:01                                           ` Yinghai Lu
@ 2008-11-13 22:05                                             ` Ingo Molnar
  2008-11-13 22:13                                             ` Andrew Morton
  1 sibling, 0 replies; 66+ messages in thread
From: Ingo Molnar @ 2008-11-13 22:05 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: Andrew Morton, tglx, hpa, linux-kernel, travis


* Yinghai Lu <yinghai@kernel.org> wrote:


> >> +     /*
> >> +      * We have to do the hash-walk again, to avoid races
> >> +      * with another CPU:
> >> +      */
> >> +     list_for_each_entry(desc, hash_head, hash_entry)
> >> +             if (desc->irq == irq)
> >> +                     goto out_unlock;

should be:

> >> +     list_for_each_entry(desc, hash_head, hash_entry) {
> >> +             if (desc->irq == irq)
> >> +                     goto out_unlock;
> >> +     }

and:

> >> +     desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
> >
> > Oh for gawd's sake.  PLEASE read Documentation/SubmitChecklist. 
> > Carefully.  We've already discussed this.
> >
> > You cannot do a GFP_KERNEL allocation under spin_lock_irqsave().

yes.

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 22:01                                           ` Yinghai Lu
  2008-11-13 22:05                                             ` Ingo Molnar
@ 2008-11-13 22:13                                             ` Andrew Morton
  2008-11-13 22:41                                               ` Yinghai Lu
  1 sibling, 1 reply; 66+ messages in thread
From: Andrew Morton @ 2008-11-13 22:13 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: mingo, tglx, hpa, linux-kernel, travis

On Thu, 13 Nov 2008 14:01:06 -0800
"Yinghai Lu" <yinghai@kernel.org> wrote:

> On Thu, Nov 13, 2008 at 1:18 PM, Andrew Morton
> <akpm@linux-foundation.org> wrote:
> > On Thu, 13 Nov 2008 12:16:56 -0800
> > Yinghai Lu <yinghai@kernel.org> wrote:
> >
> >> From: Yinghai Lu <yinghai@kernel.org>
> >> Subject: sparseirq v13
> >
> > My overall view on this is that it takes some of the kernel's most
> > fragile and most problem-dense code and makes it much more complex, and
> > by adding new configuration options it significantly worsens our
> > testing coverage.
> >
> > The patch is HHHHHUUUUUUUUUUGGGGGEEE!  Did it really need to be a
> > single megapatch?
> >
> > Other architectures want (or have) sparse interrupts.  Are those guys
> > paying attention here?
> >
> > I don't have a clue what all this does.  I hope those who will work on
> > this code are sufficiently familiar with it all to be able to maintain
> > it when there are close to zero comments in some of our most tricky and
> > problem-prone code.
> >
>
> ...
>
> >> +static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
> >
> > Do these need to be 32-bit?  Maybe they'll fit in 16-bit, dunno.
> >
> 
> struct irq_desc {
>         unsigned int            irq;
> #ifdef CONFIG_SPARSE_IRQ
>         struct list_head        list;
>         struct list_head        hash_entry;
>         struct timer_rand_state *timer_rand_state;
>         unsigned int            *kstat_irqs;

That doesn't address my question.

The above array can be very large.  Can we halve its size by using
16-bit quantities?  Will this code ever encounter IRQ numbers larger
than 65536?

> >> +struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
> >> +{
> >> +     struct irq_desc *desc;
> >> +     struct list_head *hash_head;
> >> +     unsigned long flags;
> >> +     int node;
> >> +
> >> +     desc = irq_to_desc(irq);
> >> +     if (desc)
> >> +             return desc;
> >> +
> >> +     hash_head = sparseirqhashentry(irq);
> >> +
> >> +     spin_lock_irqsave(&sparse_irq_lock, flags);
> >> +
> >> +     /*
> >> +      * We have to do the hash-walk again, to avoid races
> >> +      * with another CPU:
> >> +      */
> >> +     list_for_each_entry(desc, hash_head, hash_entry)
> >> +             if (desc->irq == irq)
> >> +                     goto out_unlock;
> >> +
> >> +     if (cpu < 0)
> >> +             cpu = smp_processor_id();
> >> +
> >> +     node = cpu_to_node(cpu);
> >> +     desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
> >
> > Oh for gawd's sake.  PLEASE read Documentation/SubmitChecklist.
> > Carefully.  We've already discussed this.
> there are 13 errors with checkpatch scripts. seems all about macro definition.

This has nothing to do with checkpatch.  Documentation/SubmitChecklist
covers much more than that.  In particular it descripbes various steps
which should be taken when runtime testing new code subissions.

> >
> > You cannot do a GFP_KERNEL allocation under spin_lock_irqsave().

Steps which would have detected this bug.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 21:18                                         ` Andrew Morton
  2008-11-13 21:21                                           ` Ingo Molnar
  2008-11-13 22:01                                           ` Yinghai Lu
@ 2008-11-13 22:19                                           ` Paul Mackerras
  2008-11-13 22:23                                             ` David Miller
  2008-11-16 20:58                                           ` Benjamin Herrenschmidt
  3 siblings, 1 reply; 66+ messages in thread
From: Paul Mackerras @ 2008-11-13 22:19 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Yinghai Lu, mingo, tglx, hpa, linux-kernel, travis

Andrew Morton writes:

> Other architectures want (or have) sparse interrupts.  Are those guys
> paying attention here?

On powerpc we have a mapping from virtual irq numbers (in the range 0
to NR_IRQS-1) to physical irq numbers (which can be anything) and back
again.  I think our approach is simpler than what's being proposed
here, though we don't try to keep the irqdescs node-local as this
patch seems to (fortunately our big systems aren't so NUMA-ish as to
make that necessary).

Paul.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 22:19                                           ` [PATCH] sparse_irq aka dyn_irq v13 Paul Mackerras
@ 2008-11-13 22:23                                             ` David Miller
  2008-11-13 23:11                                               ` Mike Travis
  0 siblings, 1 reply; 66+ messages in thread
From: David Miller @ 2008-11-13 22:23 UTC (permalink / raw)
  To: paulus; +Cc: akpm, yinghai, mingo, tglx, hpa, linux-kernel, travis

From: Paul Mackerras <paulus@samba.org>
Date: Fri, 14 Nov 2008 09:19:13 +1100

> Andrew Morton writes:
> 
> > Other architectures want (or have) sparse interrupts.  Are those guys
> > paying attention here?
> 
> On powerpc we have a mapping from virtual irq numbers (in the range 0
> to NR_IRQS-1) to physical irq numbers (which can be anything) and back
> again.  I think our approach is simpler than what's being proposed
> here, though we don't try to keep the irqdescs node-local as this
> patch seems to (fortunately our big systems aren't so NUMA-ish as to
> make that necessary).

This is exactly what sparc64 does as well, same as powerpc, and
as Paul said it's so much incredibly simpler than the dyn_irq stuff.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 22:13                                             ` Andrew Morton
@ 2008-11-13 22:41                                               ` Yinghai Lu
  2008-11-13 22:58                                                 ` Andrew Morton
  0 siblings, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-13 22:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: mingo, tglx, hpa, linux-kernel, travis

Andrew Morton wrote:

>> ...
>>
>>>> +static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
>>> Do these need to be 32-bit?  Maybe they'll fit in 16-bit, dunno.
>>>
>> struct irq_desc {
>>         unsigned int            irq;
>> #ifdef CONFIG_SPARSE_IRQ
>>         struct list_head        list;
>>         struct list_head        hash_entry;
>>         struct timer_rand_state *timer_rand_state;
>>         unsigned int            *kstat_irqs;
> 
> That doesn't address my question.
> 
> The above array can be very large.  Can we halve its size by using
> 16-bit quantities?  Will this code ever encounter IRQ numbers larger
> than 65536?
> 

NR_CPUS=4096, NR_IRQS_LEGACY=16, and that array will be 256k bytes

later could change that alloc_bootmem, so NR_CPUS will be replaced to nr_cpu_ids

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 22:41                                               ` Yinghai Lu
@ 2008-11-13 22:58                                                 ` Andrew Morton
  2008-11-13 23:15                                                   ` Mike Travis
  2008-11-14  6:29                                                   ` [PATCH] sparse_irq aka dyn_irq v14 Yinghai Lu
  0 siblings, 2 replies; 66+ messages in thread
From: Andrew Morton @ 2008-11-13 22:58 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: mingo, tglx, hpa, linux-kernel, travis

On Thu, 13 Nov 2008 14:41:36 -0800
Yinghai Lu <yinghai@kernel.org> wrote:

> Andrew Morton wrote:
> 
> >> ...
> >>
> >>>> +static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
> >>> Do these need to be 32-bit?  Maybe they'll fit in 16-bit, dunno.
> >>>
> >> struct irq_desc {
> >>         unsigned int            irq;
> >> #ifdef CONFIG_SPARSE_IRQ
> >>         struct list_head        list;
> >>         struct list_head        hash_entry;
> >>         struct timer_rand_state *timer_rand_state;
> >>         unsigned int            *kstat_irqs;
> > 
> > That doesn't address my question.
> > 
> > The above array can be very large.  Can we halve its size by using
> > 16-bit quantities?  Will this code ever encounter IRQ numbers larger
> > than 65536?
> > 
> 
> NR_CPUS=4096, NR_IRQS_LEGACY=16, and that array will be 256k bytes
> 
> later could change that alloc_bootmem, so NR_CPUS will be replaced to nr_cpu_ids

Do the entries in that array need to be 32-bit?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 22:23                                             ` David Miller
@ 2008-11-13 23:11                                               ` Mike Travis
  2008-11-13 23:14                                                 ` David Miller
  0 siblings, 1 reply; 66+ messages in thread
From: Mike Travis @ 2008-11-13 23:11 UTC (permalink / raw)
  To: David Miller; +Cc: paulus, akpm, yinghai, mingo, tglx, hpa, linux-kernel

David Miller wrote:
> From: Paul Mackerras <paulus@samba.org>
> Date: Fri, 14 Nov 2008 09:19:13 +1100
> 
>> Andrew Morton writes:
>>
>>> Other architectures want (or have) sparse interrupts.  Are those guys
>>> paying attention here?
>> On powerpc we have a mapping from virtual irq numbers (in the range 0
>> to NR_IRQS-1) to physical irq numbers (which can be anything) and back
>> again.  I think our approach is simpler than what's being proposed
>> here, though we don't try to keep the irqdescs node-local as this
>> patch seems to (fortunately our big systems aren't so NUMA-ish as to
>> make that necessary).
> 
> This is exactly what sparc64 does as well, same as powerpc, and
> as Paul said it's so much incredibly simpler than the dyn_irq stuff.

One problem is that pre-defining a static NR_IRQ count is almost always
wrong when the NR_CPUS count is large, and should be adjusted as resources
require.

Large UV systems will take a performance hit from off-node accesses
when the CPU count (or more likely the NODE count) reaches some
threshold.  So keeping as much interrupt context close to the
interrupting source is a good thing.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 23:11                                               ` Mike Travis
@ 2008-11-13 23:14                                                 ` David Miller
  2008-11-14  0:15                                                   ` Mike Travis
  0 siblings, 1 reply; 66+ messages in thread
From: David Miller @ 2008-11-13 23:14 UTC (permalink / raw)
  To: travis; +Cc: paulus, akpm, yinghai, mingo, tglx, hpa, linux-kernel

From: Mike Travis <travis@sgi.com>
Date: Thu, 13 Nov 2008 15:11:29 -0800

> David Miller wrote:
> > From: Paul Mackerras <paulus@samba.org>
> > Date: Fri, 14 Nov 2008 09:19:13 +1100
> > 
> >> Andrew Morton writes:
> >>
> >>> Other architectures want (or have) sparse interrupts.  Are those guys
> >>> paying attention here?
> >> On powerpc we have a mapping from virtual irq numbers (in the range 0
> >> to NR_IRQS-1) to physical irq numbers (which can be anything) and back
> >> again.  I think our approach is simpler than what's being proposed
> >> here, though we don't try to keep the irqdescs node-local as this
> >> patch seems to (fortunately our big systems aren't so NUMA-ish as to
> >> make that necessary).
> > 
> > This is exactly what sparc64 does as well, same as powerpc, and
> > as Paul said it's so much incredibly simpler than the dyn_irq stuff.
> 
> One problem is that pre-defining a static NR_IRQ count is almost always
> wrong when the NR_CPUS count is large, and should be adjusted as resources
> require.

We use a value of 256 and I've been booting linux on 128 cpu sparc64
systems with lots of PCI-E host controllers (and others have booted it
on even larger ones).  All of which have several NUMA domains.

It's not an issue.

> Large UV systems will take a performance hit from off-node accesses
> when the CPU count (or more likely the NODE count) reaches some
> threshold.  So keeping as much interrupt context close to the
> interrupting source is a good thing.

Just because the same piece of information is repeated over and
over again doesn't mean it really matters.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 22:58                                                 ` Andrew Morton
@ 2008-11-13 23:15                                                   ` Mike Travis
  2008-11-13 23:24                                                     ` Yinghai Lu
  2008-11-14  6:29                                                   ` [PATCH] sparse_irq aka dyn_irq v14 Yinghai Lu
  1 sibling, 1 reply; 66+ messages in thread
From: Mike Travis @ 2008-11-13 23:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Yinghai Lu, mingo, tglx, hpa, linux-kernel

Andrew Morton wrote:
> On Thu, 13 Nov 2008 14:41:36 -0800
> Yinghai Lu <yinghai@kernel.org> wrote:
> 
>> Andrew Morton wrote:
>>
>>>> ...
>>>>
>>>>>> +static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
>>>>> Do these need to be 32-bit?  Maybe they'll fit in 16-bit, dunno.
>>>>>
>>>> struct irq_desc {
>>>>         unsigned int            irq;
>>>> #ifdef CONFIG_SPARSE_IRQ
>>>>         struct list_head        list;
>>>>         struct list_head        hash_entry;
>>>>         struct timer_rand_state *timer_rand_state;
>>>>         unsigned int            *kstat_irqs;
>>> That doesn't address my question.
>>>
>>> The above array can be very large.  Can we halve its size by using
>>> 16-bit quantities?  Will this code ever encounter IRQ numbers larger
>>> than 65536?
>>>
>> NR_CPUS=4096, NR_IRQS_LEGACY=16, and that array will be 256k bytes
>>
>> later could change that alloc_bootmem, so NR_CPUS will be replaced to nr_cpu_ids
> 
> Do the entries in that array need to be 32-bit?

That's a good question.  If there are 4k or 16k cpus, would the number of
irq's being "kstat'd" need to be > 64k?  (I'm not exactly sure why there
are NR_CPUS * NR_IRQS_LEGACY kstat entries per IRQ?)

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 23:15                                                   ` Mike Travis
@ 2008-11-13 23:24                                                     ` Yinghai Lu
  2008-11-14  0:20                                                       ` Mike Travis
  0 siblings, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-13 23:24 UTC (permalink / raw)
  To: Mike Travis; +Cc: Andrew Morton, mingo, tglx, hpa, linux-kernel

Mike Travis wrote:
> Andrew Morton wrote:
>> On Thu, 13 Nov 2008 14:41:36 -0800
>> Yinghai Lu <yinghai@kernel.org> wrote:
>>
> 
> That's a good question.  If there are 4k or 16k cpus, would the number of
> irq's being "kstat'd" need to be > 64k?  (I'm not exactly sure why there
> are NR_CPUS * NR_IRQS_LEGACY kstat entries per IRQ?)

can be replaced with nr_cpu_ids * NR_IRQS_LEGACY...

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 23:14                                                 ` David Miller
@ 2008-11-14  0:15                                                   ` Mike Travis
  2008-11-14  0:21                                                     ` David Miller
  0 siblings, 1 reply; 66+ messages in thread
From: Mike Travis @ 2008-11-14  0:15 UTC (permalink / raw)
  To: David Miller; +Cc: paulus, akpm, yinghai, mingo, tglx, hpa, linux-kernel

David Miller wrote:
> From: Mike Travis <travis@sgi.com>
> Date: Thu, 13 Nov 2008 15:11:29 -0800
> 
>> David Miller wrote:
>>> From: Paul Mackerras <paulus@samba.org>
>>> Date: Fri, 14 Nov 2008 09:19:13 +1100
>>>
>>>> Andrew Morton writes:
>>>>
>>>>> Other architectures want (or have) sparse interrupts.  Are those guys
>>>>> paying attention here?
>>>> On powerpc we have a mapping from virtual irq numbers (in the range 0
>>>> to NR_IRQS-1) to physical irq numbers (which can be anything) and back
>>>> again.  I think our approach is simpler than what's being proposed
>>>> here, though we don't try to keep the irqdescs node-local as this
>>>> patch seems to (fortunately our big systems aren't so NUMA-ish as to
>>>> make that necessary).
>>> This is exactly what sparc64 does as well, same as powerpc, and
>>> as Paul said it's so much incredibly simpler than the dyn_irq stuff.


>> One problem is that pre-defining a static NR_IRQ count is almost always
>> wrong when the NR_CPUS count is large, and should be adjusted as resources
>> require.
> 
> We use a value of 256 and I've been booting linux on 128 cpu sparc64
> systems with lots of PCI-E host controllers (and others have booted it
> on even larger ones).  All of which have several NUMA domains.
> 
> It's not an issue.

Are you saying that having a fixed count of IRQ's is not an issue?  With
NR_CPUS=4096 what would you fix it to?  (Currently it's NR_CPUS * 32
but that might not be sufficient.)  Would NR_CPUS=16384 make it an issue?

> 
>> Large UV systems will take a performance hit from off-node accesses
>> when the CPU count (or more likely the NODE count) reaches some
>> threshold.  So keeping as much interrupt context close to the
>> interrupting source is a good thing.
> 
> Just because the same piece of information is repeated over and
> over again doesn't mean it really matters.

Which information is repeated over and over?  I was under the
impression that each and every interrupt writes to the irq_desc
entry for that irq?  If this is in a big list on node 0, that is
data passing over the system bus.

Or am I missing what you're getting at?

Thanks,
Mike




^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 23:24                                                     ` Yinghai Lu
@ 2008-11-14  0:20                                                       ` Mike Travis
  2008-11-14  0:29                                                         ` Yinghai Lu
  0 siblings, 1 reply; 66+ messages in thread
From: Mike Travis @ 2008-11-14  0:20 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: Andrew Morton, mingo, tglx, hpa, linux-kernel

Yinghai Lu wrote:
> Mike Travis wrote:
>> Andrew Morton wrote:
>>> On Thu, 13 Nov 2008 14:41:36 -0800
>>> Yinghai Lu <yinghai@kernel.org> wrote:
>>>
>> That's a good question.  If there are 4k or 16k cpus, would the number of
>> irq's being "kstat'd" need to be > 64k?  (I'm not exactly sure why there
>> are NR_CPUS * NR_IRQS_LEGACY kstat entries per IRQ?)
> 
> can be replaced with nr_cpu_ids * NR_IRQS_LEGACY...
> 
> YH

Ok, but why does each irq need a list of all other irq's in the system?
Can they be collapsed into one array or percpu list?

Thanks,
Mike 

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-14  0:15                                                   ` Mike Travis
@ 2008-11-14  0:21                                                     ` David Miller
  2008-11-14  0:39                                                       ` Mike Travis
  0 siblings, 1 reply; 66+ messages in thread
From: David Miller @ 2008-11-14  0:21 UTC (permalink / raw)
  To: travis; +Cc: paulus, akpm, yinghai, mingo, tglx, hpa, linux-kernel

From: Mike Travis <travis@sgi.com>
Date: Thu, 13 Nov 2008 16:15:12 -0800

> David Miller wrote:
> > From: Mike Travis <travis@sgi.com>
> > Date: Thu, 13 Nov 2008 15:11:29 -0800
> > 
> > We use a value of 256 and I've been booting linux on 128 cpu sparc64
> > systems with lots of PCI-E host controllers (and others have booted it
> > on even larger ones).  All of which have several NUMA domains.
> > 
> > It's not an issue.
> 
> Are you saying that having a fixed count of IRQ's is not an issue?  With
> NR_CPUS=4096 what would you fix it to?  (Currently it's NR_CPUS * 32
> but that might not be sufficient.)  Would NR_CPUS=16384 make it an issue?

Nope, and nope.  I frequently run kernels with NR_CPUS set to huge
values.

It seems that the issue of x86 is that it has it's IRQ count tied to
the number of cpus, that's not very intelligent.  Perhaps that part
should be rearranged somehow?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-14  0:20                                                       ` Mike Travis
@ 2008-11-14  0:29                                                         ` Yinghai Lu
  0 siblings, 0 replies; 66+ messages in thread
From: Yinghai Lu @ 2008-11-14  0:29 UTC (permalink / raw)
  To: Mike Travis; +Cc: Andrew Morton, mingo, tglx, hpa, linux-kernel

Mike Travis wrote:
> Yinghai Lu wrote:
>> Mike Travis wrote:
>>> Andrew Morton wrote:
>>>> On Thu, 13 Nov 2008 14:41:36 -0800
>>>> Yinghai Lu <yinghai@kernel.org> wrote:
>>>>
>>> That's a good question.  If there are 4k or 16k cpus, would the number of
>>> irq's being "kstat'd" need to be > 64k?  (I'm not exactly sure why there
>>> are NR_CPUS * NR_IRQS_LEGACY kstat entries per IRQ?)
>> can be replaced with nr_cpu_ids * NR_IRQS_LEGACY...
>>
>> YH
> 
> Ok, but why does each irq need a list of all other irq's in the system?
> Can they be collapsed into one array or percpu list?

use a big pointer array like struct irq_desc *desc[1<<32] ? that means 32g bytes... irq = bus/dev/fn/idx

or struct irq_desc *desc[NR_CPUS * NR_VECTORS] or per_cpu etc, it mean 4096*256*8 ... irq = cpu/idx


YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-14  0:21                                                     ` David Miller
@ 2008-11-14  0:39                                                       ` Mike Travis
  2008-11-14  2:37                                                         ` David Miller
  0 siblings, 1 reply; 66+ messages in thread
From: Mike Travis @ 2008-11-14  0:39 UTC (permalink / raw)
  To: David Miller; +Cc: paulus, akpm, yinghai, mingo, tglx, hpa, linux-kernel

David Miller wrote:
> From: Mike Travis <travis@sgi.com>
> Date: Thu, 13 Nov 2008 16:15:12 -0800
> 
>> David Miller wrote:
>>> From: Mike Travis <travis@sgi.com>
>>> Date: Thu, 13 Nov 2008 15:11:29 -0800
>>>
>>> We use a value of 256 and I've been booting linux on 128 cpu sparc64
>>> systems with lots of PCI-E host controllers (and others have booted it
>>> on even larger ones).  All of which have several NUMA domains.
>>>
>>> It's not an issue.
>> Are you saying that having a fixed count of IRQ's is not an issue?  With
>> NR_CPUS=4096 what would you fix it to?  (Currently it's NR_CPUS * 32
>> but that might not be sufficient.)  Would NR_CPUS=16384 make it an issue?
> 
> Nope, and nope.  I frequently run kernels with NR_CPUS set to huge
> values.
> 
> It seems that the issue of x86 is that it has it's IRQ count tied to
> the number of cpus, that's not very intelligent.  Perhaps that part
> should be rearranged somehow?

Yes, you're probably right but it is what it is.  Most of the irq vectors
have more to do with cpus than with i/o devices (the system vectors, ipi,
kdb and gru [a uv thing] interrupt vectors come first to mind.)  These do
by necessity need to grow with NR_CPUS, if you're fixing the total IRQ count.

There's been a couple of different proposals to attempt to disassociate
i/o and system vectors though even attempting to guess at the number of
i/o devices is tricky.  Every one of the 512 nodes on a UV system *may*
have a number of i/o devices attached to them though practically this
will be rare. 

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-14  0:39                                                       ` Mike Travis
@ 2008-11-14  2:37                                                         ` David Miller
  2008-11-14  3:06                                                           ` Mike Travis
  0 siblings, 1 reply; 66+ messages in thread
From: David Miller @ 2008-11-14  2:37 UTC (permalink / raw)
  To: travis; +Cc: paulus, akpm, yinghai, mingo, tglx, hpa, linux-kernel

From: Mike Travis <travis@sgi.com>
Date: Thu, 13 Nov 2008 16:39:51 -0800

> There's been a couple of different proposals to attempt to disassociate
> i/o and system vectors though even attempting to guess at the number of
> i/o devices is tricky.  Every one of the 512 nodes on a UV system *may*
> have a number of i/o devices attached to them though practically this
> will be rare. 

Practicality is what really matters.

We can even make the constant number a config option, so that if
someone has a system that actually triggers past this limit even dist
vendors can simply bump the config option value.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-14  2:37                                                         ` David Miller
@ 2008-11-14  3:06                                                           ` Mike Travis
  0 siblings, 0 replies; 66+ messages in thread
From: Mike Travis @ 2008-11-14  3:06 UTC (permalink / raw)
  To: David Miller; +Cc: paulus, akpm, yinghai, mingo, tglx, hpa, linux-kernel

David Miller wrote:

> Practicality is what really matters.
> 
> We can even make the constant number a config option, so that if
> someone has a system that actually triggers past this limit even dist
> vendors can simply bump the config option value.

Unfortunately the reality is that distros spend months certifying the base
kernel for both application and security conformance.  And many (most?) of
the large system customers will only run this kernel.  Even changing root
startup options is considered invalidating the conformance testing.  That
means the kernel has to adapt to the resource needs of the system being
serviced.

Ideally, linux shouldn't have any fixed resources.  A system built to run
on a massively parallel system should also boot and run on a smart phone
(not well perhaps... ;-)

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] sparse_irq aka dyn_irq v14
  2008-11-13 22:58                                                 ` Andrew Morton
  2008-11-13 23:15                                                   ` Mike Travis
@ 2008-11-14  6:29                                                   ` Yinghai Lu
  2008-11-14  6:46                                                     ` Andrew Morton
  1 sibling, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-14  6:29 UTC (permalink / raw)
  To: Andrew Morton, mingo, tglx, hpa; +Cc: linux-kernel, travis

address some Andrew's concerns.

also according to ingo, change _irqx to _irq_desc etc.

Thanks

Yinghai Lu

-----

From: Yinghai Lu <yinghai@kernel.org>
Subject: sparseirq v14

impact: new feature sparseirq

add some kind of hash table as Ingo suggesting.
remove dyna_array

when sparse_irq is used (CONFIG_SPARSE_IRQ), use kzalloc_node to get irq_desc, irq_cfg
  use desc->chip_data for x86 to store irq_cfg

if CONFIG_MOVE_IRQ_DESC is set
  make irq_desc to go with affinity aka irq_desc moving etc
  call move_irq_desc in irq_complete_move()
  need to add struct (irq_desc **descp) to ack_edge/level to make sure desc get updated
  try to pass desc cfg as more as possible to avoid list looking up. 
  legacy irq_desc is not moved, because they are allocated via static array

for logical apic mode, need to add move_desc_in_progress_in_same_domain. otherwise it will not get moved. ==> also could need two phase to get irq_desc moved.
	for example: 0xff is old affinity, and need to set 0xf, and then set to 0xf0.
	[ or we need to change domain definition to cpus on the same node ? ]

LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,000000ff
LBSuse:~ # echo f > /proc/irq/22/smp_affinity
LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,0000000f
LBSuse:~ # tail /var/log/messages
...
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
LBSuse:~ # echo f0 > /proc/irq/22/smp_affinity
LBSuse:~ # tail /var/log/messages
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
Oct 27 12:36:46 LBSuse kernel:   move irq_desc for 22 aka 0x16 to cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc kstat_irqs on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_cfg on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_2_pin on cpu 7 node 1

so assume the user space program should update /proc/irq/XX/smp_affinity to 03 or 0f at first on boot
or we change irq_default_affinity ?

for physical apic is much simple
on 4 sockets 16 cores system
irq_desc is moving..
when
# echo 10 > /proc/irq/134483967/smp_affinity
# echo 100 > /proc/irq/134483967/smp_affinity
# echo 1000 > /proc/irq/134483967/smp_affinity
got
Nov  9 21:39:51 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc kstat_irqs on cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc irq_cfg on cpu 4 node 1
Nov  9 21:40:05 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc kstat_irqs on cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc irq_cfg on cpu 8 node 2
Nov  9 21:40:18 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc kstat_irqs on cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc irq_cfg on cpu 12 node 3

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/Kconfig                   |   20 
 arch/x86/include/asm/hpet.h        |    7 
 arch/x86/include/asm/irq_vectors.h |    2 
 arch/x86/kernel/hpet.c             |   23 -
 arch/x86/kernel/i8259.c            |   29 +
 arch/x86/kernel/io_apic.c          |  846 ++++++++++++++++++++++++++-----------
 arch/x86/kernel/irq.c              |   24 -
 arch/x86/kernel/irq_32.c           |    4 
 arch/x86/kernel/irq_64.c           |    8 
 arch/x86/kernel/irqinit_32.c       |    3 
 arch/x86/kernel/irqinit_64.c       |    3 
 arch/x86/kernel/uv_irq.c           |   27 +
 arch/x86/mm/init_32.c              |    3 
 drivers/char/random.c              |   31 +
 drivers/pci/htirq.c                |   44 +
 drivers/pci/intel-iommu.c          |   23 -
 drivers/pci/intr_remapping.c       |   60 ++
 drivers/pci/msi.c                  |   71 ++-
 drivers/xen/events.c               |    9 
 fs/proc/interrupts.c               |   18 
 fs/proc/stat.c                     |   17 
 include/linux/dmar.h               |    7 
 include/linux/htirq.h              |    8 
 include/linux/interrupt.h          |    2 
 include/linux/irq.h                |   95 +++-
 include/linux/irqnr.h              |   15 
 include/linux/kernel_stat.h        |   14 
 include/linux/msi.h                |   10 
 init/main.c                        |   11 
 kernel/irq/autoprobe.c             |   10 
 kernel/irq/chip.c                  |   40 -
 kernel/irq/handle.c                |  369 +++++++++++++++-
 kernel/irq/manage.c                |    6 
 kernel/irq/migration.c             |   34 +
 kernel/irq/proc.c                  |    3 
 kernel/irq/spurious.c              |    4 
 36 files changed, 1527 insertions(+), 373 deletions(-)

Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -240,6 +240,26 @@ config X86_HAS_BOOT_CPU_ID
 	def_bool y
 	depends on X86_VOYAGER
 
+config SPARSE_IRQ
+	bool "Support sparse irq numbering"
+	depends on PCI_MSI || HT_IRQ
+	default y
+	help
+	  This enables support for sparse irq, esp for msi/msi-x. the irq
+	  number will be bus/dev/fn + 12bit. You may need if you have lots of
+	  cards supports msi-x installed.
+
+	  If you don't know what to do here, say Y.
+
+config MOVE_IRQ_DESC
+	bool "Move irq desc when changing irq smp_affinity"
+	depends on SPARSE_IRQ && SMP
+	default y
+	help
+	  This enables moving irq_desc to cpu/node that irq will use handled.
+
+	  If you don't know what to do here, say Y.
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
Index: linux-2.6/arch/x86/kernel/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/io_apic.c
+++ linux-2.6/arch/x86/kernel/io_apic.c
@@ -108,94 +108,232 @@ static int __init parse_noapic(char *str
 early_param("noapic", parse_noapic);
 
 struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+	int apic, pin;
+	struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+	struct irq_pin_list *pin;
+	int node;
+
+	node = cpu_to_node(cpu);
+
+	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+	BUG_ON(!pin);
+
+	return pin;
+}
+
 struct irq_cfg {
-	unsigned int irq;
 	struct irq_pin_list *irq_2_pin;
 	cpumask_t domain;
 	cpumask_t old_domain;
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
+#ifdef CONFIG_MOVE_IRQ_DESC
+	u8 move_desc_in_progress_in_same_domain : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
 static struct irq_cfg irq_cfgx[NR_IRQS] = {
-	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-	[3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-	[4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-	[5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-	[6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-	[7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-	[8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-	[9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-	[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-	[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-	[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-	[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-	[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+#endif
+	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
 };
 
-#define for_each_irq_cfg(irq, cfg)		\
-	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+void __init arch_early_irq_init(void)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int count;
+	int i;
+#ifdef CONFIG_SPARSE_IRQ
+	int count_desc = NR_IRQS_LEGACY;
+#else
+	int count_desc = NR_IRQS;
+#endif
+
+	cfg = irq_cfgx;
+	count = ARRAY_SIZE(irq_cfgx);
+
+	BUG_ON(count > count_desc);
 
+	for (i = 0; i < count; i++) {
+		desc = irq_to_desc(i);
+		desc->chip_data = &cfg[i];
+	}
+}
+
+#ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+	struct irq_cfg *cfg = NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		cfg = desc->chip_data;
+
+	return cfg;
+}
+
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+{
+	struct irq_cfg *cfg;
+	int node;
+
+	node = cpu_to_node(cpu);
+
+	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
+	BUG_ON(!cfg);
+
+	return cfg;
 }
 
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
 {
-	return irq_cfg(irq);
+	struct irq_cfg *cfg;
+
+	cfg = desc->chip_data;
+	if (!cfg)
+		desc->chip_data = get_one_free_irq_cfg(cpu);
 }
 
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+#ifdef CONFIG_MOVE_IRQ_DESC
 
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu)
+{
+	struct irq_pin_list *old_entry, *tail, *entry;
 
-struct irq_pin_list {
-	int apic, pin;
-	struct irq_pin_list *next;
-};
+	cfg->irq_2_pin = NULL;
+	old_entry = old_cfg->irq_2_pin;
+	if (!old_entry)
+		return;
 
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+	entry = get_one_free_irq_2_pin(cpu);
+	entry->apic = old_entry->apic;
+	entry->pin = old_entry->pin;
+	cfg->irq_2_pin = entry;
+	tail = entry;
+	old_entry = old_entry->next;
 
-static void __init irq_2_pin_init(void)
+	while (old_entry) {
+		entry = get_one_free_irq_2_pin(cpu);
+		entry->apic = old_entry->apic;
+		entry->pin = old_entry->pin;
+		tail->next = entry;
+		tail = entry;
+		old_entry = old_entry->next;
+	}
+
+	tail->next = NULL;
+}
+
+static void free_irq_2_pin(struct irq_cfg *cfg)
 {
-	struct irq_pin_list *pin = irq_2_pin_head;
-	int i;
+	struct irq_pin_list *entry, *next;
 
-	for (i = 1; i < PIN_MAP_SIZE; i++)
-		pin[i-1].next = &pin[i];
+	entry = cfg->irq_2_pin;
 
-	irq_2_pin_ptr = &pin[0];
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	cfg->irq_2_pin = NULL;
 }
 
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_ptr;
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
 
-	if (!pin)
-		panic("can not get more irq_2_pin\n");
+	cfg = get_one_free_irq_cfg(cpu);
+	desc->chip_data = cfg;
 
-	irq_2_pin_ptr = pin->next;
-	pin->next = NULL;
-	return pin;
+	old_cfg = old_desc->chip_data;
+
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *cfg)
+{
+	kfree(cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+
+	cfg = desc->chip_data;
+	if (cfg) {
+		free_irq_2_pin(cfg);
+		free_irq_cfg(cfg);
+		desc->chip_data = NULL;
+	}
 }
 
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+	struct irq_cfg *cfg = desc->chip_data;
+
+	if (!cfg->move_in_progress) {
+		/* it means that domain is not changed */
+		if (!cpus_intersects(desc->affinity, mask))
+			cfg->move_desc_in_progress_in_same_domain = 1;
+	}
+}
+#endif
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+#endif
+
+#ifndef CONFIG_MOVE_IRQ_DESC
+static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+}
+#endif
+
 struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
@@ -237,11 +375,10 @@ static inline void io_apic_modify(unsign
 	writel(value, &io_apic->data);
 }
 
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
-	struct irq_cfg *cfg = irq_cfg(irq);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	entry = cfg->irq_2_pin;
@@ -323,13 +460,12 @@ static void ioapic_mask_entry(int apic,
 }
 
 #ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
 {
 	int apic, pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
 
-	cfg = irq_cfg(irq);
 	entry = cfg->irq_2_pin;
 	for (;;) {
 		unsigned int reg;
@@ -359,24 +495,27 @@ static void __target_IO_APIC_irq(unsigne
 	}
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
 	unsigned long flags;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	cfg = irq_cfg(irq);
-	if (assign_irq_vector(irq, mask))
+	irq = desc->irq;
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 	/*
@@ -384,12 +523,24 @@ static void set_ioapic_affinity_irq(unsi
 	 */
 	dest = SET_APIC_LOGICAL_ID(dest);
 
-	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__target_IO_APIC_irq(irq, dest, cfg->vector);
+	__target_IO_APIC_irq(irq, dest, cfg);
 	desc->affinity = mask;
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+#define set_ioapic_affinity_irq set_ioapic_affinity_irq_desc
+#else
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	set_ioapic_affinity_irq_desc(desc, mask);
+}
+#endif
 #endif /* CONFIG_SMP */
 
 /*
@@ -397,16 +548,13 @@ static void set_ioapic_affinity_irq(unsi
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	/* first time to refer irq_cfg, so with new */
-	cfg = irq_cfg_alloc(irq);
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin();
+		entry = get_one_free_irq_2_pin(cpu);
 		cfg->irq_2_pin = entry;
 		entry->apic = apic;
 		entry->pin = pin;
@@ -421,7 +569,7 @@ static void add_pin_to_irq(unsigned int
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin();
+	entry->next = get_one_free_irq_2_pin(cpu);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
@@ -430,11 +578,10 @@ static void add_pin_to_irq(unsigned int
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
 	struct irq_pin_list *entry = cfg->irq_2_pin;
 	int replaced = 0;
 
@@ -451,18 +598,16 @@ static void __init replace_pin_at_irq(un
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq(irq, newapic, newpin);
+		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
 
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
 				int mask_and, int mask_or,
 				void (*final)(struct irq_pin_list *entry))
 {
 	int pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	cfg = irq_cfg(irq);
 	for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
 		unsigned int reg;
 		pin = entry->pin;
@@ -475,9 +620,9 @@ static inline void io_apic_modify_irq(un
 	}
 }
 
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
 #ifdef CONFIG_X86_64
@@ -492,47 +637,69 @@ void io_apic_sync(struct irq_pin_list *e
 	readl(&io_apic->data);
 }
 
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 }
 #else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
 			IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
 			IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
 #endif /* CONFIG_X86_32 */
 
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq_desc(struct irq_desc **descp)
 {
+	struct irq_cfg *cfg = (*descp)->chip_data;
 	unsigned long flags;
 
+	BUG_ON(!cfg);
+
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__mask_IO_APIC_irq(irq);
+	__mask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq_desc(struct irq_desc **descp)
 {
+	struct irq_cfg *cfg = (*descp)->chip_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_IO_APIC_irq(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+#define mask_IO_APIC_irq mask_IO_APIC_irq_desc
+#define unmask_IO_APIC_irq unmask_IO_APIC_irq_desc
+#else
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_IO_APIC_irq_desc(&desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_IO_APIC_irq_desc(&desc);
+}
+#endif
+
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
@@ -809,7 +976,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector
  */
 static int EISA_ELCR(unsigned int irq)
 {
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		unsigned int port = 0x4d0 + (irq >> 3);
 		return (inb(port) >> (irq & 7)) & 1;
 	}
@@ -1034,7 +1201,7 @@ void unlock_vector_lock(void)
 	spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1050,16 +1217,13 @@ static int __assign_irq_vector(int irq,
 	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
 	unsigned int old_vector;
 	int cpu;
-	struct irq_cfg *cfg;
 
-	cfg = irq_cfg(irq);
+	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+		return -EBUSY;
 
 	/* Only try and allocate irqs on cpus that are present */
 	cpus_and(mask, mask, cpu_online_map);
 
-	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
-		return -EBUSY;
-
 	old_vector = cfg->vector;
 	if (old_vector) {
 		cpumask_t tmp;
@@ -1113,24 +1277,22 @@ next:
 	return -ENOSPC;
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	int err;
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, cfg, mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-	struct irq_cfg *cfg;
 	cpumask_t mask;
 	int cpu, vector;
 
-	cfg = irq_cfg(irq);
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
@@ -1148,14 +1310,16 @@ void __setup_vector_irq(int cpu)
 	/* This function must be called with vector_lock held */
 	int irq, vector;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	/* Mark the inuse vectors */
-	for_each_irq_cfg(irq, cfg) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
 		if (!cpu_isset(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
-	}
+	} end_for_each_irq_desc();
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
 		irq = per_cpu(vector_irq, cpu)[vector];
@@ -1201,11 +1365,8 @@ static inline int IO_APIC_irq_trigger(in
 }
 #endif
 
-static void ioapic_register_intr(int irq, unsigned long trigger)
+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
@@ -1297,7 +1458,7 @@ static int setup_ioapic_entry(int apic,
 	return 0;
 }
 
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
 			      int trigger, int polarity)
 {
 	struct irq_cfg *cfg;
@@ -1307,10 +1468,10 @@ static void setup_IO_APIC_irq(int apic,
 	if (!IO_APIC_IRQ(irq))
 		return;
 
-	cfg = irq_cfg(irq);
+	cfg = desc->chip_data;
 
 	mask = TARGET_CPUS;
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
 	cpus_and(mask, cfg->domain, mask);
@@ -1327,12 +1488,12 @@ static void setup_IO_APIC_irq(int apic,
 			       cfg->vector)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic].mp_apicid, pin);
-		__clear_irq_vector(irq);
+		__clear_irq_vector(irq, cfg);
 		return;
 	}
 
-	ioapic_register_intr(irq, trigger);
-	if (irq < 16)
+	ioapic_register_intr(irq, desc, trigger);
+	if (irq < NR_IRQS_LEGACY)
 		disable_8259A_irq(irq);
 
 	ioapic_write_entry(apic, pin, entry);
@@ -1342,6 +1503,9 @@ static void __init setup_IO_APIC_irqs(vo
 {
 	int apic, pin, idx, irq;
 	int notcon = 0;
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int cpu = boot_cpu_id;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1373,9 +1537,11 @@ static void __init setup_IO_APIC_irqs(vo
 			if (multi_timer_check(apic, irq))
 				continue;
 #endif
-			add_pin_to_irq(irq, apic, pin);
+			desc = irq_to_desc_alloc_cpu(irq, cpu);
+			cfg = desc->chip_data;
+			add_pin_to_irq_cpu(cfg, cpu, apic, pin);
 
-			setup_IO_APIC_irq(apic, pin, irq,
+			setup_IO_APIC_irq(apic, pin, irq, desc,
 					irq_trigger(idx), irq_polarity(idx));
 		}
 	}
@@ -1434,6 +1600,7 @@ __apicdebuginit(void) print_IO_APIC(void
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 	unsigned int irq;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1523,8 +1690,10 @@ __apicdebuginit(void) print_IO_APIC(void
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for_each_irq_cfg(irq, cfg) {
-		struct irq_pin_list *entry = cfg->irq_2_pin;
+	for_each_irq_desc(irq, desc) {
+		struct irq_pin_list *entry;
+		cfg = desc->chip_data;
+		entry = cfg->irq_2_pin;
 		if (!entry)
 			continue;
 		printk(KERN_DEBUG "IRQ%d ", irq);
@@ -1535,7 +1704,7 @@ __apicdebuginit(void) print_IO_APIC(void
 			entry = entry->next;
 		}
 		printk("\n");
-	}
+	} end_for_each_irq_desc();
 
 	printk(KERN_INFO ".................................... done.\n");
 
@@ -2008,14 +2177,16 @@ static unsigned int startup_ioapic_irq(u
 {
 	int was_pending = 0;
 	unsigned long flags;
+	struct irq_cfg *cfg;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		disable_8259A_irq(irq);
 		if (i8259A_irq_pending(irq))
 			was_pending = 1;
 	}
-	__unmask_IO_APIC_irq(irq);
+	cfg = irq_cfg(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
@@ -2078,35 +2249,37 @@ static DECLARE_DELAYED_WORK(ir_migration
  * as simple as edge triggered migration and we can do the irq migration
  * with a simple atomic update to IO-APIC RTE.
  */
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
 	int modify_ioapic_rte;
 	unsigned int dest;
 	unsigned long flags;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
+	irq = desc->irq;
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
-	desc = irq_to_desc(irq);
 	modify_ioapic_rte = desc->status & IRQ_LEVEL;
 	if (modify_ioapic_rte) {
 		spin_lock_irqsave(&ioapic_lock, flags);
-		__target_IO_APIC_irq(irq, dest, cfg->vector);
+		__target_IO_APIC_irq(irq, dest, cfg);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -2128,14 +2301,14 @@ static void migrate_ioapic_irq(int irq,
 	desc->affinity = mask;
 }
 
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
 {
 	int ret = -1;
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
 
-	mask_IO_APIC_irq(irq);
+	mask_IO_APIC_irq_desc(&desc);
 
-	if (io_apic_level_ack_pending(irq)) {
+	if (io_apic_level_ack_pending(cfg)) {
 		/*
 		 * Interrupt in progress. Migrating irq now will change the
 		 * vector information in the IO-APIC RTE and that will confuse
@@ -2147,14 +2320,15 @@ static int migrate_irq_remapped_level(in
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq(irq, desc->pending_mask);
+	migrate_ioapic_irq_desc(desc, desc->pending_mask);
 
 	ret = 0;
 	desc->status &= ~IRQ_MOVE_PENDING;
 	cpus_clear(desc->pending_mask);
 
 unmask:
-	unmask_IO_APIC_irq(irq);
+	unmask_IO_APIC_irq_desc(&desc);
+
 	return ret;
 }
 
@@ -2175,29 +2349,37 @@ static void ir_irq_migration(struct work
 				continue;
 			}
 
-			desc->chip->set_affinity(irq, desc->pending_mask);
+			desc_chip_set_affinity(irq, desc, desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
  * Migrates the IRQ destination in the process context.
  */
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
 	if (desc->status & IRQ_LEVEL) {
 		desc->status |= IRQ_MOVE_PENDING;
 		desc->pending_mask = mask;
-		migrate_irq_remapped_level(irq);
+		migrate_irq_remapped_level_desc(desc);
 		return;
 	}
 
-	migrate_ioapic_irq(irq, mask);
+	migrate_ioapic_irq_desc(desc, mask);
+}
+#ifdef CONFIG_SPARSE_IRQ
+#define set_ir_ioapic_affinity_irq set_ir_ioapic_affinity_irq_desc
+#else
+static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif
+#endif
 
 asmlinkage void smp_irq_move_cleanup_interrupt(void)
 {
@@ -2236,19 +2418,40 @@ unlock:
 	irq_exit();
 }
 
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_desc *desc = *descp;
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress))
+	if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_MOVE_IRQ_DESC
+		if (likely(!cfg->move_desc_in_progress_in_same_domain))
+			return;
+
+		/* domain is not change, but affinity is changed */
+		me = smp_processor_id();
+		if (cpu_isset(me, desc->affinity)) {
+			*descp = desc = move_irq_desc(desc, me);
+			/* get the new one */
+			cfg = desc->chip_data;
+			cfg->move_desc_in_progress_in_same_domain = 0;
+		}
+#endif
 		return;
+	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+		*descp = desc = move_irq_desc(desc, me);
+		/* get the new one */
+		cfg = desc->chip_data;
+#endif
+
 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
@@ -2256,9 +2459,24 @@ static void irq_complete_move(unsigned i
 	}
 }
 #else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
+
 #ifdef CONFIG_INTR_REMAP
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_x2apic_level_desc(struct irq_desc **descp)
+{
+	ack_x2APIC_irq();
+}
+
+static void ack_x2apic_edge_desc(struct irq_desc **descp)
+{
+	ack_x2APIC_irq();
+}
+
+#define ack_x2apic_level ack_x2apic_level_desc
+#define ack_x2apic_edge ack_x2apic_edge_desc
+#else
 static void ack_x2apic_level(unsigned int irq)
 {
 	ack_x2APIC_irq();
@@ -2270,29 +2488,34 @@ static void ack_x2apic_edge(unsigned int
 }
 #endif
 
-static void ack_apic_edge(unsigned int irq)
+#endif
+
+static void ack_apic_edge_desc(struct irq_desc **descp)
 {
-	irq_complete_move(irq);
-	move_native_irq(irq);
+	irq_complete_move(descp);
+#ifdef CONFIG_SMP
+	move_native_irq_desc(descp);
+#endif
 	ack_APIC_irq();
 }
 
 atomic_t irq_mis_count;
 
-static void ack_apic_level(unsigned int irq)
+static void ack_apic_level_desc(struct irq_desc **descp)
 {
 #ifdef CONFIG_X86_32
 	unsigned long v;
 	int i;
 #endif
+	struct irq_cfg *cfg;
 	int do_unmask_irq = 0;
 
-	irq_complete_move(irq);
+	irq_complete_move(descp);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+	if (unlikely((*descp)->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
-		mask_IO_APIC_irq(irq);
+		mask_IO_APIC_irq_desc(descp);
 	}
 #endif
 
@@ -2316,7 +2539,8 @@ static void ack_apic_level(unsigned int
 	* operation to prevent an edge-triggered interrupt escaping meanwhile.
 	* The idea is from Manfred Spraul.  --macro
 	*/
-	i = irq_cfg(irq)->vector;
+	cfg = (*descp)->chip_data;
+	i = cfg->vector;
 
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 #endif
@@ -2355,22 +2579,44 @@ static void ack_apic_level(unsigned int
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(irq))
-			move_masked_irq(irq);
-		unmask_IO_APIC_irq(irq);
+		cfg = (*descp)->chip_data;
+		if (!io_apic_level_ack_pending(cfg)) {
+# ifdef CONFIG_SMP
+			move_masked_irq_desc(descp);
+# endif
+		}
+		unmask_IO_APIC_irq_desc(descp);
 	}
 
 #ifdef CONFIG_X86_32
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		__mask_and_edge_IO_APIC_irq(cfg);
+		__unmask_and_level_IO_APIC_irq(cfg);
 		spin_unlock(&ioapic_lock);
 	}
 #endif
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+#define ack_apic_edge ack_apic_edge_desc
+#define ack_apic_level ack_apic_level_desc
+#else
+static void ack_apic_edge(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_apic_edge_desc(&desc);
+}
+static void ack_apic_level(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_apic_level_desc(&desc);
+}
+#endif
+
 static struct irq_chip ioapic_chip __read_mostly = {
 	.name		= "IO-APIC",
 	.startup	= startup_ioapic_irq,
@@ -2416,29 +2662,28 @@ static inline void init_IO_APIC_traps(vo
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for_each_irq_cfg(irq, cfg) {
-		if (IO_APIC_IRQ(irq) && !cfg->vector) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
+		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
 			 * interrupt if we can..
 			 */
-			if (irq < 16)
+			if (irq < NR_IRQS_LEGACY)
 				make_8259A_irq(irq);
-			else {
-				desc = irq_to_desc(irq);
+			else
 				/* Strange. Oh, well.. */
 				desc->chip = &no_irq_chip;
-			}
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
  * The local APIC irq-chip implementation:
  */
 
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irq_desc(struct irq_desc **descp)
 {
 	unsigned long v;
 
@@ -2446,7 +2691,7 @@ static void mask_lapic_irq(unsigned int
 	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq_desc(struct irq_desc **descp)
 {
 	unsigned long v;
 
@@ -2454,11 +2699,36 @@ static void unmask_lapic_irq(unsigned in
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irq_desc(struct irq_desc **descp)
 {
 	ack_APIC_irq();
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+#define mask_lapic_irq mask_lapic_irq_desc
+#define unmask_lapic_irq unmask_lapic_irq_desc
+#define ack_lapic_irq ack_lapic_irq_desc
+#else
+static void mask_lapic_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_lapic_irq_desc(&desc);
+}
+static void unmask_lapic_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_lapic_irq_desc(&desc);
+}
+static void ack_lapic_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_lapic_irq_desc(&desc);
+}
+#endif
+
 static struct irq_chip lapic_chip __read_mostly = {
 	.name		= "local-APIC",
 	.mask		= mask_lapic_irq,
@@ -2466,11 +2736,8 @@ static struct irq_chip lapic_chip __read
 	.ack		= ack_lapic_irq,
 };
 
-static void lapic_register_intr(int irq)
+static void lapic_register_intr(int irq, struct irq_desc *desc)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
 	desc->status &= ~IRQ_LEVEL;
 	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
 				      "edge");
@@ -2574,7 +2841,9 @@ int timer_through_8259 __initdata;
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_desc *desc = irq_to_desc(0);
+	struct irq_cfg *cfg = desc->chip_data;
+	int cpu = boot_cpu_id;
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	unsigned int ver;
@@ -2589,7 +2858,7 @@ static inline void __init check_timer(vo
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	assign_irq_vector(0, TARGET_CPUS);
+	assign_irq_vector(0, cfg, TARGET_CPUS);
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2640,10 +2909,10 @@ static inline void __init check_timer(vo
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq(0, apic1, pin1);
+			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irq_desc(&desc);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
@@ -2669,9 +2938,9 @@ static inline void __init check_timer(vo
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irq_desc(&desc);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2703,7 +2972,7 @@ static inline void __init check_timer(vo
 	apic_printk(APIC_QUIET, KERN_INFO
 		    "...trying to set up timer as Virtual Wire IRQ...\n");
 
-	lapic_register_intr(0);
+	lapic_register_intr(0, desc);
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
@@ -2888,22 +3157,26 @@ unsigned int create_irq_nr(unsigned int
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
-	struct irq_cfg *cfg_new;
+	struct irq_cfg *cfg_new = NULL;
+	int cpu = boot_cpu_id;
+	struct irq_desc *desc_new = NULL;
 
+#ifndef CONFIG_SPARSE_IRQ
 	irq_want = nr_irqs - 1;
+#endif
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
 	for (new = irq_want; new > 0; new--) {
 		if (platform_legacy_irq(new))
 			continue;
-		cfg_new = irq_cfg(new);
-		if (cfg_new && cfg_new->vector != 0)
+
+		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		cfg_new = desc_new->chip_data;
+
+		if (cfg_new->vector != 0)
 			continue;
-		/* check if need to create one */
-		if (!cfg_new)
-			cfg_new = irq_cfg_alloc(new);
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
 	}
@@ -2911,6 +3184,9 @@ unsigned int create_irq_nr(unsigned int
 
 	if (irq > 0) {
 		dynamic_irq_init(irq);
+		/* restore it, in case dynamic_irq_init clear it */
+		if (desc_new)
+			desc_new->chip_data = cfg_new;
 	}
 	return irq;
 }
@@ -2930,14 +3206,22 @@ int create_irq(void)
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
+	/* store it, in case dynamic_irq_cleanup clear it */
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
 	dynamic_irq_cleanup(irq);
+	/* connect back irq_cfg */
+	if (desc)
+		desc->chip_data = cfg;
 
 #ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
 #endif
 	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
+	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
@@ -2952,12 +3236,12 @@ static int msi_compose_msg(struct pci_de
 	unsigned dest;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (err)
 		return err;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, tmp);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3013,61 +3297,75 @@ static int msi_compose_msg(struct pci_de
 }
 
 #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_msi_irq_affinity_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
+	unsigned int irq = desc->irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
-	read_msi_msg(irq, &msg);
+	read_msi_msg_desc(desc, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
 	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
-	write_msi_msg(irq, &msg);
-	desc = irq_to_desc(irq);
+	write_msi_msg_desc(desc, &msg);
 	desc->affinity = mask;
 }
+#ifdef CONFIG_SPARSE_IRQ
+#define set_msi_irq_affinity set_msi_irq_affinity_desc
+#else
+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
 
+	set_msi_irq_affinity_desc(desc, mask);
+}
+#endif
 #ifdef CONFIG_INTR_REMAP
 /*
  * Migrate the MSI irq to another cpumask. This migration is
  * done in the process context using interrupt-remapping hardware.
  */
-static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void ir_set_msi_irq_affinity_desc(struct irq_desc *desc,
+					 cpumask_t mask)
 {
 	struct irq_cfg *cfg;
 	unsigned int dest;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
-	struct irq_desc *desc;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
+	irq = desc->irq;
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3091,9 +3389,20 @@ static void ir_set_msi_irq_affinity(unsi
 		cfg->move_in_progress = 0;
 	}
 
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+#define ir_set_msi_irq_affinity ir_set_msi_irq_affinity_desc
+#else
+static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ir_set_msi_irq_affinity_desc(desc, mask);
+}
+#endif
+
 #endif
 #endif /* CONFIG_SMP */
 
@@ -3152,7 +3461,7 @@ static int msi_alloc_irte(struct pci_dev
 }
 #endif
 
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
 	int ret;
 	struct msi_msg msg;
@@ -3161,7 +3470,7 @@ static int setup_msi_irq(struct pci_dev
 	if (ret < 0)
 		return ret;
 
-	set_irq_msi(irq, desc);
+	set_irq_msi(irq, msidesc);
 	write_msi_msg(irq, &msg);
 
 #ifdef CONFIG_INTR_REMAP
@@ -3176,7 +3485,7 @@ static int setup_msi_irq(struct pci_dev
 #endif
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
-	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for MSI/MSI-X\n", irq, irq);
 
 	return 0;
 }
@@ -3185,6 +3494,7 @@ static unsigned int build_irq_for_pci_de
 {
 	unsigned int irq;
 
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
 	irq = dev->bus->number;
 	irq <<= 8;
 	irq |= dev->devfn;
@@ -3193,13 +3503,13 @@ static unsigned int build_irq_for_pci_de
 	return irq;
 }
 
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
 {
 	unsigned int irq;
 	int ret;
 	unsigned int irq_want;
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 
 	irq = create_irq_nr(irq_want);
 	if (irq == 0)
@@ -3214,7 +3524,7 @@ int arch_setup_msi_irq(struct pci_dev *d
 		goto error;
 no_ir:
 #endif
-	ret = setup_msi_irq(dev, desc, irq);
+	ret = setup_msi_irq(dev, msidesc, irq);
 	if (ret < 0) {
 		destroy_irq(irq);
 		return ret;
@@ -3232,7 +3542,7 @@ int arch_setup_msi_irqs(struct pci_dev *
 {
 	unsigned int irq;
 	int ret, sub_handle;
-	struct msi_desc *desc;
+	struct msi_desc *msidesc;
 	unsigned int irq_want;
 
 #ifdef CONFIG_INTR_REMAP
@@ -3240,9 +3550,10 @@ int arch_setup_msi_irqs(struct pci_dev *
 	int index = 0;
 #endif
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	/* count from the top 0xfff in 12 bits range */
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 	sub_handle = 0;
-	list_for_each_entry(desc, &dev->msi_list, list) {
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
 		irq = create_irq_nr(irq_want--);
 		if (irq == 0)
 			return -1;
@@ -3275,7 +3586,7 @@ int arch_setup_msi_irqs(struct pci_dev *
 		}
 no_ir:
 #endif
-		ret = setup_msi_irq(dev, desc, irq);
+		ret = setup_msi_irq(dev, msidesc, irq);
 		if (ret < 0)
 			goto error;
 		sub_handle++;
@@ -3294,22 +3605,25 @@ void arch_teardown_msi_irq(unsigned int
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	irq = desc->irq;
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3321,9 +3635,20 @@ static void dmar_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+#define dmar_msi_set_affinity dmar_msi_set_affinity_desc
+#else
+static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	dmar_msi_set_affinity_desc(desc, mask);
+}
+#endif
+
 #endif /* CONFIG_SMP */
 
 struct irq_chip dmar_msi_type = {
@@ -3355,22 +3680,25 @@ int arch_setup_dmar_msi(unsigned int irq
 #ifdef CONFIG_HPET_TIMER
 
 #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void hpet_msi_set_affinity_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	irq = desc->irq;
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3382,9 +3710,19 @@ static void hpet_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+#define hpet_msi_set_affinity hpet_msi_set_affinity_desc
+#else
+static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	hpet_msi_set_affinity_desc(desc, mask);
+}
+#endif
 #endif /* CONFIG_SMP */
 
 struct irq_chip hpet_msi_type = {
@@ -3437,28 +3775,40 @@ static void target_ht_irq(unsigned int i
 	write_ht_irq_msg(irq, &msg);
 }
 
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_ht_irq_affinity_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
+	unsigned int irq = desc->irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+#define set_ht_irq_affinity set_ht_irq_affinity_desc
+#else
+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	set_ht_irq_affinity_desc(desc, mask);
+}
+#endif
 #endif
 
 static struct irq_chip ht_irq_chip = {
@@ -3478,13 +3828,13 @@ int arch_setup_ht_irq(unsigned int irq,
 	int err;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (!err) {
 		struct ht_irq_msg msg;
 		unsigned dest;
 
-		cfg = irq_cfg(irq);
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
@@ -3508,7 +3858,8 @@ int arch_setup_ht_irq(unsigned int irq,
 		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
 					      handle_edge_irq, "edge");
 
-		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
+		dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for HT\n",
+				 irq, irq);
 	}
 	return err;
 }
@@ -3530,7 +3881,9 @@ int arch_enable_uv_irq(char *irq_name, u
 	unsigned long flags;
 	int err;
 
-	err = assign_irq_vector(irq, *eligible_cpu);
+	cfg = irq_cfg(irq);
+
+	err = assign_irq_vector(irq, cfg, *eligible_cpu);
 	if (err != 0)
 		return err;
 
@@ -3539,8 +3892,6 @@ int arch_enable_uv_irq(char *irq_name, u
 				      irq_name);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	cfg = irq_cfg(irq);
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3594,6 +3945,7 @@ int __init io_apic_get_redir_entries (in
 
 int __init probe_nr_irqs(void)
 {
+#ifdef CONFIG_SPARSE_IRQ
 	int idx;
 	int nr = 0;
 #ifndef CONFIG_XEN
@@ -3611,10 +3963,11 @@ int __init probe_nr_irqs(void)
 	/* something wrong ? */
 	if (nr < nr_min)
 		nr = nr_min;
-	if (WARN_ON(nr > NR_IRQS))
-		nr = NR_IRQS;
 
 	return nr;
+#else
+	return NR_IRQS;
+#endif
 }
 
 /* --------------------------------------------------------------------------
@@ -3713,19 +4066,27 @@ int __init io_apic_get_version(int ioapi
 
 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
 {
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int cpu = boot_cpu_id;
+
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
 			ioapic);
 		return -EINVAL;
 	}
 
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
-	if (irq >= 16)
-		add_pin_to_irq(irq, ioapic, pin);
+	if (irq >= NR_IRQS_LEGACY) {
+		cfg = desc->chip_data;
+		add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+	}
 
-	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
 
 	return 0;
 }
@@ -3779,9 +4140,10 @@ void __init setup_ioapic_dest(void)
 			 * when you have too many devices, because at that time only boot
 			 * cpu is online.
 			 */
-			cfg = irq_cfg(irq);
+			desc = irq_to_desc(irq);
+			cfg = desc->chip_data;
 			if (!cfg->vector) {
-				setup_IO_APIC_irq(ioapic, pin, irq,
+				setup_IO_APIC_irq(ioapic, pin, irq, desc,
 						  irq_trigger(irq_entry),
 						  irq_polarity(irq_entry));
 				continue;
@@ -3791,7 +4153,6 @@ void __init setup_ioapic_dest(void)
 			/*
 			 * Honour affinities which have been set in early boot
 			 */
-			desc = irq_to_desc(irq);
 			if (desc->status &
 			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
 				mask = desc->affinity;
@@ -3800,10 +4161,10 @@ void __init setup_ioapic_dest(void)
 
 #ifdef CONFIG_INTR_REMAP
 			if (intr_remapping_enabled)
-				set_ir_ioapic_affinity_irq(irq, mask);
+				set_ir_ioapic_affinity_irq_desc(desc, mask);
 			else
 #endif
-				set_ioapic_affinity_irq(irq, mask);
+				set_ioapic_affinity_irq_desc(desc, mask);
 		}
 
 	}
@@ -3852,7 +4213,6 @@ void __init ioapic_init_mappings(void)
 	struct resource *ioapic_res;
 	int i;
 
-	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
Index: linux-2.6/arch/x86/kernel/irqinit_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_32.c
+++ linux-2.6/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
 	/*
 	 * 16 old-style INTA-cycle interrupts:
 	 */
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/kernel/irqinit_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_64.c
+++ linux-2.6/arch/x86/kernel/irqinit_64.c
@@ -142,8 +142,7 @@ void __init init_ISA_irqs(void)
 	init_bsp_APIC();
 	init_8259A(0);
 
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/mm/init_32.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_32.c
+++ linux-2.6/arch/x86/mm/init_32.c
@@ -66,6 +66,7 @@ static unsigned long __meminitdata table
 static unsigned long __meminitdata table_top;
 
 static int __initdata after_init_bootmem;
+int after_bootmem;
 
 static __init void *alloc_low_page(unsigned long *phys)
 {
@@ -987,6 +988,8 @@ void __init mem_init(void)
 
 	set_highmem_pages_init();
 
+	after_bootmem = 1;
+
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
Index: linux-2.6/drivers/char/random.c
===================================================================
--- linux-2.6.orig/drivers/char/random.c
+++ linux-2.6/drivers/char/random.c
@@ -558,6 +558,8 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
+#ifndef CONFIG_SPARSE_IRQ
+
 static struct timer_rand_state *irq_timer_state[NR_IRQS];
 
 static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
@@ -576,6 +578,33 @@ static void set_timer_rand_state(unsigne
 	irq_timer_state[irq] = state;
 }
 
+#else
+
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return NULL;
+
+	return desc->timer_rand_state;
+}
+
+static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return;
+
+	desc->timer_rand_state = state;
+}
+#endif
+
 static struct timer_rand_state input_timer_state;
 
 /*
@@ -933,8 +962,10 @@ void rand_initialize_irq(int irq)
 {
 	struct timer_rand_state *state;
 
+#ifndef CONFIG_SPARSE_IRQ
 	if (irq >= nr_irqs)
 		return;
+#endif
 
 	state = get_timer_rand_state(irq);
 
Index: linux-2.6/drivers/pci/htirq.c
===================================================================
--- linux-2.6.orig/drivers/pci/htirq.c
+++ linux-2.6/drivers/pci/htirq.c
@@ -58,30 +58,62 @@ void fetch_ht_irq_msg(unsigned int irq,
 	*msg = cfg->msg;
 }
 
-void mask_ht_irq(unsigned int irq)
+void mask_ht_irq_desc(struct irq_desc **descp)
 {
 	struct ht_irq_cfg *cfg;
 	struct ht_irq_msg msg;
+	unsigned int irq = (*descp)->irq;
 
-	cfg = get_irq_data(irq);
+	cfg = get_irq_desc_data(*descp);
 
 	msg = cfg->msg;
 	msg.address_lo |= 1;
 	write_ht_irq_msg(irq, &msg);
 }
 
-void unmask_ht_irq(unsigned int irq)
+void unmask_ht_irq_desc(struct irq_desc **descp)
 {
 	struct ht_irq_cfg *cfg;
 	struct ht_irq_msg msg;
+	unsigned int irq = (*descp)->irq;
 
-	cfg = get_irq_data(irq);
+	cfg = get_irq_desc_data(*descp);
 
 	msg = cfg->msg;
 	msg.address_lo &= ~1;
 	write_ht_irq_msg(irq, &msg);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void mask_ht_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_ht_irq_desc(&desc);
+}
+void unmask_ht_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_ht_irq_desc(&desc);
+}
+
+#else
+
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+	unsigned int irq;
+
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
+	irq = dev->bus->number;
+	irq <<= 8;
+	irq |= dev->devfn;
+	irq <<= 12;
+
+	return irq;
+}
+#endif
+
 /**
  * __ht_create_irq - create an irq and attach it to a device.
  * @dev: The hypertransport device to find the irq capability on.
@@ -125,7 +157,11 @@ int __ht_create_irq(struct pci_dev *dev,
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
+#ifdef CONFIG_SPARSE_IRQ
+	irq = create_irq_nr(idx + build_irq_for_pci_dev(dev));
+#else
 	irq = create_irq();
+#endif
 
 	if (irq <= 0) {
 		kfree(cfg);
Index: linux-2.6/drivers/pci/intr_remapping.c
===================================================================
--- linux-2.6.orig/drivers/pci/intr_remapping.c
+++ linux-2.6/drivers/pci/intr_remapping.c
@@ -19,17 +19,71 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+{
+	struct irq_2_iommu *iommu;
+	int node;
+
+	node = cpu_to_node(cpu);
+
+	iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+
+	return iommu;
+}
 
 static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 {
-	return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (WARN_ON_ONCE(!desc))
+		return NULL;
+
+	return desc->irq_2_iommu;
 }
 
+static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct irq_2_iommu *irq_iommu;
+
+	/*
+	 * alloc irq desc if not allocated already.
+	 */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+
+	irq_iommu = desc->irq_2_iommu;
+
+	if (!irq_iommu)
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+
+	return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+	return irq_2_iommu_alloc_cpu(irq, boot_cpu_id);
+}
+
+#else /* !CONFIG_SPARSE_IRQ */
+
+static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_2_iommuX[irq];
+
+	return NULL;
+}
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
 	return irq_2_iommu(irq);
 }
+#endif
 
 static DEFINE_SPINLOCK(irq_2_ir_lock);
 
@@ -86,9 +140,11 @@ int alloc_irte(struct intel_iommu *iommu
 	if (!count)
 		return -1;
 
+#ifndef CONFIG_SPARSE_IRQ
 	/* protect irq_2_iommu_alloc later */
 	if (irq >= nr_irqs)
 		return -1;
+#endif
 
 	/*
 	 * start the IRTE search from index 0.
Index: linux-2.6/drivers/xen/events.c
===================================================================
--- linux-2.6.orig/drivers/xen/events.c
+++ linux-2.6/drivers/xen/events.c
@@ -141,8 +141,9 @@ static void init_evtchn_cpu_bindings(voi
 	int i;
 
 	/* By default all event channels notify CPU#0. */
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		desc->affinity = cpumask_of_cpu(0);
+	} end_for_each_irq_desc();
 #endif
 
 	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@ -231,7 +232,7 @@ static int find_unbound_irq(void)
 	int irq;
 
 	/* Only allocate from dynirq range */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		if (irq_bindcount[irq] == 0)
 			break;
 
@@ -792,7 +793,7 @@ void xen_irq_resume(void)
 		mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		irq_info[irq].evtchn = 0; /* zap event-channel binding */
 
 	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@ -824,7 +825,7 @@ void __init xen_init_IRQ(void)
 		mask_evtchn(i);
 
 	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-	for_each_irq_nr(i)
+	for (i = 0; i < nr_irqs; i++)
 		irq_bindcount[i] = 0;
 
 	irq_ctx_init(smp_processor_id());
Index: linux-2.6/fs/proc/stat.c
===================================================================
--- linux-2.6.orig/fs/proc/stat.c
+++ linux-2.6/fs/proc/stat.c
@@ -27,6 +27,9 @@ static int show_stat(struct seq_file *p,
 	u64 sum = 0;
 	struct timespec boottime;
 	unsigned int per_irq_sum;
+#ifdef CONFIG_GENERIC_HARDIRQS
+	struct irq_desc *desc;
+#endif
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -44,10 +47,9 @@ static int show_stat(struct seq_file *p,
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-
-		for_each_irq_nr(j)
+		for_each_irq_desc(j, desc) {
 			sum += kstat_irqs_cpu(j, i);
-
+		} end_for_each_irq_desc();
 		sum += arch_irq_stat_cpu(i);
 	}
 	sum += arch_irq_stat();
@@ -90,14 +92,17 @@ static int show_stat(struct seq_file *p,
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_nr(j) {
+	for_each_irq_desc(j, desc) {
 		per_irq_sum = 0;
-
 		for_each_possible_cpu(i)
 			per_irq_sum += kstat_irqs_cpu(j, i);
 
+#ifdef CONFIG_SPARSE_IRQ
+		seq_printf(p, " %#x:%u", j, per_irq_sum);
+#else
 		seq_printf(p, " %u", per_irq_sum);
-	}
+#endif
+	} end_for_each_irq_desc();
 
 	seq_printf(p,
 		"\nctxt %llu\n"
Index: linux-2.6/fs/proc/interrupts.c
===================================================================
--- linux-2.6.orig/fs/proc/interrupts.c
+++ linux-2.6/fs/proc/interrupts.c
@@ -8,6 +8,23 @@
 /*
  * /proc/interrupts
  */
+#ifdef CONFIG_SPARSE_IRQ
+static void *int_seq_start(struct seq_file *f, loff_t *pos)
+{
+	rcu_read_lock();
+	return seq_list_start(&sparse_irqs_head, *pos);
+}
+
+static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &sparse_irqs_head, pos);
+}
+
+static void int_seq_stop(struct seq_file *f, void *v)
+{
+	rcu_read_unlock();
+}
+#else
 static void *int_seq_start(struct seq_file *f, loff_t *pos)
 {
 	return (*pos <= nr_irqs) ? pos : NULL;
@@ -25,6 +42,7 @@ static void int_seq_stop(struct seq_file
 {
 	/* Nothing to do */
 }
+#endif
 
 static const struct seq_operations int_seq_ops = {
 	.start = int_seq_start,
Index: linux-2.6/include/linux/interrupt.h
===================================================================
--- linux-2.6.orig/include/linux/interrupt.h
+++ linux-2.6/include/linux/interrupt.h
@@ -18,6 +18,8 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
+extern int nr_irqs;
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h
+++ linux-2.6/include/linux/irq.h
@@ -106,14 +106,23 @@ struct irq_chip {
 	void		(*enable)(unsigned int irq);
 	void		(*disable)(unsigned int irq);
 
+#ifdef CONFIG_SPARSE_IRQ
+	void		(*ack)(struct irq_desc **descp);
+	void		(*mask)(struct irq_desc **descp);
+	void		(*mask_ack)(struct irq_desc **descp);
+	void		(*unmask)(struct irq_desc **descp);
+	void		(*eoi)(struct irq_desc **descp);
+	void		(*set_affinity)(struct irq_desc *desc, cpumask_t dest);
+#else
 	void		(*ack)(unsigned int irq);
 	void		(*mask)(unsigned int irq);
 	void		(*mask_ack)(unsigned int irq);
 	void		(*unmask)(unsigned int irq);
 	void		(*eoi)(unsigned int irq);
+	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
+#endif
 
 	void		(*end)(unsigned int irq);
-	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
 	int		(*retrigger)(unsigned int irq);
 	int		(*set_type)(unsigned int irq, unsigned int flow_type);
 	int		(*set_wake)(unsigned int irq, unsigned int on);
@@ -129,6 +138,8 @@ struct irq_chip {
 	const char	*typename;
 };
 
+struct timer_rand_state;
+struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -155,6 +166,15 @@ struct irq_chip {
  */
 struct irq_desc {
 	unsigned int		irq;
+#ifdef CONFIG_SPARSE_IRQ
+	struct list_head	list;
+	struct list_head	hash_entry;
+	struct timer_rand_state *timer_rand_state;
+	unsigned int            *kstat_irqs;
+# ifdef CONFIG_INTR_REMAP
+	struct irq_2_iommu      *irq_2_iommu;
+# endif
+#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -182,13 +202,69 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+extern void arch_early_irq_init(void);
+extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
+					struct irq_desc *desc, int cpu);
+extern void arch_free_chip_data(struct irq_desc *desc);
 
+#ifndef CONFIG_SPARSE_IRQ
+
+/* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
 
-static inline struct irq_desc *irq_to_desc(unsigned int irq)
-{
-	return (irq < nr_irqs) ? irq_desc + irq : NULL;
-}
+#ifdef CONFIG_GENERIC_HARDIRQS
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+# define for_each_irq_desc_reverse(irq, desc)                          \
+	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
+	    irq >= 0; irq--, desc--)
+
+#define end_for_each_irq_desc()
+#endif
+
+#define desc_chip_ack(irq, descp) desc->chip->ack(irq)
+#define desc_chip_mask(irq, descp) desc->chip->mask(irq)
+#define desc_chip_mask_ack(irq, descp) desc->chip->mask_ack(irq)
+#define desc_chip_unmask(irq, descp) desc->chip->unmask(irq)
+#define desc_chip_eoi(irq, descp) desc->chip->eoi(irq)
+#define desc_chip_set_affinity(irq, descx, mask) desc->chip->set_affinity(irq, mask)
+
+#else
+
+void early_irq_init(void);
+extern struct list_head sparse_irqs_head;
+#define for_each_irq_desc(irqX, desc)					\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.next), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.next), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.next), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define for_each_irq_desc_reverse(irqX, desc)				\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.prev), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.prev), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.prev), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define end_for_each_irq_desc() rcu_read_unlock()
+
+#define kstat_irqs_this_cpu(DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()])
+#define kstat_incr_irqs_this_cpu(irqno, DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()]++)
+
+#define desc_chip_ack(irq, descp) desc->chip->ack(descp)
+#define desc_chip_mask(irq, descp) desc->chip->mask(descp)
+#define desc_chip_mask_ack(irq, descp) desc->chip->mask_ack(descp)
+#define desc_chip_unmask(irq, descp) desc->chip->unmask(descp)
+#define desc_chip_eoi(irq, descp) desc->chip->eoi(descp)
+#define desc_chip_set_affinity(irq, descx, mask) desc->chip->set_affinity(descx, mask)
+
+#endif
 
 /*
  * Migration helpers for obsolete names, they will go away:
@@ -211,8 +287,12 @@ extern int setup_irq(unsigned int irq, s
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 
+void move_native_irq_desc(struct irq_desc **descp);
+void move_masked_irq_desc(struct irq_desc **descp);
+#ifndef CONFIG_SPARSE_IRQ
 void move_native_irq(int irq);
 void move_masked_irq(int irq);
+#endif
 
 #else /* CONFIG_GENERIC_PENDING_IRQ */
 
@@ -381,6 +461,11 @@ extern int set_irq_msi(unsigned int irq,
 #define get_irq_data(irq)	(irq_to_desc(irq)->handler_data)
 #define get_irq_msi(irq)	(irq_to_desc(irq)->msi_desc)
 
+#define get_irq_desc_chip(desc)		((desc)->chip)
+#define get_irq_desc_chip_data(desc)	((desc)->chip_data)
+#define get_irq_desc_data(desc)		((desc)->handler_data)
+#define get_irq_desc_msi(desc)		((desc)->msi_desc)
+
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
 #endif /* !CONFIG_S390 */
Index: linux-2.6/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.orig/include/linux/kernel_stat.h
+++ linux-2.6/include/linux/kernel_stat.h
@@ -28,7 +28,9 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-	unsigned int irqs[NR_IRQS];
+#ifndef CONFIG_SPARSE_IRQ
+       unsigned int irqs[NR_IRQS];
+#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, ksta
 
 extern unsigned long long nr_context_switches(void);
 
+#ifndef CONFIG_SPARSE_IRQ
+#define kstat_irqs_this_cpu(irq) \
+	(kstat_this_cpu.irqs[irq])
+
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
@@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_
 {
 	kstat_this_cpu.irqs[irq]++;
 }
+#endif
+
 
+#ifndef CONFIG_SPARSE_IRQ
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
+#else
+extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#endif
 
 /*
  * Number of interrupts per specific IRQ source, since bootup
Index: linux-2.6/kernel/irq/autoprobe.c
===================================================================
--- linux-2.6.orig/kernel/irq/autoprobe.c
+++ linux-2.6/kernel/irq/autoprobe.c
@@ -57,7 +57,7 @@ unsigned long probe_irq_on(void)
 			desc->chip->startup(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/* Wait for longstanding interrupts to trigger. */
 	msleep(20);
@@ -75,7 +75,7 @@ unsigned long probe_irq_on(void)
 				desc->status |= IRQ_PENDING;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/*
 	 * Wait for spurious interrupts to trigger
@@ -99,7 +99,7 @@ unsigned long probe_irq_on(void)
 					mask |= 1 << i;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	return mask;
 }
@@ -135,7 +135,7 @@ unsigned int probe_irq_mask(unsigned lon
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	return mask & val;
@@ -179,7 +179,7 @@ int probe_irq_off(unsigned long val)
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	if (nr_of_irqs > 1)
Index: linux-2.6/kernel/irq/chip.c
===================================================================
--- linux-2.6.orig/kernel/irq/chip.c
+++ linux-2.6/kernel/irq/chip.c
@@ -24,9 +24,11 @@
  */
 void dynamic_irq_init(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc;
 	unsigned long flags;
 
+	/* first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
@@ -223,7 +225,7 @@ static void default_enable(unsigned int
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	desc->chip->unmask(irq);
+	desc_chip_unmask(irq, &desc);
 	desc->status &= ~IRQ_MASKED;
 }
 
@@ -252,7 +254,7 @@ static void default_shutdown(unsigned in
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	desc->chip->mask(irq);
+	desc_chip_mask(irq, &desc);
 	desc->status |= IRQ_MASKED;
 }
 
@@ -282,13 +284,15 @@ void irq_chip_set_defaults(struct irq_ch
 		chip->end = dummy_irq_chip.end;
 }
 
-static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+static inline void mask_ack_irq(struct irq_desc **descp, int irq)
 {
+	struct irq_desc *desc = *descp;
+
 	if (desc->chip->mask_ack)
-		desc->chip->mask_ack(irq);
+		desc_chip_mask_ack(irq, descp);
 	else {
-		desc->chip->mask(irq);
-		desc->chip->ack(irq);
+		desc_chip_mask(irq, descp);
+		desc_chip_ack(irq, descp);
 	}
 }
 
@@ -351,7 +355,7 @@ handle_level_irq(unsigned int irq, struc
 	irqreturn_t action_ret;
 
 	spin_lock(&desc->lock);
-	mask_ack_irq(desc, irq);
+	mask_ack_irq(&desc, irq);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -376,7 +380,7 @@ handle_level_irq(unsigned int irq, struc
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
-		desc->chip->unmask(irq);
+		desc_chip_unmask(irq, &desc);
 out_unlock:
 	spin_unlock(&desc->lock);
 }
@@ -413,7 +417,7 @@ handle_fasteoi_irq(unsigned int irq, str
 	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
 		desc->status |= IRQ_PENDING;
 		if (desc->chip->mask)
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 		goto out;
 	}
 
@@ -428,7 +432,7 @@ handle_fasteoi_irq(unsigned int irq, str
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 out:
-	desc->chip->eoi(irq);
+	desc_chip_eoi(irq, &desc);
 
 	spin_unlock(&desc->lock);
 }
@@ -464,13 +468,13 @@ handle_edge_irq(unsigned int irq, struct
 	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
-		mask_ack_irq(desc, irq);
+		mask_ack_irq(&desc, irq);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
-	desc->chip->ack(irq);
+	desc_chip_ack(irq, &desc);
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -480,7 +484,7 @@ handle_edge_irq(unsigned int irq, struct
 		irqreturn_t action_ret;
 
 		if (unlikely(!action)) {
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 			goto out_unlock;
 		}
 
@@ -492,7 +496,7 @@ handle_edge_irq(unsigned int irq, struct
 		if (unlikely((desc->status &
 			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
 			      (IRQ_PENDING | IRQ_MASKED))) {
-			desc->chip->unmask(irq);
+			desc_chip_unmask(irq, &desc);
 			desc->status &= ~IRQ_MASKED;
 		}
 
@@ -525,14 +529,14 @@ handle_percpu_irq(unsigned int irq, stru
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	if (desc->chip->ack)
-		desc->chip->ack(irq);
+		desc_chip_ack(irq, &desc);
 
 	action_ret = handle_IRQ_event(irq, desc->action);
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
 	if (desc->chip->eoi)
-		desc->chip->eoi(irq);
+		desc_chip_eoi(irq, &desc);
 }
 
 void
@@ -568,7 +572,7 @@ __set_irq_handler(unsigned int irq, irq_
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
 		if (desc->chip != &no_irq_chip)
-			mask_ack_irq(desc, irq);
+			mask_ack_irq(&desc, irq);
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c
+++ linux-2.6/kernel/irq/handle.c
@@ -15,9 +15,16 @@
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
 
 #include "internals.h"
 
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
@@ -49,6 +56,294 @@ void handle_bad_irq(unsigned int irq, st
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
+void __init __attribute__((weak)) arch_early_irq_init(void)
+{
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_desc irq_desc_init = {
+	.irq	    = -1,
+	.status	    = IRQ_DISABLED,
+	.chip	    = &no_irq_chip,
+	.handle_irq = handle_bad_irq,
+	.depth      = 1,
+	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+	.affinity   = CPU_MASK_ALL
+#endif
+};
+
+static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+{
+	unsigned long bytes;
+	char *ptr;
+	int node;
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	node = cpu_to_node(cpu);
+	ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+	BUG_ON(!ptr);
+
+	desc->kstat_irqs = (unsigned int *)ptr;
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc,
+				 int cpu, int nr)
+{
+	unsigned long bytes;
+
+	init_kstat_irqs(desc, cpu, nr);
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+}
+
+static void free_kstat_irqs(struct irq_desc *desc)
+{
+	kfree(desc->kstat_irqs);
+	desc->kstat_irqs = NULL;
+}
+#endif
+
+void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+}
+
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
+	desc->irq = irq;
+#ifdef CONFIG_SMP
+	desc->cpu = cpu;
+#endif
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_kstat_irqs(desc, cpu, nr_cpu_ids);
+	arch_init_chip_data(desc, cpu);
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+		 struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	desc->cpu = cpu;
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *desc)
+{
+	free_kstat_irqs(desc);
+	arch_free_chip_data(desc);
+}
+#endif
+/*
+ * Protect the sparse_irqs_free freelist:
+ */
+static DEFINE_SPINLOCK(sparse_irq_lock);
+LIST_HEAD(sparse_irqs_head);
+
+/*
+ * The sparse irqs are in a hash-table as well, for fast lookup:
+ */
+#define SPARSEIRQHASH_BITS          (13 - 1)
+#define SPARSEIRQHASH_SIZE          (1UL << SPARSEIRQHASH_BITS)
+#define __sparseirqhashfn(key)      hash_long((unsigned long)key, SPARSEIRQHASH_BITS)
+#define sparseirqhashentry(key)     (sparseirqhash_table + __sparseirqhashfn((key)))
+
+static struct list_head sparseirqhash_table[SPARSEIRQHASH_SIZE];
+
+static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
+	[0 ... NR_IRQS_LEGACY-1] = {
+		.irq	    = -1,
+		.status	    = IRQ_DISABLED,
+		.chip	    = &no_irq_chip,
+		.handle_irq = handle_bad_irq,
+		.depth	    = 1,
+		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+		.affinity   = CPU_MASK_ALL
+#endif
+	}
+};
+
+/* FIXME: use bootmem alloc ...*/
+static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+
+void __init early_irq_init(void)
+{
+	struct irq_desc *desc;
+	int legacy_count;
+	int i;
+
+	/* init list for sparseirq */
+	for (i = 0; i < SPARSEIRQHASH_SIZE; i++)
+		INIT_LIST_HEAD(sparseirqhash_table + i);
+
+	desc = irq_desc_legacy;
+	legacy_count = ARRAY_SIZE(irq_desc_legacy);
+
+	for (i = 0; i < legacy_count; i++) {
+		struct list_head *hash_head;
+
+		hash_head = sparseirqhashentry(i);
+		desc[i].irq = i;
+		desc[i].kstat_irqs = kstat_irqs_legacy[i];
+		list_add_tail(&desc[i].hash_entry, hash_head);
+		list_add_tail(&desc[i].list, &sparse_irqs_head);
+	}
+
+	arch_early_irq_init();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+
+	hash_head = sparseirqhashentry(irq);
+
+	/*
+	 * We can walk the hash lockfree, because the hash only
+	 * grows, and we are careful when adding entries to the end:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry) {
+		if (desc->irq == irq)
+			return desc;
+	}
+
+	return NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		return desc;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry) {
+		if (desc->irq == irq)
+			goto out_unlock;
+	}
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc irq_desc for %d aka %#x on cpu %d node %d\n",
+		 irq, irq, cpu, node);
+	BUG_ON(!desc);
+	init_one_irq_desc(irq, desc, cpu);
+
+	/*
+	 * We use RCU's safe list-add method to make
+	 * parallel walking of the hash-list safe:
+	 */
+	list_add_tail_rcu(&desc->hash_entry, hash_head);
+	/*
+	 * Add it to the global list:
+	 */
+	list_add_tail_rcu(&desc->list, &sparse_irqs_head);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc_alloc_cpu(irq, boot_cpu_id);
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+						int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	irq = old_desc->irq;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry) {
+		if (desc->irq == irq && old_desc != desc)
+			goto out_unlock;
+	}
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  move irq_desc for %d aka %#x to cpu %d node %d\n",
+		 irq, irq, cpu, node);
+	BUG_ON(!desc);
+	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+	list_replace_rcu(&old_desc->hash_entry, &desc->hash_entry);
+	list_replace_rcu(&old_desc->list, &desc->list);
+
+	/* free the old one */
+	free_one_irq_desc(old_desc);
+	kfree(old_desc);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+	int old_cpu;
+	int node, old_node;
+
+	/* those all static, do move them */
+	if (desc->irq < NR_IRQS_LEGACY)
+		return desc;
+
+	old_cpu = desc->cpu;
+	printk(KERN_DEBUG "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
+	if (old_cpu != cpu) {
+		node = cpu_to_node(cpu);
+		old_node = cpu_to_node(old_cpu);
+		if (old_node != node)
+			desc = __real_move_irq_desc(desc, cpu);
+		else
+			desc->cpu = cpu;
+	}
+
+	return desc;
+}
+#endif
+
+#else
+
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
@@ -62,17 +357,49 @@ struct irq_desc irq_desc[NR_IRQS] __cach
 	}
 };
 
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_desc[irq];
+
+	return NULL;
+}
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu)
+{
+	return old_desc;
+}
+#endif
+
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
  */
+static void ack_bad_desc(struct irq_desc **descp)
+{
+	unsigned int irq = (*descp)->irq;
+
+	print_irq_desc(irq, *descp);
+	ack_bad_irq(irq);
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+#define ack_bad ack_bad_desc
+#else
 static void ack_bad(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	print_irq_desc(irq, desc);
-	ack_bad_irq(irq);
+	ack_bad_desc(&desc);
 }
+#endif
 
 /*
  * NOP functions
@@ -81,6 +408,14 @@ static void noop(unsigned int irq)
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void noop_desc(struct irq_desc **descp)
+{
+}
+#else
+#define noop_desc noop
+#endif
+
 static unsigned int noop_ret(unsigned int irq)
 {
 	return 0;
@@ -109,9 +444,9 @@ struct irq_chip dummy_irq_chip = {
 	.shutdown	= noop,
 	.enable		= noop,
 	.disable	= noop,
-	.ack		= noop,
-	.mask		= noop,
-	.unmask		= noop,
+	.ack		= noop_desc,
+	.mask		= noop_desc,
+	.unmask		= noop_desc,
 	.end		= noop,
 };
 
@@ -180,7 +515,7 @@ unsigned int __do_IRQ(unsigned int irq)
 		 * No locking required for CPU-local interrupts:
 		 */
 		if (desc->chip->ack)
-			desc->chip->ack(irq);
+			desc_chip_ack(irq, &desc);
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -192,7 +527,7 @@ unsigned int __do_IRQ(unsigned int irq)
 
 	spin_lock(&desc->lock);
 	if (desc->chip->ack)
-		desc->chip->ack(irq);
+		desc_chip_ack(irq, &desc);
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
@@ -261,17 +596,25 @@ out:
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-static struct lock_class_key irq_desc_lock_class;
-
 void early_init_irq_lock_class(void)
 {
+#ifndef CONFIG_SPARSE_IRQ
 	struct irq_desc *desc;
 	int i;
 
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	} end_for_each_irq_desc();
+#endif
 }
 #endif
+
+#ifdef CONFIG_SPARSE_IRQ
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	return desc->kstat_irqs[cpu];
+}
+#endif
+EXPORT_SYMBOL(kstat_irqs_cpu);
+
Index: linux-2.6/arch/x86/kernel/irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq.c
+++ linux-2.6/arch/x86/kernel/irq.c
@@ -99,25 +99,37 @@ static int show_other_interrupts(struct
 int show_interrupts(struct seq_file *p, void *v)
 {
 	unsigned long flags, any_count = 0;
-	int i = *(loff_t *) v, j;
+	int i, j;
 	struct irqaction *action;
 	struct irq_desc *desc;
+	int head = 0;
 
+#ifdef CONFIG_SPARSE_IRQ
+	desc = list_entry(v, struct irq_desc, list);
+	i = desc->irq;
+	if (&desc->list == sparse_irqs_head.next)
+		head = 1;
+#else
+	i = *(loff_t *) v;
 	if (i > nr_irqs)
 		return 0;
 
 	if (i == nr_irqs)
 		return show_other_interrupts(p);
+	if (i == 0)
+		head = 1;
+
+	desc = irq_to_desc(i);
+#endif
 
 	/* print header */
-	if (i == 0) {
+	if (head) {
 		seq_printf(p, "           ");
 		for_each_online_cpu(j)
 			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
 	}
 
-	desc = irq_to_desc(i);
 	spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 	any_count = kstat_irqs(i);
@@ -148,6 +160,12 @@ int show_interrupts(struct seq_file *p,
 	seq_putc(p, '\n');
 out:
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+#ifdef CONFIG_SPARSE_IRQ
+	if (&desc->list == sparse_irqs_head.prev)
+		show_other_interrupts(p);
+#endif
+
 	return 0;
 }
 
Index: linux-2.6/include/linux/irqnr.h
===================================================================
--- linux-2.6.orig/include/linux/irqnr.h
+++ linux-2.6/include/linux/irqnr.h
@@ -7,18 +7,11 @@
 
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0; irq < nr_irqs; irq++)
-#else
-extern int nr_irqs;
+# define end_for_each_irq_desc()
 
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-
-# define for_each_irq_desc_reverse(irq, desc)				\
-	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);	\
-	     irq >= 0; irq--, desc--)
+static inline early_sparse_irq_init(void)
+{
+}
 #endif
 
-#define for_each_irq_nr(irq)			\
-	for (irq = 0; irq < nr_irqs; irq++)
-
 #endif
Index: linux-2.6/arch/x86/kernel/irq_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_32.c
+++ linux-2.6/arch/x86/kernel/irq_32.c
@@ -251,10 +251,10 @@ void fixup_irqs(cpumask_t map)
 			mask = map;
 		}
 		if (desc->chip->set_affinity)
-			desc->chip->set_affinity(irq, mask);
+			desc_chip_set_affinity(irq, desc, mask);
 		else if (desc->action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 #if 0
 	barrier();
Index: linux-2.6/arch/x86/kernel/irq_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_64.c
+++ linux-2.6/arch/x86/kernel/irq_64.c
@@ -113,15 +113,15 @@ void fixup_irqs(cpumask_t map)
 		}
 
 		if (desc->chip->mask)
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 
 		if (desc->chip->set_affinity)
-			desc->chip->set_affinity(irq, mask);
+			desc_chip_set_affinity(irq, desc, mask);
 		else if (!(warned++))
 			set_affinity = 0;
 
 		if (desc->chip->unmask)
-			desc->chip->unmask(irq);
+			desc_chip_unmask(irq, &desc);
 
 		spin_unlock(&desc->lock);
 
@@ -129,7 +129,7 @@ void fixup_irqs(cpumask_t map)
 			printk("Broke affinity for irq %i\n", irq);
 		else if (!set_affinity)
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 	/* That doesn't seem sufficient.  Give it 1ms. */
 	local_irq_enable();
Index: linux-2.6/kernel/irq/proc.c
===================================================================
--- linux-2.6.orig/kernel/irq/proc.c
+++ linux-2.6/kernel/irq/proc.c
@@ -243,7 +243,8 @@ void init_irq_proc(void)
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for_each_irq_desc(irq, desc)
+	for_each_irq_desc(irq, desc) {
 		register_irq_proc(irq, desc);
+	} end_for_each_irq_desc();
 }
 
Index: linux-2.6/kernel/irq/spurious.c
===================================================================
--- linux-2.6.orig/kernel/irq/spurious.c
+++ linux-2.6/kernel/irq/spurious.c
@@ -99,7 +99,7 @@ static int misrouted_irq(int irq)
 
 		if (try_one_irq(i, desc))
 			ok = 1;
-	}
+	} end_for_each_irq_desc();
 	/* So the caller can adjust the irq error counts */
 	return ok;
 }
@@ -122,7 +122,7 @@ static void poll_spurious_irqs(unsigned
 			continue;
 
 		try_one_irq(i, desc);
-	}
+	} end_for_each_irq_desc();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c
+++ linux-2.6/init/main.c
@@ -542,6 +542,15 @@ void __init __weak thread_info_cache_ini
 {
 }
 
+void __init __weak arch_early_irq_init(void)
+{
+}
+
+void __init __weak early_irq_init(void)
+{
+	arch_early_irq_init();
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
@@ -612,6 +621,8 @@ asmlinkage void __init start_kernel(void
 	sort_main_extable();
 	trap_init();
 	rcu_init();
+	/* init some links before init_ISA_irqs() */
+	early_irq_init();
 	init_IRQ();
 	pidhash_init();
 	init_timers();
Index: linux-2.6/arch/x86/include/asm/irq_vectors.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/irq_vectors.h
+++ linux-2.6/arch/x86/include/asm/irq_vectors.h
@@ -101,6 +101,8 @@
 #define LAST_VM86_IRQ		15
 #define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
 
+#define NR_IRQS_LEGACY		16
+
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
Index: linux-2.6/arch/x86/kernel/i8259.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/i8259.c
+++ linux-2.6/arch/x86/kernel/i8259.c
@@ -36,12 +36,31 @@ static int i8259A_auto_eoi;
 DEFINE_SPINLOCK(i8259A_lock);
 static void mask_and_ack_8259A(unsigned int);
 
+#ifdef CONFIG_SPARSE_IRQ
+static void mask_and_ack_8259A_desc(struct irq_desc **descp)
+{
+	mask_and_ack_8259A((*descp)->irq);
+}
+static void disable_8259A_irq_desc(struct irq_desc **descp)
+{
+	disable_8259A_irq((*descp)->irq);
+}
+static void enable_8259A_irq_desc(struct irq_desc **descp)
+{
+	enable_8259A_irq((*descp)->irq);
+}
+#else
+#define mask_and_ack_8259A_desc mask_and_ack_8259A
+#define disable_8259A_irq_desc disable_8259A_irq
+#define enable_8259A_irq_desc enable_8259A_irq
+#endif
+
 struct irq_chip i8259A_chip = {
 	.name		= "XT-PIC",
-	.mask		= disable_8259A_irq,
+	.mask		= disable_8259A_irq_desc,
 	.disable	= disable_8259A_irq,
-	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
+	.unmask		= enable_8259A_irq_desc,
+	.mask_ack	= mask_and_ack_8259A_desc,
 };
 
 /*
@@ -348,9 +367,9 @@ void init_8259A(int auto_eoi)
 		 * In AEOI mode we just have to mask the interrupt
 		 * when acking.
 		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
+		i8259A_chip.mask_ack = disable_8259A_irq_desc;
 	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
+		i8259A_chip.mask_ack = mask_and_ack_8259A_desc;
 
 	udelay(100);		/* wait for 8259A to initialize */
 
Index: linux-2.6/arch/x86/kernel/uv_irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/uv_irq.c
+++ linux-2.6/arch/x86/kernel/uv_irq.c
@@ -18,26 +18,45 @@ static void uv_noop(unsigned int irq)
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_noop_desc(struct irq_desc **descp)
+{
+}
+
+#else
+#define uv_noop_desc uv_noop
+#endif
+
 static unsigned int uv_noop_ret(unsigned int irq)
 {
 	return 0;
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_ack_apic_desc(struct irq_desc **descp)
+{
+	ack_APIC_irq();
+}
+
+#else
 static void uv_ack_apic(unsigned int irq)
 {
 	ack_APIC_irq();
 }
 
+#define uv_ack_apic_desc uv_ack_apic
+#endif
+
 struct irq_chip uv_irq_chip = {
 	.name		= "UV-CORE",
 	.startup	= uv_noop_ret,
 	.shutdown	= uv_noop,
 	.enable		= uv_noop,
 	.disable	= uv_noop,
-	.ack		= uv_noop,
-	.mask		= uv_noop,
-	.unmask		= uv_noop,
-	.eoi		= uv_ack_apic,
+	.ack		= uv_noop_desc,
+	.mask		= uv_noop_desc,
+	.unmask		= uv_noop_desc,
+	.eoi		= uv_ack_apic_desc,
 	.end		= uv_noop,
 };
 
Index: linux-2.6/drivers/pci/msi.c
===================================================================
--- linux-2.6.orig/drivers/pci/msi.c
+++ linux-2.6/drivers/pci/msi.c
@@ -103,11 +103,11 @@ static void msix_set_enable(struct pci_d
 	}
 }
 
-static void msix_flush_writes(unsigned int irq)
+static void msix_flush_writes(struct irq_desc *desc)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -135,11 +135,11 @@ static void msix_flush_writes(unsigned i
  * Returns 1 if it succeeded in masking the interrupt and 0 if the device
  * doesn't support MSI masking.
  */
-static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
+static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -172,9 +172,9 @@ static int msi_set_mask_bits(unsigned in
 	return 1;
 }
 
-void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
-	struct msi_desc *entry = get_irq_msi(irq);
+	struct msi_desc *entry = get_irq_desc_msi(desc);
 	switch(entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
 	{
@@ -211,9 +211,16 @@ void read_msi_msg(unsigned int irq, stru
 	}
 }
 
-void write_msi_msg(unsigned int irq, struct msi_msg *msg)
+void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	read_msi_msg_desc(desc, msg);
+}
+
+void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
-	struct msi_desc *entry = get_irq_msi(irq);
+	struct msi_desc *entry = get_irq_desc_msi(desc);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
 	{
@@ -252,17 +259,43 @@ void write_msi_msg(unsigned int irq, str
 	entry->msg = *msg;
 }
 
-void mask_msi_irq(unsigned int irq)
+void write_msi_msg(unsigned int irq, struct msi_msg *msg)
 {
-	msi_set_mask_bits(irq, 1, 1);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	write_msi_msg_desc(desc, msg);
 }
 
+void mask_msi_irq_desc(struct irq_desc **descp)
+{
+	struct irq_desc *desc = *descp;
+
+	msi_set_mask_bits(desc, 1, 1);
+	msix_flush_writes(desc);
+}
+
+void unmask_msi_irq_desc(struct irq_desc **descp)
+{
+	struct irq_desc *desc = *descp;
+
+	msi_set_mask_bits(desc, 1, 0);
+	msix_flush_writes(desc);
+}
+
+#ifndef CONFIG_SPARSE_IRQ
+void mask_msi_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_msi_irq_desc(&desc);
+}
 void unmask_msi_irq(unsigned int irq)
 {
-	msi_set_mask_bits(irq, 1, 0);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_msi_irq_desc(&desc);
 }
+#endif
 
 static int msi_free_irqs(struct pci_dev* dev);
 
@@ -303,9 +336,11 @@ static void __pci_restore_msi_state(stru
 	pci_intx_for_msi(dev, 0);
 	msi_set_enable(dev, 0);
 	write_msi_msg(dev->irq, &entry->msg);
-	if (entry->msi_attrib.maskbit)
-		msi_set_mask_bits(dev->irq, entry->msi_attrib.maskbits_mask,
+	if (entry->msi_attrib.maskbit) {
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask,
 				  entry->msi_attrib.masked);
+	}
 
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
@@ -327,8 +362,9 @@ static void __pci_restore_msix_state(str
 	msix_set_enable(dev, 0);
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
+		struct irq_desc *desc = irq_to_desc(entry->irq);
 		write_msi_msg(entry->irq, &entry->msg);
-		msi_set_mask_bits(entry->irq, 1, entry->msi_attrib.masked);
+		msi_set_mask_bits(desc, 1, entry->msi_attrib.masked);
 	}
 
 	BUG_ON(list_empty(&dev->msi_list));
@@ -596,7 +632,8 @@ void pci_msi_shutdown(struct pci_dev* de
 	/* Return the the pci reset with msi irqs unmasked */
 	if (entry->msi_attrib.maskbit) {
 		u32 mask = entry->msi_attrib.maskbits_mask;
-		msi_set_mask_bits(dev->irq, mask, ~mask);
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, mask, ~mask);
 	}
 	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
 		return;
Index: linux-2.6/include/linux/msi.h
===================================================================
--- linux-2.6.orig/include/linux/msi.h
+++ linux-2.6/include/linux/msi.h
@@ -10,8 +10,18 @@ struct msi_msg {
 };
 
 /* Helper functions */
+struct irq_desc;
+#ifdef CONFIG_SPARSE_IRQ
+extern void mask_msi_irq_desc(struct irq_desc **descp);
+extern void unmask_msi_irq_desc(struct irq_desc **descp);
+#define mask_msi_irq mask_msi_irq_desc
+#define unmask_msi_irq unmask_msi_irq_desc
+#else
 extern void mask_msi_irq(unsigned int irq);
 extern void unmask_msi_irq(unsigned int irq);
+#endif
+extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
+extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
 extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
 extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
Index: linux-2.6/arch/x86/include/asm/hpet.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/hpet.h
+++ linux-2.6/arch/x86/include/asm/hpet.h
@@ -72,8 +72,15 @@ extern void hpet_disable(void);
 extern unsigned long hpet_readl(unsigned long a);
 extern void force_hpet_resume(void);
 
+#ifdef CONFIG_SPARSE_IRQ
+extern void hpet_msi_unmask_desc(struct irq_desc **descp);
+extern void hpet_msi_mask_desc(struct irq_desc **descp);
+#define hpet_msi_unmask hpet_msi_unmask_desc
+#define hpet_msi_mask hpet_msi_mask_desc
+#else
 extern void hpet_msi_unmask(unsigned int irq);
 extern void hpet_msi_mask(unsigned int irq);
+#endif
 extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
 extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
 
Index: linux-2.6/arch/x86/kernel/hpet.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/hpet.c
+++ linux-2.6/arch/x86/kernel/hpet.c
@@ -347,9 +347,9 @@ static int hpet_legacy_next_event(unsign
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev	*hpet_devs;
 
-void hpet_msi_unmask(unsigned int irq)
+void hpet_msi_unmask_desc(struct irq_desc **descp)
 {
-	struct hpet_dev *hdev = get_irq_data(irq);
+	struct hpet_dev *hdev = get_irq_desc_data(*descp);
 	unsigned long cfg;
 
 	/* unmask it */
@@ -358,10 +358,10 @@ void hpet_msi_unmask(unsigned int irq)
 	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
 
-void hpet_msi_mask(unsigned int irq)
+void hpet_msi_mask_desc(struct irq_desc **descp)
 {
 	unsigned long cfg;
-	struct hpet_dev *hdev = get_irq_data(irq);
+	struct hpet_dev *hdev = get_irq_desc_data(*descp);
 
 	/* mask it */
 	cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -369,6 +369,21 @@ void hpet_msi_mask(unsigned int irq)
 	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void hpet_msi_unmask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	hpet_msi_unmask_desc(&desc);
+}
+void hpet_msi_mask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	hpet_msi_mask_desc(&desc);
+}
+#endif
+
 void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
 {
 	struct hpet_dev *hdev = get_irq_data(irq);
Index: linux-2.6/include/linux/htirq.h
===================================================================
--- linux-2.6.orig/include/linux/htirq.h
+++ linux-2.6/include/linux/htirq.h
@@ -9,8 +9,16 @@ struct ht_irq_msg {
 /* Helper functions.. */
 void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
 void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
+#ifdef CONFIG_SPARSE_IRQ
+struct irq_desc;
+void mask_ht_irq_desc(struct irq_desc **descp);
+void unmask_ht_irq_desc(struct irq_desc **descp);
+#define mask_ht_irq mask_ht_irq_desc
+#define unmask_ht_irq unmask_ht_irq_desc
+#else
 void mask_ht_irq(unsigned int irq);
 void unmask_ht_irq(unsigned int irq);
+#endif
 
 /* The arch hook for getting things started */
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev);
Index: linux-2.6/kernel/irq/migration.c
===================================================================
--- linux-2.6.orig/kernel/irq/migration.c
+++ linux-2.6/kernel/irq/migration.c
@@ -1,9 +1,9 @@
 
 #include <linux/irq.h>
 
-void move_masked_irq(int irq)
+void move_masked_irq_desc(struct irq_desc **descp)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc = *descp;
 	cpumask_t tmp;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
@@ -42,14 +42,17 @@ void move_masked_irq(int irq)
 	 * masking the irqs.
 	 */
 	if (likely(!cpus_empty(tmp))) {
-		desc->chip->set_affinity(irq,tmp);
+		desc_chip_set_affinity(desc->irq, desc, tmp);
 	}
 	cpus_clear(desc->pending_mask);
 }
 
-void move_native_irq(int irq)
+void move_native_irq_desc(struct irq_desc **descp)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc = *descp;
+#ifndef CONFIG_SPARSE_IRQ
+	unsigned int irq = desc->irq;
+#endif
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
@@ -57,8 +60,23 @@ void move_native_irq(int irq)
 	if (unlikely(desc->status & IRQ_DISABLED))
 		return;
 
-	desc->chip->mask(irq);
-	move_masked_irq(irq);
-	desc->chip->unmask(irq);
+	desc_chip_mask(irq, descp);
+	move_masked_irq_desc(descp);
+	desc_chip_unmask(irq, descp);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void move_masked_irq(int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	move_masked_irq_desc(&desc);
+}
+
+void move_native_irq(int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	move_native_irq_desc(&desc);
+}
+#endif
Index: linux-2.6/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.orig/drivers/pci/intel-iommu.c
+++ linux-2.6/drivers/pci/intel-iommu.c
@@ -751,9 +751,9 @@ const char *dmar_get_fault_reason(u8 fau
 		return fault_reason_strings[fault_reason];
 }
 
-void dmar_msi_unmask(unsigned int irq)
+void dmar_msi_unmask_desc(struct irq_desc **descp)
 {
-	struct intel_iommu *iommu = get_irq_data(irq);
+	struct intel_iommu *iommu = get_irq_desc_data(*descp);
 	unsigned long flag;
 
 	/* unmask it */
@@ -764,10 +764,10 @@ void dmar_msi_unmask(unsigned int irq)
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
-void dmar_msi_mask(unsigned int irq)
+void dmar_msi_mask_desc(struct irq_desc **descp)
 {
 	unsigned long flag;
-	struct intel_iommu *iommu = get_irq_data(irq);
+	struct intel_iommu *iommu = get_irq_desc_data(*descp);
 
 	/* mask it */
 	spin_lock_irqsave(&iommu->register_lock, flag);
@@ -777,6 +777,21 @@ void dmar_msi_mask(unsigned int irq)
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void dmar_msi_unmask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	dmar_msi_unmask_desc(&desc);
+}
+void dmar_msi_mask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	dmar_msi_mask_desc(&desc);
+}
+#endif
+
 void dmar_msi_write(int irq, struct msi_msg *msg)
 {
 	struct intel_iommu *iommu = get_irq_data(irq);
Index: linux-2.6/include/linux/dmar.h
===================================================================
--- linux-2.6.orig/include/linux/dmar.h
+++ linux-2.6/include/linux/dmar.h
@@ -122,8 +122,15 @@ extern const char *dmar_get_fault_reason
 /* Can't use the common MSI interrupt functions
  * since DMAR is not a pci device
  */
+#ifdef CONFIG_SPARSE_IRQ
+extern void dmar_msi_unmask_desc(struct irq_desc **descp);
+extern void dmar_msi_mask_desc(struct irq_desc **descp);
+#define dmar_msi_unmask dmar_msi_unmask_desc
+#define dmar_msi_mask dmar_msi_mask_desc
+#else
 extern void dmar_msi_unmask(unsigned int irq);
 extern void dmar_msi_mask(unsigned int irq);
+#endif
 extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);
Index: linux-2.6/kernel/irq/manage.c
===================================================================
--- linux-2.6.orig/kernel/irq/manage.c
+++ linux-2.6/kernel/irq/manage.c
@@ -92,14 +92,14 @@ int irq_set_affinity(unsigned int irq, c
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
 		desc->affinity = cpumask;
-		desc->chip->set_affinity(irq, cpumask);
+		desc_chip_set_affinity(irq, desc, cpumask);
 	} else {
 		desc->status |= IRQ_MOVE_PENDING;
 		desc->pending_mask = cpumask;
 	}
 #else
 	desc->affinity = cpumask;
-	desc->chip->set_affinity(irq, cpumask);
+	desc_chip_set_affinity(irq, desc, cpumask);
 #endif
 	desc->status |= IRQ_AFFINITY_SET;
 	spin_unlock_irqrestore(&desc->lock, flags);
@@ -131,7 +131,7 @@ int do_irq_select_affinity(unsigned int
 	}
 
 	desc->affinity = mask;
-	desc->chip->set_affinity(irq, mask);
+	desc_chip_set_affinity(irq, desc, mask);
 
 	return 0;
 }

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v14
  2008-11-14  6:29                                                   ` [PATCH] sparse_irq aka dyn_irq v14 Yinghai Lu
@ 2008-11-14  6:46                                                     ` Andrew Morton
  2008-11-15  9:05                                                       ` Yinghai Lu
  0 siblings, 1 reply; 66+ messages in thread
From: Andrew Morton @ 2008-11-14  6:46 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: mingo, tglx, hpa, linux-kernel, travis

On Thu, 13 Nov 2008 22:29:21 -0800 Yinghai Lu <yinghai@kernel.org> wrote:

> address some Andrew's concerns.
> 
> ...
>
> +static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
> +{
> +	struct irq_pin_list *pin;
> +	int node;
> +
> +	node = cpu_to_node(cpu);
> +
> +	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
> +	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
> +	BUG_ON(!pin);
> +
> +	return pin;
> +}

GFP_ATOMIC allocation attempts are unreliable - much more so than
GFP_KERNEL.  GFP_ATOMIC allocations can and do fail.  With the above
code, such a failure will crash the machine.

The code should handle this error and recover gracefully.

>
> ...
>
> +static struct irq_cfg *get_one_free_irq_cfg(int cpu)
> +{
> +	struct irq_cfg *cfg;
> +	int node;
> +
> +	node = cpu_to_node(cpu);
> +
> +	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
> +	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
> +	BUG_ON(!cfg);
> +
> +	return cfg;
>  }

Ditto

> ...
>
> +static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
> +{
> +	struct irq_2_iommu *iommu;
> +	int node;
> +
> +	node = cpu_to_node(cpu);
> +
> +	iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
> +	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
> +
> +	return iommu;
> +}

I spent some time trying to work out whether the callers handle failure
here but I got lost in a twisty maze.

> ...
>
> +static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
> +{
> +	unsigned long bytes;
> +	char *ptr;
> +	int node;
> +
> +	/* Compute how many bytes we need per irq and allocate them */
> +	bytes = nr * sizeof(unsigned int);
> +
> +	node = cpu_to_node(cpu);
> +	ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
> +	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
> +	BUG_ON(!ptr);
> +
> +	desc->kstat_irqs = (unsigned int *)ptr;
> +}

Ditto.

>
> ...
>
> +struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
> +{
> +	struct irq_desc *desc;
> +	struct list_head *hash_head;
> +	unsigned long flags;
> +	int node;
> +
> +	desc = irq_to_desc(irq);
> +	if (desc)
> +		return desc;
> +
> +	hash_head = sparseirqhashentry(irq);
> +
> +	spin_lock_irqsave(&sparse_irq_lock, flags);
> +
> +	/*
> +	 * We have to do the hash-walk again, to avoid races
> +	 * with another CPU:
> +	 */
> +	list_for_each_entry(desc, hash_head, hash_entry) {
> +		if (desc->irq == irq)
> +			goto out_unlock;
> +	}
> +
> +	node = cpu_to_node(cpu);
> +	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
> +	printk(KERN_DEBUG "  alloc irq_desc for %d aka %#x on cpu %d node %d\n",
> +		 irq, irq, cpu, node);
> +	BUG_ON(!desc);

Ditto.

> +	init_one_irq_desc(irq, desc, cpu);
> +
> +	/*
> +	 * We use RCU's safe list-add method to make
> +	 * parallel walking of the hash-list safe:
> +	 */
> +	list_add_tail_rcu(&desc->hash_entry, hash_head);
> +	/*
> +	 * Add it to the global list:
> +	 */
> +	list_add_tail_rcu(&desc->list, &sparse_irqs_head);
> +
> +out_unlock:
> +	spin_unlock_irqrestore(&sparse_irq_lock, flags);
> +
> +	return desc;
> +}
> +
>
> ...
>
> +static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
> +						int cpu)
> +{
> +	struct irq_desc *desc;
> +	unsigned int irq;
> +	struct list_head *hash_head;
> +	unsigned long flags;
> +	int node;
> +
> +	irq = old_desc->irq;
> +
> +	hash_head = sparseirqhashentry(irq);
> +
> +	spin_lock_irqsave(&sparse_irq_lock, flags);
> +	/*
> +	 * We have to do the hash-walk again, to avoid races
> +	 * with another CPU:
> +	 */
> +	list_for_each_entry(desc, hash_head, hash_entry) {
> +		if (desc->irq == irq && old_desc != desc)
> +			goto out_unlock;
> +	}
> +
> +	node = cpu_to_node(cpu);
> +	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
> +	printk(KERN_DEBUG "  move irq_desc for %d aka %#x to cpu %d node %d\n",
> +		 irq, irq, cpu, node);
> +	BUG_ON(!desc);

Ditto.

> +	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
> +
> +	list_replace_rcu(&old_desc->hash_entry, &desc->hash_entry);
> +	list_replace_rcu(&old_desc->list, &desc->list);
> +
> +	/* free the old one */
> +	free_one_irq_desc(old_desc);
> +	kfree(old_desc);
> +
> +out_unlock:
> +	spin_unlock_irqrestore(&sparse_irq_lock, flags);
> +
> +	return desc;
> +}
> +



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v14
  2008-11-14  6:46                                                     ` Andrew Morton
@ 2008-11-15  9:05                                                       ` Yinghai Lu
  0 siblings, 0 replies; 66+ messages in thread
From: Yinghai Lu @ 2008-11-15  9:05 UTC (permalink / raw)
  To: Andrew Morton, mingo, tglx, hpa; +Cc: linux-kernel, travis

please check it. address kzalloc_node return handling.

YH

---

From: Yinghai Lu <yinghai@kernel.org>
Subject: sparseirq v15

impact: new feature sparseirq

add some kind of hash table as Ingo suggesting.
remove dyna_array

when sparse_irq is used (CONFIG_SPARSE_IRQ), use kzalloc_node to get irq_desc, irq_cfg
  use desc->chip_data for x86 to store irq_cfg

if CONFIG_MOVE_IRQ_DESC is set
  make irq_desc to go with affinity aka irq_desc moving etc
  call move_irq_desc in irq_complete_move()
  need to add struct (irq_desc **descp) to ack_edge/level to make sure desc get updated
  try to pass desc cfg as more as possible to avoid list looking up. 
  legacy irq_desc is not moved, because they are allocated via static array

for logical apic mode, need to add move_desc_in_progress_in_same_domain. otherwise it will not get moved. ==> also could need two phase to get irq_desc moved.
	for example: 0xff is old affinity, and need to set 0xf, and then set to 0xf0.
	[ or we need to change domain definition to cpus on the same node ? ]

LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,000000ff
LBSuse:~ # echo f > /proc/irq/22/smp_affinity
LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,0000000f
LBSuse:~ # tail /var/log/messages
...
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
LBSuse:~ # echo f0 > /proc/irq/22/smp_affinity
LBSuse:~ # tail /var/log/messages
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
Oct 27 12:36:46 LBSuse kernel:   move irq_desc for 22 aka 0x16 to cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc kstat_irqs on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_cfg on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_2_pin on cpu 7 node 1

so assume the user space program should update /proc/irq/XX/smp_affinity to 03 or 0f at first on boot
or we change irq_default_affinity ?

for physical apic is much simple
on 4 sockets 16 cores system
irq_desc is moving..
when
# echo 10 > /proc/irq/134483967/smp_affinity
# echo 100 > /proc/irq/134483967/smp_affinity
# echo 1000 > /proc/irq/134483967/smp_affinity
got
Nov  9 21:39:51 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc kstat_irqs on cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc irq_cfg on cpu 4 node 1
Nov  9 21:40:05 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc kstat_irqs on cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc irq_cfg on cpu 8 node 2
Nov  9 21:40:18 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc kstat_irqs on cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc irq_cfg on cpu 12 node 3

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/Kconfig                   |   20 
 arch/x86/include/asm/hpet.h        |    7 
 arch/x86/include/asm/irq_vectors.h |    2 
 arch/x86/kernel/hpet.c             |   23 
 arch/x86/kernel/i8259.c            |   29 +
 arch/x86/kernel/io_apic.c          |  880 ++++++++++++++++++++++++++-----------
 arch/x86/kernel/irq.c              |   24 -
 arch/x86/kernel/irq_32.c           |    4 
 arch/x86/kernel/irq_64.c           |    8 
 arch/x86/kernel/irqinit_32.c       |    3 
 arch/x86/kernel/irqinit_64.c       |    3 
 arch/x86/kernel/uv_irq.c           |   27 -
 arch/x86/mm/init_32.c              |    3 
 drivers/char/random.c              |   31 +
 drivers/pci/htirq.c                |   44 +
 drivers/pci/intel-iommu.c          |   23 
 drivers/pci/intr_remapping.c       |   72 ++-
 drivers/pci/msi.c                  |   71 ++
 drivers/xen/events.c               |    9 
 fs/proc/interrupts.c               |   18 
 fs/proc/stat.c                     |   17 
 include/linux/dmar.h               |    7 
 include/linux/htirq.h              |    8 
 include/linux/interrupt.h          |    2 
 include/linux/irq.h                |   95 +++
 include/linux/irqnr.h              |   15 
 include/linux/kernel_stat.h        |   14 
 include/linux/msi.h                |   10 
 init/main.c                        |   11 
 kernel/irq/autoprobe.c             |   10 
 kernel/irq/chip.c                  |   40 -
 kernel/irq/handle.c                |  386 +++++++++++++++-
 kernel/irq/manage.c                |    6 
 kernel/irq/migration.c             |   34 +
 kernel/irq/proc.c                  |    3 
 kernel/irq/spurious.c              |    4 
 36 files changed, 1590 insertions(+), 373 deletions(-)

Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -240,6 +240,26 @@ config X86_HAS_BOOT_CPU_ID
 	def_bool y
 	depends on X86_VOYAGER
 
+config SPARSE_IRQ
+	bool "Support sparse irq numbering"
+	depends on PCI_MSI || HT_IRQ
+	default y
+	help
+	  This enables support for sparse irq, esp for msi/msi-x. the irq
+	  number will be bus/dev/fn + 12bit. You may need if you have lots of
+	  cards supports msi-x installed.
+
+	  If you don't know what to do here, say Y.
+
+config MOVE_IRQ_DESC
+	bool "Move irq desc when changing irq smp_affinity"
+	depends on SPARSE_IRQ && SMP
+	default y
+	help
+	  This enables moving irq_desc to cpu/node that irq will use handled.
+
+	  If you don't know what to do here, say Y.
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
Index: linux-2.6/arch/x86/kernel/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/io_apic.c
+++ linux-2.6/arch/x86/kernel/io_apic.c
@@ -108,94 +108,261 @@ static int __init parse_noapic(char *str
 early_param("noapic", parse_noapic);
 
 struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+	int apic, pin;
+	struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+	struct irq_pin_list *pin;
+	int node;
+
+	node = cpu_to_node(cpu);
+
+	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+
+	return pin;
+}
+
 struct irq_cfg {
-	unsigned int irq;
 	struct irq_pin_list *irq_2_pin;
 	cpumask_t domain;
 	cpumask_t old_domain;
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
+#ifdef CONFIG_MOVE_IRQ_DESC
+	u8 move_desc_in_progress_in_same_domain : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
 static struct irq_cfg irq_cfgx[NR_IRQS] = {
-	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-	[3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-	[4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-	[5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-	[6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-	[7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-	[8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-	[9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-	[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-	[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-	[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-	[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-	[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+#endif
+	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
 };
 
-#define for_each_irq_cfg(irq, cfg)		\
-	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+void __init arch_early_irq_init(void)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int count;
+	int i;
+#ifdef CONFIG_SPARSE_IRQ
+	int count_desc = NR_IRQS_LEGACY;
+#else
+	int count_desc = NR_IRQS;
+#endif
+
+	cfg = irq_cfgx;
+	count = ARRAY_SIZE(irq_cfgx);
+
+	BUG_ON(count > count_desc);
+
+	for (i = 0; i < count; i++) {
+		desc = irq_to_desc(i);
+		desc->chip_data = &cfg[i];
+	}
+}
 
+#ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+	struct irq_cfg *cfg = NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		cfg = desc->chip_data;
+
+	return cfg;
+}
+
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+{
+	struct irq_cfg *cfg;
+	int node;
+
+	node = cpu_to_node(cpu);
+
+	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
+
+	return cfg;
 }
 
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
 {
-	return irq_cfg(irq);
+	struct irq_cfg *cfg;
+
+	cfg = desc->chip_data;
+	if (!cfg) {
+		desc->chip_data = get_one_free_irq_cfg(cpu);
+		if (!desc->chip_data) {
+			printk(KERN_ERR "can not alloc irq_cfg\n");
+			BUG_ON(1);
+		}
+	}
 }
 
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+#ifdef CONFIG_MOVE_IRQ_DESC
 
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu)
+{
+	struct irq_pin_list *old_entry, *head, *tail, *entry;
 
-struct irq_pin_list {
-	int apic, pin;
-	struct irq_pin_list *next;
-};
+	cfg->irq_2_pin = NULL;
+	old_entry = old_cfg->irq_2_pin;
+	if (!old_entry)
+		return;
+
+	entry = get_one_free_irq_2_pin(cpu);
+	if (!entry)
+		return;
 
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+	entry->apic = old_entry->apic;
+	entry->pin = old_entry->pin;
+	head = entry;
+	tail = entry;
+	old_entry = old_entry->next;
+
+	while (old_entry) {
+		entry = get_one_free_irq_2_pin(cpu);
+		if (!entry) {
+			entry = head;
+			while (entry) {
+				head = entry->next;
+				kfree(entry);
+				entry = head;
+			}
+			/* still use the old one */
+			return;
+		}
+		entry->apic = old_entry->apic;
+		entry->pin = old_entry->pin;
+		tail->next = entry;
+		tail = entry;
+		old_entry = old_entry->next;
+	}
 
-static void __init irq_2_pin_init(void)
+	tail->next = NULL;
+	cfg->irq_2_pin = head;
+}
+
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
 {
-	struct irq_pin_list *pin = irq_2_pin_head;
-	int i;
+	struct irq_pin_list *entry, *next;
 
-	for (i = 1; i < PIN_MAP_SIZE; i++)
-		pin[i-1].next = &pin[i];
+	if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+		return;
 
-	irq_2_pin_ptr = &pin[0];
+	entry = old_cfg->irq_2_pin;
+
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	old_cfg->irq_2_pin = NULL;
 }
 
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_ptr;
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
 
-	if (!pin)
-		panic("can not get more irq_2_pin\n");
+	cfg = get_one_free_irq_cfg(cpu);
 
-	irq_2_pin_ptr = pin->next;
-	pin->next = NULL;
-	return pin;
+	if (!cfg)
+		return;
+
+	desc->chip_data = cfg;
+
+	old_cfg = old_desc->chip_data;
+
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+	kfree(old_cfg);
 }
 
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	struct irq_cfg *old_cfg, *cfg;
+
+	old_cfg = old_desc->chip_data;
+	cfg = desc->chip_data;
+
+	if (old_cfg == cfg)
+		return;
+
+	if (old_cfg) {
+		free_irq_2_pin(old_cfg, cfg);
+		free_irq_cfg(old_cfg);
+		old_desc->chip_data = NULL;
+	}
+}
+
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+	struct irq_cfg *cfg = desc->chip_data;
+
+	if (!cfg->move_in_progress) {
+		/* it means that domain is not changed */
+		if (!cpus_intersects(desc->affinity, mask))
+			cfg->move_desc_in_progress_in_same_domain = 1;
+	}
+}
+#endif
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+#endif
+
+#ifndef CONFIG_MOVE_IRQ_DESC
+static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+}
+#endif
+
 struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
@@ -237,11 +404,10 @@ static inline void io_apic_modify(unsign
 	writel(value, &io_apic->data);
 }
 
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
-	struct irq_cfg *cfg = irq_cfg(irq);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	entry = cfg->irq_2_pin;
@@ -323,13 +489,12 @@ static void ioapic_mask_entry(int apic,
 }
 
 #ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
 {
 	int apic, pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
 
-	cfg = irq_cfg(irq);
 	entry = cfg->irq_2_pin;
 	for (;;) {
 		unsigned int reg;
@@ -359,24 +524,27 @@ static void __target_IO_APIC_irq(unsigne
 	}
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
 	unsigned long flags;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	cfg = irq_cfg(irq);
-	if (assign_irq_vector(irq, mask))
+	irq = desc->irq;
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 	/*
@@ -384,12 +552,24 @@ static void set_ioapic_affinity_irq(unsi
 	 */
 	dest = SET_APIC_LOGICAL_ID(dest);
 
-	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__target_IO_APIC_irq(irq, dest, cfg->vector);
+	__target_IO_APIC_irq(irq, dest, cfg);
 	desc->affinity = mask;
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+#define set_ioapic_affinity_irq set_ioapic_affinity_irq_desc
+#else
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	set_ioapic_affinity_irq_desc(desc, mask);
+}
+#endif
 #endif /* CONFIG_SMP */
 
 /*
@@ -397,16 +577,18 @@ static void set_ioapic_affinity_irq(unsi
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	/* first time to refer irq_cfg, so with new */
-	cfg = irq_cfg_alloc(irq);
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin();
+		entry = get_one_free_irq_2_pin(cpu);
+		if (!entry) {
+			printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+					apic, pin);
+			return;
+		}
 		cfg->irq_2_pin = entry;
 		entry->apic = apic;
 		entry->pin = pin;
@@ -421,7 +603,7 @@ static void add_pin_to_irq(unsigned int
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin();
+	entry->next = get_one_free_irq_2_pin(cpu);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
@@ -430,11 +612,10 @@ static void add_pin_to_irq(unsigned int
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
 	struct irq_pin_list *entry = cfg->irq_2_pin;
 	int replaced = 0;
 
@@ -451,18 +632,16 @@ static void __init replace_pin_at_irq(un
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq(irq, newapic, newpin);
+		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
 
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
 				int mask_and, int mask_or,
 				void (*final)(struct irq_pin_list *entry))
 {
 	int pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	cfg = irq_cfg(irq);
 	for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
 		unsigned int reg;
 		pin = entry->pin;
@@ -475,9 +654,9 @@ static inline void io_apic_modify_irq(un
 	}
 }
 
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
 #ifdef CONFIG_X86_64
@@ -492,47 +671,69 @@ void io_apic_sync(struct irq_pin_list *e
 	readl(&io_apic->data);
 }
 
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 }
 #else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
 			IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
 			IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
 #endif /* CONFIG_X86_32 */
 
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq_desc(struct irq_desc **descp)
 {
+	struct irq_cfg *cfg = (*descp)->chip_data;
 	unsigned long flags;
 
+	BUG_ON(!cfg);
+
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__mask_IO_APIC_irq(irq);
+	__mask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq_desc(struct irq_desc **descp)
 {
+	struct irq_cfg *cfg = (*descp)->chip_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_IO_APIC_irq(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+#define mask_IO_APIC_irq mask_IO_APIC_irq_desc
+#define unmask_IO_APIC_irq unmask_IO_APIC_irq_desc
+#else
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_IO_APIC_irq_desc(&desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_IO_APIC_irq_desc(&desc);
+}
+#endif
+
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
@@ -809,7 +1010,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector
  */
 static int EISA_ELCR(unsigned int irq)
 {
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		unsigned int port = 0x4d0 + (irq >> 3);
 		return (inb(port) >> (irq & 7)) & 1;
 	}
@@ -1034,7 +1235,7 @@ void unlock_vector_lock(void)
 	spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1050,16 +1251,13 @@ static int __assign_irq_vector(int irq,
 	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
 	unsigned int old_vector;
 	int cpu;
-	struct irq_cfg *cfg;
 
-	cfg = irq_cfg(irq);
+	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+		return -EBUSY;
 
 	/* Only try and allocate irqs on cpus that are present */
 	cpus_and(mask, mask, cpu_online_map);
 
-	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
-		return -EBUSY;
-
 	old_vector = cfg->vector;
 	if (old_vector) {
 		cpumask_t tmp;
@@ -1113,24 +1311,22 @@ next:
 	return -ENOSPC;
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	int err;
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, cfg, mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-	struct irq_cfg *cfg;
 	cpumask_t mask;
 	int cpu, vector;
 
-	cfg = irq_cfg(irq);
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
@@ -1148,14 +1344,16 @@ void __setup_vector_irq(int cpu)
 	/* This function must be called with vector_lock held */
 	int irq, vector;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	/* Mark the inuse vectors */
-	for_each_irq_cfg(irq, cfg) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
 		if (!cpu_isset(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
-	}
+	} end_for_each_irq_desc();
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
 		irq = per_cpu(vector_irq, cpu)[vector];
@@ -1201,11 +1399,8 @@ static inline int IO_APIC_irq_trigger(in
 }
 #endif
 
-static void ioapic_register_intr(int irq, unsigned long trigger)
+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
@@ -1297,7 +1492,7 @@ static int setup_ioapic_entry(int apic,
 	return 0;
 }
 
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
 			      int trigger, int polarity)
 {
 	struct irq_cfg *cfg;
@@ -1307,10 +1502,10 @@ static void setup_IO_APIC_irq(int apic,
 	if (!IO_APIC_IRQ(irq))
 		return;
 
-	cfg = irq_cfg(irq);
+	cfg = desc->chip_data;
 
 	mask = TARGET_CPUS;
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
 	cpus_and(mask, cfg->domain, mask);
@@ -1327,12 +1522,12 @@ static void setup_IO_APIC_irq(int apic,
 			       cfg->vector)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic].mp_apicid, pin);
-		__clear_irq_vector(irq);
+		__clear_irq_vector(irq, cfg);
 		return;
 	}
 
-	ioapic_register_intr(irq, trigger);
-	if (irq < 16)
+	ioapic_register_intr(irq, desc, trigger);
+	if (irq < NR_IRQS_LEGACY)
 		disable_8259A_irq(irq);
 
 	ioapic_write_entry(apic, pin, entry);
@@ -1342,6 +1537,9 @@ static void __init setup_IO_APIC_irqs(vo
 {
 	int apic, pin, idx, irq;
 	int notcon = 0;
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int cpu = boot_cpu_id;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1373,9 +1571,11 @@ static void __init setup_IO_APIC_irqs(vo
 			if (multi_timer_check(apic, irq))
 				continue;
 #endif
-			add_pin_to_irq(irq, apic, pin);
+			desc = irq_to_desc_alloc_cpu(irq, cpu);
+			cfg = desc->chip_data;
+			add_pin_to_irq_cpu(cfg, cpu, apic, pin);
 
-			setup_IO_APIC_irq(apic, pin, irq,
+			setup_IO_APIC_irq(apic, pin, irq, desc,
 					irq_trigger(idx), irq_polarity(idx));
 		}
 	}
@@ -1434,6 +1634,7 @@ __apicdebuginit(void) print_IO_APIC(void
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 	unsigned int irq;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1523,8 +1724,10 @@ __apicdebuginit(void) print_IO_APIC(void
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for_each_irq_cfg(irq, cfg) {
-		struct irq_pin_list *entry = cfg->irq_2_pin;
+	for_each_irq_desc(irq, desc) {
+		struct irq_pin_list *entry;
+		cfg = desc->chip_data;
+		entry = cfg->irq_2_pin;
 		if (!entry)
 			continue;
 		printk(KERN_DEBUG "IRQ%d ", irq);
@@ -1535,7 +1738,7 @@ __apicdebuginit(void) print_IO_APIC(void
 			entry = entry->next;
 		}
 		printk("\n");
-	}
+	} end_for_each_irq_desc();
 
 	printk(KERN_INFO ".................................... done.\n");
 
@@ -2008,14 +2211,16 @@ static unsigned int startup_ioapic_irq(u
 {
 	int was_pending = 0;
 	unsigned long flags;
+	struct irq_cfg *cfg;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		disable_8259A_irq(irq);
 		if (i8259A_irq_pending(irq))
 			was_pending = 1;
 	}
-	__unmask_IO_APIC_irq(irq);
+	cfg = irq_cfg(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
@@ -2078,35 +2283,37 @@ static DECLARE_DELAYED_WORK(ir_migration
  * as simple as edge triggered migration and we can do the irq migration
  * with a simple atomic update to IO-APIC RTE.
  */
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
 	int modify_ioapic_rte;
 	unsigned int dest;
 	unsigned long flags;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
+	irq = desc->irq;
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
-	desc = irq_to_desc(irq);
 	modify_ioapic_rte = desc->status & IRQ_LEVEL;
 	if (modify_ioapic_rte) {
 		spin_lock_irqsave(&ioapic_lock, flags);
-		__target_IO_APIC_irq(irq, dest, cfg->vector);
+		__target_IO_APIC_irq(irq, dest, cfg);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -2128,14 +2335,14 @@ static void migrate_ioapic_irq(int irq,
 	desc->affinity = mask;
 }
 
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
 {
 	int ret = -1;
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
 
-	mask_IO_APIC_irq(irq);
+	mask_IO_APIC_irq_desc(&desc);
 
-	if (io_apic_level_ack_pending(irq)) {
+	if (io_apic_level_ack_pending(cfg)) {
 		/*
 		 * Interrupt in progress. Migrating irq now will change the
 		 * vector information in the IO-APIC RTE and that will confuse
@@ -2147,14 +2354,15 @@ static int migrate_irq_remapped_level(in
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq(irq, desc->pending_mask);
+	migrate_ioapic_irq_desc(desc, desc->pending_mask);
 
 	ret = 0;
 	desc->status &= ~IRQ_MOVE_PENDING;
 	cpus_clear(desc->pending_mask);
 
 unmask:
-	unmask_IO_APIC_irq(irq);
+	unmask_IO_APIC_irq_desc(&desc);
+
 	return ret;
 }
 
@@ -2175,29 +2383,37 @@ static void ir_irq_migration(struct work
 				continue;
 			}
 
-			desc->chip->set_affinity(irq, desc->pending_mask);
+			desc_chip_set_affinity(irq, desc, desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
  * Migrates the IRQ destination in the process context.
  */
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
 	if (desc->status & IRQ_LEVEL) {
 		desc->status |= IRQ_MOVE_PENDING;
 		desc->pending_mask = mask;
-		migrate_irq_remapped_level(irq);
+		migrate_irq_remapped_level_desc(desc);
 		return;
 	}
 
-	migrate_ioapic_irq(irq, mask);
+	migrate_ioapic_irq_desc(desc, mask);
+}
+#ifdef CONFIG_SPARSE_IRQ
+#define set_ir_ioapic_affinity_irq set_ir_ioapic_affinity_irq_desc
+#else
+static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif
+#endif
 
 asmlinkage void smp_irq_move_cleanup_interrupt(void)
 {
@@ -2236,19 +2452,40 @@ unlock:
 	irq_exit();
 }
 
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_desc *desc = *descp;
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress))
+	if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_MOVE_IRQ_DESC
+		if (likely(!cfg->move_desc_in_progress_in_same_domain))
+			return;
+
+		/* domain is not change, but affinity is changed */
+		me = smp_processor_id();
+		if (cpu_isset(me, desc->affinity)) {
+			*descp = desc = move_irq_desc(desc, me);
+			/* get the new one */
+			cfg = desc->chip_data;
+			cfg->move_desc_in_progress_in_same_domain = 0;
+		}
+#endif
 		return;
+	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+		*descp = desc = move_irq_desc(desc, me);
+		/* get the new one */
+		cfg = desc->chip_data;
+#endif
+
 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
@@ -2256,9 +2493,24 @@ static void irq_complete_move(unsigned i
 	}
 }
 #else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
+
 #ifdef CONFIG_INTR_REMAP
+#ifdef CONFIG_SPARSE_IRQ
+static void ack_x2apic_level_desc(struct irq_desc **descp)
+{
+	ack_x2APIC_irq();
+}
+
+static void ack_x2apic_edge_desc(struct irq_desc **descp)
+{
+	ack_x2APIC_irq();
+}
+
+#define ack_x2apic_level ack_x2apic_level_desc
+#define ack_x2apic_edge ack_x2apic_edge_desc
+#else
 static void ack_x2apic_level(unsigned int irq)
 {
 	ack_x2APIC_irq();
@@ -2270,29 +2522,34 @@ static void ack_x2apic_edge(unsigned int
 }
 #endif
 
-static void ack_apic_edge(unsigned int irq)
+#endif
+
+static void ack_apic_edge_desc(struct irq_desc **descp)
 {
-	irq_complete_move(irq);
-	move_native_irq(irq);
+	irq_complete_move(descp);
+#ifdef CONFIG_SMP
+	move_native_irq_desc(descp);
+#endif
 	ack_APIC_irq();
 }
 
 atomic_t irq_mis_count;
 
-static void ack_apic_level(unsigned int irq)
+static void ack_apic_level_desc(struct irq_desc **descp)
 {
 #ifdef CONFIG_X86_32
 	unsigned long v;
 	int i;
 #endif
+	struct irq_cfg *cfg;
 	int do_unmask_irq = 0;
 
-	irq_complete_move(irq);
+	irq_complete_move(descp);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+	if (unlikely((*descp)->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
-		mask_IO_APIC_irq(irq);
+		mask_IO_APIC_irq_desc(descp);
 	}
 #endif
 
@@ -2316,7 +2573,8 @@ static void ack_apic_level(unsigned int
 	* operation to prevent an edge-triggered interrupt escaping meanwhile.
 	* The idea is from Manfred Spraul.  --macro
 	*/
-	i = irq_cfg(irq)->vector;
+	cfg = (*descp)->chip_data;
+	i = cfg->vector;
 
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 #endif
@@ -2355,22 +2613,44 @@ static void ack_apic_level(unsigned int
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(irq))
-			move_masked_irq(irq);
-		unmask_IO_APIC_irq(irq);
+		cfg = (*descp)->chip_data;
+		if (!io_apic_level_ack_pending(cfg)) {
+# ifdef CONFIG_SMP
+			move_masked_irq_desc(descp);
+# endif
+		}
+		unmask_IO_APIC_irq_desc(descp);
 	}
 
 #ifdef CONFIG_X86_32
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		__mask_and_edge_IO_APIC_irq(cfg);
+		__unmask_and_level_IO_APIC_irq(cfg);
 		spin_unlock(&ioapic_lock);
 	}
 #endif
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+#define ack_apic_edge ack_apic_edge_desc
+#define ack_apic_level ack_apic_level_desc
+#else
+static void ack_apic_edge(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_apic_edge_desc(&desc);
+}
+static void ack_apic_level(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_apic_level_desc(&desc);
+}
+#endif
+
 static struct irq_chip ioapic_chip __read_mostly = {
 	.name		= "IO-APIC",
 	.startup	= startup_ioapic_irq,
@@ -2416,29 +2696,28 @@ static inline void init_IO_APIC_traps(vo
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for_each_irq_cfg(irq, cfg) {
-		if (IO_APIC_IRQ(irq) && !cfg->vector) {
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
+		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
 			 * interrupt if we can..
 			 */
-			if (irq < 16)
+			if (irq < NR_IRQS_LEGACY)
 				make_8259A_irq(irq);
-			else {
-				desc = irq_to_desc(irq);
+			else
 				/* Strange. Oh, well.. */
 				desc->chip = &no_irq_chip;
-			}
 		}
-	}
+	} end_for_each_irq_desc();
 }
 
 /*
  * The local APIC irq-chip implementation:
  */
 
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irq_desc(struct irq_desc **descp)
 {
 	unsigned long v;
 
@@ -2446,7 +2725,7 @@ static void mask_lapic_irq(unsigned int
 	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq_desc(struct irq_desc **descp)
 {
 	unsigned long v;
 
@@ -2454,11 +2733,36 @@ static void unmask_lapic_irq(unsigned in
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irq_desc(struct irq_desc **descp)
 {
 	ack_APIC_irq();
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+#define mask_lapic_irq mask_lapic_irq_desc
+#define unmask_lapic_irq unmask_lapic_irq_desc
+#define ack_lapic_irq ack_lapic_irq_desc
+#else
+static void mask_lapic_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_lapic_irq_desc(&desc);
+}
+static void unmask_lapic_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_lapic_irq_desc(&desc);
+}
+static void ack_lapic_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_lapic_irq_desc(&desc);
+}
+#endif
+
 static struct irq_chip lapic_chip __read_mostly = {
 	.name		= "local-APIC",
 	.mask		= mask_lapic_irq,
@@ -2466,11 +2770,8 @@ static struct irq_chip lapic_chip __read
 	.ack		= ack_lapic_irq,
 };
 
-static void lapic_register_intr(int irq)
+static void lapic_register_intr(int irq, struct irq_desc *desc)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
 	desc->status &= ~IRQ_LEVEL;
 	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
 				      "edge");
@@ -2574,7 +2875,9 @@ int timer_through_8259 __initdata;
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_desc *desc = irq_to_desc(0);
+	struct irq_cfg *cfg = desc->chip_data;
+	int cpu = boot_cpu_id;
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	unsigned int ver;
@@ -2589,7 +2892,7 @@ static inline void __init check_timer(vo
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	assign_irq_vector(0, TARGET_CPUS);
+	assign_irq_vector(0, cfg, TARGET_CPUS);
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2640,10 +2943,10 @@ static inline void __init check_timer(vo
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq(0, apic1, pin1);
+			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irq_desc(&desc);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
@@ -2669,9 +2972,9 @@ static inline void __init check_timer(vo
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irq_desc(&desc);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2703,7 +3006,7 @@ static inline void __init check_timer(vo
 	apic_printk(APIC_QUIET, KERN_INFO
 		    "...trying to set up timer as Virtual Wire IRQ...\n");
 
-	lapic_register_intr(0);
+	lapic_register_intr(0, desc);
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
@@ -2888,22 +3191,26 @@ unsigned int create_irq_nr(unsigned int
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
-	struct irq_cfg *cfg_new;
+	struct irq_cfg *cfg_new = NULL;
+	int cpu = boot_cpu_id;
+	struct irq_desc *desc_new = NULL;
 
+#ifndef CONFIG_SPARSE_IRQ
 	irq_want = nr_irqs - 1;
+#endif
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
 	for (new = irq_want; new > 0; new--) {
 		if (platform_legacy_irq(new))
 			continue;
-		cfg_new = irq_cfg(new);
-		if (cfg_new && cfg_new->vector != 0)
+
+		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		cfg_new = desc_new->chip_data;
+
+		if (cfg_new->vector != 0)
 			continue;
-		/* check if need to create one */
-		if (!cfg_new)
-			cfg_new = irq_cfg_alloc(new);
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
 	}
@@ -2911,6 +3218,9 @@ unsigned int create_irq_nr(unsigned int
 
 	if (irq > 0) {
 		dynamic_irq_init(irq);
+		/* restore it, in case dynamic_irq_init clear it */
+		if (desc_new)
+			desc_new->chip_data = cfg_new;
 	}
 	return irq;
 }
@@ -2930,14 +3240,22 @@ int create_irq(void)
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
+	/* store it, in case dynamic_irq_cleanup clear it */
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
 	dynamic_irq_cleanup(irq);
+	/* connect back irq_cfg */
+	if (desc)
+		desc->chip_data = cfg;
 
 #ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
 #endif
 	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
+	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
@@ -2952,12 +3270,12 @@ static int msi_compose_msg(struct pci_de
 	unsigned dest;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (err)
 		return err;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, tmp);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3013,61 +3331,75 @@ static int msi_compose_msg(struct pci_de
 }
 
 #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_msi_irq_affinity_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
+	unsigned int irq = desc->irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
-	read_msi_msg(irq, &msg);
+	read_msi_msg_desc(desc, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
 	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
-	write_msi_msg(irq, &msg);
-	desc = irq_to_desc(irq);
+	write_msi_msg_desc(desc, &msg);
 	desc->affinity = mask;
 }
+#ifdef CONFIG_SPARSE_IRQ
+#define set_msi_irq_affinity set_msi_irq_affinity_desc
+#else
+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
 
+	set_msi_irq_affinity_desc(desc, mask);
+}
+#endif
 #ifdef CONFIG_INTR_REMAP
 /*
  * Migrate the MSI irq to another cpumask. This migration is
  * done in the process context using interrupt-remapping hardware.
  */
-static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void ir_set_msi_irq_affinity_desc(struct irq_desc *desc,
+					 cpumask_t mask)
 {
 	struct irq_cfg *cfg;
 	unsigned int dest;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
-	struct irq_desc *desc;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
+	irq = desc->irq;
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3091,9 +3423,20 @@ static void ir_set_msi_irq_affinity(unsi
 		cfg->move_in_progress = 0;
 	}
 
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+#define ir_set_msi_irq_affinity ir_set_msi_irq_affinity_desc
+#else
+static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ir_set_msi_irq_affinity_desc(desc, mask);
+}
+#endif
+
 #endif
 #endif /* CONFIG_SMP */
 
@@ -3152,7 +3495,7 @@ static int msi_alloc_irte(struct pci_dev
 }
 #endif
 
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
 	int ret;
 	struct msi_msg msg;
@@ -3161,7 +3504,7 @@ static int setup_msi_irq(struct pci_dev
 	if (ret < 0)
 		return ret;
 
-	set_irq_msi(irq, desc);
+	set_irq_msi(irq, msidesc);
 	write_msi_msg(irq, &msg);
 
 #ifdef CONFIG_INTR_REMAP
@@ -3176,7 +3519,7 @@ static int setup_msi_irq(struct pci_dev
 #endif
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
-	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for MSI/MSI-X\n", irq, irq);
 
 	return 0;
 }
@@ -3185,6 +3528,7 @@ static unsigned int build_irq_for_pci_de
 {
 	unsigned int irq;
 
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
 	irq = dev->bus->number;
 	irq <<= 8;
 	irq |= dev->devfn;
@@ -3193,13 +3537,13 @@ static unsigned int build_irq_for_pci_de
 	return irq;
 }
 
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
 {
 	unsigned int irq;
 	int ret;
 	unsigned int irq_want;
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 
 	irq = create_irq_nr(irq_want);
 	if (irq == 0)
@@ -3214,7 +3558,7 @@ int arch_setup_msi_irq(struct pci_dev *d
 		goto error;
 no_ir:
 #endif
-	ret = setup_msi_irq(dev, desc, irq);
+	ret = setup_msi_irq(dev, msidesc, irq);
 	if (ret < 0) {
 		destroy_irq(irq);
 		return ret;
@@ -3232,7 +3576,7 @@ int arch_setup_msi_irqs(struct pci_dev *
 {
 	unsigned int irq;
 	int ret, sub_handle;
-	struct msi_desc *desc;
+	struct msi_desc *msidesc;
 	unsigned int irq_want;
 
 #ifdef CONFIG_INTR_REMAP
@@ -3240,9 +3584,10 @@ int arch_setup_msi_irqs(struct pci_dev *
 	int index = 0;
 #endif
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	/* count from the top 0xfff in 12 bits range */
+	irq_want = build_irq_for_pci_dev(dev) + 0xfff;
 	sub_handle = 0;
-	list_for_each_entry(desc, &dev->msi_list, list) {
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
 		irq = create_irq_nr(irq_want--);
 		if (irq == 0)
 			return -1;
@@ -3275,7 +3620,7 @@ int arch_setup_msi_irqs(struct pci_dev *
 		}
 no_ir:
 #endif
-		ret = setup_msi_irq(dev, desc, irq);
+		ret = setup_msi_irq(dev, msidesc, irq);
 		if (ret < 0)
 			goto error;
 		sub_handle++;
@@ -3294,22 +3639,25 @@ void arch_teardown_msi_irq(unsigned int
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	irq = desc->irq;
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3321,9 +3669,20 @@ static void dmar_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+#define dmar_msi_set_affinity dmar_msi_set_affinity_desc
+#else
+static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	dmar_msi_set_affinity_desc(desc, mask);
+}
+#endif
+
 #endif /* CONFIG_SMP */
 
 struct irq_chip dmar_msi_type = {
@@ -3355,22 +3714,25 @@ int arch_setup_dmar_msi(unsigned int irq
 #ifdef CONFIG_HPET_TIMER
 
 #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void hpet_msi_set_affinity_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	irq = desc->irq;
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3382,9 +3744,19 @@ static void hpet_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+#define hpet_msi_set_affinity hpet_msi_set_affinity_desc
+#else
+static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	hpet_msi_set_affinity_desc(desc, mask);
+}
+#endif
 #endif /* CONFIG_SMP */
 
 struct irq_chip hpet_msi_type = {
@@ -3437,28 +3809,40 @@ static void target_ht_irq(unsigned int i
 	write_ht_irq_msg(irq, &msg);
 }
 
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_ht_irq_affinity_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
+	unsigned int irq = desc->irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+#define set_ht_irq_affinity set_ht_irq_affinity_desc
+#else
+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	set_ht_irq_affinity_desc(desc, mask);
+}
+#endif
 #endif
 
 static struct irq_chip ht_irq_chip = {
@@ -3478,13 +3862,13 @@ int arch_setup_ht_irq(unsigned int irq,
 	int err;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (!err) {
 		struct ht_irq_msg msg;
 		unsigned dest;
 
-		cfg = irq_cfg(irq);
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
@@ -3508,7 +3892,8 @@ int arch_setup_ht_irq(unsigned int irq,
 		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
 					      handle_edge_irq, "edge");
 
-		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
+		dev_printk(KERN_DEBUG, &dev->dev, "irq %d aka 0x%08x for HT\n",
+				 irq, irq);
 	}
 	return err;
 }
@@ -3530,7 +3915,9 @@ int arch_enable_uv_irq(char *irq_name, u
 	unsigned long flags;
 	int err;
 
-	err = assign_irq_vector(irq, *eligible_cpu);
+	cfg = irq_cfg(irq);
+
+	err = assign_irq_vector(irq, cfg, *eligible_cpu);
 	if (err != 0)
 		return err;
 
@@ -3539,8 +3926,6 @@ int arch_enable_uv_irq(char *irq_name, u
 				      irq_name);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	cfg = irq_cfg(irq);
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3594,6 +3979,7 @@ int __init io_apic_get_redir_entries (in
 
 int __init probe_nr_irqs(void)
 {
+#ifdef CONFIG_SPARSE_IRQ
 	int idx;
 	int nr = 0;
 #ifndef CONFIG_XEN
@@ -3611,10 +3997,11 @@ int __init probe_nr_irqs(void)
 	/* something wrong ? */
 	if (nr < nr_min)
 		nr = nr_min;
-	if (WARN_ON(nr > NR_IRQS))
-		nr = NR_IRQS;
 
 	return nr;
+#else
+	return NR_IRQS;
+#endif
 }
 
 /* --------------------------------------------------------------------------
@@ -3713,19 +4100,27 @@ int __init io_apic_get_version(int ioapi
 
 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
 {
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int cpu = boot_cpu_id;
+
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
 			ioapic);
 		return -EINVAL;
 	}
 
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
-	if (irq >= 16)
-		add_pin_to_irq(irq, ioapic, pin);
+	if (irq >= NR_IRQS_LEGACY) {
+		cfg = desc->chip_data;
+		add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+	}
 
-	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
 
 	return 0;
 }
@@ -3779,9 +4174,10 @@ void __init setup_ioapic_dest(void)
 			 * when you have too many devices, because at that time only boot
 			 * cpu is online.
 			 */
-			cfg = irq_cfg(irq);
+			desc = irq_to_desc(irq);
+			cfg = desc->chip_data;
 			if (!cfg->vector) {
-				setup_IO_APIC_irq(ioapic, pin, irq,
+				setup_IO_APIC_irq(ioapic, pin, irq, desc,
 						  irq_trigger(irq_entry),
 						  irq_polarity(irq_entry));
 				continue;
@@ -3791,7 +4187,6 @@ void __init setup_ioapic_dest(void)
 			/*
 			 * Honour affinities which have been set in early boot
 			 */
-			desc = irq_to_desc(irq);
 			if (desc->status &
 			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
 				mask = desc->affinity;
@@ -3800,10 +4195,10 @@ void __init setup_ioapic_dest(void)
 
 #ifdef CONFIG_INTR_REMAP
 			if (intr_remapping_enabled)
-				set_ir_ioapic_affinity_irq(irq, mask);
+				set_ir_ioapic_affinity_irq_desc(desc, mask);
 			else
 #endif
-				set_ioapic_affinity_irq(irq, mask);
+				set_ioapic_affinity_irq_desc(desc, mask);
 		}
 
 	}
@@ -3852,7 +4247,6 @@ void __init ioapic_init_mappings(void)
 	struct resource *ioapic_res;
 	int i;
 
-	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
Index: linux-2.6/arch/x86/kernel/irqinit_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_32.c
+++ linux-2.6/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
 	/*
 	 * 16 old-style INTA-cycle interrupts:
 	 */
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/kernel/irqinit_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit_64.c
+++ linux-2.6/arch/x86/kernel/irqinit_64.c
@@ -142,8 +142,7 @@ void __init init_ISA_irqs(void)
 	init_bsp_APIC();
 	init_8259A(0);
 
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
Index: linux-2.6/arch/x86/mm/init_32.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_32.c
+++ linux-2.6/arch/x86/mm/init_32.c
@@ -66,6 +66,7 @@ static unsigned long __meminitdata table
 static unsigned long __meminitdata table_top;
 
 static int __initdata after_init_bootmem;
+int after_bootmem;
 
 static __init void *alloc_low_page(unsigned long *phys)
 {
@@ -987,6 +988,8 @@ void __init mem_init(void)
 
 	set_highmem_pages_init();
 
+	after_bootmem = 1;
+
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
Index: linux-2.6/drivers/char/random.c
===================================================================
--- linux-2.6.orig/drivers/char/random.c
+++ linux-2.6/drivers/char/random.c
@@ -558,6 +558,8 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
+#ifndef CONFIG_SPARSE_IRQ
+
 static struct timer_rand_state *irq_timer_state[NR_IRQS];
 
 static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
@@ -576,6 +578,33 @@ static void set_timer_rand_state(unsigne
 	irq_timer_state[irq] = state;
 }
 
+#else
+
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return NULL;
+
+	return desc->timer_rand_state;
+}
+
+static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return;
+
+	desc->timer_rand_state = state;
+}
+#endif
+
 static struct timer_rand_state input_timer_state;
 
 /*
@@ -933,8 +962,10 @@ void rand_initialize_irq(int irq)
 {
 	struct timer_rand_state *state;
 
+#ifndef CONFIG_SPARSE_IRQ
 	if (irq >= nr_irqs)
 		return;
+#endif
 
 	state = get_timer_rand_state(irq);
 
Index: linux-2.6/drivers/pci/htirq.c
===================================================================
--- linux-2.6.orig/drivers/pci/htirq.c
+++ linux-2.6/drivers/pci/htirq.c
@@ -58,30 +58,62 @@ void fetch_ht_irq_msg(unsigned int irq,
 	*msg = cfg->msg;
 }
 
-void mask_ht_irq(unsigned int irq)
+void mask_ht_irq_desc(struct irq_desc **descp)
 {
 	struct ht_irq_cfg *cfg;
 	struct ht_irq_msg msg;
+	unsigned int irq = (*descp)->irq;
 
-	cfg = get_irq_data(irq);
+	cfg = get_irq_desc_data(*descp);
 
 	msg = cfg->msg;
 	msg.address_lo |= 1;
 	write_ht_irq_msg(irq, &msg);
 }
 
-void unmask_ht_irq(unsigned int irq)
+void unmask_ht_irq_desc(struct irq_desc **descp)
 {
 	struct ht_irq_cfg *cfg;
 	struct ht_irq_msg msg;
+	unsigned int irq = (*descp)->irq;
 
-	cfg = get_irq_data(irq);
+	cfg = get_irq_desc_data(*descp);
 
 	msg = cfg->msg;
 	msg.address_lo &= ~1;
 	write_ht_irq_msg(irq, &msg);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void mask_ht_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_ht_irq_desc(&desc);
+}
+void unmask_ht_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_ht_irq_desc(&desc);
+}
+
+#else
+
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+	unsigned int irq;
+
+	/* use 8bits (bus) + 8bits (devfn) + 12 bits */
+	irq = dev->bus->number;
+	irq <<= 8;
+	irq |= dev->devfn;
+	irq <<= 12;
+
+	return irq;
+}
+#endif
+
 /**
  * __ht_create_irq - create an irq and attach it to a device.
  * @dev: The hypertransport device to find the irq capability on.
@@ -125,7 +157,11 @@ int __ht_create_irq(struct pci_dev *dev,
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
+#ifdef CONFIG_SPARSE_IRQ
+	irq = create_irq_nr(idx + build_irq_for_pci_dev(dev));
+#else
 	irq = create_irq();
+#endif
 
 	if (irq <= 0) {
 		kfree(cfg);
Index: linux-2.6/drivers/pci/intr_remapping.c
===================================================================
--- linux-2.6.orig/drivers/pci/intr_remapping.c
+++ linux-2.6/drivers/pci/intr_remapping.c
@@ -19,17 +19,71 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+{
+	struct irq_2_iommu *iommu;
+	int node;
+
+	node = cpu_to_node(cpu);
+
+	iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+
+	return iommu;
+}
 
 static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 {
-	return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (WARN_ON_ONCE(!desc))
+		return NULL;
+
+	return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct irq_2_iommu *irq_iommu;
+
+	/*
+	 * alloc irq desc if not allocated already.
+	 */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+
+	irq_iommu = desc->irq_2_iommu;
+
+	if (!irq_iommu)
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+
+	return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+	return irq_2_iommu_alloc_cpu(irq, boot_cpu_id);
 }
 
+#else /* !CONFIG_SPARSE_IRQ */
+
+static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_2_iommuX[irq];
+
+	return NULL;
+}
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
 	return irq_2_iommu(irq);
 }
+#endif
 
 static DEFINE_SPINLOCK(irq_2_ir_lock);
 
@@ -86,9 +140,11 @@ int alloc_irte(struct intel_iommu *iommu
 	if (!count)
 		return -1;
 
+#ifndef CONFIG_SPARSE_IRQ
 	/* protect irq_2_iommu_alloc later */
 	if (irq >= nr_irqs)
 		return -1;
+#endif
 
 	/*
 	 * start the IRTE search from index 0.
@@ -130,6 +186,12 @@ int alloc_irte(struct intel_iommu *iommu
 		table->base[i].present = 1;
 
 	irq_iommu = irq_2_iommu_alloc(irq);
+	if (!irq_iommu) {
+		spin_unlock(&irq_2_ir_lock);
+		printk(KERN_ERR "can't allocate irq_2_iommu\n");
+		return -1;
+	}
+
 	irq_iommu->iommu = iommu;
 	irq_iommu->irte_index =  index;
 	irq_iommu->sub_handle = 0;
@@ -177,6 +239,12 @@ int set_irte_irq(int irq, struct intel_i
 
 	irq_iommu = irq_2_iommu_alloc(irq);
 
+	if (!irq_iommu) {
+		spin_unlock(&irq_2_ir_lock);
+		printk(KERN_ERR "can't allocate irq_2_iommu\n");
+		return -1;
+	}
+
 	irq_iommu->iommu = iommu;
 	irq_iommu->irte_index = index;
 	irq_iommu->sub_handle = subhandle;
Index: linux-2.6/drivers/xen/events.c
===================================================================
--- linux-2.6.orig/drivers/xen/events.c
+++ linux-2.6/drivers/xen/events.c
@@ -141,8 +141,9 @@ static void init_evtchn_cpu_bindings(voi
 	int i;
 
 	/* By default all event channels notify CPU#0. */
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		desc->affinity = cpumask_of_cpu(0);
+	} end_for_each_irq_desc();
 #endif
 
 	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@ -231,7 +232,7 @@ static int find_unbound_irq(void)
 	int irq;
 
 	/* Only allocate from dynirq range */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		if (irq_bindcount[irq] == 0)
 			break;
 
@@ -792,7 +793,7 @@ void xen_irq_resume(void)
 		mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		irq_info[irq].evtchn = 0; /* zap event-channel binding */
 
 	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@ -824,7 +825,7 @@ void __init xen_init_IRQ(void)
 		mask_evtchn(i);
 
 	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-	for_each_irq_nr(i)
+	for (i = 0; i < nr_irqs; i++)
 		irq_bindcount[i] = 0;
 
 	irq_ctx_init(smp_processor_id());
Index: linux-2.6/fs/proc/stat.c
===================================================================
--- linux-2.6.orig/fs/proc/stat.c
+++ linux-2.6/fs/proc/stat.c
@@ -27,6 +27,9 @@ static int show_stat(struct seq_file *p,
 	u64 sum = 0;
 	struct timespec boottime;
 	unsigned int per_irq_sum;
+#ifdef CONFIG_GENERIC_HARDIRQS
+	struct irq_desc *desc;
+#endif
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -44,10 +47,9 @@ static int show_stat(struct seq_file *p,
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-
-		for_each_irq_nr(j)
+		for_each_irq_desc(j, desc) {
 			sum += kstat_irqs_cpu(j, i);
-
+		} end_for_each_irq_desc();
 		sum += arch_irq_stat_cpu(i);
 	}
 	sum += arch_irq_stat();
@@ -90,14 +92,17 @@ static int show_stat(struct seq_file *p,
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_nr(j) {
+	for_each_irq_desc(j, desc) {
 		per_irq_sum = 0;
-
 		for_each_possible_cpu(i)
 			per_irq_sum += kstat_irqs_cpu(j, i);
 
+#ifdef CONFIG_SPARSE_IRQ
+		seq_printf(p, " %#x:%u", j, per_irq_sum);
+#else
 		seq_printf(p, " %u", per_irq_sum);
-	}
+#endif
+	} end_for_each_irq_desc();
 
 	seq_printf(p,
 		"\nctxt %llu\n"
Index: linux-2.6/fs/proc/interrupts.c
===================================================================
--- linux-2.6.orig/fs/proc/interrupts.c
+++ linux-2.6/fs/proc/interrupts.c
@@ -8,6 +8,23 @@
 /*
  * /proc/interrupts
  */
+#ifdef CONFIG_SPARSE_IRQ
+static void *int_seq_start(struct seq_file *f, loff_t *pos)
+{
+	rcu_read_lock();
+	return seq_list_start(&sparse_irqs_head, *pos);
+}
+
+static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &sparse_irqs_head, pos);
+}
+
+static void int_seq_stop(struct seq_file *f, void *v)
+{
+	rcu_read_unlock();
+}
+#else
 static void *int_seq_start(struct seq_file *f, loff_t *pos)
 {
 	return (*pos <= nr_irqs) ? pos : NULL;
@@ -25,6 +42,7 @@ static void int_seq_stop(struct seq_file
 {
 	/* Nothing to do */
 }
+#endif
 
 static const struct seq_operations int_seq_ops = {
 	.start = int_seq_start,
Index: linux-2.6/include/linux/interrupt.h
===================================================================
--- linux-2.6.orig/include/linux/interrupt.h
+++ linux-2.6/include/linux/interrupt.h
@@ -18,6 +18,8 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
+extern int nr_irqs;
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h
+++ linux-2.6/include/linux/irq.h
@@ -106,14 +106,23 @@ struct irq_chip {
 	void		(*enable)(unsigned int irq);
 	void		(*disable)(unsigned int irq);
 
+#ifdef CONFIG_SPARSE_IRQ
+	void		(*ack)(struct irq_desc **descp);
+	void		(*mask)(struct irq_desc **descp);
+	void		(*mask_ack)(struct irq_desc **descp);
+	void		(*unmask)(struct irq_desc **descp);
+	void		(*eoi)(struct irq_desc **descp);
+	void		(*set_affinity)(struct irq_desc *desc, cpumask_t dest);
+#else
 	void		(*ack)(unsigned int irq);
 	void		(*mask)(unsigned int irq);
 	void		(*mask_ack)(unsigned int irq);
 	void		(*unmask)(unsigned int irq);
 	void		(*eoi)(unsigned int irq);
+	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
+#endif
 
 	void		(*end)(unsigned int irq);
-	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
 	int		(*retrigger)(unsigned int irq);
 	int		(*set_type)(unsigned int irq, unsigned int flow_type);
 	int		(*set_wake)(unsigned int irq, unsigned int on);
@@ -129,6 +138,8 @@ struct irq_chip {
 	const char	*typename;
 };
 
+struct timer_rand_state;
+struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -155,6 +166,15 @@ struct irq_chip {
  */
 struct irq_desc {
 	unsigned int		irq;
+#ifdef CONFIG_SPARSE_IRQ
+	struct list_head	list;
+	struct list_head	hash_entry;
+	struct timer_rand_state *timer_rand_state;
+	unsigned int            *kstat_irqs;
+# ifdef CONFIG_INTR_REMAP
+	struct irq_2_iommu      *irq_2_iommu;
+# endif
+#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -182,13 +202,69 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+extern void arch_early_irq_init(void);
+extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
+					struct irq_desc *desc, int cpu);
+extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
 
+#ifndef CONFIG_SPARSE_IRQ
+
+/* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
 
-static inline struct irq_desc *irq_to_desc(unsigned int irq)
-{
-	return (irq < nr_irqs) ? irq_desc + irq : NULL;
-}
+#ifdef CONFIG_GENERIC_HARDIRQS
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+# define for_each_irq_desc_reverse(irq, desc)                          \
+	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
+	    irq >= 0; irq--, desc--)
+
+#define end_for_each_irq_desc()
+#endif
+
+#define desc_chip_ack(irq, descp) desc->chip->ack(irq)
+#define desc_chip_mask(irq, descp) desc->chip->mask(irq)
+#define desc_chip_mask_ack(irq, descp) desc->chip->mask_ack(irq)
+#define desc_chip_unmask(irq, descp) desc->chip->unmask(irq)
+#define desc_chip_eoi(irq, descp) desc->chip->eoi(irq)
+#define desc_chip_set_affinity(irq, descx, mask) desc->chip->set_affinity(irq, mask)
+
+#else
+
+void early_irq_init(void);
+extern struct list_head sparse_irqs_head;
+#define for_each_irq_desc(irqX, desc)					\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.next), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.next), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.next), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define for_each_irq_desc_reverse(irqX, desc)				\
+	rcu_read_lock();						\
+	for (desc = list_entry(rcu_dereference(sparse_irqs_head.prev), typeof(*desc), list), irqX = desc->irq; \
+		prefetch(desc->list.prev), &desc->list != &sparse_irqs_head; \
+		desc = list_entry(rcu_dereference(desc->list.prev), typeof(*desc), list), irqX = desc ? desc->irq : -1U)
+
+#define end_for_each_irq_desc() rcu_read_unlock()
+
+#define kstat_irqs_this_cpu(DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()])
+#define kstat_incr_irqs_this_cpu(irqno, DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()]++)
+
+#define desc_chip_ack(irq, descp) desc->chip->ack(descp)
+#define desc_chip_mask(irq, descp) desc->chip->mask(descp)
+#define desc_chip_mask_ack(irq, descp) desc->chip->mask_ack(descp)
+#define desc_chip_unmask(irq, descp) desc->chip->unmask(descp)
+#define desc_chip_eoi(irq, descp) desc->chip->eoi(descp)
+#define desc_chip_set_affinity(irq, descx, mask) desc->chip->set_affinity(descx, mask)
+
+#endif
 
 /*
  * Migration helpers for obsolete names, they will go away:
@@ -211,8 +287,12 @@ extern int setup_irq(unsigned int irq, s
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 
+void move_native_irq_desc(struct irq_desc **descp);
+void move_masked_irq_desc(struct irq_desc **descp);
+#ifndef CONFIG_SPARSE_IRQ
 void move_native_irq(int irq);
 void move_masked_irq(int irq);
+#endif
 
 #else /* CONFIG_GENERIC_PENDING_IRQ */
 
@@ -381,6 +461,11 @@ extern int set_irq_msi(unsigned int irq,
 #define get_irq_data(irq)	(irq_to_desc(irq)->handler_data)
 #define get_irq_msi(irq)	(irq_to_desc(irq)->msi_desc)
 
+#define get_irq_desc_chip(desc)		((desc)->chip)
+#define get_irq_desc_chip_data(desc)	((desc)->chip_data)
+#define get_irq_desc_data(desc)		((desc)->handler_data)
+#define get_irq_desc_msi(desc)		((desc)->msi_desc)
+
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
 #endif /* !CONFIG_S390 */
Index: linux-2.6/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.orig/include/linux/kernel_stat.h
+++ linux-2.6/include/linux/kernel_stat.h
@@ -28,7 +28,9 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-	unsigned int irqs[NR_IRQS];
+#ifndef CONFIG_SPARSE_IRQ
+       unsigned int irqs[NR_IRQS];
+#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, ksta
 
 extern unsigned long long nr_context_switches(void);
 
+#ifndef CONFIG_SPARSE_IRQ
+#define kstat_irqs_this_cpu(irq) \
+	(kstat_this_cpu.irqs[irq])
+
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
@@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_
 {
 	kstat_this_cpu.irqs[irq]++;
 }
+#endif
+
 
+#ifndef CONFIG_SPARSE_IRQ
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
+#else
+extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#endif
 
 /*
  * Number of interrupts per specific IRQ source, since bootup
Index: linux-2.6/kernel/irq/autoprobe.c
===================================================================
--- linux-2.6.orig/kernel/irq/autoprobe.c
+++ linux-2.6/kernel/irq/autoprobe.c
@@ -57,7 +57,7 @@ unsigned long probe_irq_on(void)
 			desc->chip->startup(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/* Wait for longstanding interrupts to trigger. */
 	msleep(20);
@@ -75,7 +75,7 @@ unsigned long probe_irq_on(void)
 				desc->status |= IRQ_PENDING;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	/*
 	 * Wait for spurious interrupts to trigger
@@ -99,7 +99,7 @@ unsigned long probe_irq_on(void)
 					mask |= 1 << i;
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 
 	return mask;
 }
@@ -135,7 +135,7 @@ unsigned int probe_irq_mask(unsigned lon
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	return mask & val;
@@ -179,7 +179,7 @@ int probe_irq_off(unsigned long val)
 			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
-	}
+	} end_for_each_irq_desc();
 	mutex_unlock(&probing_active);
 
 	if (nr_of_irqs > 1)
Index: linux-2.6/kernel/irq/chip.c
===================================================================
--- linux-2.6.orig/kernel/irq/chip.c
+++ linux-2.6/kernel/irq/chip.c
@@ -24,9 +24,11 @@
  */
 void dynamic_irq_init(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc;
 	unsigned long flags;
 
+	/* first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
@@ -223,7 +225,7 @@ static void default_enable(unsigned int
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	desc->chip->unmask(irq);
+	desc_chip_unmask(irq, &desc);
 	desc->status &= ~IRQ_MASKED;
 }
 
@@ -252,7 +254,7 @@ static void default_shutdown(unsigned in
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	desc->chip->mask(irq);
+	desc_chip_mask(irq, &desc);
 	desc->status |= IRQ_MASKED;
 }
 
@@ -282,13 +284,15 @@ void irq_chip_set_defaults(struct irq_ch
 		chip->end = dummy_irq_chip.end;
 }
 
-static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+static inline void mask_ack_irq(struct irq_desc **descp, int irq)
 {
+	struct irq_desc *desc = *descp;
+
 	if (desc->chip->mask_ack)
-		desc->chip->mask_ack(irq);
+		desc_chip_mask_ack(irq, descp);
 	else {
-		desc->chip->mask(irq);
-		desc->chip->ack(irq);
+		desc_chip_mask(irq, descp);
+		desc_chip_ack(irq, descp);
 	}
 }
 
@@ -351,7 +355,7 @@ handle_level_irq(unsigned int irq, struc
 	irqreturn_t action_ret;
 
 	spin_lock(&desc->lock);
-	mask_ack_irq(desc, irq);
+	mask_ack_irq(&desc, irq);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -376,7 +380,7 @@ handle_level_irq(unsigned int irq, struc
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
-		desc->chip->unmask(irq);
+		desc_chip_unmask(irq, &desc);
 out_unlock:
 	spin_unlock(&desc->lock);
 }
@@ -413,7 +417,7 @@ handle_fasteoi_irq(unsigned int irq, str
 	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
 		desc->status |= IRQ_PENDING;
 		if (desc->chip->mask)
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 		goto out;
 	}
 
@@ -428,7 +432,7 @@ handle_fasteoi_irq(unsigned int irq, str
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 out:
-	desc->chip->eoi(irq);
+	desc_chip_eoi(irq, &desc);
 
 	spin_unlock(&desc->lock);
 }
@@ -464,13 +468,13 @@ handle_edge_irq(unsigned int irq, struct
 	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
-		mask_ack_irq(desc, irq);
+		mask_ack_irq(&desc, irq);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
-	desc->chip->ack(irq);
+	desc_chip_ack(irq, &desc);
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -480,7 +484,7 @@ handle_edge_irq(unsigned int irq, struct
 		irqreturn_t action_ret;
 
 		if (unlikely(!action)) {
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 			goto out_unlock;
 		}
 
@@ -492,7 +496,7 @@ handle_edge_irq(unsigned int irq, struct
 		if (unlikely((desc->status &
 			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
 			      (IRQ_PENDING | IRQ_MASKED))) {
-			desc->chip->unmask(irq);
+			desc_chip_unmask(irq, &desc);
 			desc->status &= ~IRQ_MASKED;
 		}
 
@@ -525,14 +529,14 @@ handle_percpu_irq(unsigned int irq, stru
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	if (desc->chip->ack)
-		desc->chip->ack(irq);
+		desc_chip_ack(irq, &desc);
 
 	action_ret = handle_IRQ_event(irq, desc->action);
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
 	if (desc->chip->eoi)
-		desc->chip->eoi(irq);
+		desc_chip_eoi(irq, &desc);
 }
 
 void
@@ -568,7 +572,7 @@ __set_irq_handler(unsigned int irq, irq_
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
 		if (desc->chip != &no_irq_chip)
-			mask_ack_irq(desc, irq);
+			mask_ack_irq(&desc, irq);
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c
+++ linux-2.6/kernel/irq/handle.c
@@ -15,9 +15,16 @@
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
 
 #include "internals.h"
 
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
@@ -49,6 +56,311 @@ void handle_bad_irq(unsigned int irq, st
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
+void __init __attribute__((weak)) arch_early_irq_init(void)
+{
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_desc irq_desc_init = {
+	.irq	    = -1,
+	.status	    = IRQ_DISABLED,
+	.chip	    = &no_irq_chip,
+	.handle_irq = handle_bad_irq,
+	.depth      = 1,
+	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+	.affinity   = CPU_MASK_ALL
+#endif
+};
+
+static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+{
+	unsigned long bytes;
+	char *ptr;
+	int node;
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	node = cpu_to_node(cpu);
+	ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+
+	if (ptr)
+		desc->kstat_irqs = (unsigned int *)ptr;
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc,
+				 int cpu, int nr)
+{
+	unsigned long bytes;
+
+	init_kstat_irqs(desc, cpu, nr);
+
+	if (desc->kstat_irqs != old_desc->kstat_irqs) {
+		/* Compute how many bytes we need per irq and allocate them */
+		bytes = nr * sizeof(unsigned int);
+
+		memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+	}
+}
+
+static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	if (old_desc->kstat_irqs == desc->kstat_irqs)
+		return;
+
+	kfree(old_desc->kstat_irqs);
+	old_desc->kstat_irqs = NULL;
+}
+#endif
+
+void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+}
+
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
+	desc->irq = irq;
+#ifdef CONFIG_SMP
+	desc->cpu = cpu;
+#endif
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_kstat_irqs(desc, cpu, nr_cpu_ids);
+	if (!desc->kstat_irqs) {
+		printk(KERN_ERR "can not alloc kstat_irqs\n");
+		BUG_ON(1);
+	}
+	arch_init_chip_data(desc, cpu);
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+		 struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	desc->cpu = cpu;
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	free_kstat_irqs(old_desc, desc);
+	arch_free_chip_data(old_desc, desc);
+}
+#endif
+/*
+ * Protect the sparse_irqs_free freelist:
+ */
+static DEFINE_SPINLOCK(sparse_irq_lock);
+LIST_HEAD(sparse_irqs_head);
+
+/*
+ * The sparse irqs are in a hash-table as well, for fast lookup:
+ */
+#define SPARSEIRQHASH_BITS          (13 - 1)
+#define SPARSEIRQHASH_SIZE          (1UL << SPARSEIRQHASH_BITS)
+#define __sparseirqhashfn(key)      hash_long((unsigned long)key, SPARSEIRQHASH_BITS)
+#define sparseirqhashentry(key)     (sparseirqhash_table + __sparseirqhashfn((key)))
+
+static struct list_head sparseirqhash_table[SPARSEIRQHASH_SIZE];
+
+static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
+	[0 ... NR_IRQS_LEGACY-1] = {
+		.irq	    = -1,
+		.status	    = IRQ_DISABLED,
+		.chip	    = &no_irq_chip,
+		.handle_irq = handle_bad_irq,
+		.depth	    = 1,
+		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+		.affinity   = CPU_MASK_ALL
+#endif
+	}
+};
+
+/* FIXME: use bootmem alloc ...*/
+static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+
+void __init early_irq_init(void)
+{
+	struct irq_desc *desc;
+	int legacy_count;
+	int i;
+
+	/* init list for sparseirq */
+	for (i = 0; i < SPARSEIRQHASH_SIZE; i++)
+		INIT_LIST_HEAD(sparseirqhash_table + i);
+
+	desc = irq_desc_legacy;
+	legacy_count = ARRAY_SIZE(irq_desc_legacy);
+
+	for (i = 0; i < legacy_count; i++) {
+		struct list_head *hash_head;
+
+		hash_head = sparseirqhashentry(i);
+		desc[i].irq = i;
+		desc[i].kstat_irqs = kstat_irqs_legacy[i];
+		list_add_tail(&desc[i].hash_entry, hash_head);
+		list_add_tail(&desc[i].list, &sparse_irqs_head);
+	}
+
+	arch_early_irq_init();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+
+	hash_head = sparseirqhashentry(irq);
+
+	/*
+	 * We can walk the hash lockfree, because the hash only
+	 * grows, and we are careful when adding entries to the end:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry) {
+		if (desc->irq == irq)
+			return desc;
+	}
+
+	return NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		return desc;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry) {
+		if (desc->irq == irq)
+			goto out_unlock;
+	}
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc irq_desc for %d aka %#x on cpu %d node %d\n",
+		 irq, irq, cpu, node);
+	if (!desc) {
+		printk(KERN_ERR "can not alloc irq_desc\n");
+		BUG_ON(1);
+	}
+	init_one_irq_desc(irq, desc, cpu);
+
+	/*
+	 * We use RCU's safe list-add method to make
+	 * parallel walking of the hash-list safe:
+	 */
+	list_add_tail_rcu(&desc->hash_entry, hash_head);
+	/*
+	 * Add it to the global list:
+	 */
+	list_add_tail_rcu(&desc->list, &sparse_irqs_head);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc_alloc_cpu(irq, boot_cpu_id);
+}
+
+#ifdef CONFIG_MOVE_IRQ_DESC
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+						int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	struct list_head *hash_head;
+	unsigned long flags;
+	int node;
+
+	irq = old_desc->irq;
+
+	hash_head = sparseirqhashentry(irq);
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(desc, hash_head, hash_entry) {
+		if (desc->irq == irq && old_desc != desc)
+			goto out_unlock;
+	}
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  move irq_desc for %d aka %#x to cpu %d node %d\n",
+		 irq, irq, cpu, node);
+	if (!desc) {
+		printk(KERN_ERR "can not get new irq_desc for moving\n");
+		/* still use old one */
+		desc = old_desc;
+		goto out_unlock;
+	}
+	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+	list_replace_rcu(&old_desc->hash_entry, &desc->hash_entry);
+	list_replace_rcu(&old_desc->list, &desc->list);
+
+	/* free the old one */
+	free_one_irq_desc(old_desc, desc);
+	kfree(old_desc);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+	int old_cpu;
+	int node, old_node;
+
+	/* those all static, do move them */
+	if (desc->irq < NR_IRQS_LEGACY)
+		return desc;
+
+	old_cpu = desc->cpu;
+	printk(KERN_DEBUG "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
+	if (old_cpu != cpu) {
+		node = cpu_to_node(cpu);
+		old_node = cpu_to_node(old_cpu);
+		if (old_node != node)
+			desc = __real_move_irq_desc(desc, cpu);
+		else
+			desc->cpu = cpu;
+	}
+
+	return desc;
+}
+#endif
+
+#else
+
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
@@ -62,17 +374,49 @@ struct irq_desc irq_desc[NR_IRQS] __cach
 	}
 };
 
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_desc[irq];
+
+	return NULL;
+}
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc(irq);
+}
+struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu)
+{
+	return old_desc;
+}
+#endif
+
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
  */
+static void ack_bad_desc(struct irq_desc **descp)
+{
+	unsigned int irq = (*descp)->irq;
+
+	print_irq_desc(irq, *descp);
+	ack_bad_irq(irq);
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+#define ack_bad ack_bad_desc
+#else
 static void ack_bad(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	print_irq_desc(irq, desc);
-	ack_bad_irq(irq);
+	ack_bad_desc(&desc);
 }
+#endif
 
 /*
  * NOP functions
@@ -81,6 +425,14 @@ static void noop(unsigned int irq)
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void noop_desc(struct irq_desc **descp)
+{
+}
+#else
+#define noop_desc noop
+#endif
+
 static unsigned int noop_ret(unsigned int irq)
 {
 	return 0;
@@ -109,9 +461,9 @@ struct irq_chip dummy_irq_chip = {
 	.shutdown	= noop,
 	.enable		= noop,
 	.disable	= noop,
-	.ack		= noop,
-	.mask		= noop,
-	.unmask		= noop,
+	.ack		= noop_desc,
+	.mask		= noop_desc,
+	.unmask		= noop_desc,
 	.end		= noop,
 };
 
@@ -180,7 +532,7 @@ unsigned int __do_IRQ(unsigned int irq)
 		 * No locking required for CPU-local interrupts:
 		 */
 		if (desc->chip->ack)
-			desc->chip->ack(irq);
+			desc_chip_ack(irq, &desc);
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -192,7 +544,7 @@ unsigned int __do_IRQ(unsigned int irq)
 
 	spin_lock(&desc->lock);
 	if (desc->chip->ack)
-		desc->chip->ack(irq);
+		desc_chip_ack(irq, &desc);
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
@@ -261,17 +613,25 @@ out:
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-static struct lock_class_key irq_desc_lock_class;
-
 void early_init_irq_lock_class(void)
 {
+#ifndef CONFIG_SPARSE_IRQ
 	struct irq_desc *desc;
 	int i;
 
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
 		lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	} end_for_each_irq_desc();
+#endif
 }
 #endif
+
+#ifdef CONFIG_SPARSE_IRQ
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	return desc->kstat_irqs[cpu];
+}
+#endif
+EXPORT_SYMBOL(kstat_irqs_cpu);
+
Index: linux-2.6/arch/x86/kernel/irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq.c
+++ linux-2.6/arch/x86/kernel/irq.c
@@ -99,25 +99,37 @@ static int show_other_interrupts(struct
 int show_interrupts(struct seq_file *p, void *v)
 {
 	unsigned long flags, any_count = 0;
-	int i = *(loff_t *) v, j;
+	int i, j;
 	struct irqaction *action;
 	struct irq_desc *desc;
+	int head = 0;
 
+#ifdef CONFIG_SPARSE_IRQ
+	desc = list_entry(v, struct irq_desc, list);
+	i = desc->irq;
+	if (&desc->list == sparse_irqs_head.next)
+		head = 1;
+#else
+	i = *(loff_t *) v;
 	if (i > nr_irqs)
 		return 0;
 
 	if (i == nr_irqs)
 		return show_other_interrupts(p);
+	if (i == 0)
+		head = 1;
+
+	desc = irq_to_desc(i);
+#endif
 
 	/* print header */
-	if (i == 0) {
+	if (head) {
 		seq_printf(p, "           ");
 		for_each_online_cpu(j)
 			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
 	}
 
-	desc = irq_to_desc(i);
 	spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 	any_count = kstat_irqs(i);
@@ -148,6 +160,12 @@ int show_interrupts(struct seq_file *p,
 	seq_putc(p, '\n');
 out:
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+#ifdef CONFIG_SPARSE_IRQ
+	if (&desc->list == sparse_irqs_head.prev)
+		show_other_interrupts(p);
+#endif
+
 	return 0;
 }
 
Index: linux-2.6/include/linux/irqnr.h
===================================================================
--- linux-2.6.orig/include/linux/irqnr.h
+++ linux-2.6/include/linux/irqnr.h
@@ -7,18 +7,11 @@
 
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0; irq < nr_irqs; irq++)
-#else
-extern int nr_irqs;
+# define end_for_each_irq_desc()
 
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-
-# define for_each_irq_desc_reverse(irq, desc)				\
-	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);	\
-	     irq >= 0; irq--, desc--)
+static inline early_sparse_irq_init(void)
+{
+}
 #endif
 
-#define for_each_irq_nr(irq)			\
-	for (irq = 0; irq < nr_irqs; irq++)
-
 #endif
Index: linux-2.6/arch/x86/kernel/irq_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_32.c
+++ linux-2.6/arch/x86/kernel/irq_32.c
@@ -251,10 +251,10 @@ void fixup_irqs(cpumask_t map)
 			mask = map;
 		}
 		if (desc->chip->set_affinity)
-			desc->chip->set_affinity(irq, mask);
+			desc_chip_set_affinity(irq, desc, mask);
 		else if (desc->action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 #if 0
 	barrier();
Index: linux-2.6/arch/x86/kernel/irq_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irq_64.c
+++ linux-2.6/arch/x86/kernel/irq_64.c
@@ -113,15 +113,15 @@ void fixup_irqs(cpumask_t map)
 		}
 
 		if (desc->chip->mask)
-			desc->chip->mask(irq);
+			desc_chip_mask(irq, &desc);
 
 		if (desc->chip->set_affinity)
-			desc->chip->set_affinity(irq, mask);
+			desc_chip_set_affinity(irq, desc, mask);
 		else if (!(warned++))
 			set_affinity = 0;
 
 		if (desc->chip->unmask)
-			desc->chip->unmask(irq);
+			desc_chip_unmask(irq, &desc);
 
 		spin_unlock(&desc->lock);
 
@@ -129,7 +129,7 @@ void fixup_irqs(cpumask_t map)
 			printk("Broke affinity for irq %i\n", irq);
 		else if (!set_affinity)
 			printk("Cannot set affinity for irq %i\n", irq);
-	}
+	} end_for_each_irq_desc();
 
 	/* That doesn't seem sufficient.  Give it 1ms. */
 	local_irq_enable();
Index: linux-2.6/kernel/irq/proc.c
===================================================================
--- linux-2.6.orig/kernel/irq/proc.c
+++ linux-2.6/kernel/irq/proc.c
@@ -243,7 +243,8 @@ void init_irq_proc(void)
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for_each_irq_desc(irq, desc)
+	for_each_irq_desc(irq, desc) {
 		register_irq_proc(irq, desc);
+	} end_for_each_irq_desc();
 }
 
Index: linux-2.6/kernel/irq/spurious.c
===================================================================
--- linux-2.6.orig/kernel/irq/spurious.c
+++ linux-2.6/kernel/irq/spurious.c
@@ -99,7 +99,7 @@ static int misrouted_irq(int irq)
 
 		if (try_one_irq(i, desc))
 			ok = 1;
-	}
+	} end_for_each_irq_desc();
 	/* So the caller can adjust the irq error counts */
 	return ok;
 }
@@ -122,7 +122,7 @@ static void poll_spurious_irqs(unsigned
 			continue;
 
 		try_one_irq(i, desc);
-	}
+	} end_for_each_irq_desc();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c
+++ linux-2.6/init/main.c
@@ -542,6 +542,15 @@ void __init __weak thread_info_cache_ini
 {
 }
 
+void __init __weak arch_early_irq_init(void)
+{
+}
+
+void __init __weak early_irq_init(void)
+{
+	arch_early_irq_init();
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
@@ -612,6 +621,8 @@ asmlinkage void __init start_kernel(void
 	sort_main_extable();
 	trap_init();
 	rcu_init();
+	/* init some links before init_ISA_irqs() */
+	early_irq_init();
 	init_IRQ();
 	pidhash_init();
 	init_timers();
Index: linux-2.6/arch/x86/include/asm/irq_vectors.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/irq_vectors.h
+++ linux-2.6/arch/x86/include/asm/irq_vectors.h
@@ -101,6 +101,8 @@
 #define LAST_VM86_IRQ		15
 #define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
 
+#define NR_IRQS_LEGACY		16
+
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
Index: linux-2.6/arch/x86/kernel/i8259.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/i8259.c
+++ linux-2.6/arch/x86/kernel/i8259.c
@@ -36,12 +36,31 @@ static int i8259A_auto_eoi;
 DEFINE_SPINLOCK(i8259A_lock);
 static void mask_and_ack_8259A(unsigned int);
 
+#ifdef CONFIG_SPARSE_IRQ
+static void mask_and_ack_8259A_desc(struct irq_desc **descp)
+{
+	mask_and_ack_8259A((*descp)->irq);
+}
+static void disable_8259A_irq_desc(struct irq_desc **descp)
+{
+	disable_8259A_irq((*descp)->irq);
+}
+static void enable_8259A_irq_desc(struct irq_desc **descp)
+{
+	enable_8259A_irq((*descp)->irq);
+}
+#else
+#define mask_and_ack_8259A_desc mask_and_ack_8259A
+#define disable_8259A_irq_desc disable_8259A_irq
+#define enable_8259A_irq_desc enable_8259A_irq
+#endif
+
 struct irq_chip i8259A_chip = {
 	.name		= "XT-PIC",
-	.mask		= disable_8259A_irq,
+	.mask		= disable_8259A_irq_desc,
 	.disable	= disable_8259A_irq,
-	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
+	.unmask		= enable_8259A_irq_desc,
+	.mask_ack	= mask_and_ack_8259A_desc,
 };
 
 /*
@@ -348,9 +367,9 @@ void init_8259A(int auto_eoi)
 		 * In AEOI mode we just have to mask the interrupt
 		 * when acking.
 		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
+		i8259A_chip.mask_ack = disable_8259A_irq_desc;
 	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
+		i8259A_chip.mask_ack = mask_and_ack_8259A_desc;
 
 	udelay(100);		/* wait for 8259A to initialize */
 
Index: linux-2.6/arch/x86/kernel/uv_irq.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/uv_irq.c
+++ linux-2.6/arch/x86/kernel/uv_irq.c
@@ -18,26 +18,45 @@ static void uv_noop(unsigned int irq)
 {
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_noop_desc(struct irq_desc **descp)
+{
+}
+
+#else
+#define uv_noop_desc uv_noop
+#endif
+
 static unsigned int uv_noop_ret(unsigned int irq)
 {
 	return 0;
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+static void uv_ack_apic_desc(struct irq_desc **descp)
+{
+	ack_APIC_irq();
+}
+
+#else
 static void uv_ack_apic(unsigned int irq)
 {
 	ack_APIC_irq();
 }
 
+#define uv_ack_apic_desc uv_ack_apic
+#endif
+
 struct irq_chip uv_irq_chip = {
 	.name		= "UV-CORE",
 	.startup	= uv_noop_ret,
 	.shutdown	= uv_noop,
 	.enable		= uv_noop,
 	.disable	= uv_noop,
-	.ack		= uv_noop,
-	.mask		= uv_noop,
-	.unmask		= uv_noop,
-	.eoi		= uv_ack_apic,
+	.ack		= uv_noop_desc,
+	.mask		= uv_noop_desc,
+	.unmask		= uv_noop_desc,
+	.eoi		= uv_ack_apic_desc,
 	.end		= uv_noop,
 };
 
Index: linux-2.6/drivers/pci/msi.c
===================================================================
--- linux-2.6.orig/drivers/pci/msi.c
+++ linux-2.6/drivers/pci/msi.c
@@ -103,11 +103,11 @@ static void msix_set_enable(struct pci_d
 	}
 }
 
-static void msix_flush_writes(unsigned int irq)
+static void msix_flush_writes(struct irq_desc *desc)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -135,11 +135,11 @@ static void msix_flush_writes(unsigned i
  * Returns 1 if it succeeded in masking the interrupt and 0 if the device
  * doesn't support MSI masking.
  */
-static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
+static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -172,9 +172,9 @@ static int msi_set_mask_bits(unsigned in
 	return 1;
 }
 
-void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
-	struct msi_desc *entry = get_irq_msi(irq);
+	struct msi_desc *entry = get_irq_desc_msi(desc);
 	switch(entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
 	{
@@ -211,9 +211,16 @@ void read_msi_msg(unsigned int irq, stru
 	}
 }
 
-void write_msi_msg(unsigned int irq, struct msi_msg *msg)
+void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	read_msi_msg_desc(desc, msg);
+}
+
+void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
-	struct msi_desc *entry = get_irq_msi(irq);
+	struct msi_desc *entry = get_irq_desc_msi(desc);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
 	{
@@ -252,17 +259,43 @@ void write_msi_msg(unsigned int irq, str
 	entry->msg = *msg;
 }
 
-void mask_msi_irq(unsigned int irq)
+void write_msi_msg(unsigned int irq, struct msi_msg *msg)
 {
-	msi_set_mask_bits(irq, 1, 1);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	write_msi_msg_desc(desc, msg);
 }
 
+void mask_msi_irq_desc(struct irq_desc **descp)
+{
+	struct irq_desc *desc = *descp;
+
+	msi_set_mask_bits(desc, 1, 1);
+	msix_flush_writes(desc);
+}
+
+void unmask_msi_irq_desc(struct irq_desc **descp)
+{
+	struct irq_desc *desc = *descp;
+
+	msi_set_mask_bits(desc, 1, 0);
+	msix_flush_writes(desc);
+}
+
+#ifndef CONFIG_SPARSE_IRQ
+void mask_msi_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_msi_irq_desc(&desc);
+}
 void unmask_msi_irq(unsigned int irq)
 {
-	msi_set_mask_bits(irq, 1, 0);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_msi_irq_desc(&desc);
 }
+#endif
 
 static int msi_free_irqs(struct pci_dev* dev);
 
@@ -303,9 +336,11 @@ static void __pci_restore_msi_state(stru
 	pci_intx_for_msi(dev, 0);
 	msi_set_enable(dev, 0);
 	write_msi_msg(dev->irq, &entry->msg);
-	if (entry->msi_attrib.maskbit)
-		msi_set_mask_bits(dev->irq, entry->msi_attrib.maskbits_mask,
+	if (entry->msi_attrib.maskbit) {
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask,
 				  entry->msi_attrib.masked);
+	}
 
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
@@ -327,8 +362,9 @@ static void __pci_restore_msix_state(str
 	msix_set_enable(dev, 0);
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
+		struct irq_desc *desc = irq_to_desc(entry->irq);
 		write_msi_msg(entry->irq, &entry->msg);
-		msi_set_mask_bits(entry->irq, 1, entry->msi_attrib.masked);
+		msi_set_mask_bits(desc, 1, entry->msi_attrib.masked);
 	}
 
 	BUG_ON(list_empty(&dev->msi_list));
@@ -596,7 +632,8 @@ void pci_msi_shutdown(struct pci_dev* de
 	/* Return the the pci reset with msi irqs unmasked */
 	if (entry->msi_attrib.maskbit) {
 		u32 mask = entry->msi_attrib.maskbits_mask;
-		msi_set_mask_bits(dev->irq, mask, ~mask);
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, mask, ~mask);
 	}
 	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
 		return;
Index: linux-2.6/include/linux/msi.h
===================================================================
--- linux-2.6.orig/include/linux/msi.h
+++ linux-2.6/include/linux/msi.h
@@ -10,8 +10,18 @@ struct msi_msg {
 };
 
 /* Helper functions */
+struct irq_desc;
+#ifdef CONFIG_SPARSE_IRQ
+extern void mask_msi_irq_desc(struct irq_desc **descp);
+extern void unmask_msi_irq_desc(struct irq_desc **descp);
+#define mask_msi_irq mask_msi_irq_desc
+#define unmask_msi_irq unmask_msi_irq_desc
+#else
 extern void mask_msi_irq(unsigned int irq);
 extern void unmask_msi_irq(unsigned int irq);
+#endif
+extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
+extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
 extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
 extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
Index: linux-2.6/arch/x86/include/asm/hpet.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/hpet.h
+++ linux-2.6/arch/x86/include/asm/hpet.h
@@ -72,8 +72,15 @@ extern void hpet_disable(void);
 extern unsigned long hpet_readl(unsigned long a);
 extern void force_hpet_resume(void);
 
+#ifdef CONFIG_SPARSE_IRQ
+extern void hpet_msi_unmask_desc(struct irq_desc **descp);
+extern void hpet_msi_mask_desc(struct irq_desc **descp);
+#define hpet_msi_unmask hpet_msi_unmask_desc
+#define hpet_msi_mask hpet_msi_mask_desc
+#else
 extern void hpet_msi_unmask(unsigned int irq);
 extern void hpet_msi_mask(unsigned int irq);
+#endif
 extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
 extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
 
Index: linux-2.6/arch/x86/kernel/hpet.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/hpet.c
+++ linux-2.6/arch/x86/kernel/hpet.c
@@ -347,9 +347,9 @@ static int hpet_legacy_next_event(unsign
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev	*hpet_devs;
 
-void hpet_msi_unmask(unsigned int irq)
+void hpet_msi_unmask_desc(struct irq_desc **descp)
 {
-	struct hpet_dev *hdev = get_irq_data(irq);
+	struct hpet_dev *hdev = get_irq_desc_data(*descp);
 	unsigned long cfg;
 
 	/* unmask it */
@@ -358,10 +358,10 @@ void hpet_msi_unmask(unsigned int irq)
 	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
 
-void hpet_msi_mask(unsigned int irq)
+void hpet_msi_mask_desc(struct irq_desc **descp)
 {
 	unsigned long cfg;
-	struct hpet_dev *hdev = get_irq_data(irq);
+	struct hpet_dev *hdev = get_irq_desc_data(*descp);
 
 	/* mask it */
 	cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -369,6 +369,21 @@ void hpet_msi_mask(unsigned int irq)
 	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void hpet_msi_unmask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	hpet_msi_unmask_desc(&desc);
+}
+void hpet_msi_mask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	hpet_msi_mask_desc(&desc);
+}
+#endif
+
 void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
 {
 	struct hpet_dev *hdev = get_irq_data(irq);
Index: linux-2.6/include/linux/htirq.h
===================================================================
--- linux-2.6.orig/include/linux/htirq.h
+++ linux-2.6/include/linux/htirq.h
@@ -9,8 +9,16 @@ struct ht_irq_msg {
 /* Helper functions.. */
 void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
 void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
+#ifdef CONFIG_SPARSE_IRQ
+struct irq_desc;
+void mask_ht_irq_desc(struct irq_desc **descp);
+void unmask_ht_irq_desc(struct irq_desc **descp);
+#define mask_ht_irq mask_ht_irq_desc
+#define unmask_ht_irq unmask_ht_irq_desc
+#else
 void mask_ht_irq(unsigned int irq);
 void unmask_ht_irq(unsigned int irq);
+#endif
 
 /* The arch hook for getting things started */
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev);
Index: linux-2.6/kernel/irq/migration.c
===================================================================
--- linux-2.6.orig/kernel/irq/migration.c
+++ linux-2.6/kernel/irq/migration.c
@@ -1,9 +1,9 @@
 
 #include <linux/irq.h>
 
-void move_masked_irq(int irq)
+void move_masked_irq_desc(struct irq_desc **descp)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc = *descp;
 	cpumask_t tmp;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
@@ -42,14 +42,17 @@ void move_masked_irq(int irq)
 	 * masking the irqs.
 	 */
 	if (likely(!cpus_empty(tmp))) {
-		desc->chip->set_affinity(irq,tmp);
+		desc_chip_set_affinity(desc->irq, desc, tmp);
 	}
 	cpus_clear(desc->pending_mask);
 }
 
-void move_native_irq(int irq)
+void move_native_irq_desc(struct irq_desc **descp)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc = *descp;
+#ifndef CONFIG_SPARSE_IRQ
+	unsigned int irq = desc->irq;
+#endif
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
@@ -57,8 +60,23 @@ void move_native_irq(int irq)
 	if (unlikely(desc->status & IRQ_DISABLED))
 		return;
 
-	desc->chip->mask(irq);
-	move_masked_irq(irq);
-	desc->chip->unmask(irq);
+	desc_chip_mask(irq, descp);
+	move_masked_irq_desc(descp);
+	desc_chip_unmask(irq, descp);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void move_masked_irq(int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	move_masked_irq_desc(&desc);
+}
+
+void move_native_irq(int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	move_native_irq_desc(&desc);
+}
+#endif
Index: linux-2.6/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.orig/drivers/pci/intel-iommu.c
+++ linux-2.6/drivers/pci/intel-iommu.c
@@ -751,9 +751,9 @@ const char *dmar_get_fault_reason(u8 fau
 		return fault_reason_strings[fault_reason];
 }
 
-void dmar_msi_unmask(unsigned int irq)
+void dmar_msi_unmask_desc(struct irq_desc **descp)
 {
-	struct intel_iommu *iommu = get_irq_data(irq);
+	struct intel_iommu *iommu = get_irq_desc_data(*descp);
 	unsigned long flag;
 
 	/* unmask it */
@@ -764,10 +764,10 @@ void dmar_msi_unmask(unsigned int irq)
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
-void dmar_msi_mask(unsigned int irq)
+void dmar_msi_mask_desc(struct irq_desc **descp)
 {
 	unsigned long flag;
-	struct intel_iommu *iommu = get_irq_data(irq);
+	struct intel_iommu *iommu = get_irq_desc_data(*descp);
 
 	/* mask it */
 	spin_lock_irqsave(&iommu->register_lock, flag);
@@ -777,6 +777,21 @@ void dmar_msi_mask(unsigned int irq)
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
+#ifndef CONFIG_SPARSE_IRQ
+void dmar_msi_unmask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	dmar_msi_unmask_desc(&desc);
+}
+void dmar_msi_mask(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	dmar_msi_mask_desc(&desc);
+}
+#endif
+
 void dmar_msi_write(int irq, struct msi_msg *msg)
 {
 	struct intel_iommu *iommu = get_irq_data(irq);
Index: linux-2.6/include/linux/dmar.h
===================================================================
--- linux-2.6.orig/include/linux/dmar.h
+++ linux-2.6/include/linux/dmar.h
@@ -122,8 +122,15 @@ extern const char *dmar_get_fault_reason
 /* Can't use the common MSI interrupt functions
  * since DMAR is not a pci device
  */
+#ifdef CONFIG_SPARSE_IRQ
+extern void dmar_msi_unmask_desc(struct irq_desc **descp);
+extern void dmar_msi_mask_desc(struct irq_desc **descp);
+#define dmar_msi_unmask dmar_msi_unmask_desc
+#define dmar_msi_mask dmar_msi_mask_desc
+#else
 extern void dmar_msi_unmask(unsigned int irq);
 extern void dmar_msi_mask(unsigned int irq);
+#endif
 extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);
Index: linux-2.6/kernel/irq/manage.c
===================================================================
--- linux-2.6.orig/kernel/irq/manage.c
+++ linux-2.6/kernel/irq/manage.c
@@ -92,14 +92,14 @@ int irq_set_affinity(unsigned int irq, c
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
 		desc->affinity = cpumask;
-		desc->chip->set_affinity(irq, cpumask);
+		desc_chip_set_affinity(irq, desc, cpumask);
 	} else {
 		desc->status |= IRQ_MOVE_PENDING;
 		desc->pending_mask = cpumask;
 	}
 #else
 	desc->affinity = cpumask;
-	desc->chip->set_affinity(irq, cpumask);
+	desc_chip_set_affinity(irq, desc, cpumask);
 #endif
 	desc->status |= IRQ_AFFINITY_SET;
 	spin_unlock_irqrestore(&desc->lock, flags);
@@ -131,7 +131,7 @@ int do_irq_select_affinity(unsigned int
 	}
 
 	desc->affinity = mask;
-	desc->chip->set_affinity(irq, mask);
+	desc_chip_set_affinity(irq, desc, mask);
 
 	return 0;
 }


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-13 21:18                                         ` Andrew Morton
                                                             ` (2 preceding siblings ...)
  2008-11-13 22:19                                           ` [PATCH] sparse_irq aka dyn_irq v13 Paul Mackerras
@ 2008-11-16 20:58                                           ` Benjamin Herrenschmidt
  2008-11-16 23:44                                             ` Yinghai Lu
  3 siblings, 1 reply; 66+ messages in thread
From: Benjamin Herrenschmidt @ 2008-11-16 20:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Yinghai Lu, mingo, tglx, hpa, linux-kernel, travis


> Other architectures want (or have) sparse interrupts.  Are those guys
> paying attention here?
> 
> I don't have a clue what all this does.  I hope those who will work on
> this code are sufficiently familiar with it all to be able to maintain
> it when there are close to zero comments in some of our most tricky and
> problem-prone code.

Indeed, I'm a bit scared by the patch as it is ...

On powerpc, we have a nice an simple virtual irq layer that also
probably better fits the need of architecture that can have multiple
PICs with different number space cascaded in various ways in a single
machine too.

We provide a basic construct that defines an HW IRQ domain (we call it
irq_host, it can have a 1:1 relationship to a HW irq_chip but not
always), and when HW IRQs are discovered and made available to linux,
they are "mapped" to virtual irq numbers by mapping them to a tuple
(irq_hose, hw_number).

The HW numbering can be arbitrarily anything for that controller
unrelated to the "linux" virtual number, so all the problems of making
IRQ 0 invalid and reserving 1...15 for legacy 8259 etc... are taken care
of.

For fast mapping of HW->Linux numbers, we provide various schemes that a
given IRQ host implementation can choose (from no translation, some HW
PICs or hypervisor can be directly given the linux number and pass it
back to us) to linear mapping to a lockless radix tree for large sparse
irq domains.

At this stage, I don't feel like changing to this sparse_irq which I
haven't fully understood, is complicated, and doesn't seem to provide
the features I need anyway.

The only thing that I see possibly of interest is getting rid of the
irq_desc array itself, toward something more dynamically resizeable, in
which case our powerpc remapping scheme could probably also benefit by
also getting rid in a similar way of our equivalent array used for
mapping the virq's to host,hw_number tuples.

The irq_desc per-cpu also seem to be an interesting idea.

In any case, the patch is too big and unless I missed something, totally
under-documented in what it does, why it does it, how it does it, etc...
with a bit more of that, I might take more time to look closely and find
out that indeed it's something I can make use of, maybe with a few
tweaks.

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-16 20:58                                           ` Benjamin Herrenschmidt
@ 2008-11-16 23:44                                             ` Yinghai Lu
  2008-11-16 23:48                                               ` H. Peter Anvin
  0 siblings, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-16 23:44 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Andrew Morton, mingo, tglx, hpa, linux-kernel, travis

Benjamin Herrenschmidt wrote:
> 
> In any case, the patch is too big and unless I missed something, totally
> under-documented in what it does, why it does it, how it does it, etc...
> with a bit more of that, I might take more time to look closely and find
> out that indeed it's something I can make use of, maybe with a few
> tweaks.

will cut it to smaller ones. try to have one big one to make review more easy.

starting case
1. IBM x3950 have io apic pins > 224?
2. SGI big box need 8192 irqs 

want to make same kernel with NR_CPUS = 4096 can be used with regular sized system.

points are
1. removing irq_desc array and NR_IRQS. for small system will have less wasting ram in that array, and for big system will get more irqs like system have 8 pcie cards and every card need 40 MSI vector or a lot of IO apic controllers etc.
2. make irq number is bus/devfn/idx, and every dev func will use 12bit range, irq number is relatively fixed not like current MSI irq creating is some kind of floating from NR_IRQS too. 
3. irq_desc is going with /proc/irq/xxx/smp_affinity.

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-16 23:44                                             ` Yinghai Lu
@ 2008-11-16 23:48                                               ` H. Peter Anvin
  2008-11-16 23:54                                                 ` Yinghai Lu
                                                                   ` (2 more replies)
  0 siblings, 3 replies; 66+ messages in thread
From: H. Peter Anvin @ 2008-11-16 23:48 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Benjamin Herrenschmidt, Andrew Morton, mingo, tglx, linux-kernel,
	travis

Yinghai Lu wrote:
> 2. make irq number is bus/devfn/idx, and every dev func will use 12bit range, irq number is relatively fixed not like current MSI irq creating is some kind of floating from NR_IRQS too. 

2 is *STILL WRONG*, dammit!

You keep bringing this one up, but our PCI addressing is
*DOMAIN*/bus/devfn -- it falls flat on its face when you have more than
16 PCI domains.  CAN WE PLEASE STOP WITH THIS FOOLISHNESS NOW!

	-hpa

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-16 23:48                                               ` H. Peter Anvin
@ 2008-11-16 23:54                                                 ` Yinghai Lu
  2008-11-16 23:59                                                   ` H. Peter Anvin
  2008-11-17  4:22                                                   ` Benjamin Herrenschmidt
  2008-11-17  1:51                                                 ` Mike Travis
  2008-11-17  4:22                                                 ` Benjamin Herrenschmidt
  2 siblings, 2 replies; 66+ messages in thread
From: Yinghai Lu @ 2008-11-16 23:54 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Benjamin Herrenschmidt, Andrew Morton, mingo, tglx, linux-kernel,
	travis

H. Peter Anvin wrote:
> Yinghai Lu wrote:
>> 2. make irq number is bus/devfn/idx, and every dev func will use 12bit range, irq number is relatively fixed not like current MSI irq creating is some kind of floating from NR_IRQS too. 
> 
> 2 is *STILL WRONG*, dammit!
> 
> You keep bringing this one up, but our PCI addressing is
> *DOMAIN*/bus/devfn -- it falls flat on its face when you have more than
> 16 PCI domains.  CAN WE PLEASE STOP WITH THIS FOOLISHNESS NOW!

you want to u64 instead of unsigned int for irq?

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-16 23:54                                                 ` Yinghai Lu
@ 2008-11-16 23:59                                                   ` H. Peter Anvin
  2008-11-17  0:21                                                     ` Yinghai Lu
  2008-11-17  4:25                                                     ` Benjamin Herrenschmidt
  2008-11-17  4:22                                                   ` Benjamin Herrenschmidt
  1 sibling, 2 replies; 66+ messages in thread
From: H. Peter Anvin @ 2008-11-16 23:59 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Benjamin Herrenschmidt, Andrew Morton, mingo, tglx, linux-kernel,
	travis

Yinghai Lu wrote:
> H. Peter Anvin wrote:
>> Yinghai Lu wrote:
>>> 2. make irq number is bus/devfn/idx, and every dev func will use 12bit range, irq number is relatively fixed not like current MSI irq creating is some kind of floating from NR_IRQS too. 
>> 2 is *STILL WRONG*, dammit!
>>
>> You keep bringing this one up, but our PCI addressing is
>> *DOMAIN*/bus/devfn -- it falls flat on its face when you have more than
>> 16 PCI domains.  CAN WE PLEASE STOP WITH THIS FOOLISHNESS NOW!
> 
> you want to u64 instead of unsigned int for irq?
> 

No, I think the whole notion of a static *numeric* identifier for an IRQ
when it's something like MSI-X is simply pointless.  I think we should
assign IRQ numbers beyond the legacy range dynamically.

I really don't think anyone gives a hoot about the IRQ number for any
IRQ above the 0-15 legacy range, even including the "APIC" numbers 16+.

	-hpa

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-16 23:59                                                   ` H. Peter Anvin
@ 2008-11-17  0:21                                                     ` Yinghai Lu
  2008-11-17  0:26                                                       ` H. Peter Anvin
  2008-11-17 20:25                                                       ` Jeremy Fitzhardinge
  2008-11-17  4:25                                                     ` Benjamin Herrenschmidt
  1 sibling, 2 replies; 66+ messages in thread
From: Yinghai Lu @ 2008-11-17  0:21 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Benjamin Herrenschmidt, Andrew Morton, mingo, tglx, linux-kernel,
	travis

H. Peter Anvin wrote:
> Yinghai Lu wrote:
>> H. Peter Anvin wrote:
>>> Yinghai Lu wrote:
>>>> 2. make irq number is bus/devfn/idx, and every dev func will use 12bit range, irq number is relatively fixed not like current MSI irq creating is some kind of floating from NR_IRQS too. 
>>> 2 is *STILL WRONG*, dammit!
>>>
>>> You keep bringing this one up, but our PCI addressing is
>>> *DOMAIN*/bus/devfn -- it falls flat on its face when you have more than
>>> 16 PCI domains.  CAN WE PLEASE STOP WITH THIS FOOLISHNESS NOW!
>> you want to u64 instead of unsigned int for irq?
>>
> 
> No, I think the whole notion of a static *numeric* identifier for an IRQ
> when it's something like MSI-X is simply pointless.  I think we should
> assign IRQ numbers beyond the legacy range dynamically.
> 
> I really don't think anyone gives a hoot about the IRQ number for any
> IRQ above the 0-15 legacy range, even including the "APIC" numbers 16+.

you want to change ioapic/pin to irq mapping too?

so INTx and MSI will call create_irq_nr to get one irq for 16, and following first come and first serve rule.

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  0:21                                                     ` Yinghai Lu
@ 2008-11-17  0:26                                                       ` H. Peter Anvin
  2008-11-17  0:36                                                         ` Yinghai Lu
  2008-11-17  4:26                                                         ` Benjamin Herrenschmidt
  2008-11-17 20:25                                                       ` Jeremy Fitzhardinge
  1 sibling, 2 replies; 66+ messages in thread
From: H. Peter Anvin @ 2008-11-17  0:26 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Benjamin Herrenschmidt, Andrew Morton, mingo, tglx, linux-kernel,
	travis

Yinghai Lu wrote:
>>
>> I really don't think anyone gives a hoot about the IRQ number for any
>> IRQ above the 0-15 legacy range, even including the "APIC" numbers 16+.
> 
> you want to change ioapic/pin to irq mapping too?
> so INTx and MSI will call create_irq_nr to get one irq for 16, and following first come and first serve rule.
> 

I personally don't think there is any issue with changing ioapic/pin to
IRQ mapping.  Other people may disagree.  My opinion is that IRQ numbers
16-23 are somewhat useful when you're dealing with a single IOAPIC, but
after that it's all a blur.

It would, however, be a good idea if IOAPICs had their numbers assigned
at detection time, as opposed to when the interrupt is registered, thus
making it a stable number for a single boot, at least.  The same is
probably true for MSI(-X); we could assign it a range of numbers when
the device is enumerated (as opposed to when a driver is activated), but
I don't know to what extent that is likely to cause more troubles than
it solves.

	-hpa

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  0:26                                                       ` H. Peter Anvin
@ 2008-11-17  0:36                                                         ` Yinghai Lu
  2008-11-17  0:48                                                           ` H. Peter Anvin
  2008-11-17  4:26                                                         ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-17  0:36 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Benjamin Herrenschmidt, Andrew Morton, mingo, tglx, linux-kernel,
	travis

H. Peter Anvin wrote:
> 
> It would, however, be a good idea if IOAPICs had their numbers assigned
> at detection time, as opposed to when the interrupt is registered, thus
> making it a stable number for a single boot, at least.  The same is
> probably true for MSI(-X); we could assign it a range of numbers when
> the device is enumerated (as opposed to when a driver is activated), but
> I don't know to what extent that is likely to cause more troubles than
> it solves.

how to find the range for MSIX, one device/func may need a lot. for example, niu driver could use 20 MSI-X for one port. (one dev/func)
some could use 256. we only know that when driver is loaded.
So as Eric said, just try to use 12bits (4k range) for them.


YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  0:36                                                         ` Yinghai Lu
@ 2008-11-17  0:48                                                           ` H. Peter Anvin
  2008-11-17  0:58                                                             ` Yinghai Lu
  0 siblings, 1 reply; 66+ messages in thread
From: H. Peter Anvin @ 2008-11-17  0:48 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Benjamin Herrenschmidt, Andrew Morton, mingo, tglx, linux-kernel,
	travis

Yinghai Lu wrote:
> H. Peter Anvin wrote:
>> It would, however, be a good idea if IOAPICs had their numbers assigned
>> at detection time, as opposed to when the interrupt is registered, thus
>> making it a stable number for a single boot, at least.  The same is
>> probably true for MSI(-X); we could assign it a range of numbers when
>> the device is enumerated (as opposed to when a driver is activated), but
>> I don't know to what extent that is likely to cause more troubles than
>> it solves.
> 
> how to find the range for MSIX, one device/func may need a lot. for example, niu driver could use 20 MSI-X for one port. (one dev/func)
> some could use 256. we only know that when driver is loaded.
> So as Eric said, just try to use 12bits (4k range) for them.
> 

You can know how many vectors are exported in generic code.  However,
using 4k per should be fine.

	-hpa

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  0:48                                                           ` H. Peter Anvin
@ 2008-11-17  0:58                                                             ` Yinghai Lu
  2008-11-17  1:00                                                               ` H. Peter Anvin
  0 siblings, 1 reply; 66+ messages in thread
From: Yinghai Lu @ 2008-11-17  0:58 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Benjamin Herrenschmidt, Andrew Morton, mingo, tglx, linux-kernel,
	travis

H. Peter Anvin wrote:
> Yinghai Lu wrote:
>> H. Peter Anvin wrote:
>>> It would, however, be a good idea if IOAPICs had their numbers assigned
>>> at detection time, as opposed to when the interrupt is registered, thus
>>> making it a stable number for a single boot, at least.  The same is
>>> probably true for MSI(-X); we could assign it a range of numbers when
>>> the device is enumerated (as opposed to when a driver is activated), but
>>> I don't know to what extent that is likely to cause more troubles than
>>> it solves.
>> how to find the range for MSIX, one device/func may need a lot. for example, niu driver could use 20 MSI-X for one port. (one dev/func)
>> some could use 256. we only know that when driver is loaded.
>> So as Eric said, just try to use 12bits (4k range) for them.
>>
> 
> You can know how many vectors are exported in generic code.  However,
> using 4k per should be fine.

so we have one list to map domain/bus/dev/func to bits [31,12] in irq ?

YH

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  0:58                                                             ` Yinghai Lu
@ 2008-11-17  1:00                                                               ` H. Peter Anvin
  2008-11-17  2:03                                                                 ` Mike Travis
  2008-11-17  4:27                                                                 ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 66+ messages in thread
From: H. Peter Anvin @ 2008-11-17  1:00 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Benjamin Herrenschmidt, Andrew Morton, mingo, tglx, linux-kernel,
	travis

Yinghai Lu wrote:
> 
> so we have one list to map domain/bus/dev/func to bits [31,12] in irq ?
> 

That works, although having a more generic allocation mechanism which
isn't so tied to MSI-X would make more sense.

	-hpa

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-16 23:48                                               ` H. Peter Anvin
  2008-11-16 23:54                                                 ` Yinghai Lu
@ 2008-11-17  1:51                                                 ` Mike Travis
  2008-11-17  4:39                                                   ` H. Peter Anvin
  2008-11-17  4:22                                                 ` Benjamin Herrenschmidt
  2 siblings, 1 reply; 66+ messages in thread
From: Mike Travis @ 2008-11-17  1:51 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yinghai Lu, Benjamin Herrenschmidt, Andrew Morton, mingo, tglx,
	linux-kernel

H. Peter Anvin wrote:
> Yinghai Lu wrote:
>> 2. make irq number is bus/devfn/idx, and every dev func will use 12bit range, irq number is relatively fixed not like current MSI irq creating is some kind of floating from NR_IRQS too. 
> 
> 2 is *STILL WRONG*, dammit!
> 
> You keep bringing this one up, but our PCI addressing is
> *DOMAIN*/bus/devfn -- it falls flat on its face when you have more than
> 16 PCI domains.  CAN WE PLEASE STOP WITH THIS FOOLISHNESS NOW!
> 
> 	-hpa

Hmm, I was going to bring this up as well... ;-)

X was changed quite a while ago to domain/bus/dev/func, which was a lot easier to
deal with than trying to "offset" the bus with domain * some large number + bus.
Currently max nodes is 512 so it deserves it's own field. ;-)

Btw, are you suggesting combining device and function?  That might be inadvertently
squeezing something that shouldn't be.

Thanks,
Mike

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  1:00                                                               ` H. Peter Anvin
@ 2008-11-17  2:03                                                                 ` Mike Travis
  2008-11-17  4:27                                                                 ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 66+ messages in thread
From: Mike Travis @ 2008-11-17  2:03 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yinghai Lu, Benjamin Herrenschmidt, Andrew Morton, mingo, tglx,
	linux-kernel

H. Peter Anvin wrote:
> Yinghai Lu wrote:
>> so we have one list to map domain/bus/dev/func to bits [31,12] in irq ?
>>
> 
> That works, although having a more generic allocation mechanism which
> isn't so tied to MSI-X would make more sense.
> 
> 	-hpa

Hmm, I see I picked up this conversation quite late.  I would agree that
trying to map any set of fixed bit fields to domain/bus/device/function
would be problematic (esp. since I don't understand how MSI's figure into
the deal.)

Is there some reason why we need that?

Thanks,
Mike

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-16 23:48                                               ` H. Peter Anvin
  2008-11-16 23:54                                                 ` Yinghai Lu
  2008-11-17  1:51                                                 ` Mike Travis
@ 2008-11-17  4:22                                                 ` Benjamin Herrenschmidt
  2008-11-17  4:42                                                   ` H. Peter Anvin
  2 siblings, 1 reply; 66+ messages in thread
From: Benjamin Herrenschmidt @ 2008-11-17  4:22 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yinghai Lu, Andrew Morton, mingo, tglx, linux-kernel, travis

On Sun, 2008-11-16 at 15:48 -0800, H. Peter Anvin wrote:
> Yinghai Lu wrote:
> > 2. make irq number is bus/devfn/idx, and every dev func will use 12bit range, irq number is relatively fixed not like current MSI irq creating is some kind of floating from NR_IRQS too. 
> 
> 2 is *STILL WRONG*, dammit!
> 
> You keep bringing this one up, but our PCI addressing is
> *DOMAIN*/bus/devfn -- it falls flat on its face when you have more than
> 16 PCI domains.  CAN WE PLEASE STOP WITH THIS FOOLISHNESS NOW!

Besides, the relationship between PCI location and IRQ numbers seems to
be an x86 thingy ... don't bring that into the generic code please !

IRQ numbers are arbitrary, some platforms make up numbers out of the
blue, or they can be hypervisor internal tokens etc...

The only sane way to handle this generically IMHO is to do what we do
on powerpc (and I think sparc64) which is to totally disconnect the HW
number from the "linux" number.

Ben.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-16 23:54                                                 ` Yinghai Lu
  2008-11-16 23:59                                                   ` H. Peter Anvin
@ 2008-11-17  4:22                                                   ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 66+ messages in thread
From: Benjamin Herrenschmidt @ 2008-11-17  4:22 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: H. Peter Anvin, Andrew Morton, mingo, tglx, linux-kernel, travis


> > You keep bringing this one up, but our PCI addressing is
> > *DOMAIN*/bus/devfn -- it falls flat on its face when you have more than
> > 16 PCI domains.  CAN WE PLEASE STOP WITH THIS FOOLISHNESS NOW!
> 
> you want to u64 instead of unsigned int for irq?

No, separate HW number from linux logic number. The later can remain
u32 or even u16 if you want :-)

Ben.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-16 23:59                                                   ` H. Peter Anvin
  2008-11-17  0:21                                                     ` Yinghai Lu
@ 2008-11-17  4:25                                                     ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 66+ messages in thread
From: Benjamin Herrenschmidt @ 2008-11-17  4:25 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yinghai Lu, Andrew Morton, mingo, tglx, linux-kernel, travis

On Sun, 2008-11-16 at 15:59 -0800, H. Peter Anvin wrote:
> 
> No, I think the whole notion of a static *numeric* identifier for an IRQ
> when it's something like MSI-X is simply pointless.  I think we should
> assign IRQ numbers beyond the legacy range dynamically.

Yup, exactly. Which is what we do on other platforms :-)

I think there is some value in getting rid of the irq_desc static array,
and to a certain extend having the ability to have irq_desc's be per-cpu
allocated but I think that patch tries to mix up way too many different
things, including a dubious attempt at tying the interrupt subsystem
into a specific implementation choice of x86 platforms for numbering.

Linux interrupts should just be a dynamically allocated number space,
with an exception for the 16 first ones (0 = illegal, 1...15 = legacy)
and that should be -separate- from the actual HW number of one on a
given PIC. In fact, powerpc handles multiple HW interrupt domain numbers
just fine that way which is very useful for embedded platforms with
funky cascaded PIC setups..
 
Ben.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  0:26                                                       ` H. Peter Anvin
  2008-11-17  0:36                                                         ` Yinghai Lu
@ 2008-11-17  4:26                                                         ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 66+ messages in thread
From: Benjamin Herrenschmidt @ 2008-11-17  4:26 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yinghai Lu, Andrew Morton, mingo, tglx, linux-kernel, travis

On Sun, 2008-11-16 at 16:26 -0800, H. Peter Anvin wrote:
> It would, however, be a good idea if IOAPICs had their numbers assigned
> at detection time, as opposed to when the interrupt is registered, thus
> making it a stable number for a single boot, at least.  The same is
> probably true for MSI(-X); we could assign it a range of numbers when
> the device is enumerated (as opposed to when a driver is activated), but
> I don't know to what extent that is likely to cause more troubles than
> it solves.

On powerpc, we establish the mapping at PCI probe time so it's
reasonably stable. In fact, we even try to use virq == hwirq number if
the hwirq number fits and is available though I'm tempted to remove that
"feature" as it can make things more confusing in the end with some
interrupts having matching HW numbers and some not.

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  1:00                                                               ` H. Peter Anvin
  2008-11-17  2:03                                                                 ` Mike Travis
@ 2008-11-17  4:27                                                                 ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 66+ messages in thread
From: Benjamin Herrenschmidt @ 2008-11-17  4:27 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yinghai Lu, Andrew Morton, mingo, tglx, linux-kernel, travis

On Sun, 2008-11-16 at 17:00 -0800, H. Peter Anvin wrote:
> Yinghai Lu wrote:
> > 
> > so we have one list to map domain/bus/dev/func to bits [31,12] in irq ?
> > 
> 
> That works, although having a more generic allocation mechanism which
> isn't so tied to MSI-X would make more sense.

None of that should be related to the linux interrupt number.

Ben.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  1:51                                                 ` Mike Travis
@ 2008-11-17  4:39                                                   ` H. Peter Anvin
  0 siblings, 0 replies; 66+ messages in thread
From: H. Peter Anvin @ 2008-11-17  4:39 UTC (permalink / raw)
  To: Mike Travis
  Cc: Yinghai Lu, Benjamin Herrenschmidt, Andrew Morton, mingo, tglx,
	linux-kernel

Mike Travis wrote:
> 
> X was changed quite a while ago to domain/bus/dev/func, which was a lot easier to
> deal with than trying to "offset" the bus with domain * some large number + bus.
> Currently max nodes is 512 so it deserves it's own field. ;-)
> 
> Btw, are you suggesting combining device and function?  That might be inadvertently
> squeezing something that shouldn't be.
> 

devfn = (device << 3) + function

It's a standard construct inside Linux.

	-hpa

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  4:22                                                 ` Benjamin Herrenschmidt
@ 2008-11-17  4:42                                                   ` H. Peter Anvin
  2008-11-17  6:52                                                     ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 66+ messages in thread
From: H. Peter Anvin @ 2008-11-17  4:42 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Yinghai Lu, Andrew Morton, mingo, tglx, linux-kernel, travis

Benjamin Herrenschmidt wrote:
> 
> IRQ numbers are arbitrary, some platforms make up numbers out of the
> blue, or they can be hypervisor internal tokens etc...
> 
> The only sane way to handle this generically IMHO is to do what we do
> on powerpc (and I think sparc64) which is to totally disconnect the HW
> number from the "linux" number.
> 

Yes, that's what I want to see, too.  On x86, it's important to preserve
the first 16 (BIOS-compatible, XT-PIC) numbers, as they are widely used
as a user interface, but for the rest, there is no point.

It is probably desirable to do that by overlaying the first (primary,
south bridge) IO-APIC, which also takes care of the "semi-legacy" IRQ
16-23 numbers.

	-hpa

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  4:42                                                   ` H. Peter Anvin
@ 2008-11-17  6:52                                                     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 66+ messages in thread
From: Benjamin Herrenschmidt @ 2008-11-17  6:52 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yinghai Lu, Andrew Morton, mingo, tglx, linux-kernel, travis

On Sun, 2008-11-16 at 20:42 -0800, H. Peter Anvin wrote:
> Benjamin Herrenschmidt wrote:
> > 
> > IRQ numbers are arbitrary, some platforms make up numbers out of the
> > blue, or they can be hypervisor internal tokens etc...
> > 
> > The only sane way to handle this generically IMHO is to do what we do
> > on powerpc (and I think sparc64) which is to totally disconnect the HW
> > number from the "linux" number.
> > 
> 
> Yes, that's what I want to see, too.  On x86, it's important to preserve
> the first 16 (BIOS-compatible, XT-PIC) numbers, as they are widely used
> as a user interface, but for the rest, there is no point.

Right, and I do that on powerpc by reserving those numbers so they get
automatically assigned to and only to the PIC that comes up as claiming
the legacy number space (if any, if none, then they remain unassigned to
avoid problems with old crappy modules hard coding IRQ numbers and
trying to request them).

> It is probably desirable to do that by overlaying the first (primary,
> south bridge) IO-APIC, which also takes care of the "semi-legacy" IRQ
> 16-23 numbers.

Would make sense indeed to treat those numbers specifically.

Ben.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] sparse_irq aka dyn_irq v13
  2008-11-17  0:21                                                     ` Yinghai Lu
  2008-11-17  0:26                                                       ` H. Peter Anvin
@ 2008-11-17 20:25                                                       ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 66+ messages in thread
From: Jeremy Fitzhardinge @ 2008-11-17 20:25 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: H. Peter Anvin, Benjamin Herrenschmidt, Andrew Morton, mingo,
	tglx, linux-kernel, travis

Yinghai Lu wrote:
> H. Peter Anvin wrote:
>   
>> Yinghai Lu wrote:
>>     
>>> H. Peter Anvin wrote:
>>>       
>>>> Yinghai Lu wrote:
>>>>         
>>>>> 2. make irq number is bus/devfn/idx, and every dev func will use 12bit range, irq number is relatively fixed not like current MSI irq creating is some kind of floating from NR_IRQS too. 
>>>>>           
>>>> 2 is *STILL WRONG*, dammit!
>>>>
>>>> You keep bringing this one up, but our PCI addressing is
>>>> *DOMAIN*/bus/devfn -- it falls flat on its face when you have more than
>>>> 16 PCI domains.  CAN WE PLEASE STOP WITH THIS FOOLISHNESS NOW!
>>>>         
>>> you want to u64 instead of unsigned int for irq?
>>>
>>>       
>> No, I think the whole notion of a static *numeric* identifier for an IRQ
>> when it's something like MSI-X is simply pointless.  I think we should
>> assign IRQ numbers beyond the legacy range dynamically.
>>
>> I really don't think anyone gives a hoot about the IRQ number for any
>> IRQ above the 0-15 legacy range, even including the "APIC" numbers 16+.
>>     
>
> you want to change ioapic/pin to irq mapping too?
>   

I would like to see that.  I'm already doing this in the Xen dom0 code 
interrupt code that I posted the other day.

    J

^ permalink raw reply	[flat|nested] 66+ messages in thread

end of thread, other threads:[~2008-11-17 20:25 UTC | newest]

Thread overview: 66+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20081023143721.GA25783@elte.hu>
     [not found] ` <49012399.4010100@kernel.org>
     [not found]   ` <20081027164135.GD19476@elte.hu>
     [not found]     ` <4912B2FE.7030804@kernel.org>
     [not found]       ` <20081106101715.GA4022@elte.hu>
     [not found]         ` <4913B45C.1000009@kernel.org>
     [not found]           ` <20081107081249.GB4435@elte.hu>
     [not found]             ` <4913F9AA.80500@kernel.org>
     [not found]               ` <20081107084240.GG4435@elte.hu>
     [not found]                 ` <491434FB.2050904@kernel.org>
     [not found]                   ` <20081107124957.GA21709@elte.hu>
2008-11-09  7:05                     ` [RFC PATCH] sparse_irq aka dyn_irq Yinghai Lu
2008-11-09  7:38                       ` Ingo Molnar
2008-11-09  8:03                         ` Yinghai Lu
2008-11-10  9:40                           ` Ingo Molnar
2008-11-10  9:51                             ` [PATCH] sparse_irq aka dyn_irq v10 Yinghai Lu
2008-11-10  9:53                               ` Ingo Molnar
2008-11-10  9:55                                 ` Yinghai Lu
2008-11-10  9:57                                   ` Ingo Molnar
2008-11-10  9:55                             ` [RFC PATCH] sparse_irq aka dyn_irq Andrew Morton
2008-11-10 10:00                               ` Yinghai Lu
2008-11-10 10:03                                 ` Ingo Molnar
2008-11-10 10:05                                   ` Yinghai Lu
2008-11-10 10:09                                     ` Ingo Molnar
2008-11-10 19:47                                       ` Yinghai Lu
2008-11-11  6:28                                   ` [PATCH] sparse_irq aka dyn_irq v11 Yinghai Lu
     [not found]                                   ` <491A9F87.8040403@kernel.org>
     [not found]                                     ` <20081112120814.GG11352@elte.hu>
2008-11-13  7:01                                       ` [PATCH] sparse_irq aka dyn_irq v13 Yinghai Lu
2008-11-13  9:53                                         ` Ingo Molnar
2008-11-13 20:06                                           ` Yinghai Lu
2008-11-13 20:16                                       ` Yinghai Lu
2008-11-13 21:18                                         ` Andrew Morton
2008-11-13 21:21                                           ` Ingo Molnar
2008-11-13 22:01                                           ` Yinghai Lu
2008-11-13 22:05                                             ` Ingo Molnar
2008-11-13 22:13                                             ` Andrew Morton
2008-11-13 22:41                                               ` Yinghai Lu
2008-11-13 22:58                                                 ` Andrew Morton
2008-11-13 23:15                                                   ` Mike Travis
2008-11-13 23:24                                                     ` Yinghai Lu
2008-11-14  0:20                                                       ` Mike Travis
2008-11-14  0:29                                                         ` Yinghai Lu
2008-11-14  6:29                                                   ` [PATCH] sparse_irq aka dyn_irq v14 Yinghai Lu
2008-11-14  6:46                                                     ` Andrew Morton
2008-11-15  9:05                                                       ` Yinghai Lu
2008-11-13 22:19                                           ` [PATCH] sparse_irq aka dyn_irq v13 Paul Mackerras
2008-11-13 22:23                                             ` David Miller
2008-11-13 23:11                                               ` Mike Travis
2008-11-13 23:14                                                 ` David Miller
2008-11-14  0:15                                                   ` Mike Travis
2008-11-14  0:21                                                     ` David Miller
2008-11-14  0:39                                                       ` Mike Travis
2008-11-14  2:37                                                         ` David Miller
2008-11-14  3:06                                                           ` Mike Travis
2008-11-16 20:58                                           ` Benjamin Herrenschmidt
2008-11-16 23:44                                             ` Yinghai Lu
2008-11-16 23:48                                               ` H. Peter Anvin
2008-11-16 23:54                                                 ` Yinghai Lu
2008-11-16 23:59                                                   ` H. Peter Anvin
2008-11-17  0:21                                                     ` Yinghai Lu
2008-11-17  0:26                                                       ` H. Peter Anvin
2008-11-17  0:36                                                         ` Yinghai Lu
2008-11-17  0:48                                                           ` H. Peter Anvin
2008-11-17  0:58                                                             ` Yinghai Lu
2008-11-17  1:00                                                               ` H. Peter Anvin
2008-11-17  2:03                                                                 ` Mike Travis
2008-11-17  4:27                                                                 ` Benjamin Herrenschmidt
2008-11-17  4:26                                                         ` Benjamin Herrenschmidt
2008-11-17 20:25                                                       ` Jeremy Fitzhardinge
2008-11-17  4:25                                                     ` Benjamin Herrenschmidt
2008-11-17  4:22                                                   ` Benjamin Herrenschmidt
2008-11-17  1:51                                                 ` Mike Travis
2008-11-17  4:39                                                   ` H. Peter Anvin
2008-11-17  4:22                                                 ` Benjamin Herrenschmidt
2008-11-17  4:42                                                   ` H. Peter Anvin
2008-11-17  6:52                                                     ` Benjamin Herrenschmidt
2008-11-09  8:36                         ` [RFC PATCH] sparse_irq aka dyn_irq H. Peter Anvin
2008-11-09  7:50                       ` Cyrill Gorcunov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox