Xen spinlock questions

All of lore.kernel.org
 help / color / mirror / Atom feed

* Xen spinlock questions
@ 2008-08-04 10:18 Jan Beulich
  2008-08-04 10:24 ` Keir Fraser
  2008-08-04 19:36 ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 29+ messages in thread
From: Jan Beulich @ 2008-08-04 10:18 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 1348 bytes --]

Jeremy,

considering to utilize your pv-ops spinlock implementation for our kernels,
I'd appreciate your opinion on the following thoughts:

1) While the goal of the per-CPU kicker irq appears to be to avoid all CPUs
waiting for a particular lock to get kicked simultaneously, I think this
doesn't have the desired effect. This is because Xen doesn't track what
event channel you poll for (through SCHEDOP_poll), and rather kicks all CPUs
polling for any event channel.

2) While on native not re-enabling interrupts in __raw_spin_lock_flags()
may be tolerable (but perhaps questionable), not doing so at least on
the slow path here seems suspicious.

3) Introducing yet another per-CPU IRQ for this purpose further
tightens scalability. Using a single, IRQF_PER_CPU IRQ should be
sufficient here, as long as it gets properly multiplexed onto individual
event channels (of which we have far more than IRQs). I have a patch
queued for the traditional tree that does just that conversion for the
reschedule and call-function IPIs, which I had long planned to get
submitted (but so far wasn't able to due to lack of testing done on the
migration aspects of it), and once successful was planning on trying to
do something similar for the timer IRQ. I am attaching that (2.6.26 based)
patch just for reference.

Thanks, Jan


[-- Attachment #2: xen-ipi-per-cpu-irq.patch --]
[-- Type: text/plain, Size: 17002 bytes --]

From: jbeulich@novell.com
Subject: fold IPIs onto a single IRQ each
Patch-mainline: obsolete

If this is reasonable, a similar change should be done for per-CPU VIRQs.

Index: head-2008-07-21/arch/x86/kernel/genapic_xen_64.c
===================================================================
--- head-2008-07-21.orig/arch/x86/kernel/genapic_xen_64.c	2008-07-15 14:50:28.000000000 +0200
+++ head-2008-07-21/arch/x86/kernel/genapic_xen_64.c	2008-07-22 08:25:52.000000000 +0200
@@ -24,13 +24,9 @@
 #include <asm/genapic.h>
 #include <xen/evtchn.h>
 
-DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
-
 static inline void __send_IPI_one(unsigned int cpu, int vector)
 {
-	int irq = per_cpu(ipi_to_irq, cpu)[vector];
-	BUG_ON(irq < 0);
-	notify_remote_via_irq(irq);
+	notify_remote_via_ipi(vector, cpu);
 }
 
 void xen_send_IPI_shortcut(unsigned int shortcut, int vector)
Index: head-2008-07-21/arch/x86/kernel/ipi-xen.c
===================================================================
--- head-2008-07-21.orig/arch/x86/kernel/ipi-xen.c	2008-07-21 17:42:48.000000000 +0200
+++ head-2008-07-21/arch/x86/kernel/ipi-xen.c	2008-07-22 08:25:52.000000000 +0200
@@ -49,15 +49,6 @@ static inline int __prepare_ICR2(unsigne
 }
 #else
 #include <xen/evtchn.h>
-
-DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
-
-static inline void __send_IPI_one(unsigned int cpu, int vector)
-{
-	int irq = per_cpu(ipi_to_irq, cpu)[vector];
-	BUG_ON(irq < 0);
-	notify_remote_via_irq(irq);
-}
 #endif
 
 void __send_IPI_shortcut(unsigned int shortcut, int vector)
@@ -91,14 +82,14 @@ void __send_IPI_shortcut(unsigned int sh
 
 	switch (shortcut) {
 	case APIC_DEST_SELF:
-		__send_IPI_one(smp_processor_id(), vector);
+		notify_remote_via_ipi(vector, smp_processor_id());
 		break;
 	case APIC_DEST_ALLBUT:
 		for (cpu = 0; cpu < NR_CPUS; ++cpu) {
 			if (cpu == smp_processor_id())
 				continue;
 			if (cpu_isset(cpu, cpu_online_map)) {
-				__send_IPI_one(cpu, vector);
+				notify_remote_via_ipi(vector, cpu);
 			}
 		}
 		break;
@@ -168,7 +159,7 @@ void send_IPI_mask_bitmask(cpumask_t cpu
 #else
 	for (cpu = 0; cpu < NR_CPUS; ++cpu)
 		if (cpu_isset(cpu, cpumask))
-			__send_IPI_one(cpu, vector);
+			notify_remote_via_ipi(vector, cpu);
 #endif
 	local_irq_restore(flags);
 }
Index: head-2008-07-21/drivers/xen/Kconfig
===================================================================
--- head-2008-07-21.orig/drivers/xen/Kconfig	2008-07-22 08:23:14.000000000 +0200
+++ head-2008-07-21/drivers/xen/Kconfig	2008-07-22 08:25:52.000000000 +0200
@@ -4,6 +4,7 @@
 
 config XEN
 	bool
+	select IRQ_PER_CPU if SMP
 
 if XEN
 config XEN_INTERFACE_VERSION
@@ -290,6 +291,9 @@ config HAVE_IRQ_IGNORE_UNHANDLED
 config GENERIC_HARDIRQS_NO__DO_IRQ
 	def_bool y
 
+config IRQ_PER_CPU
+	bool
+
 config NO_IDLE_HZ
 	def_bool y
 
Index: head-2008-07-21/drivers/xen/core/evtchn.c
===================================================================
--- head-2008-07-21.orig/drivers/xen/core/evtchn.c	2008-07-22 08:25:47.000000000 +0200
+++ head-2008-07-21/drivers/xen/core/evtchn.c	2008-07-22 08:25:52.000000000 +0200
@@ -57,6 +57,22 @@ static DEFINE_SPINLOCK(irq_mapping_updat
 static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
 	[0 ...  NR_EVENT_CHANNELS-1] = -1 };
 
+/* IRQ <-> IPI mapping. */
+#ifndef NR_IPIS
+#define NR_IPIS 1
+#endif
+#if defined(CONFIG_SMP) && defined(CONFIG_X86)
+static int ipi_to_irq[NR_IPIS] __read_mostly = {[0 ... NR_IPIS-1] = -1};
+static DEFINE_PER_CPU(int[NR_IPIS], ipi_to_evtchn) = {[0 ... NR_IPIS-1] = -1};
+#else
+#define PER_CPU_IPI_IRQ
+#endif
+#if !defined(CONFIG_SMP) || !defined(PER_CPU_IPI_IRQ)
+#define BUG_IF_IPI(irq) BUG_ON(type_from_irq(irq) == IRQT_IPI)
+#else
+#define BUG_IF_IPI(irq) ((void)(irq))
+#endif
+
 /* Packed IRQ information: binding type, sub-type index, and event channel. */
 static u32 irq_info[NR_IRQS];
 
@@ -83,10 +99,12 @@ static inline u32 mk_irq_info(u32 type, 
  * Accessors for packed IRQ information.
  */
 
+#ifdef PER_CPU_IPI_IRQ
 static inline unsigned int evtchn_from_irq(int irq)
 {
 	return (u16)(irq_info[irq]);
 }
+#endif
 
 static inline unsigned int index_from_irq(int irq)
 {
@@ -98,14 +116,27 @@ static inline unsigned int type_from_irq
 	return (u8)(irq_info[irq] >> 24);
 }
 
+#ifndef PER_CPU_IPI_IRQ
+static inline unsigned int evtchn_from_per_cpu_irq(unsigned int irq, unsigned int cpu)
+{
+	BUG_ON(type_from_irq(irq) != IRQT_IPI);
+	return per_cpu(ipi_to_evtchn, cpu)[index_from_irq(irq)];
+}
+
+static inline unsigned int evtchn_from_irq(unsigned int irq)
+{
+	if (type_from_irq(irq) != IRQT_IPI)
+		return (u16)(irq_info[irq]);
+	return evtchn_from_per_cpu_irq(irq, smp_processor_id());
+}
+#endif
+
 /* IRQ <-> VIRQ mapping. */
 DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
 
-/* IRQ <-> IPI mapping. */
-#ifndef NR_IPIS
-#define NR_IPIS 1
-#endif
+#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
 DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1};
+#endif
 
 /* Reference counts for bindings to IRQs. */
 static int irq_bindcount[NR_IRQS];
@@ -133,8 +164,14 @@ static void bind_evtchn_to_cpu(unsigned 
 
 	BUG_ON(!test_bit(chn, s->evtchn_mask));
 
-	if (irq != -1)
-		irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+	if (irq != -1) {
+		struct irq_desc *desc = irq_desc + irq;
+
+		if (!(desc->status & IRQ_PER_CPU))
+			desc->affinity = cpumask_of_cpu(cpu);
+		else
+			cpu_set(cpu, desc->affinity);
+	}
 
 	clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
 	set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
@@ -428,6 +465,7 @@ static int bind_virq_to_irq(unsigned int
 	return irq;
 }
 
+#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
 static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 {
 	struct evtchn_bind_ipi bind_ipi;
@@ -459,6 +497,7 @@ static int bind_ipi_to_irq(unsigned int 
 	spin_unlock(&irq_mapping_update_lock);
 	return irq;
 }
+#endif
 
 static void unbind_from_irq(unsigned int irq)
 {
@@ -466,6 +505,7 @@ static void unbind_from_irq(unsigned int
 	unsigned int cpu;
 	int evtchn = evtchn_from_irq(irq);
 
+	BUG_IF_IPI(irq);
 	spin_lock(&irq_mapping_update_lock);
 
 	if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
@@ -479,10 +519,12 @@ static void unbind_from_irq(unsigned int
 			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
 				[index_from_irq(irq)] = -1;
 			break;
+#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
 		case IRQT_IPI:
 			per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
 				[index_from_irq(irq)] = -1;
 			break;
+#endif
 		default:
 			break;
 		}
@@ -501,6 +543,44 @@ static void unbind_from_irq(unsigned int
 	spin_unlock(&irq_mapping_update_lock);
 }
 
+#if defined(CONFIG_SMP) && !defined(PER_CPU_IPI_IRQ)
+void unbind_from_per_cpu_irq(unsigned int irq, unsigned int cpu)
+{
+	struct evtchn_close close;
+	int evtchn = evtchn_from_per_cpu_irq(irq, cpu);
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if (VALID_EVTCHN(evtchn)) {
+		struct irq_desc *desc = irq_desc + irq;
+
+		cpu_clear(cpu, desc->affinity);
+		BUG_ON(irq_bindcount[irq] <= 1);
+		irq_bindcount[irq]--;
+
+		close.port = evtchn;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
+			BUG();
+
+		switch (type_from_irq(irq)) {
+		case IRQT_IPI:
+			per_cpu(ipi_to_evtchn, cpu)[index_from_irq(irq)] = -1;
+			break;
+		default:
+			BUG();
+			break;
+		}
+
+		/* Closed ports are implicitly re-bound to VCPU0. */
+		bind_evtchn_to_cpu(evtchn, 0);
+
+		evtchn_to_irq[evtchn] = -1;
+	}
+
+	spin_unlock(&irq_mapping_update_lock);
+}
+#endif /* CONFIG_SMP && !PER_CPU_IPI_IRQ */
+
 int bind_caller_port_to_irqhandler(
 	unsigned int caller_port,
 	irq_handler_t handler,
@@ -595,6 +675,8 @@ int bind_virq_to_irqhandler(
 }
 EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
 
+#ifdef CONFIG_SMP
+#ifdef PER_CPU_IPI_IRQ
 int bind_ipi_to_irqhandler(
 	unsigned int ipi,
 	unsigned int cpu,
@@ -617,7 +699,72 @@ int bind_ipi_to_irqhandler(
 
 	return irq;
 }
-EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler);
+#else
+int __cpuinit bind_ipi_to_irqaction(
+	unsigned int ipi,
+	unsigned int cpu,
+	struct irqaction *action)
+{
+	struct evtchn_bind_ipi bind_ipi;
+	int evtchn, irq, retval = 0;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if (per_cpu(ipi_to_evtchn, cpu)[ipi] != -1) {
+		spin_unlock(&irq_mapping_update_lock);
+		return -EBUSY;
+	}
+
+	if ((irq = ipi_to_irq[ipi]) == -1) {
+		if ((irq = find_unbound_irq()) < 0) {
+			spin_unlock(&irq_mapping_update_lock);
+			return irq;
+		}
+
+		/* Extra reference so count will never drop to zero. */
+		irq_bindcount[irq]++;
+
+		ipi_to_irq[ipi] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, 0);
+		irq_desc[irq].handle_irq = handle_percpu_irq;
+		retval = 1;
+	}
+
+	bind_ipi.vcpu = cpu;
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+					&bind_ipi) != 0)
+		BUG();
+
+	evtchn = bind_ipi.port;
+	evtchn_to_irq[evtchn] = irq;
+	per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn;
+
+	bind_evtchn_to_cpu(evtchn, cpu);
+
+	irq_bindcount[irq]++;
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	if (retval == 0) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		unmask_evtchn(evtchn);
+		local_irq_restore(flags);
+	} else {
+		action->flags |= IRQF_PERCPU;
+		retval = setup_irq(irq, action);
+		if (retval) {
+			unbind_from_per_cpu_irq(irq, cpu);
+			BUG_ON(retval > 0);
+			irq = retval;
+		}
+	}
+
+	return irq;
+}
+#endif /* PER_CPU_IPI_IRQ */
+#endif /* CONFIG_SMP */
 
 void unbind_from_irqhandler(unsigned int irq, void *dev_id)
 {
@@ -643,6 +790,7 @@ static void rebind_irq_to_cpu(unsigned i
 {
 	int evtchn = evtchn_from_irq(irq);
 
+	BUG_IF_IPI(irq);
 	if (VALID_EVTCHN(evtchn))
 		rebind_evtchn_to_cpu(evtchn, tcpu);
 }
@@ -726,6 +874,7 @@ static struct irq_chip dynirq_chip = {
 	.unmask   = unmask_dynirq,
 	.mask_ack = ack_dynirq,
 	.ack      = ack_dynirq,
+	.eoi      = end_dynirq,
 	.end      = end_dynirq,
 #ifdef CONFIG_SMP
 	.set_affinity = set_affinity_irq,
@@ -870,10 +1019,21 @@ int irq_ignore_unhandled(unsigned int ir
 	return !!(irq_status.flags & XENIRQSTAT_shared);
 }
 
+#if defined(CONFIG_SMP) && !defined(PER_CPU_IPI_IRQ)
+void notify_remote_via_ipi(unsigned int ipi, unsigned int cpu)
+{
+	int evtchn = evtchn_from_per_cpu_irq(ipi_to_irq[ipi], cpu);
+
+	if (VALID_EVTCHN(evtchn))
+		notify_remote_via_evtchn(evtchn);
+}
+#endif
+
 void notify_remote_via_irq(int irq)
 {
 	int evtchn = evtchn_from_irq(irq);
 
+	BUG_IF_IPI(irq);
 	if (VALID_EVTCHN(evtchn))
 		notify_remote_via_evtchn(evtchn);
 }
@@ -881,6 +1041,7 @@ EXPORT_SYMBOL_GPL(notify_remote_via_irq)
 
 int irq_to_evtchn_port(int irq)
 {
+	BUG_IF_IPI(irq);
 	return evtchn_from_irq(irq);
 }
 EXPORT_SYMBOL_GPL(irq_to_evtchn_port);
@@ -958,11 +1119,16 @@ static void restore_cpu_virqs(unsigned i
 
 static void restore_cpu_ipis(unsigned int cpu)
 {
+#ifdef CONFIG_SMP
 	struct evtchn_bind_ipi bind_ipi;
 	int ipi, irq, evtchn;
 
 	for (ipi = 0; ipi < NR_IPIS; ipi++) {
+#ifdef PER_CPU_IPI_IRQ
 		if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
+#else
+		if ((irq = ipi_to_irq[ipi]) == -1)
+#endif
 			continue;
 
 		BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
@@ -976,13 +1142,19 @@ static void restore_cpu_ipis(unsigned in
 
 		/* Record the new mapping. */
 		evtchn_to_irq[evtchn] = irq;
+#ifdef PER_CPU_IPI_IRQ
 		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+#else
+		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, 0);
+		per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn;
+#endif
 		bind_evtchn_to_cpu(evtchn, cpu);
 
 		/* Ready for use. */
 		unmask_evtchn(evtchn);
 
 	}
+#endif
 }
 
 static int evtchn_resume(struct sys_device *dev)
@@ -1007,8 +1179,17 @@ static int evtchn_resume(struct sys_devi
 
 	for_each_possible_cpu(cpu) {
 		restore_cpu_virqs(cpu);
+#ifdef PER_CPU_IPI_IRQ
 		restore_cpu_ipis(cpu);
+#else
+		/* No IPI <-> event-channel mappings. */
+		for (irq = 0; irq < NR_IPIS; ++irq)
+			per_cpu(ipi_to_evtchn, cpu)[irq] = -1;
+#endif
 	}
+#ifndef PER_CPU_IPI_IRQ
+	restore_cpu_ipis(smp_processor_id());
+#endif
 
 	return 0;
 }
Index: head-2008-07-21/drivers/xen/core/smpboot.c
===================================================================
--- head-2008-07-21.orig/drivers/xen/core/smpboot.c	2008-07-22 16:17:27.000000000 +0200
+++ head-2008-07-21/drivers/xen/core/smpboot.c	2008-07-22 16:19:31.000000000 +0200
@@ -52,10 +52,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
 DEFINE_PER_CPU(int, cpu_state) = { 0 };
 #endif
 
-static DEFINE_PER_CPU(int, resched_irq);
-static DEFINE_PER_CPU(int, callfunc_irq);
-static char resched_name[NR_CPUS][15];
-static char callfunc_name[NR_CPUS][15];
+static int resched_irq = -1, callfunc_irq = -1;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
@@ -112,31 +109,36 @@ remove_siblinginfo(unsigned int cpu)
 
 static int __cpuinit xen_smp_intr_init(unsigned int cpu)
 {
+	static struct irqaction resched_action = {
+		.handler =	smp_reschedule_interrupt,
+		.flags =	IRQF_DISABLED,
+		.name =		"resched"
+	}, callfunc_action = {
+		.handler =	smp_call_function_interrupt,
+		.flags =	IRQF_DISABLED,
+		.name =		"callfunc"
+	};
 	int rc;
 
-	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
-
-	sprintf(resched_name[cpu], "resched%u", cpu);
-	rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
-				    cpu,
-				    smp_reschedule_interrupt,
-				    IRQF_DISABLED|IRQF_NOBALANCING,
-				    resched_name[cpu],
-				    NULL);
+	rc = bind_ipi_to_irqaction(RESCHEDULE_VECTOR,
+				   cpu,
+				   &resched_action);
 	if (rc < 0)
 		goto fail;
-	per_cpu(resched_irq, cpu) = rc;
-
-	sprintf(callfunc_name[cpu], "callfunc%u", cpu);
-	rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
-				    cpu,
-				    smp_call_function_interrupt,
-				    IRQF_DISABLED|IRQF_NOBALANCING,
-				    callfunc_name[cpu],
-				    NULL);
+	if (resched_irq < 0)
+		resched_irq = rc;
+	else
+		BUG_ON(resched_irq != rc);
+
+	rc = bind_ipi_to_irqaction(CALL_FUNCTION_VECTOR,
+				   cpu,
+				   &callfunc_action);
 	if (rc < 0)
 		goto fail;
-	per_cpu(callfunc_irq, cpu) = rc;
+	if (callfunc_irq < 0)
+		callfunc_irq = rc;
+	else
+		BUG_ON(callfunc_irq != rc);
 
 	if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
 		goto fail;
@@ -144,10 +146,10 @@ static int __cpuinit xen_smp_intr_init(u
 	return 0;
 
  fail:
-	if (per_cpu(resched_irq, cpu) >= 0)
-		unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
-	if (per_cpu(callfunc_irq, cpu) >= 0)
-		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	if (resched_irq >= 0)
+		unbind_from_per_cpu_irq(resched_irq, cpu);
+	if (callfunc_irq >= 0)
+		unbind_from_per_cpu_irq(callfunc_irq, cpu);
 	return rc;
 }
 
@@ -157,8 +159,8 @@ static void __cpuexit xen_smp_intr_exit(
 	if (cpu != 0)
 		local_teardown_timer(cpu);
 
-	unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
-	unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	unbind_from_per_cpu_irq(resched_irq, cpu);
+	unbind_from_per_cpu_irq(callfunc_irq, cpu);
 }
 #endif
 
Index: head-2008-07-21/include/xen/evtchn.h
===================================================================
--- head-2008-07-21.orig/include/xen/evtchn.h	2008-07-22 08:25:47.000000000 +0200
+++ head-2008-07-21/include/xen/evtchn.h	2008-07-22 08:25:52.000000000 +0200
@@ -78,6 +78,8 @@ int bind_virq_to_irqhandler(
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id);
+#if defined(CONFIG_SMP) && !defined(MODULE)
+#ifndef CONFIG_X86
 int bind_ipi_to_irqhandler(
 	unsigned int ipi,
 	unsigned int cpu,
@@ -85,6 +87,13 @@ int bind_ipi_to_irqhandler(
 	unsigned long irqflags,
 	const char *devname,
 	void *dev_id);
+#else
+int bind_ipi_to_irqaction(
+	unsigned int ipi,
+	unsigned int cpu,
+	struct irqaction *action);
+#endif
+#endif
 
 /*
  * Common unbind function for all event sources. Takes IRQ to unbind from.
@@ -93,6 +102,11 @@ int bind_ipi_to_irqhandler(
  */
 void unbind_from_irqhandler(unsigned int irq, void *dev_id);
 
+#if defined(CONFIG_SMP) && !defined(MODULE) && defined(CONFIG_X86)
+/* Specialized unbind function for per-CPU IRQs. */
+void unbind_from_per_cpu_irq(unsigned int irq, unsigned int cpu);
+#endif
+
 #ifndef CONFIG_XEN
 void irq_resume(void);
 #endif
@@ -152,4 +166,8 @@ int clear_pirq_hw_action(int pirq);
 #define PIRQ_END 5
 #define PIRQ_ACK 6
 
+#if defined(CONFIG_SMP) && !defined(MODULE) && defined(CONFIG_X86)
+void notify_remote_via_ipi(unsigned int ipi, unsigned int cpu);
+#endif
+
 #endif /* __ASM_EVTCHN_H__ */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-04 10:18 Xen spinlock questions Jan Beulich
@ 2008-08-04 10:24 ` Keir Fraser
  2008-08-11 12:22   ` Jan Beulich
  2008-08-04 19:36 ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 29+ messages in thread
From: Keir Fraser @ 2008-08-04 10:24 UTC (permalink / raw)
  To: Jan Beulich, Jeremy Fitzhardinge; +Cc: xen-devel

On 4/8/08 11:18, "Jan Beulich" <jbeulich@novell.com> wrote:

> 1) While the goal of the per-CPU kicker irq appears to be to avoid all CPUs
> waiting for a particular lock to get kicked simultaneously, I think this
> doesn't have the desired effect. This is because Xen doesn't track what
> event channel you poll for (through SCHEDOP_poll), and rather kicks all CPUs
> polling for any event channel.

Yes, this is true. We could easily do something better for VCPUs polling a
single event channel though, but there hasn't been a need up to now. I
suppose it depends how often we have multiple VCPUs stuck waiting for
spinlocks. I can sort out a Xen-side patch if someone wanted to measure the
benefits from more selective wakeup from poll.

 -- Keir

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-04 10:24 ` Keir Fraser
@ 2008-08-11 12:22   ` Jan Beulich
  2008-08-11 12:25     ` Keir Fraser
                       ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Jan Beulich @ 2008-08-11 12:22 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Jeremy Fitzhardinge, xen-devel

[-- Attachment #1: Type: text/plain, Size: 1140 bytes --]

>>> Keir Fraser <keir.fraser@eu.citrix.com> 04.08.08 12:24 >>>
>On 4/8/08 11:18, "Jan Beulich" <jbeulich@novell.com> wrote:
>
>> 1) While the goal of the per-CPU kicker irq appears to be to avoid all CPUs
>> waiting for a particular lock to get kicked simultaneously, I think this
>> doesn't have the desired effect. This is because Xen doesn't track what
>> event channel you poll for (through SCHEDOP_poll), and rather kicks all CPUs
>> polling for any event channel.
>
>Yes, this is true. We could easily do something better for VCPUs polling a
>single event channel though, but there hasn't been a need up to now. I
>suppose it depends how often we have multiple VCPUs stuck waiting for
>spinlocks. I can sort out a Xen-side patch if someone wanted to measure the
>benefits from more selective wakeup from poll.

Running kernel builds on 8 vCPU-s competing for 4 pCPU-s shows a 10%
improvement in performance with the individual wakeup (patch attached
- probably sub-optimal, but I didn't seem to be able to think of a lock-less
mechanism to achieve the desired behavior), using ticket locks in the
kernel.

Jan


[-- Attachment #2: poll-single-port.patch --]
[-- Type: text/plain, Size: 5743 bytes --]

Index: 2008-08-06/xen/common/domain.c
===================================================================
--- 2008-08-06.orig/xen/common/domain.c	2008-08-07 11:03:33.000000000 +0200
+++ 2008-08-06/xen/common/domain.c	2008-08-07 12:04:00.000000000 +0200
@@ -209,6 +209,7 @@ struct domain *domain_create(
     atomic_set(&d->refcnt, 1);
     spin_lock_init(&d->domain_lock);
     spin_lock_init(&d->page_alloc_lock);
+    spin_lock_init(&d->poll_lock);
     spin_lock_init(&d->shutdown_lock);
     spin_lock_init(&d->hypercall_deadlock_mutex);
     INIT_LIST_HEAD(&d->page_list);
@@ -653,7 +654,7 @@ void vcpu_reset(struct vcpu *v)
 
     v->fpu_initialised = 0;
     v->fpu_dirtied     = 0;
-    v->is_polling      = 0;
+    v->poll_evtchn     = 0;
     v->is_initialised  = 0;
     v->nmi_pending     = 0;
     v->mce_pending     = 0;
Index: 2008-08-06/xen/common/event_channel.c
===================================================================
--- 2008-08-06.orig/xen/common/event_channel.c	2008-08-07 11:03:33.000000000 +0200
+++ 2008-08-06/xen/common/event_channel.c	2008-08-07 15:24:17.000000000 +0200
@@ -545,6 +545,7 @@ out:
 static int evtchn_set_pending(struct vcpu *v, int port)
 {
     struct domain *d = v->domain;
+    unsigned long flags;
 
     /*
      * The following bit operations must happen in strict order.
@@ -564,19 +565,36 @@ static int evtchn_set_pending(struct vcp
     }
     
     /* Check if some VCPU might be polling for this event. */
-    if ( unlikely(d->is_polling) )
+    if ( likely(!d->is_polling) )
+        return 0;
+
+    spin_lock_irqsave(&d->poll_lock, flags);
+
+    if ( likely(d->is_polling) )
     {
-        d->is_polling = 0;
+        bool_t is_polling = 0;
+
+        d->is_polling = -1;
         smp_mb(); /* check vcpu poll-flags /after/ clearing domain poll-flag */
         for_each_vcpu ( d, v )
         {
-            if ( !v->is_polling )
+            int poll_evtchn = v->poll_evtchn;
+
+            if ( !poll_evtchn )
+                continue;
+            if ( poll_evtchn > 0 && poll_evtchn != port )
+            {
+                is_polling = 1;
                 continue;
-            v->is_polling = 0;
+            }
+            v->poll_evtchn = 0;
             vcpu_unblock(v);
         }
+        cmpxchg(&d->is_polling, -1, is_polling);
     }
 
+    spin_unlock_irqrestore(&d->poll_lock, flags);
+
     return 0;
 }
 
Index: 2008-08-06/xen/common/schedule.c
===================================================================
--- 2008-08-06.orig/xen/common/schedule.c	2008-08-07 11:03:33.000000000 +0200
+++ 2008-08-06/xen/common/schedule.c	2008-08-07 15:27:07.000000000 +0200
@@ -348,7 +348,7 @@ static long do_poll(struct sched_poll *s
         return -EFAULT;
 
     set_bit(_VPF_blocked, &v->pause_flags);
-    v->is_polling = 1;
+    v->poll_evtchn = -1;
     d->is_polling = 1;
 
     /* Check for events /after/ setting flags: avoids wakeup waiting race. */
@@ -369,6 +369,9 @@ static long do_poll(struct sched_poll *s
             goto out;
     }
 
+    if ( i == 1 )
+        v->poll_evtchn = port;
+
     if ( sched_poll->timeout != 0 )
         set_timer(&v->poll_timer, sched_poll->timeout);
 
@@ -378,7 +381,7 @@ static long do_poll(struct sched_poll *s
     return 0;
 
  out:
-    v->is_polling = 0;
+    v->poll_evtchn = 0;
     clear_bit(_VPF_blocked, &v->pause_flags);
     return rc;
 }
@@ -758,10 +761,10 @@ static void poll_timer_fn(void *data)
 {
     struct vcpu *v = data;
 
-    if ( !v->is_polling )
+    if ( !v->poll_evtchn )
         return;
 
-    v->is_polling = 0;
+    v->poll_evtchn = 0;
     vcpu_unblock(v);
 }
 
Index: 2008-08-06/xen/include/xen/sched.h
===================================================================
--- 2008-08-06.orig/xen/include/xen/sched.h	2008-08-07 11:03:33.000000000 +0200
+++ 2008-08-06/xen/include/xen/sched.h	2008-08-07 10:07:56.000000000 +0200
@@ -106,8 +106,6 @@ struct vcpu 
     bool_t           fpu_initialised;
     /* Has the FPU been used since it was last saved? */
     bool_t           fpu_dirtied;
-    /* Is this VCPU polling any event channels (SCHEDOP_poll)? */
-    bool_t           is_polling;
     /* Initialization completed for this VCPU? */
     bool_t           is_initialised;
     /* Currently running on a CPU? */
@@ -137,6 +135,11 @@ struct vcpu 
     unsigned long    pause_flags;
     atomic_t         pause_count;
 
+    /* Is this VCPU polling any event channels (SCHEDOP_poll)?
+     * Positive values indicate a single, negative values multiple channels
+     * being polled. */
+    int              poll_evtchn;
+
     /* IRQ-safe virq_lock protects against delivering VIRQ to stale evtchn. */
     u16              virq_to_evtchn[NR_VIRQS];
     spinlock_t       virq_lock;
@@ -210,7 +213,7 @@ struct domain
     /* Is this guest being debugged by dom0? */
     bool_t           debugger_attached;
     /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
-    bool_t           is_polling;
+    signed char      is_polling;
     /* Is this guest dying (i.e., a zombie)? */
     enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying;
     /* Domain is paused by controller software? */
@@ -218,6 +221,9 @@ struct domain
     /* Domain's VCPUs are pinned 1:1 to physical CPUs? */
     bool_t           is_pinned;
 
+    /* Protects is_polling modification in evtchn_set_pending(). */
+    spinlock_t       poll_lock;
+
     /* Guest has shut down (inc. reason code)? */
     spinlock_t       shutdown_lock;
     bool_t           is_shutting_down; /* in process of shutting down? */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-11 12:22   ` Jan Beulich
@ 2008-08-11 12:25     ` Keir Fraser
  2008-08-11 14:41     ` Keir Fraser
  2008-08-11 18:10     ` Jeremy Fitzhardinge
  2 siblings, 0 replies; 29+ messages in thread
From: Keir Fraser @ 2008-08-11 12:25 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Jeremy Fitzhardinge, xen-devel

On 11/8/08 13:22, "Jan Beulich" <jbeulich@novell.com> wrote:

>> Yes, this is true. We could easily do something better for VCPUs polling a
>> single event channel though, but there hasn't been a need up to now. I
>> suppose it depends how often we have multiple VCPUs stuck waiting for
>> spinlocks. I can sort out a Xen-side patch if someone wanted to measure the
>> benefits from more selective wakeup from poll.
> 
> Running kernel builds on 8 vCPU-s competing for 4 pCPU-s shows a 10%
> improvement in performance with the individual wakeup (patch attached
> - probably sub-optimal, but I didn't seem to be able to think of a lock-less
> mechanism to achieve the desired behavior), using ticket locks in the
> kernel.

Hardly a real-world benchmark. :-)

 -- Keir

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-11 12:22   ` Jan Beulich
  2008-08-11 12:25     ` Keir Fraser
@ 2008-08-11 14:41     ` Keir Fraser
  2008-08-11 18:11       ` Jeremy Fitzhardinge
  2008-08-11 18:10     ` Jeremy Fitzhardinge
  2 siblings, 1 reply; 29+ messages in thread
From: Keir Fraser @ 2008-08-11 14:41 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Jeremy Fitzhardinge, xen-devel

[-- Attachment #1: Type: text/plain, Size: 427 bytes --]

On 11/8/08 13:22, "Jan Beulich" <jbeulich@novell.com> wrote:

> Running kernel builds on 8 vCPU-s competing for 4 pCPU-s shows a 10%
> improvement in performance with the individual wakeup (patch attached
> - probably sub-optimal, but I didn't seem to be able to think of a lock-less
> mechanism to achieve the desired behavior), using ticket locks in the
> kernel.

Could you try with the attached lock-free patch?

 -- Keir


[-- Attachment #2: poll.patch --]
[-- Type: application/octet-stream, Size: 5055 bytes --]

diff -r 265d42af434b xen/common/domain.c
--- a/xen/common/domain.c	Mon Aug 11 15:01:34 2008 +0100
+++ b/xen/common/domain.c	Mon Aug 11 15:25:48 2008 +0100
@@ -651,9 +651,10 @@ void vcpu_reset(struct vcpu *v)
 
     set_bit(_VPF_down, &v->pause_flags);
 
+    clear_bit(v->vcpu_id, d->poll_mask);
+
     v->fpu_initialised = 0;
     v->fpu_dirtied     = 0;
-    v->is_polling      = 0;
     v->is_initialised  = 0;
     v->nmi_pending     = 0;
     v->mce_pending     = 0;
diff -r 265d42af434b xen/common/event_channel.c
--- a/xen/common/event_channel.c	Mon Aug 11 15:01:34 2008 +0100
+++ b/xen/common/event_channel.c	Mon Aug 11 15:32:09 2008 +0100
@@ -545,6 +545,7 @@ static int evtchn_set_pending(struct vcp
 static int evtchn_set_pending(struct vcpu *v, int port)
 {
     struct domain *d = v->domain;
+    int vcpuid;
 
     /*
      * The following bit operations must happen in strict order.
@@ -564,17 +565,18 @@ static int evtchn_set_pending(struct vcp
     }
     
     /* Check if some VCPU might be polling for this event. */
-    if ( unlikely(d->is_polling) )
-    {
-        d->is_polling = 0;
-        smp_mb(); /* check vcpu poll-flags /after/ clearing domain poll-flag */
-        for_each_vcpu ( d, v )
-        {
-            if ( !v->is_polling )
-                continue;
-            v->is_polling = 0;
+    if ( likely(bitmap_empty(d->poll_mask, MAX_VIRT_CPUS)) )
+        return 0;
+
+    /* Wake any interested (or potentially interested) pollers. */
+    for ( vcpuid = find_first_bit(d->poll_mask, MAX_VIRT_CPUS);
+          vcpuid < MAX_VIRT_CPUS;
+          vcpuid = find_next_bit(d->poll_mask, MAX_VIRT_CPUS, vcpuid+1) )
+    {
+        v = d->vcpu[vcpuid];
+        if ( ((v->poll_evtchn < 0) || (v->poll_evtchn == port)) &&
+             test_and_clear_bit(vcpuid, d->poll_mask) )
             vcpu_unblock(v);
-        }
     }
 
     return 0;
diff -r 265d42af434b xen/common/schedule.c
--- a/xen/common/schedule.c	Mon Aug 11 15:01:34 2008 +0100
+++ b/xen/common/schedule.c	Mon Aug 11 15:32:44 2008 +0100
@@ -348,11 +348,11 @@ static long do_poll(struct sched_poll *s
         return -EFAULT;
 
     set_bit(_VPF_blocked, &v->pause_flags);
-    v->is_polling = 1;
-    d->is_polling = 1;
+    v->poll_evtchn = -1;
+    set_bit(v->vcpu_id, d->poll_mask);
 
     /* Check for events /after/ setting flags: avoids wakeup waiting race. */
-    smp_wmb();
+    smp_mb();
 
     for ( i = 0; i < sched_poll->nr_ports; i++ )
     {
@@ -369,6 +369,9 @@ static long do_poll(struct sched_poll *s
             goto out;
     }
 
+    if ( sched_poll->nr_ports == 1 )
+        v->poll_evtchn = port;
+
     if ( sched_poll->timeout != 0 )
         set_timer(&v->poll_timer, sched_poll->timeout);
 
@@ -378,7 +381,7 @@ static long do_poll(struct sched_poll *s
     return 0;
 
  out:
-    v->is_polling = 0;
+    clear_bit(v->vcpu_id, d->poll_mask);
     clear_bit(_VPF_blocked, &v->pause_flags);
     return rc;
 }
@@ -758,11 +761,8 @@ static void poll_timer_fn(void *data)
 {
     struct vcpu *v = data;
 
-    if ( !v->is_polling )
-        return;
-
-    v->is_polling = 0;
-    vcpu_unblock(v);
+    if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
+        vcpu_unblock(v);
 }
 
 /* Initialise the data structures. */
diff -r 265d42af434b xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Mon Aug 11 15:01:34 2008 +0100
+++ b/xen/include/xen/sched.h	Mon Aug 11 15:25:04 2008 +0100
@@ -106,8 +106,6 @@ struct vcpu
     bool_t           fpu_initialised;
     /* Has the FPU been used since it was last saved? */
     bool_t           fpu_dirtied;
-    /* Is this VCPU polling any event channels (SCHEDOP_poll)? */
-    bool_t           is_polling;
     /* Initialization completed for this VCPU? */
     bool_t           is_initialised;
     /* Currently running on a CPU? */
@@ -136,6 +134,13 @@ struct vcpu
 
     unsigned long    pause_flags;
     atomic_t         pause_count;
+
+    /*
+     * Valid only if this VCPU is specified in d->poll_mask:
+     * Positive values indicate a single port is being polled; -1 indicates
+     * multiple ports may be being polled.
+     */
+    int              poll_evtchn;
 
     /* IRQ-safe virq_lock protects against delivering VIRQ to stale evtchn. */
     u16              virq_to_evtchn[NR_VIRQS];
@@ -209,14 +214,15 @@ struct domain
     struct domain   *target;
     /* Is this guest being debugged by dom0? */
     bool_t           debugger_attached;
-    /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
-    bool_t           is_polling;
     /* Is this guest dying (i.e., a zombie)? */
     enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying;
     /* Domain is paused by controller software? */
     bool_t           is_paused_by_controller;
     /* Domain's VCPUs are pinned 1:1 to physical CPUs? */
     bool_t           is_pinned;
+
+    /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
+    DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS);
 
     /* Guest has shut down (inc. reason code)? */
     spinlock_t       shutdown_lock;

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-11 14:41     ` Keir Fraser
@ 2008-08-11 18:11       ` Jeremy Fitzhardinge
  2008-08-11 18:31         ` Keir Fraser
  0 siblings, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-11 18:11 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Keir Fraser wrote:
> Could you try with the attached lock-free patch?
>   

Just to confirm: this will definitely cause SCHED_poll to return if any 
event is delivered to the polling vcpu, right?  It won't restart the 
poll after event delivery?

Looks good otherwise.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-11 18:11       ` Jeremy Fitzhardinge
@ 2008-08-11 18:31         ` Keir Fraser
  2008-08-11 18:49           ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 29+ messages in thread
From: Keir Fraser @ 2008-08-11 18:31 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel

On 11/8/08 19:11, "Jeremy Fitzhardinge" <jeremy@goop.org> wrote:

> Keir Fraser wrote:
>> Could you try with the attached lock-free patch?
>>   
> 
> Just to confirm: this will definitely cause SCHED_poll to return if any
> event is delivered to the polling vcpu, right?  It won't restart the
> poll after event delivery?

That's right. The hypercall returns if any event is delivered to the vcpu,
or if any port in the poll set is pending. The fact we don't clear the vcpu
from the poll_mask if unblocked for event delivery isn't a correctness
issue, but it will cause unnecessary extra work in future invocations of
evtchn_set_pending(). Perhaps vcpu_unblock() should clear it.

 -- Keir

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-11 18:31         ` Keir Fraser
@ 2008-08-11 18:49           ` Jeremy Fitzhardinge
  2008-08-12  9:00             ` Keir Fraser
  0 siblings, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-11 18:49 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Keir Fraser wrote:
> That's right. The hypercall returns if any event is delivered to the vcpu,
> or if any port in the poll set is pending. The fact we don't clear the vcpu
> from the poll_mask if unblocked for event delivery isn't a correctness
> issue, but it will cause unnecessary extra work in future invocations of
> evtchn_set_pending(). Perhaps vcpu_unblock() should clear it.
>   

That seems reasonable.  In this use-case, it's quite likely that if the 
poll is interrupted by event delivery, on return it will find that the 
spinlock is now free and never re-enter the poll.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-11 18:49           ` Jeremy Fitzhardinge
@ 2008-08-12  9:00             ` Keir Fraser
  2008-08-12 16:33               ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 29+ messages in thread
From: Keir Fraser @ 2008-08-12  9:00 UTC (permalink / raw)
  To: Jeremy Fitzhardinge, Jan Beulich; +Cc: xen-devel

On 11/8/08 19:49, "Jeremy Fitzhardinge" <jeremy@goop.org> wrote:

> Keir Fraser wrote:
>> That's right. The hypercall returns if any event is delivered to the vcpu,
>> or if any port in the poll set is pending. The fact we don't clear the vcpu
>> from the poll_mask if unblocked for event delivery isn't a correctness
>> issue, but it will cause unnecessary extra work in future invocations of
>> evtchn_set_pending(). Perhaps vcpu_unblock() should clear it.
>>   
> 
> That seems reasonable.  In this use-case, it's quite likely that if the
> poll is interrupted by event delivery, on return it will find that the
> spinlock is now free and never re-enter the poll.

Attached is a new version of the patch which clears the vcpu from poll_mask
when it is unblocked for any reason. Jan: please can you give this one a
spin if you get time.

 -- Keir

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-12  9:00             ` Keir Fraser
@ 2008-08-12 16:33               ` Jeremy Fitzhardinge
  2008-08-12 17:00                 ` Keir Fraser
  0 siblings, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-12 16:33 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Keir Fraser wrote:
> On 11/8/08 19:49, "Jeremy Fitzhardinge" <jeremy@goop.org> wrote:
>
>   
>> Keir Fraser wrote:
>>     
>>> That's right. The hypercall returns if any event is delivered to the vcpu,
>>> or if any port in the poll set is pending. The fact we don't clear the vcpu
>>> from the poll_mask if unblocked for event delivery isn't a correctness
>>> issue, but it will cause unnecessary extra work in future invocations of
>>> evtchn_set_pending(). Perhaps vcpu_unblock() should clear it.
>>>   
>>>       
>> That seems reasonable.  In this use-case, it's quite likely that if the
>> poll is interrupted by event delivery, on return it will find that the
>> spinlock is now free and never re-enter the poll.
>>     
>
> Attached is a new version of the patch which clears the vcpu from poll_mask
> when it is unblocked for any reason. Jan: please can you give this one a
> spin if you get time.
>   

Forgot to attach?

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-12 16:33               ` Jeremy Fitzhardinge
@ 2008-08-12 17:00                 ` Keir Fraser
  2008-08-15 12:15                   ` Jan Beulich
  0 siblings, 1 reply; 29+ messages in thread
From: Keir Fraser @ 2008-08-12 17:00 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 526 bytes --]

On 12/8/08 17:33, "Jeremy Fitzhardinge" <jeremy@goop.org> wrote:

>>> That seems reasonable.  In this use-case, it's quite likely that if the
>>> poll is interrupted by event delivery, on return it will find that the
>>> spinlock is now free and never re-enter the poll.
>>>     
>> 
>> Attached is a new version of the patch which clears the vcpu from poll_mask
>> when it is unblocked for any reason. Jan: please can you give this one a
>> spin if you get time.
>>   
> 
> Forgot to attach?

Sorry, attached now.

 -- Keir


[-- Attachment #2: poll2.patch --]
[-- Type: application/octet-stream, Size: 6684 bytes --]

diff -r 8b1ebe5e8fd7 xen/common/domain.c
--- a/xen/common/domain.c	Mon Aug 11 16:51:02 2008 +0100
+++ b/xen/common/domain.c	Tue Aug 12 09:39:39 2008 +0100
@@ -651,9 +651,10 @@ void vcpu_reset(struct vcpu *v)
 
     set_bit(_VPF_down, &v->pause_flags);
 
+    clear_bit(v->vcpu_id, d->poll_mask);
+
     v->fpu_initialised = 0;
     v->fpu_dirtied     = 0;
-    v->is_polling      = 0;
     v->is_initialised  = 0;
     v->nmi_pending     = 0;
     v->mce_pending     = 0;
diff -r 8b1ebe5e8fd7 xen/common/event_channel.c
--- a/xen/common/event_channel.c	Mon Aug 11 16:51:02 2008 +0100
+++ b/xen/common/event_channel.c	Tue Aug 12 09:39:39 2008 +0100
@@ -545,6 +545,7 @@ static int evtchn_set_pending(struct vcp
 static int evtchn_set_pending(struct vcpu *v, int port)
 {
     struct domain *d = v->domain;
+    int vcpuid;
 
     /*
      * The following bit operations must happen in strict order.
@@ -564,17 +565,18 @@ static int evtchn_set_pending(struct vcp
     }
     
     /* Check if some VCPU might be polling for this event. */
-    if ( unlikely(d->is_polling) )
-    {
-        d->is_polling = 0;
-        smp_mb(); /* check vcpu poll-flags /after/ clearing domain poll-flag */
-        for_each_vcpu ( d, v )
-        {
-            if ( !v->is_polling )
-                continue;
-            v->is_polling = 0;
+    if ( likely(bitmap_empty(d->poll_mask, MAX_VIRT_CPUS)) )
+        return 0;
+
+    /* Wake any interested (or potentially interested) pollers. */
+    for ( vcpuid = find_first_bit(d->poll_mask, MAX_VIRT_CPUS);
+          vcpuid < MAX_VIRT_CPUS;
+          vcpuid = find_next_bit(d->poll_mask, MAX_VIRT_CPUS, vcpuid+1) )
+    {
+        v = d->vcpu[vcpuid];
+        if ( ((v->poll_evtchn < 0) || (v->poll_evtchn == port)) &&
+             test_and_clear_bit(vcpuid, d->poll_mask) )
             vcpu_unblock(v);
-        }
     }
 
     return 0;
diff -r 8b1ebe5e8fd7 xen/common/schedule.c
--- a/xen/common/schedule.c	Mon Aug 11 16:51:02 2008 +0100
+++ b/xen/common/schedule.c	Tue Aug 12 09:58:03 2008 +0100
@@ -198,6 +198,21 @@ void vcpu_wake(struct vcpu *v)
     TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
 }
 
+void vcpu_unblock(struct vcpu *v)
+{
+    struct domain *d;
+
+    if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
+        return;
+
+    /* Unlikely test-and-clear: try to avoid exclusive access to cache line. */
+    d = v->domain;
+    if ( unlikely(test_bit(v->vcpu_id, d->poll_mask)) )
+         clear_bit(v->vcpu_id, d->poll_mask);
+
+    vcpu_wake(v);
+}
+
 static void vcpu_migrate(struct vcpu *v)
 {
     unsigned long flags;
@@ -348,11 +363,20 @@ static long do_poll(struct sched_poll *s
         return -EFAULT;
 
     set_bit(_VPF_blocked, &v->pause_flags);
-    v->is_polling = 1;
-    d->is_polling = 1;
+    v->poll_evtchn = -1;
+    set_bit(v->vcpu_id, d->poll_mask);
 
     /* Check for events /after/ setting flags: avoids wakeup waiting race. */
-    smp_wmb();
+    smp_mb();
+
+    /*
+     * Someone may have seen we are blocked but not that we are polling, or
+     * vice versa. We are certainly being woken, so clean up and bail. Beyond
+     * this point others can be guaranteed to clean up for us if they wake us.
+     */
+    if ( !test_bit(_VPF_blocked, &v->pause_flags) ||
+         !test_bit(v->vcpu_id, d->poll_mask) )
+        goto out;
 
     for ( i = 0; i < sched_poll->nr_ports; i++ )
     {
@@ -369,6 +393,9 @@ static long do_poll(struct sched_poll *s
             goto out;
     }
 
+    if ( sched_poll->nr_ports == 1 )
+        v->poll_evtchn = port;
+
     if ( sched_poll->timeout != 0 )
         set_timer(&v->poll_timer, sched_poll->timeout);
 
@@ -378,7 +405,7 @@ static long do_poll(struct sched_poll *s
     return 0;
 
  out:
-    v->is_polling = 0;
+    clear_bit(v->vcpu_id, d->poll_mask);
     clear_bit(_VPF_blocked, &v->pause_flags);
     return rc;
 }
@@ -758,11 +785,8 @@ static void poll_timer_fn(void *data)
 {
     struct vcpu *v = data;
 
-    if ( !v->is_polling )
-        return;
-
-    v->is_polling = 0;
-    vcpu_unblock(v);
+    if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
+        vcpu_unblock(v);
 }
 
 /* Initialise the data structures. */
diff -r 8b1ebe5e8fd7 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Mon Aug 11 16:51:02 2008 +0100
+++ b/xen/include/xen/sched.h	Tue Aug 12 09:40:19 2008 +0100
@@ -106,8 +106,6 @@ struct vcpu
     bool_t           fpu_initialised;
     /* Has the FPU been used since it was last saved? */
     bool_t           fpu_dirtied;
-    /* Is this VCPU polling any event channels (SCHEDOP_poll)? */
-    bool_t           is_polling;
     /* Initialization completed for this VCPU? */
     bool_t           is_initialised;
     /* Currently running on a CPU? */
@@ -136,6 +134,13 @@ struct vcpu
 
     unsigned long    pause_flags;
     atomic_t         pause_count;
+
+    /*
+     * Valid only if this VCPU is specified in d->poll_mask:
+     * Positive values indicate a single port is being polled; -1 indicates
+     * multiple ports may be being polled.
+     */
+    int              poll_evtchn;
 
     /* IRQ-safe virq_lock protects against delivering VIRQ to stale evtchn. */
     u16              virq_to_evtchn[NR_VIRQS];
@@ -209,14 +214,15 @@ struct domain
     struct domain   *target;
     /* Is this guest being debugged by dom0? */
     bool_t           debugger_attached;
-    /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
-    bool_t           is_polling;
     /* Is this guest dying (i.e., a zombie)? */
     enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying;
     /* Domain is paused by controller software? */
     bool_t           is_paused_by_controller;
     /* Domain's VCPUs are pinned 1:1 to physical CPUs? */
     bool_t           is_pinned;
+
+    /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
+    DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS);
 
     /* Guest has shut down (inc. reason code)? */
     spinlock_t       shutdown_lock;
@@ -507,6 +513,7 @@ static inline int vcpu_runnable(struct v
              atomic_read(&v->domain->pause_count));
 }
 
+void vcpu_unblock(struct vcpu *v);
 void vcpu_pause(struct vcpu *v);
 void vcpu_pause_nosync(struct vcpu *v);
 void domain_pause(struct domain *d);
@@ -523,12 +530,6 @@ void vcpu_unlock_affinity(struct vcpu *v
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
 
-static inline void vcpu_unblock(struct vcpu *v)
-{
-    if ( test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
-        vcpu_wake(v);
-}
-
 #define IS_PRIV(_d) ((_d)->is_privileged)
 #define IS_PRIV_FOR(_d, _t) (IS_PRIV(_d) || ((_d)->target && (_d)->target == (_t)))
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-12 17:00                 ` Keir Fraser
@ 2008-08-15 12:15                   ` Jan Beulich
  2008-08-15 13:01                     ` Keir Fraser
  0 siblings, 1 reply; 29+ messages in thread
From: Jan Beulich @ 2008-08-15 12:15 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Jeremy Fitzhardinge, xen-devel

>>> Keir Fraser <keir.fraser@eu.citrix.com> 12.08.08 19:00 >>>
>On 12/8/08 17:33, "Jeremy Fitzhardinge" <jeremy@goop.org> wrote:
>
>>>> That seems reasonable.  In this use-case, it's quite likely that if the
>>>> poll is interrupted by event delivery, on return it will find that the
>>>> spinlock is now free and never re-enter the poll.
>>>>     
>>> 
>>> Attached is a new version of the patch which clears the vcpu from poll_mask
>>> when it is unblocked for any reason. Jan: please can you give this one a
>>> spin if you get time.
>>>   
>> 
>> Forgot to attach?
>
>Sorry, attached now.

I can't really explain the results of testing with this version of the patch:
While the number of false wakeups got further reduced by somewhat
less than 20%, both time spent in the kernel and total execution time
went up (8% and 4% respectively) compared to my original (and from
all I can tell worse) version of the patch. Nothing else changed as far as
I'm aware.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-15 12:15                   ` Jan Beulich
@ 2008-08-15 13:01                     ` Keir Fraser
  2008-08-15 14:06                       ` Jan Beulich
  2008-08-15 17:12                       ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 29+ messages in thread
From: Keir Fraser @ 2008-08-15 13:01 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Jeremy Fitzhardinge, xen-devel

On 15/8/08 13:15, "Jan Beulich" <jbeulich@novell.com> wrote:

> I can't really explain the results of testing with this version of the patch:
> While the number of false wakeups got further reduced by somewhat
> less than 20%, both time spent in the kernel and total execution time
> went up (8% and 4% respectively) compared to my original (and from
> all I can tell worse) version of the patch. Nothing else changed as far as
> I'm aware.

That is certainly odd. Presumably consistent across a few runs? I can't
imagine where extra time would be being spent though...

 -- Keir

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-15 13:01                     ` Keir Fraser
@ 2008-08-15 14:06                       ` Jan Beulich
  2008-08-18 10:01                         ` Keir Fraser
  2008-08-15 17:12                       ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 29+ messages in thread
From: Jan Beulich @ 2008-08-15 14:06 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Jeremy Fitzhardinge, xen-devel

>>> Keir Fraser <keir.fraser@eu.citrix.com> 15.08.08 15:01 >>>
>On 15/8/08 13:15, "Jan Beulich" <jbeulich@novell.com> wrote:
>
>> I can't really explain the results of testing with this version of the patch:
>> While the number of false wakeups got further reduced by somewhat
>> less than 20%, both time spent in the kernel and total execution time
>> went up (8% and 4% respectively) compared to my original (and from
>> all I can tell worse) version of the patch. Nothing else changed as far as
>> I'm aware.
>
>That is certainly odd. Presumably consistent across a few runs? I can't
>imagine where extra time would be being spent though...

Yes, I did at least five runs in each environment.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-15 14:06                       ` Jan Beulich
@ 2008-08-18 10:01                         ` Keir Fraser
  0 siblings, 0 replies; 29+ messages in thread
From: Keir Fraser @ 2008-08-18 10:01 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Jeremy Fitzhardinge, xen-devel

On 15/8/08 15:06, "Jan Beulich" <jbeulich@novell.com> wrote:

>>> I can't really explain the results of testing with this version of the
>>> patch:
>>> While the number of false wakeups got further reduced by somewhat
>>> less than 20%, both time spent in the kernel and total execution time
>>> went up (8% and 4% respectively) compared to my original (and from
>>> all I can tell worse) version of the patch. Nothing else changed as far as
>>> I'm aware.
>> 
>> That is certainly odd. Presumably consistent across a few runs? I can't
>> imagine where extra time would be being spent though...
> 
> Yes, I did at least five runs in each environment.

It might be worth retrying with the vcpu_unblock() changes removed. It'll
still work, but poll_mask may have bits spuriously left set for arbitrary
time periods. However, vcpu_unblock() is the only thing I obviously make
more expensive than in your patch.

We could also possibly make the vcpu_unblock() check cheaper by testing
v->poll_evtchn for non-zero, and zero it, and clear from poll_mask. Reading
a vcpu-local field may be cheaper than getting access to a domain struct
cache line.

 -- Keir

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-15 13:01                     ` Keir Fraser
  2008-08-15 14:06                       ` Jan Beulich
@ 2008-08-15 17:12                       ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-15 17:12 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Keir Fraser wrote:
> On 15/8/08 13:15, "Jan Beulich" <jbeulich@novell.com> wrote:
>
>   
>> I can't really explain the results of testing with this version of the patch:
>> While the number of false wakeups got further reduced by somewhat
>> less than 20%, both time spent in the kernel and total execution time
>> went up (8% and 4% respectively) compared to my original (and from
>> all I can tell worse) version of the patch. Nothing else changed as far as
>> I'm aware.
>>     
>
> That is certainly odd. Presumably consistent across a few runs? I can't
> imagine where extra time would be being spent though...
>   

2 locked instructions in vcpu_unblock() when using polling?  Doesn't
seem like 8%-worth of time though.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-11 12:22   ` Jan Beulich
  2008-08-11 12:25     ` Keir Fraser
  2008-08-11 14:41     ` Keir Fraser
@ 2008-08-11 18:10     ` Jeremy Fitzhardinge
  2008-08-13  7:17       ` Jan Beulich
  2 siblings, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-11 18:10 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

Jan Beulich wrote:
> Running kernel builds on 8 vCPU-s competing for 4 pCPU-s shows a 10%
> improvement in performance with the individual wakeup (patch attached
> - probably sub-optimal, but I didn't seem to be able to think of a lock-less
> mechanism to achieve the desired behavior), using ticket locks in the
> kernel.

What exactly are you measuring here?  Are you testing your 2.6.2x 
forward-port with an adaptation of pv spinlocks using ticket locks as 
the underlying locking mechanism?  Does the 10% improvement come 
compared to straightforward use of sched_poll, or compared to doing 
spurious wakeup detection?

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-11 18:10     ` Jeremy Fitzhardinge
@ 2008-08-13  7:17       ` Jan Beulich
  0 siblings, 0 replies; 29+ messages in thread
From: Jan Beulich @ 2008-08-13  7:17 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel, Keir Fraser

>>> Jeremy Fitzhardinge <jeremy@goop.org> 11.08.08 20:10 >>>
>Jan Beulich wrote:
>> Running kernel builds on 8 vCPU-s competing for 4 pCPU-s shows a 10%
>> improvement in performance with the individual wakeup (patch attached
>> - probably sub-optimal, but I didn't seem to be able to think of a lock-less
>> mechanism to achieve the desired behavior), using ticket locks in the
>> kernel.
>
>What exactly are you measuring here?  Are you testing your 2.6.2x 
>forward-port with an adaptation of pv spinlocks using ticket locks as 
>the underlying locking mechanism?  Does the 10% improvement come 

Yes.

>compared to straightforward use of sched_poll, or compared to doing 
>spurious wakeup detection?

Compared to spurious wakeup detection done in the kernel, similar to
how you had done it in the latest version of your respective patch that
I have seen.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-04 10:18 Xen spinlock questions Jan Beulich
  2008-08-04 10:24 ` Keir Fraser
@ 2008-08-04 19:36 ` Jeremy Fitzhardinge
  2008-08-05  6:32   ` Jan Beulich
  1 sibling, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-04 19:36 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

Jan Beulich wrote:
> Jeremy,
>
> considering to utilize your pv-ops spinlock implementation for our kernels,
> I'd appreciate your opinion on the following thoughts:
>
> 1) While the goal of the per-CPU kicker irq appears to be to avoid all CPUs
> waiting for a particular lock to get kicked simultaneously, I think this
> doesn't have the desired effect. This is because Xen doesn't track what
> event channel you poll for (through SCHEDOP_poll), and rather kicks all CPUs
> polling for any event channel.
>   

There's no problem with kicking all cpus waiting for a given lock, but 
it was intended to avoid kicking cpus waiting for some other lock.  I 
hadn't looked at the poll implementation that closely.  I guess using 
the per-cpu interrupt gives Xen some room to live up to the expectations 
we have for it ;)

> 2) While on native not re-enabling interrupts in __raw_spin_lock_flags()
> may be tolerable (but perhaps questionable), not doing so at least on
> the slow path here seems suspicious.
>   

I wasn't sure about that.  Is it OK to enable interrupts in the middle 
of a spinlock?  Can it be done unconditionally?

> 3) Introducing yet another per-CPU IRQ for this purpose further
> tightens scalability. Using a single, IRQF_PER_CPU IRQ should be
> sufficient here, as long as it gets properly multiplexed onto individual
> event channels (of which we have far more than IRQs). I have a patch
> queued for the traditional tree that does just that conversion for the
> reschedule and call-function IPIs, which I had long planned to get
> submitted (but so far wasn't able to due to lack of testing done on the
> migration aspects of it), and once successful was planning on trying to
> do something similar for the timer IRQ.

There's two lines of work I'm hoping to push to mitigate this:

One is the unification of 32 and 64-bit interrupt handling, so that they 
both have an underlying notion of a vector, which is what we map event 
channels to.  Since vectors can be mapped to a (irq,cpu) tuple, it would 
allow multiple per-cpu vectors/event channels to be mapped to a single 
irq, and do so generically for all event channel types.  That would mean 
we'd end up allocating one set of interrupts for time, function calls, 
spinlocks, etc, rather than percpu.

The other is eliminating NR_IRQ, and making irq allocation completely 
dynamic.

>  I am attaching that (2.6.26 based) patch just for reference.

 From a quick look, you're thinking along similar lines.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-04 19:36 ` Jeremy Fitzhardinge
@ 2008-08-05  6:32   ` Jan Beulich
  2008-08-05  7:33     ` Keir Fraser
  2008-08-05 18:09     ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 29+ messages in thread
From: Jan Beulich @ 2008-08-05  6:32 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel, Keir Fraser

>> 2) While on native not re-enabling interrupts in __raw_spin_lock_flags()
>> may be tolerable (but perhaps questionable), not doing so at least on
>> the slow path here seems suspicious.
>>   
>
>I wasn't sure about that.  Is it OK to enable interrupts in the middle 
>of a spinlock?  Can it be done unconditionally?

That used to be done in the pre-ticket lock implementation, but of course
conditional upon the original interrupt flag.

Later yesterday I noticed another issue: The code setting lock_spinners
isn't interruption safe - you'll need to return the old value from
spinning_lock() and restore it in unspinning_lock().

Also I'm considering doing it ticket-based nevertheless, as "mix(ing) up
next cpu selection" won't really help fairness in xen_spin_unlock_slow().
Apart from definitely needing the wakeup to happen for just the target
CPU (Keir, I'd want the necessary support in Xen done for that to work
regardless of performance measurements with the traditional locking,
as it's known that with ticket locks performance suffers from wrong-
order CPU kicking), one thing we'd be in even more need for here than
old-style spin locks had been would be a directed yield (sub-)hypercall.
Has that ever been considered to become a schedop?

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-05  6:32   ` Jan Beulich
@ 2008-08-05  7:33     ` Keir Fraser
  2008-08-05 18:09     ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 29+ messages in thread
From: Keir Fraser @ 2008-08-05  7:33 UTC (permalink / raw)
  To: Jan Beulich, Jeremy Fitzhardinge; +Cc: xen-devel

On 5/8/08 07:32, "Jan Beulich" <jbeulich@novell.com> wrote:

> one thing we'd be in even more need for here than
> old-style spin locks had been would be a directed yield (sub-)hypercall.
> Has that ever been considered to become a schedop?

It has. Someone will need to implement it and show a convincing performance
win with it with a real benchmark.

 -- Keir

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-05  6:32   ` Jan Beulich
  2008-08-05  7:33     ` Keir Fraser
@ 2008-08-05 18:09     ` Jeremy Fitzhardinge
  2008-08-06  7:15       ` Jan Beulich
  1 sibling, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-05 18:09 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

Jan Beulich wrote:
>>> 2) While on native not re-enabling interrupts in __raw_spin_lock_flags()
>>> may be tolerable (but perhaps questionable), not doing so at least on
>>> the slow path here seems suspicious.
>>>   
>>>       
>> I wasn't sure about that.  Is it OK to enable interrupts in the middle 
>> of a spinlock?  Can it be done unconditionally?
>>     
>
> That used to be done in the pre-ticket lock implementation, but of course
> conditional upon the original interrupt flag.
>   

Right, I see.  The spin_lock_flags stuff which the current lock 
implementation just ignores.  I'll add a new lock op for it.

> Later yesterday I noticed another issue: The code setting lock_spinners
> isn't interruption safe - you'll need to return the old value from
> spinning_lock() and restore it in unspinning_lock().
>   

Good catch.

> Also I'm considering doing it ticket-based nevertheless, as "mix(ing) up
> next cpu selection" won't really help fairness in xen_spin_unlock_slow().
>   

Why's that?  An alternative might be to just wake all cpus waiting for 
the lock up, and then let them fight it out.  It should be unusual that 
there's a significant number of waiters anyway, since contention is 
(should be) rare.

The main reason for ticket locks is to break the egregious unfairness 
that (some) bus protocols implement.  That level of fairness shouldn't 
be necessary here because once the cpus fall to blocking in the 
hypervisor, it's up to Xen to tie-break.

> Apart from definitely needing the wakeup to happen for just the target
> CPU (Keir, I'd want the necessary support in Xen done for that to work
> regardless of performance measurements with the traditional locking,
> as it's known that with ticket locks performance suffers from wrong-
> order CPU kicking), one thing we'd be in even more need for here than
> old-style spin locks had been would be a directed yield (sub-)hypercall.
> Has that ever been considered to become a schedop?
>   

Considered.  But the point of this exercise was to come up with 
something that would work with an unmodified hypervisor.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-05 18:09     ` Jeremy Fitzhardinge
@ 2008-08-06  7:15       ` Jan Beulich
  2008-08-06  8:47         ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 29+ messages in thread
From: Jan Beulich @ 2008-08-06  7:15 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel, Keir Fraser

>>> Jeremy Fitzhardinge <jeremy@goop.org> 05.08.08 20:09 >>>
>Jan Beulich wrote:
>> Later yesterday I noticed another issue: The code setting lock_spinners
>> isn't interruption safe - you'll need to return the old value from
>> spinning_lock() and restore it in unspinning_lock().
>>   
>
>Good catch.

More on that: You'll really need two per-CPU variables afaics, one for the
current non-irq lock being spun upon, and one for the current irqs-disabled
one. The latter one might not need saving/restoring as long as you don't
re-enable interrupts, but the code might turn out cleaner when doing the
save/restore regardless, e.g. for me (doing ticket locking):

int xen_spin_wait(raw_spinlock_t *lock, unsigned int token)
{
	struct spinning *spinning;
	int rc;

	/* if kicker interrupt not initialized yet, just spin */
	if (spinlock_irq < 0)
		return 0;

	/* announce we're spinning */
	spinning = &__get_cpu_var(spinning);
	if (spinning->lock) {
		BUG_ON(!raw_irqs_disabled());
		spinning = &__get_cpu_var(spinning_irq);
		BUG_ON(spinning->lock);
	}
	spinning->ticket = token >> TICKET_SHIFT;
	smp_wmb();
	spinning->lock = lock;

	/* clear pending */
	xen_clear_irq_pending(spinlock_irq);

	/* check again make sure it didn't become free while
	   we weren't looking  */
	rc = __raw_spin_trylock(lock);
	if (!rc) {
		/* block until irq becomes pending */
		xen_poll_irq(spinlock_irq);
		kstat_this_cpu.irqs[spinlock_irq]++;
	}

	/* announce we're done */
	spinning->lock = NULL;

	return rc;
}

>> Also I'm considering doing it ticket-based nevertheless, as "mix(ing) up
>> next cpu selection" won't really help fairness in xen_spin_unlock_slow().
>>   
>
>Why's that?  An alternative might be to just wake all cpus waiting for 
>the lock up, and then let them fight it out.  It should be unusual that 
>there's a significant number of waiters anyway, since contention is 
>(should be) rare.

On an 8-core system I'm seeing between 20,000 (x86-64) and 35,000
(i686) wakeup interrupts per CPU. I'm not certain this still counts as rare.
Though that number may go down a little once the hypervisor doesn't
needlessly wake all polling vCPU-s anymore.

>The main reason for ticket locks is to break the egregious unfairness 
>that (some) bus protocols implement.  That level of fairness shouldn't 
>be necessary here because once the cpus fall to blocking in the 
>hypervisor, it's up to Xen to tie-break.

Why? The hypervisor doing the tie-break makes it possibly even more
unfair, whereas with tickets and a way to kick the next owner (and only
it) almost the same level of fairness as on native can be achieved. The
only heuristic to determine by measurement is the number of spin loops
before going into poll mode (which your original patch's description and
implementation for some reason disagree about - the description says
2^16 loops, the implementation uses 2^10). Obviously, the optimal
numbers may turn out different for byte and ticket locks.

And doing a wake-them-all approach isn't good here, as was
(supposedly, I wasn't there) explained in a talk on the last summit.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-06  7:15       ` Jan Beulich
@ 2008-08-06  8:47         ` Jeremy Fitzhardinge
  2008-08-06  9:35           ` Jan Beulich
  0 siblings, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-06  8:47 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

Jan Beulich wrote:
> More on that: You'll really need two per-CPU variables afaics, one for the
> current non-irq lock being spun upon, and one for the current irqs-disabled
> one. The latter one might not need saving/restoring as long as you don't
> re-enable interrupts, but the code might turn out cleaner when doing the
> save/restore regardless, e.g. for me (doing ticket locking):
>   

Not sure I follow.  How do you use the second array at the kick end of 
the process?

I ended up just storing the previous value locally, and then restoring 
it.  It assumes that locks will strictly nest, of course, but I think 
that's reasonable.

--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -135,25 +135,39 @@
 static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
 static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);

-static inline void spinning_lock(struct xen_spinlock *xl)
+/*
+ * Mark a cpu as interested in a lock.  Returns the CPU's previous
+ * lock of interest, in case we got preempted by an interrupt.
+ */
+static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
 {
-	__get_cpu_var(lock_spinners) = xl;
-	wmb();			/* set lock of interest before count */
+	struct xen_spinlock *prev;
+
+	prev = xchg(&__get_cpu_var(lock_spinners), xl);
+	/* xchg is a barrier */
+
 	asm(LOCK_PREFIX " incw %0"
 	    : "+m" (xl->spinners) : : "memory");
+
+	return prev;
 }

-static inline void unspinning_lock(struct xen_spinlock *xl)
+/*
+ * Mark a cpu as no longer interested in a lock.  Restores previous
+ * lock of interest (NULL for none).
+ */
+static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
 {
 	asm(LOCK_PREFIX " decw %0"
 	    : "+m" (xl->spinners) : : "memory");
-	wmb();			/* decrement count before clearing lock */
-	__get_cpu_var(lock_spinners) = NULL;
+	wmb();			/* decrement count before restoring lock */
+	__get_cpu_var(lock_spinners) = prev;
 }

 static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
 {
 	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+	struct xen_spinlock *prev;
 	int irq = __get_cpu_var(lock_kicker_irq);
 	int ret;
 	unsigned long flags;
@@ -163,7 +177,8 @@
 		return 0;

 	/* announce we're spinning */
-	spinning_lock(xl);
+	prev = spinning_lock(xl);
+
 	flags = __raw_local_save_flags();
 	if (irq_enable) {
 		ADD_STATS(taken_slow_irqenable, 1);
@@ -189,7 +204,7 @@

 out:
 	raw_local_irq_restore(flags);
-	unspinning_lock(xl);
+	unspinning_lock(xl, prev);
 	return ret;
 }

> int xen_spin_wait(raw_spinlock_t *lock, unsigned int token)
> {
> 	struct spinning *spinning;
> 	int rc;
>
> 	/* if kicker interrupt not initialized yet, just spin */
> 	if (spinlock_irq < 0)
> 		return 0;
>
> 	/* announce we're spinning */
> 	spinning = &__get_cpu_var(spinning);
> 	if (spinning->lock) {
> 		BUG_ON(!raw_irqs_disabled());
> 		spinning = &__get_cpu_var(spinning_irq);
> 		BUG_ON(spinning->lock);
> 	}
> 	spinning->ticket = token >> TICKET_SHIFT;
> 	smp_wmb();
> 	spinning->lock = lock;
>
> 	/* clear pending */
> 	xen_clear_irq_pending(spinlock_irq);
>
> 	/* check again make sure it didn't become free while
> 	   we weren't looking  */
> 	rc = __raw_spin_trylock(lock);
> 	if (!rc) {
> 		/* block until irq becomes pending */
> 		xen_poll_irq(spinlock_irq);
> 		kstat_this_cpu.irqs[spinlock_irq]++;
> 	}
>
> 	/* announce we're done */
> 	spinning->lock = NULL;
>
> 	return rc;
> }
>
>   
> On an 8-core system I'm seeing between 20,000 (x86-64) and 35,000
> (i686) wakeup interrupts per CPU. I'm not certain this still counts as rare.
> Though that number may go down a little once the hypervisor doesn't
> needlessly wake all polling vCPU-s anymore.
>   

What workload are you seeing that on?  20-35k interrupts over what time 
period?

In my tests, I only see it fall into the slow path a couple of thousand 
times per cpu for a kernbench run.

>> The main reason for ticket locks is to break the egregious unfairness 
>> that (some) bus protocols implement.  That level of fairness shouldn't 
>> be necessary here because once the cpus fall to blocking in the 
>> hypervisor, it's up to Xen to tie-break.
>>     
>
> Why? The hypervisor doing the tie-break makes it possibly even more
> unfair, whereas with tickets and a way to kick the next owner (and only
> it) almost the same level of fairness as on native can be achieved.

I don't think strict fairness is a particularly desirable property; it's 
certainly not an unambiguous win.  The important thing is solves is 
total starvation, and if the Xen scheduler ends up starving a CPU then 
that's a scheduler bug we can fix.  We can't fix the cache coherence 
protocols, so we need to use something like a ticket lock.

>  The
> only heuristic to determine by measurement is the number of spin loops
> before going into poll mode (which your original patch's description and
> implementation for some reason disagree about - the description says
> 2^16 loops, the implementation uses 2^10). Obviously, the optimal
> numbers may turn out different for byte and ticket locks.
>   

Yes, there's a bit of confusion about loop iterations vs cycles.  The 
original paper said that after 2^16 *cycles* 90% of locks have been 
taken, which I map (approximately) to 2^10 loop iterations (but 
originally I had 2^16 iterations).  I don't think it's all that 
critical; it probably depends too much on workload and precise system 
and VM configuration to really finely tune, and doesn't end up making 
all that much difference.

That said, I've implemented a pile of debugfs infrastructure for 
extracting lots of details about lock performance so there's some scope 
for tuning it (including being able to change the timeout on the fly to 
see how things change).

> And doing a wake-them-all approach isn't good here, as was
> (supposedly, I wasn't there) explained in a talk on the last summit.
>   

Well, yes, wake-all is a total disaster for ticket locks, since it just 
causes scheduler thrashing.  But for byte locks it doesn't matter all 
that much since whoever runs next will take the lock and make progress.  
The others will wake up and spin for a bit before sleeping; not great, 
but still better than plain spinning.

It might be worth using a smaller timeout for re-lock attempts after 
waking up, but that could also lead to starvation as a sleeper will be 
at a disadvantage against someone who's currently spinning on the lock, 
and so will tend to lose against new lock takers.

I think we can also mitigate poll's wake-all behaviour by seeing if our 
particular per-cpu interrupt is pending and drop back into poll 
immediately if not (ie, detect a spurious wakeup).

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-06  8:47         ` Jeremy Fitzhardinge
@ 2008-08-06  9:35           ` Jan Beulich
  2008-08-06 17:53             ` Jeremy Fitzhardinge
  2008-08-06 20:21             ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 29+ messages in thread
From: Jan Beulich @ 2008-08-06  9:35 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel, Keir Fraser

>>> Jeremy Fitzhardinge <jeremy@goop.org> 06.08.08 10:47 >>>
>Jan Beulich wrote:
>> More on that: You'll really need two per-CPU variables afaics, one for the
>> current non-irq lock being spun upon, and one for the current irqs-disabled
>> one. The latter one might not need saving/restoring as long as you don't
>> re-enable interrupts, but the code might turn out cleaner when doing the
>> save/restore regardless, e.g. for me (doing ticket locking):
>>   
>
>Not sure I follow.  How do you use the second array at the kick end of 
>the process?

I just look at both stored values.

>I ended up just storing the previous value locally, and then restoring 
>it.  It assumes that locks will strictly nest, of course, but I think 
>that's reasonable.

Storing the previous value locally is fine. But I don't think you can do with
just one 'currently spinning' pointer because of the kicking side
requirements - if an irq-save lock interrupted an non-irq one (with the
spinning pointer already set) and a remote CPU releases the lock and
wants to kick you, it won't be able to if the irq-save lock already replaced
the non-irq one. Nevertheless, if you're past the try-lock, you may end
up never getting the wakeup.

Since there can only be one non-irq and one irq-save lock a CPU is
currently spinning on (the latter as long as you don't re-enable interrupts),
two fields, otoh, are sufficient.

Btw., I also think that using an xchg() (and hence a locked transaction)
for updating the pointer isn't necessary.

>> On an 8-core system I'm seeing between 20,000 (x86-64) and 35,000
>> (i686) wakeup interrupts per CPU. I'm not certain this still counts as rare.
>> Though that number may go down a little once the hypervisor doesn't
>> needlessly wake all polling vCPU-s anymore.
>>   
>
>What workload are you seeing that on?  20-35k interrupts over what time 
>period?

Oh, sorry, I meant to say that's for a kernel build (-j12), taking about
400 wall seconds.

>In my tests, I only see it fall into the slow path a couple of thousand 
>times per cpu for a kernbench run.

Hmm, that's different then for me. Actually, I see a significant spike at
modpost stage 2, when all the .ko-s get linked. The (spinlock) interrupt
rate gets up to between 1,000 and 2,000 per CPU and second.

>That said, I've implemented a pile of debugfs infrastructure for 
>extracting lots of details about lock performance so there's some scope 
>for tuning it (including being able to change the timeout on the fly to 
>see how things change).

Yeah, that's gonna be useful to have.

>I think we can also mitigate poll's wake-all behaviour by seeing if our 
>particular per-cpu interrupt is pending and drop back into poll 
>immediately if not (ie, detect a spurious wakeup).

Oh, of course - I didn't consider this so far.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Xen spinlock questions
  2008-08-06  9:35           ` Jan Beulich
@ 2008-08-06 17:53             ` Jeremy Fitzhardinge
  2008-08-06 20:21             ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-06 17:53 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

Jan Beulich wrote:
> Storing the previous value locally is fine. But I don't think you can do with
> just one 'currently spinning' pointer because of the kicking side
> requirements - if an irq-save lock interrupted an non-irq one (with the
> spinning pointer already set) and a remote CPU releases the lock and
> wants to kick you, it won't be able to if the irq-save lock already replaced
> the non-irq one. Nevertheless, if you're past the try-lock, you may end
> up never getting the wakeup.
>
> Since there can only be one non-irq and one irq-save lock a CPU is
> currently spinning on (the latter as long as you don't re-enable interrupts),
> two fields, otoh, are sufficient.
>   

No, I think it's mostly OK.  The sequence is:

   1. mark that we're about to block
   2. clear pending state on irq
   3. test again with trylock
   4. block in poll

The worrisome interval is between 3-4, but it's only a problem if we end 
up entering the poll with the lock free and the interrupt non-pending.  
There are a few possibilities for what happens in that interval:

   1. the nested spinlock is fastpath only, and takes some other lock
      -> OK, because we're still marked as interested in this lock, and
      will be kicked
      -> if the nested spinlock takes the same lock as the outer one, it
      will end up doing a self-kick
   2. the nested spinlock is slowpath and is kicked
      -> OK, because it will leave the irq pending
   3. the nested spinlock is slowpath, but picks up the lock on retry
      -> bad, because it won't leave the irq pending.

The fix is to make sure that in case 4, it checks to see if it's 
interrupting a blocking lock (by checking if prev != NULL), and leave 
the irq pending if it is.

Updated patch below.  Compile tested only.

> Btw., I also think that using an xchg() (and hence a locked transaction)
> for updating the pointer isn't necessary.
>   

I was concerned about what would happen if an interrupt got between the 
fetch and store.  But thinking about it, I think you're right.

>>> On an 8-core system I'm seeing between 20,000 (x86-64) and 35,000
>>> (i686) wakeup interrupts per CPU. I'm not certain this still counts as rare.
>>> Though that number may go down a little once the hypervisor doesn't
>>> needlessly wake all polling vCPU-s anymore.
>>>   
>>>       
>> What workload are you seeing that on?  20-35k interrupts over what time 
>> period?
>>     
>
> Oh, sorry, I meant to say that's for a kernel build (-j12), taking about
> 400 wall seconds.
>
>   
>> In my tests, I only see it fall into the slow path a couple of thousand 
>> times per cpu for a kernbench run.
>>     
>
> Hmm, that's different then for me. Actually, I see a significant spike at
> modpost stage 2, when all the .ko-s get linked. The (spinlock) interrupt
> rate gets up to between 1,000 and 2,000 per CPU and second.
>   

defconfig? allmodconfig?

I'll measure again to confirm.

    J

diff -r 5b4b80c08799 arch/x86/xen/spinlock.c
--- a/arch/x86/xen/spinlock.c	Wed Aug 06 01:35:09 2008 -0700
+++ b/arch/x86/xen/spinlock.c	Wed Aug 06 10:51:27 2008 -0700
@@ -22,6 +22,7 @@
 	u64 taken_slow;
 	u64 taken_slow_pickup;
 	u64 taken_slow_irqenable;
+	u64 taken_slow_spurious;
 
 	u64 released;
 	u64 released_slow;
@@ -135,25 +136,41 @@
 static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
 static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
 
-static inline void spinning_lock(struct xen_spinlock *xl)
+/*
+ * Mark a cpu as interested in a lock.  Returns the CPU's previous
+ * lock of interest, in case we got preempted by an interrupt.
+ */
+static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
 {
+	struct xen_spinlock *prev;
+
+	prev = __get_cpu_var(lock_spinners);
 	__get_cpu_var(lock_spinners) = xl;
+
 	wmb();			/* set lock of interest before count */
+
 	asm(LOCK_PREFIX " incw %0"
 	    : "+m" (xl->spinners) : : "memory");
+
+	return prev;
 }
 
-static inline void unspinning_lock(struct xen_spinlock *xl)
+/*
+ * Mark a cpu as no longer interested in a lock.  Restores previous
+ * lock of interest (NULL for none).
+ */
+static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
 {
 	asm(LOCK_PREFIX " decw %0"
 	    : "+m" (xl->spinners) : : "memory");
-	wmb();			/* decrement count before clearing lock */
-	__get_cpu_var(lock_spinners) = NULL;
+	wmb();			/* decrement count before restoring lock */
+	__get_cpu_var(lock_spinners) = prev;
 }
 
 static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
 {
 	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+	struct xen_spinlock *prev;
 	int irq = __get_cpu_var(lock_kicker_irq);
 	int ret;
 	unsigned long flags;
@@ -163,33 +180,56 @@
 		return 0;
 
 	/* announce we're spinning */
-	spinning_lock(xl);
+	prev = spinning_lock(xl);
+
 	flags = __raw_local_save_flags();
 	if (irq_enable) {
 		ADD_STATS(taken_slow_irqenable, 1);
 		raw_local_irq_enable();
 	}
 
-	/* clear pending */
-	xen_clear_irq_pending(irq);
+	ADD_STATS(taken_slow, 1);
+		
+	do {
+		/* clear pending */
+		xen_clear_irq_pending(irq);
 
-	ADD_STATS(taken_slow, 1);
+		/* check again make sure it didn't become free while
+		   we weren't looking  */
+		ret = xen_spin_trylock(lock);
+		if (ret) {
+			ADD_STATS(taken_slow_pickup, 1);
+			
+			/*
+			 * If we interrupted another spinlock while it
+			 * was blocking, make sure it doesn't block
+			 * without rechecking the lock.
+			 */
+			if (prev != NULL)
+				xen_set_irq_pending(irq);
+			goto out;
+		}
 
-	/* check again make sure it didn't become free while
-	   we weren't looking  */
-	ret = xen_spin_trylock(lock);
-	if (ret) {
-		ADD_STATS(taken_slow_pickup, 1);
-		goto out;
-	}
+		/* 
+		 * Block until irq becomes pending.  If we're
+		 * interrupted at this point (after the trylock but
+		 * before entering the block), then the nested lock
+		 * handler guarantees that the irq will be left
+		 * pending if there's any chance the lock became free;
+		 * xen_poll_irq() returns immediately if the irq is
+		 * pending.
+		 */
+		xen_poll_irq(irq);
+		kstat_this_cpu.irqs[irq]++;
+		ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
+	} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
 
-	/* block until irq becomes pending */
-	xen_poll_irq(irq);
-	kstat_this_cpu.irqs[irq]++;
+	/* Leave the irq pending so that any interrupted blocker will
+	   recheck. */
 
 out:
 	raw_local_irq_restore(flags);
-	unspinning_lock(xl);
+	unspinning_lock(xl, prev);
 	return ret;
 }
 
@@ -333,6 +373,8 @@
 			   &spinlock_stats.taken_slow_pickup);
 	debugfs_create_u64("taken_slow_irqenable", 0444, d_spin_debug,
 			   &spinlock_stats.taken_slow_irqenable);
+	debugfs_create_u64("taken_slow_spurious", 0444, d_spin_debug,
+			   &spinlock_stats.taken_slow_spurious);
 
 	debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
 	debugfs_create_u64("released_slow", 0444, d_spin_debug,
diff -r 5b4b80c08799 drivers/xen/events.c
--- a/drivers/xen/events.c	Wed Aug 06 01:35:09 2008 -0700
+++ b/drivers/xen/events.c	Wed Aug 06 10:51:27 2008 -0700
@@ -162,6 +162,12 @@
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline int test_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	return sync_test_bit(port, &s->evtchn_pending[0]);
 }
 
 
@@ -748,6 +754,25 @@
 		clear_evtchn(evtchn);
 }
 
+void xen_set_irq_pending(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		set_evtchn(evtchn);
+}
+
+bool xen_test_irq_pending(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+	bool ret = false;
+
+	if (VALID_EVTCHN(evtchn))
+		ret = test_evtchn(evtchn);
+
+	return ret;
+}
+
 /* Poll waiting for an irq to become pending.  In the usual case, the
    irq will be disabled so it won't deliver an interrupt. */
 void xen_poll_irq(int irq)
diff -r 5b4b80c08799 include/xen/events.h
--- a/include/xen/events.h	Wed Aug 06 01:35:09 2008 -0700
+++ b/include/xen/events.h	Wed Aug 06 10:51:27 2008 -0700
@@ -47,6 +47,8 @@
 
 /* Clear an irq's pending state, in preparation for polling on it */
 void xen_clear_irq_pending(int irq);
+void xen_set_irq_pending(int irq);
+bool xen_test_irq_pending(int irq);
 
 /* Poll waiting for an irq to become pending.  In the usual case, the
    irq will be disabled so it won't deliver an interrupt. */

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: Xen spinlock questions
  2008-08-06  9:35           ` Jan Beulich
  2008-08-06 17:53             ` Jeremy Fitzhardinge
@ 2008-08-06 20:21             ` Jeremy Fitzhardinge
  2008-08-07  7:21               ` Jan Beulich
  1 sibling, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-06 20:21 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

Jan Beulich wrote:
>> In my tests, I only see it fall into the slow path a couple of thousand 
>> times per cpu for a kernbench run.
>>     
>
> Hmm, that's different then for me. Actually, I see a significant spike at
> modpost stage 2, when all the .ko-s get linked. The (spinlock) interrupt
> rate gets up to between 1,000 and 2,000 per CPU and second.
>   

BTW, how much contention for CPUs was there?

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: Xen spinlock questions
  2008-08-06 20:21             ` Jeremy Fitzhardinge
@ 2008-08-07  7:21               ` Jan Beulich
  2008-08-07 15:47                 ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 29+ messages in thread
From: Jan Beulich @ 2008-08-07  7:21 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel, Keir Fraser

>>> Jeremy Fitzhardinge <jeremy@goop.org> 06.08.08 22:21 >>>
>Jan Beulich wrote:
>>> In my tests, I only see it fall into the slow path a couple of thousand 
>>> times per cpu for a kernbench run.
>>>     
>>
>> Hmm, that's different then for me. Actually, I see a significant spike at
>> modpost stage 2, when all the .ko-s get linked. The (spinlock) interrupt
>> rate gets up to between 1,000 and 2,000 per CPU and second.
>>   
>
>BTW, how much contention for CPUs was there?

In Xen or the kernel? So far these are all Dom0-only measurements, so
in Xen there should be no contention.

To answer your other mail's question: It's defconfig builds I'm using.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: Xen spinlock questions
  2008-08-07  7:21               ` Jan Beulich
@ 2008-08-07 15:47                 ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-07 15:47 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

Jan Beulich wrote:
>>>> Jeremy Fitzhardinge <jeremy@goop.org> 06.08.08 22:21 >>>
>>>>         
>> Jan Beulich wrote:
>>     
>>>> In my tests, I only see it fall into the slow path a couple of thousand 
>>>> times per cpu for a kernbench run.
>>>>     
>>>>         
>>> Hmm, that's different then for me. Actually, I see a significant spike at
>>> modpost stage 2, when all the .ko-s get linked. The (spinlock) interrupt
>>> rate gets up to between 1,000 and 2,000 per CPU and second.
>>>   
>>>       
>> BTW, how much contention for CPUs was there?
>>     
>
> In Xen or the kernel? So far these are all Dom0-only measurements, so
> in Xen there should be no contention.
>   

OK.  The spinlock results are most interesting when there's contention 
for physical cpus between domains.

> To answer your other mail's question: It's defconfig builds I'm using.
>   
[SuSE defconfigs]  Presumably that has a lot enabled as modules.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2008-08-18 10:01 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-04 10:18 Xen spinlock questions Jan Beulich
2008-08-04 10:24 ` Keir Fraser
2008-08-11 12:22   ` Jan Beulich
2008-08-11 12:25     ` Keir Fraser
2008-08-11 14:41     ` Keir Fraser
2008-08-11 18:11       ` Jeremy Fitzhardinge
2008-08-11 18:31         ` Keir Fraser
2008-08-11 18:49           ` Jeremy Fitzhardinge
2008-08-12  9:00             ` Keir Fraser
2008-08-12 16:33               ` Jeremy Fitzhardinge
2008-08-12 17:00                 ` Keir Fraser
2008-08-15 12:15                   ` Jan Beulich
2008-08-15 13:01                     ` Keir Fraser
2008-08-15 14:06                       ` Jan Beulich
2008-08-18 10:01                         ` Keir Fraser
2008-08-15 17:12                       ` Jeremy Fitzhardinge
2008-08-11 18:10     ` Jeremy Fitzhardinge
2008-08-13  7:17       ` Jan Beulich
2008-08-04 19:36 ` Jeremy Fitzhardinge
2008-08-05  6:32   ` Jan Beulich
2008-08-05  7:33     ` Keir Fraser
2008-08-05 18:09     ` Jeremy Fitzhardinge
2008-08-06  7:15       ` Jan Beulich
2008-08-06  8:47         ` Jeremy Fitzhardinge
2008-08-06  9:35           ` Jan Beulich
2008-08-06 17:53             ` Jeremy Fitzhardinge
2008-08-06 20:21             ` Jeremy Fitzhardinge
2008-08-07  7:21               ` Jan Beulich
2008-08-07 15:47                 ` Jeremy Fitzhardinge

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.