[RFC PATCH 2/3] Xen: rework NR_EVENT_CHANNELS related stuffs.

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH 2/3] Xen: rework NR_EVENT_CHANNELS related stuffs.
@ 2012-12-31 18:37 Wei Liu
  2012-12-31 18:37 ` [RFC PATCH 3/3] Xen: implement 3-level event channel routines Wei Liu
  0 siblings, 1 reply; 4+ messages in thread
From: Wei Liu @ 2012-12-31 18:37 UTC (permalink / raw)
  To: xen-devel; +Cc: Wei Liu


Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c        |   44 +++++++++++++++++++++++++++++--------------
 drivers/xen/evtchn.c        |   16 +++++++++-------
 include/xen/events.h        |    3 +++
 include/xen/interface/xen.h |   17 ++++++++++++++++-
 4 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 835101f..f60ba76 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -52,7 +52,8 @@
 #include <xen/interface/hvm/params.h>
 
 /* N-level event channel, starting from 2 */
-static unsigned int evtchn_level = 2;
+unsigned int evtchn_level = 2;
+EXPORT_SYMBOL_GPL(evtchn_level);
 
 struct evtchn_ops {
 	unsigned long (*active_evtchns)(unsigned int,
@@ -130,8 +131,7 @@ static int *evtchn_to_irq;
 static unsigned long *pirq_eoi_map;
 static bool (*pirq_needs_eoi)(unsigned irq);
 
-static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG],
-		      cpu_evtchn_mask);
+static DEFINE_PER_CPU(unsigned long *, cpu_evtchn_mask);
 
 /* Xen will never allocate port zero for any purpose. */
 #define VALID_EVTCHN(chn)	((chn) != 0)
@@ -913,7 +913,7 @@ static int find_virq(unsigned int virq, unsigned int cpu)
 	int port, rc = -ENOENT;
 
 	memset(&status, 0, sizeof(status));
-	for (port = 0; port <= NR_EVENT_CHANNELS; port++) {
+	for (port = 0; port <= NR_EVENT_CHANNELS(evtchn_level); port++) {
 		status.dom = DOMID_SELF;
 		status.port = port;
 		rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status);
@@ -1138,7 +1138,7 @@ int evtchn_get(unsigned int evtchn)
 	struct irq_info *info;
 	int err = -ENOENT;
 
-	if (evtchn >= NR_EVENT_CHANNELS)
+	if (evtchn >= NR_EVENT_CHANNELS(evtchn_level))
 		return -EINVAL;
 
 	mutex_lock(&irq_mapping_update_lock);
@@ -1227,7 +1227,7 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 		       i % 8 == 0 ? "\n   " : " ");
 
 	printk("\nlocal cpu%d mask:\n   ", cpu);
-	for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--)
+	for (i = (NR_EVENT_CHANNELS(evtchn_level)/BITS_PER_LONG)-1; i >= 0; i--)
 		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
 		       cpu_evtchn[i],
 		       i % 8 == 0 ? "\n   " : " ");
@@ -1242,7 +1242,7 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 	}
 
 	printk("\npending list:\n");
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++) {
 		if (sync_test_bit(i, sh->evtchn_pending)) {
 			int word_idx = i / BITS_PER_LONG;
 			printk("  %d: event %d -> irq %d%s%s%s\n",
@@ -1709,14 +1709,14 @@ void xen_irq_resume(void)
 	init_evtchn_cpu_bindings();
 
 	/* New event-channel space is not 'live' yet. */
-	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS(evtchn_level); evtchn++)
 		eops->mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
 	list_for_each_entry(info, &xen_irq_list_head, list)
 		info->evtchn = 0; /* zap event-channel binding */
 
-	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS(evtchn_level); evtchn++)
 		evtchn_to_irq[evtchn] = -1;
 
 	for_each_possible_cpu(cpu) {
@@ -1824,21 +1824,37 @@ static struct evtchn_ops evtchn_ops_l2 __read_mostly = {
 void __init xen_init_IRQ(void)
 {
 	int i, rc;
+	int cpu;
 
-	evtchn_level = 2;
+	/* Setup 2-level event channel */
 	eops = &evtchn_ops_l2;
+	evtchn_level = 2;
 
-	/* Setup 2-level event channel */
-	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
+	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS(evtchn_level),
+				sizeof(*evtchn_to_irq),
 				GFP_KERNEL);
 	BUG_ON(!evtchn_to_irq);
-	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+
+	for_each_possible_cpu(cpu) {
+		void *p;
+		unsigned int nr = NR_EVENT_CHANNELS(evtchn_level)/BITS_PER_LONG;
+		p = kzalloc_node(sizeof(unsigned long) * nr,
+				 GFP_KERNEL,
+				 cpu_to_node(cpu));
+		if (!p)
+			p = kzalloc(sizeof(unsigned long) * nr,
+				    GFP_KERNEL);
+		BUG_ON(!p);
+		per_cpu(cpu_evtchn_mask, cpu) = p;
+	}
+
+	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++)
 		evtchn_to_irq[i] = -1;
 
 	init_evtchn_cpu_bindings();
 
 	/* No event channels are 'live' right now. */
-	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++)
 		eops->mask_evtchn(i);
 
 	pirq_needs_eoi = pirq_needs_eoi_flag;
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index b1f60a0..cb45ecf 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -232,7 +232,7 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
 	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
 		unsigned port = kbuf[i];
 
-		if (port < NR_EVENT_CHANNELS &&
+		if (port < NR_EVENT_CHANNELS(evtchn_level) &&
 		    get_port_user(port) == u &&
 		    !get_port_enabled(port)) {
 			set_port_enabled(port, true);
@@ -364,7 +364,7 @@ static long evtchn_ioctl(struct file *file,
 			break;
 
 		rc = -EINVAL;
-		if (unbind.port >= NR_EVENT_CHANNELS)
+		if (unbind.port >= NR_EVENT_CHANNELS(evtchn_level))
 			break;
 
 		spin_lock_irq(&port_user_lock);
@@ -392,7 +392,7 @@ static long evtchn_ioctl(struct file *file,
 		if (copy_from_user(&notify, uarg, sizeof(notify)))
 			break;
 
-		if (notify.port >= NR_EVENT_CHANNELS) {
+		if (notify.port >= NR_EVENT_CHANNELS(evtchn_level)) {
 			rc = -EINVAL;
 		} else if (get_port_user(notify.port) != u) {
 			rc = -ENOTCONN;
@@ -482,7 +482,7 @@ static int evtchn_release(struct inode *inode, struct file *filp)
 
 	free_page((unsigned long)u->ring);
 
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++) {
 		if (get_port_user(i) != u)
 			continue;
 
@@ -491,7 +491,7 @@ static int evtchn_release(struct inode *inode, struct file *filp)
 
 	spin_unlock_irq(&port_user_lock);
 
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++) {
 		if (get_port_user(i) != u)
 			continue;
 
@@ -528,7 +528,8 @@ static int __init evtchn_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
+	port_user = kcalloc(NR_EVENT_CHANNELS(evtchn_level),
+			    sizeof(*port_user), GFP_KERNEL);
 	if (port_user == NULL)
 		return -ENOMEM;
 
@@ -541,7 +542,8 @@ static int __init evtchn_init(void)
 		return err;
 	}
 
-	printk(KERN_INFO "Event-channel device installed.\n");
+	printk(KERN_INFO "Event-channel device installed."
+	       " Event-channel level: %d\n", evtchn_level);
 
 	return 0;
 }
diff --git a/include/xen/events.h b/include/xen/events.h
index 04399b2..bc10f22 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -109,4 +109,7 @@ int xen_irq_from_gsi(unsigned gsi);
 /* Determine whether to ignore this IRQ if it is passed to a guest. */
 int xen_test_irq_shared(int irq);
 
+/* N-level event channels */
+extern unsigned int evtchn_level;
+
 #endif	/* _XEN_EVENTS_H */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index a890804..c66e1ff 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -283,9 +283,24 @@ DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
 
 /*
  * Event channel endpoints per domain:
+ * 2-level:
  *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
+ * 3-level:
+ *  32k if a long is 32 bits; 256k if a long is 64 bits.
  */
-#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
+#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) * 64)
+#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * sizeof(unsigned long))
+#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0;	\
+	switch (x) {					\
+	case 2:						\
+		__v = NR_EVENT_CHANNELS_L2; break;	\
+	case 3:						\
+		__v = NR_EVENT_CHANNELS_L3; break;	\
+	default:					\
+		BUG();					\
+	}						\
+	__v; })
+
 
 struct vcpu_time_info {
 	/*
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [RFC PATCH 3/3] Xen: implement 3-level event channel routines.
  2012-12-31 18:37 [RFC PATCH 2/3] Xen: rework NR_EVENT_CHANNELS related stuffs Wei Liu
@ 2012-12-31 18:37 ` Wei Liu
  0 siblings, 0 replies; 4+ messages in thread
From: Wei Liu @ 2012-12-31 18:37 UTC (permalink / raw)
  To: xen-devel; +Cc: Wei Liu


Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 arch/x86/xen/enlighten.c              |    7 +
 drivers/xen/events.c                  |  419 +++++++++++++++++++++++++++++++--
 include/xen/events.h                  |    2 +
 include/xen/interface/event_channel.h |   24 ++
 include/xen/interface/xen.h           |    2 +-
 5 files changed, 437 insertions(+), 17 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bc893e7..f471881 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -43,6 +43,7 @@
 #include <xen/hvm.h>
 #include <xen/hvc-console.h>
 #include <xen/acpi.h>
+#include <xen/events.h>
 
 #include <asm/paravirt.h>
 #include <asm/apic.h>
@@ -195,6 +196,9 @@ void xen_vcpu_restore(void)
 		    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
 			BUG();
 	}
+
+	if (evtchn_level_param == 3)
+		xen_event_channel_setup_3level();
 }
 
 static void __init xen_banner(void)
@@ -1028,6 +1032,9 @@ void xen_setup_vcpu_info_placement(void)
 	for_each_possible_cpu(cpu)
 		xen_vcpu_setup(cpu);
 
+	if (evtchn_level_param == 3)
+		xen_event_channel_setup_3level();
+
 	/* xen_vcpu_setup managed to place the vcpu_info within the
 	   percpu area for all cpus, so make use of it */
 	if (have_vcpu_info_placement) {
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index f60ba76..adb94e9 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -52,9 +52,15 @@
 #include <xen/interface/hvm/params.h>
 
 /* N-level event channel, starting from 2 */
+unsigned int evtchn_level_param = -1;
 unsigned int evtchn_level = 2;
 EXPORT_SYMBOL_GPL(evtchn_level);
 
+/* 3-level event channel */
+DEFINE_PER_CPU(unsigned long [sizeof(unsigned long)*8], evtchn_sel_l2);
+unsigned long evtchn_pending[NR_EVENT_CHANNELS_L3/BITS_PER_LONG] __page_aligned_bss;
+unsigned long evtchn_mask[NR_EVENT_CHANNELS_L3/BITS_PER_LONG] __page_aligned_bss;
+
 struct evtchn_ops {
 	unsigned long (*active_evtchns)(unsigned int,
 					struct shared_info*, unsigned int);
@@ -142,6 +148,29 @@ static struct irq_chip xen_pirq_chip;
 static void enable_dynirq(struct irq_data *data);
 static void disable_dynirq(struct irq_data *data);
 
+static int __init parse_evtchn_level(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (strcmp(arg, "3") == 0)
+		evtchn_level_param = 3;
+
+	return 0;
+}
+early_param("evtchn_level", parse_evtchn_level);
+
+static inline int __is_masked_l2(int chn)
+{
+	struct shared_info *sh = HYPERVISOR_shared_info;
+	return sync_test_and_set_bit(chn, sh->evtchn_mask);
+}
+
+static inline int __is_masked_l3(int chn)
+{
+	return sync_test_and_set_bit(chn, evtchn_mask);
+}
+
 /* Get info for IRQ */
 static struct irq_info *info_for_irq(unsigned irq)
 {
@@ -311,6 +340,15 @@ static inline unsigned long __active_evtchns_l2(unsigned int cpu,
 		~sh->evtchn_mask[idx];
 }
 
+static inline unsigned long __active_evtchns_l3(unsigned int cpu,
+						struct shared_info *sh,
+						unsigned int idx)
+{
+	return evtchn_pending[idx] &
+		per_cpu(cpu_evtchn_mask, cpu)[idx] &
+		~evtchn_mask[idx];
+}
+
 static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 {
 	int irq = evtchn_to_irq[chn];
@@ -351,18 +389,33 @@ static inline void __clear_evtchn_l2(int port)
 	sync_clear_bit(port, &s->evtchn_pending[0]);
 }
 
+static inline void __clear_evtchn_l3(int port)
+{
+	sync_clear_bit(port, &evtchn_pending[0]);
+}
+
 static inline void __set_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	sync_set_bit(port, &s->evtchn_pending[0]);
 }
 
+static inline void __set_evtchn_l3(int port)
+{
+	sync_set_bit(port, &evtchn_pending[0]);
+}
+
 static inline int __test_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	return sync_test_bit(port, &s->evtchn_pending[0]);
 }
 
+static inline int __test_evtchn_l3(int port)
+{
+	return sync_test_bit(port, &evtchn_pending[0]);
+}
+
 /**
  * notify_remote_via_irq - send event to remote end of event channel via irq
  * @irq: irq of event channel to send event to
@@ -386,6 +439,11 @@ static void __mask_evtchn_l2(int port)
 	sync_set_bit(port, &s->evtchn_mask[0]);
 }
 
+static void __mask_evtchn_l3(int port)
+{
+	sync_set_bit(port, &evtchn_mask[0]);
+}
+
 static void __unmask_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
@@ -416,6 +474,36 @@ static void __unmask_evtchn_l2(int port)
 	put_cpu();
 }
 
+static void __unmask_evtchn_l3(int port)
+{
+	unsigned int cpu = get_cpu();
+	int l1cb = BITS_PER_LONG * BITS_PER_LONG;
+	int l2cb = BITS_PER_LONG;
+
+	if (unlikely(cpu != cpu_from_evtchn(port))) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	} else {
+		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+
+		sync_clear_bit(port, &evtchn_mask[0]);
+
+		/*
+		 * The following is basically the equivalent of
+		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+		 * the interrupt edge' if the channel is masked.
+		 */
+		if (sync_test_bit(port, &evtchn_pending[0]) &&
+		    !sync_test_and_set_bit(port / l2cb,
+					   &per_cpu(evtchn_sel_l2, cpu)[0]) &&
+		    !sync_test_and_set_bit(port / l1cb,
+					   &vcpu_info->evtchn_pending_sel))
+			vcpu_info->evtchn_upcall_pending = 1;
+	}
+
+	put_cpu();
+}
+
 static void xen_irq_init(unsigned irq)
 {
 	struct irq_info *info;
@@ -1181,6 +1269,7 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
 	notify_remote_via_irq(irq);
 }
 
+static DEFINE_SPINLOCK(debug_lock);
 static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 {
 	struct shared_info *sh = HYPERVISOR_shared_info;
@@ -1188,7 +1277,6 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 	unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
 	int i;
 	unsigned long flags;
-	static DEFINE_SPINLOCK(debug_lock);
 	struct vcpu_info *v;
 
 	spin_lock_irqsave(&debug_lock, flags);
@@ -1196,13 +1284,13 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 	printk("\nvcpu %d\n  ", cpu);
 
 	for_each_online_cpu(i) {
-		int pending;
+		int masked;
 		v = per_cpu(xen_vcpu, i);
-		pending = (get_irq_regs() && i == cpu)
+		masked = (get_irq_regs() && i == cpu)
 			? xen_irqs_disabled(get_irq_regs())
 			: v->evtchn_upcall_mask;
 		printk("%d: masked=%d pending=%d event_sel %0*lx\n  ", i,
-		       pending, v->evtchn_upcall_pending,
+		       masked, v->evtchn_upcall_pending,
 		       (int)(sizeof(v->evtchn_pending_sel)*2),
 		       v->evtchn_pending_sel);
 	}
@@ -1227,7 +1315,7 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 		       i % 8 == 0 ? "\n   " : " ");
 
 	printk("\nlocal cpu%d mask:\n   ", cpu);
-	for (i = (NR_EVENT_CHANNELS(evtchn_level)/BITS_PER_LONG)-1; i >= 0; i--)
+	for (i = (NR_EVENT_CHANNELS(2)/BITS_PER_LONG)-1; i >= 0; i--)
 		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
 		       cpu_evtchn[i],
 		       i % 8 == 0 ? "\n   " : " ");
@@ -1242,7 +1330,7 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 	}
 
 	printk("\npending list:\n");
-	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++) {
+	for (i = 0; i < NR_EVENT_CHANNELS(2); i++) {
 		if (sync_test_bit(i, sh->evtchn_pending)) {
 			int word_idx = i / BITS_PER_LONG;
 			printk("  %d: event %d -> irq %d%s%s%s\n",
@@ -1262,15 +1350,110 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t __xen_debug_interrupt_l3(int irq, void *dev_id)
+{
+	int cpu = smp_processor_id();
+	unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
+	int i, j;
+	unsigned long flags;
+	struct vcpu_info *v;
+
+	spin_lock_irqsave(&debug_lock, flags);
+
+	printk("\nvcpu %d\n  ", cpu);
+
+	for_each_online_cpu(i) {
+		int masked;
+
+		v = per_cpu(xen_vcpu, i);
+		masked = (get_irq_regs() && i == cpu)
+			? xen_irqs_disabled(get_irq_regs())
+			: v->evtchn_upcall_mask;
+		printk("%d: masked=%d pending=%d event_sel_l1 %0*lx\n  ", i,
+		       masked, v->evtchn_upcall_pending,
+		       (int)(sizeof(v->evtchn_pending_sel)*2),
+		       v->evtchn_pending_sel);
+
+		printk("\nevtchn_sel_l2:\n   ");
+		for (j = (sizeof(unsigned long)*8)-1; j >= 0; j--)
+			printk("%0*lx%s",
+			       (int)(sizeof(evtchn_sel_l2[0])*2),
+			       per_cpu(evtchn_sel_l2, i)[j],
+			       j % 8 == 0 ? "\n   " : " ");
+	}
+
+	v = per_cpu(xen_vcpu, cpu);
+
+	printk("\npending:\n   ");
+	for (i = ARRAY_SIZE(evtchn_pending)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(evtchn_pending[0])*2),
+		       evtchn_pending[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nglobal mask:\n   ");
+	for (i = ARRAY_SIZE(evtchn_mask)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(evtchn_mask[0])*2),
+		       evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nglobally unmasked:\n   ");
+	for (i = ARRAY_SIZE(evtchn_mask)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(evtchn_mask[0])*2),
+		       evtchn_pending[i] & ~evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocal cpu%d mask:\n   ", cpu);
+	for (i = (NR_EVENT_CHANNELS(3)/BITS_PER_LONG)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
+		       cpu_evtchn[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocally unmasked:\n   ");
+	for (i = ARRAY_SIZE(evtchn_mask)-1; i >= 0; i--) {
+		unsigned long pending = evtchn_pending[i]
+			& ~evtchn_mask[i]
+			& cpu_evtchn[i];
+		printk("%0*lx%s", (int)(sizeof(evtchn_mask[0])*2),
+		       pending, i % 8 == 0 ? "\n   " : " ");
+	}
+
+	printk("\npending list:\n");
+	for (i = 0; i < NR_EVENT_CHANNELS(3); i++) {
+		if (sync_test_bit(i, evtchn_pending)) {
+			int word_idx_l1 = i / (BITS_PER_LONG * BITS_PER_LONG);
+			int word_idx_l2 = i / BITS_PER_LONG;
+			printk("  %d: event %d -> irq %d%s%s%s%s\n",
+			       cpu_from_evtchn(i), i,
+			       evtchn_to_irq[i],
+			       sync_test_bit(word_idx_l1, &v->evtchn_pending_sel)
+					     ? "" : " l1-clear",
+			       sync_test_bit(word_idx_l2, per_cpu(evtchn_sel_l2, cpu))
+					     ? "" : " l2-clear",
+			       !sync_test_bit(i, evtchn_mask)
+					     ? "" : " globally-masked",
+			       sync_test_bit(i, cpu_evtchn)
+					     ? "" : " locally-masked");
+		}
+	}
+
+	spin_unlock_irqrestore(&debug_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 {
 	return eops->xen_debug_interrupt(irq, dev_id);
 }
 
 static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+
+/* 2-level event channel does not use current_word_idx_l2 */
 static DEFINE_PER_CPU(unsigned int, current_word_idx);
+static DEFINE_PER_CPU(unsigned int, current_word_idx_l2);
 static DEFINE_PER_CPU(unsigned int, current_bit_idx);
 
+
 /*
  * Mask out the i least significant bits of w
  */
@@ -1303,7 +1486,8 @@ static void __xen_evtchn_do_upcall_l2(void)
 		if (__this_cpu_inc_return(xed_nesting_count) - 1)
 			goto out;
 
-#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
+#ifndef CONFIG_X86
+		/* No need for a barrier -- XCHG is a barrier on x86. */
 		/* Clear master flag /before/ clearing selector flag. */
 		wmb();
 #endif
@@ -1392,6 +1576,155 @@ out:
 	put_cpu();
 }
 
+void __xen_evtchn_do_upcall_l3(void)
+{
+	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+	unsigned count;
+	int start_word_idx_l1, start_word_idx_l2, start_bit_idx;
+	int word_idx_l1, word_idx_l2, bit_idx;
+	int i, j;
+	unsigned long l1cb, l2cb;
+	int cpu = get_cpu();
+
+	l1cb = BITS_PER_LONG * BITS_PER_LONG;
+	l2cb = BITS_PER_LONG;
+
+	do {
+		unsigned long pending_words_l1;
+
+		vcpu_info->evtchn_upcall_pending = 0;
+
+		if (__this_cpu_inc_return(xed_nesting_count) - 1)
+			goto out;
+#ifndef CONFIG_X86
+		/* No need for a barrier -- XCHG is a barrier on x86. */
+		/* Clear master flag /before/ clearing selector flag. */
+		wmb();
+#endif
+		/* here we get l1 pending selector */
+		pending_words_l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
+
+		start_word_idx_l1 = __this_cpu_read(current_word_idx);
+		start_word_idx_l2 = __this_cpu_read(current_word_idx_l2);
+		start_bit_idx = __this_cpu_read(current_bit_idx);
+
+		word_idx_l1 = start_word_idx_l1;
+
+		/* loop through l1, try to pick up l2 */
+		for (i = 0; pending_words_l1 != 0; i++) {
+			unsigned long words_l1;
+			unsigned long pending_words_l2;
+			unsigned long pwl2idx;
+
+			words_l1 = MASK_LSBS(pending_words_l1, word_idx_l1);
+
+			if (words_l1 == 0) {
+				word_idx_l1 = 0;
+				start_word_idx_l2 = 0;
+				continue;
+			}
+
+			word_idx_l1 = __ffs(words_l1);
+
+			pwl2idx = word_idx_l1 * BITS_PER_LONG;
+
+			pending_words_l2 =
+				xchg(&per_cpu(evtchn_sel_l2, cpu)[pwl2idx],
+				     0);
+
+			word_idx_l2 = 0;
+			if (word_idx_l1 == start_word_idx_l1) {
+				if (i == 0)
+					word_idx_l2 = start_word_idx_l2;
+				else
+					word_idx_l2 &= (1UL << start_word_idx_l2) - 1;
+			}
+
+			for (j = 0; pending_words_l2 != 0; j++) {
+				unsigned long pending_bits;
+				unsigned long words_l2;
+				unsigned long idx;
+
+				words_l2 = MASK_LSBS(pending_words_l2,
+						     word_idx_l2);
+
+				if (words_l2 == 0) {
+					word_idx_l2 = 0;
+					bit_idx = 0;
+					continue;
+				}
+
+				word_idx_l2 = __ffs(words_l2);
+
+				idx = word_idx_l1*BITS_PER_LONG+word_idx_l2;
+				pending_bits =
+					eops->active_evtchns(cpu, NULL, idx);
+
+				bit_idx = 0;
+				if (word_idx_l2 == start_word_idx_l2) {
+					if (j == 0)
+						bit_idx = start_bit_idx;
+					else
+						bit_idx &= (1UL<<start_bit_idx)-1;
+				}
+
+				/* process port */
+				do {
+					unsigned long bits;
+					int port, irq;
+					struct irq_desc *desc;
+
+					bits = MASK_LSBS(pending_bits, bit_idx);
+
+					if (bits == 0)
+						break;
+
+					bit_idx = __ffs(bits);
+
+					port = word_idx_l1 * l1cb +
+						word_idx_l2 * l2cb +
+						bit_idx;
+
+					irq = evtchn_to_irq[port];
+
+					if (irq != -1) {
+						desc = irq_to_desc(irq);
+						if (desc)
+							generic_handle_irq_desc(irq, desc);
+					}
+
+					bit_idx = (bit_idx + 1) % BITS_PER_LONG;
+
+					__this_cpu_write(current_bit_idx, bit_idx);
+					__this_cpu_write(current_word_idx_l2,
+							 bit_idx ? word_idx_l2 :
+							 (word_idx_l2+1) % BITS_PER_LONG);
+					__this_cpu_write(current_word_idx_l2,
+							 word_idx_l2 ? word_idx_l1 :
+							 (word_idx_l1+1) % BITS_PER_LONG);
+				} while (bit_idx != 0);
+
+				if ((word_idx_l2 != start_word_idx_l2) || (j != 0))
+					pending_words_l2 &= ~(1UL << word_idx_l2);
+
+				word_idx_l2 = (word_idx_l2) % BITS_PER_LONG;
+			}
+
+			if ((word_idx_l1 != start_word_idx_l1) || (i != 0))
+				pending_words_l1 &= ~(1UL << word_idx_l1);
+
+			word_idx_l1 = (word_idx_l1) % BITS_PER_LONG;
+		}
+
+		BUG_ON(!irqs_disabled());
+		count = __this_cpu_read(xed_nesting_count);
+		__this_cpu_write(xed_nesting_count, 0);
+	} while (count != 1 || vcpu_info->evtchn_upcall_pending);
+
+out:
+	put_cpu();
+}
+
 void xen_evtchn_do_upcall(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
@@ -1525,12 +1858,6 @@ static void mask_ack_dynirq(struct irq_data *data)
 	ack_dynirq(data);
 }
 
-static inline int __is_masked_l2(int chn)
-{
-	struct shared_info *sh = HYPERVISOR_shared_info;
-	return sync_test_and_set_bit(chn, sh->evtchn_mask);
-}
-
 static int retrigger_dynirq(struct irq_data *data)
 {
 	int evtchn = evtchn_from_irq(data->irq);
@@ -1821,14 +2148,74 @@ static struct evtchn_ops evtchn_ops_l2 __read_mostly = {
 	.xen_debug_interrupt = __xen_debug_interrupt_l2,
 };
 
+static struct evtchn_ops evtchn_ops_l3 __read_mostly = {
+	.active_evtchns = __active_evtchns_l3,
+	.clear_evtchn = __clear_evtchn_l3,
+	.set_evtchn = __set_evtchn_l3,
+	.test_evtchn = __test_evtchn_l3,
+	.mask_evtchn = __mask_evtchn_l3,
+	.unmask_evtchn = __unmask_evtchn_l3,
+	.is_masked = __is_masked_l3,
+	.xen_evtchn_do_upcall = __xen_evtchn_do_upcall_l3,
+	.xen_debug_interrupt = __xen_debug_interrupt_l3,
+};
+
+int xen_event_channel_setup_3level(void)
+{
+	evtchn_register_nlevel_t reg;
+	int i, nr_pages, cpu;
+	unsigned long mfns[nr_cpu_ids];
+	unsigned long offsets[nr_cpu_ids];
+	int rc = -EINVAL;
+
+	memset(&reg, 0, sizeof(reg));
+
+	reg.level = 3;
+	nr_pages = (sizeof(unsigned long) == 4 ? 1 : 8);
+
+	for (i = 0; i < nr_pages; i++) {
+		unsigned long offset = PAGE_SIZE * i;
+		reg.u.l3.evtchn_pending[i] =
+			arbitrary_virt_to_mfn(
+				(void *)((unsigned long)evtchn_pending+offset));
+		reg.u.l3.evtchn_mask[i] =
+			arbitrary_virt_to_mfn(
+				(void *)((unsigned long)evtchn_mask+offset));
+	}
+
+	reg.u.l3.l2sel_mfn = mfns;
+	reg.u.l3.l2sel_offset = offsets;
+	reg.u.l3.nr_vcpus = nr_cpu_ids;
+
+	for_each_possible_cpu(cpu) {
+		reg.u.l3.l2sel_mfn[cpu] =
+			arbitrary_virt_to_mfn(&per_cpu(evtchn_sel_l2, cpu));
+		reg.u.l3.l2sel_offset[cpu] =
+			offset_in_page(&per_cpu(evtchn_sel_l2, cpu));
+	}
+
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_register_nlevel, &reg);
+
+	if (rc == 0)
+		evtchn_level = 3;
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(xen_event_channel_setup_3level);
+
 void __init xen_init_IRQ(void)
 {
 	int i, rc;
 	int cpu;
 
-	/* Setup 2-level event channel */
-	eops = &evtchn_ops_l2;
-	evtchn_level = 2;
+	switch (evtchn_level) {
+	case 2:
+		eops = &evtchn_ops_l2; break;
+	case 3:
+		eops = &evtchn_ops_l3; break;
+	default:
+		BUG();
+	}
 
 	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS(evtchn_level),
 				sizeof(*evtchn_to_irq),
diff --git a/include/xen/events.h b/include/xen/events.h
index bc10f22..87696fc 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -111,5 +111,7 @@ int xen_test_irq_shared(int irq);
 
 /* N-level event channels */
 extern unsigned int evtchn_level;
+extern unsigned int evtchn_level_param;
+int xen_event_channel_setup_3level(void);
 
 #endif	/* _XEN_EVENTS_H */
diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h
index f494292..f764d21 100644
--- a/include/xen/interface/event_channel.h
+++ b/include/xen/interface/event_channel.h
@@ -190,6 +190,30 @@ struct evtchn_reset {
 };
 typedef struct evtchn_reset evtchn_reset_t;
 
+/*
+ * EVTCHNOP_register_nlevel: Register N level event channels.
+ * NOTES:
+ *   1. currently only 3-level is supported.
+ *   2. should fall back to basic 2-level if this call fails.
+ */
+#define EVTCHNOP_register_nlevel 11
+#define MAX_L3_PAGES 8		/* 8 pages for 64 bits */
+struct evtchn_register_3level {
+	unsigned long evtchn_pending[MAX_L3_PAGES];
+	unsigned long evtchn_mask[MAX_L3_PAGES];
+	unsigned long *l2sel_mfn;
+	unsigned long *l2sel_offset;
+	unsigned int nr_vcpus;
+};
+
+struct evtchn_register_nlevel {
+	uint32_t level;
+	union {
+		struct evtchn_register_3level l3;
+	} u;
+};
+typedef struct evtchn_register_nlevel evtchn_register_nlevel_t;
+
 struct evtchn_op {
 	uint32_t cmd; /* EVTCHNOP_* */
 	union {
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index c66e1ff..7cb9d8f 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -289,7 +289,7 @@ DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
  *  32k if a long is 32 bits; 256k if a long is 64 bits.
  */
 #define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) * 64)
-#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * sizeof(unsigned long))
+#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64)
 #define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0;	\
 	switch (x) {					\
 	case 2:						\
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [RFC PATCH 3/3] Xen: implement 3-level event channel routines.
  2012-12-31 18:38 Implement 3-level event channel routines in Linux Wei Liu
@ 2012-12-31 18:38 ` Wei Liu
  2013-01-02 14:57   ` David Vrabel
  0 siblings, 1 reply; 4+ messages in thread
From: Wei Liu @ 2012-12-31 18:38 UTC (permalink / raw)
  To: xen-devel; +Cc: Wei Liu, konrad.wilk


Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 arch/x86/xen/enlighten.c              |    7 +
 drivers/xen/events.c                  |  419 +++++++++++++++++++++++++++++++--
 include/xen/events.h                  |    2 +
 include/xen/interface/event_channel.h |   24 ++
 include/xen/interface/xen.h           |    2 +-
 5 files changed, 437 insertions(+), 17 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bc893e7..f471881 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -43,6 +43,7 @@
 #include <xen/hvm.h>
 #include <xen/hvc-console.h>
 #include <xen/acpi.h>
+#include <xen/events.h>
 
 #include <asm/paravirt.h>
 #include <asm/apic.h>
@@ -195,6 +196,9 @@ void xen_vcpu_restore(void)
 		    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
 			BUG();
 	}
+
+	if (evtchn_level_param == 3)
+		xen_event_channel_setup_3level();
 }
 
 static void __init xen_banner(void)
@@ -1028,6 +1032,9 @@ void xen_setup_vcpu_info_placement(void)
 	for_each_possible_cpu(cpu)
 		xen_vcpu_setup(cpu);
 
+	if (evtchn_level_param == 3)
+		xen_event_channel_setup_3level();
+
 	/* xen_vcpu_setup managed to place the vcpu_info within the
 	   percpu area for all cpus, so make use of it */
 	if (have_vcpu_info_placement) {
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index f60ba76..adb94e9 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -52,9 +52,15 @@
 #include <xen/interface/hvm/params.h>
 
 /* N-level event channel, starting from 2 */
+unsigned int evtchn_level_param = -1;
 unsigned int evtchn_level = 2;
 EXPORT_SYMBOL_GPL(evtchn_level);
 
+/* 3-level event channel */
+DEFINE_PER_CPU(unsigned long [sizeof(unsigned long)*8], evtchn_sel_l2);
+unsigned long evtchn_pending[NR_EVENT_CHANNELS_L3/BITS_PER_LONG] __page_aligned_bss;
+unsigned long evtchn_mask[NR_EVENT_CHANNELS_L3/BITS_PER_LONG] __page_aligned_bss;
+
 struct evtchn_ops {
 	unsigned long (*active_evtchns)(unsigned int,
 					struct shared_info*, unsigned int);
@@ -142,6 +148,29 @@ static struct irq_chip xen_pirq_chip;
 static void enable_dynirq(struct irq_data *data);
 static void disable_dynirq(struct irq_data *data);
 
+static int __init parse_evtchn_level(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (strcmp(arg, "3") == 0)
+		evtchn_level_param = 3;
+
+	return 0;
+}
+early_param("evtchn_level", parse_evtchn_level);
+
+static inline int __is_masked_l2(int chn)
+{
+	struct shared_info *sh = HYPERVISOR_shared_info;
+	return sync_test_and_set_bit(chn, sh->evtchn_mask);
+}
+
+static inline int __is_masked_l3(int chn)
+{
+	return sync_test_and_set_bit(chn, evtchn_mask);
+}
+
 /* Get info for IRQ */
 static struct irq_info *info_for_irq(unsigned irq)
 {
@@ -311,6 +340,15 @@ static inline unsigned long __active_evtchns_l2(unsigned int cpu,
 		~sh->evtchn_mask[idx];
 }
 
+static inline unsigned long __active_evtchns_l3(unsigned int cpu,
+						struct shared_info *sh,
+						unsigned int idx)
+{
+	return evtchn_pending[idx] &
+		per_cpu(cpu_evtchn_mask, cpu)[idx] &
+		~evtchn_mask[idx];
+}
+
 static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 {
 	int irq = evtchn_to_irq[chn];
@@ -351,18 +389,33 @@ static inline void __clear_evtchn_l2(int port)
 	sync_clear_bit(port, &s->evtchn_pending[0]);
 }
 
+static inline void __clear_evtchn_l3(int port)
+{
+	sync_clear_bit(port, &evtchn_pending[0]);
+}
+
 static inline void __set_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	sync_set_bit(port, &s->evtchn_pending[0]);
 }
 
+static inline void __set_evtchn_l3(int port)
+{
+	sync_set_bit(port, &evtchn_pending[0]);
+}
+
 static inline int __test_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	return sync_test_bit(port, &s->evtchn_pending[0]);
 }
 
+static inline int __test_evtchn_l3(int port)
+{
+	return sync_test_bit(port, &evtchn_pending[0]);
+}
+
 /**
  * notify_remote_via_irq - send event to remote end of event channel via irq
  * @irq: irq of event channel to send event to
@@ -386,6 +439,11 @@ static void __mask_evtchn_l2(int port)
 	sync_set_bit(port, &s->evtchn_mask[0]);
 }
 
+static void __mask_evtchn_l3(int port)
+{
+	sync_set_bit(port, &evtchn_mask[0]);
+}
+
 static void __unmask_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
@@ -416,6 +474,36 @@ static void __unmask_evtchn_l2(int port)
 	put_cpu();
 }
 
+static void __unmask_evtchn_l3(int port)
+{
+	unsigned int cpu = get_cpu();
+	int l1cb = BITS_PER_LONG * BITS_PER_LONG;
+	int l2cb = BITS_PER_LONG;
+
+	if (unlikely(cpu != cpu_from_evtchn(port))) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	} else {
+		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+
+		sync_clear_bit(port, &evtchn_mask[0]);
+
+		/*
+		 * The following is basically the equivalent of
+		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+		 * the interrupt edge' if the channel is masked.
+		 */
+		if (sync_test_bit(port, &evtchn_pending[0]) &&
+		    !sync_test_and_set_bit(port / l2cb,
+					   &per_cpu(evtchn_sel_l2, cpu)[0]) &&
+		    !sync_test_and_set_bit(port / l1cb,
+					   &vcpu_info->evtchn_pending_sel))
+			vcpu_info->evtchn_upcall_pending = 1;
+	}
+
+	put_cpu();
+}
+
 static void xen_irq_init(unsigned irq)
 {
 	struct irq_info *info;
@@ -1181,6 +1269,7 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
 	notify_remote_via_irq(irq);
 }
 
+static DEFINE_SPINLOCK(debug_lock);
 static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 {
 	struct shared_info *sh = HYPERVISOR_shared_info;
@@ -1188,7 +1277,6 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 	unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
 	int i;
 	unsigned long flags;
-	static DEFINE_SPINLOCK(debug_lock);
 	struct vcpu_info *v;
 
 	spin_lock_irqsave(&debug_lock, flags);
@@ -1196,13 +1284,13 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 	printk("\nvcpu %d\n  ", cpu);
 
 	for_each_online_cpu(i) {
-		int pending;
+		int masked;
 		v = per_cpu(xen_vcpu, i);
-		pending = (get_irq_regs() && i == cpu)
+		masked = (get_irq_regs() && i == cpu)
 			? xen_irqs_disabled(get_irq_regs())
 			: v->evtchn_upcall_mask;
 		printk("%d: masked=%d pending=%d event_sel %0*lx\n  ", i,
-		       pending, v->evtchn_upcall_pending,
+		       masked, v->evtchn_upcall_pending,
 		       (int)(sizeof(v->evtchn_pending_sel)*2),
 		       v->evtchn_pending_sel);
 	}
@@ -1227,7 +1315,7 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 		       i % 8 == 0 ? "\n   " : " ");
 
 	printk("\nlocal cpu%d mask:\n   ", cpu);
-	for (i = (NR_EVENT_CHANNELS(evtchn_level)/BITS_PER_LONG)-1; i >= 0; i--)
+	for (i = (NR_EVENT_CHANNELS(2)/BITS_PER_LONG)-1; i >= 0; i--)
 		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
 		       cpu_evtchn[i],
 		       i % 8 == 0 ? "\n   " : " ");
@@ -1242,7 +1330,7 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 	}
 
 	printk("\npending list:\n");
-	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++) {
+	for (i = 0; i < NR_EVENT_CHANNELS(2); i++) {
 		if (sync_test_bit(i, sh->evtchn_pending)) {
 			int word_idx = i / BITS_PER_LONG;
 			printk("  %d: event %d -> irq %d%s%s%s\n",
@@ -1262,15 +1350,110 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t __xen_debug_interrupt_l3(int irq, void *dev_id)
+{
+	int cpu = smp_processor_id();
+	unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
+	int i, j;
+	unsigned long flags;
+	struct vcpu_info *v;
+
+	spin_lock_irqsave(&debug_lock, flags);
+
+	printk("\nvcpu %d\n  ", cpu);
+
+	for_each_online_cpu(i) {
+		int masked;
+
+		v = per_cpu(xen_vcpu, i);
+		masked = (get_irq_regs() && i == cpu)
+			? xen_irqs_disabled(get_irq_regs())
+			: v->evtchn_upcall_mask;
+		printk("%d: masked=%d pending=%d event_sel_l1 %0*lx\n  ", i,
+		       masked, v->evtchn_upcall_pending,
+		       (int)(sizeof(v->evtchn_pending_sel)*2),
+		       v->evtchn_pending_sel);
+
+		printk("\nevtchn_sel_l2:\n   ");
+		for (j = (sizeof(unsigned long)*8)-1; j >= 0; j--)
+			printk("%0*lx%s",
+			       (int)(sizeof(evtchn_sel_l2[0])*2),
+			       per_cpu(evtchn_sel_l2, i)[j],
+			       j % 8 == 0 ? "\n   " : " ");
+	}
+
+	v = per_cpu(xen_vcpu, cpu);
+
+	printk("\npending:\n   ");
+	for (i = ARRAY_SIZE(evtchn_pending)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(evtchn_pending[0])*2),
+		       evtchn_pending[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nglobal mask:\n   ");
+	for (i = ARRAY_SIZE(evtchn_mask)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(evtchn_mask[0])*2),
+		       evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nglobally unmasked:\n   ");
+	for (i = ARRAY_SIZE(evtchn_mask)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(evtchn_mask[0])*2),
+		       evtchn_pending[i] & ~evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocal cpu%d mask:\n   ", cpu);
+	for (i = (NR_EVENT_CHANNELS(3)/BITS_PER_LONG)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
+		       cpu_evtchn[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocally unmasked:\n   ");
+	for (i = ARRAY_SIZE(evtchn_mask)-1; i >= 0; i--) {
+		unsigned long pending = evtchn_pending[i]
+			& ~evtchn_mask[i]
+			& cpu_evtchn[i];
+		printk("%0*lx%s", (int)(sizeof(evtchn_mask[0])*2),
+		       pending, i % 8 == 0 ? "\n   " : " ");
+	}
+
+	printk("\npending list:\n");
+	for (i = 0; i < NR_EVENT_CHANNELS(3); i++) {
+		if (sync_test_bit(i, evtchn_pending)) {
+			int word_idx_l1 = i / (BITS_PER_LONG * BITS_PER_LONG);
+			int word_idx_l2 = i / BITS_PER_LONG;
+			printk("  %d: event %d -> irq %d%s%s%s%s\n",
+			       cpu_from_evtchn(i), i,
+			       evtchn_to_irq[i],
+			       sync_test_bit(word_idx_l1, &v->evtchn_pending_sel)
+					     ? "" : " l1-clear",
+			       sync_test_bit(word_idx_l2, per_cpu(evtchn_sel_l2, cpu))
+					     ? "" : " l2-clear",
+			       !sync_test_bit(i, evtchn_mask)
+					     ? "" : " globally-masked",
+			       sync_test_bit(i, cpu_evtchn)
+					     ? "" : " locally-masked");
+		}
+	}
+
+	spin_unlock_irqrestore(&debug_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 {
 	return eops->xen_debug_interrupt(irq, dev_id);
 }
 
 static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+
+/* 2-level event channel does not use current_word_idx_l2 */
 static DEFINE_PER_CPU(unsigned int, current_word_idx);
+static DEFINE_PER_CPU(unsigned int, current_word_idx_l2);
 static DEFINE_PER_CPU(unsigned int, current_bit_idx);
 
+
 /*
  * Mask out the i least significant bits of w
  */
@@ -1303,7 +1486,8 @@ static void __xen_evtchn_do_upcall_l2(void)
 		if (__this_cpu_inc_return(xed_nesting_count) - 1)
 			goto out;
 
-#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
+#ifndef CONFIG_X86
+		/* No need for a barrier -- XCHG is a barrier on x86. */
 		/* Clear master flag /before/ clearing selector flag. */
 		wmb();
 #endif
@@ -1392,6 +1576,155 @@ out:
 	put_cpu();
 }
 
+void __xen_evtchn_do_upcall_l3(void)
+{
+	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+	unsigned count;
+	int start_word_idx_l1, start_word_idx_l2, start_bit_idx;
+	int word_idx_l1, word_idx_l2, bit_idx;
+	int i, j;
+	unsigned long l1cb, l2cb;
+	int cpu = get_cpu();
+
+	l1cb = BITS_PER_LONG * BITS_PER_LONG;
+	l2cb = BITS_PER_LONG;
+
+	do {
+		unsigned long pending_words_l1;
+
+		vcpu_info->evtchn_upcall_pending = 0;
+
+		if (__this_cpu_inc_return(xed_nesting_count) - 1)
+			goto out;
+#ifndef CONFIG_X86
+		/* No need for a barrier -- XCHG is a barrier on x86. */
+		/* Clear master flag /before/ clearing selector flag. */
+		wmb();
+#endif
+		/* here we get l1 pending selector */
+		pending_words_l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
+
+		start_word_idx_l1 = __this_cpu_read(current_word_idx);
+		start_word_idx_l2 = __this_cpu_read(current_word_idx_l2);
+		start_bit_idx = __this_cpu_read(current_bit_idx);
+
+		word_idx_l1 = start_word_idx_l1;
+
+		/* loop through l1, try to pick up l2 */
+		for (i = 0; pending_words_l1 != 0; i++) {
+			unsigned long words_l1;
+			unsigned long pending_words_l2;
+			unsigned long pwl2idx;
+
+			words_l1 = MASK_LSBS(pending_words_l1, word_idx_l1);
+
+			if (words_l1 == 0) {
+				word_idx_l1 = 0;
+				start_word_idx_l2 = 0;
+				continue;
+			}
+
+			word_idx_l1 = __ffs(words_l1);
+
+			pwl2idx = word_idx_l1 * BITS_PER_LONG;
+
+			pending_words_l2 =
+				xchg(&per_cpu(evtchn_sel_l2, cpu)[pwl2idx],
+				     0);
+
+			word_idx_l2 = 0;
+			if (word_idx_l1 == start_word_idx_l1) {
+				if (i == 0)
+					word_idx_l2 = start_word_idx_l2;
+				else
+					word_idx_l2 &= (1UL << start_word_idx_l2) - 1;
+			}
+
+			for (j = 0; pending_words_l2 != 0; j++) {
+				unsigned long pending_bits;
+				unsigned long words_l2;
+				unsigned long idx;
+
+				words_l2 = MASK_LSBS(pending_words_l2,
+						     word_idx_l2);
+
+				if (words_l2 == 0) {
+					word_idx_l2 = 0;
+					bit_idx = 0;
+					continue;
+				}
+
+				word_idx_l2 = __ffs(words_l2);
+
+				idx = word_idx_l1*BITS_PER_LONG+word_idx_l2;
+				pending_bits =
+					eops->active_evtchns(cpu, NULL, idx);
+
+				bit_idx = 0;
+				if (word_idx_l2 == start_word_idx_l2) {
+					if (j == 0)
+						bit_idx = start_bit_idx;
+					else
+						bit_idx &= (1UL<<start_bit_idx)-1;
+				}
+
+				/* process port */
+				do {
+					unsigned long bits;
+					int port, irq;
+					struct irq_desc *desc;
+
+					bits = MASK_LSBS(pending_bits, bit_idx);
+
+					if (bits == 0)
+						break;
+
+					bit_idx = __ffs(bits);
+
+					port = word_idx_l1 * l1cb +
+						word_idx_l2 * l2cb +
+						bit_idx;
+
+					irq = evtchn_to_irq[port];
+
+					if (irq != -1) {
+						desc = irq_to_desc(irq);
+						if (desc)
+							generic_handle_irq_desc(irq, desc);
+					}
+
+					bit_idx = (bit_idx + 1) % BITS_PER_LONG;
+
+					__this_cpu_write(current_bit_idx, bit_idx);
+					__this_cpu_write(current_word_idx_l2,
+							 bit_idx ? word_idx_l2 :
+							 (word_idx_l2+1) % BITS_PER_LONG);
+					__this_cpu_write(current_word_idx_l2,
+							 word_idx_l2 ? word_idx_l1 :
+							 (word_idx_l1+1) % BITS_PER_LONG);
+				} while (bit_idx != 0);
+
+				if ((word_idx_l2 != start_word_idx_l2) || (j != 0))
+					pending_words_l2 &= ~(1UL << word_idx_l2);
+
+				word_idx_l2 = (word_idx_l2) % BITS_PER_LONG;
+			}
+
+			if ((word_idx_l1 != start_word_idx_l1) || (i != 0))
+				pending_words_l1 &= ~(1UL << word_idx_l1);
+
+			word_idx_l1 = (word_idx_l1) % BITS_PER_LONG;
+		}
+
+		BUG_ON(!irqs_disabled());
+		count = __this_cpu_read(xed_nesting_count);
+		__this_cpu_write(xed_nesting_count, 0);
+	} while (count != 1 || vcpu_info->evtchn_upcall_pending);
+
+out:
+	put_cpu();
+}
+
 void xen_evtchn_do_upcall(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
@@ -1525,12 +1858,6 @@ static void mask_ack_dynirq(struct irq_data *data)
 	ack_dynirq(data);
 }
 
-static inline int __is_masked_l2(int chn)
-{
-	struct shared_info *sh = HYPERVISOR_shared_info;
-	return sync_test_and_set_bit(chn, sh->evtchn_mask);
-}
-
 static int retrigger_dynirq(struct irq_data *data)
 {
 	int evtchn = evtchn_from_irq(data->irq);
@@ -1821,14 +2148,74 @@ static struct evtchn_ops evtchn_ops_l2 __read_mostly = {
 	.xen_debug_interrupt = __xen_debug_interrupt_l2,
 };
 
+static struct evtchn_ops evtchn_ops_l3 __read_mostly = {
+	.active_evtchns = __active_evtchns_l3,
+	.clear_evtchn = __clear_evtchn_l3,
+	.set_evtchn = __set_evtchn_l3,
+	.test_evtchn = __test_evtchn_l3,
+	.mask_evtchn = __mask_evtchn_l3,
+	.unmask_evtchn = __unmask_evtchn_l3,
+	.is_masked = __is_masked_l3,
+	.xen_evtchn_do_upcall = __xen_evtchn_do_upcall_l3,
+	.xen_debug_interrupt = __xen_debug_interrupt_l3,
+};
+
+int xen_event_channel_setup_3level(void)
+{
+	evtchn_register_nlevel_t reg;
+	int i, nr_pages, cpu;
+	unsigned long mfns[nr_cpu_ids];
+	unsigned long offsets[nr_cpu_ids];
+	int rc = -EINVAL;
+
+	memset(&reg, 0, sizeof(reg));
+
+	reg.level = 3;
+	nr_pages = (sizeof(unsigned long) == 4 ? 1 : 8);
+
+	for (i = 0; i < nr_pages; i++) {
+		unsigned long offset = PAGE_SIZE * i;
+		reg.u.l3.evtchn_pending[i] =
+			arbitrary_virt_to_mfn(
+				(void *)((unsigned long)evtchn_pending+offset));
+		reg.u.l3.evtchn_mask[i] =
+			arbitrary_virt_to_mfn(
+				(void *)((unsigned long)evtchn_mask+offset));
+	}
+
+	reg.u.l3.l2sel_mfn = mfns;
+	reg.u.l3.l2sel_offset = offsets;
+	reg.u.l3.nr_vcpus = nr_cpu_ids;
+
+	for_each_possible_cpu(cpu) {
+		reg.u.l3.l2sel_mfn[cpu] =
+			arbitrary_virt_to_mfn(&per_cpu(evtchn_sel_l2, cpu));
+		reg.u.l3.l2sel_offset[cpu] =
+			offset_in_page(&per_cpu(evtchn_sel_l2, cpu));
+	}
+
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_register_nlevel, &reg);
+
+	if (rc == 0)
+		evtchn_level = 3;
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(xen_event_channel_setup_3level);
+
 void __init xen_init_IRQ(void)
 {
 	int i, rc;
 	int cpu;
 
-	/* Setup 2-level event channel */
-	eops = &evtchn_ops_l2;
-	evtchn_level = 2;
+	switch (evtchn_level) {
+	case 2:
+		eops = &evtchn_ops_l2; break;
+	case 3:
+		eops = &evtchn_ops_l3; break;
+	default:
+		BUG();
+	}
 
 	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS(evtchn_level),
 				sizeof(*evtchn_to_irq),
diff --git a/include/xen/events.h b/include/xen/events.h
index bc10f22..87696fc 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -111,5 +111,7 @@ int xen_test_irq_shared(int irq);
 
 /* N-level event channels */
 extern unsigned int evtchn_level;
+extern unsigned int evtchn_level_param;
+int xen_event_channel_setup_3level(void);
 
 #endif	/* _XEN_EVENTS_H */
diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h
index f494292..f764d21 100644
--- a/include/xen/interface/event_channel.h
+++ b/include/xen/interface/event_channel.h
@@ -190,6 +190,30 @@ struct evtchn_reset {
 };
 typedef struct evtchn_reset evtchn_reset_t;
 
+/*
+ * EVTCHNOP_register_nlevel: Register N level event channels.
+ * NOTES:
+ *   1. currently only 3-level is supported.
+ *   2. should fall back to basic 2-level if this call fails.
+ */
+#define EVTCHNOP_register_nlevel 11
+#define MAX_L3_PAGES 8		/* 8 pages for 64 bits */
+struct evtchn_register_3level {
+	unsigned long evtchn_pending[MAX_L3_PAGES];
+	unsigned long evtchn_mask[MAX_L3_PAGES];
+	unsigned long *l2sel_mfn;
+	unsigned long *l2sel_offset;
+	unsigned int nr_vcpus;
+};
+
+struct evtchn_register_nlevel {
+	uint32_t level;
+	union {
+		struct evtchn_register_3level l3;
+	} u;
+};
+typedef struct evtchn_register_nlevel evtchn_register_nlevel_t;
+
 struct evtchn_op {
 	uint32_t cmd; /* EVTCHNOP_* */
 	union {
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index c66e1ff..7cb9d8f 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -289,7 +289,7 @@ DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
  *  32k if a long is 32 bits; 256k if a long is 64 bits.
  */
 #define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) * 64)
-#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * sizeof(unsigned long))
+#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64)
 #define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0;	\
 	switch (x) {					\
 	case 2:						\
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH 3/3] Xen: implement 3-level event channel routines.
  2012-12-31 18:38 ` [RFC PATCH 3/3] Xen: implement 3-level event channel routines Wei Liu
@ 2013-01-02 14:57   ` David Vrabel
  0 siblings, 0 replies; 4+ messages in thread
From: David Vrabel @ 2013-01-02 14:57 UTC (permalink / raw)
  To: Wei Liu; +Cc: konrad.wilk, xen-devel

On 31/12/12 18:38, Wei Liu wrote:
> 

Changeset description?

> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  arch/x86/xen/enlighten.c              |    7 +
>  drivers/xen/events.c                  |  419 +++++++++++++++++++++++++++++++--
>  include/xen/events.h                  |    2 +
>  include/xen/interface/event_channel.h |   24 ++
>  include/xen/interface/xen.h           |    2 +-
>  5 files changed, 437 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index bc893e7..f471881 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -43,6 +43,7 @@
>  #include <xen/hvm.h>
>  #include <xen/hvc-console.h>
>  #include <xen/acpi.h>
> +#include <xen/events.h>
>  
>  #include <asm/paravirt.h>
>  #include <asm/apic.h>
> @@ -195,6 +196,9 @@ void xen_vcpu_restore(void)
>  		    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
>  			BUG();
>  	}
> +
> +	if (evtchn_level_param == 3)
> +		xen_event_channel_setup_3level();

Why is this here?

>  }
>  
>  static void __init xen_banner(void)
> @@ -1028,6 +1032,9 @@ void xen_setup_vcpu_info_placement(void)
>  	for_each_possible_cpu(cpu)
>  		xen_vcpu_setup(cpu);
>  
> +	if (evtchn_level_param == 3)
> +		xen_event_channel_setup_3level();
> +

Why is this here instead of xen_init_IRQ()?

>  	/* xen_vcpu_setup managed to place the vcpu_info within the
>  	   percpu area for all cpus, so make use of it */
>  	if (have_vcpu_info_placement) {
> diff --git a/drivers/xen/events.c b/drivers/xen/events.c
> index f60ba76..adb94e9 100644
> --- a/drivers/xen/events.c
> +++ b/drivers/xen/events.c
[...]
> +
> +/* 2-level event channel does not use current_word_idx_l2 */
>  static DEFINE_PER_CPU(unsigned int, current_word_idx);
> +static DEFINE_PER_CPU(unsigned int, current_word_idx_l2);
>  static DEFINE_PER_CPU(unsigned int, current_bit_idx);

I suggest renaming these to current_word_idx_l3 and current_word_idx_l2.

The use of these variable really needs documentation, particularly why
they're used. I presume (but not really sure) that they're to ensure the
average event latency is constant independent of which channel it is.

> +
>  /*
>   * Mask out the i least significant bits of w
>   */
> @@ -1303,7 +1486,8 @@ static void __xen_evtchn_do_upcall_l2(void)
>  		if (__this_cpu_inc_return(xed_nesting_count) - 1)
>  			goto out;
>  
> -#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
> +#ifndef CONFIG_X86
> +		/* No need for a barrier -- XCHG is a barrier on x86. */
>  		/* Clear master flag /before/ clearing selector flag. */
>  		wmb();
>  #endif
> @@ -1392,6 +1576,155 @@ out:
>  	put_cpu();
>  }
>  
> +void __xen_evtchn_do_upcall_l3(void)

This is one of my least favourite functions...  A comment describing the
algorithm used here would be nice.

> @@ -1821,14 +2148,74 @@ static struct evtchn_ops evtchn_ops_l2 __read_mostly = {
>  	.xen_debug_interrupt = __xen_debug_interrupt_l2,
>  };
>  
> +static struct evtchn_ops evtchn_ops_l3 __read_mostly = {

const

> +	.active_evtchns = __active_evtchns_l3,
> +	.clear_evtchn = __clear_evtchn_l3,
> +	.set_evtchn = __set_evtchn_l3,
> +	.test_evtchn = __test_evtchn_l3,
> +	.mask_evtchn = __mask_evtchn_l3,
> +	.unmask_evtchn = __unmask_evtchn_l3,
> +	.is_masked = __is_masked_l3,
> +	.xen_evtchn_do_upcall = __xen_evtchn_do_upcall_l3,
> +	.xen_debug_interrupt = __xen_debug_interrupt_l3,
> +};
> +
> +int xen_event_channel_setup_3level(void)
> +{
> +	evtchn_register_nlevel_t reg;
> +	int i, nr_pages, cpu;
> +	unsigned long mfns[nr_cpu_ids];
> +	unsigned long offsets[nr_cpu_ids];

These arrays are too large for the stack if the domain has many VCPUs.
With 256 VCPUs this uses a page of stack.

> diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h
> index f494292..f764d21 100644
> --- a/include/xen/interface/event_channel.h
> +++ b/include/xen/interface/event_channel.h
[...]

> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
> index c66e1ff..7cb9d8f 100644
> --- a/include/xen/interface/xen.h
> +++ b/include/xen/interface/xen.h
[,.,]

Put these in he patch sync'ing the headers.

David

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2013-01-02 14:57 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-12-31 18:37 [RFC PATCH 2/3] Xen: rework NR_EVENT_CHANNELS related stuffs Wei Liu
2012-12-31 18:37 ` [RFC PATCH 3/3] Xen: implement 3-level event channel routines Wei Liu
  -- strict thread matches above, loose matches on Subject: below --
2012-12-31 18:38 Implement 3-level event channel routines in Linux Wei Liu
2012-12-31 18:38 ` [RFC PATCH 3/3] Xen: implement 3-level event channel routines Wei Liu
2013-01-02 14:57   ` David Vrabel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).