From mboxrd@z Thu Jan  1 00:00:00 1970
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Subject: Re: [PATCH 16/16] xen/events: use the FIFO-based ABI if
	available
Date: Mon, 14 Oct 2013 15:30:43 -0400
Message-ID: <525C4663.70907@oracle.com>
References: <1381236555-27493-1-git-send-email-david.vrabel@citrix.com>
	<1381236555-27493-17-git-send-email-david.vrabel@citrix.com>
Mime-Version: 1.0
Content-Type: text/plain; charset="us-ascii"; Format="flowed"
Content-Transfer-Encoding: 7bit
Return-path: <xen-devel-bounces@lists.xen.org>
In-Reply-To: <1381236555-27493-17-git-send-email-david.vrabel@citrix.com>
List-Unsubscribe: <http://lists.xen.org/cgi-bin/mailman/options/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xen.org>
List-Help: <mailto:xen-devel-request@lists.xen.org?subject=help>
List-Subscribe: <http://lists.xen.org/cgi-bin/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=subscribe>
Sender: xen-devel-bounces@lists.xen.org
Errors-To: xen-devel-bounces@lists.xen.org
To: David Vrabel <david.vrabel@citrix.com>
Cc: Jan Beulich <jbeulich@suse.com>, xen-devel@lists.xen.org
List-Id: xen-devel@lists.xenproject.org

On 10/08/2013 08:49 AM, David Vrabel wrote:
> From: David Vrabel <david.vrabel@citrix.com>
>
> Implement all the event channel port ops for the FIFO-based ABI.
>
> If the hypervisor supports the FIFO-based ABI, enable it by
> initializing the control block for the boot VCPU and subsequent VCPUs
> as they are brought up and on resume.  The event array is expanded as
> required when event ports are setup.
>
> Signed-off-by: David Vrabel <david.vrabel@citrix.com>
> ---
>   drivers/xen/events/Makefile          |    1 +
>   drivers/xen/events/events.c          |    9 +-
>   drivers/xen/events/events_fifo.c     |  409 ++++++++++++++++++++++++++++++++++
>   drivers/xen/events/events_internal.h |    8 +
>   4 files changed, 426 insertions(+), 1 deletions(-)
>   create mode 100644 drivers/xen/events/events_fifo.c
>
> diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile
> index 3bed6e0..d0f1581 100644
> --- a/drivers/xen/events/Makefile
> +++ b/drivers/xen/events/Makefile
> @@ -1,2 +1,3 @@
>   obj-y += events.o
>   obj-y += events_2l.o
> +obj-y += events_fifo.o
> diff --git a/drivers/xen/events/events.c b/drivers/xen/events/events.c
> index fd65485..dec4d6d 100644
> --- a/drivers/xen/events/events.c
> +++ b/drivers/xen/events/events.c
> @@ -1538,6 +1538,7 @@ void xen_irq_resume(void)
>   
>   	/* New event-channel space is not 'live' yet. */
>   	xen_evtchn_mask_all();
> +	xen_evtchn_resume();
>   
>   	/* No IRQ <-> event-channel mappings. */
>   	list_for_each_entry(info, &xen_irq_list_head, list)
> @@ -1636,7 +1637,13 @@ void xen_callback_vector(void) {}
>   
>   void __init xen_init_IRQ(void)
>   {
> -	xen_evtchn_2l_init();
> +	int ret;
> +
> +	ret = xen_evtchn_fifo_init();
> +	if (ret < 0) {
> +		printk(KERN_INFO "xen: falling back to n-level event channels");
> +		xen_evtchn_2l_init();
> +	}

Should we provide users with ability to choose which mechanism to use? 
Is there any advantage in staying with 2-level? Stability, I guess, 
would be one.

>   
>   	evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()),
>   				sizeof(*evtchn_to_irq), GFP_KERNEL);
> diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
> new file mode 100644
> index 0000000..c109639
> --- /dev/null
> +++ b/drivers/xen/events/events_fifo.c
> @@ -0,0 +1,409 @@
> +/*
> + * Xen event channels (FIFO-based ABI)
> + *
> + * Copyright (C) 2013 Citrix Systems R&D ltd.
> + *
> + * This source code is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License as
> + * published by the Free Software Foundation; either version 2 of the
> + * License, or (at your option) any later version.
> + *
> + * Or, when distributed separately from the Linux kernel or
> + * incorporated into other software packages, subject to the following
> + * license:
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this source file (the "Software"), to deal in the Software without
> + * restriction, including without limitation the rights to use, copy, modify,
> + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
> + * and to permit persons to whom the Software is furnished to do so, subject to
> + * the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#include <linux/linkage.h>
> +#include <linux/interrupt.h>
> +#include <linux/irq.h>
> +#include <linux/module.h>
> +#include <linux/smp.h>
> +#include <linux/percpu.h>
> +#include <linux/cpu.h>
> +
> +#include <asm/sync_bitops.h>
> +#include <asm/xen/hypercall.h>
> +#include <asm/xen/hypervisor.h>
> +#include <asm/xen/page.h>
> +
> +#include <xen/xen.h>
> +#include <xen/xen-ops.h>
> +#include <xen/events.h>
> +#include <xen/interface/xen.h>
> +#include <xen/interface/event_channel.h>
> +
> +#include "events_internal.h"
> +
> +#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t))
> +#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE)
> +
> +struct evtchn_fifo_queue {
> +	uint32_t head[EVTCHN_FIFO_MAX_QUEUES];
> +};
> +
> +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block);
> +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue);
> +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly;
> +static unsigned event_array_pages __read_mostly;
> +
> +#define BM(w) ((unsigned long *)(w))

This could go into a header file (events_internal.h?) since 2-level uses 
it as well.

> +
> +static inline event_word_t *event_word_from_port(unsigned port)
> +{
> +	unsigned i = port / EVENT_WORDS_PER_PAGE;
> +
> +	if (i >= event_array_pages)
> +		return NULL;
> +	return event_array[i] + port % EVENT_WORDS_PER_PAGE;
> +}
> +
> +static unsigned evtchn_fifo_max_channels(void)
> +{
> +	return EVTCHN_FIFO_NR_CHANNELS;
> +}
> +
> +static unsigned evtchn_fifo_nr_channels(void)
> +{
> +	return event_array_pages * EVENT_WORDS_PER_PAGE;
> +}
> +
> +static void free_unused_array_pages(void)
> +{
> +	unsigned i;
> +
> +	for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) {
> +		if (!event_array[i])
> +			break;
> +		free_page((unsigned long)event_array[i]);
> +		event_array[i] = NULL;
> +	}
> +}
> +
> +static void init_array_page(event_word_t *array_page)
> +{
> +	unsigned i;
> +
> +	for (i = 0; i < EVENT_WORDS_PER_PAGE; i++)
> +		array_page[i] = 1 << EVTCHN_FIFO_MASKED;
> +}
> +
> +static int evtchn_fifo_setup(struct irq_info *info)
> +{
> +	unsigned port = info->evtchn;
> +	unsigned i;
> +	int ret = -ENOMEM;
> +
> +	i = port / EVENT_WORDS_PER_PAGE;

Something more descriptive than 'i' would be better.

> +
> +	if (i >= MAX_EVENT_ARRAY_PAGES)
> +		return -EINVAL;
> +
> +	while (i >= event_array_pages) {
> +		void *array_page;
> +		struct evtchn_expand_array expand_array;
> +
> +		/* Might already have a page if we've resumed. */
> +		array_page = event_array[event_array_pages];
> +		if (!array_page) {
> +			array_page = (void *)__get_free_page(GFP_KERNEL);
> +			if (array_page == NULL)
> +				goto error;
> +			event_array[event_array_pages] = array_page;
> +		}
> +
> +		/* Mask all events in this page before adding it. */
> +		init_array_page(array_page);
> +
> +		expand_array.array_gfn = virt_to_mfn(array_page);
> +
> +		ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array);
> +		if (ret < 0)
> +			goto error;
> +
> +		event_array_pages++;

Should this increment happen in the 'if(!array_page)' clause?

> +	}
> +	return 0;
> +
> +  error:
> +	if (event_array_pages == 0)
> +		panic("xen: unable to expand event array with initial page (%d)\n", ret);
> +	else
> +		printk(KERN_ERR "xen: unable to expand event array (%d)\n", ret);
> +	free_unused_array_pages();

Do you need to clean up in the hypervisor as well?

> +	return ret;
> +}
> +
> +static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu)
> +{
> +	/* no-op */
> +}
> +
> +static void evtchn_fifo_clear_pending(unsigned port)
> +{
> +	event_word_t *word = event_word_from_port(port);
> +	sync_clear_bit(EVTCHN_FIFO_PENDING, BM(word));
> +}
> +
> +static void evtchn_fifo_set_pending(unsigned port)
> +{
> +	event_word_t *word = event_word_from_port(port);
> +	sync_set_bit(EVTCHN_FIFO_PENDING, BM(word));
> +}
> +
> +static bool evtchn_fifo_is_pending(unsigned port)
> +{
> +	event_word_t *word = event_word_from_port(port);
> +	return sync_test_bit(EVTCHN_FIFO_PENDING, BM(word));
> +}
> +
> +static bool evtchn_fifo_test_and_set_mask(unsigned port)
> +{
> +	event_word_t *word = event_word_from_port(port);
> +	return sync_test_and_set_bit(EVTCHN_FIFO_MASKED, BM(word));
> +}
> +
> +static void evtchn_fifo_mask(unsigned port)
> +{
> +	event_word_t *word = event_word_from_port(port);
> +	if (word)
> +		sync_set_bit(EVTCHN_FIFO_MASKED, BM(word));

You are testing 'word' here but not in the routines above (or below).

> +}
> +
> +static void evtchn_fifo_unmask(unsigned port)
> +{
> +	event_word_t *word = event_word_from_port(port);
> +
> +	BUG_ON(!irqs_disabled());
> +
> +	sync_clear_bit(EVTCHN_FIFO_MASKED, BM(word));
> +	if (sync_test_bit(EVTCHN_FIFO_PENDING, BM(word))) {
> +		struct evtchn_unmask unmask = { .port = port };
> +		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
> +	}
> +}

2-level unmasking is somewhat more elaborate, with it trying to avoid 
races on pending events. Is this not a concern here?

> +
> +static uint32_t clear_linked(volatile event_word_t *word)
> +{
> +	event_word_t new, old, w;
> +
> +	w = *word;
> +
> +	do {
> +		old = w;
> +		new = (w & ~((1 << EVTCHN_FIFO_LINKED)
> +			     | EVTCHN_FIFO_LINK_MASK));
> +	} while ((w = sync_cmpxchg(word, old, new)) != old);
> +
> +	return w & EVTCHN_FIFO_LINK_MASK;
> +}
> +
> +static void handle_irq_for_port(unsigned port)
> +{
> +	int irq;
> +	struct irq_desc *desc;
> +
> +	irq = get_evtchn_to_irq(port);
> +	if (irq != -1) {
> +		desc = irq_to_desc(irq);
> +		if (desc)
> +			generic_handle_irq_desc(irq, desc);
> +	}
> +}
> +
> +static void consume_one_event(unsigned cpu,
> +			      struct evtchn_fifo_control_block *control_block,
> +			      unsigned priority, uint32_t *ready)
> +{
> +	struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
> +	uint32_t head;
> +	unsigned port;
> +	event_word_t *word;
> +
> +	head = q->head[priority];
> +
> +	/* Reached the tail last time?  Read the new HEAD from the
> +	   control block. */

Comment style.

> +	if (head == 0) {
> +		rmb(); /* Ensure word is up-to-date before reading head. */
> +		head = control_block->head[priority];
> +	}
> +
> +	port = head;
> +	word = event_word_from_port(port);

Do you need to check for 'word!=NULL'? You don't check it in 
clear_linked() (which is maybe where this should be done).

> +	head = clear_linked(word);
> +
> +	/*
> +	 * If the link is non-zero, there are more events in the
> +	 * queue, otherwise the queue is empty.
> +	 *
> +	 * If the queue is empty, clear this priority from our local
> +	 * copy of the ready word.
> +	 */
> +	if (head == 0)
> +		clear_bit(priority, BM(ready));
> +
> +	if (sync_test_bit(EVTCHN_FIFO_PENDING, BM(word))
> +	    && !sync_test_bit(EVTCHN_FIFO_MASKED, BM(word)))
> +		handle_irq_for_port(port);
> +
> +	q->head[priority] = head;
> +}
> +
> +static void evtchn_fifo_handle_events(unsigned cpu)
> +{
> +	struct evtchn_fifo_control_block *control_block;
> +	uint32_t ready;
> +	unsigned q;
> +
> +	control_block = per_cpu(cpu_control_block, cpu);
> +
> +	ready = xchg(&control_block->ready, 0);
> +
> +	while (ready) {
> +		q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES);
> +		consume_one_event(cpu, control_block, q, &ready);
> +		ready |= xchg(&control_block->ready, 0);
> +	}
> +}
> +
> +static void evtchn_fifo_resume(void)
> +{
> +	unsigned cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		void *control_block = per_cpu(cpu_control_block, cpu);
> +		struct evtchn_init_control init_control;
> +		int ret;
> +
> +		if (!control_block)
> +			continue;
> +
> +		/*
> +		 * If this CPU is offline, take the opportunity to
> +		 * free the control block while it is not being
> +		 * used.
> +		 */
> +		if (!cpu_online(cpu)) {
> +			free_page((unsigned long)control_block);
> +			per_cpu(cpu_control_block, cpu) = NULL;
> +			continue;
> +		}

Have you tested offlining/onlining CPUs (lots of them)? I am asking 
because I see EVTCHNOP_init_control both here
and in init_control_block() but I don't see anything that would "deinit" 
control block for which you are freeing the page above.


> +
> +		init_control.control_gfn = virt_to_mfn(control_block);
> +		init_control.offset = 0;
> +		init_control.vcpu = cpu;
> +
> +		ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control,
> +						  &init_control);
> +		if (ret < 0)
> +			BUG();
> +	}
> +
> +	/*
> +	 * The event array starts out as empty again and is extended
> +	 * as normal when events are bound.  The existing pages will
> +	 * be reused.
> +	 */
> +	event_array_pages = 0;
> +}
> +
> +static const struct evtchn_ops evtchn_ops_fifo = {
> +	.max_channels      = evtchn_fifo_max_channels,
> +	.nr_channels       = evtchn_fifo_nr_channels,
> +	.setup             = evtchn_fifo_setup,
> +	.bind_to_cpu       = evtchn_fifo_bind_to_cpu,
> +	.clear_pending     = evtchn_fifo_clear_pending,
> +	.set_pending       = evtchn_fifo_set_pending,
> +	.is_pending        = evtchn_fifo_is_pending,
> +	.test_and_set_mask = evtchn_fifo_test_and_set_mask,
> +	.mask              = evtchn_fifo_mask,
> +	.unmask            = evtchn_fifo_unmask,
> +	.handle_events     = evtchn_fifo_handle_events,
> +	.resume            = evtchn_fifo_resume,
> +};
> +
> +static int __cpuinit evtchn_fifo_init_control_block(unsigned cpu)
> +{
> +	struct page *control_block = NULL;
> +	struct evtchn_init_control init_control;
> +	int ret = -ENOMEM;
> +
> +	control_block = alloc_page(GFP_KERNEL|__GFP_ZERO);
> +	if (control_block == NULL)
> +		goto error;
> +
> +	init_control.control_gfn = virt_to_mfn(page_address(control_block));
> +	init_control.offset      = 0;
> +	init_control.vcpu        = cpu;
> +
> +	ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control);
> +	if (ret < 0)
> +		goto error;
> +
> +	per_cpu(cpu_control_block, cpu) = page_address(control_block);
> +
> +	return 0;
> +
> +  error:
> +	__free_page(control_block);
> +	return ret;
> +}
> +
> +static int __cpuinit evtchn_fifo_cpu_notification(struct notifier_block *self,
> +						  unsigned long action,
> +						  void *hcpu)
> +{
> +	int cpu = (long)hcpu;

I don't understand this. Yes, it's everywhere in the kernel but it looks 
weird.

> +	int ret = 0;
> +
> +	switch (action) {
> +	case CPU_UP_PREPARE:
> +		if (!per_cpu(cpu_control_block, cpu))
> +			ret = evtchn_fifo_init_control_block(cpu);
> +		break;
> +	default:
> +		break;
> +	}

What happens when you offline a CPU?


-boris


> +	return ret < 0 ? NOTIFY_BAD : NOTIFY_OK;
> +}
> +
> +static struct notifier_block evtchn_fifo_cpu_notifier __cpuinitdata = {
> +	.notifier_call	= evtchn_fifo_cpu_notification,
> +};
> +
> +int __init xen_evtchn_fifo_init(void)
> +{
> +	int cpu = get_cpu();
> +	int ret;
> +
> +	ret = evtchn_fifo_init_control_block(cpu);
> +	if (ret < 0)
> +		goto out;
> +
> +	printk(KERN_INFO "xen: switching to FIFO-based event channels\n");
> +
> +	evtchn_ops = &evtchn_ops_fifo;
> +
> +	register_cpu_notifier(&evtchn_fifo_cpu_notifier);
> +out:
> +	put_cpu();
> +	return ret;
> +}
> diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
> index a3d9aec..0af518f 100644
> --- a/drivers/xen/events/events_internal.h
> +++ b/drivers/xen/events/events_internal.h
> @@ -69,6 +69,7 @@ struct evtchn_ops {
>   	void (*unmask)(unsigned port);
>   
>   	void (*handle_events)(unsigned cpu);
> +	void (*resume)(void);
>   };
>   
>   extern const struct evtchn_ops *evtchn_ops;
> @@ -142,6 +143,13 @@ static inline void xen_evtchn_handle_events(unsigned cpu)
>   	return evtchn_ops->handle_events(cpu);
>   }
>   
> +static inline void xen_evtchn_resume(void)
> +{
> +	if (evtchn_ops->resume)
> +		evtchn_ops->resume();
> +}
> +
>   void xen_evtchn_2l_init(void);
> +int xen_evtchn_fifo_init(void);
>   
>   #endif /* #ifndef __EVENTS_INTERNAL_H__ */