From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
To: David Vrabel <david.vrabel@citrix.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>,
Jan Beulich <jbeulich@suse.com>,
xen-devel@lists.xen.org
Subject: Re: [PATCH 13/13] xen/events: use the FIFO-based ABI if available
Date: Tue, 24 Sep 2013 11:50:36 -0400 [thread overview]
Message-ID: <20130924155036.GM4712@phenom.dumpdata.com> (raw)
In-Reply-To: <1379091601-30358-14-git-send-email-david.vrabel@citrix.com>
On Fri, Sep 13, 2013 at 06:00:01PM +0100, David Vrabel wrote:
> From: David Vrabel <david.vrabel@citrix.com>
>
> If the hypervisor supports the FIFO-based ABI, enable it by
> initializing the control block for the boot VCPU and subsequent VCPUs
> as they are brought up. The event array is expanded as required when
> event ports are setup.
>
> This implementation has some known limitations (which will be
> corrected in a subsequent patch):
>
> - The timer VIRQ which previously was treated as the highest priority
> event has the default priority.
>
> Signed-off-by: David Vrabel <david.vrabel@citrix.com>
> ---
> drivers/xen/events/Makefile | 1 +
> drivers/xen/events/events.c | 7 +-
> drivers/xen/events/events_internal.h | 8 +
> drivers/xen/events/fifo.c | 395 ++++++++++++++++++++++++++++++++++
> 4 files changed, 410 insertions(+), 1 deletions(-)
> create mode 100644 drivers/xen/events/fifo.c
>
> diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile
> index aea331e..74644d0 100644
> --- a/drivers/xen/events/Makefile
> +++ b/drivers/xen/events/Makefile
> @@ -1,2 +1,3 @@
> obj-y += events.o
> +obj-y += fifo.o
> obj-y += n-level.o
> diff --git a/drivers/xen/events/events.c b/drivers/xen/events/events.c
> index 6c55b1f..561ae7b 100644
> --- a/drivers/xen/events/events.c
> +++ b/drivers/xen/events/events.c
> @@ -1518,6 +1518,7 @@ void xen_irq_resume(void)
>
> /* New event-channel space is not 'live' yet. */
> xen_evtchn_mask_all();
> + xen_evtchn_resume();
>
> /* No IRQ <-> event-channel mappings. */
> list_for_each_entry(info, &xen_irq_list_head, list)
> @@ -1618,7 +1619,11 @@ void __init xen_init_IRQ(void)
> {
> int ret;
>
> - xen_evtchn_init_nlevel();
> + ret = xen_evtchn_init_fifo();
> + if (ret < 0) {
> + printk(KERN_INFO "xen: falling back to n-level event channels");
> + xen_evtchn_init_nlevel();
> + }
>
> evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()),
> sizeof(*evtchn_to_irq), GFP_KERNEL);
> diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
> index 9d8b70c..f7fbcea 100644
> --- a/drivers/xen/events/events_internal.h
> +++ b/drivers/xen/events/events_internal.h
> @@ -69,6 +69,7 @@ struct evtchn_ops {
> void (*unmask)(int port);
>
> void (*handle_events)(int cpu);
> + void (*resume)(void);
> };
>
> extern const struct evtchn_ops *evtchn_ops;
> @@ -137,6 +138,13 @@ static inline void xen_evtchn_handle_events(int cpu)
> return evtchn_ops->handle_events(cpu);
> }
>
> +static inline void xen_evtchn_resume(void)
> +{
> + if (evtchn_ops->resume)
> + evtchn_ops->resume();
> +}
> +
> void xen_evtchn_init_nlevel(void);
> +int xen_evtchn_init_fifo(void);
>
> #endif /* #ifndef __EVENTS_INTERNAL_H__ */
> diff --git a/drivers/xen/events/fifo.c b/drivers/xen/events/fifo.c
> new file mode 100644
> index 0000000..8047430
> --- /dev/null
> +++ b/drivers/xen/events/fifo.c
> @@ -0,0 +1,395 @@
> +/*
> + * Xen event channels (FIFO-based ABI)
> + *
> + * Copyright (C) 2013 Citrix Systems R&D ltd.
> + *
> + * This source code is licensed under the GNU General Public License,
> + * Version 2 or later. See the file COPYING for more details.
> + */
> +
> +#include <linux/linkage.h>
> +#include <linux/interrupt.h>
> +#include <linux/irq.h>
> +#include <linux/module.h>
> +#include <linux/smp.h>
> +#include <linux/percpu.h>
> +#include <linux/cpu.h>
> +
> +#include <asm/sync_bitops.h>
> +#include <asm/xen/hypercall.h>
> +#include <asm/xen/hypervisor.h>
> +#include <asm/xen/page.h>
> +
> +#include <xen/xen.h>
> +#include <xen/xen-ops.h>
> +#include <xen/events.h>
> +#include <xen/interface/xen.h>
> +#include <xen/interface/event_channel.h>
> +
> +#include "events_internal.h"
> +
> +#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t))
> +#define MAX_EVENT_ARRAY_PAGES ((1 << EVTCHN_FIFO_LINK_BITS) \
> + / EVENT_WORDS_PER_PAGE)
> +
> +struct evtchn_fifo_queue {
> + uint32_t head[EVTCHN_FIFO_MAX_QUEUES];
> +};
> +
> +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block);
> +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue);
> +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES];
> +static unsigned event_array_pages;
__read_mostly
> +
> +#define BM(w) ((unsigned long *)(w))
> +
> +static inline event_word_t *event_word_from_port(int port)
Should port be 'unsigned int'?
> +{
> + int i = port / EVENT_WORDS_PER_PAGE;
And this be 'unsigned' as well?
> +
> + if (i >= event_array_pages)
> + return NULL;
> + return event_array[i] + port % EVENT_WORDS_PER_PAGE;
> +}
> +
> +static unsigned fifo_max_channels(void)
> +{
> + return 1 << EVTCHN_FIFO_LINK_BITS;
> +}
> +
> +static unsigned fifo_nr_channels(void)
> +{
> + return event_array_pages * EVENT_WORDS_PER_PAGE;
> +}
> +
> +static void free_unused_array_pages(void)
> +{
> + unsigned i;
> +
> + for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) {
> + if (!event_array[i])
> + break;
> + free_page((unsigned long)event_array[i]);
> + event_array[i] = NULL;
> + }
> +}
> +
> +static int fifo_setup(struct irq_info *info)
> +{
> + int port = info->evtchn;
unsigned int?
> + int i;
> + int ret = -ENOMEM;
> +
> + i = port / EVENT_WORDS_PER_PAGE;
> +
> + if (i >= MAX_EVENT_ARRAY_PAGES)
> + return -EINVAL;
> +
> + while (i >= event_array_pages) {
> + void *array_page;
> + struct evtchn_expand_array expand_array;
> +
> + /* Might already have a page if we've resumed. */
> + array_page = event_array[event_array_pages];
> + if (!array_page) {
> + array_page = (void *)get_zeroed_page(GFP_KERNEL);
> + if (array_page == NULL)
> + goto error;
> + event_array[event_array_pages] = array_page;
> + } else
> + memset(array_page, 0, PAGE_SIZE);
> +
> + expand_array.array_mfn = virt_to_mfn(array_page);
> +
> + ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array);
> + if (ret < 0)
> + goto error;
> +
> + event_array_pages++;
> + }
> + return 0;
> +
> + error:
> + if (event_array_pages == 0)
> + panic("xen: unable to expand event array with initial page (%d)\n", ret);
Shouldn't we just printk and return a negative value so we can use
the old style event channels?
> + else
> + printk(KERN_ERR "xen: unable to expand event array (%d)\n", ret);
> + free_unused_array_pages();
> + return ret;
> +}
> +
> +static void fifo_bind_to_cpu(struct irq_info *info, int cpu)
> +{
> + /* no-op */
Perhaps you can also say: "The event array is shared between all of the
guests VCPUs." (from design-E).
> +}
> +
> +static void fifo_clear_pending(int port)
> +{
> + event_word_t *word = event_word_from_port(port);
> + sync_clear_bit(EVTCHN_FIFO_PENDING, BM(word));
> +}
> +
> +static void fifo_set_pending(int port)
> +{
> + event_word_t *word = event_word_from_port(port);
> + sync_set_bit(EVTCHN_FIFO_PENDING, BM(word));
> +}
> +
> +static bool fifo_is_pending(int port)
> +{
> + event_word_t *word = event_word_from_port(port);
> + return sync_test_bit(EVTCHN_FIFO_PENDING, BM(word));
> +}
> +
> +static bool fifo_test_and_set_mask(int port)
> +{
> + event_word_t *word = event_word_from_port(port);
> + return sync_test_and_set_bit(EVTCHN_FIFO_MASKED, BM(word));
> +}
> +
> +static void fifo_mask(int port)
> +{
> + event_word_t *word = event_word_from_port(port);
> + if (word)
> + sync_set_bit(EVTCHN_FIFO_MASKED, BM(word));
> +}
> +
> +static void fifo_unmask(int port)
> +{
> + unsigned int cpu = get_cpu();
> + bool do_hypercall = false;
> + bool evtchn_pending = false;
> +
> + BUG_ON(!irqs_disabled());
> +
> + if (unlikely((cpu != cpu_from_evtchn(port))))
> + do_hypercall = true;
Since the event array is shared across all of the vCPUs is this still
needed?
> + else {
> + event_word_t *word = event_word_from_port(port);
> +
> + sync_clear_bit(EVTCHN_FIFO_MASKED, BM(word));
> + evtchn_pending = sync_test_bit(EVTCHN_FIFO_PENDING, BM(word));
> + if (evtchn_pending)
> + do_hypercall = true;
> + }
> +
> + if (do_hypercall) {
> + struct evtchn_unmask unmask = { .port = port };
> + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
> + }
> +
> + put_cpu();
> +}
> +
> +static uint32_t clear_linked(volatile event_word_t *word)
> +{
> + event_word_t n, o, w;
How about just 'new', 'old', 'temp' ?
> +
> + w = *word;
> +
> + do {
> + o = w;
> + n = (w & ~((1 << EVTCHN_FIFO_LINKED) | EVTCHN_FIFO_LINK_MASK));
> + } while ( (w = sync_cmpxchg(word, o, n)) != o );
> +
> + return w & EVTCHN_FIFO_LINK_MASK;
> +}
> +
> +static void handle_irq_for_port(int port)
unsigned int ?
> +{
> + int irq;
> + struct irq_desc *desc;
> +
> + irq = get_evtchn_to_irq(port);
> + if (irq != -1) {
> + desc = irq_to_desc(irq);
> + if (desc)
> + generic_handle_irq_desc(irq, desc);
> + }
> +}
> +
> +static void consume_one_event(int cpu,
> + struct evtchn_fifo_control_block *control_block,
> + int priority, uint32_t *ready)
> +{
> + struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
> + uint32_t head;
> + int port;
unsigned?
> + event_word_t *word;
> +
> + head = q->head[priority];
> +
> + /* Reached the tail last time? Read the new HEAD from the
> + control block. */
> + if (head == 0) {
> + rmb(); /* Ensure word is up-to-date before reading head. */
> + head = control_block->head[priority];
> + }
> +
> + port = head;
> + word = event_word_from_port(port);
> + head = clear_linked(word);
> +
> + /*
> + * If the link is non-zero, there are more events in the
> + * queue, otherwise the queue is empty.
> + *
> + * If the queue is empty, clear this priority from our local
> + * copy of the ready word.
> + */
> + if (head == 0)
> + clear_bit(priority, BM(ready));
> +
> + if (sync_test_bit(EVTCHN_FIFO_PENDING, BM(word))
> + && !sync_test_bit(EVTCHN_FIFO_MASKED, BM(word)))
> + handle_irq_for_port(port);
> +
> + q->head[priority] = head;
> +}
> +
> +#define EVTCHN_FIFO_READY_MASK ((1 << EVTCHN_FIFO_MAX_QUEUES) - 1)
That could go to the header that defines EVTCHN_FIFO_MAX_QUEUES.
> +
> +static void fifo_handle_events(int cpu)
> +{
> + struct evtchn_fifo_control_block *control_block;
> + uint32_t ready;
> + int q;
> +
> + control_block = per_cpu(cpu_control_block, cpu);
> +
> + ready = xchg(&control_block->ready, 0);
> +
> + while (ready & EVTCHN_FIFO_READY_MASK) {
> + q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES);
> + consume_one_event(cpu, control_block, q, &ready);
> + ready |= xchg(&control_block->ready, 0);
Say priority 0 is where VIRQ_TIMER is set and the TIMER is set to
be incredibly short.
Couldn't this cause a scenario where the guest clears the ready in
consume_one_event (clear_bit(0, BM(ready)) and the hypervisor
at the same time can set the bit. We then process the IRQ (virq_timer).
Then we get to the xchg here and end up with the ready having the
priority 0 bit set again.
And then loop again and again..
> + }
> +}
> +
> +static void fifo_resume(void)
> +{
> + unsigned cpu;
> +
> + for_each_possible_cpu(cpu) {
> + void *control_block = per_cpu(cpu_control_block, cpu);
> + struct evtchn_init_control init_control;
> + int ret;
> +
> + if (!control_block)
> + continue;
> +
> + /*
> + * If this CPU is offline, take the opportunity to
> + * free the control block while it is not being
> + * used.
> + */
> + if (!cpu_online(cpu)) {
> + free_page((unsigned long)control_block);
> + per_cpu(cpu_control_block, cpu) = NULL;
> + continue;
> + }
> +
> + init_control.control_mfn = virt_to_mfn(control_block);
> + init_control.offset = 0;
> + init_control.vcpu = cpu;
> +
> + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control,
> + &init_control);
> + if (ret < 0)
> + BUG();
So if this is run after migration to an older hypevisor won't we
blow up here? Shouldn't we just exit with an return and use the
old style classis events?
> + }
> +
> + /*
> + * The event array starts out as empty again and is extended
> + * as normal when events are bound. The existing pages will
> + * be reused.
> + */
> + event_array_pages = 0;
> +}
> +
> +static const struct evtchn_ops evtchn_ops_fifo = {
> + .max_channels = fifo_max_channels,
> + .nr_channels = fifo_nr_channels,
> + .setup = fifo_setup,
> + .bind_to_cpu = fifo_bind_to_cpu,
> + .clear_pending = fifo_clear_pending,
> + .set_pending = fifo_set_pending,
> + .is_pending = fifo_is_pending,
> + .test_and_set_mask = fifo_test_and_set_mask,
> + .mask = fifo_mask,
> + .unmask = fifo_unmask,
> + .handle_events = fifo_handle_events,
> + .resume = fifo_resume,
> +};
> +
> +static int __cpuinit fifo_init_control_block(int cpu)
> +{
> + struct page *control_block = NULL;
> + struct evtchn_init_control init_control;
> + int ret = -ENOMEM;
> +
> + control_block = alloc_page(GFP_KERNEL|__GFP_ZERO);
> + if (control_block == NULL)
> + goto error;
> +
> + init_control.control_mfn = virt_to_mfn(page_address(control_block));
> + init_control.offset = 0;
> + init_control.vcpu = cpu;
> +
> + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control);
> + if (ret < 0)
> + goto error;
> +
> + per_cpu(cpu_control_block, cpu) = page_address(control_block);
> +
> + return 0;
> +
> + error:
> + __free_page(control_block);
> + return ret;
> +}
> +
> +static int __cpuinit fifo_cpu_notification(struct notifier_block *self,
> + unsigned long action, void *hcpu)
> +{
> + int cpu = (long)hcpu;
> + int ret = 0;
> +
> + switch (action) {
> + case CPU_UP_PREPARE:
> + if (!per_cpu(cpu_control_block, cpu))
> + ret = fifo_init_control_block(cpu);
> + break;
> + default:
> + break;
> + }
> + return ret < 0 ? NOTIFY_BAD : NOTIFY_OK;
I think you should return NOTIFY_OK.
Say you do this:
1) Launch guest on a new hypervisor (which has FIFO ABI)
2) Bring down all CPUs but one.
3) Migrate to an older hypervisor (no FIFI ABI)
4) Bring up all of the CPUs.
We shouldn't return NOTIFY_BAD b/c the hypervisor we are resuming
on is too old. We should just continue on without the FIFO mechanism
in place.
> +}
> +
> +static struct notifier_block fifo_cpu_notifier __cpuinitdata = {
> + .notifier_call = fifo_cpu_notification,
> +};
> +
> +
> +int __init xen_evtchn_init_fifo(void)
> +{
> + int cpu = get_cpu();
> + int ret;
> +
> + ret = fifo_init_control_block(cpu);
> + if (ret < 0)
> + goto error;
> +
> + printk(KERN_INFO "xen: switching to FIFO-based event channels\n");
> +
> + evtchn_ops = &evtchn_ops_fifo;
> +
> + register_cpu_notifier(&fifo_cpu_notifier);
> +
> + put_cpu();
> + return 0;
> +
> + error:
> + put_cpu();
> + return ret;
> +}
> --
> 1.7.2.5
>
next prev parent reply other threads:[~2013-09-24 15:50 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-09-13 16:59 (no subject) David Vrabel
2013-09-13 16:59 ` [PATCH 01/13] xen/events: refactor retrigger_dynirq() and resend_irq_on_evtchn() David Vrabel
2013-09-24 14:37 ` Konrad Rzeszutek Wilk
2013-09-24 16:04 ` David Vrabel
2013-09-13 16:59 ` [PATCH 02/13] xen/events: remove unnecessary init_evtchn_cpu_bindings() David Vrabel
2013-09-24 14:40 ` Konrad Rzeszutek Wilk
2013-09-24 16:06 ` David Vrabel
2013-09-24 16:49 ` Konrad Rzeszutek Wilk
2013-09-13 16:59 ` [PATCH 03/13] xen/events: introduce test_and_set_mask David Vrabel
2013-09-24 14:40 ` Konrad Rzeszutek Wilk
2013-09-13 16:59 ` [PATCH 04/13] xen/events: replace raw bit ops with functions David Vrabel
2013-09-24 14:41 ` Konrad Rzeszutek Wilk
2013-09-13 16:59 ` [PATCH 05/13] xen/events: move drivers/xen/events.c into drivers/xen/events/ David Vrabel
2013-09-13 16:59 ` [PATCH 06/13] xen/events: move 2-level specific code into its own file David Vrabel
2013-09-13 16:59 ` [PATCH 07/13] xen/events: add struct evtchn_ops for the low-level port operations David Vrabel
2013-09-24 14:44 ` Konrad Rzeszutek Wilk
2013-09-13 16:59 ` [PATCH 08/13] xen/events: allow setup of irq_info to fail David Vrabel
2013-09-24 14:47 ` Konrad Rzeszutek Wilk
2013-09-13 16:59 ` [PATCH 09/13] xen/events: add a evtchn_op for port setup David Vrabel
2013-09-24 14:47 ` Konrad Rzeszutek Wilk
2013-09-13 16:59 ` [PATCH 10/13] xen/events: Refactor evtchn_to_irq array to be dynamically allocated David Vrabel
2013-09-24 14:58 ` Konrad Rzeszutek Wilk
2013-09-26 10:53 ` David Vrabel
2013-09-26 12:51 ` Konrad Rzeszutek Wilk
2013-09-13 16:59 ` [PATCH 11/13] xen/events: add xen_evtchn_mask_all() David Vrabel
2013-09-24 14:58 ` Konrad Rzeszutek Wilk
2013-09-13 17:00 ` [PATCH 12/13] xen/events: Add the hypervisor interface for the FIFO-based event channels David Vrabel
2013-09-24 15:06 ` Konrad Rzeszutek Wilk
2013-09-26 11:06 ` David Vrabel
2013-09-24 15:08 ` Konrad Rzeszutek Wilk
2013-09-24 16:11 ` David Vrabel
2013-09-24 16:51 ` Konrad Rzeszutek Wilk
2013-09-13 17:00 ` [PATCH 13/13] xen/events: use the FIFO-based ABI if available David Vrabel
2013-09-24 15:50 ` Konrad Rzeszutek Wilk [this message]
2013-09-24 16:48 ` David Vrabel
2013-09-24 17:04 ` Konrad Rzeszutek Wilk
2013-09-13 17:03 ` [RFC PATCHv3 00/12] Linux: FIFO-based event channel ABI David Vrabel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130924155036.GM4712@phenom.dumpdata.com \
--to=konrad.wilk@oracle.com \
--cc=boris.ostrovsky@oracle.com \
--cc=david.vrabel@citrix.com \
--cc=jbeulich@suse.com \
--cc=xen-devel@lists.xen.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.