From mboxrd@z Thu Jan 1 00:00:00 1970 From: Boris Ostrovsky Subject: Re: [PATCH 16/16] xen/events: use the FIFO-based ABI if available Date: Mon, 14 Oct 2013 15:30:43 -0400 Message-ID: <525C4663.70907@oracle.com> References: <1381236555-27493-1-git-send-email-david.vrabel@citrix.com> <1381236555-27493-17-git-send-email-david.vrabel@citrix.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii"; Format="flowed" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1381236555-27493-17-git-send-email-david.vrabel@citrix.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: David Vrabel Cc: Jan Beulich , xen-devel@lists.xen.org List-Id: xen-devel@lists.xenproject.org On 10/08/2013 08:49 AM, David Vrabel wrote: > From: David Vrabel > > Implement all the event channel port ops for the FIFO-based ABI. > > If the hypervisor supports the FIFO-based ABI, enable it by > initializing the control block for the boot VCPU and subsequent VCPUs > as they are brought up and on resume. The event array is expanded as > required when event ports are setup. > > Signed-off-by: David Vrabel > --- > drivers/xen/events/Makefile | 1 + > drivers/xen/events/events.c | 9 +- > drivers/xen/events/events_fifo.c | 409 ++++++++++++++++++++++++++++++++++ > drivers/xen/events/events_internal.h | 8 + > 4 files changed, 426 insertions(+), 1 deletions(-) > create mode 100644 drivers/xen/events/events_fifo.c > > diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile > index 3bed6e0..d0f1581 100644 > --- a/drivers/xen/events/Makefile > +++ b/drivers/xen/events/Makefile > @@ -1,2 +1,3 @@ > obj-y += events.o > obj-y += events_2l.o > +obj-y += events_fifo.o > diff --git a/drivers/xen/events/events.c b/drivers/xen/events/events.c > index fd65485..dec4d6d 100644 > --- a/drivers/xen/events/events.c > +++ b/drivers/xen/events/events.c > @@ -1538,6 +1538,7 @@ void xen_irq_resume(void) > > /* New event-channel space is not 'live' yet. */ > xen_evtchn_mask_all(); > + xen_evtchn_resume(); > > /* No IRQ <-> event-channel mappings. */ > list_for_each_entry(info, &xen_irq_list_head, list) > @@ -1636,7 +1637,13 @@ void xen_callback_vector(void) {} > > void __init xen_init_IRQ(void) > { > - xen_evtchn_2l_init(); > + int ret; > + > + ret = xen_evtchn_fifo_init(); > + if (ret < 0) { > + printk(KERN_INFO "xen: falling back to n-level event channels"); > + xen_evtchn_2l_init(); > + } Should we provide users with ability to choose which mechanism to use? Is there any advantage in staying with 2-level? Stability, I guess, would be one. > > evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), > sizeof(*evtchn_to_irq), GFP_KERNEL); > diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c > new file mode 100644 > index 0000000..c109639 > --- /dev/null > +++ b/drivers/xen/events/events_fifo.c > @@ -0,0 +1,409 @@ > +/* > + * Xen event channels (FIFO-based ABI) > + * > + * Copyright (C) 2013 Citrix Systems R&D ltd. > + * > + * This source code is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License as > + * published by the Free Software Foundation; either version 2 of the > + * License, or (at your option) any later version. > + * > + * Or, when distributed separately from the Linux kernel or > + * incorporated into other software packages, subject to the following > + * license: > + * > + * Permission is hereby granted, free of charge, to any person obtaining a copy > + * of this source file (the "Software"), to deal in the Software without > + * restriction, including without limitation the rights to use, copy, modify, > + * merge, publish, distribute, sublicense, and/or sell copies of the Software, > + * and to permit persons to whom the Software is furnished to do so, subject to > + * the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE > + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS > + * IN THE SOFTWARE. > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include > +#include > +#include > +#include > + > +#include > +#include > +#include > +#include > +#include > + > +#include "events_internal.h" > + > +#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) > +#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) > + > +struct evtchn_fifo_queue { > + uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; > +}; > + > +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block); > +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue); > +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly; > +static unsigned event_array_pages __read_mostly; > + > +#define BM(w) ((unsigned long *)(w)) This could go into a header file (events_internal.h?) since 2-level uses it as well. > + > +static inline event_word_t *event_word_from_port(unsigned port) > +{ > + unsigned i = port / EVENT_WORDS_PER_PAGE; > + > + if (i >= event_array_pages) > + return NULL; > + return event_array[i] + port % EVENT_WORDS_PER_PAGE; > +} > + > +static unsigned evtchn_fifo_max_channels(void) > +{ > + return EVTCHN_FIFO_NR_CHANNELS; > +} > + > +static unsigned evtchn_fifo_nr_channels(void) > +{ > + return event_array_pages * EVENT_WORDS_PER_PAGE; > +} > + > +static void free_unused_array_pages(void) > +{ > + unsigned i; > + > + for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) { > + if (!event_array[i]) > + break; > + free_page((unsigned long)event_array[i]); > + event_array[i] = NULL; > + } > +} > + > +static void init_array_page(event_word_t *array_page) > +{ > + unsigned i; > + > + for (i = 0; i < EVENT_WORDS_PER_PAGE; i++) > + array_page[i] = 1 << EVTCHN_FIFO_MASKED; > +} > + > +static int evtchn_fifo_setup(struct irq_info *info) > +{ > + unsigned port = info->evtchn; > + unsigned i; > + int ret = -ENOMEM; > + > + i = port / EVENT_WORDS_PER_PAGE; Something more descriptive than 'i' would be better. > + > + if (i >= MAX_EVENT_ARRAY_PAGES) > + return -EINVAL; > + > + while (i >= event_array_pages) { > + void *array_page; > + struct evtchn_expand_array expand_array; > + > + /* Might already have a page if we've resumed. */ > + array_page = event_array[event_array_pages]; > + if (!array_page) { > + array_page = (void *)__get_free_page(GFP_KERNEL); > + if (array_page == NULL) > + goto error; > + event_array[event_array_pages] = array_page; > + } > + > + /* Mask all events in this page before adding it. */ > + init_array_page(array_page); > + > + expand_array.array_gfn = virt_to_mfn(array_page); > + > + ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array); > + if (ret < 0) > + goto error; > + > + event_array_pages++; Should this increment happen in the 'if(!array_page)' clause? > + } > + return 0; > + > + error: > + if (event_array_pages == 0) > + panic("xen: unable to expand event array with initial page (%d)\n", ret); > + else > + printk(KERN_ERR "xen: unable to expand event array (%d)\n", ret); > + free_unused_array_pages(); Do you need to clean up in the hypervisor as well? > + return ret; > +} > + > +static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu) > +{ > + /* no-op */ > +} > + > +static void evtchn_fifo_clear_pending(unsigned port) > +{ > + event_word_t *word = event_word_from_port(port); > + sync_clear_bit(EVTCHN_FIFO_PENDING, BM(word)); > +} > + > +static void evtchn_fifo_set_pending(unsigned port) > +{ > + event_word_t *word = event_word_from_port(port); > + sync_set_bit(EVTCHN_FIFO_PENDING, BM(word)); > +} > + > +static bool evtchn_fifo_is_pending(unsigned port) > +{ > + event_word_t *word = event_word_from_port(port); > + return sync_test_bit(EVTCHN_FIFO_PENDING, BM(word)); > +} > + > +static bool evtchn_fifo_test_and_set_mask(unsigned port) > +{ > + event_word_t *word = event_word_from_port(port); > + return sync_test_and_set_bit(EVTCHN_FIFO_MASKED, BM(word)); > +} > + > +static void evtchn_fifo_mask(unsigned port) > +{ > + event_word_t *word = event_word_from_port(port); > + if (word) > + sync_set_bit(EVTCHN_FIFO_MASKED, BM(word)); You are testing 'word' here but not in the routines above (or below). > +} > + > +static void evtchn_fifo_unmask(unsigned port) > +{ > + event_word_t *word = event_word_from_port(port); > + > + BUG_ON(!irqs_disabled()); > + > + sync_clear_bit(EVTCHN_FIFO_MASKED, BM(word)); > + if (sync_test_bit(EVTCHN_FIFO_PENDING, BM(word))) { > + struct evtchn_unmask unmask = { .port = port }; > + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); > + } > +} 2-level unmasking is somewhat more elaborate, with it trying to avoid races on pending events. Is this not a concern here? > + > +static uint32_t clear_linked(volatile event_word_t *word) > +{ > + event_word_t new, old, w; > + > + w = *word; > + > + do { > + old = w; > + new = (w & ~((1 << EVTCHN_FIFO_LINKED) > + | EVTCHN_FIFO_LINK_MASK)); > + } while ((w = sync_cmpxchg(word, old, new)) != old); > + > + return w & EVTCHN_FIFO_LINK_MASK; > +} > + > +static void handle_irq_for_port(unsigned port) > +{ > + int irq; > + struct irq_desc *desc; > + > + irq = get_evtchn_to_irq(port); > + if (irq != -1) { > + desc = irq_to_desc(irq); > + if (desc) > + generic_handle_irq_desc(irq, desc); > + } > +} > + > +static void consume_one_event(unsigned cpu, > + struct evtchn_fifo_control_block *control_block, > + unsigned priority, uint32_t *ready) > +{ > + struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); > + uint32_t head; > + unsigned port; > + event_word_t *word; > + > + head = q->head[priority]; > + > + /* Reached the tail last time? Read the new HEAD from the > + control block. */ Comment style. > + if (head == 0) { > + rmb(); /* Ensure word is up-to-date before reading head. */ > + head = control_block->head[priority]; > + } > + > + port = head; > + word = event_word_from_port(port); Do you need to check for 'word!=NULL'? You don't check it in clear_linked() (which is maybe where this should be done). > + head = clear_linked(word); > + > + /* > + * If the link is non-zero, there are more events in the > + * queue, otherwise the queue is empty. > + * > + * If the queue is empty, clear this priority from our local > + * copy of the ready word. > + */ > + if (head == 0) > + clear_bit(priority, BM(ready)); > + > + if (sync_test_bit(EVTCHN_FIFO_PENDING, BM(word)) > + && !sync_test_bit(EVTCHN_FIFO_MASKED, BM(word))) > + handle_irq_for_port(port); > + > + q->head[priority] = head; > +} > + > +static void evtchn_fifo_handle_events(unsigned cpu) > +{ > + struct evtchn_fifo_control_block *control_block; > + uint32_t ready; > + unsigned q; > + > + control_block = per_cpu(cpu_control_block, cpu); > + > + ready = xchg(&control_block->ready, 0); > + > + while (ready) { > + q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES); > + consume_one_event(cpu, control_block, q, &ready); > + ready |= xchg(&control_block->ready, 0); > + } > +} > + > +static void evtchn_fifo_resume(void) > +{ > + unsigned cpu; > + > + for_each_possible_cpu(cpu) { > + void *control_block = per_cpu(cpu_control_block, cpu); > + struct evtchn_init_control init_control; > + int ret; > + > + if (!control_block) > + continue; > + > + /* > + * If this CPU is offline, take the opportunity to > + * free the control block while it is not being > + * used. > + */ > + if (!cpu_online(cpu)) { > + free_page((unsigned long)control_block); > + per_cpu(cpu_control_block, cpu) = NULL; > + continue; > + } Have you tested offlining/onlining CPUs (lots of them)? I am asking because I see EVTCHNOP_init_control both here and in init_control_block() but I don't see anything that would "deinit" control block for which you are freeing the page above. > + > + init_control.control_gfn = virt_to_mfn(control_block); > + init_control.offset = 0; > + init_control.vcpu = cpu; > + > + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, > + &init_control); > + if (ret < 0) > + BUG(); > + } > + > + /* > + * The event array starts out as empty again and is extended > + * as normal when events are bound. The existing pages will > + * be reused. > + */ > + event_array_pages = 0; > +} > + > +static const struct evtchn_ops evtchn_ops_fifo = { > + .max_channels = evtchn_fifo_max_channels, > + .nr_channels = evtchn_fifo_nr_channels, > + .setup = evtchn_fifo_setup, > + .bind_to_cpu = evtchn_fifo_bind_to_cpu, > + .clear_pending = evtchn_fifo_clear_pending, > + .set_pending = evtchn_fifo_set_pending, > + .is_pending = evtchn_fifo_is_pending, > + .test_and_set_mask = evtchn_fifo_test_and_set_mask, > + .mask = evtchn_fifo_mask, > + .unmask = evtchn_fifo_unmask, > + .handle_events = evtchn_fifo_handle_events, > + .resume = evtchn_fifo_resume, > +}; > + > +static int __cpuinit evtchn_fifo_init_control_block(unsigned cpu) > +{ > + struct page *control_block = NULL; > + struct evtchn_init_control init_control; > + int ret = -ENOMEM; > + > + control_block = alloc_page(GFP_KERNEL|__GFP_ZERO); > + if (control_block == NULL) > + goto error; > + > + init_control.control_gfn = virt_to_mfn(page_address(control_block)); > + init_control.offset = 0; > + init_control.vcpu = cpu; > + > + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control); > + if (ret < 0) > + goto error; > + > + per_cpu(cpu_control_block, cpu) = page_address(control_block); > + > + return 0; > + > + error: > + __free_page(control_block); > + return ret; > +} > + > +static int __cpuinit evtchn_fifo_cpu_notification(struct notifier_block *self, > + unsigned long action, > + void *hcpu) > +{ > + int cpu = (long)hcpu; I don't understand this. Yes, it's everywhere in the kernel but it looks weird. > + int ret = 0; > + > + switch (action) { > + case CPU_UP_PREPARE: > + if (!per_cpu(cpu_control_block, cpu)) > + ret = evtchn_fifo_init_control_block(cpu); > + break; > + default: > + break; > + } What happens when you offline a CPU? -boris > + return ret < 0 ? NOTIFY_BAD : NOTIFY_OK; > +} > + > +static struct notifier_block evtchn_fifo_cpu_notifier __cpuinitdata = { > + .notifier_call = evtchn_fifo_cpu_notification, > +}; > + > +int __init xen_evtchn_fifo_init(void) > +{ > + int cpu = get_cpu(); > + int ret; > + > + ret = evtchn_fifo_init_control_block(cpu); > + if (ret < 0) > + goto out; > + > + printk(KERN_INFO "xen: switching to FIFO-based event channels\n"); > + > + evtchn_ops = &evtchn_ops_fifo; > + > + register_cpu_notifier(&evtchn_fifo_cpu_notifier); > +out: > + put_cpu(); > + return ret; > +} > diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h > index a3d9aec..0af518f 100644 > --- a/drivers/xen/events/events_internal.h > +++ b/drivers/xen/events/events_internal.h > @@ -69,6 +69,7 @@ struct evtchn_ops { > void (*unmask)(unsigned port); > > void (*handle_events)(unsigned cpu); > + void (*resume)(void); > }; > > extern const struct evtchn_ops *evtchn_ops; > @@ -142,6 +143,13 @@ static inline void xen_evtchn_handle_events(unsigned cpu) > return evtchn_ops->handle_events(cpu); > } > > +static inline void xen_evtchn_resume(void) > +{ > + if (evtchn_ops->resume) > + evtchn_ops->resume(); > +} > + > void xen_evtchn_2l_init(void); > +int xen_evtchn_fifo_init(void); > > #endif /* #ifndef __EVENTS_INTERNAL_H__ */