From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Gregory Haskins" Subject: [PATCH] interrupt preemption support Date: Thu, 15 Mar 2007 17:10:56 -0400 Message-ID: <45F97019.BA47.005A.0@novell.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit To: Return-path: Content-Disposition: inline List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: kvm-devel-bounces-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org Errors-To: kvm-devel-bounces-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org List-Id: kvm.vger.kernel.org This patch adds baseline support for interrupt preemption. This lets one context (virtual SMP vcpu, PV driver, etc) preempt another vcpu which is currently in guest context when an interrupt is posted. The patch consists of the following: 1) Re-worked the vcpu mutex with a new "MPL" construct. This allows updates to the vcpu structure even if the vcpu is currently executing in guest mode. The previous code held a mutex during guest execution. 2) Exposed the previously static kvm_vcpu_ioctl_interrupt() as kvm_vcpu_interrupt so that other mechanisms (PV drivers) can issue interrupts just as the ioctl interface can 3) Utilizes the signal facility to interrupt an executing guest by virtue of the scheduler's IPI facility. All external interrupts (including IPI) will cause a VMEXIT (thanks for the hints, Avi) Whats missing: 1) SVN support- shouldnt break SVN..it just will serialize access if you try to post an interrupt. 2) IPI currently delivers SIGUSR1, which will cause QEMU to exit on reception. Note however that the code does not currently invoke the send_sig since there are not any parallel vcpu/PV tasks running, thus there is not an issue. Next step is to identify a good signal to use (perhaps we should start defining some RT-SIGS (e.g. KVM_SIG_VIRTUAL_INT = 33) in a header file somewhere. Signed-off by: Gregory Haskins Index: kvm-12/kernel/kvm.h =================================================================== --- kvm-12.orig/kernel/kvm.h +++ kvm-12/kernel/kvm.h @@ -12,6 +12,8 @@ #include #include +#include "mpl.h" + #include "vmx.h" #include #include @@ -219,15 +221,87 @@ enum { VCPU_SREG_LDTR, }; +typedef enum +{ + KVM_MPL_FREE, + KVM_MPL_VMCS, + KVM_MPL_VCPU, + KVM_MPL_FULL = KVM_MPL_VCPU /* Must be equal to the last entry */ +}kvm_mpl_locktypes_t; + +#define __KVM_MPL_LOCK(lock, min, max) \ + mpl_lock(lock, min, max) +#define __KVM_MPL_TRYLOCK(lock, min, max) \ + mpl_trylock(lock, min, max) +#define __KVM_MPL_UNLOCK(lock, restore) \ + mpl_unlock(lock, restore) + +// #define MPL_DEBUG + +#ifdef MPL_DEBUG +#define _KVM_MPL_LOCK(lock, min, max) \ +({ \ + int _ret; \ + printk(KERN_CRIT "MPL: Aquire lock %d:%d at %s:%d\n", \ + min, max, __FILE__, __LINE__); \ + dump_stack(); \ + _ret = __KVM_MPL_LOCK(lock, min, max); \ + printk(KERN_CRIT "MPL: Aquired with %d\n", _ret); \ + _ret; \ +}) + +#define _KVM_MPL_TRYLOCK(lock, min, max) \ +({ \ + int _ret; \ + printk(KERN_CRIT "MPL: Aquire trylock %d:%d at %s:%d\n", \ + min, max, __FILE__, __LINE__); \ + dump_stack(); \ + _ret = __KVM_MPL_TRYLOCK(lock, min, max); \ + printk(KERN_CRIT "MPL: Done with %d\n", _ret); \ + _ret; \ +}) +#define _KVM_MPL_UNLOCK(lock, restore) \ +({ \ + int _ret; \ + printk(KERN_CRIT "MPL: Unlock to %d at %s:%d\n", \ + restore, __FILE__, __LINE__); \ + dump_stack(); \ + _ret = __KVM_MPL_UNLOCK(lock, restore); \ + printk(KERN_CRIT "MPL: Unlocked with %d\n", _ret); \ + _ret; \ +}) +#else +#define _KVM_MPL_LOCK __KVM_MPL_LOCK +#define _KVM_MPL_TRYLOCK __KVM_MPL_TRYLOCK +#define _KVM_MPL_UNLOCK __KVM_MPL_UNLOCK +#endif + +/* These macros make the MPL lock behave like a standard mutex + * (which is the behavior we want in all but a few special cases) + */ +#define KVM_MPL_LOCK(lock) _KVM_MPL_LOCK(lock, KVM_MPL_FREE, KVM_MPL_FULL) +#define KVM_MPL_TRYLOCK(lock) \ +( (_KVM_MPL_TRYLOCK(lock, KVM_MPL_FREE, KVM_MPL_FULL) >= 0) ? 1 : 0 ) +#define KVM_MPL_UNLOCK(lock) _KVM_MPL_UNLOCK(lock, KVM_MPL_FREE) + +/* We use a seperate struct so that we can expand the task related info + * without adding noise to the main kvm_vcpu struct + */ +struct kvm_vcpu_task { + int running; + struct task_struct *owner; +}; + struct kvm_vcpu { struct kvm *kvm; union { struct vmcs *vmcs; struct vcpu_svm *svm; }; - struct mutex mutex; + mpl_t mpl; int cpu; int launched; + struct kvm_vcpu_task task; int interrupt_window_open; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) @@ -485,6 +559,8 @@ void kvm_mmu_free_some_pages(struct kvm_ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_vcpu_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *intr); + static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { Index: kvm-12/kernel/kvm_main.c =================================================================== --- kvm-12.orig/kernel/kvm_main.c +++ kvm-12/kernel/kvm_main.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "x86_emulate.h" #include "segment_descriptor.h" @@ -257,7 +258,7 @@ EXPORT_SYMBOL_GPL(kvm_write_guest); */ static void vcpu_load(struct kvm_vcpu *vcpu) { - mutex_lock(&vcpu->mutex); + KVM_MPL_LOCK(&vcpu->mpl); kvm_arch_ops->vcpu_load(vcpu); } @@ -269,9 +270,9 @@ static struct kvm_vcpu *vcpu_load_slot(s { struct kvm_vcpu *vcpu = &kvm->vcpus[slot]; - mutex_lock(&vcpu->mutex); + KVM_MPL_LOCK(&vcpu->mpl); if (!vcpu->vmcs) { - mutex_unlock(&vcpu->mutex); + KVM_MPL_UNLOCK(&vcpu->mpl); return NULL; } kvm_arch_ops->vcpu_load(vcpu); @@ -281,7 +282,7 @@ static struct kvm_vcpu *vcpu_load_slot(s static void vcpu_put(struct kvm_vcpu *vcpu) { kvm_arch_ops->vcpu_put(vcpu); - mutex_unlock(&vcpu->mutex); + KVM_MPL_UNLOCK(&vcpu->mpl); } static struct kvm *kvm_create_vm(void) @@ -297,7 +298,7 @@ static struct kvm *kvm_create_vm(void) for (i = 0; i < KVM_MAX_VCPUS; ++i) { struct kvm_vcpu *vcpu = &kvm->vcpus[i]; - mutex_init(&vcpu->mutex); + mpl_init(&vcpu->mpl); vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; @@ -1523,6 +1524,15 @@ static int kvm_vcpu_ioctl_run(struct kvm vcpu->mmio_needed = 0; + if(!vcpu->task.owner) + vcpu->task.owner = current; + + if(vcpu->task.owner != current) + { + printk(KERN_WARNING "KVM: VCPU task owner change detected\n"); + vcpu->task.owner = current; + } + r = kvm_arch_ops->run(vcpu, kvm_run); vcpu_put(vcpu); @@ -1858,20 +1868,40 @@ static int kvm_vcpu_ioctl_translate(stru return 0; } -static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) +int kvm_vcpu_interrupt(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) { - if (irq->irq < 0 || irq->irq >= 256) - return -EINVAL; - vcpu_load(vcpu); + int lockval; + + if (irq->irq < 0 || irq->irq >= 256) + return -EINVAL; + + /* We only care about locking the VCPU, not the VMCS */ + lockval = _KVM_MPL_LOCK(&vcpu->mpl, KVM_MPL_VMCS, KVM_MPL_VCPU); set_bit(irq->irq, vcpu->irq_pending); set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); - vcpu_put(vcpu); + /* If the vcpu is currently running, we want to send the + task an IPI (when applicable) to cause a VMEXIT */ + if(vcpu->task.running && vcpu->task.owner && + (vcpu->task.owner != current)) + { + send_sig(SIGUSR1, vcpu->task.owner, 0); + } + + /* Restore the lock to its previous value */ + _KVM_MPL_UNLOCK(&vcpu->mpl, lockval); return 0; } +EXPORT_SYMBOL_GPL(kvm_vcpu_interrupt); + +static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) +{ + return kvm_vcpu_interrupt(vcpu, irq); +} static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) @@ -1954,10 +1984,10 @@ static int kvm_vm_ioctl_create_vcpu(stru vcpu = &kvm->vcpus[n]; - mutex_lock(&vcpu->mutex); + KVM_MPL_LOCK(&vcpu->mpl); if (vcpu->vmcs) { - mutex_unlock(&vcpu->mutex); + KVM_MPL_UNLOCK(&vcpu->mpl); return -EEXIST; } @@ -1990,7 +2020,7 @@ static int kvm_vm_ioctl_create_vcpu(stru out_free_vcpus: kvm_free_vcpu(vcpu); - mutex_unlock(&vcpu->mutex); + KVM_MPL_UNLOCK(&vcpu->mpl); out: return r; } @@ -2345,12 +2375,12 @@ static void decache_vcpus_on_cpu(int cpu * If it's not locked, check the last cpu it executed * on. */ - if (mutex_trylock(&vcpu->mutex)) { + if (KVM_MPL_TRYLOCK(&vcpu->mpl)) { if (vcpu->cpu == cpu) { kvm_arch_ops->vcpu_decache(vcpu); vcpu->cpu = -1; } - mutex_unlock(&vcpu->mutex); + KVM_MPL_UNLOCK(&vcpu->mpl); } } spin_unlock(&kvm_lock); Index: kvm-12/kernel/vmx.c =================================================================== --- kvm-12.orig/kernel/vmx.c +++ kvm-12/kernel/vmx.c @@ -1769,6 +1769,9 @@ again: save_msrs(vcpu->host_msrs, vcpu->nmsrs); load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + vcpu->task.running = 1; + _KVM_MPL_UNLOCK(&vcpu->mpl, KVM_MPL_VMCS); /* Downgrade the lock */ + asm ( /* Store host registers */ "pushf \n\t" @@ -1888,6 +1891,9 @@ again: [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) : "cc", "memory" ); + _KVM_MPL_LOCK(&vcpu->mpl, KVM_MPL_VMCS, KVM_MPL_FULL); /* Reaquire */ + vcpu->task.running = 0; + ++kvm_stat.exits; save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); Index: kvm-12/kernel/mpl.h =================================================================== --- /dev/null +++ kvm-12/kernel/mpl.h @@ -0,0 +1,147 @@ +/* Multi-Phase Lock + * + * This is a construct which acts as a mutex with multiple levels. The first + * level (phase = 0) represents a "free" mutex that is available for + * consumption. Any other level (phase > 0) represents that one or more locks + * are being held. Each phase represents effectivly a new lock, where higher + * values offer finer grained locks, and lower values are coarser grained. + * + * This same construct could be represented with a unique mutex for each phase. + * However, in this model the burden of deadlock avoidance is placed on the + * developer. With MPL, deadlock is avoided because the locking order is + * enforced by the construct as we move from courser to finer grained locks. + * E.g. you always must aquire lock 1 before 2, 2 before 3, etc. This is + * enforced because locks are aquired via a range expression (e.g. 0:2 aquires + * 0, 1, and 2, and in proper order) + * + * In the context of KVM, this is used to guard the VCPU structure. Level 1 + * is the VMCS structure itself, and Level 2 is the entire VCPU + VCMS. Level + * 2 represents what was previously considered vcpu->mutex. This lock is taken + * whenever broad changes need to be made to the vcpu structure. However, + * in the previous model, this lock was also held for the duration of a guests + * execution. Going forward, we want to support the notion of guest-preemption + * and this requires finer granularity. So the new model has the RUN code + * demote the MPL lock to VMCS mode prior to performing the VMENTER. This + * allows code which only needs to modify the VCPU (but not the VMCS) to + * aquire the partial lock. This is exploited to update pending interrupt + * status while a VMCS is executing. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Copyright (C) 2007 Novell + * + * Author: Gregory Haskins + * +*/ + +#ifndef __MPL_H +#define __MPL_H + +#include + +/* Condition Variable */ +typedef struct condvar { + wait_queue_head_t queue; +}condvar_t; + +static inline void condvar_init(condvar_t *cv) +{ + wait_queue_head_t __head = __WAIT_QUEUE_HEAD_INITIALIZER(cv->queue); + cv->queue = __head; +} + +static inline int condvar_wait(condvar_t *cv, struct mutex *m) +{ + DEFINE_WAIT(__wait); + int _ret = 0; + + /* first place ourselves on the waitqueue before releasing the lock */ + prepare_to_wait(&cv->queue, &__wait, TASK_UNINTERRUPTIBLE); + + /* now actually release the lock to unblock any potential signalers */ + mutex_unlock(m); + + /* finally, reschedule until we are re-awoken */ + schedule(); + finish_wait(&cv->queue, &__wait); + + /* if we get here, its because someone signaled us. reaquire the lock */ + mutex_lock(m); + + return _ret; +} + +static inline int condvar_signal(condvar_t *cv) +{ + wake_up(&cv->queue); + return 0; +} + +typedef struct mpl +{ + struct mutex mutex; + int phase; + condvar_t cv; + }mpl_t; + +static inline void mpl_init(mpl_t *lock) +{ + mutex_init(&lock->mutex); + lock->phase = 0; + condvar_init(&lock->cv); +} + +static inline int mpl_lock(mpl_t *lock, int min, int max) +{ + int _ret; + + mutex_lock(&lock->mutex); + + /* Wait for the phase to reach the low water */ + while(lock->phase > min) + condvar_wait(&lock->cv, &lock->mutex); + + BUG_ON(lock->phase > min); + + _ret = lock->phase; + lock->phase = max; + + mutex_unlock(&lock->mutex); + + return _ret; +} + +static inline int mpl_trylock(mpl_t *lock, int min, int max) +{ + int _ret = -EAGAIN; + + mutex_lock(&lock->mutex); + + if(lock->phase > min) + goto out; + + BUG_ON(lock->phase > min); + + _ret = lock->phase; + lock->phase = max; + + out: + mutex_unlock(&lock->mutex); + + return _ret; +} + +static inline int mpl_unlock(mpl_t *lock, int restore) +{ + mutex_lock(&lock->mutex); + + lock->phase = restore; + condvar_signal(&lock->cv); + + mutex_unlock(&lock->mutex); + + return 0; +} + +#endif /* __MPL_H */ ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV