* [PATCH] interrupt preemption support
@ 2007-03-15 21:10 Gregory Haskins
[not found] ` <45F97019.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 10+ messages in thread
From: Gregory Haskins @ 2007-03-15 21:10 UTC (permalink / raw)
To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
This patch adds baseline support for interrupt preemption. This lets one context (virtual SMP vcpu, PV driver, etc) preempt another vcpu which is currently in guest context when an interrupt is posted. The patch consists of the following:
1) Re-worked the vcpu mutex with a new "MPL" construct. This allows updates to the vcpu structure even if the vcpu is currently executing in guest mode. The previous code held a mutex during guest execution.
2) Exposed the previously static kvm_vcpu_ioctl_interrupt() as kvm_vcpu_interrupt so that other mechanisms (PV drivers) can issue interrupts just as the ioctl interface can
3) Utilizes the signal facility to interrupt an executing guest by virtue of the scheduler's IPI facility. All external interrupts (including IPI) will cause a VMEXIT (thanks for the hints, Avi)
Whats missing:
1) SVN support- shouldnt break SVN..it just will serialize access if you try to post an interrupt.
2) IPI currently delivers SIGUSR1, which will cause QEMU to exit on reception. Note however that the code does not currently invoke the send_sig since there are not any parallel vcpu/PV tasks running, thus there is not an issue. Next step is to identify a good signal to use (perhaps we should start defining some RT-SIGS (e.g. KVM_SIG_VIRTUAL_INT = 33) in a header file somewhere.
Signed-off by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
Index: kvm-12/kernel/kvm.h
===================================================================
--- kvm-12.orig/kernel/kvm.h
+++ kvm-12/kernel/kvm.h
@@ -12,6 +12,8 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
+#include "mpl.h"
+
#include "vmx.h"
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -219,15 +221,87 @@ enum {
VCPU_SREG_LDTR,
};
+typedef enum
+{
+ KVM_MPL_FREE,
+ KVM_MPL_VMCS,
+ KVM_MPL_VCPU,
+ KVM_MPL_FULL = KVM_MPL_VCPU /* Must be equal to the last entry */
+}kvm_mpl_locktypes_t;
+
+#define __KVM_MPL_LOCK(lock, min, max) \
+ mpl_lock(lock, min, max)
+#define __KVM_MPL_TRYLOCK(lock, min, max) \
+ mpl_trylock(lock, min, max)
+#define __KVM_MPL_UNLOCK(lock, restore) \
+ mpl_unlock(lock, restore)
+
+// #define MPL_DEBUG
+
+#ifdef MPL_DEBUG
+#define _KVM_MPL_LOCK(lock, min, max) \
+({ \
+ int _ret; \
+ printk(KERN_CRIT "MPL: Aquire lock %d:%d at %s:%d\n", \
+ min, max, __FILE__, __LINE__); \
+ dump_stack(); \
+ _ret = __KVM_MPL_LOCK(lock, min, max); \
+ printk(KERN_CRIT "MPL: Aquired with %d\n", _ret); \
+ _ret; \
+})
+
+#define _KVM_MPL_TRYLOCK(lock, min, max) \
+({ \
+ int _ret; \
+ printk(KERN_CRIT "MPL: Aquire trylock %d:%d at %s:%d\n", \
+ min, max, __FILE__, __LINE__); \
+ dump_stack(); \
+ _ret = __KVM_MPL_TRYLOCK(lock, min, max); \
+ printk(KERN_CRIT "MPL: Done with %d\n", _ret); \
+ _ret; \
+})
+#define _KVM_MPL_UNLOCK(lock, restore) \
+({ \
+ int _ret; \
+ printk(KERN_CRIT "MPL: Unlock to %d at %s:%d\n", \
+ restore, __FILE__, __LINE__); \
+ dump_stack(); \
+ _ret = __KVM_MPL_UNLOCK(lock, restore); \
+ printk(KERN_CRIT "MPL: Unlocked with %d\n", _ret); \
+ _ret; \
+})
+#else
+#define _KVM_MPL_LOCK __KVM_MPL_LOCK
+#define _KVM_MPL_TRYLOCK __KVM_MPL_TRYLOCK
+#define _KVM_MPL_UNLOCK __KVM_MPL_UNLOCK
+#endif
+
+/* These macros make the MPL lock behave like a standard mutex
+ * (which is the behavior we want in all but a few special cases)
+ */
+#define KVM_MPL_LOCK(lock) _KVM_MPL_LOCK(lock, KVM_MPL_FREE, KVM_MPL_FULL)
+#define KVM_MPL_TRYLOCK(lock) \
+( (_KVM_MPL_TRYLOCK(lock, KVM_MPL_FREE, KVM_MPL_FULL) >= 0) ? 1 : 0 )
+#define KVM_MPL_UNLOCK(lock) _KVM_MPL_UNLOCK(lock, KVM_MPL_FREE)
+
+/* We use a seperate struct so that we can expand the task related info
+ * without adding noise to the main kvm_vcpu struct
+ */
+struct kvm_vcpu_task {
+ int running;
+ struct task_struct *owner;
+};
+
struct kvm_vcpu {
struct kvm *kvm;
union {
struct vmcs *vmcs;
struct vcpu_svm *svm;
};
- struct mutex mutex;
+ mpl_t mpl;
int cpu;
int launched;
+ struct kvm_vcpu_task task;
int interrupt_window_open;
unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
@@ -485,6 +559,8 @@ void kvm_mmu_free_some_pages(struct kvm_
int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
+int kvm_vcpu_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *intr);
+
static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
Index: kvm-12/kernel/kvm_main.c
===================================================================
--- kvm-12.orig/kernel/kvm_main.c
+++ kvm-12/kernel/kvm_main.c
@@ -40,6 +40,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mount.h>
+#include <linux/signal.h>
#include "x86_emulate.h"
#include "segment_descriptor.h"
@@ -257,7 +258,7 @@ EXPORT_SYMBOL_GPL(kvm_write_guest);
*/
static void vcpu_load(struct kvm_vcpu *vcpu)
{
- mutex_lock(&vcpu->mutex);
+ KVM_MPL_LOCK(&vcpu->mpl);
kvm_arch_ops->vcpu_load(vcpu);
}
@@ -269,9 +270,9 @@ static struct kvm_vcpu *vcpu_load_slot(s
{
struct kvm_vcpu *vcpu = &kvm->vcpus[slot];
- mutex_lock(&vcpu->mutex);
+ KVM_MPL_LOCK(&vcpu->mpl);
if (!vcpu->vmcs) {
- mutex_unlock(&vcpu->mutex);
+ KVM_MPL_UNLOCK(&vcpu->mpl);
return NULL;
}
kvm_arch_ops->vcpu_load(vcpu);
@@ -281,7 +282,7 @@ static struct kvm_vcpu *vcpu_load_slot(s
static void vcpu_put(struct kvm_vcpu *vcpu)
{
kvm_arch_ops->vcpu_put(vcpu);
- mutex_unlock(&vcpu->mutex);
+ KVM_MPL_UNLOCK(&vcpu->mpl);
}
static struct kvm *kvm_create_vm(void)
@@ -297,7 +298,7 @@ static struct kvm *kvm_create_vm(void)
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
struct kvm_vcpu *vcpu = &kvm->vcpus[i];
- mutex_init(&vcpu->mutex);
+ mpl_init(&vcpu->mpl);
vcpu->cpu = -1;
vcpu->kvm = kvm;
vcpu->mmu.root_hpa = INVALID_PAGE;
@@ -1523,6 +1524,15 @@ static int kvm_vcpu_ioctl_run(struct kvm
vcpu->mmio_needed = 0;
+ if(!vcpu->task.owner)
+ vcpu->task.owner = current;
+
+ if(vcpu->task.owner != current)
+ {
+ printk(KERN_WARNING "KVM: VCPU task owner change detected\n");
+ vcpu->task.owner = current;
+ }
+
r = kvm_arch_ops->run(vcpu, kvm_run);
vcpu_put(vcpu);
@@ -1858,20 +1868,40 @@ static int kvm_vcpu_ioctl_translate(stru
return 0;
}
-static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
- struct kvm_interrupt *irq)
+int kvm_vcpu_interrupt(struct kvm_vcpu *vcpu,
+ struct kvm_interrupt *irq)
{
- if (irq->irq < 0 || irq->irq >= 256)
- return -EINVAL;
- vcpu_load(vcpu);
+ int lockval;
+
+ if (irq->irq < 0 || irq->irq >= 256)
+ return -EINVAL;
+
+ /* We only care about locking the VCPU, not the VMCS */
+ lockval = _KVM_MPL_LOCK(&vcpu->mpl, KVM_MPL_VMCS, KVM_MPL_VCPU);
set_bit(irq->irq, vcpu->irq_pending);
set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
- vcpu_put(vcpu);
+ /* If the vcpu is currently running, we want to send the
+ task an IPI (when applicable) to cause a VMEXIT */
+ if(vcpu->task.running && vcpu->task.owner &&
+ (vcpu->task.owner != current))
+ {
+ send_sig(SIGUSR1, vcpu->task.owner, 0);
+ }
+
+ /* Restore the lock to its previous value */
+ _KVM_MPL_UNLOCK(&vcpu->mpl, lockval);
return 0;
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_interrupt);
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+ struct kvm_interrupt *irq)
+{
+ return kvm_vcpu_interrupt(vcpu, irq);
+}
static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
struct kvm_debug_guest *dbg)
@@ -1954,10 +1984,10 @@ static int kvm_vm_ioctl_create_vcpu(stru
vcpu = &kvm->vcpus[n];
- mutex_lock(&vcpu->mutex);
+ KVM_MPL_LOCK(&vcpu->mpl);
if (vcpu->vmcs) {
- mutex_unlock(&vcpu->mutex);
+ KVM_MPL_UNLOCK(&vcpu->mpl);
return -EEXIST;
}
@@ -1990,7 +2020,7 @@ static int kvm_vm_ioctl_create_vcpu(stru
out_free_vcpus:
kvm_free_vcpu(vcpu);
- mutex_unlock(&vcpu->mutex);
+ KVM_MPL_UNLOCK(&vcpu->mpl);
out:
return r;
}
@@ -2345,12 +2375,12 @@ static void decache_vcpus_on_cpu(int cpu
* If it's not locked, check the last cpu it executed
* on.
*/
- if (mutex_trylock(&vcpu->mutex)) {
+ if (KVM_MPL_TRYLOCK(&vcpu->mpl)) {
if (vcpu->cpu == cpu) {
kvm_arch_ops->vcpu_decache(vcpu);
vcpu->cpu = -1;
}
- mutex_unlock(&vcpu->mutex);
+ KVM_MPL_UNLOCK(&vcpu->mpl);
}
}
spin_unlock(&kvm_lock);
Index: kvm-12/kernel/vmx.c
===================================================================
--- kvm-12.orig/kernel/vmx.c
+++ kvm-12/kernel/vmx.c
@@ -1769,6 +1769,9 @@ again:
save_msrs(vcpu->host_msrs, vcpu->nmsrs);
load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
+ vcpu->task.running = 1;
+ _KVM_MPL_UNLOCK(&vcpu->mpl, KVM_MPL_VMCS); /* Downgrade the lock */
+
asm (
/* Store host registers */
"pushf \n\t"
@@ -1888,6 +1891,9 @@ again:
[cr2]"i"(offsetof(struct kvm_vcpu, cr2))
: "cc", "memory" );
+ _KVM_MPL_LOCK(&vcpu->mpl, KVM_MPL_VMCS, KVM_MPL_FULL); /* Reaquire */
+ vcpu->task.running = 0;
+
++kvm_stat.exits;
save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
Index: kvm-12/kernel/mpl.h
===================================================================
--- /dev/null
+++ kvm-12/kernel/mpl.h
@@ -0,0 +1,147 @@
+/* Multi-Phase Lock
+ *
+ * This is a construct which acts as a mutex with multiple levels. The first
+ * level (phase = 0) represents a "free" mutex that is available for
+ * consumption. Any other level (phase > 0) represents that one or more locks
+ * are being held. Each phase represents effectivly a new lock, where higher
+ * values offer finer grained locks, and lower values are coarser grained.
+ *
+ * This same construct could be represented with a unique mutex for each phase.
+ * However, in this model the burden of deadlock avoidance is placed on the
+ * developer. With MPL, deadlock is avoided because the locking order is
+ * enforced by the construct as we move from courser to finer grained locks.
+ * E.g. you always must aquire lock 1 before 2, 2 before 3, etc. This is
+ * enforced because locks are aquired via a range expression (e.g. 0:2 aquires
+ * 0, 1, and 2, and in proper order)
+ *
+ * In the context of KVM, this is used to guard the VCPU structure. Level 1
+ * is the VMCS structure itself, and Level 2 is the entire VCPU + VCMS. Level
+ * 2 represents what was previously considered vcpu->mutex. This lock is taken
+ * whenever broad changes need to be made to the vcpu structure. However,
+ * in the previous model, this lock was also held for the duration of a guests
+ * execution. Going forward, we want to support the notion of guest-preemption
+ * and this requires finer granularity. So the new model has the RUN code
+ * demote the MPL lock to VMCS mode prior to performing the VMENTER. This
+ * allows code which only needs to modify the VCPU (but not the VMCS) to
+ * aquire the partial lock. This is exploited to update pending interrupt
+ * status while a VMCS is executing.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * Author: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
+ *
+*/
+
+#ifndef __MPL_H
+#define __MPL_H
+
+#include <linux/wait.h>
+
+/* Condition Variable */
+typedef struct condvar {
+ wait_queue_head_t queue;
+}condvar_t;
+
+static inline void condvar_init(condvar_t *cv)
+{
+ wait_queue_head_t __head = __WAIT_QUEUE_HEAD_INITIALIZER(cv->queue);
+ cv->queue = __head;
+}
+
+static inline int condvar_wait(condvar_t *cv, struct mutex *m)
+{
+ DEFINE_WAIT(__wait);
+ int _ret = 0;
+
+ /* first place ourselves on the waitqueue before releasing the lock */
+ prepare_to_wait(&cv->queue, &__wait, TASK_UNINTERRUPTIBLE);
+
+ /* now actually release the lock to unblock any potential signalers */
+ mutex_unlock(m);
+
+ /* finally, reschedule until we are re-awoken */
+ schedule();
+ finish_wait(&cv->queue, &__wait);
+
+ /* if we get here, its because someone signaled us. reaquire the lock */
+ mutex_lock(m);
+
+ return _ret;
+}
+
+static inline int condvar_signal(condvar_t *cv)
+{
+ wake_up(&cv->queue);
+ return 0;
+}
+
+typedef struct mpl
+{
+ struct mutex mutex;
+ int phase;
+ condvar_t cv;
+ }mpl_t;
+
+static inline void mpl_init(mpl_t *lock)
+{
+ mutex_init(&lock->mutex);
+ lock->phase = 0;
+ condvar_init(&lock->cv);
+}
+
+static inline int mpl_lock(mpl_t *lock, int min, int max)
+{
+ int _ret;
+
+ mutex_lock(&lock->mutex);
+
+ /* Wait for the phase to reach the low water */
+ while(lock->phase > min)
+ condvar_wait(&lock->cv, &lock->mutex);
+
+ BUG_ON(lock->phase > min);
+
+ _ret = lock->phase;
+ lock->phase = max;
+
+ mutex_unlock(&lock->mutex);
+
+ return _ret;
+}
+
+static inline int mpl_trylock(mpl_t *lock, int min, int max)
+{
+ int _ret = -EAGAIN;
+
+ mutex_lock(&lock->mutex);
+
+ if(lock->phase > min)
+ goto out;
+
+ BUG_ON(lock->phase > min);
+
+ _ret = lock->phase;
+ lock->phase = max;
+
+ out:
+ mutex_unlock(&lock->mutex);
+
+ return _ret;
+}
+
+static inline int mpl_unlock(mpl_t *lock, int restore)
+{
+ mutex_lock(&lock->mutex);
+
+ lock->phase = restore;
+ condvar_signal(&lock->cv);
+
+ mutex_unlock(&lock->mutex);
+
+ return 0;
+}
+
+#endif /* __MPL_H */
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] interrupt preemption support
[not found] ` <45F97019.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
@ 2007-03-16 7:03 ` Avi Kivity
[not found] ` <45FA414C.1040703-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2007-03-16 7:03 UTC (permalink / raw)
To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Gregory Haskins wrote:
> This patch adds baseline support for interrupt preemption. This lets one context (virtual SMP vcpu, PV driver, etc) preempt another vcpu which is currently in guest context when an interrupt is posted. The patch consists of the following:
>
> 1) Re-worked the vcpu mutex with a new "MPL" construct. This allows updates to the vcpu structure even if the vcpu is currently executing in guest mode. The previous code held a mutex during guest execution.
>
>
Whoa there. kvm can't just add new locking constructs to the kernel.
It has to be added to the kernel _first_, you need to justify it with
more than just kvm as a user, show correctness, performance, and
scalability.
Once it's accepted, kvm can use it.
Formal issues aside, why is this different from taking nested locks?
The paravirt network code congealing in Dor's repo has a spinlock
protecting the interrupt bits. The main execution path takes both the
vcpu mutex and the irq lock (when necessary), other paths take just the
irq lock. This has the added advantage of not requiring a mutex to
inject an interrupt, which is often necessary from (host) irq context.
> 2) Exposed the previously static kvm_vcpu_ioctl_interrupt() as kvm_vcpu_interrupt so that other mechanisms (PV drivers) can issue interrupts just as the ioctl interface can
>
It's not enough to issue an interrupt, there is a whole dance involved
in the guest side to ack it. You need to go through the apic, which is
currently emulated in userspace. We may push it to the kernel later.
>
> Index: kvm-12/kernel/kvm.h
>
Please base things off trunk. For kernel code, off the git repository,
not the bundled kernel module (which is mangled by the release process
in order to accommodate older kernels).
>
> +typedef enum
> +{
> + KVM_MPL_FREE,
> + KVM_MPL_VMCS,
> + KVM_MPL_VCPU,
> + KVM_MPL_FULL = KVM_MPL_VCPU /* Must be equal to the last entry */
> +}kvm_mpl_locktypes_t;
>
Either you or your mailer mangle whitespace horribly.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] interrupt preemption support
[not found] ` <45FA414C.1040703-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-03-16 12:17 ` Gregory Haskins
[not found] ` <45FA4489.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 10+ messages in thread
From: Gregory Haskins @ 2007-03-16 12:17 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
>>> On Fri, Mar 16, 2007 at 3:03 AM, in message <45FA414C.1040703-atKUWr5tajBWk0Htik3J/w@public.gmane.org>,
Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
> Gregory Haskins wrote:
>> This patch adds baseline support for interrupt preemption. This lets one
> context (virtual SMP vcpu, PV driver, etc) preempt another vcpu which is
> currently in guest context when an interrupt is posted. The patch consists
> of the following:
>>
>> 1) Re- worked the vcpu mutex with a new "MPL" construct. This allows updates
> to the vcpu structure even if the vcpu is currently executing in guest mode.
> The previous code held a mutex during guest execution.
>>
>>
>
> Whoa there. kvm can't just add new locking constructs to the kernel.
With all due respect, why not? ;) Of course it can. We arent really adding anything exposed to the kernel outside of KVM or even adding a new one from scratch. Its simply derivative from the existing mutex/semphore that we already use. As a philosophical point, surely not ever construct we utilize in KVM has to be trickled down from the upstream kernel, or KVM itself would not exist ;)
> It has to be added to the kernel _first_, you need to justify it with
> more than just kvm as a user, show correctness, performance, and
> scalability.
>
> Once it's accepted, kvm can use it.
I understand if what I said may not change your argument against proving the construct against a larger audience, but I thought I would point it out that its really just a struct+mutex in case you only read the comment and not the code. I think it just sounds worse than it is.
>
> Formal issues aside, why is this different from taking nested locks?
This essentially *is* a set of nested locks, except that it also automatically protects against deadlock. Alternatively I could have written the code to simply have replaced the original vcpu->mutex with something like vcpu->vcms_mutex + vcpu->vcpu_mutex, and then replaced all the calls to mutex_lock(&vcpu->mutex) with a macro that grabs both (in proper order). However, I felt as though my solution was better for two reasons:
1) Its impossible to bungle the lock-ordering to induce deadlock, regardless of nesting depth
2) Its non-contended acquisition overhead is O(1) regardless of nesting depth, whereas true nested locks are O(n) (where n = depth)
In my experience, nested locks work ok if there is only one developer on the project. When you get project newbies (like me w.r.t. KVM ;) ) hacking code in after the fact, bad things can happen pretty easily. I thought this might perhaps be a good way to prevent that. If it turns out you guys dont like it, its no big deal to rip it out and go the more traditional route. I wanted your feedback. Thats why I submitted this now before I had all the issues worked out. ;)
> The paravirt network code congealing in Dor's repo has a spinlock
> protecting the interrupt bits. The main execution path takes both the
> vcpu mutex and the irq lock (when necessary), other paths take just the
> irq lock. This has the added advantage of not requiring a mutex to
> inject an interrupt, which is often necessary from (host) irq context.
>
Keep in mind that my primary intention was to fix the kvm_vcpu_(ioctl)_interrupt() function to use finer grained locking than the previous vcpu->mutex that it used to use. Because the old lock was a mutex, the new nested lock was also based on a mutex. So its true that its not interrupt friendly, but it wasn't to begin with. Whether it actually needs to be is something that I do not yet know (see my comments below).
>
>> 2) Exposed the previously static kvm_vcpu_ioctl_interrupt() as
> kvm_vcpu_interrupt so that other mechanisms (PV drivers) can issue interrupts
> just as the ioctl interface can
>>
>
> It's not enough to issue an interrupt, there is a whole dance involved
> in the guest side to ack it. You need to go through the apic, which is
> currently emulated in userspace. We may push it to the kernel later.
I added this interface as a stab at accommodating your request for PV driver support without fully understanding your requirements. Based on your comments, I assume that the code that invokes the INTERRUPT ioctl must have already updated the APIC model? I will revert this change to make it a static ioctl function again until I can understand how the PV drivers can update the apic (userspace or kernel, whichever it ends up being).
>
>>
>> Index: kvm- 12/kernel/kvm.h
>>
>
> Please base things off trunk. For kernel code, off the git repository,
> not the bundled kernel module (which is mangled by the release process
> in order to accommodate older kernels).
Note that it actually is based off of (approx) trunk, but my quilt series starts with the kvm-12 tarball and thus the directory name. I think my series was patched up to r4524. You comment about git vs tarball is legitimate. I will have to refactor my workflow to utilize git instead of the tarball somehow.
>
>>
>> +typedef enum
>> +{
>> + KVM_MPL_FREE,
>> + KVM_MPL_VMCS,
>> + KVM_MPL_VCPU,
>> + KVM_MPL_FULL = KVM_MPL_VCPU /* Must be equal to the last entry */
>> +}kvm_mpl_locktypes_t;
>>
>
> Either you or your mailer mangle whitespace horribly.
Its probably a combination of both. Do you guys just use the standard "linux/linus" settings for emacs (or equivalent). Sorry about that. I recently rebuilt my devel machine and don't have my .emacs brought over yet. I will fix this.
-Greg
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] interrupt preemption support
[not found] ` <45FA4489.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
@ 2007-03-18 5:46 ` Avi Kivity
[not found] ` <45FCD226.2010603-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2007-03-18 5:46 UTC (permalink / raw)
To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Gregory Haskins wrote:
>> Whoa there. kvm can't just add new locking constructs to the kernel.
>>
>
> With all due respect, why not? ;) Of course it can. We arent really adding anything exposed to the kernel outside of KVM or even adding a new one from scratch. Its simply derivative from the existing mutex/semphore that we already use.
Still it's not good practice IMO. If the locking construct is useful,
the entire kernel should benefit. If it isn't, we don't want it. And
we want all the locking gurus to have a look, not just one.
> As a philosophical point, surely not ever construct we utilize in KVM has to be trickled down from the upstream kernel, or KVM itself would not exist ;)
>
>
We try not to introduce anything new, unless it is directly related to
virtualization. We want to make our code as boring as possible (but not
boringer :)
That's important in not raising the barrier of entry above the minimum
possible.
>> It has to be added to the kernel _first_, you need to justify it with
>> more than just kvm as a user, show correctness, performance, and
>> scalability.
>>
>> Once it's accepted, kvm can use it.
>>
>
> I understand if what I said may not change your argument against proving the construct against a larger audience, but I thought I would point it out that its really just a struct+mutex in case you only read the comment and not the code. I think it just sounds worse than it is.
>
This will happen to anyone reading the code. People are used to seeing
nested locks. If they see something new, they have to drop whatever
exciting feature they were developing and go understand it.
>
>> Formal issues aside, why is this different from taking nested locks?
>>
>
> This essentially *is* a set of nested locks, except that it also automatically protects against deadlock. Alternatively I could have written the code to simply have replaced the original vcpu->mutex with something like vcpu->vcms_mutex + vcpu->vcpu_mutex, and then replaced all the calls to mutex_lock(&vcpu->mutex) with a macro that grabs both (in proper order).
Won't that lock out the interrupt user? We want only access to the vcpu
to be blocked during guest execution, except for the interrupt area
(which is just mutually excluded) and for the mmu area (likewise).
> However, I felt as though my solution was better for two reasons:
>
> 1) Its impossible to bungle the lock-ordering to induce deadlock, regardless of nesting depth
> 2) Its non-contended acquisition overhead is O(1) regardless of nesting depth, whereas true nested locks are O(n) (where n = depth)
>
First I need to understand how it fits.
> In my experience, nested locks work ok if there is only one developer on the project. When you get project newbies (like me w.r.t. KVM ;) ) hacking code in after the fact, bad things can happen pretty easily.
Right, we need to document the mess.
> I thought this might perhaps be a good way to prevent that. If it turns out you guys dont like it, its no big deal to rip it out and go the more traditional route. I wanted your feedback. Thats why I submitted this now before I had all the issues worked out. ;)
>
Sure, early & often.
> Keep in mind that my primary intention was to fix the kvm_vcpu_(ioctl)_interrupt() function to use finer grained locking than the previous vcpu->mutex that it used to use. Because the old lock was a mutex, the new nested lock was also based on a mutex. So its true that its not interrupt friendly, but it wasn't to begin with. Whether it actually needs to be is something that I do not yet know (see my comments below).
>
>
It's needed all right.
>> It's not enough to issue an interrupt, there is a whole dance involved
>> in the guest side to ack it. You need to go through the apic, which is
>> currently emulated in userspace. We may push it to the kernel later.
>>
>
> I added this interface as a stab at accommodating your request for PV driver support without fully understanding your requirements. Based on your comments, I assume that the code that invokes the INTERRUPT ioctl must have already updated the APIC model?
>
Yes.
> Its probably a combination of both. Do you guys just use the standard "linux/linus" settings for emacs (or equivalent).
>
I use the script suggested in Documentation/CodingStyle, plus the
show-trailing-whitespace customization variable, which saves me from
committing the sin of leaving trailing whitespace every now and then.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] interrupt preemption support
[not found] ` <45FCD226.2010603-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-03-19 13:54 ` Gregory Haskins
[not found] ` <45FE4FB1.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 10+ messages in thread
From: Gregory Haskins @ 2007-03-19 13:54 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Hi Avi,
You make good points. I will convert to a nest lock design and resubmit. Should I use two mutexes, or a mutex and spinlock?
Also, do you have any suggestions on the signum I should use to IPI the running guest? Should I use one of the normal signals (SIGUSR) or should I start a block of defined signals in the RT range (>32)?
-Greg
>>> On Sun, Mar 18, 2007 at 1:46 AM, in message <45FCD226.2010603-atKUWr5tajBWk0Htik3J/w@public.gmane.org>,
Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
> Gregory Haskins wrote:
>
>
>
>>> Whoa there. kvm can't just add new locking constructs to the kernel.
>>>
>>
>> With all due respect, why not? ;) Of course it can. We arent really adding
> anything exposed to the kernel outside of KVM or even adding a new one from
> scratch. Its simply derivative from the existing mutex/semphore that we
> already use.
>
> Still it's not good practice IMO. If the locking construct is useful,
> the entire kernel should benefit. If it isn't, we don't want it. And
> we want all the locking gurus to have a look, not just one.
>
>> As a philosophical point, surely not ever construct we utilize in KVM has to
> be trickled down from the upstream kernel, or KVM itself would not exist ;)
>>
>>
>
> We try not to introduce anything new, unless it is directly related to
> virtualization. We want to make our code as boring as possible (but not
> boringer :)
>
> That's important in not raising the barrier of entry above the minimum
> possible.
>
>>> It has to be added to the kernel _first_, you need to justify it with
>>> more than just kvm as a user, show correctness, performance, and
>>> scalability.
>>>
>>> Once it's accepted, kvm can use it.
>>>
>>
>> I understand if what I said may not change your argument against proving the
> construct against a larger audience, but I thought I would point it out that
> its really just a struct+mutex in case you only read the comment and not the
> code. I think it just sounds worse than it is.
>>
>
> This will happen to anyone reading the code. People are used to seeing
> nested locks. If they see something new, they have to drop whatever
> exciting feature they were developing and go understand it.
>
>
>>
>>> Formal issues aside, why is this different from taking nested locks?
>>>
>>
>> This essentially *is* a set of nested locks, except that it also
> automatically protects against deadlock. Alternatively I could have written
> the code to simply have replaced the original vcpu- >mutex with something like
> vcpu- >vcms_mutex + vcpu- >vcpu_mutex, and then replaced all the calls to
> mutex_lock(&vcpu- >mutex) with a macro that grabs both (in proper order).
>
> Won't that lock out the interrupt user? We want only access to the vcpu
> to be blocked during guest execution, except for the interrupt area
> (which is just mutually excluded) and for the mmu area (likewise).
>
>> However, I felt as though my solution was better for two reasons:
>>
>> 1) Its impossible to bungle the lock- ordering to induce deadlock, regardless
> of nesting depth
>> 2) Its non- contended acquisition overhead is O(1) regardless of nesting
> depth, whereas true nested locks are O(n) (where n = depth)
>>
>
> First I need to understand how it fits.
>
>> In my experience, nested locks work ok if there is only one developer on the
> project. When you get project newbies (like me w.r.t. KVM ;) ) hacking code
> in after the fact, bad things can happen pretty easily.
>
> Right, we need to document the mess.
>
>> I thought this might perhaps be a good way to prevent that. If it turns out
> you guys dont like it, its no big deal to rip it out and go the more
> traditional route. I wanted your feedback. Thats why I submitted this now
> before I had all the issues worked out. ;)
>>
>
> Sure, early & often.
>
>> Keep in mind that my primary intention was to fix the
> kvm_vcpu_(ioctl)_interrupt() function to use finer grained locking than the
> previous vcpu- >mutex that it used to use. Because the old lock was a mutex,
> the new nested lock was also based on a mutex. So its true that its not
> interrupt friendly, but it wasn't to begin with. Whether it actually needs
> to be is something that I do not yet know (see my comments below).
>>
>>
>
> It's needed all right.
>
>>> It's not enough to issue an interrupt, there is a whole dance involved
>>> in the guest side to ack it. You need to go through the apic, which is
>>> currently emulated in userspace. We may push it to the kernel later.
>>>
>>
>> I added this interface as a stab at accommodating your request for PV driver
> support without fully understanding your requirements. Based on your
> comments, I assume that the code that invokes the INTERRUPT ioctl must have
> already updated the APIC model?
>>
>
> Yes.
>
>> Its probably a combination of both. Do you guys just use the standard
> "linux/linus" settings for emacs (or equivalent).
>>
>
> I use the script suggested in Documentation/CodingStyle, plus the
> show- trailing- whitespace customization variable, which saves me from
> committing the sin of leaving trailing whitespace every now and then.
>
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] interrupt preemption support
[not found] ` <45FE4FB1.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
@ 2007-03-19 14:00 ` Avi Kivity
[not found] ` <45FE9793.3060204-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2007-03-19 14:00 UTC (permalink / raw)
To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Gregory Haskins wrote:
> Hi Avi,
> You make good points. I will convert to a nest lock design and resubmit. Should I use two mutexes, or a mutex and spinlock?
>
> Also, do you have any suggestions on the signum I should use to IPI the running guest? Should I use one of the normal signals (SIGUSR) or should I start a block of defined signals in the RT range (>32)?
>
For a short term solution, where the apic is in userspace, we can just
say ipi == signal, and not require any locking. Qemu will catch the
signal and call the appropriate apic function. The signal number should
be set from userspace.
Longer term, with the apic in the kernel, we'll need to protect the
apic/interrupt data structures with a spinlock, and use kick_process()
or similar instead of a signal.
--
error compiling committee.c: too many arguments to function
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] interrupt preemption support
[not found] ` <45FE9793.3060204-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-03-19 14:03 ` Avi Kivity
[not found] ` <45FE9827.5030200-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2007-03-19 14:03 UTC (permalink / raw)
To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Avi Kivity wrote:
> Gregory Haskins wrote:
>> Hi Avi,
>> You make good points. I will convert to a nest lock design and
>> resubmit. Should I use two mutexes, or a mutex and spinlock?
>>
>> Also, do you have any suggestions on the signum I should use to IPI
>> the running guest? Should I use one of the normal signals (SIGUSR)
>> or should I start a block of defined signals in the RT range (>32)?
>>
>
> For a short term solution, where the apic is in userspace, we can just
> say ipi == signal, and not require any locking. Qemu will catch the
> signal and call the appropriate apic function. The signal number
> should be set from userspace.
>
Note that as long as the apic code is in userspace, the sending side is
also in userspace, so all the IPI related stuff doesn't touch the kernel.
--
error compiling committee.c: too many arguments to function
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] interrupt preemption support
[not found] ` <45FE9827.5030200-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-03-19 14:09 ` Gregory Haskins
[not found] ` <45FE5346.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-03-28 22:05 ` Dor Laor
1 sibling, 1 reply; 10+ messages in thread
From: Gregory Haskins @ 2007-03-19 14:09 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
>>> On Mon, Mar 19, 2007 at 10:03 AM, in message <45FE9827.5030200-atKUWr5tajBWk0Htik3J/w@public.gmane.org>,
Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
> Avi Kivity wrote:
>> Gregory Haskins wrote:
>>> Hi Avi,
>>> You make good points. I will convert to a nest lock design and
>>> resubmit. Should I use two mutexes, or a mutex and spinlock?
>>>
>>> Also, do you have any suggestions on the signum I should use to IPI
>>> the running guest? Should I use one of the normal signals (SIGUSR)
>>> or should I start a block of defined signals in the RT range (>32)?
>>>
>>
>> For a short term solution, where the apic is in userspace, we can just
>> say ipi == signal, and not require any locking. Qemu will catch the
>> signal and call the appropriate apic function. The signal number
>> should be set from userspace.
>>
>
> Note that as long as the apic code is in userspace, the sending side is
> also in userspace, so all the IPI related stuff doesn't touch the kernel.
I see. So really the entire approach I took (against the kernel code) is wrong, and I should focus on the QEMU side?
-Greg
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] interrupt preemption support
[not found] ` <45FE5346.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
@ 2007-03-19 14:21 ` Avi Kivity
0 siblings, 0 replies; 10+ messages in thread
From: Avi Kivity @ 2007-03-19 14:21 UTC (permalink / raw)
To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Gregory Haskins wrote:
>> Note that as long as the apic code is in userspace, the sending side is
>> also in userspace, so all the IPI related stuff doesn't touch the kernel.
>>
>
>
> I see. So really the entire approach I took (against the kernel code) is wrong, and I should focus on the QEMU side?
>
Well, eventually we do want to have the apic in the kernel, and we do
want inexpensive inter-vcpu (and driver->vcpu) interrupts. But it seems
that the simplest way to get things going is in qemu.
Note that it's not enough; there's the mmu to harden for smp and basic
support for creating > 1 vcpus.
--
error compiling committee.c: too many arguments to function
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] interrupt preemption support
[not found] ` <45FE9827.5030200-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-03-19 14:09 ` Gregory Haskins
@ 2007-03-28 22:05 ` Dor Laor
1 sibling, 0 replies; 10+ messages in thread
From: Dor Laor @ 2007-03-28 22:05 UTC (permalink / raw)
To: Avi Kivity, Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
>Avi Kivity wrote:
>> Gregory Haskins wrote:
>>> Hi Avi,
>>> You make good points. I will convert to a nest lock design and
>>> resubmit. Should I use two mutexes, or a mutex and spinlock?
>>>
>>> Also, do you have any suggestions on the signum I should use to IPI
>>> the running guest? Should I use one of the normal signals (SIGUSR)
>>> or should I start a block of defined signals in the RT range (>32)?
>>>
>>
>> For a short term solution, where the apic is in userspace, we can
just
>> say ipi == signal, and not require any locking. Qemu will catch the
>> signal and call the appropriate apic function. The signal number
>> should be set from userspace.
>>
>
>Note that as long as the apic code is in userspace, the sending side is
>also in userspace, so all the IPI related stuff doesn't touch the
kernel.
You can look at a working copy for the PV network code in my git tree
git://kvm.qumranet.com/home/dor/src/linux-2.6, pick the
pv-network-driver
And the matching paravirt-network svn branch.
It's working but far from perfect ;)
>
>--
>error compiling committee.c: too many arguments to function
>
>
>-----------------------------------------------------------------------
--
>Take Surveys. Earn Cash. Influence the Future of IT
>Join SourceForge.net's Techsay panel and you'll get the chance to share
>your
>opinions on IT & business topics through brief surveys-and earn cash
>http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVD
EV
>_______________________________________________
>kvm-devel mailing list
>kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
>https://lists.sourceforge.net/lists/listinfo/kvm-devel
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2007-03-28 22:05 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-15 21:10 [PATCH] interrupt preemption support Gregory Haskins
[not found] ` <45F97019.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-03-16 7:03 ` Avi Kivity
[not found] ` <45FA414C.1040703-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-03-16 12:17 ` Gregory Haskins
[not found] ` <45FA4489.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-03-18 5:46 ` Avi Kivity
[not found] ` <45FCD226.2010603-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-03-19 13:54 ` Gregory Haskins
[not found] ` <45FE4FB1.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-03-19 14:00 ` Avi Kivity
[not found] ` <45FE9793.3060204-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-03-19 14:03 ` Avi Kivity
[not found] ` <45FE9827.5030200-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-03-19 14:09 ` Gregory Haskins
[not found] ` <45FE5346.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-03-19 14:21 ` Avi Kivity
2007-03-28 22:05 ` Dor Laor
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox