public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH][RFC] kvm-scheduler integration
@ 2007-07-08 12:58 Avi Kivity
  2007-07-08 13:09 ` Ingo Molnar
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Avi Kivity @ 2007-07-08 12:58 UTC (permalink / raw)
  To: kvm-devel; +Cc: Ingo Molnar, linux-kernel, shaohua.li, Avi Kivity

Intel VT essentially introduces a new set of registers into the processor;
this means we cannot preempt kvm in kernel mode lest a new VM run with
and old VM's registers.  In addition, kvm lazy switches some host registers
as well. (AMD does not introduce new registers, but we still want lazy
msr switching, and we want to know when we move to a different cpu in order
to be able to guarantee a monotonously increasing tsc).

Current kvm code simply disables preemption when guest context is in use.
This, however, has many drawbacks:

- some kvm mmu code is O(n), causing possibly unbounded latencies and causing
  -rt great unhappiness.
- the mmu code wants to sleep (especially with guest paging), but can't.
- some optimizations are not possible; for example, if we switch from one
  VM to another, we need not restore some host registers (as they will simply
  be overwritten with the new guest registers immediately).

This patch adds hooks to the scheduler that allow kvm to be notified about
scheduling decisions involving virtual machines.  When we schedule out a VM,
kvm is told to swap guest registers out; when we schedule the VM in, we swap
the registers back in.

Note that this patch does not include optimizations; it just makes most
kvm code preemptible.  A follow on patch to convert the kvm spinlock to a
mutex should be trivial.

The only fly in the ointment is that it crashes quite soon.  Haven't figured
out why yet, but comments on the general direction would be welcome.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
index 33fa28a..b6aadc8 100644
--- a/drivers/kvm/Kconfig
+++ b/drivers/kvm/Kconfig
@@ -40,4 +40,9 @@ config KVM_AMD
 	  Provides support for KVM on AMD processors equipped with the AMD-V
 	  (SVM) extensions.
 
+config SCHED_KVM
+	bool
+	default y
+	depends on KVM
+
 endif # VIRTUALIZATION
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index a7c5e6b..bb97dcc 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -474,7 +474,7 @@ struct kvm_arch_ops {
 	int (*vcpu_create)(struct kvm_vcpu *vcpu);
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
 
-	void (*vcpu_load)(struct kvm_vcpu *vcpu);
+	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 	void (*vcpu_decache)(struct kvm_vcpu *vcpu);
 
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index ea02719..eb742d4 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -230,8 +230,13 @@ EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
  */
 static void vcpu_load(struct kvm_vcpu *vcpu)
 {
+	int cpu;
+
 	mutex_lock(&vcpu->mutex);
-	kvm_arch_ops->vcpu_load(vcpu);
+	cpu = get_cpu();
+	sched_load_kvm_state(vcpu);
+	kvm_arch_ops->vcpu_load(vcpu, cpu);
+	put_cpu();
 }
 
 /*
@@ -241,19 +246,26 @@ static void vcpu_load(struct kvm_vcpu *vcpu)
 static struct kvm_vcpu *vcpu_load_slot(struct kvm *kvm, int slot)
 {
 	struct kvm_vcpu *vcpu = &kvm->vcpus[slot];
+	int cpu;
 
 	mutex_lock(&vcpu->mutex);
 	if (!vcpu->vmcs) {
 		mutex_unlock(&vcpu->mutex);
 		return NULL;
 	}
-	kvm_arch_ops->vcpu_load(vcpu);
+	cpu = get_cpu();
+	sched_load_kvm_state(vcpu);
+	kvm_arch_ops->vcpu_load(vcpu, cpu);
+	put_cpu();
 	return vcpu;
 }
 
 static void vcpu_put(struct kvm_vcpu *vcpu)
 {
+	preempt_disable();
 	kvm_arch_ops->vcpu_put(vcpu);
+	sched_put_kvm_state();
+	preempt_enable();
 	mutex_unlock(&vcpu->mutex);
 }
 
@@ -1650,9 +1662,7 @@ void kvm_resched(struct kvm_vcpu *vcpu)
 {
 	if (!need_resched())
 		return;
-	vcpu_put(vcpu);
 	cond_resched();
-	vcpu_load(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_resched);
 
@@ -1718,11 +1728,9 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
 	unsigned bytes;
 	int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
 
-	kvm_arch_ops->vcpu_put(vcpu);
 	q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
 		 PAGE_KERNEL);
 	if (!q) {
-		kvm_arch_ops->vcpu_load(vcpu);
 		free_pio_guest_pages(vcpu);
 		return -ENOMEM;
 	}
@@ -1734,7 +1742,6 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
 		memcpy(p, q, bytes);
 	q -= vcpu->pio.guest_page_offset;
 	vunmap(q);
-	kvm_arch_ops->vcpu_load(vcpu);
 	free_pio_guest_pages(vcpu);
 	return 0;
 }
@@ -2375,6 +2382,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 	int r;
 	struct kvm_vcpu *vcpu;
 	struct page *page;
+	int cpu;
 
 	r = -EINVAL;
 	if (!valid_vcpu(n))
@@ -2414,7 +2422,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 	if (r < 0)
 		goto out_free_vcpus;
 
-	kvm_arch_ops->vcpu_load(vcpu);
+	cpu = get_cpu();
+	sched_load_kvm_state(vcpu);
+	kvm_arch_ops->vcpu_load(vcpu, cpu);
+	put_cpu();
 	r = kvm_mmu_setup(vcpu);
 	if (r >= 0)
 		r = kvm_arch_ops->vcpu_setup(vcpu);
@@ -3115,6 +3126,7 @@ hpa_t bad_page_address;
 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
 {
 	int r;
+	struct sched_kvm_hooks sched_hooks;
 
 	if (kvm_arch_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -3158,6 +3170,10 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
 		goto out_free;
 	}
 
+	sched_hooks.vcpu_load = kvm_arch_ops->vcpu_load;
+	sched_hooks.vcpu_put = kvm_arch_ops->vcpu_put;
+	sched_set_kvm_hooks(&sched_hooks);
+
 	return r;
 
 out_free:
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index b297a6b..f60aa87 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -254,9 +254,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 	r = __mmu_topup_memory_caches(vcpu, GFP_NOWAIT);
 	if (r < 0) {
 		spin_unlock(&vcpu->kvm->lock);
-		kvm_arch_ops->vcpu_put(vcpu);
 		r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL);
-		kvm_arch_ops->vcpu_load(vcpu);
 		spin_lock(&vcpu->kvm->lock);
 	}
 	return r;
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index bc818cc..22ae4a1 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -615,7 +615,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu)
 {
 	int cpu, i;
 
-	cpu = get_cpu();
+	cpu = sched_load_kvm_state(vcpu);
 	if (unlikely(cpu != vcpu->cpu)) {
 		u64 tsc_this, delta;
 
@@ -641,7 +641,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 		wrmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]);
 
 	rdtscll(vcpu->host_tsc);
-	put_cpu();
+	sched_put_kvm_state();
 }
 
 static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 80628f6..c72d583 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -179,6 +179,7 @@ static unsigned long vmcs_readl(unsigned long field)
 {
 	unsigned long value;
 
+	WARN_ON(raw_smp_processor_id() != current->kvm_vcpu->cpu);
 	asm volatile (ASM_VMX_VMREAD_RDX_RAX
 		      : "=a"(value) : "d"(field) : "cc");
 	return value;
@@ -214,6 +215,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
 {
 	u8 error;
 
+	WARN_ON(raw_smp_processor_id() != current->kvm_vcpu->cpu);
 	asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
 		       : "=q"(error) : "a"(value), "d"(field) : "cc" );
 	if (unlikely(error))
@@ -366,14 +369,12 @@ static void vmx_load_host_state(struct kvm_vcpu *vcpu)
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
  */
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	u64 phys_addr = __pa(vcpu->vmcs);
-	int cpu;
 	u64 tsc_this, delta;
 
-	cpu = get_cpu();
-
+	WARN_ON(!preempt_count());
 	if (vcpu->cpu != cpu)
 		vcpu_clear(vcpu);
 
@@ -426,9 +432,9 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
 
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	WARN_ON(!preempt_count());
 	vmx_load_host_state(vcpu);
 	kvm_put_guest_fpu(vcpu);
-	put_cpu();
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -2000,6 +2008,8 @@ preempted:
 		kvm_guest_debug_pre(vcpu);
 
 again:
+	preempt_disable();
+
 	if (!vcpu->mmio_read_completed)
 		do_interrupt_requests(vcpu, kvm_run);
 
@@ -2146,6 +2156,9 @@ again:
 	vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
 
 	asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+	vcpu->launched = 1;
+
+	preempt_enable();
 
 	if (unlikely(fail)) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2160,7 +2174,6 @@ again:
 	if (unlikely(prof_on == KVM_PROFILING))
 		profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
 
-	vcpu->launched = 1;
 	r = kvm_handle_exit(kvm_run, vcpu);
 	if (r > 0) {
 		/* Give scheduler a change to reschedule. */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 693f0e6..b705876 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -875,6 +875,10 @@ struct task_struct {
 	pid_t pid;
 	pid_t tgid;
 
+#ifdef CONFIG_SCHED_KVM
+	struct kvm_vcpu *kvm_vcpu;
+#endif
+
 #ifdef CONFIG_CC_STACKPROTECTOR
 	/* Canary value for the -fstack-protector gcc feature */
 	unsigned long stack_canary;
@@ -1369,6 +1373,18 @@ extern int send_group_sigqueue(int, struct sigqueue *,  struct task_struct *);
 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
 
+#ifdef CONFIG_SCHED_KVM
+
+struct sched_kvm_hooks {
+	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+	void (*vcpu_put)(struct kvm_vcpu *vcpu);
+};
+
+int sched_load_kvm_state(struct kvm_vcpu *vcpu);
+void sched_put_kvm_state(void);
+void sched_set_kvm_hooks(struct sched_kvm_hooks *hooks);
+#endif
+
 static inline int kill_cad_pid(int sig, int priv)
 {
 	return kill_pid(cad_pid, sig, priv);
diff --git a/kernel/sched.c b/kernel/sched.c
index 13cdab3..beb3c45 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -57,6 +57,10 @@
 #include <asm/tlb.h>
 #include <asm/unistd.h>
 
+#ifdef CONFIG_SCHED_KVM
+static __read_mostly struct sched_kvm_hooks kvm_hooks;
+#endif
+
 /*
  * Scheduler clock - returns current time in nanosec units.
  * This is default implementation.
@@ -1805,6 +1809,51 @@ void fastcall sched_exit(struct task_struct *p)
 	task_rq_unlock(rq, &flags);
 }
 
+#ifdef CONFIG_SCHED_KVM
+
+int sched_load_kvm_state(struct kvm_vcpu *vcpu)
+{
+	current->kvm_vcpu = vcpu;
+	return raw_smp_processor_id();
+}
+EXPORT_SYMBOL_GPL(sched_load_kvm_state);
+
+void sched_put_kvm_state()
+{
+	current->kvm_vcpu = NULL;
+}
+EXPORT_SYMBOL_GPL(sched_put_kvm_state);
+
+void sched_set_kvm_hooks(struct sched_kvm_hooks *hooks)
+{
+	kvm_hooks = *hooks;
+}
+EXPORT_SYMBOL_GPL(sched_set_kvm_hooks);
+
+static void unload_kvm_vcpu(struct task_struct *tsk)
+{
+	if (tsk->kvm_vcpu)
+		kvm_hooks.vcpu_put(tsk->kvm_vcpu);
+}
+
+static void reload_kvm_vcpu(struct task_struct *tsk)
+{
+	if (tsk->kvm_vcpu)
+		kvm_hooks.vcpu_load(tsk->kvm_vcpu, raw_smp_processor_id());
+}
+
+#else
+
+static void unload_kvm_vcpu(struct task_struct *tsk)
+{
+}
+
+static void reload_kvm_vcpu(struct task_struct *tsk)
+{
+}
+
+#endif
+
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
@@ -1819,6 +1870,7 @@ void fastcall sched_exit(struct task_struct *p)
  */
 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
 {
+	unload_kvm_vcpu(current);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
@@ -1860,6 +1912,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
+	reload_kvm_vcpu(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 12:58 [PATCH][RFC] kvm-scheduler integration Avi Kivity
@ 2007-07-08 13:09 ` Ingo Molnar
  2007-07-08 13:16   ` Avi Kivity
  2007-07-08 13:35 ` Ingo Molnar
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2007-07-08 13:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, linux-kernel, shaohua.li


* Avi Kivity <avi@qumranet.com> wrote:

> Intel VT essentially introduces a new set of registers into the 
> processor; this means we cannot preempt kvm in kernel mode lest a new 
> VM run with and old VM's registers.  In addition, kvm lazy switches 
> some host registers as well. (AMD does not introduce new registers, 
> but we still want lazy msr switching, and we want to know when we move 
> to a different cpu in order to be able to guarantee a monotonously 
> increasing tsc).
> 
> Current kvm code simply disables preemption when guest context is in 
> use. This, however, has many drawbacks:
> 
> - some kvm mmu code is O(n), causing possibly unbounded latencies and causing
>   -rt great unhappiness.
> - the mmu code wants to sleep (especially with guest paging), but can't.
> - some optimizations are not possible; for example, if we switch from one
>   VM to another, we need not restore some host registers (as they will simply
>   be overwritten with the new guest registers immediately).
> 
> This patch adds hooks to the scheduler that allow kvm to be notified 
> about scheduling decisions involving virtual machines.  When we 
> schedule out a VM, kvm is told to swap guest registers out; when we 
> schedule the VM in, we swap the registers back in.

hm, why not do what i have in -rt? See the patch below. Seems to work 
fine for me, although i might be missing something.

	Ingo

------------------------->
Subject: [patch] kvm: make vcpu_load/put preemptible
From: Ingo Molnar <mingo@elte.hu>

make vcpu_load/put preemptible.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/kvm/svm.c |   13 ++++++++++---
 drivers/kvm/vmx.c |   15 ++++++++++++---
 2 files changed, 22 insertions(+), 6 deletions(-)

Index: linux-rt-rebase.q/drivers/kvm/svm.c
===================================================================
--- linux-rt-rebase.q.orig/drivers/kvm/svm.c
+++ linux-rt-rebase.q/drivers/kvm/svm.c
@@ -610,9 +610,17 @@ static void svm_free_vcpu(struct kvm_vcp
 
 static void svm_vcpu_load(struct kvm_vcpu *vcpu)
 {
-	int cpu, i;
+	int cpu = raw_smp_processor_id(), i;
+	cpumask_t this_mask = cpumask_of_cpu(cpu);
+
+	/*
+	 * Keep the context preemptible, but do not migrate
+	 * away to another CPU. TODO: make sure this persists.
+	 * Save/restore original mask.
+	 */
+	if (unlikely(!cpus_equal(current->cpus_allowed, this_mask)))
+		set_cpus_allowed(current, cpumask_of_cpu(cpu));
 
-	cpu = get_cpu();
 	if (unlikely(cpu != vcpu->cpu)) {
 		u64 tsc_this, delta;
 
@@ -638,7 +646,6 @@ static void svm_vcpu_put(struct kvm_vcpu
 		wrmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]);
 
 	rdtscll(vcpu->host_tsc);
-	put_cpu();
 }
 
 static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
Index: linux-rt-rebase.q/drivers/kvm/vmx.c
===================================================================
--- linux-rt-rebase.q.orig/drivers/kvm/vmx.c
+++ linux-rt-rebase.q/drivers/kvm/vmx.c
@@ -241,9 +241,16 @@ static void vmcs_set_bits(unsigned long 
 static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
 {
 	u64 phys_addr = __pa(vcpu->vmcs);
-	int cpu;
+	int cpu = raw_smp_processor_id();
+	cpumask_t this_mask = cpumask_of_cpu(cpu);
 
-	cpu = get_cpu();
+	/*
+	 * Keep the context preemptible, but do not migrate
+	 * away to another CPU. TODO: make sure this persists.
+	 * Save/restore original mask.
+	 */
+	if (unlikely(!cpus_equal(current->cpus_allowed, this_mask)))
+		set_cpus_allowed(current, cpumask_of_cpu(cpu));
 
 	if (vcpu->cpu != cpu)
 		vcpu_clear(vcpu);
@@ -281,7 +288,6 @@ static void vmx_vcpu_load(struct kvm_vcp
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvm_put_guest_fpu(vcpu);
-	put_cpu();
 }
 
 static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
@@ -1862,6 +1868,7 @@ again:
 	}
 #endif
 
+	preempt_disable();
 	asm (
 		/* Store host registers */
 		"pushf \n\t"
@@ -2002,6 +2009,8 @@ again:
 
 		reload_tss();
 	}
+	preempt_enable();
+
 	++vcpu->stat.exits;
 
 #ifdef CONFIG_X86_64

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 13:09 ` Ingo Molnar
@ 2007-07-08 13:16   ` Avi Kivity
  2007-07-08 13:36     ` Ingo Molnar
  0 siblings, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-07-08 13:16 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: kvm-devel, linux-kernel, shaohua.li

Ingo Molnar wrote:
> * Avi Kivity <avi@qumranet.com> wrote:
>
>   
>> Intel VT essentially introduces a new set of registers into the 
>> processor; this means we cannot preempt kvm in kernel mode lest a new 
>> VM run with and old VM's registers.  In addition, kvm lazy switches 
>> some host registers as well. (AMD does not introduce new registers, 
>> but we still want lazy msr switching, and we want to know when we move 
>> to a different cpu in order to be able to guarantee a monotonously 
>> increasing tsc).
>>
>> Current kvm code simply disables preemption when guest context is in 
>> use. This, however, has many drawbacks:
>>
>> - some kvm mmu code is O(n), causing possibly unbounded latencies and causing
>>   -rt great unhappiness.
>> - the mmu code wants to sleep (especially with guest paging), but can't.
>> - some optimizations are not possible; for example, if we switch from one
>>   VM to another, we need not restore some host registers (as they will simply
>>   be overwritten with the new guest registers immediately).
>>
>> This patch adds hooks to the scheduler that allow kvm to be notified 
>> about scheduling decisions involving virtual machines.  When we 
>> schedule out a VM, kvm is told to swap guest registers out; when we 
>> schedule the VM in, we swap the registers back in.
>>     
>
> hm, why not do what i have in -rt? See the patch below. Seems to work 
> fine for me, although i might be missing something.
>   


How can this work with >1 VM?  We need to execute vmptrld with the new 
VM's vmcs before touching any VT registers.

Recent kvm also keeps the guest syscall msrs on the host, so if you 
switch context to a process that then issues a syscall, you enter the 
host kernel at the guest syscall entry point... makes for nice stacktraces.

> 	Ingo
>
> ------------------------->
> Subject: [patch] kvm: make vcpu_load/put preemptible
> From: Ingo Molnar <mingo@elte.hu>
>
> make vcpu_load/put preemptible.
>
> Signed-off-by: Ingo Molnar <mingo@elte.hu>
> ---
>  drivers/kvm/svm.c |   13 ++++++++++---
>  drivers/kvm/vmx.c |   15 ++++++++++++---
>  2 files changed, 22 insertions(+), 6 deletions(-)
>
> Index: linux-rt-rebase.q/drivers/kvm/svm.c
> ===================================================================
> --- linux-rt-rebase.q.orig/drivers/kvm/svm.c
> +++ linux-rt-rebase.q/drivers/kvm/svm.c
> @@ -610,9 +610,17 @@ static void svm_free_vcpu(struct kvm_vcp
>  
>  static void svm_vcpu_load(struct kvm_vcpu *vcpu)
>  {
> -	int cpu, i;
> +	int cpu = raw_smp_processor_id(), i;
> +	cpumask_t this_mask = cpumask_of_cpu(cpu);
> +
> +	/*
> +	 * Keep the context preemptible, but do not migrate
> +	 * away to another CPU. TODO: make sure this persists.
> +	 * Save/restore original mask.
> +	 */
> +	if (unlikely(!cpus_equal(current->cpus_allowed, this_mask)))
> +		set_cpus_allowed(current, cpumask_of_cpu(cpu));
>  
> -	cpu = get_cpu();
>  	if (unlikely(cpu != vcpu->cpu)) {
>  		u64 tsc_this, delta;
>  
> @@ -638,7 +646,6 @@ static void svm_vcpu_put(struct kvm_vcpu
>  		wrmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]);
>  
>  	rdtscll(vcpu->host_tsc);
> -	put_cpu();
>  }
>  
>  static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
> Index: linux-rt-rebase.q/drivers/kvm/vmx.c
> ===================================================================
> --- linux-rt-rebase.q.orig/drivers/kvm/vmx.c
> +++ linux-rt-rebase.q/drivers/kvm/vmx.c
> @@ -241,9 +241,16 @@ static void vmcs_set_bits(unsigned long 
>  static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
>  {
>  	u64 phys_addr = __pa(vcpu->vmcs);
> -	int cpu;
> +	int cpu = raw_smp_processor_id();
> +	cpumask_t this_mask = cpumask_of_cpu(cpu);
>  
> -	cpu = get_cpu();
> +	/*
> +	 * Keep the context preemptible, but do not migrate
> +	 * away to another CPU. TODO: make sure this persists.
> +	 * Save/restore original mask.
> +	 */
> +	if (unlikely(!cpus_equal(current->cpus_allowed, this_mask)))
> +		set_cpus_allowed(current, cpumask_of_cpu(cpu));
>  
>  	if (vcpu->cpu != cpu)
>  		vcpu_clear(vcpu);
> @@ -281,7 +288,6 @@ static void vmx_vcpu_load(struct kvm_vcp
>  static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
>  {
>  	kvm_put_guest_fpu(vcpu);
> -	put_cpu();
>  }
>  
>  static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
> @@ -1862,6 +1868,7 @@ again:
>  	}
>  #endif
>  
> +	preempt_disable();
>  	asm (
>  		/* Store host registers */
>  		"pushf \n\t"
> @@ -2002,6 +2009,8 @@ again:
>  
>  		reload_tss();
>  	}
> +	preempt_enable();
> +
>  	++vcpu->stat.exits;
>  
>  #ifdef CONFIG_X86_64
>   


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 12:58 [PATCH][RFC] kvm-scheduler integration Avi Kivity
  2007-07-08 13:09 ` Ingo Molnar
@ 2007-07-08 13:35 ` Ingo Molnar
  2007-07-08 13:41   ` Avi Kivity
  2007-07-08 19:07 ` Andi Kleen
  2007-07-09  8:50 ` Shaohua Li
  3 siblings, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2007-07-08 13:35 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, linux-kernel, shaohua.li


* Avi Kivity <avi@qumranet.com> wrote:

> +#ifdef CONFIG_SCHED_KVM
> +static __read_mostly struct sched_kvm_hooks kvm_hooks;
> +#endif

please just add a current->put_vcpu() function pointer instead of this 
hooks thing.

>  static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
>  {
> +	unload_kvm_vcpu(current);
>  	prepare_lock_switch(rq, next);
>  	prepare_arch_switch(next);
>  }
> @@ -1860,6 +1912,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
>  	prev_state = prev->state;
>  	finish_arch_switch(prev);
>  	finish_lock_switch(rq, prev);
> +	reload_kvm_vcpu(current);

ok, this looks certainly cheap enough from a scheduler POV, and it 
cleans up the whole KVM/scheduling interaction quite nicely. (I'd not 
bother with tweaking the migration logic, there's enough incentive for 
the scheduler to keep tasks from migrating unnecessarily.)

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 13:16   ` Avi Kivity
@ 2007-07-08 13:36     ` Ingo Molnar
  0 siblings, 0 replies; 26+ messages in thread
From: Ingo Molnar @ 2007-07-08 13:36 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, linux-kernel, shaohua.li


* Avi Kivity <avi@qumranet.com> wrote:

> > hm, why not do what i have in -rt? See the patch below. Seems to 
> > work fine for me, although i might be missing something.
> 
> How can this work with >1 VM?  We need to execute vmptrld with the new 
> VM's vmcs before touching any VT registers.

yeah, indeed...

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 13:35 ` Ingo Molnar
@ 2007-07-08 13:41   ` Avi Kivity
  2007-07-08 13:48     ` Ingo Molnar
  0 siblings, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-07-08 13:41 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: kvm-devel, linux-kernel, shaohua.li

Ingo Molnar wrote:
> * Avi Kivity <avi@qumranet.com> wrote:
>
>   
>> +#ifdef CONFIG_SCHED_KVM
>> +static __read_mostly struct sched_kvm_hooks kvm_hooks;
>> +#endif
>>     
>
> please just add a current->put_vcpu() function pointer instead of this 
> hooks thing.
>
>   

Won't that increase task_struct (16 bytes on 64-bit) unnecessarily?  The 
function pointers are common to all virtual machines.

>>  static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
>>  {
>> +	unload_kvm_vcpu(current);
>>  	prepare_lock_switch(rq, next);
>>  	prepare_arch_switch(next);
>>  }
>> @@ -1860,6 +1912,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
>>  	prev_state = prev->state;
>>  	finish_arch_switch(prev);
>>  	finish_lock_switch(rq, prev);
>> +	reload_kvm_vcpu(current);
>>     
>
> ok, this looks certainly cheap enough from a scheduler POV, and it 
> cleans up the whole KVM/scheduling interaction quite nicely. (I'd not 
> bother with tweaking the migration logic, there's enough incentive for 
> the scheduler to keep tasks from migrating unnecessarily.)
>   

Okay.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 13:41   ` Avi Kivity
@ 2007-07-08 13:48     ` Ingo Molnar
  2007-07-08 13:53       ` Avi Kivity
  2007-07-08 23:32       ` [kvm-devel] " Rusty Russell
  0 siblings, 2 replies; 26+ messages in thread
From: Ingo Molnar @ 2007-07-08 13:48 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, linux-kernel, shaohua.li


* Avi Kivity <avi@qumranet.com> wrote:

> >>+#ifdef CONFIG_SCHED_KVM
> >>+static __read_mostly struct sched_kvm_hooks kvm_hooks;
> >>+#endif
> >
> >please just add a current->put_vcpu() function pointer instead of 
> >this hooks thing.
> 
> Won't that increase task_struct (16 bytes on 64-bit) unnecessarily?  
> The function pointers are common to all virtual machines.

well, this function pointer could then be reused by other virtual 
machines as well, couldnt it? If the task struct overhead is a problem 
(it really isnt, and it's dependent on CONFIG_KVM) then we could switch 
it around to a notifier-alike mechanism.

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 13:48     ` Ingo Molnar
@ 2007-07-08 13:53       ` Avi Kivity
  2007-07-08 13:59         ` Ingo Molnar
  2007-07-08 23:32       ` [kvm-devel] " Rusty Russell
  1 sibling, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-07-08 13:53 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: kvm-devel, linux-kernel, shaohua.li

Ingo Molnar wrote:
> * Avi Kivity <avi@qumranet.com> wrote:
>
>   
>>>> +#ifdef CONFIG_SCHED_KVM
>>>> +static __read_mostly struct sched_kvm_hooks kvm_hooks;
>>>> +#endif
>>>>         
>>> please just add a current->put_vcpu() function pointer instead of 
>>> this hooks thing.
>>>       
>> Won't that increase task_struct (16 bytes on 64-bit) unnecessarily?  
>> The function pointers are common to all virtual machines.
>>     
>
> well, this function pointer could then be reused by other virtual 
> machines as well, couldnt it? 

I don't get this.  If we add a couple of members to task_struct, it 
can't be reused.  The values will be the same across all tasks, but the 
memory will be gone (including tasks which aren't virtual machines).

> If the task struct overhead is a problem 
> (it really isnt, and it's dependent on CONFIG_KVM) then we could switch 
> it around to a notifier-alike mechanism.
>   

I'm hoping that CONFIG_KVM will be enabled on most distro kernels, so we 
need to optimize for that case as well.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 13:53       ` Avi Kivity
@ 2007-07-08 13:59         ` Ingo Molnar
  2007-07-08 15:13           ` Avi Kivity
  2007-07-10 11:18           ` Avi Kivity
  0 siblings, 2 replies; 26+ messages in thread
From: Ingo Molnar @ 2007-07-08 13:59 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, linux-kernel, shaohua.li


* Avi Kivity <avi@qumranet.com> wrote:

> >> Won't that increase task_struct (16 bytes on 64-bit) unnecessarily?  
> >> The function pointers are common to all virtual machines.
> >
> > well, this function pointer could then be reused by other virtual 
> > machines as well, couldnt it?
> 
> I don't get this.  If we add a couple of members to task_struct, it 
> can't be reused.  The values will be the same across all tasks, but 
> the memory will be gone (including tasks which aren't virtual 
> machines).

i mean, the function pointer is set by KVM, but it could be set to a 
different value by other hypervisors.

but ... no strong feelings either way, your patch is certainly fine.

Acked-by: Ingo Molnar <mingo@elte.hu>

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 13:59         ` Ingo Molnar
@ 2007-07-08 15:13           ` Avi Kivity
  2007-07-10 11:18           ` Avi Kivity
  1 sibling, 0 replies; 26+ messages in thread
From: Avi Kivity @ 2007-07-08 15:13 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: kvm-devel, linux-kernel, shaohua.li

Ingo Molnar wrote:
> * Avi Kivity <avi@qumranet.com> wrote:
>
>   
>>>> Won't that increase task_struct (16 bytes on 64-bit) unnecessarily?  
>>>> The function pointers are common to all virtual machines.
>>>>         
>>> well, this function pointer could then be reused by other virtual 
>>> machines as well, couldnt it?
>>>       
>> I don't get this.  If we add a couple of members to task_struct, it 
>> can't be reused.  The values will be the same across all tasks, but 
>> the memory will be gone (including tasks which aren't virtual 
>> machines).
>>     
>
> i mean, the function pointer is set by KVM, but it could be set to a 
> different value by other hypervisors.
>
>   

What other hypervisors? ;-)

Oh, if Rusty wants it for lguest I can put a pointer or a chain into 
task_struct.  But I doubt it -- with paravirtualization there are no 
costly new registers to swap.

> but ... no strong feelings either way, your patch is certainly fine.
>
> Acked-by: Ingo Molnar <mingo@elte.hu>
>   

Thanks!

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 12:58 [PATCH][RFC] kvm-scheduler integration Avi Kivity
  2007-07-08 13:09 ` Ingo Molnar
  2007-07-08 13:35 ` Ingo Molnar
@ 2007-07-08 19:07 ` Andi Kleen
  2007-07-09  6:41   ` Avi Kivity
  2007-07-09  8:50 ` Shaohua Li
  3 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2007-07-08 19:07 UTC (permalink / raw)
  To: avi; +Cc: linux-kernel

Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> writes:
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 693f0e6..b705876 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -875,6 +875,10 @@ struct task_struct {
>  	pid_t pid;
>  	pid_t tgid;
>  
> +#ifdef CONFIG_SCHED_KVM
> +	struct kvm_vcpu *kvm_vcpu;
> +#endif

You should be careful to put this into a cache line that 
is already touched during context switch. Otherwise
if it needs an additional cache miss it might become
very costly

Also it's a bit worrying to expose hooks into the scheduler
to any modules. How would others be stopped from abusing this?

-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [kvm-devel] [PATCH][RFC] kvm-scheduler integration
  2007-07-08 13:48     ` Ingo Molnar
  2007-07-08 13:53       ` Avi Kivity
@ 2007-07-08 23:32       ` Rusty Russell
  2007-07-09  6:39         ` Avi Kivity
  1 sibling, 1 reply; 26+ messages in thread
From: Rusty Russell @ 2007-07-08 23:32 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Avi Kivity, kvm-devel, linux-kernel

On Sun, 2007-07-08 at 15:48 +0200, Ingo Molnar wrote:
> * Avi Kivity <avi@qumranet.com> wrote:
> 
> > >>+#ifdef CONFIG_SCHED_KVM
> > >>+static __read_mostly struct sched_kvm_hooks kvm_hooks;
> > >>+#endif
> > >
> > >please just add a current->put_vcpu() function pointer instead of 
> > >this hooks thing.
> > 
> > Won't that increase task_struct (16 bytes on 64-bit) unnecessarily?  
> > The function pointers are common to all virtual machines.
> 
> well, this function pointer could then be reused by other virtual 
> machines as well, couldnt it? If the task struct overhead is a problem 
> (it really isnt, and it's dependent on CONFIG_KVM) then we could switch 
> it around to a notifier-alike mechanism.

OK, this patch is *ugly*.  Not that there's anything wrong with a patch
which says "I'm going to preempt you", but making it kvm-specific is
ugly.  ISTR times past where I wanted such a hook, although none spring
immediately into my pre-coffee brain.

I think a "struct preempt_ops *" and a "void *preempt_ops_data" inside
every task struct is a better idea.  Call the config option
PREEMPT_SCHED_HOOKS and now there's nothing kvm-specific about it...

Cheers,
Rusty.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [kvm-devel] [PATCH][RFC] kvm-scheduler integration
  2007-07-08 23:32       ` [kvm-devel] " Rusty Russell
@ 2007-07-09  6:39         ` Avi Kivity
  2007-07-10  1:09           ` Rusty Russell
  0 siblings, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-07-09  6:39 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Ingo Molnar, kvm-devel, linux-kernel

Rusty Russell wrote:
> On Sun, 2007-07-08 at 15:48 +0200, Ingo Molnar wrote:
>   
>> * Avi Kivity <avi@qumranet.com> wrote:
>>
>>     
>>>>> +#ifdef CONFIG_SCHED_KVM
>>>>> +static __read_mostly struct sched_kvm_hooks kvm_hooks;
>>>>> +#endif
>>>>>           
>>>> please just add a current->put_vcpu() function pointer instead of 
>>>> this hooks thing.
>>>>         
>>> Won't that increase task_struct (16 bytes on 64-bit) unnecessarily?  
>>> The function pointers are common to all virtual machines.
>>>       
>> well, this function pointer could then be reused by other virtual 
>> machines as well, couldnt it? If the task struct overhead is a problem 
>> (it really isnt, and it's dependent on CONFIG_KVM) then we could switch 
>> it around to a notifier-alike mechanism.
>>     
>
> OK, this patch is *ugly*.  Not that there's anything wrong with a patch
> which says "I'm going to preempt you", but making it kvm-specific is
> ugly.  ISTR times past where I wanted such a hook, although none spring
> immediately into my pre-coffee brain.
>
> I think a "struct preempt_ops *" and a "void *preempt_ops_data" inside
> every task struct is a better idea.  Call the config option
> PREEMPT_SCHED_HOOKS and now there's nothing kvm-specific about it...
>   

I considered that, but your proposal does not allow a single task to
have multiple preemption hooks installed (hookers?!).  Since in general
there's no reason to suppose that users would be mutually exclusive, we
need to have a struct hlist of these things.  All in all this seemed to
indicate that the second user should have the honor of figuring out that
stuff.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 19:07 ` Andi Kleen
@ 2007-07-09  6:41   ` Avi Kivity
  0 siblings, 0 replies; 26+ messages in thread
From: Avi Kivity @ 2007-07-09  6:41 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, KVM

[cc list restored]

Andi Kleen wrote:
> Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> writes:
>   
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 693f0e6..b705876 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -875,6 +875,10 @@ struct task_struct {
>>  	pid_t pid;
>>  	pid_t tgid;
>>  
>> +#ifdef CONFIG_SCHED_KVM
>> +	struct kvm_vcpu *kvm_vcpu;
>> +#endif
>>     
>
> You should be careful to put this into a cache line that 
> is already touched during context switch. Otherwise
> if it needs an additional cache miss it might become
> very costly
>   

Yeah, I'll look into task_struct again to find a good place.

> Also it's a bit worrying to expose hooks into the scheduler
> to any modules. How would others be stopped from abusing this?
>   

The functions for installing a hook are out-of-line and
EXPORT_SYMBOL_GPL'ed, in case that helps.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 12:58 [PATCH][RFC] kvm-scheduler integration Avi Kivity
                   ` (2 preceding siblings ...)
  2007-07-08 19:07 ` Andi Kleen
@ 2007-07-09  8:50 ` Shaohua Li
  2007-07-09  9:46   ` Avi Kivity
  3 siblings, 1 reply; 26+ messages in thread
From: Shaohua Li @ 2007-07-09  8:50 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Ingo Molnar, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 991 bytes --]

On Sun, 2007-07-08 at 20:58 +0800, Avi Kivity wrote:
> The only fly in the ointment is that it crashes quite soon.  Haven't
> figured
> out why yet, but comments on the general direction would be welcome.
Attached patch seems help in my test. prepare_task_switch is called with
irq disabled.


> -static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
> +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  {
>         u64 phys_addr = __pa(vcpu->vmcs);
> -       int cpu;
>         u64 tsc_this, delta;
> 
> -       cpu = get_cpu();
> -
> +       WARN_ON(!preempt_count());
This and below change will break preempt disabled case. better remove
them.

>         if (vcpu->cpu != cpu)
>                 vcpu_clear(vcpu);
> 
> @@ -426,9 +432,9 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
> 
>  static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
>  {
> +       WARN_ON(!preempt_count());
>         vmx_load_host_state(vcpu);
>         kvm_put_guest_fpu(vcpu);
> -       put_cpu();
>  }


[-- Attachment #2: dbg.patch --]
[-- Type: text/x-patch, Size: 779 bytes --]

Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -347,6 +347,7 @@ static void vmx_save_host_state(struct k
 static void vmx_load_host_state(struct kvm_vcpu *vcpu)
 {
 	struct vmx_host_state *hs = &vcpu->vmx_host_state;
+	unsigned long flags;
 
 	if (!hs->loaded)
 		return;
@@ -359,12 +360,12 @@ static void vmx_load_host_state(struct k
 		 * If we have to reload gs, we must take care to
 		 * preserve our gs base.
 		 */
-		local_irq_disable();
+		local_irq_save(flags);
 		load_gs(hs->gs_sel);
 #ifdef CONFIG_X86_64
 		wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
 #endif
-		local_irq_enable();
+		local_irq_restore(flags);
 
 		reload_tss();
 	}

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-09  8:50 ` Shaohua Li
@ 2007-07-09  9:46   ` Avi Kivity
  2007-07-09 10:21     ` Avi Kivity
  0 siblings, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-07-09  9:46 UTC (permalink / raw)
  To: Shaohua Li; +Cc: kvm-devel, Ingo Molnar, linux-kernel

Shaohua Li wrote:
> On Sun, 2007-07-08 at 20:58 +0800, Avi Kivity wrote:
>   
>> The only fly in the ointment is that it crashes quite soon.  Haven't
>> figured
>> out why yet, but comments on the general direction would be welcome.
>>     
> Attached patch seems help in my test. prepare_task_switch is called with
> irq disabled.
>
>
>   

Thanks!!  Will integrate that.


>> -static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
>> +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>  {
>>         u64 phys_addr = __pa(vcpu->vmcs);
>> -       int cpu;
>>         u64 tsc_this, delta;
>>
>> -       cpu = get_cpu();
>> -
>> +       WARN_ON(!preempt_count());
>>     
> This and below change will break preempt disabled case. better remove
> them.
>   


Right.  Will take them out.



-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-09  9:46   ` Avi Kivity
@ 2007-07-09 10:21     ` Avi Kivity
  0 siblings, 0 replies; 26+ messages in thread
From: Avi Kivity @ 2007-07-09 10:21 UTC (permalink / raw)
  To: Shaohua Li; +Cc: kvm-devel, Ingo Molnar, linux-kernel

Avi Kivity wrote:
> Shaohua Li wrote:
>> On Sun, 2007-07-08 at 20:58 +0800, Avi Kivity wrote:
>>  
>>> The only fly in the ointment is that it crashes quite soon.  Haven't
>>> figured
>>> out why yet, but comments on the general direction would be welcome.
>>>     
>> Attached patch seems help in my test. prepare_task_switch is called with
>> irq disabled.
>>
>>
>>   
>
> Thanks!!  Will integrate that.
>


I've committed and pushed this into a new 'sched' branch on kvm.git.  
I'll continually rebase this against 'master'.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [kvm-devel] [PATCH][RFC] kvm-scheduler integration
  2007-07-09  6:39         ` Avi Kivity
@ 2007-07-10  1:09           ` Rusty Russell
  2007-07-10  5:53             ` Avi Kivity
  0 siblings, 1 reply; 26+ messages in thread
From: Rusty Russell @ 2007-07-10  1:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, kvm-devel, linux-kernel

On Mon, 2007-07-09 at 09:39 +0300, Avi Kivity wrote:
> Rusty Russell wrote:
> > I think a "struct preempt_ops *" and a "void *preempt_ops_data" inside
> > every task struct is a better idea.  Call the config option
> > PREEMPT_SCHED_HOOKS and now there's nothing kvm-specific about it...
> >   
> 
> I considered that, but your proposal does not allow a single task to
> have multiple preemption hooks installed (hookers?!).  Since in general
> there's no reason to suppose that users would be mutually exclusive, we
> need to have a struct hlist of these things.  All in all this seemed to
> indicate that the second user should have the honor of figuring out that
> stuff.

No; this is a "I'm doing something magic and need to know before someone
else takes the CPU".  Almost by definition, you cannot have two of them
at the same time.  Let someone else try that if and when...

But having different hooks for different tasks makes a great deal of
sense.  This hook makes a great deal of sense.

But KVM-specific code in the scheduler is just wrong, and I think we all
know that.

Cheers,
Rusty.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [kvm-devel] [PATCH][RFC] kvm-scheduler integration
  2007-07-10  1:09           ` Rusty Russell
@ 2007-07-10  5:53             ` Avi Kivity
  2007-07-10  6:47               ` Rusty Russell
  0 siblings, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-07-10  5:53 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Ingo Molnar, kvm-devel, linux-kernel

Rusty Russell wrote:
> On Mon, 2007-07-09 at 09:39 +0300, Avi Kivity wrote:
>   
>> Rusty Russell wrote:
>>     
>>> I think a "struct preempt_ops *" and a "void *preempt_ops_data" inside
>>> every task struct is a better idea.  Call the config option
>>> PREEMPT_SCHED_HOOKS and now there's nothing kvm-specific about it...
>>>   
>>>       
>> I considered that, but your proposal does not allow a single task to
>> have multiple preemption hooks installed (hookers?!).  Since in general
>> there's no reason to suppose that users would be mutually exclusive, we
>> need to have a struct hlist of these things.  All in all this seemed to
>> indicate that the second user should have the honor of figuring out that
>> stuff.
>>     
>
> No; this is a "I'm doing something magic and need to know before someone
> else takes the CPU".  Almost by definition, you cannot have two of them
> at the same time.  Let someone else try that if and when...
>   

Why can't you have two of them?  Say I'm writing a module to utilize
branch recording to be able to debug a process in reverse (of course
that doesn't really need sched hooks; let's pretend it does).  Why can't
I debug a process that uses kvm?

More importantly, now the two subsystems have to know about each other
so they don't step on each other's toes.

> But having different hooks for different tasks makes a great deal of
> sense.  This hook makes a great deal of sense.
>   

If we make the hooks non-kvm-specific, I'd just add an hlist there.

> But KVM-specific code in the scheduler is just wrong, and I think we all
> know that.
>   

Even if I eradicate all mention of kvm from the patch, it's still kvm
specific.  kvm at least is sensitive to the exact point where we switch
in (it wants interrupts enabled) and it expects certain parameters to
the callbacks.  If $new_abuser needs other conditions or parameters,
which is quite likely IMO as it will most likely have to do with
hardware, then we will need to update the hooks anyway.

Removing 'kvm' from the patch is easy; but I don't like to pretend to
generality where there ain't.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [kvm-devel] [PATCH][RFC] kvm-scheduler integration
  2007-07-10  5:53             ` Avi Kivity
@ 2007-07-10  6:47               ` Rusty Russell
  2007-07-10  7:19                 ` Avi Kivity
  0 siblings, 1 reply; 26+ messages in thread
From: Rusty Russell @ 2007-07-10  6:47 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, kvm-devel, linux-kernel

On Tue, 2007-07-10 at 08:53 +0300, Avi Kivity wrote:
> Rusty Russell wrote:
> > No; this is a "I'm doing something magic and need to know before someone
> > else takes the CPU".  Almost by definition, you cannot have two of them
> > at the same time.  Let someone else try that if and when...
> 
> Why can't you have two of them?  Say I'm writing a module to utilize
> branch recording to be able to debug a process in reverse (of course
> that doesn't really need sched hooks; let's pretend it does).  Why can't
> I debug a process that uses kvm?
>
> More importantly, now the two subsystems have to know about each other
> so they don't step on each other's toes.

Exactly, if we have two at the same time, they need to know about each
other.  Providing infrastructure which lets them avoid thinking about it
is the wrong direction.

> > But KVM-specific code in the scheduler is just wrong, and I think we all
> > know that.
> 
> Even if I eradicate all mention of kvm from the patch, it's still kvm
> specific.  kvm at least is sensitive to the exact point where we switch
> in (it wants interrupts enabled) and it expects certain parameters to
> the callbacks.  If $new_abuser needs other conditions or parameters,
> which is quite likely IMO as it will most likely have to do with
> hardware, then we will need to update the hooks anyway.

If it's not general, then this whole approach is wrong: put it in
arch/*/kernel/process.c:__switch_to and finish_arch_switch.  The
congruent case which comes to mind is lazy FPU handling.

Which brings us to the question: why do you want interrupts enabled?

Cheers,
Rusty.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [kvm-devel] [PATCH][RFC] kvm-scheduler integration
  2007-07-10  6:47               ` Rusty Russell
@ 2007-07-10  7:19                 ` Avi Kivity
  2007-07-10  8:01                   ` Rusty Russell
  2007-07-11  5:50                   ` Avi Kivity
  0 siblings, 2 replies; 26+ messages in thread
From: Avi Kivity @ 2007-07-10  7:19 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Ingo Molnar, kvm-devel, linux-kernel

Rusty Russell wrote:
> On Tue, 2007-07-10 at 08:53 +0300, Avi Kivity wrote:
>   
>> Rusty Russell wrote:
>>     
>>> No; this is a "I'm doing something magic and need to know before someone
>>> else takes the CPU".  Almost by definition, you cannot have two of them
>>> at the same time.  Let someone else try that if and when...
>>>       
>> Why can't you have two of them?  Say I'm writing a module to utilize
>> branch recording to be able to debug a process in reverse (of course
>> that doesn't really need sched hooks; let's pretend it does).  Why can't
>> I debug a process that uses kvm?
>>
>> More importantly, now the two subsystems have to know about each other
>> so they don't step on each other's toes.
>>     
>
> Exactly, if we have two at the same time, they need to know about each
> other.  Providing infrastructure which lets them avoid thinking about it
> is the wrong direction.
>   

With a kvm-specific hook, they can't stop on each other (there can only 
be one).
With a list, they don't stomp on each other.
With a struct preempt_ops but no list, as you propose, they can and will 
stomp on each other.

>   
>>> But KVM-specific code in the scheduler is just wrong, and I think we all
>>> know that.
>>>       
>> Even if I eradicate all mention of kvm from the patch, it's still kvm
>> specific.  kvm at least is sensitive to the exact point where we switch
>> in (it wants interrupts enabled) and it expects certain parameters to
>> the callbacks.  If $new_abuser needs other conditions or parameters,
>> which is quite likely IMO as it will most likely have to do with
>> hardware, then we will need to update the hooks anyway.
>>     
>
> If it's not general, then this whole approach is wrong: put it in
> arch/*/kernel/process.c:__switch_to and finish_arch_switch.  

I imagine other kvm ports will also need this.  It's not arch specific, 
just kvm specific (but that's not really fair: other archs might want 
the switch in another place, or they might not need it after all).

I guess I can put it in arch specific code, but that means both i386 and 
x86_64.

Once we have another user we can try to generalize it.

> The
> congruent case which comes to mind is lazy FPU handling.
>   

That one has preempt_ops in hardware: cr0.ts and #NM.

> Which brings us to the question: why do you want interrupts enabled?
>   

The sched in hook (vcpu_load) sometimes needs to issue an IPI in order 
to flush the VT registers from another cpu into memory.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [kvm-devel] [PATCH][RFC] kvm-scheduler integration
  2007-07-10  7:19                 ` Avi Kivity
@ 2007-07-10  8:01                   ` Rusty Russell
  2007-07-10  8:24                     ` Avi Kivity
  2007-07-11  5:50                   ` Avi Kivity
  1 sibling, 1 reply; 26+ messages in thread
From: Rusty Russell @ 2007-07-10  8:01 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, kvm-devel, linux-kernel

On Tue, 2007-07-10 at 10:19 +0300, Avi Kivity wrote:
> Rusty Russell wrote:
> > Exactly, if we have two at the same time, they need to know about each
> > other.  Providing infrastructure which lets them avoid thinking about it
> > is the wrong direction.
> >   
> 
> With a kvm-specific hook, they can't stop on each other (there can only 
> be one).
> With a list, they don't stomp on each other.
> With a struct preempt_ops but no list, as you propose, they can and will 
> stomp on each other.

I'm not talking about the actual overwriting of someone else's hook.
I'm talking about semantic conflicts involving the actual CPU state.

If I'm lazily restoring some CPU state because I know I don't use it,
and you're lazily restoring some CPU state because you don't use it, we
need to make sure that state doesn't intersect: ie. we need to be aware
of each other.  Only providing a single hook per task forces the second
user to think about it (maybe that lazy state saving needs to be
extracted into common code).

> I guess I can put it in arch specific code, but that means both i386 and 
> x86_64.
> 
> Once we have another user we can try to generalize it.

The problem is that the arch hooks are in the wrong place: 

> > Which brings us to the question: why do you want interrupts enabled?
> 
> The sched in hook (vcpu_load) sometimes needs to issue an IPI in order 
> to flush the VT registers from another cpu into memory.

OK, I'll have to go away and read the code for this.

BTW, I have no problem with #ifdef KVM-style code in arch-specifics.
It's kernel/sched.c which is jarring...

Thanks,
Rusty.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [kvm-devel] [PATCH][RFC] kvm-scheduler integration
  2007-07-10  8:01                   ` Rusty Russell
@ 2007-07-10  8:24                     ` Avi Kivity
  0 siblings, 0 replies; 26+ messages in thread
From: Avi Kivity @ 2007-07-10  8:24 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Ingo Molnar, kvm-devel, linux-kernel

Rusty Russell wrote:
> On Tue, 2007-07-10 at 10:19 +0300, Avi Kivity wrote:
>   
>> Rusty Russell wrote:
>>     
>>> Exactly, if we have two at the same time, they need to know about each
>>> other.  Providing infrastructure which lets them avoid thinking about it
>>> is the wrong direction.
>>>   
>>>       
>> With a kvm-specific hook, they can't stop on each other (there can only 
>> be one).
>> With a list, they don't stomp on each other.
>> With a struct preempt_ops but no list, as you propose, they can and will 
>> stomp on each other.
>>     
>
> I'm not talking about the actual overwriting of someone else's hook.
> I'm talking about semantic conflicts involving the actual CPU state.
>
> If I'm lazily restoring some CPU state because I know I don't use it,
> and you're lazily restoring some CPU state because you don't use it, we
> need to make sure that state doesn't intersect: ie. we need to be aware
> of each other.  Only providing a single hook per task forces the second
> user to think about it (maybe that lazy state saving needs to be
> extracted into common code).
>   

Well, if there's another user of VT instructions, then we certainly need 
to have something central to coordinate it.  No API can prevent this, at 
some point we'll forced to use common sense.

>> I guess I can put it in arch specific code, but that means both i386 and 
>> x86_64.
>>
>> Once we have another user we can try to generalize it.
>>     
>
> The problem is that the arch hooks are in the wrong place: 
>
>   

Yes.

>>> Which brings us to the question: why do you want interrupts enabled?
>>>       
>> The sched in hook (vcpu_load) sometimes needs to issue an IPI in order 
>> to flush the VT registers from another cpu into memory.
>>     
>
> OK, I'll have to go away and read the code for this.
>
> BTW, I have no problem with #ifdef KVM-style code in arch-specifics.
> It's kernel/sched.c which is jarring...
>   

We don't want you jarred, do we? I'll prepare a non-kvm-specific patch 
for review later on.  But I can't bring myself to do a single generic 
hook (it's impossible to use correctly); it will be an hlist-based thing.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-08 13:59         ` Ingo Molnar
  2007-07-08 15:13           ` Avi Kivity
@ 2007-07-10 11:18           ` Avi Kivity
  2007-07-10 11:30             ` Ingo Molnar
  1 sibling, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-07-10 11:18 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: kvm-devel, linux-kernel, shaohua.li

Ingo Molnar wrote:
> * Avi Kivity <avi@qumranet.com> wrote:
>
>   
>>>> Won't that increase task_struct (16 bytes on 64-bit) unnecessarily?  
>>>> The function pointers are common to all virtual machines.
>>>>         
>>> well, this function pointer could then be reused by other virtual 
>>> machines as well, couldnt it?
>>>       
>> I don't get this.  If we add a couple of members to task_struct, it 
>> can't be reused.  The values will be the same across all tasks, but 
>> the memory will be gone (including tasks which aren't virtual 
>> machines).
>>     
>
> i mean, the function pointer is set by KVM, but it could be set to a 
> different value by other hypervisors.
>
> but ... no strong feelings either way, your patch is certainly fine.
>
> Acked-by: Ingo Molnar <mingo@elte.hu>
>
> 	Ingo
>   

How do you feel about some variant of this going into 2.6.23-rc1?  I 
initially thought of this as a 2.6.24 thing, but as it now looks solid, 
maybe we can hurry things along.

If Shaohua ports his spinlock->mutex convertion to the sched branch, we 
get some real benefits:

 - reduced latencies for desktop users
 - less kvm patches to carry in -rt (maybe none?)


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH][RFC] kvm-scheduler integration
  2007-07-10 11:18           ` Avi Kivity
@ 2007-07-10 11:30             ` Ingo Molnar
  0 siblings, 0 replies; 26+ messages in thread
From: Ingo Molnar @ 2007-07-10 11:30 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, linux-kernel, shaohua.li


* Avi Kivity <avi@qumranet.com> wrote:

> How do you feel about some variant of this going into 2.6.23-rc1?  I 
> initially thought of this as a 2.6.24 thing, but as it now looks 
> solid, maybe we can hurry things along.

yeah, fine by me. You were working on a more generic patch, right? I 
think a good approach would be an atomic notifier chain.

> If Shaohua ports his spinlock->mutex convertion to the sched branch, 
> we get some real benefits:
> 
> - reduced latencies for desktop users
> - less kvm patches to carry in -rt (maybe none?)

yeah, and i think we can reach 'none'.

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [kvm-devel] [PATCH][RFC] kvm-scheduler integration
  2007-07-10  7:19                 ` Avi Kivity
  2007-07-10  8:01                   ` Rusty Russell
@ 2007-07-11  5:50                   ` Avi Kivity
  1 sibling, 0 replies; 26+ messages in thread
From: Avi Kivity @ 2007-07-11  5:50 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Ingo Molnar, kvm-devel, linux-kernel

Avi Kivity wrote:
>
>> The
>> congruent case which comes to mind is lazy FPU handling.
>>   
>
> That one has preempt_ops in hardware: cr0.ts and #NM.

However, that doesn't handle in-kernel use of the fpu.  
kernel_fpu_begin()/kernel_fpu_end() could easily be modified to take 
advantage of a generic preempt hook.  I'll add a patch for that, and we 
gain preemptible raid-5 parity calculation.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2007-07-11  6:18 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-07-08 12:58 [PATCH][RFC] kvm-scheduler integration Avi Kivity
2007-07-08 13:09 ` Ingo Molnar
2007-07-08 13:16   ` Avi Kivity
2007-07-08 13:36     ` Ingo Molnar
2007-07-08 13:35 ` Ingo Molnar
2007-07-08 13:41   ` Avi Kivity
2007-07-08 13:48     ` Ingo Molnar
2007-07-08 13:53       ` Avi Kivity
2007-07-08 13:59         ` Ingo Molnar
2007-07-08 15:13           ` Avi Kivity
2007-07-10 11:18           ` Avi Kivity
2007-07-10 11:30             ` Ingo Molnar
2007-07-08 23:32       ` [kvm-devel] " Rusty Russell
2007-07-09  6:39         ` Avi Kivity
2007-07-10  1:09           ` Rusty Russell
2007-07-10  5:53             ` Avi Kivity
2007-07-10  6:47               ` Rusty Russell
2007-07-10  7:19                 ` Avi Kivity
2007-07-10  8:01                   ` Rusty Russell
2007-07-10  8:24                     ` Avi Kivity
2007-07-11  5:50                   ` Avi Kivity
2007-07-08 19:07 ` Andi Kleen
2007-07-09  6:41   ` Avi Kivity
2007-07-09  8:50 ` Shaohua Li
2007-07-09  9:46   ` Avi Kivity
2007-07-09 10:21     ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox