Some Code for Performance Profiling

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* Some Code for Performance Profiling
@ 2010-03-31 16:53 Jiaqing Du
  2010-04-05  8:34 ` Avi Kivity
  0 siblings, 1 reply; 4+ messages in thread
From: Jiaqing Du @ 2010-03-31 16:53 UTC (permalink / raw)
  To: kvm; +Cc: Nipun sehrawat

Hi,

We have some code about performance profiling in KVM. They are outputs
of a school project. Previous discussions in KVM, Perfmon2, and Xen
mailing lists helped us a lot. The code are NOT in a good shape and
are only used to demonstrated the feasibility of doing performance
profiling in KVM. Feel free to use it if you want.

We categorize performance profiling in a virtualized environment into
two types: *guest-wide profiling* and *system-wide profiling*. For
guest-wide profiling, only the guest is profiled. KVM virtualizes the
PMU and the user runs a profiler directly in the guest. It requires no
modifications to the guest OS and the profiler running in the guest.
For system-wide profiling, both KVM and the guest OS are profiled. The
results are similar to what XenOprof outputs. In this case, one
profiler running in the host and one profiler running in the guest.
Still it requires no modifications to the guest and the profiler
running in it.

For guest-wide profiling, there are two possible places to save and
restore the related MSRs. One is where the CPU switches between guest
mode and host mode. We call this *CPU-switch*. Profiling with this
enabled reflects how the guest behaves on the physical CPU, plus other
virtualized, not emulated, devices. The other place is where the CPU
switches between the KVM context and others. Here KVM context means
the CPU is executing guest code or KVM code, both kernel space and
user space. We call this *domain-switch*. Profiling with this enabled
discloses how the guest behaves on both the physical CPU and KVM.
(Some emulated operations are really expensive in a virtualized
environment.)

More details can be found at http://jiaqing.org/download/profiling_kvm.tgz


=============Guest-wide profiling with domain-switch, for
Linux-2.6.32==================

diff --git a/arch/x86/include/asm/thread_info.h
b/arch/x86/include/asm/thread_info.h
index d27d0a2..b749b5d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -96,6 +96,7 @@ struct thread_info {
 #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_VPMU_CTXSW      29  /* KVM thread tag */

 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -119,6 +120,7 @@ struct thread_info {
 #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_VPMU_CTXSW         (1 << TIF_VPMU_CTXSW)

 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -146,8 +148,9 @@ struct thread_info {

 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
-
+	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC|   \
+     _TIF_VPMU_CTXSW)
+
 #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2..d5269d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -178,6 +178,53 @@ int set_tsc_mode(unsigned int val)
 	return 0;
 }

+static const u32 vmx_pmu_msr_index[] = {
+  MSR_P6_EVNTSEL0, MSR_P6_EVNTSEL1, MSR_P6_PERFCTR0, MSR_P6_PERFCTR1,
+};
+#define NR_VMX_PMU_MSR ARRAY_SIZE(vmx_pmu_msr_index)
+static u64 vpmu_msr_list[NR_VMX_PMU_MSR];
+
+static void vpmu_load_msrs(u64 *msr_list)
+{
+    u64 *p = msr_list;
+    int i;
+
+	for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+	    wrmsrl(vmx_pmu_msr_index[i], *p);
+	    p++;
+	}
+}
+
+static void vpmu_save_msrs(u64 *msr_list)
+{
+    u64 *p = msr_list;
+    int i;
+
+	for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+	    rdmsrl(vmx_pmu_msr_index[i], *p);
+	    p++;
+	}
+}
+
+#define P6_EVENTSEL0_ENABLE     (1 << 22)
+static void enable_perf(void)
+{
+    u64 val;
+
+    rdmsrl(MSR_P6_EVNTSEL0, val);
+    val |= P6_EVENTSEL0_ENABLE;
+    wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void disable_perf(void)
+{
+    u64 val;
+
+    rdmsrl(MSR_P6_EVNTSEL0, val);
+    val &= ~P6_EVENTSEL0_ENABLE;
+    wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		      struct tss_struct *tss)
 {
@@ -186,6 +233,21 @@ void __switch_to_xtra(struct task_struct *prev_p,
struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;

+    if (test_tsk_thread_flag(prev_p, TIF_VPMU_CTXSW) &&
+            test_tsk_thread_flag(next_p, TIF_VPMU_CTXSW)) {
+        /* do nothing, still in KVM context */
+    } else {
+        if (test_tsk_thread_flag(prev_p, TIF_VPMU_CTXSW)) {
+            disable_perf();
+            vpmu_save_msrs(vpmu_msr_list);
+        }
+
+        if (test_tsk_thread_flag(next_p, TIF_VPMU_CTXSW)) {
+            vpmu_load_msrs(vpmu_msr_list);
+            enable_perf();
+        }
+    }
+
 	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
 	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
 		ds_switch_to(prev_p, next_p);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed53b42..4f4ff86 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -34,6 +34,7 @@
 #include <asm/vmx.h>
 #include <asm/virtext.h>
 #include <asm/mce.h>
+#include <linux/kdebug.h>

 #include "trace.h"

@@ -127,6 +128,7 @@ static u64 construct_eptp(unsigned long root_hpa);
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct kvm_vcpu *, cur_exit_vcpu);

 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
@@ -3603,6 +3605,7 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+    int cpu = raw_smp_processor_id();

 	if (enable_ept && is_paging(vcpu)) {
 		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
@@ -3639,6 +3642,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
 	if (vcpu->arch.switch_db_regs)
 		set_debugreg(vcpu->arch.dr6, 6);

+    /* record the exited vcpu */
+    per_cpu(cur_exit_vcpu, cpu) = vcpu;
+
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -3985,6 +3991,43 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.gb_page_enable = vmx_gb_page_enable,
 };

+static void guest_set_apic(void *info)
+{
+    unsigned int v;
+
+    v = apic_read(APIC_LVTERR);
+    apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+    apic_write(APIC_LVTPC, APIC_DM_NMI);
+    apic_write(APIC_LVTERR, v);
+}
+
+static int vmx_vcpu_nmi_notify(struct notifer_block *self,
+                   unsigned long val, void *data)
+{
+    int cpu = raw_smp_processor_id();
+    struct kvm_vcpu *vcpu = per_cpu(cur_exit_vcpu, cpu);
+    int ret = NOTIFY_DONE;
+
+    switch (val) {
+    case DIE_NMI:
+    case DIE_NMI_IPI:
+            guest_set_apic(NULL);
+            vcpu->cntr_overflow = 1;
+            vcpu->nmi_nr++;
+            ret = NOTIFY_STOP;
+            break;
+    default:
+            break;
+    }
+    return ret;
+}
+
+static struct notifier_block vmx_vcpu_nb = {
+        .notifier_call = vmx_vcpu_nmi_notify,
+        .next = NULL,
+        .priority = 3
+};
+
 static int __init vmx_init(void)
 {
 	int r;
@@ -4036,6 +4079,17 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);

+	vmx_disable_intercept_for_msr(MSR_P6_PERFCTR0, false);
+	vmx_disable_intercept_for_msr(MSR_P6_PERFCTR1, false);
+	vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL0, false);
+	vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL1, false);
+
+    if (register_die_notifier(&vmx_vcpu_nb)) {
+        printk(KERN_ALERT "[hw_vpmu]: Register NMI handler failed..\n");
+    } else {
+        printk(KERN_ALERT "[hw_vpmu]: Register NMI handler succeeded..\n");
+    }
+
 	if (enable_ept) {
 		bypass_guest_pf = 0;
 		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
@@ -4071,6 +4125,9 @@ static void __exit vmx_exit(void)
 	free_page((unsigned long)vmx_io_bitmap_b);
 	free_page((unsigned long)vmx_io_bitmap_a);

+    unregister_die_notifier(&vmx_vcpu_nb);
+	printk(KERN_ALERT "[hw_vpmu]: Remove NMI handler module..\n");
+
 	kvm_exit();
 }

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae07d26..1abedb4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3615,6 +3615,11 @@ static int vcpu_enter_guest(struct kvm_vcpu
*vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}

+    if (vcpu->cntr_overflow) {
+            vcpu->arch.nmi_pending = 1;
+            vcpu->cntr_overflow = 0;
+    }
+
 	inject_pending_event(vcpu, kvm_run);

 	/* enable NMI/IRQ window open exits if needed */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b7bbb5d..96d63d1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -99,6 +99,9 @@ struct kvm_vcpu {
 	gpa_t mmio_phys_addr;
 #endif

+    int cntr_overflow;
+    int nmi_nr;
+
 	struct kvm_vcpu_arch arch;
 };

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index cf24c20..b0942c1 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -225,6 +225,9 @@ extern int flush_work(struct work_struct *work);

 extern int cancel_work_sync(struct work_struct *work);

+extern struct task_struct * thread_of_workqueue(struct workqueue_struct *wq,
+        int cpu);
+
 /*
  * Kill off a pending schedule_delayed_work().  Note that the work callback
  * function may still be running on return from cancel_delayed_work(), unless
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b..5eb9503 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -150,6 +150,15 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }

+struct task_struct * thread_of_workqueue(struct workqueue_struct *wq,
+        int cpu)
+{
+    struct cpu_workqueue_struct *cwq = wq_per_cpu(wq, cpu);
+
+    return cwq->thread;
+}
+EXPORT_SYMBOL_GPL(thread_of_workqueue);
+
 /**
  * queue_work - queue work on a workqueue
  * @wq: workqueue to use
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index bb4ebd8..33b5da8 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -318,10 +318,18 @@ kvm_irqfd_release(struct kvm *kvm)
  */
 static int __init irqfd_module_init(void)
 {
+    int cpu = raw_smp_processor_id();
+    struct task_struct *thread;
+
 	irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
 	if (!irqfd_cleanup_wq)
 		return -ENOMEM;

+    thread = thread_of_workqueue(irqfd_cleanup_wq, cpu);
+    set_tsk_thread_flag(thread, TIF_VPMU_CTXSW);
+    printk(KERN_ALERT "[hw_vpmu]: monitored irqfd thread id = %d\n",
+            (int)thread->pid);
+
 	return 0;
 }

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7495ce3..355bff5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1809,6 +1809,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm
*kvm, u32 id)
 		kvm->bsp_vcpu = vcpu;
 #endif
 	mutex_unlock(&kvm->lock);
+
+    set_tsk_thread_flag(current, TIF_VPMU_CTXSW);
+    printk(KERN_ALERT "[hw_vpmu]: monitored vcpu thread id = %d\n",
+            (int)current->pid);
+
 	return r;

 vcpu_destroy:
@@ -2360,6 +2365,10 @@ static int kvm_dev_ioctl_create_vm(void)
 	if (fd < 0)
 		kvm_put_kvm(kvm);

+    set_tsk_thread_flag(current, TIF_VPMU_CTXSW);
+    printk(KERN_ALERT "[hw_vpmu]: monitored main thread id = %d\n",
+            (int)current->pid);
+
 	return fd;
 }



=============Guest-wide profiling with cpu-switch, for
Linux-2.6.32==================

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed53b42..970b5ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -34,6 +34,7 @@
 #include <asm/vmx.h>
 #include <asm/virtext.h>
 #include <asm/mce.h>
+#include <linux/kdebug.h>

 #include "trace.h"

@@ -114,6 +115,9 @@ struct vcpu_vmx {
 	ktime_t entry_time;
 	s64 vnmi_blocked_time;
 	u32 exit_reason;
+
+    unsigned long *msr_host_load_store;
+    unsigned long *msr_guest_load_store;
 };

 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -127,12 +131,18 @@ static u64 construct_eptp(unsigned long root_hpa);
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct kvm_vcpu *, cur_exit_vcpu);

 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;

+static const u32 vmx_pmu_msr_index[] = {
+  MSR_P6_EVNTSEL0, MSR_P6_EVNTSEL1, MSR_P6_PERFCTR0, MSR_P6_PERFCTR1,
+};
+#define NR_VMX_PMU_MSR ARRAY_SIZE(vmx_pmu_msr_index)
+
 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 static DEFINE_SPINLOCK(vmx_vpid_lock);

@@ -2272,6 +2282,14 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
 	vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));

+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, NR_VMX_PMU_MSR);
+	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_guest_load_store));
+	
+	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, NR_VMX_PMU_MSR);
+	vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_guest_load_store));
+	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, NR_VMX_PMU_MSR);
+	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_host_load_store));
+
 	if (cpu_has_vmx_msr_bitmap())
 		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));

@@ -2340,9 +2358,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)

 	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
 	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
-	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
-	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);

 	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
 	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
@@ -3600,9 +3615,34 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 #define Q "l"
 #endif

+static void guest_set_apic(void *info)
+{
+    unsigned int v;
+
+    v = apic_read(APIC_LVTERR);
+    apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+    apic_write(APIC_LVTPC, APIC_DM_NMI);
+    apic_write(APIC_LVTERR, v);
+}
+
+static void save_host_msrs(struct vcpu_vmx *vmx)
+{
+    u32 *p;
+    int i;
+
+	p = (u32 *)vmx->msr_host_load_store;
+	for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+	    *p = vmx_pmu_msr_index[i];
+	    p += 2;
+	    rdmsrl(vmx_pmu_msr_index[i], *((u64 *)p));
+	    p += 2;
+	}
+}
+
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+    int cpu = raw_smp_processor_id();

 	if (enable_ept && is_paging(vcpu)) {
 		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
@@ -3639,6 +3679,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
 	if (vcpu->arch.switch_db_regs)
 		set_debugreg(vcpu->arch.dr6, 6);

+    /* record the exited vcpu */
+    per_cpu(cur_exit_vcpu, cpu) = vcpu;
+
+    /* The guest counters are reloaded by the hardware later. */
+    save_host_msrs(vmx);
+
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -3750,6 +3796,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
 	vmx->launched = 1;

 	vmx_complete_interrupts(vmx);
+
+    /* always clear LVTPC bit */
+    guest_set_apic(NULL);
+
 }

 #undef R
@@ -3766,6 +3816,59 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 	}
 }

+static int vmx_create_vpmu_msrs(struct kvm_vcpu *vcpu)
+{
+    int i, r = 0;
+    u32 *p;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	vmx->msr_host_load_store = (unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx->msr_host_load_store) {
+	    r = -ENOMEM;
+	}
+	
+	vmx->msr_guest_load_store = (unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx->msr_guest_load_store) {
+	    r = -ENOMEM;
+	    goto free_msr_host;
+	}
+	
+	memset(vmx->msr_host_load_store, 0x00, PAGE_SIZE);
+	memset(vmx->msr_guest_load_store, 0x00, PAGE_SIZE);
+
+	/* Initialize load&store memory area. Use the contents of host MSRs as
+     * initial values.. */
+	p = (u32 *)vmx->msr_host_load_store;
+	for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+	    *p = vmx_pmu_msr_index[i];
+	    p += 2;
+	    rdmsrl(vmx_pmu_msr_index[i], *((u64 *)p));
+	    p += 2;
+	}
+	
+	p = (u32 *)vmx->msr_guest_load_store;
+	for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+	    *p = vmx_pmu_msr_index[i];
+	    p += 2;
+	    rdmsrl(vmx_pmu_msr_index[i], *((u64 *)p));
+	    p += 2;
+	}
+
+    return r;
+
+free_msr_host:
+    free_page((unsigned long)vmx->msr_host_load_store);
+    return r;
+}
+
+static void vmx_free_vpmu_msrs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	free_page((unsigned long)vmx->msr_host_load_store);
+	free_page((unsigned long)vmx->msr_guest_load_store);
+}
+
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3777,6 +3880,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	vmx_free_vmcs(vcpu);
 	kfree(vmx->host_msrs);
 	kfree(vmx->guest_msrs);
+    vmx_free_vpmu_msrs(vcpu);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
@@ -3812,6 +3916,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct
kvm *kvm, unsigned int id)

 	vmcs_clear(vmx->vmcs);

+    if (vmx_create_vpmu_msrs(&vmx->vcpu))
+        goto free_vmcs;
+
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
 	err = vmx_vcpu_setup(vmx);
@@ -3985,6 +4092,33 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.gb_page_enable = vmx_gb_page_enable,
 };

+static int vmx_vcpu_nmi_notify(struct notifer_block *self,
+                   unsigned long val, void *data)
+{
+	    int cpu = raw_smp_processor_id();
+        struct kvm_vcpu *vcpu = per_cpu(cur_exit_vcpu, cpu);
+        int ret = NOTIFY_DONE;
+
+        switch (val) {
+        case DIE_NMI:
+        case DIE_NMI_IPI:
+                guest_set_apic(NULL);
+                vcpu->cntr_overflow = 1;
+                vcpu->nmi_nr++;
+                ret = NOTIFY_STOP;
+                break;
+        default:
+                break;
+        }
+        return ret;
+}
+
+static struct notifier_block vmx_vcpu_nb = {
+        .notifier_call = vmx_vcpu_nmi_notify,
+        .next = NULL,
+        .priority = 3
+};
+
 static int __init vmx_init(void)
 {
 	int r;
@@ -4036,6 +4170,17 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);

+	vmx_disable_intercept_for_msr(MSR_P6_PERFCTR0, false);
+	vmx_disable_intercept_for_msr(MSR_P6_PERFCTR1, false);
+	vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL0, false);
+	vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL1, false);
+
+    if (register_die_notifier(&vmx_vcpu_nb)) {
+        printk(KERN_ALERT "[hw_vpmu]: Register NMI handler failed..\n");
+    } else {
+        printk(KERN_ALERT "[hw_vpmu]: Register NMI handler succeeded..\n");
+    }
+
 	if (enable_ept) {
 		bypass_guest_pf = 0;
 		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
@@ -4071,6 +4216,9 @@ static void __exit vmx_exit(void)
 	free_page((unsigned long)vmx_io_bitmap_b);
 	free_page((unsigned long)vmx_io_bitmap_a);

+    unregister_die_notifier(&vmx_vcpu_nb);
+	printk(KERN_ALERT "[hw_vpmu]: Remove NMI handler module..\n");
+
 	kvm_exit();
 }

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae07d26..1abedb4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3615,6 +3615,11 @@ static int vcpu_enter_guest(struct kvm_vcpu
*vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}

+    if (vcpu->cntr_overflow) {
+            vcpu->arch.nmi_pending = 1;
+            vcpu->cntr_overflow = 0;
+    }
+
 	inject_pending_event(vcpu, kvm_run);

 	/* enable NMI/IRQ window open exits if needed */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b7bbb5d..96d63d1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -99,6 +99,9 @@ struct kvm_vcpu {
 	gpa_t mmio_phys_addr;
 #endif

+    int cntr_overflow;
+    int nmi_nr;
+
 	struct kvm_vcpu_arch arch;
 };

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: Some Code for Performance Profiling
  2010-03-31 16:53 Some Code for Performance Profiling Jiaqing Du
@ 2010-04-05  8:34 ` Avi Kivity
  2010-04-07 19:23   ` Jiaqing Du
  0 siblings, 1 reply; 4+ messages in thread
From: Avi Kivity @ 2010-04-05  8:34 UTC (permalink / raw)
  To: Jiaqing Du; +Cc: kvm, Nipun sehrawat

On 03/31/2010 07:53 PM, Jiaqing Du wrote:
> Hi,
>
> We have some code about performance profiling in KVM. They are outputs
> of a school project. Previous discussions in KVM, Perfmon2, and Xen
> mailing lists helped us a lot. The code are NOT in a good shape and
> are only used to demonstrated the feasibility of doing performance
> profiling in KVM. Feel free to use it if you want.
>    

Performance monitoring is an important feature for kvm.  Is there any 
chance you can work at getting it into good shape?

> We categorize performance profiling in a virtualized environment into
> two types: *guest-wide profiling* and *system-wide profiling*. For
> guest-wide profiling, only the guest is profiled. KVM virtualizes the
> PMU and the user runs a profiler directly in the guest. It requires no
> modifications to the guest OS and the profiler running in the guest.
> For system-wide profiling, both KVM and the guest OS are profiled. The
> results are similar to what XenOprof outputs. In this case, one
> profiler running in the host and one profiler running in the guest.
> Still it requires no modifications to the guest and the profiler
> running in it.
>    

Can your implementation support both simultaneously?

> For guest-wide profiling, there are two possible places to save and
> restore the related MSRs. One is where the CPU switches between guest
> mode and host mode. We call this *CPU-switch*. Profiling with this
> enabled reflects how the guest behaves on the physical CPU, plus other
> virtualized, not emulated, devices. The other place is where the CPU
> switches between the KVM context and others. Here KVM context means
> the CPU is executing guest code or KVM code, both kernel space and
> user space. We call this *domain-switch*. Profiling with this enabled
> discloses how the guest behaves on both the physical CPU and KVM.
> (Some emulated operations are really expensive in a virtualized
> environment.)
>    

Which method do you use?  Or do you support both?

Note disclosing host pmu data to the guest is sometimes a security issue.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Some Code for Performance Profiling
  2010-04-05  8:34 ` Avi Kivity
@ 2010-04-07 19:23   ` Jiaqing Du
  2010-04-07 19:30     ` Avi Kivity
  0 siblings, 1 reply; 4+ messages in thread
From: Jiaqing Du @ 2010-04-07 19:23 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Nipun sehrawat

2010/4/5 Avi Kivity <avi@redhat.com>:
> On 03/31/2010 07:53 PM, Jiaqing Du wrote:
>>
>> Hi,
>>
>> We have some code about performance profiling in KVM. They are outputs
>> of a school project. Previous discussions in KVM, Perfmon2, and Xen
>> mailing lists helped us a lot. The code are NOT in a good shape and
>> are only used to demonstrated the feasibility of doing performance
>> profiling in KVM. Feel free to use it if you want.
>>
>
> Performance monitoring is an important feature for kvm.  Is there any chance
> you can work at getting it into good shape?

I have been following the discussions about PMU virtualization in the
list for a while. Exporting a proper interface, i.e., guest visible
MSRs and supported events, to the guest across a large number physical
CPUs from different vendors, families, and models is the major
problem. For KVM, currently it also supports almost a dozen different
types of virtual CPUs. I will think about it and try to come up with
something more general.

>
>> We categorize performance profiling in a virtualized environment into
>> two types: *guest-wide profiling* and *system-wide profiling*. For
>> guest-wide profiling, only the guest is profiled. KVM virtualizes the
>> PMU and the user runs a profiler directly in the guest. It requires no
>> modifications to the guest OS and the profiler running in the guest.
>> For system-wide profiling, both KVM and the guest OS are profiled. The
>> results are similar to what XenOprof outputs. In this case, one
>> profiler running in the host and one profiler running in the guest.
>> Still it requires no modifications to the guest and the profiler
>> running in it.
>>
>
> Can your implementation support both simultaneously?

What do you mean "simultaneously"? With my implementation, you either
do guest-wide profiling or system-wide profiling. They are achieved
through different patches. Actually, the result of guest-wide
profiling is a subset of system-wide profiling.

>
>> For guest-wide profiling, there are two possible places to save and
>> restore the related MSRs. One is where the CPU switches between guest
>> mode and host mode. We call this *CPU-switch*. Profiling with this
>> enabled reflects how the guest behaves on the physical CPU, plus other
>> virtualized, not emulated, devices. The other place is where the CPU
>> switches between the KVM context and others. Here KVM context means
>> the CPU is executing guest code or KVM code, both kernel space and
>> user space. We call this *domain-switch*. Profiling with this enabled
>> discloses how the guest behaves on both the physical CPU and KVM.
>> (Some emulated operations are really expensive in a virtualized
>> environment.)
>>
>
> Which method do you use?  Or do you support both?

I post two patches in my previous email. One is for CPU-switch, and
the other is for domain-switch.

>
> Note disclosing host pmu data to the guest is sometimes a security issue.
>

For instance?

> --
> Do not meddle in the internals of kernels, for they are subtle and quick to
> panic.
>
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Some Code for Performance Profiling
  2010-04-07 19:23   ` Jiaqing Du
@ 2010-04-07 19:30     ` Avi Kivity
  0 siblings, 0 replies; 4+ messages in thread
From: Avi Kivity @ 2010-04-07 19:30 UTC (permalink / raw)
  To: Jiaqing Du; +Cc: kvm, Nipun sehrawat

On 04/07/2010 10:23 PM, Jiaqing Du wrote:
>
>> Can your implementation support both simultaneously?
>>      
> What do you mean "simultaneously"? With my implementation, you either
> do guest-wide profiling or system-wide profiling. They are achieved
> through different patches. Actually, the result of guest-wide
> profiling is a subset of system-wide profiling.
>
>    

A guest admin monitors the performance of their guest via a vpmu.  
Meanwhile the host admin monitors the performance of the host (including 
all guests) using the host pmu.  Given that the host pmu and the vpmu 
may select different counters, it is difficult to support both 
simultaneously.

>>> For guest-wide profiling, there are two possible places to save and
>>> restore the related MSRs. One is where the CPU switches between guest
>>> mode and host mode. We call this *CPU-switch*. Profiling with this
>>> enabled reflects how the guest behaves on the physical CPU, plus other
>>> virtualized, not emulated, devices. The other place is where the CPU
>>> switches between the KVM context and others. Here KVM context means
>>> the CPU is executing guest code or KVM code, both kernel space and
>>> user space. We call this *domain-switch*. Profiling with this enabled
>>> discloses how the guest behaves on both the physical CPU and KVM.
>>> (Some emulated operations are really expensive in a virtualized
>>> environment.)
>>>
>>>        
>> Which method do you use?  Or do you support both?
>>      
> I post two patches in my previous email. One is for CPU-switch, and
> the other is for domain-switch.
>
>    

I see.  I'm not sure I know which one is better!

>> Note disclosing host pmu data to the guest is sometimes a security issue.
>>
>>      
> For instance?
>    

The standard example is hyperthreading where the memory bus unit is 
shared among two logical processors.  A guest sampling a vcpu on one 
thread can gain information about what is happening on the other - the 
number of bus transactions the other thread has issued.  This can be 
used to establish a communication channel between two guests that 
shouldn't be communicating, or to eavesdrop on another guest.  A similar 
problem happens with multicores.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-04-07 19:30 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-31 16:53 Some Code for Performance Profiling Jiaqing Du
2010-04-05  8:34 ` Avi Kivity
2010-04-07 19:23   ` Jiaqing Du
2010-04-07 19:30     ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox