[PATCH v5 0/3] KVM: x86: tracepoint updates

kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v5 0/3] KVM: x86: tracepoint updates
@ 2024-09-10 20:03 Maxim Levitsky
  2024-09-10 20:03 ` [PATCH v5 1/3] KVM: x86: add more information to the kvm_entry tracepoint Maxim Levitsky
                   ` (4 more replies)
  0 siblings, 5 replies; 13+ messages in thread
From: Maxim Levitsky @ 2024-09-10 20:03 UTC (permalink / raw)
  To: kvm
  Cc: x86, Dave Hansen, Thomas Gleixner, Borislav Petkov, Paolo Bonzini,
	Ingo Molnar, Sean Christopherson, H. Peter Anvin, linux-kernel,
	Maxim Levitsky

This patch series is intended to add some selected information
to the kvm tracepoints to make it easier to gather insights about
running nested guests.

This patch series was developed together with a new x86 performance analysis tool
that I developed recently (https://gitlab.com/maximlevitsky/kvmon)
which aims to be a better kvm_stat, and allows you at glance
to see what is happening in a VM, including nesting.

V5: rebased on top of recent changes

Best regards,
	Maxim Levitsky

Maxim Levitsky (3):
  KVM: x86: add more information to the kvm_entry tracepoint
  KVM: x86: add information about pending requests to kvm_exit
    tracepoint
  KVM: x86: add new nested vmexit tracepoints

 arch/x86/include/asm/kvm-x86-ops.h |   1 +
 arch/x86/include/asm/kvm_host.h    |   5 +-
 arch/x86/kvm/svm/nested.c          |  22 ++++++
 arch/x86/kvm/svm/svm.c             |  17 +++++
 arch/x86/kvm/trace.h               | 107 ++++++++++++++++++++++++++---
 arch/x86/kvm/vmx/main.c            |   1 +
 arch/x86/kvm/vmx/nested.c          |  27 ++++++++
 arch/x86/kvm/vmx/vmx.c             |  11 +++
 arch/x86/kvm/vmx/x86_ops.h         |   4 ++
 arch/x86/kvm/x86.c                 |   3 +
 10 files changed, 189 insertions(+), 9 deletions(-)

-- 
2.26.3



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v5 1/3] KVM: x86: add more information to the kvm_entry tracepoint
  2024-09-10 20:03 [PATCH v5 0/3] KVM: x86: tracepoint updates Maxim Levitsky
@ 2024-09-10 20:03 ` Maxim Levitsky
  2024-12-18 20:53   ` Sean Christopherson
  2024-09-10 20:03 ` [PATCH v5 2/3] KVM: x86: add information about pending requests to kvm_exit tracepoint Maxim Levitsky
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 13+ messages in thread
From: Maxim Levitsky @ 2024-09-10 20:03 UTC (permalink / raw)
  To: kvm
  Cc: x86, Dave Hansen, Thomas Gleixner, Borislav Petkov, Paolo Bonzini,
	Ingo Molnar, Sean Christopherson, H. Peter Anvin, linux-kernel,
	Maxim Levitsky

Add VMX/SVM specific interrupt injection info to vm entry tracepoint.
Also add a flag showing that immediate vm exit is set to happen after
the entry.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  5 ++++-
 arch/x86/kvm/svm/svm.c             | 17 +++++++++++++++++
 arch/x86/kvm/trace.h               | 17 ++++++++++++++---
 arch/x86/kvm/vmx/main.c            |  1 +
 arch/x86/kvm/vmx/vmx.c             | 11 +++++++++++
 arch/x86/kvm/vmx/x86_ops.h         |  4 ++++
 7 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 03b7e13f15bbd..af5c4d55d47bc 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -99,6 +99,7 @@ KVM_X86_OP(get_l2_tsc_multiplier)
 KVM_X86_OP(write_tsc_offset)
 KVM_X86_OP(write_tsc_multiplier)
 KVM_X86_OP(get_exit_info)
+KVM_X86_OP(get_entry_info)
 KVM_X86_OP(check_intercept)
 KVM_X86_OP(handle_exit_irqoff)
 KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 52443ccda320f..8118f75a8a35d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1756,13 +1756,16 @@ struct kvm_x86_ops {
 	void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu);
 
 	/*
-	 * Retrieve somewhat arbitrary exit information.  Intended to
+	 * Retrieve somewhat arbitrary exit/entry information.  Intended to
 	 * be used only from within tracepoints or error paths.
 	 */
 	void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason,
 			      u64 *info1, u64 *info2,
 			      u32 *exit_int_info, u32 *exit_int_info_err_code);
 
+	void (*get_entry_info)(struct kvm_vcpu *vcpu,
+				u32 *inj_info, u32 *inj_info_error_code);
+
 	int (*check_intercept)(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9a0506ef87dfb..485f3c2826312 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3525,6 +3525,22 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
 		*error_code = 0;
 }
 
+static void svm_get_entry_info(struct kvm_vcpu *vcpu,
+			u32 *inj_info,
+			u32 *inj_info_error_code)
+{
+	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+	*inj_info = control->event_inj;
+
+	if ((*inj_info & SVM_EXITINTINFO_VALID) &&
+	    (*inj_info & SVM_EXITINTINFO_VALID_ERR))
+		*inj_info_error_code = control->event_inj_err;
+	else
+		*inj_info_error_code = 0;
+
+}
+
 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -5057,6 +5073,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
 
 	.get_exit_info = svm_get_exit_info,
+	.get_entry_info = svm_get_entry_info,
 
 	.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
 
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index d3aeffd6ae753..b4c014b1aa9a1 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -22,16 +22,27 @@ TRACE_EVENT(kvm_entry,
 		__field(	unsigned int,	vcpu_id		)
 		__field(	unsigned long,	rip		)
 		__field(	bool,		immediate_exit	)
+		__field(	u32,		inj_info	)
+		__field(	u32,		inj_info_err	)
+		__field(	bool,		guest_mode	)
 	),
 
 	TP_fast_assign(
 		__entry->vcpu_id        = vcpu->vcpu_id;
 		__entry->rip		= kvm_rip_read(vcpu);
-		__entry->immediate_exit	= force_immediate_exit;
+		__entry->immediate_exit = force_immediate_exit;
+		__entry->guest_mode     = is_guest_mode(vcpu);
+
+		static_call(kvm_x86_get_entry_info)(vcpu,
+					  &__entry->inj_info,
+					  &__entry->inj_info_err);
 	),
 
-	TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
-		  __entry->immediate_exit ? "[immediate exit]" : "")
+	TP_printk("vcpu %u, rip 0x%lx inj 0x%08x inj_error_code 0x%08x%s%s",
+		  __entry->vcpu_id, __entry->rip,
+		  __entry->inj_info, __entry->inj_info_err,
+		  __entry->immediate_exit ? "[immediate exit]" : "",
+		  __entry->guest_mode ? "[guest]" : "")
 );
 
 /*
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 7e2e78a142574..769a7ca06f59b 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -108,6 +108,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
 	.get_mt_mask = vmx_get_mt_mask,
 
 	.get_exit_info = vmx_get_exit_info,
+	.get_entry_info = vmx_get_entry_info,
 
 	.vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 89682832dded7..af18174cc4a20 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6180,6 +6180,17 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
 	}
 }
 
+void vmx_get_entry_info(struct kvm_vcpu *vcpu,
+			      u32 *inj_info,
+			      u32 *inj_info_error_code)
+{
+	*inj_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	if (is_exception_with_error_code(*inj_info))
+		*inj_info_error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	else
+		*inj_info_error_code = 0;
+}
+
 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
 {
 	if (vmx->pml_pg) {
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index b6a7cfc6ae317..124425997ec15 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -104,8 +104,12 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr);
 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+
 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
 		       u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *inj_info,
+			   u32 *inj_info_error_code);
+
 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v5 2/3] KVM: x86: add information about pending requests to kvm_exit tracepoint
  2024-09-10 20:03 [PATCH v5 0/3] KVM: x86: tracepoint updates Maxim Levitsky
  2024-09-10 20:03 ` [PATCH v5 1/3] KVM: x86: add more information to the kvm_entry tracepoint Maxim Levitsky
@ 2024-09-10 20:03 ` Maxim Levitsky
  2024-09-10 20:03 ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints Maxim Levitsky
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Maxim Levitsky @ 2024-09-10 20:03 UTC (permalink / raw)
  To: kvm
  Cc: x86, Dave Hansen, Thomas Gleixner, Borislav Petkov, Paolo Bonzini,
	Ingo Molnar, Sean Christopherson, H. Peter Anvin, linux-kernel,
	Maxim Levitsky

This allows to gather information on how often kvm interrupts vCPUs due
to specific requests.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/trace.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index b4c014b1aa9a1..5a5b7757e8456 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -319,12 +319,14 @@ TRACE_EVENT(name,							     \
 		__field(	u32,	        intr_info	)	     \
 		__field(	u32,	        error_code	)	     \
 		__field(	unsigned int,	vcpu_id         )	     \
+		__field(	u64,		requests        )	     \
 	),								     \
 									     \
 	TP_fast_assign(							     \
 		__entry->guest_rip	= kvm_rip_read(vcpu);		     \
 		__entry->isa            = isa;				     \
 		__entry->vcpu_id        = vcpu->vcpu_id;		     \
+		__entry->requests       = READ_ONCE(vcpu->requests);	     \
 		kvm_x86_call(get_exit_info)(vcpu,			     \
 					    &__entry->exit_reason,	     \
 					    &__entry->info1,		     \
@@ -334,11 +336,13 @@ TRACE_EVENT(name,							     \
 	),								     \
 									     \
 	TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx "	     \
-		  "info2 0x%016llx intr_info 0x%08x error_code 0x%08x",	     \
+		  "info2 0x%016llx intr_info 0x%08x error_code 0x%08x "      \
+		  "requests 0x%016llx",					     \
 		  __entry->vcpu_id,					     \
 		  kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \
 		  __entry->guest_rip, __entry->info1, __entry->info2,	     \
-		  __entry->intr_info, __entry->error_code)		     \
+		  __entry->intr_info, __entry->error_code, 		     \
+		  __entry->requests)					     \
 )
 
 /*
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints
  2024-09-10 20:03 [PATCH v5 0/3] KVM: x86: tracepoint updates Maxim Levitsky
  2024-09-10 20:03 ` [PATCH v5 1/3] KVM: x86: add more information to the kvm_entry tracepoint Maxim Levitsky
  2024-09-10 20:03 ` [PATCH v5 2/3] KVM: x86: add information about pending requests to kvm_exit tracepoint Maxim Levitsky
@ 2024-09-10 20:03 ` Maxim Levitsky
  2024-12-18 21:14   ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepointsg Sean Christopherson
  2024-12-19 17:33   ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints Paolo Bonzini
  2024-10-30 21:21 ` [PATCH v5 0/3] KVM: x86: tracepoint updates Maxim Levitsky
  2024-12-19  2:40 ` Sean Christopherson
  4 siblings, 2 replies; 13+ messages in thread
From: Maxim Levitsky @ 2024-09-10 20:03 UTC (permalink / raw)
  To: kvm
  Cc: x86, Dave Hansen, Thomas Gleixner, Borislav Petkov, Paolo Bonzini,
	Ingo Molnar, Sean Christopherson, H. Peter Anvin, linux-kernel,
	Maxim Levitsky

Add 3 new tracepoints for nested VM exits which are intended
to capture extra information to gain insights about the nested guest
behavior.

The new tracepoints are:

- kvm_nested_msr
- kvm_nested_hypercall

These tracepoints capture extra register state to be able to know
which MSR or which hypercall was done.

- kvm_nested_page_fault

This tracepoint allows to capture extra info about which host pagefault
error code caused the nested page fault.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 22 +++++++++++
 arch/x86/kvm/trace.h      | 82 +++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx/nested.c | 27 +++++++++++++
 arch/x86/kvm/x86.c        |  3 ++
 4 files changed, 131 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 6f704c1037e51..2020307481553 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -38,6 +38,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
+	u64 host_error_code = vmcb->control.exit_info_1;
+
 
 	if (vmcb->control.exit_code != SVM_EXIT_NPF) {
 		/*
@@ -48,11 +50,15 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
 		vmcb->control.exit_code_hi = 0;
 		vmcb->control.exit_info_1 = (1ULL << 32);
 		vmcb->control.exit_info_2 = fault->address;
+		host_error_code = 0;
 	}
 
 	vmcb->control.exit_info_1 &= ~0xffffffffULL;
 	vmcb->control.exit_info_1 |= fault->error_code;
 
+	trace_kvm_nested_page_fault(fault->address, host_error_code,
+				    fault->error_code);
+
 	nested_svm_vmexit(svm);
 }
 
@@ -1126,6 +1132,22 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 				       vmcb12->control.exit_int_info_err,
 				       KVM_ISA_SVM);
 
+	/* Collect some info about nested VM exits */
+	switch (vmcb12->control.exit_code) {
+	case SVM_EXIT_MSR:
+		trace_kvm_nested_msr(vmcb12->control.exit_info_1 == 1,
+				     kvm_rcx_read(vcpu),
+				     (vmcb12->save.rax & 0xFFFFFFFFull) |
+				     (((u64)kvm_rdx_read(vcpu) << 32)));
+		break;
+	case SVM_EXIT_VMMCALL:
+		trace_kvm_nested_hypercall(vmcb12->save.rax,
+					   kvm_rbx_read(vcpu),
+					   kvm_rcx_read(vcpu),
+					   kvm_rdx_read(vcpu));
+		break;
+	}
+
 	kvm_vcpu_unmap(vcpu, &map, true);
 
 	nested_svm_transition_tlb_flush(vcpu);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 5a5b7757e8456..6074b4f85d5e2 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -613,7 +613,7 @@ TRACE_EVENT(kvm_pv_eoi,
 );
 
 /*
- * Tracepoint for nested VMRUN
+ * Tracepoint for nested VMRUN/VMENTER
  */
 TRACE_EVENT(kvm_nested_vmenter,
 	    TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
@@ -746,8 +746,84 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
 	TP_printk("rip: 0x%016llx", __entry->rip)
 );
 
+
 /*
- * Tracepoint for nested #vmexit because of interrupt pending
+ * Tracepoint for nested guest MSR access.
+ */
+TRACE_EVENT(kvm_nested_msr,
+	TP_PROTO(bool write, u32 ecx, u64 data),
+	TP_ARGS(write, ecx, data),
+
+	TP_STRUCT__entry(
+		__field(	bool,		write		)
+		__field(	u32,		ecx		)
+		__field(	u64,		data		)
+	),
+
+	TP_fast_assign(
+		__entry->write		= write;
+		__entry->ecx		= ecx;
+		__entry->data		= data;
+	),
+
+	TP_printk("msr_%s %x = 0x%llx",
+		  __entry->write ? "write" : "read",
+		  __entry->ecx, __entry->data)
+);
+
+/*
+ * Tracepoint for nested hypercalls, capturing generic info about the
+ * hypercall
+ */
+
+TRACE_EVENT(kvm_nested_hypercall,
+	TP_PROTO(u64 rax, u64 rbx, u64 rcx, u64 rdx),
+	TP_ARGS(rax, rbx, rcx, rdx),
+
+	TP_STRUCT__entry(
+		__field(	u64, 	rax	)
+		__field(	u64,	rbx	)
+		__field(	u64,	rcx	)
+		__field(	u64,	rdx	)
+	),
+
+	TP_fast_assign(
+		__entry->rax		= rax;
+		__entry->rbx		= rbx;
+		__entry->rcx		= rcx;
+		__entry->rdx		= rdx;
+	),
+
+	TP_printk("rax 0x%llx rbx 0x%llx rcx 0x%llx rdx 0x%llx",
+		 __entry->rax, __entry->rbx, __entry->rcx,  __entry->rdx)
+);
+
+
+TRACE_EVENT(kvm_nested_page_fault,
+	TP_PROTO(u64 gpa, u64 host_error_code, u64 guest_error_code),
+	TP_ARGS(gpa, host_error_code, guest_error_code),
+
+	TP_STRUCT__entry(
+			__field(	u64,		gpa	)
+		__field(	u64,		host_error_code		)
+		__field(	u64,		guest_errror_code	)
+	),
+
+	TP_fast_assign(
+		__entry->gpa			= gpa;
+		__entry->host_error_code	= host_error_code;
+		__entry->guest_errror_code	= guest_error_code;
+	),
+
+	TP_printk("gpa 0x%llx host err 0x%llx guest err 0x%llx",
+		  __entry->gpa,
+		  __entry->host_error_code,
+		  __entry->guest_errror_code)
+);
+
+
+/*
+ * Tracepoint for invlpga
  */
 TRACE_EVENT(kvm_invlpga,
 	    TP_PROTO(__u64 rip, unsigned int asid, u64 address),
@@ -770,7 +846,7 @@ TRACE_EVENT(kvm_invlpga,
 );
 
 /*
- * Tracepoint for nested #vmexit because of interrupt pending
+ * Tracepoint for skinit
  */
 TRACE_EVENT(kvm_skinit,
 	    TP_PROTO(__u64 rip, __u32 slb),
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 2392a7ef254df..3881a02694fc2 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -454,6 +454,16 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
 		 */
 		nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
 					   fault->address);
+
+		/*
+		 * vmx_get_exit_qual() returns the original exit qualification,
+		 * before it was overridden with exit qualification that
+		 * is about to be injected to the guest.
+		 */
+
+		trace_kvm_nested_page_fault(fault->address,
+				vmx_get_exit_qual(vcpu),
+				exit_qualification);
 	}
 
 	nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
@@ -4985,6 +4995,23 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
 						       vmcs12->vm_exit_intr_error_code,
 						       KVM_ISA_VMX);
 
+		switch ((u16)vmcs12->vm_exit_reason) {
+		case EXIT_REASON_MSR_READ:
+		case EXIT_REASON_MSR_WRITE:
+			trace_kvm_nested_msr(vmcs12->vm_exit_reason == EXIT_REASON_MSR_WRITE,
+					     kvm_rcx_read(vcpu),
+					     (kvm_rax_read(vcpu) & 0xFFFFFFFFull) |
+					     (((u64)kvm_rdx_read(vcpu)) << 32));
+			break;
+		case EXIT_REASON_VMCALL:
+			trace_kvm_nested_hypercall(kvm_rax_read(vcpu),
+						   kvm_rbx_read(vcpu),
+						   kvm_rcx_read(vcpu),
+						   kvm_rdx_read(vcpu));
+			break;
+
+		}
+
 		load_vmcs12_host_state(vcpu, vmcs12);
 
 		return;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f72e5d89e942d..cb01cf2ad6ac9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -14032,6 +14032,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_hypercall);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_msr);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 0/3] KVM: x86: tracepoint updates
  2024-09-10 20:03 [PATCH v5 0/3] KVM: x86: tracepoint updates Maxim Levitsky
                   ` (2 preceding siblings ...)
  2024-09-10 20:03 ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints Maxim Levitsky
@ 2024-10-30 21:21 ` Maxim Levitsky
  2024-11-22  1:04   ` Maxim Levitsky
  2024-12-19  2:40 ` Sean Christopherson
  4 siblings, 1 reply; 13+ messages in thread
From: Maxim Levitsky @ 2024-10-30 21:21 UTC (permalink / raw)
  To: kvm
  Cc: x86, Dave Hansen, Thomas Gleixner, Borislav Petkov, Paolo Bonzini,
	Ingo Molnar, Sean Christopherson, H. Peter Anvin, linux-kernel

On Tue, 2024-09-10 at 16:03 -0400, Maxim Levitsky wrote:
> This patch series is intended to add some selected information
> to the kvm tracepoints to make it easier to gather insights about
> running nested guests.
> 
> This patch series was developed together with a new x86 performance analysis tool
> that I developed recently (https://gitlab.com/maximlevitsky/kvmon)
> which aims to be a better kvm_stat, and allows you at glance
> to see what is happening in a VM, including nesting.
> 
> V5: rebased on top of recent changes
> 
> Best regards,
> 	Maxim Levitsky
> 
> Maxim Levitsky (3):
>   KVM: x86: add more information to the kvm_entry tracepoint
>   KVM: x86: add information about pending requests to kvm_exit
>     tracepoint
>   KVM: x86: add new nested vmexit tracepoints
> 
>  arch/x86/include/asm/kvm-x86-ops.h |   1 +
>  arch/x86/include/asm/kvm_host.h    |   5 +-
>  arch/x86/kvm/svm/nested.c          |  22 ++++++
>  arch/x86/kvm/svm/svm.c             |  17 +++++
>  arch/x86/kvm/trace.h               | 107 ++++++++++++++++++++++++++---
>  arch/x86/kvm/vmx/main.c            |   1 +
>  arch/x86/kvm/vmx/nested.c          |  27 ++++++++
>  arch/x86/kvm/vmx/vmx.c             |  11 +++
>  arch/x86/kvm/vmx/x86_ops.h         |   4 ++
>  arch/x86/kvm/x86.c                 |   3 +
>  10 files changed, 189 insertions(+), 9 deletions(-)
> 
> -- 
> 2.26.3
> 
> 

Hi,
A very gentle ping on this patch series.

Best regards,
	Maxim Levitsky


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 0/3] KVM: x86: tracepoint updates
  2024-10-30 21:21 ` [PATCH v5 0/3] KVM: x86: tracepoint updates Maxim Levitsky
@ 2024-11-22  1:04   ` Maxim Levitsky
  0 siblings, 0 replies; 13+ messages in thread
From: Maxim Levitsky @ 2024-11-22  1:04 UTC (permalink / raw)
  To: kvm
  Cc: x86, Dave Hansen, Thomas Gleixner, Borislav Petkov, Paolo Bonzini,
	Ingo Molnar, Sean Christopherson, H. Peter Anvin, linux-kernel

On Wed, 2024-10-30 at 17:21 -0400, Maxim Levitsky wrote:
> On Tue, 2024-09-10 at 16:03 -0400, Maxim Levitsky wrote:
> > This patch series is intended to add some selected information
> > to the kvm tracepoints to make it easier to gather insights about
> > running nested guests.
> > 
> > This patch series was developed together with a new x86 performance analysis tool
> > that I developed recently (https://gitlab.com/maximlevitsky/kvmon)
> > which aims to be a better kvm_stat, and allows you at glance
> > to see what is happening in a VM, including nesting.
> > 
> > V5: rebased on top of recent changes
> > 
> > Best regards,
> > 	Maxim Levitsky
> > 
> > Maxim Levitsky (3):
> >   KVM: x86: add more information to the kvm_entry tracepoint
> >   KVM: x86: add information about pending requests to kvm_exit
> >     tracepoint
> >   KVM: x86: add new nested vmexit tracepoints
> > 
> >  arch/x86/include/asm/kvm-x86-ops.h |   1 +
> >  arch/x86/include/asm/kvm_host.h    |   5 +-
> >  arch/x86/kvm/svm/nested.c          |  22 ++++++
> >  arch/x86/kvm/svm/svm.c             |  17 +++++
> >  arch/x86/kvm/trace.h               | 107 ++++++++++++++++++++++++++---
> >  arch/x86/kvm/vmx/main.c            |   1 +
> >  arch/x86/kvm/vmx/nested.c          |  27 ++++++++
> >  arch/x86/kvm/vmx/vmx.c             |  11 +++
> >  arch/x86/kvm/vmx/x86_ops.h         |   4 ++
> >  arch/x86/kvm/x86.c                 |   3 +
> >  10 files changed, 189 insertions(+), 9 deletions(-)
> > 
> > -- 
> > 2.26.3
> > 
> > 
> 
> Hi,
> A very gentle ping on this patch series.
> 
> Best regards,
> 	Maxim Levitsky
> 
Another kind ping on this patch series.

Best regards,
	Maxim Levitsky


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 1/3] KVM: x86: add more information to the kvm_entry tracepoint
  2024-09-10 20:03 ` [PATCH v5 1/3] KVM: x86: add more information to the kvm_entry tracepoint Maxim Levitsky
@ 2024-12-18 20:53   ` Sean Christopherson
  0 siblings, 0 replies; 13+ messages in thread
From: Sean Christopherson @ 2024-12-18 20:53 UTC (permalink / raw)
  To: Maxim Levitsky
  Cc: kvm, x86, Dave Hansen, Thomas Gleixner, Borislav Petkov,
	Paolo Bonzini, Ingo Molnar, H. Peter Anvin, linux-kernel

On Tue, Sep 10, 2024, Maxim Levitsky wrote:
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 52443ccda320f..8118f75a8a35d 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1756,13 +1756,16 @@ struct kvm_x86_ops {
>  	void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu);
>  
>  	/*
> -	 * Retrieve somewhat arbitrary exit information.  Intended to
> +	 * Retrieve somewhat arbitrary exit/entry information.  Intended to
>  	 * be used only from within tracepoints or error paths.
>  	 */
>  	void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason,
>  			      u64 *info1, u64 *info2,
>  			      u32 *exit_int_info, u32 *exit_int_info_err_code);
>  
> +	void (*get_entry_info)(struct kvm_vcpu *vcpu,
> +				u32 *inj_info, u32 *inj_info_error_code);

I vote to use the same names as the kvm_exit tracepoint, i.e. intr_into and
error_code throughout.  While I agree that capturing the "injection" aspect is
nice to have, if a user doesn't know that the fields are related to event/intr
injection, I don't think "inj" is going to help them connect the dots.

On the other, for cases where an event is re-injected, using the same names as
kvm_exit provides a direct connection between the event that was being vectored
at the time of exit, and the subsequent re-injection of the same event.

>  	int (*check_intercept)(struct kvm_vcpu *vcpu,
>  			       struct x86_instruction_info *info,
>  			       enum x86_intercept_stage stage,

...

>  	TP_fast_assign(
>  		__entry->vcpu_id        = vcpu->vcpu_id;
>  		__entry->rip		= kvm_rip_read(vcpu);
> -		__entry->immediate_exit	= force_immediate_exit;
> +		__entry->immediate_exit = force_immediate_exit;
> +		__entry->guest_mode     = is_guest_mode(vcpu);
> +
> +		static_call(kvm_x86_get_entry_info)(vcpu,
> +					  &__entry->inj_info,
> +					  &__entry->inj_info_err);
>  	),
>  
> -	TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
> -		  __entry->immediate_exit ? "[immediate exit]" : "")
> +	TP_printk("vcpu %u, rip 0x%lx inj 0x%08x inj_error_code 0x%08x%s%s",
> +		  __entry->vcpu_id, __entry->rip,
> +		  __entry->inj_info, __entry->inj_info_err,
> +		  __entry->immediate_exit ? "[immediate exit]" : "",
> +		  __entry->guest_mode ? "[guest]" : "")

I 100% agree kvm_entry should capture L1 vs. L2, but looking more closely, I
think we should make the entry and exit tracepoints, and then maybe rename
trace_kvm_nested_vmexit_inject() => trace_kvm_nested_vmexit().

Currently, trace_kvm_nested_vmexit() traces all exits from L2=>L0, which is rather
silly since it's trivial to capture L1 vs. L2 in kvm_exit.  I also find it to be
quite annoying since the vast, vast majority of time I don't want to trace *just*
L2=>L0 exits.  And it's especially annoying because if I want to see both L1 and
L2 exit, the trace contains a double dose of L2 exits.

Last thought, what about always capturing where the transition is occuring?  E.g.
instead of tagging on "[guest]" at the end, something like this:

	TP_printk("vcpu %u => L%u rip 0x%lx intr_info 0x%08x error_code 0x%08x%s",
		  __entry->vcpu_id, 1 + __entry->guest_mode,
		  ...

and then in kvm_exit:

	TP_printk("vcpu %u <= L%u reason %s%s%s rip 0x%lx info1 0x%016llx "  \
		  "info2 0x%016llx intr_info 0x%08x error_code 0x%08x "      \
		  "requests 0x%016llx",					     \
		  __entry->vcpu_id, 1 + __entry->guest_mode,		     \


Or use "to" and "from" if the "=>" / "<=" is too cute and confusing.

For now, I'm going to omit the is_guest_mode() change purely to avoid churn if
we end up squashing the current trace_kvm_nested_vmexit() into trace_kvm_exit().

As I'm about to disappear for two weeks, I'm going to speculatively apply the
below so I don't delay the meat of this patch any more than I already have.
Please holler if you disagree with the intr_info+error_code terminology, I'm
definitely open to other other names, though I do feel quite strongly that entry
and exit need to be consistent.  These are sitting at the head of "misc", so I
can fixup without much fuss.

---
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Tue, 10 Sep 2024 16:03:48 -0400
Subject: [PATCH 1/2] KVM: x86: Add interrupt injection information to the
 kvm_entry tracepoint

Add VMX/SVM specific interrupt injection info the kvm_entry tracepoint.
As is done with kvm_exit, gather the information via a kvm_x86_ops hook
to avoid the moderately costly VMREADs on VMX when the tracepoint isn't
enabled.

Opportunistically rename the parameters in the get_exit_info()
declaration to match the names used by both SVM and VMX.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Link: https://lore.kernel.org/r/20240910200350.264245-2-mlevitsk@redhat.com
[sean: drop is_guest_mode() change, use intr_info/error_code for names]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  7 +++++--
 arch/x86/kvm/svm/svm.c             | 16 ++++++++++++++++
 arch/x86/kvm/trace.h               |  9 ++++++++-
 arch/x86/kvm/vmx/main.c            |  1 +
 arch/x86/kvm/vmx/vmx.c             |  9 +++++++++
 arch/x86/kvm/vmx/x86_ops.h         |  3 +++
 7 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 5aff7222e40f..8c04472829a0 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -100,6 +100,7 @@ KVM_X86_OP(get_l2_tsc_multiplier)
 KVM_X86_OP(write_tsc_offset)
 KVM_X86_OP(write_tsc_multiplier)
 KVM_X86_OP(get_exit_info)
+KVM_X86_OP(get_entry_info)
 KVM_X86_OP(check_intercept)
 KVM_X86_OP(handle_exit_irqoff)
 KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1a09ac99132c..c07d8318e9d8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1770,12 +1770,15 @@ struct kvm_x86_ops {
 	void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu);
 
 	/*
-	 * Retrieve somewhat arbitrary exit information.  Intended to
+	 * Retrieve somewhat arbitrary exit/entry information.  Intended to
 	 * be used only from within tracepoints or error paths.
 	 */
 	void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason,
 			      u64 *info1, u64 *info2,
-			      u32 *exit_int_info, u32 *exit_int_info_err_code);
+			      u32 *intr_info, u32 *error_code);
+
+	void (*get_entry_info)(struct kvm_vcpu *vcpu,
+			       u32 *intr_info, u32 *error_code);
 
 	int (*check_intercept)(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 8fc2f4a97495..d06fe41a2de0 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3542,6 +3542,21 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
 		*error_code = 0;
 }
 
+static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info,
+			       u32 *error_code)
+{
+	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+	*intr_info = control->event_inj;
+
+	if ((*intr_info & SVM_EXITINTINFO_VALID) &&
+	    (*intr_info & SVM_EXITINTINFO_VALID_ERR))
+		*error_code = control->event_inj_err;
+	else
+		*error_code = 0;
+
+}
+
 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -5082,6 +5097,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
 
 	.get_exit_info = svm_get_exit_info,
+	.get_entry_info = svm_get_entry_info,
 
 	.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
 
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index d3aeffd6ae75..c2edf4a36fad 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -22,15 +22,22 @@ TRACE_EVENT(kvm_entry,
 		__field(	unsigned int,	vcpu_id		)
 		__field(	unsigned long,	rip		)
 		__field(	bool,		immediate_exit	)
+		__field(	u32,		intr_info	)
+		__field(	u32,		error_code	)
 	),
 
 	TP_fast_assign(
 		__entry->vcpu_id        = vcpu->vcpu_id;
 		__entry->rip		= kvm_rip_read(vcpu);
 		__entry->immediate_exit	= force_immediate_exit;
+
+		kvm_x86_call(get_entry_info)(vcpu, &__entry->intr_info,
+					     &__entry->error_code);
 	),
 
-	TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
+	TP_printk("vcpu %u, rip 0x%lx intr_info 0x%08x error_code 0x%08x%s",
+		  __entry->vcpu_id, __entry->rip,
+		  __entry->intr_info, __entry->error_code,
 		  __entry->immediate_exit ? "[immediate exit]" : "")
 );
 
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 92d35cc6cd15..697e135ba0f3 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -111,6 +111,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
 	.get_mt_mask = vmx_get_mt_mask,
 
 	.get_exit_info = vmx_get_exit_info,
+	.get_entry_info = vmx_get_entry_info,
 
 	.vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e93c48ff61c5..3fd6df782492 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6194,6 +6194,15 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
 	}
 }
 
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
+{
+	*intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	if (is_exception_with_error_code(*intr_info))
+		*error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	else
+		*error_code = 0;
+}
+
 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
 {
 	if (vmx->pml_pg) {
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index a55981c5216e..f7f65e81920b 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -104,8 +104,11 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr);
 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+
 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
 		       u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code);
+
 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);

base-commit: 43651b98dd23e3d2d11f14964e98801ba58feccb
-- 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepointsg
  2024-09-10 20:03 ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints Maxim Levitsky
@ 2024-12-18 21:14   ` Sean Christopherson
  2024-12-19 17:33   ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints Paolo Bonzini
  1 sibling, 0 replies; 13+ messages in thread
From: Sean Christopherson @ 2024-12-18 21:14 UTC (permalink / raw)
  To: Maxim Levitsky
  Cc: kvm, x86, Dave Hansen, Thomas Gleixner, Borislav Petkov,
	Paolo Bonzini, Ingo Molnar, H. Peter Anvin, linux-kernel

On Tue, Sep 10, 2024, Maxim Levitsky wrote:
> Add 3 new tracepoints for nested VM exits which are intended
> to capture extra information to gain insights about the nested guest
> behavior.
> 
> The new tracepoints are:
> 
> - kvm_nested_msr
> - kvm_nested_hypercall

I 100% agree that not having register state in the exit tracepoints is obnoxious,
but I don't think we should add one-off tracepoints for the most annoying cases.
I would much prefer to figure out a way to capture register state in kvm_entry
and kvm_exit.  E.g. I've lost track of the number of times I've observed an MSR
exit without having trace_kvm_msr enabled.

One idea would be to capture E{A,B,C,D}X, which would cover MSRs, CPUID, and
most hypercalls.  And then we might even be able to drop the dedicated MSR and
CPUID tracepoints (not sure if that's a good idea).

Side topic, arch/s390/kvm/trace.h has the concept of COMMON information that is
captured for multiple tracepoints.  I haven't looked closely, but I gotta imagine
we can/should use a similar approach for x86.

> These tracepoints capture extra register state to be able to know
> which MSR or which hypercall was done.
> 
> - kvm_nested_page_fault
> 
> This tracepoint allows to capture extra info about which host pagefault
> error code caused the nested page fault.

The host error code, a.k.a. qualification info, is readily available in the
kvm_exit (or nested variant) tracepoint.  I don't letting userspace skip a
tracepoint that's probably already enabled is worth the extra code to support
this tracepoint.  The nested_svm_inject_npf_exit() code in particular is wonky,
and I think it's a good example of why userspace "needs" trace_kvm_exit, e.g. to
observe that a nested stage-2 page fault didn't originate from a hardware stage-2
fault.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 0/3] KVM: x86: tracepoint updates
  2024-09-10 20:03 [PATCH v5 0/3] KVM: x86: tracepoint updates Maxim Levitsky
                   ` (3 preceding siblings ...)
  2024-10-30 21:21 ` [PATCH v5 0/3] KVM: x86: tracepoint updates Maxim Levitsky
@ 2024-12-19  2:40 ` Sean Christopherson
  4 siblings, 0 replies; 13+ messages in thread
From: Sean Christopherson @ 2024-12-19  2:40 UTC (permalink / raw)
  To: Sean Christopherson, kvm, Maxim Levitsky
  Cc: x86, Dave Hansen, Thomas Gleixner, Borislav Petkov, Paolo Bonzini,
	Ingo Molnar, H. Peter Anvin, linux-kernel

On Tue, 10 Sep 2024 16:03:47 -0400, Maxim Levitsky wrote:
> This patch series is intended to add some selected information
> to the kvm tracepoints to make it easier to gather insights about
> running nested guests.
> 
> This patch series was developed together with a new x86 performance analysis tool
> that I developed recently (https://gitlab.com/maximlevitsky/kvmon)
> which aims to be a better kvm_stat, and allows you at glance
> to see what is happening in a VM, including nesting.
> 
> [...]

Applied 1 and 2 to kvm-x86 misc, with the massaging to patch 1 I detailed earlier.
Again, please holler if you disagree with anything.  Thanks!

[1/3] KVM: x86: add more information to the kvm_entry tracepoint
      https://github.com/kvm-x86/linux/commit/3e633e7e7d07
[2/3] KVM: x86: add information about pending requests to kvm_exit tracepoint
      https://github.com/kvm-x86/linux/commit/0e77b324110c
[3/3] KVM: x86: add new nested vmexit tracepoints
      (no commit info)

--
https://github.com/kvm-x86/linux/tree/next

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints
  2024-09-10 20:03 ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints Maxim Levitsky
  2024-12-18 21:14   ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepointsg Sean Christopherson
@ 2024-12-19 17:33   ` Paolo Bonzini
  2024-12-19 17:49     ` Maxim Levitsky
  1 sibling, 1 reply; 13+ messages in thread
From: Paolo Bonzini @ 2024-12-19 17:33 UTC (permalink / raw)
  To: Maxim Levitsky, kvm
  Cc: x86, Dave Hansen, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Sean Christopherson, H. Peter Anvin, linux-kernel

On 9/10/24 22:03, Maxim Levitsky wrote:
> Add 3 new tracepoints for nested VM exits which are intended
> to capture extra information to gain insights about the nested guest
> behavior.
> 
> The new tracepoints are:
> 
> - kvm_nested_msr
> - kvm_nested_hypercall
> 
> These tracepoints capture extra register state to be able to know
> which MSR or which hypercall was done.
> 
> - kvm_nested_page_fault
> 
> This tracepoint allows to capture extra info about which host pagefault
> error code caused the nested page fault.
> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>   arch/x86/kvm/svm/nested.c | 22 +++++++++++
>   arch/x86/kvm/trace.h      | 82 +++++++++++++++++++++++++++++++++++++--
>   arch/x86/kvm/vmx/nested.c | 27 +++++++++++++
>   arch/x86/kvm/x86.c        |  3 ++
>   4 files changed, 131 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 6f704c1037e51..2020307481553 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -38,6 +38,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
>   {
>   	struct vcpu_svm *svm = to_svm(vcpu);
>   	struct vmcb *vmcb = svm->vmcb;
> +	u64 host_error_code = vmcb->control.exit_info_1;
> +
>   
>   	if (vmcb->control.exit_code != SVM_EXIT_NPF) {
>   		/*
> @@ -48,11 +50,15 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
>   		vmcb->control.exit_code_hi = 0;
>   		vmcb->control.exit_info_1 = (1ULL << 32);
>   		vmcb->control.exit_info_2 = fault->address;
> +		host_error_code = 0;
>   	}
>   
>   	vmcb->control.exit_info_1 &= ~0xffffffffULL;
>   	vmcb->control.exit_info_1 |= fault->error_code;
>   
> +	trace_kvm_nested_page_fault(fault->address, host_error_code,
> +				    fault->error_code);
> +

I disagree with Sean about trace_kvm_nested_page_fault.  It's a useful 
addition and it is easier to understand what's happening with a 
dedicated tracepoint (especially on VMX).

Tracepoint are not an exact science and they aren't entirely kernel API. 
  At least they can just go away at any time (changing them is a lot 
more tricky, but their presence is not guaranteed).  The one below has 
the slight ugliness of having to do some computation in 
nested_svm_vmexit(), this one should go in.

>   	nested_svm_vmexit(svm);
>   }
>   
> @@ -1126,6 +1132,22 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>   				       vmcb12->control.exit_int_info_err,
>   				       KVM_ISA_SVM);
>   
> +	/* Collect some info about nested VM exits */
> +	switch (vmcb12->control.exit_code) {
> +	case SVM_EXIT_MSR:
> +		trace_kvm_nested_msr(vmcb12->control.exit_info_1 == 1,
> +				     kvm_rcx_read(vcpu),
> +				     (vmcb12->save.rax & 0xFFFFFFFFull) |
> +				     (((u64)kvm_rdx_read(vcpu) << 32)));
> +		break;
> +	case SVM_EXIT_VMMCALL:
> +		trace_kvm_nested_hypercall(vmcb12->save.rax,
> +					   kvm_rbx_read(vcpu),
> +					   kvm_rcx_read(vcpu),
> +					   kvm_rdx_read(vcpu));
> +		break;

Here I probably would have preferred an unconditional tracepoint giving 
RAX/RBX/RCX/RDX after a nested vmexit.  This is not exactly what Sean 
wanted but perhaps it strikes a middle ground?  I know you wrote this 
for a debugging tool, do you really need to have everything in a single 
tracepoint, or can you correlate the existing exit tracepoint with this 
hypothetical trace_kvm_nested_exit_regs, to pick RDMSR vs. WRMSR?

Paolo


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints
  2024-12-19 17:33   ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints Paolo Bonzini
@ 2024-12-19 17:49     ` Maxim Levitsky
  2024-12-19 18:02       ` Paolo Bonzini
  0 siblings, 1 reply; 13+ messages in thread
From: Maxim Levitsky @ 2024-12-19 17:49 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: x86, Dave Hansen, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Sean Christopherson, H. Peter Anvin, linux-kernel

On Thu, 2024-12-19 at 18:33 +0100, Paolo Bonzini wrote:
> On 9/10/24 22:03, Maxim Levitsky wrote:
> > Add 3 new tracepoints for nested VM exits which are intended
> > to capture extra information to gain insights about the nested guest
> > behavior.
> > 
> > The new tracepoints are:
> > 
> > - kvm_nested_msr
> > - kvm_nested_hypercall
> > 
> > These tracepoints capture extra register state to be able to know
> > which MSR or which hypercall was done.
> > 
> > - kvm_nested_page_fault
> > 
> > This tracepoint allows to capture extra info about which host pagefault
> > error code caused the nested page fault.
> > 
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >   arch/x86/kvm/svm/nested.c | 22 +++++++++++
> >   arch/x86/kvm/trace.h      | 82 +++++++++++++++++++++++++++++++++++++--
> >   arch/x86/kvm/vmx/nested.c | 27 +++++++++++++
> >   arch/x86/kvm/x86.c        |  3 ++
> >   4 files changed, 131 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index 6f704c1037e51..2020307481553 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -38,6 +38,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
> >   {
> >   	struct vcpu_svm *svm = to_svm(vcpu);
> >   	struct vmcb *vmcb = svm->vmcb;
> > +	u64 host_error_code = vmcb->control.exit_info_1;
> > +
> >   
> >   	if (vmcb->control.exit_code != SVM_EXIT_NPF) {
> >   		/*
> > @@ -48,11 +50,15 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
> >   		vmcb->control.exit_code_hi = 0;
> >   		vmcb->control.exit_info_1 = (1ULL << 32);
> >   		vmcb->control.exit_info_2 = fault->address;
> > +		host_error_code = 0;
> >   	}
> >   
> >   	vmcb->control.exit_info_1 &= ~0xffffffffULL;
> >   	vmcb->control.exit_info_1 |= fault->error_code;
> >   
> > +	trace_kvm_nested_page_fault(fault->address, host_error_code,
> > +				    fault->error_code);
> > +
> 
> I disagree with Sean about trace_kvm_nested_page_fault.  It's a useful 
> addition and it is easier to understand what's happening with a 
> dedicated tracepoint (especially on VMX).
> 
> Tracepoint are not an exact science and they aren't entirely kernel API. 
>   At least they can just go away at any time (changing them is a lot 
> more tricky, but their presence is not guaranteed).  The one below has 
> the slight ugliness of having to do some computation in 
> nested_svm_vmexit(), this one should go in.
> 
> >   	nested_svm_vmexit(svm);
> >   }
> >   
> > @@ -1126,6 +1132,22 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> >   				       vmcb12->control.exit_int_info_err,
> >   				       KVM_ISA_SVM);
> >   
> > +	/* Collect some info about nested VM exits */
> > +	switch (vmcb12->control.exit_code) {
> > +	case SVM_EXIT_MSR:
> > +		trace_kvm_nested_msr(vmcb12->control.exit_info_1 == 1,
> > +				     kvm_rcx_read(vcpu),
> > +				     (vmcb12->save.rax & 0xFFFFFFFFull) |
> > +				     (((u64)kvm_rdx_read(vcpu) << 32)));
> > +		break;
> > +	case SVM_EXIT_VMMCALL:
> > +		trace_kvm_nested_hypercall(vmcb12->save.rax,
> > +					   kvm_rbx_read(vcpu),
> > +					   kvm_rcx_read(vcpu),
> > +					   kvm_rdx_read(vcpu));
> > +		break;
> 
> Here I probably would have preferred an unconditional tracepoint giving 
> RAX/RBX/RCX/RDX after a nested vmexit.  This is not exactly what Sean 
> wanted but perhaps it strikes a middle ground?  I know you wrote this 
> for a debugging tool, do you really need to have everything in a single 
> tracepoint, or can you correlate the existing exit tracepoint with this 
> hypothetical trace_kvm_nested_exit_regs, to pick RDMSR vs. WRMSR?


Hi!

If the new trace_kvm_nested_exit_regs tracepoint has a VM exit number argument, then
I can enable this new tracepoint twice with a different filter (vm_exit_num number == msr and vm_exit_num == vmcall),
and each instance will count the events that I need.

So this can work.

Thanks!
Best regards,
	Maxim Levitsky

> 
> Paolo
> 



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints
  2024-12-19 17:49     ` Maxim Levitsky
@ 2024-12-19 18:02       ` Paolo Bonzini
  2025-01-16 22:53         ` Sean Christopherson
  0 siblings, 1 reply; 13+ messages in thread
From: Paolo Bonzini @ 2024-12-19 18:02 UTC (permalink / raw)
  To: Maxim Levitsky, kvm
  Cc: x86, Dave Hansen, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Sean Christopherson, H. Peter Anvin, linux-kernel

On 12/19/24 18:49, Maxim Levitsky wrote:
>> Here I probably would have preferred an unconditional tracepoint giving
>> RAX/RBX/RCX/RDX after a nested vmexit.  This is not exactly what Sean
>> wanted but perhaps it strikes a middle ground?  I know you wrote this
>> for a debugging tool, do you really need to have everything in a single
>> tracepoint, or can you correlate the existing exit tracepoint with this
>> hypothetical trace_kvm_nested_exit_regs, to pick RDMSR vs. WRMSR?
> 
> Hi!
> 
> If the new trace_kvm_nested_exit_regs tracepoint has a VM exit number argument, then
> I can enable this new tracepoint twice with a different filter (vm_exit_num number == msr and vm_exit_num == vmcall),
> and each instance will count the events that I need.
> 
> So this can work.
Ok, thanks.  On one hand it may make sense to have trace_kvm_exit_regs 
and trace_kvm_nested_exit_regs (you can even extend the 
TRACE_EVENT_KVM_EXIT macro to generate both the exit and the exit_regs 
tracepoint).  On the other hand it seems to me that this new tracepoint 
is kinda reinventing the wheel; your patch adding nested equivalents of 
trace_kvm_hypercall and trace_kvm_msr seems more obvious to me.

I see Sean's point in not wanting one-off tracepoints, on the other hand 
there is value in having similar tracepoints for the L1->L0 and L2->L0 
cases.  I'll let him choose between the two possibilities (a new 
*_exit_regs pair, or just apply this patch) but I think there should be 
one of these two.

Paolo


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints
  2024-12-19 18:02       ` Paolo Bonzini
@ 2025-01-16 22:53         ` Sean Christopherson
  0 siblings, 0 replies; 13+ messages in thread
From: Sean Christopherson @ 2025-01-16 22:53 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Maxim Levitsky, kvm, x86, Dave Hansen, Thomas Gleixner,
	Borislav Petkov, Ingo Molnar, H. Peter Anvin, linux-kernel

On Thu, Dec 19, 2024, Paolo Bonzini wrote:
> On 12/19/24 18:49, Maxim Levitsky wrote:
> > > Here I probably would have preferred an unconditional tracepoint giving
> > > RAX/RBX/RCX/RDX after a nested vmexit.  This is not exactly what Sean
> > > wanted but perhaps it strikes a middle ground?  I know you wrote this
> > > for a debugging tool, do you really need to have everything in a single
> > > tracepoint, or can you correlate the existing exit tracepoint with this
> > > hypothetical trace_kvm_nested_exit_regs, to pick RDMSR vs. WRMSR?
> > 
> > Hi!
> > 
> > If the new trace_kvm_nested_exit_regs tracepoint has a VM exit number
> > argument, then I can enable this new tracepoint twice with a different
> > filter (vm_exit_num number == msr and vm_exit_num == vmcall), and each
> > instance will count the events that I need.
> > 
> > So this can work.
> Ok, thanks.  On one hand it may make sense to have trace_kvm_exit_regs and
> trace_kvm_nested_exit_regs (you can even extend the TRACE_EVENT_KVM_EXIT
> macro to generate both the exit and the exit_regs tracepoint).  On the other
> hand it seems to me that this new tracepoint is kinda reinventing the wheel;
> your patch adding nested equivalents of trace_kvm_hypercall and
> trace_kvm_msr seems more obvious to me.
> 
> I see Sean's point in not wanting one-off tracepoints, on the other hand
> there is value in having similar tracepoints for the L1->L0 and L2->L0
> cases.

I don't understand why we want two (or three, or five) tracepoints for the same
thing.  I want to go the opposite direction and (a) delete kvm_nested_vmexit
and then (b) rename kvm_nested_vmexit_inject => kvm_nested_vmexit so that it
pairs with kvm_nested_vmenter.

Similary, having kvm_nested_intr_vmexit is asinine when kvm_nested_vmexit_inject
captures *more* information about the IRQ itself.

I don't see the point of trace_kvm_nested_exit_regs.  Except for L1 vs. L2, it's
redundant.   kvm_nested_vmexit_inject and kvm_nested_vmenter are useful because
they capture novel information.

> I'll let him choose between the two possibilities (a new *_exit_regs
> pair, or just apply this patch) but I think there should be one of these
> two.

Anything but a pair.  Why can't we capture L1 vs. L2 in the tracepoints and call
it a day?

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2025-01-16 22:53 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-10 20:03 [PATCH v5 0/3] KVM: x86: tracepoint updates Maxim Levitsky
2024-09-10 20:03 ` [PATCH v5 1/3] KVM: x86: add more information to the kvm_entry tracepoint Maxim Levitsky
2024-12-18 20:53   ` Sean Christopherson
2024-09-10 20:03 ` [PATCH v5 2/3] KVM: x86: add information about pending requests to kvm_exit tracepoint Maxim Levitsky
2024-09-10 20:03 ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints Maxim Levitsky
2024-12-18 21:14   ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepointsg Sean Christopherson
2024-12-19 17:33   ` [PATCH v5 3/3] KVM: x86: add new nested vmexit tracepoints Paolo Bonzini
2024-12-19 17:49     ` Maxim Levitsky
2024-12-19 18:02       ` Paolo Bonzini
2025-01-16 22:53         ` Sean Christopherson
2024-10-30 21:21 ` [PATCH v5 0/3] KVM: x86: tracepoint updates Maxim Levitsky
2024-11-22  1:04   ` Maxim Levitsky
2024-12-19  2:40 ` Sean Christopherson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).