linux-trace-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] trace-cmd: Fix mappings when kvm vcpus do not match the guests CPU numbers
@ 2022-07-08  1:42 Steven Rostedt
  2022-07-08  1:42 ` [PATCH 1/2] trace-cmd: Do not use KVM debug vcpu directories as the CPU mapping Steven Rostedt
  2022-07-08  1:42 ` [PATCH 2/2] trace-cmd: Have the pid to vcpu mappings know about sparse maps Steven Rostedt
  0 siblings, 2 replies; 3+ messages in thread
From: Steven Rostedt @ 2022-07-08  1:42 UTC (permalink / raw)
  To: linux-trace-devel; +Cc: Vineeth Pillai, Steven Rostedt (Google)

From: "Steven Rostedt (Google)" <rostedt@goodmis.org>

The guest CPU numbers may not match what the kvm enter/exit events show.
This will prevent trace-cmd from saving what host threads map to which guest
vcpu.

Luckily, it seems that the mappings are sorted. That is, the vcpuX of kvm
has the X in the same order as the CPUs numbers of the guest. And currently,
the guests see consecutive numbers (not sparse).

This adds a mapping between the vcpuX and the consective numbers that allows
trace-cmd to find the host threads that map to the guests.

Steven Rostedt (Google) (2):
  trace-cmd: Do not use KVM debug vcpu directories as the CPU mapping
  trace-cmd: Have the pid to vcpu mappings know about sparse maps

 lib/trace-cmd/trace-timesync-kvm.c | 104 ++++++++++++++++-------------
 tracecmd/trace-tsync.c             | 100 ++++++++++++++++++++++++++-
 2 files changed, 157 insertions(+), 47 deletions(-)

-- 
2.35.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] trace-cmd: Do not use KVM debug vcpu directories as the CPU mapping
  2022-07-08  1:42 [PATCH 0/2] trace-cmd: Fix mappings when kvm vcpus do not match the guests CPU numbers Steven Rostedt
@ 2022-07-08  1:42 ` Steven Rostedt
  2022-07-08  1:42 ` [PATCH 2/2] trace-cmd: Have the pid to vcpu mappings know about sparse maps Steven Rostedt
  1 sibling, 0 replies; 3+ messages in thread
From: Steven Rostedt @ 2022-07-08  1:42 UTC (permalink / raw)
  To: linux-trace-devel; +Cc: Vineeth Pillai, Steven Rostedt (Google)

From: "Steven Rostedt (Google)" <rostedt@goodmis.org>

There are some setups where the vcpu directories in:

  /sys/kernel/debug/kvm/<pid>/vcpuX

Does not match the CPU of the guest. That is, the guest CPU 3 may be
vcpu8.

This breaks the algorithm used when setting up the kvm time
synchronization. As it expects these files to map to the CPU used, and if
one of the vcpuX is greater than the number of CPUs of the guest, it will
fail.

Currently, the locations that do this at least have the guest CPUS
consecutive (non sparse) and they map to the kvm vcpuX in order. That is,
if the vcpuX is sorted by the X it will then map to the CPUs of the guest.

This may not hold true for all setups, but we are currently just going to
solve this one. It appears that the vcpuX in kvm maps to the guest's
apicid in /proc/cpuinfo. More can be done later to make sure the two are
in order and if not to put in meta data that will allow the kvm mappings
to still work.

For now restructure the code by creating a single kvm_clock_files
structure to hold the offset, scaling and frac arrays in a single
structure, and also add the vcpu to it when adding new information from
the vcpuX directories. Then sort the list by the vcpu such that it should
map to the 0-nr_cpus of the guest.

Link: https://lore.kernel.org/all/20220504010242.1388192-1-vineethrp@google.com/

Reported-by: Vineeth Pillai <vineethrp@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 lib/trace-cmd/trace-timesync-kvm.c | 104 ++++++++++++++++-------------
 1 file changed, 58 insertions(+), 46 deletions(-)

diff --git a/lib/trace-cmd/trace-timesync-kvm.c b/lib/trace-cmd/trace-timesync-kvm.c
index 1db63d94f545..aa1799b1eefb 100644
--- a/lib/trace-cmd/trace-timesync-kvm.c
+++ b/lib/trace-cmd/trace-timesync-kvm.c
@@ -33,15 +33,20 @@ typedef __s64 s64;
 #define KVM_ACCURACY	0
 #define KVM_NAME	"kvm"
 
+struct kvm_clock_files {
+	int		vcpu;
+	char		*offsets;
+	char		*scalings;
+	char		*frac;
+};
+
 struct kvm_clock_sync {
-	int vcpu_count;
-	char **vcpu_offsets;
-	char **vcpu_scalings;
-	char **vcpu_frac;
-	int marker_fd;
-	struct tep_handle *tep;
-	int raw_id;
-	unsigned long long ts;
+	int			vcpu_count;
+	int			marker_fd;
+	struct kvm_clock_files	*clock_files;
+	struct tep_handle	*tep;
+	int			raw_id;
+	unsigned long long	ts;
 };
 
 struct kvm_clock_offset_msg {
@@ -210,7 +215,7 @@ static bool kvm_support_check(bool guest)
 	return kvm_scaling_check();
 }
 
-static int kvm_open_vcpu_dir(struct kvm_clock_sync *kvm, int cpu, char *dir_str)
+static int kvm_open_vcpu_dir(struct kvm_clock_sync *kvm, int i, char *dir_str)
 {
 	struct dirent *entry;
 	char path[PATH_MAX];
@@ -224,21 +229,21 @@ static int kvm_open_vcpu_dir(struct kvm_clock_sync *kvm, int cpu, char *dir_str)
 			if (!strcmp(entry->d_name, KVM_DEBUG_OFFSET_FILE)) {
 				snprintf(path, sizeof(path), "%s/%s",
 					 dir_str, entry->d_name);
-				kvm->vcpu_offsets[cpu] = strdup(path);
+				kvm->clock_files[i].offsets = strdup(path);
 			}
 			if (!strcmp(entry->d_name, KVM_DEBUG_SCALING_FILE)) {
 				snprintf(path, sizeof(path), "%s/%s",
 					 dir_str, entry->d_name);
-				kvm->vcpu_scalings[cpu] = strdup(path);
+				kvm->clock_files[i].scalings = strdup(path);
 			}
 			if (!strcmp(entry->d_name, KVM_DEBUG_FRACTION_FILE)) {
 				snprintf(path, sizeof(path), "%s/%s",
 					 dir_str, entry->d_name);
-				kvm->vcpu_frac[cpu] = strdup(path);
+				kvm->clock_files[i].frac = strdup(path);
 			}
 		}
 	}
-	if (!kvm->vcpu_offsets[cpu])
+	if (!kvm->clock_files[i].offsets)
 		goto error;
 	closedir(dir);
 	return 0;
@@ -246,15 +251,25 @@ static int kvm_open_vcpu_dir(struct kvm_clock_sync *kvm, int cpu, char *dir_str)
 error:
 	if (dir)
 		closedir(dir);
-	free(kvm->vcpu_offsets[cpu]);
-	kvm->vcpu_offsets[cpu] = NULL;
-	free(kvm->vcpu_scalings[cpu]);
-	kvm->vcpu_scalings[cpu] = NULL;
-	free(kvm->vcpu_frac[cpu]);
-	kvm->vcpu_frac[cpu] = NULL;
+	free(kvm->clock_files[i].offsets);
+	kvm->clock_files[i].offsets = NULL;
+	free(kvm->clock_files[i].scalings);
+	kvm->clock_files[i].scalings = NULL;
+	free(kvm->clock_files[i].frac);
+	kvm->clock_files[i].frac = NULL;
 	return -1;
 }
 
+static int cmp_clock(const void *A, const void *B)
+{
+	const struct kvm_clock_files *a = A;
+	const struct kvm_clock_files *b = B;
+
+	if (a->vcpu < b->vcpu)
+		return -1;
+	return a->vcpu > b->vcpu;
+}
+
 static int kvm_open_debug_files(struct kvm_clock_sync *kvm, int pid)
 {
 	char *vm_dir_str = NULL;
@@ -284,21 +299,25 @@ static int kvm_open_debug_files(struct kvm_clock_sync *kvm, int pid)
 	dir = opendir(vm_dir_str);
 	if (!dir)
 		goto error;
+	i = 0;
 	while ((entry = readdir(dir))) {
 		if (!(entry->d_type == DT_DIR &&
 		    !strncmp(entry->d_name, KVM_DEBUG_VCPU_DIR, strlen(KVM_DEBUG_VCPU_DIR))))
 			continue;
-		vcpu =  strtol(entry->d_name + strlen(KVM_DEBUG_VCPU_DIR), NULL, 10);
-		if (vcpu < 0 || vcpu >= kvm->vcpu_count)
-			continue;
-		snprintf(path, sizeof(path), "%s/%s", vm_dir_str, entry->d_name);
-		if (kvm_open_vcpu_dir(kvm, vcpu, path) < 0)
+		if (i == kvm->vcpu_count)
 			goto error;
-	}
-	for (i = 0; i < kvm->vcpu_count; i++) {
-		if (!kvm->vcpu_offsets[i])
+		vcpu = strtol(entry->d_name + strlen(KVM_DEBUG_VCPU_DIR), NULL, 10);
+		kvm->clock_files[i].vcpu = vcpu;
+		snprintf(path, sizeof(path), "%s/%s", vm_dir_str, entry->d_name);
+		if (kvm_open_vcpu_dir(kvm, i, path) < 0)
 			goto error;
+		i++;
 	}
+	if (i < kvm->vcpu_count)
+		goto error;
+
+	qsort(kvm->clock_files, kvm->vcpu_count, sizeof(*kvm->clock_files), cmp_clock);
+
 	closedir(dir);
 	free(pid_str);
 	free(vm_dir_str);
@@ -315,19 +334,15 @@ static int kvm_clock_sync_init_host(struct tracecmd_time_sync *tsync,
 				    struct kvm_clock_sync *kvm)
 {
 	kvm->vcpu_count = tsync->vcpu_count;
-	kvm->vcpu_offsets = calloc(kvm->vcpu_count, sizeof(char *));
-	kvm->vcpu_scalings = calloc(kvm->vcpu_count, sizeof(char *));
-	kvm->vcpu_frac = calloc(kvm->vcpu_count, sizeof(char *));
-	if (!kvm->vcpu_offsets || !kvm->vcpu_scalings || !kvm->vcpu_frac)
+	kvm->clock_files = calloc(kvm->vcpu_count, sizeof(*kvm->clock_files));
+	if (!kvm->clock_files)
 		goto error;
 	if (kvm_open_debug_files(kvm, tsync->guest_pid) < 0)
 		goto error;
 	return 0;
 
 error:
-	free(kvm->vcpu_offsets);
-	free(kvm->vcpu_scalings);
-	free(kvm->vcpu_frac);
+	free(kvm->clock_files);
 	return -1;
 }
 
@@ -414,12 +429,9 @@ static int kvm_clock_sync_free(struct tracecmd_time_sync *tsync)
 		kvm = (struct kvm_clock_sync *)clock_context->proto_data;
 	if (kvm) {
 		for (i = 0; i < kvm->vcpu_count; i++) {
-			free(kvm->vcpu_offsets[i]);
-			kvm->vcpu_offsets[i] = NULL;
-			free(kvm->vcpu_scalings[i]);
-			kvm->vcpu_scalings[i] = NULL;
-			free(kvm->vcpu_frac[i]);
-			kvm->vcpu_frac[i] = NULL;
+			free(kvm->clock_files[i].offsets);
+			free(kvm->clock_files[i].scalings);
+			free(kvm->clock_files[i].frac);
 		}
 		if (kvm->tep)
 			tep_free(kvm->tep);
@@ -449,23 +461,23 @@ static int kvm_clock_host(struct tracecmd_time_sync *tsync,
 	clock_context = (struct clock_sync_context *)tsync->context;
 	if (clock_context)
 		kvm = (struct kvm_clock_sync *)clock_context->proto_data;
-	if (!kvm || !kvm->vcpu_offsets || !kvm->vcpu_offsets[0])
+	if (!kvm || !kvm->clock_files || !kvm->clock_files[0].offsets)
 		return -1;
 	if (cpu >= kvm->vcpu_count)
 		return -1;
-	ret = read_ll_from_file(kvm->vcpu_offsets[cpu], &kvm_offset);
+	ret = read_ll_from_file(kvm->clock_files[cpu].offsets, &kvm_offset);
 	if (ret < 0)
 		return -1;
 
-	if (kvm->vcpu_scalings && kvm->vcpu_scalings[cpu]) {
-		read_ll_from_file(kvm->vcpu_scalings[cpu], &kvm_scaling);
+	if (kvm->clock_files[cpu].scalings) {
+		read_ll_from_file(kvm->clock_files[cpu].scalings, &kvm_scaling);
 		if (kvm_scaling == KVM_SCALING_AMD_DEFAULT ||
 		    kvm_scaling == KVM_SCALING_INTEL_DEFAULT)
 			kvm_scaling = 1;
 	}
 
-	if (kvm->vcpu_frac && kvm->vcpu_frac[cpu] && kvm_scaling != 1)
-		ret = read_ll_from_file(kvm->vcpu_frac[cpu], &kvm_frac);
+	if (kvm->clock_files[cpu].frac && kvm_scaling != 1)
+		ret = read_ll_from_file(kvm->clock_files[cpu].frac, &kvm_frac);
 	msg = (char *)&packet;
 	size = sizeof(packet);
 	ret = tracecmd_msg_recv_time_sync(tsync->msg_handle,
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] trace-cmd: Have the pid to vcpu mappings know about sparse maps
  2022-07-08  1:42 [PATCH 0/2] trace-cmd: Fix mappings when kvm vcpus do not match the guests CPU numbers Steven Rostedt
  2022-07-08  1:42 ` [PATCH 1/2] trace-cmd: Do not use KVM debug vcpu directories as the CPU mapping Steven Rostedt
@ 2022-07-08  1:42 ` Steven Rostedt
  1 sibling, 0 replies; 3+ messages in thread
From: Steven Rostedt @ 2022-07-08  1:42 UTC (permalink / raw)
  To: linux-trace-devel; +Cc: Vineeth Pillai, Steven Rostedt (Google)

From: "Steven Rostedt (Google)" <rostedt@goodmis.org>

If the kvm vcpuX files do not map directly to the guest's CPUs, then the
tmaps will not match and the host will not have a mapping to the guest.

That is, if the hypervisor has:

 # ls -d /sys/kernel/debug/kvm/[0-9]*/vcpu*
/sys/kernel/debug/kvm/408-7/vcpu0   /sys/kernel/debug/kvm/408-7/vcpu24
/sys/kernel/debug/kvm/408-7/vcpu1   /sys/kernel/debug/kvm/408-7/vcpu26
/sys/kernel/debug/kvm/408-7/vcpu16  /sys/kernel/debug/kvm/408-7/vcpu28
/sys/kernel/debug/kvm/408-7/vcpu18  /sys/kernel/debug/kvm/408-7/vcpu30
/sys/kernel/debug/kvm/408-7/vcpu20  /sys/kernel/debug/kvm/408-7/vcpu8
/sys/kernel/debug/kvm/408-7/vcpu22  /sys/kernel/debug/kvm/408-7/vcpu9

But the guest sees CPUs 0-11, the kvm exit/enter events will also show the
above numbers (vcpu 8 for guest CPU 3).

Have the mapping logic look at these files and sort them, and then map the
threads to CPU via them.

Link: https://lore.kernel.org/all/20220504010242.1388192-1-vineethrp@google.com/

Reported-by: Vineeth Pillai <vineethrp@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tracecmd/trace-tsync.c | 100 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 99 insertions(+), 1 deletion(-)

diff --git a/tracecmd/trace-tsync.c b/tracecmd/trace-tsync.c
index a96b4e43e0d9..57baff399bd3 100644
--- a/tracecmd/trace-tsync.c
+++ b/tracecmd/trace-tsync.c
@@ -16,14 +16,105 @@ struct trace_mapping {
 	struct tep_format_field		*common_pid;
 	int				*pids;
 	int				*map;
+	int				*vcpu;
 	int				max_cpus;
 };
 
+static int cmp_tmap_vcpu(const void *A, const void *B)
+{
+	const int *a = A;
+	const int *b = B;
+
+	if (*a < *b)
+		return -1;
+	return *a > *b;
+}
+
+static int map_kvm_vcpus(int guest_pid, struct trace_mapping *tmap)
+{
+	struct dirent *entry;
+	const char *debugfs;
+	char *vm_dir_str = NULL;
+	char *pid_file = NULL;
+	char *kvm_dir;
+	int pid_file_len;
+	bool found = false;
+	DIR *dir;
+	int ret = -1;
+	int i;
+
+	tmap->vcpu = malloc(sizeof(*tmap->vcpu) * tmap->max_cpus);
+	if (!tmap->vcpu)
+		return -1;
+
+	memset(tmap->vcpu, -1, sizeof(*tmap->vcpu) * tmap->max_cpus);
+
+	debugfs = tracefs_debug_dir();
+	if (!debugfs)
+		return -1;
+
+	if (asprintf(&kvm_dir, "%s/kvm", debugfs) < 0)
+		return -1;
+
+	dir = opendir(kvm_dir);
+	if (!dir)
+		goto out;
+
+	if (asprintf(&pid_file, "%d-", guest_pid) <= 0)
+		goto out;
+
+	pid_file_len = strlen(pid_file);
+
+	while ((entry = readdir(dir))) {
+		if (entry->d_type != DT_DIR ||
+		    strncmp(entry->d_name, pid_file, pid_file_len) != 0)
+			continue;
+		if (asprintf(&vm_dir_str, "%s/%s", kvm_dir, entry->d_name) < 0)
+			goto out;
+		found = true;
+		break;
+	}
+	if (!found)
+		goto out;
+
+	closedir(dir);
+	dir = opendir(vm_dir_str);
+	if (!dir)
+		goto out;
+	i = 0;
+	while ((entry = readdir(dir))) {
+		if (entry->d_type != DT_DIR ||
+		    strncmp(entry->d_name, "vcpu", 4))
+			continue;
+		if (i == tmap->max_cpus)
+			goto out;
+		tmap->vcpu[i] = strtol(entry->d_name + 4, NULL, 10);
+		i++;
+	}
+
+	if (i < tmap->max_cpus)
+		goto out;
+
+	qsort(tmap->vcpu, tmap->max_cpus, sizeof(*tmap->vcpu), cmp_tmap_vcpu);
+
+	ret = 0;
+
+ out:
+	if (dir)
+		closedir(dir);
+	free(vm_dir_str);
+	free(pid_file);
+	free(kvm_dir);
+
+	return ret;
+}
+
 static int map_vcpus(struct tep_event *event, struct tep_record *record,
 		     int cpu, void *context)
 {
 	struct trace_mapping *tmap = context;
 	unsigned long long val;
+	int *vcpu;
 	int type;
 	int pid;
 	int ret;
@@ -53,10 +144,13 @@ static int map_vcpus(struct tep_event *event, struct tep_record *record,
 
 	cpu = (int)val;
 
+	vcpu = bsearch(&cpu, tmap->vcpu, tmap->max_cpus, sizeof(cpu), cmp_tmap_vcpu);
 	/* Sanity check, warn? */
-	if (cpu >= tmap->max_cpus)
+	if (!vcpu)
 		return 0;
 
+	cpu = vcpu - tmap->vcpu;
+
 	/* Already have this one? Should we check if it is the same? */
 	if (tmap->map[cpu] >= 0)
 		return 0;
@@ -123,6 +217,10 @@ static void stop_mapping_vcpus(int cpu_count, struct trace_guest *guest)
 	if (!tmap.map)
 		return;
 
+	/* Check if the kvm vcpu mappings are the same */
+	if (map_kvm_vcpus(guest->pid, &tmap) < 0)
+		goto out;
+
 	for (i = 0; i < tmap.max_cpus; i++)
 		tmap.map[i] = -1;
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-07-08  1:42 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-07-08  1:42 [PATCH 0/2] trace-cmd: Fix mappings when kvm vcpus do not match the guests CPU numbers Steven Rostedt
2022-07-08  1:42 ` [PATCH 1/2] trace-cmd: Do not use KVM debug vcpu directories as the CPU mapping Steven Rostedt
2022-07-08  1:42 ` [PATCH 2/2] trace-cmd: Have the pid to vcpu mappings know about sparse maps Steven Rostedt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).