[PATCH] KVM: selftests: Run dirty_log_perf

kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] KVM: selftests: Run dirty_log_perf_test on specific cpus
@ 2022-08-17 15:29 Vipin Sharma
  2022-08-17 17:24 ` Sean Christopherson
  0 siblings, 1 reply; 7+ messages in thread
From: Vipin Sharma @ 2022-08-17 15:29 UTC (permalink / raw)
  To: seanjc, dmatlack, pbonzini; +Cc: kvm, linux-kernel, Vipin Sharma

Add command line options to run the vcpus and the main process on the
specific cpus on a host machine. This is useful as it provides
options to analyze performance based on the vcpus and dirty log worker
locations, like on the different numa nodes or on the same numa nodes.

Signed-off-by: Vipin Sharma <vipinsh@google.com>
Suggested-by: David Matlack <dmatlack@google.com>
Suggested-by: Sean Christopherson <seanjc@google.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
---

This is based on the discussion at
https://lore.kernel.org/lkml/20220801151928.270380-1-vipinsh@google.com/

 .../selftests/kvm/access_tracking_perf_test.c |   2 +-
 .../selftests/kvm/demand_paging_test.c        |   2 +-
 .../selftests/kvm/dirty_log_perf_test.c       | 108 +++++++++++++++++-
 .../selftests/kvm/include/perf_test_util.h    |   3 +-
 .../selftests/kvm/lib/perf_test_util.c        |  32 +++++-
 .../kvm/memslot_modification_stress_test.c    |   2 +-
 6 files changed, 139 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
index 1c2749b1481ac..9659462f47478 100644
--- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
+++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
@@ -299,7 +299,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 	vm = perf_test_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1,
 				 params->backing_src, !overlap_memory_access);
 
-	perf_test_start_vcpu_threads(nr_vcpus, vcpu_thread_main);
+	perf_test_start_vcpu_threads(nr_vcpus, NULL, vcpu_thread_main);
 
 	pr_info("\n");
 	access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory");
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 779ae54f89c40..b9848174d6e7c 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -336,7 +336,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 	pr_info("Finished creating vCPUs and starting uffd threads\n");
 
 	clock_gettime(CLOCK_MONOTONIC, &start);
-	perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker);
+	perf_test_start_vcpu_threads(nr_vcpus, NULL, vcpu_worker);
 	pr_info("Started all vCPUs\n");
 
 	perf_test_join_vcpu_threads(nr_vcpus);
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index f99e39a672d3f..68a1d7262f21b 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -8,10 +8,12 @@
  * Copyright (C) 2020, Google, Inc.
  */
 
+#define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <pthread.h>
+#include <sched.h>
 #include <linux/bitmap.h>
 
 #include "kvm_util.h"
@@ -66,6 +68,8 @@ static u64 dirty_log_manual_caps;
 static bool host_quit;
 static int iteration;
 static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
+/* Map vcpus to logical cpus on host. */
+static int vcpu_to_lcpu_map[KVM_MAX_VCPUS];
 
 static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
 {
@@ -132,6 +136,7 @@ struct test_params {
 	bool partition_vcpu_memory_access;
 	enum vm_mem_backing_src_type backing_src;
 	int slots;
+	int *vcpu_to_lcpu;
 };
 
 static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable)
@@ -248,7 +253,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 	for (i = 0; i < nr_vcpus; i++)
 		vcpu_last_completed_iteration[i] = -1;
 
-	perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker);
+	perf_test_start_vcpu_threads(nr_vcpus, p->vcpu_to_lcpu, vcpu_worker);
 
 	/* Allow the vCPUs to populate memory */
 	pr_debug("Starting iteration %d - Populating\n", iteration);
@@ -348,12 +353,74 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 	perf_test_destroy_vm(vm);
 }
 
+static int parse_num(const char *num_str)
+{
+	int num;
+	char *end_ptr;
+
+	errno = 0;
+	num = (int)strtol(num_str, &end_ptr, 10);
+	TEST_ASSERT(num_str != end_ptr && *end_ptr == '\0',
+		    "Invalid number string.\n");
+	TEST_ASSERT(errno == 0, "Conversion error: %d\n", errno);
+
+	return num;
+}
+
+static int parse_cpu_list(const char *arg)
+{
+	char delim[2] = ",";
+	char *cpu, *cpu_list;
+	int i = 0, cpu_num;
+
+	cpu_list = strdup(arg);
+	TEST_ASSERT(cpu_list, "Low memory\n");
+
+	cpu = strtok(cpu_list, delim);
+	while (cpu) {
+		cpu_num = parse_num(cpu);
+		TEST_ASSERT(cpu_num >= 0, "Invalid cpu number: %d\n", cpu_num);
+		vcpu_to_lcpu_map[i++] = cpu_num;
+		cpu = strtok(NULL, delim);
+	}
+
+	free(cpu_list);
+
+	return i;
+}
+
+static void assign_dirty_log_perf_test_cpu(const char *arg)
+{
+	char delim[2] = ",";
+	char *cpu, *cpu_list;
+	cpu_set_t cpuset;
+	int cpu_num, err;
+
+	cpu_list = strdup(arg);
+	TEST_ASSERT(cpu_list, "Low memory\n");
+
+	CPU_ZERO(&cpuset);
+	cpu = strtok(cpu_list, delim);
+	while (cpu) {
+		cpu_num = parse_num(cpu);
+		TEST_ASSERT(cpu_num >= 0, "Invalid cpu number: %d\n", cpu_num);
+		CPU_SET(cpu_num, &cpuset);
+		cpu = strtok(NULL, delim);
+	}
+
+	err = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+	TEST_ASSERT(err == 0, "Error in setting dirty log perf test cpu\n");
+
+	free(cpu_list);
+}
+
 static void help(char *name)
 {
 	puts("");
 	printf("usage: %s [-h] [-i iterations] [-p offset] [-g] "
 	       "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]"
-	       "[-x memslots]\n", name);
+	       "[-x memslots] [-c logical cpus for vcpus]"
+	       "[-d cpus to run dirty_log_perf_test]\n", name);
 	puts("");
 	printf(" -i: specify iteration counts (default: %"PRIu64")\n",
 	       TEST_HOST_LOOP_N);
@@ -383,6 +450,26 @@ static void help(char *name)
 	backing_src_help("-s");
 	printf(" -x: Split the memory region into this number of memslots.\n"
 	       "     (default: 1)\n");
+	printf(" -c: Comma separated values of the logical CPUs which will run\n"
+	       "     the vCPUs. Number of values should be equal to the number\n"
+	       "     of vCPUs.\n\n"
+	       "     Example: ./dirty_log_perf_test -v 3 -c 22,43,1\n"
+	       "     This means that the vcpu 0 will run on the logical cpu 22,\n"
+	       "     vcpu 1 on the logical cpu 43 and vcpu 2 on the logical cpu 1.\n"
+	       "     (default: No cpu mapping)\n\n");
+	printf(" -d: Comma separated values of the logical CPUs on which\n"
+	       "     dirty_log_perf_test will run. Without -c option, all of\n"
+	       "     the vcpus and main process will run on the cpus provided here.\n"
+	       "     This option also accepts a single cpu. (default: No cpu mapping)\n\n"
+	       "     Example 1: ./dirty_log_perf_test -v 3 -c 22,43,1 -d 101\n"
+	       "     Main application thread will run on logical cpu 101 and\n"
+	       "     vcpus will run on the logical cpus 22, 43 and 1\n\n"
+	       "     Example 2: ./dirty_log_perf_test -v 3 -d 101\n"
+	       "     Main application thread and vcpus will run on the logical\n"
+	       "     cpu 101\n\n"
+	       "     Example 3: ./dirty_log_perf_test -v 3 -d 101,23,53\n"
+	       "     Main application thread and vcpus will run on logical cpus\n"
+	       "     101, 23 and 53.\n");
 	puts("");
 	exit(0);
 }
@@ -396,8 +483,10 @@ int main(int argc, char *argv[])
 		.partition_vcpu_memory_access = true,
 		.backing_src = DEFAULT_VM_MEM_SRC,
 		.slots = 1,
+		.vcpu_to_lcpu = NULL,
 	};
 	int opt;
+	int nr_lcpus = -1;
 
 	dirty_log_manual_caps =
 		kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
@@ -406,8 +495,14 @@ int main(int argc, char *argv[])
 
 	guest_modes_append_default();
 
-	while ((opt = getopt(argc, argv, "eghi:p:m:nb:f:v:os:x:")) != -1) {
+	while ((opt = getopt(argc, argv, "c:d:eghi:p:m:nb:f:v:os:x:")) != -1) {
 		switch (opt) {
+		case 'c':
+			nr_lcpus = parse_cpu_list(optarg);
+			break;
+		case 'd':
+			assign_dirty_log_perf_test_cpu(optarg);
+			break;
 		case 'e':
 			/* 'e' is for evil. */
 			run_vcpus_while_disabling_dirty_logging = true;
@@ -455,6 +550,13 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	if (nr_lcpus != -1) {
+		TEST_ASSERT(nr_lcpus == nr_vcpus,
+			    "Number of vCPUs (%d) are not equal to number of logical cpus provided (%d).",
+			    nr_vcpus, nr_lcpus);
+		p.vcpu_to_lcpu = vcpu_to_lcpu_map;
+	}
+
 	TEST_ASSERT(p.iterations >= 2, "The test should have at least two iterations");
 
 	pr_info("Test iterations: %"PRIu64"\n",	p.iterations);
diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h
index eaa88df0555a8..bd6c566cfc92e 100644
--- a/tools/testing/selftests/kvm/include/perf_test_util.h
+++ b/tools/testing/selftests/kvm/include/perf_test_util.h
@@ -53,7 +53,8 @@ void perf_test_destroy_vm(struct kvm_vm *vm);
 
 void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract);
 
-void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *));
+void perf_test_start_vcpu_threads(int vcpus, int *vcpus_to_lcpu,
+				  void (*vcpu_fn)(struct perf_test_vcpu_args *));
 void perf_test_join_vcpu_threads(int vcpus);
 void perf_test_guest_code(uint32_t vcpu_id);
 
diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
index 9618b37c66f7e..771fbdf3d2c22 100644
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -2,11 +2,14 @@
 /*
  * Copyright (C) 2020, Google LLC.
  */
+#define _GNU_SOURCE
 #include <inttypes.h>
 
 #include "kvm_util.h"
 #include "perf_test_util.h"
 #include "processor.h"
+#include <pthread.h>
+#include <sched.h>
 
 struct perf_test_args perf_test_args;
 
@@ -260,10 +263,15 @@ static void *vcpu_thread_main(void *data)
 	return NULL;
 }
 
-void perf_test_start_vcpu_threads(int nr_vcpus,
+void perf_test_start_vcpu_threads(int nr_vcpus, int *vcpu_to_lcpu,
 				  void (*vcpu_fn)(struct perf_test_vcpu_args *))
 {
-	int i;
+	int i, err = 0;
+	pthread_attr_t attr;
+	cpu_set_t cpuset;
+
+	pthread_attr_init(&attr);
+	CPU_ZERO(&cpuset);
 
 	vcpu_thread_fn = vcpu_fn;
 	WRITE_ONCE(all_vcpu_threads_running, false);
@@ -274,7 +282,24 @@ void perf_test_start_vcpu_threads(int nr_vcpus,
 		vcpu->vcpu_idx = i;
 		WRITE_ONCE(vcpu->running, false);
 
-		pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu);
+		if (vcpu_to_lcpu) {
+			CPU_SET(vcpu_to_lcpu[i], &cpuset);
+
+			err = pthread_attr_setaffinity_np(&attr,
+							  sizeof(cpu_set_t),
+							  &cpuset);
+			TEST_ASSERT(err == 0,
+				    "vCPU %d could not be mapped to logical cpu %d, error returned: %d\n",
+				    i, vcpu_to_lcpu[i], err);
+
+			CPU_CLR(vcpu_to_lcpu[i], &cpuset);
+		}
+
+		err = pthread_create(&vcpu->thread, &attr, vcpu_thread_main,
+				     vcpu);
+		TEST_ASSERT(err == 0,
+			    "error in creating vcpu %d thread, error returned: %d\n",
+			    i, err);
 	}
 
 	for (i = 0; i < nr_vcpus; i++) {
@@ -283,6 +308,7 @@ void perf_test_start_vcpu_threads(int nr_vcpus,
 	}
 
 	WRITE_ONCE(all_vcpu_threads_running, true);
+	pthread_attr_destroy(&attr);
 }
 
 void perf_test_join_vcpu_threads(int nr_vcpus)
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
index 6ee7e1dde4043..246f8cc7bb2b1 100644
--- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@ -103,7 +103,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
 	pr_info("Finished creating vCPUs\n");
 
-	perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker);
+	perf_test_start_vcpu_threads(nr_vcpus, NULL, vcpu_worker);
 
 	pr_info("Started all vCPUs\n");
 
-- 
2.37.1.595.g718a3a8f04-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] KVM: selftests: Run dirty_log_perf_test on specific cpus
  2022-08-17 15:29 [PATCH] KVM: selftests: Run dirty_log_perf_test on specific cpus Vipin Sharma
@ 2022-08-17 17:24 ` Sean Christopherson
  2022-08-17 18:16   ` Vipin Sharma
  0 siblings, 1 reply; 7+ messages in thread
From: Sean Christopherson @ 2022-08-17 17:24 UTC (permalink / raw)
  To: Vipin Sharma; +Cc: dmatlack, pbonzini, kvm, linux-kernel

On Wed, Aug 17, 2022, Vipin Sharma wrote:
> Add command line options to run the vcpus and the main process on the
> specific cpus on a host machine. This is useful as it provides
> options to analyze performance based on the vcpus and dirty log worker
> locations, like on the different numa nodes or on the same numa nodes.

The two options should probably be separate patches, they are related but still
two very distinct changes.

> Signed-off-by: Vipin Sharma <vipinsh@google.com>
> Suggested-by: David Matlack <dmatlack@google.com>
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> 
> This is based on the discussion at
> https://lore.kernel.org/lkml/20220801151928.270380-1-vipinsh@google.com/

This can and should be captured in the changelog proper:

  Link: https://lore.kernel.org/lkml/20220801151928.270380-1-vipinsh@google.com

> @@ -348,12 +353,74 @@ static void run_test(enum vm_guest_mode mode, void *arg)
>  	perf_test_destroy_vm(vm);
>  }
>  
> +static int parse_num(const char *num_str)
> +{
> +	int num;
> +	char *end_ptr;
> +
> +	errno = 0;
> +	num = (int)strtol(num_str, &end_ptr, 10);
> +	TEST_ASSERT(num_str != end_ptr && *end_ptr == '\0',
> +		    "Invalid number string.\n");
> +	TEST_ASSERT(errno == 0, "Conversion error: %d\n", errno);

Is the paranoia truly necessary?  What happens if parse_cpu_list() simply uses
atoi() and is passed garbage?

> +
> +	return num;
> +}
> +
> +static int parse_cpu_list(const char *arg)
> +{
> +	char delim[2] = ",";
> +	char *cpu, *cpu_list;
> +	int i = 0, cpu_num;
> +
> +	cpu_list = strdup(arg);
> +	TEST_ASSERT(cpu_list, "Low memory\n");

Heh, probably a little less than "low".  Just be literal and let the user figure
out why the allocation failed instead.

        TEST_ASSERT(cpu_list, "strdup() allocation failed\n");

> +
> +	cpu = strtok(cpu_list, delim);
> +	while (cpu) {
> +		cpu_num = parse_num(cpu);
> +		TEST_ASSERT(cpu_num >= 0, "Invalid cpu number: %d\n", cpu_num);
> +		vcpu_to_lcpu_map[i++] = cpu_num;
> +		cpu = strtok(NULL, delim);
> +	}
> +
> +	free(cpu_list);

The tokenization and parsing is nearly identical between parse_cpu_list() and
assign_dirty_log_perf_test_cpu().  The code can be made into a common helper by
passing in the destination, e.g.

static int parse_cpu_list(const char *arg, cpu_set_t *cpuset, int *vcpu_map)
{
	const char delim[] = ",";
	char *cpustr, *cpu_list;
	int i = 0, cpu;

	TEST_ASSERT(!!cpuset ^ !!vcpu_map);

	cpu_list = strdup(arg);
	TEST_ASSERT(cpu_list, "Low memory\n");

	cpustr = strtok(cpu_list, delim);
	while (cpustr) {
		cpu = atoi(cpustr);
		TEST_ASSERT(cpu >= 0, "Invalid cpu number: %d\n", cpu);
		if (vcpu_map)
			vcpu_to_lcpu_map[i++] = cpu_num;
		else
			CPU_SET(cpu_num, cpuset);
		cpu = strtok(NULL, delim);
	}

	free(cpu_list);

	return i;
}

> @@ -383,6 +450,26 @@ static void help(char *name)
>  	backing_src_help("-s");
>  	printf(" -x: Split the memory region into this number of memslots.\n"
>  	       "     (default: 1)\n");
> +	printf(" -c: Comma separated values of the logical CPUs which will run\n"
> +	       "     the vCPUs. Number of values should be equal to the number\n"
> +	       "     of vCPUs.\n\n"
> +	       "     Example: ./dirty_log_perf_test -v 3 -c 22,43,1\n"
> +	       "     This means that the vcpu 0 will run on the logical cpu 22,\n"
> +	       "     vcpu 1 on the logical cpu 43 and vcpu 2 on the logical cpu 1.\n"
> +	       "     (default: No cpu mapping)\n\n");
> +	printf(" -d: Comma separated values of the logical CPUs on which\n"
> +	       "     dirty_log_perf_test will run. Without -c option, all of\n"
> +	       "     the vcpus and main process will run on the cpus provided here.\n"
> +	       "     This option also accepts a single cpu. (default: No cpu mapping)\n\n"
> +	       "     Example 1: ./dirty_log_perf_test -v 3 -c 22,43,1 -d 101\n"
> +	       "     Main application thread will run on logical cpu 101 and\n"
> +	       "     vcpus will run on the logical cpus 22, 43 and 1\n\n"
> +	       "     Example 2: ./dirty_log_perf_test -v 3 -d 101\n"
> +	       "     Main application thread and vcpus will run on the logical\n"
> +	       "     cpu 101\n\n"
> +	       "     Example 3: ./dirty_log_perf_test -v 3 -d 101,23,53\n"
> +	       "     Main application thread and vcpus will run on logical cpus\n"
> +	       "     101, 23 and 53.\n");
>  	puts("");
>  	exit(0);
>  }

> @@ -455,6 +550,13 @@ int main(int argc, char *argv[])
>  		}
>  	}
>  

I wonder if we should make -c and -d mutually exclusive.  Tweak -c to include the
application thread, i.e. TEST_ASSERT(nr_lcpus == nr_vcpus+1) and require 1:1 pinning
for all tasks.  E.g. allowing "-c ..., -d 0,1,22" seems unnecessary.

> +	if (nr_lcpus != -1) {
> +		TEST_ASSERT(nr_lcpus == nr_vcpus,
> +			    "Number of vCPUs (%d) are not equal to number of logical cpus provided (%d).",
> +			    nr_vcpus, nr_lcpus);
> +		p.vcpu_to_lcpu = vcpu_to_lcpu_map;
> +	}
> +
>  	TEST_ASSERT(p.iterations >= 2, "The test should have at least two iterations");
>  
>  	pr_info("Test iterations: %"PRIu64"\n",	p.iterations);

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] KVM: selftests: Run dirty_log_perf_test on specific cpus
  2022-08-17 17:24 ` Sean Christopherson
@ 2022-08-17 18:16   ` Vipin Sharma
  2022-08-17 21:29     ` Sean Christopherson
  0 siblings, 1 reply; 7+ messages in thread
From: Vipin Sharma @ 2022-08-17 18:16 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: dmatlack, pbonzini, kvm, linux-kernel

On Wed, Aug 17, 2022 at 10:25 AM Sean Christopherson <seanjc@google.com> wrote:
>
> > +static int parse_num(const char *num_str)
> > +{
> > +     int num;
> > +     char *end_ptr;
> > +
> > +     errno = 0;
> > +     num = (int)strtol(num_str, &end_ptr, 10);
> > +     TEST_ASSERT(num_str != end_ptr && *end_ptr == '\0',
> > +                 "Invalid number string.\n");
> > +     TEST_ASSERT(errno == 0, "Conversion error: %d\n", errno);
>
> Is the paranoia truly necessary?  What happens if parse_cpu_list() simply uses
> atoi() and is passed garbage?

On error atoi() returns 0, which is also a valid logical cpu number.
We need error checking here to make sure that the user really wants
cpu 0 and it was not a mistake in typing.
I was thinking of using parse_num API for other places as well instead
of atoi() in dirty_log_perf_test.

> > +
> > +     cpu = strtok(cpu_list, delim);
> > +     while (cpu) {
> > +             cpu_num = parse_num(cpu);
> > +             TEST_ASSERT(cpu_num >= 0, "Invalid cpu number: %d\n", cpu_num);
> > +             vcpu_to_lcpu_map[i++] = cpu_num;
> > +             cpu = strtok(NULL, delim);
> > +     }
> > +
> > +     free(cpu_list);
>
> The tokenization and parsing is nearly identical between parse_cpu_list() and
> assign_dirty_log_perf_test_cpu().  The code can be made into a common helper by
> passing in the destination, e.g.
>
> static int parse_cpu_list(const char *arg, cpu_set_t *cpuset, int *vcpu_map)
> {
>         const char delim[] = ",";
>         char *cpustr, *cpu_list;
>         int i = 0, cpu;
>
>         TEST_ASSERT(!!cpuset ^ !!vcpu_map);
>
>         cpu_list = strdup(arg);
>         TEST_ASSERT(cpu_list, "Low memory\n");
>
>         cpustr = strtok(cpu_list, delim);
>         while (cpustr) {
>                 cpu = atoi(cpustr);
>                 TEST_ASSERT(cpu >= 0, "Invalid cpu number: %d\n", cpu);
>                 if (vcpu_map)
>                         vcpu_to_lcpu_map[i++] = cpu_num;
>                 else
>                         CPU_SET(cpu_num, cpuset);
>                 cpu = strtok(NULL, delim);
>         }
>
>         free(cpu_list);
>
>         return i;
> }
>

Yeah, it was either my almost duplicate functions or have the one
function do two things via if-else.  I am not happy with both
approaches.

I think I will pass an integer array which this parsing function will
fill up and return an int denoting how many elements were filled. The
caller then can use the array as they wish, to copy it in
vcpu_to_lcpu_map or cpuset.

> > @@ -383,6 +450,26 @@ static void help(char *name)
> >       backing_src_help("-s");
> >       printf(" -x: Split the memory region into this number of memslots.\n"
> >              "     (default: 1)\n");
> > +     printf(" -c: Comma separated values of the logical CPUs which will run\n"
> > +            "     the vCPUs. Number of values should be equal to the number\n"
> > +            "     of vCPUs.\n\n"
> > +            "     Example: ./dirty_log_perf_test -v 3 -c 22,43,1\n"
> > +            "     This means that the vcpu 0 will run on the logical cpu 22,\n"
> > +            "     vcpu 1 on the logical cpu 43 and vcpu 2 on the logical cpu 1.\n"
> > +            "     (default: No cpu mapping)\n\n");
> > +     printf(" -d: Comma separated values of the logical CPUs on which\n"
> > +            "     dirty_log_perf_test will run. Without -c option, all of\n"
> > +            "     the vcpus and main process will run on the cpus provided here.\n"
> > +            "     This option also accepts a single cpu. (default: No cpu mapping)\n\n"
> > +            "     Example 1: ./dirty_log_perf_test -v 3 -c 22,43,1 -d 101\n"
> > +            "     Main application thread will run on logical cpu 101 and\n"
> > +            "     vcpus will run on the logical cpus 22, 43 and 1\n\n"
> > +            "     Example 2: ./dirty_log_perf_test -v 3 -d 101\n"
> > +            "     Main application thread and vcpus will run on the logical\n"
> > +            "     cpu 101\n\n"
> > +            "     Example 3: ./dirty_log_perf_test -v 3 -d 101,23,53\n"
> > +            "     Main application thread and vcpus will run on logical cpus\n"
> > +            "     101, 23 and 53.\n");
> >       puts("");
> >       exit(0);
> >  }
>
> > @@ -455,6 +550,13 @@ int main(int argc, char *argv[])
> >               }
> >       }
> >
>
> I wonder if we should make -c and -d mutually exclusive.  Tweak -c to include the
> application thread, i.e. TEST_ASSERT(nr_lcpus == nr_vcpus+1) and require 1:1 pinning
> for all tasks.  E.g. allowing "-c ..., -d 0,1,22" seems unnecessary.
>

One downside I can think of will be if we add some worker threads
which are not vcpus then all of those threads will end up running on a
single cpu unless we edit this parsing logic again.

Current implementation gives vcpus special treatment via -c and for
the whole application via -d. This gives good separation of concerns
via flags.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] KVM: selftests: Run dirty_log_perf_test on specific cpus
  2022-08-17 18:16   ` Vipin Sharma
@ 2022-08-17 21:29     ` Sean Christopherson
  2022-08-18 17:58       ` Vipin Sharma
  0 siblings, 1 reply; 7+ messages in thread
From: Sean Christopherson @ 2022-08-17 21:29 UTC (permalink / raw)
  To: Vipin Sharma; +Cc: dmatlack, pbonzini, kvm, linux-kernel

On Wed, Aug 17, 2022, Vipin Sharma wrote:
> On Wed, Aug 17, 2022 at 10:25 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > > +static int parse_num(const char *num_str)
> > > +{
> > > +     int num;
> > > +     char *end_ptr;
> > > +
> > > +     errno = 0;
> > > +     num = (int)strtol(num_str, &end_ptr, 10);
> > > +     TEST_ASSERT(num_str != end_ptr && *end_ptr == '\0',
> > > +                 "Invalid number string.\n");
> > > +     TEST_ASSERT(errno == 0, "Conversion error: %d\n", errno);
> >
> > Is the paranoia truly necessary?  What happens if parse_cpu_list() simply uses
> > atoi() and is passed garbage?
> 
> On error atoi() returns 0, which is also a valid logical cpu number.

Lame.

> We need error checking here to make sure that the user really wants
> cpu 0 and it was not a mistake in typing.
> I was thinking of using parse_num API for other places as well instead
> of atoi() in dirty_log_perf_test.

Yes, definitely.  And maybe give it a name like atoi_paranoid()?

> Yeah, it was either my almost duplicate functions or have the one
> function do two things via if-else.  I am not happy with both
> approaches.
> 
> I think I will pass an integer array which this parsing function will
> fill up and return an int denoting how many elements were filled. The
> caller then can use the array as they wish, to copy it in
> vcpu_to_lcpu_map or cpuset.

Eh, I doubt that'll be a net improvement, e.g. the CPUSET case will then need to
re-loop, which seems silly.  If the exclusive cpuset vs. array is undesirable, we
could have the API require at least one instead of exactly one, i.e.

	TEST_ASSERT(cpuset || vcpu_map);

	...

                cpu = atoi(cpustr);
                TEST_ASSERT(cpu >= 0, "Invalid cpu number: %d\n", cpu);
                if (vcpu_map)
                        vcpu_map[i++] = cpu;
                if (cpuset)
                        CPU_SET(cpu, cpuset);
 
If we somehow end up with a third type of destination, then we can revisit this,
but that seems unlikely at this point.

> > > @@ -383,6 +450,26 @@ static void help(char *name)
> > >       backing_src_help("-s");
> > >       printf(" -x: Split the memory region into this number of memslots.\n"
> > >              "     (default: 1)\n");
> > > +     printf(" -c: Comma separated values of the logical CPUs which will run\n"
> > > +            "     the vCPUs. Number of values should be equal to the number\n"
> > > +            "     of vCPUs.\n\n"
> > > +            "     Example: ./dirty_log_perf_test -v 3 -c 22,43,1\n"
> > > +            "     This means that the vcpu 0 will run on the logical cpu 22,\n"
> > > +            "     vcpu 1 on the logical cpu 43 and vcpu 2 on the logical cpu 1.\n"
> > > +            "     (default: No cpu mapping)\n\n");
> > > +     printf(" -d: Comma separated values of the logical CPUs on which\n"
> > > +            "     dirty_log_perf_test will run. Without -c option, all of\n"
> > > +            "     the vcpus and main process will run on the cpus provided here.\n"
> > > +            "     This option also accepts a single cpu. (default: No cpu mapping)\n\n"
> > > +            "     Example 1: ./dirty_log_perf_test -v 3 -c 22,43,1 -d 101\n"
> > > +            "     Main application thread will run on logical cpu 101 and\n"
> > > +            "     vcpus will run on the logical cpus 22, 43 and 1\n\n"
> > > +            "     Example 2: ./dirty_log_perf_test -v 3 -d 101\n"
> > > +            "     Main application thread and vcpus will run on the logical\n"
> > > +            "     cpu 101\n\n"
> > > +            "     Example 3: ./dirty_log_perf_test -v 3 -d 101,23,53\n"
> > > +            "     Main application thread and vcpus will run on logical cpus\n"
> > > +            "     101, 23 and 53.\n");
> > >       puts("");
> > >       exit(0);
> > >  }
> >
> > > @@ -455,6 +550,13 @@ int main(int argc, char *argv[])
> > >               }
> > >       }
> > >
> >
> > I wonder if we should make -c and -d mutually exclusive.  Tweak -c to include the
> > application thread, i.e. TEST_ASSERT(nr_lcpus == nr_vcpus+1) and require 1:1 pinning
> > for all tasks.  E.g. allowing "-c ..., -d 0,1,22" seems unnecessary.
> >
> 
> One downside I can think of will be if we add some worker threads
> which are not vcpus then all of those threads will end up running on a
> single cpu unless we edit this parsing logic again.

But adding worker threads also requires a code change, i.e. it won't require a
separate commit/churn.  And if we get to the point where we want multiple workers,
it should be relatively straightforward to support pinning an arbitrary number of
workers, e.g.

	enum memtest_worker_type {
		MAIN_WORKER,
		MINION_1,
		MINION_2,
		NR_MEMTEST_WORKERS,
	}


	TEST_ASSERT(nr_lcpus == nr_vcpus + NR_MEMTEST_WORKERS);

void spawn_worker(enum memtest_worker_type type, <function pointer>)
{
	cpu_set_t cpuset;

	CPU_ZERO(&cpuset);
	CPU_SET(task_map[nr_vcpus + type], &cpuset);

	<set affinity and spawn>
}

> Current implementation gives vcpus special treatment via -c and for
> the whole application via -d. This gives good separation of concerns
> via flags.

But they aren't separated, e.g. using -d without -c means vCPUs are thrown into
the same pool as worker threads.  And if we ever do add more workers, -d doesn't
allow the user to pin workers 1:1 with logical CPUs.

Actually, if -c is extended to pin workers, then adding -d is unnecessary.  If the
user wants to affine tasks to CPUs but not pin 1:1, it can do that with e.g. taskset.
What the user can't do is pin 1:1.

If we don't want to _require_ the caller to pin the main worker, then we could do

	TEST_ASSERT(nr_lcpus >= nr_vcpus &&
		    nr_lcpus <= nr_vcpus + NR_MEMTEST_WORKERS);

to _require_ pinning all vCPUs, and allow but not require pinning non-vCPU tasks.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] KVM: selftests: Run dirty_log_perf_test on specific cpus
  2022-08-17 21:29     ` Sean Christopherson
@ 2022-08-18 17:58       ` Vipin Sharma
  2022-08-18 18:10         ` Sean Christopherson
  0 siblings, 1 reply; 7+ messages in thread
From: Vipin Sharma @ 2022-08-18 17:58 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: dmatlack, pbonzini, kvm, linux-kernel

On Wed, Aug 17, 2022 at 2:29 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Aug 17, 2022, Vipin Sharma wrote:
> > On Wed, Aug 17, 2022 at 10:25 AM Sean Christopherson <seanjc@google.com> wrote:

> > We need error checking here to make sure that the user really wants
> > cpu 0 and it was not a mistake in typing.
> > I was thinking of using parse_num API for other places as well instead
> > of atoi() in dirty_log_perf_test.
>
> Yes, definitely.  And maybe give it a name like atoi_paranoid()?

Lol. Absolutely, if that's what you want!

>
> > Yeah, it was either my almost duplicate functions or have the one
> > function do two things via if-else.  I am not happy with both
> > approaches.
> >
> > I think I will pass an integer array which this parsing function will
> > fill up and return an int denoting how many elements were filled. The
> > caller then can use the array as they wish, to copy it in
> > vcpu_to_lcpu_map or cpuset.
>
> Eh, I doubt that'll be a net improvement, e.g. the CPUSET case will then need to
> re-loop, which seems silly.  If the exclusive cpuset vs. array is undesirable, we
> could have the API require at least one instead of exactly one, i.e.
>
>         TEST_ASSERT(cpuset || vcpu_map);
>
>         ...
>
>                 cpu = atoi(cpustr);
>                 TEST_ASSERT(cpu >= 0, "Invalid cpu number: %d\n", cpu);
>                 if (vcpu_map)
>                         vcpu_map[i++] = cpu;
>                 if (cpuset)
>                         CPU_SET(cpu, cpuset);
>
> If we somehow end up with a third type of destination, then we can revisit this,
> but that seems unlikely at this point.
>

I am removing the -d option, so this is not needed anymore.


> > > I wonder if we should make -c and -d mutually exclusive.  Tweak -c to include the
> > > application thread, i.e. TEST_ASSERT(nr_lcpus == nr_vcpus+1) and require 1:1 pinning
> > > for all tasks.  E.g. allowing "-c ..., -d 0,1,22" seems unnecessary.
> > >
> >
> > One downside I can think of will be if we add some worker threads
> > which are not vcpus then all of those threads will end up running on a
> > single cpu unless we edit this parsing logic again.
>
> But adding worker threads also requires a code change, i.e. it won't require a
> separate commit/churn.  And if we get to the point where we want multiple workers,
> it should be relatively straightforward to support pinning an arbitrary number of
> workers, e.g.
>
>         enum memtest_worker_type {
>                 MAIN_WORKER,
>                 MINION_1,
>                 MINION_2,
>                 NR_MEMTEST_WORKERS,
>         }
>
>
>         TEST_ASSERT(nr_lcpus == nr_vcpus + NR_MEMTEST_WORKERS);
>
> void spawn_worker(enum memtest_worker_type type, <function pointer>)
> {
>         cpu_set_t cpuset;
>
>         CPU_ZERO(&cpuset);
>         CPU_SET(task_map[nr_vcpus + type], &cpuset);
>
>         <set affinity and spawn>
> }
>
> > Current implementation gives vcpus special treatment via -c and for
> > the whole application via -d. This gives good separation of concerns
> > via flags.
>
> But they aren't separated, e.g. using -d without -c means vCPUs are thrown into
> the same pool as worker threads.  And if we ever do add more workers, -d doesn't
> allow the user to pin workers 1:1 with logical CPUs.
>
> Actually, if -c is extended to pin workers, then adding -d is unnecessary.  If the
> user wants to affine tasks to CPUs but not pin 1:1, it can do that with e.g. taskset.
> What the user can't do is pin 1:1.
>
> If we don't want to _require_ the caller to pin the main worker, then we could do
>
>         TEST_ASSERT(nr_lcpus >= nr_vcpus &&
>                     nr_lcpus <= nr_vcpus + NR_MEMTEST_WORKERS);
>
> to _require_ pinning all vCPUs, and allow but not require pinning non-vCPU tasks.

Okay, I will remove -d and only keep -c. I will extend it to support
pinning the main worker and vcpus. Arguments to -c will be like:
<main woker lcpu>, <vcpu0's lcpu>, <vcpu1's lcpu>, <vcpu2's lcpu>,...
Example:
./dirty_log_perf_test -v 3 -c 1,20,21,22

Main worker will run on 1 and 3 vcpus  will run on logical cpus 20, 21 and 22.

Sounds good?

Thanks
Vipin

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] KVM: selftests: Run dirty_log_perf_test on specific cpus
  2022-08-18 17:58       ` Vipin Sharma
@ 2022-08-18 18:10         ` Sean Christopherson
  2022-08-18 18:28           ` Vipin Sharma
  0 siblings, 1 reply; 7+ messages in thread
From: Sean Christopherson @ 2022-08-18 18:10 UTC (permalink / raw)
  To: Vipin Sharma; +Cc: dmatlack, pbonzini, kvm, linux-kernel

On Thu, Aug 18, 2022, Vipin Sharma wrote:
> On Wed, Aug 17, 2022 at 2:29 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Wed, Aug 17, 2022, Vipin Sharma wrote:
> > > On Wed, Aug 17, 2022 at 10:25 AM Sean Christopherson <seanjc@google.com> wrote:
> 
> > > We need error checking here to make sure that the user really wants
> > > cpu 0 and it was not a mistake in typing.
> > > I was thinking of using parse_num API for other places as well instead
> > > of atoi() in dirty_log_perf_test.
> >
> > Yes, definitely.  And maybe give it a name like atoi_paranoid()?
> 
> Lol. Absolutely, if that's what you want!

The goal is to capture that it's effectively atoi(), but with checking.  E.g. most
developers will be familiar with atoi() and won't have to think too hard if they
see e.g. atoi_paranoid().  On the other hand, parse_num() leaves the reader
wondering if it's parsing hex, decimal, octal, etc..., why selftests just doesn't
use atoi(), etc...

> Okay, I will remove -d and only keep -c. I will extend it to support
> pinning the main worker and vcpus. Arguments to -c will be like:
> <main woker lcpu>, <vcpu0's lcpu>, <vcpu1's lcpu>, <vcpu2's lcpu>,...
> Example:
> ./dirty_log_perf_test -v 3 -c 1,20,21,22
> 
> Main worker will run on 1 and 3 vcpus  will run on logical cpus 20, 21 and 22.

I think it makes sense to have the vCPUs be first.  That way vcpu0 => task_map[0],
and we can also extend the option in the future without breaking existing command
lines, e.g. to add more workers and/or redefine the behavior of the "trailing"
numbers to say that all workers are affined to those CPUs (to allow sequestering
the main worker from vCPUs without pinning it to a single CPU).

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] KVM: selftests: Run dirty_log_perf_test on specific cpus
  2022-08-18 18:10         ` Sean Christopherson
@ 2022-08-18 18:28           ` Vipin Sharma
  0 siblings, 0 replies; 7+ messages in thread
From: Vipin Sharma @ 2022-08-18 18:28 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: dmatlack, pbonzini, kvm, linux-kernel

On Thu, Aug 18, 2022 at 11:10 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Aug 18, 2022, Vipin Sharma wrote:
> > On Wed, Aug 17, 2022 at 2:29 PM Sean Christopherson <seanjc@google.com> wrote:
>
> > Okay, I will remove -d and only keep -c. I will extend it to support
> > pinning the main worker and vcpus. Arguments to -c will be like:
> > <main woker lcpu>, <vcpu0's lcpu>, <vcpu1's lcpu>, <vcpu2's lcpu>,...
> > Example:
> > ./dirty_log_perf_test -v 3 -c 1,20,21,22
> >
> > Main worker will run on 1 and 3 vcpus  will run on logical cpus 20, 21 and 22.
>
> I think it makes sense to have the vCPUs be first.  That way vcpu0 => task_map[0],
> and we can also extend the option in the future without breaking existing command
> lines, e.g. to add more workers and/or redefine the behavior of the "trailing"
> numbers to say that all workers are affined to those CPUs (to allow sequestering
> the main worker from vCPUs without pinning it to a single CPU).

Make sense. vcpus first followed by worker cpus.

Thanks

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-08-18 18:29 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-08-17 15:29 [PATCH] KVM: selftests: Run dirty_log_perf_test on specific cpus Vipin Sharma
2022-08-17 17:24 ` Sean Christopherson
2022-08-17 18:16   ` Vipin Sharma
2022-08-17 21:29     ` Sean Christopherson
2022-08-18 17:58       ` Vipin Sharma
2022-08-18 18:10         ` Sean Christopherson
2022-08-18 18:28           ` Vipin Sharma

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).