From mboxrd@z Thu Jan  1 00:00:00 1970
From: Alex =?utf-8?Q?Benn=C3=A9e?= <alex.bennee@linaro.org>
Subject: Re: [kvm-unit-tests RFC PATCH] arm/tlbflush.c: TLB flushing torture test [DEV]
Date: Mon, 27 Jul 2015 10:07:57 +0100
Message-ID: <87zj2im0ia.fsf@linaro.org>
References: <1437744306-7911-1-git-send-email-alex.bennee@linaro.org> <20150727075411.GA3758@hawk.localdomain>
Mime-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: QUOTED-PRINTABLE
Cc: mttcg@greensocs.com, mark.burton@greensocs.com,
	fred.konrad@greensocs.com, a.spyridakis@virtualopensystems.com,
	kvm@vger.kernel.org
To: Andrew Jones <drjones@redhat.com>
Return-path: <kvm-owner@vger.kernel.org>
Received: from mail-wi0-f182.google.com ([209.85.212.182]:34091 "EHLO
	mail-wi0-f182.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1751217AbbG0JIB (ORCPT <rfc822;kvm@vger.kernel.org>);
	Mon, 27 Jul 2015 05:08:01 -0400
Received: by wibud3 with SMTP id ud3so130909309wib.1
        for <kvm@vger.kernel.org>; Mon, 27 Jul 2015 02:08:00 -0700 (PDT)
In-reply-to: <20150727075411.GA3758@hawk.localdomain>
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>


Andrew Jones <drjones@redhat.com> writes:

> On Fri, Jul 24, 2015 at 02:25:06PM +0100, Alex Benn=C3=A9e wrote:
>> This adds a fairly brain dead torture test for TLB flushes intended =
for
>> stressing the MTTCG QEMU build. It takes the usual -smp option for
>> multiple CPUs.
>>=20
>> By default it will do a TLBIALL flush after each cycle. If you pass
>> -append "page" to the kernel it will take it in turns to flush each =
of
>> the computation functions. At the moment it doesn't do any re-mappin=
g of
>> pages but maybe that is something that could be done in the future.
>>=20
>> [DEV VERSION FOR COMMENT]
>> Signed-off-by: Alex Benn=C3=A9e <alex.bennee@linaro.org>
>> ---
>>  arm/tlbflush.c               | 163 ++++++++++++++++++++++++++++++++=
+++++++++++
>>  config/config-arm-common.mak |   4 +-
>>  lib/arm/asm/mmu.h            |  11 +++
>>  3 files changed, 177 insertions(+), 1 deletion(-)
>>  create mode 100644 arm/tlbflush.c
>>=20
>> diff --git a/arm/tlbflush.c b/arm/tlbflush.c
>> new file mode 100644
>> index 0000000..6eeff18
>> --- /dev/null
>> +++ b/arm/tlbflush.c
>> @@ -0,0 +1,163 @@
>> +#include <libcflat.h>
>> +#include <asm/smp.h>
>> +#include <asm/cpumask.h>
>> +#include <asm/barrier.h>
>> +#include <asm/mmu.h>
>> +
>> +#define SEQ_LENGTH 10
>> +
>> +static cpumask_t smp_test_complete;
>> +static int flush_count =3D 100000;
>> +static int flush_self =3D 1;
>> +static int flush_page =3D 0;
>> +
>> +__attribute__((aligned(0x1000))) unsigned int hash_array(int length=
, unsigned int *array)
>
> You should use PAGE_SIZE instead of 0x1000 in these attributes, allow=
ing
> the test to also work for aarch64, as we're using 64k pages on
> aarch64.

Good point.

>
>> +{
>> +	int i;
>> +	unsigned int sum=3D0;
>> +	for (i=3D0; i<length; i++)
>> +	{
>> +		unsigned int val =3D *array++;
>> +		sum ^=3D val;
>> +		sum ^=3D (val >> (val % 16));
>> +		sum ^=3D (val << (val % 32));
>> +	}
>> +
>> +	return sum;
>> +}
>> +
>> +__attribute__((aligned(0x1000))) void create_fib_sequence(int lengt=
h, unsigned int *array)
>> +{
>> +	int i;
>> +
>> +	/* first two values */
>> +	array[0] =3D 0;
>> +	array[1] =3D 1;
>> +	for (i=3D2; i<length; i++)
>> +	{
>> +		array[i] =3D array[i-2] + array[i-1];
>> +	}
>> +}
>> +
>> +__attribute__((aligned(0x1000))) unsigned long long factorial(unsig=
ned int n)
>> +{
>> +	unsigned int i;
>> +	unsigned long long fac =3D 1;
>> +	for (i=3D1; i<=3Dn; i++)
>> +	{
>> +		fac =3D fac * i;
>> +	}
>> +	return fac;
>> +}
>> +
>> +/* do some computationally expensive stuff, return a checksum of th=
e
>> + * results */
>> +__attribute__((aligned(0x1000))) unsigned int do_computation(void)
>> +{
>> +	unsigned int fib_array[SEQ_LENGTH];
>> +	unsigned long long facfib_array[SEQ_LENGTH];
>> +	unsigned int fib_hash, facfib_hash;
>> +	int cpu =3D smp_processor_id();
>> +	int i, j;
>> +=09
>> +	create_fib_sequence(SEQ_LENGTH, &fib_array[0]);
>> +	fib_hash =3D hash_array(SEQ_LENGTH, &fib_array[0]);
>> +	for (i=3D0; i<SEQ_LENGTH; i++) {
>> +		for (j=3D0; j<fib_array[i]; j++) {
>> +			facfib_array[i] =3D factorial(fib_array[i]+j);
>> +		}
>> +	}
>> +	facfib_hash =3D 0;
>> +	for (i=3D0; i<SEQ_LENGTH; i++) {
>> +		for (j=3D0; j<fib_array[i]; j++) {
>> +			facfib_hash ^=3D hash_array(sizeof(facfib_array)/sizeof(unsigned=
 int), (unsigned int *)&facfib_array[0]);
>> +		}
>> +	}
>> +
>> +#if 0
>> +	printf("CPU:%d FIBSEQ ", cpu);
>> +	for (i=3D0; i<SEQ_LENGTH; i++)
>> +		printf("%u,", fib_array[i]);
>> +	printf("\n");
>> +
>> +	printf("CPU:%d FACFIB ", cpu);
>> +	for (i=3D0; i<SEQ_LENGTH; i++)
>> +		printf("%llu,", facfib_array[i]);
>> +	printf("\n");
>> +#endif
>> +=09
>> +	return (fib_hash ^ facfib_hash);
>> +}
>> +
>> +static void * pages[] =3D {&hash_array, &create_fib_sequence, &fact=
orial, &do_computation};
>
> I can't comment on whether or not the complexity of do_computation is
> necessary for your test, but it seems like overkill. Comments explain=
ing
> why it's necessary would be good.

OK. From QEMUs TCG point of view I just want to ensure I have more than=
 two
basic blocks per-page region so I can check the block-chaining in-page
and jump caching intra-page which are both affected on flushes. A
computationally complex routine with a known answer would be nicer
though I guess.

>
>> +
>> +static void test_flush(void)
>> +{
>> +	int i, errors =3D 0;
>> +	int cpu =3D smp_processor_id();
>> +
>> +	unsigned int ref;
>> +
>> +	printf("CPU%d online\n", cpu);
>> +
>> +	ref =3D do_computation();
>
> What makes you sure that the first time you do the computation
> per cpu is correct? I think computing it externally, and saving
> the result, i.e.=20
>
> #define EXPECTED_RESULT 0x12345678
>
> would be more reliable.

OK.

>
>> +
>> +	for (i=3D0; i < flush_count; i++) {
>> +		unsigned int this_ref =3D do_computation();
>> +
>> +		if (this_ref !=3D ref) {
>> +			errors++;
>> +			printf("CPU%d: seq%d 0x%x!=3D0x%x\n",
>> +				cpu, i, ref, this_ref);
>> +		}
>> +
>> +		if ((i % 1000) =3D=3D 0) {
>> +			printf("CPU%d: seq%d\n", cpu, i);
>> +		}
>> +	=09
>> +		if (flush_self) {
>> +			if (flush_page) {
>> +				int j =3D (i % (sizeof(pages)/sizeof(void *)));
> libcflat.h has the ARRAY_SIZE macro

OK

>> +				flush_tlb_page((unsigned long)pages[j]);
>> +			} else {
>> +				flush_tlb_all();
>> +			}
>> +		}
>> +	}
>> +
>> +	report("CPU%d: Done - Errors: %d\n", errors =3D=3D 0, cpu, errors)=
;
>> +
>> +	cpumask_set_cpu(cpu, &smp_test_complete);
>> +	if (cpu !=3D 0)
>> +		halt();
>> +}
>> +
>> +int main(int argc, char **argv)
>> +{
>> +	int cpu, i;
>> +=09
>> +	report_prefix_push("tlbflush");
>> +
>> +	for (i=3D0; i<argc; i++) {
>> +		char *arg =3D argv[i];
>> +/* 		printf("arg:%d:%s\n", i, arg); */
>> +
>> +		if (strcmp(arg, "page") =3D=3D 0) {
>> +			report_prefix_push("page");
>> +			flush_page =3D 1;
>> +		}
>> +	}
>> +
>> +	for_each_present_cpu(cpu) {
>> +		if (cpu =3D=3D 0)
>> +			continue;
>> +		smp_boot_secondary(cpu, test_flush);
>> +	}
>> +
>> +	test_flush();
>> +
>> +	while (!cpumask_full(&smp_test_complete))
>> +		cpu_relax();
>> +
>> +	return report_summary();
>
> As we use the kernel coding style you should run
>
> $KERNEL_SRC/scripts/checkpatch.pl -f arm/tlbflush.c
>
> Also, please rename to tlbflush-test.c to differentiate it
> from an implementation of tlbflush support, and to make
> the standalone test name (if we commit those patches) more
> descriptive.

I'll have another poke at my editor config. It should have been setting
the coding style automatically, although of course explicit local
variables are better ;-)

>
>> +}
>> diff --git a/config/config-arm-common.mak b/config/config-arm-common=
=2Emak
>> index 0674daa..5b14db4 100644
>> --- a/config/config-arm-common.mak
>> +++ b/config/config-arm-common.mak
>> @@ -11,7 +11,8 @@ endif
>> =20
>>  tests-common =3D \
>>  	$(TEST_DIR)/selftest.flat \
>> -	$(TEST_DIR)/spinlock-test.flat
>> +	$(TEST_DIR)/spinlock-test.flat \
>> +        $(TEST_DIR)/tlbflush.flat
>
> As we're adding tests faster now it's becoming clear that the '\' lis=
t
> isn't so great. To add a new test at the bottom we always have to mod=
ify
> the last line too. We should either add the new one at the top (right
> below the 'test-common =3D' line), or change this to a '+=3D' sequenc=
e like
> some other lists are done.
>
>> =20
>>  all: test_cases
>> =20
>> @@ -72,3 +73,4 @@ test_cases: $(generated_files) $(tests-common) $(t=
ests)
>> =20
>>  $(TEST_DIR)/selftest.elf: $(cstart.o) $(TEST_DIR)/selftest.o
>>  $(TEST_DIR)/spinlock-test.elf: $(cstart.o) $(TEST_DIR)/spinlock-tes=
t.o
>> +$(TEST_DIR)/tlbflush.elf: $(cstart.o) $(TEST_DIR)/tlbflush.o
>> diff --git a/lib/arm/asm/mmu.h b/lib/arm/asm/mmu.h
>> index c1bd01c..2bb0cde 100644
>> --- a/lib/arm/asm/mmu.h
>> +++ b/lib/arm/asm/mmu.h
>> @@ -14,8 +14,11 @@
>>  #define PTE_AF			PTE_EXT_AF
>>  #define PTE_WBWA		L_PTE_MT_WRITEALLOC
>> =20
>> +/* See B3.18.7 TLB maintenance operations */
>> +
>>  static inline void local_flush_tlb_all(void)
>>  {
>> +	/* TLBIALL */
>>  	asm volatile("mcr p15, 0, %0, c8, c7, 0" :: "r" (0));
>>  	dsb();
>>  	isb();
>> @@ -27,6 +30,14 @@ static inline void flush_tlb_all(void)
>>  	local_flush_tlb_all();
>>  }
>> =20
>> +static inline void flush_tlb_page(unsigned long vaddr)
>> +{
>> +	/* TLBIMVAA */
>> +	asm volatile("mcr p15, 0, %0, c8, c7, 3" :: "r" (vaddr));
>> +	dsb();
>> +	isb();
>> +}
>> +
>>  #include <asm/mmu-api.h>
>> =20
>>  #endif /* __ASMARM_MMU_H_ */
>
> This mmu.h change looks good, but please add the arm64
> flush_tlb_page at the same time. And anyway, I guess you'll
> want your test to work for both arm and aarch64?

Yes I will. Currently the MTTCG is arm32 only but this will be expanded=
=2E

>
> Thanks,
> drew

--=20
Alex Benn=C3=A9e