Re: [PATCH bpf-next] Improve BPF test stability (related to perf events and scheduling)

BPF List
 help / color / mirror / Atom feed

From: Yonghong Song <yhs@fb.com>
To: Mykola Lysenko <mykolal@fb.com>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>,
	bpf <bpf@vger.kernel.org>, Alexei Starovoitov <ast@kernel.org>,
	Andrii Nakryiko <andrii@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>
Subject: Re: [PATCH bpf-next] Improve BPF test stability (related to perf events and scheduling)
Date: Tue, 1 Mar 2022 20:53:21 -0800	[thread overview]
Message-ID: <ab6970fc-eaa0-9462-9614-893085c42331@fb.com> (raw)
In-Reply-To: <8A284AAF-84BA-48A4-BF4D-7BDB2426DAB8@fb.com>



On 2/28/22 7:45 PM, Mykola Lysenko wrote:
> 
> 
>> On Feb 22, 2022, at 8:32 PM, Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 2/22/22 7:13 PM, Andrii Nakryiko wrote:
>>> On Tue, Feb 22, 2022 at 12:35 PM Mykola Lysenko <mykolal@fb.com> wrote:
>>>>
>>>> Thanks for the review Andrii!
>>>>
>>>>> On Feb 19, 2022, at 8:39 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
>>>>>
>>>>> On Fri, Feb 18, 2022 at 4:30 PM Mykola Lysenko <mykolal@fb.com> wrote:
>>>>>>
>>>>>> In send_signal, replace sleep with dummy cpu intensive computation
>>>>>> to increase probability of child process being scheduled. Add few
>>>>>> more asserts.
>>>>>>
>>>>>> In find_vma, reduce sample_freq as higher values may be rejected in
>>>>>> some qemu setups, remove usleep and increase length of cpu intensive
>>>>>> computation.
>>>>>>
>>>>>> In bpf_cookie, perf_link and perf_branches, reduce sample_freq as
>>>>>> higher values may be rejected in some qemu setups
>>>>>>
>>>>>> Signed-off-by: Mykola Lysenko <mykolal@fb.com>
>>>>>> ---
>>>>>> .../testing/selftests/bpf/prog_tests/bpf_cookie.c  |  2 +-
>>>>>> tools/testing/selftests/bpf/prog_tests/find_vma.c  |  5 ++---
>>>>>> .../selftests/bpf/prog_tests/perf_branches.c       |  4 ++--
>>>>>> tools/testing/selftests/bpf/prog_tests/perf_link.c |  2 +-
>>>>>> .../testing/selftests/bpf/prog_tests/send_signal.c | 14 ++++++++++----
>>>>>> 5 files changed, 16 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
>>>>>> index cd10df6cd0fc..0612e79a9281 100644
>>>>>> --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
>>>>>> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
>>>>>> @@ -199,7 +199,7 @@ static void pe_subtest(struct test_bpf_cookie *skel)
>>>>>>         attr.type = PERF_TYPE_SOFTWARE;
>>>>>>         attr.config = PERF_COUNT_SW_CPU_CLOCK;
>>>>>>         attr.freq = 1;
>>>>>> -       attr.sample_freq = 4000;
>>>>>> +       attr.sample_freq = 1000;
>>>>>>         pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
>>>>>>         if (!ASSERT_GE(pfd, 0, "perf_fd"))
>>>>>>                 goto cleanup;
>>>>>> diff --git a/tools/testing/selftests/bpf/prog_tests/find_vma.c b/tools/testing/selftests/bpf/prog_tests/find_vma.c
>>>>>> index b74b3c0c555a..acc41223a112 100644
>>>>>> --- a/tools/testing/selftests/bpf/prog_tests/find_vma.c
>>>>>> +++ b/tools/testing/selftests/bpf/prog_tests/find_vma.c
>>>>>> @@ -30,7 +30,7 @@ static int open_pe(void)
>>>>>>         attr.type = PERF_TYPE_HARDWARE;
>>>>>>         attr.config = PERF_COUNT_HW_CPU_CYCLES;
>>>>>>         attr.freq = 1;
>>>>>> -       attr.sample_freq = 4000;
>>>>>> +       attr.sample_freq = 1000;
>>>>>>         pfd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, PERF_FLAG_FD_CLOEXEC);
>>>>>>
>>>>>>         return pfd >= 0 ? pfd : -errno;
>>>>>> @@ -57,7 +57,7 @@ static void test_find_vma_pe(struct find_vma *skel)
>>>>>>         if (!ASSERT_OK_PTR(link, "attach_perf_event"))
>>>>>>                 goto cleanup;
>>>>>>
>>>>>> -       for (i = 0; i < 1000000; ++i)
>>>>>> +       for (i = 0; i < 1000000000; ++i)
>>>>>
>>>>> 1bln seems excessive... maybe 10mln would be enough?
>>>>
>>>> See explanation for send_signal test case below
>>>>
>>>>>
>>>>>>                 ++j;
>>>>>>
>>>>>>         test_and_reset_skel(skel, -EBUSY /* in nmi, irq_work is busy */);
>>>>>
>>>>> [...]
>>>>>
>>>>>> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
>>>>>> index 776916b61c40..841217bd1df6 100644
>>>>>> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
>>>>>> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
>>>>>> @@ -4,11 +4,12 @@
>>>>>> #include <sys/resource.h>
>>>>>> #include "test_send_signal_kern.skel.h"
>>>>>>
>>>>>> -int sigusr1_received = 0;
>>>>>> +int sigusr1_received;
>>>>>> +volatile int volatile_variable;
>>>>>
>>>>> please make them static
>>>>
>>>> sure
>>>>
>>>>>
>>>>>>
>>>>>> static void sigusr1_handler(int signum)
>>>>>> {
>>>>>> -       sigusr1_received++;
>>>>>> +       sigusr1_received = 1;
>>>>>> }
>>>>>>
>>>>>> static void test_send_signal_common(struct perf_event_attr *attr,
>>>>>> @@ -42,7 +43,9 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>>>>>                 int old_prio;
>>>>>>
>>>>>>                 /* install signal handler and notify parent */
>>>>>> +               errno = 0;
>>>>>>                 signal(SIGUSR1, sigusr1_handler);
>>>>>> +               ASSERT_OK(errno, "signal");
>>>>>
>>>>> just ASSERT_OK(signal(...), "signal");
>>>>
>>>> I am fine to merge signal and ASSERT lines, but will substitute with condition "signal(SIGUSR1, sigusr1_handler) != SIG_ERR”, sounds good?
>>>>
>>> Ah, signal is a bit special with return values. Yeah,
>>> ASSERT_NEQ(signal(...), SIG_ERR, "signal") sounds good.
>>>>>
>>>>>>
>>>>>>                 close(pipe_c2p[0]); /* close read */
>>>>>>                 close(pipe_p2c[1]); /* close write */
>>>>>> @@ -63,9 +66,12 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>>>>>                 ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
>>>>>>
>>>>>>                 /* wait a little for signal handler */
>>>>>> -               sleep(1);
>>>>>> +               for (int i = 0; i < 1000000000; i++)
>>>>>
>>>>> same about 1bln
>>>>
>>>> With 10mln and 100 test runs I got 86 failures
>>>> 100mln - 63 failures
>>>> 1bln - 0 failures on 100 runs
>>>>
>>>> Now, there is performance concern for this test. Running
>>>>
>>>> time sudo  ./test_progs -t send_signal/send_signal_nmi_thread
>>>>
>>>> With 1bln takes ~4s
>>>> 100mln - 1s.
>>>> Unchanged test with sleep(1); takes ~2s.
>>>>
>>>> On the other hand 300mln runs ~2s, and only fails 1 time per 100 runs. As 300mln does not regress performance comparing to the current “sleep(1)” implementation, I propose to go with it. What do you think?
>>> I think if we need to burn multiple seconds of CPU to make the test
>>> reliable, then we should either rework or disable/remove the test. In
>>> CI those billions of iterations will be much slower. And even waiting
>>> for 4 seconds for just one test is painful.
>>> Yonghong, WDYT? Should we just drop thi test? It has caused us a bunch
>>> of flakiness and maintenance burden without actually catching any
>>> issues. Maybe it's better to just get rid of it?
>>
>> Could we try to set affinity for the child process here?
>> See perf_branches.c:
>>
>> ...
>>         /* generate some branches on cpu 0 */
>>         CPU_ZERO(&cpu_set);
>>         CPU_SET(0, &cpu_set);
>>         err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
>>         if (CHECK(err, "set_affinity", "cpu #0, err %d\n", err))
>>                 goto out_destroy;
>>         /* spin the loop for a while (random high number) */
>>         for (i = 0; i < 1000000; ++i)
>>                 ++j;
>> ...
>>
>> Binding the process (single thread) to a particular cpu can
>> prevent other non-binding processes from migrating to this
>> cpu and boost the chance for NMI triggered on this cpu.
>> This could be the reason perf_branches.c (and a few other tests)
>> does.
>>
>> In send_signal case, the cpu affinity probably should
>> set to cpu 1 as cpu 0 has been pinned by previous tests for
>> the main process and I didn't see it 'unpinned'
>> (by setaffinity to ALL cpus).
>> This is inconvenient.
>>
>> So the following is my suggestion:
>> 1. abstract the above 'pthread_setaffinity_np to
>>    a helper to set affinity to a particular cpu as
>>    this function has been used in several cases.
>> 2. create a new helper to undo setaffinity (set cpu
>>    mask to all available cpus) so we can pair it
>>    with pthread_setaffinity_np helper in prog_tests/...
>>    files.
>> 3. clean up prog_tests/... files which have pthread_setaffinity_np.
>> 4. use helpers 1/2 with loop bound 1000000 for send_signal test.
>>    The implementation here will be consistent with
>>    other NMI tests. Hopefully the test can consistent
>>    pass similar to other NMI tests.
>>
>> WDYT?
> 
> Hi Yonghong,
> 
> I have tried this approach in the send_signal test without much success unfortunately (different CPUs and configurations options). It is required though for perf_branches test, yet to understand why.

Thanks for experiments. I looked at the code again. Indeed 
pthread_setaffinity_np is not needed for send_signal test. This is 
because we use perf_event_open pid/cpu config like below:
        pid > 0 and cpu == -1
               This measures the specified process/thread on any CPU.

For perf_branches, pthread_setaffinity_np is needed since it uses
the perf_evnet_open pid/cpu config like below:
        pid == -1 and cpu >= 0
               This measures all processes/threads on the specified CPU. 
  This requires CAP_SYS_ADMIN capabil‐
               ity or a /proc/sys/kernel/perf_event_paranoid value of 
less than 1.

> 
> In the V2 of this patch, I used modified approach when we will stop crunching volatile variable when needed condition became true. I hope this will be an acceptable middle ground in this case.

My current setup is using qemu on a physical server and cannot reproduce 
the issue. So I created another setup which uses qemu on a VM itseld and
can actually reproduce the issue. Replacing the sleep(1) with
   for (int i = 0; i < 1000000000; i++) /* 1billion */
      j++; /* volatile int j */
seems fixing the issue. But your change
   for (int i = 0; i < 100000000 && !sigusr1_received; i++)	
      volatile_variable /= i + 1;
works too and I tested it and in most cases the time for the subtest
is 0.8x or 0.9x seconds. Sometimes it can be < 0.5 seconds, and 
occasionally it may be 1.0x seconds. Overall, this is definitely
an improvement for fixing flakiness and better runtime.

> 
> Thanks!
> 
>>
>>>>
>>>>>
>>>>>> +                       volatile_variable++;
>>>>>>
>>>>>>                 buf[0] = sigusr1_received ? '2' : '0';
>>>>>> +               ASSERT_EQ(sigusr1_received, 1, "sigusr1_received");
>>>>>> +
>>>>>>                 ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
>>>>>>
>>>>>>                 /* wait for parent notification and exit */
>>>>>> @@ -110,9 +116,9 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>>>>>         ASSERT_EQ(read(pipe_c2p[0], buf, 1), 1, "pipe_read");
>>>>>>
>>>>>>         /* trigger the bpf send_signal */
>>>>>> +       skel->bss->signal_thread = signal_thread;
>>>>>>         skel->bss->pid = pid;
>>>>>>         skel->bss->sig = SIGUSR1;
>>>>>> -       skel->bss->signal_thread = signal_thread;
>>>>>>
>>>>>>         /* notify child that bpf program can send_signal now */
>>>>>>         ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
>>>>>> --
>>>>>> 2.30.2
>

     prev parent reply	other threads:[~2022-03-02  4:53 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-19  0:30 [PATCH bpf-next] Improve BPF test stability (related to perf events and scheduling) Mykola Lysenko
2022-02-19  1:47 ` Song Liu
2022-02-22 20:00   ` Mykola Lysenko
2022-02-20  4:39 ` Andrii Nakryiko
2022-02-22 20:35   ` Mykola Lysenko
2022-02-23  3:13     ` Andrii Nakryiko
2022-02-23  4:32       ` Yonghong Song
2022-02-24  0:52         ` Mykola Lysenko
2022-03-01  3:45         ` Mykola Lysenko
2022-03-02  4:53           ` Yonghong Song [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ab6970fc-eaa0-9462-9614-893085c42331@fb.com \
    --to=yhs@fb.com \
    --cc=andrii.nakryiko@gmail.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=mykolal@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox