From: Geliang Tang <geliang.tang@suse.com>
To: Paolo Abeni <pabeni@redhat.com>
Cc: mptcp@lists.linux.dev
Subject: Re: [PATCH mptcp-next v8 13/17] selftests/bpf: Add bpf_burst scheduler
Date: Fri, 9 Jun 2023 21:32:36 +0800 [thread overview]
Message-ID: <20230609133236.GA30403@localhost> (raw)
In-Reply-To: <666004bda6ff9e3e6d65c6903c5b18f18f0e31ed.camel@redhat.com>
Hi Paolo,
On Fri, Jun 09, 2023 at 11:57:53AM +0200, Paolo Abeni wrote:
> On Thu, 2023-06-08 at 13:46 +0800, Geliang Tang wrote:
> > This patch implements the burst BPF MPTCP scheduler, named bpf_burst,
> > which is the default scheduler in protocol.c. bpf_burst_get_send() uses
> > the same logic as mptcp_subflow_get_send() and bpf_burst_get_retrans
> > uses the same logic as mptcp_subflow_get_retrans().
> >
> > Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> > ---
> > tools/testing/selftests/bpf/bpf_tcp_helpers.h | 5 +
> > .../selftests/bpf/progs/mptcp_bpf_burst.c | 208 ++++++++++++++++++
> > 2 files changed, 213 insertions(+)
> > create mode 100644 tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
> >
> > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > index 44c356772798..927a00f663f2 100644
> > --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > @@ -36,6 +36,7 @@ enum sk_pacing {
> > struct sock {
> > struct sock_common __sk_common;
> > #define sk_state __sk_common.skc_state
> > + int sk_wmem_queued;
> > unsigned long sk_pacing_rate;
> > __u32 sk_pacing_status; /* see enum sk_pacing */
> > } __attribute__((preserve_access_index));
> > @@ -234,12 +235,15 @@ extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
> > #define MPTCP_SUBFLOWS_MAX 8
> >
> > struct mptcp_subflow_context {
> > + unsigned long avg_pacing_rate;
> > __u32 backup : 1;
> > + __u8 stale_count;
> > struct sock *tcp_sock; /* tcp sk backpointer */
> > } __attribute__((preserve_access_index));
> >
> > struct mptcp_sched_data {
> > struct sock *last_snd;
> > + int snd_burst;
> > bool reinject;
> > struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX];
> > } __attribute__((preserve_access_index));
> > @@ -260,6 +264,7 @@ struct mptcp_sched_ops {
> > struct mptcp_sock {
> > struct inet_connection_sock sk;
> >
> > + __u64 snd_nxt;
> > __u32 token;
> > struct sock *first;
> > char ca_name[TCP_CA_NAME_MAX];
> > diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
> > new file mode 100644
> > index 000000000000..b860fd517787
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
> > @@ -0,0 +1,208 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2023, SUSE. */
> > +
> > +#include <linux/bpf.h>
> > +#include <limits.h>
> > +#include "bpf_tcp_helpers.h"
> > +
> > +char _license[] SEC("license") = "GPL";
> > +
> > +#define MPTCP_SEND_BURST_SIZE 65428
> > +
> > +struct subflow_send_info {
> > + struct sock *ssk;
> > + __u64 linger_time;
> > +};
> > +
> > +static struct mptcp_subflow_context *
> > +bpf_mptcp_subflow_ctx(struct sock *ssk, struct mptcp_sched_data *data)
> > +{
> > + int i, nr = 0;
> > +
> > + for (i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
> > + if (!ssk || !data->contexts[i])
> > + break;
> > +
> > + if (mptcp_subflow_tcp_sock(data->contexts[i]) == ssk) {
> > + nr = i;
> > + break;
> > + }
> > + }
> > +
> > + return data->contexts[nr];
> > +}
> > +
> > +static inline __u64 div_u64_rem(__u64 dividend, __u32 divisor, __u32 *remainder)
> > +{
> > + *remainder = dividend % divisor;
> > + return dividend / divisor;
> > +}
> > +
> > +static inline __u64 div_u64(__u64 dividend, __u32 divisor)
> > +{
> > + __u32 remainder;
> > +
> > + return div_u64_rem(dividend, divisor, &remainder);
> > +}
> > +
> > +extern bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) __ksym;
> > +extern void mptcp_set_timeout(struct sock *sk) __ksym;
> > +extern __u64 mptcp_wnd_end(const struct mptcp_sock *msk) __ksym;
> > +extern bool bpf_sk_stream_memory_free(const struct sock *sk) __ksym;
> > +extern bool bpf_tcp_rtx_and_write_queues_empty(const struct sock *sk) __ksym;
> > +extern void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) __ksym;
> > +
> > +#define SSK_MODE_ACTIVE 0
> > +#define SSK_MODE_BACKUP 1
> > +#define SSK_MODE_MAX 2
> > +
> > +SEC("struct_ops/mptcp_sched_burst_init")
> > +void BPF_PROG(mptcp_sched_burst_init, const struct mptcp_sock *msk)
> > +{
> > +}
> > +
> > +SEC("struct_ops/mptcp_sched_burst_release")
> > +void BPF_PROG(mptcp_sched_burst_release, const struct mptcp_sock *msk)
> > +{
> > +}
> > +
> > +void BPF_STRUCT_OPS(bpf_burst_data_init, const struct mptcp_sock *msk,
> > + struct mptcp_sched_data *data)
> > +{
> > + mptcp_sched_data_set_contexts(msk, data);
> > +}
> > +
> > +static int bpf_burst_get_send(const struct mptcp_sock *msk,
> > + struct mptcp_sched_data *data)
> > +{
> > + struct subflow_send_info send_info[SSK_MODE_MAX];
> > + struct mptcp_subflow_context *subflow;
> > + struct sock *sk = (struct sock *)msk;
> > + __u32 pace, burst, wmem;
> > + int i, nr_active = 0;
> > + __u64 linger_time;
> > + struct sock *ssk;
> > +
> > + /* pick the subflow with the lower wmem/wspace ratio */
> > + for (i = 0; i < SSK_MODE_MAX; ++i) {
> > + send_info[i].ssk = NULL;
> > + send_info[i].linger_time = -1;
> > + }
> > +
> > + for (i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
> > + if (!data->contexts[i])
> > + break;
> > +
> > + subflow = data->contexts[i];
> > + ssk = mptcp_subflow_tcp_sock(subflow);
> > + if (!mptcp_subflow_active(subflow))
> > + continue;
> > +
> > + nr_active += !subflow->backup;
> > + pace = subflow->avg_pacing_rate;
> > + if (!pace) {
> > + /* init pacing rate from socket */
> > + subflow->avg_pacing_rate = ssk->sk_pacing_rate;
> > + pace = subflow->avg_pacing_rate;
> > + if (!pace)
> > + continue;
> > + }
> > +
> > + linger_time = div_u64((__u64)ssk->sk_wmem_queued << 32, pace);
> > + if (linger_time < send_info[subflow->backup].linger_time) {
> > + send_info[subflow->backup].ssk = ssk;
> > + send_info[subflow->backup].linger_time = linger_time;
> > + }
> > + }
> > + mptcp_set_timeout(sk);
> > +
> > + /* pick the best backup if no other subflow is active */
> > + if (!nr_active)
> > + send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;
> > +
> > + /* Get the ssk directly like this "ssk = send_info[SSK_MODE_ACTIVE].ssk;"
> > + * will get an error:
> > + * arg#0 pointer type STRUCT sock must point to scalar, or struct with scalar
> > + * So here use bpf_mptcp_subflow_ctx() to get the subflow,
This comment needs to be updated:
/* Pass "send_info[SSK_MODE_ACTIVE].ssk" directly to bpf_sk_stream_memory_free()
* will get an error:
* arg#0 pointer type STRUCT sock must point to scalar, or struct with scalar
* So here pass it to bpf_mptcp_subflow_ctx() to get the subflow,
* then use mptcp_subflow_tcp_sock() to get the ssk,
* and pass the ssk to bpf_sk_stream_memory_free().
*/
>
> May I guess you get a similar error if you do:
>
> subflow = mptcp_subflow_ctx(ssk)
>
> ? (just out of sheer ignorance and curiosity)
Yes. It seems that accessing 'send_info[SSK_MODE_ACTIVE].ssk' is
considered unsafe in BPF context. So here we pass 'send_info[SSK_MODE_ACTIVE].ssk'
into bpf_mptcp_subflow_ctx() to find the related subflow. Then we access this
subflow instead of 'send_info[SSK_MODE_ACTIVE].ssk' below. This can make BPF happy.
>
> > + * then use mptcp_subflow_tcp_sock() to get the ssk.
> > + */
> > + subflow = mptcp_subflow_tcp_sock(send_info[SSK_MODE_ACTIVE].ssk, data);
> > + ssk = mptcp_subflow_tcp_sock(subflow);
>
> What if you store the 'subflow' pointer in 'send_info'? Will the
> verifier splat with that? and what if we store the corresponding
> context index 'i' instead?
Storing the 'subflow' or index 'i' don't work too.
> that we avoid the 'mptcp_subflow_tcp_sock()'
> call entirely.
No need to avoid "mptcp_subflow_tcp_sock", we need to access both
'subflow' and 'ssk' below.
Thanks,
-Geliang
>
> Please, don't send directly a new version, I think it would be better
> clarify this point before updating the code
>
> thanks!
>
> Paolo
>
next prev parent reply other threads:[~2023-06-09 13:32 UTC|newest]
Thread overview: 35+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-06-08 5:46 [PATCH mptcp-next v8 00/17] BPF packet scheduler updates Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 01/17] Squash to "mptcp: drop last_snd and MPTCP_RESET_SCHEDULER" Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 02/17] Squash to "mptcp: add struct mptcp_sched_ops" Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 03/17] Squash to "mptcp: add scheduler wrappers" Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 04/17] mptcp: add last_snd in sched_data Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 05/17] mptcp: add snd_burst " Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 06/17] mptcp: register default scheduler Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 07/17] Squash to "bpf: Add bpf_mptcp_sched_ops" Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 08/17] Squash to "selftests/bpf: Add mptcp sched structs" Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 09/17] Squash to "selftests/bpf: Add bpf_rr scheduler" Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 10/17] mptcp: add two wrappers needed by bpf_burst Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 11/17] bpf: Add bpf_burst write accesses Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 12/17] bpf: Export more bpf_burst related functions Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 13/17] selftests/bpf: Add bpf_burst scheduler Geliang Tang
2023-06-09 9:57 ` Paolo Abeni
2023-06-09 13:32 ` Geliang Tang [this message]
2023-06-09 14:40 ` Paolo Abeni
2023-06-10 1:45 ` Geliang Tang
2023-06-12 11:05 ` Paolo Abeni
2023-06-12 13:29 ` Geliang Tang
2023-06-12 14:22 ` Paolo Abeni
2023-06-13 5:36 ` Geliang Tang
2023-06-13 9:35 ` Paolo Abeni
2023-06-13 12:32 ` Geliang Tang
2023-06-13 13:36 ` Paolo Abeni
2023-06-13 14:03 ` Geliang Tang
2023-06-13 14:18 ` Paolo Abeni
2023-06-08 5:46 ` [PATCH mptcp-next v8 14/17] selftests/bpf: Add bpf_burst test Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 15/17] bpf: Add subflow bit flags write accesses Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 16/17] selftests/bpf: Add bpf_stale scheduler Geliang Tang
2023-06-08 5:46 ` [PATCH mptcp-next v8 17/17] selftests/bpf: Add bpf_stale test Geliang Tang
2023-06-08 6:29 ` selftests/bpf: Add bpf_stale test: Build Failure MPTCP CI
2023-06-08 6:56 ` selftests/bpf: Add bpf_stale test: Tests Results MPTCP CI
2023-06-13 10:08 ` selftests/bpf: Add bpf_stale test: Build Failure MPTCP CI
2023-06-13 10:57 ` selftests/bpf: Add bpf_stale test: Tests Results MPTCP CI
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230609133236.GA30403@localhost \
--to=geliang.tang@suse.com \
--cc=mptcp@lists.linux.dev \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox