MPTCP Linux Development
 help / color / mirror / Atom feed
From: Geliang Tang <geliang.tang@suse.com>
To: Paolo Abeni <pabeni@redhat.com>
Cc: mptcp@lists.linux.dev
Subject: Re: [PATCH mptcp-next v8 13/17] selftests/bpf: Add bpf_burst scheduler
Date: Fri, 9 Jun 2023 21:32:36 +0800	[thread overview]
Message-ID: <20230609133236.GA30403@localhost> (raw)
In-Reply-To: <666004bda6ff9e3e6d65c6903c5b18f18f0e31ed.camel@redhat.com>

Hi Paolo,

On Fri, Jun 09, 2023 at 11:57:53AM +0200, Paolo Abeni wrote:
> On Thu, 2023-06-08 at 13:46 +0800, Geliang Tang wrote:
> > This patch implements the burst BPF MPTCP scheduler, named bpf_burst,
> > which is the default scheduler in protocol.c. bpf_burst_get_send() uses
> > the same logic as mptcp_subflow_get_send() and bpf_burst_get_retrans
> > uses the same logic as mptcp_subflow_get_retrans().
> > 
> > Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> > ---
> >  tools/testing/selftests/bpf/bpf_tcp_helpers.h |   5 +
> >  .../selftests/bpf/progs/mptcp_bpf_burst.c     | 208 ++++++++++++++++++
> >  2 files changed, 213 insertions(+)
> >  create mode 100644 tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
> > 
> > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > index 44c356772798..927a00f663f2 100644
> > --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > @@ -36,6 +36,7 @@ enum sk_pacing {
> >  struct sock {
> >  	struct sock_common	__sk_common;
> >  #define sk_state		__sk_common.skc_state
> > +	int			sk_wmem_queued;
> >  	unsigned long		sk_pacing_rate;
> >  	__u32			sk_pacing_status; /* see enum sk_pacing */
> >  } __attribute__((preserve_access_index));
> > @@ -234,12 +235,15 @@ extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
> >  #define MPTCP_SUBFLOWS_MAX	8
> >  
> >  struct mptcp_subflow_context {
> > +	unsigned long avg_pacing_rate;
> >  	__u32	backup : 1;
> > +	__u8	stale_count;
> >  	struct	sock *tcp_sock;	    /* tcp sk backpointer */
> >  } __attribute__((preserve_access_index));
> >  
> >  struct mptcp_sched_data {
> >  	struct sock	*last_snd;
> > +	int		snd_burst;
> >  	bool		reinject;
> >  	struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX];
> >  } __attribute__((preserve_access_index));
> > @@ -260,6 +264,7 @@ struct mptcp_sched_ops {
> >  struct mptcp_sock {
> >  	struct inet_connection_sock	sk;
> >  
> > +	__u64		snd_nxt;
> >  	__u32		token;
> >  	struct sock	*first;
> >  	char		ca_name[TCP_CA_NAME_MAX];
> > diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
> > new file mode 100644
> > index 000000000000..b860fd517787
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
> > @@ -0,0 +1,208 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2023, SUSE. */
> > +
> > +#include <linux/bpf.h>
> > +#include <limits.h>
> > +#include "bpf_tcp_helpers.h"
> > +
> > +char _license[] SEC("license") = "GPL";
> > +
> > +#define MPTCP_SEND_BURST_SIZE	65428
> > +
> > +struct subflow_send_info {
> > +	struct sock *ssk;
> > +	__u64 linger_time;
> > +};
> > +
> > +static struct mptcp_subflow_context *
> > +bpf_mptcp_subflow_ctx(struct sock *ssk, struct mptcp_sched_data *data)
> > +{
> > +	int i, nr = 0;
> > +
> > +	for (i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
> > +		if (!ssk || !data->contexts[i])
> > +			break;
> > +
> > +		if (mptcp_subflow_tcp_sock(data->contexts[i]) == ssk) {
> > +			nr = i;
> > +			break;
> > +		}
> > +	}
> > +
> > +	return data->contexts[nr];
> > +}
> > +
> > +static inline __u64 div_u64_rem(__u64 dividend, __u32 divisor, __u32 *remainder)
> > +{
> > +	*remainder = dividend % divisor;
> > +	return dividend / divisor;
> > +}
> > +
> > +static inline __u64 div_u64(__u64 dividend, __u32 divisor)
> > +{
> > +	__u32 remainder;
> > +
> > +	return div_u64_rem(dividend, divisor, &remainder);
> > +}
> > +
> > +extern bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) __ksym;
> > +extern void mptcp_set_timeout(struct sock *sk) __ksym;
> > +extern __u64 mptcp_wnd_end(const struct mptcp_sock *msk) __ksym;
> > +extern bool bpf_sk_stream_memory_free(const struct sock *sk) __ksym;
> > +extern bool bpf_tcp_rtx_and_write_queues_empty(const struct sock *sk) __ksym;
> > +extern void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) __ksym;
> > +
> > +#define SSK_MODE_ACTIVE	0
> > +#define SSK_MODE_BACKUP	1
> > +#define SSK_MODE_MAX	2
> > +
> > +SEC("struct_ops/mptcp_sched_burst_init")
> > +void BPF_PROG(mptcp_sched_burst_init, const struct mptcp_sock *msk)
> > +{
> > +}
> > +
> > +SEC("struct_ops/mptcp_sched_burst_release")
> > +void BPF_PROG(mptcp_sched_burst_release, const struct mptcp_sock *msk)
> > +{
> > +}
> > +
> > +void BPF_STRUCT_OPS(bpf_burst_data_init, const struct mptcp_sock *msk,
> > +		    struct mptcp_sched_data *data)
> > +{
> > +	mptcp_sched_data_set_contexts(msk, data);
> > +}
> > +
> > +static int bpf_burst_get_send(const struct mptcp_sock *msk,
> > +			      struct mptcp_sched_data *data)
> > +{
> > +	struct subflow_send_info send_info[SSK_MODE_MAX];
> > +	struct mptcp_subflow_context *subflow;
> > +	struct sock *sk = (struct sock *)msk;
> > +	__u32 pace, burst, wmem;
> > +	int i, nr_active = 0;
> > +	__u64 linger_time;
> > +	struct sock *ssk;
> > +
> > +	/* pick the subflow with the lower wmem/wspace ratio */
> > +	for (i = 0; i < SSK_MODE_MAX; ++i) {
> > +		send_info[i].ssk = NULL;
> > +		send_info[i].linger_time = -1;
> > +	}
> > +
> > +	for (i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
> > +		if (!data->contexts[i])
> > +			break;
> > +
> > +		subflow = data->contexts[i];
> > +		ssk = mptcp_subflow_tcp_sock(subflow);
> > +		if (!mptcp_subflow_active(subflow))
> > +			continue;
> > +
> > +		nr_active += !subflow->backup;
> > +		pace = subflow->avg_pacing_rate;
> > +		if (!pace) {
> > +			/* init pacing rate from socket */
> > +			subflow->avg_pacing_rate = ssk->sk_pacing_rate;
> > +			pace = subflow->avg_pacing_rate;
> > +			if (!pace)
> > +				continue;
> > +		}
> > +
> > +		linger_time = div_u64((__u64)ssk->sk_wmem_queued << 32, pace);
> > +		if (linger_time < send_info[subflow->backup].linger_time) {
> > +			send_info[subflow->backup].ssk = ssk;
> > +			send_info[subflow->backup].linger_time = linger_time;
> > +		}
> > +	}
> > +	mptcp_set_timeout(sk);
> > +
> > +	/* pick the best backup if no other subflow is active */
> > +	if (!nr_active)
> > +		send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;
> > +
> > +	/* Get the ssk directly like this "ssk = send_info[SSK_MODE_ACTIVE].ssk;"
> > +	 * will get an error:
> > +	 *   arg#0 pointer type STRUCT sock must point to scalar, or struct with scalar
> > +	 * So here use bpf_mptcp_subflow_ctx() to get the subflow,

This comment needs to be updated:

        /* Pass "send_info[SSK_MODE_ACTIVE].ssk" directly to bpf_sk_stream_memory_free()
         * will get an error:
         *   arg#0 pointer type STRUCT sock must point to scalar, or struct with scalar
         * So here pass it to bpf_mptcp_subflow_ctx() to get the subflow,
         * then use mptcp_subflow_tcp_sock() to get the ssk,
         * and pass the ssk to bpf_sk_stream_memory_free().
         */

> 
> May I guess you get a similar error if you do:
> 
> 	subflow = mptcp_subflow_ctx(ssk)
> 
> ? (just out of sheer ignorance and curiosity)

Yes. It seems that accessing 'send_info[SSK_MODE_ACTIVE].ssk' is
considered unsafe in BPF context. So here we pass 'send_info[SSK_MODE_ACTIVE].ssk'
into bpf_mptcp_subflow_ctx() to find the related subflow. Then we access this
subflow instead of 'send_info[SSK_MODE_ACTIVE].ssk' below. This can make BPF happy.

> 
> > +	 * then use mptcp_subflow_tcp_sock() to get the ssk.
> > +	 */
> > +	subflow = mptcp_subflow_tcp_sock(send_info[SSK_MODE_ACTIVE].ssk, data);
> > +	ssk = mptcp_subflow_tcp_sock(subflow);
> 
> What if you store the 'subflow' pointer in 'send_info'? Will the
> verifier splat with that? and what if we store the corresponding
> context index 'i' instead?

Storing the 'subflow' or index 'i' don't work too.

> that we avoid the 'mptcp_subflow_tcp_sock()'
> call entirely.

No need to avoid "mptcp_subflow_tcp_sock", we need to access both
'subflow' and 'ssk' below.

Thanks,
-Geliang

> 
> Please, don't send directly a new version, I think it would be better
> clarify this point before updating the code
> 
> thanks!
> 
> Paolo
> 

  reply	other threads:[~2023-06-09 13:32 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-08  5:46 [PATCH mptcp-next v8 00/17] BPF packet scheduler updates Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 01/17] Squash to "mptcp: drop last_snd and MPTCP_RESET_SCHEDULER" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 02/17] Squash to "mptcp: add struct mptcp_sched_ops" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 03/17] Squash to "mptcp: add scheduler wrappers" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 04/17] mptcp: add last_snd in sched_data Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 05/17] mptcp: add snd_burst " Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 06/17] mptcp: register default scheduler Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 07/17] Squash to "bpf: Add bpf_mptcp_sched_ops" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 08/17] Squash to "selftests/bpf: Add mptcp sched structs" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 09/17] Squash to "selftests/bpf: Add bpf_rr scheduler" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 10/17] mptcp: add two wrappers needed by bpf_burst Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 11/17] bpf: Add bpf_burst write accesses Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 12/17] bpf: Export more bpf_burst related functions Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 13/17] selftests/bpf: Add bpf_burst scheduler Geliang Tang
2023-06-09  9:57   ` Paolo Abeni
2023-06-09 13:32     ` Geliang Tang [this message]
2023-06-09 14:40       ` Paolo Abeni
2023-06-10  1:45         ` Geliang Tang
2023-06-12 11:05           ` Paolo Abeni
2023-06-12 13:29             ` Geliang Tang
2023-06-12 14:22               ` Paolo Abeni
2023-06-13  5:36                 ` Geliang Tang
2023-06-13  9:35                   ` Paolo Abeni
2023-06-13 12:32                 ` Geliang Tang
2023-06-13 13:36                   ` Paolo Abeni
2023-06-13 14:03                     ` Geliang Tang
2023-06-13 14:18                       ` Paolo Abeni
2023-06-08  5:46 ` [PATCH mptcp-next v8 14/17] selftests/bpf: Add bpf_burst test Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 15/17] bpf: Add subflow bit flags write accesses Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 16/17] selftests/bpf: Add bpf_stale scheduler Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 17/17] selftests/bpf: Add bpf_stale test Geliang Tang
2023-06-08  6:29   ` selftests/bpf: Add bpf_stale test: Build Failure MPTCP CI
2023-06-08  6:56   ` selftests/bpf: Add bpf_stale test: Tests Results MPTCP CI
2023-06-13 10:08   ` selftests/bpf: Add bpf_stale test: Build Failure MPTCP CI
2023-06-13 10:57   ` selftests/bpf: Add bpf_stale test: Tests Results MPTCP CI

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230609133236.GA30403@localhost \
    --to=geliang.tang@suse.com \
    --cc=mptcp@lists.linux.dev \
    --cc=pabeni@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox