All of lore.kernel.org
 help / color / mirror / Atom feed
From: Geliang Tang <geliang.tang@suse.com>
To: Paolo Abeni <pabeni@redhat.com>
Cc: mptcp@lists.linux.dev
Subject: Re: [PATCH mptcp-next v8 13/17] selftests/bpf: Add bpf_burst scheduler
Date: Fri, 9 Jun 2023 21:32:36 +0800	[thread overview]
Message-ID: <20230609133236.GA30403@localhost> (raw)
In-Reply-To: <666004bda6ff9e3e6d65c6903c5b18f18f0e31ed.camel@redhat.com>

Hi Paolo,

On Fri, Jun 09, 2023 at 11:57:53AM +0200, Paolo Abeni wrote:
> On Thu, 2023-06-08 at 13:46 +0800, Geliang Tang wrote:
> > This patch implements the burst BPF MPTCP scheduler, named bpf_burst,
> > which is the default scheduler in protocol.c. bpf_burst_get_send() uses
> > the same logic as mptcp_subflow_get_send() and bpf_burst_get_retrans
> > uses the same logic as mptcp_subflow_get_retrans().
> > 
> > Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> > ---
> >  tools/testing/selftests/bpf/bpf_tcp_helpers.h |   5 +
> >  .../selftests/bpf/progs/mptcp_bpf_burst.c     | 208 ++++++++++++++++++
> >  2 files changed, 213 insertions(+)
> >  create mode 100644 tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
> > 
> > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > index 44c356772798..927a00f663f2 100644
> > --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > @@ -36,6 +36,7 @@ enum sk_pacing {
> >  struct sock {
> >  	struct sock_common	__sk_common;
> >  #define sk_state		__sk_common.skc_state
> > +	int			sk_wmem_queued;
> >  	unsigned long		sk_pacing_rate;
> >  	__u32			sk_pacing_status; /* see enum sk_pacing */
> >  } __attribute__((preserve_access_index));
> > @@ -234,12 +235,15 @@ extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
> >  #define MPTCP_SUBFLOWS_MAX	8
> >  
> >  struct mptcp_subflow_context {
> > +	unsigned long avg_pacing_rate;
> >  	__u32	backup : 1;
> > +	__u8	stale_count;
> >  	struct	sock *tcp_sock;	    /* tcp sk backpointer */
> >  } __attribute__((preserve_access_index));
> >  
> >  struct mptcp_sched_data {
> >  	struct sock	*last_snd;
> > +	int		snd_burst;
> >  	bool		reinject;
> >  	struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX];
> >  } __attribute__((preserve_access_index));
> > @@ -260,6 +264,7 @@ struct mptcp_sched_ops {
> >  struct mptcp_sock {
> >  	struct inet_connection_sock	sk;
> >  
> > +	__u64		snd_nxt;
> >  	__u32		token;
> >  	struct sock	*first;
> >  	char		ca_name[TCP_CA_NAME_MAX];
> > diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
> > new file mode 100644
> > index 000000000000..b860fd517787
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
> > @@ -0,0 +1,208 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2023, SUSE. */
> > +
> > +#include <linux/bpf.h>
> > +#include <limits.h>
> > +#include "bpf_tcp_helpers.h"
> > +
> > +char _license[] SEC("license") = "GPL";
> > +
> > +#define MPTCP_SEND_BURST_SIZE	65428
> > +
> > +struct subflow_send_info {
> > +	struct sock *ssk;
> > +	__u64 linger_time;
> > +};
> > +
> > +static struct mptcp_subflow_context *
> > +bpf_mptcp_subflow_ctx(struct sock *ssk, struct mptcp_sched_data *data)
> > +{
> > +	int i, nr = 0;
> > +
> > +	for (i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
> > +		if (!ssk || !data->contexts[i])
> > +			break;
> > +
> > +		if (mptcp_subflow_tcp_sock(data->contexts[i]) == ssk) {
> > +			nr = i;
> > +			break;
> > +		}
> > +	}
> > +
> > +	return data->contexts[nr];
> > +}
> > +
> > +static inline __u64 div_u64_rem(__u64 dividend, __u32 divisor, __u32 *remainder)
> > +{
> > +	*remainder = dividend % divisor;
> > +	return dividend / divisor;
> > +}
> > +
> > +static inline __u64 div_u64(__u64 dividend, __u32 divisor)
> > +{
> > +	__u32 remainder;
> > +
> > +	return div_u64_rem(dividend, divisor, &remainder);
> > +}
> > +
> > +extern bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) __ksym;
> > +extern void mptcp_set_timeout(struct sock *sk) __ksym;
> > +extern __u64 mptcp_wnd_end(const struct mptcp_sock *msk) __ksym;
> > +extern bool bpf_sk_stream_memory_free(const struct sock *sk) __ksym;
> > +extern bool bpf_tcp_rtx_and_write_queues_empty(const struct sock *sk) __ksym;
> > +extern void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) __ksym;
> > +
> > +#define SSK_MODE_ACTIVE	0
> > +#define SSK_MODE_BACKUP	1
> > +#define SSK_MODE_MAX	2
> > +
> > +SEC("struct_ops/mptcp_sched_burst_init")
> > +void BPF_PROG(mptcp_sched_burst_init, const struct mptcp_sock *msk)
> > +{
> > +}
> > +
> > +SEC("struct_ops/mptcp_sched_burst_release")
> > +void BPF_PROG(mptcp_sched_burst_release, const struct mptcp_sock *msk)
> > +{
> > +}
> > +
> > +void BPF_STRUCT_OPS(bpf_burst_data_init, const struct mptcp_sock *msk,
> > +		    struct mptcp_sched_data *data)
> > +{
> > +	mptcp_sched_data_set_contexts(msk, data);
> > +}
> > +
> > +static int bpf_burst_get_send(const struct mptcp_sock *msk,
> > +			      struct mptcp_sched_data *data)
> > +{
> > +	struct subflow_send_info send_info[SSK_MODE_MAX];
> > +	struct mptcp_subflow_context *subflow;
> > +	struct sock *sk = (struct sock *)msk;
> > +	__u32 pace, burst, wmem;
> > +	int i, nr_active = 0;
> > +	__u64 linger_time;
> > +	struct sock *ssk;
> > +
> > +	/* pick the subflow with the lower wmem/wspace ratio */
> > +	for (i = 0; i < SSK_MODE_MAX; ++i) {
> > +		send_info[i].ssk = NULL;
> > +		send_info[i].linger_time = -1;
> > +	}
> > +
> > +	for (i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
> > +		if (!data->contexts[i])
> > +			break;
> > +
> > +		subflow = data->contexts[i];
> > +		ssk = mptcp_subflow_tcp_sock(subflow);
> > +		if (!mptcp_subflow_active(subflow))
> > +			continue;
> > +
> > +		nr_active += !subflow->backup;
> > +		pace = subflow->avg_pacing_rate;
> > +		if (!pace) {
> > +			/* init pacing rate from socket */
> > +			subflow->avg_pacing_rate = ssk->sk_pacing_rate;
> > +			pace = subflow->avg_pacing_rate;
> > +			if (!pace)
> > +				continue;
> > +		}
> > +
> > +		linger_time = div_u64((__u64)ssk->sk_wmem_queued << 32, pace);
> > +		if (linger_time < send_info[subflow->backup].linger_time) {
> > +			send_info[subflow->backup].ssk = ssk;
> > +			send_info[subflow->backup].linger_time = linger_time;
> > +		}
> > +	}
> > +	mptcp_set_timeout(sk);
> > +
> > +	/* pick the best backup if no other subflow is active */
> > +	if (!nr_active)
> > +		send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;
> > +
> > +	/* Get the ssk directly like this "ssk = send_info[SSK_MODE_ACTIVE].ssk;"
> > +	 * will get an error:
> > +	 *   arg#0 pointer type STRUCT sock must point to scalar, or struct with scalar
> > +	 * So here use bpf_mptcp_subflow_ctx() to get the subflow,

This comment needs to be updated:

        /* Pass "send_info[SSK_MODE_ACTIVE].ssk" directly to bpf_sk_stream_memory_free()
         * will get an error:
         *   arg#0 pointer type STRUCT sock must point to scalar, or struct with scalar
         * So here pass it to bpf_mptcp_subflow_ctx() to get the subflow,
         * then use mptcp_subflow_tcp_sock() to get the ssk,
         * and pass the ssk to bpf_sk_stream_memory_free().
         */

> 
> May I guess you get a similar error if you do:
> 
> 	subflow = mptcp_subflow_ctx(ssk)
> 
> ? (just out of sheer ignorance and curiosity)

Yes. It seems that accessing 'send_info[SSK_MODE_ACTIVE].ssk' is
considered unsafe in BPF context. So here we pass 'send_info[SSK_MODE_ACTIVE].ssk'
into bpf_mptcp_subflow_ctx() to find the related subflow. Then we access this
subflow instead of 'send_info[SSK_MODE_ACTIVE].ssk' below. This can make BPF happy.

> 
> > +	 * then use mptcp_subflow_tcp_sock() to get the ssk.
> > +	 */
> > +	subflow = mptcp_subflow_tcp_sock(send_info[SSK_MODE_ACTIVE].ssk, data);
> > +	ssk = mptcp_subflow_tcp_sock(subflow);
> 
> What if you store the 'subflow' pointer in 'send_info'? Will the
> verifier splat with that? and what if we store the corresponding
> context index 'i' instead?

Storing the 'subflow' or index 'i' don't work too.

> that we avoid the 'mptcp_subflow_tcp_sock()'
> call entirely.

No need to avoid "mptcp_subflow_tcp_sock", we need to access both
'subflow' and 'ssk' below.

Thanks,
-Geliang

> 
> Please, don't send directly a new version, I think it would be better
> clarify this point before updating the code
> 
> thanks!
> 
> Paolo
> 

  reply	other threads:[~2023-06-09 13:32 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-08  5:46 [PATCH mptcp-next v8 00/17] BPF packet scheduler updates Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 01/17] Squash to "mptcp: drop last_snd and MPTCP_RESET_SCHEDULER" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 02/17] Squash to "mptcp: add struct mptcp_sched_ops" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 03/17] Squash to "mptcp: add scheduler wrappers" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 04/17] mptcp: add last_snd in sched_data Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 05/17] mptcp: add snd_burst " Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 06/17] mptcp: register default scheduler Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 07/17] Squash to "bpf: Add bpf_mptcp_sched_ops" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 08/17] Squash to "selftests/bpf: Add mptcp sched structs" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 09/17] Squash to "selftests/bpf: Add bpf_rr scheduler" Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 10/17] mptcp: add two wrappers needed by bpf_burst Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 11/17] bpf: Add bpf_burst write accesses Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 12/17] bpf: Export more bpf_burst related functions Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 13/17] selftests/bpf: Add bpf_burst scheduler Geliang Tang
2023-06-09  9:57   ` Paolo Abeni
2023-06-09 13:32     ` Geliang Tang [this message]
2023-06-09 14:40       ` Paolo Abeni
2023-06-10  1:45         ` Geliang Tang
2023-06-12 11:05           ` Paolo Abeni
2023-06-12 13:29             ` Geliang Tang
2023-06-12 14:22               ` Paolo Abeni
2023-06-13  5:36                 ` Geliang Tang
2023-06-13  9:35                   ` Paolo Abeni
2023-06-13 12:32                 ` Geliang Tang
2023-06-13 13:36                   ` Paolo Abeni
2023-06-13 14:03                     ` Geliang Tang
2023-06-13 14:18                       ` Paolo Abeni
2023-06-08  5:46 ` [PATCH mptcp-next v8 14/17] selftests/bpf: Add bpf_burst test Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 15/17] bpf: Add subflow bit flags write accesses Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 16/17] selftests/bpf: Add bpf_stale scheduler Geliang Tang
2023-06-08  5:46 ` [PATCH mptcp-next v8 17/17] selftests/bpf: Add bpf_stale test Geliang Tang
2023-06-08  6:29   ` selftests/bpf: Add bpf_stale test: Build Failure MPTCP CI
2023-06-08  6:56   ` selftests/bpf: Add bpf_stale test: Tests Results MPTCP CI
2023-06-13 10:08   ` selftests/bpf: Add bpf_stale test: Build Failure MPTCP CI
2023-06-13 10:57   ` selftests/bpf: Add bpf_stale test: Tests Results MPTCP CI

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230609133236.GA30403@localhost \
    --to=geliang.tang@suse.com \
    --cc=mptcp@lists.linux.dev \
    --cc=pabeni@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.