* [PATCH 1/2] [PATCH net-next,1/2] Add new args for cong_control in tcp_congestion_ops @ 2024-04-24 20:37 Miao Xu 2024-04-24 20:37 ` [PATCH 2/2] [PATCH net-next,2/2] Add test for the use of new args in cong_control Miao Xu 2024-04-27 13:06 ` [PATCH 1/2] [PATCH net-next,1/2] Add new args for cong_control in tcp_congestion_ops Eric Dumazet 0 siblings, 2 replies; 4+ messages in thread From: Miao Xu @ 2024-04-24 20:37 UTC (permalink / raw) To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni, David Ahern, Martin Lau Cc: netdev, bpf, Miao Xu This patch adds two new arguments for cong_control of struct tcp_congestion_ops: - ack - flag These two arguments are inherited from the caller tcp_cong_control in tcp_intput.c. One use case of them is to update cwnd and pacing rate inside cong_control based on the info they provide. For example, the flag can be used to decide if it is the right time to raise or reduce a sender's cwnd. Another change in this patch is to allow the write of tp->snd_cwnd_stamp for a bpf tcp ca program. An use case of writing this field is to keep track of the time whenever tp->snd_cwnd is raised or reduced inside the cong_control callback. Signed-off-by: Miao Xu <miaxu@meta.com> --- include/net/tcp.h | 2 +- net/ipv4/bpf_tcp_ca.c | 6 +++++- net/ipv4/tcp_bbr.c | 2 +- net/ipv4/tcp_input.c | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index b935e1ae4caf..b37b8219060a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1167,7 +1167,7 @@ struct tcp_congestion_ops { /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); /* new value of cwnd after loss (required) */ diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 7f518ea5f4ac..18227757ec0c 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -107,6 +107,9 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, case offsetof(struct tcp_sock, snd_cwnd_cnt): end = offsetofend(struct tcp_sock, snd_cwnd_cnt); break; + case offsetof(struct tcp_sock, snd_cwnd_stamp): + end = offsetofend(struct tcp_sock, snd_cwnd_stamp); + break; case offsetof(struct tcp_sock, snd_ssthresh): end = offsetofend(struct tcp_sock, snd_ssthresh); break; @@ -307,7 +310,8 @@ static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) return 0; } -static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs) +static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, + const struct rate_sample *rs) { } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 05dc2d05bc7c..c13d263dae06 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1024,7 +1024,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) bbr_update_gains(sk); } -__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs) +__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { struct bbr *bbr = inet_csk_ca(sk); u32 bw; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 384fa5e2f065..661dca9e3895 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3541,7 +3541,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cong_control) { - icsk->icsk_ca_ops->cong_control(sk, rs); + icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs); return; } -- 2.43.0 ^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/2] [PATCH net-next,2/2] Add test for the use of new args in cong_control 2024-04-24 20:37 [PATCH 1/2] [PATCH net-next,1/2] Add new args for cong_control in tcp_congestion_ops Miao Xu @ 2024-04-24 20:37 ` Miao Xu 2024-04-25 19:09 ` Martin KaFai Lau 2024-04-27 13:06 ` [PATCH 1/2] [PATCH net-next,1/2] Add new args for cong_control in tcp_congestion_ops Eric Dumazet 1 sibling, 1 reply; 4+ messages in thread From: Miao Xu @ 2024-04-24 20:37 UTC (permalink / raw) To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni, David Ahern, Martin Lau Cc: netdev, bpf, Miao Xu This patch adds a selftest to show the usage of the new arguments in cong_control. For simplicity's sake, the testing example reuses cubic's kernel functions. Signed-off-by: Miao Xu <miaxu@meta.com> --- tools/testing/selftests/bpf/bpf_tcp_helpers.h | 23 +++ .../bpf/progs/bpf_cubic_cong_control.c | 176 ++++++++++++++++++ 2 files changed, 199 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_cubic_cong_control.c diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index 82a7c9de95f9..3115bc80280e 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -21,6 +21,15 @@ BPF_PROG(name, args) #endif #define tcp_jiffies32 ((__u32)bpf_jiffies64()) +#define TCP_INFINITE_SSTHRESH 0x7fffffff + +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_SND_UNA_ADVANCED \ + 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_ACKED (FLAG_DATA_ACKED | FLAG_SYN_ACKED) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED | FLAG_DATA_SACKED) struct sock_common { unsigned char skc_state; @@ -37,6 +46,7 @@ struct sock { struct sock_common __sk_common; #define sk_state __sk_common.skc_state unsigned long sk_pacing_rate; + unsigned long sk_max_pacing_rate; __u32 sk_pacing_status; /* see enum sk_pacing */ } __attribute__((preserve_access_index)); @@ -86,6 +96,19 @@ struct tcp_sock { __u32 prior_cwnd; __u64 tcp_mstamp; /* most recent packet received/sent */ bool is_mptcp; + __u32 snd_cwnd_stamp; + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 high_seq; /* snd_nxt at onset of congestion */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u32 prr_delivered; /* Number of newly delivered packets to + * receiver in Recovery. + */ + __u32 prr_out; /* Total number of pkts sent during Recovery. */ + __u32 reordering; /* Packet reordering metric. */ } __attribute__((preserve_access_index)); static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk) diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic_cong_control.c b/tools/testing/selftests/bpf/progs/bpf_cubic_cong_control.c new file mode 100644 index 000000000000..698964df1f33 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_cubic_cong_control.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* WARNING: This implementation is NOT the same as the tcp_cubic.c. + * The purpose is mainly to show use cases of the new arguments in + * cong_control. + */ + +#include <linux/bpf.h> +#include <linux/stddef.h> +#include <linux/tcp.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; + +extern void cubictcp_init(struct sock *sk) __ksym; +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) + __ksym; + extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; + extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym; + extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) + __ksym; + extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; + + +void BPF_STRUCT_OPS(bpf_cubic_init, struct sock *sk) +{ + cubictcp_init(sk); +} + +void BPF_STRUCT_OPS(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + cubictcp_cwnd_event(sk, event); +} + +#define USEC_PER_SEC 1000000UL +#define TCP_PACING_SS_RATIO (200) +#define TCP_PACING_CA_RATIO (120) +#define TCP_REORDERING (12) +#define likely(x) (__builtin_expect(!!(x), 1)) + +static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) +{ + return dividend / divisor; +} + +static void tcp_update_pacing_rate(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + __u64 rate; + + /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ + rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); + + /* current rate is (cwnd * mss) / srtt + * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. + * In Congestion Avoidance phase, set it to 120 % the current rate. + * + * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) + * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching + * end of slow start and should slow down. + */ + if (tp->snd_cwnd < tp->snd_ssthresh / 2) + rate *= TCP_PACING_SS_RATIO; + else + rate *= TCP_PACING_CA_RATIO; + + rate *= max(tp->snd_cwnd, tp->packets_out); + + if (likely(tp->srtt_us)) + rate = div64_u64(rate, (__u64)tp->srtt_us); + + sk->sk_pacing_rate = min(rate, (__u64)sk->sk_max_pacing_rate); +} + +static __always_inline void tcp_cwnd_reduction( + struct sock *sk, + int newly_acked_sacked, + int newly_lost, + int flag) { + struct tcp_sock *tp = tcp_sk(sk); + int sndcnt = 0; + __u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out; + int delta = tp->snd_ssthresh - pkts_in_flight; + + if (newly_acked_sacked <= 0 || !tp->prior_cwnd) + return; + + __u32 prr_delivered = tp->prr_delivered + newly_acked_sacked; + + if (delta < 0) { + __u64 dividend = + (__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1; + sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out; + } else { + sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked); + if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) + sndcnt++; + sndcnt = min(delta, sndcnt); + } + /* Force a fast retransmit upon entering fast recovery */ + sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); + tp->snd_cwnd = pkts_in_flight + sndcnt; +} + +/* Decide wheather to run the increase function of congestion control. */ +static __always_inline bool tcp_may_raise_cwnd( + const struct sock *sk, + const int flag) { + if (tcp_sk(sk)->reordering > TCP_REORDERING) + return flag & FLAG_FORWARD_PROGRESS; + + return flag & FLAG_DATA_ACKED; +} + +void BPF_STRUCT_OPS(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (((1<<TCP_CA_CWR) | (1<<TCP_CA_Recovery)) & + (1 << inet_csk(sk)->icsk_ca_state)) { + /* Reduce cwnd if state mandates */ + tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag); + + if (!before(tp->snd_una, tp->high_seq)) { + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_jiffies32; + } + // __cwnd_event(sk, CA_EVENT_COMPLETE_CWR); + } + } else if (tcp_may_raise_cwnd(sk, flag)) { + /* Advance cwnd if state allows */ + cubictcp_cong_avoid(sk, ack, rs->acked_sacked); + tp->snd_cwnd_stamp = tcp_jiffies32; + } + + tcp_update_pacing_rate(sk); +} + +__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) +{ + return cubictcp_recalc_ssthresh(sk); +} + +void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) +{ + cubictcp_state(sk, new_state); +} + +void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, + const struct ack_sample *sample) +{ + cubictcp_acked(sk, sample); +} + +__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) +{ + return tcp_reno_undo_cwnd(sk); +} + + +SEC(".struct_ops") +struct tcp_congestion_ops cubic = { + .init = (void *)bpf_cubic_init, + .ssthresh = (void *)bpf_cubic_recalc_ssthresh, + .cong_control = (void *)bpf_cubic_cong_control, + .set_state = (void *)bpf_cubic_state, + .undo_cwnd = (void *)bpf_cubic_undo_cwnd, + .cwnd_event = (void *)bpf_cubic_cwnd_event, + .pkts_acked = (void *)bpf_cubic_acked, + .name = "bpf_cubic", +}; -- 2.43.0 ^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH 2/2] [PATCH net-next,2/2] Add test for the use of new args in cong_control 2024-04-24 20:37 ` [PATCH 2/2] [PATCH net-next,2/2] Add test for the use of new args in cong_control Miao Xu @ 2024-04-25 19:09 ` Martin KaFai Lau 0 siblings, 0 replies; 4+ messages in thread From: Martin KaFai Lau @ 2024-04-25 19:09 UTC (permalink / raw) To: Miao Xu Cc: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni, David Ahern, Martin Lau, netdev, bpf On 4/24/24 1:37 PM, Miao Xu wrote: > +#define USEC_PER_SEC 1000000UL > +#define TCP_PACING_SS_RATIO (200) > +#define TCP_PACING_CA_RATIO (120) > +#define TCP_REORDERING (12) > +#define likely(x) (__builtin_expect(!!(x), 1)) > + > +static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) > +{ > + return dividend / divisor; > +} > + > +static void tcp_update_pacing_rate(struct sock *sk) > +{ > + const struct tcp_sock *tp = tcp_sk(sk); > + __u64 rate; > + > + /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ > + rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); > + > + /* current rate is (cwnd * mss) / srtt > + * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. > + * In Congestion Avoidance phase, set it to 120 % the current rate. > + * > + * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) > + * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching > + * end of slow start and should slow down. > + */ > + if (tp->snd_cwnd < tp->snd_ssthresh / 2) > + rate *= TCP_PACING_SS_RATIO; > + else > + rate *= TCP_PACING_CA_RATIO; > + > + rate *= max(tp->snd_cwnd, tp->packets_out); > + > + if (likely(tp->srtt_us)) > + rate = div64_u64(rate, (__u64)tp->srtt_us); > + > + sk->sk_pacing_rate = min(rate, (__u64)sk->sk_max_pacing_rate); > +} > + > +static __always_inline void tcp_cwnd_reduction( > + struct sock *sk, > + int newly_acked_sacked, > + int newly_lost, > + int flag) { > + struct tcp_sock *tp = tcp_sk(sk); > + int sndcnt = 0; > + __u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out; > + int delta = tp->snd_ssthresh - pkts_in_flight; > + > + if (newly_acked_sacked <= 0 || !tp->prior_cwnd) > + return; > + > + __u32 prr_delivered = tp->prr_delivered + newly_acked_sacked; > + > + if (delta < 0) { > + __u64 dividend = > + (__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1; > + sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out; > + } else { > + sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked); > + if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) > + sndcnt++; > + sndcnt = min(delta, sndcnt); > + } > + /* Force a fast retransmit upon entering fast recovery */ > + sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); > + tp->snd_cwnd = pkts_in_flight + sndcnt; > +} > + > +/* Decide wheather to run the increase function of congestion control. */ > +static __always_inline bool tcp_may_raise_cwnd( > + const struct sock *sk, > + const int flag) { > + if (tcp_sk(sk)->reordering > TCP_REORDERING) > + return flag & FLAG_FORWARD_PROGRESS; > + > + return flag & FLAG_DATA_ACKED; > +} > + > +void BPF_STRUCT_OPS(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, > + const struct rate_sample *rs) > +{ > + struct tcp_sock *tp = tcp_sk(sk); > + > + if (((1<<TCP_CA_CWR) | (1<<TCP_CA_Recovery)) & > + (1 << inet_csk(sk)->icsk_ca_state)) { > + /* Reduce cwnd if state mandates */ > + tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag); > + > + if (!before(tp->snd_una, tp->high_seq)) { > + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ > + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && > + inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { > + tp->snd_cwnd = tp->snd_ssthresh; > + tp->snd_cwnd_stamp = tcp_jiffies32; > + } > + // __cwnd_event(sk, CA_EVENT_COMPLETE_CWR); > + } > + } else if (tcp_may_raise_cwnd(sk, flag)) { > + /* Advance cwnd if state allows */ > + cubictcp_cong_avoid(sk, ack, rs->acked_sacked); > + tp->snd_cwnd_stamp = tcp_jiffies32; > + } > + > + tcp_update_pacing_rate(sk); It will be useful to highlight what you want to do differently from the kernel's tcp_cong_control()+tcp_cong_avoid() here. or it is something that I missed from the above example? ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH 1/2] [PATCH net-next,1/2] Add new args for cong_control in tcp_congestion_ops 2024-04-24 20:37 [PATCH 1/2] [PATCH net-next,1/2] Add new args for cong_control in tcp_congestion_ops Miao Xu 2024-04-24 20:37 ` [PATCH 2/2] [PATCH net-next,2/2] Add test for the use of new args in cong_control Miao Xu @ 2024-04-27 13:06 ` Eric Dumazet 1 sibling, 0 replies; 4+ messages in thread From: Eric Dumazet @ 2024-04-27 13:06 UTC (permalink / raw) To: Miao Xu Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, David Ahern, Martin Lau, netdev, bpf On Wed, Apr 24, 2024 at 10:38 PM Miao Xu <miaxu@meta.com> wrote: > > This patch adds two new arguments for cong_control of struct > tcp_congestion_ops: > - ack > - flag > These two arguments are inherited from the caller tcp_cong_control in > tcp_intput.c. One use case of them is to update cwnd and pacing rate > inside cong_control based on the info they provide. For example, the > flag can be used to decide if it is the right time to raise or reduce a > sender's cwnd. > > Another change in this patch is to allow the write of tp->snd_cwnd_stamp > for a bpf tcp ca program. An use case of writing this field is to keep > track of the time whenever tp->snd_cwnd is raised or reduced inside the > cong_control callback. > > Signed-off-by: Miao Xu <miaxu@meta.com> Reviewed-by: Eric Dumazet <edumazet@google.com> ^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2024-04-27 13:06 UTC | newest] Thread overview: 4+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2024-04-24 20:37 [PATCH 1/2] [PATCH net-next,1/2] Add new args for cong_control in tcp_congestion_ops Miao Xu 2024-04-24 20:37 ` [PATCH 2/2] [PATCH net-next,2/2] Add test for the use of new args in cong_control Miao Xu 2024-04-25 19:09 ` Martin KaFai Lau 2024-04-27 13:06 ` [PATCH 1/2] [PATCH net-next,1/2] Add new args for cong_control in tcp_congestion_ops Eric Dumazet
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox