From: Kui-Feng Lee <sinquersw@gmail.com>
To: Daniel Borkmann <daniel@iogearbox.net>,
Kui-Feng Lee <kuifeng@meta.com>,
bpf@vger.kernel.org, ast@kernel.org, martin.lau@linux.dev,
song@kernel.org, kernel-team@meta.com, andrii@kernel.org,
sdf@google.com
Cc: netdev@vger.kernel.org, Eric Dumazet <edumazet@google.com>
Subject: Re: [PATCH bpf-next v7 2/8] net: Update an existing TCP congestion control algorithm.
Date: Fri, 17 Mar 2023 16:07:22 -0700 [thread overview]
Message-ID: <4271f277-ec08-ea15-4c7b-8c5147db6308@gmail.com> (raw)
In-Reply-To: <f72b77c3-15ac-3de3-5bce-c263564c1487@iogearbox.net>
On 3/17/23 08:23, Daniel Borkmann wrote:
> On 3/16/23 3:36 AM, Kui-Feng Lee wrote:
>> This feature lets you immediately transition to another congestion
>> control algorithm or implementation with the same name. Once a name
>> is updated, new connections will apply this new algorithm.
>>
>> The purpose is to update a customized algorithm implemented in BPF
>> struct_ops with a new version on the flight. The following is an
>> example of using the userspace API implemented in later BPF patches.
>>
>> link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
>> .......
>> err = bpf_link__update_map(link, skel->maps.ca_update_2);
>>
>> We first load and register an algorithm implemented in BPF struct_ops,
>> then swap it out with a new one using the same name. After that, newly
>> created connections will apply the updated algorithm, while older ones
>> retain the previous version already applied.
>>
>> This patch also takes this chance to refactor the ca validation into
>> the new tcp_validate_congestion_control() function.
>>
>> Cc: netdev@vger.kernel.org, Eric Dumazet <edumazet@google.com>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>> include/net/tcp.h | 3 +++
>> net/ipv4/tcp_cong.c | 60 +++++++++++++++++++++++++++++++++++++++------
>> 2 files changed, 56 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index db9f828e9d1e..2abb755e6a3a 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -1117,6 +1117,9 @@ struct tcp_congestion_ops {
>> int tcp_register_congestion_control(struct tcp_congestion_ops *type);
>> void tcp_unregister_congestion_control(struct tcp_congestion_ops
>> *type);
>> +int tcp_update_congestion_control(struct tcp_congestion_ops *type,
>> + struct tcp_congestion_ops *old_type);
>> +int tcp_validate_congestion_control(struct tcp_congestion_ops *ca);
>> void tcp_assign_congestion_control(struct sock *sk);
>> void tcp_init_congestion_control(struct sock *sk);
>> diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
>> index db8b4b488c31..c90791ae8389 100644
>> --- a/net/ipv4/tcp_cong.c
>> +++ b/net/ipv4/tcp_cong.c
>> @@ -75,14 +75,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
>> return NULL;
>> }
>> -/*
>> - * Attach new congestion control algorithm to the list
>> - * of available options.
>> - */
>> -int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
>> +int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
>> {
>> - int ret = 0;
>> -
>> /* all algorithms must implement these */
>> if (!ca->ssthresh || !ca->undo_cwnd ||
>> !(ca->cong_avoid || ca->cong_control)) {
>> @@ -90,6 +84,20 @@ int tcp_register_congestion_control(struct
>> tcp_congestion_ops *ca)
>> return -EINVAL;
>> }
>> + return 0;
>> +}
>> +
>> +/* Attach new congestion control algorithm to the list
>> + * of available options.
>> + */
>> +int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
>> +{
>> + int ret;
>> +
>> + ret = tcp_validate_congestion_control(ca);
>> + if (ret)
>> + return ret;
>> +
>> ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
>> spin_lock(&tcp_cong_list_lock);
>> @@ -130,6 +138,44 @@ void tcp_unregister_congestion_control(struct
>> tcp_congestion_ops *ca)
>> }
>> EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
>> +/* Replace a registered old ca with a new one.
>> + *
>> + * The new ca must have the same name as the old one, that has been
>> + * registered.
>> + */
>> +int tcp_update_congestion_control(struct tcp_congestion_ops *ca,
>> struct tcp_congestion_ops *old_ca)
>> +{
>> + struct tcp_congestion_ops *existing;
>> + int ret;
>> +
>> + ret = tcp_validate_congestion_control(ca);
>> + if (ret)
>> + return ret;
>> +
>> + ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
>> +
>> + spin_lock(&tcp_cong_list_lock);
>> + existing = tcp_ca_find_key(old_ca->key);
>> + if (ca->key == TCP_CA_UNSPEC || !existing ||
>> strcmp(existing->name, ca->name)) {
>> + pr_notice("%s not registered or non-unique key\n",
>> + ca->name);
>> + ret = -EINVAL;
>> + } else if (existing != old_ca) {
>> + pr_notice("invalid old congestion control algorithm to
>> replace\n");
>> + ret = -EINVAL;
>> + } else {
>> + /* Add the new one before removing the old one to keep
>> + * one implementation available all the time.
>> + */
>> + list_add_tail_rcu(&ca->list, &tcp_cong_list);
>> + list_del_rcu(&existing->list);
>> + pr_debug("%s updated\n", ca->name);
>> + }
>> + spin_unlock(&tcp_cong_list_lock);
>> +
>> + return ret;
>> +}
>
> Was wondering if we could have tcp_register_congestion_control and
> tcp_update_congestion_control
> could be refactored for reuse. Maybe like below. From the function
> itself what is not clear whether
> callers that replace an existing one should do the synchronize_rcu()
> themselves or if this should
> be part of tcp_update_congestion_control?
>
> int tcp_check_congestion_control(struct tcp_congestion_ops *ca)
> {
> /* All algorithms must implement these. */
> if (!ca->ssthresh || !ca->undo_cwnd ||
> !(ca->cong_avoid || ca->cong_control)) {
> pr_err("%s does not implement required ops\n", ca->name);
> return -EINVAL;
> }
> if (ca->key == TCP_CA_UNSPEC)
> ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
> if (ca->key == TCP_CA_UNSPEC) {
> pr_notice("%s results in zero key\n", ca->name);
> return -EEXIST;
> }
> return 0;
> }
>
> /* Attach new congestion control algorithm to the list of available
> * options or replace an existing one if old is non-NULL.
> */
> int tcp_update_congestion_control(struct tcp_congestion_ops *new,
> struct tcp_congestion_ops *old)
> {
> struct tcp_congestion_ops *found;
> int ret;
>
> ret = tcp_check_congestion_control(new);
> if (ret)
> return ret;
> if (old &&
> (old->key != new->key ||
> strcmp(old->name, new->name))) {
> pr_notice("%s & %s have non-matching congestion control names\n",
> old->name, new->name);
> return -EINVAL;
> }
> spin_lock(&tcp_cong_list_lock);
> found = tcp_ca_find_key(new->key); > if (old) {
> if (found == old) {
> list_add_tail_rcu(&new->list, &tcp_cong_list);
> list_del_rcu(&old->list);
> } else {
> pr_notice("%s not registered\n", old->name);
> ret = -EINVAL;
> }
> } else {
> if (found) {
> pr_notice("%s already registered\n", new->name);
> ret = -EEXIST;
> } else {
> list_add_tail_rcu(&new->list, &tcp_cong_list);
> }
> }
> spin_unlock(&tcp_cong_list_lock);
> return ret;
> }
After trying to do this refactoring, I found it just shares a few lines
of code, but make thing complicated by adding more checks. So, I prefer
to keep it as it is. How do you think?
>
> int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
> {
> return tcp_update_congestion_control(ca, NULL);
> }
> EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
next prev parent reply other threads:[~2023-03-17 23:07 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-03-16 2:36 [PATCH bpf-next v7 0/8] Transit between BPF TCP congestion controls Kui-Feng Lee
2023-03-16 2:36 ` [PATCH bpf-next v7 1/8] bpf: Retire the struct_ops map kvalue->refcnt Kui-Feng Lee
2023-03-17 16:47 ` Martin KaFai Lau
2023-03-17 20:41 ` Kui-Feng Lee
2023-03-16 2:36 ` [PATCH bpf-next v7 2/8] net: Update an existing TCP congestion control algorithm Kui-Feng Lee
2023-03-17 15:23 ` Daniel Borkmann
2023-03-17 17:18 ` Martin KaFai Lau
2023-03-17 17:23 ` Daniel Borkmann
2023-03-17 21:46 ` Kui-Feng Lee
2023-03-17 23:07 ` Kui-Feng Lee [this message]
2023-03-16 2:36 ` [PATCH bpf-next v7 3/8] bpf: Create links for BPF struct_ops maps Kui-Feng Lee
2023-03-17 18:10 ` Martin KaFai Lau
2023-03-17 20:52 ` Kui-Feng Lee
2023-03-16 2:36 ` [PATCH bpf-next v7 4/8] libbpf: Create a bpf_link in bpf_map__attach_struct_ops() Kui-Feng Lee
2023-03-17 18:44 ` Martin KaFai Lau
2023-03-17 21:00 ` Kui-Feng Lee
2023-03-17 22:23 ` Andrii Nakryiko
2023-03-17 23:48 ` Kui-Feng Lee
2023-03-16 2:36 ` [PATCH bpf-next v7 5/8] bpf: Update the struct_ops of a bpf_link Kui-Feng Lee
2023-03-17 19:23 ` Martin KaFai Lau
2023-03-17 21:39 ` Kui-Feng Lee
2023-03-18 1:11 ` Kui-Feng Lee
2023-03-18 5:38 ` Martin KaFai Lau
2023-03-17 22:27 ` Andrii Nakryiko
2023-03-18 0:41 ` Kui-Feng Lee
2023-03-16 2:36 ` [PATCH bpf-next v7 6/8] libbpf: Update a bpf_link with another struct_ops Kui-Feng Lee
2023-03-17 19:42 ` Martin KaFai Lau
2023-03-17 21:40 ` Kui-Feng Lee
2023-03-17 22:33 ` Andrii Nakryiko
2023-03-18 1:17 ` Kui-Feng Lee
2023-03-16 2:36 ` [PATCH bpf-next v7 7/8] libbpf: Use .struct_ops.link section to indicate a struct_ops with a link Kui-Feng Lee
2023-03-17 22:35 ` Andrii Nakryiko
2023-03-16 2:36 ` [PATCH bpf-next v7 8/8] selftests/bpf: Test switching TCP Congestion Control algorithms Kui-Feng Lee
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4271f277-ec08-ea15-4c7b-8c5147db6308@gmail.com \
--to=sinquersw@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=edumazet@google.com \
--cc=kernel-team@meta.com \
--cc=kuifeng@meta.com \
--cc=martin.lau@linux.dev \
--cc=netdev@vger.kernel.org \
--cc=sdf@google.com \
--cc=song@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox