From: Paolo Abeni <pabeni@redhat.com>
To: chia-yu.chang@nokia-bell-labs.com, horms@kernel.org,
dsahern@kernel.org, kuniyu@amazon.com, bpf@vger.kernel.org,
netdev@vger.kernel.org, dave.taht@gmail.com, jhs@mojatatu.com,
kuba@kernel.org, stephen@networkplumber.org,
xiyou.wangcong@gmail.com, jiri@resnulli.us, davem@davemloft.net,
edumazet@google.com, andrew+netdev@lunn.ch,
donald.hunter@gmail.com, ast@fiberby.net, liuhangbin@gmail.com,
shuah@kernel.org, linux-kselftest@vger.kernel.org, ij@kernel.org,
ncardwell@google.com, koen.de_schepper@nokia-bell-labs.com,
g.white@cablelabs.com, ingemar.s.johansson@ericsson.com,
mirja.kuehlewind@ericsson.com, cheshire@apple.com,
rs.ietf@gmx.at, Jason_Livingood@comcast.com,
vidhi_goel@apple.com
Subject: Re: [PATCH v5 net-next 09/15] tcp: accecn: AccECN option
Date: Tue, 29 Apr 2025 13:56:11 +0200 [thread overview]
Message-ID: <d0969c3d-e33c-472e-815d-70b333990b39@redhat.com> (raw)
In-Reply-To: <20250422153602.54787-10-chia-yu.chang@nokia-bell-labs.com>
On 4/22/25 5:35 PM, chia-yu.chang@nokia-bell-labs.com wrote:
> @@ -302,10 +303,13 @@ struct tcp_sock {
> u32 snd_up; /* Urgent pointer */
> u32 delivered; /* Total data packets delivered incl. rexmits */
> u32 delivered_ce; /* Like the above but only ECE marked packets */
> + u32 delivered_ecn_bytes[3];
This new fields do not belong to this cacheline group. I'm unsure they
belong to fast-path at all. Also u32 will wrap-around very soon.
[...]
> diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
> index dc8fdc80e16b..74ac8a5d2e00 100644
> --- a/include/uapi/linux/tcp.h
> +++ b/include/uapi/linux/tcp.h
> @@ -298,6 +298,13 @@ struct tcp_info {
> __u32 tcpi_snd_wnd; /* peer's advertised receive window after
> * scaling (bytes)
> */
> + __u32 tcpi_received_ce; /* # of CE marks received */
> + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */
> + __u32 tcpi_delivered_e0_bytes;
> + __u32 tcpi_delivered_ce_bytes;
> + __u32 tcpi_received_e1_bytes;
> + __u32 tcpi_received_e0_bytes;
> + __u32 tcpi_received_ce_bytes;
This will break uAPI: new fields must be addded at the end, or must fill
existing holes. Also u32 set in stone in uAPI for a byte counter looks
way too small.
> @@ -5100,7 +5113,7 @@ static void __init tcp_struct_check(void)
> /* 32bit arches with 8byte alignment on u64 fields might need padding
> * before tcp_clock_cache.
> */
> - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 109 + 7);
> + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 122 + 6);
The above means an additional cacheline in fast-path WRT the current
status. IMHO should be avoided.
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 5bd7fc9bcf66..41e45b9aff3f 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -70,6 +70,7 @@
> #include <linux/sysctl.h>
> #include <linux/kernel.h>
> #include <linux/prefetch.h>
> +#include <linux/bitops.h>
> #include <net/dst.h>
> #include <net/tcp.h>
> #include <net/proto_memory.h>
> @@ -499,6 +500,144 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
> return false;
> }
>
> +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given
> + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0).
> + */
> +static u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield)
> +{
> + switch (ecnfield) {
> + case INET_ECN_NOT_ECT:
> + return 0; /* AccECN does not send counts of NOT_ECT */
> + case INET_ECN_ECT_1:
> + return 1;
> + case INET_ECN_CE:
> + return 2;
> + case INET_ECN_ECT_0:
> + return 3;
> + default:
> + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield);
No WARN_ONCE() above please: either the 'ecnfield' data is masked vs
INET_ECN_MASK and the WARN_ONCE should not be possible or a remote
sender can deterministically trigger a WARN() which nowadays will in
turn raise a CVE...
[...]
> +static u32 tcp_accecn_field_init_offset(u8 ecnfield)
> +{
> + switch (ecnfield) {
> + case INET_ECN_NOT_ECT:
> + return 0; /* AccECN does not send counts of NOT_ECT */
> + case INET_ECN_ECT_1:
> + return TCP_ACCECN_E1B_INIT_OFFSET;
> + case INET_ECN_CE:
> + return TCP_ACCECN_CEB_INIT_OFFSET;
> + case INET_ECN_ECT_0:
> + return TCP_ACCECN_E0B_INIT_OFFSET;
> + default:
> + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield);
Same as above.
> + }
> + return 0;
> +}
> +
> +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */
> +static unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int optfield,
> + bool order)
> +{
> + u8 tmp;
> +
> + optfield = order ? 2 - optfield : optfield;
> + tmp = optfield + 2;
> +
> + return (tmp + (tmp >> 2)) & INET_ECN_MASK;
> +}
> +
> +/* Handles AccECN option ECT and CE 24-bit byte counters update into
> + * the u32 value in tcp_sock. As we're processing TCP options, it is
> + * safe to access from - 1.
> + */
> +static s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 init_offset)
> +{
> + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) &
> + 0xFFFFFFU;
> + u32 delta = (truncated - *cnt) & 0xFFFFFFU;
> +
> + /* If delta has the highest bit set (24th bit) indicating
> + * negative, sign extend to correct an estimation using
> + * sign_extend32(delta, 24 - 1)
> + */
> + delta = sign_extend32(delta, 23);
> + *cnt += delta;
> + return (s32)delta;
> +}
> +
> +/* Returns true if the byte counters can be used */
> +static bool tcp_accecn_process_option(struct tcp_sock *tp,
> + const struct sk_buff *skb,
> + u32 delivered_bytes, int flag)
> +{
> + u8 estimate_ecnfield = tp->est_ecnfield;
> + bool ambiguous_ecn_bytes_incr = false;
> + bool first_changed = false;
> + unsigned int optlen;
> + unsigned char *ptr;
> + bool order1, res;
> + unsigned int i;
> +
> + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
> + if (estimate_ecnfield) {
> + u8 ecnfield = estimate_ecnfield - 1;
> +
> + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes;
> + return true;
> + }
> + return false;
> + }
> +
> + ptr = skb_transport_header(skb) + tp->rx_opt.accecn;
> + optlen = ptr[1] - 2;
This assumes optlen is greater then 2, but I don't see the relevant
check. Are tcp options present at all?
> + WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1);
Please, don't warn for arbitrary wrong data sent from the peer.
> + order1 = (ptr[0] == TCPOPT_ACCECN1);
> + ptr += 2;
> +
> + res = !!estimate_ecnfield;
> + for (i = 0; i < 3; i++) {
> + if (optlen >= TCPOLEN_ACCECN_PERFIELD) {
> + u32 init_offset;
> + u8 ecnfield;
> + s32 delta;
> + u32 *cnt;
> +
> + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1);
> + init_offset = tcp_accecn_field_init_offset(ecnfield);
> + cnt = &tp->delivered_ecn_bytes[ecnfield - 1];
> + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset);
> + if (delta) {
> + if (delta < 0) {
> + res = false;
> + ambiguous_ecn_bytes_incr = true;
> + }
> + if (ecnfield != estimate_ecnfield) {
> + if (!first_changed) {
> + tp->est_ecnfield = ecnfield;
> + first_changed = true;
> + } else {
> + res = false;
> + ambiguous_ecn_bytes_incr = true;
> + }
At least 2 indentation levels above the maximum readable.
[...]
> @@ -4378,6 +4524,7 @@ void tcp_parse_options(const struct net *net,
>
> ptr = (const unsigned char *)(th + 1);
> opt_rx->saw_tstamp = 0;
> + opt_rx->accecn = 0;
> opt_rx->saw_unknown = 0;
It would be good to be able to zero both 'accecn' and 'saw_unknown' with
a single statement.
[...]
> @@ -766,6 +769,47 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
> *ptr++ = htonl(opts->tsecr);
> }
>
> + if (OPTION_ACCECN & options) {
> + const u8 ect0_idx = INET_ECN_ECT_0 - 1;
> + const u8 ect1_idx = INET_ECN_ECT_1 - 1;
> + const u8 ce_idx = INET_ECN_CE - 1;
> + u32 e0b;
> + u32 e1b;
> + u32 ceb;
> + u8 len;
> +
> + e0b = opts->ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
> + e1b = opts->ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
> + ceb = opts->ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
> + len = TCPOLEN_ACCECN_BASE +
> + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD;
> +
> + if (opts->num_accecn_fields == 2) {
> + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
> + ((e1b >> 8) & 0xffff));
> + *ptr++ = htonl(((e1b & 0xff) << 24) |
> + (ceb & 0xffffff));
> + } else if (opts->num_accecn_fields == 1) {
> + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
> + ((e1b >> 8) & 0xffff));
> + leftover_bytes = ((e1b & 0xff) << 8) |
> + TCPOPT_NOP;
> + leftover_size = 1;
> + } else if (opts->num_accecn_fields == 0) {
> + leftover_bytes = (TCPOPT_ACCECN1 << 8) | len;
> + leftover_size = 2;
> + } else if (opts->num_accecn_fields == 3) {
> + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
> + ((e1b >> 8) & 0xffff));
> + *ptr++ = htonl(((e1b & 0xff) << 24) |
> + (ceb & 0xffffff));
> + *ptr++ = htonl(((e0b & 0xffffff) << 8) |
> + TCPOPT_NOP);
The above chunck and the contents of patch 7 must be in the same patch.
This split makes the review even harder.
[...]
> @@ -1117,6 +1235,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
> opts->num_sack_blocks = 0;
> }
>
> + if (tcp_ecn_mode_accecn(tp) &&
> + sock_net(sk)->ipv4.sysctl_tcp_ecn_option) {
> + int saving = opts->num_sack_blocks > 0 ? 2 : 0;
> + int remaining = MAX_TCP_OPTION_SPACE - size;
AFACS the above means tcp_options_fit_accecn() must clear any already
set options, but apparently it does not do so. Have you tested with
something adding largish options like mptcp?
/P
next prev parent reply other threads:[~2025-04-29 11:56 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-22 15:35 [PATCH v5 net-next 00/15] AccECN protocol patch series chia-yu.chang
2025-04-22 15:35 ` [PATCH v5 net-next 01/15] tcp: reorganize SYN ECN code chia-yu.chang
2025-04-22 15:35 ` [PATCH v5 net-next 02/15] tcp: fast path functions later chia-yu.chang
2025-04-22 15:35 ` [PATCH v5 net-next 03/15] tcp: AccECN core chia-yu.chang
2025-04-29 10:14 ` Paolo Abeni
2025-05-05 15:24 ` Chia-Yu Chang (Nokia)
2025-05-06 13:56 ` Chia-Yu Chang (Nokia)
2025-04-22 15:35 ` [PATCH v5 net-next 04/15] tcp: accecn: AccECN negotiation chia-yu.chang
2025-04-29 10:36 ` Paolo Abeni
2025-05-05 15:59 ` Chia-Yu Chang (Nokia)
2025-04-22 15:35 ` [PATCH v5 net-next 05/15] tcp: accecn: add AccECN rx byte counters chia-yu.chang
2025-04-29 10:45 ` Paolo Abeni
2025-04-22 15:35 ` [PATCH v5 net-next 06/15] tcp: accecn: AccECN needs to know delivered bytes chia-yu.chang
2025-04-22 15:35 ` [PATCH v5 net-next 07/15] tcp: allow embedding leftover into option padding chia-yu.chang
2025-04-29 10:57 ` Paolo Abeni
2025-05-05 23:09 ` Ilpo Järvinen
2025-05-06 8:50 ` Chia-Yu Chang (Nokia)
2025-04-22 15:35 ` [PATCH v5 net-next 08/15] tcp: sack option handling improvements chia-yu.chang
2025-04-22 15:35 ` [PATCH v5 net-next 09/15] tcp: accecn: AccECN option chia-yu.chang
2025-04-29 11:56 ` Paolo Abeni [this message]
2025-05-05 21:47 ` Chia-Yu Chang (Nokia)
2025-05-05 22:54 ` Ilpo Järvinen
2025-05-06 8:48 ` Chia-Yu Chang (Nokia)
2025-05-06 17:40 ` Ilpo Järvinen
2025-05-06 17:49 ` Ilpo Järvinen
2025-04-22 15:35 ` [PATCH v5 net-next 10/15] tcp: accecn: AccECN option send control chia-yu.chang
2025-04-29 12:10 ` Paolo Abeni
2025-05-05 21:53 ` Chia-Yu Chang (Nokia)
2025-05-05 23:26 ` Ilpo Järvinen
2025-05-06 8:53 ` Chia-Yu Chang (Nokia)
2025-04-22 15:35 ` [PATCH v5 net-next 11/15] tcp: accecn: AccECN option failure handling chia-yu.chang
2025-04-29 14:08 ` Paolo Abeni
2025-04-22 15:35 ` [PATCH v5 net-next 12/15] tcp: accecn: AccECN option ceb/cep heuristic chia-yu.chang
2025-04-22 15:36 ` [PATCH v5 net-next 13/15] tcp: accecn: AccECN ACE field multi-wrap heuristic chia-yu.chang
2025-04-22 15:36 ` [PATCH v5 net-next 14/15] tcp: accecn: try to fit AccECN option with SACK chia-yu.chang
2025-04-22 15:36 ` [PATCH v5 net-next 15/15] tcp: try to avoid safer when ACKs are thinned chia-yu.chang
2025-04-29 14:11 ` Paolo Abeni
2025-04-24 17:36 ` [PATCH v5 net-next 00/15] AccECN protocol patch series Simon Horman
2025-04-26 0:32 ` Jakub Kicinski
2025-04-29 2:33 ` Neal Cardwell
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=d0969c3d-e33c-472e-815d-70b333990b39@redhat.com \
--to=pabeni@redhat.com \
--cc=Jason_Livingood@comcast.com \
--cc=andrew+netdev@lunn.ch \
--cc=ast@fiberby.net \
--cc=bpf@vger.kernel.org \
--cc=cheshire@apple.com \
--cc=chia-yu.chang@nokia-bell-labs.com \
--cc=dave.taht@gmail.com \
--cc=davem@davemloft.net \
--cc=donald.hunter@gmail.com \
--cc=dsahern@kernel.org \
--cc=edumazet@google.com \
--cc=g.white@cablelabs.com \
--cc=horms@kernel.org \
--cc=ij@kernel.org \
--cc=ingemar.s.johansson@ericsson.com \
--cc=jhs@mojatatu.com \
--cc=jiri@resnulli.us \
--cc=koen.de_schepper@nokia-bell-labs.com \
--cc=kuba@kernel.org \
--cc=kuniyu@amazon.com \
--cc=linux-kselftest@vger.kernel.org \
--cc=liuhangbin@gmail.com \
--cc=mirja.kuehlewind@ericsson.com \
--cc=ncardwell@google.com \
--cc=netdev@vger.kernel.org \
--cc=rs.ietf@gmx.at \
--cc=shuah@kernel.org \
--cc=stephen@networkplumber.org \
--cc=vidhi_goel@apple.com \
--cc=xiyou.wangcong@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).