From: Martin KaFai Lau <martin.lau@linux.dev>
To: Abhishek Chauhan <quic_abchauha@quicinc.com>
Cc: "David S. Miller" <davem@davemloft.net>,
Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
Andrew Halaney <ahalaney@redhat.com>,
Willem de Bruijn <willemdebruijn.kernel@gmail.com>,
Martin KaFai Lau <martin.lau@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>, bpf <bpf@vger.kernel.org>,
kernel@quicinc.com
Subject: Re: [RFC PATCH bpf-next v5 2/2] net: Add additional bit to support clockid_t timestamp type
Date: Thu, 25 Apr 2024 21:37:36 -0700 [thread overview]
Message-ID: <2b2c3eb1-df87-40fe-b871-b52812c8ecd0@linux.dev> (raw)
In-Reply-To: <20240424222028.1080134-3-quic_abchauha@quicinc.com>
On 4/24/24 3:20 PM, Abhishek Chauhan wrote:
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index e464d0ebc9c1..3ad0de07d261 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -711,6 +711,8 @@ typedef unsigned char *sk_buff_data_t;
> enum skb_tstamp_type {
> SKB_CLOCK_REALTIME,
> SKB_CLOCK_MONOTONIC,
> + SKB_CLOCK_TAI,
> + __SKB_CLOCK_MAX = SKB_CLOCK_TAI,
> };
>
> /**
> @@ -831,8 +833,8 @@ enum skb_tstamp_type {
> * @decrypted: Decrypted SKB
> * @slow_gro: state present at GRO time, slower prepare step required
> * @tstamp_type: When set, skb->tstamp has the
> - * delivery_time in mono clock base Otherwise, the
> - * timestamp is considered real clock base.
> + * delivery_time in mono clock base or clock base of skb->tstamp.
> + * Otherwise, the timestamp is considered real clock base
> * @napi_id: id of the NAPI struct this skb came from
> * @sender_cpu: (aka @napi_id) source CPU in XPS
> * @alloc_cpu: CPU which did the skb allocation.
> @@ -960,7 +962,7 @@ struct sk_buff {
> /* private: */
> __u8 __mono_tc_offset[0];
> /* public: */
> - __u8 tstamp_type:1; /* See skb_tstamp_type */
> + __u8 tstamp_type:2; /* See skb_tstamp_type */
> #ifdef CONFIG_NET_XGRESS
> __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */
> __u8 tc_skip_classify:1;
> @@ -1090,15 +1092,17 @@ struct sk_buff {
> #endif
> #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset)
>
> -/* if you move tc_at_ingress or mono_delivery_time
> +/* if you move tc_at_ingress or tstamp_type:2
> * around, you also must adapt these constants.
> */
> #ifdef __BIG_ENDIAN_BITFIELD
> -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 7)
> -#define TC_AT_INGRESS_MASK (1 << 6)
> +#define SKB_TSTAMP_TYPE_MASK (3 << 6)
> +#define SKB_TSTAMP_TYPE_RSH (6)
> +#define TC_AT_INGRESS_RSH (5)
TC_AT_INGRESS_RSH is not used.
> +#define TC_AT_INGRESS_MASK (1 << 5)
> #else
> -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 0)
> -#define TC_AT_INGRESS_MASK (1 << 1)
> +#define SKB_TSTAMP_TYPE_MASK (3)
> +#define TC_AT_INGRESS_MASK (1 << 2)
> #endif
> #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset)
>
> @@ -4204,6 +4208,12 @@ static inline void skb_set_tstamp_type_frm_clkid(struct sk_buff *skb,
> case CLOCK_MONOTONIC:
> skb->tstamp_type = SKB_CLOCK_MONOTONIC;
> break;
> + case CLOCK_TAI:
> + skb->tstamp_type = SKB_CLOCK_TAI;
> + break;
> + default:
> + WARN_ONCE(true, "clockid %d not supported", clockid);
> + skb->tstamp_type = SKB_CLOCK_REALTIME;
> }
> }
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index cee0a7915c08..1376ed5ece10 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
The bpf.h needs to be sync to tools/include/uapi/linux/bpf.h.
Otherwise, the bpf CI cannot compile the tests:
https://patchwork.kernel.org/project/netdevbpf/patch/20240424222028.1080134-2-quic_abchauha@quicinc.com/
Please monitor the bpf CI test result after submitting the patches.
> @@ -6209,6 +6209,7 @@ union { \
> enum {
> BPF_SKB_TSTAMP_UNSPEC,
> BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */
> + BPF_SKB_TSTAMP_DELIVERY_TAI, /* tstamp has tai delivery time */
> /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
> * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
> * and try to deduce it by ingress, egress or skb->sk->sk_clockid.
SKB_CLOCK_TAI is properly defined as an enum now and there is a
WARN for clock other than REAL, MONO, and TAI. I think it is
time to remove UNSPEC and give it back the proper name REALTIME.
I want to take this chance to do some renaming:
/* The enum used in skb->tstamp_type. It specifies the clock type
* of the time stored in the skb->tstamp.
*/
enum {
BPF_SKB_TSTAMP_UNSPEC = 0, /* DEPRECATED */
BPF_SKB_TSTAMP_DELIVERY_MONO = 1, /* DEPRECATED */
BPF_SKB_CLOCK_REALTIME = 0, /* Realtime clock */
BPF_SKB_CLOCK_MONOTONIC = 1, /* Monotonic clock */
BPF_SKB_CLOCK_TAI = 2, /* TAI clock */
/* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle,
* the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid.
*/
};
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 957c2fc724eb..c67622f4fe98 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -7733,6 +7733,12 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
> skb->tstamp = tstamp;
> skb->tstamp_type = SKB_CLOCK_MONOTONIC;
> break;
> + case BPF_SKB_TSTAMP_DELIVERY_TAI:
> + if (!tstamp)
> + return -EINVAL;
> + skb->tstamp = tstamp;
> + skb->tstamp_type = SKB_CLOCK_TAI;
> + break;
> case BPF_SKB_TSTAMP_UNSPEC:
> if (tstamp)
Allow to store any realtime tstamp here since BPF_SKB_TSTAMP_UNSPEC
becomes BPF_SKB_CLOCK_REALTIME.
Like:
BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
u64, tstamp, u32, tstamp_type)
{
/* ... */
case BPF_SKB_CLOCK_TAI:
if (!tstamp)
return -EINVAL;
skb->tstamp = tstamp;
skb->tstamp_type = SKB_CLOCK_TAI;
break;
case BPF_SKB_CLOCK_REALTIME:
skb->tstamp = tstamp;
skb->tstamp_type = SKB_CLOCK_REALTIME;
break;
/* ... */
}
> return -EINVAL;
> @@ -9388,17 +9394,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
> {
> __u8 value_reg = si->dst_reg;
> __u8 skb_reg = si->src_reg;
> - /* AX is needed because src_reg and dst_reg could be the same */
> - __u8 tmp_reg = BPF_REG_AX;
> -
> - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
> - SKB_BF_MONO_TC_OFFSET);
> - *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
> - SKB_MONO_DELIVERY_TIME_MASK, 2);
> - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
> - *insn++ = BPF_JMP_A(1);
> - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
> -
> + BUILD_BUG_ON(__SKB_CLOCK_MAX != BPF_SKB_TSTAMP_DELIVERY_TAI);
Add these also:
BUILD_BUG_ON(SKB_CLOCK_REALTIME != BPF_SKB_CLOCK_REALTIME);
BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != BPF_SKB_CLOCK_MONOTONIC);
BUILD_BUG_ON(SKB_CLOCK_TAI != BPF_SKB_CLOCK_TAI);
> + *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
> + *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
> +#ifdef __BIG_ENDIAN_BITFIELD
> + *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSH);
> +#else
> + BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
> +#endif
> + *insn++ = BPF_JMP32_IMM(BPF_JNE, value_reg, SKB_TSTAMP_TYPE_MASK, 1);
> + /* Both the bits set then mark it BPF_SKB_TSTAMP_UNSPEC */
> + *insn++ = BPF_MOV64_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
The kernel should not have both bits set in skb->tstamp_type. No need to
add two extra bpf insns to check this. If there is a bug in the kernel,
it is better to be uncovered instead of hiding it under BPF_SKB_TSTAMP_UNSPEC (which
is renamed to BPF_SKB_CLOCK_REALTIME anyway).
Hence, the last two bpf insns should be removed.
> return insn;
> }
>
> @@ -9430,6 +9436,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
> __u8 value_reg = si->dst_reg;
> __u8 skb_reg = si->src_reg;
>
> +BUILD_BUG_ON(__SKB_CLOCK_MAX != BPF_SKB_TSTAMP_DELIVERY_TAI);
It is a dup of the one in bpf_convert_tstamp_type_read and can be removed.
> #ifdef CONFIG_NET_XGRESS
> /* If the tstamp_type is read,
> * the bpf prog is aware the tstamp could have delivery time.
> @@ -9440,11 +9447,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
> __u8 tmp_reg = BPF_REG_AX;
>
> *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
> - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
> - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
> - *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
> - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2);
> - /* skb->tc_at_ingress && skb->tstamp_type:1,
> + /* check if ingress mask bits is set */
> + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
> + *insn++ = BPF_JMP_A(4);
> + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
> + *insn++ = BPF_JMP_A(2);
> + /* skb->tc_at_ingress && skb->tstamp_type:2,
> * read 0 as the (rcv) timestamp.
> */
> *insn++ = BPF_MOV64_IMM(value_reg, 0);
> @@ -9469,7 +9477,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
> * the bpf prog is aware the tstamp could have delivery time.
> * Thus, write skb->tstamp as is if tstamp_type_access is true.
> * Otherwise, writing at ingress will have to clear the
> - * mono_delivery_time (skb->tstamp_type:1)bit also.
> + * mono_delivery_time (skb->tstamp_type:2)bit also.
> */
> if (!prog->tstamp_type_access) {
> __u8 tmp_reg = BPF_REG_AX;
> @@ -9479,8 +9487,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
> *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
> /* goto <store> */
> *insn++ = BPF_JMP_A(2);
> - /* <clear>: mono_delivery_time or (skb->tstamp_type:1) */
> - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
> + /* <clear>: skb->tstamp_type:2 */
> + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
> *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
> }
> #endif
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 591226dcde26..f195b31d6e75 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -1457,7 +1457,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
>
> skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
> skb->mark = cork->mark;
> - skb->tstamp = cork->transmit_time;
> + skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid);
hmm... I think this will break for tcp. This sequence in particular:
tcp_v4_timewait_ack()
tcp_v4_send_ack()
ip_send_unicast_reply()
ip_push_pending_frames()
ip_finish_skb()
__ip_make_skb()
/* sk_clockid is REAL but cork->transmit_time should be in mono */
skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid);;
I think I hit it from time to time when running the test in this patch set.
[ ... ]
> diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
> index 74ec09f040b7..19dba6d88265 100644
> --- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c
> +++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
Please separate the selftests/bpf changes into another patch.
> @@ -227,6 +227,12 @@ int egress_host(struct __sk_buff *skb)
> inc_dtimes(EGRESS_ENDHOST);
> else
> inc_errs(EGRESS_ENDHOST);
> + } else if (skb_proto(skb_type) == IPPROTO_UDP) {
> + if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI &&
> + skb->tstamp)
> + inc_dtimes(EGRESS_ENDHOST);
> + else
> + inc_errs(EGRESS_ENDHOST);
> } else {
> if (skb->tstamp_type == BPF_SKB_TSTAMP_UNSPEC &&
> skb->tstamp)
> @@ -255,6 +261,9 @@ int ingress_host(struct __sk_buff *skb)
> if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
> skb->tstamp == EGRESS_FWDNS_MAGIC)
> inc_dtimes(INGRESS_ENDHOST);
> + else if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI &&
> + skb->tstamp == EGRESS_FWDNS_MAGIC)
> + inc_dtimes(INGRESS_ENDHOST);
> else
> inc_errs(INGRESS_ENDHOST);
>
> @@ -323,12 +332,14 @@ int ingress_fwdns_prio101(struct __sk_buff *skb)
> /* Should have handled in prio100 */
> return TC_ACT_SHOT;
>
> - if (skb_proto(skb_type) == IPPROTO_UDP)
> + if (skb_proto(skb_type) == IPPROTO_UDP &&
> + skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI)
> expected_dtime = 0;
The IPPROTO_UDP check and expected_dtime can be removed. The UDP test
can expect the same EGRESS_ENDHOST_MAGIC in the skb->tstamp since
the TAI tstamp is also forwarded from egress to ingress now.
>
> if (skb->tstamp_type) {
> if (fwdns_clear_dtime() ||
> - skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
> + (skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO &&
> + skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI) ||
> skb->tstamp != expected_dtime)
> inc_errs(INGRESS_FWDNS_P101);
> else
> @@ -338,7 +349,8 @@ int ingress_fwdns_prio101(struct __sk_buff *skb)
> inc_errs(INGRESS_FWDNS_P101);
> }
>
> - if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
> + if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO ||
> + skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI) {
No need to check BPF_SKB_TSTAMP_DELIVERY_TAI such that the
bpf_skb_set_tstamp() helper can still be tested.
There are some other minor changes needed for the test_tc_dtime.c and
the tc_redirect.c. I quickly made the changes and put them here (first patch):
https://git.kernel.org/pub/scm/linux/kernel/git/martin.lau/bpf-next.git/log/?h=skb.tstamp_type
> skb->tstamp = INGRESS_FWDNS_MAGIC;
> } else {
> if (bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
> @@ -370,7 +382,8 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
>
> if (skb->tstamp_type) {
> if (fwdns_clear_dtime() ||
> - skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
> + (skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO &&
> + skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI) ||
> skb->tstamp != INGRESS_FWDNS_MAGIC)
> inc_errs(EGRESS_FWDNS_P101);
> else
> @@ -380,7 +393,8 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
> inc_errs(EGRESS_FWDNS_P101);
> }
>
> - if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
> + if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO ||
> + skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI) {
> skb->tstamp = EGRESS_FWDNS_MAGIC;
> } else {
> if (bpf_skb_set_tstamp(skb, EGRESS_FWDNS_MAGIC,
next prev parent reply other threads:[~2024-04-26 4:37 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-04-24 22:20 [RFC PATCH bpf-next v5 0/2] Replace mono_delivery_time with tstamp_type Abhishek Chauhan
2024-04-24 22:20 ` [RFC PATCH bpf-next v5 1/2] net: Rename mono_delivery_time to tstamp_type for scalabilty Abhishek Chauhan
2024-04-25 14:31 ` Willem de Bruijn
2024-04-25 19:02 ` Abhishek Chauhan (ABC)
2024-04-25 23:50 ` Martin KaFai Lau
2024-04-25 23:55 ` Abhishek Chauhan (ABC)
2024-04-24 22:20 ` [RFC PATCH bpf-next v5 2/2] net: Add additional bit to support clockid_t timestamp type Abhishek Chauhan
2024-04-25 14:42 ` Willem de Bruijn
2024-04-25 19:12 ` Abhishek Chauhan (ABC)
2024-04-26 4:37 ` Martin KaFai Lau [this message]
2024-04-26 18:46 ` Abhishek Chauhan (ABC)
2024-04-26 23:50 ` Martin KaFai Lau
2024-04-30 19:16 ` Abhishek Chauhan (ABC)
2024-04-30 20:26 ` Willem de Bruijn
2024-04-30 20:40 ` Abhishek Chauhan (ABC)
2024-05-03 21:33 ` Abhishek Chauhan (ABC)
2024-05-03 21:41 ` Martin KaFai Lau
2024-05-03 22:14 ` Abhishek Chauhan (ABC)
2024-04-26 23:55 ` Martin KaFai Lau
2024-04-30 19:59 ` Abhishek Chauhan (ABC)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=2b2c3eb1-df87-40fe-b871-b52812c8ecd0@linux.dev \
--to=martin.lau@linux.dev \
--cc=ahalaney@redhat.com \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=kernel@quicinc.com \
--cc=kuba@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=martin.lau@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=quic_abchauha@quicinc.com \
--cc=willemdebruijn.kernel@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.