* [PATCH net-next v2 01/12] net-timestamp: introduce socket tsflag requestors
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-15 1:30 ` Willem de Bruijn
2024-10-12 4:06 ` [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt Jason Xing
` (11 subsequent siblings)
12 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
We need a separate tsflag to control bpf extension feature so that
we will not affect the behaviors of existing applications.
The idea of introducing requestors for better extension (not only
serving bpf extension) comes from Vadim Fedorenko.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/net/ip.h | 2 +-
include/net/sock.h | 15 +++++++++++----
net/can/j1939/socket.c | 2 +-
net/core/skbuff.c | 5 +++--
net/core/sock.c | 8 ++++----
net/ipv4/ip_output.c | 2 +-
net/ipv4/ip_sockglue.c | 2 +-
net/ipv4/tcp.c | 2 +-
net/ipv6/ip6_output.c | 2 +-
net/ipv6/ping.c | 2 +-
net/ipv6/raw.c | 2 +-
net/ipv6/udp.c | 2 +-
net/sctp/socket.c | 2 +-
net/socket.c | 4 ++--
14 files changed, 30 insertions(+), 22 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index bab084df1567..b0a836aebc33 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -96,7 +96,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
ipcm_init(ipcm);
ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark);
- ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags);
+ ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if);
ipcm->addr = inet->inet_saddr;
ipcm->protocol = inet->inet_num;
diff --git a/include/net/sock.h b/include/net/sock.h
index b32f1424ecc5..8cf278c957b3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -234,6 +234,13 @@ struct sock_common {
struct bpf_local_storage;
struct sk_filter;
+enum {
+ SOCKETOPT_TS_REQUESTOR = 0,
+ BPFPROG_TS_REQUESTOR,
+
+ __MAX_TS_REQUESTOR,
+};
+
/**
* struct sock - network layer representation of sockets
* @__sk_common: shared layout with inet_timewait_sock
@@ -444,7 +451,7 @@ struct sock {
socket_lock_t sk_lock;
u32 sk_reserved_mem;
int sk_forward_alloc;
- u32 sk_tsflags;
+ u32 sk_tsflags[__MAX_TS_REQUESTOR];
__cacheline_group_end(sock_write_rxtx);
__cacheline_group_begin(sock_write_tx);
@@ -1809,7 +1816,7 @@ static inline void sockcm_init(struct sockcm_cookie *sockc,
const struct sock *sk)
{
*sockc = (struct sockcm_cookie) {
- .tsflags = READ_ONCE(sk->sk_tsflags)
+ .tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR])
};
}
@@ -2617,7 +2624,7 @@ static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
- u32 tsflags = READ_ONCE(sk->sk_tsflags);
+ u32 tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
ktime_t kt = skb->tstamp;
/*
* generate control messages if
@@ -2652,7 +2659,7 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
SOF_TIMESTAMPING_RAW_HARDWARE)
if (sk->sk_flags & FLAGS_RECV_CMSGS ||
- READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY)
+ READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]) & TSFLAGS_ANY)
__sock_recv_cmsgs(msg, sk, skb);
else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
sock_write_timestamp(sk, skb->tstamp);
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index 305dd72c844c..8f5799930a93 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -996,7 +996,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk,
if (!(jsk->state & J1939_SOCK_ERRQUEUE))
return;
- tsflags = READ_ONCE(sk->sk_tsflags);
+ tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
switch (type) {
case J1939_ERRQUEUE_TX_ACK:
if (!(tsflags & SOF_TIMESTAMPING_TX_ACK))
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 00afeb90c23a..ab0a59f1e14d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5490,7 +5490,8 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
serr->ee.ee_info = tstype;
serr->opt_stats = opt_stats;
serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
- if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
+ if (READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]) &
+ SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
if (sk_is_tcp(sk))
serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
@@ -5551,7 +5552,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!sk)
return;
- tsflags = READ_ONCE(sk->sk_tsflags);
+ tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
return;
diff --git a/net/core/sock.c b/net/core/sock.c
index 083d438d8b6f..52c8c5a5ba27 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -908,7 +908,7 @@ int sock_set_timestamping(struct sock *sk, int optname,
return -EINVAL;
if (val & SOF_TIMESTAMPING_OPT_ID &&
- !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+ !(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR] & SOF_TIMESTAMPING_OPT_ID)) {
if (sk_is_tcp(sk)) {
if ((1 << sk->sk_state) &
(TCPF_CLOSE | TCPF_LISTEN))
@@ -932,7 +932,7 @@ int sock_set_timestamping(struct sock *sk, int optname,
return ret;
}
- WRITE_ONCE(sk->sk_tsflags, val);
+ WRITE_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR], val);
sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
@@ -1797,7 +1797,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
* Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
*/
if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
- v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
+ v.timestamping.flags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
}
break;
@@ -2930,7 +2930,7 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
case SCM_TS_OPT_ID:
if (sk_is_tcp(sk))
return -EINVAL;
- tsflags = READ_ONCE(sk->sk_tsflags);
+ tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
return -EINVAL;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e5c55a95063d..ded504458d5d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1050,7 +1050,7 @@ static int __ip_append_data(struct sock *sk,
cork->length += length;
if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
- READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
+ READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]) & SOF_TIMESTAMPING_OPT_ID) {
if (cork->flags & IPCORK_TS_OPT_ID) {
tskey = cork->ts_opt_id;
} else {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cf377377b52d..fac8f593c43a 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -509,7 +509,7 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk,
* or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
*/
info = PKTINFO_SKB_CB(skb);
- if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) ||
+ if (!(READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]) & SOF_TIMESTAMPING_OPT_CMSG) ||
!info->ipi_ifindex)
return false;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 82cc4a5633ce..6c8968eb4427 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2245,7 +2245,7 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
struct scm_timestamping_internal *tss)
{
int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
- u32 tsflags = READ_ONCE(sk->sk_tsflags);
+ u32 tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
bool has_timestamping = false;
if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 205673179b3c..c983e0ca6f72 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1547,7 +1547,7 @@ static int __ip6_append_data(struct sock *sk,
}
if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
- READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
+ READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]) & SOF_TIMESTAMPING_OPT_ID) {
if (cork->flags & IPCORK_TS_OPT_ID) {
tskey = cork->ts_opt_id;
} else {
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 88b3fcacd4f9..0080b7c3a475 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -119,7 +119,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return -EINVAL;
ipcm6_init_sk(&ipc6, sk);
- ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
+ ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
ipc6.sockc.mark = READ_ONCE(sk->sk_mark);
fl6.flowi6_oif = oif;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 8476a3944a88..cd02aa02d813 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -778,7 +778,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_uid = sk->sk_uid;
ipcm6_init(&ipc6);
- ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
+ ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
ipc6.sockc.mark = fl6.flowi6_mark;
if (sin6) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 52dfbb2ff1a8..008cc0282338 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1349,7 +1349,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipcm6_init(&ipc6);
ipc6.gso_size = READ_ONCE(up->gso_size);
- ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
+ ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
ipc6.sockc.mark = READ_ONCE(sk->sk_mark);
/* destination address check */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 078bcb3858c7..f66f21d6363e 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -9463,7 +9463,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newsk->sk_type = sk->sk_type;
newsk->sk_bound_dev_if = sk->sk_bound_dev_if;
newsk->sk_flags = sk->sk_flags;
- newsk->sk_tsflags = sk->sk_tsflags;
+ memcpy(newsk->sk_tsflags, sk->sk_tsflags, sizeof(u32) * __MAX_TS_REQUESTOR);
newsk->sk_no_check_tx = sk->sk_no_check_tx;
newsk->sk_no_check_rx = sk->sk_no_check_rx;
newsk->sk_reuse = sk->sk_reuse;
diff --git a/net/socket.c b/net/socket.c
index 3b1b65b9f471..24619a27909a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -845,7 +845,7 @@ static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp)
static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index)
{
- bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC;
+ bool cycles = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]) & SOF_TIMESTAMPING_BIND_PHC;
struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
struct net_device *orig_dev;
ktime_t hwtstamp;
@@ -944,7 +944,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
}
memset(&tss, 0, sizeof(tss));
- tsflags = READ_ONCE(sk->sk_tsflags);
+ tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
if ((tsflags & SOF_TIMESTAMPING_SOFTWARE &&
(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE ||
skb_is_err_queue(skb) ||
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 01/12] net-timestamp: introduce socket tsflag requestors
2024-10-12 4:06 ` [PATCH net-next v2 01/12] net-timestamp: introduce socket tsflag requestors Jason Xing
@ 2024-10-15 1:30 ` Willem de Bruijn
2024-10-15 1:50 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-15 1:30 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: bpf, netdev, Jason Xing
Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> We need a separate tsflag to control bpf extension feature so that
> we will not affect the behaviors of existing applications.
>
> The idea of introducing requestors for better extension (not only
> serving bpf extension) comes from Vadim Fedorenko.
As also said in the cover letter: I prefer sk_tstflags_bpf.
This array approach adds code churn, may have cacheline effects by
moving other fields and anticipates I don't see a third requestor
happening. And if it does, we'll deal with it then.
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 01/12] net-timestamp: introduce socket tsflag requestors
2024-10-15 1:30 ` Willem de Bruijn
@ 2024-10-15 1:50 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-15 1:50 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Tue, Oct 15, 2024 at 9:30 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > We need a separate tsflag to control bpf extension feature so that
> > we will not affect the behaviors of existing applications.
> >
> > The idea of introducing requestors for better extension (not only
> > serving bpf extension) comes from Vadim Fedorenko.
>
> As also said in the cover letter: I prefer sk_tstflags_bpf.
>
> This array approach adds code churn, may have cacheline effects by
> moving other fields and anticipates I don't see a third requestor
> happening. And if it does, we'll deal with it then.
Got it. Thanks. I will adjust it accordingly.
^ permalink raw reply [flat|nested] 73+ messages in thread
* [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
2024-10-12 4:06 ` [PATCH net-next v2 01/12] net-timestamp: introduce socket tsflag requestors Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-15 1:34 ` Willem de Bruijn
` (2 more replies)
2024-10-12 4:06 ` [PATCH net-next v2 03/12] net-timestamp: reorganize in skb_tstamp_tx_output() Jason Xing
` (10 subsequent siblings)
12 siblings, 3 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
For now, we support bpf_setsockopt only TX timestamps flags. Users
can use something like this in bpf program to turn on the feature:
flags = SOF_TIMESTAMPING_TX_SCHED;
bpf_setsockopt(skops, SOL_SOCKET, SO_TIMESTAMPING, &flags, sizeof(flags));
Later, I will support each Tx flags one by one based on this.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/net/sock.h | 2 ++
net/core/filter.c | 27 +++++++++++++++++++++++++++
net/core/sock.c | 35 ++++++++++++++++++++++++-----------
3 files changed, 53 insertions(+), 11 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index 8cf278c957b3..66ecd78f1dfe 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2890,6 +2890,8 @@ void sock_def_readable(struct sock *sk);
int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
+int sock_get_timestamping(struct so_timestamping *timestamping,
+ sockptr_t optval, unsigned int optlen);
int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping);
diff --git a/net/core/filter.c b/net/core/filter.c
index bd0d08bf76bb..996426095bd9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5204,10 +5204,30 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static int bpf_sock_set_timestamping(struct sock *sk,
+ struct so_timestamping *timestamping)
+{
+ u32 flags = timestamping->flags;
+
+ if (flags & ~SOF_TIMESTAMPING_MASK)
+ return -EINVAL;
+
+ if (!(flags & (SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_TX_ACK)))
+ return -EINVAL;
+
+ WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
+
+ return 0;
+}
+
static int sol_socket_sockopt(struct sock *sk, int optname,
char *optval, int *optlen,
bool getopt)
{
+ struct so_timestamping ts;
+ int ret = 0;
+
switch (optname) {
case SO_REUSEADDR:
case SO_SNDBUF:
@@ -5225,6 +5245,13 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
break;
case SO_BINDTODEVICE:
break;
+ case SO_TIMESTAMPING_NEW:
+ case SO_TIMESTAMPING_OLD:
+ ret = sock_get_timestamping(&ts, KERNEL_SOCKPTR(optval),
+ *optlen);
+ if (!ret)
+ ret = bpf_sock_set_timestamping(sk, &ts);
+ return ret;
default:
return -EINVAL;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 52c8c5a5ba27..a6e0d51a5f72 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -894,6 +894,27 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
return 0;
}
+int sock_get_timestamping(struct so_timestamping *timestamping,
+ sockptr_t optval, unsigned int optlen)
+{
+ int val;
+
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ if (optlen == sizeof(*timestamping)) {
+ if (copy_from_sockptr(timestamping, optval,
+ sizeof(*timestamping))) {
+ return -EFAULT;
+ }
+ } else {
+ memset(timestamping, 0, sizeof(*timestamping));
+ timestamping->flags = val;
+ }
+
+ return 0;
+}
+
int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping)
{
@@ -1402,17 +1423,9 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
case SO_TIMESTAMPING_NEW:
case SO_TIMESTAMPING_OLD:
- if (optlen == sizeof(timestamping)) {
- if (copy_from_sockptr(×tamping, optval,
- sizeof(timestamping))) {
- ret = -EFAULT;
- break;
- }
- } else {
- memset(×tamping, 0, sizeof(timestamping));
- timestamping.flags = val;
- }
- ret = sock_set_timestamping(sk, optname, timestamping);
+ ret = sock_get_timestamping(×tamping, optval, optlen);
+ if (!ret)
+ ret = sock_set_timestamping(sk, optname, timestamping);
break;
case SO_RCVLOWAT:
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt
2024-10-12 4:06 ` [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt Jason Xing
@ 2024-10-15 1:34 ` Willem de Bruijn
2024-10-15 2:05 ` Jason Xing
2024-10-15 21:32 ` Martin KaFai Lau
2024-10-15 23:54 ` Martin KaFai Lau
2 siblings, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-15 1:34 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: bpf, netdev, Jason Xing
Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> For now, we support bpf_setsockopt only TX timestamps flags. Users
> can use something like this in bpf program to turn on the feature:
>
> flags = SOF_TIMESTAMPING_TX_SCHED;
> bpf_setsockopt(skops, SOL_SOCKET, SO_TIMESTAMPING, &flags, sizeof(flags));
>
> Later, I will support each Tx flags one by one based on this.
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> include/net/sock.h | 2 ++
> net/core/filter.c | 27 +++++++++++++++++++++++++++
> net/core/sock.c | 35 ++++++++++++++++++++++++-----------
> 3 files changed, 53 insertions(+), 11 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 8cf278c957b3..66ecd78f1dfe 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -2890,6 +2890,8 @@ void sock_def_readable(struct sock *sk);
>
> int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
> void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
> +int sock_get_timestamping(struct so_timestamping *timestamping,
> + sockptr_t optval, unsigned int optlen);
> int sock_set_timestamping(struct sock *sk, int optname,
> struct so_timestamping timestamping);
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index bd0d08bf76bb..996426095bd9 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5204,10 +5204,30 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
> .arg1_type = ARG_PTR_TO_CTX,
> };
>
> +static int bpf_sock_set_timestamping(struct sock *sk,
> + struct so_timestamping *timestamping)
> +{
> + u32 flags = timestamping->flags;
> +
> + if (flags & ~SOF_TIMESTAMPING_MASK)
> + return -EINVAL;
> +
> + if (!(flags & (SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
> + SOF_TIMESTAMPING_TX_ACK)))
> + return -EINVAL;
> +
> + WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
> +
> + return 0;
> +}
> +
> static int sol_socket_sockopt(struct sock *sk, int optname,
> char *optval, int *optlen,
> bool getopt)
> {
> + struct so_timestamping ts;
> + int ret = 0;
> +
> switch (optname) {
> case SO_REUSEADDR:
> case SO_SNDBUF:
> @@ -5225,6 +5245,13 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> break;
> case SO_BINDTODEVICE:
> break;
> + case SO_TIMESTAMPING_NEW:
> + case SO_TIMESTAMPING_OLD:
> + ret = sock_get_timestamping(&ts, KERNEL_SOCKPTR(optval),
> + *optlen);
> + if (!ret)
> + ret = bpf_sock_set_timestamping(sk, &ts);
> + return ret;
> default:
> return -EINVAL;
> }
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 52c8c5a5ba27..a6e0d51a5f72 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -894,6 +894,27 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
> return 0;
> }
>
> +int sock_get_timestamping(struct so_timestamping *timestamping,
> + sockptr_t optval, unsigned int optlen)
> +{
> + int val;
> +
> + if (copy_from_sockptr(&val, optval, sizeof(val)))
> + return -EFAULT;
Ideally don't read this again.
If you do, then move it in the else clause.
> +
> + if (optlen == sizeof(*timestamping)) {
> + if (copy_from_sockptr(timestamping, optval,
> + sizeof(*timestamping))) {
> + return -EFAULT;
> + }
> + } else {
> + memset(timestamping, 0, sizeof(*timestamping));
> + timestamping->flags = val;
> + }
> +
> + return 0;
> +}
> +
> int sock_set_timestamping(struct sock *sk, int optname,
> struct so_timestamping timestamping)
> {
> @@ -1402,17 +1423,9 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
>
> case SO_TIMESTAMPING_NEW:
> case SO_TIMESTAMPING_OLD:
> - if (optlen == sizeof(timestamping)) {
> - if (copy_from_sockptr(×tamping, optval,
> - sizeof(timestamping))) {
> - ret = -EFAULT;
> - break;
> - }
> - } else {
> - memset(×tamping, 0, sizeof(timestamping));
> - timestamping.flags = val;
> - }
> - ret = sock_set_timestamping(sk, optname, timestamping);
> + ret = sock_get_timestamping(×tamping, optval, optlen);
> + if (!ret)
> + ret = sock_set_timestamping(sk, optname, timestamping);
> break;
>
> case SO_RCVLOWAT:
> --
> 2.37.3
>
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt
2024-10-15 1:34 ` Willem de Bruijn
@ 2024-10-15 2:05 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-15 2:05 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Tue, Oct 15, 2024 at 9:34 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > For now, we support bpf_setsockopt only TX timestamps flags. Users
> > can use something like this in bpf program to turn on the feature:
> >
> > flags = SOF_TIMESTAMPING_TX_SCHED;
> > bpf_setsockopt(skops, SOL_SOCKET, SO_TIMESTAMPING, &flags, sizeof(flags));
> >
> > Later, I will support each Tx flags one by one based on this.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > include/net/sock.h | 2 ++
> > net/core/filter.c | 27 +++++++++++++++++++++++++++
> > net/core/sock.c | 35 ++++++++++++++++++++++++-----------
> > 3 files changed, 53 insertions(+), 11 deletions(-)
> >
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 8cf278c957b3..66ecd78f1dfe 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -2890,6 +2890,8 @@ void sock_def_readable(struct sock *sk);
> >
> > int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
> > void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
> > +int sock_get_timestamping(struct so_timestamping *timestamping,
> > + sockptr_t optval, unsigned int optlen);
> > int sock_set_timestamping(struct sock *sk, int optname,
> > struct so_timestamping timestamping);
> >
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index bd0d08bf76bb..996426095bd9 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -5204,10 +5204,30 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
> > .arg1_type = ARG_PTR_TO_CTX,
> > };
> >
> > +static int bpf_sock_set_timestamping(struct sock *sk,
> > + struct so_timestamping *timestamping)
> > +{
> > + u32 flags = timestamping->flags;
> > +
> > + if (flags & ~SOF_TIMESTAMPING_MASK)
> > + return -EINVAL;
> > +
> > + if (!(flags & (SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
> > + SOF_TIMESTAMPING_TX_ACK)))
> > + return -EINVAL;
> > +
> > + WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
> > +
> > + return 0;
> > +}
> > +
> > static int sol_socket_sockopt(struct sock *sk, int optname,
> > char *optval, int *optlen,
> > bool getopt)
> > {
> > + struct so_timestamping ts;
> > + int ret = 0;
> > +
> > switch (optname) {
> > case SO_REUSEADDR:
> > case SO_SNDBUF:
> > @@ -5225,6 +5245,13 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> > break;
> > case SO_BINDTODEVICE:
> > break;
> > + case SO_TIMESTAMPING_NEW:
> > + case SO_TIMESTAMPING_OLD:
> > + ret = sock_get_timestamping(&ts, KERNEL_SOCKPTR(optval),
> > + *optlen);
> > + if (!ret)
> > + ret = bpf_sock_set_timestamping(sk, &ts);
> > + return ret;
> > default:
> > return -EINVAL;
> > }
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index 52c8c5a5ba27..a6e0d51a5f72 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -894,6 +894,27 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
> > return 0;
> > }
> >
> > +int sock_get_timestamping(struct so_timestamping *timestamping,
> > + sockptr_t optval, unsigned int optlen)
> > +{
> > + int val;
> > +
> > + if (copy_from_sockptr(&val, optval, sizeof(val)))
> > + return -EFAULT;
>
> Ideally don't read this again.
>
> If you do, then move it in the else clause.
Thanks, I will do that.
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt
2024-10-12 4:06 ` [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt Jason Xing
2024-10-15 1:34 ` Willem de Bruijn
@ 2024-10-15 21:32 ` Martin KaFai Lau
2024-10-15 21:55 ` Willem de Bruijn
2024-10-16 0:45 ` Jason Xing
2024-10-15 23:54 ` Martin KaFai Lau
2 siblings, 2 replies; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-15 21:32 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 10/11/24 9:06 PM, Jason Xing wrote:
> static int sol_socket_sockopt(struct sock *sk, int optname,
> char *optval, int *optlen,
> bool getopt)
> {
> + struct so_timestamping ts;
> + int ret = 0;
> +
> switch (optname) {
> case SO_REUSEADDR:
> case SO_SNDBUF:
> @@ -5225,6 +5245,13 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> break;
> case SO_BINDTODEVICE:
> break;
> + case SO_TIMESTAMPING_NEW:
> + case SO_TIMESTAMPING_OLD:
How about remove the "_OLD" support ?
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt
2024-10-15 21:32 ` Martin KaFai Lau
@ 2024-10-15 21:55 ` Willem de Bruijn
2024-10-22 13:22 ` Jason Xing
2024-10-16 0:45 ` Jason Xing
1 sibling, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-15 21:55 UTC (permalink / raw)
To: Martin KaFai Lau, Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
Martin KaFai Lau wrote:
> On 10/11/24 9:06 PM, Jason Xing wrote:
> > static int sol_socket_sockopt(struct sock *sk, int optname,
> > char *optval, int *optlen,
> > bool getopt)
> > {
> > + struct so_timestamping ts;
> > + int ret = 0;
> > +
> > switch (optname) {
> > case SO_REUSEADDR:
> > case SO_SNDBUF:
> > @@ -5225,6 +5245,13 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> > break;
> > case SO_BINDTODEVICE:
> > break;
> > + case SO_TIMESTAMPING_NEW:
> > + case SO_TIMESTAMPING_OLD:
>
> How about remove the "_OLD" support ?
+1 I forgot to mention that yesterday.
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt
2024-10-15 21:55 ` Willem de Bruijn
@ 2024-10-22 13:22 ` Jason Xing
2024-10-23 0:06 ` Willem de Bruijn
0 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-22 13:22 UTC (permalink / raw)
To: Willem de Bruijn
Cc: Martin KaFai Lau, davem, edumazet, kuba, pabeni, dsahern, willemb,
ast, daniel, andrii, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Wed, Oct 16, 2024 at 5:56 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Martin KaFai Lau wrote:
> > On 10/11/24 9:06 PM, Jason Xing wrote:
> > > static int sol_socket_sockopt(struct sock *sk, int optname,
> > > char *optval, int *optlen,
> > > bool getopt)
> > > {
> > > + struct so_timestamping ts;
> > > + int ret = 0;
> > > +
> > > switch (optname) {
> > > case SO_REUSEADDR:
> > > case SO_SNDBUF:
> > > @@ -5225,6 +5245,13 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> > > break;
> > > case SO_BINDTODEVICE:
> > > break;
> > > + case SO_TIMESTAMPING_NEW:
> > > + case SO_TIMESTAMPING_OLD:
> >
> > How about remove the "_OLD" support ?
>
> +1 I forgot to mention that yesterday.
Hello Willem, Martin,
I did a test on this and found that if we only use
SO_TIMESTAMPING_NEW, we will never enter the real set sk_tsflags_bpf
logic, unless there is "case SO_TIMESTAMPING_OLD".
And I checked SO_TIMESTAMPING in include/uapi/asm-generic/socket.h:
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
/* on 64-bit and x32, avoid the ?: operator */
...
#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD
...
#else
...
#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ?
SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
...
#endif
The SO_TIMESTAMPING is defined as SO_TIMESTAMPING_OLD. I wonder if I
missed something? Thanks in advance.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt
2024-10-22 13:22 ` Jason Xing
@ 2024-10-23 0:06 ` Willem de Bruijn
2024-10-23 3:49 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-23 0:06 UTC (permalink / raw)
To: Jason Xing, Willem de Bruijn
Cc: Martin KaFai Lau, davem, edumazet, kuba, pabeni, dsahern, willemb,
ast, daniel, andrii, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
Jason Xing wrote:
> On Wed, Oct 16, 2024 at 5:56 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Martin KaFai Lau wrote:
> > > On 10/11/24 9:06 PM, Jason Xing wrote:
> > > > static int sol_socket_sockopt(struct sock *sk, int optname,
> > > > char *optval, int *optlen,
> > > > bool getopt)
> > > > {
> > > > + struct so_timestamping ts;
> > > > + int ret = 0;
> > > > +
> > > > switch (optname) {
> > > > case SO_REUSEADDR:
> > > > case SO_SNDBUF:
> > > > @@ -5225,6 +5245,13 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> > > > break;
> > > > case SO_BINDTODEVICE:
> > > > break;
> > > > + case SO_TIMESTAMPING_NEW:
> > > > + case SO_TIMESTAMPING_OLD:
> > >
> > > How about remove the "_OLD" support ?
> >
> > +1 I forgot to mention that yesterday.
>
> Hello Willem, Martin,
>
> I did a test on this and found that if we only use
> SO_TIMESTAMPING_NEW, we will never enter the real set sk_tsflags_bpf
> logic, unless there is "case SO_TIMESTAMPING_OLD".
>
> And I checked SO_TIMESTAMPING in include/uapi/asm-generic/socket.h:
> #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
> /* on 64-bit and x32, avoid the ?: operator */
> ...
> #define SO_TIMESTAMPING SO_TIMESTAMPING_OLD
> ...
> #else
> ...
> #define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ?
> SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
> ...
> #endif
>
> The SO_TIMESTAMPING is defined as SO_TIMESTAMPING_OLD. I wonder if I
> missed something? Thanks in advance.
The _NEW vs _OLD aim to deal with y2038 issues on 32-bit platforms.
For new APIs, like BPF timestamping, we should always use the safe
structs, such as timespec64.
Then we can just use SO_TIMESTAMPING without the NEW or OLD suffix.
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt
2024-10-23 0:06 ` Willem de Bruijn
@ 2024-10-23 3:49 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-23 3:49 UTC (permalink / raw)
To: Willem de Bruijn
Cc: Martin KaFai Lau, davem, edumazet, kuba, pabeni, dsahern, willemb,
ast, daniel, andrii, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Wed, Oct 23, 2024 at 8:06 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Wed, Oct 16, 2024 at 5:56 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Martin KaFai Lau wrote:
> > > > On 10/11/24 9:06 PM, Jason Xing wrote:
> > > > > static int sol_socket_sockopt(struct sock *sk, int optname,
> > > > > char *optval, int *optlen,
> > > > > bool getopt)
> > > > > {
> > > > > + struct so_timestamping ts;
> > > > > + int ret = 0;
> > > > > +
> > > > > switch (optname) {
> > > > > case SO_REUSEADDR:
> > > > > case SO_SNDBUF:
> > > > > @@ -5225,6 +5245,13 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> > > > > break;
> > > > > case SO_BINDTODEVICE:
> > > > > break;
> > > > > + case SO_TIMESTAMPING_NEW:
> > > > > + case SO_TIMESTAMPING_OLD:
> > > >
> > > > How about remove the "_OLD" support ?
> > >
> > > +1 I forgot to mention that yesterday.
> >
> > Hello Willem, Martin,
> >
> > I did a test on this and found that if we only use
> > SO_TIMESTAMPING_NEW, we will never enter the real set sk_tsflags_bpf
> > logic, unless there is "case SO_TIMESTAMPING_OLD".
> >
> > And I checked SO_TIMESTAMPING in include/uapi/asm-generic/socket.h:
> > #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
> > /* on 64-bit and x32, avoid the ?: operator */
> > ...
> > #define SO_TIMESTAMPING SO_TIMESTAMPING_OLD
> > ...
> > #else
> > ...
> > #define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ?
> > SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
> > ...
> > #endif
> >
> > The SO_TIMESTAMPING is defined as SO_TIMESTAMPING_OLD. I wonder if I
> > missed something? Thanks in advance.
>
> The _NEW vs _OLD aim to deal with y2038 issues on 32-bit platforms.
>
> For new APIs, like BPF timestamping, we should always use the safe
> structs, such as timespec64.
Thanks, I learned a lot.
>
> Then we can just use SO_TIMESTAMPING without the NEW or OLD suffix.
Weird thing is that the SO_TIMESTAMPING would be converted to
SO_TIMESTAMPING_OLD in kernel if I use this :
bpf_setsockopt(skops, SOL_SOCKET, SO_TIMESTAMPING, &flags, sizeof(flags));
As I mentioned before, SO_TIMESTAMPING exists in
include/uapi/asm-generic/socket.h:
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
/* on 64-bit and x32, avoid the ?: operator */
...
#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD
...
#else
...
#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ?
SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
...
#endif
So I wonder if there is something unexpected?
BTW, I conducted the test on a VM with x86_64 cpu.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt
2024-10-15 21:32 ` Martin KaFai Lau
2024-10-15 21:55 ` Willem de Bruijn
@ 2024-10-16 0:45 ` Jason Xing
1 sibling, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-16 0:45 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Wed, Oct 16, 2024 at 5:33 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/11/24 9:06 PM, Jason Xing wrote:
> > static int sol_socket_sockopt(struct sock *sk, int optname,
> > char *optval, int *optlen,
> > bool getopt)
> > {
> > + struct so_timestamping ts;
> > + int ret = 0;
> > +
> > switch (optname) {
> > case SO_REUSEADDR:
> > case SO_SNDBUF:
> > @@ -5225,6 +5245,13 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> > break;
> > case SO_BINDTODEVICE:
> > break;
> > + case SO_TIMESTAMPING_NEW:
> > + case SO_TIMESTAMPING_OLD:
>
> How about remove the "_OLD" support ?
Will do that. Thanks!
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt
2024-10-12 4:06 ` [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt Jason Xing
2024-10-15 1:34 ` Willem de Bruijn
2024-10-15 21:32 ` Martin KaFai Lau
@ 2024-10-15 23:54 ` Martin KaFai Lau
2024-10-16 0:49 ` Jason Xing
2 siblings, 1 reply; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-15 23:54 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 10/11/24 9:06 PM, Jason Xing wrote:
> +static int bpf_sock_set_timestamping(struct sock *sk,
> + struct so_timestamping *timestamping)
> +{
> + u32 flags = timestamping->flags;
> +
> + if (flags & ~SOF_TIMESTAMPING_MASK)
> + return -EINVAL;
> +
> + if (!(flags & (SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
> + SOF_TIMESTAMPING_TX_ACK)))
hmm... Does it mean at least one of the bit must be set and cannot be completely
cleared once it has been set before?
> + return -EINVAL;
> +
> + WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
> +
> + return 0;
> +}
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt
2024-10-15 23:54 ` Martin KaFai Lau
@ 2024-10-16 0:49 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-16 0:49 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Wed, Oct 16, 2024 at 7:54 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/11/24 9:06 PM, Jason Xing wrote:
> > +static int bpf_sock_set_timestamping(struct sock *sk,
> > + struct so_timestamping *timestamping)
> > +{
> > + u32 flags = timestamping->flags;
> > +
> > + if (flags & ~SOF_TIMESTAMPING_MASK)
> > + return -EINVAL;
> > +
> > + if (!(flags & (SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
> > + SOF_TIMESTAMPING_TX_ACK)))
>
> hmm... Does it mean at least one of the bit must be set and cannot be completely
> cleared once it has been set before?
Yes. Because in the current BPF extension feature I don't support all
the original SO_TIMESTAMPING flags (SOF_TIMESTAMPING_*) . When it
comes to clearing flags, I cannot find a proper time/chance to clear
them. That's the reason why I don't implement it.
^ permalink raw reply [flat|nested] 73+ messages in thread
* [PATCH net-next v2 03/12] net-timestamp: reorganize in skb_tstamp_tx_output()
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
2024-10-12 4:06 ` [PATCH net-next v2 01/12] net-timestamp: introduce socket tsflag requestors Jason Xing
2024-10-12 4:06 ` [PATCH net-next v2 02/12] net-timestamp: open gate for bpf_setsockopt Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-12 4:06 ` [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension Jason Xing
` (9 subsequent siblings)
12 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
It's a prep for bpf print function later. This patch only put the
original generating logic into one function. No functional changes.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
My thought is keeping each patch small helps people to review.
---
net/core/skbuff.c | 22 +++++++++++++++-------
1 file changed, 15 insertions(+), 7 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ab0a59f1e14d..f36eb9daa31a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5540,18 +5540,15 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
-void __skb_tstamp_tx(struct sk_buff *orig_skb,
- const struct sk_buff *ack_skb,
- struct skb_shared_hwtstamps *hwtstamps,
- struct sock *sk, int tstype)
+static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
+ const struct sk_buff *ack_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
{
struct sk_buff *skb;
bool tsonly, opt_stats = false;
u32 tsflags;
- if (!sk)
- return;
-
tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
@@ -5595,6 +5592,17 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
+
+void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ const struct sk_buff *ack_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
+{
+ if (!sk)
+ return;
+
+ skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
+}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
void skb_tstamp_tx(struct sk_buff *orig_skb,
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (2 preceding siblings ...)
2024-10-12 4:06 ` [PATCH net-next v2 03/12] net-timestamp: reorganize in skb_tstamp_tx_output() Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-15 1:36 ` Willem de Bruijn
` (2 more replies)
2024-10-12 4:06 ` [PATCH net-next v2 05/12] net-timestamp: add bpf infrastructure to allow exposing timestamp later Jason Xing
` (8 subsequent siblings)
12 siblings, 3 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Willem suggested that we use a static key to control. The advantage
is that we will not affect the existing applications at all if we
don't load BPF program.
In this patch, except the static key, I also add one logic that is
used to test if the socket has enabled its tsflags in order to
support bpf logic to allow both cases to happen at the same time.
Or else, the skb carring related timestamp flag doesn't know which
way of printing is desirable.
One thing important is this patch allows print from both applications
and bpf program at the same time. Now we have three kinds of print:
1) only BPF program prints
2) only application program prints
3) both can print without side effect
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/net/sock.h | 1 +
net/core/filter.c | 3 +++
net/core/skbuff.c | 38 ++++++++++++++++++++++++++++++++++++++
3 files changed, 42 insertions(+)
diff --git a/include/net/sock.h b/include/net/sock.h
index 66ecd78f1dfe..b7c51b95c92d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2889,6 +2889,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
void sock_def_readable(struct sock *sk);
int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
+DECLARE_STATIC_KEY_FALSE(bpf_tstamp_control);
void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
int sock_get_timestamping(struct so_timestamping *timestamping,
sockptr_t optval, unsigned int optlen);
diff --git a/net/core/filter.c b/net/core/filter.c
index 996426095bd9..08135f538c99 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5204,6 +5204,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+DEFINE_STATIC_KEY_FALSE(bpf_tstamp_control);
+
static int bpf_sock_set_timestamping(struct sock *sk,
struct so_timestamping *timestamping)
{
@@ -5217,6 +5219,7 @@ static int bpf_sock_set_timestamping(struct sock *sk,
return -EINVAL;
WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
+ static_branch_enable(&bpf_tstamp_control);
return 0;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f36eb9daa31a..d0f912f1ff7b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5540,6 +5540,29 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
+{
+ u32 testflag;
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ testflag = SOF_TIMESTAMPING_TX_SCHED;
+ break;
+ case SCM_TSTAMP_SND:
+ testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
+ break;
+ case SCM_TSTAMP_ACK:
+ testflag = SOF_TIMESTAMPING_TX_ACK;
+ break;
+ default:
+ return false;
+ }
+ if (tsflags & testflag)
+ return true;
+
+ return false;
+}
+
static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
@@ -5558,6 +5581,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
if (!skb_may_tx_timestamp(sk, tsonly))
return;
+ if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
+ return;
+
if (tsonly) {
#ifdef CONFIG_INET
if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
@@ -5593,6 +5619,15 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
+static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
+{
+ u32 tsflags;
+
+ tsflags = READ_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR]);
+ if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
+ return;
+}
+
void __skb_tstamp_tx(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
@@ -5601,6 +5636,9 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!sk)
return;
+ if (static_branch_unlikely(&bpf_tstamp_control))
+ bpf_skb_tstamp_tx_output(sk, tstype);
+
skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-12 4:06 ` [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension Jason Xing
@ 2024-10-15 1:36 ` Willem de Bruijn
2024-10-15 2:25 ` Jason Xing
2024-10-16 0:09 ` Martin KaFai Lau
2024-10-20 21:51 ` Willem de Bruijn
2 siblings, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-15 1:36 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: bpf, netdev, Jason Xing
Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> Willem suggested that we use a static key to control. The advantage
> is that we will not affect the existing applications at all if we
> don't load BPF program.
>
> In this patch, except the static key, I also add one logic that is
> used to test if the socket has enabled its tsflags in order to
> support bpf logic to allow both cases to happen at the same time.
These two features are unrelated, should probably be separate patches.
> Or else, the skb carring related timestamp flag doesn't know which
> way of printing is desirable.
>
> One thing important is this patch allows print from both applications
> and bpf program at the same time. Now we have three kinds of print:
> 1) only BPF program prints
> 2) only application program prints
> 3) both can print without side effect
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-15 1:36 ` Willem de Bruijn
@ 2024-10-15 2:25 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-15 2:25 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Tue, Oct 15, 2024 at 9:36 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Willem suggested that we use a static key to control. The advantage
> > is that we will not affect the existing applications at all if we
> > don't load BPF program.
> >
> > In this patch, except the static key, I also add one logic that is
> > used to test if the socket has enabled its tsflags in order to
> > support bpf logic to allow both cases to happen at the same time.
>
> These two features are unrelated, should probably be separate patches.
Will do it, thanks.
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-12 4:06 ` [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension Jason Xing
2024-10-15 1:36 ` Willem de Bruijn
@ 2024-10-16 0:09 ` Martin KaFai Lau
2024-10-16 1:04 ` Jason Xing
2024-10-20 21:51 ` Willem de Bruijn
2 siblings, 1 reply; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-16 0:09 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 10/11/24 9:06 PM, Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> Willem suggested that we use a static key to control. The advantage
> is that we will not affect the existing applications at all if we
> don't load BPF program.
>
> In this patch, except the static key, I also add one logic that is
> used to test if the socket has enabled its tsflags in order to
> support bpf logic to allow both cases to happen at the same time.
> Or else, the skb carring related timestamp flag doesn't know which
> way of printing is desirable.
>
> One thing important is this patch allows print from both applications
> and bpf program at the same time. Now we have three kinds of print:
> 1) only BPF program prints
> 2) only application program prints
> 3) both can print without side effect
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> include/net/sock.h | 1 +
> net/core/filter.c | 3 +++
> net/core/skbuff.c | 38 ++++++++++++++++++++++++++++++++++++++
> 3 files changed, 42 insertions(+)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 66ecd78f1dfe..b7c51b95c92d 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -2889,6 +2889,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
> void sock_def_readable(struct sock *sk);
>
> int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
> +DECLARE_STATIC_KEY_FALSE(bpf_tstamp_control);
> void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
> int sock_get_timestamping(struct so_timestamping *timestamping,
> sockptr_t optval, unsigned int optlen);
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 996426095bd9..08135f538c99 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5204,6 +5204,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
> .arg1_type = ARG_PTR_TO_CTX,
> };
>
> +DEFINE_STATIC_KEY_FALSE(bpf_tstamp_control);
> +
> static int bpf_sock_set_timestamping(struct sock *sk,
> struct so_timestamping *timestamping)
> {
> @@ -5217,6 +5219,7 @@ static int bpf_sock_set_timestamping(struct sock *sk,
> return -EINVAL;
>
> WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
> + static_branch_enable(&bpf_tstamp_control);
Not sure when is a good time to do static_branch_disable().
The bpf prog may be detached also. (IF) it ends up staying with the
cgroup/sockops interface, it should depend on the existing static key in
cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
>
> return 0;
> }
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f36eb9daa31a..d0f912f1ff7b 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5540,6 +5540,29 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> }
> EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
>
> +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
sk is unused.
> +{
> + u32 testflag;
> +
> + switch (tstype) {
> + case SCM_TSTAMP_SCHED:
Instead of doing this translation,
is it easier to directly store the bpf prog desired ts"type" (i.e. the
SCM_TSTAMP_*) in the sk->sk_tsflags_bpf?
or there is a specific need to keep the SOF_TIMESTAMPING_* value in
sk->sk_tsflags_bpf?
> + testflag = SOF_TIMESTAMPING_TX_SCHED;
> + break;
> + case SCM_TSTAMP_SND:
> + testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> + break;
> + case SCM_TSTAMP_ACK:
> + testflag = SOF_TIMESTAMPING_TX_ACK;
> + break;
> + default:
> + return false;
> + }
> + if (tsflags & testflag)
> + return true;
> +
> + return false;
> +}
> +
> static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> const struct sk_buff *ack_skb,
> struct skb_shared_hwtstamps *hwtstamps,
> @@ -5558,6 +5581,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> if (!skb_may_tx_timestamp(sk, tsonly))
> return;
>
> + if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
This is a new test. tsflags is the sk->sk_tsflags here if I read it correctly.
My understanding is the sendmsg can provide SOF_TIMESTAMPING_* for individual
skb. Would it break? Is it the similar case on the skb tx_flags that Willem has
mentioned in the patch 0's thread?
> + return;
> +
> if (tsonly) {
> #ifdef CONFIG_INET
> if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
> @@ -5593,6 +5619,15 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
> }
>
> +static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
> +{
> + u32 tsflags;
> +
> + tsflags = READ_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR]);
> + if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> + return;
> +}
> +
> void __skb_tstamp_tx(struct sk_buff *orig_skb,
> const struct sk_buff *ack_skb,
> struct skb_shared_hwtstamps *hwtstamps,
> @@ -5601,6 +5636,9 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
> if (!sk)
> return;
>
> + if (static_branch_unlikely(&bpf_tstamp_control))
> + bpf_skb_tstamp_tx_output(sk, tstype);
> +
> skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
> }
> EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 0:09 ` Martin KaFai Lau
@ 2024-10-16 1:04 ` Jason Xing
2024-10-16 1:32 ` Jason Xing
2024-10-16 6:31 ` Martin KaFai Lau
0 siblings, 2 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-16 1:04 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Wed, Oct 16, 2024 at 8:10 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/11/24 9:06 PM, Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Willem suggested that we use a static key to control. The advantage
> > is that we will not affect the existing applications at all if we
> > don't load BPF program.
> >
> > In this patch, except the static key, I also add one logic that is
> > used to test if the socket has enabled its tsflags in order to
> > support bpf logic to allow both cases to happen at the same time.
> > Or else, the skb carring related timestamp flag doesn't know which
> > way of printing is desirable.
> >
> > One thing important is this patch allows print from both applications
> > and bpf program at the same time. Now we have three kinds of print:
> > 1) only BPF program prints
> > 2) only application program prints
> > 3) both can print without side effect
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > include/net/sock.h | 1 +
> > net/core/filter.c | 3 +++
> > net/core/skbuff.c | 38 ++++++++++++++++++++++++++++++++++++++
> > 3 files changed, 42 insertions(+)
> >
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 66ecd78f1dfe..b7c51b95c92d 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -2889,6 +2889,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
> > void sock_def_readable(struct sock *sk);
> >
> > int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
> > +DECLARE_STATIC_KEY_FALSE(bpf_tstamp_control);
> > void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
> > int sock_get_timestamping(struct so_timestamping *timestamping,
> > sockptr_t optval, unsigned int optlen);
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 996426095bd9..08135f538c99 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -5204,6 +5204,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
> > .arg1_type = ARG_PTR_TO_CTX,
> > };
> >
> > +DEFINE_STATIC_KEY_FALSE(bpf_tstamp_control);
> > +
> > static int bpf_sock_set_timestamping(struct sock *sk,
> > struct so_timestamping *timestamping)
> > {
> > @@ -5217,6 +5219,7 @@ static int bpf_sock_set_timestamping(struct sock *sk,
> > return -EINVAL;
> >
> > WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
> > + static_branch_enable(&bpf_tstamp_control);
>
> Not sure when is a good time to do static_branch_disable().
Thanks for the review.
To be honest, I considered how to disable the static key. Like you
said, I failed to find a good chance that I can accurately disable it.
>
> The bpf prog may be detached also. (IF) it ends up staying with the
> cgroup/sockops interface, it should depend on the existing static key in
> cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
Are you suggesting that we need to remove the current static key? In
the previous thread, the reason why Willem came up with this idea is,
I think, to avoid affect the non-bpf timestamping feature.
>
> >
> > return 0;
> > }
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index f36eb9daa31a..d0f912f1ff7b 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -5540,6 +5540,29 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> > }
> > EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> >
> > +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
>
> sk is unused.
Thanks for the careful check.
>
> > +{
> > + u32 testflag;
> > +
> > + switch (tstype) {
> > + case SCM_TSTAMP_SCHED:
>
> Instead of doing this translation,
> is it easier to directly store the bpf prog desired ts"type" (i.e. the
> SCM_TSTAMP_*) in the sk->sk_tsflags_bpf?
> or there is a specific need to keep the SOF_TIMESTAMPING_* value in
> sk->sk_tsflags_bpf?
We have to reuse SOF_TIMESTAMPING_* because there are more flags, say,
SOF_TIMESTAMPING_OPT_ID, that we need to support.
>
> > + testflag = SOF_TIMESTAMPING_TX_SCHED;
> > + break;
> > + case SCM_TSTAMP_SND:
> > + testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> > + break;
> > + case SCM_TSTAMP_ACK:
> > + testflag = SOF_TIMESTAMPING_TX_ACK;
> > + break;
> > + default:
> > + return false;
> > + }
> > + if (tsflags & testflag)
> > + return true;
> > +
> > + return false;
> > +}
> > +
> > static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > const struct sk_buff *ack_skb,
> > struct skb_shared_hwtstamps *hwtstamps,
> > @@ -5558,6 +5581,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > if (!skb_may_tx_timestamp(sk, tsonly))
> > return;
> >
> > + if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
>
> This is a new test. tsflags is the sk->sk_tsflags here if I read it correctly.
This test will be used in bpf and non-bpf cases. Because of this, we
can support BPF extension. In this function, if skb has tsflags but we
don't know which approach the user expects, sk_tstamp_tx_flags() can
help us.
>
> My understanding is the sendmsg can provide SOF_TIMESTAMPING_* for individual
> skb. Would it break?
Oh, you're right. I didn't support cmsg mode...
> Is it the similar case on the skb tx_flags that Willem has
> mentioned in the patch 0's thread?
Yes, but am I supposed to add new a bpf tx_flags in the struct sk_buff?
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 1:04 ` Jason Xing
@ 2024-10-16 1:32 ` Jason Xing
2024-10-16 6:13 ` Martin KaFai Lau
2024-10-16 6:31 ` Martin KaFai Lau
1 sibling, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-16 1:32 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Wed, Oct 16, 2024 at 9:04 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Wed, Oct 16, 2024 at 8:10 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >
> > On 10/11/24 9:06 PM, Jason Xing wrote:
> > > From: Jason Xing <kernelxing@tencent.com>
> > >
> > > Willem suggested that we use a static key to control. The advantage
> > > is that we will not affect the existing applications at all if we
> > > don't load BPF program.
> > >
> > > In this patch, except the static key, I also add one logic that is
> > > used to test if the socket has enabled its tsflags in order to
> > > support bpf logic to allow both cases to happen at the same time.
> > > Or else, the skb carring related timestamp flag doesn't know which
> > > way of printing is desirable.
> > >
> > > One thing important is this patch allows print from both applications
> > > and bpf program at the same time. Now we have three kinds of print:
> > > 1) only BPF program prints
> > > 2) only application program prints
> > > 3) both can print without side effect
> > >
> > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > ---
> > > include/net/sock.h | 1 +
> > > net/core/filter.c | 3 +++
> > > net/core/skbuff.c | 38 ++++++++++++++++++++++++++++++++++++++
> > > 3 files changed, 42 insertions(+)
> > >
> > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > index 66ecd78f1dfe..b7c51b95c92d 100644
> > > --- a/include/net/sock.h
> > > +++ b/include/net/sock.h
> > > @@ -2889,6 +2889,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
> > > void sock_def_readable(struct sock *sk);
> > >
> > > int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
> > > +DECLARE_STATIC_KEY_FALSE(bpf_tstamp_control);
> > > void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
> > > int sock_get_timestamping(struct so_timestamping *timestamping,
> > > sockptr_t optval, unsigned int optlen);
> > > diff --git a/net/core/filter.c b/net/core/filter.c
> > > index 996426095bd9..08135f538c99 100644
> > > --- a/net/core/filter.c
> > > +++ b/net/core/filter.c
> > > @@ -5204,6 +5204,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
> > > .arg1_type = ARG_PTR_TO_CTX,
> > > };
> > >
> > > +DEFINE_STATIC_KEY_FALSE(bpf_tstamp_control);
> > > +
> > > static int bpf_sock_set_timestamping(struct sock *sk,
> > > struct so_timestamping *timestamping)
> > > {
> > > @@ -5217,6 +5219,7 @@ static int bpf_sock_set_timestamping(struct sock *sk,
> > > return -EINVAL;
> > >
> > > WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
> > > + static_branch_enable(&bpf_tstamp_control);
> >
> > Not sure when is a good time to do static_branch_disable().
>
> Thanks for the review.
>
> To be honest, I considered how to disable the static key. Like you
> said, I failed to find a good chance that I can accurately disable it.
>
> >
> > The bpf prog may be detached also. (IF) it ends up staying with the
> > cgroup/sockops interface, it should depend on the existing static key in
> > cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
>
> Are you suggesting that we need to remove the current static key? In
> the previous thread, the reason why Willem came up with this idea is,
> I think, to avoid affect the non-bpf timestamping feature.
>
> >
> > >
> > > return 0;
> > > }
> > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > > index f36eb9daa31a..d0f912f1ff7b 100644
> > > --- a/net/core/skbuff.c
> > > +++ b/net/core/skbuff.c
> > > @@ -5540,6 +5540,29 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> > > }
> > > EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> > >
> > > +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
> >
> > sk is unused.
>
> Thanks for the careful check.
>
> >
> > > +{
> > > + u32 testflag;
> > > +
> > > + switch (tstype) {
> > > + case SCM_TSTAMP_SCHED:
> >
> > Instead of doing this translation,
> > is it easier to directly store the bpf prog desired ts"type" (i.e. the
> > SCM_TSTAMP_*) in the sk->sk_tsflags_bpf?
> > or there is a specific need to keep the SOF_TIMESTAMPING_* value in
> > sk->sk_tsflags_bpf?
>
> We have to reuse SOF_TIMESTAMPING_* because there are more flags, say,
> SOF_TIMESTAMPING_OPT_ID, that we need to support.
>
> >
> > > + testflag = SOF_TIMESTAMPING_TX_SCHED;
> > > + break;
> > > + case SCM_TSTAMP_SND:
> > > + testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> > > + break;
> > > + case SCM_TSTAMP_ACK:
> > > + testflag = SOF_TIMESTAMPING_TX_ACK;
> > > + break;
> > > + default:
> > > + return false;
> > > + }
> > > + if (tsflags & testflag)
> > > + return true;
> > > +
> > > + return false;
> > > +}
> > > +
> > > static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > const struct sk_buff *ack_skb,
> > > struct skb_shared_hwtstamps *hwtstamps,
> > > @@ -5558,6 +5581,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > if (!skb_may_tx_timestamp(sk, tsonly))
> > > return;
> > >
> > > + if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> >
> > This is a new test. tsflags is the sk->sk_tsflags here if I read it correctly.
>
> This test will be used in bpf and non-bpf cases. Because of this, we
> can support BPF extension. In this function, if skb has tsflags but we
> don't know which approach the user expects, sk_tstamp_tx_flags() can
> help us.
>
> >
> > My understanding is the sendmsg can provide SOF_TIMESTAMPING_* for individual
> > skb. Would it break?
>
> Oh, you're right. I didn't support cmsg mode...
I think I only need to test if it's in the bpf mode, or else let the
original way print the timestamp, which can solve the issue.
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 1:32 ` Jason Xing
@ 2024-10-16 6:13 ` Martin KaFai Lau
2024-10-16 6:30 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-16 6:13 UTC (permalink / raw)
To: Jason Xing
Cc: Jakub Sitnicki, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, eddyz87,
song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
bpf, netdev, Jason Xing
On 10/15/24 6:32 PM, Jason Xing wrote:
> On Wed, Oct 16, 2024 at 9:04 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>>
>> On Wed, Oct 16, 2024 at 8:10 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>
>>> On 10/11/24 9:06 PM, Jason Xing wrote:
>>>> From: Jason Xing <kernelxing@tencent.com>
>>>>
>>>> Willem suggested that we use a static key to control. The advantage
>>>> is that we will not affect the existing applications at all if we
>>>> don't load BPF program.
>>>>
>>>> In this patch, except the static key, I also add one logic that is
>>>> used to test if the socket has enabled its tsflags in order to
>>>> support bpf logic to allow both cases to happen at the same time.
>>>> Or else, the skb carring related timestamp flag doesn't know which
>>>> way of printing is desirable.
>>>>
>>>> One thing important is this patch allows print from both applications
>>>> and bpf program at the same time. Now we have three kinds of print:
>>>> 1) only BPF program prints
>>>> 2) only application program prints
>>>> 3) both can print without side effect
>>>>
>>>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
>>>> ---
>>>> include/net/sock.h | 1 +
>>>> net/core/filter.c | 3 +++
>>>> net/core/skbuff.c | 38 ++++++++++++++++++++++++++++++++++++++
>>>> 3 files changed, 42 insertions(+)
>>>>
>>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>>> index 66ecd78f1dfe..b7c51b95c92d 100644
>>>> --- a/include/net/sock.h
>>>> +++ b/include/net/sock.h
>>>> @@ -2889,6 +2889,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
>>>> void sock_def_readable(struct sock *sk);
>>>>
>>>> int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
>>>> +DECLARE_STATIC_KEY_FALSE(bpf_tstamp_control);
>>>> void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
>>>> int sock_get_timestamping(struct so_timestamping *timestamping,
>>>> sockptr_t optval, unsigned int optlen);
>>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>>> index 996426095bd9..08135f538c99 100644
>>>> --- a/net/core/filter.c
>>>> +++ b/net/core/filter.c
>>>> @@ -5204,6 +5204,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
>>>> .arg1_type = ARG_PTR_TO_CTX,
>>>> };
>>>>
>>>> +DEFINE_STATIC_KEY_FALSE(bpf_tstamp_control);
>>>> +
>>>> static int bpf_sock_set_timestamping(struct sock *sk,
>>>> struct so_timestamping *timestamping)
>>>> {
>>>> @@ -5217,6 +5219,7 @@ static int bpf_sock_set_timestamping(struct sock *sk,
>>>> return -EINVAL;
>>>>
>>>> WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
>>>> + static_branch_enable(&bpf_tstamp_control);
>>>
>>> Not sure when is a good time to do static_branch_disable().
>>
>> Thanks for the review.
>>
>> To be honest, I considered how to disable the static key. Like you
>> said, I failed to find a good chance that I can accurately disable it.
>>
>>>
>>> The bpf prog may be detached also. (IF) it ends up staying with the
>>> cgroup/sockops interface, it should depend on the existing static key in
>>> cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
>>
>> Are you suggesting that we need to remove the current static key? In
>> the previous thread, the reason why Willem came up with this idea is,
>> I think, to avoid affect the non-bpf timestamping feature.
>>
>>>
>>>>
>>>> return 0;
>>>> }
>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>> index f36eb9daa31a..d0f912f1ff7b 100644
>>>> --- a/net/core/skbuff.c
>>>> +++ b/net/core/skbuff.c
>>>> @@ -5540,6 +5540,29 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
>>>> }
>>>> EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
>>>>
>>>> +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
>>>
>>> sk is unused.
>>
>> Thanks for the careful check.
>>
>>>
>>>> +{
>>>> + u32 testflag;
>>>> +
>>>> + switch (tstype) {
>>>> + case SCM_TSTAMP_SCHED:
>>>
>>> Instead of doing this translation,
>>> is it easier to directly store the bpf prog desired ts"type" (i.e. the
>>> SCM_TSTAMP_*) in the sk->sk_tsflags_bpf?
>>> or there is a specific need to keep the SOF_TIMESTAMPING_* value in
>>> sk->sk_tsflags_bpf?
>>
>> We have to reuse SOF_TIMESTAMPING_* because there are more flags, say,
>> SOF_TIMESTAMPING_OPT_ID, that we need to support.
>>
>>>
>>>> + testflag = SOF_TIMESTAMPING_TX_SCHED;
>>>> + break;
>>>> + case SCM_TSTAMP_SND:
>>>> + testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
>>>> + break;
>>>> + case SCM_TSTAMP_ACK:
>>>> + testflag = SOF_TIMESTAMPING_TX_ACK;
>>>> + break;
>>>> + default:
>>>> + return false;
>>>> + }
>>>> + if (tsflags & testflag)
>>>> + return true;
>>>> +
>>>> + return false;
>>>> +}
>>>> +
>>>> static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
>>>> const struct sk_buff *ack_skb,
>>>> struct skb_shared_hwtstamps *hwtstamps,
>>>> @@ -5558,6 +5581,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
>>>> if (!skb_may_tx_timestamp(sk, tsonly))
>>>> return;
>>>>
>>>> + if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
>>>
>>> This is a new test. tsflags is the sk->sk_tsflags here if I read it correctly.
>>
>> This test will be used in bpf and non-bpf cases. Because of this, we
>> can support BPF extension. In this function, if skb has tsflags but we
>> don't know which approach the user expects, sk_tstamp_tx_flags() can
>> help us.
>>
>>>
>>> My understanding is the sendmsg can provide SOF_TIMESTAMPING_* for individual
>>> skb. Would it break?
>>
>> Oh, you're right. I didn't support cmsg mode...
>
> I think I only need to test if it's in the bpf mode, or else let the
> original way print the timestamp, which can solve the issue.
From looking at the existing "__skb_tstamp_tx(skb, NULL, NULL, skb->sk,
SCM_TSTAMP_SCHED);":
int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
/* ... */
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* ... */
}
I am still puzzling how __skb_tstamp_tx() will be called if only bpf has enabled
the timestamping. I may have missed somewhere in the patch set that the skb's
tx_flags is changed by sk->sk_tsflags_bpf alone?
I think a skb tskey is still desired (?), so eventually we want some spaces in
the skb for bpf. Jakub Sitnicki (cc-ed) has presented in LPC about extending
skb->data_meta usage outside of xdp and tc. I think here we want to have it
available at the tx side to store the tx_flags and tskey but probably want them
at a specific place/offset at the data_meta.
For now, is there thing we can explore to share in the skb_shared_info? Can the
"struct skb_shared_hwtstamps hwtstamps;" be used for the bpf tx_flags and tskey
only at the "tx" side? There is already another union member. The hwtstamps
should only be needed when the NIC is done sending?
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 6:13 ` Martin KaFai Lau
@ 2024-10-16 6:30 ` Jason Xing
2024-10-16 7:01 ` Martin KaFai Lau
0 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-16 6:30 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Jakub Sitnicki, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, eddyz87,
song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
bpf, netdev, Jason Xing
On Wed, Oct 16, 2024 at 2:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/15/24 6:32 PM, Jason Xing wrote:
> > On Wed, Oct 16, 2024 at 9:04 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >>
> >> On Wed, Oct 16, 2024 at 8:10 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>>
> >>> On 10/11/24 9:06 PM, Jason Xing wrote:
> >>>> From: Jason Xing <kernelxing@tencent.com>
> >>>>
> >>>> Willem suggested that we use a static key to control. The advantage
> >>>> is that we will not affect the existing applications at all if we
> >>>> don't load BPF program.
> >>>>
> >>>> In this patch, except the static key, I also add one logic that is
> >>>> used to test if the socket has enabled its tsflags in order to
> >>>> support bpf logic to allow both cases to happen at the same time.
> >>>> Or else, the skb carring related timestamp flag doesn't know which
> >>>> way of printing is desirable.
> >>>>
> >>>> One thing important is this patch allows print from both applications
> >>>> and bpf program at the same time. Now we have three kinds of print:
> >>>> 1) only BPF program prints
> >>>> 2) only application program prints
> >>>> 3) both can print without side effect
> >>>>
> >>>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> >>>> ---
> >>>> include/net/sock.h | 1 +
> >>>> net/core/filter.c | 3 +++
> >>>> net/core/skbuff.c | 38 ++++++++++++++++++++++++++++++++++++++
> >>>> 3 files changed, 42 insertions(+)
> >>>>
> >>>> diff --git a/include/net/sock.h b/include/net/sock.h
> >>>> index 66ecd78f1dfe..b7c51b95c92d 100644
> >>>> --- a/include/net/sock.h
> >>>> +++ b/include/net/sock.h
> >>>> @@ -2889,6 +2889,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
> >>>> void sock_def_readable(struct sock *sk);
> >>>>
> >>>> int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
> >>>> +DECLARE_STATIC_KEY_FALSE(bpf_tstamp_control);
> >>>> void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
> >>>> int sock_get_timestamping(struct so_timestamping *timestamping,
> >>>> sockptr_t optval, unsigned int optlen);
> >>>> diff --git a/net/core/filter.c b/net/core/filter.c
> >>>> index 996426095bd9..08135f538c99 100644
> >>>> --- a/net/core/filter.c
> >>>> +++ b/net/core/filter.c
> >>>> @@ -5204,6 +5204,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
> >>>> .arg1_type = ARG_PTR_TO_CTX,
> >>>> };
> >>>>
> >>>> +DEFINE_STATIC_KEY_FALSE(bpf_tstamp_control);
> >>>> +
> >>>> static int bpf_sock_set_timestamping(struct sock *sk,
> >>>> struct so_timestamping *timestamping)
> >>>> {
> >>>> @@ -5217,6 +5219,7 @@ static int bpf_sock_set_timestamping(struct sock *sk,
> >>>> return -EINVAL;
> >>>>
> >>>> WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
> >>>> + static_branch_enable(&bpf_tstamp_control);
> >>>
> >>> Not sure when is a good time to do static_branch_disable().
> >>
> >> Thanks for the review.
> >>
> >> To be honest, I considered how to disable the static key. Like you
> >> said, I failed to find a good chance that I can accurately disable it.
> >>
> >>>
> >>> The bpf prog may be detached also. (IF) it ends up staying with the
> >>> cgroup/sockops interface, it should depend on the existing static key in
> >>> cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
> >>
> >> Are you suggesting that we need to remove the current static key? In
> >> the previous thread, the reason why Willem came up with this idea is,
> >> I think, to avoid affect the non-bpf timestamping feature.
> >>
> >>>
> >>>>
> >>>> return 0;
> >>>> }
> >>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >>>> index f36eb9daa31a..d0f912f1ff7b 100644
> >>>> --- a/net/core/skbuff.c
> >>>> +++ b/net/core/skbuff.c
> >>>> @@ -5540,6 +5540,29 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> >>>> }
> >>>> EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> >>>>
> >>>> +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
> >>>
> >>> sk is unused.
> >>
> >> Thanks for the careful check.
> >>
> >>>
> >>>> +{
> >>>> + u32 testflag;
> >>>> +
> >>>> + switch (tstype) {
> >>>> + case SCM_TSTAMP_SCHED:
> >>>
> >>> Instead of doing this translation,
> >>> is it easier to directly store the bpf prog desired ts"type" (i.e. the
> >>> SCM_TSTAMP_*) in the sk->sk_tsflags_bpf?
> >>> or there is a specific need to keep the SOF_TIMESTAMPING_* value in
> >>> sk->sk_tsflags_bpf?
> >>
> >> We have to reuse SOF_TIMESTAMPING_* because there are more flags, say,
> >> SOF_TIMESTAMPING_OPT_ID, that we need to support.
> >>
> >>>
> >>>> + testflag = SOF_TIMESTAMPING_TX_SCHED;
> >>>> + break;
> >>>> + case SCM_TSTAMP_SND:
> >>>> + testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> >>>> + break;
> >>>> + case SCM_TSTAMP_ACK:
> >>>> + testflag = SOF_TIMESTAMPING_TX_ACK;
> >>>> + break;
> >>>> + default:
> >>>> + return false;
> >>>> + }
> >>>> + if (tsflags & testflag)
> >>>> + return true;
> >>>> +
> >>>> + return false;
> >>>> +}
> >>>> +
> >>>> static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> >>>> const struct sk_buff *ack_skb,
> >>>> struct skb_shared_hwtstamps *hwtstamps,
> >>>> @@ -5558,6 +5581,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> >>>> if (!skb_may_tx_timestamp(sk, tsonly))
> >>>> return;
> >>>>
> >>>> + if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> >>>
> >>> This is a new test. tsflags is the sk->sk_tsflags here if I read it correctly.
> >>
> >> This test will be used in bpf and non-bpf cases. Because of this, we
> >> can support BPF extension. In this function, if skb has tsflags but we
> >> don't know which approach the user expects, sk_tstamp_tx_flags() can
> >> help us.
> >>
> >>>
> >>> My understanding is the sendmsg can provide SOF_TIMESTAMPING_* for individual
> >>> skb. Would it break?
> >>
> >> Oh, you're right. I didn't support cmsg mode...
> >
> > I think I only need to test if it's in the bpf mode, or else let the
> > original way print the timestamp, which can solve the issue.
>
> From looking at the existing "__skb_tstamp_tx(skb, NULL, NULL, skb->sk,
> SCM_TSTAMP_SCHED);":
>
> int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
> {
> /* ... */
>
> if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
> __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
>
> /* ... */
> }
>
> I am still puzzling how __skb_tstamp_tx() will be called if only bpf has enabled
> the timestamping. I may have missed somewhere in the patch set that the skb's
> tx_flags is changed by sk->sk_tsflags_bpf alone?
If sk_tsflags_bpf is set, tcp_sendmsg() -> tcp_tx_timestamp() will be
helpful, which initializes every last skb, please see patch [10/12].
>
> I think a skb tskey is still desired (?), so eventually we want some spaces in
tskey function is optional I think. It depends whether users want to
use it or not. It can controlled by SOF_TIMESTAMPING_OPT_ID flag.
> the skb for bpf. Jakub Sitnicki (cc-ed) has presented in LPC about extending
> skb->data_meta usage outside of xdp and tc. I think here we want to have it
> available at the tx side to store the tx_flags and tskey but probably want them
> at a specific place/offset at the data_meta.
If we have the plan to store extra information in data_meta, I can
give it a try:)
>
> For now, is there thing we can explore to share in the skb_shared_info?
My initial thought is just to reuse these fields in skb. It can work
without interfering one another.
> Can the "struct skb_shared_hwtstamps hwtstamps;" be used for the bpf tx_flags and tskey
> only at the "tx" side? There is already another union member.
tskey is always used in the tx path.
hwtstamps can be used in both rx and tx cases (please see
tcp_update_recv_tstamps() and skb_tstamp_tx()).
> The hwtstamps should only be needed when the NIC is done sending?
In this patch, yes, hwtstamps are the records in tx path.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 6:30 ` Jason Xing
@ 2024-10-16 7:01 ` Martin KaFai Lau
2024-10-16 7:54 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-16 7:01 UTC (permalink / raw)
To: Jason Xing
Cc: Jakub Sitnicki, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, eddyz87,
song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
bpf, netdev, Jason Xing
On 10/15/24 11:30 PM, Jason Xing wrote:
> On Wed, Oct 16, 2024 at 2:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 10/15/24 6:32 PM, Jason Xing wrote:
>>> On Wed, Oct 16, 2024 at 9:04 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>>>>
>>>> On Wed, Oct 16, 2024 at 8:10 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>>>
>>>>> On 10/11/24 9:06 PM, Jason Xing wrote:
>>>>>> From: Jason Xing <kernelxing@tencent.com>
>>>>>>
>>>>>> Willem suggested that we use a static key to control. The advantage
>>>>>> is that we will not affect the existing applications at all if we
>>>>>> don't load BPF program.
>>>>>>
>>>>>> In this patch, except the static key, I also add one logic that is
>>>>>> used to test if the socket has enabled its tsflags in order to
>>>>>> support bpf logic to allow both cases to happen at the same time.
>>>>>> Or else, the skb carring related timestamp flag doesn't know which
>>>>>> way of printing is desirable.
>>>>>>
>>>>>> One thing important is this patch allows print from both applications
>>>>>> and bpf program at the same time. Now we have three kinds of print:
>>>>>> 1) only BPF program prints
>>>>>> 2) only application program prints
>>>>>> 3) both can print without side effect
>>>>>>
>>>>>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
>>>>>> ---
>>>>>> include/net/sock.h | 1 +
>>>>>> net/core/filter.c | 3 +++
>>>>>> net/core/skbuff.c | 38 ++++++++++++++++++++++++++++++++++++++
>>>>>> 3 files changed, 42 insertions(+)
>>>>>>
>>>>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>>>>> index 66ecd78f1dfe..b7c51b95c92d 100644
>>>>>> --- a/include/net/sock.h
>>>>>> +++ b/include/net/sock.h
>>>>>> @@ -2889,6 +2889,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
>>>>>> void sock_def_readable(struct sock *sk);
>>>>>>
>>>>>> int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
>>>>>> +DECLARE_STATIC_KEY_FALSE(bpf_tstamp_control);
>>>>>> void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
>>>>>> int sock_get_timestamping(struct so_timestamping *timestamping,
>>>>>> sockptr_t optval, unsigned int optlen);
>>>>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>>>>> index 996426095bd9..08135f538c99 100644
>>>>>> --- a/net/core/filter.c
>>>>>> +++ b/net/core/filter.c
>>>>>> @@ -5204,6 +5204,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
>>>>>> .arg1_type = ARG_PTR_TO_CTX,
>>>>>> };
>>>>>>
>>>>>> +DEFINE_STATIC_KEY_FALSE(bpf_tstamp_control);
>>>>>> +
>>>>>> static int bpf_sock_set_timestamping(struct sock *sk,
>>>>>> struct so_timestamping *timestamping)
>>>>>> {
>>>>>> @@ -5217,6 +5219,7 @@ static int bpf_sock_set_timestamping(struct sock *sk,
>>>>>> return -EINVAL;
>>>>>>
>>>>>> WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
>>>>>> + static_branch_enable(&bpf_tstamp_control);
>>>>>
>>>>> Not sure when is a good time to do static_branch_disable().
>>>>
>>>> Thanks for the review.
>>>>
>>>> To be honest, I considered how to disable the static key. Like you
>>>> said, I failed to find a good chance that I can accurately disable it.
>>>>
>>>>>
>>>>> The bpf prog may be detached also. (IF) it ends up staying with the
>>>>> cgroup/sockops interface, it should depend on the existing static key in
>>>>> cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
>>>>
>>>> Are you suggesting that we need to remove the current static key? In
>>>> the previous thread, the reason why Willem came up with this idea is,
>>>> I think, to avoid affect the non-bpf timestamping feature.
>>>>
>>>>>
>>>>>>
>>>>>> return 0;
>>>>>> }
>>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>>> index f36eb9daa31a..d0f912f1ff7b 100644
>>>>>> --- a/net/core/skbuff.c
>>>>>> +++ b/net/core/skbuff.c
>>>>>> @@ -5540,6 +5540,29 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
>>>>>> }
>>>>>> EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
>>>>>>
>>>>>> +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
>>>>>
>>>>> sk is unused.
>>>>
>>>> Thanks for the careful check.
>>>>
>>>>>
>>>>>> +{
>>>>>> + u32 testflag;
>>>>>> +
>>>>>> + switch (tstype) {
>>>>>> + case SCM_TSTAMP_SCHED:
>>>>>
>>>>> Instead of doing this translation,
>>>>> is it easier to directly store the bpf prog desired ts"type" (i.e. the
>>>>> SCM_TSTAMP_*) in the sk->sk_tsflags_bpf?
>>>>> or there is a specific need to keep the SOF_TIMESTAMPING_* value in
>>>>> sk->sk_tsflags_bpf?
>>>>
>>>> We have to reuse SOF_TIMESTAMPING_* because there are more flags, say,
>>>> SOF_TIMESTAMPING_OPT_ID, that we need to support.
>>>>
>>>>>
>>>>>> + testflag = SOF_TIMESTAMPING_TX_SCHED;
>>>>>> + break;
>>>>>> + case SCM_TSTAMP_SND:
>>>>>> + testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
>>>>>> + break;
>>>>>> + case SCM_TSTAMP_ACK:
>>>>>> + testflag = SOF_TIMESTAMPING_TX_ACK;
>>>>>> + break;
>>>>>> + default:
>>>>>> + return false;
>>>>>> + }
>>>>>> + if (tsflags & testflag)
>>>>>> + return true;
>>>>>> +
>>>>>> + return false;
>>>>>> +}
>>>>>> +
>>>>>> static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
>>>>>> const struct sk_buff *ack_skb,
>>>>>> struct skb_shared_hwtstamps *hwtstamps,
>>>>>> @@ -5558,6 +5581,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
>>>>>> if (!skb_may_tx_timestamp(sk, tsonly))
>>>>>> return;
>>>>>>
>>>>>> + if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
>>>>>
>>>>> This is a new test. tsflags is the sk->sk_tsflags here if I read it correctly.
>>>>
>>>> This test will be used in bpf and non-bpf cases. Because of this, we
>>>> can support BPF extension. In this function, if skb has tsflags but we
>>>> don't know which approach the user expects, sk_tstamp_tx_flags() can
>>>> help us.
>>>>
>>>>>
>>>>> My understanding is the sendmsg can provide SOF_TIMESTAMPING_* for individual
>>>>> skb. Would it break?
>>>>
>>>> Oh, you're right. I didn't support cmsg mode...
>>>
>>> I think I only need to test if it's in the bpf mode, or else let the
>>> original way print the timestamp, which can solve the issue.
>>
>> From looking at the existing "__skb_tstamp_tx(skb, NULL, NULL, skb->sk,
>> SCM_TSTAMP_SCHED);":
>>
>> int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
>> {
>> /* ... */
>>
>> if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
>> __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
>>
>> /* ... */
>> }
>>
>> I am still puzzling how __skb_tstamp_tx() will be called if only bpf has enabled
>> the timestamping. I may have missed somewhere in the patch set that the skb's
>> tx_flags is changed by sk->sk_tsflags_bpf alone?
>
> If sk_tsflags_bpf is set, tcp_sendmsg() -> tcp_tx_timestamp() will be
> helpful, which initializes every last skb, please see patch [10/12].
Ah. ok. It is the thing I missed. Thanks for the pointer.
>>
>> I think a skb tskey is still desired (?), so eventually we want some spaces in
>
> tskey function is optional I think. It depends whether users want to
> use it or not. It can controlled by SOF_TIMESTAMPING_OPT_ID flag.
>
>> the skb for bpf. Jakub Sitnicki (cc-ed) has presented in LPC about extending
>> skb->data_meta usage outside of xdp and tc. I think here we want to have it
>> available at the tx side to store the tx_flags and tskey but probably want them
>> at a specific place/offset at the data_meta.
>
> If we have the plan to store extra information in data_meta, I can
> give it a try:)
>
>>
>> For now, is there thing we can explore to share in the skb_shared_info?
>
> My initial thought is just to reuse these fields in skb. It can work
> without interfering one another.
After reading closer to patch 10, I am likely still missing something. How can
it tell if the tx_flags is set by the bpf or by the user space cmsg?
>
>> Can the "struct skb_shared_hwtstamps hwtstamps;" be used for the bpf tx_flags and tskey
>> only at the "tx" side? There is already another union member.
>
> tskey is always used in the tx path.
>
> hwtstamps can be used in both rx and tx cases (please see
> tcp_update_recv_tstamps() and skb_tstamp_tx()).
hmm... we only need some where to store the bpf tx_flags and bpf tskey in the
TX-ing skb. You meant the hwtstamps of a Tx-ing skb is not empty?
At skb_tstamp_tx (TX side only?), the orig_skb's hwtstamps has not been written yet?
>
>> The hwtstamps should only be needed when the NIC is done sending?
>
> In this patch, yes, hwtstamps are the records in tx path.
>
> Thanks,
> Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 7:01 ` Martin KaFai Lau
@ 2024-10-16 7:54 ` Jason Xing
2024-10-16 8:31 ` Martin KaFai Lau
0 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-16 7:54 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Jakub Sitnicki, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, eddyz87,
song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
bpf, netdev, Jason Xing
On Wed, Oct 16, 2024 at 3:01 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/15/24 11:30 PM, Jason Xing wrote:
> > On Wed, Oct 16, 2024 at 2:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 10/15/24 6:32 PM, Jason Xing wrote:
> >>> On Wed, Oct 16, 2024 at 9:04 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >>>>
> >>>> On Wed, Oct 16, 2024 at 8:10 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>>>>
> >>>>> On 10/11/24 9:06 PM, Jason Xing wrote:
> >>>>>> From: Jason Xing <kernelxing@tencent.com>
> >>>>>>
> >>>>>> Willem suggested that we use a static key to control. The advantage
> >>>>>> is that we will not affect the existing applications at all if we
> >>>>>> don't load BPF program.
> >>>>>>
> >>>>>> In this patch, except the static key, I also add one logic that is
> >>>>>> used to test if the socket has enabled its tsflags in order to
> >>>>>> support bpf logic to allow both cases to happen at the same time.
> >>>>>> Or else, the skb carring related timestamp flag doesn't know which
> >>>>>> way of printing is desirable.
> >>>>>>
> >>>>>> One thing important is this patch allows print from both applications
> >>>>>> and bpf program at the same time. Now we have three kinds of print:
> >>>>>> 1) only BPF program prints
> >>>>>> 2) only application program prints
> >>>>>> 3) both can print without side effect
> >>>>>>
> >>>>>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> >>>>>> ---
> >>>>>> include/net/sock.h | 1 +
> >>>>>> net/core/filter.c | 3 +++
> >>>>>> net/core/skbuff.c | 38 ++++++++++++++++++++++++++++++++++++++
> >>>>>> 3 files changed, 42 insertions(+)
> >>>>>>
> >>>>>> diff --git a/include/net/sock.h b/include/net/sock.h
> >>>>>> index 66ecd78f1dfe..b7c51b95c92d 100644
> >>>>>> --- a/include/net/sock.h
> >>>>>> +++ b/include/net/sock.h
> >>>>>> @@ -2889,6 +2889,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
> >>>>>> void sock_def_readable(struct sock *sk);
> >>>>>>
> >>>>>> int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
> >>>>>> +DECLARE_STATIC_KEY_FALSE(bpf_tstamp_control);
> >>>>>> void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
> >>>>>> int sock_get_timestamping(struct so_timestamping *timestamping,
> >>>>>> sockptr_t optval, unsigned int optlen);
> >>>>>> diff --git a/net/core/filter.c b/net/core/filter.c
> >>>>>> index 996426095bd9..08135f538c99 100644
> >>>>>> --- a/net/core/filter.c
> >>>>>> +++ b/net/core/filter.c
> >>>>>> @@ -5204,6 +5204,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
> >>>>>> .arg1_type = ARG_PTR_TO_CTX,
> >>>>>> };
> >>>>>>
> >>>>>> +DEFINE_STATIC_KEY_FALSE(bpf_tstamp_control);
> >>>>>> +
> >>>>>> static int bpf_sock_set_timestamping(struct sock *sk,
> >>>>>> struct so_timestamping *timestamping)
> >>>>>> {
> >>>>>> @@ -5217,6 +5219,7 @@ static int bpf_sock_set_timestamping(struct sock *sk,
> >>>>>> return -EINVAL;
> >>>>>>
> >>>>>> WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
> >>>>>> + static_branch_enable(&bpf_tstamp_control);
> >>>>>
> >>>>> Not sure when is a good time to do static_branch_disable().
> >>>>
> >>>> Thanks for the review.
> >>>>
> >>>> To be honest, I considered how to disable the static key. Like you
> >>>> said, I failed to find a good chance that I can accurately disable it.
> >>>>
> >>>>>
> >>>>> The bpf prog may be detached also. (IF) it ends up staying with the
> >>>>> cgroup/sockops interface, it should depend on the existing static key in
> >>>>> cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
> >>>>
> >>>> Are you suggesting that we need to remove the current static key? In
> >>>> the previous thread, the reason why Willem came up with this idea is,
> >>>> I think, to avoid affect the non-bpf timestamping feature.
> >>>>
> >>>>>
> >>>>>>
> >>>>>> return 0;
> >>>>>> }
> >>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >>>>>> index f36eb9daa31a..d0f912f1ff7b 100644
> >>>>>> --- a/net/core/skbuff.c
> >>>>>> +++ b/net/core/skbuff.c
> >>>>>> @@ -5540,6 +5540,29 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> >>>>>> }
> >>>>>> EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> >>>>>>
> >>>>>> +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
> >>>>>
> >>>>> sk is unused.
> >>>>
> >>>> Thanks for the careful check.
> >>>>
> >>>>>
> >>>>>> +{
> >>>>>> + u32 testflag;
> >>>>>> +
> >>>>>> + switch (tstype) {
> >>>>>> + case SCM_TSTAMP_SCHED:
> >>>>>
> >>>>> Instead of doing this translation,
> >>>>> is it easier to directly store the bpf prog desired ts"type" (i.e. the
> >>>>> SCM_TSTAMP_*) in the sk->sk_tsflags_bpf?
> >>>>> or there is a specific need to keep the SOF_TIMESTAMPING_* value in
> >>>>> sk->sk_tsflags_bpf?
> >>>>
> >>>> We have to reuse SOF_TIMESTAMPING_* because there are more flags, say,
> >>>> SOF_TIMESTAMPING_OPT_ID, that we need to support.
> >>>>
> >>>>>
> >>>>>> + testflag = SOF_TIMESTAMPING_TX_SCHED;
> >>>>>> + break;
> >>>>>> + case SCM_TSTAMP_SND:
> >>>>>> + testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> >>>>>> + break;
> >>>>>> + case SCM_TSTAMP_ACK:
> >>>>>> + testflag = SOF_TIMESTAMPING_TX_ACK;
> >>>>>> + break;
> >>>>>> + default:
> >>>>>> + return false;
> >>>>>> + }
> >>>>>> + if (tsflags & testflag)
> >>>>>> + return true;
> >>>>>> +
> >>>>>> + return false;
> >>>>>> +}
> >>>>>> +
> >>>>>> static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> >>>>>> const struct sk_buff *ack_skb,
> >>>>>> struct skb_shared_hwtstamps *hwtstamps,
> >>>>>> @@ -5558,6 +5581,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> >>>>>> if (!skb_may_tx_timestamp(sk, tsonly))
> >>>>>> return;
> >>>>>>
> >>>>>> + if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> >>>>>
> >>>>> This is a new test. tsflags is the sk->sk_tsflags here if I read it correctly.
> >>>>
> >>>> This test will be used in bpf and non-bpf cases. Because of this, we
> >>>> can support BPF extension. In this function, if skb has tsflags but we
> >>>> don't know which approach the user expects, sk_tstamp_tx_flags() can
> >>>> help us.
> >>>>
> >>>>>
> >>>>> My understanding is the sendmsg can provide SOF_TIMESTAMPING_* for individual
> >>>>> skb. Would it break?
> >>>>
> >>>> Oh, you're right. I didn't support cmsg mode...
> >>>
> >>> I think I only need to test if it's in the bpf mode, or else let the
> >>> original way print the timestamp, which can solve the issue.
> >>
> >> From looking at the existing "__skb_tstamp_tx(skb, NULL, NULL, skb->sk,
> >> SCM_TSTAMP_SCHED);":
> >>
> >> int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
> >> {
> >> /* ... */
> >>
> >> if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
> >> __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
> >>
> >> /* ... */
> >> }
> >>
> >> I am still puzzling how __skb_tstamp_tx() will be called if only bpf has enabled
> >> the timestamping. I may have missed somewhere in the patch set that the skb's
> >> tx_flags is changed by sk->sk_tsflags_bpf alone?
> >
> > If sk_tsflags_bpf is set, tcp_sendmsg() -> tcp_tx_timestamp() will be
> > helpful, which initializes every last skb, please see patch [10/12].
>
> Ah. ok. It is the thing I missed. Thanks for the pointer.
>
> >>
> >> I think a skb tskey is still desired (?), so eventually we want some spaces in
> >
> > tskey function is optional I think. It depends whether users want to
> > use it or not. It can controlled by SOF_TIMESTAMPING_OPT_ID flag.
> >
> >> the skb for bpf. Jakub Sitnicki (cc-ed) has presented in LPC about extending
> >> skb->data_meta usage outside of xdp and tc. I think here we want to have it
> >> available at the tx side to store the tx_flags and tskey but probably want them
> >> at a specific place/offset at the data_meta.
> >
> > If we have the plan to store extra information in data_meta, I can
> > give it a try:)
> >
> >>
> >> For now, is there thing we can explore to share in the skb_shared_info?
> >
> > My initial thought is just to reuse these fields in skb. It can work
> > without interfering one another.
>
> After reading closer to patch 10, I am likely still missing something. How can
> it tell if the tx_flags is set by the bpf or by the user space cmsg?
If the skb carries the timestamp, there are three cases:
1) non-bpf case and users uses setsockopt()
2) cmsg case
3) bpf case
#1 and #2 are already handled well before this patch. I only need to
test if sk_tsflags_bpf has those flags. If so, it means we hit #3, or
else it could be #1 or #2, then we will let the old way print
timestamps in __skb_tstamp_tx().
>
> >
> >> Can the "struct skb_shared_hwtstamps hwtstamps;" be used for the bpf tx_flags and tskey
> >> only at the "tx" side? There is already another union member.
> >
> > tskey is always used in the tx path.
> >
> > hwtstamps can be used in both rx and tx cases (please see
> > tcp_update_recv_tstamps() and skb_tstamp_tx()).
>
> hmm... we only need some where to store the bpf tx_flags and bpf tskey in the
> TX-ing skb.
And there is one more field we have to take care of: txstamp_ack which
indicates whether we print timestamp when the last skb is acked.
Please see tcp_tx_timestamp().
> You meant the hwtstamps of a Tx-ing skb is not empty?
Sometimes, it's not empty if the hardware supports the timestamp
feature and the user wants to see it (by enabling the
SOF_TIMESTAMPING_TX_HARDWARE flag). As we can see, there are many
callers calling skb_tstamp_tx().
>
> At skb_tstamp_tx (TX side only?), the orig_skb's hwtstamps has not been written yet?
I'm not that sure about the orig_skb. It seems no. I can see some
callers reading ptp timestamp from the nic and pass the timestamp to
skb_tstamp_tx().
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 7:54 ` Jason Xing
@ 2024-10-16 8:31 ` Martin KaFai Lau
2024-10-16 10:36 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-16 8:31 UTC (permalink / raw)
To: Jason Xing
Cc: Jakub Sitnicki, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, eddyz87,
song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
bpf, netdev, Jason Xing
On 10/16/24 12:54 AM, Jason Xing wrote:
> On Wed, Oct 16, 2024 at 3:01 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 10/15/24 11:30 PM, Jason Xing wrote:
>>> On Wed, Oct 16, 2024 at 2:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>>
>>>> On 10/15/24 6:32 PM, Jason Xing wrote:
>>>>> On Wed, Oct 16, 2024 at 9:04 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>>>>>>
>>>>>> On Wed, Oct 16, 2024 at 8:10 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>>>>>
>>>>>>> On 10/11/24 9:06 PM, Jason Xing wrote:
>>>>>>>> From: Jason Xing <kernelxing@tencent.com>
>>>>>>>>
>>>>>>>> Willem suggested that we use a static key to control. The advantage
>>>>>>>> is that we will not affect the existing applications at all if we
>>>>>>>> don't load BPF program.
>>>>>>>>
>>>>>>>> In this patch, except the static key, I also add one logic that is
>>>>>>>> used to test if the socket has enabled its tsflags in order to
>>>>>>>> support bpf logic to allow both cases to happen at the same time.
>>>>>>>> Or else, the skb carring related timestamp flag doesn't know which
>>>>>>>> way of printing is desirable.
>>>>>>>>
>>>>>>>> One thing important is this patch allows print from both applications
>>>>>>>> and bpf program at the same time. Now we have three kinds of print:
>>>>>>>> 1) only BPF program prints
>>>>>>>> 2) only application program prints
>>>>>>>> 3) both can print without side effect
>>>>>>>>
>>>>>>>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
>>>>>>>> ---
>>>>>>>> include/net/sock.h | 1 +
>>>>>>>> net/core/filter.c | 3 +++
>>>>>>>> net/core/skbuff.c | 38 ++++++++++++++++++++++++++++++++++++++
>>>>>>>> 3 files changed, 42 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>>>>>>> index 66ecd78f1dfe..b7c51b95c92d 100644
>>>>>>>> --- a/include/net/sock.h
>>>>>>>> +++ b/include/net/sock.h
>>>>>>>> @@ -2889,6 +2889,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
>>>>>>>> void sock_def_readable(struct sock *sk);
>>>>>>>>
>>>>>>>> int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
>>>>>>>> +DECLARE_STATIC_KEY_FALSE(bpf_tstamp_control);
>>>>>>>> void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
>>>>>>>> int sock_get_timestamping(struct so_timestamping *timestamping,
>>>>>>>> sockptr_t optval, unsigned int optlen);
>>>>>>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>>>>>>> index 996426095bd9..08135f538c99 100644
>>>>>>>> --- a/net/core/filter.c
>>>>>>>> +++ b/net/core/filter.c
>>>>>>>> @@ -5204,6 +5204,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
>>>>>>>> .arg1_type = ARG_PTR_TO_CTX,
>>>>>>>> };
>>>>>>>>
>>>>>>>> +DEFINE_STATIC_KEY_FALSE(bpf_tstamp_control);
>>>>>>>> +
>>>>>>>> static int bpf_sock_set_timestamping(struct sock *sk,
>>>>>>>> struct so_timestamping *timestamping)
>>>>>>>> {
>>>>>>>> @@ -5217,6 +5219,7 @@ static int bpf_sock_set_timestamping(struct sock *sk,
>>>>>>>> return -EINVAL;
>>>>>>>>
>>>>>>>> WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
>>>>>>>> + static_branch_enable(&bpf_tstamp_control);
>>>>>>>
>>>>>>> Not sure when is a good time to do static_branch_disable().
>>>>>>
>>>>>> Thanks for the review.
>>>>>>
>>>>>> To be honest, I considered how to disable the static key. Like you
>>>>>> said, I failed to find a good chance that I can accurately disable it.
>>>>>>
>>>>>>>
>>>>>>> The bpf prog may be detached also. (IF) it ends up staying with the
>>>>>>> cgroup/sockops interface, it should depend on the existing static key in
>>>>>>> cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
>>>>>>
>>>>>> Are you suggesting that we need to remove the current static key? In
>>>>>> the previous thread, the reason why Willem came up with this idea is,
>>>>>> I think, to avoid affect the non-bpf timestamping feature.
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> return 0;
>>>>>>>> }
>>>>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>>>>> index f36eb9daa31a..d0f912f1ff7b 100644
>>>>>>>> --- a/net/core/skbuff.c
>>>>>>>> +++ b/net/core/skbuff.c
>>>>>>>> @@ -5540,6 +5540,29 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
>>>>>>>> }
>>>>>>>> EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
>>>>>>>>
>>>>>>>> +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
>>>>>>>
>>>>>>> sk is unused.
>>>>>>
>>>>>> Thanks for the careful check.
>>>>>>
>>>>>>>
>>>>>>>> +{
>>>>>>>> + u32 testflag;
>>>>>>>> +
>>>>>>>> + switch (tstype) {
>>>>>>>> + case SCM_TSTAMP_SCHED:
>>>>>>>
>>>>>>> Instead of doing this translation,
>>>>>>> is it easier to directly store the bpf prog desired ts"type" (i.e. the
>>>>>>> SCM_TSTAMP_*) in the sk->sk_tsflags_bpf?
>>>>>>> or there is a specific need to keep the SOF_TIMESTAMPING_* value in
>>>>>>> sk->sk_tsflags_bpf?
>>>>>>
>>>>>> We have to reuse SOF_TIMESTAMPING_* because there are more flags, say,
>>>>>> SOF_TIMESTAMPING_OPT_ID, that we need to support.
>>>>>>
>>>>>>>
>>>>>>>> + testflag = SOF_TIMESTAMPING_TX_SCHED;
>>>>>>>> + break;
>>>>>>>> + case SCM_TSTAMP_SND:
>>>>>>>> + testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
>>>>>>>> + break;
>>>>>>>> + case SCM_TSTAMP_ACK:
>>>>>>>> + testflag = SOF_TIMESTAMPING_TX_ACK;
>>>>>>>> + break;
>>>>>>>> + default:
>>>>>>>> + return false;
>>>>>>>> + }
>>>>>>>> + if (tsflags & testflag)
>>>>>>>> + return true;
>>>>>>>> +
>>>>>>>> + return false;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
>>>>>>>> const struct sk_buff *ack_skb,
>>>>>>>> struct skb_shared_hwtstamps *hwtstamps,
>>>>>>>> @@ -5558,6 +5581,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
>>>>>>>> if (!skb_may_tx_timestamp(sk, tsonly))
>>>>>>>> return;
>>>>>>>>
>>>>>>>> + if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
>>>>>>>
>>>>>>> This is a new test. tsflags is the sk->sk_tsflags here if I read it correctly.
>>>>>>
>>>>>> This test will be used in bpf and non-bpf cases. Because of this, we
>>>>>> can support BPF extension. In this function, if skb has tsflags but we
>>>>>> don't know which approach the user expects, sk_tstamp_tx_flags() can
>>>>>> help us.
>>>>>>
>>>>>>>
>>>>>>> My understanding is the sendmsg can provide SOF_TIMESTAMPING_* for individual
>>>>>>> skb. Would it break?
>>>>>>
>>>>>> Oh, you're right. I didn't support cmsg mode...
>>>>>
>>>>> I think I only need to test if it's in the bpf mode, or else let the
>>>>> original way print the timestamp, which can solve the issue.
>>>>
>>>> From looking at the existing "__skb_tstamp_tx(skb, NULL, NULL, skb->sk,
>>>> SCM_TSTAMP_SCHED);":
>>>>
>>>> int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
>>>> {
>>>> /* ... */
>>>>
>>>> if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
>>>> __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
>>>>
>>>> /* ... */
>>>> }
>>>>
>>>> I am still puzzling how __skb_tstamp_tx() will be called if only bpf has enabled
>>>> the timestamping. I may have missed somewhere in the patch set that the skb's
>>>> tx_flags is changed by sk->sk_tsflags_bpf alone?
>>>
>>> If sk_tsflags_bpf is set, tcp_sendmsg() -> tcp_tx_timestamp() will be
>>> helpful, which initializes every last skb, please see patch [10/12].
>>
>> Ah. ok. It is the thing I missed. Thanks for the pointer.
>>
>>>>
>>>> I think a skb tskey is still desired (?), so eventually we want some spaces in
>>>
>>> tskey function is optional I think. It depends whether users want to
>>> use it or not. It can controlled by SOF_TIMESTAMPING_OPT_ID flag.
>>>
>>>> the skb for bpf. Jakub Sitnicki (cc-ed) has presented in LPC about extending
>>>> skb->data_meta usage outside of xdp and tc. I think here we want to have it
>>>> available at the tx side to store the tx_flags and tskey but probably want them
>>>> at a specific place/offset at the data_meta.
>>>
>>> If we have the plan to store extra information in data_meta, I can
>>> give it a try:)
>>>
>>>>
>>>> For now, is there thing we can explore to share in the skb_shared_info?
>>>
>>> My initial thought is just to reuse these fields in skb. It can work
>>> without interfering one another.
>>
>> After reading closer to patch 10, I am likely still missing something. How can
>> it tell if the tx_flags is set by the bpf or by the user space cmsg?
>
> If the skb carries the timestamp, there are three cases:
> 1) non-bpf case and users uses setsockopt()
> 2) cmsg case
> 3) bpf case
>
> #1 and #2 are already handled well before this patch. I only need to
> test if sk_tsflags_bpf has those flags. If so, it means we hit #3, or
> else it could be #1 or #2, then we will let the old way print
> timestamps in __skb_tstamp_tx().
hmm... I am still not sure I fully understand...but I think I may start getting it.
Is it the reason that the bpf_setsockopt() cannot clear the sk_tsflags_bpf once
it is set in patch 2? It is not a usable api tbh. It will be a surprise to many.
It has to be able to set and clear.
Does it also mean either the bpf or the user space can enable the timetstamping
but not both? I don't think we can assume this also. It will be hard to deploy
the bpf prog in production to collect continuous data. The user space may have
some timestamping enabled but the bpf may want to do its parallel investigation
also. The user space may rollout timestamping in the future and suddenly break
the bpf prog.
[ getting late here. will continue later. ]
>
>>
>>>
>>>> Can the "struct skb_shared_hwtstamps hwtstamps;" be used for the bpf tx_flags and tskey
>>>> only at the "tx" side? There is already another union member.
>>>
>>> tskey is always used in the tx path.
>>>
>>> hwtstamps can be used in both rx and tx cases (please see
>>> tcp_update_recv_tstamps() and skb_tstamp_tx()).
>>
>> hmm... we only need some where to store the bpf tx_flags and bpf tskey in the
>> TX-ing skb.
>
> And there is one more field we have to take care of: txstamp_ack which
> indicates whether we print timestamp when the last skb is acked.
> Please see tcp_tx_timestamp().
>
>> You meant the hwtstamps of a Tx-ing skb is not empty?
>
> Sometimes, it's not empty if the hardware supports the timestamp
> feature and the user wants to see it (by enabling the
> SOF_TIMESTAMPING_TX_HARDWARE flag). As we can see, there are many
> callers calling skb_tstamp_tx().
>
>>
>> At skb_tstamp_tx (TX side only?), the orig_skb's hwtstamps has not been written yet?
>
> I'm not that sure about the orig_skb. It seems no. I can see some
> callers reading ptp timestamp from the nic and pass the timestamp to
> skb_tstamp_tx().
>
> Thanks,
> Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 8:31 ` Martin KaFai Lau
@ 2024-10-16 10:36 ` Jason Xing
2024-10-17 0:48 ` Martin KaFai Lau
0 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-16 10:36 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Jakub Sitnicki, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, eddyz87,
song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
bpf, netdev, Jason Xing
On Wed, Oct 16, 2024 at 4:31 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/16/24 12:54 AM, Jason Xing wrote:
> > On Wed, Oct 16, 2024 at 3:01 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 10/15/24 11:30 PM, Jason Xing wrote:
> >>> On Wed, Oct 16, 2024 at 2:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>>>
> >>>> On 10/15/24 6:32 PM, Jason Xing wrote:
> >>>>> On Wed, Oct 16, 2024 at 9:04 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >>>>>>
> >>>>>> On Wed, Oct 16, 2024 at 8:10 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>>>>>>
> >>>>>>> On 10/11/24 9:06 PM, Jason Xing wrote:
> >>>>>>>> From: Jason Xing <kernelxing@tencent.com>
> >>>>>>>>
> >>>>>>>> Willem suggested that we use a static key to control. The advantage
> >>>>>>>> is that we will not affect the existing applications at all if we
> >>>>>>>> don't load BPF program.
> >>>>>>>>
> >>>>>>>> In this patch, except the static key, I also add one logic that is
> >>>>>>>> used to test if the socket has enabled its tsflags in order to
> >>>>>>>> support bpf logic to allow both cases to happen at the same time.
> >>>>>>>> Or else, the skb carring related timestamp flag doesn't know which
> >>>>>>>> way of printing is desirable.
> >>>>>>>>
> >>>>>>>> One thing important is this patch allows print from both applications
> >>>>>>>> and bpf program at the same time. Now we have three kinds of print:
> >>>>>>>> 1) only BPF program prints
> >>>>>>>> 2) only application program prints
> >>>>>>>> 3) both can print without side effect
> >>>>>>>>
> >>>>>>>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> >>>>>>>> ---
> >>>>>>>> include/net/sock.h | 1 +
> >>>>>>>> net/core/filter.c | 3 +++
> >>>>>>>> net/core/skbuff.c | 38 ++++++++++++++++++++++++++++++++++++++
> >>>>>>>> 3 files changed, 42 insertions(+)
> >>>>>>>>
> >>>>>>>> diff --git a/include/net/sock.h b/include/net/sock.h
> >>>>>>>> index 66ecd78f1dfe..b7c51b95c92d 100644
> >>>>>>>> --- a/include/net/sock.h
> >>>>>>>> +++ b/include/net/sock.h
> >>>>>>>> @@ -2889,6 +2889,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
> >>>>>>>> void sock_def_readable(struct sock *sk);
> >>>>>>>>
> >>>>>>>> int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
> >>>>>>>> +DECLARE_STATIC_KEY_FALSE(bpf_tstamp_control);
> >>>>>>>> void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
> >>>>>>>> int sock_get_timestamping(struct so_timestamping *timestamping,
> >>>>>>>> sockptr_t optval, unsigned int optlen);
> >>>>>>>> diff --git a/net/core/filter.c b/net/core/filter.c
> >>>>>>>> index 996426095bd9..08135f538c99 100644
> >>>>>>>> --- a/net/core/filter.c
> >>>>>>>> +++ b/net/core/filter.c
> >>>>>>>> @@ -5204,6 +5204,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
> >>>>>>>> .arg1_type = ARG_PTR_TO_CTX,
> >>>>>>>> };
> >>>>>>>>
> >>>>>>>> +DEFINE_STATIC_KEY_FALSE(bpf_tstamp_control);
> >>>>>>>> +
> >>>>>>>> static int bpf_sock_set_timestamping(struct sock *sk,
> >>>>>>>> struct so_timestamping *timestamping)
> >>>>>>>> {
> >>>>>>>> @@ -5217,6 +5219,7 @@ static int bpf_sock_set_timestamping(struct sock *sk,
> >>>>>>>> return -EINVAL;
> >>>>>>>>
> >>>>>>>> WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
> >>>>>>>> + static_branch_enable(&bpf_tstamp_control);
> >>>>>>>
> >>>>>>> Not sure when is a good time to do static_branch_disable().
> >>>>>>
> >>>>>> Thanks for the review.
> >>>>>>
> >>>>>> To be honest, I considered how to disable the static key. Like you
> >>>>>> said, I failed to find a good chance that I can accurately disable it.
> >>>>>>
> >>>>>>>
> >>>>>>> The bpf prog may be detached also. (IF) it ends up staying with the
> >>>>>>> cgroup/sockops interface, it should depend on the existing static key in
> >>>>>>> cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
> >>>>>>
> >>>>>> Are you suggesting that we need to remove the current static key? In
> >>>>>> the previous thread, the reason why Willem came up with this idea is,
> >>>>>> I think, to avoid affect the non-bpf timestamping feature.
> >>>>>>
> >>>>>>>
> >>>>>>>>
> >>>>>>>> return 0;
> >>>>>>>> }
> >>>>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >>>>>>>> index f36eb9daa31a..d0f912f1ff7b 100644
> >>>>>>>> --- a/net/core/skbuff.c
> >>>>>>>> +++ b/net/core/skbuff.c
> >>>>>>>> @@ -5540,6 +5540,29 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> >>>>>>>> }
> >>>>>>>> EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> >>>>>>>>
> >>>>>>>> +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
> >>>>>>>
> >>>>>>> sk is unused.
> >>>>>>
> >>>>>> Thanks for the careful check.
> >>>>>>
> >>>>>>>
> >>>>>>>> +{
> >>>>>>>> + u32 testflag;
> >>>>>>>> +
> >>>>>>>> + switch (tstype) {
> >>>>>>>> + case SCM_TSTAMP_SCHED:
> >>>>>>>
> >>>>>>> Instead of doing this translation,
> >>>>>>> is it easier to directly store the bpf prog desired ts"type" (i.e. the
> >>>>>>> SCM_TSTAMP_*) in the sk->sk_tsflags_bpf?
> >>>>>>> or there is a specific need to keep the SOF_TIMESTAMPING_* value in
> >>>>>>> sk->sk_tsflags_bpf?
> >>>>>>
> >>>>>> We have to reuse SOF_TIMESTAMPING_* because there are more flags, say,
> >>>>>> SOF_TIMESTAMPING_OPT_ID, that we need to support.
> >>>>>>
> >>>>>>>
> >>>>>>>> + testflag = SOF_TIMESTAMPING_TX_SCHED;
> >>>>>>>> + break;
> >>>>>>>> + case SCM_TSTAMP_SND:
> >>>>>>>> + testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> >>>>>>>> + break;
> >>>>>>>> + case SCM_TSTAMP_ACK:
> >>>>>>>> + testflag = SOF_TIMESTAMPING_TX_ACK;
> >>>>>>>> + break;
> >>>>>>>> + default:
> >>>>>>>> + return false;
> >>>>>>>> + }
> >>>>>>>> + if (tsflags & testflag)
> >>>>>>>> + return true;
> >>>>>>>> +
> >>>>>>>> + return false;
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> >>>>>>>> const struct sk_buff *ack_skb,
> >>>>>>>> struct skb_shared_hwtstamps *hwtstamps,
> >>>>>>>> @@ -5558,6 +5581,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> >>>>>>>> if (!skb_may_tx_timestamp(sk, tsonly))
> >>>>>>>> return;
> >>>>>>>>
> >>>>>>>> + if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> >>>>>>>
> >>>>>>> This is a new test. tsflags is the sk->sk_tsflags here if I read it correctly.
> >>>>>>
> >>>>>> This test will be used in bpf and non-bpf cases. Because of this, we
> >>>>>> can support BPF extension. In this function, if skb has tsflags but we
> >>>>>> don't know which approach the user expects, sk_tstamp_tx_flags() can
> >>>>>> help us.
> >>>>>>
> >>>>>>>
> >>>>>>> My understanding is the sendmsg can provide SOF_TIMESTAMPING_* for individual
> >>>>>>> skb. Would it break?
> >>>>>>
> >>>>>> Oh, you're right. I didn't support cmsg mode...
> >>>>>
> >>>>> I think I only need to test if it's in the bpf mode, or else let the
> >>>>> original way print the timestamp, which can solve the issue.
> >>>>
> >>>> From looking at the existing "__skb_tstamp_tx(skb, NULL, NULL, skb->sk,
> >>>> SCM_TSTAMP_SCHED);":
> >>>>
> >>>> int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
> >>>> {
> >>>> /* ... */
> >>>>
> >>>> if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
> >>>> __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
> >>>>
> >>>> /* ... */
> >>>> }
> >>>>
> >>>> I am still puzzling how __skb_tstamp_tx() will be called if only bpf has enabled
> >>>> the timestamping. I may have missed somewhere in the patch set that the skb's
> >>>> tx_flags is changed by sk->sk_tsflags_bpf alone?
> >>>
> >>> If sk_tsflags_bpf is set, tcp_sendmsg() -> tcp_tx_timestamp() will be
> >>> helpful, which initializes every last skb, please see patch [10/12].
> >>
> >> Ah. ok. It is the thing I missed. Thanks for the pointer.
> >>
> >>>>
> >>>> I think a skb tskey is still desired (?), so eventually we want some spaces in
> >>>
> >>> tskey function is optional I think. It depends whether users want to
> >>> use it or not. It can controlled by SOF_TIMESTAMPING_OPT_ID flag.
> >>>
> >>>> the skb for bpf. Jakub Sitnicki (cc-ed) has presented in LPC about extending
> >>>> skb->data_meta usage outside of xdp and tc. I think here we want to have it
> >>>> available at the tx side to store the tx_flags and tskey but probably want them
> >>>> at a specific place/offset at the data_meta.
> >>>
> >>> If we have the plan to store extra information in data_meta, I can
> >>> give it a try:)
> >>>
> >>>>
> >>>> For now, is there thing we can explore to share in the skb_shared_info?
> >>>
> >>> My initial thought is just to reuse these fields in skb. It can work
> >>> without interfering one another.
> >>
> >> After reading closer to patch 10, I am likely still missing something. How can
> >> it tell if the tx_flags is set by the bpf or by the user space cmsg?
> >
> > If the skb carries the timestamp, there are three cases:
> > 1) non-bpf case and users uses setsockopt()
> > 2) cmsg case
> > 3) bpf case
> >
> > #1 and #2 are already handled well before this patch. I only need to
> > test if sk_tsflags_bpf has those flags. If so, it means we hit #3, or
> > else it could be #1 or #2, then we will let the old way print
> > timestamps in __skb_tstamp_tx().
>
> hmm... I am still not sure I fully understand...but I think I may start getting it.
Sorry, my bad. I gave the wrong answer...
It should be:
Testing if if sk_tsflags has SOF_TIMESTAMPING_SOFTWARE flag should
work fine. If it has the flag, we could use skb_tstamp_tx_output() to
print based on patch [4/12]; if not, we will use
bpf_skb_tstamp_tx_output() to print.
If users use traditional ways of deploying SO_TIMESTAMPING, sk_tsflags
always has SOF_TIMESTAMPING_SOFTWARE which is a software report flag
(please see Documentation/networking/timestamping.rst). We can see a
good example on how to use in
tools/testing/selftests/net/txtimestamp.c:
do_test()
{
sock_opt = SOF_TIMESTAMPING_SOFTWARE |
...
if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
(char *) &sock_opt, sizeof(sock_opt)))
}
>
> Is it the reason that the bpf_setsockopt() cannot clear the sk_tsflags_bpf once
> it is set in patch 2? It is not a usable api tbh. It will be a surprise to many.
> It has to be able to set and clear.
I cannot find a good time to clear all the sockets which are set
through the BPF program. If we detach the BPF program, it will not
print of course. Does it really matter if we don't clear the
sk_tsflags_bpf?
>
> Does it also mean either the bpf or the user space can enable the timetstamping
> but not both? I don't think we can assume this also. It will be hard to deploy
> the bpf prog in production to collect continuous data. The user space may have
> some timestamping enabled but the bpf may want to do its parallel investigation
> also. The user space may rollout timestamping in the future and suddenly break
> the bpf prog.
Well, IIUC, it's also the basic idea from the current series which
allows both happening at the same time. Let us put it in a simple way,
I hope that if the app uses the SO_TIMESTAMPING feature already, then
one admin deploys the BPF program, that app should be traced both in
bpf and non-bpf ways.
But Willem doesn't agree about this approach[1] because of hard to debug.
[1]: https://lore.kernel.org/all/670dda9437147_2e6c4029461@willemb.c.googlers.com.notmuch/
Regarding to this link, I have a few more words to say: the socket
could be set through bpf_setsockopt() in different phases not like
setsockopt(), so in some cases we cannot make setsockopt hard failed.
After rethinking this point more, I still reckon that letting BPF
program trace timestamping parallelly without caring whether the
socket is set to the SO_TIMESTAMPING feature through setsockopt()
method. It means I would like to keep this part in patch [04/12]:
@@ -5601,6 +5636,9 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!sk)
return;
+ if (static_branch_unlikely(&bpf_tstamp_control))
+ bpf_skb_tstamp_tx_output(sk, tstype);
+
skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk,
tstype);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
>
> [ getting late here. will continue later. ]
Thanks for your effort :)
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 10:36 ` Jason Xing
@ 2024-10-17 0:48 ` Martin KaFai Lau
2024-10-17 2:28 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-17 0:48 UTC (permalink / raw)
To: Jason Xing
Cc: Jakub Sitnicki, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, eddyz87,
song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
bpf, netdev, Jason Xing
On 10/16/24 3:36 AM, Jason Xing wrote:
>>> If the skb carries the timestamp, there are three cases:
>>> 1) non-bpf case and users uses setsockopt()
>>> 2) cmsg case
>>> 3) bpf case
These should have tests in the selftests/bpf/ sooner than later. (More below).
>>>
>>> #1 and #2 are already handled well before this patch. I only need to
>>> test if sk_tsflags_bpf has those flags. If so, it means we hit #3, or
>>> else it could be #1 or #2, then we will let the old way print
>>> timestamps in __skb_tstamp_tx().
>>
>> hmm... I am still not sure I fully understand...but I think I may start getting it.
>
> Sorry, my bad. I gave the wrong answer...
>
> It should be:
> Testing if if sk_tsflags has SOF_TIMESTAMPING_SOFTWARE flag should
You meant adding SOF_TIMESTAMPING_SOFTWARE test to the sk_tstamp_tx_flags()?
Before any bpf changes, if I read __skb_tstamp_tx() correctly, the current
behavior is to just queue to the sk_error_queue as long as there is
"SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it is regardless of the
sk_tsflags. This will eventually get ignored when user read it from the error
queue because the SOF_TIMESTAMPING_SOFTWARE is not set in sk_tsflags? I suspect
the user space will still read something from the error queue unless there is
SOF_TIMESTAMPING_OPT_TSONLY but it won't have the tstamp cmsg.
Adding SOF_TIMESTAMPING_SOFTWARE test to the sk_tstamp_tx_flags() will stop it
from even queuing to the error queue? I think it is ok but I am not sure if
anyone is depending on the above behavior.
> work fine. If it has the flag, we could use skb_tstamp_tx_output() to
> print based on patch [4/12]; if not, we will use
> bpf_skb_tstamp_tx_output() to print.
>
> If users use traditional ways of deploying SO_TIMESTAMPING, sk_tsflags
> always has SOF_TIMESTAMPING_SOFTWARE which is a software report flag
> (please see Documentation/networking/timestamping.rst). We can see a
> good example on how to use in
> tools/testing/selftests/net/txtimestamp.c:
> do_test()
> {
> sock_opt = SOF_TIMESTAMPING_SOFTWARE |
> ...
> if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
> (char *) &sock_opt, sizeof(sock_opt)))
> }
>
>>
>> Is it the reason that the bpf_setsockopt() cannot clear the sk_tsflags_bpf once
>> it is set in patch 2? It is not a usable api tbh. It will be a surprise to many.
>> It has to be able to set and clear.
>
> I cannot find a good time to clear all the sockets which are set
> through the BPF program. If we detach the BPF program, it will not
> print of course. Does it really matter if we don't clear the
> sk_tsflags_bpf?
Yes, it matters. The same reason goes for why the existing bpf prog can clear
the tp->bpf_sock_ops_cb_flags. Yes, detach will automatically not taking the
timestamp. For sockops program that stays forever, not all usages expect to do
timestamping for the whole lifetime of the connection. If there is a way for the
prog to turn it on, it should have a way for the prog to turn it off.
What is the concern of allowing the bpf prog to disable something that it has
enabled before?
While we are on bpf_sock_ops_cb_flags, the
BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG addition is mostly a dup of whatever in
the new sk_tsflags_bpf. It is something we need to clean up later when we decide
what interface to use for bpf timestamping.
>
>>
>> Does it also mean either the bpf or the user space can enable the timetstamping
>> but not both? I don't think we can assume this also. It will be hard to deploy
>> the bpf prog in production to collect continuous data. The user space may have
>> some timestamping enabled but the bpf may want to do its parallel investigation
>> also. The user space may rollout timestamping in the future and suddenly break
>> the bpf prog.
>
> Well, IIUC, it's also the basic idea from the current series which
> allows both happening at the same time. Let us put it in a simple way,
> I hope that if the app uses the SO_TIMESTAMPING feature already, then
> one admin deploys the BPF program, that app should be traced both in
> bpf and non-bpf ways.
>
> But Willem doesn't agree about this approach[1] because of hard to debug.
>
> [1]: https://lore.kernel.org/all/670dda9437147_2e6c4029461@willemb.c.googlers.com.notmuch/
> Regarding to this link, I have a few more words to say: the socket
> could be set through bpf_setsockopt() in different phases not like
> setsockopt(), so in some cases we cannot make setsockopt hard failed.
>
> After rethinking this point more, I still reckon that letting BPF
> program trace timestamping parallelly without caring whether the
> socket is set to the SO_TIMESTAMPING feature through setsockopt()
I am afraid having both work in parallel is needed. Otherwise, it will be very
hard to deploy a bpf prog to run continuously in scale. Being able to collect
timestamp without worrying about application changes/updates/downgrades is
important. e.g. App changes from no time stamping to time stamping
Please help to add selftests to show how the above cases (1), (2), (3), and
other tsflags/txflags sharing cases will work. This should not be delayed until
the discussion is done. It is needed sooner or later to prove both bpf and
non-bpf ways can work at the same time. It will help the reviewer and also help
to think about the design with a real use case in bpf prog.
The example in patch 0 only prints the reported tstamp, can you share how it
will be used to investigate issue? Is it also useful to know when the skb is
written to the kernel during sendmsg()?
Regarding the bpf_setsockopt() can be called in different phase,
bpf_setsockopt() is not limited to sockops program. e.g. it can also be called
from a bpf-tcp-cc (congestion control). Not a tcp-cc expert but I won't be
surprised people will try to trigger some on-and-off timestamping from
bpf-tcp-cc to measure some delay.
More about bpf_setsockopt() in different phase, understand that UDP is not your
priority. However, it needs to have some clarity on how UDP will work and how to
enable it. UDP usually has no connect/established phase.
Regarding the SOF_TIMESTAMPING_* support, can you list out what else you are
planning to support in the future. You mentioned the SOF_TIMESTAMPING_TX_ACK in
another thread. What else?
> method. It means I would like to keep this part in patch [04/12]:
> @@ -5601,6 +5636,9 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
> if (!sk)
> return;
>
> + if (static_branch_unlikely(&bpf_tstamp_control))
> + bpf_skb_tstamp_tx_output(sk, tstype);
> +
> skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk,
> tstype);
> }
> EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
>
>>
>> [ getting late here. will continue later. ]
>
> Thanks for your effort :)
>
> Thanks,
> Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-17 0:48 ` Martin KaFai Lau
@ 2024-10-17 2:28 ` Jason Xing
2024-10-17 20:43 ` Martin KaFai Lau
0 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-17 2:28 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Jakub Sitnicki, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, eddyz87,
song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
bpf, netdev, Jason Xing
On Thu, Oct 17, 2024 at 8:48 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/16/24 3:36 AM, Jason Xing wrote:
> >>> If the skb carries the timestamp, there are three cases:
> >>> 1) non-bpf case and users uses setsockopt()
> >>> 2) cmsg case
> >>> 3) bpf case
>
> These should have tests in the selftests/bpf/ sooner than later. (More below).
>
> >>>
> >>> #1 and #2 are already handled well before this patch. I only need to
> >>> test if sk_tsflags_bpf has those flags. If so, it means we hit #3, or
> >>> else it could be #1 or #2, then we will let the old way print
> >>> timestamps in __skb_tstamp_tx().
> >>
> >> hmm... I am still not sure I fully understand...but I think I may start getting it.
> >
> > Sorry, my bad. I gave the wrong answer...
> >
> > It should be:
> > Testing if if sk_tsflags has SOF_TIMESTAMPING_SOFTWARE flag should
>
> You meant adding SOF_TIMESTAMPING_SOFTWARE test to the sk_tstamp_tx_flags()?
Yep.
>
> Before any bpf changes, if I read __skb_tstamp_tx() correctly, the current
> behavior is to just queue to the sk_error_queue as long as there is
> "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it is regardless of the
> sk_tsflags. This will eventually get ignored when user read it from the error
> queue because the SOF_TIMESTAMPING_SOFTWARE is not set in sk_tsflags?
Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
read the skb from the errqueue but are not able to parse the
timestamps. Please see
tcp_recvmsg()->inet_recv_error()->ip_recv_error()->sock_recv_timestamp()->__sock_recv_timestamp():
if ((tsflags & SOF_TIMESTAMPING_SOFTWARE...
ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
> I suspect
> the user space will still read something from the error queue unless there is
> SOF_TIMESTAMPING_OPT_TSONLY but it won't have the tstamp cmsg.
No, please see above.
>
> Adding SOF_TIMESTAMPING_SOFTWARE test to the sk_tstamp_tx_flags() will stop it
> from even queuing to the error queue? I think it is ok but I am not sure if
> anyone is depending on the above behavior.
SOF_TIMESTAMPING_SOFTWARE is only used in traditional SO_TIMESTAMPING
features including cmsg mode. But it will not be used in bpf mode. So
the test statement is enough to divided those three cases into two
groups.
>
> > work fine. If it has the flag, we could use skb_tstamp_tx_output() to
> > print based on patch [4/12]; if not, we will use
> > bpf_skb_tstamp_tx_output() to print.
> >
> > If users use traditional ways of deploying SO_TIMESTAMPING, sk_tsflags
> > always has SOF_TIMESTAMPING_SOFTWARE which is a software report flag
> > (please see Documentation/networking/timestamping.rst). We can see a
> > good example on how to use in
> > tools/testing/selftests/net/txtimestamp.c:
> > do_test()
> > {
> > sock_opt = SOF_TIMESTAMPING_SOFTWARE |
> > ...
> > if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
> > (char *) &sock_opt, sizeof(sock_opt)))
> > }
> >
> >>
> >> Is it the reason that the bpf_setsockopt() cannot clear the sk_tsflags_bpf once
> >> it is set in patch 2? It is not a usable api tbh. It will be a surprise to many.
> >> It has to be able to set and clear.
> >
> > I cannot find a good time to clear all the sockets which are set
> > through the BPF program. If we detach the BPF program, it will not
> > print of course. Does it really matter if we don't clear the
> > sk_tsflags_bpf?
>
> Yes, it matters. The same reason goes for why the existing bpf prog can clear
> the tp->bpf_sock_ops_cb_flags. Yes, detach will automatically not taking the
> timestamp. For sockops program that stays forever, not all usages expect to do
> timestamping for the whole lifetime of the connection. If there is a way for the
> prog to turn it on, it should have a way for the prog to turn it off.
I see what you meant here. If we don't clear sk_tsflags_bpf, after we
detach the bpf program, it will do nothing in __skb_tstamp_tx() and
return earlier. It is almost equal to the effect of turning off. It is
why I don't handle clearing the flag.
>
> What is the concern of allowing the bpf prog to disable something that it has
> enabled before?
Let me give one instance:
If one socket is established and stays idle, how can the bpf program
clear the tsflags from that socket? I have no idea.
>
> While we are on bpf_sock_ops_cb_flags, the
> BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG addition is mostly a dup of whatever in
> the new sk_tsflags_bpf. It is something we need to clean up later when we decide
> what interface to use for bpf timestamping.
I'm not sure if I understand correctly. I mimicked the use of
BPF_SOCK_OPS_RTO_CB_FLAG. Do you mean we can remove the use of
bpf_sock_ops_cb_flags_set() in BPF program?
>
> >
> >>
> >> Does it also mean either the bpf or the user space can enable the timetstamping
> >> but not both? I don't think we can assume this also. It will be hard to deploy
> >> the bpf prog in production to collect continuous data. The user space may have
> >> some timestamping enabled but the bpf may want to do its parallel investigation
> >> also. The user space may rollout timestamping in the future and suddenly break
> >> the bpf prog.
> >
> > Well, IIUC, it's also the basic idea from the current series which
> > allows both happening at the same time. Let us put it in a simple way,
> > I hope that if the app uses the SO_TIMESTAMPING feature already, then
> > one admin deploys the BPF program, that app should be traced both in
> > bpf and non-bpf ways.
> >
> > But Willem doesn't agree about this approach[1] because of hard to debug.
> >
> > [1]: https://lore.kernel.org/all/670dda9437147_2e6c4029461@willemb.c.googlers.com.notmuch/
> > Regarding to this link, I have a few more words to say: the socket
> > could be set through bpf_setsockopt() in different phases not like
> > setsockopt(), so in some cases we cannot make setsockopt hard failed.
> >
> > After rethinking this point more, I still reckon that letting BPF
> > program trace timestamping parallelly without caring whether the
> > socket is set to the SO_TIMESTAMPING feature through setsockopt()
>
> I am afraid having both work in parallel is needed. Otherwise, it will be very
> hard to deploy a bpf prog to run continuously in scale. Being able to collect
> timestamp without worrying about application changes/updates/downgrades is
> important. e.g. App changes from no time stamping to time stamping
Sorry, I didn't make myself clear. Yesterday, I said I agreed with you
:) So let me keep the current logic of printing (see the
__skb_tstamp_tx() function in patch [04/12]) in the next version. Then
I don't need to add some test statements to distinguish which way of
printing.
>
> Please help to add selftests to show how the above cases (1), (2), (3), and
> other tsflags/txflags sharing cases will work. This should not be delayed until
> the discussion is done. It is needed sooner or later to prove both bpf and
> non-bpf ways can work at the same time. It will help the reviewer and also help
> to think about the design with a real use case in bpf prog.
Got it. But I'm not sure where I should put those test cases? Could
you help me point out a good example that I can follow?
>
> The example in patch 0 only prints the reported tstamp, can you share how it
> will be used to investigate issue?
No problem. Please see chapter 3 about "goal" in
https://netdev.bots.linux.dev/netconf/2024/jason_xing.pdf.
> Is it also useful to know when the skb is
> written to the kernel during sendmsg()?
You are right. Before this patch, normally applications will record an
accurate timestamp before do sendmsg().
After you remind me of this, I feel that we can add the timestamp
print in the future for bpf use.
>
> Regarding the bpf_setsockopt() can be called in different phase,
> bpf_setsockopt() is not limited to sockops program. e.g. it can also be called
> from a bpf-tcp-cc (congestion control). Not a tcp-cc expert but I won't be
> surprised people will try to trigger some on-and-off timestamping from
> bpf-tcp-cc to measure some delay.
>
>
> More about bpf_setsockopt() in different phase, understand that UDP is not your
> priority. However, it needs to have some clarity on how UDP will work and how to
> enable it. UDP usually has no connect/established phase.
For now, I don't expect an extension for UDP because it will bring too
much extra work. Could we discuss this later? I mean, after we finish
the basic bpf extension :)
>
> Regarding the SOF_TIMESTAMPING_* support, can you list out what else you are
> planning to support in the future. You mentioned the SOF_TIMESTAMPING_TX_ACK in
> another thread. What else?
In this patch series, I support
SOF_TIMESTAMPING_TX_SCHED|SOF_TIMESTAMPING_TX_ACK|SOF_TIMESTAMPING_TX_SOFTWARE,
which you've already noticed from the BPF example in patch [0/12].
They all come from the original design of SO_TIMESTAMPING feature.
The question you proposed is what I am willing to implement in the
future, like adding one hook in tcp_write_xmit()? It's part of my
plans to extend in the future, not be included in this series.
Thanks for your review.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-17 2:28 ` Jason Xing
@ 2024-10-17 20:43 ` Martin KaFai Lau
2024-10-18 2:52 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-17 20:43 UTC (permalink / raw)
To: Jason Xing
Cc: Jakub Sitnicki, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, eddyz87,
song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
bpf, netdev, Jason Xing
On 10/16/24 7:28 PM, Jason Xing wrote:
> On Thu, Oct 17, 2024 at 8:48 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 10/16/24 3:36 AM, Jason Xing wrote:
>>>>> If the skb carries the timestamp, there are three cases:
>>>>> 1) non-bpf case and users uses setsockopt()
>>>>> 2) cmsg case
>>>>> 3) bpf case
>>
>> These should have tests in the selftests/bpf/ sooner than later. (More below).
>>
>>>>>
>>>>> #1 and #2 are already handled well before this patch. I only need to
>>>>> test if sk_tsflags_bpf has those flags. If so, it means we hit #3, or
>>>>> else it could be #1 or #2, then we will let the old way print
>>>>> timestamps in __skb_tstamp_tx().
>>>>
>>>> hmm... I am still not sure I fully understand...but I think I may start getting it.
>>>
>>> Sorry, my bad. I gave the wrong answer...
>>>
>>> It should be:
>>> Testing if if sk_tsflags has SOF_TIMESTAMPING_SOFTWARE flag should
>>
>> You meant adding SOF_TIMESTAMPING_SOFTWARE test to the sk_tstamp_tx_flags()?
>
> Yep.
>
>>
>> Before any bpf changes, if I read __skb_tstamp_tx() correctly, the current
>> behavior is to just queue to the sk_error_queue as long as there is
>> "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it is regardless of the
>> sk_tsflags. This will eventually get ignored when user read it from the error
>> queue because the SOF_TIMESTAMPING_SOFTWARE is not set in sk_tsflags?
>
> Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
> SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
> read the skb from the errqueue but are not able to parse the
> timestamps. Please see
> tcp_recvmsg()->inet_recv_error()->ip_recv_error()->sock_recv_timestamp()->__sock_recv_timestamp():
> if ((tsflags & SOF_TIMESTAMPING_SOFTWARE...
> ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
afaict, __sock_recv_timestamp does not put the timestamp cmsg but ip_recv_error
still returns the skb to the user.
I suspect we are talking the same thing. When SOF_TIMESTAMPING_SOFTWARE is not
set in sk and SOF_TIMESTAMPING_TX_* is set in cmsg, the existing (aka
traditional) way is that the generated skb will still be queued in the error
queue. The user space can still read it but just won't have the timestamp cmsg.
If I understand how the v3 may look like, the skb will not be queued in the
error queue at all because the sk has no SOF_TIMESTAMPING_SOFTWARE. The user
space won't get it from the error queue which is a change of behavior. I was
saying I am fine but not sure if someone depends on this behavior.
I think we start talking pass each other on this. I will wait for the code on
this part and the selftest first.
>
>> I suspect
>> the user space will still read something from the error queue unless there is
>> SOF_TIMESTAMPING_OPT_TSONLY but it won't have the tstamp cmsg.
>
> No, please see above.
>
>>
>> Adding SOF_TIMESTAMPING_SOFTWARE test to the sk_tstamp_tx_flags() will stop it
>> from even queuing to the error queue? I think it is ok but I am not sure if
>> anyone is depending on the above behavior.
>
> SOF_TIMESTAMPING_SOFTWARE is only used in traditional SO_TIMESTAMPING
Got it. This part is now understood.
It is one of the reasons for my earlier question on which SOF_* that bpf needs
to support. I want to simplify the naming part of the SOF_* in bpf_sesockopt but
lets leave these nits for a little later.
However, it will be very useful to highlight which SOF_* will never be used in
bpf in v3.
> features including cmsg mode. But it will not be used in bpf mode. So
> the test statement is enough to divided those three cases into two
> groups.
>
>>
>>> work fine. If it has the flag, we could use skb_tstamp_tx_output() to
>>> print based on patch [4/12]; if not, we will use
>>> bpf_skb_tstamp_tx_output() to print.
>>>
>>> If users use traditional ways of deploying SO_TIMESTAMPING, sk_tsflags
>>> always has SOF_TIMESTAMPING_SOFTWARE which is a software report flag
>>> (please see Documentation/networking/timestamping.rst). We can see a
>>> good example on how to use in
>>> tools/testing/selftests/net/txtimestamp.c:
>>> do_test()
>>> {
>>> sock_opt = SOF_TIMESTAMPING_SOFTWARE |
>>> ...
>>> if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
>>> (char *) &sock_opt, sizeof(sock_opt)))
>>> }
>>>
>>>>
>>>> Is it the reason that the bpf_setsockopt() cannot clear the sk_tsflags_bpf once
>>>> it is set in patch 2? It is not a usable api tbh. It will be a surprise to many.
>>>> It has to be able to set and clear.
>>>
>>> I cannot find a good time to clear all the sockets which are set
>>> through the BPF program. If we detach the BPF program, it will not
>>> print of course. Does it really matter if we don't clear the
>>> sk_tsflags_bpf?
>>
>> Yes, it matters. The same reason goes for why the existing bpf prog can clear
>> the tp->bpf_sock_ops_cb_flags. Yes, detach will automatically not taking the
>> timestamp. For sockops program that stays forever, not all usages expect to do
>> timestamping for the whole lifetime of the connection. If there is a way for the
>> prog to turn it on, it should have a way for the prog to turn it off.
>
> I see what you meant here. If we don't clear sk_tsflags_bpf, after we
> detach the bpf program, it will do nothing in __skb_tstamp_tx() and
> return earlier. It is almost equal to the effect of turning off. It is
> why I don't handle clearing the flag.
>
>>
>> What is the concern of allowing the bpf prog to disable something that it has
>> enabled before?
>
> Let me give one instance:
> If one socket is established and stays idle, how can the bpf program
> clear the tsflags from that socket? I have no idea.
bpf_tcp_iter prog can. That said, the idle connection example is too carry away
as an excuse that bpf_setsockopt does not need to support turning-off. Sure,
idle connection is as good as off. and yes, detach is as good as off also.
I am now acting as a broken clock repeating myself that not all use cases run
for 5 mins and then detach, so I need to be specific here that bpf_setsockopt
not supporting off is a nack. There are many use cases in production that the
bpf prog runs forever and wants to turn it on-and-off.
Again, bpf sockops prog is not the only one can bpf_setsockopt(). bpf-tcp-cc
that runs forever can also bpf_setsockopt to ask the sockops bpf prog to do
periodic timestamping when needed. bpf_tcp_iter can also bpf_setsockopt to turn
it off if needed.
I am not asking to clear the sk_tsflags_bpf when the bpf prog is detached. I am
asking to support clearing the sk_tsflags_bpf in bpf_setsockopt().
I have still yet heard a technical reason why bpf_setsockopt cannot clear the bits.
>
>>
>> While we are on bpf_sock_ops_cb_flags, the
>> BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG addition is mostly a dup of whatever in
>> the new sk_tsflags_bpf. It is something we need to clean up later when we decide
>> what interface to use for bpf timestamping.
>
> I'm not sure if I understand correctly. I mimicked the use of
> BPF_SOCK_OPS_RTO_CB_FLAG. Do you mean we can remove the use of
> bpf_sock_ops_cb_flags_set() in BPF program?
In patch 5, I meant the BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG is redundant.
It is as good as testing and setting sk_tsflags_bpf alone.
This could be some cleanup for the later stage of the set.
> >>
>>>
>>>>
>>>> Does it also mean either the bpf or the user space can enable the timetstamping
>>>> but not both? I don't think we can assume this also. It will be hard to deploy
>>>> the bpf prog in production to collect continuous data. The user space may have
>>>> some timestamping enabled but the bpf may want to do its parallel investigation
>>>> also. The user space may rollout timestamping in the future and suddenly break
>>>> the bpf prog.
>>>
>>> Well, IIUC, it's also the basic idea from the current series which
>>> allows both happening at the same time. Let us put it in a simple way,
>>> I hope that if the app uses the SO_TIMESTAMPING feature already, then
>>> one admin deploys the BPF program, that app should be traced both in
>>> bpf and non-bpf ways.
>>>
>>> But Willem doesn't agree about this approach[1] because of hard to debug.
>>>
>>> [1]: https://lore.kernel.org/all/670dda9437147_2e6c4029461@willemb.c.googlers.com.notmuch/
>>> Regarding to this link, I have a few more words to say: the socket
>>> could be set through bpf_setsockopt() in different phases not like
>>> setsockopt(), so in some cases we cannot make setsockopt hard failed.
>>>
>>> After rethinking this point more, I still reckon that letting BPF
>>> program trace timestamping parallelly without caring whether the
>>> socket is set to the SO_TIMESTAMPING feature through setsockopt()
>>
>> I am afraid having both work in parallel is needed. Otherwise, it will be very
>> hard to deploy a bpf prog to run continuously in scale. Being able to collect
>> timestamp without worrying about application changes/updates/downgrades is
>> important. e.g. App changes from no time stamping to time stamping
>
> Sorry, I didn't make myself clear. Yesterday, I said I agreed with you
> :) So let me keep the current logic of printing (see the
> __skb_tstamp_tx() function in patch [04/12]) in the next version. Then
> I don't need to add some test statements to distinguish which way of
> printing.
>
>>
>> Please help to add selftests to show how the above cases (1), (2), (3), and
>> other tsflags/txflags sharing cases will work. This should not be delayed until
>> the discussion is done. It is needed sooner or later to prove both bpf and
>> non-bpf ways can work at the same time. It will help the reviewer and also help
>> to think about the design with a real use case in bpf prog.
>
> Got it. But I'm not sure where I should put those test cases? Could
> you help me point out a good example that I can follow?
Have you looked at the selftests/bpf directory?
prog_tests/setget_sockopt.c may be something closer to what you need.
There is a recent one in the mailing list also:
https://lore.kernel.org/all/20241016-syncookie-v1-0-3b7a0de12153@bootlin.com/
The expectation is to be able to run the test like this: ./test_progs -t
setget_sockopt
>
>>
>> The example in patch 0 only prints the reported tstamp, can you share how it
>> will be used to investigate issue?
>
> No problem. Please see chapter 3 about "goal" in
> https://netdev.bots.linux.dev/netconf/2024/jason_xing.pdf.
Thanks.
>
>> Is it also useful to know when the skb is
>> written to the kernel during sendmsg()?
>
> You are right. Before this patch, normally applications will record an
> accurate timestamp before do sendmsg().
>
> After you remind me of this, I feel that we can add the timestamp
> print in the future for bpf use.
Yes, please add the sendmsg timestamp capturing in the selftest. It is useful.
>
>>
>> Regarding the bpf_setsockopt() can be called in different phase,
>> bpf_setsockopt() is not limited to sockops program. e.g. it can also be called
>> from a bpf-tcp-cc (congestion control). Not a tcp-cc expert but I won't be
>> surprised people will try to trigger some on-and-off timestamping from
>> bpf-tcp-cc to measure some delay.
>>
>>
>> More about bpf_setsockopt() in different phase, understand that UDP is not your
>> priority. However, it needs to have some clarity on how UDP will work and how to
>> enable it. UDP usually has no connect/established phase.
>
> For now, I don't expect an extension for UDP because it will bring too
> much extra work. Could we discuss this later? I mean, after we finish
> the basic bpf extension :)
Later is fine but before this set lands. I am not asking a full UDP
implementation but need ideas on how that may look like. We need some clarity on
how UDP will work and also how much new sockops API extension will be needed to
decide if sockops is the correct one going forward. I don't want to end up tcp
is in sockops and UDP (and others) is non sockops.
That said, the current priority is to get bpf prog and user space work without
stepping on each other first.
>
>>
>> Regarding the SOF_TIMESTAMPING_* support, can you list out what else you are
>> planning to support in the future. You mentioned the SOF_TIMESTAMPING_TX_ACK in
>> another thread. What else?
>
> In this patch series, I support
> SOF_TIMESTAMPING_TX_SCHED|SOF_TIMESTAMPING_TX_ACK|SOF_TIMESTAMPING_TX_SOFTWARE,
> which you've already noticed from the BPF example in patch [0/12].
> They all come from the original design of SO_TIMESTAMPING feature.
>
> The question you proposed is what I am willing to implement in the
> future, like adding one hook in tcp_write_xmit()? It's part of my
> plans to extend in the future, not be included in this series.
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-17 20:43 ` Martin KaFai Lau
@ 2024-10-18 2:52 ` Jason Xing
2024-10-18 3:05 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-18 2:52 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Jakub Sitnicki, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, eddyz87,
song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
bpf, netdev, Jason Xing
On Fri, Oct 18, 2024 at 4:43 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/16/24 7:28 PM, Jason Xing wrote:
> > On Thu, Oct 17, 2024 at 8:48 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 10/16/24 3:36 AM, Jason Xing wrote:
> >>>>> If the skb carries the timestamp, there are three cases:
> >>>>> 1) non-bpf case and users uses setsockopt()
> >>>>> 2) cmsg case
> >>>>> 3) bpf case
> >>
> >> These should have tests in the selftests/bpf/ sooner than later. (More below).
> >>
> >>>>>
> >>>>> #1 and #2 are already handled well before this patch. I only need to
> >>>>> test if sk_tsflags_bpf has those flags. If so, it means we hit #3, or
> >>>>> else it could be #1 or #2, then we will let the old way print
> >>>>> timestamps in __skb_tstamp_tx().
> >>>>
> >>>> hmm... I am still not sure I fully understand...but I think I may start getting it.
> >>>
> >>> Sorry, my bad. I gave the wrong answer...
> >>>
> >>> It should be:
> >>> Testing if if sk_tsflags has SOF_TIMESTAMPING_SOFTWARE flag should
> >>
> >> You meant adding SOF_TIMESTAMPING_SOFTWARE test to the sk_tstamp_tx_flags()?
> >
> > Yep.
> >
> >>
> >> Before any bpf changes, if I read __skb_tstamp_tx() correctly, the current
> >> behavior is to just queue to the sk_error_queue as long as there is
> >> "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it is regardless of the
> >> sk_tsflags. This will eventually get ignored when user read it from the error
> >> queue because the SOF_TIMESTAMPING_SOFTWARE is not set in sk_tsflags?
> >
> > Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
> > SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
> > read the skb from the errqueue but are not able to parse the
> > timestamps. Please see
> > tcp_recvmsg()->inet_recv_error()->ip_recv_error()->sock_recv_timestamp()->__sock_recv_timestamp():
> > if ((tsflags & SOF_TIMESTAMPING_SOFTWARE...
> > ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
>
> afaict, __sock_recv_timestamp does not put the timestamp cmsg but ip_recv_error
> still returns the skb to the user.
>
> I suspect we are talking the same thing. When SOF_TIMESTAMPING_SOFTWARE is not
> set in sk and SOF_TIMESTAMPING_TX_* is set in cmsg, the existing (aka
> traditional) way is that the generated skb will still be queued in the error
> queue. The user space can still read it but just won't have the timestamp cmsg.
Apparently, we're on the same page. What you were saying is right, of course :)
>
> If I understand how the v3 may look like, the skb will not be queued in the
> error queue at all because the sk has no SOF_TIMESTAMPING_SOFTWARE. The user
Right, skb will not even be cloned or generated, let alone queue it in
the error queue. For bpf extension, preventing skb to be queued in the
error queue is a very vital thing.
> space won't get it from the error queue which is a change of behavior. I was
> saying I am fine but not sure if someone depends on this behavior.
For bpf part, it's okay to bypass the queuing skb into the errqueue
logic, which has been discussed at netconf before this series with
Willem also.
For non-bpf part, I will not touch and modify their prior behaviour.
>
> I think we start talking pass each other on this. I will wait for the code on
> this part and the selftest first.
I will keep this code snippets in V3 so that three kinds of printing
could work parallelly:
@@ -5601,6 +5636,9 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!sk)
return;
+ if (static_branch_unlikely(&bpf_tstamp_control))
+ bpf_skb_tstamp_tx_output(sk, tstype);
+
skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
}
So please forget what we've talked about testing the
SOF_TIMESTAMPING_SOFTWARE flag thing.
>
> >
> >> I suspect
> >> the user space will still read something from the error queue unless there is
> >> SOF_TIMESTAMPING_OPT_TSONLY but it won't have the tstamp cmsg.
> >
> > No, please see above.
> >
> >>
> >> Adding SOF_TIMESTAMPING_SOFTWARE test to the sk_tstamp_tx_flags() will stop it
> >> from even queuing to the error queue? I think it is ok but I am not sure if
> >> anyone is depending on the above behavior.
> >
> > SOF_TIMESTAMPING_SOFTWARE is only used in traditional SO_TIMESTAMPING
>
> Got it. This part is now understood.
>
> It is one of the reasons for my earlier question on which SOF_* that bpf needs
> to support. I want to simplify the naming part of the SOF_* in bpf_sesockopt but
> lets leave these nits for a little later.
>
> However, it will be very useful to highlight which SOF_* will never be used in
> bpf in v3.
Got it. Will do that.
>
> > features including cmsg mode. But it will not be used in bpf mode. So
> > the test statement is enough to divided those three cases into two
> > groups.
>
> >
> >>
> >>> work fine. If it has the flag, we could use skb_tstamp_tx_output() to
> >>> print based on patch [4/12]; if not, we will use
> >>> bpf_skb_tstamp_tx_output() to print.
> >>>
> >>> If users use traditional ways of deploying SO_TIMESTAMPING, sk_tsflags
> >>> always has SOF_TIMESTAMPING_SOFTWARE which is a software report flag
> >>> (please see Documentation/networking/timestamping.rst). We can see a
> >>> good example on how to use in
> >>> tools/testing/selftests/net/txtimestamp.c:
> >>> do_test()
> >>> {
> >>> sock_opt = SOF_TIMESTAMPING_SOFTWARE |
> >>> ...
> >>> if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
> >>> (char *) &sock_opt, sizeof(sock_opt)))
> >>> }
> >>>
> >>>>
> >>>> Is it the reason that the bpf_setsockopt() cannot clear the sk_tsflags_bpf once
> >>>> it is set in patch 2? It is not a usable api tbh. It will be a surprise to many.
> >>>> It has to be able to set and clear.
> >>>
> >>> I cannot find a good time to clear all the sockets which are set
> >>> through the BPF program. If we detach the BPF program, it will not
> >>> print of course. Does it really matter if we don't clear the
> >>> sk_tsflags_bpf?
> >>
> >> Yes, it matters. The same reason goes for why the existing bpf prog can clear
> >> the tp->bpf_sock_ops_cb_flags. Yes, detach will automatically not taking the
> >> timestamp. For sockops program that stays forever, not all usages expect to do
> >> timestamping for the whole lifetime of the connection. If there is a way for the
> >> prog to turn it on, it should have a way for the prog to turn it off.
> >
> > I see what you meant here. If we don't clear sk_tsflags_bpf, after we
> > detach the bpf program, it will do nothing in __skb_tstamp_tx() and
> > return earlier. It is almost equal to the effect of turning off. It is
> > why I don't handle clearing the flag.
> >
> >>
> >> What is the concern of allowing the bpf prog to disable something that it has
> >> enabled before?
> >
> > Let me give one instance:
> > If one socket is established and stays idle, how can the bpf program
> > clear the tsflags from that socket? I have no idea.
>
> bpf_tcp_iter prog can. That said, the idle connection example is too carry away
> as an excuse that bpf_setsockopt does not need to support turning-off. Sure,
> idle connection is as good as off. and yes, detach is as good as off also.
Thanks, I see.
>
> I am now acting as a broken clock repeating myself that not all use cases run
> for 5 mins and then detach, so I need to be specific here that bpf_setsockopt
> not supporting off is a nack. There are many use cases in production that the
> bpf prog runs forever and wants to turn it on-and-off.
>
> Again, bpf sockops prog is not the only one can bpf_setsockopt(). bpf-tcp-cc
> that runs forever can also bpf_setsockopt to ask the sockops bpf prog to do
> periodic timestamping when needed. bpf_tcp_iter can also bpf_setsockopt to turn
> it off if needed.
I'm not insisting not to support this. Just curious why. Now I get it.
>
> I am not asking to clear the sk_tsflags_bpf when the bpf prog is detached. I am
> asking to support clearing the sk_tsflags_bpf in bpf_setsockopt().
Yeah, I know that.
>
> I have still yet heard a technical reason why bpf_setsockopt cannot clear the bits.
It's easy for me to support the function clearing the sk_tsflags_bpf
in the bpf_setsockopt() function. Will do that :)
>
> >
> >>
> >> While we are on bpf_sock_ops_cb_flags, the
> >> BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG addition is mostly a dup of whatever in
> >> the new sk_tsflags_bpf. It is something we need to clean up later when we decide
> >> what interface to use for bpf timestamping.
> >
> > I'm not sure if I understand correctly. I mimicked the use of
> > BPF_SOCK_OPS_RTO_CB_FLAG. Do you mean we can remove the use of
> > bpf_sock_ops_cb_flags_set() in BPF program?
>
> In patch 5, I meant the BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG is redundant.
> It is as good as testing and setting sk_tsflags_bpf alone.
I will remove it, then the code will be simplified.
>
> This could be some cleanup for the later stage of the set.
>
> > >>
> >>>
> >>>>
> >>>> Does it also mean either the bpf or the user space can enable the timetstamping
> >>>> but not both? I don't think we can assume this also. It will be hard to deploy
> >>>> the bpf prog in production to collect continuous data. The user space may have
> >>>> some timestamping enabled but the bpf may want to do its parallel investigation
> >>>> also. The user space may rollout timestamping in the future and suddenly break
> >>>> the bpf prog.
> >>>
> >>> Well, IIUC, it's also the basic idea from the current series which
> >>> allows both happening at the same time. Let us put it in a simple way,
> >>> I hope that if the app uses the SO_TIMESTAMPING feature already, then
> >>> one admin deploys the BPF program, that app should be traced both in
> >>> bpf and non-bpf ways.
> >>>
> >>> But Willem doesn't agree about this approach[1] because of hard to debug.
> >>>
> >>> [1]: https://lore.kernel.org/all/670dda9437147_2e6c4029461@willemb.c.googlers.com.notmuch/
> >>> Regarding to this link, I have a few more words to say: the socket
> >>> could be set through bpf_setsockopt() in different phases not like
> >>> setsockopt(), so in some cases we cannot make setsockopt hard failed.
> >>>
> >>> After rethinking this point more, I still reckon that letting BPF
> >>> program trace timestamping parallelly without caring whether the
> >>> socket is set to the SO_TIMESTAMPING feature through setsockopt()
> >>
> >> I am afraid having both work in parallel is needed. Otherwise, it will be very
> >> hard to deploy a bpf prog to run continuously in scale. Being able to collect
> >> timestamp without worrying about application changes/updates/downgrades is
> >> important. e.g. App changes from no time stamping to time stamping
> >
> > Sorry, I didn't make myself clear. Yesterday, I said I agreed with you
> > :) So let me keep the current logic of printing (see the
> > __skb_tstamp_tx() function in patch [04/12]) in the next version. Then
> > I don't need to add some test statements to distinguish which way of
> > printing.
> >
> >>
> >> Please help to add selftests to show how the above cases (1), (2), (3), and
> >> other tsflags/txflags sharing cases will work. This should not be delayed until
> >> the discussion is done. It is needed sooner or later to prove both bpf and
> >> non-bpf ways can work at the same time. It will help the reviewer and also help
> >> to think about the design with a real use case in bpf prog.
> >
> > Got it. But I'm not sure where I should put those test cases? Could
> > you help me point out a good example that I can follow?
>
> Have you looked at the selftests/bpf directory?
Sure, the full bpf program was written based on the examples in
selftests/bpf. But I remembered that selftests/bpf is already
deprecated?
>
> prog_tests/setget_sockopt.c may be something closer to what you need.
>
> There is a recent one in the mailing list also:
>
> https://lore.kernel.org/all/20241016-syncookie-v1-0-3b7a0de12153@bootlin.com/
>
> The expectation is to be able to run the test like this: ./test_progs -t
> setget_sockopt
Thanks for the pointer. I will spend some time studying it.
>
> >
> >>
> >> The example in patch 0 only prints the reported tstamp, can you share how it
> >> will be used to investigate issue?
> >
> > No problem. Please see chapter 3 about "goal" in
> > https://netdev.bots.linux.dev/netconf/2024/jason_xing.pdf.
>
> Thanks.
>
> >
> >> Is it also useful to know when the skb is
> >> written to the kernel during sendmsg()?
> >
> > You are right. Before this patch, normally applications will record an
> > accurate timestamp before do sendmsg().
> >
> > After you remind me of this, I feel that we can add the timestamp
> > print in the future for bpf use.
>
> Yes, please add the sendmsg timestamp capturing in the selftest. It is useful.
>
> >
> >>
> >> Regarding the bpf_setsockopt() can be called in different phase,
> >> bpf_setsockopt() is not limited to sockops program. e.g. it can also be called
> >> from a bpf-tcp-cc (congestion control). Not a tcp-cc expert but I won't be
> >> surprised people will try to trigger some on-and-off timestamping from
> >> bpf-tcp-cc to measure some delay.
> >>
> >>
> >> More about bpf_setsockopt() in different phase, understand that UDP is not your
> >> priority. However, it needs to have some clarity on how UDP will work and how to
> >> enable it. UDP usually has no connect/established phase.
> >
> > For now, I don't expect an extension for UDP because it will bring too
> > much extra work. Could we discuss this later? I mean, after we finish
> > the basic bpf extension :)
>
> Later is fine but before this set lands. I am not asking a full UDP
> implementation but need ideas on how that may look like. We need some clarity on
> how UDP will work and also how much new sockops API extension will be needed to
> decide if sockops is the correct one going forward. I don't want to end up tcp
> is in sockops and UDP (and others) is non sockops.
I see.
After removing the use of "BPF_SOCK_OPS_TEST_FLAG(tp,
BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG)", the bpf extension will be
limited only in TCP, I think.
Using sk->sk_tsflags_bpf as an indicator could work for both protocols.
Thanks for your great help :)
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-18 2:52 ` Jason Xing
@ 2024-10-18 3:05 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-18 3:05 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Jakub Sitnicki, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, eddyz87,
song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
bpf, netdev, Jason Xing
On Fri, Oct 18, 2024 at 10:52 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Fri, Oct 18, 2024 at 4:43 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >
> > On 10/16/24 7:28 PM, Jason Xing wrote:
> > > On Thu, Oct 17, 2024 at 8:48 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >>
> > >> On 10/16/24 3:36 AM, Jason Xing wrote:
> > >>>>> If the skb carries the timestamp, there are three cases:
> > >>>>> 1) non-bpf case and users uses setsockopt()
> > >>>>> 2) cmsg case
> > >>>>> 3) bpf case
> > >>
> > >> These should have tests in the selftests/bpf/ sooner than later. (More below).
> > >>
> > >>>>>
> > >>>>> #1 and #2 are already handled well before this patch. I only need to
> > >>>>> test if sk_tsflags_bpf has those flags. If so, it means we hit #3, or
> > >>>>> else it could be #1 or #2, then we will let the old way print
> > >>>>> timestamps in __skb_tstamp_tx().
> > >>>>
> > >>>> hmm... I am still not sure I fully understand...but I think I may start getting it.
> > >>>
> > >>> Sorry, my bad. I gave the wrong answer...
> > >>>
> > >>> It should be:
> > >>> Testing if if sk_tsflags has SOF_TIMESTAMPING_SOFTWARE flag should
> > >>
> > >> You meant adding SOF_TIMESTAMPING_SOFTWARE test to the sk_tstamp_tx_flags()?
> > >
> > > Yep.
> > >
> > >>
> > >> Before any bpf changes, if I read __skb_tstamp_tx() correctly, the current
> > >> behavior is to just queue to the sk_error_queue as long as there is
> > >> "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it is regardless of the
> > >> sk_tsflags. This will eventually get ignored when user read it from the error
> > >> queue because the SOF_TIMESTAMPING_SOFTWARE is not set in sk_tsflags?
> > >
> > > Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
> > > SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
> > > read the skb from the errqueue but are not able to parse the
> > > timestamps. Please see
> > > tcp_recvmsg()->inet_recv_error()->ip_recv_error()->sock_recv_timestamp()->__sock_recv_timestamp():
> > > if ((tsflags & SOF_TIMESTAMPING_SOFTWARE...
> > > ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
> >
> > afaict, __sock_recv_timestamp does not put the timestamp cmsg but ip_recv_error
> > still returns the skb to the user.
> >
> > I suspect we are talking the same thing. When SOF_TIMESTAMPING_SOFTWARE is not
> > set in sk and SOF_TIMESTAMPING_TX_* is set in cmsg, the existing (aka
> > traditional) way is that the generated skb will still be queued in the error
> > queue. The user space can still read it but just won't have the timestamp cmsg.
>
> Apparently, we're on the same page. What you were saying is right, of course :)
>
> >
> > If I understand how the v3 may look like, the skb will not be queued in the
> > error queue at all because the sk has no SOF_TIMESTAMPING_SOFTWARE. The user
>
> Right, skb will not even be cloned or generated, let alone queue it in
> the error queue. For bpf extension, preventing skb to be queued in the
> error queue is a very vital thing.
>
> > space won't get it from the error queue which is a change of behavior. I was
> > saying I am fine but not sure if someone depends on this behavior.
>
> For bpf part, it's okay to bypass the queuing skb into the errqueue
> logic, which has been discussed at netconf before this series with
> Willem also.
> For non-bpf part, I will not touch and modify their prior behaviour.
>
> >
> > I think we start talking pass each other on this. I will wait for the code on
> > this part and the selftest first.
>
> I will keep this code snippets in V3 so that three kinds of printing
> could work parallelly:
> @@ -5601,6 +5636,9 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
> if (!sk)
> return;
>
> + if (static_branch_unlikely(&bpf_tstamp_control))
> + bpf_skb_tstamp_tx_output(sk, tstype);
> +
> skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
> }
>
> So please forget what we've talked about testing the
> SOF_TIMESTAMPING_SOFTWARE flag thing.
>
> >
> > >
> > >> I suspect
> > >> the user space will still read something from the error queue unless there is
> > >> SOF_TIMESTAMPING_OPT_TSONLY but it won't have the tstamp cmsg.
> > >
> > > No, please see above.
> > >
> > >>
> > >> Adding SOF_TIMESTAMPING_SOFTWARE test to the sk_tstamp_tx_flags() will stop it
> > >> from even queuing to the error queue? I think it is ok but I am not sure if
> > >> anyone is depending on the above behavior.
> > >
> > > SOF_TIMESTAMPING_SOFTWARE is only used in traditional SO_TIMESTAMPING
> >
> > Got it. This part is now understood.
> >
> > It is one of the reasons for my earlier question on which SOF_* that bpf needs
> > to support. I want to simplify the naming part of the SOF_* in bpf_sesockopt but
> > lets leave these nits for a little later.
> >
> > However, it will be very useful to highlight which SOF_* will never be used in
> > bpf in v3.
>
> Got it. Will do that.
>
> >
> > > features including cmsg mode. But it will not be used in bpf mode. So
> > > the test statement is enough to divided those three cases into two
> > > groups.
> >
> > >
> > >>
> > >>> work fine. If it has the flag, we could use skb_tstamp_tx_output() to
> > >>> print based on patch [4/12]; if not, we will use
> > >>> bpf_skb_tstamp_tx_output() to print.
> > >>>
> > >>> If users use traditional ways of deploying SO_TIMESTAMPING, sk_tsflags
> > >>> always has SOF_TIMESTAMPING_SOFTWARE which is a software report flag
> > >>> (please see Documentation/networking/timestamping.rst). We can see a
> > >>> good example on how to use in
> > >>> tools/testing/selftests/net/txtimestamp.c:
> > >>> do_test()
> > >>> {
> > >>> sock_opt = SOF_TIMESTAMPING_SOFTWARE |
> > >>> ...
> > >>> if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
> > >>> (char *) &sock_opt, sizeof(sock_opt)))
> > >>> }
> > >>>
> > >>>>
> > >>>> Is it the reason that the bpf_setsockopt() cannot clear the sk_tsflags_bpf once
> > >>>> it is set in patch 2? It is not a usable api tbh. It will be a surprise to many.
> > >>>> It has to be able to set and clear.
> > >>>
> > >>> I cannot find a good time to clear all the sockets which are set
> > >>> through the BPF program. If we detach the BPF program, it will not
> > >>> print of course. Does it really matter if we don't clear the
> > >>> sk_tsflags_bpf?
> > >>
> > >> Yes, it matters. The same reason goes for why the existing bpf prog can clear
> > >> the tp->bpf_sock_ops_cb_flags. Yes, detach will automatically not taking the
> > >> timestamp. For sockops program that stays forever, not all usages expect to do
> > >> timestamping for the whole lifetime of the connection. If there is a way for the
> > >> prog to turn it on, it should have a way for the prog to turn it off.
> > >
> > > I see what you meant here. If we don't clear sk_tsflags_bpf, after we
> > > detach the bpf program, it will do nothing in __skb_tstamp_tx() and
> > > return earlier. It is almost equal to the effect of turning off. It is
> > > why I don't handle clearing the flag.
> > >
> > >>
> > >> What is the concern of allowing the bpf prog to disable something that it has
> > >> enabled before?
> > >
> > > Let me give one instance:
> > > If one socket is established and stays idle, how can the bpf program
> > > clear the tsflags from that socket? I have no idea.
> >
> > bpf_tcp_iter prog can. That said, the idle connection example is too carry away
> > as an excuse that bpf_setsockopt does not need to support turning-off. Sure,
> > idle connection is as good as off. and yes, detach is as good as off also.
>
> Thanks, I see.
>
> >
> > I am now acting as a broken clock repeating myself that not all use cases run
> > for 5 mins and then detach, so I need to be specific here that bpf_setsockopt
> > not supporting off is a nack. There are many use cases in production that the
> > bpf prog runs forever and wants to turn it on-and-off.
> >
> > Again, bpf sockops prog is not the only one can bpf_setsockopt(). bpf-tcp-cc
> > that runs forever can also bpf_setsockopt to ask the sockops bpf prog to do
> > periodic timestamping when needed. bpf_tcp_iter can also bpf_setsockopt to turn
> > it off if needed.
>
> I'm not insisting not to support this. Just curious why. Now I get it.
>
> >
> > I am not asking to clear the sk_tsflags_bpf when the bpf prog is detached. I am
> > asking to support clearing the sk_tsflags_bpf in bpf_setsockopt().
>
> Yeah, I know that.
>
> >
> > I have still yet heard a technical reason why bpf_setsockopt cannot clear the bits.
>
> It's easy for me to support the function clearing the sk_tsflags_bpf
> in the bpf_setsockopt() function. Will do that :)
>
> >
> > >
> > >>
> > >> While we are on bpf_sock_ops_cb_flags, the
> > >> BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG addition is mostly a dup of whatever in
> > >> the new sk_tsflags_bpf. It is something we need to clean up later when we decide
> > >> what interface to use for bpf timestamping.
> > >
> > > I'm not sure if I understand correctly. I mimicked the use of
> > > BPF_SOCK_OPS_RTO_CB_FLAG. Do you mean we can remove the use of
> > > bpf_sock_ops_cb_flags_set() in BPF program?
> >
> > In patch 5, I meant the BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG is redundant.
> > It is as good as testing and setting sk_tsflags_bpf alone.
>
> I will remove it, then the code will be simplified.
>
> >
> > This could be some cleanup for the later stage of the set.
> >
> > > >>
> > >>>
> > >>>>
> > >>>> Does it also mean either the bpf or the user space can enable the timetstamping
> > >>>> but not both? I don't think we can assume this also. It will be hard to deploy
> > >>>> the bpf prog in production to collect continuous data. The user space may have
> > >>>> some timestamping enabled but the bpf may want to do its parallel investigation
> > >>>> also. The user space may rollout timestamping in the future and suddenly break
> > >>>> the bpf prog.
> > >>>
> > >>> Well, IIUC, it's also the basic idea from the current series which
> > >>> allows both happening at the same time. Let us put it in a simple way,
> > >>> I hope that if the app uses the SO_TIMESTAMPING feature already, then
> > >>> one admin deploys the BPF program, that app should be traced both in
> > >>> bpf and non-bpf ways.
> > >>>
> > >>> But Willem doesn't agree about this approach[1] because of hard to debug.
> > >>>
> > >>> [1]: https://lore.kernel.org/all/670dda9437147_2e6c4029461@willemb.c.googlers.com.notmuch/
> > >>> Regarding to this link, I have a few more words to say: the socket
> > >>> could be set through bpf_setsockopt() in different phases not like
> > >>> setsockopt(), so in some cases we cannot make setsockopt hard failed.
> > >>>
> > >>> After rethinking this point more, I still reckon that letting BPF
> > >>> program trace timestamping parallelly without caring whether the
> > >>> socket is set to the SO_TIMESTAMPING feature through setsockopt()
> > >>
> > >> I am afraid having both work in parallel is needed. Otherwise, it will be very
> > >> hard to deploy a bpf prog to run continuously in scale. Being able to collect
> > >> timestamp without worrying about application changes/updates/downgrades is
> > >> important. e.g. App changes from no time stamping to time stamping
> > >
> > > Sorry, I didn't make myself clear. Yesterday, I said I agreed with you
> > > :) So let me keep the current logic of printing (see the
> > > __skb_tstamp_tx() function in patch [04/12]) in the next version. Then
> > > I don't need to add some test statements to distinguish which way of
> > > printing.
> > >
> > >>
> > >> Please help to add selftests to show how the above cases (1), (2), (3), and
> > >> other tsflags/txflags sharing cases will work. This should not be delayed until
> > >> the discussion is done. It is needed sooner or later to prove both bpf and
> > >> non-bpf ways can work at the same time. It will help the reviewer and also help
> > >> to think about the design with a real use case in bpf prog.
> > >
> > > Got it. But I'm not sure where I should put those test cases? Could
> > > you help me point out a good example that I can follow?
> >
> > Have you looked at the selftests/bpf directory?
>
> Sure, the full bpf program was written based on the examples in
> selftests/bpf. But I remembered that selftests/bpf is already
> deprecated?
>
> >
> > prog_tests/setget_sockopt.c may be something closer to what you need.
> >
> > There is a recent one in the mailing list also:
> >
> > https://lore.kernel.org/all/20241016-syncookie-v1-0-3b7a0de12153@bootlin.com/
> >
> > The expectation is to be able to run the test like this: ./test_progs -t
> > setget_sockopt
>
> Thanks for the pointer. I will spend some time studying it.
>
> >
> > >
> > >>
> > >> The example in patch 0 only prints the reported tstamp, can you share how it
> > >> will be used to investigate issue?
> > >
> > > No problem. Please see chapter 3 about "goal" in
> > > https://netdev.bots.linux.dev/netconf/2024/jason_xing.pdf.
> >
> > Thanks.
> >
> > >
> > >> Is it also useful to know when the skb is
> > >> written to the kernel during sendmsg()?
> > >
> > > You are right. Before this patch, normally applications will record an
> > > accurate timestamp before do sendmsg().
> > >
> > > After you remind me of this, I feel that we can add the timestamp
> > > print in the future for bpf use.
> >
> > Yes, please add the sendmsg timestamp capturing in the selftest. It is useful.
> >
> > >
> > >>
> > >> Regarding the bpf_setsockopt() can be called in different phase,
> > >> bpf_setsockopt() is not limited to sockops program. e.g. it can also be called
> > >> from a bpf-tcp-cc (congestion control). Not a tcp-cc expert but I won't be
> > >> surprised people will try to trigger some on-and-off timestamping from
> > >> bpf-tcp-cc to measure some delay.
> > >>
> > >>
> > >> More about bpf_setsockopt() in different phase, understand that UDP is not your
> > >> priority. However, it needs to have some clarity on how UDP will work and how to
> > >> enable it. UDP usually has no connect/established phase.
> > >
> > > For now, I don't expect an extension for UDP because it will bring too
> > > much extra work. Could we discuss this later? I mean, after we finish
> > > the basic bpf extension :)
> >
> > Later is fine but before this set lands. I am not asking a full UDP
> > implementation but need ideas on how that may look like. We need some clarity on
> > how UDP will work and also how much new sockops API extension will be needed to
> > decide if sockops is the correct one going forward. I don't want to end up tcp
> > is in sockops and UDP (and others) is non sockops.
The differences between TCP and UDP are:
1) TCP supports SOF_TIMESTAMPING_TX_ACK, SOF_TIMESTAMPING_OPT_ID_TCP,
while UDP does not.
2) the tskey works in different ways.
We have a good example to run and test:
tools/testing/selftests/net/txtimestamp.c
If that arouses your interesting, you could try:
1) for UDP, ./txtimestamp -4 -L 127.0.0.1 -l 1000 -c 2 -u
2) for TCP, ./txtimestamp -4 -L 127.0.0.1 -l 1000 -c 2
I think the V3 series could easily support the UDP protocol. Let me
try, then we could discuss more based on V3.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 1:04 ` Jason Xing
2024-10-16 1:32 ` Jason Xing
@ 2024-10-16 6:31 ` Martin KaFai Lau
2024-10-16 6:45 ` Jason Xing
1 sibling, 1 reply; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-16 6:31 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 10/15/24 6:04 PM, Jason Xing wrote:
> To be honest, I considered how to disable the static key. Like you
> said, I failed to find a good chance that I can accurately disable it.
It at least needs to be disabled whenever that bpf prog got detached.
>
>> The bpf prog may be detached also. (IF) it ends up staying with the
>> cgroup/sockops interface, it should depend on the existing static key in
>> cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
> Are you suggesting that we need to remove the current static key? In
> the previous thread, the reason why Willem came up with this idea is,
> I think, to avoid affect the non-bpf timestamping feature.
Take a look at cgroup_bpf_enabled(CGROUP_SOCK_OPS). There is a static key. I am
saying to use that existing key. afaict, the newly added bpf_tstamp_control key
is mainly an optimization. Yes, cgroup_bpf_enabled(CGROUP_SOCK_OPS) is less
granular but it has the needed accounting to disable whenever the bpf prog got
detached, so better just reuse the cgroup_bpf_enabled(CGROUP_SOCK_OPS).
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 6:31 ` Martin KaFai Lau
@ 2024-10-16 6:45 ` Jason Xing
2024-10-16 13:13 ` Willem de Bruijn
0 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-16 6:45 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Wed, Oct 16, 2024 at 2:31 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/15/24 6:04 PM, Jason Xing wrote:
> > To be honest, I considered how to disable the static key. Like you
> > said, I failed to find a good chance that I can accurately disable it.
>
> It at least needs to be disabled whenever that bpf prog got detached.
>
> >
> >> The bpf prog may be detached also. (IF) it ends up staying with the
> >> cgroup/sockops interface, it should depend on the existing static key in
> >> cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
>
> > Are you suggesting that we need to remove the current static key? In
> > the previous thread, the reason why Willem came up with this idea is,
> > I think, to avoid affect the non-bpf timestamping feature.
>
> Take a look at cgroup_bpf_enabled(CGROUP_SOCK_OPS). There is a static key. I am
> saying to use that existing key. afaict, the newly added bpf_tstamp_control key
> is mainly an optimization. Yes, cgroup_bpf_enabled(CGROUP_SOCK_OPS) is less
> granular but it has the needed accounting to disable whenever the bpf prog got
> detached, so better just reuse the cgroup_bpf_enabled(CGROUP_SOCK_OPS).
Good suggestion. Good thing is that I don't need to figure out a
proper place to disable it any more. I can directly use
cgroup_bpf_enabled(CGROUP_SOCK_OPS) to test if the timestamp should be
printed with BPF program loaded.
BTW, I found that we don't implement how to disable the ip4_min_ttl
static key. Sometimes, I'm confused whether we have to disable it at a
certain time.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 6:45 ` Jason Xing
@ 2024-10-16 13:13 ` Willem de Bruijn
2024-10-16 13:22 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-16 13:13 UTC (permalink / raw)
To: Jason Xing, Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
Jason Xing wrote:
> On Wed, Oct 16, 2024 at 2:31 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >
> > On 10/15/24 6:04 PM, Jason Xing wrote:
> > > To be honest, I considered how to disable the static key. Like you
> > > said, I failed to find a good chance that I can accurately disable it.
> >
> > It at least needs to be disabled whenever that bpf prog got detached.
> >
> > >
> > >> The bpf prog may be detached also. (IF) it ends up staying with the
> > >> cgroup/sockops interface, it should depend on the existing static key in
> > >> cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
> >
> > > Are you suggesting that we need to remove the current static key? In
> > > the previous thread, the reason why Willem came up with this idea is,
> > > I think, to avoid affect the non-bpf timestamping feature.
> >
> > Take a look at cgroup_bpf_enabled(CGROUP_SOCK_OPS). There is a static key. I am
> > saying to use that existing key. afaict, the newly added bpf_tstamp_control key
> > is mainly an optimization. Yes, cgroup_bpf_enabled(CGROUP_SOCK_OPS) is less
> > granular but it has the needed accounting to disable whenever the bpf prog got
> > detached, so better just reuse the cgroup_bpf_enabled(CGROUP_SOCK_OPS).
>
> Good suggestion. Good thing is that I don't need to figure out a
> proper place to disable it any more. I can directly use
> cgroup_bpf_enabled(CGROUP_SOCK_OPS) to test if the timestamp should be
> printed with BPF program loaded.
>
> BTW, I found that we don't implement how to disable the ip4_min_ttl
> static key. Sometimes, I'm confused whether we have to disable it at a
> certain time.
In this case it would be fine to not disable it at all.
The crux is that it is disabled on the vast majority of machines not
using the feature. If a socket uses the feature, adding the small cost
of the branches on the rest of the system is fine.
Disabling requires refcounting usage. Sometimes the complexity and
cost of that outweights the benefit.
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-16 13:13 ` Willem de Bruijn
@ 2024-10-16 13:22 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-16 13:22 UTC (permalink / raw)
To: Willem de Bruijn
Cc: Martin KaFai Lau, davem, edumazet, kuba, pabeni, dsahern, willemb,
ast, daniel, andrii, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Wed, Oct 16, 2024 at 9:13 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Wed, Oct 16, 2024 at 2:31 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >
> > > On 10/15/24 6:04 PM, Jason Xing wrote:
> > > > To be honest, I considered how to disable the static key. Like you
> > > > said, I failed to find a good chance that I can accurately disable it.
> > >
> > > It at least needs to be disabled whenever that bpf prog got detached.
> > >
> > > >
> > > >> The bpf prog may be detached also. (IF) it ends up staying with the
> > > >> cgroup/sockops interface, it should depend on the existing static key in
> > > >> cgroup_bpf_enabled(CGROUP_SOCK_OPS) instead of adding another one.
> > >
> > > > Are you suggesting that we need to remove the current static key? In
> > > > the previous thread, the reason why Willem came up with this idea is,
> > > > I think, to avoid affect the non-bpf timestamping feature.
> > >
> > > Take a look at cgroup_bpf_enabled(CGROUP_SOCK_OPS). There is a static key. I am
> > > saying to use that existing key. afaict, the newly added bpf_tstamp_control key
> > > is mainly an optimization. Yes, cgroup_bpf_enabled(CGROUP_SOCK_OPS) is less
> > > granular but it has the needed accounting to disable whenever the bpf prog got
> > > detached, so better just reuse the cgroup_bpf_enabled(CGROUP_SOCK_OPS).
> >
> > Good suggestion. Good thing is that I don't need to figure out a
> > proper place to disable it any more. I can directly use
> > cgroup_bpf_enabled(CGROUP_SOCK_OPS) to test if the timestamp should be
> > printed with BPF program loaded.
> >
> > BTW, I found that we don't implement how to disable the ip4_min_ttl
> > static key. Sometimes, I'm confused whether we have to disable it at a
> > certain time.
>
> In this case it would be fine to not disable it at all.
>
> The crux is that it is disabled on the vast majority of machines not
> using the feature. If a socket uses the feature, adding the small cost
> of the branches on the rest of the system is fine.
>
> Disabling requires refcounting usage. Sometimes the complexity and
> cost of that outweights the benefit.
Thanks for the explanation. I will take Martin's advice and use the
CGROUP_SOCK_OPS static key. So I don't have to take efforts to
implement the inc/dec/enable/disable the static key
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-12 4:06 ` [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension Jason Xing
2024-10-15 1:36 ` Willem de Bruijn
2024-10-16 0:09 ` Martin KaFai Lau
@ 2024-10-20 21:51 ` Willem de Bruijn
2024-10-21 3:21 ` Jason Xing
2024-10-22 0:53 ` Martin KaFai Lau
2 siblings, 2 replies; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-20 21:51 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: bpf, netdev, Jason Xing
Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> Willem suggested that we use a static key to control. The advantage
> is that we will not affect the existing applications at all if we
> don't load BPF program.
>
> In this patch, except the static key, I also add one logic that is
> used to test if the socket has enabled its tsflags in order to
> support bpf logic to allow both cases to happen at the same time.
> Or else, the skb carring related timestamp flag doesn't know which
> way of printing is desirable.
>
> One thing important is this patch allows print from both applications
> and bpf program at the same time. Now we have three kinds of print:
> 1) only BPF program prints
> 2) only application program prints
> 3) both can print without side effect
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
Getting back to this thread. It is long, instead of responding to
multiple messages, let me combine them in a single response.
* On future extensions:
+1 that the UDP case, and datagrams more broadly, must have a clear
development path, before we can merge TCP.
Similarly, hardware timestamps need not be supported from the start,
but must clearly be supportable.
* On queueing packets to userspace:
> > the current behavior is to just queue to the sk_error_queue as long
> > as there is "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it
> > is regardless of the sk_tsflags. "
> Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
> SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
> read the skb from the errqueue but are not able to parse the
> timestamps
Before queuing a packet to userspace on the error queue, the relevant
reporting flag is always tested. sock_recv_timestamp has:
/*
* generate control messages if
* - receive time stamping in software requested
* - software time stamp available and wanted
* - hardware time stamps available and wanted
*/
if (sock_flag(sk, SOCK_RCVTSTAMP) ||
(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
(kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
(hwtstamps->hwtstamp &&
(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
__sock_recv_timestamp(msg, sk, skb);
Otherwise applications could get error messages queued, and
epoll/poll/select would unexpectedly behave differently.
> SOF_TIMESTAMPING_SOFTWARE is only used in traditional SO_TIMESTAMPING
> features including cmsg mode. But it will not be used in bpf mode.
For simplicity, the two uses of the API are best kept identical. If
there is a technical reason why BPF has to diverge from established
behavior, this needs to be explicitly called out in the commit
message.
Also, if you want to extend the API for BPF in the future, good to
call this out now and ideally extensions will apply to both, to
maintain a uniform API.
* On extra measurement points, at sendmsg or tcp_write_xmit:
The first is interesting. For application timestamping, this was
never needed, as the application can just call clock_gettime before
sendmsg.
In general, additional measurement points are not only useful if the
interval between is not constant. So far, we have seen no need for
any additional points.
* On skb state:
> > For now, is there thing we can explore to share in the skb_shared_info?
skb_shinfo space is at a premium. I don't think we can justify two
extra fields just for this use case.
> My initial thought is just to reuse these fields in skb. It can work
> without interfering one another.
I'm skeptical that two methods can work at the same time. If they are
started at different times, their sk_tskey will be different, for one.
There may be workarounds. Maybe BPF can store its state in some BPF
specific field, indeed. Or perhaps it can store per-sk shadow state
that resolves the conflict. For instance, the offset between sk_tskey
and bpf_tskey.
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-20 21:51 ` Willem de Bruijn
@ 2024-10-21 3:21 ` Jason Xing
2024-10-21 14:49 ` Willem de Bruijn
2024-10-22 0:53 ` Martin KaFai Lau
1 sibling, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-21 3:21 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Mon, Oct 21, 2024 at 5:52 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Willem suggested that we use a static key to control. The advantage
> > is that we will not affect the existing applications at all if we
> > don't load BPF program.
> >
> > In this patch, except the static key, I also add one logic that is
> > used to test if the socket has enabled its tsflags in order to
> > support bpf logic to allow both cases to happen at the same time.
> > Or else, the skb carring related timestamp flag doesn't know which
> > way of printing is desirable.
> >
> > One thing important is this patch allows print from both applications
> > and bpf program at the same time. Now we have three kinds of print:
> > 1) only BPF program prints
> > 2) only application program prints
> > 3) both can print without side effect
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
>
> Getting back to this thread. It is long, instead of responding to
> multiple messages, let me combine them in a single response.
Thank you so much!
>
>
> * On future extensions:
>
> +1 that the UDP case, and datagrams more broadly, must have a clear
> development path, before we can merge TCP.
>
> Similarly, hardware timestamps need not be supported from the start,
> but must clearly be supportable.
Agreed. Using the standalone sk_tsflags_bpf and tskey_bpf and removing
the TCP bpf test logic(say, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)
could work well for both protos. Let me give it a try first.
>
>
> * On queueing packets to userspace:
>
> > > the current behavior is to just queue to the sk_error_queue as long
> > > as there is "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it
> > > is regardless of the sk_tsflags. "
>
> > Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
> > SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
> > read the skb from the errqueue but are not able to parse the
> > timestamps
Above is what I tried to explain how the application timestamping
feature works, not what I tried to implement for the BPF extension.
>
> Before queuing a packet to userspace on the error queue, the relevant
> reporting flag is always tested. sock_recv_timestamp has:
>
> /*
> * generate control messages if
> * - receive time stamping in software requested
> * - software time stamp available and wanted
> * - hardware time stamps available and wanted
> */
> if (sock_flag(sk, SOCK_RCVTSTAMP) ||
> (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
> (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
> (hwtstamps->hwtstamp &&
> (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
> __sock_recv_timestamp(msg, sk, skb);
>
> Otherwise applications could get error messages queued, and
> epoll/poll/select would unexpectedly behave differently.
Right. And I have no intention to use the SOF_TIMESTAMPING_SOFTWARE
flag for BPF.
>
> > SOF_TIMESTAMPING_SOFTWARE is only used in traditional SO_TIMESTAMPING
> > features including cmsg mode. But it will not be used in bpf mode.
>
> For simplicity, the two uses of the API are best kept identical. If
> there is a technical reason why BPF has to diverge from established
> behavior, this needs to be explicitly called out in the commit
> message.
>
> Also, if you want to extend the API for BPF in the future, good to
> call this out now and ideally extensions will apply to both, to
> maintain a uniform API.
As you said, I also agree on "two uses of the API are best kept identical".
>
>
> * On extra measurement points, at sendmsg or tcp_write_xmit:
>
> The first is interesting. For application timestamping, this was
> never needed, as the application can just call clock_gettime before
> sendmsg.
Yes, we could add it after we finish the current series. I'm going to
write it down on my todo list.
>
> In general, additional measurement points are not only useful if the
> interval between is not constant. So far, we have seen no need for
> any additional points.
Taking a snapshot of tcp_write_xmit() could be useful especially when
the skb is not transmitted due to nagle algorithm.
>
>
> * On skb state:
>
> > > For now, is there thing we can explore to share in the skb_shared_info?
>
> skb_shinfo space is at a premium. I don't think we can justify two
> extra fields just for this use case.
>
> > My initial thought is just to reuse these fields in skb. It can work
> > without interfering one another.
>
> I'm skeptical that two methods can work at the same time. If they are
> started at different times, their sk_tskey will be different, for one.
Right, sk_tskey is the only special one that I will take care of.
Others like tx_flags or txstamp_ack from struct tcp_skb_cb can be
reused.
>
> There may be workarounds. Maybe BPF can store its state in some BPF
> specific field, indeed. Or perhaps it can store per-sk shadow state
> that resolves the conflict. For instance, the offset between sk_tskey
> and bpf_tskey.
Things could get complicated in the future if we want to unified the
final tskey value for all the cases. Since 1) the value of
shinfo->tskey depends on skb seq and len, 2) the final tskey output is
the diff between sk_tskey and shinfo->tskey, can I add a bpf_tskey in
struct sock and related output logic for bpf without caring if it's
the same as sk_tskey.
That said, the outputs from two methods differ. Do you think it is
acceptable? It could be simpler and easier if we keep them identical.
Again, thanks for your long conclusion and every review.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-21 3:21 ` Jason Xing
@ 2024-10-21 14:49 ` Willem de Bruijn
2024-10-21 15:05 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-21 14:49 UTC (permalink / raw)
To: Jason Xing, Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
Jason Xing wrote:
> On Mon, Oct 21, 2024 at 5:52 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Jason Xing wrote:
> > > From: Jason Xing <kernelxing@tencent.com>
> > >
> > > Willem suggested that we use a static key to control. The advantage
> > > is that we will not affect the existing applications at all if we
> > > don't load BPF program.
> > >
> > > In this patch, except the static key, I also add one logic that is
> > > used to test if the socket has enabled its tsflags in order to
> > > support bpf logic to allow both cases to happen at the same time.
> > > Or else, the skb carring related timestamp flag doesn't know which
> > > way of printing is desirable.
> > >
> > > One thing important is this patch allows print from both applications
> > > and bpf program at the same time. Now we have three kinds of print:
> > > 1) only BPF program prints
> > > 2) only application program prints
> > > 3) both can print without side effect
> > >
> > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> >
> > Getting back to this thread. It is long, instead of responding to
> > multiple messages, let me combine them in a single response.
>
> Thank you so much!
>
> >
> >
> > * On future extensions:
> >
> > +1 that the UDP case, and datagrams more broadly, must have a clear
> > development path, before we can merge TCP.
> >
> > Similarly, hardware timestamps need not be supported from the start,
> > but must clearly be supportable.
>
> Agreed. Using the standalone sk_tsflags_bpf and tskey_bpf and removing
> the TCP bpf test logic(say, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)
> could work well for both protos. Let me give it a try first.
Great, thanks.
> >
> >
> > * On queueing packets to userspace:
> >
> > > > the current behavior is to just queue to the sk_error_queue as long
> > > > as there is "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it
> > > > is regardless of the sk_tsflags. "
> >
> > > Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
> > > SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
> > > read the skb from the errqueue but are not able to parse the
> > > timestamps
>
> Above is what I tried to explain how the application timestamping
> feature works, not what I tried to implement for the BPF extension.
>
> >
> > Before queuing a packet to userspace on the error queue, the relevant
> > reporting flag is always tested. sock_recv_timestamp has:
> >
> > /*
> > * generate control messages if
> > * - receive time stamping in software requested
> > * - software time stamp available and wanted
> > * - hardware time stamps available and wanted
> > */
> > if (sock_flag(sk, SOCK_RCVTSTAMP) ||
> > (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
> > (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
> > (hwtstamps->hwtstamp &&
> > (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
> > __sock_recv_timestamp(msg, sk, skb);
> >
> > Otherwise applications could get error messages queued, and
> > epoll/poll/select would unexpectedly behave differently.
>
> Right. And I have no intention to use the SOF_TIMESTAMPING_SOFTWARE
> flag for BPF.
Can you elaborate on this? This sounds like it would go against the
intent to have the two versions of the API (application and BPF) be
equivalent.
> >
> > > SOF_TIMESTAMPING_SOFTWARE is only used in traditional SO_TIMESTAMPING
> > > features including cmsg mode. But it will not be used in bpf mode.
> >
> > For simplicity, the two uses of the API are best kept identical. If
> > there is a technical reason why BPF has to diverge from established
> > behavior, this needs to be explicitly called out in the commit
> > message.
> >
> > Also, if you want to extend the API for BPF in the future, good to
> > call this out now and ideally extensions will apply to both, to
> > maintain a uniform API.
>
> As you said, I also agree on "two uses of the API are best kept identical".
>
> >
> >
> > * On extra measurement points, at sendmsg or tcp_write_xmit:
> >
> > The first is interesting. For application timestamping, this was
> > never needed, as the application can just call clock_gettime before
> > sendmsg.
>
> Yes, we could add it after we finish the current series. I'm going to
> write it down on my todo list.
>
> >
> > In general, additional measurement points are not only useful if the
> > interval between is not constant. So far, we have seen no need for
> > any additional points.
>
> Taking a snapshot of tcp_write_xmit() could be useful especially when
> the skb is not transmitted due to nagle algorithm.
>
> >
> >
> > * On skb state:
> >
> > > > For now, is there thing we can explore to share in the skb_shared_info?
> >
> > skb_shinfo space is at a premium. I don't think we can justify two
> > extra fields just for this use case.
> >
> > > My initial thought is just to reuse these fields in skb. It can work
> > > without interfering one another.
> >
> > I'm skeptical that two methods can work at the same time. If they are
> > started at different times, their sk_tskey will be different, for one.
>
> Right, sk_tskey is the only special one that I will take care of.
> Others like tx_flags or txstamp_ack from struct tcp_skb_cb can be
> reused.
>
> >
> > There may be workarounds. Maybe BPF can store its state in some BPF
> > specific field, indeed. Or perhaps it can store per-sk shadow state
> > that resolves the conflict. For instance, the offset between sk_tskey
> > and bpf_tskey.
>
> Things could get complicated in the future if we want to unified the
> final tskey value for all the cases. Since 1) the value of
> shinfo->tskey depends on skb seq and len, 2) the final tskey output is
> the diff between sk_tskey and shinfo->tskey, can I add a bpf_tskey in
> struct sock and related output logic for bpf without caring if it's
> the same as sk_tskey.
I think we can add fields to struct sock without too much concern.
Adding fields to sk_buff or skb_shared_info would be more difficult.
> That said, the outputs from two methods differ. Do you think it is
> acceptable? It could be simpler and easier if we keep them identical.
Since we can only have one skb_shared_info.tskey, if both user and bpf
request OPT_ID, starting at different times, then we will have two
bases against which to compute the difference. Having two fields in
struct sock should suffice.
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-21 14:49 ` Willem de Bruijn
@ 2024-10-21 15:05 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-21 15:05 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Mon, Oct 21, 2024 at 10:49 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Mon, Oct 21, 2024 at 5:52 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Jason Xing wrote:
> > > > From: Jason Xing <kernelxing@tencent.com>
> > > >
> > > > Willem suggested that we use a static key to control. The advantage
> > > > is that we will not affect the existing applications at all if we
> > > > don't load BPF program.
> > > >
> > > > In this patch, except the static key, I also add one logic that is
> > > > used to test if the socket has enabled its tsflags in order to
> > > > support bpf logic to allow both cases to happen at the same time.
> > > > Or else, the skb carring related timestamp flag doesn't know which
> > > > way of printing is desirable.
> > > >
> > > > One thing important is this patch allows print from both applications
> > > > and bpf program at the same time. Now we have three kinds of print:
> > > > 1) only BPF program prints
> > > > 2) only application program prints
> > > > 3) both can print without side effect
> > > >
> > > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > >
> > > Getting back to this thread. It is long, instead of responding to
> > > multiple messages, let me combine them in a single response.
> >
> > Thank you so much!
> >
> > >
> > >
> > > * On future extensions:
> > >
> > > +1 that the UDP case, and datagrams more broadly, must have a clear
> > > development path, before we can merge TCP.
> > >
> > > Similarly, hardware timestamps need not be supported from the start,
> > > but must clearly be supportable.
> >
> > Agreed. Using the standalone sk_tsflags_bpf and tskey_bpf and removing
> > the TCP bpf test logic(say, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)
> > could work well for both protos. Let me give it a try first.
>
> Great, thanks.
>
> > >
> > >
> > > * On queueing packets to userspace:
> > >
> > > > > the current behavior is to just queue to the sk_error_queue as long
> > > > > as there is "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it
> > > > > is regardless of the sk_tsflags. "
> > >
> > > > Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
> > > > SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
> > > > read the skb from the errqueue but are not able to parse the
> > > > timestamps
> >
> > Above is what I tried to explain how the application timestamping
> > feature works, not what I tried to implement for the BPF extension.
> >
> > >
> > > Before queuing a packet to userspace on the error queue, the relevant
> > > reporting flag is always tested. sock_recv_timestamp has:
> > >
> > > /*
> > > * generate control messages if
> > > * - receive time stamping in software requested
> > > * - software time stamp available and wanted
> > > * - hardware time stamps available and wanted
> > > */
> > > if (sock_flag(sk, SOCK_RCVTSTAMP) ||
> > > (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
> > > (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
> > > (hwtstamps->hwtstamp &&
> > > (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
> > > __sock_recv_timestamp(msg, sk, skb);
> > >
> > > Otherwise applications could get error messages queued, and
> > > epoll/poll/select would unexpectedly behave differently.
> >
> > Right. And I have no intention to use the SOF_TIMESTAMPING_SOFTWARE
> > flag for BPF.
>
> Can you elaborate on this? This sounds like it would go against the
> intent to have the two versions of the API (application and BPF) be
> equivalent.
Oh, I see what you mean here. I have no preference. Well, I can add
this report flag into the BPF extension like how application
timestamping works.
>
> > >
> > > > SOF_TIMESTAMPING_SOFTWARE is only used in traditional SO_TIMESTAMPING
> > > > features including cmsg mode. But it will not be used in bpf mode.
> > >
> > > For simplicity, the two uses of the API are best kept identical. If
> > > there is a technical reason why BPF has to diverge from established
> > > behavior, this needs to be explicitly called out in the commit
> > > message.
> > >
> > > Also, if you want to extend the API for BPF in the future, good to
> > > call this out now and ideally extensions will apply to both, to
> > > maintain a uniform API.
> >
> > As you said, I also agree on "two uses of the API are best kept identical".
> >
> > >
> > >
> > > * On extra measurement points, at sendmsg or tcp_write_xmit:
> > >
> > > The first is interesting. For application timestamping, this was
> > > never needed, as the application can just call clock_gettime before
> > > sendmsg.
> >
> > Yes, we could add it after we finish the current series. I'm going to
> > write it down on my todo list.
> >
> > >
> > > In general, additional measurement points are not only useful if the
> > > interval between is not constant. So far, we have seen no need for
> > > any additional points.
> >
> > Taking a snapshot of tcp_write_xmit() could be useful especially when
> > the skb is not transmitted due to nagle algorithm.
> >
> > >
> > >
> > > * On skb state:
> > >
> > > > > For now, is there thing we can explore to share in the skb_shared_info?
> > >
> > > skb_shinfo space is at a premium. I don't think we can justify two
> > > extra fields just for this use case.
> > >
> > > > My initial thought is just to reuse these fields in skb. It can work
> > > > without interfering one another.
> > >
> > > I'm skeptical that two methods can work at the same time. If they are
> > > started at different times, their sk_tskey will be different, for one.
> >
> > Right, sk_tskey is the only special one that I will take care of.
> > Others like tx_flags or txstamp_ack from struct tcp_skb_cb can be
> > reused.
> >
> > >
> > > There may be workarounds. Maybe BPF can store its state in some BPF
> > > specific field, indeed. Or perhaps it can store per-sk shadow state
> > > that resolves the conflict. For instance, the offset between sk_tskey
> > > and bpf_tskey.
> >
> > Things could get complicated in the future if we want to unified the
> > final tskey value for all the cases. Since 1) the value of
> > shinfo->tskey depends on skb seq and len, 2) the final tskey output is
> > the diff between sk_tskey and shinfo->tskey, can I add a bpf_tskey in
> > struct sock and related output logic for bpf without caring if it's
> > the same as sk_tskey.
>
> I think we can add fields to struct sock without too much concern.
> Adding fields to sk_buff or skb_shared_info would be more difficult.
Got it:)
>
> > That said, the outputs from two methods differ. Do you think it is
> > acceptable? It could be simpler and easier if we keep them identical.
>
> Since we can only have one skb_shared_info.tskey, if both user and bpf
> request OPT_ID, starting at different times, then we will have two
> bases against which to compute the difference. Having two fields in
> struct sock should suffice.
Exactly! I will do it.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-20 21:51 ` Willem de Bruijn
2024-10-21 3:21 ` Jason Xing
@ 2024-10-22 0:53 ` Martin KaFai Lau
2024-10-22 2:30 ` Jason Xing
2024-10-23 0:17 ` Willem de Bruijn
1 sibling, 2 replies; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-22 0:53 UTC (permalink / raw)
To: Willem de Bruijn, Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, eddyz87, song, yonghong.song, john.fastabend, kpsingh,
sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On 10/20/24 2:51 PM, Willem de Bruijn wrote:
> Jason Xing wrote:
>> From: Jason Xing <kernelxing@tencent.com>
>>
>> Willem suggested that we use a static key to control. The advantage
>> is that we will not affect the existing applications at all if we
>> don't load BPF program.
>>
>> In this patch, except the static key, I also add one logic that is
>> used to test if the socket has enabled its tsflags in order to
>> support bpf logic to allow both cases to happen at the same time.
>> Or else, the skb carring related timestamp flag doesn't know which
>> way of printing is desirable.
>>
>> One thing important is this patch allows print from both applications
>> and bpf program at the same time. Now we have three kinds of print:
>> 1) only BPF program prints
>> 2) only application program prints
>> 3) both can print without side effect
>>
>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
>
> Getting back to this thread. It is long, instead of responding to
> multiple messages, let me combine them in a single response.
>
>
> * On future extensions:
>
> +1 that the UDP case, and datagrams more broadly, must have a clear
> development path, before we can merge TCP.
>
> Similarly, hardware timestamps need not be supported from the start,
> but must clearly be supportable.
>
>
> * On queueing packets to userspace:
>
>>> the current behavior is to just queue to the sk_error_queue as long
>>> as there is "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it
>>> is regardless of the sk_tsflags. "
>
>> Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
>> SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
>> read the skb from the errqueue but are not able to parse the
>> timestamps
>
> Before queuing a packet to userspace on the error queue, the relevant
> reporting flag is always tested. sock_recv_timestamp has:
>
> /*
> * generate control messages if
> * - receive time stamping in software requested
> * - software time stamp available and wanted
> * - hardware time stamps available and wanted
> */
> if (sock_flag(sk, SOCK_RCVTSTAMP) ||
> (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
> (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
> (hwtstamps->hwtstamp &&
> (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
> __sock_recv_timestamp(msg, sk, skb);
>
> Otherwise applications could get error messages queued, and
> epoll/poll/select would unexpectedly behave differently.
I just tried the following diff to remove setsockopt from txtimestamp.c and run
"./txtimestamp -6 -c 1 -C -N -L ::1". It is getting the skb from the error queue
with only cmsg flag. I did a printk in __skb_tstamp_tx to ensure the
sk->sk_tsflags is empty also.
diff --git i/tools/testing/selftests/net/txtimestamp.c
w/tools/testing/selftests/net/txtimestamp.c
index dae91eb97d69..5d9d2773b076 100644
--- i/tools/testing/selftests/net/txtimestamp.c
+++ w/tools/testing/selftests/net/txtimestamp.c
@@ -319,6 +319,8 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int
payload_len)
for (cm = CMSG_FIRSTHDR(msg);
cm && cm->cmsg_len;
cm = CMSG_NXTHDR(msg, cm)) {
+ printf("cm->cmsg_level %d cm->cmsg_type %d\n",
+ cm->cmsg_level, cm->cmsg_type);
if (cm->cmsg_level == SOL_SOCKET &&
cm->cmsg_type == SCM_TIMESTAMPING) {
tss = (void *) CMSG_DATA(cm);
@@ -362,7 +364,7 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int
payload_len)
if (batch > 1) {
fprintf(stderr, "batched %d timestamps\n", batch);
} else if (!batch) {
- fprintf(stderr, "Failed to report timestamps\n");
+ fprintf(stderr, "Failed to report timestamps. payload_len %d\n", payload_len);
test_failed = true;
}
}
@@ -578,9 +580,12 @@ static void do_test(int family, unsigned int report_opt)
if (cfg_loop_nodata)
sock_opt |= SOF_TIMESTAMPING_OPT_TSONLY;
+ (void)sock_opt;
+/*
if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
(char *) &sock_opt, sizeof(sock_opt)))
error(1, 0, "setsockopt timestamping");
+*/
for (i = 0; i < cfg_num_pkts; i++) {
memset(&msg, 0, sizeof(msg));
>
>> SOF_TIMESTAMPING_SOFTWARE is only used in traditional SO_TIMESTAMPING
>> features including cmsg mode. But it will not be used in bpf mode.
>
> For simplicity, the two uses of the API are best kept identical. If
> there is a technical reason why BPF has to diverge from established
> behavior, this needs to be explicitly called out in the commit
> message.
SOF_TIMESTAMPING_OPT_TSONLY will not be supported. The orig_skb can always be
passed directly to the bpf if needed without extra cost. The same probably goes
for SOF_TIMESTAMPING_OPT_PKTINFO. SOF_TIMESTAMPING_SOFTWARE does not seem to be
useful either. I think only a subset of SOF_* will be supported, probably only
the TX_* and RX_* ones.
>
> Also, if you want to extend the API for BPF in the future, good to
> call this out now and ideally extensions will apply to both, to
> maintain a uniform API.
>
>
> * On extra measurement points, at sendmsg or tcp_write_xmit:
>
> The first is interesting. For application timestamping, this was
> never needed, as the application can just call clock_gettime before
> sendmsg.
>
> In general, additional measurement points are not only useful if the
> interval between is not constant. So far, we have seen no need for
> any additional points.
>
>
> * On skb state:
>
>>> For now, is there thing we can explore to share in the skb_shared_info?
>
> skb_shinfo space is at a premium. I don't think we can justify two
> extra fields just for this use case.
>
>> My initial thought is just to reuse these fields in skb. It can work
>> without interfering one another.
>
> I'm skeptical that two methods can work at the same time. If they are
> started at different times, their sk_tskey will be different, for one.
For the skb's tx_flags, Jason seems to be able to figure out by only using the
new sk_tsflags_bpf. In the worst case, it seems there is still one bit left in
tx_flags.
I am also not very positive on the skb's tskey for now.
Willem, I recalled I had tried to reuse the tx_flags and hwtstamp when keeping
the delivery time in skb->tstamp for a skb redirecting from egress to ingress. I
think that approach was stalled because the tx_flags could be changed by the
netdevice like "skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS". How about the
skb_shinfo(skb)->hwtstamps? At least for the TX path, it should not be changed
until the netdevice calling skb_tstamp_tx() to report the hwtstamp? or the clone
in the tcp stack will still break things if the hwtstamps is reused for other
purpose?
>
> There may be workarounds. Maybe BPF can store its state in some BPF
> specific field, indeed. Or perhaps it can store per-sk shadow state
> that resolves the conflict. For instance, the offset between sk_tskey
> and bpf_tskey.
I have also been proposing to explore other way for the key since bpf has direct
access to the skb (also the sk, bpf prog can store data in the sk).
The bpf prog can learn what is the seq_no of the egress-ing skb. When the ack
comes back, it can also learn the ack seq no. Does it help? It will be harder to
use because it probably needs to store this info in the bpf map (or in the bpf
sk storage). However, if it needs to learn the timestamp at the
tcp_sendmsg/tcp_transmit_skb/tcp_write_xmit, this timestamp has to be stored
somewhere also. Either in a bpf map or in a bpf sk storage.
SEC("cgroup/setsockopt") prog can also enforce the user space setsockopt. e.g.
it can add SOF_TIMESTAMPING_OPT_ID_TCP when user space only use
SOF_TIMESTAMPING_OPT_ID.
^ permalink raw reply related [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-22 0:53 ` Martin KaFai Lau
@ 2024-10-22 2:30 ` Jason Xing
2024-10-23 0:17 ` Willem de Bruijn
1 sibling, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-22 2:30 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Willem de Bruijn, davem, edumazet, kuba, pabeni, dsahern, willemb,
ast, daniel, andrii, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Tue, Oct 22, 2024 at 8:53 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/20/24 2:51 PM, Willem de Bruijn wrote:
> > Jason Xing wrote:
> >> From: Jason Xing <kernelxing@tencent.com>
> >>
> >> Willem suggested that we use a static key to control. The advantage
> >> is that we will not affect the existing applications at all if we
> >> don't load BPF program.
> >>
> >> In this patch, except the static key, I also add one logic that is
> >> used to test if the socket has enabled its tsflags in order to
> >> support bpf logic to allow both cases to happen at the same time.
> >> Or else, the skb carring related timestamp flag doesn't know which
> >> way of printing is desirable.
> >>
> >> One thing important is this patch allows print from both applications
> >> and bpf program at the same time. Now we have three kinds of print:
> >> 1) only BPF program prints
> >> 2) only application program prints
> >> 3) both can print without side effect
> >>
> >> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> >
> > Getting back to this thread. It is long, instead of responding to
> > multiple messages, let me combine them in a single response.
> >
> >
> > * On future extensions:
> >
> > +1 that the UDP case, and datagrams more broadly, must have a clear
> > development path, before we can merge TCP.
> >
> > Similarly, hardware timestamps need not be supported from the start,
> > but must clearly be supportable.
> >
> >
> > * On queueing packets to userspace:
> >
> >>> the current behavior is to just queue to the sk_error_queue as long
> >>> as there is "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it
> >>> is regardless of the sk_tsflags. "
> >
> >> Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
> >> SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
> >> read the skb from the errqueue but are not able to parse the
> >> timestamps
> >
> > Before queuing a packet to userspace on the error queue, the relevant
> > reporting flag is always tested. sock_recv_timestamp has:
> >
> > /*
> > * generate control messages if
> > * - receive time stamping in software requested
> > * - software time stamp available and wanted
> > * - hardware time stamps available and wanted
> > */
> > if (sock_flag(sk, SOCK_RCVTSTAMP) ||
> > (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
> > (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
> > (hwtstamps->hwtstamp &&
> > (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
> > __sock_recv_timestamp(msg, sk, skb);
> >
> > Otherwise applications could get error messages queued, and
> > epoll/poll/select would unexpectedly behave differently.
>
> I just tried the following diff to remove setsockopt from txtimestamp.c and run
> "./txtimestamp -6 -c 1 -C -N -L ::1". It is getting the skb from the error queue
> with only cmsg flag. I did a printk in __skb_tstamp_tx to ensure the
> sk->sk_tsflags is empty also.
>
> diff --git i/tools/testing/selftests/net/txtimestamp.c
> w/tools/testing/selftests/net/txtimestamp.c
> index dae91eb97d69..5d9d2773b076 100644
> --- i/tools/testing/selftests/net/txtimestamp.c
> +++ w/tools/testing/selftests/net/txtimestamp.c
> @@ -319,6 +319,8 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int
> payload_len)
> for (cm = CMSG_FIRSTHDR(msg);
> cm && cm->cmsg_len;
> cm = CMSG_NXTHDR(msg, cm)) {
> + printf("cm->cmsg_level %d cm->cmsg_type %d\n",
> + cm->cmsg_level, cm->cmsg_type);
> if (cm->cmsg_level == SOL_SOCKET &&
> cm->cmsg_type == SCM_TIMESTAMPING) {
> tss = (void *) CMSG_DATA(cm);
> @@ -362,7 +364,7 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int
> payload_len)
> if (batch > 1) {
> fprintf(stderr, "batched %d timestamps\n", batch);
> } else if (!batch) {
> - fprintf(stderr, "Failed to report timestamps\n");
> + fprintf(stderr, "Failed to report timestamps. payload_len %d\n", payload_len);
> test_failed = true;
> }
> }
> @@ -578,9 +580,12 @@ static void do_test(int family, unsigned int report_opt)
> if (cfg_loop_nodata)
> sock_opt |= SOF_TIMESTAMPING_OPT_TSONLY;
>
> + (void)sock_opt;
> +/*
> if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
> (char *) &sock_opt, sizeof(sock_opt)))
> error(1, 0, "setsockopt timestamping");
> +*/
>
> for (i = 0; i < cfg_num_pkts; i++) {
> memset(&msg, 0, sizeof(msg));
> >
> >> SOF_TIMESTAMPING_SOFTWARE is only used in traditional SO_TIMESTAMPING
> >> features including cmsg mode. But it will not be used in bpf mode.
> >
> > For simplicity, the two uses of the API are best kept identical. If
> > there is a technical reason why BPF has to diverge from established
> > behavior, this needs to be explicitly called out in the commit
> > message.
>
> SOF_TIMESTAMPING_OPT_TSONLY will not be supported. The orig_skb can always be
> passed directly to the bpf if needed without extra cost. The same probably goes
> for SOF_TIMESTAMPING_OPT_PKTINFO.
Right, they will not be supported.
> SOF_TIMESTAMPING_SOFTWARE does not seem to be
> useful either. I think only a subset of SOF_* will be supported, probably only
I had a discussion with Willem on this point yesterday. If I
understand what Willem was thinking correctly, he doesn't expect
users' behaviors to change too much.
As I said previously, I have no strong preference. Whether keeping
this report flag or not doesn't affect the core logic for BPF
extension.
> the TX_* and RX_* ones.
>
> >
> > Also, if you want to extend the API for BPF in the future, good to
> > call this out now and ideally extensions will apply to both, to
> > maintain a uniform API.
> >
> >
> > * On extra measurement points, at sendmsg or tcp_write_xmit:
> >
> > The first is interesting. For application timestamping, this was
> > never needed, as the application can just call clock_gettime before
> > sendmsg.
> >
> > In general, additional measurement points are not only useful if the
> > interval between is not constant. So far, we have seen no need for
> > any additional points.
> >
> >
> > * On skb state:
> >
> >>> For now, is there thing we can explore to share in the skb_shared_info?
> >
> > skb_shinfo space is at a premium. I don't think we can justify two
> > extra fields just for this use case.
> >
> >> My initial thought is just to reuse these fields in skb. It can work
> >> without interfering one another.
> >
> > I'm skeptical that two methods can work at the same time. If they are
> > started at different times, their sk_tskey will be different, for one.
>
> For the skb's tx_flags, Jason seems to be able to figure out by only using the
> new sk_tsflags_bpf. In the worst case, it seems there is still one bit left in
> tx_flags.
Let me try, then we'll see if it works.
>
> I am also not very positive on the skb's tskey for now.
For TCP, the final output of tskey that is reflected to users is the
result of this calculation "shinfo->tskey - $KEY". $KEY is the base
which could be either sk->sk_tskey or sk->sk_tskey_bpf. They are
initialized at different points.
You can see the calculation in __skb_complete_tx_timestamp():
serr->ee.ee_data = skb_shinfo(skb)->tskey;
serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
With that said, we will keep two different $KEY to let each feature
(bpf SO_TIMESTAMPING or application SO_TIMESTAMPING) work
respectively, which also means, we probably will see two different
tskeys when two methods work parallely. It's fine because as long as
we can make sure the final tskeys are consistent in each feature.
tskey is used to identify which sendmsg() the skb should belong to.
It also works for UDP proto.
>
> Willem, I recalled I had tried to reuse the tx_flags and hwtstamp when keeping
> the delivery time in skb->tstamp for a skb redirecting from egress to ingress. I
> think that approach was stalled because the tx_flags could be changed by the
> netdevice like "skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS". How about the
> skb_shinfo(skb)->hwtstamps? At least for the TX path, it should not be changed
> until the netdevice calling skb_tstamp_tx() to report the hwtstamp? or the clone
> in the tcp stack will still break things if the hwtstamps is reused for other
> purpose?
>
> >
> > There may be workarounds. Maybe BPF can store its state in some BPF
> > specific field, indeed. Or perhaps it can store per-sk shadow state
> > that resolves the conflict. For instance, the offset between sk_tskey
> > and bpf_tskey.
>
> I have also been proposing to explore other way for the key since bpf has direct
> access to the skb (also the sk, bpf prog can store data in the sk).
>
> The bpf prog can learn what is the seq_no of the egress-ing skb. When the ack
> comes back, it can also learn the ack seq no. Does it help? It will be harder to
> use because it probably needs to store this info in the bpf map (or in the bpf
> sk storage). However, if it needs to learn the timestamp at the
> tcp_sendmsg/tcp_transmit_skb/tcp_write_xmit, this timestamp has to be stored
> somewhere also. Either in a bpf map or in a bpf sk storage.
Thanks for the idea. But please see the above comment, we could keep
the logic as simple as it is :)
>
> SEC("cgroup/setsockopt") prog can also enforce the user space setsockopt. e.g.
> it can add SOF_TIMESTAMPING_OPT_ID_TCP when user space only use
> SOF_TIMESTAMPING_OPT_ID.
Interesting.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-22 0:53 ` Martin KaFai Lau
2024-10-22 2:30 ` Jason Xing
@ 2024-10-23 0:17 ` Willem de Bruijn
2024-10-23 2:31 ` Willem de Bruijn
1 sibling, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-23 0:17 UTC (permalink / raw)
To: Martin KaFai Lau, Willem de Bruijn, Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, eddyz87, song, yonghong.song, john.fastabend, kpsingh,
sdf, haoluo, jolsa, bpf, netdev, Jason Xing
Martin KaFai Lau wrote:
> On 10/20/24 2:51 PM, Willem de Bruijn wrote:
> > Jason Xing wrote:
> >> From: Jason Xing <kernelxing@tencent.com>
> >>
> >> Willem suggested that we use a static key to control. The advantage
> >> is that we will not affect the existing applications at all if we
> >> don't load BPF program.
> >>
> >> In this patch, except the static key, I also add one logic that is
> >> used to test if the socket has enabled its tsflags in order to
> >> support bpf logic to allow both cases to happen at the same time.
> >> Or else, the skb carring related timestamp flag doesn't know which
> >> way of printing is desirable.
> >>
> >> One thing important is this patch allows print from both applications
> >> and bpf program at the same time. Now we have three kinds of print:
> >> 1) only BPF program prints
> >> 2) only application program prints
> >> 3) both can print without side effect
> >>
> >> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> >
> > Getting back to this thread. It is long, instead of responding to
> > multiple messages, let me combine them in a single response.
> >
> >
> > * On future extensions:
> >
> > +1 that the UDP case, and datagrams more broadly, must have a clear
> > development path, before we can merge TCP.
> >
> > Similarly, hardware timestamps need not be supported from the start,
> > but must clearly be supportable.
> >
> >
> > * On queueing packets to userspace:
> >
> >>> the current behavior is to just queue to the sk_error_queue as long
> >>> as there is "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it
> >>> is regardless of the sk_tsflags. "
> >
> >> Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
> >> SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
> >> read the skb from the errqueue but are not able to parse the
> >> timestamps
> >
> > Before queuing a packet to userspace on the error queue, the relevant
> > reporting flag is always tested. sock_recv_timestamp has:
> >
> > /*
> > * generate control messages if
> > * - receive time stamping in software requested
> > * - software time stamp available and wanted
> > * - hardware time stamps available and wanted
> > */
> > if (sock_flag(sk, SOCK_RCVTSTAMP) ||
> > (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
> > (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
> > (hwtstamps->hwtstamp &&
> > (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
> > __sock_recv_timestamp(msg, sk, skb);
> >
> > Otherwise applications could get error messages queued, and
> > epoll/poll/select would unexpectedly behave differently.
>
> I just tried the following diff to remove setsockopt from txtimestamp.c and run
> "./txtimestamp -6 -c 1 -C -N -L ::1". It is getting the skb from the error queue
> with only cmsg flag.
That it surprising and against the API intent as I understand it.
Let me reproduce and take a closer look.
> I did a printk in __skb_tstamp_tx to ensure the
> sk->sk_tsflags is empty also.
>
> diff --git i/tools/testing/selftests/net/txtimestamp.c
> w/tools/testing/selftests/net/txtimestamp.c
> index dae91eb97d69..5d9d2773b076 100644
> --- i/tools/testing/selftests/net/txtimestamp.c
> +++ w/tools/testing/selftests/net/txtimestamp.c
> @@ -319,6 +319,8 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int
> payload_len)
> for (cm = CMSG_FIRSTHDR(msg);
> cm && cm->cmsg_len;
> cm = CMSG_NXTHDR(msg, cm)) {
> + printf("cm->cmsg_level %d cm->cmsg_type %d\n",
> + cm->cmsg_level, cm->cmsg_type);
> if (cm->cmsg_level == SOL_SOCKET &&
> cm->cmsg_type == SCM_TIMESTAMPING) {
> tss = (void *) CMSG_DATA(cm);
> @@ -362,7 +364,7 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int
> payload_len)
> if (batch > 1) {
> fprintf(stderr, "batched %d timestamps\n", batch);
> } else if (!batch) {
> - fprintf(stderr, "Failed to report timestamps\n");
> + fprintf(stderr, "Failed to report timestamps. payload_len %d\n", payload_len);
> test_failed = true;
> }
> }
> @@ -578,9 +580,12 @@ static void do_test(int family, unsigned int report_opt)
> if (cfg_loop_nodata)
> sock_opt |= SOF_TIMESTAMPING_OPT_TSONLY;
>
> + (void)sock_opt;
> +/*
> if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
> (char *) &sock_opt, sizeof(sock_opt)))
> error(1, 0, "setsockopt timestamping");
> +*/
>
> for (i = 0; i < cfg_num_pkts; i++) {
> memset(&msg, 0, sizeof(msg));
> >
> >> SOF_TIMESTAMPING_SOFTWARE is only used in traditional SO_TIMESTAMPING
> >> features including cmsg mode. But it will not be used in bpf mode.
> >
> > For simplicity, the two uses of the API are best kept identical. If
> > there is a technical reason why BPF has to diverge from established
> > behavior, this needs to be explicitly called out in the commit
> > message.
>
> SOF_TIMESTAMPING_OPT_TSONLY will not be supported. The orig_skb can always be
> passed directly to the bpf if needed without extra cost. The same probably goes
> for SOF_TIMESTAMPING_OPT_PKTINFO. SOF_TIMESTAMPING_SOFTWARE does not seem to be
> useful either. I think only a subset of SOF_* will be supported, probably only
> the TX_* and RX_* ones.
>
> >
> > Also, if you want to extend the API for BPF in the future, good to
> > call this out now and ideally extensions will apply to both, to
> > maintain a uniform API.
> >
> >
> > * On extra measurement points, at sendmsg or tcp_write_xmit:
> >
> > The first is interesting. For application timestamping, this was
> > never needed, as the application can just call clock_gettime before
> > sendmsg.
> >
> > In general, additional measurement points are not only useful if the
> > interval between is not constant. So far, we have seen no need for
> > any additional points.
> >
> >
> > * On skb state:
> >
> >>> For now, is there thing we can explore to share in the skb_shared_info?
> >
> > skb_shinfo space is at a premium. I don't think we can justify two
> > extra fields just for this use case.
> >
> >> My initial thought is just to reuse these fields in skb. It can work
> >> without interfering one another.
> >
> > I'm skeptical that two methods can work at the same time. If they are
> > started at different times, their sk_tskey will be different, for one.
>
> For the skb's tx_flags, Jason seems to be able to figure out by only using the
> new sk_tsflags_bpf. In the worst case, it seems there is still one bit left in
> tx_flags.
>
> I am also not very positive on the skb's tskey for now.
>
> Willem, I recalled I had tried to reuse the tx_flags and hwtstamp when keeping
> the delivery time in skb->tstamp for a skb redirecting from egress to ingress. I
> think that approach was stalled because the tx_flags could be changed by the
> netdevice like "skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS". How about the
> skb_shinfo(skb)->hwtstamps? At least for the TX path, it should not be changed
> until the netdevice calling skb_tstamp_tx() to report the hwtstamp? or the clone
> in the tcp stack will still break things if the hwtstamps is reused for other
> purpose?
True. I think on Tx hwtstamps is only used on the path from the driver
tx completion handler to when it calls skb_tstamp_tx.
It does not even really have to be an skb field. The first driver
cscope happens to point me to indeed just allocates it on the stack:
tsnep_tx_poll.
> >
> > There may be workarounds. Maybe BPF can store its state in some BPF
> > specific field, indeed. Or perhaps it can store per-sk shadow state
> > that resolves the conflict. For instance, the offset between sk_tskey
> > and bpf_tskey.
>
> I have also been proposing to explore other way for the key since bpf has direct
> access to the skb (also the sk, bpf prog can store data in the sk).
>
> The bpf prog can learn what is the seq_no of the egress-ing skb. When the ack
> comes back, it can also learn the ack seq no. Does it help? It will be harder to
> use because it probably needs to store this info in the bpf map (or in the bpf
> sk storage). However, if it needs to learn the timestamp at the
> tcp_sendmsg/tcp_transmit_skb/tcp_write_xmit, this timestamp has to be stored
> somewhere also. Either in a bpf map or in a bpf sk storage.
>
> SEC("cgroup/setsockopt") prog can also enforce the user space setsockopt. e.g.
> it can add SOF_TIMESTAMPING_OPT_ID_TCP when user space only use
> SOF_TIMESTAMPING_OPT_ID.
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension
2024-10-23 0:17 ` Willem de Bruijn
@ 2024-10-23 2:31 ` Willem de Bruijn
0 siblings, 0 replies; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-23 2:31 UTC (permalink / raw)
To: Willem de Bruijn, Martin KaFai Lau, Willem de Bruijn, Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, eddyz87, song, yonghong.song, john.fastabend, kpsingh,
sdf, haoluo, jolsa, bpf, netdev, Jason Xing
Willem de Bruijn wrote:
> Martin KaFai Lau wrote:
> > On 10/20/24 2:51 PM, Willem de Bruijn wrote:
> > > Jason Xing wrote:
> > >> From: Jason Xing <kernelxing@tencent.com>
> > >>
> > >> Willem suggested that we use a static key to control. The advantage
> > >> is that we will not affect the existing applications at all if we
> > >> don't load BPF program.
> > >>
> > >> In this patch, except the static key, I also add one logic that is
> > >> used to test if the socket has enabled its tsflags in order to
> > >> support bpf logic to allow both cases to happen at the same time.
> > >> Or else, the skb carring related timestamp flag doesn't know which
> > >> way of printing is desirable.
> > >>
> > >> One thing important is this patch allows print from both applications
> > >> and bpf program at the same time. Now we have three kinds of print:
> > >> 1) only BPF program prints
> > >> 2) only application program prints
> > >> 3) both can print without side effect
> > >>
> > >> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > >
> > > Getting back to this thread. It is long, instead of responding to
> > > multiple messages, let me combine them in a single response.
> > >
> > >
> > > * On future extensions:
> > >
> > > +1 that the UDP case, and datagrams more broadly, must have a clear
> > > development path, before we can merge TCP.
> > >
> > > Similarly, hardware timestamps need not be supported from the start,
> > > but must clearly be supportable.
> > >
> > >
> > > * On queueing packets to userspace:
> > >
> > >>> the current behavior is to just queue to the sk_error_queue as long
> > >>> as there is "SOF_TIMESTAMPING_TX_*" set in the skb's tx_flags and it
> > >>> is regardless of the sk_tsflags. "
> > >
> > >> Totally correct. SOF_TIMESTAMPING_SOFTWARE is a report flag while
> > >> SOF_TIMESTAMPING_TX_* are generation flags. Without former, users can
> > >> read the skb from the errqueue but are not able to parse the
> > >> timestamps
> > >
> > > Before queuing a packet to userspace on the error queue, the relevant
> > > reporting flag is always tested. sock_recv_timestamp has:
> > >
> > > /*
> > > * generate control messages if
> > > * - receive time stamping in software requested
> > > * - software time stamp available and wanted
> > > * - hardware time stamps available and wanted
> > > */
> > > if (sock_flag(sk, SOCK_RCVTSTAMP) ||
> > > (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
> > > (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
> > > (hwtstamps->hwtstamp &&
> > > (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
> > > __sock_recv_timestamp(msg, sk, skb);
> > >
> > > Otherwise applications could get error messages queued, and
> > > epoll/poll/select would unexpectedly behave differently.
> >
> > I just tried the following diff to remove setsockopt from txtimestamp.c and run
> > "./txtimestamp -6 -c 1 -C -N -L ::1". It is getting the skb from the error queue
> > with only cmsg flag.
>
> That it surprising and against the API intent as I understand it.
> Let me reproduce and take a closer look.
Interesting. I guess my interpretation was wrong.
The reporting flags prevent reporting of the timestamp, but not
queuing of the skb on the error queue. Even if the only purpose is to
report a timestamp.
It goes back until well before all the API extensions. At least v3.6.
It still does suppress the timestamp itself if the relevant reporting
flag, SOF_TIMESTAMPING_SOFTWARE or SOF_TIMESTAMPING_RAW_HARDWARE, is
not set. So BPF should really still match that, I guess.
^ permalink raw reply [flat|nested] 73+ messages in thread
* [PATCH net-next v2 05/12] net-timestamp: add bpf infrastructure to allow exposing timestamp later
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (3 preceding siblings ...)
2024-10-12 4:06 ` [PATCH net-next v2 04/12] net-timestamp: add static key to control the whole bpf extension Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-12 4:06 ` [PATCH net-next v2 06/12] net-timestamp: introduce TS_SCHED_OPT_CB to generate dev xmit timestamp Jason Xing
` (7 subsequent siblings)
12 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Implement basic codes so that we later can easily add each tx points.
Introducing BPF_SOCK_OPS_ALL_CB_FLAGS used as a test statement can help
use control whether to output or not.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/uapi/linux/bpf.h | 5 ++++-
net/core/skbuff.c | 8 ++++++++
tools/include/uapi/linux/bpf.h | 5 ++++-
3 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c6cd7c7aeeee..157e139ed6fc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6900,8 +6900,11 @@ enum {
* options first before the BPF program does.
*/
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6),
+ /* Call bpf when the kernel is generating tx timestamps.
+ */
+ BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG = (1<<7),
/* Mask of all currently supported cb flags */
- BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
+ BPF_SOCK_OPS_ALL_CB_FLAGS = 0xFF,
};
/* List of known BPF sock_ops operators.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d0f912f1ff7b..3a4110d0f983 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5621,11 +5621,19 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
{
+ struct tcp_sock *tp;
u32 tsflags;
+ if (!sk_is_tcp(sk))
+ return;
+
tsflags = READ_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR]);
if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
return;
+
+ tp = tcp_sk(sk);
+ if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG))
+ return;
}
void __skb_tstamp_tx(struct sk_buff *orig_skb,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1fb3cb2636e6..93853d9d4922 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6899,8 +6899,11 @@ enum {
* options first before the BPF program does.
*/
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6),
+ /* Call bpf when the kernel is generating tx timestamps.
+ */
+ BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG = (1<<7),
/* Mask of all currently supported cb flags */
- BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
+ BPF_SOCK_OPS_ALL_CB_FLAGS = 0xFF,
};
/* List of known BPF sock_ops operators.
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* [PATCH net-next v2 06/12] net-timestamp: introduce TS_SCHED_OPT_CB to generate dev xmit timestamp
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (4 preceding siblings ...)
2024-10-12 4:06 ` [PATCH net-next v2 05/12] net-timestamp: add bpf infrastructure to allow exposing timestamp later Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-16 1:01 ` Martin KaFai Lau
2024-10-12 4:06 ` [PATCH net-next v2 07/12] net-timestamp: introduce TS_SW_OPT_CB to generate driver timestamp Jason Xing
` (6 subsequent siblings)
12 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Introduce BPF_SOCK_OPS_TS_SCHED_OPT_CB flag so that we can decide to
print timestamps when the skb just passes the dev layer.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/uapi/linux/bpf.h | 5 +++++
net/core/skbuff.c | 17 +++++++++++++++--
tools/include/uapi/linux/bpf.h | 5 +++++
3 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 157e139ed6fc..3cf3c9c896c7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7019,6 +7019,11 @@ enum {
* by the kernel or the
* earlier bpf-progs.
*/
+ BPF_SOCK_OPS_TS_SCHED_OPT_CB, /* Called when skb is passing through
+ * dev layer when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3a4110d0f983..16e7bdc1eacb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5632,8 +5632,21 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
return;
tp = tcp_sk(sk);
- if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG))
- return;
+ if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)) {
+ struct timespec64 tstamp;
+ u32 cb_flag;
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ cb_flag = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
+ break;
+ default:
+ return;
+ }
+
+ tstamp = ktime_to_timespec64(ktime_get_real());
+ tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);
+ }
}
void __skb_tstamp_tx(struct sk_buff *orig_skb,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 93853d9d4922..d60675e1a5a0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7018,6 +7018,11 @@ enum {
* by the kernel or the
* earlier bpf-progs.
*/
+ BPF_SOCK_OPS_TS_SCHED_OPT_CB, /* Called when skb is passing through
+ * dev layer when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 06/12] net-timestamp: introduce TS_SCHED_OPT_CB to generate dev xmit timestamp
2024-10-12 4:06 ` [PATCH net-next v2 06/12] net-timestamp: introduce TS_SCHED_OPT_CB to generate dev xmit timestamp Jason Xing
@ 2024-10-16 1:01 ` Martin KaFai Lau
2024-10-16 1:24 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-16 1:01 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 10/11/24 9:06 PM, Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> Introduce BPF_SOCK_OPS_TS_SCHED_OPT_CB flag so that we can decide to
> print timestamps when the skb just passes the dev layer.
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> include/uapi/linux/bpf.h | 5 +++++
> net/core/skbuff.c | 17 +++++++++++++++--
> tools/include/uapi/linux/bpf.h | 5 +++++
> 3 files changed, 25 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 157e139ed6fc..3cf3c9c896c7 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7019,6 +7019,11 @@ enum {
> * by the kernel or the
> * earlier bpf-progs.
> */
> + BPF_SOCK_OPS_TS_SCHED_OPT_CB, /* Called when skb is passing through
> + * dev layer when SO_TIMESTAMPING
> + * feature is on. It indicates the
> + * recorded timestamp.
> + */
> };
>
> /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 3a4110d0f983..16e7bdc1eacb 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5632,8 +5632,21 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
> return;
>
> tp = tcp_sk(sk);
> - if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG))
> - return;
> + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)) {
> + struct timespec64 tstamp;
> + u32 cb_flag;
> +
> + switch (tstype) {
> + case SCM_TSTAMP_SCHED:
> + cb_flag = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
> + break;
> + default:
> + return;
> + }
> +
> + tstamp = ktime_to_timespec64(ktime_get_real());
> + tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);
There is bpf_ktime_get_*() helper. The bpf prog can directly call the
bpf_ktime_get_* helper and use whatever clock it sees fit instead of enforcing
real clock here and doing an extra ktime_to_timespec64. Right now the
bpf_ktime_get_*() does not have real clock which I think it can be added.
I think overall the tstamp reporting interface does not necessarily have to
follow the socket API. The bpf prog is running in the kernel. It could pass
other information to the bpf prog if it sees fit. e.g. the bpf prog could also
get the original transmitted tcp skb if it is useful.
> + }
> }
>
> void __skb_tstamp_tx(struct sk_buff *orig_skb,
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 93853d9d4922..d60675e1a5a0 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -7018,6 +7018,11 @@ enum {
> * by the kernel or the
> * earlier bpf-progs.
> */
> + BPF_SOCK_OPS_TS_SCHED_OPT_CB, /* Called when skb is passing through
> + * dev layer when SO_TIMESTAMPING
> + * feature is on. It indicates the
> + * recorded timestamp.
> + */
> };
>
> /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 06/12] net-timestamp: introduce TS_SCHED_OPT_CB to generate dev xmit timestamp
2024-10-16 1:01 ` Martin KaFai Lau
@ 2024-10-16 1:24 ` Jason Xing
2024-10-16 5:35 ` Martin KaFai Lau
0 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-16 1:24 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Wed, Oct 16, 2024 at 9:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/11/24 9:06 PM, Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Introduce BPF_SOCK_OPS_TS_SCHED_OPT_CB flag so that we can decide to
> > print timestamps when the skb just passes the dev layer.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > include/uapi/linux/bpf.h | 5 +++++
> > net/core/skbuff.c | 17 +++++++++++++++--
> > tools/include/uapi/linux/bpf.h | 5 +++++
> > 3 files changed, 25 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 157e139ed6fc..3cf3c9c896c7 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -7019,6 +7019,11 @@ enum {
> > * by the kernel or the
> > * earlier bpf-progs.
> > */
> > + BPF_SOCK_OPS_TS_SCHED_OPT_CB, /* Called when skb is passing through
> > + * dev layer when SO_TIMESTAMPING
> > + * feature is on. It indicates the
> > + * recorded timestamp.
> > + */
> > };
> >
> > /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 3a4110d0f983..16e7bdc1eacb 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -5632,8 +5632,21 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
> > return;
> >
> > tp = tcp_sk(sk);
> > - if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG))
> > - return;
> > + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)) {
> > + struct timespec64 tstamp;
> > + u32 cb_flag;
> > +
> > + switch (tstype) {
> > + case SCM_TSTAMP_SCHED:
> > + cb_flag = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
> > + break;
> > + default:
> > + return;
> > + }
> > +
> > + tstamp = ktime_to_timespec64(ktime_get_real());
> > + tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);
>
> There is bpf_ktime_get_*() helper. The bpf prog can directly call the
> bpf_ktime_get_* helper and use whatever clock it sees fit instead of enforcing
> real clock here and doing an extra ktime_to_timespec64. Right now the
> bpf_ktime_get_*() does not have real clock which I think it can be added.
In this way, there is no need to add tcp_call_bpf_*arg() to pass
timestamp to userspace, right? Let the bpf program implement it.
Now I wonder what information I should pass? Sorry for the lack of BPF
related knowledge :(
>
> I think overall the tstamp reporting interface does not necessarily have to
> follow the socket API. The bpf prog is running in the kernel. It could pass
> other information to the bpf prog if it sees fit. e.g. the bpf prog could also
> get the original transmitted tcp skb if it is useful.
Good to know that! But how the BPF program parses the skb by using
tcp_call_bpf_2arg() which only passes u32 parameters.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 06/12] net-timestamp: introduce TS_SCHED_OPT_CB to generate dev xmit timestamp
2024-10-16 1:24 ` Jason Xing
@ 2024-10-16 5:35 ` Martin KaFai Lau
2024-10-16 6:08 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Martin KaFai Lau @ 2024-10-16 5:35 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 10/15/24 6:24 PM, Jason Xing wrote:
> On Wed, Oct 16, 2024 at 9:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 10/11/24 9:06 PM, Jason Xing wrote:
>>> From: Jason Xing <kernelxing@tencent.com>
>>>
>>> Introduce BPF_SOCK_OPS_TS_SCHED_OPT_CB flag so that we can decide to
>>> print timestamps when the skb just passes the dev layer.
>>>
>>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
>>> ---
>>> include/uapi/linux/bpf.h | 5 +++++
>>> net/core/skbuff.c | 17 +++++++++++++++--
>>> tools/include/uapi/linux/bpf.h | 5 +++++
>>> 3 files changed, 25 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 157e139ed6fc..3cf3c9c896c7 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -7019,6 +7019,11 @@ enum {
>>> * by the kernel or the
>>> * earlier bpf-progs.
>>> */
>>> + BPF_SOCK_OPS_TS_SCHED_OPT_CB, /* Called when skb is passing through
>>> + * dev layer when SO_TIMESTAMPING
>>> + * feature is on. It indicates the
>>> + * recorded timestamp.
>>> + */
>>> };
>>>
>>> /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>> index 3a4110d0f983..16e7bdc1eacb 100644
>>> --- a/net/core/skbuff.c
>>> +++ b/net/core/skbuff.c
>>> @@ -5632,8 +5632,21 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
>>> return;
>>>
>>> tp = tcp_sk(sk);
>>> - if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG))
>>> - return;
>>> + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)) {
>>> + struct timespec64 tstamp;
>>> + u32 cb_flag;
>>> +
>>> + switch (tstype) {
>>> + case SCM_TSTAMP_SCHED:
>>> + cb_flag = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
>>> + break;
>>> + default:
>>> + return;
>>> + }
>>> +
>>> + tstamp = ktime_to_timespec64(ktime_get_real());
>>> + tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);
>>
>> There is bpf_ktime_get_*() helper. The bpf prog can directly call the
>> bpf_ktime_get_* helper and use whatever clock it sees fit instead of enforcing
>> real clock here and doing an extra ktime_to_timespec64. Right now the
>> bpf_ktime_get_*() does not have real clock which I think it can be added.
>
> In this way, there is no need to add tcp_call_bpf_*arg() to pass
> timestamp to userspace, right? Let the bpf program implement it.
>
> Now I wonder what information I should pass? Sorry for the lack of BPF
> related knowledge :(
Just pass the cb_flag op in this case.
A bpf selftest is missing in this series to show how it is going to be used.
Yes, there are existing socket API tests on time stamping but I believe this
discussion has already shown some subtle differences that warrant a closer to
real world bpf prog example first.
>
>>
>> I think overall the tstamp reporting interface does not necessarily have to
>> follow the socket API. The bpf prog is running in the kernel. It could pass
>> other information to the bpf prog if it sees fit. e.g. the bpf prog could also
>> get the original transmitted tcp skb if it is useful.
>
> Good to know that! But how the BPF program parses the skb by using
> tcp_call_bpf_2arg() which only passes u32 parameters.
"struct skbuff *skb" has already been added to "struct bpf_sock_ops_kern". It is
only assigned during the "BPF_SOCK_OPS_PARSE_*HDR_CB". It is not exposed
directly to bpf prog but it could be. However, it may need to change some
convert_ctx code in filter.c which I am not excited about. We haven't added
convert_ctx changes for a while since it is the old way.
Together with the "u32 bpf_sock_ops_cb_flags;" change in patch 9 which is only
for tcp_sock and other _CB flags are also tcp specific only. For now, I am not
sure carrying this sockops to the future UDP support is desired.
Take a look at tcp_call_bpf(). It needs to initialize the whole "struct
bpf_sock_ops_kern" regardless of what the bpf prog is needed before calling the
bpf prog. The "u32 args[4]" is one of them. The is the older way of using bpf to
extend kernel.
bpf has struct_ops support now which can pass only what is needed and without
the need of doing the convert_ctx in filter.c. The "struct tcp_congestion_ops"
can already be implemented in bpf. Take a look at
selftests/bpf/progs/bpf_cubic.c. All the BPF_SOCK_OPS_*_CB (e.g.
BPF_SOCK_OPS_TS_SCHED_OPT_CB here) could just a "ops" in the struct_ops.
That said, I think the first thing needs to figure out is how to enable bpf time
stamping without having side effect on the user space. Continue the sockops
approach first and use it to create a selftest bpf prog example. Then we can decide.
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 06/12] net-timestamp: introduce TS_SCHED_OPT_CB to generate dev xmit timestamp
2024-10-16 5:35 ` Martin KaFai Lau
@ 2024-10-16 6:08 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-16 6:08 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Wed, Oct 16, 2024 at 1:35 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/15/24 6:24 PM, Jason Xing wrote:
> > On Wed, Oct 16, 2024 at 9:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 10/11/24 9:06 PM, Jason Xing wrote:
> >>> From: Jason Xing <kernelxing@tencent.com>
> >>>
> >>> Introduce BPF_SOCK_OPS_TS_SCHED_OPT_CB flag so that we can decide to
> >>> print timestamps when the skb just passes the dev layer.
> >>>
> >>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> >>> ---
> >>> include/uapi/linux/bpf.h | 5 +++++
> >>> net/core/skbuff.c | 17 +++++++++++++++--
> >>> tools/include/uapi/linux/bpf.h | 5 +++++
> >>> 3 files changed, 25 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >>> index 157e139ed6fc..3cf3c9c896c7 100644
> >>> --- a/include/uapi/linux/bpf.h
> >>> +++ b/include/uapi/linux/bpf.h
> >>> @@ -7019,6 +7019,11 @@ enum {
> >>> * by the kernel or the
> >>> * earlier bpf-progs.
> >>> */
> >>> + BPF_SOCK_OPS_TS_SCHED_OPT_CB, /* Called when skb is passing through
> >>> + * dev layer when SO_TIMESTAMPING
> >>> + * feature is on. It indicates the
> >>> + * recorded timestamp.
> >>> + */
> >>> };
> >>>
> >>> /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
> >>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >>> index 3a4110d0f983..16e7bdc1eacb 100644
> >>> --- a/net/core/skbuff.c
> >>> +++ b/net/core/skbuff.c
> >>> @@ -5632,8 +5632,21 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
> >>> return;
> >>>
> >>> tp = tcp_sk(sk);
> >>> - if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG))
> >>> - return;
> >>> + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)) {
> >>> + struct timespec64 tstamp;
> >>> + u32 cb_flag;
> >>> +
> >>> + switch (tstype) {
> >>> + case SCM_TSTAMP_SCHED:
> >>> + cb_flag = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
> >>> + break;
> >>> + default:
> >>> + return;
> >>> + }
> >>> +
> >>> + tstamp = ktime_to_timespec64(ktime_get_real());
> >>> + tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);
> >>
> >> There is bpf_ktime_get_*() helper. The bpf prog can directly call the
> >> bpf_ktime_get_* helper and use whatever clock it sees fit instead of enforcing
> >> real clock here and doing an extra ktime_to_timespec64. Right now the
> >> bpf_ktime_get_*() does not have real clock which I think it can be added.
> >
> > In this way, there is no need to add tcp_call_bpf_*arg() to pass
> > timestamp to userspace, right? Let the bpf program implement it.
> >
> > Now I wonder what information I should pass? Sorry for the lack of BPF
> > related knowledge :(
>
> Just pass the cb_flag op in this case.
I see. I saw one example just passing a NULL parameter:
tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);.
>
> A bpf selftest is missing in this series to show how it is going to be used.
Sorry, I didn't implement a standard selftest, but I wrote a full BPF
program in patch[0/12]. I planned to write a selftests after every
expert agrees the current approach.
> Yes, there are existing socket API tests on time stamping but I believe this
> discussion has already shown some subtle differences that warrant a closer to
> real world bpf prog example first.
>
> >
> >>
> >> I think overall the tstamp reporting interface does not necessarily have to
> >> follow the socket API. The bpf prog is running in the kernel. It could pass
> >> other information to the bpf prog if it sees fit. e.g. the bpf prog could also
> >> get the original transmitted tcp skb if it is useful.
> >
> > Good to know that! But how the BPF program parses the skb by using
> > tcp_call_bpf_2arg() which only passes u32 parameters.
>
> "struct skbuff *skb" has already been added to "struct bpf_sock_ops_kern". It is
> only assigned during the "BPF_SOCK_OPS_PARSE_*HDR_CB". It is not exposed
> directly to bpf prog but it could be. However, it may need to change some
> convert_ctx code in filter.c which I am not excited about. We haven't added
> convert_ctx changes for a while since it is the old way.
>
> Together with the "u32 bpf_sock_ops_cb_flags;" change in patch 9 which is only
> for tcp_sock and other _CB flags are also tcp specific only. For now, I am not
Right, the first move I made is to make TCP work.
> sure carrying this sockops to the future UDP support is desired.
I hope so. But it's not an urgent thing that needs to be done recently.
>
> Take a look at tcp_call_bpf(). It needs to initialize the whole "struct
> bpf_sock_ops_kern" regardless of what the bpf prog is needed before calling the
> bpf prog. The "u32 args[4]" is one of them. The is the older way of using bpf to
> extend kernel.
I see.
>
> bpf has struct_ops support now which can pass only what is needed and without
> the need of doing the convert_ctx in filter.c. The "struct tcp_congestion_ops"
> can already be implemented in bpf. Take a look at
> selftests/bpf/progs/bpf_cubic.c. All the BPF_SOCK_OPS_*_CB (e.g.
> BPF_SOCK_OPS_TS_SCHED_OPT_CB here) could just a "ops" in the struct_ops.
Interesting, but it seems this way is much more complex than the
current approach.
>
> That said, I think the first thing needs to figure out is how to enable bpf time
> stamping without having side effect on the user space.
In the next version, I will avoid affecting the cmsg case, so no more
side effects I think.
> Continue the sockops approach first
I'm a little hesitant to do so because it looks like we will introduce
more codes. Please let me investigate more :)
> and use it to create a selftest bpf prog example. Then we can decide.
I copy the BPF program from patch [0/12], please take a look and help
me review this:
---
Here is the test output:
1) receive path
iperf3-987305 [008] ...11 179955.200990: bpf_trace_printk: rx: port:
5201:55192, swtimestamp: 1728167973,670426346, hwtimestamp: 0,0
2) xmit path
iperf3-19765 [013] ...11 2021.329602: bpf_trace_printk: tx: port:
47528:5201, key: 1036, timestamp: 1728357067,436678584
iperf3-19765 [013] b..11 2021.329611: bpf_trace_printk: tx: port:
47528:5201, key: 1036, timestamp: 1728357067,436689976
iperf3-19765 [013] ...11 2021.329622: bpf_trace_printk: tx: port:
47528:5201, key: 1036, timestamp: 1728357067,436700739
Here is the full bpf program:
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
#include <uapi/linux/net_tstamp.h>
int _version SEC("version") = 1;
char _license[] SEC("license") = "GPL";
# define SO_TIMESTAMPING 37
__section("sockops")
int set_initial_rto(struct bpf_sock_ops *skops)
{
int op = (int) skops->op;
u32 sport = 0, dport = 0;
int flags;
switch (op) {
//case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
case BPF_SOCK_OPS_TCP_CONNECT_CB:
case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
flags = SOF_TIMESTAMPING_RX_SOFTWARE |
SOF_TIMESTAMPING_TX_SCHED |
SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_TX_SOFTWARE |
SOF_TIMESTAMPING_OPT_ID | SOF_TIMESTAMPING_OPT_ID_TCP;
bpf_setsockopt(skops, SOL_SOCKET, SO_TIMESTAMPING,
&flags, sizeof(flags));
bpf_sock_ops_cb_flags_set(skops,
BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG|BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG);
break;
case BPF_SOCK_OPS_TS_SCHED_OPT_CB:
case BPF_SOCK_OPS_TS_SW_OPT_CB:
case BPF_SOCK_OPS_TS_ACK_OPT_CB:
dport = bpf_ntohl(skops->remote_port);
sport = skops->local_port;
bpf_printk("tx: port: %u:%u, key: %u, timestamp: %u,%u\n",
sport, dport, skops->args[0],
skops->args[1], skops->args[2]);
break;
case BPF_SOCK_OPS_TS_RX_OPT_CB:
dport = bpf_ntohl(skops->remote_port);
sport = skops->local_port;
bpf_printk("rx: port: %u:%u, swtimestamp: %u,%u,
hwtimestamp: %u,%u\n",
sport, dport, skops->args[0],
skops->args[1], skops->args[2], skops->args[3]);
break;
}
return 1;
}
---
What is your opinion on the above?
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread
* [PATCH net-next v2 07/12] net-timestamp: introduce TS_SW_OPT_CB to generate driver timestamp
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (5 preceding siblings ...)
2024-10-12 4:06 ` [PATCH net-next v2 06/12] net-timestamp: introduce TS_SCHED_OPT_CB to generate dev xmit timestamp Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-12 4:06 ` [PATCH net-next v2 08/12] net-timestamp: introduce TS_ACK_OPT_CB to generate tcp acked timestamp Jason Xing
` (5 subsequent siblings)
12 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
When the skb is about to send from driver to nic, we can print timestamp
by setting BPF_SOCK_OPS_TS_SW_OPT_CB in bpf program.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/uapi/linux/bpf.h | 5 +++++
net/core/skbuff.c | 13 ++++++++++---
tools/include/uapi/linux/bpf.h | 5 +++++
3 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3cf3c9c896c7..0d00539f247a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7024,6 +7024,11 @@ enum {
* feature is on. It indicates the
* recorded timestamp.
*/
+ BPF_SOCK_OPS_TS_SW_OPT_CB, /* Called when skb is about to send
+ * to the nic when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 16e7bdc1eacb..832d53de9874 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5619,7 +5619,8 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
-static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
+static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype,
+ struct skb_shared_hwtstamps *hwtstamps)
{
struct tcp_sock *tp;
u32 tsflags;
@@ -5640,11 +5641,17 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
case SCM_TSTAMP_SCHED:
cb_flag = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
break;
+ case SCM_TSTAMP_SND:
+ cb_flag = BPF_SOCK_OPS_TS_SW_OPT_CB;
+ break;
default:
return;
}
- tstamp = ktime_to_timespec64(ktime_get_real());
+ if (hwtstamps)
+ tstamp = ktime_to_timespec64(hwtstamps->hwtstamp);
+ else
+ tstamp = ktime_to_timespec64(ktime_get_real());
tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);
}
}
@@ -5658,7 +5665,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
return;
if (static_branch_unlikely(&bpf_tstamp_control))
- bpf_skb_tstamp_tx_output(sk, tstype);
+ bpf_skb_tstamp_tx_output(sk, tstype, hwtstamps);
skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d60675e1a5a0..020ec14ffae6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7023,6 +7023,11 @@ enum {
* feature is on. It indicates the
* recorded timestamp.
*/
+ BPF_SOCK_OPS_TS_SW_OPT_CB, /* Called when skb is about to send
+ * to the nic when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* [PATCH net-next v2 08/12] net-timestamp: introduce TS_ACK_OPT_CB to generate tcp acked timestamp
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (6 preceding siblings ...)
2024-10-12 4:06 ` [PATCH net-next v2 07/12] net-timestamp: introduce TS_SW_OPT_CB to generate driver timestamp Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-12 4:06 ` [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case Jason Xing
` (4 subsequent siblings)
12 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
When the last sent skb in each sendmsg() is acknowledged in TCP layer,
we can print timestamp by setting BPF_SOCK_OPS_TS_ACK_OPT_CB in
bpf program.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/uapi/linux/bpf.h | 5 +++++
net/core/skbuff.c | 3 +++
tools/include/uapi/linux/bpf.h | 5 +++++
3 files changed, 13 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0d00539f247a..1b478ec18ac2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7029,6 +7029,11 @@ enum {
* feature is on. It indicates the
* recorded timestamp.
*/
+ BPF_SOCK_OPS_TS_ACK_OPT_CB, /* Called when all the skbs are
+ * acknowledged when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 832d53de9874..e18305b03a01 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5644,6 +5644,9 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype,
case SCM_TSTAMP_SND:
cb_flag = BPF_SOCK_OPS_TS_SW_OPT_CB;
break;
+ case SCM_TSTAMP_ACK:
+ cb_flag = BPF_SOCK_OPS_TS_ACK_OPT_CB;
+ break;
default:
return;
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 020ec14ffae6..fc9b94de19f2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7028,6 +7028,11 @@ enum {
* feature is on. It indicates the
* recorded timestamp.
*/
+ BPF_SOCK_OPS_TS_ACK_OPT_CB, /* Called when all the skbs are
+ * acknowledged when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (7 preceding siblings ...)
2024-10-12 4:06 ` [PATCH net-next v2 08/12] net-timestamp: introduce TS_ACK_OPT_CB to generate tcp acked timestamp Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-15 1:38 ` Willem de Bruijn
2024-10-15 8:40 ` kernel test robot
2024-10-12 4:06 ` [PATCH net-next v2 10/12] net-timestamp: make bpf for tx timestamp work Jason Xing
` (3 subsequent siblings)
12 siblings, 2 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
We can set OPT_ID|OPT_ID_TCP before we initialize the last skb
from each sendmsg. We only set the socket once like how we use
setsockopt() with OPT_ID|OPT_ID_TCP flags.
Note: we will check if non-bpf _and_ bpf sk_tsflags have OPT_ID
flag. If either of them has been set before, we will not initialize
the key any more, or else it will affect the existing printing
from applications or BPF program behaviour.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/net/sock.h | 1 +
net/core/filter.c | 5 +++++
net/core/skbuff.c | 14 ++++++++++----
net/core/sock.c | 29 +++++++++++++++++++++--------
4 files changed, 37 insertions(+), 12 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index b7c51b95c92d..2b4ac289c8fa 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2893,6 +2893,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_tstamp_control);
void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
int sock_get_timestamping(struct so_timestamping *timestamping,
sockptr_t optval, unsigned int optlen);
+int sock_set_tskey(struct sock *sk, int val, int type);
int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping);
diff --git a/net/core/filter.c b/net/core/filter.c
index 08135f538c99..3b4afaa273d9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5210,6 +5210,7 @@ static int bpf_sock_set_timestamping(struct sock *sk,
struct so_timestamping *timestamping)
{
u32 flags = timestamping->flags;
+ int ret;
if (flags & ~SOF_TIMESTAMPING_MASK)
return -EINVAL;
@@ -5218,6 +5219,10 @@ static int bpf_sock_set_timestamping(struct sock *sk,
SOF_TIMESTAMPING_TX_ACK)))
return -EINVAL;
+ ret = sock_set_tskey(sk, flags, BPFPROG_TS_REQUESTOR);
+ if (ret)
+ return ret;
+
WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
static_branch_enable(&bpf_tstamp_control);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e18305b03a01..1ef379a87f88 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5619,7 +5619,7 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
-static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype,
+static void bpf_skb_tstamp_tx_output(struct sock *sk, struct sk_buff *skb, int tstype,
struct skb_shared_hwtstamps *hwtstamps)
{
struct tcp_sock *tp;
@@ -5635,7 +5635,7 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype,
tp = tcp_sk(sk);
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)) {
struct timespec64 tstamp;
- u32 cb_flag;
+ u32 cb_flag, key = 0;
switch (tstype) {
case SCM_TSTAMP_SCHED:
@@ -5651,11 +5651,17 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype,
return;
}
+ if (sk_is_tcp(sk)) {
+ key = skb_shinfo(skb)->tskey;
+ key -= atomic_read(&sk->sk_tskey);
+ }
+
if (hwtstamps)
tstamp = ktime_to_timespec64(hwtstamps->hwtstamp);
else
tstamp = ktime_to_timespec64(ktime_get_real());
- tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);
+
+ tcp_call_bpf_3arg(sk, cb_flag, key, tstamp.tv_sec, tstamp.tv_nsec);
}
}
@@ -5668,7 +5674,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
return;
if (static_branch_unlikely(&bpf_tstamp_control))
- bpf_skb_tstamp_tx_output(sk, tstype, hwtstamps);
+ bpf_skb_tstamp_tx_output(sk, orig_skb, tstype, hwtstamps);
skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
}
diff --git a/net/core/sock.c b/net/core/sock.c
index a6e0d51a5f72..c15edbd382d5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -915,21 +915,18 @@ int sock_get_timestamping(struct so_timestamping *timestamping,
return 0;
}
-int sock_set_timestamping(struct sock *sk, int optname,
- struct so_timestamping timestamping)
+int sock_set_tskey(struct sock *sk, int val, int type)
{
- int val = timestamping.flags;
- int ret;
-
- if (val & ~SOF_TIMESTAMPING_MASK)
- return -EINVAL;
+ u32 tsflags;
if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
!(val & SOF_TIMESTAMPING_OPT_ID))
return -EINVAL;
+ tsflags |= (sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR] |
+ sk->sk_tsflags[BPFPROG_TS_REQUESTOR]);
if (val & SOF_TIMESTAMPING_OPT_ID &&
- !(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR] & SOF_TIMESTAMPING_OPT_ID)) {
+ !(tsflags & SOF_TIMESTAMPING_OPT_ID)) {
if (sk_is_tcp(sk)) {
if ((1 << sk->sk_state) &
(TCPF_CLOSE | TCPF_LISTEN))
@@ -943,6 +940,22 @@ int sock_set_timestamping(struct sock *sk, int optname,
}
}
+ return 0;
+}
+
+int sock_set_timestamping(struct sock *sk, int optname,
+ struct so_timestamping timestamping)
+{
+ int val = timestamping.flags;
+ int ret;
+
+ if (val & ~SOF_TIMESTAMPING_MASK)
+ return -EINVAL;
+
+ ret = sock_set_tskey(sk, val, SOCKETOPT_TS_REQUESTOR);
+ if (ret)
+ return ret;
+
if (val & SOF_TIMESTAMPING_OPT_STATS &&
!(val & SOF_TIMESTAMPING_OPT_TSONLY))
return -EINVAL;
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case
2024-10-12 4:06 ` [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case Jason Xing
@ 2024-10-15 1:38 ` Willem de Bruijn
2024-10-15 2:25 ` Jason Xing
2024-10-15 8:40 ` kernel test robot
1 sibling, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-15 1:38 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: bpf, netdev, Jason Xing
Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> We can set OPT_ID|OPT_ID_TCP before we initialize the last skb
> from each sendmsg. We only set the socket once like how we use
> setsockopt() with OPT_ID|OPT_ID_TCP flags.
>
> Note: we will check if non-bpf _and_ bpf sk_tsflags have OPT_ID
> flag. If either of them has been set before, we will not initialize
> the key any more,
Where and how is this achieved?
Also be aware of the subtle distinction between passing OPT_ID_TCP
along with OPT_ID or not.
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case
2024-10-15 1:38 ` Willem de Bruijn
@ 2024-10-15 2:25 ` Jason Xing
2024-10-15 2:38 ` Willem de Bruijn
0 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-15 2:25 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Tue, Oct 15, 2024 at 9:38 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > We can set OPT_ID|OPT_ID_TCP before we initialize the last skb
> > from each sendmsg. We only set the socket once like how we use
> > setsockopt() with OPT_ID|OPT_ID_TCP flags.
> >
> > Note: we will check if non-bpf _and_ bpf sk_tsflags have OPT_ID
> > flag. If either of them has been set before, we will not initialize
> > the key any more,
>
> Where and how is this achieved?
Please see this patch and you will find the following codes.
+ tsflags |= (sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR] |
+ sk->sk_tsflags[BPFPROG_TS_REQUESTOR]);
But the difference/problem is that the non-bpf feature only init it
when connect() is done, but the bpf feature could do it at the
beginning of connect(). If running txtimestamp -l 1000, the former
will generate 999 for turkey while the latter 1000.
>
> Also be aware of the subtle distinction between passing OPT_ID_TCP
> along with OPT_ID or not.
>
>
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case
2024-10-15 2:25 ` Jason Xing
@ 2024-10-15 2:38 ` Willem de Bruijn
2024-10-15 2:59 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-15 2:38 UTC (permalink / raw)
To: Jason Xing, Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
Jason Xing wrote:
> On Tue, Oct 15, 2024 at 9:38 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Jason Xing wrote:
> > > From: Jason Xing <kernelxing@tencent.com>
> > >
> > > We can set OPT_ID|OPT_ID_TCP before we initialize the last skb
> > > from each sendmsg. We only set the socket once like how we use
> > > setsockopt() with OPT_ID|OPT_ID_TCP flags.
> > >
> > > Note: we will check if non-bpf _and_ bpf sk_tsflags have OPT_ID
> > > flag. If either of them has been set before, we will not initialize
> > > the key any more,
> >
> > Where and how is this achieved?
>
> Please see this patch and you will find the following codes.
> + tsflags |= (sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR] |
> + sk->sk_tsflags[BPFPROG_TS_REQUESTOR]);
I saw that, but it's not a condition that stops reinitializing. Which
I think is the intent, based on "If either of them has been set
before, we will not initialize the key anymore"?
Reinitialization is actually supported behavior.
if (val & SOF_TIMESTAMPING_OPT_ID &&
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
But the sk_tsflags bit may be repeatedly set and cleared.
Anyway, the current patch sets it if either requests it?
+ tsflags |= (sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR] |
+ sk->sk_tsflags[BPFPROG_TS_REQUESTOR]);
if (val & SOF_TIMESTAMPING_OPT_ID &&
- !(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR] & SOF_TIMESTAMPING_OPT_ID)) {
+ !(tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> But the difference/problem is that the non-bpf feature only init it
> when connect() is done, but the bpf feature could do it at the
> beginning of connect(). If running txtimestamp -l 1000, the former
> will generate 999 for turkey while the latter 1000.
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case
2024-10-15 2:38 ` Willem de Bruijn
@ 2024-10-15 2:59 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-15 2:59 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Tue, Oct 15, 2024 at 10:38 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Tue, Oct 15, 2024 at 9:38 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Jason Xing wrote:
> > > > From: Jason Xing <kernelxing@tencent.com>
> > > >
> > > > We can set OPT_ID|OPT_ID_TCP before we initialize the last skb
> > > > from each sendmsg. We only set the socket once like how we use
> > > > setsockopt() with OPT_ID|OPT_ID_TCP flags.
> > > >
> > > > Note: we will check if non-bpf _and_ bpf sk_tsflags have OPT_ID
> > > > flag. If either of them has been set before, we will not initialize
> > > > the key any more,
> > >
> > > Where and how is this achieved?
> >
> > Please see this patch and you will find the following codes.
> > + tsflags |= (sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR] |
> > + sk->sk_tsflags[BPFPROG_TS_REQUESTOR]);
>
> I saw that, but it's not a condition that stops reinitializing. Which
> I think is the intent, based on "If either of them has been set
> before, we will not initialize the key anymore"?
Yep, based on that sentence. If we find sk_tsflags is initialized,
then we will not do the same thing to sk_tskey again when we use bpf
method.
>
> Reinitialization is actually supported behavior.
>
> if (val & SOF_TIMESTAMPING_OPT_ID &&
> !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
>
> But the sk_tsflags bit may be repeatedly set and cleared.
This line "!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {" was removed
and replaced in the new function sock_set_tskey(). So it could avoid
re-initialization.
>
> Anyway, the current patch sets it if either requests it?
Yep, either of the ways (bpf and non-bpf) can init it.
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case
2024-10-12 4:06 ` [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case Jason Xing
2024-10-15 1:38 ` Willem de Bruijn
@ 2024-10-15 8:40 ` kernel test robot
2024-10-15 9:36 ` Jason Xing
1 sibling, 1 reply; 73+ messages in thread
From: kernel test robot @ 2024-10-15 8:40 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: llvm, oe-kbuild-all, bpf, netdev, Jason Xing
Hi Jason,
kernel test robot noticed the following build warnings:
[auto build test WARNING on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Jason-Xing/net-timestamp-introduce-socket-tsflag-requestors/20241012-121010
base: net-next/main
patch link: https://lore.kernel.org/r/20241012040651.95616-10-kerneljasonxing%40gmail.com
patch subject: [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20241015/202410151628.hcAdeahi-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241015/202410151628.hcAdeahi-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410151628.hcAdeahi-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> net/core/sock.c:926:2: warning: variable 'tsflags' is uninitialized when used here [-Wuninitialized]
926 | tsflags |= (sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR] |
| ^~~~~~~
net/core/sock.c:920:13: note: initialize the variable 'tsflags' to silence this warning
920 | u32 tsflags;
| ^
| = 0
1 warning generated.
vim +/tsflags +926 net/core/sock.c
917
918 int sock_set_tskey(struct sock *sk, int val, int type)
919 {
920 u32 tsflags;
921
922 if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
923 !(val & SOF_TIMESTAMPING_OPT_ID))
924 return -EINVAL;
925
> 926 tsflags |= (sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR] |
927 sk->sk_tsflags[BPFPROG_TS_REQUESTOR]);
928 if (val & SOF_TIMESTAMPING_OPT_ID &&
929 !(tsflags & SOF_TIMESTAMPING_OPT_ID)) {
930 if (sk_is_tcp(sk)) {
931 if ((1 << sk->sk_state) &
932 (TCPF_CLOSE | TCPF_LISTEN))
933 return -EINVAL;
934 if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
935 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
936 else
937 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
938 } else {
939 atomic_set(&sk->sk_tskey, 0);
940 }
941 }
942
943 return 0;
944 }
945
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case
2024-10-15 8:40 ` kernel test robot
@ 2024-10-15 9:36 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-15 9:36 UTC (permalink / raw)
To: kernel test robot
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, llvm,
oe-kbuild-all, bpf, netdev, Jason Xing
On Tue, Oct 15, 2024 at 4:41 PM kernel test robot <lkp@intel.com> wrote:
>
> Hi Jason,
>
> kernel test robot noticed the following build warnings:
>
> [auto build test WARNING on net-next/main]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Jason-Xing/net-timestamp-introduce-socket-tsflag-requestors/20241012-121010
> base: net-next/main
> patch link: https://lore.kernel.org/r/20241012040651.95616-10-kerneljasonxing%40gmail.com
> patch subject: [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case
> config: x86_64-kexec (https://download.01.org/0day-ci/archive/20241015/202410151628.hcAdeahi-lkp@intel.com/config)
> compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241015/202410151628.hcAdeahi-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202410151628.hcAdeahi-lkp@intel.com/
>
> All warnings (new ones prefixed by >>):
>
> >> net/core/sock.c:926:2: warning: variable 'tsflags' is uninitialized when used here [-Wuninitialized]
> 926 | tsflags |= (sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR] |
> | ^~~~~~~
> net/core/sock.c:920:13: note: initialize the variable 'tsflags' to silence this warning
> 920 | u32 tsflags;
> | ^
> | = 0
> 1 warning generated.
Thanks! I will fix it!
^ permalink raw reply [flat|nested] 73+ messages in thread
* [PATCH net-next v2 10/12] net-timestamp: make bpf for tx timestamp work
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (8 preceding siblings ...)
2024-10-12 4:06 ` [PATCH net-next v2 09/12] net-timestamp: add tx OPT_ID_TCP support for bpf case Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-12 4:06 ` [PATCH net-next v2 11/12] net-timestamp: add bpf framework for rx timestamps Jason Xing
` (2 subsequent siblings)
12 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
It's time to let bpf extension feature work. I extracted the part
of logic from tcp_tx_timestamp() for bpf extension use, like
TX timestamp flags.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
net/ipv4/tcp.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6c8968eb4427..d37e231b2737 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -477,11 +477,31 @@ void tcp_init_sock(struct sock *sk)
}
EXPORT_SYMBOL(tcp_init_sock);
+static void bpf_tx_timestamp(struct sock *sk, struct sk_buff *skb)
+{
+ u32 tsflags = READ_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR]);
+
+ if (tsflags && skb) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ __sock_tx_timestamp(tsflags, &shinfo->tx_flags);
+
+ if (tsflags & SOF_TIMESTAMPING_TX_ACK)
+ tcb->txstamp_ack = 1;
+ if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
+ shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+ }
+}
+
static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
{
struct sk_buff *skb = tcp_write_queue_tail(sk);
u32 tsflags = sockc->tsflags;
+ if (static_branch_unlikely(&bpf_tstamp_control))
+ bpf_tx_timestamp(sk, skb);
+
if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* [PATCH net-next v2 11/12] net-timestamp: add bpf framework for rx timestamps
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (9 preceding siblings ...)
2024-10-12 4:06 ` [PATCH net-next v2 10/12] net-timestamp: make bpf for tx timestamp work Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-15 1:44 ` Willem de Bruijn
2024-10-12 4:06 ` [PATCH net-next v2 12/12] net-timestamp: add bpf support for rx software/hardware timestamp Jason Xing
2024-10-12 17:48 ` [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Willem de Bruijn
12 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Prepare for later changes in this series. Here I use u32 for
bpf_sock_ops_cb_flags for better extension and introduce a new
rx bpf flag to control separately.
Main change is let userside set through bpf_setsockopt() for
SO_TIMESTAMPING feature.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/linux/tcp.h | 2 +-
include/net/tcp.h | 2 +-
include/uapi/linux/bpf.h | 5 ++++-
net/core/filter.c | 6 +++++-
net/ipv4/tcp.c | 13 ++++++++++++-
tools/include/uapi/linux/bpf.h | 5 ++++-
6 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6a5e08b937b3..e21fd3035962 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -446,7 +446,7 @@ struct tcp_sock {
/* Sock_ops bpf program related variables */
#ifdef CONFIG_BPF
- u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs
+ u32 bpf_sock_ops_cb_flags; /* Control calling BPF programs
* values defined in uapi/linux/tcp.h
*/
u8 bpf_chg_cc_inprogress:1; /* In the middle of
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 739a9fb83d0c..728db7107074 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -423,7 +423,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val);
int tcp_set_window_clamp(struct sock *sk, int val);
void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping_internal *tss);
-void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+void tcp_recv_timestamp(struct msghdr *msg, struct sock *sk,
struct scm_timestamping_internal *tss);
void tcp_data_ready(struct sock *sk);
#ifdef CONFIG_MMU
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1b478ec18ac2..d2754f155cf7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6903,8 +6903,11 @@ enum {
/* Call bpf when the kernel is generating tx timestamps.
*/
BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG = (1<<7),
+ /* Call bpf when the kernel is generating rx timestamps.
+ */
+ BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG = (1<<8),
/* Mask of all currently supported cb flags */
- BPF_SOCK_OPS_ALL_CB_FLAGS = 0xFF,
+ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x1FF,
};
/* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 3b4afaa273d9..36b357b76f4a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5216,14 +5216,18 @@ static int bpf_sock_set_timestamping(struct sock *sk,
return -EINVAL;
if (!(flags & (SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
- SOF_TIMESTAMPING_TX_ACK)))
+ SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_RX_SOFTWARE)))
return -EINVAL;
ret = sock_set_tskey(sk, flags, BPFPROG_TS_REQUESTOR);
if (ret)
return ret;
+ if (flags & SOF_TIMESTAMPING_RX_SOFTWARE)
+ sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
+
WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
+
static_branch_enable(&bpf_tstamp_control);
return 0;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d37e231b2737..0891b41bc745 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2260,14 +2260,25 @@ static int tcp_zerocopy_receive(struct sock *sk,
}
#endif
+static void tcp_bpf_recv_timestamp(struct sock *sk, struct scm_timestamping_internal *tss)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG))
+ return;
+}
+
/* Similar to __sock_recv_timestamp, but does not require an skb */
-void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+void tcp_recv_timestamp(struct msghdr *msg, struct sock *sk,
struct scm_timestamping_internal *tss)
{
int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
u32 tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
bool has_timestamping = false;
+ if (static_branch_unlikely(&bpf_tstamp_control))
+ tcp_bpf_recv_timestamp(sk, tss);
+
if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
if (sock_flag(sk, SOCK_RCVTSTAMP)) {
if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fc9b94de19f2..331e3e6f1ed5 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6902,8 +6902,11 @@ enum {
/* Call bpf when the kernel is generating tx timestamps.
*/
BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG = (1<<7),
+ /* Call bpf when the kernel is generating rx timestamps.
+ */
+ BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG = (1<<8),
/* Mask of all currently supported cb flags */
- BPF_SOCK_OPS_ALL_CB_FLAGS = 0xFF,
+ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x1FF,
};
/* List of known BPF sock_ops operators.
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 11/12] net-timestamp: add bpf framework for rx timestamps
2024-10-12 4:06 ` [PATCH net-next v2 11/12] net-timestamp: add bpf framework for rx timestamps Jason Xing
@ 2024-10-15 1:44 ` Willem de Bruijn
2024-10-15 2:18 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-15 1:44 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: bpf, netdev, Jason Xing
Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> Prepare for later changes in this series. Here I use u32 for
> bpf_sock_ops_cb_flags for better extension and introduce a new
> rx bpf flag to control separately.
>
> Main change is let userside set through bpf_setsockopt() for
> SO_TIMESTAMPING feature.
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> include/linux/tcp.h | 2 +-
> include/net/tcp.h | 2 +-
> include/uapi/linux/bpf.h | 5 ++++-
> net/core/filter.c | 6 +++++-
> net/ipv4/tcp.c | 13 ++++++++++++-
> tools/include/uapi/linux/bpf.h | 5 ++++-
> 6 files changed, 27 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 6a5e08b937b3..e21fd3035962 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -446,7 +446,7 @@ struct tcp_sock {
>
> /* Sock_ops bpf program related variables */
> #ifdef CONFIG_BPF
> - u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs
> + u32 bpf_sock_ops_cb_flags; /* Control calling BPF programs
> * values defined in uapi/linux/tcp.h
> */
> u8 bpf_chg_cc_inprogress:1; /* In the middle of
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 739a9fb83d0c..728db7107074 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -423,7 +423,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val);
> int tcp_set_window_clamp(struct sock *sk, int val);
> void tcp_update_recv_tstamps(struct sk_buff *skb,
> struct scm_timestamping_internal *tss);
> -void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
> +void tcp_recv_timestamp(struct msghdr *msg, struct sock *sk,
> struct scm_timestamping_internal *tss);
> void tcp_data_ready(struct sock *sk);
> #ifdef CONFIG_MMU
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 1b478ec18ac2..d2754f155cf7 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -6903,8 +6903,11 @@ enum {
> /* Call bpf when the kernel is generating tx timestamps.
> */
> BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG = (1<<7),
> + /* Call bpf when the kernel is generating rx timestamps.
> + */
> + BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG = (1<<8),
> /* Mask of all currently supported cb flags */
> - BPF_SOCK_OPS_ALL_CB_FLAGS = 0xFF,
> + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x1FF,
> };
>
> /* List of known BPF sock_ops operators.
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 3b4afaa273d9..36b357b76f4a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5216,14 +5216,18 @@ static int bpf_sock_set_timestamping(struct sock *sk,
> return -EINVAL;
>
> if (!(flags & (SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
> - SOF_TIMESTAMPING_TX_ACK)))
> + SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_RX_SOFTWARE)))
> return -EINVAL;
>
> ret = sock_set_tskey(sk, flags, BPFPROG_TS_REQUESTOR);
> if (ret)
> return ret;
>
> + if (flags & SOF_TIMESTAMPING_RX_SOFTWARE)
> + sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
> +
> WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
> +
> static_branch_enable(&bpf_tstamp_control);
>
> return 0;
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index d37e231b2737..0891b41bc745 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -2260,14 +2260,25 @@ static int tcp_zerocopy_receive(struct sock *sk,
> }
> #endif
>
> +static void tcp_bpf_recv_timestamp(struct sock *sk, struct scm_timestamping_internal *tss)
> +{
> + struct tcp_sock *tp = tcp_sk(sk);
> +
> + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG))
> + return;
> +}
> +
> /* Similar to __sock_recv_timestamp, but does not require an skb */
> -void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
> +void tcp_recv_timestamp(struct msghdr *msg, struct sock *sk,
> struct scm_timestamping_internal *tss)
> {
> int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
> u32 tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
> bool has_timestamping = false;
>
> + if (static_branch_unlikely(&bpf_tstamp_control))
> + tcp_bpf_recv_timestamp(sk, tss);
> +
> if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
> if (sock_flag(sk, SOCK_RCVTSTAMP)) {
> if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
tcp_recv_timestamp is called from tcp_recvmsg only conditionally:
if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
if (cmsg_flags & TCP_CMSG_TS)
tcp_recv_timestamp(msg, sk, &tss);
How do you get this triggered for your BPF program?
And also check the other caller, tcp_zc_finalize_rx_tstamp.
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 11/12] net-timestamp: add bpf framework for rx timestamps
2024-10-15 1:44 ` Willem de Bruijn
@ 2024-10-15 2:18 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-15 2:18 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Tue, Oct 15, 2024 at 9:44 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Prepare for later changes in this series. Here I use u32 for
> > bpf_sock_ops_cb_flags for better extension and introduce a new
> > rx bpf flag to control separately.
> >
> > Main change is let userside set through bpf_setsockopt() for
> > SO_TIMESTAMPING feature.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > include/linux/tcp.h | 2 +-
> > include/net/tcp.h | 2 +-
> > include/uapi/linux/bpf.h | 5 ++++-
> > net/core/filter.c | 6 +++++-
> > net/ipv4/tcp.c | 13 ++++++++++++-
> > tools/include/uapi/linux/bpf.h | 5 ++++-
> > 6 files changed, 27 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> > index 6a5e08b937b3..e21fd3035962 100644
> > --- a/include/linux/tcp.h
> > +++ b/include/linux/tcp.h
> > @@ -446,7 +446,7 @@ struct tcp_sock {
> >
> > /* Sock_ops bpf program related variables */
> > #ifdef CONFIG_BPF
> > - u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs
> > + u32 bpf_sock_ops_cb_flags; /* Control calling BPF programs
> > * values defined in uapi/linux/tcp.h
> > */
> > u8 bpf_chg_cc_inprogress:1; /* In the middle of
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index 739a9fb83d0c..728db7107074 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -423,7 +423,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val);
> > int tcp_set_window_clamp(struct sock *sk, int val);
> > void tcp_update_recv_tstamps(struct sk_buff *skb,
> > struct scm_timestamping_internal *tss);
> > -void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
> > +void tcp_recv_timestamp(struct msghdr *msg, struct sock *sk,
> > struct scm_timestamping_internal *tss);
> > void tcp_data_ready(struct sock *sk);
> > #ifdef CONFIG_MMU
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 1b478ec18ac2..d2754f155cf7 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -6903,8 +6903,11 @@ enum {
> > /* Call bpf when the kernel is generating tx timestamps.
> > */
> > BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG = (1<<7),
> > + /* Call bpf when the kernel is generating rx timestamps.
> > + */
> > + BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG = (1<<8),
> > /* Mask of all currently supported cb flags */
> > - BPF_SOCK_OPS_ALL_CB_FLAGS = 0xFF,
> > + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x1FF,
> > };
> >
> > /* List of known BPF sock_ops operators.
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 3b4afaa273d9..36b357b76f4a 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -5216,14 +5216,18 @@ static int bpf_sock_set_timestamping(struct sock *sk,
> > return -EINVAL;
> >
> > if (!(flags & (SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
> > - SOF_TIMESTAMPING_TX_ACK)))
> > + SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_RX_SOFTWARE)))
> > return -EINVAL;
> >
> > ret = sock_set_tskey(sk, flags, BPFPROG_TS_REQUESTOR);
> > if (ret)
> > return ret;
> >
> > + if (flags & SOF_TIMESTAMPING_RX_SOFTWARE)
> > + sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
> > +
> > WRITE_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR], flags);
> > +
> > static_branch_enable(&bpf_tstamp_control);
> >
> > return 0;
> > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > index d37e231b2737..0891b41bc745 100644
> > --- a/net/ipv4/tcp.c
> > +++ b/net/ipv4/tcp.c
> > @@ -2260,14 +2260,25 @@ static int tcp_zerocopy_receive(struct sock *sk,
> > }
> > #endif
> >
> > +static void tcp_bpf_recv_timestamp(struct sock *sk, struct scm_timestamping_internal *tss)
> > +{
> > + struct tcp_sock *tp = tcp_sk(sk);
> > +
> > + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG))
> > + return;
> > +}
> > +
> > /* Similar to __sock_recv_timestamp, but does not require an skb */
> > -void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
> > +void tcp_recv_timestamp(struct msghdr *msg, struct sock *sk,
> > struct scm_timestamping_internal *tss)
> > {
> > int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
> > u32 tsflags = READ_ONCE(sk->sk_tsflags[SOCKETOPT_TS_REQUESTOR]);
> > bool has_timestamping = false;
> >
> > + if (static_branch_unlikely(&bpf_tstamp_control))
> > + tcp_bpf_recv_timestamp(sk, tss);
> > +
> > if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
> > if (sock_flag(sk, SOCK_RCVTSTAMP)) {
> > if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
>
> tcp_recv_timestamp is called from tcp_recvmsg only conditionally:
>
> if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
> if (cmsg_flags & TCP_CMSG_TS)
> tcp_recv_timestamp(msg, sk, &tss);
>
> How do you get this triggered for your BPF program?
When users use BPF SO_TIMESTAMPING to print rx timestamp, it will use
bpf_setsockopt() to call sock_enable_timestamp() (see this patch), so
the skb will carry a timestamp. In tcp_recvmsg_locked(), cmsg_flags
will be initialized, so tcp_recv_timestamp() will get called.
>
> And also check the other caller, tcp_zc_finalize_rx_tstamp.
Got it, thanks for pointing it out.
^ permalink raw reply [flat|nested] 73+ messages in thread
* [PATCH net-next v2 12/12] net-timestamp: add bpf support for rx software/hardware timestamp
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (10 preceding siblings ...)
2024-10-12 4:06 ` [PATCH net-next v2 11/12] net-timestamp: add bpf framework for rx timestamps Jason Xing
@ 2024-10-12 4:06 ` Jason Xing
2024-10-12 17:48 ` [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Willem de Bruijn
12 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-12 4:06 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Now it's time to let the bpf for rx timestamp take effect.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/net/tcp.h | 14 ++++++++++++++
include/uapi/linux/bpf.h | 5 +++++
net/ipv4/tcp.c | 29 +++++++++++++++++++++++++++--
tools/include/uapi/linux/bpf.h | 5 +++++
4 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 728db7107074..5a7893379ef7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2676,6 +2676,14 @@ static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
return tcp_call_bpf(sk, op, 3, args);
}
+static inline int tcp_call_bpf_4arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+ u32 arg3, u32 arg4)
+{
+ u32 args[4] = {arg1, arg2, arg3, arg4};
+
+ return tcp_call_bpf(sk, op, 4, args);
+}
+
#else
static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
@@ -2693,6 +2701,12 @@ static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
return -EPERM;
}
+static inline int tcp_call_bpf_4arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+ u32 arg3, u32 arg4)
+{
+ return -EPERM;
+}
+
#endif
static inline u32 tcp_timeout_init(struct sock *sk)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d2754f155cf7..3527c20c8396 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7037,6 +7037,11 @@ enum {
* feature is on. It indicates the
* recorded timestamp.
*/
+ BPF_SOCK_OPS_TS_RX_OPT_CB, /* Called when tcp layer tries to
+ * receive skbs with timestamps when
+ * SO_TIMESTAMPING feature is on
+ * It indicates the recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0891b41bc745..14bc7283f574 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2262,10 +2262,35 @@ static int tcp_zerocopy_receive(struct sock *sk,
static void tcp_bpf_recv_timestamp(struct sock *sk, struct scm_timestamping_internal *tss)
{
+ u32 tsflags = READ_ONCE(sk->sk_tsflags[BPFPROG_TS_REQUESTOR]);
struct tcp_sock *tp = tcp_sk(sk);
- if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG))
- return;
+ if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG)) {
+ u32 hw_sec, hw_nsec, sw_sec, sw_nsec;
+
+ if (!(tsflags & (SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_RX_HARDWARE)))
+ return;
+
+ if (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) {
+ sw_sec = tss->ts[0].tv_sec;
+ sw_nsec = tss->ts[0].tv_nsec;
+ } else {
+ sw_sec = 0;
+ sw_nsec = 0;
+ }
+
+ if (tsflags & SOF_TIMESTAMPING_RX_HARDWARE) {
+ hw_sec = tss->ts[2].tv_sec;
+ hw_nsec = tss->ts[2].tv_nsec;
+ } else {
+ hw_sec = 0;
+ hw_nsec = 0;
+ }
+
+ tcp_call_bpf_4arg(sk, BPF_SOCK_OPS_TS_RX_OPT_CB,
+ sw_sec, sw_nsec, hw_sec, hw_nsec);
+ }
}
/* Similar to __sock_recv_timestamp, but does not require an skb */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 331e3e6f1ed5..fad942abc36a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7036,6 +7036,11 @@ enum {
* feature is on. It indicates the
* recorded timestamp.
*/
+ BPF_SOCK_OPS_TS_RX_OPT_CB, /* Called when tcp layer tries to
+ * receive skbs with timestamps when
+ * SO_TIMESTAMPING feature is on
+ * It indicates the recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
--
2.37.3
^ permalink raw reply related [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently
2024-10-12 4:06 [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (11 preceding siblings ...)
2024-10-12 4:06 ` [PATCH net-next v2 12/12] net-timestamp: add bpf support for rx software/hardware timestamp Jason Xing
@ 2024-10-12 17:48 ` Willem de Bruijn
2024-10-13 3:28 ` Jason Xing
12 siblings, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-12 17:48 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: bpf, netdev, Jason Xing
Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> A few weeks ago, I planned to extend SO_TIMESTMAMPING feature by using
> tracepoint to print information (say, tstamp) so that we can
> transparently equip applications with this feature and require no
> modification in user side.
>
> Later, we discussed at netconf and agreed that we can use bpf for better
> extension, which is mainly suggested by John Fastabend and Willem de
> Bruijn. Many thanks here! So I post this series to see if we have a
> better solution to extend. My feeling is BPF is a good place to provide
> a way to add timestamping by administrators, without having to rebuild
> applications.
>
> This approach mostly relies on existing SO_TIMESTAMPING feature, users
> only needs to pass certain flags through bpf_setsocktop() to a separate
> tsflags. For TX timestamps, they will be printed during generation
> phase. For RX timestamps, we will wait for the moment when recvmsg() is
> called.
>
> After this series, we could step by step implement more advanced
> functions/flags already in SO_TIMESTAMPING feature for bpf extension.
>
> In this series, I only support TCP protocol which is widely used in
> SO_TIMESTAMPING feature.
>
> ---
> V2
> Link: https://lore.kernel.org/all/20241008095109.99918-1-kerneljasonxing@gmail.com/
> 1. Introduce tsflag requestors so that we are able to extend more in the
> future. Besides, it enables TX flags for bpf extension feature separately
> without breaking users. It is suggested by Vadim Fedorenko.
> 2. introduce a static key to control the whole feature. (Willem)
> 3. Open the gate of bpf_setsockopt for the SO_TIMESTAMPING feature in
> some TX/RX cases, not all the cases.
>
> Note:
> The main concern we've discussion in V1 thread is how to deal with the
> applications using SO_TIMESTAMPING feature? In this series, I allow both
> cases to happen at the same time, which indicates that even one
> applications setting SO_TIMESTAMPING can still be traced through BPF
> program. Please see patch [04/12].
This revision does not address the main concern.
An administrator installed BPF program can affect results of a process
using SO_TIMESTAMPING in ways that break it.
My halfway suggestion was to only enable this if the process has not
enabled timestamping on a socket, and to hard fail the application if
it does enable it while BPF timestamping is active. You pushed back,
entirely reasonably. But if anything we need a stronger method of
isolation, not just ignore the issue.
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently
2024-10-12 17:48 ` [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently Willem de Bruijn
@ 2024-10-13 3:28 ` Jason Xing
2024-10-13 3:43 ` Jason Xing
2024-10-15 1:28 ` Willem de Bruijn
0 siblings, 2 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-13 3:28 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Sun, Oct 13, 2024 at 1:48 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > A few weeks ago, I planned to extend SO_TIMESTMAMPING feature by using
> > tracepoint to print information (say, tstamp) so that we can
> > transparently equip applications with this feature and require no
> > modification in user side.
> >
> > Later, we discussed at netconf and agreed that we can use bpf for better
> > extension, which is mainly suggested by John Fastabend and Willem de
> > Bruijn. Many thanks here! So I post this series to see if we have a
> > better solution to extend. My feeling is BPF is a good place to provide
> > a way to add timestamping by administrators, without having to rebuild
> > applications.
> >
> > This approach mostly relies on existing SO_TIMESTAMPING feature, users
> > only needs to pass certain flags through bpf_setsocktop() to a separate
> > tsflags. For TX timestamps, they will be printed during generation
> > phase. For RX timestamps, we will wait for the moment when recvmsg() is
> > called.
> >
> > After this series, we could step by step implement more advanced
> > functions/flags already in SO_TIMESTAMPING feature for bpf extension.
> >
> > In this series, I only support TCP protocol which is widely used in
> > SO_TIMESTAMPING feature.
> >
> > ---
> > V2
> > Link: https://lore.kernel.org/all/20241008095109.99918-1-kerneljasonxing@gmail.com/
> > 1. Introduce tsflag requestors so that we are able to extend more in the
> > future. Besides, it enables TX flags for bpf extension feature separately
> > without breaking users. It is suggested by Vadim Fedorenko.
> > 2. introduce a static key to control the whole feature. (Willem)
> > 3. Open the gate of bpf_setsockopt for the SO_TIMESTAMPING feature in
> > some TX/RX cases, not all the cases.
> >
> > Note:
> > The main concern we've discussion in V1 thread is how to deal with the
> > applications using SO_TIMESTAMPING feature? In this series, I allow both
> > cases to happen at the same time, which indicates that even one
> > applications setting SO_TIMESTAMPING can still be traced through BPF
> > program. Please see patch [04/12].
>
> This revision does not address the main concern.
>
> An administrator installed BPF program can affect results of a process
> using SO_TIMESTAMPING in ways that break it.
Sorry, I didn't get it. How the following code snippet would break users?
void __skb_tstamp_tx(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype)
{
if (!sk)
return;
if (static_branch_unlikely(&bpf_tstamp_control))
bpf_skb_tstamp_tx_output(sk, orig_skb, tstype, hwtstamps);
skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk,
tstype);
}
You can see, the application shipped with SO_TIMESTAMPING still prints
timestamps even when the application stays in the attached cgroup
directory.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently
2024-10-13 3:28 ` Jason Xing
@ 2024-10-13 3:43 ` Jason Xing
2024-10-13 6:05 ` Jason Xing
2024-10-15 1:28 ` Willem de Bruijn
1 sibling, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-13 3:43 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Sun, Oct 13, 2024 at 11:28 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Sun, Oct 13, 2024 at 1:48 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Jason Xing wrote:
> > > From: Jason Xing <kernelxing@tencent.com>
> > >
> > > A few weeks ago, I planned to extend SO_TIMESTMAMPING feature by using
> > > tracepoint to print information (say, tstamp) so that we can
> > > transparently equip applications with this feature and require no
> > > modification in user side.
> > >
> > > Later, we discussed at netconf and agreed that we can use bpf for better
> > > extension, which is mainly suggested by John Fastabend and Willem de
> > > Bruijn. Many thanks here! So I post this series to see if we have a
> > > better solution to extend. My feeling is BPF is a good place to provide
> > > a way to add timestamping by administrators, without having to rebuild
> > > applications.
> > >
> > > This approach mostly relies on existing SO_TIMESTAMPING feature, users
> > > only needs to pass certain flags through bpf_setsocktop() to a separate
> > > tsflags. For TX timestamps, they will be printed during generation
> > > phase. For RX timestamps, we will wait for the moment when recvmsg() is
> > > called.
> > >
> > > After this series, we could step by step implement more advanced
> > > functions/flags already in SO_TIMESTAMPING feature for bpf extension.
> > >
> > > In this series, I only support TCP protocol which is widely used in
> > > SO_TIMESTAMPING feature.
> > >
> > > ---
> > > V2
> > > Link: https://lore.kernel.org/all/20241008095109.99918-1-kerneljasonxing@gmail.com/
> > > 1. Introduce tsflag requestors so that we are able to extend more in the
> > > future. Besides, it enables TX flags for bpf extension feature separately
> > > without breaking users. It is suggested by Vadim Fedorenko.
> > > 2. introduce a static key to control the whole feature. (Willem)
> > > 3. Open the gate of bpf_setsockopt for the SO_TIMESTAMPING feature in
> > > some TX/RX cases, not all the cases.
> > >
> > > Note:
> > > The main concern we've discussion in V1 thread is how to deal with the
> > > applications using SO_TIMESTAMPING feature? In this series, I allow both
> > > cases to happen at the same time, which indicates that even one
> > > applications setting SO_TIMESTAMPING can still be traced through BPF
> > > program. Please see patch [04/12].
> >
> > This revision does not address the main concern.
> >
> > An administrator installed BPF program can affect results of a process
> > using SO_TIMESTAMPING in ways that break it.
>
> Sorry, I didn't get it. How the following code snippet would break users?
>
> void __skb_tstamp_tx(struct sk_buff *orig_skb,
> const struct sk_buff *ack_skb,
> struct skb_shared_hwtstamps *hwtstamps,
> struct sock *sk, int tstype)
> {
> if (!sk)
> return;
>
> if (static_branch_unlikely(&bpf_tstamp_control))
> bpf_skb_tstamp_tx_output(sk, orig_skb, tstype, hwtstamps);
>
> skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk,
> tstype);
> }
>
> You can see, the application shipped with SO_TIMESTAMPING still prints
> timestamps even when the application stays in the attached cgroup
> directory.
I tested this by running "./txtimestamp -4 -L 127.0.0.1 -l 1000 -c 5"
in the bpf attached directory and it can correctly print the
timestamp. So it would not break users.
And surprisingly I found the key is not that right (ERROR: key 1000,
expected 999). I will investigate and fix it.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently
2024-10-13 3:43 ` Jason Xing
@ 2024-10-13 6:05 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-13 6:05 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
> I tested this by running "./txtimestamp -4 -L 127.0.0.1 -l 1000 -c 5"
> in the bpf attached directory and it can correctly print the
> timestamp. So it would not break users.
>
> And surprisingly I found the key is not that right (ERROR: key 1000,
> expected 999). I will investigate and fix it.
Ah, I think I know the reason. In this series, the BPF extension
allows setting before sending SYN packet in the beginning of
tcp_connect(), which is different from the original design that allows
setting after sending the SYN packet. It causes the unexpected key.
They are different. The reason why the failure is triggered is because
I reuse the tskey logic in the BPF extension...
====
Back to the question on how to solve the conflicts, if we finally
reckon that the original feature has the first priority, I can change
the order in the next version.
void __skb_tstamp_tx(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype)
{
if (!sk)
return;
ret = skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
if (ret)
/* Apps does set the SO_TIMESTAMPING flag, return directly */
return;
if (static_branch_unlikely(&bpf_tstamp_control))
bpf_skb_tstamp_tx_output(sk, orig_skb, tstype, hwtstamps);
}
In this way, it will allow either of two features to work. Willem, do
you think it is fine with you?
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently
2024-10-13 3:28 ` Jason Xing
2024-10-13 3:43 ` Jason Xing
@ 2024-10-15 1:28 ` Willem de Bruijn
2024-10-15 2:52 ` Jason Xing
1 sibling, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-15 1:28 UTC (permalink / raw)
To: Jason Xing, Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
Jason Xing wrote:
> On Sun, Oct 13, 2024 at 1:48 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Jason Xing wrote:
> > > From: Jason Xing <kernelxing@tencent.com>
> > >
> > > A few weeks ago, I planned to extend SO_TIMESTMAMPING feature by using
> > > tracepoint to print information (say, tstamp) so that we can
> > > transparently equip applications with this feature and require no
> > > modification in user side.
> > >
> > > Later, we discussed at netconf and agreed that we can use bpf for better
> > > extension, which is mainly suggested by John Fastabend and Willem de
> > > Bruijn. Many thanks here! So I post this series to see if we have a
> > > better solution to extend. My feeling is BPF is a good place to provide
> > > a way to add timestamping by administrators, without having to rebuild
> > > applications.
> > >
> > > This approach mostly relies on existing SO_TIMESTAMPING feature, users
> > > only needs to pass certain flags through bpf_setsocktop() to a separate
> > > tsflags. For TX timestamps, they will be printed during generation
> > > phase. For RX timestamps, we will wait for the moment when recvmsg() is
> > > called.
> > >
> > > After this series, we could step by step implement more advanced
> > > functions/flags already in SO_TIMESTAMPING feature for bpf extension.
> > >
> > > In this series, I only support TCP protocol which is widely used in
> > > SO_TIMESTAMPING feature.
> > >
> > > ---
> > > V2
> > > Link: https://lore.kernel.org/all/20241008095109.99918-1-kerneljasonxing@gmail.com/
> > > 1. Introduce tsflag requestors so that we are able to extend more in the
> > > future. Besides, it enables TX flags for bpf extension feature separately
> > > without breaking users. It is suggested by Vadim Fedorenko.
> > > 2. introduce a static key to control the whole feature. (Willem)
> > > 3. Open the gate of bpf_setsockopt for the SO_TIMESTAMPING feature in
> > > some TX/RX cases, not all the cases.
> > >
> > > Note:
> > > The main concern we've discussion in V1 thread is how to deal with the
> > > applications using SO_TIMESTAMPING feature? In this series, I allow both
> > > cases to happen at the same time, which indicates that even one
> > > applications setting SO_TIMESTAMPING can still be traced through BPF
> > > program. Please see patch [04/12].
> >
> > This revision does not address the main concern.
> >
> > An administrator installed BPF program can affect results of a process
> > using SO_TIMESTAMPING in ways that break it.
>
> Sorry, I didn't get it. How the following code snippet would break users?
The state between user and bpf timestamping needs to be separate to
avoid interference.
Introducing a new sk_tsflags for bpf goes a long way. Though I prefer
a separate sk_tsflags_bpf and not touching existing sk_tsflags over
the array approach of patch 1. Also need to check pahole and maybe
move sk_tsflags_bpf elsewhere in the struct.
Other state is sk_tskey. The current approach can initialize the key
in bpf before the user attempts it for the same socket. Admittedly
unlikely. But hard to reach states creates hard to debug issues.
This field cannot easily be duplicated, because the key is tracked
in skb_shinfo. Where there is not sufficient room for two keys.
The same goes for txflags.
The current approach is to set those flags if either user or bpf
requestss them, then on __skb_tstamp_tx detect if the user did not set
them, and if so skip output to the user. Need to take a closer look,
but seems to work.
So getting closer.
^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently
2024-10-15 1:28 ` Willem de Bruijn
@ 2024-10-15 2:52 ` Jason Xing
2024-10-15 2:59 ` Willem de Bruijn
0 siblings, 1 reply; 73+ messages in thread
From: Jason Xing @ 2024-10-15 2:52 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Tue, Oct 15, 2024 at 9:28 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Sun, Oct 13, 2024 at 1:48 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Jason Xing wrote:
> > > > From: Jason Xing <kernelxing@tencent.com>
> > > >
> > > > A few weeks ago, I planned to extend SO_TIMESTMAMPING feature by using
> > > > tracepoint to print information (say, tstamp) so that we can
> > > > transparently equip applications with this feature and require no
> > > > modification in user side.
> > > >
> > > > Later, we discussed at netconf and agreed that we can use bpf for better
> > > > extension, which is mainly suggested by John Fastabend and Willem de
> > > > Bruijn. Many thanks here! So I post this series to see if we have a
> > > > better solution to extend. My feeling is BPF is a good place to provide
> > > > a way to add timestamping by administrators, without having to rebuild
> > > > applications.
> > > >
> > > > This approach mostly relies on existing SO_TIMESTAMPING feature, users
> > > > only needs to pass certain flags through bpf_setsocktop() to a separate
> > > > tsflags. For TX timestamps, they will be printed during generation
> > > > phase. For RX timestamps, we will wait for the moment when recvmsg() is
> > > > called.
> > > >
> > > > After this series, we could step by step implement more advanced
> > > > functions/flags already in SO_TIMESTAMPING feature for bpf extension.
> > > >
> > > > In this series, I only support TCP protocol which is widely used in
> > > > SO_TIMESTAMPING feature.
> > > >
> > > > ---
> > > > V2
> > > > Link: https://lore.kernel.org/all/20241008095109.99918-1-kerneljasonxing@gmail.com/
> > > > 1. Introduce tsflag requestors so that we are able to extend more in the
> > > > future. Besides, it enables TX flags for bpf extension feature separately
> > > > without breaking users. It is suggested by Vadim Fedorenko.
> > > > 2. introduce a static key to control the whole feature. (Willem)
> > > > 3. Open the gate of bpf_setsockopt for the SO_TIMESTAMPING feature in
> > > > some TX/RX cases, not all the cases.
> > > >
> > > > Note:
> > > > The main concern we've discussion in V1 thread is how to deal with the
> > > > applications using SO_TIMESTAMPING feature? In this series, I allow both
> > > > cases to happen at the same time, which indicates that even one
> > > > applications setting SO_TIMESTAMPING can still be traced through BPF
> > > > program. Please see patch [04/12].
> > >
> > > This revision does not address the main concern.
> > >
> > > An administrator installed BPF program can affect results of a process
> > > using SO_TIMESTAMPING in ways that break it.
> >
> > Sorry, I didn't get it. How the following code snippet would break users?
>
> The state between user and bpf timestamping needs to be separate to
> avoid interference.
Do you agree that we will use this method as following, only allow
either of them to work?
void __skb_tstamp_tx(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype)
{
if (!sk)
return;
ret = skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
if (ret)
/* Apps does set the SO_TIMESTAMPING flag, return directly */
return;
if (static_branch_unlikely(&bpf_tstamp_control))
bpf_skb_tstamp_tx_output(sk, orig_skb, tstype, hwtstamps);
}
which means if the apps using non-bpf method, we will not see the
output even if we load bpf program.
>
> Introducing a new sk_tsflags for bpf goes a long way. Though I prefer
> a separate sk_tsflags_bpf and not touching existing sk_tsflags over
> the array approach of patch 1. Also need to check pahole and maybe
> move sk_tsflags_bpf elsewhere in the struct.
Yes, I will use this instead.
>
> Other state is sk_tskey. The current approach can initialize the key
> in bpf before the user attempts it for the same socket. Admittedly
> unlikely. But hard to reach states creates hard to debug issues.
>
> This field cannot easily be duplicated, because the key is tracked
> in skb_shinfo. Where there is not sufficient room for two keys.
>
> The same goes for txflags.
They are not that easy to handle in a proper way. That's the reason
why I chose to use the same logic, so that there is no side effect.
If we expect to separate them as well, it seems a little bit weird to
introduce another similar flags in struct sk_buff.
>
> The current approach is to set those flags if either user or bpf
> requestss them, then on __skb_tstamp_tx detect if the user did not set
> them, and if so skip output to the user. Need to take a closer look,
> but seems to work.
Let me keep this current approach, it will not affect each other.
>
> So getting closer.
Thanks for the careful review.
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently
2024-10-15 2:52 ` Jason Xing
@ 2024-10-15 2:59 ` Willem de Bruijn
2024-10-15 3:02 ` Jason Xing
0 siblings, 1 reply; 73+ messages in thread
From: Willem de Bruijn @ 2024-10-15 2:59 UTC (permalink / raw)
To: Jason Xing, Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
Jason Xing wrote:
> On Tue, Oct 15, 2024 at 9:28 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Jason Xing wrote:
> > > On Sun, Oct 13, 2024 at 1:48 AM Willem de Bruijn
> > > <willemdebruijn.kernel@gmail.com> wrote:
> > > >
> > > > Jason Xing wrote:
> > > > > From: Jason Xing <kernelxing@tencent.com>
> > > > >
> > > > > A few weeks ago, I planned to extend SO_TIMESTMAMPING feature by using
> > > > > tracepoint to print information (say, tstamp) so that we can
> > > > > transparently equip applications with this feature and require no
> > > > > modification in user side.
> > > > >
> > > > > Later, we discussed at netconf and agreed that we can use bpf for better
> > > > > extension, which is mainly suggested by John Fastabend and Willem de
> > > > > Bruijn. Many thanks here! So I post this series to see if we have a
> > > > > better solution to extend. My feeling is BPF is a good place to provide
> > > > > a way to add timestamping by administrators, without having to rebuild
> > > > > applications.
> > > > >
> > > > > This approach mostly relies on existing SO_TIMESTAMPING feature, users
> > > > > only needs to pass certain flags through bpf_setsocktop() to a separate
> > > > > tsflags. For TX timestamps, they will be printed during generation
> > > > > phase. For RX timestamps, we will wait for the moment when recvmsg() is
> > > > > called.
> > > > >
> > > > > After this series, we could step by step implement more advanced
> > > > > functions/flags already in SO_TIMESTAMPING feature for bpf extension.
> > > > >
> > > > > In this series, I only support TCP protocol which is widely used in
> > > > > SO_TIMESTAMPING feature.
> > > > >
> > > > > ---
> > > > > V2
> > > > > Link: https://lore.kernel.org/all/20241008095109.99918-1-kerneljasonxing@gmail.com/
> > > > > 1. Introduce tsflag requestors so that we are able to extend more in the
> > > > > future. Besides, it enables TX flags for bpf extension feature separately
> > > > > without breaking users. It is suggested by Vadim Fedorenko.
> > > > > 2. introduce a static key to control the whole feature. (Willem)
> > > > > 3. Open the gate of bpf_setsockopt for the SO_TIMESTAMPING feature in
> > > > > some TX/RX cases, not all the cases.
> > > > >
> > > > > Note:
> > > > > The main concern we've discussion in V1 thread is how to deal with the
> > > > > applications using SO_TIMESTAMPING feature? In this series, I allow both
> > > > > cases to happen at the same time, which indicates that even one
> > > > > applications setting SO_TIMESTAMPING can still be traced through BPF
> > > > > program. Please see patch [04/12].
> > > >
> > > > This revision does not address the main concern.
> > > >
> > > > An administrator installed BPF program can affect results of a process
> > > > using SO_TIMESTAMPING in ways that break it.
> > >
> > > Sorry, I didn't get it. How the following code snippet would break users?
> >
> > The state between user and bpf timestamping needs to be separate to
> > avoid interference.
>
> Do you agree that we will use this method as following, only allow
> either of them to work?
>
> void __skb_tstamp_tx(struct sk_buff *orig_skb,
> const struct sk_buff *ack_skb,
> struct skb_shared_hwtstamps *hwtstamps,
> struct sock *sk, int tstype)
> {
> if (!sk)
> return;
>
> ret = skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
> if (ret)
> /* Apps does set the SO_TIMESTAMPING flag, return directly */
> return;
>
> if (static_branch_unlikely(&bpf_tstamp_control))
> bpf_skb_tstamp_tx_output(sk, orig_skb, tstype, hwtstamps);
> }
>
> which means if the apps using non-bpf method, we will not see the
> output even if we load bpf program.
Could the bpf setsockopt fail hard in that case?
Your current patch tries to make them work at the same time. It mostly
does work. There are just a few concerning edge cases that may result
in hard to understand bugs.
Making only one method work per socket and fail hard if both try it is
crude, but at least the failure will be clear: the setsockopt fails.
I think that's safer. And in practice, the use cases for BPF
timestamping probably are exactly when application timestamping is
missing?
^ permalink raw reply [flat|nested] 73+ messages in thread* Re: [PATCH net-next v2 00/12] net-timestamp: bpf extension to equip applications transparently
2024-10-15 2:59 ` Willem de Bruijn
@ 2024-10-15 3:02 ` Jason Xing
0 siblings, 0 replies; 73+ messages in thread
From: Jason Xing @ 2024-10-15 3:02 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Tue, Oct 15, 2024 at 10:59 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Tue, Oct 15, 2024 at 9:28 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Jason Xing wrote:
> > > > On Sun, Oct 13, 2024 at 1:48 AM Willem de Bruijn
> > > > <willemdebruijn.kernel@gmail.com> wrote:
> > > > >
> > > > > Jason Xing wrote:
> > > > > > From: Jason Xing <kernelxing@tencent.com>
> > > > > >
> > > > > > A few weeks ago, I planned to extend SO_TIMESTMAMPING feature by using
> > > > > > tracepoint to print information (say, tstamp) so that we can
> > > > > > transparently equip applications with this feature and require no
> > > > > > modification in user side.
> > > > > >
> > > > > > Later, we discussed at netconf and agreed that we can use bpf for better
> > > > > > extension, which is mainly suggested by John Fastabend and Willem de
> > > > > > Bruijn. Many thanks here! So I post this series to see if we have a
> > > > > > better solution to extend. My feeling is BPF is a good place to provide
> > > > > > a way to add timestamping by administrators, without having to rebuild
> > > > > > applications.
> > > > > >
> > > > > > This approach mostly relies on existing SO_TIMESTAMPING feature, users
> > > > > > only needs to pass certain flags through bpf_setsocktop() to a separate
> > > > > > tsflags. For TX timestamps, they will be printed during generation
> > > > > > phase. For RX timestamps, we will wait for the moment when recvmsg() is
> > > > > > called.
> > > > > >
> > > > > > After this series, we could step by step implement more advanced
> > > > > > functions/flags already in SO_TIMESTAMPING feature for bpf extension.
> > > > > >
> > > > > > In this series, I only support TCP protocol which is widely used in
> > > > > > SO_TIMESTAMPING feature.
> > > > > >
> > > > > > ---
> > > > > > V2
> > > > > > Link: https://lore.kernel.org/all/20241008095109.99918-1-kerneljasonxing@gmail.com/
> > > > > > 1. Introduce tsflag requestors so that we are able to extend more in the
> > > > > > future. Besides, it enables TX flags for bpf extension feature separately
> > > > > > without breaking users. It is suggested by Vadim Fedorenko.
> > > > > > 2. introduce a static key to control the whole feature. (Willem)
> > > > > > 3. Open the gate of bpf_setsockopt for the SO_TIMESTAMPING feature in
> > > > > > some TX/RX cases, not all the cases.
> > > > > >
> > > > > > Note:
> > > > > > The main concern we've discussion in V1 thread is how to deal with the
> > > > > > applications using SO_TIMESTAMPING feature? In this series, I allow both
> > > > > > cases to happen at the same time, which indicates that even one
> > > > > > applications setting SO_TIMESTAMPING can still be traced through BPF
> > > > > > program. Please see patch [04/12].
> > > > >
> > > > > This revision does not address the main concern.
> > > > >
> > > > > An administrator installed BPF program can affect results of a process
> > > > > using SO_TIMESTAMPING in ways that break it.
> > > >
> > > > Sorry, I didn't get it. How the following code snippet would break users?
> > >
> > > The state between user and bpf timestamping needs to be separate to
> > > avoid interference.
> >
> > Do you agree that we will use this method as following, only allow
> > either of them to work?
> >
> > void __skb_tstamp_tx(struct sk_buff *orig_skb,
> > const struct sk_buff *ack_skb,
> > struct skb_shared_hwtstamps *hwtstamps,
> > struct sock *sk, int tstype)
> > {
> > if (!sk)
> > return;
> >
> > ret = skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
> > if (ret)
> > /* Apps does set the SO_TIMESTAMPING flag, return directly */
> > return;
> >
> > if (static_branch_unlikely(&bpf_tstamp_control))
> > bpf_skb_tstamp_tx_output(sk, orig_skb, tstype, hwtstamps);
> > }
> >
> > which means if the apps using non-bpf method, we will not see the
> > output even if we load bpf program.
>
> Could the bpf setsockopt fail hard in that case?
We can do this. I think I will add some if test statements to see if
sk_tsflags is initialized before.
>
> Your current patch tries to make them work at the same time. It mostly
> does work. There are just a few concerning edge cases that may result
> in hard to understand bugs.
Agree.
>
> Making only one method work per socket and fail hard if both try it is
> crude, but at least the failure will be clear: the setsockopt fails.
>
> I think that's safer. And in practice, the use cases for BPF
> timestamping probably are exactly when application timestamping is
> missing?
Fair enough. Let me try this way:)
Thanks,
Jason
^ permalink raw reply [flat|nested] 73+ messages in thread