All of lore.kernel.org
 help / color / mirror / Atom feed
From: Martin KaFai Lau <martin.lau@linux.dev>
To: Jason Xing <kerneljasonxing@gmail.com>
Cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>,
	willemb@google.com, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, dsahern@kernel.org,
	ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org,
	eddyz87@gmail.com, song@kernel.org, yonghong.song@linux.dev,
	john.fastabend@gmail.com, kpsingh@kernel.org, sdf@fomichev.me,
	haoluo@google.com, jolsa@kernel.org, shuah@kernel.org,
	ykolal@fb.com, bpf@vger.kernel.org, netdev@vger.kernel.org,
	Jason Xing <kernelxing@tencent.com>
Subject: Re: [PATCH net-next v3 02/14] net-timestamp: allow two features to work parallelly
Date: Mon, 4 Nov 2024 17:50:58 -0800	[thread overview]
Message-ID: <651bd5a4-adb8-4f18-8eb7-cc781495fcb3@linux.dev> (raw)
In-Reply-To: <CAL+tcoD6fqrDhYDCFkuSuy-HgORo-qxLLwm+=WQqdQA1=C_S3w@mail.gmail.com>

On 11/1/24 12:47 AM, Jason Xing wrote:

>> If it prefers to stay with bpf_setsockopt (which is fine), it needs a bpf
>> specific optname like the current TCP_BPF_SOCK_OPS_CB_FLAGS which currently sets
>> the tp->bpf_sock_ops_cb_flags. May be a new SK_BPF_CB_FLAGS optname for setting
>> the sk->sk_bpf_cb_flags, like bpf_setsockopt(skops_ctx, SOL_SOCKET,
> 
>> SK_BPF_CB_FLAGS, &val, sizeof(val)) and handle it in the sol_socket_sockopt()
>> alone without calling into sk_{set,get}sockopt. Add a new enum for the optval
>> for the sk_bpf_cb_flags:
>>
>> enum {
>>          SK_BPF_CB_TX_TIMESTAMPING = (1 << 0),
>>          SK_BPF_CB_RX_TIEMSTAMPING = (1 << 1),
>> };
> 
> Then it will involve more strange modification in sol_socket_sockopt()
> to retrieve the opt value like what I did in V2 (see
> https://lore.kernel.org/all/20241012040651.95616-3-kerneljasonxing@gmail.com/).
> It's the reason why I did set and get operation in
> sk_{set,get}sockopt() in this series to keep align with other flags.
> Handling it in sk_{set,get}sockopt() is not a bad idea and easy to
> implement, I feel.

This will look very different now. It is handling bpf specific
optname and accessing the bpf specific field in sk->sk_bpf_cb_flags.

I really don't see why it needs to spill over to sk_{set,get}sockopt()
to handle sk->sk_bpf_cb_flags.

I have quickly typed out a small part of discussion so far.
It is likely buggy and not compiler tested. Pieces are still missing.
The bpf_tstamp_ack will need a few more changes in the
tcp_{input,output}.c. May be merging with the tstamp_ack to become
2 bits will be cleaner, not sure.

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 39f1d16f3628..0b4913315854 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -488,6 +488,7 @@ enum {
  
  	/* generate software time stamp when entering packet scheduling */
  	SKBTX_SCHED_TSTAMP = 1 << 6,
+	SKBTX_BPF = 1 << 7,
  };
  
  #define SKBTX_ANY_SW_TSTAMP	(SKBTX_SW_TSTAMP    | \
diff --git a/include/net/sock.h b/include/net/sock.h
index f29c14448938..4ec27c524f49 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -234,6 +234,20 @@ struct sock_common {
  struct bpf_local_storage;
  struct sk_filter;
  
+enum {
+	SK_BPF_CB_TX_TIMESTAMPING = BIT(0),
+	SK_BPF_CB_RX_TIEMSTAMPING = BIT(1),
+	SK_BPF_CB_MASK		= BIT(2) - 1,
+};
+
+#ifdef CONFIG_BPF_SYSCALL
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb);
+#else
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG)
+static inline void bpf_skops_timestamping(struct sock *sk, struct sk_buff *skb) {}
+#endif
+
  /**
    *	struct sock - network layer representation of sockets
    *	@__sk_common: shared layout with inet_timewait_sock
@@ -444,7 +458,10 @@ struct sock {
  	socket_lock_t		sk_lock;
  	u32			sk_reserved_mem;
  	int			sk_forward_alloc;
-	u32			sk_tsflags;
+	u16			sk_tsflags;
+#ifdef CONFIG_BPF_SYSCALL
+	u16			sk_bpf_cb_flags;
+#endif
  	__cacheline_group_end(sock_write_rxtx);
  
  	__cacheline_group_begin(sock_write_tx);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d1948d357dad..224b697bae9d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -961,7 +961,8 @@ struct tcp_skb_cb {
  	__u8		txstamp_ack:1,	/* Record TX timestamp for ack? */
  			eor:1,		/* Is skb MSG_EOR marked? */
  			has_rxtstamp:1,	/* SKB has a RX timestamp	*/
-			unused:5;
+			bpf_txstamp_ack:1,
+			unused:4;
  	__u32		ack_seq;	/* Sequence number ACK'd	*/
  	union {
  		struct {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f28b6527e815..2ff7ff0ebdab 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7014,6 +7014,7 @@ enum {
  					 * by the kernel or the
  					 * earlier bpf-progs.
  					 */
+	BPF_SOCK_OPS_TX_TIMESTAMPING_CB,
  };
  
  /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
@@ -7080,6 +7081,7 @@ enum {
  	TCP_BPF_SYN_IP		= 1006, /* Copy the IP[46] and TCP header */
  	TCP_BPF_SYN_MAC         = 1007, /* Copy the MAC, IP[46], and TCP header */
  	TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
+	SK_BPF_CB_FLAGS		= 1009,
  };
  
  enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index e31ee8be2de0..81a36e50047b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5206,6 +5206,19 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
  	.arg1_type      = ARG_PTR_TO_CTX,
  };
  
+static int sk_bpf_cb_flags(struct sock *sk, int sk_bpf_cb_flags, bool getopt)
+{
+	if (getopt)
+		return -EINVAL;
+
+	if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
+		return -EINVAL;
+
+	sk->sk_bpf_cb_flags = sk->sk_bpf_cb_flags;
+
+	return 0;
+}
+
  static int sol_socket_sockopt(struct sock *sk, int optname,
  			      char *optval, int *optlen,
  			      bool getopt)
@@ -5222,6 +5235,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
  	case SO_MAX_PACING_RATE:
  	case SO_BINDTOIFINDEX:
  	case SO_TXREHASH:
+	case SK_BPF_CB_FLAGS:
  		if (*optlen != sizeof(int))
  			return -EINVAL;
  		break;
@@ -5231,6 +5245,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
  		return -EINVAL;
  	}
  
+	if (optname == SK_BPF_CB_FLAGS)
+		return sk_bpf_cb_flags(sk, *(int *)optval, getopt);
+
  	if (getopt) {
  		if (optname == SO_BINDTODEVICE)
  			return -EINVAL;
diff --git a/net/core/sock.c b/net/core/sock.c
index 039be95c40cf..d0406639cee9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -137,6 +137,7 @@
  #include <linux/sock_diag.h>
  
  #include <linux/filter.h>
+#include <linux/bpf-cgroup.h>
  #include <net/sock_reuseport.h>
  #include <net/bpf_sk_storage.h>
  
@@ -946,6 +947,20 @@ int sock_set_timestamping(struct sock *sk, int optname,
  	return 0;
  }
  
+#ifdef CONFIG_BPF_SYSCALL
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb)
+{
+	struct bpf_sock_ops_kern sock_ops;
+
+	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+	sock_ops.op = BPF_SOCK_OPS_TX_TIMESTAMPING_CB;
+	sock_ops.is_fullsock = 1;
+	sock_ops.sk = sk;
+	sock_ops.skb = skb;
+	__cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
+}
+#endif
+
  void sock_set_keepalive(struct sock *sk)
  {
  	lock_sock(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4f77bd862e95..1e7f2d5fd792 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -491,6 +491,15 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
  		if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
  			shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
  	}
+
+	if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
+	    SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING))
+		/* The bpf prog can do:
+		 * shinfo->tx_flags |= SKBTX_BPF,
+		 * tcb->bpf_txstamp_ack = 1,
+		 * shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1 (if tskey not set)
+		 */
+		bpf_skops_tx_timestamping(sk, skb);
  }


> 
> Overall the suggestion looks good to me. I can give it a try :)
> 
> I'm thinking of another approach to using bpf_sock_ops_cb_flags_set()
> instead of bpf_setsockopt() when sockops like
> BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB is triggered. I can modify the
> bpf_sock_ops_cb_flags_set like this:
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 58761263176c..001140067c1a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5770,14 +5770,25 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct
> bpf_sock_ops_kern *, bpf_sock,
>             int, argval)
>   {
>          struct sock *sk = bpf_sock->sk;
> -       int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
> +       int val = argval;
> 
> -       if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
> +       if (!IS_ENABLED(CONFIG_INET))
>                  return -EINVAL;
> 
> -       tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
> +       if (sk_is_tcp(sk)) {
> +               val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
> +               if (!sk_fullsock(sk))
> +                       return -EINVAL;
> +
> +               tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
> +
> +               val = argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
> +       } else {
> +               sk->bpf_sock_ops_cb_flags = val;

Why separate tcp vs non-tcp case? The tcp_sk(sk)->bpf_sock_ops_cb_flags
is running out of bits anyway for tcp specific callback.

just keep the SK_BPF_CB_{TX,RX}_TIEMSTAMPING in sk->sk_bpf_cb_flags
for all tcp/udp/raw/...

> +               val = argval &
> (~(SK_BPF_CB_TX_TIEMSTAMPING|SK_BPF_CB_RX_TIEMSTAMPING));

imo, we also don't need to return val to tell the caller what
is not supported in the running kernel. The BPF CO-RE can
handle this also, so less reason to keep extending the
bpf_sock_ops_cb_flags_set API for non tcp.

>>>> For datagrams (UDP as well as RAW and many non IP protocols), an
>>>> alternative still needs to be found.
>>
>> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
>> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
>> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set. If it is
>> unlikely, may be we can just disallow bpf prog from directly setting
>> skb_shinfo(skb)->tskey for this particular skb.
>>
>> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
>> pass the kernel decided tskey to the bpf prog.
> 
> I'm a bit confused here. IIUC, we need to support the tskey like what
> we did in this series to handle non TCP cases?

Like tcp, I don't think it really needs to use the sk->sk_tskey to mark the
ID of a skb for the non tcp cases also. will comment on another thread.


  reply	other threads:[~2024-11-05  1:51 UTC|newest]

Thread overview: 88+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-28 11:05 [PATCH net-next v3 00/14] net-timestamp: bpf extension to equip applications transparently Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 01/14] net-timestamp: reorganize in skb_tstamp_tx_output() Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 02/14] net-timestamp: allow two features to work parallelly Jason Xing
2024-10-29 23:00   ` Martin KaFai Lau
2024-10-30  1:23     ` Jason Xing
2024-10-30  1:45       ` Willem de Bruijn
2024-10-30  2:32         ` Jason Xing
2024-10-30  2:47           ` Willem de Bruijn
2024-10-30  3:04             ` Jason Xing
2024-10-30  5:37               ` Martin KaFai Lau
2024-10-30  6:42                 ` Jason Xing
2024-10-30 17:15                   ` Willem de Bruijn
2024-10-30 23:54                     ` Jason Xing
2024-10-31  0:13                       ` Jason Xing
2024-10-31  6:27                         ` Martin KaFai Lau
2024-10-31  7:04                           ` Jason Xing
2024-10-31 12:30                             ` Willem de Bruijn
2024-10-31 13:50                               ` Jason Xing
2024-10-31 23:26                                 ` Martin KaFai Lau
2024-11-01  7:47                                   ` Jason Xing
2024-11-05  1:50                                     ` Martin KaFai Lau [this message]
2024-11-05  3:13                                       ` Jason Xing
2024-11-01 13:32                                   ` Willem de Bruijn
2024-11-01 16:08                                     ` Jason Xing
2024-11-01 16:39                                       ` Willem de Bruijn
2024-11-05  2:09                                     ` Martin KaFai Lau
2024-11-05  6:22                                       ` Jason Xing
2024-11-05 19:22                                         ` Martin KaFai Lau
2024-11-06  0:17                                           ` Jason Xing
2024-11-06  1:09                                             ` Martin KaFai Lau
2024-11-06  2:51                                               ` Jason Xing
2024-11-07  1:19                                                 ` Martin KaFai Lau
2024-11-07  3:31                                                   ` Jason Xing
2024-11-07 19:05                                                     ` Martin KaFai Lau
2024-11-06  1:11                                             ` Willem de Bruijn
2024-11-06  2:37                                               ` Jason Xing
2024-11-05 14:29                                       ` Willem de Bruijn
2024-11-02 13:43   ` Simon Horman
2024-11-03  0:42     ` Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 03/14] net-timestamp: open gate for bpf_setsockopt/_getsockopt Jason Xing
2024-10-29  0:59   ` Willem de Bruijn
2024-10-29  1:18     ` Jason Xing
2024-10-30  0:32   ` Martin KaFai Lau
2024-10-30  1:15     ` Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 04/14] net-timestamp: introduce TS_SCHED_OPT_CB to generate dev xmit timestamp Jason Xing
2024-10-29  0:23   ` kernel test robot
2024-10-29  1:02   ` Willem de Bruijn
2024-10-29  1:30     ` Jason Xing
2024-10-29  1:04   ` kernel test robot
2024-10-28 11:05 ` [PATCH net-next v3 05/14] net-timestamp: introduce TS_SW_OPT_CB to generate driver timestamp Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 06/14] net-timestamp: introduce TS_ACK_OPT_CB to generate tcp acked timestamp Jason Xing
2024-10-29  1:03   ` Willem de Bruijn
2024-10-29  1:19     ` Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 07/14] net-timestamp: add a new triggered point to set sk_tsflags_bpf in UDP layer Jason Xing
2024-10-29  1:07   ` Willem de Bruijn
2024-10-29  1:23     ` Jason Xing
2024-10-29  1:33       ` Willem de Bruijn
2024-10-29  3:12         ` Jason Xing
2024-10-29 15:04           ` Willem de Bruijn
2024-10-29 15:44             ` Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 08/14] net-timestamp: make bpf for tx timestamp work Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 09/14] net-timestamp: add a common helper to set tskey Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 10/14] net-timestamp: add basic support with tskey offset Jason Xing
2024-10-29  1:24   ` Willem de Bruijn
2024-10-29  2:41     ` Jason Xing
2024-10-29 15:03       ` Willem de Bruijn
2024-10-29 15:50         ` Jason Xing
2024-10-29 19:45           ` Willem de Bruijn
2024-10-30  3:27             ` Jason Xing
2024-10-30  5:42   ` Martin KaFai Lau
2024-10-30  6:50     ` Jason Xing
2024-10-31  1:17       ` Martin KaFai Lau
2024-10-31  2:41         ` Jason Xing
2024-10-31  3:27           ` Jason Xing
2024-10-31  5:52           ` Martin KaFai Lau
2024-10-31  6:16             ` Jason Xing
2024-10-31 23:50           ` Martin KaFai Lau
2024-11-01  6:33             ` Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 11/14] net-timestamp: support OPT_ID for TCP proto Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 12/14] net-timestamp: add OPT_ID for UDP proto Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 13/14] net-timestamp: use static key to control bpf extension Jason Xing
2024-10-28 11:05 ` [PATCH net-next v3 14/14] bpf: add simple bpf tests in the tx path for so_timstamping feature Jason Xing
2024-10-29  1:26   ` Willem de Bruijn
2024-10-29  1:33     ` Jason Xing
2024-10-29  1:40       ` Willem de Bruijn
2024-10-29  3:13         ` Jason Xing
2024-10-30  5:57   ` Martin KaFai Lau
2024-10-30  6:54     ` Jason Xing

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=651bd5a4-adb8-4f18-8eb7-cc781495fcb3@linux.dev \
    --to=martin.lau@linux.dev \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=dsahern@kernel.org \
    --cc=eddyz87@gmail.com \
    --cc=edumazet@google.com \
    --cc=haoluo@google.com \
    --cc=john.fastabend@gmail.com \
    --cc=jolsa@kernel.org \
    --cc=kerneljasonxing@gmail.com \
    --cc=kernelxing@tencent.com \
    --cc=kpsingh@kernel.org \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=sdf@fomichev.me \
    --cc=shuah@kernel.org \
    --cc=song@kernel.org \
    --cc=willemb@google.com \
    --cc=willemdebruijn.kernel@gmail.com \
    --cc=ykolal@fb.com \
    --cc=yonghong.song@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.