* [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt()
2024-12-07 17:37 [PATCH net-next v4 00/11] net-timestamp: bpf extension to equip applications transparently Jason Xing
@ 2024-12-07 17:37 ` Jason Xing
2024-12-09 4:28 ` kernel test robot
` (2 more replies)
2024-12-07 17:37 ` [PATCH net-next v4 02/11] net-timestamp: prepare for bpf prog use Jason Xing
` (9 subsequent siblings)
10 siblings, 3 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-07 17:37 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Users can write the following code to enable the bpf extension:
bpf_setsockopt(skops, SOL_SOCKET, SK_BPF_CB_FLAGS, &flags, sizeof(flags));
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/net/sock.h | 7 +++++++
include/uapi/linux/bpf.h | 8 ++++++++
net/core/filter.c | 22 ++++++++++++++++++++++
tools/include/uapi/linux/bpf.h | 1 +
4 files changed, 38 insertions(+)
diff --git a/include/net/sock.h b/include/net/sock.h
index 7464e9f9f47c..0dd464ba9e46 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -303,6 +303,7 @@ struct sk_filter;
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
+ * @sk_bpf_cb_flags: used for bpf_setsockopt
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
* Sockets that can be used under memory reclaim should
* set this to false.
@@ -445,6 +446,12 @@ struct sock {
u32 sk_reserved_mem;
int sk_forward_alloc;
u32 sk_tsflags;
+#ifdef CONFIG_BPF_SYSCALL
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
+ u32 sk_bpf_cb_flags;
+#else
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG) 0
+#endif
__cacheline_group_end(sock_write_rxtx);
__cacheline_group_begin(sock_write_tx);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4162afc6b5d0..e629e09b0b31 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6903,6 +6903,13 @@ enum {
BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
};
+/* Definitions for bpf_sk_cb_flags */
+enum {
+ SK_BPF_CB_TX_TIMESTAMPING = 1<<0,
+ SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) |
+ SK_BPF_CB_TX_TIMESTAMPING
+};
+
/* List of known BPF sock_ops operators.
* New entries can only be added at the end
*/
@@ -7081,6 +7088,7 @@ enum {
TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
+ SK_BPF_CB_FLAGS = 1009, /* Used to set socket bpf flags */
};
enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index 6625b3f563a4..f7e9f88e09b1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5214,6 +5214,24 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static int sk_bpf_set_cb_flags(struct sock *sk, sockptr_t optval, bool getopt)
+{
+ int sk_bpf_cb_flags;
+
+ if (getopt)
+ return -EINVAL;
+
+ if (copy_from_sockptr(&sk_bpf_cb_flags, optval, sizeof(sk_bpf_cb_flags)))
+ return -EFAULT;
+
+ if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
+ return -EINVAL;
+
+ sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
+
+ return 0;
+}
+
static int sol_socket_sockopt(struct sock *sk, int optname,
char *optval, int *optlen,
bool getopt)
@@ -5230,6 +5248,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
case SO_MAX_PACING_RATE:
case SO_BINDTOIFINDEX:
case SO_TXREHASH:
+ case SK_BPF_CB_FLAGS:
if (*optlen != sizeof(int))
return -EINVAL;
break;
@@ -5239,6 +5258,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
return -EINVAL;
}
+ if (optname == SK_BPF_CB_FLAGS)
+ return sk_bpf_set_cb_flags(sk, KERNEL_SOCKPTR(optval), getopt);
+
if (getopt) {
if (optname == SO_BINDTODEVICE)
return -EINVAL;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4162afc6b5d0..6b0a5b787b12 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7081,6 +7081,7 @@ enum {
TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
+ SK_BPF_CB_FLAGS = 1009, /* Used to set socket bpf flags */
};
enum {
--
2.37.3
^ permalink raw reply related [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt()
2024-12-07 17:37 ` [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt() Jason Xing
@ 2024-12-09 4:28 ` kernel test robot
2024-12-09 4:31 ` kernel test robot
2024-12-12 19:34 ` Martin KaFai Lau
2 siblings, 0 replies; 48+ messages in thread
From: kernel test robot @ 2024-12-09 4:28 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: oe-kbuild-all, bpf, netdev, Jason Xing
Hi Jason,
kernel test robot noticed the following build errors:
[auto build test ERROR on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Jason-Xing/net-timestamp-add-support-for-bpf_setsockopt/20241208-014111
base: net-next/main
patch link: https://lore.kernel.org/r/20241207173803.90744-2-kerneljasonxing%40gmail.com
patch subject: [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt()
config: i386-buildonly-randconfig-001-20241208 (https://download.01.org/0day-ci/archive/20241208/202412080315.koZqiF0Y-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241208/202412080315.koZqiF0Y-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202412080315.koZqiF0Y-lkp@intel.com/
All errors (new ones prefixed by >>):
net/core/filter.c: In function 'sk_bpf_set_cb_flags':
>> net/core/filter.c:5232:11: error: 'struct sock' has no member named 'sk_bpf_cb_flags'
5232 | sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
| ^~
vim +5232 net/core/filter.c
5218
5219 static int sk_bpf_set_cb_flags(struct sock *sk, sockptr_t optval, bool getopt)
5220 {
5221 int sk_bpf_cb_flags;
5222
5223 if (getopt)
5224 return -EINVAL;
5225
5226 if (copy_from_sockptr(&sk_bpf_cb_flags, optval, sizeof(sk_bpf_cb_flags)))
5227 return -EFAULT;
5228
5229 if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
5230 return -EINVAL;
5231
> 5232 sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
5233
5234 return 0;
5235 }
5236
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt()
2024-12-07 17:37 ` [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt() Jason Xing
2024-12-09 4:28 ` kernel test robot
@ 2024-12-09 4:31 ` kernel test robot
2024-12-12 19:34 ` Martin KaFai Lau
2 siblings, 0 replies; 48+ messages in thread
From: kernel test robot @ 2024-12-09 4:31 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: llvm, oe-kbuild-all, bpf, netdev, Jason Xing
Hi Jason,
kernel test robot noticed the following build errors:
[auto build test ERROR on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Jason-Xing/net-timestamp-add-support-for-bpf_setsockopt/20241208-014111
base: net-next/main
patch link: https://lore.kernel.org/r/20241207173803.90744-2-kerneljasonxing%40gmail.com
patch subject: [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt()
config: x86_64-buildonly-randconfig-004-20241208 (https://download.01.org/0day-ci/archive/20241208/202412080629.IyOW2oUA-lkp@intel.com/config)
compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241208/202412080629.IyOW2oUA-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202412080629.IyOW2oUA-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from net/core/filter.c:21:
In file included from include/linux/bpf_verifier.h:7:
In file included from include/linux/bpf.h:21:
In file included from include/linux/kallsyms.h:13:
In file included from include/linux/mm.h:2223:
include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
| ~~~~~~~~~~~ ^ ~~~
net/core/filter.c:1726:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
1726 | .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:2041:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
2041 | .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~
net/core/filter.c:2043:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
2043 | .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~
net/core/filter.c:2580:35: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
2580 | .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~
net/core/filter.c:4643:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
4643 | .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:4657:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
4657 | .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:4857:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
4857 | .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:4885:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
4885 | .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:5057:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
5057 | .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:5071:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
5071 | .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:5120:45: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
5120 | .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~
>> net/core/filter.c:5232:6: error: no member named 'sk_bpf_cb_flags' in 'struct sock'
5232 | sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
| ~~ ^
net/core/filter.c:5577:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
5577 | .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:5611:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
5611 | .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:5645:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
5645 | .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:5679:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
5679 | .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:5854:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
5854 | .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:6391:46: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
6391 | .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
| ~~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~
net/core/filter.c:6403:46: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
6403 | .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
| ~~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~
net/core/filter.c:6489:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
6489 | .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
net/core/filter.c:6499:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
6499 | .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
21 warnings and 1 error generated.
vim +5232 net/core/filter.c
5049
5050 static const struct bpf_func_proto bpf_xdp_event_output_proto = {
5051 .func = bpf_xdp_event_output,
5052 .gpl_only = true,
5053 .ret_type = RET_INTEGER,
5054 .arg1_type = ARG_PTR_TO_CTX,
5055 .arg2_type = ARG_CONST_MAP_PTR,
5056 .arg3_type = ARG_ANYTHING,
> 5057 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5058 .arg5_type = ARG_CONST_SIZE_OR_ZERO,
5059 };
5060
5061 BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)
5062
5063 const struct bpf_func_proto bpf_xdp_output_proto = {
5064 .func = bpf_xdp_event_output,
5065 .gpl_only = true,
5066 .ret_type = RET_INTEGER,
5067 .arg1_type = ARG_PTR_TO_BTF_ID,
5068 .arg1_btf_id = &bpf_xdp_output_btf_ids[0],
5069 .arg2_type = ARG_CONST_MAP_PTR,
5070 .arg3_type = ARG_ANYTHING,
5071 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5072 .arg5_type = ARG_CONST_SIZE_OR_ZERO,
5073 };
5074
5075 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
5076 {
5077 return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
5078 }
5079
5080 static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
5081 .func = bpf_get_socket_cookie,
5082 .gpl_only = false,
5083 .ret_type = RET_INTEGER,
5084 .arg1_type = ARG_PTR_TO_CTX,
5085 };
5086
5087 BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
5088 {
5089 return __sock_gen_cookie(ctx->sk);
5090 }
5091
5092 static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
5093 .func = bpf_get_socket_cookie_sock_addr,
5094 .gpl_only = false,
5095 .ret_type = RET_INTEGER,
5096 .arg1_type = ARG_PTR_TO_CTX,
5097 };
5098
5099 BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
5100 {
5101 return __sock_gen_cookie(ctx);
5102 }
5103
5104 static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
5105 .func = bpf_get_socket_cookie_sock,
5106 .gpl_only = false,
5107 .ret_type = RET_INTEGER,
5108 .arg1_type = ARG_PTR_TO_CTX,
5109 };
5110
5111 BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
5112 {
5113 return sk ? sock_gen_cookie(sk) : 0;
5114 }
5115
5116 const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
5117 .func = bpf_get_socket_ptr_cookie,
5118 .gpl_only = false,
5119 .ret_type = RET_INTEGER,
5120 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
5121 };
5122
5123 BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
5124 {
5125 return __sock_gen_cookie(ctx->sk);
5126 }
5127
5128 static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
5129 .func = bpf_get_socket_cookie_sock_ops,
5130 .gpl_only = false,
5131 .ret_type = RET_INTEGER,
5132 .arg1_type = ARG_PTR_TO_CTX,
5133 };
5134
5135 static u64 __bpf_get_netns_cookie(struct sock *sk)
5136 {
5137 const struct net *net = sk ? sock_net(sk) : &init_net;
5138
5139 return net->net_cookie;
5140 }
5141
5142 BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb)
5143 {
5144 return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL);
5145 }
5146
5147 static const struct bpf_func_proto bpf_get_netns_cookie_proto = {
5148 .func = bpf_get_netns_cookie,
5149 .ret_type = RET_INTEGER,
5150 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5151 };
5152
5153 BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
5154 {
5155 return __bpf_get_netns_cookie(ctx);
5156 }
5157
5158 static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
5159 .func = bpf_get_netns_cookie_sock,
5160 .gpl_only = false,
5161 .ret_type = RET_INTEGER,
5162 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5163 };
5164
5165 BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
5166 {
5167 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
5168 }
5169
5170 static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
5171 .func = bpf_get_netns_cookie_sock_addr,
5172 .gpl_only = false,
5173 .ret_type = RET_INTEGER,
5174 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5175 };
5176
5177 BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
5178 {
5179 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
5180 }
5181
5182 static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
5183 .func = bpf_get_netns_cookie_sock_ops,
5184 .gpl_only = false,
5185 .ret_type = RET_INTEGER,
5186 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5187 };
5188
5189 BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
5190 {
5191 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
5192 }
5193
5194 static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
5195 .func = bpf_get_netns_cookie_sk_msg,
5196 .gpl_only = false,
5197 .ret_type = RET_INTEGER,
5198 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5199 };
5200
5201 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
5202 {
5203 struct sock *sk = sk_to_full_sk(skb->sk);
5204 kuid_t kuid;
5205
5206 if (!sk || !sk_fullsock(sk))
5207 return overflowuid;
5208 kuid = sock_net_uid(sock_net(sk), sk);
5209 return from_kuid_munged(sock_net(sk)->user_ns, kuid);
5210 }
5211
5212 static const struct bpf_func_proto bpf_get_socket_uid_proto = {
5213 .func = bpf_get_socket_uid,
5214 .gpl_only = false,
5215 .ret_type = RET_INTEGER,
5216 .arg1_type = ARG_PTR_TO_CTX,
5217 };
5218
5219 static int sk_bpf_set_cb_flags(struct sock *sk, sockptr_t optval, bool getopt)
5220 {
5221 int sk_bpf_cb_flags;
5222
5223 if (getopt)
5224 return -EINVAL;
5225
5226 if (copy_from_sockptr(&sk_bpf_cb_flags, optval, sizeof(sk_bpf_cb_flags)))
5227 return -EFAULT;
5228
5229 if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
5230 return -EINVAL;
5231
> 5232 sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
5233
5234 return 0;
5235 }
5236
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt()
2024-12-07 17:37 ` [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt() Jason Xing
2024-12-09 4:28 ` kernel test robot
2024-12-09 4:31 ` kernel test robot
@ 2024-12-12 19:34 ` Martin KaFai Lau
2024-12-13 14:12 ` Jason Xing
2 siblings, 1 reply; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-12 19:34 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/7/24 9:37 AM, Jason Xing wrote:
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 6625b3f563a4..f7e9f88e09b1 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5214,6 +5214,24 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
> .arg1_type = ARG_PTR_TO_CTX,
> };
>
> +static int sk_bpf_set_cb_flags(struct sock *sk, sockptr_t optval, bool getopt)
It is confusing to take a sockptr_t argument. It is called by the kernel bpf
prog only. It must be from the kernel memory. Directly pass the "int
sk_bpf_cb_flags" as the argument.
> +{
> + int sk_bpf_cb_flags;
> +
> + if (getopt)
> + return -EINVAL;
> +
> + if (copy_from_sockptr(&sk_bpf_cb_flags, optval, sizeof(sk_bpf_cb_flags)))
It is an unnecessary copy. Directly use the "int sk_bpf_cb_flags" arg instead.
> + return -EFAULT;
This should never happen.
> +
> + if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
> + return -EINVAL;
> +
> + sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
> +
> + return 0;
> +}
> +
> static int sol_socket_sockopt(struct sock *sk, int optname,
> char *optval, int *optlen,
> bool getopt)
> @@ -5230,6 +5248,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> case SO_MAX_PACING_RATE:
> case SO_BINDTOIFINDEX:
> case SO_TXREHASH:
> + case SK_BPF_CB_FLAGS:
> if (*optlen != sizeof(int))
> return -EINVAL;
> break;
> @@ -5239,6 +5258,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> return -EINVAL;
> }
>
> + if (optname == SK_BPF_CB_FLAGS)
> + return sk_bpf_set_cb_flags(sk, KERNEL_SOCKPTR(optval), getopt);
> +
> if (getopt) {
> if (optname == SO_BINDTODEVICE)
> return -EINVAL;
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt()
2024-12-12 19:34 ` Martin KaFai Lau
@ 2024-12-13 14:12 ` Jason Xing
0 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-13 14:12 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Fri, Dec 13, 2024 at 3:35 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/7/24 9:37 AM, Jason Xing wrote:
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 6625b3f563a4..f7e9f88e09b1 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -5214,6 +5214,24 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
> > .arg1_type = ARG_PTR_TO_CTX,
> > };
> >
> > +static int sk_bpf_set_cb_flags(struct sock *sk, sockptr_t optval, bool getopt)
>
> It is confusing to take a sockptr_t argument. It is called by the kernel bpf
> prog only. It must be from the kernel memory. Directly pass the "int
> sk_bpf_cb_flags" as the argument.
Thanks. I will fix this.
>
> > +{
> > + int sk_bpf_cb_flags;
> > +
> > + if (getopt)
> > + return -EINVAL;
> > +
> > + if (copy_from_sockptr(&sk_bpf_cb_flags, optval, sizeof(sk_bpf_cb_flags)))
>
> It is an unnecessary copy. Directly use the "int sk_bpf_cb_flags" arg instead.
>
> > + return -EFAULT;
>
> This should never happen.
>
> > +
> > + if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
> > + return -EINVAL;
> > +
> > + sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
> > +
> > + return 0;
> > +}
> > +
> > static int sol_socket_sockopt(struct sock *sk, int optname,
> > char *optval, int *optlen,
> > bool getopt)
> > @@ -5230,6 +5248,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> > case SO_MAX_PACING_RATE:
> > case SO_BINDTOIFINDEX:
> > case SO_TXREHASH:
> > + case SK_BPF_CB_FLAGS:
> > if (*optlen != sizeof(int))
> > return -EINVAL;
> > break;
> > @@ -5239,6 +5258,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
> > return -EINVAL;
> > }
> >
> > + if (optname == SK_BPF_CB_FLAGS)
> > + return sk_bpf_set_cb_flags(sk, KERNEL_SOCKPTR(optval), getopt);
> > +
> > if (getopt) {
> > if (optname == SO_BINDTODEVICE)
> > return -EINVAL;
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH net-next v4 02/11] net-timestamp: prepare for bpf prog use
2024-12-07 17:37 [PATCH net-next v4 00/11] net-timestamp: bpf extension to equip applications transparently Jason Xing
2024-12-07 17:37 ` [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt() Jason Xing
@ 2024-12-07 17:37 ` Jason Xing
2024-12-11 2:02 ` Martin KaFai Lau
2024-12-07 17:37 ` [PATCH net-next v4 03/11] net-timestamp: reorganize in skb_tstamp_tx_output() Jason Xing
` (8 subsequent siblings)
10 siblings, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-07 17:37 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Later, I would introduce three points to report some information
to user space based on this.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/net/sock.h | 7 +++++++
net/core/sock.c | 15 +++++++++++++++
2 files changed, 22 insertions(+)
diff --git a/include/net/sock.h b/include/net/sock.h
index 0dd464ba9e46..f88a00108a2f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2920,6 +2920,13 @@ int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping);
void sock_enable_timestamps(struct sock *sk);
+#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
+#else
+static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+}
+#endif
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
diff --git a/net/core/sock.c b/net/core/sock.c
index 74729d20cd00..79cb5c74c76c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -941,6 +941,21 @@ int sock_set_timestamping(struct sock *sk, int optname,
return 0;
}
+#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+ struct bpf_sock_ops_kern sock_ops;
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
+}
+#endif
+
void sock_set_keepalive(struct sock *sk)
{
lock_sock(sk);
--
2.37.3
^ permalink raw reply related [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 02/11] net-timestamp: prepare for bpf prog use
2024-12-07 17:37 ` [PATCH net-next v4 02/11] net-timestamp: prepare for bpf prog use Jason Xing
@ 2024-12-11 2:02 ` Martin KaFai Lau
2024-12-11 9:17 ` Jason Xing
0 siblings, 1 reply; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-11 2:02 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/7/24 9:37 AM, Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> Later, I would introduce three points to report some information
> to user space based on this.
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> include/net/sock.h | 7 +++++++
> net/core/sock.c | 15 +++++++++++++++
> 2 files changed, 22 insertions(+)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 0dd464ba9e46..f88a00108a2f 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -2920,6 +2920,13 @@ int sock_set_timestamping(struct sock *sk, int optname,
> struct so_timestamping timestamping);
>
> void sock_enable_timestamps(struct sock *sk);
> +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
> +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
> +#else
> +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> +{
> +}
> +#endif
> void sock_no_linger(struct sock *sk);
> void sock_set_keepalive(struct sock *sk);
> void sock_set_priority(struct sock *sk, u32 priority);
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 74729d20cd00..79cb5c74c76c 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -941,6 +941,21 @@ int sock_set_timestamping(struct sock *sk, int optname,
> return 0;
> }
>
> +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
> +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> +{
> + struct bpf_sock_ops_kern sock_ops;
> +
> + sock_owned_by_me(sk);
I don't think this can be assumed in the time stamping callback.
To remove this assumption for sockops, I believe it needs to stop the bpf prog
from calling a few bpf helpers. In particular, the bpf_sock_ops_cb_flags_set and
bpf_sock_ops_setsockopt. This should be easy by asking the helpers to check the
"u8 op" in "struct bpf_sock_ops_kern *".
I just noticed a trickier one, sockops bpf prog can write to sk->sk_txhash. The
same should go for reading from sk. Also, sockops prog assumes a fullsock sk is
a tcp_sock which also won't work for the udp case. A quick thought is to do
something similar to is_fullsock. May be repurpose the is_fullsock somehow or a
new u8 is needed. Take a look at SOCK_OPS_{GET,SET}_FIELD. It avoids
writing/reading the sk when is_fullsock is false.
This is a signal that the existing sockops interface has already seen better
days. I hope not too many fixes like these are needed to get tcp/udp
timestamping to work.
> +
> + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
> + sock_ops.op = op;
> + sock_ops.is_fullsock = 1;
I don't think we can assume it is always is_fullsock either.
> + sock_ops.sk = sk;
> + __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
Same here. sk may not be fullsock. BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops) is
needed.
[ I will continue the rest of the set later. ]
> +}
> +#endif
> +
> void sock_set_keepalive(struct sock *sk)
> {
> lock_sock(sk);
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 02/11] net-timestamp: prepare for bpf prog use
2024-12-11 2:02 ` Martin KaFai Lau
@ 2024-12-11 9:17 ` Jason Xing
2024-12-13 1:41 ` Martin KaFai Lau
0 siblings, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-11 9:17 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Wed, Dec 11, 2024 at 10:02 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/7/24 9:37 AM, Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Later, I would introduce three points to report some information
> > to user space based on this.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > include/net/sock.h | 7 +++++++
> > net/core/sock.c | 15 +++++++++++++++
> > 2 files changed, 22 insertions(+)
> >
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 0dd464ba9e46..f88a00108a2f 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -2920,6 +2920,13 @@ int sock_set_timestamping(struct sock *sk, int optname,
> > struct so_timestamping timestamping);
> >
> > void sock_enable_timestamps(struct sock *sk);
> > +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
> > +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
> > +#else
> > +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> > +{
> > +}
> > +#endif
> > void sock_no_linger(struct sock *sk);
> > void sock_set_keepalive(struct sock *sk);
> > void sock_set_priority(struct sock *sk, u32 priority);
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index 74729d20cd00..79cb5c74c76c 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -941,6 +941,21 @@ int sock_set_timestamping(struct sock *sk, int optname,
> > return 0;
> > }
> >
> > +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
> > +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> > +{
> > + struct bpf_sock_ops_kern sock_ops;
> > +
> > + sock_owned_by_me(sk);
>
> I don't think this can be assumed in the time stamping callback.
I'll remove this.
>
> To remove this assumption for sockops, I believe it needs to stop the bpf prog
> from calling a few bpf helpers. In particular, the bpf_sock_ops_cb_flags_set and
> bpf_sock_ops_setsockopt. This should be easy by asking the helpers to check the
> "u8 op" in "struct bpf_sock_ops_kern *".
Sorry, I don't follow. Could you rephrase your thoughts? Thanks.
>
> I just noticed a trickier one, sockops bpf prog can write to sk->sk_txhash. The
> same should go for reading from sk. Also, sockops prog assumes a fullsock sk is
> a tcp_sock which also won't work for the udp case. A quick thought is to do
> something similar to is_fullsock. May be repurpose the is_fullsock somehow or a
> new u8 is needed. Take a look at SOCK_OPS_{GET,SET}_FIELD. It avoids
> writing/reading the sk when is_fullsock is false.
Do you mean that if we introduce a new field, then bpf prog can
read/write the socket?
Reading the socket could be very helpful in the long run.
>
> This is a signal that the existing sockops interface has already seen better
> days. I hope not too many fixes like these are needed to get tcp/udp
> timestamping to work.
>
> > +
> > + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
> > + sock_ops.op = op;
> > + sock_ops.is_fullsock = 1;
>
> I don't think we can assume it is always is_fullsock either.
Right, but for now, TCP seems to need this. I can remove this also.
>
> > + sock_ops.sk = sk;
> > + __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
>
> Same here. sk may not be fullsock. BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops) is
> needed.
If we use this helper, we will change when the udp bpf extension needs
to be supported.
>
> [ I will continue the rest of the set later. ]
Thanks a lot :)
>
> > +}
> > +#endif
> > +
> > void sock_set_keepalive(struct sock *sk)
> > {
> > lock_sock(sk);
>
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 02/11] net-timestamp: prepare for bpf prog use
2024-12-11 9:17 ` Jason Xing
@ 2024-12-13 1:41 ` Martin KaFai Lau
2024-12-13 14:42 ` Jason Xing
0 siblings, 1 reply; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-13 1:41 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/11/24 1:17 AM, Jason Xing wrote:
> On Wed, Dec 11, 2024 at 10:02 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 12/7/24 9:37 AM, Jason Xing wrote:
>>> From: Jason Xing <kernelxing@tencent.com>
>>>
>>> Later, I would introduce three points to report some information
>>> to user space based on this.
>>>
>>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
>>> ---
>>> include/net/sock.h | 7 +++++++
>>> net/core/sock.c | 15 +++++++++++++++
>>> 2 files changed, 22 insertions(+)
>>>
>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>> index 0dd464ba9e46..f88a00108a2f 100644
>>> --- a/include/net/sock.h
>>> +++ b/include/net/sock.h
>>> @@ -2920,6 +2920,13 @@ int sock_set_timestamping(struct sock *sk, int optname,
>>> struct so_timestamping timestamping);
>>>
>>> void sock_enable_timestamps(struct sock *sk);
>>> +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
>>> +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
>>> +#else
>>> +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
>>> +{
>>> +}
>>> +#endif
>>> void sock_no_linger(struct sock *sk);
>>> void sock_set_keepalive(struct sock *sk);
>>> void sock_set_priority(struct sock *sk, u32 priority);
>>> diff --git a/net/core/sock.c b/net/core/sock.c
>>> index 74729d20cd00..79cb5c74c76c 100644
>>> --- a/net/core/sock.c
>>> +++ b/net/core/sock.c
>>> @@ -941,6 +941,21 @@ int sock_set_timestamping(struct sock *sk, int optname,
>>> return 0;
>>> }
>>>
>>> +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
>>> +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
>>> +{
>>> + struct bpf_sock_ops_kern sock_ops;
>>> +
>>> + sock_owned_by_me(sk);
>>
>> I don't think this can be assumed in the time stamping callback.
>
> I'll remove this.
>
>>
>> To remove this assumption for sockops, I believe it needs to stop the bpf prog
>> from calling a few bpf helpers. In particular, the bpf_sock_ops_cb_flags_set and
>> bpf_sock_ops_setsockopt. This should be easy by asking the helpers to check the
>> "u8 op" in "struct bpf_sock_ops_kern *".
>
> Sorry, I don't follow. Could you rephrase your thoughts? Thanks.
Take a look at bpf_sock_ops_setsockopt in filter.c. To change a sk, it needs to
hold the sk_lock. If you drill down bpf_sock_ops_setsockopt,
sock_owned_by_me(sk) is checked somewhere.
The sk_lock held assumption is true so far for the existing sockops callbacks.
The new timestamping sockops callback does not necessary have the sk_lock held,
so it will break the bpf_sock_ops_setsockopt() assumption on the sk_lock.
>
>>
>> I just noticed a trickier one, sockops bpf prog can write to sk->sk_txhash. The
>> same should go for reading from sk. Also, sockops prog assumes a fullsock sk is
>> a tcp_sock which also won't work for the udp case. A quick thought is to do
>> something similar to is_fullsock. May be repurpose the is_fullsock somehow or a
>> new u8 is needed. Take a look at SOCK_OPS_{GET,SET}_FIELD. It avoids
>> writing/reading the sk when is_fullsock is false.
>
> Do you mean that if we introduce a new field, then bpf prog can
> read/write the socket?
The same goes for writing the sk, e.g. writing the sk->sk_txhash. It needs the
sk_lock held. Reading may be ok-ish. The bpf prog can read it anyway by
bpf_probe_read...etc.
When adding udp timestamp callback later, it needs to stop reading the tcp_sock
through skops from the udp callback for sure. Do take a look at
SOCK_OPS_GET_TCP_SOCK_FIELD. I think we need to ensure the udp timestamp
callback won't break here before moving forward.
>
> Reading the socket could be very helpful in the long run.
>
>>
>> This is a signal that the existing sockops interface has already seen better
>> days. I hope not too many fixes like these are needed to get tcp/udp
>> timestamping to work.
>>
>>> +
>>> + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
>>> + sock_ops.op = op;
>>> + sock_ops.is_fullsock = 1;
>>
>> I don't think we can assume it is always is_fullsock either.
>
> Right, but for now, TCP seems to need this. I can remove this also.
I take this back. After reading the existing __skb_tstamp_tx, I think sk is
always fullsock here.
>
>>
>>> + sock_ops.sk = sk;
>>> + __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
>>
>> Same here. sk may not be fullsock. BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops) is
>> needed.
>
> If we use this helper, we will change when the udp bpf extension needs
> to be supported.
>
>>
>> [ I will continue the rest of the set later. ]
>
> Thanks a lot :)
>
>>
>>> +}
>>> +#endif
>>> +
>>> void sock_set_keepalive(struct sock *sk)
>>> {
>>> lock_sock(sk);
>>
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 02/11] net-timestamp: prepare for bpf prog use
2024-12-13 1:41 ` Martin KaFai Lau
@ 2024-12-13 14:42 ` Jason Xing
2024-12-13 22:26 ` Martin KaFai Lau
0 siblings, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-13 14:42 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Fri, Dec 13, 2024 at 9:41 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/11/24 1:17 AM, Jason Xing wrote:
> > On Wed, Dec 11, 2024 at 10:02 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 12/7/24 9:37 AM, Jason Xing wrote:
> >>> From: Jason Xing <kernelxing@tencent.com>
> >>>
> >>> Later, I would introduce three points to report some information
> >>> to user space based on this.
> >>>
> >>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> >>> ---
> >>> include/net/sock.h | 7 +++++++
> >>> net/core/sock.c | 15 +++++++++++++++
> >>> 2 files changed, 22 insertions(+)
> >>>
> >>> diff --git a/include/net/sock.h b/include/net/sock.h
> >>> index 0dd464ba9e46..f88a00108a2f 100644
> >>> --- a/include/net/sock.h
> >>> +++ b/include/net/sock.h
> >>> @@ -2920,6 +2920,13 @@ int sock_set_timestamping(struct sock *sk, int optname,
> >>> struct so_timestamping timestamping);
> >>>
> >>> void sock_enable_timestamps(struct sock *sk);
> >>> +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
> >>> +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
> >>> +#else
> >>> +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> >>> +{
> >>> +}
> >>> +#endif
> >>> void sock_no_linger(struct sock *sk);
> >>> void sock_set_keepalive(struct sock *sk);
> >>> void sock_set_priority(struct sock *sk, u32 priority);
> >>> diff --git a/net/core/sock.c b/net/core/sock.c
> >>> index 74729d20cd00..79cb5c74c76c 100644
> >>> --- a/net/core/sock.c
> >>> +++ b/net/core/sock.c
> >>> @@ -941,6 +941,21 @@ int sock_set_timestamping(struct sock *sk, int optname,
> >>> return 0;
> >>> }
> >>>
> >>> +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
> >>> +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> >>> +{
> >>> + struct bpf_sock_ops_kern sock_ops;
> >>> +
> >>> + sock_owned_by_me(sk);
> >>
> >> I don't think this can be assumed in the time stamping callback.
> >
> > I'll remove this.
> >
> >>
> >> To remove this assumption for sockops, I believe it needs to stop the bpf prog
> >> from calling a few bpf helpers. In particular, the bpf_sock_ops_cb_flags_set and
> >> bpf_sock_ops_setsockopt. This should be easy by asking the helpers to check the
> >> "u8 op" in "struct bpf_sock_ops_kern *".
> >
> > Sorry, I don't follow. Could you rephrase your thoughts? Thanks.
>
> Take a look at bpf_sock_ops_setsockopt in filter.c. To change a sk, it needs to
> hold the sk_lock. If you drill down bpf_sock_ops_setsockopt,
> sock_owned_by_me(sk) is checked somewhere.
Thanks, now I totally follow.
>
> The sk_lock held assumption is true so far for the existing sockops callbacks.
> The new timestamping sockops callback does not necessary have the sk_lock held,
Well, for TCP only, there are three new callbacks that I think own the
sk_lock already, say, the tcp_sendmsg() will call the lock_sock(). For
other types, like you said, maybe not.
> so it will break the bpf_sock_ops_setsockopt() assumption on the sk_lock.
Agreed, at least for the writer accessing the sk is not allowed actually.
>
> >
> >>
> >> I just noticed a trickier one, sockops bpf prog can write to sk->sk_txhash. The
> >> same should go for reading from sk. Also, sockops prog assumes a fullsock sk is
> >> a tcp_sock which also won't work for the udp case. A quick thought is to do
> >> something similar to is_fullsock. May be repurpose the is_fullsock somehow or a
> >> new u8 is needed. Take a look at SOCK_OPS_{GET,SET}_FIELD. It avoids
> >> writing/reading the sk when is_fullsock is false.
> >
> > Do you mean that if we introduce a new field, then bpf prog can
> > read/write the socket?
>
> The same goes for writing the sk, e.g. writing the sk->sk_txhash. It needs the
> sk_lock held. Reading may be ok-ish. The bpf prog can read it anyway by
> bpf_probe_read...etc.
>
> When adding udp timestamp callback later, it needs to stop reading the tcp_sock
> through skops from the udp callback for sure. Do take a look at
> SOCK_OPS_GET_TCP_SOCK_FIELD. I think we need to ensure the udp timestamp
> callback won't break here before moving forward.
Agreed. Removing the "sock_ops.sk = sk;" is simple, but I still want
the bpf prog to be able to read some fields from the socket under
those new callbacks.
Let me figure out a feasible solution this weekend if everything goes well.
>
> >
> > Reading the socket could be very helpful in the long run.
> >
> >>
> >> This is a signal that the existing sockops interface has already seen better
> >> days. I hope not too many fixes like these are needed to get tcp/udp
> >> timestamping to work.
> >>
> >>> +
> >>> + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
> >>> + sock_ops.op = op;
> >>> + sock_ops.is_fullsock = 1;
> >>
> >> I don't think we can assume it is always is_fullsock either.
> >
> > Right, but for now, TCP seems to need this. I can remove this also.
>
> I take this back. After reading the existing __skb_tstamp_tx, I think sk is
> always fullsock here.
Got it.
>
> >
> >>
> >>> + sock_ops.sk = sk;
> >>> + __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
> >>
> >> Same here. sk may not be fullsock. BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops) is
> >> needed.
> >
> > If we use this helper, we will change when the udp bpf extension needs
> > to be supported.
> >
> >>
> >> [ I will continue the rest of the set later. ]
> >
> > Thanks a lot :)
> >
> >>
> >>> +}
> >>> +#endif
> >>> +
> >>> void sock_set_keepalive(struct sock *sk)
> >>> {
> >>> lock_sock(sk);
> >>
>
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 02/11] net-timestamp: prepare for bpf prog use
2024-12-13 14:42 ` Jason Xing
@ 2024-12-13 22:26 ` Martin KaFai Lau
2024-12-13 23:56 ` Jason Xing
0 siblings, 1 reply; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-13 22:26 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/13/24 6:42 AM, Jason Xing wrote:
>>>> I just noticed a trickier one, sockops bpf prog can write to sk->sk_txhash. The
>>>> same should go for reading from sk. Also, sockops prog assumes a fullsock sk is
>>>> a tcp_sock which also won't work for the udp case. A quick thought is to do
>>>> something similar to is_fullsock. May be repurpose the is_fullsock somehow or a
>>>> new u8 is needed. Take a look at SOCK_OPS_{GET,SET}_FIELD. It avoids
>>>> writing/reading the sk when is_fullsock is false.
May be this message buried in the earlier reply or some piece was not clear, so
worth to highlight here.
Take a look at how is_fullsock is used in SOCK_OPS_{GET,SET}_FIELD. I think a
similar idea can be borrowed here.
>>>
>>> Do you mean that if we introduce a new field, then bpf prog can
>>> read/write the socket?
>>
>> The same goes for writing the sk, e.g. writing the sk->sk_txhash. It needs the
>> sk_lock held. Reading may be ok-ish. The bpf prog can read it anyway by
>> bpf_probe_read...etc.
>>
>> When adding udp timestamp callback later, it needs to stop reading the tcp_sock
>> through skops from the udp callback for sure. Do take a look at
>> SOCK_OPS_GET_TCP_SOCK_FIELD. I think we need to ensure the udp timestamp
>> callback won't break here before moving forward.
>
> Agreed. Removing the "sock_ops.sk = sk;" is simple, but I still want
> the bpf prog to be able to read some fields from the socket under
> those new callbacks.
No need to remove "sock_ops.sk = sk;". Try to borrow the is_fullsock idea.
Overall, the new timestamp callback breaks assumptions like, sk_lock is held and
is_fullsock must be a tcp_sock. This needs to be audited. In particular, please
check sock_ops_func_proto() for all accessible bpf helpers. Also check the
sock_ops_is_valid_access() and sock_ops_convert_ctx_access() for directly
accessible fields without the helpers. In particular, the BPF_WRITE (able)
fields and the tcp_sock fields.
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 02/11] net-timestamp: prepare for bpf prog use
2024-12-13 22:26 ` Martin KaFai Lau
@ 2024-12-13 23:56 ` Jason Xing
0 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-13 23:56 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Sat, Dec 14, 2024 at 6:26 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/13/24 6:42 AM, Jason Xing wrote:
> >>>> I just noticed a trickier one, sockops bpf prog can write to sk->sk_txhash. The
> >>>> same should go for reading from sk. Also, sockops prog assumes a fullsock sk is
> >>>> a tcp_sock which also won't work for the udp case. A quick thought is to do
> >>>> something similar to is_fullsock. May be repurpose the is_fullsock somehow or a
> >>>> new u8 is needed. Take a look at SOCK_OPS_{GET,SET}_FIELD. It avoids
> >>>> writing/reading the sk when is_fullsock is false.
>
> May be this message buried in the earlier reply or some piece was not clear, so
> worth to highlight here.
>
> Take a look at how is_fullsock is used in SOCK_OPS_{GET,SET}_FIELD. I think a
> similar idea can be borrowed here.
>
> >>>
> >>> Do you mean that if we introduce a new field, then bpf prog can
> >>> read/write the socket?
> >>
> >> The same goes for writing the sk, e.g. writing the sk->sk_txhash. It needs the
> >> sk_lock held. Reading may be ok-ish. The bpf prog can read it anyway by
> >> bpf_probe_read...etc.
> >>
> >> When adding udp timestamp callback later, it needs to stop reading the tcp_sock
> >> through skops from the udp callback for sure. Do take a look at
> >> SOCK_OPS_GET_TCP_SOCK_FIELD. I think we need to ensure the udp timestamp
> >> callback won't break here before moving forward.
> >
> > Agreed. Removing the "sock_ops.sk = sk;" is simple, but I still want
> > the bpf prog to be able to read some fields from the socket under
> > those new callbacks.
>
> No need to remove "sock_ops.sk = sk;". Try to borrow the is_fullsock idea.
>
> Overall, the new timestamp callback breaks assumptions like, sk_lock is held and
> is_fullsock must be a tcp_sock. This needs to be audited. In particular, please
> check sock_ops_func_proto() for all accessible bpf helpers. Also check the
> sock_ops_is_valid_access() and sock_ops_convert_ctx_access() for directly
> accessible fields without the helpers. In particular, the BPF_WRITE (able)
> fields and the tcp_sock fields.
Thanks for the valuable information. I will dig into them.
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH net-next v4 03/11] net-timestamp: reorganize in skb_tstamp_tx_output()
2024-12-07 17:37 [PATCH net-next v4 00/11] net-timestamp: bpf extension to equip applications transparently Jason Xing
2024-12-07 17:37 ` [PATCH net-next v4 01/11] net-timestamp: add support for bpf_setsockopt() Jason Xing
2024-12-07 17:37 ` [PATCH net-next v4 02/11] net-timestamp: prepare for bpf prog use Jason Xing
@ 2024-12-07 17:37 ` Jason Xing
2024-12-09 14:37 ` Willem de Bruijn
2024-12-07 17:37 ` [PATCH net-next v4 04/11] net-timestamp: support SCM_TSTAMP_SCHED for bpf extension Jason Xing
` (7 subsequent siblings)
10 siblings, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-07 17:37 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
It's a prep for bpf print function later. This patch only puts the
original generating logic into one function, so that we integrate
bpf print easily. No functional changes here.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/linux/skbuff.h | 4 ++--
net/core/dev.c | 3 +--
net/core/skbuff.c | 41 +++++++++++++++++++++++++++++++++++------
3 files changed, 38 insertions(+), 10 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 58009fa66102..53c6913560e4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -39,6 +39,7 @@
#include <net/net_debug.h>
#include <net/dropreason-core.h>
#include <net/netmem.h>
+#include <uapi/linux/errqueue.h>
/**
* DOC: skb checksums
@@ -4535,8 +4536,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
skb_clone_tx_timestamp(skb);
- if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
- skb_tstamp_tx(skb, NULL);
+ __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SND);
}
/**
diff --git a/net/core/dev.c b/net/core/dev.c
index 45a8c3dd4a64..5d584950564b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4350,8 +4350,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_reset_mac_header(skb);
skb_assert_len(skb);
- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
- __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
+ __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6841e61a6bd0..74b840ffaf94 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5539,10 +5539,10 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
-void __skb_tstamp_tx(struct sk_buff *orig_skb,
- const struct sk_buff *ack_skb,
- struct skb_shared_hwtstamps *hwtstamps,
- struct sock *sk, int tstype)
+static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
+ const struct sk_buff *ack_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
{
struct sk_buff *skb;
bool tsonly, opt_stats = false;
@@ -5594,13 +5594,42 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
+
+static bool skb_tstamp_is_set(const struct sk_buff *skb, int tstype)
+{
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+ return true;
+ return false;
+ case SCM_TSTAMP_SND:
+ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP))
+ return true;
+ return false;
+ case SCM_TSTAMP_ACK:
+ if (TCP_SKB_CB(skb)->txstamp_ack)
+ return true;
+ return false;
+ }
+
+ return false;
+}
+
+void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ const struct sk_buff *ack_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
+{
+ if (unlikely(skb_tstamp_is_set(orig_skb, tstype)))
+ skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
+}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
void skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps)
{
- return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
- SCM_TSTAMP_SND);
+ return skb_tstamp_tx_output(orig_skb, NULL, hwtstamps, orig_skb->sk,
+ SCM_TSTAMP_SND);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
--
2.37.3
^ permalink raw reply related [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 03/11] net-timestamp: reorganize in skb_tstamp_tx_output()
2024-12-07 17:37 ` [PATCH net-next v4 03/11] net-timestamp: reorganize in skb_tstamp_tx_output() Jason Xing
@ 2024-12-09 14:37 ` Willem de Bruijn
2024-12-09 14:57 ` Jason Xing
0 siblings, 1 reply; 48+ messages in thread
From: Willem de Bruijn @ 2024-12-09 14:37 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: bpf, netdev, Jason Xing
Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> It's a prep for bpf print function later. This patch only puts the
> original generating logic into one function, so that we integrate
> bpf print easily. No functional changes here.
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> include/linux/skbuff.h | 4 ++--
> net/core/dev.c | 3 +--
> net/core/skbuff.c | 41 +++++++++++++++++++++++++++++++++++------
> 3 files changed, 38 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 58009fa66102..53c6913560e4 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -39,6 +39,7 @@
> #include <net/net_debug.h>
> #include <net/dropreason-core.h>
> #include <net/netmem.h>
> +#include <uapi/linux/errqueue.h>
>
> /**
> * DOC: skb checksums
> @@ -4535,8 +4536,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
> static inline void skb_tx_timestamp(struct sk_buff *skb)
> {
> skb_clone_tx_timestamp(skb);
> - if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
> - skb_tstamp_tx(skb, NULL);
> + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SND);
> }
>
> /**
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 45a8c3dd4a64..5d584950564b 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4350,8 +4350,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
> skb_reset_mac_header(skb);
> skb_assert_len(skb);
>
> - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
> - __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
> + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
This adds a function call in the hot path, as __skb_tstamp_tx is
defined in a .c file.
Currently this is only well predicted branch on a likely cache hot
variable.
>
> /* Disable soft irqs for various locks below. Also
> * stops preemption for RCU.
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 6841e61a6bd0..74b840ffaf94 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5539,10 +5539,10 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> }
> EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
>
> -void __skb_tstamp_tx(struct sk_buff *orig_skb,
> - const struct sk_buff *ack_skb,
> - struct skb_shared_hwtstamps *hwtstamps,
> - struct sock *sk, int tstype)
> +static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> + const struct sk_buff *ack_skb,
> + struct skb_shared_hwtstamps *hwtstamps,
> + struct sock *sk, int tstype)
> {
> struct sk_buff *skb;
> bool tsonly, opt_stats = false;
> @@ -5594,13 +5594,42 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
>
> __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
> }
> +
> +static bool skb_tstamp_is_set(const struct sk_buff *skb, int tstype)
> +{
> + switch (tstype) {
> + case SCM_TSTAMP_SCHED:
> + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
> + return true;
> + return false;
> + case SCM_TSTAMP_SND:
> + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP))
> + return true;
Also true for SKBTX_HW_TSTAMP
> + return false;
> + case SCM_TSTAMP_ACK:
> + if (TCP_SKB_CB(skb)->txstamp_ack)
> + return true;
> + return false;
> + }
> +
> + return false;
> +}
> +
> +void __skb_tstamp_tx(struct sk_buff *orig_skb,
> + const struct sk_buff *ack_skb,
> + struct skb_shared_hwtstamps *hwtstamps,
> + struct sock *sk, int tstype)
> +{
> + if (unlikely(skb_tstamp_is_set(orig_skb, tstype)))
> + skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
> +}
> EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
>
> void skb_tstamp_tx(struct sk_buff *orig_skb,
> struct skb_shared_hwtstamps *hwtstamps)
> {
> - return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
> - SCM_TSTAMP_SND);
> + return skb_tstamp_tx_output(orig_skb, NULL, hwtstamps, orig_skb->sk,
> + SCM_TSTAMP_SND);
> }
> EXPORT_SYMBOL_GPL(skb_tstamp_tx);
>
> --
> 2.37.3
>
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 03/11] net-timestamp: reorganize in skb_tstamp_tx_output()
2024-12-09 14:37 ` Willem de Bruijn
@ 2024-12-09 14:57 ` Jason Xing
0 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-09 14:57 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Mon, Dec 9, 2024 at 10:37 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > It's a prep for bpf print function later. This patch only puts the
> > original generating logic into one function, so that we integrate
> > bpf print easily. No functional changes here.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > include/linux/skbuff.h | 4 ++--
> > net/core/dev.c | 3 +--
> > net/core/skbuff.c | 41 +++++++++++++++++++++++++++++++++++------
> > 3 files changed, 38 insertions(+), 10 deletions(-)
> >
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index 58009fa66102..53c6913560e4 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -39,6 +39,7 @@
> > #include <net/net_debug.h>
> > #include <net/dropreason-core.h>
> > #include <net/netmem.h>
> > +#include <uapi/linux/errqueue.h>
> >
> > /**
> > * DOC: skb checksums
> > @@ -4535,8 +4536,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
> > static inline void skb_tx_timestamp(struct sk_buff *skb)
> > {
> > skb_clone_tx_timestamp(skb);
> > - if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
> > - skb_tstamp_tx(skb, NULL);
> > + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SND);
> > }
> >
> > /**
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 45a8c3dd4a64..5d584950564b 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -4350,8 +4350,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
> > skb_reset_mac_header(skb);
> > skb_assert_len(skb);
> >
> > - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
> > - __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
> > + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
>
> This adds a function call in the hot path, as __skb_tstamp_tx is
> defined in a .c file.
>
> Currently this is only well predicted branch on a likely cache hot
> variable.
Oh, right, thanks for reminding me. I will figure out a better solution :)
> >
> > /* Disable soft irqs for various locks below. Also
> > * stops preemption for RCU.
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 6841e61a6bd0..74b840ffaf94 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -5539,10 +5539,10 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> > }
> > EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> >
> > -void __skb_tstamp_tx(struct sk_buff *orig_skb,
> > - const struct sk_buff *ack_skb,
> > - struct skb_shared_hwtstamps *hwtstamps,
> > - struct sock *sk, int tstype)
> > +static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > + const struct sk_buff *ack_skb,
> > + struct skb_shared_hwtstamps *hwtstamps,
> > + struct sock *sk, int tstype)
> > {
> > struct sk_buff *skb;
> > bool tsonly, opt_stats = false;
> > @@ -5594,13 +5594,42 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
> >
> > __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
> > }
> > +
> > +static bool skb_tstamp_is_set(const struct sk_buff *skb, int tstype)
> > +{
> > + switch (tstype) {
> > + case SCM_TSTAMP_SCHED:
> > + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
> > + return true;
> > + return false;
> > + case SCM_TSTAMP_SND:
> > + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP))
> > + return true;
>
> Also true for SKBTX_HW_TSTAMP
>
> > + return false;
> > + case SCM_TSTAMP_ACK:
> > + if (TCP_SKB_CB(skb)->txstamp_ack)
> > + return true;
> > + return false;
> > + }
> > +
> > + return false;
> > +}
> > +
> > +void __skb_tstamp_tx(struct sk_buff *orig_skb,
> > + const struct sk_buff *ack_skb,
> > + struct skb_shared_hwtstamps *hwtstamps,
> > + struct sock *sk, int tstype)
> > +{
> > + if (unlikely(skb_tstamp_is_set(orig_skb, tstype)))
> > + skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
> > +}
> > EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
> >
> > void skb_tstamp_tx(struct sk_buff *orig_skb,
> > struct skb_shared_hwtstamps *hwtstamps)
> > {
> > - return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
> > - SCM_TSTAMP_SND);
> > + return skb_tstamp_tx_output(orig_skb, NULL, hwtstamps, orig_skb->sk,
> > + SCM_TSTAMP_SND);
> > }
> > EXPORT_SYMBOL_GPL(skb_tstamp_tx);
> >
> > --
> > 2.37.3
> >
>
>
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH net-next v4 04/11] net-timestamp: support SCM_TSTAMP_SCHED for bpf extension
2024-12-07 17:37 [PATCH net-next v4 00/11] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (2 preceding siblings ...)
2024-12-07 17:37 ` [PATCH net-next v4 03/11] net-timestamp: reorganize in skb_tstamp_tx_output() Jason Xing
@ 2024-12-07 17:37 ` Jason Xing
2024-12-07 17:37 ` [PATCH net-next v4 05/11] net-timestamp: support SCM_TSTAMP_SND " Jason Xing
` (6 subsequent siblings)
10 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-07 17:37 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Introducing SKBTX_BPF is used as an indicator telling us whether
the skb should be traced by the bpf prog.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/linux/skbuff.h | 6 +++++-
include/uapi/linux/bpf.h | 5 +++++
net/core/skbuff.c | 29 ++++++++++++++++++++++++++---
tools/include/uapi/linux/bpf.h | 5 +++++
4 files changed, 41 insertions(+), 4 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 53c6913560e4..af22c8326ca5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -490,10 +490,14 @@ enum {
/* generate software time stamp when entering packet scheduling */
SKBTX_SCHED_TSTAMP = 1 << 6,
+
+ /* used for bpf extension when a bpf program is loaded */
+ SKBTX_BPF = 1 << 7,
};
#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \
- SKBTX_SCHED_TSTAMP)
+ SKBTX_SCHED_TSTAMP | \
+ SKBTX_BPF)
#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \
SKBTX_HW_TSTAMP_USE_CYCLES | \
SKBTX_ANY_SW_TSTAMP)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e629e09b0b31..72f93c6e45c1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7022,6 +7022,11 @@ enum {
* by the kernel or the
* earlier bpf-progs.
*/
+ BPF_SOCK_OPS_TS_SCHED_OPT_CB, /* Called when skb is passing through
+ * dev layer when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 74b840ffaf94..fd4f06b88a2e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5539,6 +5539,24 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype)
+{
+ int op;
+
+ if (!sk)
+ return;
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ op = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
+ break;
+ default:
+ return;
+ }
+
+ bpf_skops_tx_timestamping(sk, skb, op);
+}
+
static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
@@ -5595,11 +5613,14 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
-static bool skb_tstamp_is_set(const struct sk_buff *skb, int tstype)
+static bool skb_tstamp_is_set(const struct sk_buff *skb, int tstype, bool bpf_mode)
{
+ int flag;
+
switch (tstype) {
case SCM_TSTAMP_SCHED:
- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+ flag = bpf_mode ? SKBTX_BPF : SKBTX_SCHED_TSTAMP;
+ if (unlikely(skb_shinfo(skb)->tx_flags & flag))
return true;
return false;
case SCM_TSTAMP_SND:
@@ -5620,8 +5641,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype)
{
- if (unlikely(skb_tstamp_is_set(orig_skb, tstype)))
+ if (unlikely(skb_tstamp_is_set(orig_skb, tstype, false)))
skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
+ if (unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
+ __skb_tstamp_tx_bpf(sk, orig_skb, tstype);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 6b0a5b787b12..891217ab6d2d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7015,6 +7015,11 @@ enum {
* by the kernel or the
* earlier bpf-progs.
*/
+ BPF_SOCK_OPS_TS_SCHED_OPT_CB, /* Called when skb is passing through
+ * dev layer when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
--
2.37.3
^ permalink raw reply related [flat|nested] 48+ messages in thread* [PATCH net-next v4 05/11] net-timestamp: support SCM_TSTAMP_SND for bpf extension
2024-12-07 17:37 [PATCH net-next v4 00/11] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (3 preceding siblings ...)
2024-12-07 17:37 ` [PATCH net-next v4 04/11] net-timestamp: support SCM_TSTAMP_SCHED for bpf extension Jason Xing
@ 2024-12-07 17:37 ` Jason Xing
2024-12-07 17:37 ` [PATCH net-next v4 06/11] net-timestamp: support SCM_TSTAMP_ACK " Jason Xing
` (5 subsequent siblings)
10 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-07 17:37 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Support SCM_TSTAMP_SND case. Then we will get the timestamp
when the driver is about to send the skb.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/uapi/linux/bpf.h | 5 +++++
net/core/skbuff.c | 13 ++++++++++---
tools/include/uapi/linux/bpf.h | 5 +++++
3 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 72f93c6e45c1..a6d761f07f67 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7027,6 +7027,11 @@ enum {
* feature is on. It indicates the
* recorded timestamp.
*/
+ BPF_SOCK_OPS_TS_SW_OPT_CB, /* Called when skb is about to send
+ * to the nic when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fd4f06b88a2e..73b15d6277f7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5550,6 +5550,9 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
case SCM_TSTAMP_SCHED:
op = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
break;
+ case SCM_TSTAMP_SND:
+ op = BPF_SOCK_OPS_TS_SW_OPT_CB;
+ break;
default:
return;
}
@@ -5624,7 +5627,8 @@ static bool skb_tstamp_is_set(const struct sk_buff *skb, int tstype, bool bpf_mo
return true;
return false;
case SCM_TSTAMP_SND:
- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP))
+ flag = bpf_mode ? SKBTX_BPF : SKBTX_SW_TSTAMP;
+ if (unlikely(skb_shinfo(skb)->tx_flags & flag))
return true;
return false;
case SCM_TSTAMP_ACK:
@@ -5651,8 +5655,11 @@ EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
void skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps)
{
- return skb_tstamp_tx_output(orig_skb, NULL, hwtstamps, orig_skb->sk,
- SCM_TSTAMP_SND);
+ int tstype = SCM_TSTAMP_SND;
+
+ skb_tstamp_tx_output(orig_skb, NULL, hwtstamps, orig_skb->sk, tstype);
+ if (unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
+ __skb_tstamp_tx_bpf(orig_skb->sk, orig_skb, tstype);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 891217ab6d2d..73fc0a95c9ca 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7020,6 +7020,11 @@ enum {
* feature is on. It indicates the
* recorded timestamp.
*/
+ BPF_SOCK_OPS_TS_SW_OPT_CB, /* Called when skb is about to send
+ * to the nic when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
--
2.37.3
^ permalink raw reply related [flat|nested] 48+ messages in thread* [PATCH net-next v4 06/11] net-timestamp: support SCM_TSTAMP_ACK for bpf extension
2024-12-07 17:37 [PATCH net-next v4 00/11] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (4 preceding siblings ...)
2024-12-07 17:37 ` [PATCH net-next v4 05/11] net-timestamp: support SCM_TSTAMP_SND " Jason Xing
@ 2024-12-07 17:37 ` Jason Xing
2024-12-12 22:36 ` Martin KaFai Lau
2024-12-07 17:37 ` [PATCH net-next v4 07/11] net-timestamp: support hwtstamp print " Jason Xing
` (4 subsequent siblings)
10 siblings, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-07 17:37 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Handle the ACK timestamp case. Actually testing SKBTX_BPF flag
can work, but we need to Introduce a new txstamp_ack_bpf to avoid
cache line misses in tcp_ack_tstamp().
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/net/tcp.h | 3 ++-
include/uapi/linux/bpf.h | 5 +++++
net/core/skbuff.c | 9 ++++++---
net/ipv4/tcp_input.c | 3 ++-
net/ipv4/tcp_output.c | 5 +++++
tools/include/uapi/linux/bpf.h | 5 +++++
6 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e9b37b76e894..8e5103d3c6b9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -959,9 +959,10 @@ struct tcp_skb_cb {
__u8 sacked; /* State flags for SACK. */
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
__u8 txstamp_ack:1, /* Record TX timestamp for ack? */
+ txstamp_ack_bpf:1, /* ack timestamp for bpf use */
eor:1, /* Is skb MSG_EOR marked? */
has_rxtstamp:1, /* SKB has a RX timestamp */
- unused:5;
+ unused:4;
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a6d761f07f67..a0aff1b4eb61 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7032,6 +7032,11 @@ enum {
* feature is on. It indicates the
* recorded timestamp.
*/
+ BPF_SOCK_OPS_TS_ACK_OPT_CB, /* Called when all the skbs are
+ * acknowledged when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 73b15d6277f7..48b0c71e9522 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5553,6 +5553,9 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
case SCM_TSTAMP_SND:
op = BPF_SOCK_OPS_TS_SW_OPT_CB;
break;
+ case SCM_TSTAMP_ACK:
+ op = BPF_SOCK_OPS_TS_ACK_OPT_CB;
+ break;
default:
return;
}
@@ -5632,9 +5635,9 @@ static bool skb_tstamp_is_set(const struct sk_buff *skb, int tstype, bool bpf_mo
return true;
return false;
case SCM_TSTAMP_ACK:
- if (TCP_SKB_CB(skb)->txstamp_ack)
- return true;
- return false;
+ flag = bpf_mode ? TCP_SKB_CB(skb)->txstamp_ack_bpf :
+ TCP_SKB_CB(skb)->txstamp_ack;
+ return !!flag;
}
return false;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bdf13ac26ef..82bb26f5b214 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3321,7 +3321,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
const struct skb_shared_info *shinfo;
/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
- if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
+ if (likely(!TCP_SKB_CB(skb)->txstamp_ack &&
+ !TCP_SKB_CB(skb)->txstamp_ack_bpf))
return;
shinfo = skb_shinfo(skb);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5485a70b5fe5..c8927143d3e1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1552,6 +1552,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
{
return TCP_SKB_CB(skb)->txstamp_ack ||
+ TCP_SKB_CB(skb)->txstamp_ack_bpf ||
(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
}
@@ -1568,7 +1569,9 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
shinfo2->tx_flags |= tsflags;
swap(shinfo->tskey, shinfo2->tskey);
TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
+ TCP_SKB_CB(skb2)->txstamp_ack_bpf = TCP_SKB_CB(skb)->txstamp_ack_bpf;
TCP_SKB_CB(skb)->txstamp_ack = 0;
+ TCP_SKB_CB(skb)->txstamp_ack_bpf = 0;
}
}
@@ -3209,6 +3212,8 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
shinfo->tskey = next_shinfo->tskey;
TCP_SKB_CB(skb)->txstamp_ack |=
TCP_SKB_CB(next_skb)->txstamp_ack;
+ TCP_SKB_CB(skb)->txstamp_ack_bpf |=
+ TCP_SKB_CB(next_skb)->txstamp_ack_bpf;
}
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 73fc0a95c9ca..0fe7d663a244 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7025,6 +7025,11 @@ enum {
* feature is on. It indicates the
* recorded timestamp.
*/
+ BPF_SOCK_OPS_TS_ACK_OPT_CB, /* Called when all the skbs are
+ * acknowledged when SO_TIMESTAMPING
+ * feature is on. It indicates the
+ * recorded timestamp.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
--
2.37.3
^ permalink raw reply related [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 06/11] net-timestamp: support SCM_TSTAMP_ACK for bpf extension
2024-12-07 17:37 ` [PATCH net-next v4 06/11] net-timestamp: support SCM_TSTAMP_ACK " Jason Xing
@ 2024-12-12 22:36 ` Martin KaFai Lau
2024-12-13 14:49 ` Jason Xing
0 siblings, 1 reply; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-12 22:36 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/7/24 9:37 AM, Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> Handle the ACK timestamp case. Actually testing SKBTX_BPF flag
> can work, but we need to Introduce a new txstamp_ack_bpf to avoid
> cache line misses in tcp_ack_tstamp().
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> include/net/tcp.h | 3 ++-
> include/uapi/linux/bpf.h | 5 +++++
> net/core/skbuff.c | 9 ++++++---
> net/ipv4/tcp_input.c | 3 ++-
> net/ipv4/tcp_output.c | 5 +++++
> tools/include/uapi/linux/bpf.h | 5 +++++
> 6 files changed, 25 insertions(+), 5 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index e9b37b76e894..8e5103d3c6b9 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -959,9 +959,10 @@ struct tcp_skb_cb {
> __u8 sacked; /* State flags for SACK. */
> __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
> __u8 txstamp_ack:1, /* Record TX timestamp for ack? */
> + txstamp_ack_bpf:1, /* ack timestamp for bpf use */
After quickly peeking at patch 8, I realize that the new txstamp_ack_bpf bit is
not needed. SKBTX_BPF bit (in skb_shinfo(skb)->tx_flags) and the txstamp_ack_bpf
are always set together. Then only use the SKBTX_BPF bit should be as good.
> eor:1, /* Is skb MSG_EOR marked? */
> has_rxtstamp:1, /* SKB has a RX timestamp */
> - unused:5;
> + unused:4;
> __u32 ack_seq; /* Sequence number ACK'd */
> union {
> struct {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index a6d761f07f67..a0aff1b4eb61 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7032,6 +7032,11 @@ enum {
> * feature is on. It indicates the
> * recorded timestamp.
> */
> + BPF_SOCK_OPS_TS_ACK_OPT_CB, /* Called when all the skbs are
> + * acknowledged when SO_TIMESTAMPING
> + * feature is on. It indicates the
> + * recorded timestamp.
> + */
> };
>
> /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 73b15d6277f7..48b0c71e9522 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5553,6 +5553,9 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
> case SCM_TSTAMP_SND:
> op = BPF_SOCK_OPS_TS_SW_OPT_CB;
> break;
> + case SCM_TSTAMP_ACK:
> + op = BPF_SOCK_OPS_TS_ACK_OPT_CB;
> + break;
> default:
> return;
> }
> @@ -5632,9 +5635,9 @@ static bool skb_tstamp_is_set(const struct sk_buff *skb, int tstype, bool bpf_mo
> return true;
> return false;
> case SCM_TSTAMP_ACK:
> - if (TCP_SKB_CB(skb)->txstamp_ack)
> - return true;
> - return false;
> + flag = bpf_mode ? TCP_SKB_CB(skb)->txstamp_ack_bpf :
> + TCP_SKB_CB(skb)->txstamp_ack;
> + return !!flag;
> }
>
> return false;
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 5bdf13ac26ef..82bb26f5b214 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3321,7 +3321,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
> const struct skb_shared_info *shinfo;
>
> /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
> - if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
> + if (likely(!TCP_SKB_CB(skb)->txstamp_ack &&
> + !TCP_SKB_CB(skb)->txstamp_ack_bpf))
Change the test here to:
if (likely(!TCP_SKB_CB(skb)->txstamp_ack &&
!(skb_shinfo(skb)->tx_flags & SKBTX_BPF)))
Does it make sense?
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 06/11] net-timestamp: support SCM_TSTAMP_ACK for bpf extension
2024-12-12 22:36 ` Martin KaFai Lau
@ 2024-12-13 14:49 ` Jason Xing
2024-12-13 22:46 ` Martin KaFai Lau
0 siblings, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-13 14:49 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Fri, Dec 13, 2024 at 6:36 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/7/24 9:37 AM, Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Handle the ACK timestamp case. Actually testing SKBTX_BPF flag
> > can work, but we need to Introduce a new txstamp_ack_bpf to avoid
> > cache line misses in tcp_ack_tstamp().
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > include/net/tcp.h | 3 ++-
> > include/uapi/linux/bpf.h | 5 +++++
> > net/core/skbuff.c | 9 ++++++---
> > net/ipv4/tcp_input.c | 3 ++-
> > net/ipv4/tcp_output.c | 5 +++++
> > tools/include/uapi/linux/bpf.h | 5 +++++
> > 6 files changed, 25 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index e9b37b76e894..8e5103d3c6b9 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -959,9 +959,10 @@ struct tcp_skb_cb {
> > __u8 sacked; /* State flags for SACK. */
> > __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
> > __u8 txstamp_ack:1, /* Record TX timestamp for ack? */
> > + txstamp_ack_bpf:1, /* ack timestamp for bpf use */
>
> After quickly peeking at patch 8, I realize that the new txstamp_ack_bpf bit is
> not needed. SKBTX_BPF bit (in skb_shinfo(skb)->tx_flags) and the txstamp_ack_bpf
> are always set together. Then only use the SKBTX_BPF bit should be as good.
Please see the comments below :)
>
> > eor:1, /* Is skb MSG_EOR marked? */
> > has_rxtstamp:1, /* SKB has a RX timestamp */
> > - unused:5;
> > + unused:4;
> > __u32 ack_seq; /* Sequence number ACK'd */
> > union {
> > struct {
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index a6d761f07f67..a0aff1b4eb61 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -7032,6 +7032,11 @@ enum {
> > * feature is on. It indicates the
> > * recorded timestamp.
> > */
> > + BPF_SOCK_OPS_TS_ACK_OPT_CB, /* Called when all the skbs are
> > + * acknowledged when SO_TIMESTAMPING
> > + * feature is on. It indicates the
> > + * recorded timestamp.
> > + */
> > };
> >
> > /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 73b15d6277f7..48b0c71e9522 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -5553,6 +5553,9 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
> > case SCM_TSTAMP_SND:
> > op = BPF_SOCK_OPS_TS_SW_OPT_CB;
> > break;
> > + case SCM_TSTAMP_ACK:
> > + op = BPF_SOCK_OPS_TS_ACK_OPT_CB;
> > + break;
> > default:
> > return;
> > }
> > @@ -5632,9 +5635,9 @@ static bool skb_tstamp_is_set(const struct sk_buff *skb, int tstype, bool bpf_mo
> > return true;
> > return false;
> > case SCM_TSTAMP_ACK:
> > - if (TCP_SKB_CB(skb)->txstamp_ack)
> > - return true;
> > - return false;
> > + flag = bpf_mode ? TCP_SKB_CB(skb)->txstamp_ack_bpf :
> > + TCP_SKB_CB(skb)->txstamp_ack;
> > + return !!flag;
> > }
> >
> > return false;
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 5bdf13ac26ef..82bb26f5b214 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -3321,7 +3321,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
> > const struct skb_shared_info *shinfo;
> >
> > /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
Please take a look at the above comment.
> > - if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
> > + if (likely(!TCP_SKB_CB(skb)->txstamp_ack &&
> > + !TCP_SKB_CB(skb)->txstamp_ack_bpf))
>
> Change the test here to:
> if (likely(!TCP_SKB_CB(skb)->txstamp_ack &&
> !(skb_shinfo(skb)->tx_flags & SKBTX_BPF)))
>
> Does it make sense?
It surely works. Talking about the result only, introducing SKBTX_BPF
can work for all the cases. However, in the ACK case, the above code
snippet will access the shinfo->tx_flags, which triggers cache line
misses. I also mentioned this on purpose in the patch [06/11].
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 06/11] net-timestamp: support SCM_TSTAMP_ACK for bpf extension
2024-12-13 14:49 ` Jason Xing
@ 2024-12-13 22:46 ` Martin KaFai Lau
0 siblings, 0 replies; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-13 22:46 UTC (permalink / raw)
To: Jason Xing, willemdebruijn.kernel
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, eddyz87, song, yonghong.song, john.fastabend, kpsingh,
sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On 12/13/24 6:49 AM, Jason Xing wrote:
>>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>>> index 5bdf13ac26ef..82bb26f5b214 100644
>>> --- a/net/ipv4/tcp_input.c
>>> +++ b/net/ipv4/tcp_input.c
>>> @@ -3321,7 +3321,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
>>> const struct skb_shared_info *shinfo;
>>>
>>> /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
> Please take a look at the above comment.
>
>>> - if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
>>> + if (likely(!TCP_SKB_CB(skb)->txstamp_ack &&
>>> + !TCP_SKB_CB(skb)->txstamp_ack_bpf))
>> Change the test here to:
>> if (likely(!TCP_SKB_CB(skb)->txstamp_ack &&
>> !(skb_shinfo(skb)->tx_flags & SKBTX_BPF)))
>>
>> Does it make sense?
> It surely works. Talking about the result only, introducing SKBTX_BPF
> can work for all the cases. However, in the ACK case, the above code
> snippet will access the shinfo->tx_flags, which triggers cache line
> misses. I also mentioned this on purpose in the patch [06/11].
ah. my bad. I somehow totally skipped the comment and message in this patch when
jumping between patch 6 and 7.
Not an expert. so curious if it matters testing skb_shinfo(skb)->tx_flags or not
here? e.g. The tcp_v4_fill_cb() in the earlier rx path,
skb_hwtstamps(skb)->hwtstamp is also read. Willem?
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH net-next v4 07/11] net-timestamp: support hwtstamp print for bpf extension
2024-12-07 17:37 [PATCH net-next v4 00/11] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (5 preceding siblings ...)
2024-12-07 17:37 ` [PATCH net-next v4 06/11] net-timestamp: support SCM_TSTAMP_ACK " Jason Xing
@ 2024-12-07 17:37 ` Jason Xing
2024-12-12 23:25 ` Martin KaFai Lau
2024-12-07 17:38 ` [PATCH net-next v4 08/11] net-timestamp: make TCP tx timestamp bpf extension work Jason Xing
` (3 subsequent siblings)
10 siblings, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-07 17:37 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Passing the hwtstamp to bpf prog which can print.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/net/sock.h | 6 ++++--
net/core/skbuff.c | 17 +++++++++++++----
net/core/sock.c | 4 +++-
3 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index f88a00108a2f..9bc883573208 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2921,9 +2921,11 @@ int sock_set_timestamping(struct sock *sk, int optname,
void sock_enable_timestamps(struct sock *sk);
#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
-void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op,
+ u32 nargs, u32 *args);
#else
-static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op,
+ u32 nargs, u32 *args)
{
}
#endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 48b0c71e9522..182a44815630 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5539,8 +5539,12 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
-static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype)
+static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ int tstype)
{
+ struct timespec64 tstamp;
+ u32 args[2] = {0, 0};
int op;
if (!sk)
@@ -5552,6 +5556,11 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
break;
case SCM_TSTAMP_SND:
op = BPF_SOCK_OPS_TS_SW_OPT_CB;
+ if (hwtstamps) {
+ tstamp = ktime_to_timespec64(hwtstamps->hwtstamp);
+ args[0] = tstamp.tv_sec;
+ args[1] = tstamp.tv_nsec;
+ }
break;
case SCM_TSTAMP_ACK:
op = BPF_SOCK_OPS_TS_ACK_OPT_CB;
@@ -5560,7 +5569,7 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
return;
}
- bpf_skops_tx_timestamping(sk, skb, op);
+ bpf_skops_tx_timestamping(sk, skb, op, 2, args);
}
static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
@@ -5651,7 +5660,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (unlikely(skb_tstamp_is_set(orig_skb, tstype, false)))
skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
if (unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
- __skb_tstamp_tx_bpf(sk, orig_skb, tstype);
+ __skb_tstamp_tx_bpf(sk, orig_skb, hwtstamps, tstype);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
@@ -5662,7 +5671,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
skb_tstamp_tx_output(orig_skb, NULL, hwtstamps, orig_skb->sk, tstype);
if (unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
- __skb_tstamp_tx_bpf(orig_skb->sk, orig_skb, tstype);
+ __skb_tstamp_tx_bpf(orig_skb->sk, orig_skb, hwtstamps, tstype);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
diff --git a/net/core/sock.c b/net/core/sock.c
index 79cb5c74c76c..504939bafe0c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -942,7 +942,8 @@ int sock_set_timestamping(struct sock *sk, int optname,
}
#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
-void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op,
+ u32 nargs, u32 *args)
{
struct bpf_sock_ops_kern sock_ops;
@@ -952,6 +953,7 @@ void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
sock_ops.op = op;
sock_ops.is_fullsock = 1;
sock_ops.sk = sk;
+ memcpy(sock_ops.args, args, nargs * sizeof(*args));
__cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
}
#endif
--
2.37.3
^ permalink raw reply related [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 07/11] net-timestamp: support hwtstamp print for bpf extension
2024-12-07 17:37 ` [PATCH net-next v4 07/11] net-timestamp: support hwtstamp print " Jason Xing
@ 2024-12-12 23:25 ` Martin KaFai Lau
2024-12-13 6:28 ` Martin KaFai Lau
2024-12-13 15:13 ` Jason Xing
0 siblings, 2 replies; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-12 23:25 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/7/24 9:37 AM, Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> Passing the hwtstamp to bpf prog which can print.
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> include/net/sock.h | 6 ++++--
> net/core/skbuff.c | 17 +++++++++++++----
> net/core/sock.c | 4 +++-
> 3 files changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index f88a00108a2f..9bc883573208 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -2921,9 +2921,11 @@ int sock_set_timestamping(struct sock *sk, int optname,
>
> void sock_enable_timestamps(struct sock *sk);
> #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
> -void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
> +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op,
> + u32 nargs, u32 *args);
> #else
> -static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op,
> + u32 nargs, u32 *args)
> {
> }
> #endif
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 48b0c71e9522..182a44815630 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5539,8 +5539,12 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> }
> EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
>
> -static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype)
> +static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
> + struct skb_shared_hwtstamps *hwtstamps,
> + int tstype)
> {
> + struct timespec64 tstamp;
> + u32 args[2] = {0, 0};
> int op;
>
> if (!sk)
> @@ -5552,6 +5556,11 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
> break;
> case SCM_TSTAMP_SND:
> op = BPF_SOCK_OPS_TS_SW_OPT_CB;
> + if (hwtstamps) {
> + tstamp = ktime_to_timespec64(hwtstamps->hwtstamp);
Avoid this conversion which is likely not useful to the bpf prog. Directly pass
hwtstamps->hwtstamp (in ns?) to the bpf prog. Put lower 32bits in args[0] and
higher 32bits in args[1].
Also, how about adding a BPF_SOCK_OPS_TS_"HW"_OPT_CB for the "hwtstamps != NULL"
case instead of reusing the BPF_SOCK_OPS_TS_"SW"_OPT_CB?
A more subtle thing for the hwtstamps case is, afaik the bpf prog will not be
called. All drivers are still only testing SKBTX_HW_TSTAMP instead of testing
(SKBTX_HW_TSTAMP | SKBTX_BPF).
There are a lot of drivers to change though. A quick thought is to rename the
existing SKBTX_HW_TSTAMP (e.g. __SKBTX_HW_TSTAMP = 1 << 0) and define
SKBTX_HW_TSTAMP like:
#define SKBTX_HW_TSTAMP (__SKBTX_HW_TSTAMP | SKBTX_BPF)
Then change some of the existing skb_shinfo(skb)->tx_flags "setting" site to use
__SKBTX_HW_TSTAMP instead. e.g. in __sock_tx_timestamp(). Not very pretty but
may be still better than changing many drivers. May be there is a better way...
While talking about where to test the SKBTX_BPF bit, I wonder if the new
skb_tstamp_is_set() is needed. For the non SKBTX_HW_TSTAMP case, the number of
tx_flags testing sites should be limited, so should be easy to add the SKBTX_BPF
bit test. e.g. at the __dev_queue_xmit, test "if
(unlikely(skb_shinfo(skb)->tx_flags & (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))". Patch
6 has also tested the bpf specific bit at tcp_ack_tstamp() before calling the
__skb_tstamp_tx().
At the beginning of __skb_tstamp_tx(), do something like this:
void __skb_tstamp_tx(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype)
{
if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
unlikely(skb_shinfo(skb)->tx_flags & SKBTX_BPF))
__skb_tstamp_tx_bpf(sk, orig_skb, hwtstamps, tstype);
if (unlikely(!(skb_shinfo(skb)->tx_flags & ~SKBTX_BPF)))
return;
Then the new skb_tstamp_tx_output() shuffle is probably not be needed also.
> + args[0] = tstamp.tv_sec;
> + args[1] = tstamp.tv_nsec;
> + }
> break;
> case SCM_TSTAMP_ACK:
> op = BPF_SOCK_OPS_TS_ACK_OPT_CB;
> @@ -5560,7 +5569,7 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
> return;
> }
>
> - bpf_skops_tx_timestamping(sk, skb, op);
> + bpf_skops_tx_timestamping(sk, skb, op, 2, args);
> }
>
> static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> @@ -5651,7 +5660,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
> if (unlikely(skb_tstamp_is_set(orig_skb, tstype, false)))
> skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
> if (unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
> - __skb_tstamp_tx_bpf(sk, orig_skb, tstype);
> + __skb_tstamp_tx_bpf(sk, orig_skb, hwtstamps, tstype);
> }
> EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
>
> @@ -5662,7 +5671,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
>
> skb_tstamp_tx_output(orig_skb, NULL, hwtstamps, orig_skb->sk, tstype);
> if (unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
> - __skb_tstamp_tx_bpf(orig_skb->sk, orig_skb, tstype);
> + __skb_tstamp_tx_bpf(orig_skb->sk, orig_skb, hwtstamps, tstype);
> }
> EXPORT_SYMBOL_GPL(skb_tstamp_tx);
>
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 79cb5c74c76c..504939bafe0c 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -942,7 +942,8 @@ int sock_set_timestamping(struct sock *sk, int optname,
> }
>
> #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
> -void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op,
> + u32 nargs, u32 *args)
Directly pass hwtstamps->hwtstamp instead of args and nargs. Save a memcpy below.
> {
> struct bpf_sock_ops_kern sock_ops;
>
> @@ -952,6 +953,7 @@ void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> sock_ops.op = op;
> sock_ops.is_fullsock = 1;
> sock_ops.sk = sk;
> + memcpy(sock_ops.args, args, nargs * sizeof(*args));
> __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
> }
> #endif
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 07/11] net-timestamp: support hwtstamp print for bpf extension
2024-12-12 23:25 ` Martin KaFai Lau
@ 2024-12-13 6:28 ` Martin KaFai Lau
2024-12-13 15:13 ` Jason Xing
1 sibling, 0 replies; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-13 6:28 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/12/24 3:25 PM, Martin KaFai Lau wrote:
> A more subtle thing for the hwtstamps case is, afaik the bpf prog will not be
> called. All drivers are still only testing SKBTX_HW_TSTAMP instead of testing
> (SKBTX_HW_TSTAMP | SKBTX_BPF).
>
> There are a lot of drivers to change though. A quick thought is to rename the
> existing SKBTX_HW_TSTAMP (e.g. __SKBTX_HW_TSTAMP = 1 << 0) and define
> SKBTX_HW_TSTAMP like:
>
> #define SKBTX_HW_TSTAMP (__SKBTX_HW_TSTAMP | SKBTX_BPF)
>
> Then change some of the existing skb_shinfo(skb)->tx_flags "setting" site to use
> __SKBTX_HW_TSTAMP instead. e.g. in __sock_tx_timestamp(). Not very pretty but
> may be still better than changing many drivers. May be there is a better way...
>
> While talking about where to test the SKBTX_BPF bit, I wonder if the new
> skb_tstamp_is_set() is needed. For the non SKBTX_HW_TSTAMP case, the number of
> tx_flags testing sites should be limited, so should be easy to add the SKBTX_BPF
> bit test. e.g. at the __dev_queue_xmit, test "if (unlikely(skb_shinfo(skb)-
> >tx_flags & (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))". Patch 6 has also tested the
> bpf specific bit at tcp_ack_tstamp() before calling the __skb_tstamp_tx().
>
> At the beginning of __skb_tstamp_tx(), do something like this:
>
> void __skb_tstamp_tx(struct sk_buff *orig_skb,
> const struct sk_buff *ack_skb,
> struct skb_shared_hwtstamps *hwtstamps,
> struct sock *sk, int tstype)
> {
> if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
> unlikely(skb_shinfo(skb)->tx_flags & SKBTX_BPF))
> __skb_tstamp_tx_bpf(sk, orig_skb, hwtstamps, tstype);
>
> if (unlikely(!(skb_shinfo(skb)->tx_flags & ~SKBTX_BPF)))
This is not enough. I was wrong here. The test in skb_tstamp_is_set() is needed
when SKBTX_BPF is not set.
> return;
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 07/11] net-timestamp: support hwtstamp print for bpf extension
2024-12-12 23:25 ` Martin KaFai Lau
2024-12-13 6:28 ` Martin KaFai Lau
@ 2024-12-13 15:13 ` Jason Xing
2024-12-13 23:15 ` Martin KaFai Lau
1 sibling, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-13 15:13 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Fri, Dec 13, 2024 at 7:26 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/7/24 9:37 AM, Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Passing the hwtstamp to bpf prog which can print.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > include/net/sock.h | 6 ++++--
> > net/core/skbuff.c | 17 +++++++++++++----
> > net/core/sock.c | 4 +++-
> > 3 files changed, 20 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index f88a00108a2f..9bc883573208 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -2921,9 +2921,11 @@ int sock_set_timestamping(struct sock *sk, int optname,
> >
> > void sock_enable_timestamps(struct sock *sk);
> > #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
> > -void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
> > +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op,
> > + u32 nargs, u32 *args);
> > #else
> > -static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> > +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op,
> > + u32 nargs, u32 *args)
> > {
> > }
> > #endif
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 48b0c71e9522..182a44815630 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -5539,8 +5539,12 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> > }
> > EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> >
> > -static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype)
> > +static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
> > + struct skb_shared_hwtstamps *hwtstamps,
> > + int tstype)
> > {
> > + struct timespec64 tstamp;
> > + u32 args[2] = {0, 0};
> > int op;
> >
> > if (!sk)
> > @@ -5552,6 +5556,11 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
> > break;
> > case SCM_TSTAMP_SND:
> > op = BPF_SOCK_OPS_TS_SW_OPT_CB;
>
> > + if (hwtstamps) {
> > + tstamp = ktime_to_timespec64(hwtstamps->hwtstamp);
>
> Avoid this conversion which is likely not useful to the bpf prog. Directly pass
> hwtstamps->hwtstamp (in ns?) to the bpf prog. Put lower 32bits in args[0] and
> higher 32bits in args[1].
It makes sense.
>
> Also, how about adding a BPF_SOCK_OPS_TS_"HW"_OPT_CB for the "hwtstamps != NULL"
> case instead of reusing the BPF_SOCK_OPS_TS_"SW"_OPT_CB?
Good suggestion. Will do that.
>
> A more subtle thing for the hwtstamps case is, afaik the bpf prog will not be
> called. All drivers are still only testing SKBTX_HW_TSTAMP instead of testing
> (SKBTX_HW_TSTAMP | SKBTX_BPF).
Ah, I didn't realize that!!
>
> There are a lot of drivers to change though. A quick thought is to rename the
> existing SKBTX_HW_TSTAMP (e.g. __SKBTX_HW_TSTAMP = 1 << 0) and define
> SKBTX_HW_TSTAMP like:
>
> #define SKBTX_HW_TSTAMP (__SKBTX_HW_TSTAMP | SKBTX_BPF)
I will take it, thanks!
>
> Then change some of the existing skb_shinfo(skb)->tx_flags "setting" site to use
> __SKBTX_HW_TSTAMP instead. e.g. in __sock_tx_timestamp(). Not very pretty but
> may be still better than changing many drivers. May be there is a better way...
I agree with your approach. Changing so many drivers would be a head-ache thing.
>
> While talking about where to test the SKBTX_BPF bit, I wonder if the new
> skb_tstamp_is_set() is needed. For the non SKBTX_HW_TSTAMP case, the number of
> tx_flags testing sites should be limited, so should be easy to add the SKBTX_BPF
> bit test. e.g. at the __dev_queue_xmit, test "if
> (unlikely(skb_shinfo(skb)->tx_flags & (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))". Patch
> 6 has also tested the bpf specific bit at tcp_ack_tstamp() before calling the
> __skb_tstamp_tx().
>
> At the beginning of __skb_tstamp_tx(), do something like this:
>
> void __skb_tstamp_tx(struct sk_buff *orig_skb,
> const struct sk_buff *ack_skb,
> struct skb_shared_hwtstamps *hwtstamps,
> struct sock *sk, int tstype)
> {
> if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
> unlikely(skb_shinfo(skb)->tx_flags & SKBTX_BPF))
> __skb_tstamp_tx_bpf(sk, orig_skb, hwtstamps, tstype);
>
> if (unlikely(!(skb_shinfo(skb)->tx_flags & ~SKBTX_BPF)))
> return;
>
> Then the new skb_tstamp_tx_output() shuffle is probably not be needed also.
>
> > + args[0] = tstamp.tv_sec;
> > + args[1] = tstamp.tv_nsec;
> > + }
> > break;
> > case SCM_TSTAMP_ACK:
> > op = BPF_SOCK_OPS_TS_ACK_OPT_CB;
> > @@ -5560,7 +5569,7 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
> > return;
> > }
> >
> > - bpf_skops_tx_timestamping(sk, skb, op);
> > + bpf_skops_tx_timestamping(sk, skb, op, 2, args);
> > }
> >
> > static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > @@ -5651,7 +5660,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
> > if (unlikely(skb_tstamp_is_set(orig_skb, tstype, false)))
> > skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
> > if (unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
> > - __skb_tstamp_tx_bpf(sk, orig_skb, tstype);
> > + __skb_tstamp_tx_bpf(sk, orig_skb, hwtstamps, tstype);
> > }
> > EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
> >
> > @@ -5662,7 +5671,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
> >
> > skb_tstamp_tx_output(orig_skb, NULL, hwtstamps, orig_skb->sk, tstype);
> > if (unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
> > - __skb_tstamp_tx_bpf(orig_skb->sk, orig_skb, tstype);
> > + __skb_tstamp_tx_bpf(orig_skb->sk, orig_skb, hwtstamps, tstype);
> > }
> > EXPORT_SYMBOL_GPL(skb_tstamp_tx);
> >
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index 79cb5c74c76c..504939bafe0c 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -942,7 +942,8 @@ int sock_set_timestamping(struct sock *sk, int optname,
> > }
> >
> > #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_SYSCALL)
> > -void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> > +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op,
> > + u32 nargs, u32 *args)
>
> Directly pass hwtstamps->hwtstamp instead of args and nargs. Save a memcpy below.
>
> > {
> > struct bpf_sock_ops_kern sock_ops;
> >
> > @@ -952,6 +953,7 @@ void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
> > sock_ops.op = op;
> > sock_ops.is_fullsock = 1;
> > sock_ops.sk = sk;
> > + memcpy(sock_ops.args, args, nargs * sizeof(*args));
> > __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
> > }
> > #endif
>
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 07/11] net-timestamp: support hwtstamp print for bpf extension
2024-12-13 15:13 ` Jason Xing
@ 2024-12-13 23:15 ` Martin KaFai Lau
2024-12-14 0:02 ` Jason Xing
0 siblings, 1 reply; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-13 23:15 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/13/24 7:13 AM, Jason Xing wrote:
>>> -static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype)
>>> +static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
>>> + struct skb_shared_hwtstamps *hwtstamps,
>>> + int tstype)
>>> {
>>> + struct timespec64 tstamp;
>>> + u32 args[2] = {0, 0};
>>> int op;
>>>
>>> if (!sk)
>>> @@ -5552,6 +5556,11 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
>>> break;
>>> case SCM_TSTAMP_SND:
>>> op = BPF_SOCK_OPS_TS_SW_OPT_CB;
>>> + if (hwtstamps) {
>>> + tstamp = ktime_to_timespec64(hwtstamps->hwtstamp);
>> Avoid this conversion which is likely not useful to the bpf prog. Directly pass
>> hwtstamps->hwtstamp (in ns?) to the bpf prog. Put lower 32bits in args[0] and
>> higher 32bits in args[1].
> It makes sense.
When replying the patch 2 thread, I noticed it may not even have to pass the
hwtstamps in args here.
Can "*skb_hwtstamps(skb) = *hwtstamps;" be done before calling the bpf prog?
Then the bpf prog can directly get it from skb_shinfo(skb)->hwtstamps.
It is like reading other fields in skb_shinfo(skb), e.g. the
skb_shinfo(skb)->tskey discussed in patch 10. The bpf prog will have a more
consistent experience in reading different fields of the skb_shinfo(skb).
skb_shinfo(skb)->hwtstamps is a more intuitive place to obtain the hwtstamp than
the broken up args[0] and args[1]. On top of that, there is also an older
"skb_hwtstamp" field in "struct bpf_sock_ops".
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 07/11] net-timestamp: support hwtstamp print for bpf extension
2024-12-13 23:15 ` Martin KaFai Lau
@ 2024-12-14 0:02 ` Jason Xing
2024-12-14 0:17 ` Martin KaFai Lau
0 siblings, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-14 0:02 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Sat, Dec 14, 2024 at 7:15 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/13/24 7:13 AM, Jason Xing wrote:
> >>> -static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype)
> >>> +static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
> >>> + struct skb_shared_hwtstamps *hwtstamps,
> >>> + int tstype)
> >>> {
> >>> + struct timespec64 tstamp;
> >>> + u32 args[2] = {0, 0};
> >>> int op;
> >>>
> >>> if (!sk)
> >>> @@ -5552,6 +5556,11 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
> >>> break;
> >>> case SCM_TSTAMP_SND:
> >>> op = BPF_SOCK_OPS_TS_SW_OPT_CB;
> >>> + if (hwtstamps) {
> >>> + tstamp = ktime_to_timespec64(hwtstamps->hwtstamp);
> >> Avoid this conversion which is likely not useful to the bpf prog. Directly pass
> >> hwtstamps->hwtstamp (in ns?) to the bpf prog. Put lower 32bits in args[0] and
> >> higher 32bits in args[1].
> > It makes sense.
>
> When replying the patch 2 thread, I noticed it may not even have to pass the
> hwtstamps in args here.
>
> Can "*skb_hwtstamps(skb) = *hwtstamps;" be done before calling the bpf prog?
> Then the bpf prog can directly get it from skb_shinfo(skb)->hwtstamps.
> It is like reading other fields in skb_shinfo(skb), e.g. the
> skb_shinfo(skb)->tskey discussed in patch 10. The bpf prog will have a more
> consistent experience in reading different fields of the skb_shinfo(skb).
> skb_shinfo(skb)->hwtstamps is a more intuitive place to obtain the hwtstamp than
> the broken up args[0] and args[1]. On top of that, there is also an older
> "skb_hwtstamp" field in "struct bpf_sock_ops".
Right, right, last night, fortunately, I also spotted it. Let the bpf
prog parse the shared info from skb then. A new callback for hwtstamp
is needed, I suppose.
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 07/11] net-timestamp: support hwtstamp print for bpf extension
2024-12-14 0:02 ` Jason Xing
@ 2024-12-14 0:17 ` Martin KaFai Lau
2024-12-14 0:48 ` Jason Xing
0 siblings, 1 reply; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-14 0:17 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/13/24 4:02 PM, Jason Xing wrote:
> On Sat, Dec 14, 2024 at 7:15 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 12/13/24 7:13 AM, Jason Xing wrote:
>>>>> -static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype)
>>>>> +static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
>>>>> + struct skb_shared_hwtstamps *hwtstamps,
>>>>> + int tstype)
>>>>> {
>>>>> + struct timespec64 tstamp;
>>>>> + u32 args[2] = {0, 0};
>>>>> int op;
>>>>>
>>>>> if (!sk)
>>>>> @@ -5552,6 +5556,11 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
>>>>> break;
>>>>> case SCM_TSTAMP_SND:
>>>>> op = BPF_SOCK_OPS_TS_SW_OPT_CB;
>>>>> + if (hwtstamps) {
>>>>> + tstamp = ktime_to_timespec64(hwtstamps->hwtstamp);
>>>> Avoid this conversion which is likely not useful to the bpf prog. Directly pass
>>>> hwtstamps->hwtstamp (in ns?) to the bpf prog. Put lower 32bits in args[0] and
>>>> higher 32bits in args[1].
>>> It makes sense.
>>
>> When replying the patch 2 thread, I noticed it may not even have to pass the
>> hwtstamps in args here.
>>
>> Can "*skb_hwtstamps(skb) = *hwtstamps;" be done before calling the bpf prog?
>> Then the bpf prog can directly get it from skb_shinfo(skb)->hwtstamps.
>> It is like reading other fields in skb_shinfo(skb), e.g. the
>> skb_shinfo(skb)->tskey discussed in patch 10. The bpf prog will have a more
>> consistent experience in reading different fields of the skb_shinfo(skb).
>> skb_shinfo(skb)->hwtstamps is a more intuitive place to obtain the hwtstamp than
>> the broken up args[0] and args[1]. On top of that, there is also an older
>> "skb_hwtstamp" field in "struct bpf_sock_ops".
>
> Right, right, last night, fortunately, I also spotted it. Let the bpf
> prog parse the shared info from skb then. A new callback for hwtstamp
> is needed, I suppose.
Why a new callback is needed? "*skb_hwtstamps(skb) = *hwtstamps;" cannot be done
in __skb_tstamp_tx_bpf?
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 07/11] net-timestamp: support hwtstamp print for bpf extension
2024-12-14 0:17 ` Martin KaFai Lau
@ 2024-12-14 0:48 ` Jason Xing
0 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-14 0:48 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Sat, Dec 14, 2024 at 8:17 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/13/24 4:02 PM, Jason Xing wrote:
> > On Sat, Dec 14, 2024 at 7:15 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 12/13/24 7:13 AM, Jason Xing wrote:
> >>>>> -static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype)
> >>>>> +static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
> >>>>> + struct skb_shared_hwtstamps *hwtstamps,
> >>>>> + int tstype)
> >>>>> {
> >>>>> + struct timespec64 tstamp;
> >>>>> + u32 args[2] = {0, 0};
> >>>>> int op;
> >>>>>
> >>>>> if (!sk)
> >>>>> @@ -5552,6 +5556,11 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb, int tstype
> >>>>> break;
> >>>>> case SCM_TSTAMP_SND:
> >>>>> op = BPF_SOCK_OPS_TS_SW_OPT_CB;
> >>>>> + if (hwtstamps) {
> >>>>> + tstamp = ktime_to_timespec64(hwtstamps->hwtstamp);
> >>>> Avoid this conversion which is likely not useful to the bpf prog. Directly pass
> >>>> hwtstamps->hwtstamp (in ns?) to the bpf prog. Put lower 32bits in args[0] and
> >>>> higher 32bits in args[1].
> >>> It makes sense.
> >>
> >> When replying the patch 2 thread, I noticed it may not even have to pass the
> >> hwtstamps in args here.
> >>
> >> Can "*skb_hwtstamps(skb) = *hwtstamps;" be done before calling the bpf prog?
> >> Then the bpf prog can directly get it from skb_shinfo(skb)->hwtstamps.
> >> It is like reading other fields in skb_shinfo(skb), e.g. the
> >> skb_shinfo(skb)->tskey discussed in patch 10. The bpf prog will have a more
> >> consistent experience in reading different fields of the skb_shinfo(skb).
> >> skb_shinfo(skb)->hwtstamps is a more intuitive place to obtain the hwtstamp than
> >> the broken up args[0] and args[1]. On top of that, there is also an older
> >> "skb_hwtstamp" field in "struct bpf_sock_ops".
> >
> > Right, right, last night, fortunately, I also spotted it. Let the bpf
> > prog parse the shared info from skb then. A new callback for hwtstamp
> > is needed, I suppose.
>
> Why a new callback is needed? "*skb_hwtstamps(skb) = *hwtstamps;" cannot be done
> in __skb_tstamp_tx_bpf?
Oh, I have no preference on this point. I will abort adding a new callback then.
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH net-next v4 08/11] net-timestamp: make TCP tx timestamp bpf extension work
2024-12-07 17:37 [PATCH net-next v4 00/11] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (6 preceding siblings ...)
2024-12-07 17:37 ` [PATCH net-next v4 07/11] net-timestamp: support hwtstamp print " Jason Xing
@ 2024-12-07 17:38 ` Jason Xing
2024-12-07 17:38 ` [PATCH net-next v4 09/11] net-timestamp: introduce cgroup lock to avoid affecting non-bpf cases Jason Xing
` (2 subsequent siblings)
10 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-07 17:38 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Make the whole small feature work finally. After this, user can
fully use the bpf prog trace the tx path for TCP type.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
net/ipv4/tcp.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0d704bda6c41..0a41006b10d1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -492,6 +492,15 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
}
+
+ if (SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ tcb->txstamp_ack_bpf = 1;
+ shinfo->tx_flags |= SKBTX_BPF;
+ shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+ }
}
static bool tcp_stream_is_readable(struct sock *sk, int target)
--
2.37.3
^ permalink raw reply related [flat|nested] 48+ messages in thread* [PATCH net-next v4 09/11] net-timestamp: introduce cgroup lock to avoid affecting non-bpf cases
2024-12-07 17:37 [PATCH net-next v4 00/11] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (7 preceding siblings ...)
2024-12-07 17:38 ` [PATCH net-next v4 08/11] net-timestamp: make TCP tx timestamp bpf extension work Jason Xing
@ 2024-12-07 17:38 ` Jason Xing
2024-12-07 17:38 ` [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension Jason Xing
2024-12-07 17:38 ` [PATCH net-next v4 11/11] bpf: add simple bpf tests in the tx path for so_timstamping feature Jason Xing
10 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-07 17:38 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Introducing the lock to avoid affecting the applications which
are not using timestamping bpf feature.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
net/core/skbuff.c | 6 ++++--
net/ipv4/tcp.c | 3 ++-
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 182a44815630..7c59ef501c74 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5659,7 +5659,8 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
{
if (unlikely(skb_tstamp_is_set(orig_skb, tstype, false)))
skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
- if (unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
+ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
+ unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
__skb_tstamp_tx_bpf(sk, orig_skb, hwtstamps, tstype);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
@@ -5670,7 +5671,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
int tstype = SCM_TSTAMP_SND;
skb_tstamp_tx_output(orig_skb, NULL, hwtstamps, orig_skb->sk, tstype);
- if (unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
+ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
+ unlikely(skb_tstamp_is_set(orig_skb, tstype, true)))
__skb_tstamp_tx_bpf(orig_skb->sk, orig_skb, hwtstamps, tstype);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0a41006b10d1..3df802410ebf 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -493,7 +493,8 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
}
- if (SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb) {
+ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
+ SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
--
2.37.3
^ permalink raw reply related [flat|nested] 48+ messages in thread* [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension
2024-12-07 17:37 [PATCH net-next v4 00/11] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (8 preceding siblings ...)
2024-12-07 17:38 ` [PATCH net-next v4 09/11] net-timestamp: introduce cgroup lock to avoid affecting non-bpf cases Jason Xing
@ 2024-12-07 17:38 ` Jason Xing
2024-12-13 0:28 ` Martin KaFai Lau
2024-12-07 17:38 ` [PATCH net-next v4 11/11] bpf: add simple bpf tests in the tx path for so_timstamping feature Jason Xing
10 siblings, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-07 17:38 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
For now, there are three phases where we are not able to fetch
the right seqno from the skops->skb_data, because:
1) in __dev_queue_xmit(), the skb->data doesn't point to the start
offset in tcp header.
2) in tcp_ack_tstamp(), the skb doesn't have the tcp header.
In the long run, we may add other trace points for bpf extension.
And the shinfo->tskey is always the same value for both bpf and
non-bpf cases. With that said, let's directly use shinfo->tskey
for TCP protocol.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
net/core/skbuff.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7c59ef501c74..2e13643f791c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5544,7 +5544,7 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
int tstype)
{
struct timespec64 tstamp;
- u32 args[2] = {0, 0};
+ u32 args[3] = {0, 0, 0};
int op;
if (!sk)
@@ -5569,7 +5569,10 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
return;
}
- bpf_skops_tx_timestamping(sk, skb, op, 2, args);
+ if (sk_is_tcp(sk))
+ args[2] = skb_shinfo(skb)->tskey;
+
+ bpf_skops_tx_timestamping(sk, skb, op, 3, args);
}
static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
--
2.37.3
^ permalink raw reply related [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension
2024-12-07 17:38 ` [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension Jason Xing
@ 2024-12-13 0:28 ` Martin KaFai Lau
2024-12-13 15:44 ` Jason Xing
2025-01-08 4:21 ` Jason Xing
0 siblings, 2 replies; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-13 0:28 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/7/24 9:38 AM, Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> For now, there are three phases where we are not able to fetch
> the right seqno from the skops->skb_data, because:
> 1) in __dev_queue_xmit(), the skb->data doesn't point to the start
> offset in tcp header.
> 2) in tcp_ack_tstamp(), the skb doesn't have the tcp header.
>
> In the long run, we may add other trace points for bpf extension.
> And the shinfo->tskey is always the same value for both bpf and
> non-bpf cases. With that said, let's directly use shinfo->tskey
> for TCP protocol.
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> net/core/skbuff.c | 7 +++++--
> 1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 7c59ef501c74..2e13643f791c 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5544,7 +5544,7 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
> int tstype)
> {
> struct timespec64 tstamp;
> - u32 args[2] = {0, 0};
> + u32 args[3] = {0, 0, 0};
> int op;
>
> if (!sk)
> @@ -5569,7 +5569,10 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
> return;
> }
>
> - bpf_skops_tx_timestamping(sk, skb, op, 2, args);
> + if (sk_is_tcp(sk))
> + args[2] = skb_shinfo(skb)->tskey;
Instead of only passing one info "skb_shinfo(skb)->tskey" of a skb, pass the
whole skb ptr to the bpf prog. Take a look at bpf_skops_init_skb. Lets start
with end_offset = 0 for now so that the bpf prog won't use it to read the
skb->data. It can be revisited later.
bpf_skops_init_skb(&sock_ops, skb, 0);
The bpf prog can use bpf_cast_to_kern_ctx() and bpf_core_cast() to get to the
skb_shinfo(skb). Take a look at the md_skb example in type_cast.c.
Then it needs to add a bpf_sock->op check to the existing
bpf_sock_ops_{load,store}_hdr_opt() helpers to ensure these helpers can only be
used by the BPF_SOCK_OPS_PARSE_HDR_OPT_CB, BPF_SOCK_OPS_HDR_OPT_LEN_CB, and
BPF_SOCK_OPS_WRITE_HDR_OPT_CB callback.
btw, how is the ack_skb used for the SCM_TSTAMP_ACK by the user space now?
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension
2024-12-13 0:28 ` Martin KaFai Lau
@ 2024-12-13 15:44 ` Jason Xing
2024-12-13 23:55 ` Martin KaFai Lau
2025-01-08 4:21 ` Jason Xing
1 sibling, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-13 15:44 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Fri, Dec 13, 2024 at 8:28 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/7/24 9:38 AM, Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > For now, there are three phases where we are not able to fetch
> > the right seqno from the skops->skb_data, because:
> > 1) in __dev_queue_xmit(), the skb->data doesn't point to the start
> > offset in tcp header.
> > 2) in tcp_ack_tstamp(), the skb doesn't have the tcp header.
> >
> > In the long run, we may add other trace points for bpf extension.
> > And the shinfo->tskey is always the same value for both bpf and
> > non-bpf cases. With that said, let's directly use shinfo->tskey
> > for TCP protocol.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > net/core/skbuff.c | 7 +++++--
> > 1 file changed, 5 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 7c59ef501c74..2e13643f791c 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -5544,7 +5544,7 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
> > int tstype)
> > {
> > struct timespec64 tstamp;
> > - u32 args[2] = {0, 0};
> > + u32 args[3] = {0, 0, 0};
> > int op;
> >
> > if (!sk)
> > @@ -5569,7 +5569,10 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
> > return;
> > }
> >
> > - bpf_skops_tx_timestamping(sk, skb, op, 2, args);
> > + if (sk_is_tcp(sk))
> > + args[2] = skb_shinfo(skb)->tskey;
>
> Instead of only passing one info "skb_shinfo(skb)->tskey" of a skb, pass the
> whole skb ptr to the bpf prog. Take a look at bpf_skops_init_skb. Lets start
> with end_offset = 0 for now so that the bpf prog won't use it to read the
> skb->data. It can be revisited later.
>
> bpf_skops_init_skb(&sock_ops, skb, 0);
>
> The bpf prog can use bpf_cast_to_kern_ctx() and bpf_core_cast() to get to the
> skb_shinfo(skb). Take a look at the md_skb example in type_cast.c.
Sorry, I didn't give it much thought on getting to the shinfo. That's
why I quickly gave up using bpf_skops_init_skb() after I noticed the
seq of skb is always zero :(
I will test it tomorrow. Thanks.
>
> Then it needs to add a bpf_sock->op check to the existing
> bpf_sock_ops_{load,store}_hdr_opt() helpers to ensure these helpers can only be
> used by the BPF_SOCK_OPS_PARSE_HDR_OPT_CB, BPF_SOCK_OPS_HDR_OPT_LEN_CB, and
> BPF_SOCK_OPS_WRITE_HDR_OPT_CB callback.
Forgive me. I cannot see how the bpf_sock_ops_load_hdr_opt helper has
something to do with the current thread? Could you enlighten me?
>
> btw, how is the ack_skb used for the SCM_TSTAMP_ACK by the user space now?
To be honest, I hardly use the ack_skb[1] under this circumstance... I
think if someone offers a suggestion to use it, then we can support
it?
[1]
commit e7ed11ee945438b737e2ae2370e35591e16ec371
Author: Yousuk Seung <ysseung@google.com>
Date: Wed Jan 20 12:41:55 2021 -0800
tcp: add TTL to SCM_TIMESTAMPING_OPT_STATS
This patch adds TCP_NLA_TTL to SCM_TIMESTAMPING_OPT_STATS that exports
the time-to-live or hop limit of the latest incoming packet with
SCM_TSTAMP_ACK. The value exported may not be from the packet that acks
the sequence when incoming packets are aggregated. Exporting the
time-to-live or hop limit value of incoming packets helps to estimate
the hop count of the path of the flow that may change over time.
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension
2024-12-13 15:44 ` Jason Xing
@ 2024-12-13 23:55 ` Martin KaFai Lau
2024-12-14 0:09 ` Jason Xing
0 siblings, 1 reply; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-13 23:55 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/13/24 7:44 AM, Jason Xing wrote:
>>> @@ -5569,7 +5569,10 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
>>> return;
>>> }
>>>
>>> - bpf_skops_tx_timestamping(sk, skb, op, 2, args);
>>> + if (sk_is_tcp(sk))
>>> + args[2] = skb_shinfo(skb)->tskey;
>> Instead of only passing one info "skb_shinfo(skb)->tskey" of a skb, pass the
>> whole skb ptr to the bpf prog. Take a look at bpf_skops_init_skb. Lets start
>> with end_offset = 0 for now so that the bpf prog won't use it to read the
>> skb->data. It can be revisited later.
>>
>> bpf_skops_init_skb(&sock_ops, skb, 0);
>>
>> The bpf prog can use bpf_cast_to_kern_ctx() and bpf_core_cast() to get to the
>> skb_shinfo(skb). Take a look at the md_skb example in type_cast.c.
> Sorry, I didn't give it much thought on getting to the shinfo. That's
> why I quickly gave up using bpf_skops_init_skb() after I noticed the
> seq of skb is always zero 🙁
>
> I will test it tomorrow. Thanks.
>
>> Then it needs to add a bpf_sock->op check to the existing
>> bpf_sock_ops_{load,store}_hdr_opt() helpers to ensure these helpers can only be
>> used by the BPF_SOCK_OPS_PARSE_HDR_OPT_CB, BPF_SOCK_OPS_HDR_OPT_LEN_CB, and
>> BPF_SOCK_OPS_WRITE_HDR_OPT_CB callback.
> Forgive me. I cannot see how the bpf_sock_ops_load_hdr_opt helper has
> something to do with the current thread? Could you enlighten me?
Sure. This is the same discussion as in patch 2, so may be worth to highlight
something that I guess may be missing:
a bpf prog does not need to use a helper does not mean:
a bpf prog is not allowed to call a helper because it is not safe.
The sockops prog running at the new timestamp hook does not need to call
bpf_setsockopt() but it does not mean the bpf prog is not allowed to call
bpf_setsockopt() without holding the sk_lock which is then broken.
The sockops timestamp prog does not need to use the
bpf_sock_ops_{load,store}_hdr_opt but it does not mean the bpf prog is not
allowed to call bpf_sock_ops_{load,store}_hdr_opt to change the skb which is
then also broken.
Now, skops->skb is not NULL only when the sockops prog is allowed to read/write
the skb.
With bpf_skops_init_skb(), skops->skb will not be NULL in the new timestamp
callback hook. bpf_sock_ops_{load,store}_hdr_opt() will be able to use the
skops->skb and it will be broken.
>
>> btw, how is the ack_skb used for the SCM_TSTAMP_ACK by the user space now?
> To be honest, I hardly use the ack_skb[1] under this circumstance... I
> think if someone offers a suggestion to use it, then we can support
> it?
Thanks for the pointer.
Yep, supporting it later is fine. I am curious because the ack_skb is used in
the user space time stamping now but not in your patch. I was asking to ensure
that we should be able to support it in the future if there is a need. We
should be able to reuse the skops->syn_skb to support that in the future.
>
> [1]
> commit e7ed11ee945438b737e2ae2370e35591e16ec371
> Author: Yousuk Seung<ysseung@google.com>
> Date: Wed Jan 20 12:41:55 2021 -0800
>
> tcp: add TTL to SCM_TIMESTAMPING_OPT_STATS
>
> This patch adds TCP_NLA_TTL to SCM_TIMESTAMPING_OPT_STATS that exports
> the time-to-live or hop limit of the latest incoming packet with
> SCM_TSTAMP_ACK. The value exported may not be from the packet that acks
> the sequence when incoming packets are aggregated. Exporting the
> time-to-live or hop limit value of incoming packets helps to estimate
> the hop count of the path of the flow that may change over time.
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension
2024-12-13 23:55 ` Martin KaFai Lau
@ 2024-12-14 0:09 ` Jason Xing
0 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-14 0:09 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Sat, Dec 14, 2024 at 7:55 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/13/24 7:44 AM, Jason Xing wrote:
> >>> @@ -5569,7 +5569,10 @@ static void __skb_tstamp_tx_bpf(struct sock *sk, struct sk_buff *skb,
> >>> return;
> >>> }
> >>>
> >>> - bpf_skops_tx_timestamping(sk, skb, op, 2, args);
> >>> + if (sk_is_tcp(sk))
> >>> + args[2] = skb_shinfo(skb)->tskey;
> >> Instead of only passing one info "skb_shinfo(skb)->tskey" of a skb, pass the
> >> whole skb ptr to the bpf prog. Take a look at bpf_skops_init_skb. Lets start
> >> with end_offset = 0 for now so that the bpf prog won't use it to read the
> >> skb->data. It can be revisited later.
> >>
> >> bpf_skops_init_skb(&sock_ops, skb, 0);
> >>
> >> The bpf prog can use bpf_cast_to_kern_ctx() and bpf_core_cast() to get to the
> >> skb_shinfo(skb). Take a look at the md_skb example in type_cast.c.
> > Sorry, I didn't give it much thought on getting to the shinfo. That's
> > why I quickly gave up using bpf_skops_init_skb() after I noticed the
> > seq of skb is always zero 🙁
> >
> > I will test it tomorrow. Thanks.
> >
> >> Then it needs to add a bpf_sock->op check to the existing
> >> bpf_sock_ops_{load,store}_hdr_opt() helpers to ensure these helpers can only be
> >> used by the BPF_SOCK_OPS_PARSE_HDR_OPT_CB, BPF_SOCK_OPS_HDR_OPT_LEN_CB, and
> >> BPF_SOCK_OPS_WRITE_HDR_OPT_CB callback.
> > Forgive me. I cannot see how the bpf_sock_ops_load_hdr_opt helper has
> > something to do with the current thread? Could you enlighten me?
>
> Sure. This is the same discussion as in patch 2, so may be worth to highlight
> something that I guess may be missing:
>
> a bpf prog does not need to use a helper does not mean:
> a bpf prog is not allowed to call a helper because it is not safe.
>
> The sockops prog running at the new timestamp hook does not need to call
> bpf_setsockopt() but it does not mean the bpf prog is not allowed to call
> bpf_setsockopt() without holding the sk_lock which is then broken.
>
> The sockops timestamp prog does not need to use the
> bpf_sock_ops_{load,store}_hdr_opt but it does not mean the bpf prog is not
> allowed to call bpf_sock_ops_{load,store}_hdr_opt to change the skb which is
> then also broken.
Ah, I see. Thanks for your explanation :)
>
> Now, skops->skb is not NULL only when the sockops prog is allowed to read/write
> the skb.
>
> With bpf_skops_init_skb(), skops->skb will not be NULL in the new timestamp
> callback hook. bpf_sock_ops_{load,store}_hdr_opt() will be able to use the
> skops->skb and it will be broken.
I will take care of it along the way.
>
> >
> >> btw, how is the ack_skb used for the SCM_TSTAMP_ACK by the user space now?
> > To be honest, I hardly use the ack_skb[1] under this circumstance... I
> > think if someone offers a suggestion to use it, then we can support
> > it?
>
> Thanks for the pointer.
>
> Yep, supporting it later is fine. I am curious because the ack_skb is used in
> the user space time stamping now but not in your patch. I was asking to ensure
> that we should be able to support it in the future if there is a need. We
> should be able to reuse the skops->syn_skb to support that in the future.
Agreed. I feel that I can handle this part after this series.
>
> >
> > [1]
> > commit e7ed11ee945438b737e2ae2370e35591e16ec371
> > Author: Yousuk Seung<ysseung@google.com>
> > Date: Wed Jan 20 12:41:55 2021 -0800
> >
> > tcp: add TTL to SCM_TIMESTAMPING_OPT_STATS
> >
> > This patch adds TCP_NLA_TTL to SCM_TIMESTAMPING_OPT_STATS that exports
> > the time-to-live or hop limit of the latest incoming packet with
> > SCM_TSTAMP_ACK. The value exported may not be from the packet that acks
> > the sequence when incoming packets are aggregated. Exporting the
> > time-to-live or hop limit value of incoming packets helps to estimate
> > the hop count of the path of the flow that may change over time.
>
>
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension
2024-12-13 0:28 ` Martin KaFai Lau
2024-12-13 15:44 ` Jason Xing
@ 2025-01-08 4:21 ` Jason Xing
2025-01-08 12:55 ` Jason Xing
2025-01-10 20:35 ` Martin KaFai Lau
1 sibling, 2 replies; 48+ messages in thread
From: Jason Xing @ 2025-01-08 4:21 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
Hi Martin,
> > - bpf_skops_tx_timestamping(sk, skb, op, 2, args);
> > + if (sk_is_tcp(sk))
> > + args[2] = skb_shinfo(skb)->tskey;
>
> Instead of only passing one info "skb_shinfo(skb)->tskey" of a skb, pass the
> whole skb ptr to the bpf prog. Take a look at bpf_skops_init_skb. Lets start
> with end_offset = 0 for now so that the bpf prog won't use it to read the
> skb->data. It can be revisited later.
>
> bpf_skops_init_skb(&sock_ops, skb, 0);
>
> The bpf prog can use bpf_cast_to_kern_ctx() and bpf_core_cast() to get to the
> skb_shinfo(skb). Take a look at the md_skb example in type_cast.c.
In recent days, I've been working on this part. It turns out to be
infeasible to pass "struct __sk_buff *skb" as the second parameter in
skops_sockopt() in patch [11/11]. I cannot find a way to acquire the
skb itself, sorry for that :(
IIUC, there are three approaches to fetch the tskey:
1. Like what I wrote in this patchset, passing the tskey to bpf prog
through calling __cgroup_bpf_run_filter_sock_ops() is simple and
enough.
2. Considering future usability, I feel I can add skb_head, skb_end
fields in sock_ops_convert_ctx_access().
3. Only adding a new field tskey like skb_hwtstamp in
sock_ops_convert_ctx_access()
If there is something wrong, please correct me. Thanks!
patch[11/11]: https://lore.kernel.org/all/20241207173803.90744-12-kerneljasonxing@gmail.com/
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension
2025-01-08 4:21 ` Jason Xing
@ 2025-01-08 12:55 ` Jason Xing
2025-01-10 20:35 ` Martin KaFai Lau
1 sibling, 0 replies; 48+ messages in thread
From: Jason Xing @ 2025-01-08 12:55 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Wed, Jan 8, 2025 at 12:21 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> Hi Martin,
>
> > > - bpf_skops_tx_timestamping(sk, skb, op, 2, args);
> > > + if (sk_is_tcp(sk))
> > > + args[2] = skb_shinfo(skb)->tskey;
> >
> > Instead of only passing one info "skb_shinfo(skb)->tskey" of a skb, pass the
> > whole skb ptr to the bpf prog. Take a look at bpf_skops_init_skb. Lets start
> > with end_offset = 0 for now so that the bpf prog won't use it to read the
> > skb->data. It can be revisited later.
> >
> > bpf_skops_init_skb(&sock_ops, skb, 0);
> >
> > The bpf prog can use bpf_cast_to_kern_ctx() and bpf_core_cast() to get to the
> > skb_shinfo(skb). Take a look at the md_skb example in type_cast.c.
>
> In recent days, I've been working on this part. It turns out to be
> infeasible to pass "struct __sk_buff *skb" as the second parameter in
> skops_sockopt() in patch [11/11]. I cannot find a way to acquire the
> skb itself, sorry for that :(
>
> IIUC, there are three approaches to fetch the tskey:
> 1. Like what I wrote in this patchset, passing the tskey to bpf prog
> through calling __cgroup_bpf_run_filter_sock_ops() is simple and
> enough.
> 2. Considering future usability, I feel I can add skb_head, skb_end
> fields in sock_ops_convert_ctx_access().
> 3. Only adding a new field tskey like skb_hwtstamp in
> sock_ops_convert_ctx_access()
After considering those approaches over and over again, I think this
#3 is more suitable [1]. It can be aligned with other usages, say,
skb_hwtstamp. Normally, only temporary variables are passed through
calling __cgroup_bpf_run_filter_sock_ops(). Then we don't need to
worry about breakages on skb because of safety issues. Well, how about
this draft as below?
[1] diff patch
+ case offsetof(struct bpf_sock_ops, tskey): {
+ struct bpf_insn *jmp_on_null_skb;
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ /* Reserve one insn to test skb == NULL */
+ jmp_on_null_skb = insn++;
+ insn = bpf_convert_shinfo_access(si->dst_reg,
si->dst_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ tskey, 4,
+ target_size));
+ *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0,
+ insn - jmp_on_null_skb - 1);
+ break;
+ }
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension
2025-01-08 4:21 ` Jason Xing
2025-01-08 12:55 ` Jason Xing
@ 2025-01-10 20:35 ` Martin KaFai Lau
2025-01-10 22:46 ` Jason Xing
1 sibling, 1 reply; 48+ messages in thread
From: Martin KaFai Lau @ 2025-01-10 20:35 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 1/7/25 8:21 PM, Jason Xing wrote:
> Hi Martin,
>
>>> - bpf_skops_tx_timestamping(sk, skb, op, 2, args);
>>> + if (sk_is_tcp(sk))
>>> + args[2] = skb_shinfo(skb)->tskey;
>>
>> Instead of only passing one info "skb_shinfo(skb)->tskey" of a skb, pass the
>> whole skb ptr to the bpf prog. Take a look at bpf_skops_init_skb. Lets start
>> with end_offset = 0 for now so that the bpf prog won't use it to read the
>> skb->data. It can be revisited later.
>>
>> bpf_skops_init_skb(&sock_ops, skb, 0);
>>
>> The bpf prog can use bpf_cast_to_kern_ctx() and bpf_core_cast() to get to the
>> skb_shinfo(skb). Take a look at the md_skb example in type_cast.c.
>
> In recent days, I've been working on this part. It turns out to be
> infeasible to pass "struct __sk_buff *skb" as the second parameter in
> skops_sockopt() in patch [11/11]. I cannot find a way to acquire the
> skb itself
I didn't mean to pass skb in sock_ops_kern->args[] or pass skb to the bpf prog
"SEC("sockops") skops_sockopt(struct bpf_sock_ops *skops, struct sk_buff *skb)".
The bpf prog can only take one ctx argument which is
"struct bpf_sock_ops *skops" here.
I meant to have kernel initializing the sock_ops_kern->skb by doing
"bpf_skops_init_skb(&sock_ops, skb, 0);" before running the bpf prog.
The bpf prog can read the skb by using bpf_cast_to_kern_ctx() and bpf_core_cast().
Something like the following. I directly change the existing test_tcp_hdr_options.c.
It has not been changed to use the vmlinux.h, so I need to redefine some parts of
the sk_buff, skb_shared_info, and bpf_sock_ops_kern. Your new test should directly
include <vmlinux.h> and no need to redefine them.
Untested code:
diff --git i/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c w/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
index 5f4e87ee949a..c98ebe71f6ba 100644
--- i/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
+++ w/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
@@ -12,8 +12,10 @@
#include <linux/types.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
+#include <bpf/bpf_core_read.h>
#define BPF_PROG_TEST_TCP_HDR_OPTIONS
#include "test_tcp_hdr_options.h"
+#include "bpf_kfuncs.h"
#ifndef sizeof_field
#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
@@ -348,9 +350,63 @@ static int current_mss_opt_len(struct bpf_sock_ops *skops)
return CG_OK;
}
+struct sk_buff {
+ unsigned int end;
+ unsigned char *head;
+} __attribute__((preserve_access_index));
+
+struct skb_shared_info {
+ __u8 flags;
+ __u8 meta_len;
+ __u8 nr_frags;
+ __u8 tx_flags;
+ unsigned short gso_size;
+ unsigned short gso_segs;
+ unsigned int gso_type;
+ __u32 tskey;
+} __attribute__((preserve_access_index));
+
+struct bpf_sock_ops_kern {
+ struct sock *sk;
+ union {
+ __u32 args[4];
+ __u32 reply;
+ __u32 replylong[4];
+ };
+ struct sk_buff *syn_skb;
+ struct sk_buff *skb;
+ void *skb_data_end;
+ __u8 op;
+ __u8 is_fullsock;
+ __u8 remaining_opt_len;
+ __u64 temp; /* temp and everything after is not
+ * initialized to 0 before calling
+ * the BPF program. New fields that
+ * should be initialized to 0 should
+ * be inserted before temp.
+ * temp is scratch storage used by
+ * sock_ops_convert_ctx_access
+ * as temporary storage of a register.
+ */
+} __attribute__((preserve_access_index));
+
static int handle_hdr_opt_len(struct bpf_sock_ops *skops)
{
__u8 tcp_flags = skops_tcp_flags(skops);
+ struct bpf_sock_ops_kern *skops_kern;
+ struct skb_shared_info *shared_info;
+ struct sk_buff *skb;
+
+ skops_kern = bpf_cast_to_kern_ctx(skops);
+ skb = skops_kern->skb;
+
+ if (skb) {
+ shared_info = bpf_core_cast(skb->head + skb->end, struct skb_shared_info);
+ /* printk as an example. don't do that in selftests. */
+ bpf_printk("tskey %u gso_size %u gso_segs %u gso_type %u flags %x\n",
+ shared_info->tskey, shared_info->gso_size,
+ shared_info->gso_segs, shared_info->gso_type, shared_info->flags);
+ }
if ((tcp_flags & TCPHDR_SYNACK) == TCPHDR_SYNACK)
return synack_opt_len(skops);
^ permalink raw reply related [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension
2025-01-10 20:35 ` Martin KaFai Lau
@ 2025-01-10 22:46 ` Jason Xing
0 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2025-01-10 22:46 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Sat, Jan 11, 2025 at 4:36 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 1/7/25 8:21 PM, Jason Xing wrote:
> > Hi Martin,
> >
> >>> - bpf_skops_tx_timestamping(sk, skb, op, 2, args);
> >>> + if (sk_is_tcp(sk))
> >>> + args[2] = skb_shinfo(skb)->tskey;
> >>
> >> Instead of only passing one info "skb_shinfo(skb)->tskey" of a skb, pass the
> >> whole skb ptr to the bpf prog. Take a look at bpf_skops_init_skb. Lets start
> >> with end_offset = 0 for now so that the bpf prog won't use it to read the
> >> skb->data. It can be revisited later.
> >>
> >> bpf_skops_init_skb(&sock_ops, skb, 0);
> >>
> >> The bpf prog can use bpf_cast_to_kern_ctx() and bpf_core_cast() to get to the
> >> skb_shinfo(skb). Take a look at the md_skb example in type_cast.c.
> >
> > In recent days, I've been working on this part. It turns out to be
> > infeasible to pass "struct __sk_buff *skb" as the second parameter in
> > skops_sockopt() in patch [11/11]. I cannot find a way to acquire the
> > skb itself
>
> I didn't mean to pass skb in sock_ops_kern->args[] or pass skb to the bpf prog
> "SEC("sockops") skops_sockopt(struct bpf_sock_ops *skops, struct sk_buff *skb)".
> The bpf prog can only take one ctx argument which is
> "struct bpf_sock_ops *skops" here.
>
> I meant to have kernel initializing the sock_ops_kern->skb by doing
> "bpf_skops_init_skb(&sock_ops, skb, 0);" before running the bpf prog.
>
> The bpf prog can read the skb by using bpf_cast_to_kern_ctx() and bpf_core_cast().
>
> Something like the following. I directly change the existing test_tcp_hdr_options.c.
> It has not been changed to use the vmlinux.h, so I need to redefine some parts of
> the sk_buff, skb_shared_info, and bpf_sock_ops_kern. Your new test should directly
> include <vmlinux.h> and no need to redefine them.
>
> Untested code:
>
> diff --git i/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c w/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
> index 5f4e87ee949a..c98ebe71f6ba 100644
> --- i/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
> +++ w/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
> @@ -12,8 +12,10 @@
> #include <linux/types.h>
> #include <bpf/bpf_helpers.h>
> #include <bpf/bpf_endian.h>
> +#include <bpf/bpf_core_read.h>
> #define BPF_PROG_TEST_TCP_HDR_OPTIONS
> #include "test_tcp_hdr_options.h"
> +#include "bpf_kfuncs.h"
>
> #ifndef sizeof_field
> #define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
> @@ -348,9 +350,63 @@ static int current_mss_opt_len(struct bpf_sock_ops *skops)
> return CG_OK;
> }
>
> +struct sk_buff {
> + unsigned int end;
> + unsigned char *head;
> +} __attribute__((preserve_access_index));
> +
> +struct skb_shared_info {
> + __u8 flags;
> + __u8 meta_len;
> + __u8 nr_frags;
> + __u8 tx_flags;
> + unsigned short gso_size;
> + unsigned short gso_segs;
> + unsigned int gso_type;
> + __u32 tskey;
> +} __attribute__((preserve_access_index));
> +
> +struct bpf_sock_ops_kern {
> + struct sock *sk;
> + union {
> + __u32 args[4];
> + __u32 reply;
> + __u32 replylong[4];
> + };
> + struct sk_buff *syn_skb;
> + struct sk_buff *skb;
> + void *skb_data_end;
> + __u8 op;
> + __u8 is_fullsock;
> + __u8 remaining_opt_len;
> + __u64 temp; /* temp and everything after is not
> + * initialized to 0 before calling
> + * the BPF program. New fields that
> + * should be initialized to 0 should
> + * be inserted before temp.
> + * temp is scratch storage used by
> + * sock_ops_convert_ctx_access
> + * as temporary storage of a register.
> + */
> +} __attribute__((preserve_access_index));
> +
> static int handle_hdr_opt_len(struct bpf_sock_ops *skops)
> {
> __u8 tcp_flags = skops_tcp_flags(skops);
> + struct bpf_sock_ops_kern *skops_kern;
> + struct skb_shared_info *shared_info;
> + struct sk_buff *skb;
> +
> + skops_kern = bpf_cast_to_kern_ctx(skops);
Oh, I misunderstood the use of bpf_cast_to_kern_ctx() function and
failed/struggled to fetch the "struct bpf_sock_ops_kern".
Now I realized. Thanks so much for your detailed codes! I will try
this in a few hours.
> + skb = skops_kern->skb;
> +
> + if (skb) {
> + shared_info = bpf_core_cast(skb->head + skb->end, struct skb_shared_info);
> + /* printk as an example. don't do that in selftests. */
> + bpf_printk("tskey %u gso_size %u gso_segs %u gso_type %u flags %x\n",
> + shared_info->tskey, shared_info->gso_size,
> + shared_info->gso_segs, shared_info->gso_type, shared_info->flags);
> + }
>
> if ((tcp_flags & TCPHDR_SYNACK) == TCPHDR_SYNACK)
> return synack_opt_len(skops);
>
>
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH net-next v4 11/11] bpf: add simple bpf tests in the tx path for so_timstamping feature
2024-12-07 17:37 [PATCH net-next v4 00/11] net-timestamp: bpf extension to equip applications transparently Jason Xing
` (9 preceding siblings ...)
2024-12-07 17:38 ` [PATCH net-next v4 10/11] net-timestamp: export the tskey for TCP bpf extension Jason Xing
@ 2024-12-07 17:38 ` Jason Xing
2024-12-09 14:45 ` Willem de Bruijn
2024-12-13 1:14 ` Martin KaFai Lau
10 siblings, 2 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-07 17:38 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa
Cc: bpf, netdev, Jason Xing
From: Jason Xing <kernelxing@tencent.com>
Only check if we pass those three key points after we enable the
bpf extension for so_timestamping. During each point, we can choose
whether to print the current timestamp.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
.../bpf/prog_tests/so_timestamping.c | 97 +++++++++++++
.../selftests/bpf/progs/so_timestamping.c | 135 ++++++++++++++++++
2 files changed, 232 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/so_timestamping.c
create mode 100644 tools/testing/selftests/bpf/progs/so_timestamping.c
diff --git a/tools/testing/selftests/bpf/prog_tests/so_timestamping.c b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
new file mode 100644
index 000000000000..c5978444f9c8
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Tencent */
+
+#define _GNU_SOURCE
+#include <sched.h>
+#include <linux/socket.h>
+#include <linux/tls.h>
+#include <net/if.h>
+
+#include "test_progs.h"
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+
+#include "so_timestamping.skel.h"
+
+#define CG_NAME "/so-timestamping-test"
+
+static const char addr4_str[] = "127.0.0.1";
+static const char addr6_str[] = "::1";
+static struct so_timestamping *skel;
+static int cg_fd;
+
+static int create_netns(void)
+{
+ if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
+ return -1;
+
+ if (!ASSERT_OK(system("ip link set dev lo up"), "set lo up"))
+ return -1;
+
+ return 0;
+}
+
+static void test_tcp(int family)
+{
+ struct so_timestamping__bss *bss = skel->bss;
+ char buf[] = "testing testing";
+ int sfd = -1, cfd = -1;
+ int n;
+
+ memset(bss, 0, sizeof(*bss));
+
+ sfd = start_server(family, SOCK_STREAM,
+ family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
+ if (!ASSERT_GE(sfd, 0, "start_server"))
+ goto out;
+
+ cfd = connect_to_fd(sfd, 0);
+ if (!ASSERT_GE(cfd, 0, "connect_to_fd_server")) {
+ close(sfd);
+ goto out;
+ }
+
+ n = write(cfd, buf, sizeof(buf));
+ if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
+ goto out;
+
+ ASSERT_EQ(bss->nr_active, 1, "nr_active");
+ ASSERT_EQ(bss->nr_sched, 1, "nr_sched");
+ ASSERT_EQ(bss->nr_txsw, 1, "nr_txsw");
+ ASSERT_EQ(bss->nr_ack, 1, "nr_ack");
+
+out:
+ if (sfd >= 0)
+ close(sfd);
+ if (cfd >= 0)
+ close(cfd);
+}
+
+void test_so_timestamping(void)
+{
+ cg_fd = test__join_cgroup(CG_NAME);
+ if (cg_fd < 0)
+ return;
+
+ if (create_netns())
+ goto done;
+
+ skel = so_timestamping__open();
+ if (!ASSERT_OK_PTR(skel, "open skel"))
+ goto done;
+
+ if (!ASSERT_OK(so_timestamping__load(skel), "load skel"))
+ goto done;
+
+ skel->links.skops_sockopt =
+ bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd);
+ if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup"))
+ goto done;
+
+ test_tcp(AF_INET6);
+ test_tcp(AF_INET);
+
+done:
+ so_timestamping__destroy(skel);
+ close(cg_fd);
+}
diff --git a/tools/testing/selftests/bpf/progs/so_timestamping.c b/tools/testing/selftests/bpf/progs/so_timestamping.c
new file mode 100644
index 000000000000..f64e94dbd70e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/so_timestamping.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Tencent */
+
+#include "vmlinux.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+
+#define SK_BPF_CB_FLAGS 1009
+#define SK_BPF_CB_TX_TIMESTAMPING 1
+
+int nr_active;
+int nr_passive;
+int nr_sched;
+int nr_txsw;
+int nr_ack;
+
+struct sockopt_test {
+ int opt;
+ int new;
+};
+
+static const struct sockopt_test sol_socket_tests[] = {
+ { .opt = SK_BPF_CB_FLAGS, .new = SK_BPF_CB_TX_TIMESTAMPING, },
+ { .opt = 0, },
+};
+
+struct loop_ctx {
+ void *ctx;
+ struct sock *sk;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, 1024);
+} hash_map SEC(".maps");
+
+static u64 delay_tolerance_nsec = 5000000;
+
+static int bpf_test_sockopt_int(void *ctx, struct sock *sk,
+ const struct sockopt_test *t,
+ int level)
+{
+ int new, opt;
+
+ opt = t->opt;
+ new = t->new;
+
+ if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
+ return 1;
+
+ return 0;
+}
+
+static int bpf_test_socket_sockopt(__u32 i, struct loop_ctx *lc)
+{
+ const struct sockopt_test *t;
+
+ if (i >= ARRAY_SIZE(sol_socket_tests))
+ return 1;
+
+ t = &sol_socket_tests[i];
+ if (!t->opt)
+ return 1;
+
+ return bpf_test_sockopt_int(lc->ctx, lc->sk, t, SOL_SOCKET);
+}
+
+static int bpf_test_sockopt(void *ctx, struct sock *sk)
+{
+ struct loop_ctx lc = { .ctx = ctx, .sk = sk, };
+ int n;
+
+ n = bpf_loop(ARRAY_SIZE(sol_socket_tests), bpf_test_socket_sockopt, &lc, 0);
+ if (n != ARRAY_SIZE(sol_socket_tests))
+ return -1;
+
+ return 0;
+}
+
+static bool bpf_test_delay(struct bpf_sock_ops *skops)
+{
+ u64 timestamp = bpf_ktime_get_ns();
+ u32 seq = skops->args[2];
+ u64 *value;
+
+ value = bpf_map_lookup_elem(&hash_map, &seq);
+ if (value && (timestamp - *value > delay_tolerance_nsec)) {
+ bpf_printk("time delay: %lu", timestamp - *value);
+ return false;
+ }
+
+ bpf_map_update_elem(&hash_map, &seq, ×tamp, BPF_ANY);
+ return true;
+}
+
+SEC("sockops")
+int skops_sockopt(struct bpf_sock_ops *skops)
+{
+ struct bpf_sock *bpf_sk = skops->sk;
+ struct sock *sk;
+
+ if (!bpf_sk)
+ return 1;
+
+ sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk);
+ if (!sk)
+ return 1;
+
+ switch (skops->op) {
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ nr_active += !bpf_test_sockopt(skops, sk);
+ break;
+ case BPF_SOCK_OPS_TS_SCHED_OPT_CB:
+ if (bpf_test_delay(skops))
+ nr_sched += 1;
+ break;
+ case BPF_SOCK_OPS_TS_SW_OPT_CB:
+ if (bpf_test_delay(skops))
+ nr_txsw += 1;
+ break;
+ case BPF_SOCK_OPS_TS_ACK_OPT_CB:
+ if (bpf_test_delay(skops))
+ nr_ack += 1;
+ break;
+ }
+
+ return 1;
+}
+
+char _license[] SEC("license") = "GPL";
--
2.37.3
^ permalink raw reply related [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 11/11] bpf: add simple bpf tests in the tx path for so_timstamping feature
2024-12-07 17:38 ` [PATCH net-next v4 11/11] bpf: add simple bpf tests in the tx path for so_timstamping feature Jason Xing
@ 2024-12-09 14:45 ` Willem de Bruijn
2024-12-09 14:58 ` Jason Xing
2024-12-13 1:14 ` Martin KaFai Lau
1 sibling, 1 reply; 48+ messages in thread
From: Willem de Bruijn @ 2024-12-09 14:45 UTC (permalink / raw)
To: Jason Xing, davem, edumazet, kuba, pabeni, dsahern,
willemdebruijn.kernel, willemb, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa
Cc: bpf, netdev, Jason Xing
Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
in subject: s/so_timstamping/so_timestamping
> Only check if we pass those three key points after we enable the
> bpf extension for so_timestamping. During each point, we can choose
> whether to print the current timestamp.
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> .../bpf/prog_tests/so_timestamping.c | 97 +++++++++++++
> .../selftests/bpf/progs/so_timestamping.c | 135 ++++++++++++++++++
> 2 files changed, 232 insertions(+)
> create mode 100644 tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> create mode 100644 tools/testing/selftests/bpf/progs/so_timestamping.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/so_timestamping.c b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> new file mode 100644
> index 000000000000..c5978444f9c8
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> @@ -0,0 +1,97 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2024 Tencent */
> +
> +#define _GNU_SOURCE
> +#include <sched.h>
> +#include <linux/socket.h>
> +#include <linux/tls.h>
> +#include <net/if.h>
> +
> +#include "test_progs.h"
> +#include "cgroup_helpers.h"
> +#include "network_helpers.h"
> +
> +#include "so_timestamping.skel.h"
> +
> +#define CG_NAME "/so-timestamping-test"
> +
> +static const char addr4_str[] = "127.0.0.1";
> +static const char addr6_str[] = "::1";
> +static struct so_timestamping *skel;
> +static int cg_fd;
> +
> +static int create_netns(void)
> +{
> + if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
> + return -1;
> +
> + if (!ASSERT_OK(system("ip link set dev lo up"), "set lo up"))
> + return -1;
> +
> + return 0;
> +}
> +
> +static void test_tcp(int family)
> +{
> + struct so_timestamping__bss *bss = skel->bss;
> + char buf[] = "testing testing";
> + int sfd = -1, cfd = -1;
> + int n;
> +
> + memset(bss, 0, sizeof(*bss));
> +
> + sfd = start_server(family, SOCK_STREAM,
> + family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
> + if (!ASSERT_GE(sfd, 0, "start_server"))
> + goto out;
> +
> + cfd = connect_to_fd(sfd, 0);
> + if (!ASSERT_GE(cfd, 0, "connect_to_fd_server")) {
> + close(sfd);
> + goto out;
> + }
> +
> + n = write(cfd, buf, sizeof(buf));
> + if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
> + goto out;
> +
> + ASSERT_EQ(bss->nr_active, 1, "nr_active");
> + ASSERT_EQ(bss->nr_sched, 1, "nr_sched");
> + ASSERT_EQ(bss->nr_txsw, 1, "nr_txsw");
> + ASSERT_EQ(bss->nr_ack, 1, "nr_ack");
> +
> +out:
> + if (sfd >= 0)
> + close(sfd);
> + if (cfd >= 0)
> + close(cfd);
> +}
> +
> +void test_so_timestamping(void)
> +{
> + cg_fd = test__join_cgroup(CG_NAME);
> + if (cg_fd < 0)
> + return;
> +
> + if (create_netns())
> + goto done;
> +
> + skel = so_timestamping__open();
> + if (!ASSERT_OK_PTR(skel, "open skel"))
> + goto done;
> +
> + if (!ASSERT_OK(so_timestamping__load(skel), "load skel"))
> + goto done;
> +
> + skel->links.skops_sockopt =
> + bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd);
> + if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup"))
> + goto done;
> +
> + test_tcp(AF_INET6);
> + test_tcp(AF_INET);
> +
> +done:
> + so_timestamping__destroy(skel);
> + close(cg_fd);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/so_timestamping.c b/tools/testing/selftests/bpf/progs/so_timestamping.c
> new file mode 100644
> index 000000000000..f64e94dbd70e
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/so_timestamping.c
> @@ -0,0 +1,135 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2024 Tencent */
> +
> +#include "vmlinux.h"
> +#include "bpf_tracing_net.h"
> +#include <bpf/bpf_core_read.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +#include "bpf_misc.h"
> +
> +#define SK_BPF_CB_FLAGS 1009
> +#define SK_BPF_CB_TX_TIMESTAMPING 1
> +
> +int nr_active;
> +int nr_passive;
> +int nr_sched;
> +int nr_txsw;
> +int nr_ack;
> +
> +struct sockopt_test {
> + int opt;
> + int new;
> +};
> +
> +static const struct sockopt_test sol_socket_tests[] = {
> + { .opt = SK_BPF_CB_FLAGS, .new = SK_BPF_CB_TX_TIMESTAMPING, },
> + { .opt = 0, },
> +};
> +
> +struct loop_ctx {
> + void *ctx;
> + struct sock *sk;
> +};
> +
> +struct {
> + __uint(type, BPF_MAP_TYPE_HASH);
> + __type(key, u32);
> + __type(value, u64);
> + __uint(max_entries, 1024);
> +} hash_map SEC(".maps");
> +
> +static u64 delay_tolerance_nsec = 5000000;
> +
> +static int bpf_test_sockopt_int(void *ctx, struct sock *sk,
> + const struct sockopt_test *t,
> + int level)
> +{
> + int new, opt;
> +
> + opt = t->opt;
> + new = t->new;
> +
> + if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
> + return 1;
> +
> + return 0;
> +}
> +
> +static int bpf_test_socket_sockopt(__u32 i, struct loop_ctx *lc)
> +{
> + const struct sockopt_test *t;
> +
> + if (i >= ARRAY_SIZE(sol_socket_tests))
> + return 1;
> +
> + t = &sol_socket_tests[i];
> + if (!t->opt)
> + return 1;
> +
> + return bpf_test_sockopt_int(lc->ctx, lc->sk, t, SOL_SOCKET);
> +}
> +
> +static int bpf_test_sockopt(void *ctx, struct sock *sk)
> +{
> + struct loop_ctx lc = { .ctx = ctx, .sk = sk, };
> + int n;
> +
> + n = bpf_loop(ARRAY_SIZE(sol_socket_tests), bpf_test_socket_sockopt, &lc, 0);
> + if (n != ARRAY_SIZE(sol_socket_tests))
> + return -1;
> +
> + return 0;
> +}
> +
> +static bool bpf_test_delay(struct bpf_sock_ops *skops)
> +{
> + u64 timestamp = bpf_ktime_get_ns();
> + u32 seq = skops->args[2];
> + u64 *value;
> +
> + value = bpf_map_lookup_elem(&hash_map, &seq);
> + if (value && (timestamp - *value > delay_tolerance_nsec)) {
> + bpf_printk("time delay: %lu", timestamp - *value);
> + return false;
> + }
> +
> + bpf_map_update_elem(&hash_map, &seq, ×tamp, BPF_ANY);
Maybe enforce that you expect value to be found for all cases except
the first (SCHED). I.e., that they share the same seq/tskey.
> + return true;
> +}
> +
> +SEC("sockops")
> +int skops_sockopt(struct bpf_sock_ops *skops)
> +{
> + struct bpf_sock *bpf_sk = skops->sk;
> + struct sock *sk;
> +
> + if (!bpf_sk)
> + return 1;
> +
> + sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk);
> + if (!sk)
> + return 1;
> +
> + switch (skops->op) {
> + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
> + nr_active += !bpf_test_sockopt(skops, sk);
> + break;
> + case BPF_SOCK_OPS_TS_SCHED_OPT_CB:
> + if (bpf_test_delay(skops))
> + nr_sched += 1;
> + break;
> + case BPF_SOCK_OPS_TS_SW_OPT_CB:
> + if (bpf_test_delay(skops))
> + nr_txsw += 1;
> + break;
> + case BPF_SOCK_OPS_TS_ACK_OPT_CB:
> + if (bpf_test_delay(skops))
> + nr_ack += 1;
> + break;
> + }
> +
> + return 1;
> +}
> +
> +char _license[] SEC("license") = "GPL";
> --
> 2.37.3
>
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 11/11] bpf: add simple bpf tests in the tx path for so_timstamping feature
2024-12-09 14:45 ` Willem de Bruijn
@ 2024-12-09 14:58 ` Jason Xing
0 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-09 14:58 UTC (permalink / raw)
To: Willem de Bruijn
Cc: davem, edumazet, kuba, pabeni, dsahern, willemb, ast, daniel,
andrii, martin.lau, eddyz87, song, yonghong.song, john.fastabend,
kpsingh, sdf, haoluo, jolsa, bpf, netdev, Jason Xing
On Mon, Dec 9, 2024 at 10:45 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
>
> in subject: s/so_timstamping/so_timestamping
Will fix it soon.
>
> > Only check if we pass those three key points after we enable the
> > bpf extension for so_timestamping. During each point, we can choose
> > whether to print the current timestamp.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > .../bpf/prog_tests/so_timestamping.c | 97 +++++++++++++
> > .../selftests/bpf/progs/so_timestamping.c | 135 ++++++++++++++++++
> > 2 files changed, 232 insertions(+)
> > create mode 100644 tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> > create mode 100644 tools/testing/selftests/bpf/progs/so_timestamping.c
> >
> > diff --git a/tools/testing/selftests/bpf/prog_tests/so_timestamping.c b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> > new file mode 100644
> > index 000000000000..c5978444f9c8
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> > @@ -0,0 +1,97 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2024 Tencent */
> > +
> > +#define _GNU_SOURCE
> > +#include <sched.h>
> > +#include <linux/socket.h>
> > +#include <linux/tls.h>
> > +#include <net/if.h>
> > +
> > +#include "test_progs.h"
> > +#include "cgroup_helpers.h"
> > +#include "network_helpers.h"
> > +
> > +#include "so_timestamping.skel.h"
> > +
> > +#define CG_NAME "/so-timestamping-test"
> > +
> > +static const char addr4_str[] = "127.0.0.1";
> > +static const char addr6_str[] = "::1";
> > +static struct so_timestamping *skel;
> > +static int cg_fd;
> > +
> > +static int create_netns(void)
> > +{
> > + if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
> > + return -1;
> > +
> > + if (!ASSERT_OK(system("ip link set dev lo up"), "set lo up"))
> > + return -1;
> > +
> > + return 0;
> > +}
> > +
> > +static void test_tcp(int family)
> > +{
> > + struct so_timestamping__bss *bss = skel->bss;
> > + char buf[] = "testing testing";
> > + int sfd = -1, cfd = -1;
> > + int n;
> > +
> > + memset(bss, 0, sizeof(*bss));
> > +
> > + sfd = start_server(family, SOCK_STREAM,
> > + family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
> > + if (!ASSERT_GE(sfd, 0, "start_server"))
> > + goto out;
> > +
> > + cfd = connect_to_fd(sfd, 0);
> > + if (!ASSERT_GE(cfd, 0, "connect_to_fd_server")) {
> > + close(sfd);
> > + goto out;
> > + }
> > +
> > + n = write(cfd, buf, sizeof(buf));
> > + if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
> > + goto out;
> > +
> > + ASSERT_EQ(bss->nr_active, 1, "nr_active");
> > + ASSERT_EQ(bss->nr_sched, 1, "nr_sched");
> > + ASSERT_EQ(bss->nr_txsw, 1, "nr_txsw");
> > + ASSERT_EQ(bss->nr_ack, 1, "nr_ack");
> > +
> > +out:
> > + if (sfd >= 0)
> > + close(sfd);
> > + if (cfd >= 0)
> > + close(cfd);
> > +}
> > +
> > +void test_so_timestamping(void)
> > +{
> > + cg_fd = test__join_cgroup(CG_NAME);
> > + if (cg_fd < 0)
> > + return;
> > +
> > + if (create_netns())
> > + goto done;
> > +
> > + skel = so_timestamping__open();
> > + if (!ASSERT_OK_PTR(skel, "open skel"))
> > + goto done;
> > +
> > + if (!ASSERT_OK(so_timestamping__load(skel), "load skel"))
> > + goto done;
> > +
> > + skel->links.skops_sockopt =
> > + bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd);
> > + if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup"))
> > + goto done;
> > +
> > + test_tcp(AF_INET6);
> > + test_tcp(AF_INET);
> > +
> > +done:
> > + so_timestamping__destroy(skel);
> > + close(cg_fd);
> > +}
> > diff --git a/tools/testing/selftests/bpf/progs/so_timestamping.c b/tools/testing/selftests/bpf/progs/so_timestamping.c
> > new file mode 100644
> > index 000000000000..f64e94dbd70e
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/so_timestamping.c
> > @@ -0,0 +1,135 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2024 Tencent */
> > +
> > +#include "vmlinux.h"
> > +#include "bpf_tracing_net.h"
> > +#include <bpf/bpf_core_read.h>
> > +#include <bpf/bpf_helpers.h>
> > +#include <bpf/bpf_tracing.h>
> > +#include "bpf_misc.h"
> > +
> > +#define SK_BPF_CB_FLAGS 1009
> > +#define SK_BPF_CB_TX_TIMESTAMPING 1
> > +
> > +int nr_active;
> > +int nr_passive;
> > +int nr_sched;
> > +int nr_txsw;
> > +int nr_ack;
> > +
> > +struct sockopt_test {
> > + int opt;
> > + int new;
> > +};
> > +
> > +static const struct sockopt_test sol_socket_tests[] = {
> > + { .opt = SK_BPF_CB_FLAGS, .new = SK_BPF_CB_TX_TIMESTAMPING, },
> > + { .opt = 0, },
> > +};
> > +
> > +struct loop_ctx {
> > + void *ctx;
> > + struct sock *sk;
> > +};
> > +
> > +struct {
> > + __uint(type, BPF_MAP_TYPE_HASH);
> > + __type(key, u32);
> > + __type(value, u64);
> > + __uint(max_entries, 1024);
> > +} hash_map SEC(".maps");
> > +
> > +static u64 delay_tolerance_nsec = 5000000;
> > +
> > +static int bpf_test_sockopt_int(void *ctx, struct sock *sk,
> > + const struct sockopt_test *t,
> > + int level)
> > +{
> > + int new, opt;
> > +
> > + opt = t->opt;
> > + new = t->new;
> > +
> > + if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
> > + return 1;
> > +
> > + return 0;
> > +}
> > +
> > +static int bpf_test_socket_sockopt(__u32 i, struct loop_ctx *lc)
> > +{
> > + const struct sockopt_test *t;
> > +
> > + if (i >= ARRAY_SIZE(sol_socket_tests))
> > + return 1;
> > +
> > + t = &sol_socket_tests[i];
> > + if (!t->opt)
> > + return 1;
> > +
> > + return bpf_test_sockopt_int(lc->ctx, lc->sk, t, SOL_SOCKET);
> > +}
> > +
> > +static int bpf_test_sockopt(void *ctx, struct sock *sk)
> > +{
> > + struct loop_ctx lc = { .ctx = ctx, .sk = sk, };
> > + int n;
> > +
> > + n = bpf_loop(ARRAY_SIZE(sol_socket_tests), bpf_test_socket_sockopt, &lc, 0);
> > + if (n != ARRAY_SIZE(sol_socket_tests))
> > + return -1;
> > +
> > + return 0;
> > +}
> > +
> > +static bool bpf_test_delay(struct bpf_sock_ops *skops)
> > +{
> > + u64 timestamp = bpf_ktime_get_ns();
> > + u32 seq = skops->args[2];
> > + u64 *value;
> > +
> > + value = bpf_map_lookup_elem(&hash_map, &seq);
> > + if (value && (timestamp - *value > delay_tolerance_nsec)) {
> > + bpf_printk("time delay: %lu", timestamp - *value);
> > + return false;
> > + }
> > +
> > + bpf_map_update_elem(&hash_map, &seq, ×tamp, BPF_ANY);
>
> Maybe enforce that you expect value to be found for all cases except
> the first (SCHED). I.e., that they share the same seq/tskey.
Good suggestion. Will do it :)
Thanks,
Jason
>
> > + return true;
> > +}
> > +
> > +SEC("sockops")
> > +int skops_sockopt(struct bpf_sock_ops *skops)
> > +{
> > + struct bpf_sock *bpf_sk = skops->sk;
> > + struct sock *sk;
> > +
> > + if (!bpf_sk)
> > + return 1;
> > +
> > + sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk);
> > + if (!sk)
> > + return 1;
> > +
> > + switch (skops->op) {
> > + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
> > + nr_active += !bpf_test_sockopt(skops, sk);
> > + break;
> > + case BPF_SOCK_OPS_TS_SCHED_OPT_CB:
> > + if (bpf_test_delay(skops))
> > + nr_sched += 1;
> > + break;
> > + case BPF_SOCK_OPS_TS_SW_OPT_CB:
> > + if (bpf_test_delay(skops))
> > + nr_txsw += 1;
> > + break;
> > + case BPF_SOCK_OPS_TS_ACK_OPT_CB:
> > + if (bpf_test_delay(skops))
> > + nr_ack += 1;
> > + break;
> > + }
> > +
> > + return 1;
> > +}
> > +
> > +char _license[] SEC("license") = "GPL";
> > --
> > 2.37.3
> >
>
>
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH net-next v4 11/11] bpf: add simple bpf tests in the tx path for so_timstamping feature
2024-12-07 17:38 ` [PATCH net-next v4 11/11] bpf: add simple bpf tests in the tx path for so_timstamping feature Jason Xing
2024-12-09 14:45 ` Willem de Bruijn
@ 2024-12-13 1:14 ` Martin KaFai Lau
2024-12-13 16:02 ` Jason Xing
1 sibling, 1 reply; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-13 1:14 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/7/24 9:38 AM, Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> Only check if we pass those three key points after we enable the
> bpf extension for so_timestamping. During each point, we can choose
> whether to print the current timestamp.
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> .../bpf/prog_tests/so_timestamping.c | 97 +++++++++++++
> .../selftests/bpf/progs/so_timestamping.c | 135 ++++++++++++++++++
> 2 files changed, 232 insertions(+)
> create mode 100644 tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> create mode 100644 tools/testing/selftests/bpf/progs/so_timestamping.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/so_timestamping.c b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> new file mode 100644
> index 000000000000..c5978444f9c8
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> @@ -0,0 +1,97 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2024 Tencent */
> +
> +#define _GNU_SOURCE
> +#include <sched.h>
> +#include <linux/socket.h>
> +#include <linux/tls.h>
> +#include <net/if.h>
> +
> +#include "test_progs.h"
> +#include "cgroup_helpers.h"
> +#include "network_helpers.h"
> +
> +#include "so_timestamping.skel.h"
> +
> +#define CG_NAME "/so-timestamping-test"
> +
> +static const char addr4_str[] = "127.0.0.1";
> +static const char addr6_str[] = "::1";
> +static struct so_timestamping *skel;
> +static int cg_fd;
> +
> +static int create_netns(void)
> +{
> + if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
> + return -1;
> +
> + if (!ASSERT_OK(system("ip link set dev lo up"), "set lo up"))
> + return -1;
> +
> + return 0;
> +}
> +
> +static void test_tcp(int family)
> +{
> + struct so_timestamping__bss *bss = skel->bss;
> + char buf[] = "testing testing";
> + int sfd = -1, cfd = -1;
> + int n;
> +
> + memset(bss, 0, sizeof(*bss));
> +
> + sfd = start_server(family, SOCK_STREAM,
> + family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
> + if (!ASSERT_GE(sfd, 0, "start_server"))
> + goto out;
> +
> + cfd = connect_to_fd(sfd, 0);
> + if (!ASSERT_GE(cfd, 0, "connect_to_fd_server")) {
> + close(sfd);
> + goto out;
> + }
> +
> + n = write(cfd, buf, sizeof(buf));
> + if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
> + goto out;
> +
> + ASSERT_EQ(bss->nr_active, 1, "nr_active");
> + ASSERT_EQ(bss->nr_sched, 1, "nr_sched");
> + ASSERT_EQ(bss->nr_txsw, 1, "nr_txsw");
> + ASSERT_EQ(bss->nr_ack, 1, "nr_ack");
> +
> +out:
> + if (sfd >= 0)
> + close(sfd);
> + if (cfd >= 0)
> + close(cfd);
> +}
> +
> +void test_so_timestamping(void)
> +{
> + cg_fd = test__join_cgroup(CG_NAME);
> + if (cg_fd < 0)
> + return;
> +
> + if (create_netns())
> + goto done;
> +
> + skel = so_timestamping__open();
> + if (!ASSERT_OK_PTR(skel, "open skel"))
> + goto done;
> +
> + if (!ASSERT_OK(so_timestamping__load(skel), "load skel"))
> + goto done;
> +
> + skel->links.skops_sockopt =
> + bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd);
> + if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup"))
> + goto done;
> +
> + test_tcp(AF_INET6);
> + test_tcp(AF_INET);
> +
> +done:
> + so_timestamping__destroy(skel);
> + close(cg_fd);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/so_timestamping.c b/tools/testing/selftests/bpf/progs/so_timestamping.c
> new file mode 100644
> index 000000000000..f64e94dbd70e
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/so_timestamping.c
> @@ -0,0 +1,135 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2024 Tencent */
> +
> +#include "vmlinux.h"
> +#include "bpf_tracing_net.h"
> +#include <bpf/bpf_core_read.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +#include "bpf_misc.h"
> +
> +#define SK_BPF_CB_FLAGS 1009
> +#define SK_BPF_CB_TX_TIMESTAMPING 1
> +
> +int nr_active;
> +int nr_passive;
> +int nr_sched;
> +int nr_txsw;
> +int nr_ack;
> +
> +struct sockopt_test {
> + int opt;
> + int new;
> +};
> +
> +static const struct sockopt_test sol_socket_tests[] = {
> + { .opt = SK_BPF_CB_FLAGS, .new = SK_BPF_CB_TX_TIMESTAMPING, },
> + { .opt = 0, },
> +};
> +
> +struct loop_ctx {
> + void *ctx;
> + struct sock *sk;
> +};
> +
> +struct {
> + __uint(type, BPF_MAP_TYPE_HASH);
> + __type(key, u32);
> + __type(value, u64);
> + __uint(max_entries, 1024);
> +} hash_map SEC(".maps");
> +
> +static u64 delay_tolerance_nsec = 5000000;
If I count right, 5ms may not a lot for the bpf CI and the test could become
flaky. Probably good enough to ensure the delay is larger than the previous one.
> +
> +static int bpf_test_sockopt_int(void *ctx, struct sock *sk,
> + const struct sockopt_test *t,
> + int level)
> +{
> + int new, opt;
> +
> + opt = t->opt;
> + new = t->new;
> +
> + if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
> + return 1;
> +
> + return 0;
> +}
> +
> +static int bpf_test_socket_sockopt(__u32 i, struct loop_ctx *lc)
> +{
> + const struct sockopt_test *t;
> +
> + if (i >= ARRAY_SIZE(sol_socket_tests))
> + return 1;
> +
> + t = &sol_socket_tests[i];
> + if (!t->opt)
> + return 1;
> +
> + return bpf_test_sockopt_int(lc->ctx, lc->sk, t, SOL_SOCKET);
> +}
> +
> +static int bpf_test_sockopt(void *ctx, struct sock *sk)
> +{
> + struct loop_ctx lc = { .ctx = ctx, .sk = sk, };
> + int n;
> +
> + n = bpf_loop(ARRAY_SIZE(sol_socket_tests), bpf_test_socket_sockopt, &lc, 0);
> + if (n != ARRAY_SIZE(sol_socket_tests))
> + return -1;
> +
> + return 0;
> +}
> +
> +static bool bpf_test_delay(struct bpf_sock_ops *skops)
> +{
> + u64 timestamp = bpf_ktime_get_ns();
> + u32 seq = skops->args[2];
> + u64 *value;
> +
> + value = bpf_map_lookup_elem(&hash_map, &seq);
> + if (value && (timestamp - *value > delay_tolerance_nsec)) {
> + bpf_printk("time delay: %lu", timestamp - *value);
Please try not to printk in selftests. The bpf CI cannot interpret it
meaningfully and turn it into a PASS/FAIL signal.
> + return false;
> + }
> +
> + bpf_map_update_elem(&hash_map, &seq, ×tamp, BPF_ANY);
A nit.
*value = timestamp;
> + return true;
> +}
> +
> +SEC("sockops")
> +int skops_sockopt(struct bpf_sock_ops *skops)
> +{
> + struct bpf_sock *bpf_sk = skops->sk;
> + struct sock *sk;
> +
> + if (!bpf_sk)
> + return 1;
> +
> + sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk);
> + if (!sk)
> + return 1;
> +
> + switch (skops->op) {
> + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
> + nr_active += !bpf_test_sockopt(skops, sk);
> + break;
> + case BPF_SOCK_OPS_TS_SCHED_OPT_CB:
> + if (bpf_test_delay(skops))
> + nr_sched += 1;
> + break;
> + case BPF_SOCK_OPS_TS_SW_OPT_CB:
> + if (bpf_test_delay(skops))
> + nr_txsw += 1;
> + break;
> + case BPF_SOCK_OPS_TS_ACK_OPT_CB:
> + if (bpf_test_delay(skops))
> + nr_ack += 1;
> + break;
The test is a good step forward. Thanks. Instead of one u64 as the map value, I
think it can be improved to make the test more real to record the individual
delay. e.g. the following map value:
struct delay_info {
u64 sendmsg_ns;
u32 sched_delay; /* SCHED_OPT_CB - sendmsg_ns */
u32 sw_snd_delay;
u32 ack_delay;
};
and I think a bpf callback during the sendmsg is still needed in the next respin.
> + }
> +
> + return 1;
> +}
> +
> +char _license[] SEC("license") = "GPL";
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 11/11] bpf: add simple bpf tests in the tx path for so_timstamping feature
2024-12-13 1:14 ` Martin KaFai Lau
@ 2024-12-13 16:02 ` Jason Xing
2024-12-14 0:14 ` Martin KaFai Lau
0 siblings, 1 reply; 48+ messages in thread
From: Jason Xing @ 2024-12-13 16:02 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Fri, Dec 13, 2024 at 9:14 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/7/24 9:38 AM, Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Only check if we pass those three key points after we enable the
> > bpf extension for so_timestamping. During each point, we can choose
> > whether to print the current timestamp.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > .../bpf/prog_tests/so_timestamping.c | 97 +++++++++++++
> > .../selftests/bpf/progs/so_timestamping.c | 135 ++++++++++++++++++
> > 2 files changed, 232 insertions(+)
> > create mode 100644 tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> > create mode 100644 tools/testing/selftests/bpf/progs/so_timestamping.c
> >
> > diff --git a/tools/testing/selftests/bpf/prog_tests/so_timestamping.c b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> > new file mode 100644
> > index 000000000000..c5978444f9c8
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> > @@ -0,0 +1,97 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2024 Tencent */
> > +
> > +#define _GNU_SOURCE
> > +#include <sched.h>
> > +#include <linux/socket.h>
> > +#include <linux/tls.h>
> > +#include <net/if.h>
> > +
> > +#include "test_progs.h"
> > +#include "cgroup_helpers.h"
> > +#include "network_helpers.h"
> > +
> > +#include "so_timestamping.skel.h"
> > +
> > +#define CG_NAME "/so-timestamping-test"
> > +
> > +static const char addr4_str[] = "127.0.0.1";
> > +static const char addr6_str[] = "::1";
> > +static struct so_timestamping *skel;
> > +static int cg_fd;
> > +
> > +static int create_netns(void)
> > +{
> > + if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
> > + return -1;
> > +
> > + if (!ASSERT_OK(system("ip link set dev lo up"), "set lo up"))
> > + return -1;
> > +
> > + return 0;
> > +}
> > +
> > +static void test_tcp(int family)
> > +{
> > + struct so_timestamping__bss *bss = skel->bss;
> > + char buf[] = "testing testing";
> > + int sfd = -1, cfd = -1;
> > + int n;
> > +
> > + memset(bss, 0, sizeof(*bss));
> > +
> > + sfd = start_server(family, SOCK_STREAM,
> > + family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
> > + if (!ASSERT_GE(sfd, 0, "start_server"))
> > + goto out;
> > +
> > + cfd = connect_to_fd(sfd, 0);
> > + if (!ASSERT_GE(cfd, 0, "connect_to_fd_server")) {
> > + close(sfd);
> > + goto out;
> > + }
> > +
> > + n = write(cfd, buf, sizeof(buf));
> > + if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
> > + goto out;
> > +
> > + ASSERT_EQ(bss->nr_active, 1, "nr_active");
> > + ASSERT_EQ(bss->nr_sched, 1, "nr_sched");
> > + ASSERT_EQ(bss->nr_txsw, 1, "nr_txsw");
> > + ASSERT_EQ(bss->nr_ack, 1, "nr_ack");
> > +
> > +out:
> > + if (sfd >= 0)
> > + close(sfd);
> > + if (cfd >= 0)
> > + close(cfd);
> > +}
> > +
> > +void test_so_timestamping(void)
> > +{
> > + cg_fd = test__join_cgroup(CG_NAME);
> > + if (cg_fd < 0)
> > + return;
> > +
> > + if (create_netns())
> > + goto done;
> > +
> > + skel = so_timestamping__open();
> > + if (!ASSERT_OK_PTR(skel, "open skel"))
> > + goto done;
> > +
> > + if (!ASSERT_OK(so_timestamping__load(skel), "load skel"))
> > + goto done;
> > +
> > + skel->links.skops_sockopt =
> > + bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd);
> > + if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup"))
> > + goto done;
> > +
> > + test_tcp(AF_INET6);
> > + test_tcp(AF_INET);
> > +
> > +done:
> > + so_timestamping__destroy(skel);
> > + close(cg_fd);
> > +}
> > diff --git a/tools/testing/selftests/bpf/progs/so_timestamping.c b/tools/testing/selftests/bpf/progs/so_timestamping.c
> > new file mode 100644
> > index 000000000000..f64e94dbd70e
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/so_timestamping.c
> > @@ -0,0 +1,135 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2024 Tencent */
> > +
> > +#include "vmlinux.h"
> > +#include "bpf_tracing_net.h"
> > +#include <bpf/bpf_core_read.h>
> > +#include <bpf/bpf_helpers.h>
> > +#include <bpf/bpf_tracing.h>
> > +#include "bpf_misc.h"
> > +
> > +#define SK_BPF_CB_FLAGS 1009
> > +#define SK_BPF_CB_TX_TIMESTAMPING 1
> > +
> > +int nr_active;
> > +int nr_passive;
> > +int nr_sched;
> > +int nr_txsw;
> > +int nr_ack;
> > +
> > +struct sockopt_test {
> > + int opt;
> > + int new;
> > +};
> > +
> > +static const struct sockopt_test sol_socket_tests[] = {
> > + { .opt = SK_BPF_CB_FLAGS, .new = SK_BPF_CB_TX_TIMESTAMPING, },
> > + { .opt = 0, },
> > +};
> > +
> > +struct loop_ctx {
> > + void *ctx;
> > + struct sock *sk;
> > +};
> > +
> > +struct {
> > + __uint(type, BPF_MAP_TYPE_HASH);
> > + __type(key, u32);
> > + __type(value, u64);
> > + __uint(max_entries, 1024);
> > +} hash_map SEC(".maps");
> > +
> > +static u64 delay_tolerance_nsec = 5000000;
>
> If I count right, 5ms may not a lot for the bpf CI and the test could become
> flaky. Probably good enough to ensure the delay is larger than the previous one.
You're right, initially I set 2ms which make the test flaky. How about
20ms? We cannot ensure each delta (calculated between two tx points)
is larger than the previous one.
>
> > +
> > +static int bpf_test_sockopt_int(void *ctx, struct sock *sk,
> > + const struct sockopt_test *t,
> > + int level)
> > +{
> > + int new, opt;
> > +
> > + opt = t->opt;
> > + new = t->new;
> > +
> > + if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
> > + return 1;
> > +
> > + return 0;
> > +}
> > +
> > +static int bpf_test_socket_sockopt(__u32 i, struct loop_ctx *lc)
> > +{
> > + const struct sockopt_test *t;
> > +
> > + if (i >= ARRAY_SIZE(sol_socket_tests))
> > + return 1;
> > +
> > + t = &sol_socket_tests[i];
> > + if (!t->opt)
> > + return 1;
> > +
> > + return bpf_test_sockopt_int(lc->ctx, lc->sk, t, SOL_SOCKET);
> > +}
> > +
> > +static int bpf_test_sockopt(void *ctx, struct sock *sk)
> > +{
> > + struct loop_ctx lc = { .ctx = ctx, .sk = sk, };
> > + int n;
> > +
> > + n = bpf_loop(ARRAY_SIZE(sol_socket_tests), bpf_test_socket_sockopt, &lc, 0);
> > + if (n != ARRAY_SIZE(sol_socket_tests))
> > + return -1;
> > +
> > + return 0;
> > +}
> > +
> > +static bool bpf_test_delay(struct bpf_sock_ops *skops)
> > +{
> > + u64 timestamp = bpf_ktime_get_ns();
> > + u32 seq = skops->args[2];
> > + u64 *value;
> > +
> > + value = bpf_map_lookup_elem(&hash_map, &seq);
> > + if (value && (timestamp - *value > delay_tolerance_nsec)) {
> > + bpf_printk("time delay: %lu", timestamp - *value);
>
> Please try not to printk in selftests. The bpf CI cannot interpret it
> meaningfully and turn it into a PASS/FAIL signal.
All right.
>
> > + return false;
> > + }
> > +
> > + bpf_map_update_elem(&hash_map, &seq, ×tamp, BPF_ANY);
>
> A nit.
>
> *value = timestamp;
Will fix it.
>
> > + return true;
> > +}
> > +
> > +SEC("sockops")
> > +int skops_sockopt(struct bpf_sock_ops *skops)
> > +{
> > + struct bpf_sock *bpf_sk = skops->sk;
> > + struct sock *sk;
> > +
> > + if (!bpf_sk)
> > + return 1;
> > +
> > + sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk);
> > + if (!sk)
> > + return 1;
> > +
> > + switch (skops->op) {
> > + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
> > + nr_active += !bpf_test_sockopt(skops, sk);
> > + break;
> > + case BPF_SOCK_OPS_TS_SCHED_OPT_CB:
> > + if (bpf_test_delay(skops))
> > + nr_sched += 1;
> > + break;
> > + case BPF_SOCK_OPS_TS_SW_OPT_CB:
> > + if (bpf_test_delay(skops))
> > + nr_txsw += 1;
> > + break;
> > + case BPF_SOCK_OPS_TS_ACK_OPT_CB:
> > + if (bpf_test_delay(skops))
> > + nr_ack += 1;
> > + break;
>
> The test is a good step forward. Thanks. Instead of one u64 as the map value, I
> think it can be improved to make the test more real to record the individual
> delay. e.g. the following map value:
>
> struct delay_info {
> u64 sendmsg_ns;
> u32 sched_delay; /* SCHED_OPT_CB - sendmsg_ns */
> u32 sw_snd_delay;
> u32 ack_delay;
> };
>
Good advice :)
> and I think a bpf callback during the sendmsg is still needed in the next respin.
Okay, I planned to introduce a new BPF_SOCK_OPS_TS_SENDMSG_OPT_CB
after this patchset gets merged. Since you've already asked, I will
surely follow :) Thanks.
>
> > + }
> > +
> > + return 1;
> > +}
> > +
> > +char _license[] SEC("license") = "GPL";
>
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 11/11] bpf: add simple bpf tests in the tx path for so_timstamping feature
2024-12-13 16:02 ` Jason Xing
@ 2024-12-14 0:14 ` Martin KaFai Lau
2024-12-14 0:45 ` Jason Xing
0 siblings, 1 reply; 48+ messages in thread
From: Martin KaFai Lau @ 2024-12-14 0:14 UTC (permalink / raw)
To: Jason Xing
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On 12/13/24 8:02 AM, Jason Xing wrote:
>>> +static u64 delay_tolerance_nsec = 5000000;
>>
>> If I count right, 5ms may not a lot for the bpf CI and the test could become
>> flaky. Probably good enough to ensure the delay is larger than the previous one.
>
> You're right, initially I set 2ms which make the test flaky. How about
> 20ms? We cannot ensure each delta (calculated between two tx points)
> is larger than the previous one.
or I was thinking the delay is always measured from sendmsg_ns.
Regardless, whatever way the delay of a tx point is measured from (always from
sendmsg_ns or from the previous tx point), it can also just check the measured
delay is +ve or something like that instead of having a hard coded maximum delay
here.
The following "struct delay_info" may not be the best. Feel free to adjust.
>> struct delay_info {
>> u64 sendmsg_ns;
>> u32 sched_delay; /* SCHED_OPT_CB - sendmsg_ns */
>> u32 sw_snd_delay;
>> u32 ack_delay;
>> };
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH net-next v4 11/11] bpf: add simple bpf tests in the tx path for so_timstamping feature
2024-12-14 0:14 ` Martin KaFai Lau
@ 2024-12-14 0:45 ` Jason Xing
0 siblings, 0 replies; 48+ messages in thread
From: Jason Xing @ 2024-12-14 0:45 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: davem, edumazet, kuba, pabeni, dsahern, willemdebruijn.kernel,
willemb, ast, daniel, andrii, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, bpf, netdev,
Jason Xing
On Sat, Dec 14, 2024 at 8:14 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/13/24 8:02 AM, Jason Xing wrote:
> >>> +static u64 delay_tolerance_nsec = 5000000;
> >>
> >> If I count right, 5ms may not a lot for the bpf CI and the test could become
> >> flaky. Probably good enough to ensure the delay is larger than the previous one.
> >
> > You're right, initially I set 2ms which make the test flaky. How about
> > 20ms? We cannot ensure each delta (calculated between two tx points)
> > is larger than the previous one.
>
> or I was thinking the delay is always measured from sendmsg_ns.
>
> Regardless, whatever way the delay of a tx point is measured from (always from
> sendmsg_ns or from the previous tx point), it can also just check the measured
> delay is +ve or something like that instead of having a hard coded maximum delay
> here.
That makes things simpler. Got it.
>
> The following "struct delay_info" may not be the best. Feel free to adjust.
Okay.
>
> >> struct delay_info {
> >> u64 sendmsg_ns;
> >> u32 sched_delay; /* SCHED_OPT_CB - sendmsg_ns */
> >> u32 sw_snd_delay;
> >> u32 ack_delay;
> >> };
>
^ permalink raw reply [flat|nested] 48+ messages in thread