* [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock
2021-06-25 20:04 [PATCH bpf-next 0/8] bpf: Allow bpf tcp iter to do bpf_setsockopt Martin KaFai Lau
@ 2021-06-25 20:05 ` Martin KaFai Lau
2021-06-29 17:27 ` Yonghong Song
0 siblings, 1 reply; 8+ messages in thread
From: Martin KaFai Lau @ 2021-06-25 20:05 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Daniel Borkmann, Eric Dumazet, kernel-team,
Neal Cardwell, netdev, Yonghong Song, Yuchung Cheng
This patch does batching and lock_sock for the bpf tcp iter.
It does not affect the proc fs iteration.
With bpf-tcp-cc, new algo rollout happens more often. Instead of
restarting the application to pick up the new tcp-cc, the next patch
will allow bpf iter with CAP_NET_ADMIN to do setsockopt(TCP_CONGESTION).
This requires locking the sock.
Also, unlike the proc iteration (cat /proc/net/tcp[6]), the bpf iter
can inspect all fields of a tcp_sock. It will be useful to have a
consistent view on some of the fields (e.g. the ones reported in
tcp_get_info() that also acquires the sock lock).
Double lock: locking the bucket first and then locking the sock could
lead to deadlock. This patch takes a batching approach similar to
inet_diag. While holding the bucket lock, it batch a number of sockets
into an array first and then unlock the bucket. Before doing show(),
it then calls lock_sock_fast().
In a machine with ~400k connections, the maximum number of
sk in a bucket of the established hashtable is 7. 0.02% of
the established connections fall into this bucket size.
For listen hash (port+addr lhash2), the bucket is usually very
small also except for the SO_REUSEPORT use case which the
userspace could have one SO_REUSEPORT socket per thread.
While batching is used, it can also minimize the chance of missing
sock in the setsockopt use case if the whole bucket is batched.
This patch will start with a batch array with INIT_BATCH_SZ (16)
which will be enough for the most common cases. bpf_iter_tcp_batch()
will try to realloc to a larger array to handle exception case (e.g.
the SO_REUSEPORT case in the lhash2).
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
net/ipv4/tcp_ipv4.c | 236 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 230 insertions(+), 6 deletions(-)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0d851289a89e..856144d33f52 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2687,6 +2687,15 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
}
#ifdef CONFIG_BPF_SYSCALL
+struct bpf_tcp_iter_state {
+ struct tcp_iter_state state;
+ unsigned int cur_sk;
+ unsigned int end_sk;
+ unsigned int max_sk;
+ struct sock **batch;
+ bool st_bucket_done;
+};
+
struct bpf_iter__tcp {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct sock_common *, sk_common);
@@ -2705,16 +2714,203 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
return bpf_iter_run_prog(prog, &ctx);
}
+static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
+{
+ while (iter->cur_sk < iter->end_sk)
+ sock_put(iter->batch[iter->cur_sk++]);
+}
+
+static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
+ unsigned int new_batch_sz)
+{
+ struct sock **new_batch;
+
+ new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER);
+ if (!new_batch)
+ return -ENOMEM;
+
+ bpf_iter_tcp_put_batch(iter);
+ kvfree(iter->batch);
+ iter->batch = new_batch;
+ iter->max_sk = new_batch_sz;
+
+ return 0;
+}
+
+static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
+ struct sock *start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct inet_connection_sock *icsk;
+ unsigned int expected = 1;
+ struct sock *sk;
+
+ sock_hold(start_sk);
+ iter->batch[iter->end_sk++] = start_sk;
+
+ icsk = inet_csk(start_sk);
+ inet_lhash2_for_each_icsk_continue(icsk) {
+ sk = (struct sock *)icsk;
+ if (seq_sk_match(seq, sk)) {
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++] = sk;
+ }
+ expected++;
+ }
+ }
+ spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
+
+ return expected;
+}
+
+static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
+ struct sock *start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct hlist_nulls_node *node;
+ unsigned int expected = 1;
+ struct sock *sk;
+
+ sock_hold(start_sk);
+ iter->batch[iter->end_sk++] = start_sk;
+
+ sk = sk_nulls_next(start_sk);
+ sk_nulls_for_each_from(sk, node) {
+ if (seq_sk_match(seq, sk)) {
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++] = sk;
+ }
+ expected++;
+ }
+ }
+ spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+
+ return expected;
+}
+
+static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ unsigned int expected;
+ bool resized = false;
+ struct sock *sk;
+
+ /* The st->bucket is done. Directly advance to the next
+ * bucket instead of having the tcp_seek_last_pos() to skip
+ * one by one in the current bucket and eventually find out
+ * it has to advance to the next bucket.
+ */
+ if (iter->st_bucket_done) {
+ st->offset = 0;
+ st->bucket++;
+ if (st->state == TCP_SEQ_STATE_LISTENING &&
+ st->bucket > tcp_hashinfo.lhash2_mask) {
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ st->bucket = 0;
+ }
+ }
+
+again:
+ /* Get a new batch */
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+ iter->st_bucket_done = false;
+
+ sk = tcp_seek_last_pos(seq);
+ if (!sk)
+ return NULL; /* Done */
+
+ if (st->state == TCP_SEQ_STATE_LISTENING)
+ expected = bpf_iter_tcp_listening_batch(seq, sk);
+ else
+ expected = bpf_iter_tcp_established_batch(seq, sk);
+
+ if (iter->end_sk == expected) {
+ iter->st_bucket_done = true;
+ return sk;
+ }
+
+ if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
+ resized = true;
+ goto again;
+ }
+
+ return sk;
+}
+
+static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ /* bpf iter does not support lseek, so it always
+ * continue from where it was stop()-ped.
+ */
+ if (*pos)
+ return bpf_iter_tcp_batch(seq);
+
+ return SEQ_START_TOKEN;
+}
+
+static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct sock *sk;
+
+ /* Whenever seq_next() is called, the iter->cur_sk is
+ * done with seq_show(), so advance to the next sk in
+ * the batch.
+ */
+ if (iter->cur_sk < iter->end_sk) {
+ /* Keeping st->num consistent in tcp_iter_state.
+ * bpf_iter_tcp does not use st->num.
+ * meta.seq_num is used instead.
+ */
+ st->num++;
+ /* Move st->offset to the next sk in the bucket such that
+ * the future start() will resume at st->offset in
+ * st->bucket. See tcp_seek_last_pos().
+ */
+ st->offset++;
+ sock_put(iter->batch[iter->cur_sk++]);
+ }
+
+ if (iter->cur_sk < iter->end_sk)
+ sk = iter->batch[iter->cur_sk];
+ else
+ sk = bpf_iter_tcp_batch(seq);
+
+ ++*pos;
+ /* Keeping st->last_pos consistent in tcp_iter_state.
+ * bpf iter does not do lseek, so st->last_pos always equals to *pos.
+ */
+ st->last_pos = *pos;
+ return sk;
+}
+
static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
struct sock *sk = v;
+ bool slow;
uid_t uid;
+ int ret;
if (v == SEQ_START_TOKEN)
return 0;
+ if (sk_fullsock(sk))
+ slow = lock_sock_fast(sk);
+
+ if (unlikely(sk_unhashed(sk))) {
+ ret = SEQ_SKIP;
+ goto unlock;
+ }
+
if (sk->sk_state == TCP_TIME_WAIT) {
uid = 0;
} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
@@ -2728,11 +2924,18 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
- return tcp_prog_seq_show(prog, &meta, v, uid);
+ ret = tcp_prog_seq_show(prog, &meta, v, uid);
+
+unlock:
+ if (sk_fullsock(sk))
+ unlock_sock_fast(sk, slow);
+ return ret;
+
}
static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
{
+ struct bpf_tcp_iter_state *iter = seq->private;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
@@ -2743,13 +2946,16 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
(void)tcp_prog_seq_show(prog, &meta, v, 0);
}
- tcp_seq_stop(seq, v);
+ if (iter->cur_sk < iter->end_sk) {
+ bpf_iter_tcp_put_batch(iter);
+ iter->st_bucket_done = false;
+ }
}
static const struct seq_operations bpf_iter_tcp_seq_ops = {
.show = bpf_iter_tcp_seq_show,
- .start = tcp_seq_start,
- .next = tcp_seq_next,
+ .start = bpf_iter_tcp_seq_start,
+ .next = bpf_iter_tcp_seq_next,
.stop = bpf_iter_tcp_seq_stop,
};
#endif
@@ -3017,21 +3223,39 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
struct sock_common *sk_common, uid_t uid)
+#define INIT_BATCH_SZ 16
+
static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
{
- return bpf_iter_init_seq_net(priv_data, aux);
+ struct bpf_tcp_iter_state *iter = priv_data;
+ int err;
+
+ err = bpf_iter_init_seq_net(priv_data, aux);
+ if (err)
+ return err;
+
+ err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
+ if (err) {
+ bpf_iter_fini_seq_net(priv_data);
+ return err;
+ }
+
+ return 0;
}
static void bpf_iter_fini_tcp(void *priv_data)
{
+ struct bpf_tcp_iter_state *iter = priv_data;
+
bpf_iter_fini_seq_net(priv_data);
+ kvfree(iter->batch);
}
static const struct bpf_iter_seq_info tcp_seq_info = {
.seq_ops = &bpf_iter_tcp_seq_ops,
.init_seq_private = bpf_iter_init_tcp,
.fini_seq_private = bpf_iter_fini_tcp,
- .seq_priv_size = sizeof(struct tcp_iter_state),
+ .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
};
static struct bpf_iter_reg tcp_reg_info = {
--
2.30.2
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock
@ 2021-06-26 5:21 kernel test robot
0 siblings, 0 replies; 8+ messages in thread
From: kernel test robot @ 2021-06-26 5:21 UTC (permalink / raw)
To: kbuild
[-- Attachment #1: Type: text/plain, Size: 7767 bytes --]
CC: kbuild-all(a)lists.01.org
In-Reply-To: <20210625200523.726854-1-kafai@fb.com>
References: <20210625200523.726854-1-kafai@fb.com>
TO: Martin KaFai Lau <kafai@fb.com>
TO: bpf(a)vger.kernel.org
CC: Alexei Starovoitov <ast@kernel.org>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Eric Dumazet <edumazet@google.com>
CC: kernel-team(a)fb.com
CC: Neal Cardwell <ncardwell@google.com>
CC: netdev(a)vger.kernel.org
CC: Yonghong Song <yhs@fb.com>
CC: Yuchung Cheng <ycheng@google.com>
Hi Martin,
I love your patch! Perhaps something to improve:
[auto build test WARNING on bpf-next/master]
url: https://github.com/0day-ci/linux/commits/Martin-KaFai-Lau/bpf-Allow-bpf-tcp-iter-to-do-bpf_setsockopt/20210626-040650
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
:::::: branch date: 9 hours ago
:::::: commit date: 9 hours ago
config: x86_64-randconfig-s031-20210622 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce:
# apt-get install sparse
# sparse version: v0.6.3-341-g8af24329-dirty
# https://github.com/0day-ci/linux/commit/5f78445efda4708980f0d1cb4c59a35000205232
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Martin-KaFai-Lau/bpf-Allow-bpf-tcp-iter-to-do-bpf_setsockopt/20210626-040650
git checkout 5f78445efda4708980f0d1cb4c59a35000205232
# save the attached .config to linux build tree
make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' W=1 ARCH=x86_64
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
sparse warnings: (new ones prefixed by >>)
net/ipv4/tcp_ipv4.c:3084:41: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const *data @@ got struct tcp_congestion_ops const [noderef] __rcu *tcp_congestion_control @@
net/ipv4/tcp_ipv4.c:3084:41: sparse: expected void const *data
net/ipv4/tcp_ipv4.c:3084:41: sparse: got struct tcp_congestion_ops const [noderef] __rcu *tcp_congestion_control
net/ipv4/tcp_ipv4.c:3193:45: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const *data @@ got struct tcp_congestion_ops const [noderef] __rcu *extern [addressable] [toplevel] tcp_congestion_control @@
net/ipv4/tcp_ipv4.c:3193:45: sparse: expected void const *data
net/ipv4/tcp_ipv4.c:3193:45: sparse: got struct tcp_congestion_ops const [noderef] __rcu *extern [addressable] [toplevel] tcp_congestion_control
net/ipv4/tcp_ipv4.c:3197:50: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct tcp_congestion_ops const [noderef] __rcu *tcp_congestion_control @@ got struct tcp_congestion_ops * @@
net/ipv4/tcp_ipv4.c:3197:50: sparse: expected struct tcp_congestion_ops const [noderef] __rcu *tcp_congestion_control
net/ipv4/tcp_ipv4.c:3197:50: sparse: got struct tcp_congestion_ops *
net/ipv4/tcp_ipv4.c:1619:25: sparse: sparse: context imbalance in 'tcp_v4_syn_recv_sock' - unexpected unlock
net/ipv4/tcp_ipv4.c:1893:17: sparse: sparse: context imbalance in 'tcp_add_backlog' - unexpected unlock
net/ipv4/tcp_ipv4.c:2125:21: sparse: sparse: context imbalance in 'tcp_v4_rcv' - different lock contexts for basic block
net/ipv4/tcp_ipv4.c:2294:13: sparse: sparse: context imbalance in 'listening_get_first' - wrong count at exit
net/ipv4/tcp_ipv4.c:2342:9: sparse: sparse: context imbalance in 'listening_get_next' - unexpected unlock
net/ipv4/tcp_ipv4.c:2373:13: sparse: sparse: context imbalance in 'established_get_first' - wrong count at exit
net/ipv4/tcp_ipv4.c:2414:40: sparse: sparse: context imbalance in 'established_get_next' - unexpected unlock
net/ipv4/tcp_ipv4.c:2545:36: sparse: sparse: context imbalance in 'tcp_seq_stop' - unexpected unlock
net/ipv4/tcp_ipv4.c:2763:20: sparse: sparse: context imbalance in 'bpf_iter_tcp_listening_batch' - unexpected unlock
net/ipv4/tcp_ipv4.c:2790:40: sparse: sparse: context imbalance in 'bpf_iter_tcp_established_batch' - unexpected unlock
>> net/ipv4/tcp_ipv4.c:2932:9: sparse: sparse: context imbalance in 'bpf_iter_tcp_seq_show' - different lock contexts for basic block
net/ipv4/tcp_ipv4.c:3085:41: sparse: sparse: dereference of noderef expression
net/ipv4/tcp_ipv4.c:3085:41: sparse: sparse: dereference of noderef expression
net/ipv4/tcp_ipv4.c:3194:45: sparse: sparse: dereference of noderef expression
net/ipv4/tcp_ipv4.c:3194:45: sparse: sparse: dereference of noderef expression
vim +/bpf_iter_tcp_seq_show +2932 net/ipv4/tcp_ipv4.c
5f78445efda470 Martin KaFai Lau 2021-06-25 2893
52d87d5f6418ba Yonghong Song 2020-06-23 2894 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
52d87d5f6418ba Yonghong Song 2020-06-23 2895 {
52d87d5f6418ba Yonghong Song 2020-06-23 2896 struct bpf_iter_meta meta;
52d87d5f6418ba Yonghong Song 2020-06-23 2897 struct bpf_prog *prog;
52d87d5f6418ba Yonghong Song 2020-06-23 2898 struct sock *sk = v;
5f78445efda470 Martin KaFai Lau 2021-06-25 2899 bool slow;
52d87d5f6418ba Yonghong Song 2020-06-23 2900 uid_t uid;
5f78445efda470 Martin KaFai Lau 2021-06-25 2901 int ret;
52d87d5f6418ba Yonghong Song 2020-06-23 2902
52d87d5f6418ba Yonghong Song 2020-06-23 2903 if (v == SEQ_START_TOKEN)
52d87d5f6418ba Yonghong Song 2020-06-23 2904 return 0;
52d87d5f6418ba Yonghong Song 2020-06-23 2905
5f78445efda470 Martin KaFai Lau 2021-06-25 2906 if (sk_fullsock(sk))
5f78445efda470 Martin KaFai Lau 2021-06-25 2907 slow = lock_sock_fast(sk);
5f78445efda470 Martin KaFai Lau 2021-06-25 2908
5f78445efda470 Martin KaFai Lau 2021-06-25 2909 if (unlikely(sk_unhashed(sk))) {
5f78445efda470 Martin KaFai Lau 2021-06-25 2910 ret = SEQ_SKIP;
5f78445efda470 Martin KaFai Lau 2021-06-25 2911 goto unlock;
5f78445efda470 Martin KaFai Lau 2021-06-25 2912 }
5f78445efda470 Martin KaFai Lau 2021-06-25 2913
52d87d5f6418ba Yonghong Song 2020-06-23 2914 if (sk->sk_state == TCP_TIME_WAIT) {
52d87d5f6418ba Yonghong Song 2020-06-23 2915 uid = 0;
52d87d5f6418ba Yonghong Song 2020-06-23 2916 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
52d87d5f6418ba Yonghong Song 2020-06-23 2917 const struct request_sock *req = v;
52d87d5f6418ba Yonghong Song 2020-06-23 2918
52d87d5f6418ba Yonghong Song 2020-06-23 2919 uid = from_kuid_munged(seq_user_ns(seq),
52d87d5f6418ba Yonghong Song 2020-06-23 2920 sock_i_uid(req->rsk_listener));
52d87d5f6418ba Yonghong Song 2020-06-23 2921 } else {
52d87d5f6418ba Yonghong Song 2020-06-23 2922 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
52d87d5f6418ba Yonghong Song 2020-06-23 2923 }
52d87d5f6418ba Yonghong Song 2020-06-23 2924
52d87d5f6418ba Yonghong Song 2020-06-23 2925 meta.seq = seq;
52d87d5f6418ba Yonghong Song 2020-06-23 2926 prog = bpf_iter_get_info(&meta, false);
5f78445efda470 Martin KaFai Lau 2021-06-25 2927 ret = tcp_prog_seq_show(prog, &meta, v, uid);
5f78445efda470 Martin KaFai Lau 2021-06-25 2928
5f78445efda470 Martin KaFai Lau 2021-06-25 2929 unlock:
5f78445efda470 Martin KaFai Lau 2021-06-25 2930 if (sk_fullsock(sk))
5f78445efda470 Martin KaFai Lau 2021-06-25 2931 unlock_sock_fast(sk, slow);
5f78445efda470 Martin KaFai Lau 2021-06-25 @2932 return ret;
5f78445efda470 Martin KaFai Lau 2021-06-25 2933
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org
[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 47145 bytes --]
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock
@ 2021-06-26 14:42 kernel test robot
0 siblings, 0 replies; 8+ messages in thread
From: kernel test robot @ 2021-06-26 14:42 UTC (permalink / raw)
To: kbuild
[-- Attachment #1: Type: text/plain, Size: 4332 bytes --]
CC: kbuild-all(a)lists.01.org
In-Reply-To: <20210625200523.726854-1-kafai@fb.com>
References: <20210625200523.726854-1-kafai@fb.com>
TO: Martin KaFai Lau <kafai@fb.com>
TO: bpf(a)vger.kernel.org
CC: Alexei Starovoitov <ast@kernel.org>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Eric Dumazet <edumazet@google.com>
CC: kernel-team(a)fb.com
CC: Neal Cardwell <ncardwell@google.com>
CC: netdev(a)vger.kernel.org
CC: Yonghong Song <yhs@fb.com>
CC: Yuchung Cheng <ycheng@google.com>
Hi Martin,
I love your patch! Perhaps something to improve:
[auto build test WARNING on bpf-next/master]
url: https://github.com/0day-ci/linux/commits/Martin-KaFai-Lau/bpf-Allow-bpf-tcp-iter-to-do-bpf_setsockopt/20210626-040650
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
:::::: branch date: 19 hours ago
:::::: commit date: 19 hours ago
config: x86_64-randconfig-m001-20210625 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
smatch warnings:
net/ipv4/tcp_ipv4.c:2931 bpf_iter_tcp_seq_show() error: uninitialized symbol 'slow'.
vim +/slow +2931 net/ipv4/tcp_ipv4.c
5f78445efda470 Martin KaFai Lau 2021-06-25 2893
52d87d5f6418ba Yonghong Song 2020-06-23 2894 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
52d87d5f6418ba Yonghong Song 2020-06-23 2895 {
52d87d5f6418ba Yonghong Song 2020-06-23 2896 struct bpf_iter_meta meta;
52d87d5f6418ba Yonghong Song 2020-06-23 2897 struct bpf_prog *prog;
52d87d5f6418ba Yonghong Song 2020-06-23 2898 struct sock *sk = v;
5f78445efda470 Martin KaFai Lau 2021-06-25 2899 bool slow;
52d87d5f6418ba Yonghong Song 2020-06-23 2900 uid_t uid;
5f78445efda470 Martin KaFai Lau 2021-06-25 2901 int ret;
52d87d5f6418ba Yonghong Song 2020-06-23 2902
52d87d5f6418ba Yonghong Song 2020-06-23 2903 if (v == SEQ_START_TOKEN)
52d87d5f6418ba Yonghong Song 2020-06-23 2904 return 0;
52d87d5f6418ba Yonghong Song 2020-06-23 2905
5f78445efda470 Martin KaFai Lau 2021-06-25 2906 if (sk_fullsock(sk))
5f78445efda470 Martin KaFai Lau 2021-06-25 2907 slow = lock_sock_fast(sk);
5f78445efda470 Martin KaFai Lau 2021-06-25 2908
5f78445efda470 Martin KaFai Lau 2021-06-25 2909 if (unlikely(sk_unhashed(sk))) {
5f78445efda470 Martin KaFai Lau 2021-06-25 2910 ret = SEQ_SKIP;
5f78445efda470 Martin KaFai Lau 2021-06-25 2911 goto unlock;
5f78445efda470 Martin KaFai Lau 2021-06-25 2912 }
5f78445efda470 Martin KaFai Lau 2021-06-25 2913
52d87d5f6418ba Yonghong Song 2020-06-23 2914 if (sk->sk_state == TCP_TIME_WAIT) {
52d87d5f6418ba Yonghong Song 2020-06-23 2915 uid = 0;
52d87d5f6418ba Yonghong Song 2020-06-23 2916 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
52d87d5f6418ba Yonghong Song 2020-06-23 2917 const struct request_sock *req = v;
52d87d5f6418ba Yonghong Song 2020-06-23 2918
52d87d5f6418ba Yonghong Song 2020-06-23 2919 uid = from_kuid_munged(seq_user_ns(seq),
52d87d5f6418ba Yonghong Song 2020-06-23 2920 sock_i_uid(req->rsk_listener));
52d87d5f6418ba Yonghong Song 2020-06-23 2921 } else {
52d87d5f6418ba Yonghong Song 2020-06-23 2922 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
52d87d5f6418ba Yonghong Song 2020-06-23 2923 }
52d87d5f6418ba Yonghong Song 2020-06-23 2924
52d87d5f6418ba Yonghong Song 2020-06-23 2925 meta.seq = seq;
52d87d5f6418ba Yonghong Song 2020-06-23 2926 prog = bpf_iter_get_info(&meta, false);
5f78445efda470 Martin KaFai Lau 2021-06-25 2927 ret = tcp_prog_seq_show(prog, &meta, v, uid);
5f78445efda470 Martin KaFai Lau 2021-06-25 2928
5f78445efda470 Martin KaFai Lau 2021-06-25 2929 unlock:
5f78445efda470 Martin KaFai Lau 2021-06-25 2930 if (sk_fullsock(sk))
5f78445efda470 Martin KaFai Lau 2021-06-25 @2931 unlock_sock_fast(sk, slow);
5f78445efda470 Martin KaFai Lau 2021-06-25 2932 return ret;
5f78445efda470 Martin KaFai Lau 2021-06-25 2933
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org
[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 39072 bytes --]
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock
2021-06-25 20:05 ` [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock Martin KaFai Lau
@ 2021-06-29 17:27 ` Yonghong Song
2021-06-29 17:44 ` Martin KaFai Lau
0 siblings, 1 reply; 8+ messages in thread
From: Yonghong Song @ 2021-06-29 17:27 UTC (permalink / raw)
To: Martin KaFai Lau, bpf
Cc: Alexei Starovoitov, Daniel Borkmann, Eric Dumazet, kernel-team,
Neal Cardwell, netdev, Yuchung Cheng
On 6/25/21 1:05 PM, Martin KaFai Lau wrote:
> This patch does batching and lock_sock for the bpf tcp iter.
> It does not affect the proc fs iteration.
>
> With bpf-tcp-cc, new algo rollout happens more often. Instead of
> restarting the application to pick up the new tcp-cc, the next patch
> will allow bpf iter with CAP_NET_ADMIN to do setsockopt(TCP_CONGESTION).
> This requires locking the sock.
>
> Also, unlike the proc iteration (cat /proc/net/tcp[6]), the bpf iter
> can inspect all fields of a tcp_sock. It will be useful to have a
> consistent view on some of the fields (e.g. the ones reported in
> tcp_get_info() that also acquires the sock lock).
>
> Double lock: locking the bucket first and then locking the sock could
> lead to deadlock. This patch takes a batching approach similar to
> inet_diag. While holding the bucket lock, it batch a number of sockets
> into an array first and then unlock the bucket. Before doing show(),
> it then calls lock_sock_fast().
>
> In a machine with ~400k connections, the maximum number of
> sk in a bucket of the established hashtable is 7. 0.02% of
> the established connections fall into this bucket size.
>
> For listen hash (port+addr lhash2), the bucket is usually very
> small also except for the SO_REUSEPORT use case which the
> userspace could have one SO_REUSEPORT socket per thread.
>
> While batching is used, it can also minimize the chance of missing
> sock in the setsockopt use case if the whole bucket is batched.
> This patch will start with a batch array with INIT_BATCH_SZ (16)
> which will be enough for the most common cases. bpf_iter_tcp_batch()
> will try to realloc to a larger array to handle exception case (e.g.
> the SO_REUSEPORT case in the lhash2).
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
> net/ipv4/tcp_ipv4.c | 236 ++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 230 insertions(+), 6 deletions(-)
>
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 0d851289a89e..856144d33f52 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -2687,6 +2687,15 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
> }
>
> #ifdef CONFIG_BPF_SYSCALL
> +struct bpf_tcp_iter_state {
> + struct tcp_iter_state state;
> + unsigned int cur_sk;
> + unsigned int end_sk;
> + unsigned int max_sk;
> + struct sock **batch;
> + bool st_bucket_done;
> +};
> +
> struct bpf_iter__tcp {
> __bpf_md_ptr(struct bpf_iter_meta *, meta);
> __bpf_md_ptr(struct sock_common *, sk_common);
> @@ -2705,16 +2714,203 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
> return bpf_iter_run_prog(prog, &ctx);
> }
>
> +static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
> +{
> + while (iter->cur_sk < iter->end_sk)
> + sock_put(iter->batch[iter->cur_sk++]);
> +}
> +
> +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
> + unsigned int new_batch_sz)
> +{
> + struct sock **new_batch;
> +
> + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER);
Since we return -ENOMEM below, should we have __GFP_NOWARN in kvmalloc
flags?
> + if (!new_batch)
> + return -ENOMEM;
> +
> + bpf_iter_tcp_put_batch(iter);
> + kvfree(iter->batch);
> + iter->batch = new_batch;
> + iter->max_sk = new_batch_sz;
> +
> + return 0;
> +}
> +
[...]
> +
> static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
> {
> struct bpf_iter_meta meta;
> struct bpf_prog *prog;
> struct sock *sk = v;
> + bool slow;
> uid_t uid;
> + int ret;
>
> if (v == SEQ_START_TOKEN)
> return 0;
>
> + if (sk_fullsock(sk))
> + slow = lock_sock_fast(sk);
> +
> + if (unlikely(sk_unhashed(sk))) {
> + ret = SEQ_SKIP;
> + goto unlock;
> + }
I am not a tcp expert. Maybe a dummy question.
Is it possible to do setsockopt() for listening socket?
What will happen if the listening sock is unhashed after the
above check?
> +
> if (sk->sk_state == TCP_TIME_WAIT) {
> uid = 0;
> } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
> @@ -2728,11 +2924,18 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
>
> meta.seq = seq;
> prog = bpf_iter_get_info(&meta, false);
> - return tcp_prog_seq_show(prog, &meta, v, uid);
> + ret = tcp_prog_seq_show(prog, &meta, v, uid);
> +
> +unlock:
> + if (sk_fullsock(sk))
> + unlock_sock_fast(sk, slow);
> + return ret;
> +
> }
>
> static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
> {
> + struct bpf_tcp_iter_state *iter = seq->private;
> struct bpf_iter_meta meta;
> struct bpf_prog *prog;
>
> @@ -2743,13 +2946,16 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
> (void)tcp_prog_seq_show(prog, &meta, v, 0);
> }
>
> - tcp_seq_stop(seq, v);
> + if (iter->cur_sk < iter->end_sk) {
> + bpf_iter_tcp_put_batch(iter);
> + iter->st_bucket_done = false;
> + }
> }
>
[...]
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock
2021-06-29 17:27 ` Yonghong Song
@ 2021-06-29 17:44 ` Martin KaFai Lau
2021-06-29 17:57 ` Yonghong Song
0 siblings, 1 reply; 8+ messages in thread
From: Martin KaFai Lau @ 2021-06-29 17:44 UTC (permalink / raw)
To: Yonghong Song
Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Eric Dumazet,
kernel-team, Neal Cardwell, netdev, Yuchung Cheng
On Tue, Jun 29, 2021 at 10:27:17AM -0700, Yonghong Song wrote:
[ ... ]
> > +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
> > + unsigned int new_batch_sz)
> > +{
> > + struct sock **new_batch;
> > +
> > + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER);
>
> Since we return -ENOMEM below, should we have __GFP_NOWARN in kvmalloc
> flags?
will add in v2.
>
> > + if (!new_batch)
> > + return -ENOMEM;
> > +
> > + bpf_iter_tcp_put_batch(iter);
> > + kvfree(iter->batch);
> > + iter->batch = new_batch;
> > + iter->max_sk = new_batch_sz;
> > +
> > + return 0;
> > +}
> > +
> [...]
> > +
> > static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
> > {
> > struct bpf_iter_meta meta;
> > struct bpf_prog *prog;
> > struct sock *sk = v;
> > + bool slow;
> > uid_t uid;
> > + int ret;
> > if (v == SEQ_START_TOKEN)
> > return 0;
> > + if (sk_fullsock(sk))
> > + slow = lock_sock_fast(sk);
> > +
> > + if (unlikely(sk_unhashed(sk))) {
> > + ret = SEQ_SKIP;
> > + goto unlock;
> > + }
>
> I am not a tcp expert. Maybe a dummy question.
> Is it possible to do setsockopt() for listening socket?
> What will happen if the listening sock is unhashed after the
> above check?
It won't happen because the sk has been locked before doing the
unhashed check.
Thanks for the review.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock
2021-06-29 17:44 ` Martin KaFai Lau
@ 2021-06-29 17:57 ` Yonghong Song
2021-06-29 18:06 ` Martin KaFai Lau
0 siblings, 1 reply; 8+ messages in thread
From: Yonghong Song @ 2021-06-29 17:57 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Eric Dumazet,
kernel-team, Neal Cardwell, netdev, Yuchung Cheng
On 6/29/21 10:44 AM, Martin KaFai Lau wrote:
> On Tue, Jun 29, 2021 at 10:27:17AM -0700, Yonghong Song wrote:
> [ ... ]
>
>>> +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
>>> + unsigned int new_batch_sz)
>>> +{
>>> + struct sock **new_batch;
>>> +
>>> + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER);
>>
>> Since we return -ENOMEM below, should we have __GFP_NOWARN in kvmalloc
>> flags?
> will add in v2.
>
>>
>>> + if (!new_batch)
>>> + return -ENOMEM;
>>> +
>>> + bpf_iter_tcp_put_batch(iter);
>>> + kvfree(iter->batch);
>>> + iter->batch = new_batch;
>>> + iter->max_sk = new_batch_sz;
>>> +
>>> + return 0;
>>> +}
>>> +
>> [...]
>>> +
>>> static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
>>> {
>>> struct bpf_iter_meta meta;
>>> struct bpf_prog *prog;
>>> struct sock *sk = v;
>>> + bool slow;
>>> uid_t uid;
>>> + int ret;
>>> if (v == SEQ_START_TOKEN)
>>> return 0;
>>> + if (sk_fullsock(sk))
>>> + slow = lock_sock_fast(sk);
>>> +
>>> + if (unlikely(sk_unhashed(sk))) {
>>> + ret = SEQ_SKIP;
>>> + goto unlock;
>>> + }
>>
>> I am not a tcp expert. Maybe a dummy question.
>> Is it possible to do setsockopt() for listening socket?
>> What will happen if the listening sock is unhashed after the
>> above check?
> It won't happen because the sk has been locked before doing the
> unhashed check.
Ya, that is true. I guess I probably mean TCP_TIME_WAIT and
TCP_NEW_SYN_RECV sockets. We cannot do setsockopt() for
TCP_TIME_WAIT sockets since user space shouldn't be able
to access the socket any more.
But how about TCP_NEW_SYN_RECV sockets?
>
> Thanks for the review.
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock
2021-06-29 17:57 ` Yonghong Song
@ 2021-06-29 18:06 ` Martin KaFai Lau
2021-06-29 18:55 ` Yonghong Song
0 siblings, 1 reply; 8+ messages in thread
From: Martin KaFai Lau @ 2021-06-29 18:06 UTC (permalink / raw)
To: Yonghong Song
Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Eric Dumazet,
kernel-team, Neal Cardwell, netdev, Yuchung Cheng
On Tue, Jun 29, 2021 at 10:57:46AM -0700, Yonghong Song wrote:
>
>
> On 6/29/21 10:44 AM, Martin KaFai Lau wrote:
> > On Tue, Jun 29, 2021 at 10:27:17AM -0700, Yonghong Song wrote:
> > [ ... ]
> >
> > > > +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
> > > > + unsigned int new_batch_sz)
> > > > +{
> > > > + struct sock **new_batch;
> > > > +
> > > > + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER);
> > >
> > > Since we return -ENOMEM below, should we have __GFP_NOWARN in kvmalloc
> > > flags?
> > will add in v2.
> >
> > >
> > > > + if (!new_batch)
> > > > + return -ENOMEM;
> > > > +
> > > > + bpf_iter_tcp_put_batch(iter);
> > > > + kvfree(iter->batch);
> > > > + iter->batch = new_batch;
> > > > + iter->max_sk = new_batch_sz;
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > [...]
> > > > +
> > > > static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
> > > > {
> > > > struct bpf_iter_meta meta;
> > > > struct bpf_prog *prog;
> > > > struct sock *sk = v;
> > > > + bool slow;
> > > > uid_t uid;
> > > > + int ret;
> > > > if (v == SEQ_START_TOKEN)
> > > > return 0;
> > > > + if (sk_fullsock(sk))
> > > > + slow = lock_sock_fast(sk);
> > > > +
> > > > + if (unlikely(sk_unhashed(sk))) {
> > > > + ret = SEQ_SKIP;
> > > > + goto unlock;
> > > > + }
> > >
> > > I am not a tcp expert. Maybe a dummy question.
> > > Is it possible to do setsockopt() for listening socket?
> > > What will happen if the listening sock is unhashed after the
> > > above check?
> > It won't happen because the sk has been locked before doing the
> > unhashed check.
>
> Ya, that is true. I guess I probably mean TCP_TIME_WAIT and
> TCP_NEW_SYN_RECV sockets. We cannot do setsockopt() for
> TCP_TIME_WAIT sockets since user space shouldn't be able
> to access the socket any more.
>
> But how about TCP_NEW_SYN_RECV sockets?
_bpf_setsockopt() will return -EINVAL for non fullsock.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock
2021-06-29 18:06 ` Martin KaFai Lau
@ 2021-06-29 18:55 ` Yonghong Song
0 siblings, 0 replies; 8+ messages in thread
From: Yonghong Song @ 2021-06-29 18:55 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Eric Dumazet,
kernel-team, Neal Cardwell, netdev, Yuchung Cheng
On 6/29/21 11:06 AM, Martin KaFai Lau wrote:
> On Tue, Jun 29, 2021 at 10:57:46AM -0700, Yonghong Song wrote:
>>
>>
>> On 6/29/21 10:44 AM, Martin KaFai Lau wrote:
>>> On Tue, Jun 29, 2021 at 10:27:17AM -0700, Yonghong Song wrote:
>>> [ ... ]
>>>
>>>>> +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
>>>>> + unsigned int new_batch_sz)
>>>>> +{
>>>>> + struct sock **new_batch;
>>>>> +
>>>>> + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER);
>>>>
>>>> Since we return -ENOMEM below, should we have __GFP_NOWARN in kvmalloc
>>>> flags?
>>> will add in v2.
>>>
>>>>
>>>>> + if (!new_batch)
>>>>> + return -ENOMEM;
>>>>> +
>>>>> + bpf_iter_tcp_put_batch(iter);
>>>>> + kvfree(iter->batch);
>>>>> + iter->batch = new_batch;
>>>>> + iter->max_sk = new_batch_sz;
>>>>> +
>>>>> + return 0;
>>>>> +}
>>>>> +
>>>> [...]
>>>>> +
>>>>> static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
>>>>> {
>>>>> struct bpf_iter_meta meta;
>>>>> struct bpf_prog *prog;
>>>>> struct sock *sk = v;
>>>>> + bool slow;
>>>>> uid_t uid;
>>>>> + int ret;
>>>>> if (v == SEQ_START_TOKEN)
>>>>> return 0;
>>>>> + if (sk_fullsock(sk))
>>>>> + slow = lock_sock_fast(sk);
>>>>> +
>>>>> + if (unlikely(sk_unhashed(sk))) {
>>>>> + ret = SEQ_SKIP;
>>>>> + goto unlock;
>>>>> + }
>>>>
>>>> I am not a tcp expert. Maybe a dummy question.
>>>> Is it possible to do setsockopt() for listening socket?
>>>> What will happen if the listening sock is unhashed after the
>>>> above check?
>>> It won't happen because the sk has been locked before doing the
>>> unhashed check.
>>
>> Ya, that is true. I guess I probably mean TCP_TIME_WAIT and
>> TCP_NEW_SYN_RECV sockets. We cannot do setsockopt() for
>> TCP_TIME_WAIT sockets since user space shouldn't be able
>> to access the socket any more.
>>
>> But how about TCP_NEW_SYN_RECV sockets?
> _bpf_setsockopt() will return -EINVAL for non fullsock.
That makes sense. I think whether we could block calling
bpf_setsockopt() for unsupported sockets outside bpf program.
But indeed letting bpf to do filtering in such cases should
be simpler.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2021-06-29 18:56 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-06-26 14:42 [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock kernel test robot
-- strict thread matches above, loose matches on Subject: below --
2021-06-26 5:21 kernel test robot
2021-06-25 20:04 [PATCH bpf-next 0/8] bpf: Allow bpf tcp iter to do bpf_setsockopt Martin KaFai Lau
2021-06-25 20:05 ` [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock Martin KaFai Lau
2021-06-29 17:27 ` Yonghong Song
2021-06-29 17:44 ` Martin KaFai Lau
2021-06-29 17:57 ` Yonghong Song
2021-06-29 18:06 ` Martin KaFai Lau
2021-06-29 18:55 ` Yonghong Song
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.