* [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue
@ 2023-11-21 11:22 Pengcheng Yang
2023-11-21 11:22 ` [PATCH bpf-next v2 1/3] skmsg: Support to get the data length in ingress_msg Pengcheng Yang
` (3 more replies)
0 siblings, 4 replies; 9+ messages in thread
From: Pengcheng Yang @ 2023-11-21 11:22 UTC (permalink / raw)
To: John Fastabend, Jakub Sitnicki, Eric Dumazet, Jakub Kicinski, bpf,
netdev
Cc: Pengcheng Yang
When using skmsg redirect, the msg is queued in psock->ingress_msg,
and the application calling SIOCINQ ioctl will return a readable
length of 0, and we cannot track the data length of ingress_msg with
the ss tool.
In this patch set, we added the data length in ingress_msg to the
SIOCINQ ioctl and the rx_queue of tcp_diag.
v2:
- Add READ_ONCE()/WRITE_ONCE() on accesses to psock->msg_len
- Mask out the increment msg_len where its not needed
Pengcheng Yang (3):
skmsg: Support to get the data length in ingress_msg
tcp: Add the data length in skmsg to SIOCINQ ioctl
tcp_diag: Add the data length in skmsg to rx_queue
include/linux/skmsg.h | 26 ++++++++++++++++++++++++--
net/core/skmsg.c | 10 +++++++++-
net/ipv4/tcp.c | 3 ++-
net/ipv4/tcp_diag.c | 2 ++
4 files changed, 37 insertions(+), 4 deletions(-)
--
2.38.1
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH bpf-next v2 1/3] skmsg: Support to get the data length in ingress_msg
2023-11-21 11:22 [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue Pengcheng Yang
@ 2023-11-21 11:22 ` Pengcheng Yang
2023-12-05 0:23 ` John Fastabend
2023-11-21 11:22 ` [PATCH bpf-next v2 2/3] tcp: Add the data length in skmsg to SIOCINQ ioctl Pengcheng Yang
` (2 subsequent siblings)
3 siblings, 1 reply; 9+ messages in thread
From: Pengcheng Yang @ 2023-11-21 11:22 UTC (permalink / raw)
To: John Fastabend, Jakub Sitnicki, Eric Dumazet, Jakub Kicinski, bpf,
netdev
Cc: Pengcheng Yang
Currently msg is queued in ingress_msg of the target psock
on ingress redirect, without increment rcv_nxt. The size
that user can read includes the data in receive_queue and
ingress_msg. So we introduce sk_msg_queue_len() helper to
get the data length in ingress_msg.
Note that the msg_len does not include the data length of
msg from recevive_queue via SK_PASS, as they increment rcv_nxt
when received.
Signed-off-by: Pengcheng Yang <yangpc@wangsu.com>
---
include/linux/skmsg.h | 26 ++++++++++++++++++++++++--
net/core/skmsg.c | 10 +++++++++-
2 files changed, 33 insertions(+), 3 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index c1637515a8a4..423a5c28c606 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -47,6 +47,7 @@ struct sk_msg {
u32 apply_bytes;
u32 cork_bytes;
u32 flags;
+ bool ingress_self;
struct sk_buff *skb;
struct sock *sk_redir;
struct sock *sk;
@@ -82,6 +83,7 @@ struct sk_psock {
u32 apply_bytes;
u32 cork_bytes;
u32 eval;
+ u32 msg_len;
bool redir_ingress; /* undefined if sk_redir is null */
struct sk_msg *cork;
struct sk_psock_progs progs;
@@ -311,9 +313,11 @@ static inline void sk_psock_queue_msg(struct sk_psock *psock,
struct sk_msg *msg)
{
spin_lock_bh(&psock->ingress_lock);
- if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
list_add_tail(&msg->list, &psock->ingress_msg);
- else {
+ if (!msg->ingress_self)
+ WRITE_ONCE(psock->msg_len, psock->msg_len + msg->sg.size);
+ } else {
sk_msg_free(psock->sk, msg);
kfree(msg);
}
@@ -368,6 +372,24 @@ static inline void kfree_sk_msg(struct sk_msg *msg)
kfree(msg);
}
+static inline void sk_msg_queue_consumed(struct sk_psock *psock, u32 len)
+{
+ WRITE_ONCE(psock->msg_len, psock->msg_len - len);
+}
+
+static inline u32 sk_msg_queue_len(const struct sock *sk)
+{
+ struct sk_psock *psock;
+ u32 len = 0;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock)
+ len = READ_ONCE(psock->msg_len);
+ rcu_read_unlock();
+ return len;
+}
+
static inline void sk_psock_report_error(struct sk_psock *psock, int err)
{
struct sock *sk = psock->sk;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 6c31eefbd777..f46732a8ddc2 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -415,7 +415,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
struct iov_iter *iter = &msg->msg_iter;
int peek = flags & MSG_PEEK;
struct sk_msg *msg_rx;
- int i, copied = 0;
+ int i, copied = 0, msg_copied = 0;
msg_rx = sk_psock_peek_msg(psock);
while (copied != len) {
@@ -441,6 +441,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
}
copied += copy;
+ if (!msg_rx->ingress_self)
+ msg_copied += copy;
if (likely(!peek)) {
sge->offset += copy;
sge->length -= copy;
@@ -481,6 +483,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
msg_rx = sk_psock_peek_msg(psock);
}
out:
+ if (likely(!peek) && msg_copied)
+ sk_msg_queue_consumed(psock, msg_copied);
return copied;
}
EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
@@ -602,6 +606,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
if (unlikely(!msg))
return -EAGAIN;
+ msg->ingress_self = true;
skb_set_owner_r(skb, sk);
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
if (err < 0)
@@ -771,9 +776,12 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
list_del(&msg->list);
+ if (!msg->ingress_self)
+ sk_msg_queue_consumed(psock, msg->sg.size);
sk_msg_free(psock->sk, msg);
kfree(msg);
}
+ WARN_ON_ONCE(READ_ONCE(psock->msg_len) != 0);
}
static void __sk_psock_zap_ingress(struct sk_psock *psock)
--
2.38.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH bpf-next v2 2/3] tcp: Add the data length in skmsg to SIOCINQ ioctl
2023-11-21 11:22 [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue Pengcheng Yang
2023-11-21 11:22 ` [PATCH bpf-next v2 1/3] skmsg: Support to get the data length in ingress_msg Pengcheng Yang
@ 2023-11-21 11:22 ` Pengcheng Yang
2023-11-21 11:22 ` [PATCH bpf-next v2 3/3] tcp_diag: Add the data length in skmsg to rx_queue Pengcheng Yang
2023-11-22 15:16 ` [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue Daniel Borkmann
3 siblings, 0 replies; 9+ messages in thread
From: Pengcheng Yang @ 2023-11-21 11:22 UTC (permalink / raw)
To: John Fastabend, Jakub Sitnicki, Eric Dumazet, Jakub Kicinski, bpf,
netdev
Cc: Pengcheng Yang
SIOCINQ ioctl returns the number unread bytes of the receive
queue but does not include the ingress_msg queue. With the
sk_msg redirect, an application may get a value 0 if it calls
SIOCINQ ioctl before recv() to determine the readable size.
Signed-off-by: Pengcheng Yang <yangpc@wangsu.com>
---
net/ipv4/tcp.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3d3a24f79573..04da0684c397 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -267,6 +267,7 @@
#include <linux/errqueue.h>
#include <linux/static_key.h>
#include <linux/btf.h>
+#include <linux/skmsg.h>
#include <net/icmp.h>
#include <net/inet_common.h>
@@ -613,7 +614,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg)
return -EINVAL;
slow = lock_sock_fast(sk);
- answ = tcp_inq(sk);
+ answ = tcp_inq(sk) + sk_msg_queue_len(sk);
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
--
2.38.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH bpf-next v2 3/3] tcp_diag: Add the data length in skmsg to rx_queue
2023-11-21 11:22 [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue Pengcheng Yang
2023-11-21 11:22 ` [PATCH bpf-next v2 1/3] skmsg: Support to get the data length in ingress_msg Pengcheng Yang
2023-11-21 11:22 ` [PATCH bpf-next v2 2/3] tcp: Add the data length in skmsg to SIOCINQ ioctl Pengcheng Yang
@ 2023-11-21 11:22 ` Pengcheng Yang
2023-11-22 15:16 ` [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue Daniel Borkmann
3 siblings, 0 replies; 9+ messages in thread
From: Pengcheng Yang @ 2023-11-21 11:22 UTC (permalink / raw)
To: John Fastabend, Jakub Sitnicki, Eric Dumazet, Jakub Kicinski, bpf,
netdev
Cc: Pengcheng Yang
In order to help us track the data length in ingress_msg
when using sk_msg redirect.
Signed-off-by: Pengcheng Yang <yangpc@wangsu.com>
---
net/ipv4/tcp_diag.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 01b50fa79189..b22382820a4b 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -11,6 +11,7 @@
#include <linux/inet_diag.h>
#include <linux/tcp.h>
+#include <linux/skmsg.h>
#include <net/netlink.h>
#include <net/tcp.h>
@@ -28,6 +29,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) -
READ_ONCE(tp->copied_seq), 0);
+ r->idiag_rqueue += sk_msg_queue_len(sk);
r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una;
}
if (info)
--
2.38.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue
2023-11-21 11:22 [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue Pengcheng Yang
` (2 preceding siblings ...)
2023-11-21 11:22 ` [PATCH bpf-next v2 3/3] tcp_diag: Add the data length in skmsg to rx_queue Pengcheng Yang
@ 2023-11-22 15:16 ` Daniel Borkmann
2023-11-23 11:20 ` Pengcheng Yang
3 siblings, 1 reply; 9+ messages in thread
From: Daniel Borkmann @ 2023-11-22 15:16 UTC (permalink / raw)
To: Pengcheng Yang, John Fastabend, Jakub Sitnicki, Eric Dumazet,
Jakub Kicinski, bpf, netdev
On 11/21/23 12:22 PM, Pengcheng Yang wrote:
> When using skmsg redirect, the msg is queued in psock->ingress_msg,
> and the application calling SIOCINQ ioctl will return a readable
> length of 0, and we cannot track the data length of ingress_msg with
> the ss tool.
>
> In this patch set, we added the data length in ingress_msg to the
> SIOCINQ ioctl and the rx_queue of tcp_diag.
>
> v2:
> - Add READ_ONCE()/WRITE_ONCE() on accesses to psock->msg_len
> - Mask out the increment msg_len where its not needed
Please double check BPF CI, this series might be breaking sockmap selftests :
https://github.com/kernel-patches/bpf/actions/runs/6922624338/job/18829650043
[...]
Notice: Success: 501/13458, Skipped: 57, Failed: 1
Error: #281 sockmap_basic
Error: #281/16 sockmap_basic/sockmap skb_verdict fionread
Error: #281/16 sockmap_basic/sockmap skb_verdict fionread
test_sockmap_skb_verdict_fionread:PASS:open_and_load 0 nsec
test_sockmap_skb_verdict_fionread:PASS:bpf_prog_attach 0 nsec
test_sockmap_skb_verdict_fionread:PASS:socket_loopback(s) 0 nsec
test_sockmap_skb_verdict_fionread:PASS:create_socket_pairs(s) 0 nsec
test_sockmap_skb_verdict_fionread:PASS:bpf_map_update_elem(c1) 0 nsec
test_sockmap_skb_verdict_fionread:PASS:xsend(p0) 0 nsec
test_sockmap_skb_verdict_fionread:PASS:ioctl(FIONREAD) error 0 nsec
test_sockmap_skb_verdict_fionread:FAIL:ioctl(FIONREAD) unexpected ioctl(FIONREAD): actual 512 != expected 256
test_sockmap_skb_verdict_fionread:PASS:recv_timeout(c0) 0 nsec
Error: #281/18 sockmap_basic/sockmap skb_verdict msg_f_peek
Error: #281/18 sockmap_basic/sockmap skb_verdict msg_f_peek
test_sockmap_skb_verdict_peek:PASS:open_and_load 0 nsec
test_sockmap_skb_verdict_peek:PASS:bpf_prog_attach 0 nsec
test_sockmap_skb_verdict_peek:PASS:socket_loopback(s) 0 nsec
test_sockmap_skb_verdict_peek:PASS:create_pairs(s) 0 nsec
test_sockmap_skb_verdict_peek:PASS:bpf_map_update_elem(c1) 0 nsec
test_sockmap_skb_verdict_peek:PASS:xsend(p1) 0 nsec
test_sockmap_skb_verdict_peek:PASS:recv(c1) 0 nsec
test_sockmap_skb_verdict_peek:PASS:ioctl(FIONREAD) error 0 nsec
test_sockmap_skb_verdict_peek:FAIL:after peek ioctl(FIONREAD) unexpected after peek ioctl(FIONREAD): actual 512 != expected 256
test_sockmap_skb_verdict_peek:PASS:recv(p0) 0 nsec
test_sockmap_skb_verdict_peek:PASS:ioctl(FIONREAD) error 0 nsec
test_sockmap_skb_verdict_peek:PASS:after read ioctl(FIONREAD) 0 nsec
Test Results:
bpftool: PASS
test_progs-no_alu32: FAIL (returned 1)
shutdown: CLEAN
Error: Process completed with exit code 1.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue
2023-11-22 15:16 ` [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue Daniel Borkmann
@ 2023-11-23 11:20 ` Pengcheng Yang
2023-11-23 13:31 ` Daniel Borkmann
0 siblings, 1 reply; 9+ messages in thread
From: Pengcheng Yang @ 2023-11-23 11:20 UTC (permalink / raw)
To: 'Daniel Borkmann', 'John Fastabend',
'Jakub Sitnicki', 'Eric Dumazet',
'Jakub Kicinski', bpf, netdev
Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 11/21/23 12:22 PM, Pengcheng Yang wrote:
> > When using skmsg redirect, the msg is queued in psock->ingress_msg,
> > and the application calling SIOCINQ ioctl will return a readable
> > length of 0, and we cannot track the data length of ingress_msg with
> > the ss tool.
> >
> > In this patch set, we added the data length in ingress_msg to the
> > SIOCINQ ioctl and the rx_queue of tcp_diag.
> >
> > v2:
> > - Add READ_ONCE()/WRITE_ONCE() on accesses to psock->msg_len
> > - Mask out the increment msg_len where its not needed
>
> Please double check BPF CI, this series might be breaking sockmap selftests :
>
> https://github.com/kernel-patches/bpf/actions/runs/6922624338/job/18829650043
>
Is this a misunderstanding?
The selftests failure above were run on patch set v1 4 days ago, and this patch v2
is the fix for this case.
> [...]
> Notice: Success: 501/13458, Skipped: 57, Failed: 1
> Error: #281 sockmap_basic
> Error: #281/16 sockmap_basic/sockmap skb_verdict fionread
> Error: #281/16 sockmap_basic/sockmap skb_verdict fionread
> test_sockmap_skb_verdict_fionread:PASS:open_and_load 0 nsec
> test_sockmap_skb_verdict_fionread:PASS:bpf_prog_attach 0 nsec
> test_sockmap_skb_verdict_fionread:PASS:socket_loopback(s) 0 nsec
> test_sockmap_skb_verdict_fionread:PASS:create_socket_pairs(s) 0 nsec
> test_sockmap_skb_verdict_fionread:PASS:bpf_map_update_elem(c1) 0 nsec
> test_sockmap_skb_verdict_fionread:PASS:xsend(p0) 0 nsec
> test_sockmap_skb_verdict_fionread:PASS:ioctl(FIONREAD) error 0 nsec
> test_sockmap_skb_verdict_fionread:FAIL:ioctl(FIONREAD) unexpected ioctl(FIONREAD): actual 512 != expected 256
> test_sockmap_skb_verdict_fionread:PASS:recv_timeout(c0) 0 nsec
> Error: #281/18 sockmap_basic/sockmap skb_verdict msg_f_peek
> Error: #281/18 sockmap_basic/sockmap skb_verdict msg_f_peek
> test_sockmap_skb_verdict_peek:PASS:open_and_load 0 nsec
> test_sockmap_skb_verdict_peek:PASS:bpf_prog_attach 0 nsec
> test_sockmap_skb_verdict_peek:PASS:socket_loopback(s) 0 nsec
> test_sockmap_skb_verdict_peek:PASS:create_pairs(s) 0 nsec
> test_sockmap_skb_verdict_peek:PASS:bpf_map_update_elem(c1) 0 nsec
> test_sockmap_skb_verdict_peek:PASS:xsend(p1) 0 nsec
> test_sockmap_skb_verdict_peek:PASS:recv(c1) 0 nsec
> test_sockmap_skb_verdict_peek:PASS:ioctl(FIONREAD) error 0 nsec
> test_sockmap_skb_verdict_peek:FAIL:after peek ioctl(FIONREAD) unexpected after peek ioctl(FIONREAD): actual 512 != expected 256
> test_sockmap_skb_verdict_peek:PASS:recv(p0) 0 nsec
> test_sockmap_skb_verdict_peek:PASS:ioctl(FIONREAD) error 0 nsec
> test_sockmap_skb_verdict_peek:PASS:after read ioctl(FIONREAD) 0 nsec
> Test Results:
> bpftool: PASS
> test_progs-no_alu32: FAIL (returned 1)
> shutdown: CLEAN
> Error: Process completed with exit code 1.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue
2023-11-23 11:20 ` Pengcheng Yang
@ 2023-11-23 13:31 ` Daniel Borkmann
0 siblings, 0 replies; 9+ messages in thread
From: Daniel Borkmann @ 2023-11-23 13:31 UTC (permalink / raw)
To: Pengcheng Yang, 'John Fastabend',
'Jakub Sitnicki', 'Eric Dumazet',
'Jakub Kicinski', bpf, netdev
On 11/23/23 12:20 PM, Pengcheng Yang wrote:
> Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 11/21/23 12:22 PM, Pengcheng Yang wrote:
>>> When using skmsg redirect, the msg is queued in psock->ingress_msg,
>>> and the application calling SIOCINQ ioctl will return a readable
>>> length of 0, and we cannot track the data length of ingress_msg with
>>> the ss tool.
>>>
>>> In this patch set, we added the data length in ingress_msg to the
>>> SIOCINQ ioctl and the rx_queue of tcp_diag.
>>>
>>> v2:
>>> - Add READ_ONCE()/WRITE_ONCE() on accesses to psock->msg_len
>>> - Mask out the increment msg_len where its not needed
>>
>> Please double check BPF CI, this series might be breaking sockmap selftests :
>>
>> https://github.com/kernel-patches/bpf/actions/runs/6922624338/job/18829650043
>
> Is this a misunderstanding?
> The selftests failure above were run on patch set v1 4 days ago, and this patch v2
> is the fix for this case.
Indeed looks like there were two CI emails on v2 as well, one pointing to
failure (which was also linked from patchwork), and one to success, both
pointing to:
https://patchwork.kernel.org/project/netdevbpf/list/?series=802821&state=*
Let me rerun, meanwhile, I placed it back to 'under review'.
^ permalink raw reply [flat|nested] 9+ messages in thread
* RE: [PATCH bpf-next v2 1/3] skmsg: Support to get the data length in ingress_msg
2023-11-21 11:22 ` [PATCH bpf-next v2 1/3] skmsg: Support to get the data length in ingress_msg Pengcheng Yang
@ 2023-12-05 0:23 ` John Fastabend
2023-12-08 11:17 ` Pengcheng Yang
0 siblings, 1 reply; 9+ messages in thread
From: John Fastabend @ 2023-12-05 0:23 UTC (permalink / raw)
To: Pengcheng Yang, John Fastabend, Jakub Sitnicki, Eric Dumazet,
Jakub Kicinski, bpf, netdev
Cc: Pengcheng Yang
Pengcheng Yang wrote:
> Currently msg is queued in ingress_msg of the target psock
> on ingress redirect, without increment rcv_nxt. The size
> that user can read includes the data in receive_queue and
> ingress_msg. So we introduce sk_msg_queue_len() helper to
> get the data length in ingress_msg.
>
> Note that the msg_len does not include the data length of
> msg from recevive_queue via SK_PASS, as they increment rcv_nxt
> when received.
>
> Signed-off-by: Pengcheng Yang <yangpc@wangsu.com>
> ---
> include/linux/skmsg.h | 26 ++++++++++++++++++++++++--
> net/core/skmsg.c | 10 +++++++++-
> 2 files changed, 33 insertions(+), 3 deletions(-)
>
This has two writers under different locks this looks insufficient
to ensure correctness of the counter. Likely the consume can be
moved into where the dequeue_msg happens? But, then its not always
accurate which might break some applications doing buffer sizing.
An example of this would be nginx.
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index c1637515a8a4..423a5c28c606 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -47,6 +47,7 @@ struct sk_msg {
> u32 apply_bytes;
> u32 cork_bytes;
> u32 flags;
> + bool ingress_self;
> struct sk_buff *skb;
> struct sock *sk_redir;
> struct sock *sk;
> @@ -82,6 +83,7 @@ struct sk_psock {
> u32 apply_bytes;
> u32 cork_bytes;
> u32 eval;
> + u32 msg_len;
> bool redir_ingress; /* undefined if sk_redir is null */
> struct sk_msg *cork;
> struct sk_psock_progs progs;
> @@ -311,9 +313,11 @@ static inline void sk_psock_queue_msg(struct sk_psock *psock,
> struct sk_msg *msg)
> {
> spin_lock_bh(&psock->ingress_lock);
> - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
> + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
> list_add_tail(&msg->list, &psock->ingress_msg);
> - else {
> + if (!msg->ingress_self)
> + WRITE_ONCE(psock->msg_len, psock->msg_len + msg->sg.size);
First writer here can be from
sk_psock_backlog()
mutex_lock(psock->work_mutex)
...
sk_psock_handle_skb()
sk_psock_skb_ingress()
sk_psock_skb_ingress_enqueue()
sk_psock_queue_msg()
spin_lock_bh(psock->ingress_lock)
WRITE_ONCE(...)
spin_unlock_bh()
> + } else {
> sk_msg_free(psock->sk, msg);
> kfree(msg);
> }
> @@ -368,6 +372,24 @@ static inline void kfree_sk_msg(struct sk_msg *msg)
> kfree(msg);
> }
>
> +static inline void sk_msg_queue_consumed(struct sk_psock *psock, u32 len)
> +{
> + WRITE_ONCE(psock->msg_len, psock->msg_len - len);
> +}
> +
> +static inline u32 sk_msg_queue_len(const struct sock *sk)
> +{
> + struct sk_psock *psock;
> + u32 len = 0;
> +
> + rcu_read_lock();
> + psock = sk_psock(sk);
> + if (psock)
> + len = READ_ONCE(psock->msg_len);
> + rcu_read_unlock();
> + return len;
> +}
> +
> static inline void sk_psock_report_error(struct sk_psock *psock, int err)
> {
> struct sock *sk = psock->sk;
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index 6c31eefbd777..f46732a8ddc2 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -415,7 +415,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
> struct iov_iter *iter = &msg->msg_iter;
> int peek = flags & MSG_PEEK;
> struct sk_msg *msg_rx;
> - int i, copied = 0;
> + int i, copied = 0, msg_copied = 0;
>
> msg_rx = sk_psock_peek_msg(psock);
> while (copied != len) {
> @@ -441,6 +441,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
> }
>
> copied += copy;
> + if (!msg_rx->ingress_self)
> + msg_copied += copy;
> if (likely(!peek)) {
> sge->offset += copy;
> sge->length -= copy;
> @@ -481,6 +483,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
> msg_rx = sk_psock_peek_msg(psock);
> }
> out:
> + if (likely(!peek) && msg_copied)
> + sk_msg_queue_consumed(psock, msg_copied);
Second writer,
tcp_bpf_recvmsg_parser()
lock_sock(sk)
sk_msg_recvmsg()
sk_psock_peek_msg()
spin_lock_bh(ingress_lock); <- lock held from first writer.
msg = list...
spin_unlock_bh()
sk_psock_Dequeue_msg(psock)
spin_lock_bh(ingress_lock)
msg = .... <- should call queue_consumed here
spin_unlock_bh()
out:
if (likely(!peek) && msg_copied)
sk_msg_queue_consumed(psock, msg_copied); <- here no lock?
It looks like you could move the queue_consumed up into the dequeue_msg,
but then you have some issue on partial reads I think? Basically the
IOCTL might return more bytes than are actually in the ingress queue.
Also it will look strange if the ioctl is called twice once before a read
and again after a read and the byte count doesn't change.
Maybe needs ingress queue lock wrapped around this queue consuned and
leave it where it is? Couple ideas anyways, but I don't think its
correct as is.
> return copied;
> }
> EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
> @@ -602,6 +606,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
>
> if (unlikely(!msg))
> return -EAGAIN;
> + msg->ingress_self = true;
> skb_set_owner_r(skb, sk);
> err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
> if (err < 0)
> @@ -771,9 +776,12 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
>
purge doesn't use the ingress_lock because its cancelled and syncd the
backlog and proto handlers have been swapped back to original handlers
so there is no longer any way to get at the ingress queue from the socket
side either.
> list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
> list_del(&msg->list);
> + if (!msg->ingress_self)
> + sk_msg_queue_consumed(psock, msg->sg.size);
> sk_msg_free(psock->sk, msg);
> kfree(msg);
> }
> + WARN_ON_ONCE(READ_ONCE(psock->msg_len) != 0);
> }
>
> static void __sk_psock_zap_ingress(struct sk_psock *psock)
> --
> 2.38.1
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 1/3] skmsg: Support to get the data length in ingress_msg
2023-12-05 0:23 ` John Fastabend
@ 2023-12-08 11:17 ` Pengcheng Yang
0 siblings, 0 replies; 9+ messages in thread
From: Pengcheng Yang @ 2023-12-08 11:17 UTC (permalink / raw)
To: 'John Fastabend', 'Jakub Sitnicki',
'Eric Dumazet', 'Jakub Kicinski', bpf, netdev
John Fastabend <john.fastabend@gmail.com> wrote:
>
> Pengcheng Yang wrote:
> > Currently msg is queued in ingress_msg of the target psock
> > on ingress redirect, without increment rcv_nxt. The size
> > that user can read includes the data in receive_queue and
> > ingress_msg. So we introduce sk_msg_queue_len() helper to
> > get the data length in ingress_msg.
> >
> > Note that the msg_len does not include the data length of
> > msg from recevive_queue via SK_PASS, as they increment rcv_nxt
> > when received.
> >
> > Signed-off-by: Pengcheng Yang <yangpc@wangsu.com>
> > ---
> > include/linux/skmsg.h | 26 ++++++++++++++++++++++++--
> > net/core/skmsg.c | 10 +++++++++-
> > 2 files changed, 33 insertions(+), 3 deletions(-)
> >
>
> This has two writers under different locks this looks insufficient
> to ensure correctness of the counter. Likely the consume can be
> moved into where the dequeue_msg happens? But, then its not always
> accurate which might break some applications doing buffer sizing.
> An example of this would be nginx.
>
> > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> > index c1637515a8a4..423a5c28c606 100644
> > --- a/include/linux/skmsg.h
> > +++ b/include/linux/skmsg.h
> > @@ -47,6 +47,7 @@ struct sk_msg {
> > u32 apply_bytes;
> > u32 cork_bytes;
> > u32 flags;
> > + bool ingress_self;
> > struct sk_buff *skb;
> > struct sock *sk_redir;
> > struct sock *sk;
> > @@ -82,6 +83,7 @@ struct sk_psock {
> > u32 apply_bytes;
> > u32 cork_bytes;
> > u32 eval;
> > + u32 msg_len;
> > bool redir_ingress; /* undefined if sk_redir is null */
> > struct sk_msg *cork;
> > struct sk_psock_progs progs;
> > @@ -311,9 +313,11 @@ static inline void sk_psock_queue_msg(struct sk_psock *psock,
> > struct sk_msg *msg)
> > {
> > spin_lock_bh(&psock->ingress_lock);
> > - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
> > + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
> > list_add_tail(&msg->list, &psock->ingress_msg);
> > - else {
> > + if (!msg->ingress_self)
> > + WRITE_ONCE(psock->msg_len, psock->msg_len + msg->sg.size);
>
> First writer here can be from
>
> sk_psock_backlog()
> mutex_lock(psock->work_mutex)
> ...
> sk_psock_handle_skb()
> sk_psock_skb_ingress()
> sk_psock_skb_ingress_enqueue()
> sk_psock_queue_msg()
> spin_lock_bh(psock->ingress_lock)
> WRITE_ONCE(...)
> spin_unlock_bh()
>
> > + } else {
> > sk_msg_free(psock->sk, msg);
> > kfree(msg);
> > }
> > @@ -368,6 +372,24 @@ static inline void kfree_sk_msg(struct sk_msg *msg)
> > kfree(msg);
> > }
> >
> > +static inline void sk_msg_queue_consumed(struct sk_psock *psock, u32 len)
> > +{
> > + WRITE_ONCE(psock->msg_len, psock->msg_len - len);
> > +}
> > +
> > +static inline u32 sk_msg_queue_len(const struct sock *sk)
> > +{
> > + struct sk_psock *psock;
> > + u32 len = 0;
> > +
> > + rcu_read_lock();
> > + psock = sk_psock(sk);
> > + if (psock)
> > + len = READ_ONCE(psock->msg_len);
> > + rcu_read_unlock();
> > + return len;
> > +}
> > +
> > static inline void sk_psock_report_error(struct sk_psock *psock, int err)
> > {
> > struct sock *sk = psock->sk;
> > diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> > index 6c31eefbd777..f46732a8ddc2 100644
> > --- a/net/core/skmsg.c
> > +++ b/net/core/skmsg.c
> > @@ -415,7 +415,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
> > struct iov_iter *iter = &msg->msg_iter;
> > int peek = flags & MSG_PEEK;
> > struct sk_msg *msg_rx;
> > - int i, copied = 0;
> > + int i, copied = 0, msg_copied = 0;
> >
> > msg_rx = sk_psock_peek_msg(psock);
> > while (copied != len) {
> > @@ -441,6 +441,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
> > }
> >
> > copied += copy;
> > + if (!msg_rx->ingress_self)
> > + msg_copied += copy;
> > if (likely(!peek)) {
> > sge->offset += copy;
> > sge->length -= copy;
> > @@ -481,6 +483,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
> > msg_rx = sk_psock_peek_msg(psock);
> > }
> > out:
> > + if (likely(!peek) && msg_copied)
> > + sk_msg_queue_consumed(psock, msg_copied);
>
> Second writer,
>
> tcp_bpf_recvmsg_parser()
> lock_sock(sk)
> sk_msg_recvmsg()
> sk_psock_peek_msg()
> spin_lock_bh(ingress_lock); <- lock held from first writer.
> msg = list...
> spin_unlock_bh()
> sk_psock_Dequeue_msg(psock)
> spin_lock_bh(ingress_lock)
> msg = .... <- should call queue_consumed here
> spin_unlock_bh()
>
> out:
> if (likely(!peek) && msg_copied)
> sk_msg_queue_consumed(psock, msg_copied); <- here no lock?
>
>
> It looks like you could move the queue_consumed up into the dequeue_msg,
> but then you have some issue on partial reads I think? Basically the
> IOCTL might return more bytes than are actually in the ingress queue.
> Also it will look strange if the ioctl is called twice once before a read
> and again after a read and the byte count doesn't change.
>
Thanks john for pointing this out.
Yes, I tried to move queue_consumed into dequeue_msg without
making major changes to sk_msg_recvmsg, but failed.
> Maybe needs ingress queue lock wrapped around this queue consuned and
> leave it where it is? Couple ideas anyways, but I don't think its
> correct as is.
And, is it acceptable to just put the ingress_lock around the queue_consuned in
Sk_msg_recvmsg? Like the following:
static inline void sk_msg_queue_consumed(struct sk_psock *psock, u32 len)
{
+ spin_lock_bh(&psock->ingress_lock);
WRITE_ONCE(psock->msg_len, psock->msg_len - len);
+ spin_unlock_bh(&psock->ingress_lock);
}
static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
{
struct sk_msg *msg, *tmp;
list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
list_del(&msg->list);
if (!msg->ingress_self)
~ WRITE_ONCE(psock->msg_len, psock->msg_len - msg->sg.size);
sk_msg_free(psock->sk, msg);
kfree(msg);
}
WARN_ON_ONCE(READ_ONCE(psock->msg_len) != 0);
}
>
> > return copied;
> > }
> > EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
> > @@ -602,6 +606,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
> >
> > if (unlikely(!msg))
> > return -EAGAIN;
> > + msg->ingress_self = true;
> > skb_set_owner_r(skb, sk);
> > err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
> > if (err < 0)
> > @@ -771,9 +776,12 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
> >
>
> purge doesn't use the ingress_lock because its cancelled and syncd the
> backlog and proto handlers have been swapped back to original handlers
> so there is no longer any way to get at the ingress queue from the socket
> side either.
>
> > list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
> > list_del(&msg->list);
> > + if (!msg->ingress_self)
> > + sk_msg_queue_consumed(psock, msg->sg.size);
> > sk_msg_free(psock->sk, msg);
> > kfree(msg);
> > }
> > + WARN_ON_ONCE(READ_ONCE(psock->msg_len) != 0);
> > }
> >
> > static void __sk_psock_zap_ingress(struct sk_psock *psock)
> > --
> > 2.38.1
> >
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2023-12-08 11:17 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-11-21 11:22 [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue Pengcheng Yang
2023-11-21 11:22 ` [PATCH bpf-next v2 1/3] skmsg: Support to get the data length in ingress_msg Pengcheng Yang
2023-12-05 0:23 ` John Fastabend
2023-12-08 11:17 ` Pengcheng Yang
2023-11-21 11:22 ` [PATCH bpf-next v2 2/3] tcp: Add the data length in skmsg to SIOCINQ ioctl Pengcheng Yang
2023-11-21 11:22 ` [PATCH bpf-next v2 3/3] tcp_diag: Add the data length in skmsg to rx_queue Pengcheng Yang
2023-11-22 15:16 ` [PATCH bpf-next v2 0/3] skmsg: Add the data length in skmsg to SIOCINQ ioctl and rx_queue Daniel Borkmann
2023-11-23 11:20 ` Pengcheng Yang
2023-11-23 13:31 ` Daniel Borkmann
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).