* [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets
@ 2024-10-10 17:48 Eric Dumazet
2024-10-10 17:48 ` [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
` (5 more replies)
0 siblings, 6 replies; 24+ messages in thread
From: Eric Dumazet @ 2024-10-10 17:48 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Martin KaFai Lau, Kuniyuki Iwashima, Neal Cardwell, Brian Vazquez,
netdev, eric.dumazet, Eric Dumazet
Currently, TCP can set skb->sk for a variety of transmit packets.
However, packets sent on behalf of a TIME_WAIT sockets do not
have an attached socket.
Same issue for RST packets.
We want to change this, in order to increase eBPF program
capabilities.
This is slightly risky, because various layers could
be confused by TIME_WAIT sockets showing up in skb->sk.
v3: also changed sk_const_to_full_sk() to address CI splat in XFRM
Link: https://netdev-3.bots.linux.dev/vmksft-nf-dbg/results/804321/21-conntrack-vrf-sh/stderr
v2: audited all sk_to_full_sk() users and addressed Martin feedback.
Eric Dumazet (5):
net: add TIME_WAIT logic to sk_to_full_sk()
net_sched: sch_fq: prepare for TIME_WAIT sockets
net: add skb_set_owner_edemux() helper
ipv6: tcp: give socket pointer to control skbs
ipv4: tcp: give socket pointer to control skbs
include/linux/bpf-cgroup.h | 2 +-
include/net/inet_sock.h | 8 ++++++--
include/net/ip.h | 3 ++-
include/net/sock.h | 19 +++++++++++++++++++
net/core/filter.c | 6 +-----
net/core/sock.c | 9 +++------
net/ipv4/ip_output.c | 5 ++++-
net/ipv4/tcp_ipv4.c | 4 ++--
net/ipv4/tcp_output.c | 2 +-
net/ipv6/tcp_ipv6.c | 3 +++
net/sched/sch_fq.c | 3 ++-
11 files changed, 44 insertions(+), 20 deletions(-)
--
2.47.0.rc1.288.g06298d1525-goog
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-10 17:48 [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
@ 2024-10-10 17:48 ` Eric Dumazet
2024-10-11 23:20 ` Kuniyuki Iwashima
` (2 more replies)
2024-10-10 17:48 ` [PATCH v3 net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets Eric Dumazet
` (4 subsequent siblings)
5 siblings, 3 replies; 24+ messages in thread
From: Eric Dumazet @ 2024-10-10 17:48 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Martin KaFai Lau, Kuniyuki Iwashima, Neal Cardwell, Brian Vazquez,
netdev, eric.dumazet, Eric Dumazet
TCP will soon attach TIME_WAIT sockets to some ACK and RST.
Make sure sk_to_full_sk() detects this and does not return
a non full socket.
v3: also changed sk_const_to_full_sk()
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/linux/bpf-cgroup.h | 2 +-
include/net/inet_sock.h | 8 ++++++--
net/core/filter.c | 6 +-----
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index ce91d9b2acb9f8991150ceead4475b130bead438..f0f219271daf4afea2666c4d09fd4d1a8091f844 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
int __ret = 0; \
if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \
typeof(sk) __sk = sk_to_full_sk(sk); \
- if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
+ if (__sk && __sk == skb_to_full_sk(skb) && \
cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \
__ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
CGROUP_INET_EGRESS); \
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index f01dd273bea69d2eaf7a1d28274d7f980942b78a..56d8bc5593d3dfffd5f94cf7c6383948881917df 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -321,8 +321,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
static inline struct sock *sk_to_full_sk(struct sock *sk)
{
#ifdef CONFIG_INET
- if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
+ if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
sk = inet_reqsk(sk)->rsk_listener;
+ if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
+ sk = NULL;
#endif
return sk;
}
@@ -331,8 +333,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk)
static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
{
#ifdef CONFIG_INET
- if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
+ if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
sk = ((const struct request_sock *)sk)->rsk_listener;
+ if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
+ sk = NULL;
#endif
return sk;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index bd0d08bf76bb8de39ca2ca89cda99a97c9b0a034..202c1d386e19599e9fc6e0a0d4a95986ba6d0ea8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6778,8 +6778,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
* sock refcnt is decremented to prevent a request_sock leak.
*/
- if (!sk_fullsock(sk2))
- sk2 = NULL;
if (sk2 != sk) {
sock_gen_put(sk);
/* Ensure there is no need to bump sk2 refcnt */
@@ -6826,8 +6824,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
* sock refcnt is decremented to prevent a request_sock leak.
*/
- if (!sk_fullsock(sk2))
- sk2 = NULL;
if (sk2 != sk) {
sock_gen_put(sk);
/* Ensure there is no need to bump sk2 refcnt */
@@ -7276,7 +7272,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
{
sk = sk_to_full_sk(sk);
- if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
+ if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
return (unsigned long)sk;
return (unsigned long)NULL;
--
2.47.0.rc1.288.g06298d1525-goog
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v3 net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets
2024-10-10 17:48 [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
2024-10-10 17:48 ` [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
@ 2024-10-10 17:48 ` Eric Dumazet
2024-10-11 23:25 ` Kuniyuki Iwashima
2024-10-14 14:05 ` Brian Vazquez
2024-10-10 17:48 ` [PATCH v3 net-next 3/5] net: add skb_set_owner_edemux() helper Eric Dumazet
` (3 subsequent siblings)
5 siblings, 2 replies; 24+ messages in thread
From: Eric Dumazet @ 2024-10-10 17:48 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Martin KaFai Lau, Kuniyuki Iwashima, Neal Cardwell, Brian Vazquez,
netdev, eric.dumazet, Eric Dumazet
TCP stack is not attaching skb to TIME_WAIT sockets yet,
but we would like to allow this in the future.
Add sk_listener_or_tw() helper to detect the three states
that FQ needs to take care.
Like NEW_SYN_RECV, TIME_WAIT are not full sockets and
do not contain sk->sk_pacing_status, sk->sk_pacing_rate.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/sock.h | 10 ++++++++++
net/sched/sch_fq.c | 3 ++-
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index b32f1424ecc52e4a299a207c029192475c1b6a65..703ec6aef927337f7ca6798ff3c3970529af53f9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2800,6 +2800,16 @@ static inline bool sk_listener(const struct sock *sk)
return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}
+/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT
+ * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE)
+ * TCP RST and ACK can be attached to TIME_WAIT.
+ */
+static inline bool sk_listener_or_tw(const struct sock *sk)
+{
+ return (1 << READ_ONCE(sk->sk_state)) &
+ (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT);
+}
+
void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
int type);
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index aeabf45c9200c4aea75fb6c63986e37eddfea5f9..a97638bef6da5be8a84cc572bf2372551f4b7f96 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -362,8 +362,9 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
* 3) We do not want to rate limit them (eg SYNFLOOD attack),
* especially if the listener set SO_MAX_PACING_RATE
* 4) We pretend they are orphaned
+ * TCP can also associate TIME_WAIT sockets with RST or ACK packets.
*/
- if (!sk || sk_listener(sk)) {
+ if (!sk || sk_listener_or_tw(sk)) {
unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
/* By forcing low order bit to 1, we make sure to not
--
2.47.0.rc1.288.g06298d1525-goog
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v3 net-next 3/5] net: add skb_set_owner_edemux() helper
2024-10-10 17:48 [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
2024-10-10 17:48 ` [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
2024-10-10 17:48 ` [PATCH v3 net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets Eric Dumazet
@ 2024-10-10 17:48 ` Eric Dumazet
2024-10-11 23:29 ` Kuniyuki Iwashima
2024-10-14 14:20 ` Brian Vazquez
2024-10-10 17:48 ` [PATCH v3 net-next 4/5] ipv6: tcp: give socket pointer to control skbs Eric Dumazet
` (2 subsequent siblings)
5 siblings, 2 replies; 24+ messages in thread
From: Eric Dumazet @ 2024-10-10 17:48 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Martin KaFai Lau, Kuniyuki Iwashima, Neal Cardwell, Brian Vazquez,
netdev, eric.dumazet, Eric Dumazet
This can be used to attach a socket to an skb,
taking a reference on sk->sk_refcnt.
This helper might be a NOP if sk->sk_refcnt is zero.
Use it from tcp_make_synack().
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/sock.h | 9 +++++++++
net/core/sock.c | 9 +++------
net/ipv4/tcp_output.c | 2 +-
3 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index 703ec6aef927337f7ca6798ff3c3970529af53f9..e5bb64ad92c769f3edb8c2dc72cafb336837cabb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1758,6 +1758,15 @@ void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
void sock_pfree(struct sk_buff *skb);
+
+static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ if (refcount_inc_not_zero(&sk->sk_refcnt)) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ }
+}
#else
#define sock_edemux sock_efree
#endif
diff --git a/net/core/sock.c b/net/core/sock.c
index 083d438d8b6faff60e2e3cf1f982eb306a923cf7..f8c0d4eda888cf190b87fb42e94eef4fb950bf1f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2592,14 +2592,11 @@ void __sock_wfree(struct sk_buff *skb)
void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
- skb->sk = sk;
#ifdef CONFIG_INET
- if (unlikely(!sk_fullsock(sk))) {
- skb->destructor = sock_edemux;
- sock_hold(sk);
- return;
- }
+ if (unlikely(!sk_fullsock(sk)))
+ return skb_set_owner_edemux(skb, sk);
#endif
+ skb->sk = sk;
skb->destructor = sock_wfree;
skb_set_hash_from_sk(skb, sk);
/*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1251510f0e58da6b6403d2097b498f3e4cb6d255..4cf64ed13609fdcb72b3858ca9e20a1e65bd3d94 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3731,7 +3731,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
switch (synack_type) {
case TCP_SYNACK_NORMAL:
- skb_set_owner_w(skb, req_to_sk(req));
+ skb_set_owner_edemux(skb, req_to_sk(req));
break;
case TCP_SYNACK_COOKIE:
/* Under synflood, we do not attach skb to a socket,
--
2.47.0.rc1.288.g06298d1525-goog
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v3 net-next 4/5] ipv6: tcp: give socket pointer to control skbs
2024-10-10 17:48 [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
` (2 preceding siblings ...)
2024-10-10 17:48 ` [PATCH v3 net-next 3/5] net: add skb_set_owner_edemux() helper Eric Dumazet
@ 2024-10-10 17:48 ` Eric Dumazet
2024-10-11 23:32 ` Kuniyuki Iwashima
2024-10-14 14:22 ` Brian Vazquez
2024-10-10 17:48 ` [PATCH v3 net-next 5/5] ipv4: " Eric Dumazet
2024-10-15 1:00 ` [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets patchwork-bot+netdevbpf
5 siblings, 2 replies; 24+ messages in thread
From: Eric Dumazet @ 2024-10-10 17:48 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Martin KaFai Lau, Kuniyuki Iwashima, Neal Cardwell, Brian Vazquez,
netdev, eric.dumazet, Eric Dumazet
tcp_v6_send_response() send orphaned 'control packets'.
These are RST packets and also ACK packets sent from TIME_WAIT.
Some eBPF programs would prefer to have a meaningful skb->sk
pointer as much as possible.
This means that TCP can now attach TIME_WAIT sockets to outgoing
skbs.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv6/tcp_ipv6.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7634c0be6acbdb67bb378cc81bdbf184552d2afc..597920061a3a061a878bf0f7a1b03ac4898918a9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -967,6 +967,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
}
if (sk) {
+ /* unconstify the socket only to attach it to buff with care. */
+ skb_set_owner_edemux(buff, (struct sock *)sk);
+
if (sk->sk_state == TCP_TIME_WAIT)
mark = inet_twsk(sk)->tw_mark;
else
--
2.47.0.rc1.288.g06298d1525-goog
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v3 net-next 5/5] ipv4: tcp: give socket pointer to control skbs
2024-10-10 17:48 [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
` (3 preceding siblings ...)
2024-10-10 17:48 ` [PATCH v3 net-next 4/5] ipv6: tcp: give socket pointer to control skbs Eric Dumazet
@ 2024-10-10 17:48 ` Eric Dumazet
2024-10-11 23:41 ` Kuniyuki Iwashima
2024-10-14 14:23 ` Brian Vazquez
2024-10-15 1:00 ` [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets patchwork-bot+netdevbpf
5 siblings, 2 replies; 24+ messages in thread
From: Eric Dumazet @ 2024-10-10 17:48 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Martin KaFai Lau, Kuniyuki Iwashima, Neal Cardwell, Brian Vazquez,
netdev, eric.dumazet, Eric Dumazet
ip_send_unicast_reply() send orphaned 'control packets'.
These are RST packets and also ACK packets sent from TIME_WAIT.
Some eBPF programs would prefer to have a meaningful skb->sk
pointer as much as possible.
This means that TCP can now attach TIME_WAIT sockets to outgoing
skbs.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/ip.h | 3 ++-
net/ipv4/ip_output.c | 5 ++++-
net/ipv4/tcp_ipv4.c | 4 ++--
3 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index bab084df15677543b7400bb2832c0e83988884cb..4be0a6a603b2b5d5cfddc045a7d49d0d77be9570 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -288,7 +288,8 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
}
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
+ struct sk_buff *skb,
const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e5c55a95063dd8340f9a014102408e859b4eb755..0065b1996c947078bea210c9abe5c80fa0e0ab4f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1596,7 +1596,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
* Generic function to send a packet as reply to another packet.
* Used to send some TCP resets/acks so far.
*/
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
+ struct sk_buff *skb,
const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
@@ -1662,6 +1663,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
+ if (orig_sk)
+ skb_set_owner_edemux(nskb, (struct sock *)orig_sk);
if (transmit_time)
nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
if (txhash)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 985028434f644c399e51d12ba8d9c2c5740dc6e1..9d3dd101ea713b14e13afe662baa49d21b3b716c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -907,7 +907,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
ctl_sk->sk_mark = 0;
ctl_sk->sk_priority = 0;
}
- ip_send_unicast_reply(ctl_sk,
+ ip_send_unicast_reply(ctl_sk, sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len,
@@ -1021,7 +1021,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
transmit_time = tcp_transmit_time(sk);
- ip_send_unicast_reply(ctl_sk,
+ ip_send_unicast_reply(ctl_sk, sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len,
--
2.47.0.rc1.288.g06298d1525-goog
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-10 17:48 ` [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
@ 2024-10-11 23:20 ` Kuniyuki Iwashima
2024-10-12 3:32 ` Martin KaFai Lau
2024-10-14 14:01 ` Brian Vazquez
2 siblings, 0 replies; 24+ messages in thread
From: Kuniyuki Iwashima @ 2024-10-11 23:20 UTC (permalink / raw)
To: edumazet
Cc: brianvv, davem, eric.dumazet, kuba, kuniyu, martin.lau, ncardwell,
netdev, pabeni
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Oct 2024 17:48:13 +0000
> TCP will soon attach TIME_WAIT sockets to some ACK and RST.
>
> Make sure sk_to_full_sk() detects this and does not return
> a non full socket.
>
> v3: also changed sk_const_to_full_sk()
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets
2024-10-10 17:48 ` [PATCH v3 net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets Eric Dumazet
@ 2024-10-11 23:25 ` Kuniyuki Iwashima
2024-10-14 14:05 ` Brian Vazquez
1 sibling, 0 replies; 24+ messages in thread
From: Kuniyuki Iwashima @ 2024-10-11 23:25 UTC (permalink / raw)
To: edumazet
Cc: brianvv, davem, eric.dumazet, kuba, kuniyu, martin.lau, ncardwell,
netdev, pabeni
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Oct 2024 17:48:14 +0000
> TCP stack is not attaching skb to TIME_WAIT sockets yet,
> but we would like to allow this in the future.
>
> Add sk_listener_or_tw() helper to detect the three states
> that FQ needs to take care.
>
> Like NEW_SYN_RECV, TIME_WAIT are not full sockets and
> do not contain sk->sk_pacing_status, sk->sk_pacing_rate.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 3/5] net: add skb_set_owner_edemux() helper
2024-10-10 17:48 ` [PATCH v3 net-next 3/5] net: add skb_set_owner_edemux() helper Eric Dumazet
@ 2024-10-11 23:29 ` Kuniyuki Iwashima
2024-10-14 14:20 ` Brian Vazquez
1 sibling, 0 replies; 24+ messages in thread
From: Kuniyuki Iwashima @ 2024-10-11 23:29 UTC (permalink / raw)
To: edumazet
Cc: brianvv, davem, eric.dumazet, kuba, kuniyu, martin.lau, ncardwell,
netdev, pabeni
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Oct 2024 17:48:15 +0000
> This can be used to attach a socket to an skb,
> taking a reference on sk->sk_refcnt.
>
> This helper might be a NOP if sk->sk_refcnt is zero.
>
> Use it from tcp_make_synack().
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 4/5] ipv6: tcp: give socket pointer to control skbs
2024-10-10 17:48 ` [PATCH v3 net-next 4/5] ipv6: tcp: give socket pointer to control skbs Eric Dumazet
@ 2024-10-11 23:32 ` Kuniyuki Iwashima
2024-10-14 14:22 ` Brian Vazquez
1 sibling, 0 replies; 24+ messages in thread
From: Kuniyuki Iwashima @ 2024-10-11 23:32 UTC (permalink / raw)
To: edumazet
Cc: brianvv, davem, eric.dumazet, kuba, kuniyu, martin.lau, ncardwell,
netdev, pabeni
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Oct 2024 17:48:16 +0000
> tcp_v6_send_response() send orphaned 'control packets'.
>
> These are RST packets and also ACK packets sent from TIME_WAIT.
>
> Some eBPF programs would prefer to have a meaningful skb->sk
> pointer as much as possible.
>
> This means that TCP can now attach TIME_WAIT sockets to outgoing
> skbs.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 5/5] ipv4: tcp: give socket pointer to control skbs
2024-10-10 17:48 ` [PATCH v3 net-next 5/5] ipv4: " Eric Dumazet
@ 2024-10-11 23:41 ` Kuniyuki Iwashima
2024-10-14 14:23 ` Brian Vazquez
1 sibling, 0 replies; 24+ messages in thread
From: Kuniyuki Iwashima @ 2024-10-11 23:41 UTC (permalink / raw)
To: edumazet
Cc: brianvv, davem, eric.dumazet, kuba, kuniyu, martin.lau, ncardwell,
netdev, pabeni
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Oct 2024 17:48:17 +0000
> ip_send_unicast_reply() send orphaned 'control packets'.
>
> These are RST packets and also ACK packets sent from TIME_WAIT.
>
> Some eBPF programs would prefer to have a meaningful skb->sk
> pointer as much as possible.
>
> This means that TCP can now attach TIME_WAIT sockets to outgoing
> skbs.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-10 17:48 ` [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
2024-10-11 23:20 ` Kuniyuki Iwashima
@ 2024-10-12 3:32 ` Martin KaFai Lau
2024-10-14 14:01 ` Brian Vazquez
2 siblings, 0 replies; 24+ messages in thread
From: Martin KaFai Lau @ 2024-10-12 3:32 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, Brian Vazquez, netdev,
eric.dumazet
On 10/10/24 10:48 AM, Eric Dumazet wrote:
> TCP will soon attach TIME_WAIT sockets to some ACK and RST.
>
> Make sure sk_to_full_sk() detects this and does not return
> a non full socket.
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-10 17:48 ` [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
2024-10-11 23:20 ` Kuniyuki Iwashima
2024-10-12 3:32 ` Martin KaFai Lau
@ 2024-10-14 14:01 ` Brian Vazquez
2024-10-14 14:27 ` Eric Dumazet
2 siblings, 1 reply; 24+ messages in thread
From: Brian Vazquez @ 2024-10-14 14:01 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet
Thanks Eric for the patch series! I left some comments inline
On Thu, Oct 10, 2024 at 1:48 PM Eric Dumazet <edumazet@google.com> wrote:
>
> TCP will soon attach TIME_WAIT sockets to some ACK and RST.
>
> Make sure sk_to_full_sk() detects this and does not return
> a non full socket.
>
> v3: also changed sk_const_to_full_sk()
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> include/linux/bpf-cgroup.h | 2 +-
> include/net/inet_sock.h | 8 ++++++--
> net/core/filter.c | 6 +-----
> 3 files changed, 8 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index ce91d9b2acb9f8991150ceead4475b130bead438..f0f219271daf4afea2666c4d09fd4d1a8091f844 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
> int __ret = 0; \
> if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \
> typeof(sk) __sk = sk_to_full_sk(sk); \
> - if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
> + if (__sk && __sk == skb_to_full_sk(skb) && \
> cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \
> __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
> CGROUP_INET_EGRESS); \
> diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> index f01dd273bea69d2eaf7a1d28274d7f980942b78a..56d8bc5593d3dfffd5f94cf7c6383948881917df 100644
> --- a/include/net/inet_sock.h
> +++ b/include/net/inet_sock.h
> @@ -321,8 +321,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
> static inline struct sock *sk_to_full_sk(struct sock *sk)
> {
> #ifdef CONFIG_INET
> - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
> + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
> sk = inet_reqsk(sk)->rsk_listener;
> + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
> + sk = NULL;
> #endif
> return sk;
> }
> @@ -331,8 +333,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk)
> static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
> {
> #ifdef CONFIG_INET
> - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
> + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
> sk = ((const struct request_sock *)sk)->rsk_listener;
> + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
> + sk = NULL;
> #endif
> return sk;
> }
> diff --git a/net/core/filter.c b/net/core/filter.c
> index bd0d08bf76bb8de39ca2ca89cda99a97c9b0a034..202c1d386e19599e9fc6e0a0d4a95986ba6d0ea8 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -6778,8 +6778,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
> * sock refcnt is decremented to prevent a request_sock leak.
> */
> - if (!sk_fullsock(sk2))
> - sk2 = NULL;
IIUC, we still want the condition above since sk_to_full_sk can return
the request socket in which case the helper should return NULL, so we
still need the refcnt decrement?
> if (sk2 != sk) {
> sock_gen_put(sk);
> /* Ensure there is no need to bump sk2 refcnt */
> @@ -6826,8 +6824,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
> * sock refcnt is decremented to prevent a request_sock leak.
> */
> - if (!sk_fullsock(sk2))
> - sk2 = NULL;
Same as above.
> if (sk2 != sk) {
> sock_gen_put(sk);
> /* Ensure there is no need to bump sk2 refcnt */
> @@ -7276,7 +7272,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
> {
> sk = sk_to_full_sk(sk);
>
> - if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
> + if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
> return (unsigned long)sk;
>
> return (unsigned long)NULL;
> --
> 2.47.0.rc1.288.g06298d1525-goog
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets
2024-10-10 17:48 ` [PATCH v3 net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets Eric Dumazet
2024-10-11 23:25 ` Kuniyuki Iwashima
@ 2024-10-14 14:05 ` Brian Vazquez
1 sibling, 0 replies; 24+ messages in thread
From: Brian Vazquez @ 2024-10-14 14:05 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet
On Thu, Oct 10, 2024 at 1:48 PM Eric Dumazet <edumazet@google.com> wrote:
>
> TCP stack is not attaching skb to TIME_WAIT sockets yet,
> but we would like to allow this in the future.
>
> Add sk_listener_or_tw() helper to detect the three states
> that FQ needs to take care.
>
> Like NEW_SYN_RECV, TIME_WAIT are not full sockets and
> do not contain sk->sk_pacing_status, sk->sk_pacing_rate.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Brian Vazquez <brianvv@google.com>
> ---
> include/net/sock.h | 10 ++++++++++
> net/sched/sch_fq.c | 3 ++-
> 2 files changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index b32f1424ecc52e4a299a207c029192475c1b6a65..703ec6aef927337f7ca6798ff3c3970529af53f9 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -2800,6 +2800,16 @@ static inline bool sk_listener(const struct sock *sk)
> return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
> }
>
> +/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT
> + * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE)
> + * TCP RST and ACK can be attached to TIME_WAIT.
> + */
> +static inline bool sk_listener_or_tw(const struct sock *sk)
> +{
> + return (1 << READ_ONCE(sk->sk_state)) &
> + (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT);
> +}
> +
> void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
> int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
> int type);
> diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
> index aeabf45c9200c4aea75fb6c63986e37eddfea5f9..a97638bef6da5be8a84cc572bf2372551f4b7f96 100644
> --- a/net/sched/sch_fq.c
> +++ b/net/sched/sch_fq.c
> @@ -362,8 +362,9 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
> * 3) We do not want to rate limit them (eg SYNFLOOD attack),
> * especially if the listener set SO_MAX_PACING_RATE
> * 4) We pretend they are orphaned
> + * TCP can also associate TIME_WAIT sockets with RST or ACK packets.
> */
> - if (!sk || sk_listener(sk)) {
> + if (!sk || sk_listener_or_tw(sk)) {
> unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
>
> /* By forcing low order bit to 1, we make sure to not
> --
> 2.47.0.rc1.288.g06298d1525-goog
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 3/5] net: add skb_set_owner_edemux() helper
2024-10-10 17:48 ` [PATCH v3 net-next 3/5] net: add skb_set_owner_edemux() helper Eric Dumazet
2024-10-11 23:29 ` Kuniyuki Iwashima
@ 2024-10-14 14:20 ` Brian Vazquez
2024-10-14 14:23 ` Eric Dumazet
1 sibling, 1 reply; 24+ messages in thread
From: Brian Vazquez @ 2024-10-14 14:20 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet
On Thu, Oct 10, 2024 at 1:48 PM Eric Dumazet <edumazet@google.com> wrote:
>
> This can be used to attach a socket to an skb,
> taking a reference on sk->sk_refcnt.
>
> This helper might be a NOP if sk->sk_refcnt is zero.
>
> Use it from tcp_make_synack().
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> include/net/sock.h | 9 +++++++++
> net/core/sock.c | 9 +++------
> net/ipv4/tcp_output.c | 2 +-
> 3 files changed, 13 insertions(+), 7 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 703ec6aef927337f7ca6798ff3c3970529af53f9..e5bb64ad92c769f3edb8c2dc72cafb336837cabb 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1758,6 +1758,15 @@ void sock_efree(struct sk_buff *skb);
> #ifdef CONFIG_INET
> void sock_edemux(struct sk_buff *skb);
> void sock_pfree(struct sk_buff *skb);
> +
> +static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk)
> +{
> + skb_orphan(skb);
Is this skb_orphan(skb) needed? IIUC skb_set_owner_w is doing
skb_orphan too? and then calling this helper, but we do need the
skb_orphan is needed when called from the synack.
Can skb_set_owner_w try to orphan an skb twice?
> + if (refcount_inc_not_zero(&sk->sk_refcnt)) {
> + skb->sk = sk;
> + skb->destructor = sock_edemux;
> + }
> +}
> #else
> #define sock_edemux sock_efree
> #endif
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 083d438d8b6faff60e2e3cf1f982eb306a923cf7..f8c0d4eda888cf190b87fb42e94eef4fb950bf1f 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2592,14 +2592,11 @@ void __sock_wfree(struct sk_buff *skb)
> void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
> {
> skb_orphan(skb);
> - skb->sk = sk;
> #ifdef CONFIG_INET
> - if (unlikely(!sk_fullsock(sk))) {
> - skb->destructor = sock_edemux;
> - sock_hold(sk);
> - return;
> - }
> + if (unlikely(!sk_fullsock(sk)))
> + return skb_set_owner_edemux(skb, sk);
> #endif
> + skb->sk = sk;
> skb->destructor = sock_wfree;
> skb_set_hash_from_sk(skb, sk);
> /*
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 1251510f0e58da6b6403d2097b498f3e4cb6d255..4cf64ed13609fdcb72b3858ca9e20a1e65bd3d94 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -3731,7 +3731,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
>
> switch (synack_type) {
> case TCP_SYNACK_NORMAL:
> - skb_set_owner_w(skb, req_to_sk(req));
> + skb_set_owner_edemux(skb, req_to_sk(req));
> break;
> case TCP_SYNACK_COOKIE:
> /* Under synflood, we do not attach skb to a socket,
> --
> 2.47.0.rc1.288.g06298d1525-goog
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 4/5] ipv6: tcp: give socket pointer to control skbs
2024-10-10 17:48 ` [PATCH v3 net-next 4/5] ipv6: tcp: give socket pointer to control skbs Eric Dumazet
2024-10-11 23:32 ` Kuniyuki Iwashima
@ 2024-10-14 14:22 ` Brian Vazquez
1 sibling, 0 replies; 24+ messages in thread
From: Brian Vazquez @ 2024-10-14 14:22 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet
Thanks for this patch! This indeed helps to have more info within a
bpf program which is extremely useful!
On Thu, Oct 10, 2024 at 1:48 PM Eric Dumazet <edumazet@google.com> wrote:
>
> tcp_v6_send_response() send orphaned 'control packets'.
>
> These are RST packets and also ACK packets sent from TIME_WAIT.
>
> Some eBPF programs would prefer to have a meaningful skb->sk
> pointer as much as possible.
>
> This means that TCP can now attach TIME_WAIT sockets to outgoing
> skbs.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Brian Vazquez <brianvv@google.com>
> ---
> net/ipv6/tcp_ipv6.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 7634c0be6acbdb67bb378cc81bdbf184552d2afc..597920061a3a061a878bf0f7a1b03ac4898918a9 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -967,6 +967,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
> }
>
> if (sk) {
> + /* unconstify the socket only to attach it to buff with care. */
> + skb_set_owner_edemux(buff, (struct sock *)sk);
> +
> if (sk->sk_state == TCP_TIME_WAIT)
> mark = inet_twsk(sk)->tw_mark;
> else
> --
> 2.47.0.rc1.288.g06298d1525-goog
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 3/5] net: add skb_set_owner_edemux() helper
2024-10-14 14:20 ` Brian Vazquez
@ 2024-10-14 14:23 ` Eric Dumazet
2024-10-14 14:34 ` Brian Vazquez
0 siblings, 1 reply; 24+ messages in thread
From: Eric Dumazet @ 2024-10-14 14:23 UTC (permalink / raw)
To: Brian Vazquez
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet
On Mon, Oct 14, 2024 at 4:20 PM Brian Vazquez <brianvv@google.com> wrote:
>
> On Thu, Oct 10, 2024 at 1:48 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > This can be used to attach a socket to an skb,
> > taking a reference on sk->sk_refcnt.
> >
> > This helper might be a NOP if sk->sk_refcnt is zero.
> >
> > Use it from tcp_make_synack().
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> > include/net/sock.h | 9 +++++++++
> > net/core/sock.c | 9 +++------
> > net/ipv4/tcp_output.c | 2 +-
> > 3 files changed, 13 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 703ec6aef927337f7ca6798ff3c3970529af53f9..e5bb64ad92c769f3edb8c2dc72cafb336837cabb 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -1758,6 +1758,15 @@ void sock_efree(struct sk_buff *skb);
> > #ifdef CONFIG_INET
> > void sock_edemux(struct sk_buff *skb);
> > void sock_pfree(struct sk_buff *skb);
> > +
> > +static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk)
> > +{
> > + skb_orphan(skb);
>
> Is this skb_orphan(skb) needed? IIUC skb_set_owner_w is doing
> skb_orphan too? and then calling this helper, but we do need the
> skb_orphan is needed when called from the synack.
>
> Can skb_set_owner_w try to orphan an skb twice?
>
skb_orphan(skb) does nothing if there is nothing to do.
It is common practice to include it every time we are about to change
skb->destructor.
Otherwise we would have to add a WARN() or something to prevent future leaks.
Better safe than sorry :)
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 5/5] ipv4: tcp: give socket pointer to control skbs
2024-10-10 17:48 ` [PATCH v3 net-next 5/5] ipv4: " Eric Dumazet
2024-10-11 23:41 ` Kuniyuki Iwashima
@ 2024-10-14 14:23 ` Brian Vazquez
1 sibling, 0 replies; 24+ messages in thread
From: Brian Vazquez @ 2024-10-14 14:23 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet
On Thu, Oct 10, 2024 at 1:48 PM Eric Dumazet <edumazet@google.com> wrote:
>
> ip_send_unicast_reply() send orphaned 'control packets'.
>
> These are RST packets and also ACK packets sent from TIME_WAIT.
>
> Some eBPF programs would prefer to have a meaningful skb->sk
> pointer as much as possible.
>
> This means that TCP can now attach TIME_WAIT sockets to outgoing
> skbs.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Brian Vazquez <brianvv@google.com>
> ---
> include/net/ip.h | 3 ++-
> net/ipv4/ip_output.c | 5 ++++-
> net/ipv4/tcp_ipv4.c | 4 ++--
> 3 files changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/include/net/ip.h b/include/net/ip.h
> index bab084df15677543b7400bb2832c0e83988884cb..4be0a6a603b2b5d5cfddc045a7d49d0d77be9570 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -288,7 +288,8 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
> return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
> }
>
> -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
> +void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
> + struct sk_buff *skb,
> const struct ip_options *sopt,
> __be32 daddr, __be32 saddr,
> const struct ip_reply_arg *arg,
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index e5c55a95063dd8340f9a014102408e859b4eb755..0065b1996c947078bea210c9abe5c80fa0e0ab4f 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -1596,7 +1596,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
> * Generic function to send a packet as reply to another packet.
> * Used to send some TCP resets/acks so far.
> */
> -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
> +void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
> + struct sk_buff *skb,
> const struct ip_options *sopt,
> __be32 daddr, __be32 saddr,
> const struct ip_reply_arg *arg,
> @@ -1662,6 +1663,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
> arg->csumoffset) = csum_fold(csum_add(nskb->csum,
> arg->csum));
> nskb->ip_summed = CHECKSUM_NONE;
> + if (orig_sk)
> + skb_set_owner_edemux(nskb, (struct sock *)orig_sk);
> if (transmit_time)
> nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
> if (txhash)
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 985028434f644c399e51d12ba8d9c2c5740dc6e1..9d3dd101ea713b14e13afe662baa49d21b3b716c 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -907,7 +907,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
> ctl_sk->sk_mark = 0;
> ctl_sk->sk_priority = 0;
> }
> - ip_send_unicast_reply(ctl_sk,
> + ip_send_unicast_reply(ctl_sk, sk,
> skb, &TCP_SKB_CB(skb)->header.h4.opt,
> ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
> &arg, arg.iov[0].iov_len,
> @@ -1021,7 +1021,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
> ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
> inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
> transmit_time = tcp_transmit_time(sk);
> - ip_send_unicast_reply(ctl_sk,
> + ip_send_unicast_reply(ctl_sk, sk,
> skb, &TCP_SKB_CB(skb)->header.h4.opt,
> ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
> &arg, arg.iov[0].iov_len,
> --
> 2.47.0.rc1.288.g06298d1525-goog
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-14 14:01 ` Brian Vazquez
@ 2024-10-14 14:27 ` Eric Dumazet
2024-10-14 15:08 ` Brian Vazquez
[not found] ` <CAMzD94TytK5RfDvLKXfxR7nys=voptywE3_3zSFymXNCky0AsQ@mail.gmail.com>
0 siblings, 2 replies; 24+ messages in thread
From: Eric Dumazet @ 2024-10-14 14:27 UTC (permalink / raw)
To: Brian Vazquez
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet
On Mon, Oct 14, 2024 at 4:01 PM Brian Vazquez <brianvv@google.com> wrote:
>
> Thanks Eric for the patch series! I left some comments inline
>
>
> On Thu, Oct 10, 2024 at 1:48 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > TCP will soon attach TIME_WAIT sockets to some ACK and RST.
> >
> > Make sure sk_to_full_sk() detects this and does not return
> > a non full socket.
> >
> > v3: also changed sk_const_to_full_sk()
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> > include/linux/bpf-cgroup.h | 2 +-
> > include/net/inet_sock.h | 8 ++++++--
> > net/core/filter.c | 6 +-----
> > 3 files changed, 8 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > index ce91d9b2acb9f8991150ceead4475b130bead438..f0f219271daf4afea2666c4d09fd4d1a8091f844 100644
> > --- a/include/linux/bpf-cgroup.h
> > +++ b/include/linux/bpf-cgroup.h
> > @@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
> > int __ret = 0; \
> > if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \
> > typeof(sk) __sk = sk_to_full_sk(sk); \
> > - if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
> > + if (__sk && __sk == skb_to_full_sk(skb) && \
> > cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \
> > __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
> > CGROUP_INET_EGRESS); \
> > diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> > index f01dd273bea69d2eaf7a1d28274d7f980942b78a..56d8bc5593d3dfffd5f94cf7c6383948881917df 100644
> > --- a/include/net/inet_sock.h
> > +++ b/include/net/inet_sock.h
> > @@ -321,8 +321,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
> > static inline struct sock *sk_to_full_sk(struct sock *sk)
> > {
> > #ifdef CONFIG_INET
> > - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
> > + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
> > sk = inet_reqsk(sk)->rsk_listener;
> > + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
> > + sk = NULL;
> > #endif
> > return sk;
> > }
> > @@ -331,8 +333,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk)
> > static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
> > {
> > #ifdef CONFIG_INET
> > - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
> > + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
> > sk = ((const struct request_sock *)sk)->rsk_listener;
> > + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
> > + sk = NULL;
> > #endif
> > return sk;
> > }
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index bd0d08bf76bb8de39ca2ca89cda99a97c9b0a034..202c1d386e19599e9fc6e0a0d4a95986ba6d0ea8 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -6778,8 +6778,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> > /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
> > * sock refcnt is decremented to prevent a request_sock leak.
> > */
> > - if (!sk_fullsock(sk2))
> > - sk2 = NULL;
>
> IIUC, we still want the condition above since sk_to_full_sk can return
> the request socket in which case the helper should return NULL, so we
> still need the refcnt decrement?
>
> > if (sk2 != sk) {
> > sock_gen_put(sk);
Note that we call sock_gen_put(sk) here, not sock_gen_put(sk2);
sk is not NULL here, so if sk2 is NULL, we will take this branch.
> > /* Ensure there is no need to bump sk2 refcnt */
> > @@ -6826,8 +6824,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> > /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
> > * sock refcnt is decremented to prevent a request_sock leak.
> > */
> > - if (!sk_fullsock(sk2))
> > - sk2 = NULL;
>
> Same as above.
Should be fine I think.
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 3/5] net: add skb_set_owner_edemux() helper
2024-10-14 14:23 ` Eric Dumazet
@ 2024-10-14 14:34 ` Brian Vazquez
0 siblings, 0 replies; 24+ messages in thread
From: Brian Vazquez @ 2024-10-14 14:34 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet
Thanks a lot for the explanation, it makes sense.
Reviewed-by: Brian Vazquez <brianvv@google.com>
On Mon, Oct 14, 2024 at 10:23 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Oct 14, 2024 at 4:20 PM Brian Vazquez <brianvv@google.com> wrote:
> >
> > On Thu, Oct 10, 2024 at 1:48 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > This can be used to attach a socket to an skb,
> > > taking a reference on sk->sk_refcnt.
> > >
> > > This helper might be a NOP if sk->sk_refcnt is zero.
> > >
> > > Use it from tcp_make_synack().
> > >
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > ---
> > > include/net/sock.h | 9 +++++++++
> > > net/core/sock.c | 9 +++------
> > > net/ipv4/tcp_output.c | 2 +-
> > > 3 files changed, 13 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > index 703ec6aef927337f7ca6798ff3c3970529af53f9..e5bb64ad92c769f3edb8c2dc72cafb336837cabb 100644
> > > --- a/include/net/sock.h
> > > +++ b/include/net/sock.h
> > > @@ -1758,6 +1758,15 @@ void sock_efree(struct sk_buff *skb);
> > > #ifdef CONFIG_INET
> > > void sock_edemux(struct sk_buff *skb);
> > > void sock_pfree(struct sk_buff *skb);
> > > +
> > > +static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk)
> > > +{
> > > + skb_orphan(skb);
> >
> > Is this skb_orphan(skb) needed? IIUC skb_set_owner_w is doing
> > skb_orphan too? and then calling this helper, but we do need the
> > skb_orphan is needed when called from the synack.
> >
> > Can skb_set_owner_w try to orphan an skb twice?
> >
>
> skb_orphan(skb) does nothing if there is nothing to do.
>
> It is common practice to include it every time we are about to change
> skb->destructor.
>
> Otherwise we would have to add a WARN() or something to prevent future leaks.
>
> Better safe than sorry :)
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-14 14:27 ` Eric Dumazet
@ 2024-10-14 15:08 ` Brian Vazquez
[not found] ` <CAMzD94TytK5RfDvLKXfxR7nys=voptywE3_3zSFymXNCky0AsQ@mail.gmail.com>
1 sibling, 0 replies; 24+ messages in thread
From: Brian Vazquez @ 2024-10-14 15:08 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet
On Mon, Oct 14, 2024 at 10:28 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Oct 14, 2024 at 4:01 PM Brian Vazquez <brianvv@google.com> wrote:
> >
> > Thanks Eric for the patch series! I left some comments inline
> >
> >
> > On Thu, Oct 10, 2024 at 1:48 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > TCP will soon attach TIME_WAIT sockets to some ACK and RST.
> > >
> > > Make sure sk_to_full_sk() detects this and does not return
> > > a non full socket.
> > >
> > > v3: also changed sk_const_to_full_sk()
> > >
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > ---
> > > include/linux/bpf-cgroup.h | 2 +-
> > > include/net/inet_sock.h | 8 ++++++--
> > > net/core/filter.c | 6 +-----
> > > 3 files changed, 8 insertions(+), 8 deletions(-)
> > >
> > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > > index ce91d9b2acb9f8991150ceead4475b130bead438..f0f219271daf4afea2666c4d09fd4d1a8091f844 100644
> > > --- a/include/linux/bpf-cgroup.h
> > > +++ b/include/linux/bpf-cgroup.h
> > > @@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
> > > int __ret = 0; \
> > > if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \
> > > typeof(sk) __sk = sk_to_full_sk(sk); \
> > > - if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
> > > + if (__sk && __sk == skb_to_full_sk(skb) && \
> > > cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \
> > > __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
> > > CGROUP_INET_EGRESS); \
> > > diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> > > index f01dd273bea69d2eaf7a1d28274d7f980942b78a..56d8bc5593d3dfffd5f94cf7c6383948881917df 100644
> > > --- a/include/net/inet_sock.h
> > > +++ b/include/net/inet_sock.h
> > > @@ -321,8 +321,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
> > > static inline struct sock *sk_to_full_sk(struct sock *sk)
> > > {
> > > #ifdef CONFIG_INET
> > > - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
> > > sk = inet_reqsk(sk)->rsk_listener;
> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
> > > + sk = NULL;
> > > #endif
> > > return sk;
> > > }
> > > @@ -331,8 +333,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk)
> > > static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
> > > {
> > > #ifdef CONFIG_INET
> > > - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
> > > sk = ((const struct request_sock *)sk)->rsk_listener;
> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
> > > + sk = NULL;
> > > #endif
> > > return sk;
> > > }
> > > diff --git a/net/core/filter.c b/net/core/filter.c
> > > index bd0d08bf76bb8de39ca2ca89cda99a97c9b0a034..202c1d386e19599e9fc6e0a0d4a95986ba6d0ea8 100644
> > > --- a/net/core/filter.c
> > > +++ b/net/core/filter.c
> > > @@ -6778,8 +6778,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> > > /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
> > > * sock refcnt is decremented to prevent a request_sock leak.
> > > */
> > > - if (!sk_fullsock(sk2))
> > > - sk2 = NULL;
> >
> > IIUC, we still want the condition above since sk_to_full_sk can return
> > the request socket in which case the helper should return NULL, so we
> > still need the refcnt decrement?
> >
> > > if (sk2 != sk) {
> > > sock_gen_put(sk);
>
> Note that we call sock_gen_put(sk) here, not sock_gen_put(sk2);
>
> sk is not NULL here, so if sk2 is NULL, we will take this branch.
IIUC __bpf_sk_lookup calls __bpf_skc_lookup which can return a request
listener socket and takes a refcnt, but __bpf_sk_lookup should only
return full_sk (no request nor time_wait).
I agree that after the change to sk_to_full_sk, for time_wait it will
return NULL, hence the condition is repetitive.
if (!sk_fullsock(sk2))
sk2 = NULL;
but sk_to_full_sk can still retrieve the listener: sk =
inet_reqsk(sk)->rsk_listener; in which case we would like to still use
if (!sk_fullsock(sk2))
sk2 = NULL;
to invalidate the request socket, decrement the refcount and sk = sk2
; // which makes sk == NULL?
I think removing that condition allows __bpf_sk_lookup to return the
req socket, which wasn't possible before?
>
>
> > > /* Ensure there is no need to bump sk2 refcnt */
> > > @@ -6826,8 +6824,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> > > /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
> > > * sock refcnt is decremented to prevent a request_sock leak.
> > > */
> > > - if (!sk_fullsock(sk2))
> > > - sk2 = NULL;
> >
> > Same as above.
>
> Should be fine I think.
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
[not found] ` <CAMzD94TytK5RfDvLKXfxR7nys=voptywE3_3zSFymXNCky0AsQ@mail.gmail.com>
@ 2024-10-14 15:23 ` Eric Dumazet
2024-10-14 15:39 ` Brian Vazquez
0 siblings, 1 reply; 24+ messages in thread
From: Eric Dumazet @ 2024-10-14 15:23 UTC (permalink / raw)
To: Brian Vazquez
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet
On Mon, Oct 14, 2024 at 5:03 PM Brian Vazquez <brianvv@google.com> wrote:
>
> On Mon, Oct 14, 2024 at 10:28 AM Eric Dumazet <edumazet@google.com> wrote:
>>
>> On Mon, Oct 14, 2024 at 4:01 PM Brian Vazquez <brianvv@google.com> wrote:
>> >
>> > Thanks Eric for the patch series! I left some comments inline
>> >
>> >
>> > On Thu, Oct 10, 2024 at 1:48 PM Eric Dumazet <edumazet@google.com> wrote:
>> > >
>> > > TCP will soon attach TIME_WAIT sockets to some ACK and RST.
>> > >
>> > > Make sure sk_to_full_sk() detects this and does not return
>> > > a non full socket.
>> > >
>> > > v3: also changed sk_const_to_full_sk()
>> > >
>> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
>> > > ---
>> > > include/linux/bpf-cgroup.h | 2 +-
>> > > include/net/inet_sock.h | 8 ++++++--
>> > > net/core/filter.c | 6 +-----
>> > > 3 files changed, 8 insertions(+), 8 deletions(-)
>> > >
>> > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
>> > > index ce91d9b2acb9f8991150ceead4475b130bead438..f0f219271daf4afea2666c4d09fd4d1a8091f844 100644
>> > > --- a/include/linux/bpf-cgroup.h
>> > > +++ b/include/linux/bpf-cgroup.h
>> > > @@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
>> > > int __ret = 0; \
>> > > if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \
>> > > typeof(sk) __sk = sk_to_full_sk(sk); \
>> > > - if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
>> > > + if (__sk && __sk == skb_to_full_sk(skb) && \
>> > > cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \
>> > > __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
>> > > CGROUP_INET_EGRESS); \
>> > > diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
>> > > index f01dd273bea69d2eaf7a1d28274d7f980942b78a..56d8bc5593d3dfffd5f94cf7c6383948881917df 100644
>> > > --- a/include/net/inet_sock.h
>> > > +++ b/include/net/inet_sock.h
>> > > @@ -321,8 +321,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
>> > > static inline struct sock *sk_to_full_sk(struct sock *sk)
>> > > {
>> > > #ifdef CONFIG_INET
>> > > - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
>> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
>> > > sk = inet_reqsk(sk)->rsk_listener;
>> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
>> > > + sk = NULL;
>> > > #endif
>> > > return sk;
>> > > }
>> > > @@ -331,8 +333,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk)
>> > > static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
>> > > {
>> > > #ifdef CONFIG_INET
>> > > - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
>> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
>> > > sk = ((const struct request_sock *)sk)->rsk_listener;
>> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
>> > > + sk = NULL;
>> > > #endif
>> > > return sk;
>> > > }
>> > > diff --git a/net/core/filter.c b/net/core/filter.c
>> > > index bd0d08bf76bb8de39ca2ca89cda99a97c9b0a034..202c1d386e19599e9fc6e0a0d4a95986ba6d0ea8 100644
>> > > --- a/net/core/filter.c
>> > > +++ b/net/core/filter.c
>> > > @@ -6778,8 +6778,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
>> > > /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
>> > > * sock refcnt is decremented to prevent a request_sock leak.
>> > > */
>> > > - if (!sk_fullsock(sk2))
>> > > - sk2 = NULL;
>> >
>> > IIUC, we still want the condition above since sk_to_full_sk can return
>> > the request socket in which case the helper should return NULL, so we
>> > still need the refcnt decrement?
>> >
>> > > if (sk2 != sk) {
>> > > sock_gen_put(sk);
>>
>> Note that we call sock_gen_put(sk) here, not sock_gen_put(sk2);
>>
>>
>> sk is not NULL here, so if sk2 is NULL, we will take this branch.
>
>
> IIUC __bpf_sk_lookup calls __bpf_skc_lookup which can return a request listener socket and takes a refcnt, but __bpf_sk_lookup should only return full_sk (no request nor time_wait).
>
> That's why the function tries to detect whether req or time_wait was retrieved by __bpf_skc_lookup and if so, we invalidate the return: sk = NULL, and decrement the refcnt. This is done by having sk2 and then comparing vs sk, and if sk2 is invalid because time_wait or listener, then we decrement sk (the original return from __bpf_skc_lookup, which took a refcnt)
>
> I agree that after the change to sk_to_full_sk, for time_wait it will return NULL, hence the condition is repetitive.
>
> if (!sk_fullsock(sk2))
> sk2 = NULL;
>
> but sk_to_full_sk can still retrieve the listener: sk = inet_reqsk(sk)->rsk_listener; in which case we would like to still use
> if (!sk_fullsock(sk2))
> sk2 = NULL;
>
> to invalidate the request socket, decrement the refcount and sk = sk2 ; // which makes sk == NULL?
>
> I think removing that condition allows __bpf_sk_lookup to return the req socket, which wasn't possible before?
It was not possible before, and not possible after :
static inline struct sock *sk_to_full_sk(struct sock *sk)
{
#ifdef CONFIG_INET
if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
sk = inet_reqsk(sk)->rsk_listener;
if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) // NEW CODE
sk = NULL; // NEW CODE
#endif
return sk;
}
if sk was a request socket, sk2 would be the listener.
sk2 being a listener means that sk_fullsock(sk2) is true.
if (!sk_fullsock(sk2))
sk2 = NULL;
So really this check was only meant for TIME_WAIT, and it is now done
directly from sk_to_full_sk()
Therefore we can delete this dead code.
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-14 15:23 ` Eric Dumazet
@ 2024-10-14 15:39 ` Brian Vazquez
0 siblings, 0 replies; 24+ messages in thread
From: Brian Vazquez @ 2024-10-14 15:39 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet
On Mon, Oct 14, 2024 at 11:24 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Oct 14, 2024 at 5:03 PM Brian Vazquez <brianvv@google.com> wrote:
> >
> > On Mon, Oct 14, 2024 at 10:28 AM Eric Dumazet <edumazet@google.com> wrote:
> >>
> >> On Mon, Oct 14, 2024 at 4:01 PM Brian Vazquez <brianvv@google.com> wrote:
> >> >
> >> > Thanks Eric for the patch series! I left some comments inline
> >> >
> >> >
> >> > On Thu, Oct 10, 2024 at 1:48 PM Eric Dumazet <edumazet@google.com> wrote:
> >> > >
> >> > > TCP will soon attach TIME_WAIT sockets to some ACK and RST.
> >> > >
> >> > > Make sure sk_to_full_sk() detects this and does not return
> >> > > a non full socket.
> >> > >
> >> > > v3: also changed sk_const_to_full_sk()
> >> > >
> >> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> >> > > ---
> >> > > include/linux/bpf-cgroup.h | 2 +-
> >> > > include/net/inet_sock.h | 8 ++++++--
> >> > > net/core/filter.c | 6 +-----
> >> > > 3 files changed, 8 insertions(+), 8 deletions(-)
> >> > >
> >> > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> >> > > index ce91d9b2acb9f8991150ceead4475b130bead438..f0f219271daf4afea2666c4d09fd4d1a8091f844 100644
> >> > > --- a/include/linux/bpf-cgroup.h
> >> > > +++ b/include/linux/bpf-cgroup.h
> >> > > @@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
> >> > > int __ret = 0; \
> >> > > if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \
> >> > > typeof(sk) __sk = sk_to_full_sk(sk); \
> >> > > - if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
> >> > > + if (__sk && __sk == skb_to_full_sk(skb) && \
> >> > > cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \
> >> > > __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
> >> > > CGROUP_INET_EGRESS); \
> >> > > diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> >> > > index f01dd273bea69d2eaf7a1d28274d7f980942b78a..56d8bc5593d3dfffd5f94cf7c6383948881917df 100644
> >> > > --- a/include/net/inet_sock.h
> >> > > +++ b/include/net/inet_sock.h
> >> > > @@ -321,8 +321,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
> >> > > static inline struct sock *sk_to_full_sk(struct sock *sk)
> >> > > {
> >> > > #ifdef CONFIG_INET
> >> > > - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
> >> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
> >> > > sk = inet_reqsk(sk)->rsk_listener;
> >> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
> >> > > + sk = NULL;
> >> > > #endif
> >> > > return sk;
> >> > > }
> >> > > @@ -331,8 +333,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk)
> >> > > static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
> >> > > {
> >> > > #ifdef CONFIG_INET
> >> > > - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
> >> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
> >> > > sk = ((const struct request_sock *)sk)->rsk_listener;
> >> > > + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
> >> > > + sk = NULL;
> >> > > #endif
> >> > > return sk;
> >> > > }
> >> > > diff --git a/net/core/filter.c b/net/core/filter.c
> >> > > index bd0d08bf76bb8de39ca2ca89cda99a97c9b0a034..202c1d386e19599e9fc6e0a0d4a95986ba6d0ea8 100644
> >> > > --- a/net/core/filter.c
> >> > > +++ b/net/core/filter.c
> >> > > @@ -6778,8 +6778,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> >> > > /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
> >> > > * sock refcnt is decremented to prevent a request_sock leak.
> >> > > */
> >> > > - if (!sk_fullsock(sk2))
> >> > > - sk2 = NULL;
> >> >
> >> > IIUC, we still want the condition above since sk_to_full_sk can return
> >> > the request socket in which case the helper should return NULL, so we
> >> > still need the refcnt decrement?
> >> >
> >> > > if (sk2 != sk) {
> >> > > sock_gen_put(sk);
> >>
> >> Note that we call sock_gen_put(sk) here, not sock_gen_put(sk2);
> >>
> >>
> >> sk is not NULL here, so if sk2 is NULL, we will take this branch.
> >
> >
> > IIUC __bpf_sk_lookup calls __bpf_skc_lookup which can return a request listener socket and takes a refcnt, but __bpf_sk_lookup should only return full_sk (no request nor time_wait).
> >
> > That's why the function tries to detect whether req or time_wait was retrieved by __bpf_skc_lookup and if so, we invalidate the return: sk = NULL, and decrement the refcnt. This is done by having sk2 and then comparing vs sk, and if sk2 is invalid because time_wait or listener, then we decrement sk (the original return from __bpf_skc_lookup, which took a refcnt)
> >
> > I agree that after the change to sk_to_full_sk, for time_wait it will return NULL, hence the condition is repetitive.
> >
> > if (!sk_fullsock(sk2))
> > sk2 = NULL;
> >
> > but sk_to_full_sk can still retrieve the listener: sk = inet_reqsk(sk)->rsk_listener; in which case we would like to still use
> > if (!sk_fullsock(sk2))
> > sk2 = NULL;
> >
> > to invalidate the request socket, decrement the refcount and sk = sk2 ; // which makes sk == NULL?
> >
> > I think removing that condition allows __bpf_sk_lookup to return the req socket, which wasn't possible before?
>
> It was not possible before, and not possible after :
>
> static inline struct sock *sk_to_full_sk(struct sock *sk)
> {
> #ifdef CONFIG_INET
> if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
> sk = inet_reqsk(sk)->rsk_listener;
> if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) // NEW CODE
> sk = NULL; // NEW CODE
> #endif
> return sk;
> }
>
> if sk was a request socket, sk2 would be the listener.
This is the part that I missed, I got misled by the comment above the dead code.
Thanks for clarifying!
>
> sk2 being a listener means that sk_fullsock(sk2) is true.
>
> if (!sk_fullsock(sk2))
> sk2 = NULL;
>
> So really this check was only meant for TIME_WAIT, and it is now done
> directly from sk_to_full_sk()
>
> Therefore we can delete this dead code.
Reviewed-by: Brian Vazquez <brianvv@google.com>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets
2024-10-10 17:48 [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
` (4 preceding siblings ...)
2024-10-10 17:48 ` [PATCH v3 net-next 5/5] ipv4: " Eric Dumazet
@ 2024-10-15 1:00 ` patchwork-bot+netdevbpf
5 siblings, 0 replies; 24+ messages in thread
From: patchwork-bot+netdevbpf @ 2024-10-15 1:00 UTC (permalink / raw)
To: Eric Dumazet
Cc: davem, kuba, pabeni, martin.lau, kuniyu, ncardwell, brianvv,
netdev, eric.dumazet
Hello:
This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Thu, 10 Oct 2024 17:48:12 +0000 you wrote:
> Currently, TCP can set skb->sk for a variety of transmit packets.
>
> However, packets sent on behalf of a TIME_WAIT sockets do not
> have an attached socket.
>
> Same issue for RST packets.
>
> [...]
Here is the summary with links:
- [v3,net-next,1/5] net: add TIME_WAIT logic to sk_to_full_sk()
https://git.kernel.org/netdev/net-next/c/78e2baf3d96e
- [v3,net-next,2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets
https://git.kernel.org/netdev/net-next/c/bc43a3c83cad
- [v3,net-next,3/5] net: add skb_set_owner_edemux() helper
https://git.kernel.org/netdev/net-next/c/5ced52fa8f0d
- [v3,net-next,4/5] ipv6: tcp: give socket pointer to control skbs
https://git.kernel.org/netdev/net-next/c/507a96737d99
- [v3,net-next,5/5] ipv4: tcp: give socket pointer to control skbs
https://git.kernel.org/netdev/net-next/c/79636038d37e
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2024-10-15 1:00 UTC | newest]
Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-10 17:48 [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
2024-10-10 17:48 ` [PATCH v3 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
2024-10-11 23:20 ` Kuniyuki Iwashima
2024-10-12 3:32 ` Martin KaFai Lau
2024-10-14 14:01 ` Brian Vazquez
2024-10-14 14:27 ` Eric Dumazet
2024-10-14 15:08 ` Brian Vazquez
[not found] ` <CAMzD94TytK5RfDvLKXfxR7nys=voptywE3_3zSFymXNCky0AsQ@mail.gmail.com>
2024-10-14 15:23 ` Eric Dumazet
2024-10-14 15:39 ` Brian Vazquez
2024-10-10 17:48 ` [PATCH v3 net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets Eric Dumazet
2024-10-11 23:25 ` Kuniyuki Iwashima
2024-10-14 14:05 ` Brian Vazquez
2024-10-10 17:48 ` [PATCH v3 net-next 3/5] net: add skb_set_owner_edemux() helper Eric Dumazet
2024-10-11 23:29 ` Kuniyuki Iwashima
2024-10-14 14:20 ` Brian Vazquez
2024-10-14 14:23 ` Eric Dumazet
2024-10-14 14:34 ` Brian Vazquez
2024-10-10 17:48 ` [PATCH v3 net-next 4/5] ipv6: tcp: give socket pointer to control skbs Eric Dumazet
2024-10-11 23:32 ` Kuniyuki Iwashima
2024-10-14 14:22 ` Brian Vazquez
2024-10-10 17:48 ` [PATCH v3 net-next 5/5] ipv4: " Eric Dumazet
2024-10-11 23:41 ` Kuniyuki Iwashima
2024-10-14 14:23 ` Brian Vazquez
2024-10-15 1:00 ` [PATCH v3 net-next 0/5] tcp: add skb->sk to more control packets patchwork-bot+netdevbpf
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).