* [PATCH v2 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-06 20:32 [PATCH v2 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
@ 2024-10-06 20:32 ` Eric Dumazet
2024-10-06 20:32 ` [PATCH v2 net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets Eric Dumazet
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2024-10-06 20:32 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau
Cc: netdev, eric.dumazet, Eric Dumazet
TCP will soon attach TIME_WAIT sockets to some ACK and RST.
Make sure sk_to_full_sk() detects this and does not return
a non full socket.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/linux/bpf-cgroup.h | 2 +-
include/net/inet_sock.h | 4 +++-
net/core/filter.c | 6 +-----
3 files changed, 5 insertions(+), 7 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index ce91d9b2acb9f8991150ceead4475b130bead438..f0f219271daf4afea2666c4d09fd4d1a8091f844 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
int __ret = 0; \
if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \
typeof(sk) __sk = sk_to_full_sk(sk); \
- if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
+ if (__sk && __sk == skb_to_full_sk(skb) && \
cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \
__ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
CGROUP_INET_EGRESS); \
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index f01dd273bea69d2eaf7a1d28274d7f980942b78a..39c5a93b353ad9ac3047031d1718190d749f0530 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -321,8 +321,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
static inline struct sock *sk_to_full_sk(struct sock *sk)
{
#ifdef CONFIG_INET
- if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
+ if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
sk = inet_reqsk(sk)->rsk_listener;
+ if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
+ sk = NULL;
#endif
return sk;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index bd0d08bf76bb8de39ca2ca89cda99a97c9b0a034..202c1d386e19599e9fc6e0a0d4a95986ba6d0ea8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6778,8 +6778,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
* sock refcnt is decremented to prevent a request_sock leak.
*/
- if (!sk_fullsock(sk2))
- sk2 = NULL;
if (sk2 != sk) {
sock_gen_put(sk);
/* Ensure there is no need to bump sk2 refcnt */
@@ -6826,8 +6824,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
* sock refcnt is decremented to prevent a request_sock leak.
*/
- if (!sk_fullsock(sk2))
- sk2 = NULL;
if (sk2 != sk) {
sock_gen_put(sk);
/* Ensure there is no need to bump sk2 refcnt */
@@ -7276,7 +7272,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
{
sk = sk_to_full_sk(sk);
- if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
+ if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
return (unsigned long)sk;
return (unsigned long)NULL;
--
2.47.0.rc0.187.ge670bccf7e-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v2 net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets
2024-10-06 20:32 [PATCH v2 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
2024-10-06 20:32 ` [PATCH v2 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
@ 2024-10-06 20:32 ` Eric Dumazet
2024-10-06 20:32 ` [PATCH v2 net-next 3/5] net: add skb_set_owner_edemux() helper Eric Dumazet
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2024-10-06 20:32 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau
Cc: netdev, eric.dumazet, Eric Dumazet
TCP stack is not attaching skb to TIME_WAIT sockets yet,
but we would like to allow this in the future.
Add sk_listener_or_tw() helper to detect the three states
that FQ needs to take care.
Like NEW_SYN_RECV, TIME_WAIT are not full sockets and
do not contain sk->sk_pacing_status, sk->sk_pacing_rate.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/sock.h | 10 ++++++++++
net/sched/sch_fq.c | 3 ++-
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index e282127092aba205c038046e1e8078cf2582c075..562bb47bf3d8a58f31576a66811ffb25dfed1a8b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2800,6 +2800,16 @@ static inline bool sk_listener(const struct sock *sk)
return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}
+/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT
+ * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE)
+ * TCP RST and ACK can be attached to TIME_WAIT.
+ */
+static inline bool sk_listener_or_tw(const struct sock *sk)
+{
+ return (1 << READ_ONCE(sk->sk_state)) &
+ (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT);
+}
+
void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
int type);
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index aeabf45c9200c4aea75fb6c63986e37eddfea5f9..a97638bef6da5be8a84cc572bf2372551f4b7f96 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -362,8 +362,9 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
* 3) We do not want to rate limit them (eg SYNFLOOD attack),
* especially if the listener set SO_MAX_PACING_RATE
* 4) We pretend they are orphaned
+ * TCP can also associate TIME_WAIT sockets with RST or ACK packets.
*/
- if (!sk || sk_listener(sk)) {
+ if (!sk || sk_listener_or_tw(sk)) {
unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
/* By forcing low order bit to 1, we make sure to not
--
2.47.0.rc0.187.ge670bccf7e-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v2 net-next 3/5] net: add skb_set_owner_edemux() helper
2024-10-06 20:32 [PATCH v2 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
2024-10-06 20:32 ` [PATCH v2 net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
2024-10-06 20:32 ` [PATCH v2 net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets Eric Dumazet
@ 2024-10-06 20:32 ` Eric Dumazet
2024-10-06 20:32 ` [PATCH v2 net-next 4/5] ipv6: tcp: give socket pointer to control skbs Eric Dumazet
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2024-10-06 20:32 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau
Cc: netdev, eric.dumazet, Eric Dumazet
This can be used to attach a socket to an skb,
taking a reference on sk->sk_refcnt.
This helper might be a NOP if sk->sk_refcnt is zero.
Use it from tcp_make_synack().
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/sock.h | 9 +++++++++
net/core/sock.c | 9 +++------
net/ipv4/tcp_output.c | 2 +-
3 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index 562bb47bf3d8a58f31576a66811ffb25dfed1a8b..eaa42e20449c83465fc378f5e66b2c11929708fd 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1758,6 +1758,15 @@ void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
void sock_pfree(struct sk_buff *skb);
+
+static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ if (refcount_inc_not_zero(&sk->sk_refcnt)) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ }
+}
#else
#define sock_edemux sock_efree
#endif
diff --git a/net/core/sock.c b/net/core/sock.c
index 846f494a17cf9614bb96505eec743df17574c138..d540acc8b154052ca523853b67056ebb0097c68e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2594,14 +2594,11 @@ void __sock_wfree(struct sk_buff *skb)
void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
- skb->sk = sk;
#ifdef CONFIG_INET
- if (unlikely(!sk_fullsock(sk))) {
- skb->destructor = sock_edemux;
- sock_hold(sk);
- return;
- }
+ if (unlikely(!sk_fullsock(sk)))
+ return skb_set_owner_edemux(skb, sk);
#endif
+ skb->sk = sk;
skb->destructor = sock_wfree;
skb_set_hash_from_sk(skb, sk);
/*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 08772395690d13a0c3309a273543a51aa0dd3fdc..20d6adb919461849ebda30c30f018ae8c5d5861a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3731,7 +3731,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
switch (synack_type) {
case TCP_SYNACK_NORMAL:
- skb_set_owner_w(skb, req_to_sk(req));
+ skb_set_owner_edemux(skb, req_to_sk(req));
break;
case TCP_SYNACK_COOKIE:
/* Under synflood, we do not attach skb to a socket,
--
2.47.0.rc0.187.ge670bccf7e-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v2 net-next 4/5] ipv6: tcp: give socket pointer to control skbs
2024-10-06 20:32 [PATCH v2 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
` (2 preceding siblings ...)
2024-10-06 20:32 ` [PATCH v2 net-next 3/5] net: add skb_set_owner_edemux() helper Eric Dumazet
@ 2024-10-06 20:32 ` Eric Dumazet
2024-10-06 20:32 ` [PATCH v2 net-next 5/5] ipv4: " Eric Dumazet
2024-10-07 23:28 ` [PATCH v2 net-next 0/5] tcp: add skb->sk to more control packets Jakub Kicinski
5 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2024-10-06 20:32 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau
Cc: netdev, eric.dumazet, Eric Dumazet
tcp_v6_send_response() send orphaned 'control packets'.
These are RST packets and also ACK packets sent from TIME_WAIT.
Some eBPF programs would prefer to have a meaningful skb->sk
pointer as much as possible.
This means that TCP can now attach TIME_WAIT sockets to outgoing
skbs.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv6/tcp_ipv6.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7634c0be6acbdb67bb378cc81bdbf184552d2afc..597920061a3a061a878bf0f7a1b03ac4898918a9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -967,6 +967,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
}
if (sk) {
+ /* unconstify the socket only to attach it to buff with care. */
+ skb_set_owner_edemux(buff, (struct sock *)sk);
+
if (sk->sk_state == TCP_TIME_WAIT)
mark = inet_twsk(sk)->tw_mark;
else
--
2.47.0.rc0.187.ge670bccf7e-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v2 net-next 5/5] ipv4: tcp: give socket pointer to control skbs
2024-10-06 20:32 [PATCH v2 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
` (3 preceding siblings ...)
2024-10-06 20:32 ` [PATCH v2 net-next 4/5] ipv6: tcp: give socket pointer to control skbs Eric Dumazet
@ 2024-10-06 20:32 ` Eric Dumazet
2024-10-07 23:28 ` [PATCH v2 net-next 0/5] tcp: add skb->sk to more control packets Jakub Kicinski
5 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2024-10-06 20:32 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau
Cc: netdev, eric.dumazet, Eric Dumazet
ip_send_unicast_reply() send orphaned 'control packets'.
These are RST packets and also ACK packets sent from TIME_WAIT.
Some eBPF programs would prefer to have a meaningful skb->sk
pointer as much as possible.
This means that TCP can now attach TIME_WAIT sockets to outgoing
skbs.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/ip.h | 3 ++-
net/ipv4/ip_output.c | 5 ++++-
net/ipv4/tcp_ipv4.c | 4 ++--
3 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index bab084df15677543b7400bb2832c0e83988884cb..4be0a6a603b2b5d5cfddc045a7d49d0d77be9570 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -288,7 +288,8 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
}
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
+ struct sk_buff *skb,
const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e5c55a95063dd8340f9a014102408e859b4eb755..0065b1996c947078bea210c9abe5c80fa0e0ab4f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1596,7 +1596,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
* Generic function to send a packet as reply to another packet.
* Used to send some TCP resets/acks so far.
*/
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
+ struct sk_buff *skb,
const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
@@ -1662,6 +1663,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
+ if (orig_sk)
+ skb_set_owner_edemux(nskb, (struct sock *)orig_sk);
if (transmit_time)
nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
if (txhash)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 985028434f644c399e51d12ba8d9c2c5740dc6e1..9d3dd101ea713b14e13afe662baa49d21b3b716c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -907,7 +907,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
ctl_sk->sk_mark = 0;
ctl_sk->sk_priority = 0;
}
- ip_send_unicast_reply(ctl_sk,
+ ip_send_unicast_reply(ctl_sk, sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len,
@@ -1021,7 +1021,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
transmit_time = tcp_transmit_time(sk);
- ip_send_unicast_reply(ctl_sk,
+ ip_send_unicast_reply(ctl_sk, sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len,
--
2.47.0.rc0.187.ge670bccf7e-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread* Re: [PATCH v2 net-next 0/5] tcp: add skb->sk to more control packets
2024-10-06 20:32 [PATCH v2 net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
` (4 preceding siblings ...)
2024-10-06 20:32 ` [PATCH v2 net-next 5/5] ipv4: " Eric Dumazet
@ 2024-10-07 23:28 ` Jakub Kicinski
5 siblings, 0 replies; 7+ messages in thread
From: Jakub Kicinski @ 2024-10-07 23:28 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Paolo Abeni, Martin KaFai Lau, netdev,
eric.dumazet
On Sun, 6 Oct 2024 20:32:19 +0000 Eric Dumazet wrote:
> Currently, TCP can set skb->sk for a variety of transmit packets.
>
> However, packets sent on behalf of a TIME_WAIT sockets do not
> have an attached socket.
>
> Same issue for RST packets.
>
> We want to change this, in order to increase eBPF program
> capabilities.
>
> This is slightly risky, because various layers could
> be confused by TIME_WAIT sockets showing up in skb->sk.
>
> v2: audited all sk_to_full_sk() users and addressed Martin feedback.
I think this patch set is causing crashes like:
https://netdev-3.bots.linux.dev/vmksft-nf-dbg/results/804862/2-conntrack-vrf-sh/stderr
I haven't had the time to investigate in depth and before the next run
someone else posted a broken change, sigh.
^ permalink raw reply [flat|nested] 7+ messages in thread