* [PATCH net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-04 19:16 [PATCH net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
@ 2024-10-04 19:16 ` Eric Dumazet
2024-10-04 21:56 ` Eric Dumazet
2024-10-04 19:16 ` [PATCH net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets Eric Dumazet
` (3 subsequent siblings)
4 siblings, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2024-10-04 19:16 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau
Cc: netdev, eric.dumazet, Eric Dumazet
TCP will soon attach TIME_WAIT sockets to some ACK and RST.
Make sure sk_to_full_sk() detects this and does not return
a non full socket.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/inet_sock.h | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 394c3b66065e20d34594d6e2a2010c55bb457810..cec093b78151b9a3b95ad4b3672a72b0aa9a8305 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -319,8 +319,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
static inline struct sock *sk_to_full_sk(struct sock *sk)
{
#ifdef CONFIG_INET
- if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
+ if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
sk = inet_reqsk(sk)->rsk_listener;
+ if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
+ sk = NULL;
#endif
return sk;
}
--
2.47.0.rc0.187.ge670bccf7e-goog
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-04 19:16 ` [PATCH net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
@ 2024-10-04 21:56 ` Eric Dumazet
2024-10-05 0:36 ` Martin KaFai Lau
0 siblings, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2024-10-04 21:56 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau
Cc: netdev, eric.dumazet
On Fri, Oct 4, 2024 at 9:16 PM Eric Dumazet <edumazet@google.com> wrote:
>
> TCP will soon attach TIME_WAIT sockets to some ACK and RST.
>
> Make sure sk_to_full_sk() detects this and does not return
> a non full socket.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> include/net/inet_sock.h | 4 +++-
> 1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> index 394c3b66065e20d34594d6e2a2010c55bb457810..cec093b78151b9a3b95ad4b3672a72b0aa9a8305 100644
> --- a/include/net/inet_sock.h
> +++ b/include/net/inet_sock.h
> @@ -319,8 +319,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
> static inline struct sock *sk_to_full_sk(struct sock *sk)
> {
> #ifdef CONFIG_INET
> - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
> + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
> sk = inet_reqsk(sk)->rsk_listener;
> + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
> + sk = NULL;
> #endif
> return sk;
> }
It appears some callers do not check if the return value could be NULL.
I will have to add in v2 :
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index ce91d9b2acb9f8991150ceead4475b130bead438..c3ffb45489a6924c1bc80355e862e243ec195b01
100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
int __ret = 0; \
if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \
typeof(sk) __sk = sk_to_full_sk(sk); \
- if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
+ if (__sk && sk_fullsock(__sk) && __sk ==
skb_to_full_sk(skb) && \
cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \
__ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
CGROUP_INET_EGRESS); \
diff --git a/net/core/filter.c b/net/core/filter.c
index bd0d08bf76bb8de39ca2ca89cda99a97c9b0a034..533025618b2c06efa31548708f21d9e0ccbdc68d
100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6778,7 +6778,7 @@ __bpf_sk_lookup(struct sk_buff *skb, struct
bpf_sock_tuple *tuple, u32 len,
/* sk_to_full_sk() may return (sk)->rsk_listener, so
make sure the original sk
* sock refcnt is decremented to prevent a request_sock leak.
*/
- if (!sk_fullsock(sk2))
+ if (sk2 && !sk_fullsock(sk2))
sk2 = NULL;
if (sk2 != sk) {
sock_gen_put(sk);
@@ -6826,7 +6826,7 @@ bpf_sk_lookup(struct sk_buff *skb, struct
bpf_sock_tuple *tuple, u32 len,
/* sk_to_full_sk() may return (sk)->rsk_listener, so
make sure the original sk
* sock refcnt is decremented to prevent a request_sock leak.
*/
- if (!sk_fullsock(sk2))
+ if (sk2 && !sk_fullsock(sk2))
sk2 = NULL;
if (sk2 != sk) {
sock_gen_put(sk);
@@ -7276,7 +7276,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
{
sk = sk_to_full_sk(sk);
- if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
+ if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
return (unsigned long)sk;
return (unsigned long)NULL;
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-04 21:56 ` Eric Dumazet
@ 2024-10-05 0:36 ` Martin KaFai Lau
2024-10-06 20:14 ` Eric Dumazet
0 siblings, 1 reply; 9+ messages in thread
From: Martin KaFai Lau @ 2024-10-05 0:36 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, eric.dumazet,
netdev
On 10/4/24 2:56 PM, Eric Dumazet wrote:
> On Fri, Oct 4, 2024 at 9:16 PM Eric Dumazet <edumazet@google.com> wrote:
>>
>> TCP will soon attach TIME_WAIT sockets to some ACK and RST.
>>
>> Make sure sk_to_full_sk() detects this and does not return
>> a non full socket.
>>
>> Signed-off-by: Eric Dumazet <edumazet@google.com>
>> ---
>> include/net/inet_sock.h | 4 +++-
>> 1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
>> index 394c3b66065e20d34594d6e2a2010c55bb457810..cec093b78151b9a3b95ad4b3672a72b0aa9a8305 100644
>> --- a/include/net/inet_sock.h
>> +++ b/include/net/inet_sock.h
>> @@ -319,8 +319,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
>> static inline struct sock *sk_to_full_sk(struct sock *sk)
>> {
>> #ifdef CONFIG_INET
>> - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
>> + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
>> sk = inet_reqsk(sk)->rsk_listener;
>> + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
>> + sk = NULL;
>> #endif
>> return sk;
>> }
>
> It appears some callers do not check if the return value could be NULL.
> I will have to add in v2 :
>
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index ce91d9b2acb9f8991150ceead4475b130bead438..c3ffb45489a6924c1bc80355e862e243ec195b01
> 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
> int __ret = 0; \
> if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \
The above "&& sk" test can probably be removed after the "__sk &&" addition below.
> typeof(sk) __sk = sk_to_full_sk(sk); \
> - if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
> + if (__sk && sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
sk_to_full_sk() includes the TCP_TIME_WAIT check now. I wonder if testing __sk
for NULL is good enough and the sk_fullsock(__sk) check can be removed also.
Thanks for working on this series. It is useful for the bpf prog.
> cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \
> __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
> CGROUP_INET_EGRESS); \
> diff --git a/net/core/filter.c b/net/core/filter.c
> index bd0d08bf76bb8de39ca2ca89cda99a97c9b0a034..533025618b2c06efa31548708f21d9e0ccbdc68d
> 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -6778,7 +6778,7 @@ __bpf_sk_lookup(struct sk_buff *skb, struct
> bpf_sock_tuple *tuple, u32 len,
> /* sk_to_full_sk() may return (sk)->rsk_listener, so
> make sure the original sk
> * sock refcnt is decremented to prevent a request_sock leak.
> */
> - if (!sk_fullsock(sk2))
> + if (sk2 && !sk_fullsock(sk2))
> sk2 = NULL;
> if (sk2 != sk) {
> sock_gen_put(sk);
> @@ -6826,7 +6826,7 @@ bpf_sk_lookup(struct sk_buff *skb, struct
> bpf_sock_tuple *tuple, u32 len,
> /* sk_to_full_sk() may return (sk)->rsk_listener, so
> make sure the original sk
> * sock refcnt is decremented to prevent a request_sock leak.
> */
> - if (!sk_fullsock(sk2))
> + if (sk2 && !sk_fullsock(sk2))
> sk2 = NULL;
> if (sk2 != sk) {
> sock_gen_put(sk);
> @@ -7276,7 +7276,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
> {
> sk = sk_to_full_sk(sk);
>
> - if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
> + if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
> return (unsigned long)sk;
>
> return (unsigned long)NULL;
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk()
2024-10-05 0:36 ` Martin KaFai Lau
@ 2024-10-06 20:14 ` Eric Dumazet
0 siblings, 0 replies; 9+ messages in thread
From: Eric Dumazet @ 2024-10-06 20:14 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, eric.dumazet,
netdev
On Sat, Oct 5, 2024 at 2:37 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/4/24 2:56 PM, Eric Dumazet wrote:
> > On Fri, Oct 4, 2024 at 9:16 PM Eric Dumazet <edumazet@google.com> wrote:
> >>
> >> TCP will soon attach TIME_WAIT sockets to some ACK and RST.
> >>
> >> Make sure sk_to_full_sk() detects this and does not return
> >> a non full socket.
> >>
> >> Signed-off-by: Eric Dumazet <edumazet@google.com>
> >> ---
> >> include/net/inet_sock.h | 4 +++-
> >> 1 file changed, 3 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> >> index 394c3b66065e20d34594d6e2a2010c55bb457810..cec093b78151b9a3b95ad4b3672a72b0aa9a8305 100644
> >> --- a/include/net/inet_sock.h
> >> +++ b/include/net/inet_sock.h
> >> @@ -319,8 +319,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
> >> static inline struct sock *sk_to_full_sk(struct sock *sk)
> >> {
> >> #ifdef CONFIG_INET
> >> - if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
> >> + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
> >> sk = inet_reqsk(sk)->rsk_listener;
> >> + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
> >> + sk = NULL;
> >> #endif
> >> return sk;
> >> }
> >
> > It appears some callers do not check if the return value could be NULL.
> > I will have to add in v2 :
> >
> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > index ce91d9b2acb9f8991150ceead4475b130bead438..c3ffb45489a6924c1bc80355e862e243ec195b01
> > 100644
> > --- a/include/linux/bpf-cgroup.h
> > +++ b/include/linux/bpf-cgroup.h
> > @@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
> > int __ret = 0; \
> > if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \
>
> The above "&& sk" test can probably be removed after the "__sk &&" addition below.
Yes, I can do that.
>
> > typeof(sk) __sk = sk_to_full_sk(sk); \
> > - if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
> > + if (__sk && sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \
>
> sk_to_full_sk() includes the TCP_TIME_WAIT check now. I wonder if testing __sk
> for NULL is good enough and the sk_fullsock(__sk) check can be removed also.
+2
>
> Thanks for working on this series. It is useful for the bpf prog.
Thanks.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets
2024-10-04 19:16 [PATCH net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
2024-10-04 19:16 ` [PATCH net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
@ 2024-10-04 19:16 ` Eric Dumazet
2024-10-04 19:16 ` [PATCH net-next 3/5] net: add skb_set_owner_edemux() helper Eric Dumazet
` (2 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: Eric Dumazet @ 2024-10-04 19:16 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau
Cc: netdev, eric.dumazet, Eric Dumazet
TCP stack is not attaching skb to TIME_WAIT sockets yet,
but we would like to allow this in the future.
Add sk_listener_or_tw() helper to detect the three states
that FQ needs to take care.
Like NEW_SYN_RECV, TIME_WAIT are not full sockets and
do not contain sk->sk_pacing_status, sk->sk_pacing_rate.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/sock.h | 10 ++++++++++
net/sched/sch_fq.c | 3 ++-
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index c58ca8dd561b7312ffc0836585c04d9fe917a124..c868b8b57e3489cd81efafa3856da09397059080 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2784,6 +2784,16 @@ static inline bool sk_listener(const struct sock *sk)
return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}
+/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT
+ * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE)
+ * TCP RST and ACK can be attached to TIME_WAIT.
+ */
+static inline bool sk_listener_or_tw(const struct sock *sk)
+{
+ return (1 << READ_ONCE(sk->sk_state)) &
+ (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT);
+}
+
void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
int type);
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 19a49af5a9e527ed0371a3bb96e0113755375eac..e10e94fa5f9bc17a086173abaacb57269bc506a8 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -361,8 +361,9 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
* 3) We do not want to rate limit them (eg SYNFLOOD attack),
* especially if the listener set SO_MAX_PACING_RATE
* 4) We pretend they are orphaned
+ * TCP can also associate TIME_WAIT sockets with RST or ACK packets.
*/
- if (!sk || sk_listener(sk)) {
+ if (!sk || sk_listener_or_tw(sk)) {
unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
/* By forcing low order bit to 1, we make sure to not
--
2.47.0.rc0.187.ge670bccf7e-goog
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH net-next 3/5] net: add skb_set_owner_edemux() helper
2024-10-04 19:16 [PATCH net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
2024-10-04 19:16 ` [PATCH net-next 1/5] net: add TIME_WAIT logic to sk_to_full_sk() Eric Dumazet
2024-10-04 19:16 ` [PATCH net-next 2/5] net_sched: sch_fq: prepare for TIME_WAIT sockets Eric Dumazet
@ 2024-10-04 19:16 ` Eric Dumazet
2024-10-04 19:16 ` [PATCH net-next 4/5] ipv6: tcp: give socket pointer to control skbs Eric Dumazet
2024-10-04 19:16 ` [PATCH net-next 5/5] ipv4: " Eric Dumazet
4 siblings, 0 replies; 9+ messages in thread
From: Eric Dumazet @ 2024-10-04 19:16 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau
Cc: netdev, eric.dumazet, Eric Dumazet
This can be used to attach a socket to an skb,
taking a reference on sk->sk_refcnt.
This helper might be a NOP if sk->sk_refcnt is zero.
Use it from tcp_make_synack().
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/sock.h | 9 +++++++++
net/core/sock.c | 9 +++------
net/ipv4/tcp_output.c | 2 +-
3 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index c868b8b57e3489cd81efafa3856da09397059080..5facbf33316ab0a424b320850198c7cd6bc1a642 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1752,6 +1752,15 @@ void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
void sock_pfree(struct sk_buff *skb);
+
+static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ if (refcount_inc_not_zero(&sk->sk_refcnt)) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ }
+}
#else
#define sock_edemux sock_efree
#endif
diff --git a/net/core/sock.c b/net/core/sock.c
index 039be95c40cf6fa429d33e0f42ee606188045992..0582306bb576291c43c9eb0700130b36ba44cc25 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2594,14 +2594,11 @@ void __sock_wfree(struct sk_buff *skb)
void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
- skb->sk = sk;
#ifdef CONFIG_INET
- if (unlikely(!sk_fullsock(sk))) {
- skb->destructor = sock_edemux;
- sock_hold(sk);
- return;
- }
+ if (unlikely(!sk_fullsock(sk)))
+ return skb_set_owner_edemux(skb, sk);
#endif
+ skb->sk = sk;
skb->destructor = sock_wfree;
skb_set_hash_from_sk(skb, sk);
/*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4fd746bd4d54f621601b20c3821e71370a4a615a..60c16fa4c9007006ff8ba49b695d227652a2eafb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3731,7 +3731,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
switch (synack_type) {
case TCP_SYNACK_NORMAL:
- skb_set_owner_w(skb, req_to_sk(req));
+ skb_set_owner_edemux(skb, req_to_sk(req));
break;
case TCP_SYNACK_COOKIE:
/* Under synflood, we do not attach skb to a socket,
--
2.47.0.rc0.187.ge670bccf7e-goog
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH net-next 4/5] ipv6: tcp: give socket pointer to control skbs
2024-10-04 19:16 [PATCH net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
` (2 preceding siblings ...)
2024-10-04 19:16 ` [PATCH net-next 3/5] net: add skb_set_owner_edemux() helper Eric Dumazet
@ 2024-10-04 19:16 ` Eric Dumazet
2024-10-04 19:16 ` [PATCH net-next 5/5] ipv4: " Eric Dumazet
4 siblings, 0 replies; 9+ messages in thread
From: Eric Dumazet @ 2024-10-04 19:16 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau
Cc: netdev, eric.dumazet, Eric Dumazet
tcp_v6_send_response() send orphaned 'control packets'.
These are RST packets and also ACK packets sent from TIME_WAIT.
Some eBPF programs would prefer to have a meaningful skb->sk
pointer as much as possible.
This means that TCP can now attach TIME_WAIT sockets to outgoing
skbs.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv6/tcp_ipv6.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d71ab4e1efe1c6598cf3d3e4334adf0881064ce9..9a15adb8dbcc1e518d43e4e9b563b5bb29ef9430 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -967,6 +967,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
}
if (sk) {
+ /* unconstify the socket only to attach it to buff with care. */
+ skb_set_owner_edemux(buff, (struct sock *)sk);
+
if (sk->sk_state == TCP_TIME_WAIT)
mark = inet_twsk(sk)->tw_mark;
else
--
2.47.0.rc0.187.ge670bccf7e-goog
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH net-next 5/5] ipv4: tcp: give socket pointer to control skbs
2024-10-04 19:16 [PATCH net-next 0/5] tcp: add skb->sk to more control packets Eric Dumazet
` (3 preceding siblings ...)
2024-10-04 19:16 ` [PATCH net-next 4/5] ipv6: tcp: give socket pointer to control skbs Eric Dumazet
@ 2024-10-04 19:16 ` Eric Dumazet
4 siblings, 0 replies; 9+ messages in thread
From: Eric Dumazet @ 2024-10-04 19:16 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni, Martin KaFai Lau
Cc: netdev, eric.dumazet, Eric Dumazet
ip_send_unicast_reply() send orphaned 'control packets'.
These are RST packets and also ACK packets sent from TIME_WAIT.
Some eBPF programs would prefer to have a meaningful skb->sk
pointer as much as possible.
This means that TCP can now attach TIME_WAIT sockets to outgoing
skbs.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/ip.h | 3 ++-
net/ipv4/ip_output.c | 5 ++++-
net/ipv4/tcp_ipv4.c | 4 ++--
3 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index bab084df15677543b7400bb2832c0e83988884cb..4be0a6a603b2b5d5cfddc045a7d49d0d77be9570 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -288,7 +288,8 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
}
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
+ struct sk_buff *skb,
const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 49811c9281d424bc4b43b8e9075da9fe724773cb..7d42be9e614d343cb9e4e25238c50a715a6f8ce2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1587,7 +1587,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
* Generic function to send a packet as reply to another packet.
* Used to send some TCP resets/acks so far.
*/
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
+ struct sk_buff *skb,
const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
@@ -1653,6 +1654,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
+ if (orig_sk)
+ skb_set_owner_edemux(nskb, (struct sock *)orig_sk);
if (transmit_time)
nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
if (txhash)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5afe5e57c89b5c28dfada2bf2e01fa7b3afa61f0..8260aa9440f9e1e6fb50b4c01854acaccbbdec17 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -907,7 +907,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
ctl_sk->sk_mark = 0;
ctl_sk->sk_priority = 0;
}
- ip_send_unicast_reply(ctl_sk,
+ ip_send_unicast_reply(ctl_sk, sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len,
@@ -1021,7 +1021,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
transmit_time = tcp_transmit_time(sk);
- ip_send_unicast_reply(ctl_sk,
+ ip_send_unicast_reply(ctl_sk, sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len,
--
2.47.0.rc0.187.ge670bccf7e-goog
^ permalink raw reply related [flat|nested] 9+ messages in thread