netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v1] tcp: Add mark for TIMEWAIT sockets
@ 2018-05-10  5:21 Jon Maxwell
  2018-05-10  5:45 ` Eric Dumazet
  0 siblings, 1 reply; 3+ messages in thread
From: Jon Maxwell @ 2018-05-10  5:21 UTC (permalink / raw)
  To: davem; +Cc: kuznet, yoshfuji, netdev, linux-kernel, jmaxwell

This version has some suggestions by Eric Dumazet:

- Use a local variable for the mark in IPv6 instead of ctl_sk to avoid SMP 
races. 
- Use the more elegant "IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark"
statement. 

Aidan McGurn from Openwave Mobility systems reported the following bug:

"Marked routing is broken on customer deployment. Its effects are large 
increase in Uplink retransmissions caused by the client never receiving 
the final ACK to their FINACK - this ACK misses the mark and routes out 
of the incorrect route."

Currently marks are added to sk_buffs for replies when the "fwmark_reflect" 
sysctl is enabled. But not for TW sockets that had sk->sk_mark set via 
setsockopt(SO_MARK..).  

Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the 
original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark location. 
Then progate this so that the skb gets sent with the correct mark. Do the same 
for resets. Give the "fwmark_reflect" sysctl precedence over sk->sk_mark so that
netfilter rules are still honored.

Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
---
 include/net/inet_timewait_sock.h |  1 +
 net/ipv4/ip_output.c             |  2 +-
 net/ipv4/tcp_ipv4.c              | 18 ++++++++++++++++--
 net/ipv4/tcp_minisocks.c         |  1 +
 net/ipv6/tcp_ipv6.c              |  7 ++++++-
 5 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c7be1ca8e562..659d8ed5a3bc 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -62,6 +62,7 @@ struct inet_timewait_sock {
 #define tw_dr			__tw_common.skc_tw_dr
 
 	int			tw_timeout;
+	__u32			tw_mark;
 	volatile unsigned char	tw_substate;
 	unsigned char		tw_rcv_wscale;
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 95adb171f852..b5e21eb198d8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1561,7 +1561,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 		oif = skb->skb_iif;
 
 	flowi4_init_output(&fl4, oif,
-			   IP4_REPLY_MARK(net, skb->mark),
+			   IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
 			   RT_TOS(arg->tos),
 			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
 			   ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f70586b50838..fbee36579c83 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	struct sock *sk1 = NULL;
 #endif
 	struct net *net;
+	struct sock *ctl_sk;
 
 	/* Never send a reset in response to a reset. */
 	if (th->rst)
@@ -723,11 +724,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	arg.tos = ip_hdr(skb)->tos;
 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
-	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
+	else if (sk && sk_fullsock(sk))
+		ctl_sk->sk_mark = sk->sk_mark;
+	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
+	ctl_sk->sk_mark = 0;
 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 	local_bh_enable();
@@ -759,6 +766,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	} rep;
 	struct net *net = sock_net(sk);
 	struct ip_reply_arg arg;
+	struct sock *ctl_sk;
 
 	memset(&rep.th, 0, sizeof(struct tcphdr));
 	memset(&arg, 0, sizeof(arg));
@@ -809,11 +817,17 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	arg.tos = tos;
 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
-	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
+	else if (sk && sk_fullsock(sk))
+		ctl_sk->sk_mark = sk->sk_mark;
+	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
+	ctl_sk->sk_mark = 0;
 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 	local_bh_enable();
 }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 57b5468b5139..f867658b4b30 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -263,6 +263,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		struct inet_sock *inet = inet_sk(sk);
 
 		tw->tw_transparent	= inet->transparent;
+		tw->tw_mark		= sk->sk_mark;
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
 		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
 		tcptw->tw_snd_nxt	= tp->snd_nxt;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6d664d83cd16..9b1ecdc1722e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -803,6 +803,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	unsigned int tot_len = sizeof(struct tcphdr);
 	struct dst_entry *dst;
 	__be32 *topt;
+	__u32 mark = 0;
 
 	if (tsecr)
 		tot_len += TCPOLEN_TSTAMP_ALIGNED;
@@ -871,7 +872,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		fl6.flowi6_oif = oif;
 	}
 
-	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		mark = inet_twsk(sk)->tw_mark;
+	else if (sk && sk_fullsock(sk))
+		mark = sk->sk_mark;
+	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
 	fl6.fl6_dport = t1->dest;
 	fl6.fl6_sport = t1->source;
 	fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH net-next v1] tcp: Add mark for TIMEWAIT sockets
  2018-05-10  5:21 [PATCH net-next v1] tcp: Add mark for TIMEWAIT sockets Jon Maxwell
@ 2018-05-10  5:45 ` Eric Dumazet
  2018-05-10  6:14   ` Jonathan Maxwell
  0 siblings, 1 reply; 3+ messages in thread
From: Eric Dumazet @ 2018-05-10  5:45 UTC (permalink / raw)
  To: Jon Maxwell, davem; +Cc: kuznet, yoshfuji, netdev, linux-kernel, jmaxwell



On 05/09/2018 10:21 PM, Jon Maxwell wrote:

...

>  	if (th->rst)
> @@ -723,11 +724,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>  	arg.tos = ip_hdr(skb)->tos;
>  	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
>  	local_bh_disable();
> -	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
> +	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
> +	if (sk && sk->sk_state == TCP_TIME_WAIT)
> +		ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
> +	else if (sk && sk_fullsock(sk))

I do not believe we could have a non fullsock here ?

A request socket (SYN_RECV state) at this point is not expected. 


So you can factorize :

if (sk)
        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
			  inet_twsk(sk)->tw_mark : sk->sk_mark;

(same remark for IPv6)


> +		ctl_sk->sk_mark = sk->sk_mark;
> +	ip_send_unicast_reply(ctl_sk,
>  			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
>  			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
>  			      &arg, arg.iov[0].iov_len);

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH net-next v1] tcp: Add mark for TIMEWAIT sockets
  2018-05-10  5:45 ` Eric Dumazet
@ 2018-05-10  6:14   ` Jonathan Maxwell
  0 siblings, 0 replies; 3+ messages in thread
From: Jonathan Maxwell @ 2018-05-10  6:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, kuznet, yoshfuji, netdev, linux-kernel, Jon Maxwell

On Thu, May 10, 2018 at 3:45 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
> On 05/09/2018 10:21 PM, Jon Maxwell wrote:
>
> ...
>
>>       if (th->rst)
>> @@ -723,11 +724,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>>       arg.tos = ip_hdr(skb)->tos;
>>       arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
>>       local_bh_disable();
>> -     ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
>> +     ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
>> +     if (sk && sk->sk_state == TCP_TIME_WAIT)
>> +             ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
>> +     else if (sk && sk_fullsock(sk))
>
> I do not believe we could have a non fullsock here ?
>

Okay thanks I'll make these changes to v2.

> A request socket (SYN_RECV state) at this point is not expected.
>
>
> So you can factorize :
>
> if (sk)
>         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
>                           inet_twsk(sk)->tw_mark : sk->sk_mark;
>
> (same remark for IPv6)
>
>
>> +             ctl_sk->sk_mark = sk->sk_mark;
>> +     ip_send_unicast_reply(ctl_sk,
>>                             skb, &TCP_SKB_CB(skb)->header.h4.opt,
>>                             ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
>>                             &arg, arg.iov[0].iov_len);
>
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2018-05-10  6:14 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-05-10  5:21 [PATCH net-next v1] tcp: Add mark for TIMEWAIT sockets Jon Maxwell
2018-05-10  5:45 ` Eric Dumazet
2018-05-10  6:14   ` Jonathan Maxwell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).