public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next v2] ipv4: tcp: remove per net tcp_sock
@ 2012-07-19 17:34 Eric Dumazet
  2012-07-19 17:36 ` David Miller
  2012-07-21  8:00 ` Hiroaki SHIMODA
  0 siblings, 2 replies; 7+ messages in thread
From: Eric Dumazet @ 2012-07-19 17:34 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Tom Herbert, Bill Sommerfeld

From: Eric Dumazet <edumazet@google.com>

tcp_v4_send_reset() and tcp_v4_send_ack() use a single socket
per network namespace.

This leads to bad behavior on multiqueue NICS, because many cpus
contend for the socket lock and once socket lock is acquired, extra
false sharing on various socket fields slow down the operations.

To better resist to attacks, we use a percpu socket. Each cpu can
run without contention, using appropriate memory (local node)

Additional features :

1) We also mirror the queue_mapping of the incoming skb, so that
answers use the same queue if possible.

2) Setting SOCK_USE_WRITE_QUEUE socket flag speedup sock_wfree()

3) We now limit the number of in-flight RST/ACK [1] packets
per cpu, instead of per namespace, and we honor the sysctl_wmem_default
limit dynamically. (Prior to this patch, sysctl_wmem_default value was
copied at boot time, so any further change would not affect tcp_sock
limit)


[1] These packets are only generated when no socket was matched for
the incoming packet.

Reported-by: Bill Sommerfeld <wsommerfeld@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
---
v2 : move unicast_sock out of ip_send_unicast_reply() body
     init sk_refcnt to 1, in case some driver get/put a reference on
socket.

 include/net/ip.h         |    2 -
 include/net/netns/ipv4.h |    1 
 net/ipv4/ip_output.c     |   50 +++++++++++++++++++++++--------------
 net/ipv4/tcp_ipv4.c      |    8 ++---
 4 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index ec5cfde..bd5e444 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -158,7 +158,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
 	return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
 }
 
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 			   __be32 saddr, const struct ip_reply_arg *arg,
 			   unsigned int len);
 
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2e089a9..d909c7f 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -38,7 +38,6 @@ struct netns_ipv4 {
 	struct sock		*fibnl;
 
 	struct sock		**icmp_sk;
-	struct sock		*tcp_sock;
 	struct inet_peer_base	*peers;
 	struct tcpm_hash_bucket	*tcp_metrics_hash;
 	unsigned int		tcp_metrics_hash_mask;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cc52679..c528f84 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1463,20 +1463,33 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
 
 /*
  *	Generic function to send a packet as reply to another packet.
- *	Used to send TCP resets so far.
+ *	Used to send some TCP resets/acks so far.
  *
- *	Should run single threaded per socket because it uses the sock
- *     	structure to pass arguments.
+ *	Use a fake percpu inet socket to avoid false sharing and contention.
  */
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
+static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
+	.sk = {
+		.__sk_common = {
+			.skc_refcnt = ATOMIC_INIT(1),
+		},
+		.sk_wmem_alloc	= ATOMIC_INIT(1),
+		.sk_allocation	= GFP_ATOMIC,
+		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE),
+	},
+	.pmtudisc = IP_PMTUDISC_WANT,
+};
+
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 			   __be32 saddr, const struct ip_reply_arg *arg,
 			   unsigned int len)
 {
-	struct inet_sock *inet = inet_sk(sk);
 	struct ip_options_data replyopts;
 	struct ipcm_cookie ipc;
 	struct flowi4 fl4;
 	struct rtable *rt = skb_rtable(skb);
+	struct sk_buff *nskb;
+	struct sock *sk;
+	struct inet_sock *inet;
 
 	if (ip_options_echo(&replyopts.opt.opt, skb))
 		return;
@@ -1494,38 +1507,39 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
 
 	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
 			   RT_TOS(arg->tos),
-			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
+			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
 			   ip_reply_arg_flowi_flags(arg),
 			   daddr, saddr,
 			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
-	rt = ip_route_output_key(sock_net(sk), &fl4);
+	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
 		return;
 
-	/* And let IP do all the hard work.
+	inet = &get_cpu_var(unicast_sock);
 
-	   This chunk is not reenterable, hence spinlock.
-	   Note that it uses the fact, that this function is called
-	   with locally disabled BH and that sk cannot be already spinlocked.
-	 */
-	bh_lock_sock(sk);
 	inet->tos = arg->tos;
+	sk = &inet->sk;
 	sk->sk_priority = skb->priority;
 	sk->sk_protocol = ip_hdr(skb)->protocol;
 	sk->sk_bound_dev_if = arg->bound_dev_if;
+	sock_net_set(sk, net);
+	__skb_queue_head_init(&sk->sk_write_queue);
+	sk->sk_sndbuf = sysctl_wmem_default;
 	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
 		       &ipc, &rt, MSG_DONTWAIT);
-	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+	nskb = skb_peek(&sk->sk_write_queue);
+	if (nskb) {
 		if (arg->csumoffset >= 0)
-			*((__sum16 *)skb_transport_header(skb) +
-			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
+			*((__sum16 *)skb_transport_header(nskb) +
+			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
 								arg->csum));
-		skb->ip_summed = CHECKSUM_NONE;
+		nskb->ip_summed = CHECKSUM_NONE;
+		skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
 		ip_push_pending_frames(sk, &fl4);
 	}
 
-	bh_unlock_sock(sk);
+	put_cpu_var(unicast_sock);
 
 	ip_rt_put(rt);
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9caf5c..d7d2fa5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -688,7 +688,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 
 	net = dev_net(skb_dst(skb)->dev);
 	arg.tos = ip_hdr(skb)->tos;
-	ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 
 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -771,7 +771,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 	if (oif)
 		arg.bound_dev_if = oif;
 	arg.tos = tos;
-	ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 
 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -2624,13 +2624,11 @@ EXPORT_SYMBOL(tcp_prot);
 
 static int __net_init tcp_sk_init(struct net *net)
 {
-	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
-				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
+	return 0;
 }
 
 static void __net_exit tcp_sk_exit(struct net *net)
 {
-	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
 }
 
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] ipv4: tcp: remove per net tcp_sock
  2012-07-19 17:34 [PATCH net-next v2] ipv4: tcp: remove per net tcp_sock Eric Dumazet
@ 2012-07-19 17:36 ` David Miller
  2012-07-19 17:49   ` Eric Dumazet
  2012-07-21  8:00 ` Hiroaki SHIMODA
  1 sibling, 1 reply; 7+ messages in thread
From: David Miller @ 2012-07-19 17:36 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, therbert, wsommerfeld

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 19 Jul 2012 19:34:03 +0200

> v2 : move unicast_sock out of ip_send_unicast_reply() body
>      init sk_refcnt to 1, in case some driver get/put a reference on
>      socket.

The compiler seems much happier with this, applied, thanks Eric :-)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] ipv4: tcp: remove per net tcp_sock
  2012-07-19 17:36 ` David Miller
@ 2012-07-19 17:49   ` Eric Dumazet
  2012-07-19 17:51     ` David Miller
  0 siblings, 1 reply; 7+ messages in thread
From: Eric Dumazet @ 2012-07-19 17:49 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, therbert, wsommerfeld

On Thu, 2012-07-19 at 10:36 -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 19 Jul 2012 19:34:03 +0200
> 
> > v2 : move unicast_sock out of ip_send_unicast_reply() body
> >      init sk_refcnt to 1, in case some driver get/put a reference on
> >      socket.
> 
> The compiler seems much happier with this, applied, thanks Eric :-)

Maybe I should install your compiler ;) What is the version you
currently use ?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] ipv4: tcp: remove per net tcp_sock
  2012-07-19 17:49   ` Eric Dumazet
@ 2012-07-19 17:51     ` David Miller
  0 siblings, 0 replies; 7+ messages in thread
From: David Miller @ 2012-07-19 17:51 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, therbert, wsommerfeld

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 19 Jul 2012 19:49:57 +0200

> On Thu, 2012-07-19 at 10:36 -0700, David Miller wrote:
>> From: Eric Dumazet <eric.dumazet@gmail.com>
>> Date: Thu, 19 Jul 2012 19:34:03 +0200
>> 
>> > v2 : move unicast_sock out of ip_send_unicast_reply() body
>> >      init sk_refcnt to 1, in case some driver get/put a reference on
>> >      socket.
>> 
>> The compiler seems much happier with this, applied, thanks Eric :-)
> 
> Maybe I should install your compiler ;) What is the version you
> currently use ?

On x86-64 I use:

[davem@dokdo net-next]$ gcc --version
gcc (GCC) 4.7.0 20120507 (Red Hat 4.7.0-5)
Copyright (C) 2012 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

and on sparc64 I use gcc-4.7.1 vanilla.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] ipv4: tcp: remove per net tcp_sock
  2012-07-19 17:34 [PATCH net-next v2] ipv4: tcp: remove per net tcp_sock Eric Dumazet
  2012-07-19 17:36 ` David Miller
@ 2012-07-21  8:00 ` Hiroaki SHIMODA
  2012-07-21  8:28   ` Eric Dumazet
  1 sibling, 1 reply; 7+ messages in thread
From: Hiroaki SHIMODA @ 2012-07-21  8:00 UTC (permalink / raw)
  To: eric.dumazet; +Cc: davem, netdev, therbert, wsommerfeld

On Thu, 19 Jul 2012 19:34:03 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> +static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
> +	.sk = {
> +		.__sk_common = {
> +			.skc_refcnt = ATOMIC_INIT(1),
> +		},
> +		.sk_wmem_alloc	= ATOMIC_INIT(1),
> +		.sk_allocation	= GFP_ATOMIC,
> +		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE),
> +	},
> +	.pmtudisc = IP_PMTUDISC_WANT,
> +};

RST packet generated from unicast_sock have 0 TTL value.
I think ".uc_ttl = -1" is needed in above initialization.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] ipv4: tcp: remove per net tcp_sock
  2012-07-21  8:00 ` Hiroaki SHIMODA
@ 2012-07-21  8:28   ` Eric Dumazet
  2012-07-22 19:08     ` David Miller
  0 siblings, 1 reply; 7+ messages in thread
From: Eric Dumazet @ 2012-07-21  8:28 UTC (permalink / raw)
  To: Hiroaki SHIMODA; +Cc: davem, netdev, therbert, wsommerfeld


On Sat, 2012-07-21 at 17:00 +0900, Hiroaki SHIMODA wrote:
> On Thu, 19 Jul 2012 19:34:03 +0200
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > +static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
> > +	.sk = {
> > +		.__sk_common = {
> > +			.skc_refcnt = ATOMIC_INIT(1),
> > +		},
> > +		.sk_wmem_alloc	= ATOMIC_INIT(1),
> > +		.sk_allocation	= GFP_ATOMIC,
> > +		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE),
> > +	},
> > +	.pmtudisc = IP_PMTUDISC_WANT,
> > +};
> 
> RST packet generated from unicast_sock have 0 TTL value.
> I think ".uc_ttl = -1" is needed in above initialization.

Good catch, thanks a lot !

[PATCH net-next] ipv4: tcp: set unicast_sock uc_ttl to -1

Set unicast_sock uc_ttl to -1 so that we select the right ttl,
instead of sending packets with a 0 ttl.

Bug added in commit be9f4a44e7d4 (ipv4: tcp: remove per net tcp_sock)

Signed-off-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/ip_output.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c528f84..665abbb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1476,7 +1476,8 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
 		.sk_allocation	= GFP_ATOMIC,
 		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE),
 	},
-	.pmtudisc = IP_PMTUDISC_WANT,
+	.pmtudisc	= IP_PMTUDISC_WANT,
+	.uc_ttl		= -1,
 };
 
 void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next v2] ipv4: tcp: remove per net tcp_sock
  2012-07-21  8:28   ` Eric Dumazet
@ 2012-07-22 19:08     ` David Miller
  0 siblings, 0 replies; 7+ messages in thread
From: David Miller @ 2012-07-22 19:08 UTC (permalink / raw)
  To: eric.dumazet; +Cc: shimoda.hiroaki, netdev, therbert, wsommerfeld

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 21 Jul 2012 10:28:51 +0200

> [PATCH net-next] ipv4: tcp: set unicast_sock uc_ttl to -1
> 
> Set unicast_sock uc_ttl to -1 so that we select the right ttl,
> instead of sending packets with a 0 ttl.
> 
> Bug added in commit be9f4a44e7d4 (ipv4: tcp: remove per net tcp_sock)
> 
> Signed-off-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

APplied, thanks everyone.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2012-07-22 19:08 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-07-19 17:34 [PATCH net-next v2] ipv4: tcp: remove per net tcp_sock Eric Dumazet
2012-07-19 17:36 ` David Miller
2012-07-19 17:49   ` Eric Dumazet
2012-07-19 17:51     ` David Miller
2012-07-21  8:00 ` Hiroaki SHIMODA
2012-07-21  8:28   ` Eric Dumazet
2012-07-22 19:08     ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox