Netdev List
 help / color / mirror / Atom feed
* [PATCH v3] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-20  4:49 UTC (permalink / raw)
  To: netdev


Input packet processing for local sockets involves two major demuxes.
One for the route and one for the socket.

But we can optimize this down to one demux for certain kinds of local
sockets.

Currently we only do this for established TCP sockets, but it could
at least in theory be expanded to other kinds of connections.

If a TCP socket is established then it's identity is fully specified.

This means that whatever input route was used during the three-way
handshake must work equally well for the rest of the connection since
the keys will not change.

Once we move to established state, we cache the receive packet's input
route to use later.

Like the existing cached route in sk->sk_dst_cache used for output
packets, we have to check for route invalidations using dst->obsolete
and dst->ops->check().

Early demux occurs outside of a socket locked section, so when a route
invalidation occurs we defer the fixup of sk->sk_rx_dst until we are
actually inside of established state packet processing and thus have
the socket locked.

Signed-off-by: David S. Miller <davem@davemloft.net>
---

This is the final version I pushed into net-next, it's just respun
with the MAX_INET_PROTOS hash masking removed.

 include/net/inet_hashtables.h |    4 ++--
 include/net/protocol.h        |    1 +
 include/net/sock.h            |    2 ++
 include/net/tcp.h             |    1 +
 net/core/sock.c               |    5 +++++
 net/ipv4/af_inet.c            |   18 +++++++++-------
 net/ipv4/ip_input.c           |   39 ++++++++++++++++++++++------------
 net/ipv4/tcp_input.c          |   16 +++++++++++++-
 net/ipv4/tcp_ipv4.c           |   46 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_minisocks.c      |    2 ++
 10 files changed, 110 insertions(+), 24 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 808fc5f..54be028 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -379,10 +379,10 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 					     const __be16 sport,
 					     const __be16 dport)
 {
-	struct sock *sk;
+	struct sock *sk = skb_steal_sock(skb);
 	const struct iphdr *iph = ip_hdr(skb);
 
-	if (unlikely(sk = skb_steal_sock(skb)))
+	if (sk)
 		return sk;
 	else
 		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
diff --git a/include/net/protocol.h b/include/net/protocol.h
index a1b1b53..967b926 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,6 +37,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
+	int			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/sock.h b/include/net/sock.h
index 4a45216..87b424a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -319,6 +319,7 @@ struct sock {
 	unsigned long 		sk_flags;
 	struct dst_entry	*sk_dst_cache;
 	spinlock_t		sk_dst_lock;
+	struct dst_entry	*sk_rx_dst;
 	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
@@ -1426,6 +1427,7 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      gfp_t priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
+extern void			sock_edemux(struct sk_buff *skb);
 
 extern int			sock_setsockopt(struct socket *sock, int level,
 						int op, char __user *optval,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9332f34..6660ffc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
+extern int tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9e5b71f..929bdcc 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_rfree);
 
+void sock_edemux(struct sk_buff *skb)
+{
+	sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
 
 int sock_i_uid(struct sock *sk)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 85a3b17..07a02f6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
 
 	kfree(rcu_dereference_protected(inet->inet_opt, 1));
 	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+	dst_release(sk->sk_rx_dst);
 	sk_refcnt_debug_dec(sk);
 }
 EXPORT_SYMBOL(inet_sock_destruct);
@@ -1518,14 +1519,15 @@ static const struct net_protocol igmp_protocol = {
 #endif
 
 static const struct net_protocol tcp_protocol = {
-	.handler =	tcp_v4_rcv,
-	.err_handler =	tcp_v4_err,
-	.gso_send_check = tcp_v4_gso_send_check,
-	.gso_segment =	tcp_tso_segment,
-	.gro_receive =	tcp4_gro_receive,
-	.gro_complete =	tcp4_gro_complete,
-	.no_policy =	1,
-	.netns_ok =	1,
+	.early_demux	=	tcp_v4_early_demux,
+	.handler	=	tcp_v4_rcv,
+	.err_handler	=	tcp_v4_err,
+	.gso_send_check	=	tcp_v4_gso_send_check,
+	.gso_segment	=	tcp_tso_segment,
+	.gro_receive	=	tcp4_gro_receive,
+	.gro_complete	=	tcp4_gro_complete,
+	.no_policy	=	1,
+	.netns_ok	=	1,
 };
 
 static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c4fe1d2..93b092c 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -323,19 +323,32 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
-		if (unlikely(err)) {
-			if (err == -EHOSTUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INADDRERRORS);
-			else if (err == -ENETUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INNOROUTES);
-			else if (err == -EXDEV)
-				NET_INC_STATS_BH(dev_net(skb->dev),
-						 LINUX_MIB_IPRPFILTER);
-			goto drop;
+		const struct net_protocol *ipprot;
+		int protocol = iph->protocol;
+		int err;
+
+		rcu_read_lock();
+		ipprot = rcu_dereference(inet_protos[protocol]);
+		err = -ENOENT;
+		if (ipprot && ipprot->early_demux)
+			err = ipprot->early_demux(skb);
+		rcu_read_unlock();
+
+		if (err) {
+			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+						   iph->tos, skb->dev);
+			if (unlikely(err)) {
+				if (err == -EHOSTUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INADDRERRORS);
+				else if (err == -ENETUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INNOROUTES);
+				else if (err == -EXDEV)
+					NET_INC_STATS_BH(dev_net(skb->dev),
+							 LINUX_MIB_IPRPFILTER);
+				goto drop;
+			}
 		}
 	}
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b224eb8..8416f8a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	int res;
 
+	if (sk->sk_rx_dst) {
+		struct dst_entry *dst = sk->sk_rx_dst;
+		if (unlikely(dst->obsolete)) {
+			if (dst->ops->check(dst, 0) == NULL) {
+				dst_release(dst);
+				sk->sk_rx_dst = NULL;
+			}
+		}
+	}
+	if (unlikely(sk->sk_rx_dst == NULL))
+		sk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 	/*
 	 *	Header prediction.
 	 *	The code loosely follows the one in the famous
@@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 
 	tcp_set_state(sk, TCP_ESTABLISHED);
 
-	if (skb != NULL)
+	if (skb != NULL) {
+		sk->sk_rx_dst = dst_clone(skb_dst(skb));
 		security_inet_conn_established(sk, skb);
+	}
 
 	/* Make sure socket is routed, for correct metrics.  */
 	icsk->icsk_af_ops->rebuild_header(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fda2ca1..13857df 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1671,6 +1671,52 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
+int tcp_v4_early_demux(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	struct sock *sk;
+	int err;
+
+	err = -ENOENT;
+	if (skb->pkt_type != PACKET_HOST)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+		goto out_err;
+
+	iph = ip_hdr(skb);
+	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+
+	if (th->doff < sizeof(struct tcphdr) / 4)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
+		goto out_err;
+
+	sk = __inet_lookup_established(net, &tcp_hashinfo,
+				       iph->saddr, th->source,
+				       iph->daddr, th->dest,
+				       skb->dev->ifindex);
+	if (sk) {
+		skb->sk = sk;
+		skb->destructor = sock_edemux;
+		if (sk->sk_state != TCP_TIME_WAIT) {
+			struct dst_entry *dst = sk->sk_rx_dst;
+			if (dst)
+				dst = dst_check(dst, 0);
+			if (dst) {
+				skb_dst_set_noref(skb, dst);
+				err = 0;
+			}
+		}
+	}
+
+out_err:
+	return err;
+}
+
 /*
  *	From tcp_input.c
  */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb01531..72b7c63 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
+		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
 		 * copy any s_data_payload stored at the original socket.
-- 
1.7.10

^ permalink raw reply related

* Re: linux-next: build failure after merge of the net-next tree
From: David Miller @ 2012-06-20  4:47 UTC (permalink / raw)
  To: bhupesh.sharma
  Cc: sfr, netdev, linux-next, linux-kernel, federico.vaga,
	giancarlo.asnaghi, wg, mkl
In-Reply-To: <D5ECB3C7A6F99444980976A8C6D896384FAA275CBE@EAPEX1MAIL1.st.com>

From: Bhupesh SHARMA <bhupesh.sharma@st.com>
Date: Wed, 20 Jun 2012 12:45:46 +0800

> So, whether adding a check in Kconfig for HAVE_CLK be a proper
> solution ?  But that will limit the compilation of this driver for
> only platforms which are ARM based.
> 
> One may need to support this driver on x86 like platforms also..

Then x86 will need to provide clock operations, or there needs to
be dummy ones for such platforms.

This isn't rocket science.

^ permalink raw reply

* Re: [PATCH v2] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-20  4:46 UTC (permalink / raw)
  To: shemminger; +Cc: netdev
In-Reply-To: <20120619193549.13bcffa7@s6510.linuxnetplumber.net>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 19 Jun 2012 19:35:49 -0700

> Any benchmark numbers?

Measuring the path from ip_rcv_finish() to where we lock the socket in
tcp_v4_rcv(), on a SPARC-T3, with a pre-warmed routing cache:

Both sk and RT lookup:	~4200 cycles
Optimized early demux:	~2800 cycles

These numbers can be decreased further, because since we're already
looking at the TCP header we can pre-cook the TCP control block in the
SKB and skip much of the stuff that tcp_v4_rcv() does since we've done
it already in the early demux code.

> I think the number of ref count operations per packet is going to be
> the next line in the sand.

There is only one, for the socket.  We haven't taken a reference on the
route for years.

^ permalink raw reply

* RE: linux-next: build failure after merge of the net-next tree
From: Bhupesh SHARMA @ 2012-06-20  4:45 UTC (permalink / raw)
  To: David Miller
  Cc: sfr@canb.auug.org.au, netdev@vger.kernel.org,
	linux-next@vger.kernel.org, linux-kernel@vger.kernel.org,
	federico.vaga@gmail.com, Giancarlo ASNAGHI, wg@grandegger.com,
	mkl@pengutronix.de
In-Reply-To: <20120619.213703.513466167861199217.davem@davemloft.net>

Hi David,

> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Wednesday, June 20, 2012 10:07 AM
> To: Bhupesh SHARMA
> Cc: sfr@canb.auug.org.au; netdev@vger.kernel.org; linux-
> next@vger.kernel.org; linux-kernel@vger.kernel.org;
> federico.vaga@gmail.com; Giancarlo ASNAGHI; wg@grandegger.com;
> mkl@pengutronix.de
> Subject: Re: linux-next: build failure after merge of the net-next tree
> 
> From: Bhupesh SHARMA <bhupesh.sharma@st.com>
> Date: Wed, 20 Jun 2012 12:27:11 +0800
> 
> > clk_get/clk_put* variants are usually used by ARM platforms.
> > Protecting their calls under macro 'CONFIG_HAVE_CLK' should solve the
> problem.
> 
> No, we don't pepper foo.c files with crappy ifdefs.

So, whether adding a check in Kconfig for HAVE_CLK be a proper solution ?
But that will limit the compilation of this driver for only platforms which are ARM based.

One may need to support this driver on x86 like platforms also..

Regards,
Bhupesh

^ permalink raw reply

* Re: linux-next: build failure after merge of the net-next tree
From: David Miller @ 2012-06-20  4:37 UTC (permalink / raw)
  To: bhupesh.sharma
  Cc: sfr, netdev, linux-next, linux-kernel, federico.vaga,
	giancarlo.asnaghi, wg, mkl
In-Reply-To: <D5ECB3C7A6F99444980976A8C6D896384FAA275CA8@EAPEX1MAIL1.st.com>

From: Bhupesh SHARMA <bhupesh.sharma@st.com>
Date: Wed, 20 Jun 2012 12:27:11 +0800

> clk_get/clk_put* variants are usually used by ARM platforms.
> Protecting their calls under macro 'CONFIG_HAVE_CLK' should solve the problem.

No, we don't pepper foo.c files with crappy ifdefs.

^ permalink raw reply

* RE: linux-next: build failure after merge of the net-next tree
From: Bhupesh SHARMA @ 2012-06-20  4:27 UTC (permalink / raw)
  To: Stephen Rothwell, David Miller, netdev@vger.kernel.org
  Cc: linux-next@vger.kernel.org, linux-kernel@vger.kernel.org,
	Federico Vaga, Giancarlo ASNAGHI, Wolfgang Grandegger,
	Marc Kleine-Budde
In-Reply-To: <20120620133348.a9ff31a72ffd11818f574fe3@canb.auug.org.au>

Hi,

> -----Original Message-----
> From: Stephen Rothwell [mailto:sfr@canb.auug.org.au]
> Sent: Wednesday, June 20, 2012 9:04 AM
> To: David Miller; netdev@vger.kernel.org
> Cc: linux-next@vger.kernel.org; linux-kernel@vger.kernel.org; Federico
> Vaga; Giancarlo ASNAGHI; Wolfgang Grandegger; Bhupesh SHARMA; Marc
> Kleine-Budde
> Subject: linux-next: build failure after merge of the net-next tree
> 
> Hi all,
> 
> After merging the net-next tree, today's linux-next build (x86_64
> allmodconfig) failed like this:
> 
> ERROR: "clk_get_rate" [drivers/net/can/c_can/c_can_pci.ko] undefined!
> ERROR: "clk_get" [drivers/net/can/c_can/c_can_pci.ko] undefined!
> ERROR: "clk_put" [drivers/net/can/c_can/c_can_pci.ko] undefined!
> 
> Caused by commit 5b92da0443c2 ("c_can_pci: generic module for
> C_CAN/D_CAN on PCI").  Maybe a missing Kconfig dependency/select?
> 
> I have used the net-next tree from next-20120619 for today.
> --

clk_get/clk_put* variants are usually used by ARM platforms.
Protecting their calls under macro 'CONFIG_HAVE_CLK' should solve the problem.

See [1] for how it is done in c_can_platform.c

Could you possibly add these checks and send a patch for the same?
It should be fairly simple.

[1] http://lxr.linux.no/linux+v3.4.3/drivers/net/can/c_can/c_can_platform.c#L68

Regards,
Bhupesh

^ permalink raw reply

* Re: linux-next: build failure after merge of the net-next tree
From: David Miller @ 2012-06-20  4:24 UTC (permalink / raw)
  To: sfr
  Cc: netdev, linux-next, linux-kernel, federico.vaga,
	giancarlo.asnaghi, wg, bhupesh.sharma, mkl
In-Reply-To: <20120620133348.a9ff31a72ffd11818f574fe3@canb.auug.org.au>

From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 20 Jun 2012 13:33:48 +1000

> After merging the net-next tree, today's linux-next build (x86_64
> allmodconfig) failed like this:
> 
> ERROR: "clk_get_rate" [drivers/net/can/c_can/c_can_pci.ko] undefined!
> ERROR: "clk_get" [drivers/net/can/c_can/c_can_pci.ko] undefined!
> ERROR: "clk_put" [drivers/net/can/c_can/c_can_pci.ko] undefined!
> 
> Caused by commit 5b92da0443c2 ("c_can_pci: generic module for C_CAN/D_CAN
> on PCI").  Maybe a missing Kconfig dependency/select?
> 
> I have used the net-next tree from next-20120619 for today.

Known problem:

http://marc.info/?l=linux-netdev&m=134014347620836&w=2

^ permalink raw reply

* linux-next: build failure after merge of the net-next tree
From: Stephen Rothwell @ 2012-06-20  3:33 UTC (permalink / raw)
  To: David Miller, netdev
  Cc: linux-next, linux-kernel, Federico Vaga, Giancarlo Asnaghi,
	Wolfgang Grandegger, Bhupesh Sharma, Marc Kleine-Budde

[-- Attachment #1: Type: text/plain, Size: 583 bytes --]

Hi all,

After merging the net-next tree, today's linux-next build (x86_64
allmodconfig) failed like this:

ERROR: "clk_get_rate" [drivers/net/can/c_can/c_can_pci.ko] undefined!
ERROR: "clk_get" [drivers/net/can/c_can/c_can_pci.ko] undefined!
ERROR: "clk_put" [drivers/net/can/c_can/c_can_pci.ko] undefined!

Caused by commit 5b92da0443c2 ("c_can_pci: generic module for C_CAN/D_CAN
on PCI").  Maybe a missing Kconfig dependency/select?

I have used the net-next tree from next-20120619 for today.
-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* linux-next: manual merge of the net-next tree with the  tree
From: Stephen Rothwell @ 2012-06-20  3:18 UTC (permalink / raw)
  To: David Miller, netdev; +Cc: linux-next, linux-kernel, Thomas Graf

[-- Attachment #1: Type: text/plain, Size: 1553 bytes --]

Hi all,

Today's linux-next merge of the net-next tree got a conflict in
net/ipv6/route.c between commit d189634ecab9 ("ipv6: Move ipv6 proc file
registration to end of init order") from the net tree and commit
c3426b47190d ("inet: Initialize per-netns inetpeer roots in net/ipv
{4,6}/route.c") from the net-next tree.

I fixed it up (I think - see below) and can carry the fix as necessary.
-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

diff --cc net/ipv6/route.c
index becb048,e649cd7..0000000
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@@ -3007,11 -2929,31 +2938,36 @@@ static struct pernet_operations ip6_rou
  	.exit = ip6_route_net_exit,
  };
  
 +static struct pernet_operations ip6_route_net_late_ops = {
 +	.init = ip6_route_net_init_late,
 +	.exit = ip6_route_net_exit_late,
 +};
 +
+ static int __net_init ipv6_inetpeer_init(struct net *net)
+ {
+ 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
+ 
+ 	if (!bp)
+ 		return -ENOMEM;
+ 	inet_peer_base_init(bp);
+ 	net->ipv6.peers = bp;
+ 	return 0;
+ }
+ 
+ static void __net_exit ipv6_inetpeer_exit(struct net *net)
+ {
+ 	struct inet_peer_base *bp = net->ipv6.peers;
+ 
+ 	net->ipv6.peers = NULL;
+ 	inetpeer_invalidate_tree(bp);
+ 	kfree(bp);
+ }
+ 
+ static struct pernet_operations ipv6_inetpeer_ops = {
+ 	.init	=	ipv6_inetpeer_init,
+ 	.exit	=	ipv6_inetpeer_exit,
+ };
+ 
  static struct notifier_block ip6_route_dev_notifier = {
  	.notifier_call = ip6_route_dev_notify,
  	.priority = 0,

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH v2] ipv4: Early TCP socket demux.
From: Stephen Hemminger @ 2012-06-20  2:35 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120619.163911.2094057156011157978.davem@davemloft.net>

On Tue, 19 Jun 2012 16:39:11 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> 
> Input packet processing for local sockets involves two major demuxes.
> One for the route and one for the socket.
> 
> But we can optimize this down to one demux for certain kinds of local
> sockets.
> 
> Currently we only do this for established TCP sockets, but it could
> at least in theory be expanded to other kinds of connections.
> 
> If a TCP socket is established then it's identity is fully specified.
> 
> This means that whatever input route was used during the three-way
> handshake must work equally well for the rest of the connection since
> the keys will not change.
> 
> Once we move to established state, we cache the receive packet's input
> route to use later.
> 
> Like the existing cached route in sk->sk_dst_cache used for output
> packets, we have to check for route invalidations using dst->obsolete
> and dst->ops->check().
> 
> Early demux occurs outside of a socket locked section, so when a route
> invalidation occurs we defer the fixup of sk->sk_rx_dst until we are
> actually inside of established state packet processing and thus have
> the socket locked.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
> 
> Changes since v1:
> 
> 1) Remove unlikely() from __inet_lookup_skb()
> 
> 2) Check for cached route invalidations.
> 
> 3) Hook up RX dst when outgoing connection moved to established too,
>    previously it was only handling incoming connections.
> 
> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index 808fc5f..54be028 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -379,10 +379,10 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
>  					     const __be16 sport,
>  					     const __be16 dport)
>  {
> -	struct sock *sk;
> +	struct sock *sk = skb_steal_sock(skb);
>  	const struct iphdr *iph = ip_hdr(skb);
>  
> -	if (unlikely(sk = skb_steal_sock(skb)))
> +	if (sk)
>  		return sk;
>  	else
>  		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
> diff --git a/include/net/protocol.h b/include/net/protocol.h
> index 875f489..6c47bf8 100644
> --- a/include/net/protocol.h
> +++ b/include/net/protocol.h
> @@ -34,6 +34,7 @@
>  
>  /* This is used to register protocols. */
>  struct net_protocol {
> +	int			(*early_demux)(struct sk_buff *skb);
>  	int			(*handler)(struct sk_buff *skb);
>  	void			(*err_handler)(struct sk_buff *skb, u32 info);
>  	int			(*gso_send_check)(struct sk_buff *skb);
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 4a45216..87b424a 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -319,6 +319,7 @@ struct sock {
>  	unsigned long 		sk_flags;
>  	struct dst_entry	*sk_dst_cache;
>  	spinlock_t		sk_dst_lock;
> +	struct dst_entry	*sk_rx_dst;
>  	atomic_t		sk_wmem_alloc;
>  	atomic_t		sk_omem_alloc;
>  	int			sk_sndbuf;
> @@ -1426,6 +1427,7 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
>  					      gfp_t priority);
>  extern void			sock_wfree(struct sk_buff *skb);
>  extern void			sock_rfree(struct sk_buff *skb);
> +extern void			sock_edemux(struct sk_buff *skb);
>  
>  extern int			sock_setsockopt(struct socket *sock, int level,
>  						int op, char __user *optval,
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 9332f34..6660ffc 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
>  
>  extern void tcp_shutdown (struct sock *sk, int how);
>  
> +extern int tcp_v4_early_demux(struct sk_buff *skb);
>  extern int tcp_v4_rcv(struct sk_buff *skb);
>  
>  extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 9e5b71f..929bdcc 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb)
>  }
>  EXPORT_SYMBOL(sock_rfree);
>  
> +void sock_edemux(struct sk_buff *skb)
> +{
> +	sock_put(skb->sk);
> +}
> +EXPORT_SYMBOL(sock_edemux);
>  
>  int sock_i_uid(struct sock *sk)
>  {
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index e4e8e00..a2bd2d2 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
>  
>  	kfree(rcu_dereference_protected(inet->inet_opt, 1));
>  	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
> +	dst_release(sk->sk_rx_dst);
>  	sk_refcnt_debug_dec(sk);
>  }
>  EXPORT_SYMBOL(inet_sock_destruct);
> @@ -1520,14 +1521,15 @@ static const struct net_protocol igmp_protocol = {
>  #endif
>  
>  static const struct net_protocol tcp_protocol = {
> -	.handler =	tcp_v4_rcv,
> -	.err_handler =	tcp_v4_err,
> -	.gso_send_check = tcp_v4_gso_send_check,
> -	.gso_segment =	tcp_tso_segment,
> -	.gro_receive =	tcp4_gro_receive,
> -	.gro_complete =	tcp4_gro_complete,
> -	.no_policy =	1,
> -	.netns_ok =	1,
> +	.early_demux	=	tcp_v4_early_demux,
> +	.handler	=	tcp_v4_rcv,
> +	.err_handler	=	tcp_v4_err,
> +	.gso_send_check	=	tcp_v4_gso_send_check,
> +	.gso_segment	=	tcp_tso_segment,
> +	.gro_receive	=	tcp4_gro_receive,
> +	.gro_complete	=	tcp4_gro_complete,
> +	.no_policy	=	1,
> +	.netns_ok	=	1,
>  };
>  
>  static const struct net_protocol udp_protocol = {
> diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
> index 8590144..cb883e1 100644
> --- a/net/ipv4/ip_input.c
> +++ b/net/ipv4/ip_input.c
> @@ -324,19 +324,34 @@ static int ip_rcv_finish(struct sk_buff *skb)
>  	 *	how the packet travels inside Linux networking.
>  	 */
>  	if (skb_dst(skb) == NULL) {
> -		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
> -					       iph->tos, skb->dev);
> -		if (unlikely(err)) {
> -			if (err == -EHOSTUNREACH)
> -				IP_INC_STATS_BH(dev_net(skb->dev),
> -						IPSTATS_MIB_INADDRERRORS);
> -			else if (err == -ENETUNREACH)
> -				IP_INC_STATS_BH(dev_net(skb->dev),
> -						IPSTATS_MIB_INNOROUTES);
> -			else if (err == -EXDEV)
> -				NET_INC_STATS_BH(dev_net(skb->dev),
> -						 LINUX_MIB_IPRPFILTER);
> -			goto drop;
> +		const struct net_protocol *ipprot;
> +		int protocol = iph->protocol;
> +		int hash, err;
> +
> +		hash = protocol & (MAX_INET_PROTOS - 1);
> +
> +		rcu_read_lock();
> +		ipprot = rcu_dereference(inet_protos[hash]);
> +		err = -ENOENT;
> +		if (ipprot && ipprot->early_demux)
> +			err = ipprot->early_demux(skb);
> +		rcu_read_unlock();
> +
> +		if (err) {
> +			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
> +						   iph->tos, skb->dev);
> +			if (unlikely(err)) {
> +				if (err == -EHOSTUNREACH)
> +					IP_INC_STATS_BH(dev_net(skb->dev),
> +							IPSTATS_MIB_INADDRERRORS);
> +				else if (err == -ENETUNREACH)
> +					IP_INC_STATS_BH(dev_net(skb->dev),
> +							IPSTATS_MIB_INNOROUTES);
> +				else if (err == -EXDEV)
> +					NET_INC_STATS_BH(dev_net(skb->dev),
> +							 LINUX_MIB_IPRPFILTER);
> +				goto drop;
> +			}
>  		}
>  	}
>  
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index b224eb8..8416f8a 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
>  	struct tcp_sock *tp = tcp_sk(sk);
>  	int res;
>  
> +	if (sk->sk_rx_dst) {
> +		struct dst_entry *dst = sk->sk_rx_dst;
> +		if (unlikely(dst->obsolete)) {
> +			if (dst->ops->check(dst, 0) == NULL) {
> +				dst_release(dst);
> +				sk->sk_rx_dst = NULL;
> +			}
> +		}
> +	}
> +	if (unlikely(sk->sk_rx_dst == NULL))
> +		sk->sk_rx_dst = dst_clone(skb_dst(skb));
> +
>  	/*
>  	 *	Header prediction.
>  	 *	The code loosely follows the one in the famous
> @@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
>  
>  	tcp_set_state(sk, TCP_ESTABLISHED);
>  
> -	if (skb != NULL)
> +	if (skb != NULL) {
> +		sk->sk_rx_dst = dst_clone(skb_dst(skb));
>  		security_inet_conn_established(sk, skb);
> +	}
>  
>  	/* Make sure socket is routed, for correct metrics.  */
>  	icsk->icsk_af_ops->rebuild_header(sk);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index fda2ca1..13857df 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1671,6 +1671,52 @@ csum_err:
>  }
>  EXPORT_SYMBOL(tcp_v4_do_rcv);
>  
> +int tcp_v4_early_demux(struct sk_buff *skb)
> +{
> +	struct net *net = dev_net(skb->dev);
> +	const struct iphdr *iph;
> +	const struct tcphdr *th;
> +	struct sock *sk;
> +	int err;
> +
> +	err = -ENOENT;
> +	if (skb->pkt_type != PACKET_HOST)
> +		goto out_err;
> +
> +	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
> +		goto out_err;
> +
> +	iph = ip_hdr(skb);
> +	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
> +
> +	if (th->doff < sizeof(struct tcphdr) / 4)
> +		goto out_err;
> +
> +	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
> +		goto out_err;
> +
> +	sk = __inet_lookup_established(net, &tcp_hashinfo,
> +				       iph->saddr, th->source,
> +				       iph->daddr, th->dest,
> +				       skb->dev->ifindex);
> +	if (sk) {
> +		skb->sk = sk;
> +		skb->destructor = sock_edemux;
> +		if (sk->sk_state != TCP_TIME_WAIT) {
> +			struct dst_entry *dst = sk->sk_rx_dst;
> +			if (dst)
> +				dst = dst_check(dst, 0);
> +			if (dst) {
> +				skb_dst_set_noref(skb, dst);
> +				err = 0;
> +			}
> +		}
> +	}
> +
> +out_err:
> +	return err;
> +}
> +
>  /*
>   *	From tcp_input.c
>   */
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index cb01531..72b7c63 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
>  		struct tcp_sock *oldtp = tcp_sk(sk);
>  		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
>  
> +		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
> +
>  		/* TCP Cookie Transactions require space for the cookie pair,
>  		 * as it differs for each connection.  There is no need to
>  		 * copy any s_data_payload stored at the original socket.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Any benchmark numbers?

I think the number of ref count operations per packet is going to be
the next line in the sand.

^ permalink raw reply

* Re: [PATCH v2] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-20  2:02 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev
In-Reply-To: <20120619.180527.65196005824751590.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Tue, 19 Jun 2012 18:05:27 -0700 (PDT)

> From: Ben Hutchings <bhutchings@solarflare.com>
> Date: Wed, 20 Jun 2012 02:03:26 +0100
> 
>> On Tue, 2012-06-19 at 17:54 -0700, David Miller wrote:
>>> The hash is perfect, what's the big deal?
>> 
>> It obscures what we're really doing and relying on.
> 
> If it matters to you, patches are always welcome :-)

Nevermind, I just committed the following to net-next:

--------------------
inet: Sanitize inet{,6} protocol demux.

Don't pretend that inet_protos[] and inet6_protos[] are hashes, thay
are just a straight arrays.  Remove all unnecessary hash masking.

Document MAX_INET_PROTOS.

Use RAW_HTABLE_SIZE when appropriate.

Reported-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/protocol.h |    7 +++++--
 net/ipv4/af_inet.c     |   26 ++++++++++++--------------
 net/ipv4/icmp.c        |    9 ++++-----
 net/ipv4/ip_input.c    |    5 ++---
 net/ipv4/protocol.c    |    8 +++-----
 net/ipv6/icmp.c        |    7 ++-----
 net/ipv6/ip6_input.c   |    9 +++------
 net/ipv6/protocol.c    |    8 +++-----
 net/ipv6/raw.c         |    4 ++--
 9 files changed, 36 insertions(+), 47 deletions(-)

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 875f489..a1b1b53 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -29,8 +29,11 @@
 #include <linux/ipv6.h>
 #endif
 
-#define MAX_INET_PROTOS	256		/* Must be a power of 2		*/
-
+/* This is one larger than the largest protocol value that can be
+ * found in an ipv4 or ipv6 header.  Since in both cases the protocol
+ * value is presented in a __u8, this is defined to be 256.
+ */
+#define MAX_INET_PROTOS		256
 
 /* This is used to register protocols. */
 struct net_protocol {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e4e8e00..85a3b17 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -242,20 +242,18 @@ void build_ehash_secret(void)
 }
 EXPORT_SYMBOL(build_ehash_secret);
 
-static inline int inet_netns_ok(struct net *net, int protocol)
+static inline int inet_netns_ok(struct net *net, __u8 protocol)
 {
-	int hash;
 	const struct net_protocol *ipprot;
 
 	if (net_eq(net, &init_net))
 		return 1;
 
-	hash = protocol & (MAX_INET_PROTOS - 1);
-	ipprot = rcu_dereference(inet_protos[hash]);
-
-	if (ipprot == NULL)
+	ipprot = rcu_dereference(inet_protos[protocol]);
+	if (ipprot == NULL) {
 		/* raw IP is OK */
 		return 1;
+	}
 	return ipprot->netns_ok;
 }
 
@@ -1216,8 +1214,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 
 static int inet_gso_send_check(struct sk_buff *skb)
 {
-	const struct iphdr *iph;
 	const struct net_protocol *ops;
+	const struct iphdr *iph;
 	int proto;
 	int ihl;
 	int err = -EINVAL;
@@ -1236,7 +1234,7 @@ static int inet_gso_send_check(struct sk_buff *skb)
 	__skb_pull(skb, ihl);
 	skb_reset_transport_header(skb);
 	iph = ip_hdr(skb);
-	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	proto = iph->protocol;
 	err = -EPROTONOSUPPORT;
 
 	rcu_read_lock();
@@ -1253,8 +1251,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
-	struct iphdr *iph;
 	const struct net_protocol *ops;
+	struct iphdr *iph;
 	int proto;
 	int ihl;
 	int id;
@@ -1286,7 +1284,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	skb_reset_transport_header(skb);
 	iph = ip_hdr(skb);
 	id = ntohs(iph->id);
-	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	proto = iph->protocol;
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
 	rcu_read_lock();
@@ -1340,7 +1338,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 			goto out;
 	}
 
-	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	proto = iph->protocol;
 
 	rcu_read_lock();
 	ops = rcu_dereference(inet_protos[proto]);
@@ -1398,11 +1396,11 @@ out:
 
 static int inet_gro_complete(struct sk_buff *skb)
 {
-	const struct net_protocol *ops;
+	__be16 newlen = htons(skb->len - skb_network_offset(skb));
 	struct iphdr *iph = ip_hdr(skb);
-	int proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	const struct net_protocol *ops;
+	int proto = iph->protocol;
 	int err = -ENOSYS;
-	__be16 newlen = htons(skb->len - skb_network_offset(skb));
 
 	csum_replace2(&iph->check, iph->tot_len, newlen);
 	iph->tot_len = newlen;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index e1caa1a..49a74cc 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -637,12 +637,12 @@ EXPORT_SYMBOL(icmp_send);
 
 static void icmp_unreach(struct sk_buff *skb)
 {
+	const struct net_protocol *ipprot;
 	const struct iphdr *iph;
 	struct icmphdr *icmph;
-	int hash, protocol;
-	const struct net_protocol *ipprot;
-	u32 info = 0;
 	struct net *net;
+	u32 info = 0;
+	int protocol;
 
 	net = dev_net(skb_dst(skb)->dev);
 
@@ -731,9 +731,8 @@ static void icmp_unreach(struct sk_buff *skb)
 	 */
 	raw_icmp_error(skb, protocol, info);
 
-	hash = protocol & (MAX_INET_PROTOS - 1);
 	rcu_read_lock();
-	ipprot = rcu_dereference(inet_protos[hash]);
+	ipprot = rcu_dereference(inet_protos[protocol]);
 	if (ipprot && ipprot->err_handler)
 		ipprot->err_handler(skb, info);
 	rcu_read_unlock();
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8590144..c4fe1d2 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -198,14 +198,13 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
 	rcu_read_lock();
 	{
 		int protocol = ip_hdr(skb)->protocol;
-		int hash, raw;
 		const struct net_protocol *ipprot;
+		int raw;
 
 	resubmit:
 		raw = raw_local_deliver(skb, protocol);
 
-		hash = protocol & (MAX_INET_PROTOS - 1);
-		ipprot = rcu_dereference(inet_protos[hash]);
+		ipprot = rcu_dereference(inet_protos[protocol]);
 		if (ipprot != NULL) {
 			int ret;
 
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 9ae5c01..8918eff 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
 
 int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-	int hash = protocol & (MAX_INET_PROTOS - 1);
-
-	return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
+	return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
 			NULL, prot) ? 0 : -1;
 }
 EXPORT_SYMBOL(inet_add_protocol);
@@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol);
 
 int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+	int ret;
 
-	ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
+	ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
 		       prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 5247d5c..c7da142 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -600,9 +600,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
 {
 	const struct inet6_protocol *ipprot;
 	int inner_offset;
-	int hash;
-	u8 nexthdr;
 	__be16 frag_off;
+	u8 nexthdr;
 
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		return;
@@ -629,10 +628,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
 	   --ANK (980726)
 	 */
 
-	hash = nexthdr & (MAX_INET_PROTOS - 1);
-
 	rcu_read_lock();
-	ipprot = rcu_dereference(inet6_protos[hash]);
+	ipprot = rcu_dereference(inet6_protos[nexthdr]);
 	if (ipprot && ipprot->err_handler)
 		ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
 	rcu_read_unlock();
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 21a15df..5ab923e 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -168,13 +168,12 @@ drop:
 
 static int ip6_input_finish(struct sk_buff *skb)
 {
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	const struct inet6_protocol *ipprot;
+	struct inet6_dev *idev;
 	unsigned int nhoff;
 	int nexthdr;
 	bool raw;
-	u8 hash;
-	struct inet6_dev *idev;
-	struct net *net = dev_net(skb_dst(skb)->dev);
 
 	/*
 	 *	Parse extension headers
@@ -189,9 +188,7 @@ resubmit:
 	nexthdr = skb_network_header(skb)[nhoff];
 
 	raw = raw6_local_deliver(skb, nexthdr);
-
-	hash = nexthdr & (MAX_INET_PROTOS - 1);
-	if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) {
+	if ((ipprot = rcu_dereference(inet6_protos[nexthdr])) != NULL) {
 		int ret;
 
 		if (ipprot->flags & INET6_PROTO_FINAL) {
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index 9a7978f..053082d 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -29,9 +29,7 @@ const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
 
 int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
 {
-	int hash = protocol & (MAX_INET_PROTOS - 1);
-
-	return !cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
+	return !cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol],
 			NULL, prot) ? 0 : -1;
 }
 EXPORT_SYMBOL(inet6_add_protocol);
@@ -42,9 +40,9 @@ EXPORT_SYMBOL(inet6_add_protocol);
 
 int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol)
 {
-	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+	int ret;
 
-	ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
+	ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol],
 		       prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 43b0042..b5c1dcb 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -165,7 +165,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 	saddr = &ipv6_hdr(skb)->saddr;
 	daddr = saddr + 1;
 
-	hash = nexthdr & (MAX_INET_PROTOS - 1);
+	hash = nexthdr & (RAW_HTABLE_SIZE - 1);
 
 	read_lock(&raw_v6_hashinfo.lock);
 	sk = sk_head(&raw_v6_hashinfo.ht[hash]);
@@ -229,7 +229,7 @@ bool raw6_local_deliver(struct sk_buff *skb, int nexthdr)
 {
 	struct sock *raw_sk;
 
-	raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (MAX_INET_PROTOS - 1)]);
+	raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (RAW_HTABLE_SIZE - 1)]);
 	if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
 		raw_sk = NULL;
 
-- 
1.7.10

^ permalink raw reply related

* Re: [PATCH] net: Update netdev_alloc_frag to work more efficiently with TCP and GRO
From: Alexander Duyck @ 2012-06-20  1:49 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: netdev, davem, jeffrey.t.kirsher, Eric Dumazet
In-Reply-To: <20120620004306.17814.58369.stgit@gitlad.jf.intel.com>

On 6/19/2012 5:43 PM, Alexander Duyck wrote:
> This patch is meant to help improve system performance when
> netdev_alloc_frag is used in scenarios in which buffers are short lived.
> This is accomplished by allowing the page offset to be reset in the event
> that the page count is 1.  I also reordered the direction in which we give
> out sections of the page so that we start at the end of the page and end at
> the start.  The main motivation being that I preferred to have offset
> represent the amount of page remaining to be used.
>
> My primary test case was using ixgbe in combination with TCP.  With this
> patch applied I saw CPU utilization drop from 3.4% to 3.0% for a single
> thread of netperf receiving a TCP stream via ixgbe.
>
> I also tested several scenarios in which the page reuse would not be
> possible such as UDP flows and routing.  In both of these scenarios I saw
> no noticeable performance degradation compared to the kernel without this
> patch.
>
> Cc: Eric Dumazet<edumazet@google.com>
> Signed-off-by: Alexander Duyck<alexander.h.duyck@intel.com>
> ---
>
>   net/core/skbuff.c |   15 +++++++++++----
>   1 files changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 5b21522..eb3853c 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -317,15 +317,22 @@ void *netdev_alloc_frag(unsigned int fragsz)
>   	if (unlikely(!nc->page)) {
>   refill:
>   		nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD);
> -		nc->offset = 0;
>   	}
>   	if (likely(nc->page)) {
> -		if (nc->offset + fragsz>  PAGE_SIZE) {
> +		unsigned int offset = PAGE_SIZE;
> +
> +		if (page_count(nc->page) != 1)
> +			offset = nc->offset;
> +
> +		if (offset<  fragsz) {
>   			put_page(nc->page);
>   			goto refill;
>   		}
> -		data = page_address(nc->page) + nc->offset;
> -		nc->offset += fragsz;
> +
> +		offset -= fragsz;
> +		nc->offset = offset;
> +
> +		data = page_address(nc->page) + offset;
>   		get_page(nc->page);
>   	}
>   	local_irq_restore(flags);
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
It looks like I forgot to add "--auto" to the command line when I sent 
this out via stg mail so I am just adding Eric to the CC list on this 
reply.  Sorry for the extra noise.

Thanks,

Alex

^ permalink raw reply

* Re: [PATCH v2] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-20  1:05 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev
In-Reply-To: <1340154206.6871.90.camel@deadeye.wl.decadent.org.uk>

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 20 Jun 2012 02:03:26 +0100

> On Tue, 2012-06-19 at 17:54 -0700, David Miller wrote:
>> The hash is perfect, what's the big deal?
> 
> It obscures what we're really doing and relying on.

If it matters to you, patches are always welcome :-)

^ permalink raw reply

* Re: [PATCH v2] ipv4: Early TCP socket demux.
From: Ben Hutchings @ 2012-06-20  1:03 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120619.175435.1705108432673390966.davem@davemloft.net>

On Tue, 2012-06-19 at 17:54 -0700, David Miller wrote:
> From: Ben Hutchings <bhutchings@solarflare.com>
> Date: Wed, 20 Jun 2012 01:21:06 +0100
> 
> > This works only because MAX_INET_PROTOS is defined as 256, so that hash
> > == protocol.  If we were ever to change MAX_INET_PROTOS then we would
> > need to add a whole lot of protocol checks, but this isn't particularly
> > obvious!  Perhaps it would be better to remove the 'hashing' altogether?
> 
> We never have changed the value and we never will.

Well that's what I expected.

> The hash is perfect, what's the big deal?

It obscures what we're really doing and relying on.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [PATCH v2] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-20  0:54 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev
In-Reply-To: <1340151666.6871.81.camel@deadeye.wl.decadent.org.uk>

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 20 Jun 2012 01:21:06 +0100

> This works only because MAX_INET_PROTOS is defined as 256, so that hash
> == protocol.  If we were ever to change MAX_INET_PROTOS then we would
> need to add a whole lot of protocol checks, but this isn't particularly
> obvious!  Perhaps it would be better to remove the 'hashing' altogether?

We never have changed the value and we never will.  The hash is
perfect, what's the big deal?

^ permalink raw reply

* [PATCH] net: Update netdev_alloc_frag to work more efficiently with TCP and GRO
From: Alexander Duyck @ 2012-06-20  0:43 UTC (permalink / raw)
  To: netdev; +Cc: davem, jeffrey.t.kirsher

This patch is meant to help improve system performance when
netdev_alloc_frag is used in scenarios in which buffers are short lived.
This is accomplished by allowing the page offset to be reset in the event
that the page count is 1.  I also reordered the direction in which we give
out sections of the page so that we start at the end of the page and end at
the start.  The main motivation being that I preferred to have offset
represent the amount of page remaining to be used.

My primary test case was using ixgbe in combination with TCP.  With this
patch applied I saw CPU utilization drop from 3.4% to 3.0% for a single
thread of netperf receiving a TCP stream via ixgbe.

I also tested several scenarios in which the page reuse would not be
possible such as UDP flows and routing.  In both of these scenarios I saw
no noticeable performance degradation compared to the kernel without this
patch.

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---

 net/core/skbuff.c |   15 +++++++++++----
 1 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5b21522..eb3853c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -317,15 +317,22 @@ void *netdev_alloc_frag(unsigned int fragsz)
 	if (unlikely(!nc->page)) {
 refill:
 		nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD);
-		nc->offset = 0;
 	}
 	if (likely(nc->page)) {
-		if (nc->offset + fragsz > PAGE_SIZE) {
+		unsigned int offset = PAGE_SIZE;
+
+		if (page_count(nc->page) != 1)
+			offset = nc->offset;
+
+		if (offset < fragsz) {
 			put_page(nc->page);
 			goto refill;
 		}
-		data = page_address(nc->page) + nc->offset;
-		nc->offset += fragsz;
+
+		offset -= fragsz;
+		nc->offset = offset;
+
+		data = page_address(nc->page) + offset;
 		get_page(nc->page);
 	}
 	local_irq_restore(flags);

^ permalink raw reply related

* Re: [PATCH v2] ipv4: Early TCP socket demux.
From: Ben Hutchings @ 2012-06-20  0:21 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120619.163911.2094057156011157978.davem@davemloft.net>

On Tue, 2012-06-19 at 16:39 -0700, David Miller wrote:
> Input packet processing for local sockets involves two major demuxes.
> One for the route and one for the socket.
> 
> But we can optimize this down to one demux for certain kinds of local
> sockets.
[...]
> --- a/net/ipv4/ip_input.c
> +++ b/net/ipv4/ip_input.c
> @@ -324,19 +324,34 @@ static int ip_rcv_finish(struct sk_buff *skb)
>  	 *	how the packet travels inside Linux networking.
>  	 */
>  	if (skb_dst(skb) == NULL) {
> -		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
> -					       iph->tos, skb->dev);
> -		if (unlikely(err)) {
> -			if (err == -EHOSTUNREACH)
> -				IP_INC_STATS_BH(dev_net(skb->dev),
> -						IPSTATS_MIB_INADDRERRORS);
> -			else if (err == -ENETUNREACH)
> -				IP_INC_STATS_BH(dev_net(skb->dev),
> -						IPSTATS_MIB_INNOROUTES);
> -			else if (err == -EXDEV)
> -				NET_INC_STATS_BH(dev_net(skb->dev),
> -						 LINUX_MIB_IPRPFILTER);
> -			goto drop;
> +		const struct net_protocol *ipprot;
> +		int protocol = iph->protocol;
> +		int hash, err;
> +
> +		hash = protocol & (MAX_INET_PROTOS - 1);
[...]

This 'hashing' threw me when I read v1, because nowhere do we actually
check that the protocol (as opposed to hash) matches that for the
selected ipprot.  (And this also turns out to be true for the current
receive path.)

This works only because MAX_INET_PROTOS is defined as 256, so that hash
== protocol.  If we were ever to change MAX_INET_PROTOS then we would
need to add a whole lot of protocol checks, but this isn't particularly
obvious!  Perhaps it would be better to remove the 'hashing' altogether?

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* [PATCH v2] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-19 23:39 UTC (permalink / raw)
  To: netdev


Input packet processing for local sockets involves two major demuxes.
One for the route and one for the socket.

But we can optimize this down to one demux for certain kinds of local
sockets.

Currently we only do this for established TCP sockets, but it could
at least in theory be expanded to other kinds of connections.

If a TCP socket is established then it's identity is fully specified.

This means that whatever input route was used during the three-way
handshake must work equally well for the rest of the connection since
the keys will not change.

Once we move to established state, we cache the receive packet's input
route to use later.

Like the existing cached route in sk->sk_dst_cache used for output
packets, we have to check for route invalidations using dst->obsolete
and dst->ops->check().

Early demux occurs outside of a socket locked section, so when a route
invalidation occurs we defer the fixup of sk->sk_rx_dst until we are
actually inside of established state packet processing and thus have
the socket locked.

Signed-off-by: David S. Miller <davem@davemloft.net>
---

Changes since v1:

1) Remove unlikely() from __inet_lookup_skb()

2) Check for cached route invalidations.

3) Hook up RX dst when outgoing connection moved to established too,
   previously it was only handling incoming connections.

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 808fc5f..54be028 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -379,10 +379,10 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 					     const __be16 sport,
 					     const __be16 dport)
 {
-	struct sock *sk;
+	struct sock *sk = skb_steal_sock(skb);
 	const struct iphdr *iph = ip_hdr(skb);
 
-	if (unlikely(sk = skb_steal_sock(skb)))
+	if (sk)
 		return sk;
 	else
 		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 875f489..6c47bf8 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -34,6 +34,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
+	int			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/sock.h b/include/net/sock.h
index 4a45216..87b424a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -319,6 +319,7 @@ struct sock {
 	unsigned long 		sk_flags;
 	struct dst_entry	*sk_dst_cache;
 	spinlock_t		sk_dst_lock;
+	struct dst_entry	*sk_rx_dst;
 	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
@@ -1426,6 +1427,7 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      gfp_t priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
+extern void			sock_edemux(struct sk_buff *skb);
 
 extern int			sock_setsockopt(struct socket *sock, int level,
 						int op, char __user *optval,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9332f34..6660ffc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
+extern int tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9e5b71f..929bdcc 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_rfree);
 
+void sock_edemux(struct sk_buff *skb)
+{
+	sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
 
 int sock_i_uid(struct sock *sk)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e4e8e00..a2bd2d2 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
 
 	kfree(rcu_dereference_protected(inet->inet_opt, 1));
 	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+	dst_release(sk->sk_rx_dst);
 	sk_refcnt_debug_dec(sk);
 }
 EXPORT_SYMBOL(inet_sock_destruct);
@@ -1520,14 +1521,15 @@ static const struct net_protocol igmp_protocol = {
 #endif
 
 static const struct net_protocol tcp_protocol = {
-	.handler =	tcp_v4_rcv,
-	.err_handler =	tcp_v4_err,
-	.gso_send_check = tcp_v4_gso_send_check,
-	.gso_segment =	tcp_tso_segment,
-	.gro_receive =	tcp4_gro_receive,
-	.gro_complete =	tcp4_gro_complete,
-	.no_policy =	1,
-	.netns_ok =	1,
+	.early_demux	=	tcp_v4_early_demux,
+	.handler	=	tcp_v4_rcv,
+	.err_handler	=	tcp_v4_err,
+	.gso_send_check	=	tcp_v4_gso_send_check,
+	.gso_segment	=	tcp_tso_segment,
+	.gro_receive	=	tcp4_gro_receive,
+	.gro_complete	=	tcp4_gro_complete,
+	.no_policy	=	1,
+	.netns_ok	=	1,
 };
 
 static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8590144..cb883e1 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -324,19 +324,34 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
-		if (unlikely(err)) {
-			if (err == -EHOSTUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INADDRERRORS);
-			else if (err == -ENETUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INNOROUTES);
-			else if (err == -EXDEV)
-				NET_INC_STATS_BH(dev_net(skb->dev),
-						 LINUX_MIB_IPRPFILTER);
-			goto drop;
+		const struct net_protocol *ipprot;
+		int protocol = iph->protocol;
+		int hash, err;
+
+		hash = protocol & (MAX_INET_PROTOS - 1);
+
+		rcu_read_lock();
+		ipprot = rcu_dereference(inet_protos[hash]);
+		err = -ENOENT;
+		if (ipprot && ipprot->early_demux)
+			err = ipprot->early_demux(skb);
+		rcu_read_unlock();
+
+		if (err) {
+			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+						   iph->tos, skb->dev);
+			if (unlikely(err)) {
+				if (err == -EHOSTUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INADDRERRORS);
+				else if (err == -ENETUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INNOROUTES);
+				else if (err == -EXDEV)
+					NET_INC_STATS_BH(dev_net(skb->dev),
+							 LINUX_MIB_IPRPFILTER);
+				goto drop;
+			}
 		}
 	}
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b224eb8..8416f8a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	int res;
 
+	if (sk->sk_rx_dst) {
+		struct dst_entry *dst = sk->sk_rx_dst;
+		if (unlikely(dst->obsolete)) {
+			if (dst->ops->check(dst, 0) == NULL) {
+				dst_release(dst);
+				sk->sk_rx_dst = NULL;
+			}
+		}
+	}
+	if (unlikely(sk->sk_rx_dst == NULL))
+		sk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 	/*
 	 *	Header prediction.
 	 *	The code loosely follows the one in the famous
@@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 
 	tcp_set_state(sk, TCP_ESTABLISHED);
 
-	if (skb != NULL)
+	if (skb != NULL) {
+		sk->sk_rx_dst = dst_clone(skb_dst(skb));
 		security_inet_conn_established(sk, skb);
+	}
 
 	/* Make sure socket is routed, for correct metrics.  */
 	icsk->icsk_af_ops->rebuild_header(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fda2ca1..13857df 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1671,6 +1671,52 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
+int tcp_v4_early_demux(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	struct sock *sk;
+	int err;
+
+	err = -ENOENT;
+	if (skb->pkt_type != PACKET_HOST)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+		goto out_err;
+
+	iph = ip_hdr(skb);
+	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+
+	if (th->doff < sizeof(struct tcphdr) / 4)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
+		goto out_err;
+
+	sk = __inet_lookup_established(net, &tcp_hashinfo,
+				       iph->saddr, th->source,
+				       iph->daddr, th->dest,
+				       skb->dev->ifindex);
+	if (sk) {
+		skb->sk = sk;
+		skb->destructor = sock_edemux;
+		if (sk->sk_state != TCP_TIME_WAIT) {
+			struct dst_entry *dst = sk->sk_rx_dst;
+			if (dst)
+				dst = dst_check(dst, 0);
+			if (dst) {
+				skb_dst_set_noref(skb, dst);
+				err = 0;
+			}
+		}
+	}
+
+out_err:
+	return err;
+}
+
 /*
  *	From tcp_input.c
  */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb01531..72b7c63 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
+		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
 		 * copy any s_data_payload stored at the original socket.

^ permalink raw reply related

* Re: [PATCH net-next] ixgbe: simplify padding and length checks (v2)
From: Jeff Kirsher @ 2012-06-19 23:30 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Alexander Duyck, Bruce Allan, Carolyn Wyborny, Don Skidmore,
	Greg Rose, Peter P Waskiewicz Jr, David S. Miller, e1000-devel,
	netdev
In-Reply-To: <20120618163111.4e46493b@nehalam.linuxnetplumber.net>

[-- Attachment #1: Type: text/plain, Size: 548 bytes --]

On Mon, 2012-06-18 at 16:31 -0700, Stephen Hemminger wrote:
> The check for length <= 0 is bogus because length is unsigned, and
> network
> stack never sends zero length packets (unless it is totally broken).
> 
> The check for really small packets can be optimized (using unlikely)
> and calling skb_pad directly.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> 

I just realized I had not responded to this updated patch.

Thanks Stephen, I have dropped your previous 2 patch series and added
this patch to my queue.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* RE: [RFC net-next 11/14] Fix emulex/benet
From: Ajit.Khaparde @ 2012-06-19 22:55 UTC (permalink / raw)
  To: yuvalmin, netdev, davem; +Cc: eilong, Sathya.Perla, subbu.seetharaman
In-Reply-To: <1340118848-30978-12-git-send-email-yuvalmin@broadcom.com>

> From: Yuval Mintz [yuvalmin@broadcom.com]
> Sent: Tuesday, June 19, 2012 10:14 AM
> To: netdev@vger.kernel.org; davem@davemloft.net
> Cc: eilong@broadcom.com; Yuval Mintz; Perla, Sathya; Seetharaman, Subramanian; Khaparde, Ajit
> Subject: [RFC net-next 11/14] Fix emulex/benet

> Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
> Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

> Cc: Sathya Perla <sathya.perla@emulex.com>
> Cc: Subbu Seetharaman <subbu.seetharaman@emulex.com>
> Cc: Ajit Khaparde <ajit.khaparde@emulex.com>
> ---
>  drivers/net/ethernet/emulex/benet/be_main.c |    8 +++++---
>  1 files changed, 5 insertions(+), 3 deletions(-)


We will get back to you on this later in the day.

Thanks
-Ajit

^ permalink raw reply

* Re: [PATCH net-next 0/5] qmi_wwan fixes for 3.6
From: David Miller @ 2012-06-19 22:04 UTC (permalink / raw)
  To: oneukum-l3A5Bk7waGM
  Cc: bjorn-yOkvZcmFvRU, netdev-u79uwXL29TY76Z2rM5mHXA,
	dcbw-H+wXaHxf7aLQT0dZR+AlfA, linux-usb-u79uwXL29TY76Z2rM5mHXA,
	ajb-5+cxppFmGx6/3pe1ocb+s/XRex20P6io
In-Reply-To: <201206191253.17413.oneukum-l3A5Bk7waGM@public.gmane.org>

From: Oliver Neukum <oneukum-l3A5Bk7waGM@public.gmane.org>
Date: Tue, 19 Jun 2012 12:53:17 +0200

> Am Dienstag, 19. Juni 2012, 12:41:58 schrieb Bjørn Mork:
>> The two first patches prepare the driver for the new probing
>> model introduced by patch #3, which is the main change in this
>> set.  A RFC version of this was posted to linux-usb 29 May 2012 
>> for discussion, as it also implicitly affects the cdc-wdm driver, 
>> without any comments so far.
> 
> Looking good.

All applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [patch net-next 00/19] team: couple of patches
From: David Miller @ 2012-06-19 22:01 UTC (permalink / raw)
  To: jpirko; +Cc: netdev, eric.dumazet, jbrouer
In-Reply-To: <1340121261-2966-1-git-send-email-jpirko@redhat.com>

From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 19 Jun 2012 17:54:02 +0200

> The main impact is this patchset provides an implementation of userspace driven
> TX loadbalancing for team driver.
> 
> Also couple of typos are fixed, minor issues fixes.

Looks good, pushed, thanks Jiri.

^ permalink raw reply

* Re: pull-request: can-next 2012-06-19
From: David Miller @ 2012-06-19 22:01 UTC (permalink / raw)
  To: mkl; +Cc: netdev, linux-can
In-Reply-To: <20120619.145907.521642979191972656.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Tue, 19 Jun 2012 14:59:07 -0700 (PDT)

> From: David Miller <davem@davemloft.net>
> Date: Tue, 19 Jun 2012 14:48:45 -0700 (PDT)
> 
>> From: Marc Kleine-Budde <mkl@pengutronix.de>
>> Date: Tue, 19 Jun 2012 22:01:56 +0200
>> 
>>> here is our second pull request for net-next. In this series Federico
>>> Vaga adds a pci driver for c_can/d_can hardware using the existing
>>> generic c_can driver. The remaining 6 patches are by Oliver Hartkopp.
>>> He adds CANFD support to the CAN stack while keeping binary
>>> compatibility for existing applications. CANFD is an extension to the
>>> existing CAN standard, it allows longer CAN frames and/or higher data
>>> rates. There's no real hardware available yet, but this series adds
>>> CANFD support to the vcan driver.
>> 
>> Pulled, thanks.
> 
> I have to unpull, the new driver results in undefined symbols.
> 
> ERROR: "clk_get_rate" [drivers/net/can/c_can/c_can_pci.ko] undefined!
> ERROR: "clk_get" [drivers/net/can/c_can/c_can_pci.ko] undefined!
> ERROR: "clk_put" [drivers/net/can/c_can/c_can_pci.ko] undefined!

Actually I can't un-pull since I already pushed it out to my
tree.

Send me a fix for this bug, fast.

Thanks.

^ permalink raw reply

* Re: pull-request: can-next 2012-06-19
From: David Miller @ 2012-06-19 21:59 UTC (permalink / raw)
  To: mkl; +Cc: netdev, linux-can
In-Reply-To: <20120619.144845.390201286413906004.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Tue, 19 Jun 2012 14:48:45 -0700 (PDT)

> From: Marc Kleine-Budde <mkl@pengutronix.de>
> Date: Tue, 19 Jun 2012 22:01:56 +0200
> 
>> here is our second pull request for net-next. In this series Federico
>> Vaga adds a pci driver for c_can/d_can hardware using the existing
>> generic c_can driver. The remaining 6 patches are by Oliver Hartkopp.
>> He adds CANFD support to the CAN stack while keeping binary
>> compatibility for existing applications. CANFD is an extension to the
>> existing CAN standard, it allows longer CAN frames and/or higher data
>> rates. There's no real hardware available yet, but this series adds
>> CANFD support to the vcan driver.
> 
> Pulled, thanks.

I have to unpull, the new driver results in undefined symbols.

ERROR: "clk_get_rate" [drivers/net/can/c_can/c_can_pci.ko] undefined!
ERROR: "clk_get" [drivers/net/can/c_can/c_can_pci.ko] undefined!
ERROR: "clk_put" [drivers/net/can/c_can/c_can_pci.ko] undefined!

^ permalink raw reply

* Re: [PATCH net] batman-adv: fix skb->data assignment
From: David Miller @ 2012-06-19 21:49 UTC (permalink / raw)
  To: ordex; +Cc: netdev, b.a.t.m.a.n, stable
In-Reply-To: <1340133999-26838-1-git-send-email-ordex@autistici.org>

From: Antonio Quartulli <ordex@autistici.org>
Date: Tue, 19 Jun 2012 21:26:39 +0200

> skb_linearize(skb) possibly rearranges the skb internal data and then changes
> the skb->data pointer value. For this reason any other pointer in the code that
> was assigned skb->data before invoking skb_linearise(skb) must be re-assigned.
> 
> In the current tt_query message handling code this is not done and therefore, in
> case of skb linearization, the pointer used to handle the packet header ends up
> in pointing to free'd memory.
> 
> This bug was introduced by a73105b8d4c765d9ebfb664d0a66802127d8e4c7
> (batman-adv: improved client announcement mechanism)
> 
> Signed-off-by: Antonio Quartulli <ordex@autistici.org>
> Cc: <stable@vger.kernel.org>

Applied.

Submit things properly in the future so you don't give me unnecessary
merge hassles like this again.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox