Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-19  2:40 UTC (permalink / raw)
  To: netdev


You know you want it.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 875f489..6c47bf8 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -34,6 +34,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
+	int			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/sock.h b/include/net/sock.h
index 4a45216..87b424a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -319,6 +319,7 @@ struct sock {
 	unsigned long 		sk_flags;
 	struct dst_entry	*sk_dst_cache;
 	spinlock_t		sk_dst_lock;
+	struct dst_entry	*sk_rx_dst;
 	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
@@ -1426,6 +1427,7 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      gfp_t priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
+extern void			sock_edemux(struct sk_buff *skb);
 
 extern int			sock_setsockopt(struct socket *sock, int level,
 						int op, char __user *optval,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9332f34..6660ffc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
+extern int tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9e5b71f..929bdcc 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_rfree);
 
+void sock_edemux(struct sk_buff *skb)
+{
+	sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
 
 int sock_i_uid(struct sock *sk)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e4e8e00..a2bd2d2 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
 
 	kfree(rcu_dereference_protected(inet->inet_opt, 1));
 	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+	dst_release(sk->sk_rx_dst);
 	sk_refcnt_debug_dec(sk);
 }
 EXPORT_SYMBOL(inet_sock_destruct);
@@ -1520,14 +1521,15 @@ static const struct net_protocol igmp_protocol = {
 #endif
 
 static const struct net_protocol tcp_protocol = {
-	.handler =	tcp_v4_rcv,
-	.err_handler =	tcp_v4_err,
-	.gso_send_check = tcp_v4_gso_send_check,
-	.gso_segment =	tcp_tso_segment,
-	.gro_receive =	tcp4_gro_receive,
-	.gro_complete =	tcp4_gro_complete,
-	.no_policy =	1,
-	.netns_ok =	1,
+	.early_demux	=	tcp_v4_early_demux,
+	.handler	=	tcp_v4_rcv,
+	.err_handler	=	tcp_v4_err,
+	.gso_send_check	=	tcp_v4_gso_send_check,
+	.gso_segment	=	tcp_tso_segment,
+	.gro_receive	=	tcp4_gro_receive,
+	.gro_complete	=	tcp4_gro_complete,
+	.no_policy	=	1,
+	.netns_ok	=	1,
 };
 
 static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8590144..cb883e1 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -324,19 +324,34 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
-		if (unlikely(err)) {
-			if (err == -EHOSTUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INADDRERRORS);
-			else if (err == -ENETUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INNOROUTES);
-			else if (err == -EXDEV)
-				NET_INC_STATS_BH(dev_net(skb->dev),
-						 LINUX_MIB_IPRPFILTER);
-			goto drop;
+		const struct net_protocol *ipprot;
+		int protocol = iph->protocol;
+		int hash, err;
+
+		hash = protocol & (MAX_INET_PROTOS - 1);
+
+		rcu_read_lock();
+		ipprot = rcu_dereference(inet_protos[hash]);
+		err = -ENOENT;
+		if (ipprot && ipprot->early_demux)
+			err = ipprot->early_demux(skb);
+		rcu_read_unlock();
+
+		if (err) {
+			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+						   iph->tos, skb->dev);
+			if (unlikely(err)) {
+				if (err == -EHOSTUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INADDRERRORS);
+				else if (err == -ENETUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INNOROUTES);
+				else if (err == -EXDEV)
+					NET_INC_STATS_BH(dev_net(skb->dev),
+							 LINUX_MIB_IPRPFILTER);
+				goto drop;
+			}
 		}
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fda2ca1..bd90181 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1671,6 +1671,50 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
+int tcp_v4_early_demux(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	struct sock *sk;
+	int err;
+
+	err = -ENOENT;
+	if (skb->pkt_type != PACKET_HOST)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+		goto out_err;
+
+	iph = ip_hdr(skb);
+	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+
+	if (th->doff < sizeof(struct tcphdr) / 4)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
+		goto out_err;
+
+	sk = __inet_lookup_established(net, &tcp_hashinfo,
+				       iph->saddr, th->source,
+				       iph->daddr, th->dest,
+				       skb->dev->ifindex);
+	if (sk) {
+		skb->sk = sk;
+		skb->destructor = sock_edemux;
+		if (sk->sk_state != TCP_TIME_WAIT) {
+			struct dst_entry *dst = sk->sk_rx_dst;
+			if (dst) {
+				skb_dst_set_noref(skb, dst);
+				err = 0;
+			}
+		}
+	}
+
+out_err:
+	return err;
+}
+
 /*
  *	From tcp_input.c
  */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb01531..72b7c63 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
+		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
 		 * copy any s_data_payload stored at the original socket.

^ permalink raw reply related

* Re: [RFC] TCP:  Support configurable delayed-ack parameters.
From: Ben Greear @ 2012-06-19  2:46 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Daniel Baluta
In-Reply-To: <20120618182716.5f8fb72f@nehalam.linuxnetplumber.net>

On 06/18/2012 06:27 PM, Stephen Hemminger wrote:
> On Mon, 18 Jun 2012 17:52:43 -0700
> greearb@candelatech.com wrote:
>
>> From: Ben Greear<greearb@candelatech.com>
>>
>> RFC2581 ($4.2) specifies when an ACK should be generated as follows:
>>
>> " .. an ACK SHOULD be generated for at least every second
>>    full-sized segment, and MUST be generated within 500 ms
>>    of the arrival of the first unacknowledged packet.
>> "
>>
>> We export the number of segments and the timeout limits
>> specified above, so that a user can tune them according
>> to their needs.
>>
>> Specifically:
>> 	* /proc/sys/net/ipv4/tcp_default_delack_segs, represents
>> 	the threshold for the number of segments.
>> 	* /proc/sys/net/ipv4/tcp_default_delack_min, specifies
>> 	the minimum timeout value
>> 	* /proc/sys/net/ipv4/tcp_default_delack_max, specifies
>> 	the maximum timeout value.
>>
>> In addition, new TCP socket options are added to allow
>> per-socket configuration:
>>
>> TCP_DELACK_SEGS
>> TCP_DELACK_MIN
>> TCP_DELACK_MAX
>>
>> In order to keep a multiply out of the hot path, the segs * mss
>> computation is recalculated and cached whenever segs or mss changes.
>>
>> Signed-off-by: Daniel Baluta<dbaluta@ixiacom.com>
>> Signed-off-by: Ben Greear<greearb@candelatech.com>
>
> What is the justification (other than standard) for making this
> tunable. Why would you want to do this? Why shouldn't the stack be adjusting
> it for you (based on other heuristics)? Or is this just for testing interoperation
> with TCP stacks that have wonky ACK policies. There are already too many TCP tunable
> parameters for general usage.

tcp over wifi performance sucks, and tuning it to delay acks by 10 or 20 segments
gives a decent performance boost.

It is beyond me to write something that auto-tunes this, but even if someone
did, it's virtually guaranteed that someone somewhere will get better results
by tuning their application directly.

I honestly didn't even read the RFC section in question..just stole the
description text from the original patch by Daniel Baluta.

Thanks,
Ben


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply

* [PATCH 0/4] netfilter updates for net-next (batch 3)
From: pablo @ 2012-06-19  3:16 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

From: Pablo Neira Ayuso <pablo@netfilter.org>

Hi David,

The following patchset provides fixes for issues that were recently introduced
by my new cthelper infrastructure. They have been spotted by Randy Dunlap,
Andrew Morton and Dan Carpenter.

The patches provide:

* compilation fixes if CONFIG_NF_CONNTRACK is disabled: I moved all the
  conntrack code from nfnetlink_queue.c to nfnetlink_queue_ct.c to avoid
  peppering the entire code with lots of ifdefs. I needed to rename
  nfnetlink_queue.c to nfnetlink_queue_core.c to get it working with the
  Makefile tweaks I've added.

* fix NULL pointer dereference via ctnetlink while trying to change the helper
  for an existing conntrack entry. I don't find any reasonable use case for
  changing the helper from one to another in run-time. Thus, now ctnetlink
  returns -EOPNOTSUPP for this operation.

* fix possible out-of-bound zeroing of the conntrack extension area due to
  the helper automatic assignation routine.

You can pull these changes from:

git://1984.lsi.us.es/nf-next master

Thanks!

Pablo Neira Ayuso (4):
  netfilter: ctnetlink: fix NULL dereference while trying to change helper
  netfilter: nf_ct_helper: disable automatic helper re-assignment of different type
  netfilter: fix compilation of the nfnl_cthelper if NF_CONNTRACK is unset
  netfilter: nfnetlink_queue: fix compilation with NF_CONNTRACK disabled

 include/net/netfilter/nfnetlink_queue.h            |   43 +++++++++
 net/netfilter/Kconfig                              |   29 ++++--
 net/netfilter/Makefile                             |    4 +-
 net/netfilter/nf_conntrack_helper.c                |    8 +-
 net/netfilter/nf_conntrack_netlink.c               |   24 ++---
 .../{nfnetlink_queue.c => nfnetlink_queue_core.c}  |   49 ++--------
 net/netfilter/nfnetlink_queue_ct.c                 |   97 ++++++++++++++++++++
 7 files changed, 187 insertions(+), 67 deletions(-)
 create mode 100644 include/net/netfilter/nfnetlink_queue.h
 rename net/netfilter/{nfnetlink_queue.c => nfnetlink_queue_core.c} (95%)
 create mode 100644 net/netfilter/nfnetlink_queue_ct.c

-- 
1.7.10

^ permalink raw reply

* [PATCH 2/4] netfilter: nf_ct_helper: disable automatic helper re-assignment of different type
From: pablo @ 2012-06-19  3:16 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340075789-6196-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

This patch modifies __nf_ct_try_assign_helper in a way that invalidates support
for the following scenario:

1) attach the helper A for first time when the conntrack is created
2) attach new (different) helper B due to changes the reply tuple caused by NAT

eg. port redirection from TCP/21 to TCP/5060 with both FTP and SIP helpers
loaded, which seems to be a quite unorthodox scenario.

I can provide a more elaborated patch to support this scenario but explicit
helper attachment provides a better solution for this since now the use can
attach the helpers consistently, without relying on the automatic helper
lookup magic.

This patch fixes a possible out of bound zeroing of the conntrack helper
extension if the helper B uses more memory for its private data than
helper A.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_helper.c |    8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 2918ec2..c4bc637 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -229,7 +229,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 			goto out;
 		}
 	} else {
-		memset(help->data, 0, helper->data_len);
+		/* We only allow helper re-assignment of the same sort since
+		 * we cannot reallocate the helper extension area.
+		 */
+		if (help->helper != helper) {
+			RCU_INIT_POINTER(help->helper, NULL);
+			goto out;
+		}
 	}

 	rcu_assign_pointer(help->helper, helper);
-- 
1.7.10

^ permalink raw reply related

* [PATCH 4/4] netfilter: nfnetlink_queue: fix compilation with NF_CONNTRACK disabled
From: pablo @ 2012-06-19  3:16 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340075789-6196-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

In "9cb0176 netfilter: add glue code to integrate nfnetlink_queue and ctnetlink"
the compilation with NF_CONNTRACK disabled is broken. This patch fixes this
issue.

I have moved the conntrack part into nfnetlink_queue_ct.c to avoid
peppering the entire nfnetlink_queue.c code with ifdefs.

I also needed to rename nfnetlink_queue.c to nfnetlink_queue_pkt.c
to update the net/netfilter/Makefile to support conditional compilation
of the conntrack integration.

This patch also adds CONFIG_NETFILTER_QUEUE_CT in case you want to explicitly
disable the integration between nf_conntrack and nfnetlink_queue.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nfnetlink_queue.h            |   43 +++++++++
 net/netfilter/Kconfig                              |    9 ++
 net/netfilter/Makefile                             |    2 +
 net/netfilter/nf_conntrack_netlink.c               |   11 +--
 .../{nfnetlink_queue.c => nfnetlink_queue_core.c}  |   49 ++--------
 net/netfilter/nfnetlink_queue_ct.c                 |   97 ++++++++++++++++++++
 6 files changed, 164 insertions(+), 47 deletions(-)
 create mode 100644 include/net/netfilter/nfnetlink_queue.h
 rename net/netfilter/{nfnetlink_queue.c => nfnetlink_queue_core.c} (95%)
 create mode 100644 net/netfilter/nfnetlink_queue_ct.c

diff --git a/include/net/netfilter/nfnetlink_queue.h b/include/net/netfilter/nfnetlink_queue.h
new file mode 100644
index 0000000..9f8095c
--- /dev/null
+++ b/include/net/netfilter/nfnetlink_queue.h
@@ -0,0 +1,43 @@
+#ifndef _NET_NFNL_QUEUE_H_
+#define _NET_NFNL_QUEUE_H_
+
+#include <linux/netfilter/nf_conntrack_common.h>
+
+struct nf_conn;
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size,
+			     enum ip_conntrack_info *ctinfo);
+struct nf_conn *nfqnl_ct_parse(const struct sk_buff *skb,
+			       const struct nlattr *attr,
+			       enum ip_conntrack_info *ctinfo);
+int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct,
+		 enum ip_conntrack_info ctinfo);
+void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
+			 enum ip_conntrack_info ctinfo, int diff);
+#else
+inline struct nf_conn *
+nfqnl_ct_get(struct sk_buff *entskb, size_t *size, enum ip_conntrack_info *ctinfo)
+{
+	return NULL;
+}
+
+inline struct nf_conn *nfqnl_ct_parse(const struct sk_buff *skb,
+				      const struct nlattr *attr,
+				      enum ip_conntrack_info *ctinfo)
+{
+	return NULL;
+}
+
+inline int
+nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	return 0;
+}
+
+inline void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
+				enum ip_conntrack_info ctinfo, int diff)
+{
+}
+#endif /* NF_CONNTRACK */
+#endif
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f1a52ba..c19b214 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -340,6 +340,7 @@ config NF_CT_NETLINK_HELPER
 	select NETFILTER_NETLINK
 	depends on NF_CT_NETLINK
 	depends on NETFILTER_NETLINK_QUEUE
+	depends on NETFILTER_NETLINK_QUEUE_CT
 	depends on NETFILTER_ADVANCED
 	help
 	  This option enables the user-space connection tracking helpers
@@ -347,6 +348,14 @@ config NF_CT_NETLINK_HELPER
 
 	  If unsure, say `N'.
 
+config NETFILTER_NETLINK_QUEUE_CT
+        bool "NFQUEUE integration with Connection Tracking"
+        default n
+        depends on NETFILTER_NETLINK_QUEUE
+	help
+	  If this option is enabled, NFQUEUE can include Connection Tracking
+	  information together with the packet is the enqueued via NFNETLINK.
+
 endif # NF_CONNTRACK
 
 # transparent proxy support
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 7cc2019..1c5160f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -9,6 +9,8 @@ obj-$(CONFIG_NETFILTER) = netfilter.o
 
 obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
 obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
+nfnetlink_queue-y := nfnetlink_queue_core.o
+nfnetlink_queue-$(CONFIG_NETFILTER_NETLINK_QUEUE_CT) += nfnetlink_queue_ct.o
 obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
 obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 76271a1..31d1d8f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1627,8 +1627,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	return err;
 }
 
-#if defined(CONFIG_NETFILTER_NETLINK_QUEUE) ||	\
-    defined(CONFIG_NETFILTER_NETLINK_QUEUE_MODULE)
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
 static size_t
 ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
 {
@@ -1762,7 +1761,7 @@ static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
 	.seq_adjust	= nf_nat_tcp_seq_adjust,
 #endif
 };
-#endif /* CONFIG_NETFILTER_NETLINK_QUEUE */
+#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
 
 /***********************************************************************
  * EXPECT
@@ -2568,8 +2567,7 @@ static int __init ctnetlink_init(void)
 		pr_err("ctnetlink_init: cannot register pernet operations\n");
 		goto err_unreg_exp_subsys;
 	}
-#if defined(CONFIG_NETFILTER_NETLINK_QUEUE) ||	\
-    defined(CONFIG_NETFILTER_NETLINK_QUEUE_MODULE)
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
 	/* setup interaction between nf_queue and nf_conntrack_netlink. */
 	RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook);
 #endif
@@ -2590,8 +2588,7 @@ static void __exit ctnetlink_exit(void)
 	unregister_pernet_subsys(&ctnetlink_net_ops);
 	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
 	nfnetlink_subsys_unregister(&ctnl_subsys);
-#if defined(CONFIG_NETFILTER_NETLINK_QUEUE) ||	\
-    defined(CONFIG_NETFILTER_NETLINK_QUEUE_MODULE)
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
 	RCU_INIT_POINTER(nfq_ct_hook, NULL);
 #endif
 }
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue_core.c
similarity index 95%
rename from net/netfilter/nfnetlink_queue.c
rename to net/netfilter/nfnetlink_queue_core.c
index ff82c79..d36b95e 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -30,7 +30,7 @@
 #include <linux/list.h>
 #include <net/sock.h>
 #include <net/netfilter/nf_queue.h>
-#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nfnetlink_queue.h>
 
 #include <linux/atomic.h>
 
@@ -234,7 +234,6 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 	struct sk_buff *entskb = entry->skb;
 	struct net_device *indev;
 	struct net_device *outdev;
-	struct nfq_ct_hook *nfq_ct;
 	struct nf_conn *ct = NULL;
 	enum ip_conntrack_info uninitialized_var(ctinfo);
 
@@ -270,17 +269,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 		break;
 	}
 
-	/* rcu_read_lock()ed by __nf_queue already. */
-	nfq_ct = rcu_dereference(nfq_ct_hook);
-	if (nfq_ct != NULL && (queue->flags & NFQA_CFG_F_CONNTRACK)) {
-		ct = nf_ct_get(entskb, &ctinfo);
-		if (ct) {
-			if (!nf_ct_is_untracked(ct))
-				size += nfq_ct->build_size(ct);
-			else
-				ct = NULL;
-		}
-	}
+	if (queue->flags & NFQA_CFG_F_CONNTRACK)
+		ct = nfqnl_ct_get(entskb, &size, &ctinfo);
 
 	skb = alloc_skb(size, GFP_ATOMIC);
 	if (!skb)
@@ -404,23 +394,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 			BUG();
 	}
 
-	if (ct) {
-		struct nlattr *nest_parms;
-		u_int32_t tmp;
-
-		nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED);
-		if (!nest_parms)
-			goto nla_put_failure;
-
-		if (nfq_ct->build(skb, ct) < 0)
-			goto nla_put_failure;
-
-		nla_nest_end(skb, nest_parms);
-
-		tmp = ctinfo;
-		if (nla_put_u32(skb, NFQA_CT_INFO, htonl(ctinfo)))
-			goto nla_put_failure;
-	}
+	if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
+		goto nla_put_failure;
 
 	nlh->nlmsg_len = skb->tail - old_tail;
 	return skb;
@@ -764,7 +739,6 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 	struct nfqnl_instance *queue;
 	unsigned int verdict;
 	struct nf_queue_entry *entry;
-	struct nfq_ct_hook *nfq_ct;
 	enum ip_conntrack_info uninitialized_var(ctinfo);
 	struct nf_conn *ct = NULL;
 
@@ -786,13 +760,8 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 		return -ENOENT;
 
 	rcu_read_lock();
-	nfq_ct = rcu_dereference(nfq_ct_hook);
-	if (nfq_ct != NULL &&
-	    (queue->flags & NFQA_CFG_F_CONNTRACK) && nfqa[NFQA_CT]) {
-		ct = nf_ct_get(entry->skb, &ctinfo);
-		if (ct && !nf_ct_is_untracked(ct))
-			nfq_ct->parse(nfqa[NFQA_CT], ct);
-	}
+	if (nfqa[NFQA_CT] && (queue->flags & NFQA_CFG_F_CONNTRACK))
+		ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo);
 
 	if (nfqa[NFQA_PAYLOAD]) {
 		u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]);
@@ -802,8 +771,8 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 				 payload_len, entry, diff) < 0)
 			verdict = NF_DROP;
 
-		if (ct && (ct->status & IPS_NAT_MASK) && diff)
-			nfq_ct->seq_adjust(skb, ct, ctinfo, diff);
+		if (ct)
+			nfqnl_ct_seq_adjust(skb, ct, ctinfo, diff);
 	}
 	rcu_read_unlock();
 
diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c
new file mode 100644
index 0000000..68ef550
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue_ct.c
@@ -0,0 +1,97 @@
+/*
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <net/netfilter/nf_conntrack.h>
+
+struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size,
+			     enum ip_conntrack_info *ctinfo)
+{
+	struct nfq_ct_hook *nfq_ct;
+	struct nf_conn *ct;
+
+	/* rcu_read_lock()ed by __nf_queue already. */
+	nfq_ct = rcu_dereference(nfq_ct_hook);
+	if (nfq_ct == NULL)
+		return NULL;
+
+	ct = nf_ct_get(entskb, ctinfo);
+	if (ct) {
+		if (!nf_ct_is_untracked(ct))
+			*size += nfq_ct->build_size(ct);
+		else
+			ct = NULL;
+	}
+	return ct;
+}
+
+struct nf_conn *
+nfqnl_ct_parse(const struct sk_buff *skb, const struct nlattr *attr,
+	       enum ip_conntrack_info *ctinfo)
+{
+	struct nfq_ct_hook *nfq_ct;
+	struct nf_conn *ct;
+
+	/* rcu_read_lock()ed by __nf_queue already. */
+	nfq_ct = rcu_dereference(nfq_ct_hook);
+	if (nfq_ct == NULL)
+		return NULL;
+
+	ct = nf_ct_get(skb, ctinfo);
+	if (ct && !nf_ct_is_untracked(ct))
+		nfq_ct->parse(attr, ct);
+
+	return ct;
+}
+
+int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct,
+		 enum ip_conntrack_info ctinfo)
+{
+	struct nfq_ct_hook *nfq_ct;
+	struct nlattr *nest_parms;
+	u_int32_t tmp;
+
+	nfq_ct = rcu_dereference(nfq_ct_hook);
+	if (nfq_ct == NULL)
+		return 0;
+
+	nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+
+	if (nfq_ct->build(skb, ct) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest_parms);
+
+	tmp = ctinfo;
+	if (nla_put_be32(skb, NFQA_CT_INFO, htonl(tmp)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
+			 enum ip_conntrack_info ctinfo, int diff)
+{
+	struct nfq_ct_hook *nfq_ct;
+
+	nfq_ct = rcu_dereference(nfq_ct_hook);
+	if (nfq_ct == NULL)
+		return;
+
+	if ((ct->status & IPS_NAT_MASK) && diff)
+		nfq_ct->seq_adjust(skb, ct, ctinfo, diff);
+}
-- 
1.7.10


^ permalink raw reply related

* [PATCH 1/4] netfilter: ctnetlink: fix NULL dereference while trying to change helper
From: pablo @ 2012-06-19  3:16 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340075789-6196-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

The patch 1afc56794e03: "netfilter: nf_ct_helper: implement variable
length helper private data" from Jun 7, 2012, leads to the following
Smatch complaint:

net/netfilter/nf_conntrack_netlink.c:1231 ctnetlink_change_helper()
         error: we previously assumed 'help->helper' could be null (see line 1228)

This NULL dereference can be triggered with the following sequence:

1) attach the helper for first time when the conntrack is created.
2) remove the helper module or detach the helper from the conntrack
   via ctnetlink.
3) attach helper again (the same or different one, no matter) to the
   that existing conntrack again via ctnetlink.

This patch fixes the problem by removing the use case that allows you
to re-assign again a helper for one conntrack entry via ctnetlink since
I cannot find any practical use for it.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c |   13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index ae156df..76271a1 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1224,19 +1224,12 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
 			if (helper->from_nlattr && helpinfo)
 				helper->from_nlattr(helpinfo, ct);
 			return 0;
-		}
-		if (help->helper)
+		} else
 			return -EBUSY;
-		/* need to zero data of old helper */
-		memset(help->data, 0, help->helper->data_len);
-	} else {
-		/* we cannot set a helper for an existing conntrack */
-		return -EOPNOTSUPP;
 	}
 
-	rcu_assign_pointer(help->helper, helper);
-
-	return 0;
+	/* we cannot set a helper for an existing conntrack */
+	return -EOPNOTSUPP;
 }
 
 static inline int
-- 
1.7.10

^ permalink raw reply related

* [PATCH 3/4] netfilter: fix compilation of the nfnl_cthelper if NF_CONNTRACK is unset
From: pablo @ 2012-06-19  3:16 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340075789-6196-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

This patch fixes the compilation of net/netfilter/nfnetlink_cthelper.c
if CONFIG_NF_CONNTRACK is not set.

This patch also moves the definition of the cthelper infrastructure to
the scope of NF_CONNTRACK things.

I have also renamed NETFILTER_NETLINK_CTHELPER by NF_CT_NETLINK_HELPER,
to use similar names to other nf_conntrack_netlink extensions. Better now
that this has been only for two days in David's tree.

Two new dependencies have been added:

* NF_CT_NETLINK
* NETFILTER_NETLINK_QUEUE

Since these infrastructure requires both ctnetlink and nfqueue.

Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig  |   20 ++++++++++++--------
 net/netfilter/Makefile |    2 +-
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index aae6c62..f1a52ba 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -12,14 +12,6 @@ tristate "Netfilter NFACCT over NFNETLINK interface"
 	  If this option is enabled, the kernel will include support
 	  for extended accounting via NFNETLINK.
 
-config NETFILTER_NETLINK_CTHELPER
-tristate "Netfilter CTHELPER over NFNETLINK interface"
-	depends on NETFILTER_ADVANCED
-	select NETFILTER_NETLINK
-	help
-	  If this option is enabled, the kernel will include support
-	  for user-space connection tracking helpers via NFNETLINK.
-
 config NETFILTER_NETLINK_QUEUE
 	tristate "Netfilter NFQUEUE over NFNETLINK interface"
 	depends on NETFILTER_ADVANCED
@@ -343,6 +335,18 @@ config NF_CT_NETLINK_TIMEOUT
 
 	  If unsure, say `N'.
 
+config NF_CT_NETLINK_HELPER
+	tristate 'Connection tracking helpers in user-space via Netlink'
+	select NETFILTER_NETLINK
+	depends on NF_CT_NETLINK
+	depends on NETFILTER_NETLINK_QUEUE
+	depends on NETFILTER_ADVANCED
+	help
+	  This option enables the user-space connection tracking helpers
+	  infrastructure.
+
+	  If unsure, say `N'.
+
 endif # NF_CONNTRACK
 
 # transparent proxy support
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 2f3bc0f..7cc2019 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -9,7 +9,6 @@ obj-$(CONFIG_NETFILTER) = netfilter.o
 
 obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
 obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
-obj-$(CONFIG_NETFILTER_NETLINK_CTHELPER) += nfnetlink_cthelper.o
 obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
 obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
 
@@ -25,6 +24,7 @@ obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
 # netlink interface for nf_conntrack
 obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
 obj-$(CONFIG_NF_CT_NETLINK_TIMEOUT) += nfnetlink_cttimeout.o
+obj-$(CONFIG_NF_CT_NETLINK_HELPER) += nfnetlink_cthelper.o
 
 # connection tracking helpers
 nf_conntrack_h323-objs := nf_conntrack_h323_main.o nf_conntrack_h323_asn1.o
-- 
1.7.10

^ permalink raw reply related

* Re: linux-next: Tree for Jun 18 (netfilter nfconntrack)
From: Pablo Neira Ayuso @ 2012-06-19  3:19 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: Stephen Rothwell, linux-next, LKML, netdev, netfilter-devel,
	coreteam
In-Reply-To: <4FDF65F6.6010002@xenotime.net>

On Mon, Jun 18, 2012 at 10:31:34AM -0700, Randy Dunlap wrote:
> On 06/17/2012 11:53 PM, Stephen Rothwell wrote:
> 
> > Hi all,
> > 
> > Changes since 20120615:
> 
> 
> 
> on i386 or x86_64:
> 
> # CONFIG_NF_CONNTRACK is not set
> 
>   CC [M]  net/netfilter/nfnetlink_cthelper.o
> In file included from include/net/netfilter/nf_conntrack_helper.h:12:0,
>                  from net/netfilter/nfnetlink_cthelper.c:23:
> include/net/netfilter/nf_conntrack.h:77:22: error: field 'ct_general' has incomplete type
> include/net/netfilter/nf_conntrack.h: In function 'nf_ct_get':
> include/net/netfilter/nf_conntrack.h:157:30: error: 'const struct sk_buff' has no member named 'nfct'
> include/net/netfilter/nf_conntrack.h: In function 'nf_ct_put':
> include/net/netfilter/nf_conntrack.h:164:2: error: implicit declaration of function 'nf_conntrack_put'
> make[3]: *** [net/netfilter/nfnetlink_cthelper.o] Error 1

I've send a patch to David to solve this:

 netfilter: fix compilation of the nfnl_cthelper if NF_CONNTRACK is unset

It seems to resolve the issue for me here.

Thanks for the report.

^ permalink raw reply

* Re: [PATCH 0/4] netfilter updates for net-next (batch 3)
From: David Miller @ 2012-06-19  3:28 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <1340075789-6196-1-git-send-email-pablo@netfilter.org>

From: pablo@netfilter.org
Date: Tue, 19 Jun 2012 05:16:25 +0200

> The patches provide:
> 
> * compilation fixes if CONFIG_NF_CONNTRACK is disabled: I moved all the
>   conntrack code from nfnetlink_queue.c to nfnetlink_queue_ct.c to avoid
>   peppering the entire code with lots of ifdefs. I needed to rename
>   nfnetlink_queue.c to nfnetlink_queue_core.c to get it working with the
>   Makefile tweaks I've added.
> 
> * fix NULL pointer dereference via ctnetlink while trying to change the helper
>   for an existing conntrack entry. I don't find any reasonable use case for
>   changing the helper from one to another in run-time. Thus, now ctnetlink
>   returns -EOPNOTSUPP for this operation.
> 
> * fix possible out-of-bound zeroing of the conntrack extension area due to
>   the helper automatic assignation routine.
> 
> You can pull these changes from:
> 
> git://1984.lsi.us.es/nf-next master

Pulled, thanks.

^ permalink raw reply

* Re: pull request: batman-adv 2012-06-18
From: David Miller @ 2012-06-19  3:28 UTC (permalink / raw)
  To: ordex; +Cc: netdev, b.a.t.m.a.n
In-Reply-To: <1340051963-14836-1-git-send-email-ordex@autistici.org>

From: Antonio Quartulli <ordex@autistici.org>
Date: Mon, 18 Jun 2012 22:39:04 +0200

> Hello David,
> 	here is our first set of changes intended for next-next/linux-3.6.
> 
> Patch 2 fixes a major bug in the TranslationTable code where the old value of
> skb->data is used for memory access even if the data was relocated.
> Patches 4, 10, 11, 13, 14 are endianess-related cleanups that we wrote thanks
> to Al Viro's advice and help.
> Thanks to Martin Hundebøll batman-adv now supports the ethtool API and can
> export several counters that are specific to our module (patch 5).
> Then patch 16 improves the routing protocol API by making part of the
> TranslationTable code routing agnostic.
> The rest are minor fixes and other cleanups.

Pulled, thanks.

^ permalink raw reply

* Re: [PATCH 0/4] netfilter updates for net-next (batch 3)
From: Pablo Neira Ayuso @ 2012-06-19  3:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340075789-6196-1-git-send-email-pablo@netfilter.org>

[-- Attachment #1: Type: text/plain, Size: 333 bytes --]

On Tue, Jun 19, 2012 at 05:16:25AM +0200, pablo@netfilter.org wrote:
[...]
> You can pull these changes from:
> 
> git://1984.lsi.us.es/nf-next master

Please, also take the small patch attached after this 4 patch series. It
fixes one linking issue.

Sorry, I'll put more care next time testing compilation options more
extensively.

[-- Attachment #2: 0001-netfilter-fix-missing-symbols-if-CONFIG_NETFILTER_NE.patch --]
[-- Type: text/x-diff, Size: 1358 bytes --]

>From af6b248c22759fb7448668bbe495f1cbe0a9109d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Jun 2012 05:25:46 +0200
Subject: [PATCH] netfilter: fix missing symbols if
 CONFIG_NETFILTER_NETLINK_QUEUE_CT unset

ERROR: "nfqnl_ct_parse" [net/netfilter/nfnetlink_queue.ko] undefined!
ERROR: "nfqnl_ct_seq_adjust" [net/netfilter/nfnetlink_queue.ko] undefined!
ERROR: "nfqnl_ct_put" [net/netfilter/nfnetlink_queue.ko] undefined!
ERROR: "nfqnl_ct_get" [net/netfilter/nfnetlink_queue.ko] undefined!

We have to use CONFIG_NETFILTER_NETLINK_QUEUE_CT in
include/net/netfilter/nfnetlink_queue.h, not CONFIG_NF_CONNTRACK.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nfnetlink_queue.h |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/netfilter/nfnetlink_queue.h b/include/net/netfilter/nfnetlink_queue.h
index 9f8095c..86267a5 100644
--- a/include/net/netfilter/nfnetlink_queue.h
+++ b/include/net/netfilter/nfnetlink_queue.h
@@ -5,7 +5,7 @@
 
 struct nf_conn;
 
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
 struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size,
 			     enum ip_conntrack_info *ctinfo);
 struct nf_conn *nfqnl_ct_parse(const struct sk_buff *skb,
-- 
1.7.10


^ permalink raw reply related

* Re: [PATCH] ipv4: Early TCP socket demux.
From: Stephen Hemminger @ 2012-06-19  4:03 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120618.194016.2282814982594761206.davem@davemloft.net>

> 
> You know you want it.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>

David, I understand it, Eric understands it, and maybe one or
two others. But on the principal of what is "good for the goose
is good for the gander", you really need to provide a reasonable
change log entry. Just because you are the network maintainer
doesn't mean you get to skip all the documented rules about submitting
patches.

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: Eric Dumazet @ 2012-06-19  4:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120618.194016.2282814982594761206.davem@davemloft.net>

On Mon, 2012-06-18 at 19:40 -0700, David Miller wrote:
> You know you want it.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>

Yeah, very good idea David ;)

needs some polishing of course.

This reminds the idea of having seperate dst per tcp socket, to remove
the dst refcnt contention as well.

^ permalink raw reply

* Re: [PATCH 0/4] netfilter updates for net-next (batch 3)
From: David Miller @ 2012-06-19  4:09 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <20120619033745.GA31405@1984>

From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Jun 2012 05:37:45 +0200

> On Tue, Jun 19, 2012 at 05:16:25AM +0200, pablo@netfilter.org wrote:
> [...]
>> You can pull these changes from:
>> 
>> git://1984.lsi.us.es/nf-next master
> 
> Please, also take the small patch attached after this 4 patch series. It
> fixes one linking issue.
> 
> Sorry, I'll put more care next time testing compilation options more
> extensively.

Done.

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-19  4:10 UTC (permalink / raw)
  To: stephen.hemminger; +Cc: netdev
In-Reply-To: <a22edf13-1bf5-49fb-8ebe-05054f68c129@tahiti.vyatta.com>

From: Stephen Hemminger <stephen.hemminger@vyatta.com>
Date: Mon, 18 Jun 2012 21:03:26 -0700 (PDT)

> David, I understand it, Eric understands it, and maybe one or
> two others. But on the principal of what is "good for the goose
> is good for the gander", you really need to provide a reasonable
> change log entry. Just because you are the network maintainer
> doesn't mean you get to skip all the documented rules about submitting
> patches.

This wasn't going to be the final commit log entry.

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: Changli Gao @ 2012-06-19  4:13 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev
In-Reply-To: <a22edf13-1bf5-49fb-8ebe-05054f68c129@tahiti.vyatta.com>

On Tue, Jun 19, 2012 at 12:03 PM, Stephen Hemminger
<stephen.hemminger@vyatta.com> wrote:
>
>>
>> You know you want it.
>>
>> Signed-off-by: David S. Miller <davem@davemloft.net>
>
> David, I understand it, Eric understands it, and maybe one or
> two others. But on the principal of what is "good for the goose
> is good for the gander", you really need to provide a reasonable
> change log entry. Just because you are the network maintainer
> doesn't mean you get to skip all the documented rules about submitting
> patches.

Agree. Thanks.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-19  4:15 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1340078846.7491.2127.camel@edumazet-glaptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 19 Jun 2012 06:07:26 +0200

> On Mon, 2012-06-18 at 19:40 -0700, David Miller wrote:
>> You know you want it.
>> 
>> Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> Yeah, very good idea David ;)
> 
> needs some polishing of course.

Such as?  IPv6 support?

> This reminds the idea of having seperate dst per tcp socket, to remove
> the dst refcnt contention as well.

I'm leery of this.

We're going to move towards having dst entries more strongly shared
as we move to remove the routing cache.

In fact I have another short-term change planned that adjusts which
keys the routing cache uses based upon what kinds of keys are actually
active in the current FIB rule configuration.

I think we want to encourage sharing and make the route footprint
smaller rather than expanding it's size artificually on even the
socket level.

If you really care about this refcount problem, then it's another
reason to never orphan socket sourced packets.  Then we wouldn't need
to ever refcount the route just to send a packet, we'd just use the
implicit reference held by the socket instead.  Socket route releasing
would be held back by the presence of any packets in the socket send
queue.  If we have to reset the dst mis-lifetime due to route flushes,
we'd need to use a specific packet as a sequence point.

That to me sounds like a more reasonable approach than just making
more and more routes.

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-19  4:16 UTC (permalink / raw)
  To: xiaosuo; +Cc: stephen.hemminger, netdev
In-Reply-To: <CABa6K_E+EzeJQF_m4aoq1WAYUwWQxd8_=k7HT0XZ6P5EpBR+4g@mail.gmail.com>

From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 19 Jun 2012 12:13:48 +0800

> On Tue, Jun 19, 2012 at 12:03 PM, Stephen Hemminger
> <stephen.hemminger@vyatta.com> wrote:
>>
>>>
>>> You know you want it.
>>>
>>> Signed-off-by: David S. Miller <davem@davemloft.net>
>>
>> David, I understand it, Eric understands it, and maybe one or
>> two others. But on the principal of what is "good for the goose
>> is good for the gander", you really need to provide a reasonable
>> change log entry. Just because you are the network maintainer
>> doesn't mean you get to skip all the documented rules about submitting
>> patches.
> 
> Agree. Thanks.

That's the last time I try to be even slightly humerous on this
list.

Thanks for killing the fun Stephen.

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: Eric Dumazet @ 2012-06-19  4:23 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120618.211539.105938285016510975.davem@davemloft.net>

On Mon, 2012-06-18 at 21:15 -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 19 Jun 2012 06:07:26 +0200
> 
> > On Mon, 2012-06-18 at 19:40 -0700, David Miller wrote:
> >> You know you want it.
> >> 
> >> Signed-off-by: David S. Miller <davem@davemloft.net>
> > 
> > Yeah, very good idea David ;)
> > 
> > needs some polishing of course.
> 
> Such as?  IPv6 support?
> 

I was referring to socket leak in :

+       if (sk) {
+               skb->sk = sk;
+               skb->destructor = sock_edemux;
+               if (sk->sk_state != TCP_TIME_WAIT) {
+                       struct dst_entry *dst = sk->sk_rx_dst;
+                       if (dst) {
+                               skb_dst_set_noref(skb, dst);
+                               err = 0;
+                       }
+               }
+       }

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: Eric Dumazet @ 2012-06-19  4:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <1340079813.7491.2164.camel@edumazet-glaptop>

On Tue, 2012-06-19 at 06:23 +0200, Eric Dumazet wrote:
> On Mon, 2012-06-18 at 21:15 -0700, David Miller wrote:
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Tue, 19 Jun 2012 06:07:26 +0200
> > 
> > > On Mon, 2012-06-18 at 19:40 -0700, David Miller wrote:
> > >> You know you want it.
> > >> 
> > >> Signed-off-by: David S. Miller <davem@davemloft.net>
> > > 
> > > Yeah, very good idea David ;)
> > > 
> > > needs some polishing of course.
> > 
> > Such as?  IPv6 support?
> > 
> 
> I was referring to socket leak in :
> 
> +       if (sk) {
> +               skb->sk = sk;
> +               skb->destructor = sock_edemux;
> +               if (sk->sk_state != TCP_TIME_WAIT) {
> +                       struct dst_entry *dst = sk->sk_rx_dst;
> +                       if (dst) {
> +                               skb_dst_set_noref(skb, dst);
> +                               err = 0;
> +                       }
> +               }
> +       }
> 
> 

It's not a leak, but seems strange to keep it around if we dont use it
yet.

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-19  4:26 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1340079813.7491.2164.camel@edumazet-glaptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 19 Jun 2012 06:23:33 +0200

> I was referring to socket leak in :
> 
> +       if (sk) {
> +               skb->sk = sk;
> +               skb->destructor = sock_edemux;
> +               if (sk->sk_state != TCP_TIME_WAIT) {
> +                       struct dst_entry *dst = sk->sk_rx_dst;
> +                       if (dst) {
> +                               skb_dst_set_noref(skb, dst);
> +                               err = 0;
> +                       }
> +               }
> +       }
> 

I see no leak.

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: David Miller @ 2012-06-19  4:27 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1340079938.7491.2172.camel@edumazet-glaptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 19 Jun 2012 06:25:38 +0200

> On Tue, 2012-06-19 at 06:23 +0200, Eric Dumazet wrote:
>> On Mon, 2012-06-18 at 21:15 -0700, David Miller wrote:
>> > From: Eric Dumazet <eric.dumazet@gmail.com>
>> > Date: Tue, 19 Jun 2012 06:07:26 +0200
>> > 
>> > > On Mon, 2012-06-18 at 19:40 -0700, David Miller wrote:
>> > >> You know you want it.
>> > >> 
>> > >> Signed-off-by: David S. Miller <davem@davemloft.net>
>> > > 
>> > > Yeah, very good idea David ;)
>> > > 
>> > > needs some polishing of course.
>> > 
>> > Such as?  IPv6 support?
>> > 
>> 
>> I was referring to socket leak in :
>> 
>> +       if (sk) {
>> +               skb->sk = sk;
>> +               skb->destructor = sock_edemux;
>> +               if (sk->sk_state != TCP_TIME_WAIT) {
>> +                       struct dst_entry *dst = sk->sk_rx_dst;
>> +                       if (dst) {
>> +                               skb_dst_set_noref(skb, dst);
>> +                               err = 0;
>> +                       }
>> +               }
>> +       }
>> 
>> 
> 
> It's not a leak, but seems strange to keep it around if we dont use it
> yet.

How are we not using it?  We use the cached SKB socket no matter what
happens.

Look at how inet hash lookup works.

The error tells the caller solely whether a route lookup is still
necessary.

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: Eric Dumazet @ 2012-06-19  4:31 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120618.211539.105938285016510975.davem@davemloft.net>

On Mon, 2012-06-18 at 21:15 -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>

> > This reminds the idea of having seperate dst per tcp socket, to remove
> > the dst refcnt contention as well.
> 
> I'm leery of this.
> 
> We're going to move towards having dst entries more strongly shared
> as we move to remove the routing cache.
> 
> In fact I have another short-term change planned that adjusts which
> keys the routing cache uses based upon what kinds of keys are actually
> active in the current FIB rule configuration.
> 
> I think we want to encourage sharing and make the route footprint
> smaller rather than expanding it's size artificually on even the
> socket level.
> 
> If you really care about this refcount problem, then it's another
> reason to never orphan socket sourced packets.  Then we wouldn't need
> to ever refcount the route just to send a packet, we'd just use the
> implicit reference held by the socket instead.  Socket route releasing
> would be held back by the presence of any packets in the socket send
> queue.  If we have to reset the dst mis-lifetime due to route flushes,
> we'd need to use a specific packet as a sequence point.
> 
> That to me sounds like a more reasonable approach than just making
> more and more routes.

We already don't touch dst refcnt on TCP xmit path, unless packet is
parked in Qdisc queue...

But with BQL this is becoming less effective.

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: Eric Dumazet @ 2012-06-19  4:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120618.212721.2148947025741866390.davem@davemloft.net>

On Mon, 2012-06-18 at 21:27 -0700, David Miller wrote:

> How are we not using it?  We use the cached SKB socket no matter what
> happens.
> 
> Look at how inet hash lookup works.
> 
> The error tells the caller solely whether a route lookup is still
> necessary.

OK, remove the unlikely() in __inet_lookup_skb() so that its obvious we
have this skb_steal_sock() thing :)

^ permalink raw reply

* Re: [PATCH] ipv4: Early TCP socket demux.
From: Changli Gao @ 2012-06-19  4:43 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Tom Herbert
In-Reply-To: <20120618.194016.2282814982594761206.davem@davemloft.net>

On Tue, Jun 19, 2012 at 10:40 AM, David Miller <davem@davemloft.net> wrote:
> @@ -324,19 +324,34 @@ static int ip_rcv_finish(struct sk_buff *skb)
>         *      how the packet travels inside Linux networking.
>         */
>        if (skb_dst(skb) == NULL) {
> -               int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
> -                                              iph->tos, skb->dev);
> -               if (unlikely(err)) {
> -                       if (err == -EHOSTUNREACH)
> -                               IP_INC_STATS_BH(dev_net(skb->dev),
> -                                               IPSTATS_MIB_INADDRERRORS);
> -                       else if (err == -ENETUNREACH)
> -                               IP_INC_STATS_BH(dev_net(skb->dev),
> -                                               IPSTATS_MIB_INNOROUTES);
> -                       else if (err == -EXDEV)
> -                               NET_INC_STATS_BH(dev_net(skb->dev),
> -                                                LINUX_MIB_IPRPFILTER);
> -                       goto drop;
> +               const struct net_protocol *ipprot;
> +               int protocol = iph->protocol;
> +               int hash, err;
> +
> +               hash = protocol & (MAX_INET_PROTOS - 1);
> +
> +               rcu_read_lock();
> +               ipprot = rcu_dereference(inet_protos[hash]);
> +               err = -ENOENT;
> +               if (ipprot && ipprot->early_demux)
> +                       err = ipprot->early_demux(skb);

I am afraid that this lookup with hurt the performance of the
forwarding path. A knob?

If this approach is acceptable, maybe we can use sockets to do finer RFS.

Thanks.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox