Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 74/79] netfilter: reduce NF_VERDICT_MASK to 0xff
From: kaber @ 2011-01-19 19:15 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Florian Westphal <fw@strlen.de>

NF_VERDICT_MASK is currently 0xffff. This is because the upper
16 bits are used to store errno (for NF_DROP) or the queue number
(NF_QUEUE verdict).

As there are up to 0xffff different queues available, there is no more
room to store additional flags.

At the moment there are only 6 different verdicts, i.e. we can reduce
NF_VERDICT_MASK to 0xff to allow storing additional flags in the 0xff00 space.

NF_VERDICT_BITS would then be reduced to 8, but because the value is
exported to userspace, this might cause breakage; e.g.:

e.g. 'queuenr = (1 << NF_VERDICT_BITS) | NF_QUEUE'  would now break.

Thus, remove NF_VERDICT_BITS usage in the kernel and move the old value
to the 'userspace compat' section.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter.h |   20 +++++++++++++++-----
 net/netfilter/core.c      |    4 ++--
 net/netfilter/nf_queue.c  |    2 +-
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 0ab7ca7..78b73cc 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -24,16 +24,19 @@
 #define NF_MAX_VERDICT NF_STOP
 
 /* we overload the higher bits for encoding auxiliary data such as the queue
- * number. Not nice, but better than additional function arguments. */
-#define NF_VERDICT_MASK 0x0000ffff
-#define NF_VERDICT_BITS 16
+ * number or errno values. Not nice, but better than additional function
+ * arguments. */
+#define NF_VERDICT_MASK 0x000000ff
+
+/* extra verdict flags have mask 0x0000ff00 */
 
+/* queue number (NF_QUEUE) or errno (NF_DROP) */
 #define NF_VERDICT_QMASK 0xffff0000
 #define NF_VERDICT_QBITS 16
 
-#define NF_QUEUE_NR(x) ((((x) << NF_VERDICT_BITS) & NF_VERDICT_QMASK) | NF_QUEUE)
+#define NF_QUEUE_NR(x) ((((x) << 16) & NF_VERDICT_QMASK) | NF_QUEUE)
 
-#define NF_DROP_ERR(x) (((-x) << NF_VERDICT_BITS) | NF_DROP)
+#define NF_DROP_ERR(x) (((-x) << 16) | NF_DROP)
 
 /* only for userspace compatibility */
 #ifndef __KERNEL__
@@ -41,6 +44,9 @@
    <= 0x2000 is used for protocol-flags. */
 #define NFC_UNKNOWN 0x4000
 #define NFC_ALTERED 0x8000
+
+/* NF_VERDICT_BITS should be 8 now, but userspace might break if this changes */
+#define NF_VERDICT_BITS 16
 #endif
 
 enum nf_inet_hooks {
@@ -72,6 +78,10 @@ union nf_inet_addr {
 
 #ifdef __KERNEL__
 #ifdef CONFIG_NETFILTER
+static inline int NF_DROP_GETERR(int verdict)
+{
+	return -(verdict >> NF_VERDICT_QBITS);
+}
 
 static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1,
 				   const union nf_inet_addr *a2)
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 0c5b796..4d88e45 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -175,12 +175,12 @@ next_hook:
 		ret = 1;
 	} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
 		kfree_skb(skb);
-		ret = -(verdict >> NF_VERDICT_BITS);
+		ret = NF_DROP_GETERR(verdict);
 		if (ret == 0)
 			ret = -EPERM;
 	} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
 		ret = nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
-			       verdict >> NF_VERDICT_BITS);
+			       verdict >> NF_VERDICT_QBITS);
 		if (ret < 0) {
 			if (ret == -ECANCELED)
 				goto next_hook;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 5c4b730..ce1150d4a 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -299,7 +299,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 	case NF_QUEUE:
 		err = __nf_queue(skb, elem, entry->pf, entry->hook,
 				 entry->indev, entry->outdev, entry->okfn,
-				 verdict >> NF_VERDICT_BITS);
+				 verdict >> NF_VERDICT_QBITS);
 		if (err < 0) {
 			if (err == -ECANCELED)
 				goto next_hook;
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 75/79] netfilter: allow NFQUEUE bypass if no listener is available
From: kaber @ 2011-01-19 19:15 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Florian Westphal <fw@strlen.de>

If an skb is to be NF_QUEUE'd, but no program has opened the queue, the
packet is dropped.

This adds a v2 target revision of xt_NFQUEUE that allows packets to
continue through the ruleset instead.

Because the actual queueing happens outside of the target context, the
'bypass' flag has to be communicated back to the netfilter core.

Unfortunately the only choice to do this without adding a new function
argument is to use the target function return value (i.e. the verdict).

In the NF_QUEUE case, the upper 16bit already contain the queue number
to use.  The previous patch reduced NF_VERDICT_MASK to 0xff, i.e.
we now have extra room for a new flag.

If a hook issued a NF_QUEUE verdict, then the netfilter core will
continue packet processing if the queueing hook
returns -ESRCH (== "this queue does not exist") and the new
NF_VERDICT_FLAG_QUEUE_BYPASS flag is set in the verdict value.

Note: If the queue exists, but userspace does not consume packets fast
enough, the skb will still be dropped.

Signed-off-by: Florian Westphal <fwestphal@astaro.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter.h            |    1 +
 include/linux/netfilter/xt_NFQUEUE.h |    6 ++++++
 net/netfilter/core.c                 |    3 +++
 net/netfilter/nf_queue.c             |    7 ++++++-
 net/netfilter/xt_NFQUEUE.c           |   28 +++++++++++++++++++++++++---
 5 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 78b73cc..eeec00a 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -29,6 +29,7 @@
 #define NF_VERDICT_MASK 0x000000ff
 
 /* extra verdict flags have mask 0x0000ff00 */
+#define NF_VERDICT_FLAG_QUEUE_BYPASS	0x00008000
 
 /* queue number (NF_QUEUE) or errno (NF_DROP) */
 #define NF_VERDICT_QMASK 0xffff0000
diff --git a/include/linux/netfilter/xt_NFQUEUE.h b/include/linux/netfilter/xt_NFQUEUE.h
index 2584f4a..9eafdbb 100644
--- a/include/linux/netfilter/xt_NFQUEUE.h
+++ b/include/linux/netfilter/xt_NFQUEUE.h
@@ -20,4 +20,10 @@ struct xt_NFQ_info_v1 {
 	__u16 queues_total;
 };
 
+struct xt_NFQ_info_v2 {
+	__u16 queuenum;
+	__u16 queues_total;
+	__u16 bypass;
+};
+
 #endif /* _XT_NFQ_TARGET_H */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 4d88e45..1e00bf7 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -184,6 +184,9 @@ next_hook:
 		if (ret < 0) {
 			if (ret == -ECANCELED)
 				goto next_hook;
+			if (ret == -ESRCH &&
+			   (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+				goto next_hook;
 			kfree_skb(skb);
 		}
 		ret = 0;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index ce1150d4a..5ab22e2 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -138,8 +138,10 @@ static int __nf_queue(struct sk_buff *skb,
 	rcu_read_lock();
 
 	qh = rcu_dereference(queue_handler[pf]);
-	if (!qh)
+	if (!qh) {
+		status = -ESRCH;
 		goto err_unlock;
+	}
 
 	afinfo = nf_get_afinfo(pf);
 	if (!afinfo)
@@ -303,6 +305,9 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 		if (err < 0) {
 			if (err == -ECANCELED)
 				goto next_hook;
+			if (err == -ESRCH &&
+			   (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+				goto next_hook;
 			kfree_skb(skb);
 		}
 		break;
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 3962770..d4f4b5d 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -83,9 +83,20 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	return NF_QUEUE_NR(queue);
 }
 
-static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par)
+static unsigned int
+nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	const struct xt_NFQ_info_v1 *info = par->targinfo;
+	const struct xt_NFQ_info_v2 *info = par->targinfo;
+	unsigned int ret = nfqueue_tg_v1(skb, par);
+
+	if (info->bypass)
+		ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+	return ret;
+}
+
+static int nfqueue_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_NFQ_info_v2 *info = par->targinfo;
 	u32 maxid;
 
 	if (unlikely(!rnd_inited)) {
@@ -102,6 +113,8 @@ static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par)
 		       info->queues_total, maxid);
 		return -ERANGE;
 	}
+	if (par->target->revision == 2 && info->bypass > 1)
+		return -EINVAL;
 	return 0;
 }
 
@@ -117,11 +130,20 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = {
 		.name		= "NFQUEUE",
 		.revision	= 1,
 		.family		= NFPROTO_UNSPEC,
-		.checkentry	= nfqueue_tg_v1_check,
+		.checkentry	= nfqueue_tg_check,
 		.target		= nfqueue_tg_v1,
 		.targetsize	= sizeof(struct xt_NFQ_info_v1),
 		.me		= THIS_MODULE,
 	},
+	{
+		.name		= "NFQUEUE",
+		.revision	= 2,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= nfqueue_tg_check,
+		.target		= nfqueue_tg_v2,
+		.targetsize	= sizeof(struct xt_NFQ_info_v2),
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init nfqueue_tg_init(void)
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 76/79] netfilter: ipt_CLUSTERIP: remove "no conntrack!"
From: kaber @ 2011-01-19 19:15 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

When a packet is meant to be handled by another node of the cluster,
silently drop it instead of flooding kernel log.

Note : INVALID packets are also dropped without notice.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c |    7 +------
 1 files changed, 1 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1e26a48..403ca57 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	 * that the ->target() function isn't called after ->destroy() */
 
 	ct = nf_ct_get(skb, &ctinfo);
-	if (ct == NULL) {
-		pr_info("no conntrack!\n");
-			/* FIXME: need to drop invalid ones, since replies
-			 * to outgoing connections of other nodes will be
-			 * marked as INVALID */
+	if (ct == NULL)
 		return NF_DROP;
-	}
 
 	/* special case: ICMP error handling. conntrack distinguishes between
 	 * error messages (RELATED) and information requests (see below) */
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 78/79] netfilter: nf_conntrack_tstamp: add flow-based timestamp extension
From: kaber @ 2011-01-19 19:15 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Pablo Neira Ayuso <pablo@netfilter.org>

This patch adds flow-based timestamping for conntracks. This
conntrack extension is disabled by default. Basically, we use
two 64-bits variables to store the creation timestamp once the
conntrack has been confirmed and the other to store the deletion
time. This extension is disabled by default, to enable it, you
have to:

echo 1 > /proc/sys/net/netfilter/nf_conntrack_timestamp

This patch allows to save memory for user-space flow-based
loogers such as ulogd2. In short, ulogd2 does not need to
keep a hashtable with the conntrack in user-space to know
when they were created and destroyed, instead we use the
kernel timestamp. If we want to have a sane IPFIX implementation
in user-space, this nanosecs resolution timestamps are also
useful. Other custom user-space applications can benefit from
this via libnetfilter_conntrack.

This patch modifies the /proc output to display the delta time
in seconds since the flow start. You can also obtain the
flow-start date by means of the conntrack-tools.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nfnetlink_conntrack.h  |    9 ++
 include/net/netfilter/nf_conntrack_extend.h    |    4 +
 include/net/netfilter/nf_conntrack_timestamp.h |   53 +++++++++++
 include/net/netns/conntrack.h                  |    2 +
 net/netfilter/Kconfig                          |   11 ++
 net/netfilter/Makefile                         |    1 +
 net/netfilter/nf_conntrack_core.c              |   26 +++++
 net/netfilter/nf_conntrack_netlink.c           |   46 +++++++++-
 net/netfilter/nf_conntrack_standalone.c        |   41 ++++++++
 net/netfilter/nf_conntrack_timestamp.c         |  120 ++++++++++++++++++++++++
 10 files changed, 312 insertions(+), 1 deletions(-)
 create mode 100644 include/net/netfilter/nf_conntrack_timestamp.h
 create mode 100644 net/netfilter/nf_conntrack_timestamp.c

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 19711e3..debf1ae 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -42,6 +42,7 @@ enum ctattr_type {
 	CTA_SECMARK,		/* obsolete */
 	CTA_ZONE,
 	CTA_SECCTX,
+	CTA_TIMESTAMP,
 	__CTA_MAX
 };
 #define CTA_MAX (__CTA_MAX - 1)
@@ -127,6 +128,14 @@ enum ctattr_counters {
 };
 #define CTA_COUNTERS_MAX (__CTA_COUNTERS_MAX - 1)
 
+enum ctattr_tstamp {
+	CTA_TIMESTAMP_UNSPEC,
+	CTA_TIMESTAMP_START,
+	CTA_TIMESTAMP_STOP,
+	__CTA_TIMESTAMP_MAX
+};
+#define CTA_TIMESTAMP_MAX (__CTA_TIMESTAMP_MAX - 1)
+
 enum ctattr_nat {
 	CTA_NAT_UNSPEC,
 	CTA_NAT_MINIP,
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 1a9f96d..2dcf317 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -17,6 +17,9 @@ enum nf_ct_ext_id {
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 	NF_CT_EXT_ZONE,
 #endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+	NF_CT_EXT_TSTAMP,
+#endif
 	NF_CT_EXT_NUM,
 };
 
@@ -25,6 +28,7 @@ enum nf_ct_ext_id {
 #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter
 #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
 #define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone
+#define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp
 
 /* Extensions: optional stuff which isn't permanently in struct. */
 struct nf_ct_ext {
diff --git a/include/net/netfilter/nf_conntrack_timestamp.h b/include/net/netfilter/nf_conntrack_timestamp.h
new file mode 100644
index 0000000..f17dcb6
--- /dev/null
+++ b/include/net/netfilter/nf_conntrack_timestamp.h
@@ -0,0 +1,53 @@
+#ifndef _NF_CONNTRACK_TSTAMP_H
+#define _NF_CONNTRACK_TSTAMP_H
+
+#include <net/net_namespace.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+
+struct nf_conn_tstamp {
+	u_int64_t start;
+	u_int64_t stop;
+};
+
+static inline
+struct nf_conn_tstamp *nf_conn_tstamp_find(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+	return nf_ct_ext_find(ct, NF_CT_EXT_TSTAMP);
+#else
+	return NULL;
+#endif
+}
+
+static inline
+struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+	struct net *net = nf_ct_net(ct);
+
+	if (!net->ct.sysctl_tstamp)
+		return NULL;
+
+	return nf_ct_ext_add(ct, NF_CT_EXT_TSTAMP, gfp);
+#else
+	return NULL;
+#endif
+};
+
+static inline bool nf_ct_tstamp_enabled(struct net *net)
+{
+	return net->ct.sysctl_tstamp != 0;
+}
+
+static inline void nf_ct_set_tstamp(struct net *net, bool enable)
+{
+	net->ct.sysctl_tstamp = enable;
+}
+
+extern int nf_conntrack_tstamp_init(struct net *net);
+extern void nf_conntrack_tstamp_fini(struct net *net);
+
+#endif /* _NF_CONNTRACK_TSTAMP_H */
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 5cf8a8c..341eb08 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -21,11 +21,13 @@ struct netns_ct {
 	int			sysctl_events;
 	unsigned int		sysctl_events_retry_timeout;
 	int			sysctl_acct;
+	int			sysctl_tstamp;
 	int			sysctl_checksum;
 	unsigned int		sysctl_log_invalid; /* Log invalid packets */
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*sysctl_header;
 	struct ctl_table_header	*acct_sysctl_header;
+	struct ctl_table_header	*tstamp_sysctl_header;
 	struct ctl_table_header	*event_sysctl_header;
 #endif
 	char			*slabname;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 939b504..faf7412 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -85,6 +85,17 @@ config NF_CONNTRACK_EVENTS
 
 	  If unsure, say `N'.
 
+config NF_CONNTRACK_TIMESTAMP
+	bool  'Connection tracking timestamping'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option enables support for connection tracking timestamping.
+	  This allows you to store the flow start-time and to obtain
+	  the flow-stop time (once it has been destroyed) via Connection
+	  tracking events.
+
+	  If unsure, say `N'.
+
 config NF_CT_PROTO_DCCP
 	tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)'
 	depends on EXPERIMENTAL
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 2c2628d..9ae6878 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,6 +1,7 @@
 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
 
 nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f47ac67..1909311 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -43,6 +43,7 @@
 #include <net/netfilter/nf_conntrack_acct.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_core.h>
 
@@ -282,6 +283,11 @@ EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
 static void death_by_timeout(unsigned long ul_conntrack)
 {
 	struct nf_conn *ct = (void *)ul_conntrack;
+	struct nf_conn_tstamp *tstamp;
+
+	tstamp = nf_conn_tstamp_find(ct);
+	if (tstamp && tstamp->stop == 0)
+		tstamp->stop = ktime_to_ns(ktime_get_real());
 
 	if (!test_bit(IPS_DYING_BIT, &ct->status) &&
 	    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
@@ -419,6 +425,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 	struct nf_conn_help *help;
+	struct nf_conn_tstamp *tstamp;
 	struct hlist_nulls_node *n;
 	enum ip_conntrack_info ctinfo;
 	struct net *net;
@@ -488,6 +495,14 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	atomic_inc(&ct->ct_general.use);
 	ct->status |= IPS_CONFIRMED;
 
+	/* set conntrack timestamp, if enabled. */
+	tstamp = nf_conn_tstamp_find(ct);
+	if (tstamp) {
+		if (skb->tstamp.tv64 == 0)
+			__net_timestamp((struct sk_buff *)skb);
+
+		tstamp->start = ktime_to_ns(skb->tstamp);
+	}
 	/* Since the lookup is lockless, hash insertion must be done after
 	 * starting the timer and setting the CONFIRMED bit. The RCU barriers
 	 * guarantee that no other CPU can find the conntrack before the above
@@ -746,6 +761,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	}
 
 	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
 
 	ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
 	nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
@@ -1186,6 +1202,11 @@ struct __nf_ct_flush_report {
 static int kill_report(struct nf_conn *i, void *data)
 {
 	struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
+	struct nf_conn_tstamp *tstamp;
+
+	tstamp = nf_conn_tstamp_find(i);
+	if (tstamp && tstamp->stop == 0)
+		tstamp->stop = ktime_to_ns(ktime_get_real());
 
 	/* If we fail to deliver the event, death_by_timeout() will retry */
 	if (nf_conntrack_event_report(IPCT_DESTROY, i,
@@ -1497,6 +1518,9 @@ static int nf_conntrack_init_net(struct net *net)
 	ret = nf_conntrack_acct_init(net);
 	if (ret < 0)
 		goto err_acct;
+	ret = nf_conntrack_tstamp_init(net);
+	if (ret < 0)
+		goto err_tstamp;
 	ret = nf_conntrack_ecache_init(net);
 	if (ret < 0)
 		goto err_ecache;
@@ -1504,6 +1528,8 @@ static int nf_conntrack_init_net(struct net *net)
 	return 0;
 
 err_ecache:
+	nf_conntrack_tstamp_fini(net);
+err_tstamp:
 	nf_conntrack_acct_fini(net);
 err_acct:
 	nf_conntrack_expect_fini(net);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9eabaa6..715d56c 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -42,6 +42,7 @@
 #include <net/netfilter/nf_conntrack_tuple.h>
 #include <net/netfilter/nf_conntrack_acct.h>
 #include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
 #ifdef CONFIG_NF_NAT_NEEDED
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_protocol.h>
@@ -230,6 +231,33 @@ nla_put_failure:
 	return -1;
 }
 
+static int
+ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	struct nlattr *nest_count;
+	const struct nf_conn_tstamp *tstamp;
+
+	tstamp = nf_conn_tstamp_find(ct);
+	if (!tstamp)
+		return 0;
+
+	nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED);
+	if (!nest_count)
+		goto nla_put_failure;
+
+	NLA_PUT_BE64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start));
+	if (tstamp->stop != 0) {
+		NLA_PUT_BE64(skb, CTA_TIMESTAMP_STOP,
+			     cpu_to_be64(tstamp->stop));
+	}
+	nla_nest_end(skb, nest_count);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 #ifdef CONFIG_NF_CONNTRACK_MARK
 static inline int
 ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
@@ -404,6 +432,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 	    ctnetlink_dump_timeout(skb, ct) < 0 ||
 	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
 	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+	    ctnetlink_dump_timestamp(skb, ct) < 0 ||
 	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||
 	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||
 	    ctnetlink_dump_mark(skb, ct) < 0 ||
@@ -471,6 +500,18 @@ ctnetlink_secctx_size(const struct nf_conn *ct)
 }
 
 static inline size_t
+ctnetlink_timestamp_size(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+	if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
+		return 0;
+	return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t));
+#else
+	return 0;
+#endif
+}
+
+static inline size_t
 ctnetlink_nlmsg_size(const struct nf_conn *ct)
 {
 	return NLMSG_ALIGN(sizeof(struct nfgenmsg))
@@ -481,6 +522,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
 	       + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
 	       + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
 	       + ctnetlink_counters_size(ct)
+	       + ctnetlink_timestamp_size(ct)
 	       + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
 	       + nla_total_size(0) /* CTA_PROTOINFO */
 	       + nla_total_size(0) /* CTA_HELP */
@@ -571,7 +613,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 
 	if (events & (1 << IPCT_DESTROY)) {
 		if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
-		    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+		    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+		    ctnetlink_dump_timestamp(skb, ct) < 0)
 			goto nla_put_failure;
 	} else {
 		if (ctnetlink_dump_timeout(skb, ct) < 0)
@@ -1360,6 +1403,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 	}
 
 	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
 	nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
 	/* we must add conntrack extensions before confirmation. */
 	ct->status |= IPS_CONFIRMED;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 8257bf6..69107fd 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -29,6 +29,7 @@
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_acct.h>
 #include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
 #include <linux/rculist_nulls.h>
 
 MODULE_LICENSE("GPL");
@@ -46,6 +47,7 @@ EXPORT_SYMBOL_GPL(print_tuple);
 struct ct_iter_state {
 	struct seq_net_private p;
 	unsigned int bucket;
+	u_int64_t time_now;
 };
 
 static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
@@ -96,6 +98,9 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
 static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
+	struct ct_iter_state *st = seq->private;
+
+	st->time_now = ktime_to_ns(ktime_get_real());
 	rcu_read_lock();
 	return ct_get_idx(seq, *pos);
 }
@@ -135,6 +140,39 @@ static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
 }
 #endif
 
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+static u_int64_t ct_delta_time(u_int64_t time_now, const struct nf_conn *ct)
+{
+	struct nf_conn_tstamp *tstamp;
+
+	tstamp = nf_conn_tstamp_find(ct);
+	if (tstamp) {
+		u_int64_t delta_time = time_now - tstamp->start;
+		return delta_time > 0 ? div_s64(delta_time, NSEC_PER_SEC) : 0;
+	}
+	return -1;
+}
+
+static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+	struct ct_iter_state *st = s->private;
+	u_int64_t delta_time;
+
+	delta_time = ct_delta_time(st->time_now, ct);
+	if (delta_time < 0)
+		return 0;
+
+	return seq_printf(s, "delta-time=%llu ",
+			  (unsigned long long)delta_time);
+}
+#else
+static inline int
+ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+	return 0;
+}
+#endif
+
 /* return 0 on success, 1 in case of error */
 static int ct_seq_show(struct seq_file *s, void *v)
 {
@@ -203,6 +241,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		goto release;
 #endif
 
+	if (ct_show_delta_time(s, ct))
+		goto release;
+
 	if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
 		goto release;
 
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
new file mode 100644
index 0000000..af7dd31
--- /dev/null
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -0,0 +1,120 @@
+/*
+ * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+
+static int nf_ct_tstamp __read_mostly;
+
+module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
+MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table tstamp_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_timestamp",
+		.data		= &init_net.ct.sysctl_tstamp,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type tstamp_extend __read_mostly = {
+	.len	= sizeof(struct nf_conn_tstamp),
+	.align	= __alignof__(struct nf_conn_tstamp),
+	.id	= NF_CT_EXT_TSTAMP,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table),
+			GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	table[0].data = &net->ct.sysctl_tstamp;
+
+	net->ct.tstamp_sysctl_header = register_net_sysctl_table(net,
+			nf_net_netfilter_sysctl_path, table);
+	if (!net->ct.tstamp_sysctl_header) {
+		printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n");
+		goto out_register;
+	}
+	return 0;
+
+out_register:
+	kfree(table);
+out:
+	return -ENOMEM;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->ct.tstamp_sysctl_header->ctl_table_arg;
+	unregister_net_sysctl_table(net->ct.tstamp_sysctl_header);
+	kfree(table);
+}
+#else
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+	return 0;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_tstamp_init(struct net *net)
+{
+	int ret;
+
+	net->ct.sysctl_tstamp = nf_ct_tstamp;
+
+	if (net_eq(net, &init_net)) {
+		ret = nf_ct_extend_register(&tstamp_extend);
+		if (ret < 0) {
+			printk(KERN_ERR "nf_ct_tstamp: Unable to register "
+					"extension\n");
+			goto out_extend_register;
+		}
+	}
+
+	ret = nf_conntrack_tstamp_init_sysctl(net);
+	if (ret < 0)
+		goto out_sysctl;
+
+	return 0;
+
+out_sysctl:
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&tstamp_extend);
+out_extend_register:
+	return ret;
+}
+
+void nf_conntrack_tstamp_fini(struct net *net)
+{
+	nf_conntrack_tstamp_fini_sysctl(net);
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&tstamp_extend);
+}
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 79/79] netfilter: nf_conntrack: fix lifetime display for disabled connections
From: kaber @ 2011-01-19 19:15 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

When no tstamp extension exists, ct_delta_time() returns -1, which is
then assigned to an u64 and tested for negative values to decide
whether to display the lifetime. This obviously doesn't work, use
a s64 and merge the two minor functions into one.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_standalone.c |   29 ++++++++++++-----------------
 1 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 69107fd..0ae1428 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -141,29 +141,24 @@ static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
 #endif
 
 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
-static u_int64_t ct_delta_time(u_int64_t time_now, const struct nf_conn *ct)
+static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
 {
+	struct ct_iter_state *st = s->private;
 	struct nf_conn_tstamp *tstamp;
+	s64 delta_time;
 
 	tstamp = nf_conn_tstamp_find(ct);
 	if (tstamp) {
-		u_int64_t delta_time = time_now - tstamp->start;
-		return delta_time > 0 ? div_s64(delta_time, NSEC_PER_SEC) : 0;
+		delta_time = st->time_now - tstamp->start;
+		if (delta_time > 0)
+			delta_time = div_s64(delta_time, NSEC_PER_SEC);
+		else
+			delta_time = 0;
+
+		return seq_printf(s, "delta-time=%llu ",
+				  (unsigned long long)delta_time);
 	}
-	return -1;
-}
-
-static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
-{
-	struct ct_iter_state *st = s->private;
-	u_int64_t delta_time;
-
-	delta_time = ct_delta_time(st->time_now, ct);
-	if (delta_time < 0)
-		return 0;
-
-	return seq_printf(s, "delta-time=%llu ",
-			  (unsigned long long)delta_time);
+	return 0;
 }
 #else
 static inline int
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 01/79] netfilter: nf_conntrack: don't always initialize ct->proto
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

ct->proto is big(60 bytes) due to structure ip_ct_tcp, and we don't need
to initialize the whole for all the other protocols. This patch moves
proto to the end of structure nf_conn, and pushes the initialization down
to the individual protocols.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/netfilter/nf_conntrack.h    |    6 +++---
 net/netfilter/nf_conntrack_core.c       |    3 ++-
 net/netfilter/nf_conntrack_netlink.c    |    1 +
 net/netfilter/nf_conntrack_proto_dccp.c |    3 +++
 net/netfilter/nf_conntrack_proto_sctp.c |    1 +
 net/netfilter/nf_conntrack_proto_tcp.c  |   14 +++-----------
 6 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index caf17db..abfff1e 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -116,14 +116,14 @@ struct nf_conn {
 	u_int32_t secmark;
 #endif
 
-	/* Storage reserved for other modules: */
-	union nf_conntrack_proto proto;
-
 	/* Extensions */
 	struct nf_ct_ext *ext;
 #ifdef CONFIG_NET_NS
 	struct net *ct_net;
 #endif
+
+	/* Storage reserved for other modules, must be the last member */
+	union nf_conntrack_proto proto;
 };
 
 static inline struct nf_conn *
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 27a5ea6..0ba7d48 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -651,7 +651,8 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
 	 * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
 	 */
 	memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
-	       sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
+	       offsetof(struct nf_conn, proto) -
+	       offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
 	spin_lock_init(&ct->lock);
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index b729ace..7f59be8 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1375,6 +1375,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 	}
 #endif
 
+	memset(&ct->proto, 0, sizeof(ct->proto));
 	if (cda[CTA_PROTOINFO]) {
 		err = ctnetlink_change_protoinfo(ct, cda);
 		if (err < 0)
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 5292560..9ae57c5 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -452,6 +452,9 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
 	ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
 	ct->proto.dccp.state = CT_DCCP_NONE;
+	ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
+	ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
+	ct->proto.dccp.handshake_seq = 0;
 	return true;
 
 out_invalid:
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index c6049c2..6f4ee70 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -413,6 +413,7 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	    test_bit(SCTP_CID_COOKIE_ACK, map))
 		return false;
 
+	memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
 	new_state = SCTP_CONNTRACK_MAX;
 	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
 		/* Don't need lock here: this conntrack not in circulation yet */
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 3fb2b73..6f38d0e 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1066,9 +1066,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	BUG_ON(th == NULL);
 
 	/* Don't need lock here: this conntrack not in circulation yet */
-	new_state
-		= tcp_conntracks[0][get_conntrack_index(th)]
-		[TCP_CONNTRACK_NONE];
+	new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
 
 	/* Invalid: delete conntrack */
 	if (new_state >= TCP_CONNTRACK_MAX) {
@@ -1077,6 +1075,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	}
 
 	if (new_state == TCP_CONNTRACK_SYN_SENT) {
+		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
 		/* SYN packet */
 		ct->proto.tcp.seen[0].td_end =
 			segment_seq_plus_len(ntohl(th->seq), skb->len,
@@ -1088,11 +1087,11 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 			ct->proto.tcp.seen[0].td_end;
 
 		tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
-		ct->proto.tcp.seen[1].flags = 0;
 	} else if (nf_ct_tcp_loose == 0) {
 		/* Don't try to pick up connections. */
 		return false;
 	} else {
+		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
 		/*
 		 * We are in the middle of a connection,
 		 * its history is lost for us.
@@ -1107,7 +1106,6 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 		ct->proto.tcp.seen[0].td_maxend =
 			ct->proto.tcp.seen[0].td_end +
 			ct->proto.tcp.seen[0].td_maxwin;
-		ct->proto.tcp.seen[0].td_scale = 0;
 
 		/* We assume SACK and liberal window checking to handle
 		 * window scaling */
@@ -1116,13 +1114,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 					      IP_CT_TCP_FLAG_BE_LIBERAL;
 	}
 
-	ct->proto.tcp.seen[1].td_end = 0;
-	ct->proto.tcp.seen[1].td_maxend = 0;
-	ct->proto.tcp.seen[1].td_maxwin = 0;
-	ct->proto.tcp.seen[1].td_scale = 0;
-
 	/* tcp_packet will set them */
-	ct->proto.tcp.state = TCP_CONNTRACK_NONE;
 	ct->proto.tcp.last_index = TCP_NONE_SET;
 
 	pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 00/79] netfilter: netfilter update
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev

following is a first batch of netfilter patches for 2.6.39, containing:

- cleanups and removal of atomic operations from Changli Gao

- RCU annotations, removal of atomic operations and cleanups from Eric

- a new queue bypass option for nfnetlink_queue from Florian Westphal

- arp_tables support for xt_CLASSIFY from Frederic Leroy

- a new backup protocol for IPVS from and Schillstrom

- IPVS netns support from Hans Schillstrom

- type and header cleanups from Jan

- missing module aliases from Jan

- a new SNMP conntrack helper from Jiri Olsa

- optional timestamping for conntrack entries to simplify userspace IPFIX,
  implementations, from Pablo

- a new AUDIT target for logging packets through the audit framework,
  from Thomas

- Improvements for the IPVS persistence engine from Simon

ipset from Jozsef will follow soon, once I'm done reviewing it.

Please pull from

git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git master

Thanks!

^ permalink raw reply

* [PATCH 04/79] netfilter: ct_extend: fix the wrong alloc_size
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

In function update_alloc_size(), sizeof(struct nf_ct_ext) is added twice
wrongly.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_extend.c |    5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index bd82450..920f924 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -144,9 +144,8 @@ static void update_alloc_size(struct nf_ct_ext_type *type)
 		if (!t1)
 			continue;
 
-		t1->alloc_size = sizeof(struct nf_ct_ext)
-				 + ALIGN(sizeof(struct nf_ct_ext), t1->align)
-				 + t1->len;
+		t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) +
+				 t1->len;
 		for (j = 0; j < NF_CT_EXT_NUM; j++) {
 			t2 = nf_ct_ext_types[j];
 			if (t2 == NULL || t2 == t1 ||
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 09/79] netfilter: xt_CLASSIFY: add ARP support, allow CLASSIFY target on any table
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=UTF-8, Size: 2543 bytes --]

From: Frédéric Leroy <fredo@starox.org>

Signed-off-by: Frédéric Leroy <fredo@starox.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_CLASSIFY.c |   36 ++++++++++++++++++++++++------------
 1 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index c2c0e4a..af9c4da 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -19,12 +19,14 @@
 #include <linux/netfilter_ipv6.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_CLASSIFY.h>
+#include <linux/netfilter_arp.h>
 
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Xtables: Qdisc classification");
 MODULE_ALIAS("ipt_CLASSIFY");
 MODULE_ALIAS("ip6t_CLASSIFY");
+MODULE_ALIAS("arpt_CLASSIFY");
 
 static unsigned int
 classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
@@ -35,26 +37,36 @@ classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static struct xt_target classify_tg_reg __read_mostly = {
-	.name       = "CLASSIFY",
-	.revision   = 0,
-	.family     = NFPROTO_UNSPEC,
-	.table      = "mangle",
-	.hooks      = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
-		      (1 << NF_INET_POST_ROUTING),
-	.target     = classify_tg,
-	.targetsize = sizeof(struct xt_classify_target_info),
-	.me         = THIS_MODULE,
+static struct xt_target classify_tg_reg[] __read_mostly = {
+	{
+		.name       = "CLASSIFY",
+		.revision   = 0,
+		.family     = NFPROTO_UNSPEC,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
+		              (1 << NF_INET_POST_ROUTING),
+		.target     = classify_tg,
+		.targetsize = sizeof(struct xt_classify_target_info),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "CLASSIFY",
+		.revision   = 0,
+		.family     = NFPROTO_ARP,
+		.hooks      = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD),
+		.target     = classify_tg,
+		.targetsize = sizeof(struct xt_classify_target_info),
+		.me         = THIS_MODULE,
+	},
 };
 
 static int __init classify_tg_init(void)
 {
-	return xt_register_target(&classify_tg_reg);
+	return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
 }
 
 static void __exit classify_tg_exit(void)
 {
-	xt_unregister_target(&classify_tg_reg);
+	xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
 }
 
 module_init(classify_tg_init);
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 11/79] netfilter: nf_ct_frag6_sysctl_table is static
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3a3f129..eb9f1c0 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -73,7 +73,7 @@ static struct inet_frags nf_frags;
 static struct netns_frags nf_init_frags;
 
 #ifdef CONFIG_SYSCTL
-struct ctl_table nf_ct_frag6_sysctl_table[] = {
+static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_frag6_timeout",
 		.data		= &nf_init_frags.timeout,
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 12/79] netfilter: add __rcu annotations
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

Use helpers to reduce number of sparse warnings
(CONFIG_SPARSE_RCU_POINTER=y)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 .../netfilter/nf_conntrack_l3proto_ipv4_compat.c   |   17 +++++++++++------
 net/ipv4/netfilter/nf_nat_core.c                   |    5 ++++-
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 37f8adb..ab9c05c 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -20,6 +20,7 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_acct.h>
+#include <linux/rculist_nulls.h>
 
 struct ct_iter_state {
 	struct seq_net_private p;
@@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
 	for (st->bucket = 0;
 	     st->bucket < net->ct.htable_size;
 	     st->bucket++) {
-		n = rcu_dereference(net->ct.hash[st->bucket].first);
+		n = rcu_dereference(
+			hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
 		if (!is_a_nulls(n))
 			return n;
 	}
@@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
 	struct net *net = seq_file_net(seq);
 	struct ct_iter_state *st = seq->private;
 
-	head = rcu_dereference(head->next);
+	head = rcu_dereference(hlist_nulls_next_rcu(head));
 	while (is_a_nulls(head)) {
 		if (likely(get_nulls_value(head) == st->bucket)) {
 			if (++st->bucket >= net->ct.htable_size)
 				return NULL;
 		}
-		head = rcu_dereference(net->ct.hash[st->bucket].first);
+		head = rcu_dereference(
+			hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
 	}
 	return head;
 }
@@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
 	struct hlist_node *n;
 
 	for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
-		n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+		n = rcu_dereference(
+			hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
 		if (n)
 			return n;
 	}
@@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
 	struct net *net = seq_file_net(seq);
 	struct ct_expect_iter_state *st = seq->private;
 
-	head = rcu_dereference(head->next);
+	head = rcu_dereference(hlist_next_rcu(head));
 	while (head == NULL) {
 		if (++st->bucket >= nf_ct_expect_hsize)
 			return NULL;
-		head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+		head = rcu_dereference(
+			hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
 	}
 	return head;
 }
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index ab877ac..eb55835 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -502,7 +502,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
 	int ret = 0;
 
 	spin_lock_bh(&nf_nat_lock);
-	if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) {
+	if (rcu_dereference_protected(
+			nf_nat_protos[proto->protonum],
+			lockdep_is_held(&nf_nat_lock)
+			) != &nf_nat_unknown_protocol) {
 		ret = -EBUSY;
 		goto out;
 	}
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 06/79] netfilter: nf_nat: don't use atomic bit operation
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

As we own the conntrack and the others can't see it until we confirm it,
we don't need to use atomic bit operation on ct->status.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/netfilter/nf_nat_core.h |    4 ++--
 net/ipv4/netfilter/nf_nat_core.c    |    4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h
index 33602ab..5aec85c 100644
--- a/include/net/netfilter/nf_nat_core.h
+++ b/include/net/netfilter/nf_nat_core.h
@@ -21,9 +21,9 @@ static inline int nf_nat_initialized(struct nf_conn *ct,
 				     enum nf_nat_manip_type manip)
 {
 	if (manip == IP_NAT_MANIP_SRC)
-		return test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
+		return ct->status & IPS_SRC_NAT_DONE_BIT;
 	else
-		return test_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
+		return ct->status & IPS_DST_NAT_DONE_BIT;
 }
 
 struct nlattr;
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c04787c..ab877ac 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -323,9 +323,9 @@ nf_nat_setup_info(struct nf_conn *ct,
 
 	/* It's done. */
 	if (maniptype == IP_NAT_MANIP_DST)
-		set_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
+		ct->status |= IPS_DST_NAT_DONE_BIT;
 	else
-		set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
+		ct->status |= IPS_SRC_NAT_DONE_BIT;
 
 	return NF_ACCEPT;
 }
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 08/79] netfilter: nf_nat: define nat_pptp_info as needed
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/netfilter/nf_nat.h |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index e966092..aff80b1 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -56,7 +56,9 @@ struct nf_nat_multi_range_compat {
 /* per conntrack: nat application helper private data */
 union nf_conntrack_nat_help {
 	/* insert nat helper private data here */
+#if defined(CONFIG_NF_NAT_PPTP) || defined(CONFIG_NF_NAT_PPTP_MODULE)
 	struct nf_nat_pptp nat_pptp_info;
+#endif
 };
 
 struct nf_conn;
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 13/79] netfilter: nf_nat_amanda: rename a variable
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

Avoid a sparse warning about 'ret' variable shadowing

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_amanda.c |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 0f23b3f..703f366f 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb,
 
 	/* Try to get same port: if not, try to change it. */
 	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
-		int ret;
+		int res;
 
 		exp->tuple.dst.u.tcp.port = htons(port);
-		ret = nf_ct_expect_related(exp);
-		if (ret == 0)
+		res = nf_ct_expect_related(exp);
+		if (res == 0)
 			break;
-		else if (ret != -EBUSY) {
+		else if (res != -EBUSY) {
 			port = 0;
 			break;
 		}
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 07/79] netfilter: ct_extend: define NF_CT_EXT_* as needed
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

Less IDs make nf_ct_ext smaller.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/netfilter/nf_conntrack_ecache.h |    8 ++++++++
 include/net/netfilter/nf_conntrack_extend.h |    6 ++++++
 include/net/netfilter/nf_nat.h              |    4 ++++
 3 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 96ba5f7..f596b60 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -23,12 +23,17 @@ struct nf_conntrack_ecache {
 static inline struct nf_conntrack_ecache *
 nf_ct_ecache_find(const struct nf_conn *ct)
 {
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
 	return nf_ct_ext_find(ct, NF_CT_EXT_ECACHE);
+#else
+	return NULL;
+#endif
 }
 
 static inline struct nf_conntrack_ecache *
 nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
 {
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
 	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_ecache *e;
 
@@ -45,6 +50,9 @@ nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
 		e->expmask = expmask;
 	}
 	return e;
+#else
+	return NULL;
+#endif
 };
 
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 0772d29..1a9f96d 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -7,10 +7,16 @@
 
 enum nf_ct_ext_id {
 	NF_CT_EXT_HELPER,
+#if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE)
 	NF_CT_EXT_NAT,
+#endif
 	NF_CT_EXT_ACCT,
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
 	NF_CT_EXT_ECACHE,
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
 	NF_CT_EXT_ZONE,
+#endif
 	NF_CT_EXT_NUM,
 };
 
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index f5f09f0..e966092 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -84,7 +84,11 @@ extern int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
 
 static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct)
 {
+#if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE)
 	return nf_ct_ext_find(ct, NF_CT_EXT_NAT);
+#else
+	return NULL;
+#endif
 }
 
 #else  /* !__KERNEL__: iptables wants this to compile. */
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 19/79] IPVS: buffer argument to ip_vs_process_message() should not be const
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Simon Horman <horms@verge.net.au>

It is assigned to a non-const variable and its contents are modified.

Acked-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_sync.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 72b3d88..3897d6b 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -303,7 +303,7 @@ ip_vs_conn_fill_param_sync(int af, int protocol,
  *      Process received multicast message and create the corresponding
  *      ip_vs_conn entries.
  */
-static void ip_vs_process_message(const char *buffer, const size_t buflen)
+static void ip_vs_process_message(char *buffer, const size_t buflen)
 {
 	struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
 	struct ip_vs_sync_conn *s;
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 18/79] IPVS: Remove useless { } block from ip_vs_process_message()
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Simon Horman <horms@verge.net.au>

Acked-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_sync.c |   24 +++++++++++-------------
 1 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index a4dccbc..72b3d88 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -381,20 +381,18 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 			}
 		}
 
-		{
-			if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
-					      (union nf_inet_addr *)&s->caddr,
-					      s->cport,
-					      (union nf_inet_addr *)&s->vaddr,
-					      s->vport, &param)) {
-				pr_err("ip_vs_conn_fill_param_sync failed");
-				return;
-			}
-			if (!(flags & IP_VS_CONN_F_TEMPLATE))
-				cp = ip_vs_conn_in_get(&param);
-			else
-				cp = ip_vs_ct_in_get(&param);
+		if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
+					       (union nf_inet_addr *)&s->caddr,
+					       s->cport,
+					       (union nf_inet_addr *)&s->vaddr,
+					       s->vport, &param)) {
+			pr_err("ip_vs_conn_fill_param_sync failed");
+			return;
 		}
+		if (!(flags & IP_VS_CONN_F_TEMPLATE))
+			cp = ip_vs_conn_in_get(&param);
+		else
+			cp = ip_vs_ct_in_get(&param);
 		if (!cp) {
 			/*
 			 * Find the appropriate destination for the connection.
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 24/79] IPVS: Backup, Prepare for transferring firewall marks (fwmark) to the backup daemon.
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

One struct will have fwmark added:
 * ip_vs_conn

ip_vs_conn_new() and ip_vs_find_dest()
will have an extra param - fwmark
The effects of that, is in this patch.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |    6 ++++--
 net/netfilter/ipvs/ip_vs_conn.c |    5 +++--
 net/netfilter/ipvs/ip_vs_core.c |    8 ++++----
 net/netfilter/ipvs/ip_vs_ctl.c  |    4 ++--
 net/netfilter/ipvs/ip_vs_ftp.c  |    5 +++--
 net/netfilter/ipvs/ip_vs_sync.c |    4 ++--
 6 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index d5a32e4..890f01c 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -382,6 +382,7 @@ struct ip_vs_conn {
 	union nf_inet_addr       vaddr;          /* virtual address */
 	union nf_inet_addr       daddr;          /* destination address */
 	volatile __u32           flags;          /* status flags */
+	__u32                    fwmark;         /* Fire wall mark from skb */
 	__be16                   cport;
 	__be16                   vport;
 	__be16                   dport;
@@ -720,7 +721,7 @@ extern void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport);
 struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p,
 				  const union nf_inet_addr *daddr,
 				  __be16 dport, unsigned flags,
-				  struct ip_vs_dest *dest);
+				  struct ip_vs_dest *dest, __u32 fwmark);
 extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp);
 
 extern const char * ip_vs_state_name(__u16 proto, int state);
@@ -901,7 +902,8 @@ extern int ip_vs_control_init(void);
 extern void ip_vs_control_cleanup(void);
 extern struct ip_vs_dest *
 ip_vs_find_dest(int af, const union nf_inet_addr *daddr, __be16 dport,
-		const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol);
+		const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol,
+		__u32 fwmark);
 extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
 
 
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 7615f9e..66e4662 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -613,7 +613,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 	if ((cp) && (!cp->dest)) {
 		dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
 				       &cp->vaddr, cp->vport,
-				       cp->protocol);
+				       cp->protocol, cp->fwmark);
 		ip_vs_bind_dest(cp, dest);
 		return dest;
 	} else
@@ -803,7 +803,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
 struct ip_vs_conn *
 ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
-	       struct ip_vs_dest *dest)
+	       struct ip_vs_dest *dest, __u32 fwmark)
 {
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
@@ -827,6 +827,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 			&cp->daddr, daddr);
 	cp->dport          = dport;
 	cp->flags	   = flags;
+	cp->fwmark         = fwmark;
 	if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
 		ip_vs_pe_get(p->pe);
 		cp->pe = p->pe;
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b4e51e9..e2bb3cd 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -293,7 +293,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		 * and thus param.pe_data will be destroyed
 		 * when the template expires */
 		ct = ip_vs_conn_new(&param, &dest->addr, dport,
-				    IP_VS_CONN_F_TEMPLATE, dest);
+				    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
 		if (ct == NULL) {
 			kfree(param.pe_data);
 			return NULL;
@@ -319,7 +319,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	 */
 	ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
 			      &iph.daddr, ports[1], &param);
-	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
+	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
 	if (cp == NULL) {
 		ip_vs_conn_put(ct);
 		return NULL;
@@ -423,7 +423,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 				      pptr[0], &iph.daddr, pptr[1], &p);
 		cp = ip_vs_conn_new(&p, &dest->addr,
 				    dest->port ? dest->port : pptr[1],
-				    flags, dest);
+				    flags, dest, skb->mark);
 		if (!cp)
 			return NULL;
 	}
@@ -489,7 +489,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 					      &iph.daddr, pptr[1], &p);
 			cp = ip_vs_conn_new(&p, &daddr, 0,
 					    IP_VS_CONN_F_BYPASS | flags,
-					    NULL);
+					    NULL, skb->mark);
 			if (!cp)
 				return NF_DROP;
 		}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 3e92558..a5bd002 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -657,12 +657,12 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
 struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
 				   __be16 dport,
 				   const union nf_inet_addr *vaddr,
-				   __be16 vport, __u16 protocol)
+				   __be16 vport, __u16 protocol, __u32 fwmark)
 {
 	struct ip_vs_dest *dest;
 	struct ip_vs_service *svc;
 
-	svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
+	svc = ip_vs_service_get(af, fwmark, protocol, vaddr, vport);
 	if (!svc)
 		return NULL;
 	dest = ip_vs_lookup_dest(svc, daddr, dport);
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 7545500..84aef65 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -208,7 +208,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 			n_cp = ip_vs_conn_new(&p, &from, port,
 					      IP_VS_CONN_F_NO_CPORT |
 					      IP_VS_CONN_F_NFCT,
-					      cp->dest);
+					      cp->dest, skb->mark);
 			if (!n_cp)
 				return 0;
 
@@ -365,7 +365,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		if (!n_cp) {
 			n_cp = ip_vs_conn_new(&p, &cp->daddr,
 					      htons(ntohs(cp->dport)-1),
-					      IP_VS_CONN_F_NFCT, cp->dest);
+					      IP_VS_CONN_F_NFCT, cp->dest,
+					      skb->mark);
 			if (!n_cp)
 				return 0;
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3897d6b..47eed67 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -404,7 +404,7 @@ static void ip_vs_process_message(char *buffer, const size_t buflen)
 					       s->dport,
 					       (union nf_inet_addr *)&s->vaddr,
 					       s->vport,
-					       s->protocol);
+					       s->protocol, 0);
 			/*  Set the approprite ativity flag */
 			if (s->protocol == IPPROTO_TCP) {
 				if (state != IP_VS_TCP_S_ESTABLISHED)
@@ -419,7 +419,7 @@ static void ip_vs_process_message(char *buffer, const size_t buflen)
 			}
 			cp = ip_vs_conn_new(&param,
 					    (union nf_inet_addr *)&s->daddr,
-					    s->dport, flags, dest);
+					    s->dport, flags, dest, 0);
 			if (dest)
 				atomic_dec(&dest->refcnt);
 			if (!cp) {
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 23/79] netfilter: nf_conntrack: one less atomic op in nf_ct_expect_insert()
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

Instead of doing atomic_inc(&exp->use) twice,
call atomic_add(2, &exp->use);

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_expect.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index bbb2140..774f32b 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -323,7 +323,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 	const struct nf_conntrack_expect_policy *p;
 	unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
 
-	atomic_inc(&exp->use);
+	/* two references : one for hash insert, one for the timer */
+	atomic_add(2, &exp->use);
 
 	if (master_help) {
 		hlist_add_head(&exp->lnode, &master_help->expectations);
@@ -345,7 +346,6 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 	}
 	add_timer(&exp->timeout);
 
-	atomic_inc(&exp->use);
 	NF_CT_STAT_INC(net, expect_create);
 }
 
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 21/79] ipvs: remove shadow rt variable
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

Remove a sparse warning about rt variable.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 10bd39c..50b131c 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -188,7 +188,6 @@ __ip_vs_reroute_locally(struct sk_buff *skb)
 			},
 			.mark = skb->mark,
 		};
-		struct rtable *rt;
 
 		if (ip_route_output_key(net, &rt, &fl))
 			return 0;
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 22/79] ipvs: allow transmit of GRO aggregated skbs
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Simon Horman <horms@verge.net.au>

Attempt at allowing LVS to transmit skbs of greater than MTU length that
have been aggregated by GRO and can thus be deaggregated by GSO.

Cc: Julian Anastasov <ja@ssi.bg>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c |   25 +++++++++++++++----------
 1 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 50b131c..fb2a445 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -407,7 +407,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
-	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
+	    !skb_is_gso(skb)) {
 		ip_rt_put(rt);
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -460,7 +461,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
-	if (skb->len > mtu) {
+	if (skb->len > mtu && !skb_is_gso(skb)) {
 		if (!skb->dev) {
 			struct net *net = dev_net(skb_dst(skb)->dev);
 
@@ -560,7 +561,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
-	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
+	    !skb_is_gso(skb)) {
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
 		IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
 				 "ip_vs_nat_xmit(): frag needed for");
@@ -675,7 +677,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
-	if (skb->len > mtu) {
+	if (skb->len > mtu && !skb_is_gso(skb)) {
 		if (!skb->dev) {
 			struct net *net = dev_net(skb_dst(skb)->dev);
 
@@ -790,8 +792,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	df |= (old_iph->frag_off & htons(IP_DF));
 
-	if ((old_iph->frag_off & htons(IP_DF))
-	    && mtu < ntohs(old_iph->tot_len)) {
+	if ((old_iph->frag_off & htons(IP_DF) &&
+	    mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) {
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error_put;
@@ -903,7 +905,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (skb_dst(skb))
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
-	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) &&
+	    !skb_is_gso(skb)) {
 		if (!skb->dev) {
 			struct net *net = dev_net(skb_dst(skb)->dev);
 
@@ -1008,7 +1011,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
-	if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
+	if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu &&
+	    !skb_is_gso(skb)) {
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
 		ip_rt_put(rt);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -1175,7 +1179,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
-	if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
+	if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) &&
+	    !skb_is_gso(skb)) {
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error_put;
@@ -1289,7 +1294,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
-	if (skb->len > mtu) {
+	if (skb->len > mtu && !skb_is_gso(skb)) {
 		if (!skb->dev) {
 			struct net *net = dev_net(skb_dst(skb)->dev);
 
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 27/79] IPVS: Handle Scheduling errors.
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

If ip_vs_conn_fill_param_persist return an error to ip_vs_sched_persist,
this error must propagate as ignored=-1 to ip_vs_schedule().
Errors from ip_vs_conn_new() in ip_vs_sched_persist() and ip_vs_schedule()
should also return *ignored=-1;

This patch just relies on the fact that ignored is 1 before calling
ip_vs_sched_persist().

Sent from Julian:
  "The new case when ip_vs_conn_fill_param_persist fails
   should set *ignored = -1, so that we can use NF_DROP,
   see below. *ignored = -1 should be also used for ip_vs_conn_new
   failure in ip_vs_sched_persist() and ip_vs_schedule().
   The new negative value should be handled in tcp,udp,sctp"

"To summarize:

- *ignored = 1:
      protocol tried to schedule (eg. on SYN), found svc but the
      svc/scheduler decides that this packet should be accepted with
      NF_ACCEPT because it must not be scheduled.

- *ignored = 0:
      scheduler can not find destination, so try bypass or
      return ICMP and then NF_DROP (ip_vs_leave).

- *ignored = -1:
      scheduler tried to schedule but fatal error occurred, eg.
      ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
      failure such as missing Call-ID, ENOMEM on skb_linearize
      or pe_data. In this case we should return NF_DROP without
      any attempts to send ICMP with ip_vs_leave."

More or less all ideas and input to this patch is work from
Julian Anastasov

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c       |   56 +++++++++++++++++++++++---------
 net/netfilter/ipvs/ip_vs_proto_sctp.c |   11 +++++--
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |   10 +++++-
 net/netfilter/ipvs/ip_vs_proto_udp.c  |   10 +++++-
 4 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 9acdd79..3445da6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -177,7 +177,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 	return pp->state_transition(cp, direction, skb, pp);
 }
 
-static inline void
+static inline int
 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 			      struct sk_buff *skb, int protocol,
 			      const union nf_inet_addr *caddr, __be16 cport,
@@ -187,7 +187,9 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 	ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
 	p->pe = svc->pe;
 	if (p->pe && p->pe->fill_param)
-		p->pe->fill_param(p, skb);
+		return p->pe->fill_param(p, skb);
+
+	return 0;
 }
 
 /*
@@ -200,7 +202,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 static struct ip_vs_conn *
 ip_vs_sched_persist(struct ip_vs_service *svc,
 		    struct sk_buff *skb,
-		    __be16 src_port, __be16 dst_port)
+		    __be16 src_port, __be16 dst_port, int *ignored)
 {
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
@@ -268,20 +270,27 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 				vaddr = &fwmark;
 			}
 		}
-		ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
-					      vaddr, vport, &param);
+		/* return *ignored = -1 so NF_DROP can be used */
+		if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+						  vaddr, vport, &param) < 0) {
+			*ignored = -1;
+			return NULL;
+		}
 	}
 
 	/* Check if a template already exists */
 	ct = ip_vs_ct_in_get(&param);
 	if (!ct || !ip_vs_check_template(ct)) {
-		/* No template found or the dest of the connection
+		/*
+		 * No template found or the dest of the connection
 		 * template is not available.
+		 * return *ignored=0 i.e. ICMP and NF_DROP
 		 */
 		dest = svc->scheduler->schedule(svc, skb);
 		if (!dest) {
 			IP_VS_DBG(1, "p-schedule: no dest found.\n");
 			kfree(param.pe_data);
+			*ignored = 0;
 			return NULL;
 		}
 
@@ -296,6 +305,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 				    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
 		if (ct == NULL) {
 			kfree(param.pe_data);
+			*ignored = -1;
 			return NULL;
 		}
 
@@ -323,6 +333,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
 	if (cp == NULL) {
 		ip_vs_conn_put(ct);
+		*ignored = -1;
 		return NULL;
 	}
 
@@ -342,6 +353,21 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
  *  It selects a server according to the virtual service, and
  *  creates a connection entry.
  *  Protocols supported: TCP, UDP
+ *
+ *  Usage of *ignored
+ *
+ * 1 :   protocol tried to schedule (eg. on SYN), found svc but the
+ *       svc/scheduler decides that this packet should be accepted with
+ *       NF_ACCEPT because it must not be scheduled.
+ *
+ * 0 :   scheduler can not find destination, so try bypass or
+ *       return ICMP and then NF_DROP (ip_vs_leave).
+ *
+ * -1 :  scheduler tried to schedule but fatal error occurred, eg.
+ *       ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
+ *       failure such as missing Call-ID, ENOMEM on skb_linearize
+ *       or pe_data. In this case we should return NF_DROP without
+ *       any attempts to send ICMP with ip_vs_leave.
  */
 struct ip_vs_conn *
 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
@@ -372,11 +398,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	}
 
 	/*
-	 * Do not schedule replies from local real server. It is risky
-	 * for fwmark services but mostly for persistent services.
+	 *    Do not schedule replies from local real server.
 	 */
 	if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
-	    (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
 	    (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
 		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
 			      "Not scheduling reply for existing connection");
@@ -387,10 +411,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	/*
 	 *    Persistent service
 	 */
-	if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
-		*ignored = 0;
-		return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1]);
-	}
+	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+		return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
+
+	*ignored = 0;
 
 	/*
 	 *    Non-persistent service
@@ -403,8 +427,6 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 		return NULL;
 	}
 
-	*ignored = 0;
-
 	dest = svc->scheduler->schedule(svc, skb);
 	if (dest == NULL) {
 		IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -425,8 +447,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 		cp = ip_vs_conn_new(&p, &dest->addr,
 				    dest->port ? dest->port : pptr[1],
 				    flags, dest, skb->mark);
-		if (!cp)
+		if (!cp) {
+			*ignored = -1;
 			return NULL;
+		}
 	}
 
 	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 1ea96bc..a315159 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -47,13 +47,18 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * incoming connection, and create a connection entry.
 		 */
 		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
-		if (!*cpp && !ignored) {
-			*verdict = ip_vs_leave(svc, skb, pp);
+		if (!*cpp && ignored <= 0) {
+			if (!ignored)
+				*verdict = ip_vs_leave(svc, skb, pp);
+			else {
+				ip_vs_service_put(svc);
+				*verdict = NF_DROP;
+			}
 			return 0;
 		}
 		ip_vs_service_put(svc);
 	}
-
+	/* NF_ACCEPT */
 	return 1;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index f6c5200..1cdab12 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -64,12 +64,18 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * incoming connection, and create a connection entry.
 		 */
 		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
-		if (!*cpp && !ignored) {
-			*verdict = ip_vs_leave(svc, skb, pp);
+		if (!*cpp && ignored <= 0) {
+			if (!ignored)
+				*verdict = ip_vs_leave(svc, skb, pp);
+			else {
+				ip_vs_service_put(svc);
+				*verdict = NF_DROP;
+			}
 			return 0;
 		}
 		ip_vs_service_put(svc);
 	}
+	/* NF_ACCEPT */
 	return 1;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 9d106a0..cd398de 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -63,12 +63,18 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * incoming connection, and create a connection entry.
 		 */
 		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
-		if (!*cpp && !ignored) {
-			*verdict = ip_vs_leave(svc, skb, pp);
+		if (!*cpp && ignored <= 0) {
+			if (!ignored)
+				*verdict = ip_vs_leave(svc, skb, pp);
+			else {
+				ip_vs_service_put(svc);
+				*verdict = NF_DROP;
+			}
 			return 0;
 		}
 		ip_vs_service_put(svc);
 	}
+	/* NF_ACCEPT */
 	return 1;
 }
 
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 29/79] IPVS: Backup, Adding Version 1 receive capability
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

Functionality improvements
 * flags  changed from 16 to 32 bits
 * fwmark added (32 bits)
 * timeout in sec. added (32 bits)
 * pe data added (Variable length)
 * IPv6 capabilities (3x16 bytes for addr.)
 * Version and type in every conn msg.

ip_vs_process_message() now handles Version 1 messages
and will call ip_vs_process_message_v0() for version 0 messages.

ip_vs_proc_conn() is common for both version, and handles the update of
connection hash.

ip_vs_conn_fill_param_sync()    - Version 1 messages only
ip_vs_conn_fill_param_sync_v0() - Version 0 messages only

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/linux/ip_vs.h           |    8 +
 include/net/ip_vs.h             |    1 +
 net/netfilter/ipvs/ip_vs_pe.c   |    5 +-
 net/netfilter/ipvs/ip_vs_sync.c |  549 ++++++++++++++++++++++++++++++---------
 4 files changed, 440 insertions(+), 123 deletions(-)

diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h
index 5f43a3b..4deb383 100644
--- a/include/linux/ip_vs.h
+++ b/include/linux/ip_vs.h
@@ -89,6 +89,14 @@
 #define IP_VS_CONN_F_TEMPLATE	0x1000		/* template, not connection */
 #define IP_VS_CONN_F_ONE_PACKET	0x2000		/* forward only one packet */
 
+#define IP_VS_CONN_F_BACKUP_MASK (IP_VS_CONN_F_FWD_MASK | \
+				  IP_VS_CONN_F_NOOUTPUT | \
+				  IP_VS_CONN_F_INACTIVE | \
+				  IP_VS_CONN_F_SEQ_MASK | \
+				  IP_VS_CONN_F_NO_CPORT | \
+				  IP_VS_CONN_F_TEMPLATE \
+				 )
+
 /* Flags that are not sent to backup server start from bit 16 */
 #define IP_VS_CONN_F_NFCT	(1 << 16)	/* use netfilter conntrack */
 
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 890f01c..4069484 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -817,6 +817,7 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc);
 int register_ip_vs_pe(struct ip_vs_pe *pe);
 int unregister_ip_vs_pe(struct ip_vs_pe *pe);
 struct ip_vs_pe *ip_vs_pe_getbyname(const char *name);
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name);
 
 static inline void ip_vs_pe_get(const struct ip_vs_pe *pe)
 {
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index e99f920..5cf859c 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -29,12 +29,11 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc)
 }
 
 /* Get pe in the pe list by name */
-static struct ip_vs_pe *
-__ip_vs_pe_getbyname(const char *pe_name)
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
 {
 	struct ip_vs_pe *pe;
 
-	IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+	IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
 		  pe_name);
 
 	spin_lock_bh(&ip_vs_pe_lock);
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 566482f..e071508 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -35,6 +35,8 @@
 #include <linux/wait.h>
 #include <linux/kernel.h>
 
+#include <asm/unaligned.h>		/* Used for ntoh_seq and hton_seq */
+
 #include <net/ip.h>
 #include <net/sock.h>
 
@@ -286,6 +288,16 @@ static struct sockaddr_in mcast_addr = {
 	.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP),
 };
 
+/*
+ * Copy of struct ip_vs_seq
+ * From unaligned network order to aligned host order
+ */
+static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
+{
+	ho->init_seq       = get_unaligned_be32(&no->init_seq);
+	ho->delta          = get_unaligned_be32(&no->delta);
+	ho->previous_delta = get_unaligned_be32(&no->previous_delta);
+}
 
 static inline struct ip_vs_sync_buff *sb_dequeue(void)
 {
@@ -418,59 +430,186 @@ void ip_vs_sync_conn(const struct ip_vs_conn *cp)
 		ip_vs_sync_conn(cp->control);
 }
 
+/*
+ *  fill_param used by version 1
+ */
 static inline int
-ip_vs_conn_fill_param_sync(int af, int protocol,
-			   const union nf_inet_addr *caddr, __be16 cport,
-			   const union nf_inet_addr *vaddr, __be16 vport,
-			   struct ip_vs_conn_param *p)
+ip_vs_conn_fill_param_sync(int af, union ip_vs_sync_conn *sc,
+			   struct ip_vs_conn_param *p,
+			   __u8 *pe_data, unsigned int pe_data_len,
+			   __u8 *pe_name, unsigned int pe_name_len)
 {
-	/* XXX: Need to take into account persistence engine */
-	ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ip_vs_conn_fill_param(af, sc->v6.protocol,
+				      (const union nf_inet_addr *)&sc->v6.caddr,
+				      sc->v6.cport,
+				      (const union nf_inet_addr *)&sc->v6.vaddr,
+				      sc->v6.vport, p);
+	else
+#endif
+		ip_vs_conn_fill_param(af, sc->v4.protocol,
+				      (const union nf_inet_addr *)&sc->v4.caddr,
+				      sc->v4.cport,
+				      (const union nf_inet_addr *)&sc->v4.vaddr,
+				      sc->v4.vport, p);
+	/* Handle pe data */
+	if (pe_data_len) {
+		if (pe_name_len) {
+			char buff[IP_VS_PENAME_MAXLEN+1];
+
+			memcpy(buff, pe_name, pe_name_len);
+			buff[pe_name_len]=0;
+			p->pe = __ip_vs_pe_getbyname(buff);
+			if (!p->pe) {
+				IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", buff);
+				return 1;
+			}
+		} else {
+			IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
+			return 1;
+		}
+
+		p->pe_data = kmalloc(pe_data_len, GFP_ATOMIC);
+		if (!p->pe_data) {
+			if (p->pe->module)
+				module_put(p->pe->module);
+			return -ENOMEM;
+		}
+		memcpy(p->pe_data, pe_data, pe_data_len);
+		p->pe_data_len = pe_data_len;
+	}
 	return 0;
 }
 
 /*
- *      Process received multicast message and create the corresponding
- *      ip_vs_conn entries.
+ *  Connection Add / Update.
+ *  Common for version 0 and 1 reception of backup sync_conns.
+ *  Param: ...
+ *         timeout is in sec.
+ */
+static void ip_vs_proc_conn(struct ip_vs_conn_param *param,  unsigned flags,
+			    unsigned state, unsigned protocol, unsigned type,
+			    const union nf_inet_addr *daddr, __be16 dport,
+			    unsigned long timeout, __u32 fwmark,
+			    struct ip_vs_sync_conn_options *opt,
+			    struct ip_vs_protocol *pp)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_conn *cp;
+
+
+	if (!(flags & IP_VS_CONN_F_TEMPLATE))
+		cp = ip_vs_conn_in_get(param);
+	else
+		cp = ip_vs_ct_in_get(param);
+
+	if (cp && param->pe_data) 	/* Free pe_data */
+		kfree(param->pe_data);
+	if (!cp) {
+		/*
+		 * Find the appropriate destination for the connection.
+		 * If it is not found the connection will remain unbound
+		 * but still handled.
+		 */
+		dest = ip_vs_find_dest(type, daddr, dport, param->vaddr,
+				       param->vport, protocol, fwmark);
+
+		/*  Set the approprite ativity flag */
+		if (protocol == IPPROTO_TCP) {
+			if (state != IP_VS_TCP_S_ESTABLISHED)
+				flags |= IP_VS_CONN_F_INACTIVE;
+			else
+				flags &= ~IP_VS_CONN_F_INACTIVE;
+		} else if (protocol == IPPROTO_SCTP) {
+			if (state != IP_VS_SCTP_S_ESTABLISHED)
+				flags |= IP_VS_CONN_F_INACTIVE;
+			else
+				flags &= ~IP_VS_CONN_F_INACTIVE;
+		}
+		cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
+		if (dest)
+			atomic_dec(&dest->refcnt);
+		if (!cp) {
+			if (param->pe_data)
+				kfree(param->pe_data);
+			IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
+			return;
+		}
+	} else if (!cp->dest) {
+		dest = ip_vs_try_bind_dest(cp);
+		if (dest)
+			atomic_dec(&dest->refcnt);
+	} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
+		(cp->state != state)) {
+		/* update active/inactive flag for the connection */
+		dest = cp->dest;
+		if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+			(state != IP_VS_TCP_S_ESTABLISHED)) {
+			atomic_dec(&dest->activeconns);
+			atomic_inc(&dest->inactconns);
+			cp->flags |= IP_VS_CONN_F_INACTIVE;
+		} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+			(state == IP_VS_TCP_S_ESTABLISHED)) {
+			atomic_inc(&dest->activeconns);
+			atomic_dec(&dest->inactconns);
+			cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+		}
+	} else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
+		(cp->state != state)) {
+		dest = cp->dest;
+		if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+		(state != IP_VS_SCTP_S_ESTABLISHED)) {
+			atomic_dec(&dest->activeconns);
+			atomic_inc(&dest->inactconns);
+			cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+		}
+	}
+
+	if (opt)
+		memcpy(&cp->in_seq, opt, sizeof(*opt));
+	atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
+	cp->state = state;
+	cp->old_state = cp->state;
+	/*
+	 * For Ver 0 messages style
+	 *  - Not possible to recover the right timeout for templates
+	 *  - can not find the right fwmark
+	 *    virtual service. If needed, we can do it for
+	 *    non-fwmark persistent services.
+	 * Ver 1 messages style.
+	 *  - No problem.
+	 */
+	if (timeout) {
+		if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
+			timeout = MAX_SCHEDULE_TIMEOUT / HZ;
+		cp->timeout = timeout*HZ;
+	} else if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
+		cp->timeout = pp->timeout_table[state];
+	else
+		cp->timeout = (3*60*HZ);
+	ip_vs_conn_put(cp);
+}
+
+/*
+ *  Process received multicast message for Version 0
  */
-static void ip_vs_process_message(char *buffer, const size_t buflen)
+static void ip_vs_process_message_v0(const char *buffer, const size_t buflen)
 {
 	struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
 	struct ip_vs_sync_conn_v0 *s;
 	struct ip_vs_sync_conn_options *opt;
-	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
-	struct ip_vs_dest *dest;
 	struct ip_vs_conn_param param;
 	char *p;
 	int i;
 
-	if (buflen < sizeof(struct ip_vs_sync_mesg)) {
-		IP_VS_ERR_RL("sync message header too short\n");
-		return;
-	}
-
-	/* Convert size back to host byte order */
-	m->size = ntohs(m->size);
-
-	if (buflen != m->size) {
-		IP_VS_ERR_RL("bogus sync message size\n");
-		return;
-	}
-
-	/* SyncID sanity check */
-	if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
-		IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
-			  m->syncid);
-		return;
-	}
-
 	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
 	for (i=0; i<m->nr_conns; i++) {
 		unsigned flags, state;
 
 		if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
-			IP_VS_ERR_RL("bogus conn in sync message\n");
+			IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
 			return;
 		}
 		s = (struct ip_vs_sync_conn_v0 *) p;
@@ -480,7 +619,7 @@ static void ip_vs_process_message(char *buffer, const size_t buflen)
 			opt = (struct ip_vs_sync_conn_options *)&s[1];
 			p += FULL_CONN_SIZE;
 			if (p > buffer+buflen) {
-				IP_VS_ERR_RL("bogus conn options in sync message\n");
+				IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
 				return;
 			}
 		} else {
@@ -492,12 +631,12 @@ static void ip_vs_process_message(char *buffer, const size_t buflen)
 		if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
 			pp = ip_vs_proto_get(s->protocol);
 			if (!pp) {
-				IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
+				IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
 					s->protocol);
 				continue;
 			}
 			if (state >= pp->num_states) {
-				IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
+				IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
 					pp->name, state);
 				continue;
 			}
@@ -505,103 +644,273 @@ static void ip_vs_process_message(char *buffer, const size_t buflen)
 			/* protocol in templates is not used for state/timeout */
 			pp = NULL;
 			if (state > 0) {
-				IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
+				IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
 					state);
 				state = 0;
 			}
 		}
 
-		if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
-					       (union nf_inet_addr *)&s->caddr,
-					       s->cport,
-					       (union nf_inet_addr *)&s->vaddr,
-					       s->vport, &param)) {
-			pr_err("ip_vs_conn_fill_param_sync failed");
-			return;
+		ip_vs_conn_fill_param(AF_INET, s->protocol,
+				      (const union nf_inet_addr *)&s->caddr,
+				      s->cport,
+				      (const union nf_inet_addr *)&s->vaddr,
+				      s->vport, &param);
+
+		/* Send timeout as Zero */
+		ip_vs_proc_conn(&param, flags, state, s->protocol, AF_INET,
+				(union nf_inet_addr *)&s->daddr, s->dport,
+				0, 0, opt, pp);
+	}
+}
+
+/*
+ * Handle options
+ */
+static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
+				    __u32 *opt_flags,
+				    struct ip_vs_sync_conn_options *opt)
+{
+	struct ip_vs_sync_conn_options *topt;
+
+	topt = (struct ip_vs_sync_conn_options *)p;
+
+	if (plen != sizeof(struct ip_vs_sync_conn_options)) {
+		IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
+		return -EINVAL;
+	}
+	if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
+		IP_VS_DBG(2, "BACKUP, conn options found twice\n");
+		return -EINVAL;
+	}
+	ntoh_seq(&topt->in_seq, &opt->in_seq);
+	ntoh_seq(&topt->out_seq, &opt->out_seq);
+	*opt_flags |= IPVS_OPT_F_SEQ_DATA;
+	return 0;
+}
+
+static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
+			  __u8 **data, unsigned int maxlen,
+			  __u32 *opt_flags, __u32 flag)
+{
+	if (plen > maxlen) {
+		IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
+		return -EINVAL;
+	}
+	if (*opt_flags & flag) {
+		IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
+		return -EINVAL;
+	}
+	*data_len = plen;
+	*data = p;
+	*opt_flags |= flag;
+	return 0;
+}
+/*
+ *   Process a Version 1 sync. connection
+ */
+static inline int ip_vs_proc_sync_conn(__u8 *p, __u8 *msg_end)
+{
+	struct ip_vs_sync_conn_options opt;
+	union  ip_vs_sync_conn *s;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn_param param;
+	__u32 flags;
+	unsigned int af, state, pe_data_len=0, pe_name_len=0;
+	__u8 *pe_data=NULL, *pe_name=NULL;
+	__u32 opt_flags=0;
+	int retc=0;
+
+	s = (union ip_vs_sync_conn *) p;
+
+	if (s->v6.type & STYPE_F_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+		af = AF_INET6;
+		p += sizeof(struct ip_vs_sync_v6);
+#else
+		IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
+		retc = 10;
+		goto out;
+#endif
+	} else if (!s->v4.type) {
+		af = AF_INET;
+		p += sizeof(struct ip_vs_sync_v4);
+	} else {
+		return -10;
+	}
+	if (p > msg_end)
+		return -20;
+
+	/* Process optional params check Type & Len. */
+	while (p < msg_end) {
+		int ptype;
+		int plen;
+
+		if (p+2 > msg_end)
+			return -30;
+		ptype = *(p++);
+		plen  = *(p++);
+
+		if (!plen || ((p + plen) > msg_end))
+			return -40;
+		/* Handle seq option  p = param data */
+		switch (ptype & ~IPVS_OPT_F_PARAM) {
+		case IPVS_OPT_SEQ_DATA:
+			if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
+				return -50;
+			break;
+
+		case IPVS_OPT_PE_DATA:
+			if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
+					   IP_VS_PEDATA_MAXLEN, &opt_flags,
+					   IPVS_OPT_F_PE_DATA))
+				return -60;
+			break;
+
+		case IPVS_OPT_PE_NAME:
+			if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
+					   IP_VS_PENAME_MAXLEN, &opt_flags,
+					   IPVS_OPT_F_PE_NAME))
+				return -70;
+			break;
+
+		default:
+			/* Param data mandatory ? */
+			if (!(ptype & IPVS_OPT_F_PARAM)) {
+				IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
+					  ptype & ~IPVS_OPT_F_PARAM);
+				retc = 20;
+				goto out;
+			}
 		}
-		if (!(flags & IP_VS_CONN_F_TEMPLATE))
-			cp = ip_vs_conn_in_get(&param);
-		else
-			cp = ip_vs_ct_in_get(&param);
-		if (!cp) {
-			/*
-			 * Find the appropriate destination for the connection.
-			 * If it is not found the connection will remain unbound
-			 * but still handled.
-			 */
-			dest = ip_vs_find_dest(AF_INET,
-					       (union nf_inet_addr *)&s->daddr,
-					       s->dport,
-					       (union nf_inet_addr *)&s->vaddr,
-					       s->vport,
-					       s->protocol, 0);
-			/*  Set the approprite ativity flag */
-			if (s->protocol == IPPROTO_TCP) {
-				if (state != IP_VS_TCP_S_ESTABLISHED)
-					flags |= IP_VS_CONN_F_INACTIVE;
-				else
-					flags &= ~IP_VS_CONN_F_INACTIVE;
-			} else if (s->protocol == IPPROTO_SCTP) {
-				if (state != IP_VS_SCTP_S_ESTABLISHED)
-					flags |= IP_VS_CONN_F_INACTIVE;
-				else
-					flags &= ~IP_VS_CONN_F_INACTIVE;
+		p += plen;  /* Next option */
+	}
+
+	/* Get flags and Mask off unsupported */
+	flags  = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
+	flags |= IP_VS_CONN_F_SYNC;
+	state = ntohs(s->v4.state);
+
+	if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+		pp = ip_vs_proto_get(s->v4.protocol);
+		if (!pp) {
+			IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
+				s->v4.protocol);
+			retc = 30;
+			goto out;
+		}
+		if (state >= pp->num_states) {
+			IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
+				pp->name, state);
+			retc = 40;
+			goto out;
+		}
+	} else {
+		/* protocol in templates is not used for state/timeout */
+		pp = NULL;
+		if (state > 0) {
+			IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
+				state);
+			state = 0;
+		}
+	}
+	if (ip_vs_conn_fill_param_sync(af, s, &param,
+					pe_data, pe_data_len,
+					pe_name, pe_name_len)) {
+		retc = 50;
+		goto out;
+	}
+	/* If only IPv4, just silent skip IPv6 */
+	if (af == AF_INET)
+		ip_vs_proc_conn(&param, flags, state, s->v4.protocol, af,
+				(union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
+				ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
+				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL),
+				pp);
+#ifdef CONFIG_IP_VS_IPV6
+	else
+		ip_vs_proc_conn(&param, flags, state, s->v6.protocol, af,
+				(union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
+				ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
+				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL),
+				pp);
+#endif
+	return 0;
+	/* Error exit */
+out:
+	IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
+	return retc;
+
+}
+/*
+ *      Process received multicast message and create the corresponding
+ *      ip_vs_conn entries.
+ *      Handles Version 0 & 1
+ */
+static void ip_vs_process_message(__u8 *buffer, const size_t buflen)
+{
+	struct ip_vs_sync_mesg_v2 *m2 = (struct ip_vs_sync_mesg_v2 *)buffer;
+	__u8 *p, *msg_end;
+	unsigned int i, nr_conns;
+
+	if (buflen < sizeof(struct ip_vs_sync_mesg)) {
+		IP_VS_DBG(2, "BACKUP, message header too short\n");
+		return;
+	}
+	/* Convert size back to host byte order */
+	m2->size = ntohs(m2->size);
+
+	if (buflen != m2->size) {
+		IP_VS_DBG(2, "BACKUP, bogus message size\n");
+		return;
+	}
+	/* SyncID sanity check */
+	if (ip_vs_backup_syncid != 0 && m2->syncid != ip_vs_backup_syncid) {
+		IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
+		return;
+	}
+	/* Handle version 1  message */
+	if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
+	    && (m2->spare == 0)) {
+
+		msg_end = buffer + sizeof(struct ip_vs_sync_mesg_v2);
+		nr_conns = m2->nr_conns;
+
+		for (i=0; i<nr_conns; i++) {
+			union ip_vs_sync_conn *s;
+			unsigned size;
+			int retc;
+
+			p = msg_end;
+			if (p + sizeof(s->v4) > buffer+buflen) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+				return;
 			}
-			cp = ip_vs_conn_new(&param,
-					    (union nf_inet_addr *)&s->daddr,
-					    s->dport, flags, dest, 0);
-			if (dest)
-				atomic_dec(&dest->refcnt);
-			if (!cp) {
-				pr_err("ip_vs_conn_new failed\n");
+			s = (union ip_vs_sync_conn *)p;
+			size = ntohs(s->v4.ver_size) & SVER_MASK;
+			msg_end = p + size;
+			/* Basic sanity checks */
+			if (msg_end  > buffer+buflen) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
 				return;
 			}
-		} else if (!cp->dest) {
-			dest = ip_vs_try_bind_dest(cp);
-			if (dest)
-				atomic_dec(&dest->refcnt);
-		} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
-			   (cp->state != state)) {
-			/* update active/inactive flag for the connection */
-			dest = cp->dest;
-			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
-				(state != IP_VS_TCP_S_ESTABLISHED)) {
-				atomic_dec(&dest->activeconns);
-				atomic_inc(&dest->inactconns);
-				cp->flags |= IP_VS_CONN_F_INACTIVE;
-			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
-				(state == IP_VS_TCP_S_ESTABLISHED)) {
-				atomic_inc(&dest->activeconns);
-				atomic_dec(&dest->inactconns);
-				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+			if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
+					      ntohs(s->v4.ver_size) >> SVER_SHIFT);
+				return;
 			}
-		} else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
-			   (cp->state != state)) {
-			dest = cp->dest;
-			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
-			     (state != IP_VS_SCTP_S_ESTABLISHED)) {
-			    atomic_dec(&dest->activeconns);
-			    atomic_inc(&dest->inactconns);
-			    cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+			/* Process a single sync_conn */
+			if ((retc=ip_vs_proc_sync_conn(p, msg_end)) < 0) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
+					     retc);
+				return;
 			}
+			/* Make sure we have 32 bit alignment */
+			msg_end = p + ((size + 3) & ~3);
 		}
-
-		if (opt)
-			memcpy(&cp->in_seq, opt, sizeof(*opt));
-		atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
-		cp->state = state;
-		cp->old_state = cp->state;
-		/*
-		 * We can not recover the right timeout for templates
-		 * in all cases, we can not find the right fwmark
-		 * virtual service. If needed, we can do it for
-		 * non-fwmark persistent services.
-		 */
-		if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
-			cp->timeout = pp->timeout_table[state];
-		else
-			cp->timeout = (3*60*HZ);
-		ip_vs_conn_put(cp);
+	} else {
+		/* Old type of message */
+		ip_vs_process_message_v0(buffer, buflen);
+		return;
 	}
 }
 
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 30/79] IPVS: Backup, Change sending to Version 1 format
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

Enable sending and removal of version 0 sending
Affected functions,

ip_vs_sync_buff_create()
ip_vs_sync_conn()

ip_vs_core.c removal of IPv4 check.

*v5
 Just check cp->pe_data_len in ip_vs_sync_conn
 Check if padding needed before adding a new sync_conn
 to the buffer, i.e. avoid sending padding at the end.

*v4
 moved sanity check and pe_name_len after sloop.
 use cp->pe instead of cp->dest->svc->pe
 real length in each sync_conn, not padded length
 however total size of a sync_msg includes padding.

*v3
 Sending ip_vs_sync_conn_options in network order.
 Sending Templates for ONE_PACKET conn.
 Renaming of ip_vs_sync_mesg to ip_vs_sync_mesg_v0

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |    2 +-
 net/netfilter/ipvs/ip_vs_core.c |   13 ++-
 net/netfilter/ipvs/ip_vs_sync.c |  189 ++++++++++++++++++++++++++++++---------
 3 files changed, 156 insertions(+), 48 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4069484..a715f3d 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -919,7 +919,7 @@ extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid);
 extern int stop_sync_thread(int state);
-extern void ip_vs_sync_conn(const struct ip_vs_conn *cp);
+extern void ip_vs_sync_conn(struct ip_vs_conn *cp);
 
 
 /*
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 3445da6..5287771 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1560,9 +1560,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	 *
 	 * Sync connection if it is about to close to
 	 * encorage the standby servers to update the connections timeout
+	 *
+	 * For ONE_PKT let ip_vs_sync_conn() do the filter work.
 	 */
-	pkts = atomic_add_return(1, &cp->in_pkts);
-	if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		pkts = sysctl_ip_vs_sync_threshold[0];
+	else
+		pkts = atomic_add_return(1, &cp->in_pkts);
+
+	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
 	    cp->protocol == IPPROTO_SCTP) {
 		if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
 			(pkts % sysctl_ip_vs_sync_threshold[1]
@@ -1577,8 +1583,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	}
 
 	/* Keep this block last: TCP and others with pp->num_states <= 1 */
-	else if (af == AF_INET &&
-	    (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+	else if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
 	    (((cp->protocol != IPPROTO_TCP ||
 	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
 	      (pkts % sysctl_ip_vs_sync_threshold[1]
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index e071508..df5abf0 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -226,7 +226,7 @@ struct ip_vs_sync_thread_data {
 #define MAX_CONNS_PER_SYNCBUFF	255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
 
 /* Version 0 header */
-struct ip_vs_sync_mesg {
+struct ip_vs_sync_mesg_v0 {
 	__u8                    nr_conns;
 	__u8                    syncid;
 	__u16                   size;
@@ -235,7 +235,7 @@ struct ip_vs_sync_mesg {
 };
 
 /* Version 1 header */
-struct ip_vs_sync_mesg_v2 {
+struct ip_vs_sync_mesg {
 	__u8			reserved;	/* must be zero */
 	__u8			syncid;
 	__u16			size;
@@ -299,6 +299,17 @@ static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
 	ho->previous_delta = get_unaligned_be32(&no->previous_delta);
 }
 
+/*
+ * Copy of struct ip_vs_seq
+ * From Aligned host order to unaligned network order
+ */
+static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
+{
+	put_unaligned_be32(ho->init_seq, &no->init_seq);
+	put_unaligned_be32(ho->delta, &no->delta);
+	put_unaligned_be32(ho->previous_delta, &no->previous_delta);
+}
+
 static inline struct ip_vs_sync_buff *sb_dequeue(void)
 {
 	struct ip_vs_sync_buff *sb;
@@ -317,6 +328,9 @@ static inline struct ip_vs_sync_buff *sb_dequeue(void)
 	return sb;
 }
 
+/*
+ * Create a new sync buffer for Version 1 proto.
+ */
 static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
 {
 	struct ip_vs_sync_buff *sb;
@@ -328,11 +342,15 @@ static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
 		kfree(sb);
 		return NULL;
 	}
-	sb->mesg->nr_conns = 0;
+	sb->mesg->reserved = 0;  /* old nr_conns i.e. must be zeo now */
+	sb->mesg->version = SYNC_PROTO_VER;
 	sb->mesg->syncid = ip_vs_master_syncid;
-	sb->mesg->size = 4;
-	sb->head = (unsigned char *)sb->mesg + 4;
+	sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
+	sb->mesg->nr_conns = 0;
+	sb->mesg->spare = 0;
+	sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
 	sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+
 	sb->firstuse = jiffies;
 	return sb;
 }
@@ -373,18 +391,60 @@ get_curr_sync_buff(unsigned long time)
 	return sb;
 }
 
-
 /*
  *      Add an ip_vs_conn information into the current sync_buff.
  *      Called by ip_vs_in.
+ *      Sending Version 1 messages
  */
-void ip_vs_sync_conn(const struct ip_vs_conn *cp)
+void ip_vs_sync_conn(struct ip_vs_conn *cp)
 {
 	struct ip_vs_sync_mesg *m;
-	struct ip_vs_sync_conn_v0 *s;
-	int len;
+	union ip_vs_sync_conn *s;
+	__u8 *p;
+	unsigned int len, pe_name_len, pad;
+
+	/* Do not sync ONE PACKET */
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		goto control;
+sloop:
+	/* Sanity checks */
+	pe_name_len = 0;
+	if (cp->pe_data_len) {
+		if (!cp->pe_data || !cp->dest) {
+			IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
+			return;
+		}
+		pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
+	}
 
 	spin_lock(&curr_sb_lock);
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		len = sizeof(struct ip_vs_sync_v6);
+	else
+#endif
+		len = sizeof(struct ip_vs_sync_v4);
+
+	if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
+		len += sizeof(struct ip_vs_sync_conn_options) + 2;
+
+	if (cp->pe_data_len)
+		len += cp->pe_data_len + 2;	/* + Param hdr field */
+	if (pe_name_len)
+		len += pe_name_len + 2;
+
+	/* check if there is a space for this one  */
+	pad = 0;
+	if (curr_sb) {
+		pad = (4 - (size_t)curr_sb->head) & 3;
+		if (curr_sb->head + len + pad > curr_sb->end) {
+			sb_queue_tail(curr_sb);
+			curr_sb = NULL;
+			pad = 0;
+		}
+	}
+
 	if (!curr_sb) {
 		if (!(curr_sb=ip_vs_sync_buff_create())) {
 			spin_unlock(&curr_sb_lock);
@@ -393,41 +453,84 @@ void ip_vs_sync_conn(const struct ip_vs_conn *cp)
 		}
 	}
 
-	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
-		SIMPLE_CONN_SIZE;
 	m = curr_sb->mesg;
-	s = (struct ip_vs_sync_conn_v0 *)curr_sb->head;
-
-	/* copy members */
-	s->protocol = cp->protocol;
-	s->cport = cp->cport;
-	s->vport = cp->vport;
-	s->dport = cp->dport;
-	s->caddr = cp->caddr.ip;
-	s->vaddr = cp->vaddr.ip;
-	s->daddr = cp->daddr.ip;
-	s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
-	s->state = htons(cp->state);
-	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
-		struct ip_vs_sync_conn_options *opt =
-			(struct ip_vs_sync_conn_options *)&s[1];
-		memcpy(opt, &cp->in_seq, sizeof(*opt));
-	}
-
+	p = curr_sb->head;
+	curr_sb->head += pad + len;
+	m->size += pad + len;
+	/* Add ev. padding from prev. sync_conn */
+	while (pad--)
+		*(p++) = 0;
+
+	s = (union ip_vs_sync_conn *)p;
+
+	/* Set message type  & copy members */
+	s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
+	s->v4.ver_size = htons(len & SVER_MASK);	/* Version 0 */
+	s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
+	s->v4.state = htons(cp->state);
+	s->v4.protocol = cp->protocol;
+	s->v4.cport = cp->cport;
+	s->v4.vport = cp->vport;
+	s->v4.dport = cp->dport;
+	s->v4.fwmark = htonl(cp->fwmark);
+	s->v4.timeout = htonl(cp->timeout / HZ);
 	m->nr_conns++;
-	m->size += len;
-	curr_sb->head += len;
 
-	/* check if there is a space for next one */
-	if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
-		sb_queue_tail(curr_sb);
-		curr_sb = NULL;
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6) {
+		p += sizeof(struct ip_vs_sync_v6);
+		ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6);
+		ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6);
+		ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6);
+	} else
+#endif
+	{
+		p += sizeof(struct ip_vs_sync_v4);	/* options ptr */
+		s->v4.caddr = cp->caddr.ip;
+		s->v4.vaddr = cp->vaddr.ip;
+		s->v4.daddr = cp->daddr.ip;
+	}
+	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+		*(p++) = IPVS_OPT_SEQ_DATA;
+		*(p++) = sizeof(struct ip_vs_sync_conn_options);
+		hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
+		p += sizeof(struct ip_vs_seq);
+		hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
+		p += sizeof(struct ip_vs_seq);
 	}
+	/* Handle pe data */
+	if (cp->pe_data_len && cp->pe_data) {
+		*(p++) = IPVS_OPT_PE_DATA;
+		*(p++) = cp->pe_data_len;
+		memcpy(p, cp->pe_data, cp->pe_data_len);
+		p += cp->pe_data_len;
+		if (pe_name_len) {
+			/* Add PE_NAME */
+			*(p++) = IPVS_OPT_PE_NAME;
+			*(p++) = pe_name_len;
+			memcpy(p, cp->pe->name, pe_name_len);
+			p += pe_name_len;
+		}
+	}
+
 	spin_unlock(&curr_sb_lock);
 
+control:
 	/* synchronize its controller if it has */
-	if (cp->control)
-		ip_vs_sync_conn(cp->control);
+	cp = cp->control;
+	if (!cp)
+		return;
+	/*
+	 * Reduce sync rate for templates
+	 * i.e only increment in_pkts for Templates.
+	 */
+	if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+		int pkts = atomic_add_return(1, &cp->in_pkts);
+
+		if (pkts % sysctl_ip_vs_sync_threshold[1] != 1)
+			return;
+	}
+	goto sloop;
 }
 
 /*
@@ -596,7 +699,7 @@ static void ip_vs_proc_conn(struct ip_vs_conn_param *param,  unsigned flags,
  */
 static void ip_vs_process_message_v0(const char *buffer, const size_t buflen)
 {
-	struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
+	struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
 	struct ip_vs_sync_conn_v0 *s;
 	struct ip_vs_sync_conn_options *opt;
 	struct ip_vs_protocol *pp;
@@ -604,7 +707,7 @@ static void ip_vs_process_message_v0(const char *buffer, const size_t buflen)
 	char *p;
 	int i;
 
-	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
 	for (i=0; i<m->nr_conns; i++) {
 		unsigned flags, state;
 
@@ -848,11 +951,11 @@ out:
  */
 static void ip_vs_process_message(__u8 *buffer, const size_t buflen)
 {
-	struct ip_vs_sync_mesg_v2 *m2 = (struct ip_vs_sync_mesg_v2 *)buffer;
+	struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
 	__u8 *p, *msg_end;
-	unsigned int i, nr_conns;
+	int i, nr_conns;
 
-	if (buflen < sizeof(struct ip_vs_sync_mesg)) {
+	if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
 		IP_VS_DBG(2, "BACKUP, message header too short\n");
 		return;
 	}
@@ -872,7 +975,7 @@ static void ip_vs_process_message(__u8 *buffer, const size_t buflen)
 	if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
 	    && (m2->spare == 0)) {
 
-		msg_end = buffer + sizeof(struct ip_vs_sync_mesg_v2);
+		msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
 		nr_conns = m2->nr_conns;
 
 		for (i=0; i<nr_conns; i++) {
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 32/79] netfilter: xtables: use guarded types
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Jan Engelhardt <jengelh@medozas.de>

We are supposed to use the kernel's own types in userspace exports.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_CT.h          |   10 +++++-----
 include/linux/netfilter/xt_TCPOPTSTRIP.h |    2 +-
 include/linux/netfilter/xt_TPROXY.h      |    8 ++++----
 include/linux/netfilter/xt_cluster.h     |    8 ++++----
 include/linux/netfilter/xt_quota.h       |    6 +++---
 include/linux/netfilter/xt_time.h        |   14 +++++++-------
 include/linux/netfilter/xt_u32.h         |   16 ++++++++--------
 7 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/include/linux/netfilter/xt_CT.h b/include/linux/netfilter/xt_CT.h
index 1b56410..fbf4c56 100644
--- a/include/linux/netfilter/xt_CT.h
+++ b/include/linux/netfilter/xt_CT.h
@@ -4,11 +4,11 @@
 #define XT_CT_NOTRACK	0x1
 
 struct xt_ct_target_info {
-	u_int16_t	flags;
-	u_int16_t	zone;
-	u_int32_t	ct_events;
-	u_int32_t	exp_events;
-	char		helper[16];
+	__u16 flags;
+	__u16 zone;
+	__u32 ct_events;
+	__u32 exp_events;
+	char helper[16];
 
 	/* Used internally by the kernel */
 	struct nf_conn	*ct __attribute__((aligned(8)));
diff --git a/include/linux/netfilter/xt_TCPOPTSTRIP.h b/include/linux/netfilter/xt_TCPOPTSTRIP.h
index 2db5432..342ef14 100644
--- a/include/linux/netfilter/xt_TCPOPTSTRIP.h
+++ b/include/linux/netfilter/xt_TCPOPTSTRIP.h
@@ -7,7 +7,7 @@
 	(((1U << (idx & 31)) & bmap[(idx) >> 5]) != 0)
 
 struct xt_tcpoptstrip_target_info {
-	u_int32_t strip_bmap[8];
+	__u32 strip_bmap[8];
 };
 
 #endif /* _XT_TCPOPTSTRIP_H */
diff --git a/include/linux/netfilter/xt_TPROXY.h b/include/linux/netfilter/xt_TPROXY.h
index 3f3d693..8097e0b 100644
--- a/include/linux/netfilter/xt_TPROXY.h
+++ b/include/linux/netfilter/xt_TPROXY.h
@@ -5,15 +5,15 @@
  * redirection. We can get rid of that whenever we get support for
  * mutliple targets in the same rule. */
 struct xt_tproxy_target_info {
-	u_int32_t mark_mask;
-	u_int32_t mark_value;
+	__u32 mark_mask;
+	__u32 mark_value;
 	__be32 laddr;
 	__be16 lport;
 };
 
 struct xt_tproxy_target_info_v1 {
-	u_int32_t mark_mask;
-	u_int32_t mark_value;
+	__u32 mark_mask;
+	__u32 mark_value;
 	union nf_inet_addr laddr;
 	__be16 lport;
 };
diff --git a/include/linux/netfilter/xt_cluster.h b/include/linux/netfilter/xt_cluster.h
index 8866826..66cfa3c 100644
--- a/include/linux/netfilter/xt_cluster.h
+++ b/include/linux/netfilter/xt_cluster.h
@@ -6,10 +6,10 @@ enum xt_cluster_flags {
 };
 
 struct xt_cluster_match_info {
-	u_int32_t		total_nodes;
-	u_int32_t		node_mask;
-	u_int32_t		hash_seed;
-	u_int32_t		flags;
+	__u32 total_nodes;
+	__u32 node_mask;
+	__u32 hash_seed;
+	__u32 flags;
 };
 
 #define XT_CLUSTER_NODES_MAX	32
diff --git a/include/linux/netfilter/xt_quota.h b/include/linux/netfilter/xt_quota.h
index b0d28c6..8bda65f 100644
--- a/include/linux/netfilter/xt_quota.h
+++ b/include/linux/netfilter/xt_quota.h
@@ -9,9 +9,9 @@ enum xt_quota_flags {
 struct xt_quota_priv;
 
 struct xt_quota_info {
-	u_int32_t		flags;
-	u_int32_t		pad;
-	aligned_u64		quota;
+	__u32 flags;
+	__u32 pad;
+	aligned_u64 quota;
 
 	/* Used internally by the kernel */
 	struct xt_quota_priv	*master;
diff --git a/include/linux/netfilter/xt_time.h b/include/linux/netfilter/xt_time.h
index 14b6df4..b8bd456 100644
--- a/include/linux/netfilter/xt_time.h
+++ b/include/linux/netfilter/xt_time.h
@@ -2,13 +2,13 @@
 #define _XT_TIME_H 1
 
 struct xt_time_info {
-	u_int32_t date_start;
-	u_int32_t date_stop;
-	u_int32_t daytime_start;
-	u_int32_t daytime_stop;
-	u_int32_t monthdays_match;
-	u_int8_t weekdays_match;
-	u_int8_t flags;
+	__u32 date_start;
+	__u32 date_stop;
+	__u32 daytime_start;
+	__u32 daytime_stop;
+	__u32 monthdays_match;
+	__u8 weekdays_match;
+	__u8 flags;
 };
 
 enum {
diff --git a/include/linux/netfilter/xt_u32.h b/include/linux/netfilter/xt_u32.h
index 9947f56..e8c3d87 100644
--- a/include/linux/netfilter/xt_u32.h
+++ b/include/linux/netfilter/xt_u32.h
@@ -9,13 +9,13 @@ enum xt_u32_ops {
 };
 
 struct xt_u32_location_element {
-	u_int32_t number;
-	u_int8_t nextop;
+	__u32 number;
+	__u8 nextop;
 };
 
 struct xt_u32_value_element {
-	u_int32_t min;
-	u_int32_t max;
+	__u32 min;
+	__u32 max;
 };
 
 /*
@@ -27,14 +27,14 @@ struct xt_u32_value_element {
 struct xt_u32_test {
 	struct xt_u32_location_element location[XT_U32_MAXSIZE+1];
 	struct xt_u32_value_element value[XT_U32_MAXSIZE+1];
-	u_int8_t nnums;
-	u_int8_t nvalues;
+	__u8 nnums;
+	__u8 nvalues;
 };
 
 struct xt_u32 {
 	struct xt_u32_test tests[XT_U32_MAXSIZE+1];
-	u_int8_t ntests;
-	u_int8_t invert;
+	__u8 ntests;
+	__u8 invert;
 };
 
 #endif /* _XT_U32_H */
-- 
1.7.2.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox