Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCHv3] extensions: libxt_CHECKSUM extension
From: Michael S. Tsirkin @ 2010-07-11 15:14 UTC (permalink / raw)
  To: Patrick McHardy, David S. Miller, Jan Engelhardt, Randy Dunlap,
	netfilter-devel
In-Reply-To: <20100711150623.GA7420@redhat.com>

This adds a `CHECKSUM' target, which can be used in the iptables mangle
table.

You can use this target to compute and fill in the checksum in
a packet that lacks a checksum.  This is particularly useful,
if you need to work around old applications such as dhcp clients,
that do not work well with checksum offloads, but don't want to disable
checksum offload in your device.

The problem happens in the field with virtualized applications.
For reference, see Red Hat bz 605555, as well as
http://www.spinics.net/lists/kvm/msg37660.html

Typical expected use (helps old dhclient binary running in a VM):
iptables -A POSTROUTING -t mangle -p udp --dport bootpc \
        -j CHECKSUM --checksum-fill

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

Correction in the documentation. Sorry about the noise.

Changes from v2:
	updated man file
Changes from v1:
	switched from ipt to xt

 extensions/libxt_CHECKSUM.c   |   99 +++++++++++++++++++++++++++++++++++++++++
 extensions/libxt_CHECKSUM.man |    8 +++
 2 files changed, 107 insertions(+), 0 deletions(-)
 create mode 100644 extensions/libxt_CHECKSUM.c
 create mode 100644 extensions/libxt_CHECKSUM.man

diff --git a/extensions/libxt_CHECKSUM.c b/extensions/libxt_CHECKSUM.c
new file mode 100644
index 0000000..00fbd8f
--- /dev/null
+++ b/extensions/libxt_CHECKSUM.c
@@ -0,0 +1,99 @@
+/* Shared library add-on to xtables for CHECKSUM
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2010 by Red Hat, Inc
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is distributed under the terms of GNU GPL v2, 1991
+ *
+ * libxt_CHECKSUM.c borrowed some bits from libipt_ECN.c
+ *
+ * $Id$
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <getopt.h>
+
+#include <xtables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+static void CHECKSUM_help(void)
+{
+	printf(
+"CHECKSUM target options\n"
+"  --checksum-fill			Fill in packet checksum.\n");
+}
+
+static const struct option CHECKSUM_opts[] = {
+	{ "checksum-fill", 0, NULL, 'F' },
+	{ .name = NULL }
+};
+
+static int CHECKSUM_parse(int c, char **argv, int invert, unsigned int *flags,
+                     const void *entry, struct xt_entry_target **target)
+{
+	struct xt_CHECKSUM_info *einfo
+		= (struct xt_CHECKSUM_info *)(*target)->data;
+
+	switch (c) {
+	case 'F':
+		if (*flags)
+			xtables_error(PARAMETER_PROBLEM,
+			        "CHECKSUM target: Only use --checksum-fill ONCE!");
+		einfo->operation = XT_CHECKSUM_OP_FILL;
+		*flags |= XT_CHECKSUM_OP_FILL;
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
+static void CHECKSUM_check(unsigned int flags)
+{
+	if (!flags)
+		xtables_error(PARAMETER_PROBLEM,
+		           "CHECKSUM target: Parameter --checksum-fill is required");
+}
+
+static void CHECKSUM_print(const void *ip, const struct xt_entry_target *target,
+                      int numeric)
+{
+	const struct xt_CHECKSUM_info *einfo =
+		(const struct xt_CHECKSUM_info *)target->data;
+
+	printf("CHECKSUM ");
+
+	if (einfo->operation & XT_CHECKSUM_OP_FILL)
+		printf("fill ");
+}
+
+static void CHECKSUM_save(const void *ip, const struct xt_entry_target *target)
+{
+	const struct xt_CHECKSUM_info *einfo =
+		(const struct xt_CHECKSUM_info *)target->data;
+
+	if (einfo->operation & XT_CHECKSUM_OP_FILL)
+		printf("--checksum-fill ");
+}
+
+static struct xtables_target checksum_tg_reg = {
+	.name		= "CHECKSUM",
+	.version	= XTABLES_VERSION,
+	.family		= NFPROTO_UNSPEC,
+	.size		= XT_ALIGN(sizeof(struct xt_CHECKSUM_info)),
+	.userspacesize	= XT_ALIGN(sizeof(struct xt_CHECKSUM_info)),
+	.help		= CHECKSUM_help,
+	.parse		= CHECKSUM_parse,
+	.final_check	= CHECKSUM_check,
+	.print		= CHECKSUM_print,
+	.save		= CHECKSUM_save,
+	.extra_opts	= CHECKSUM_opts,
+};
+
+void _init(void)
+{
+	xtables_register_target(&checksum_tg_reg);
+}
diff --git a/extensions/libxt_CHECKSUM.man b/extensions/libxt_CHECKSUM.man
new file mode 100644
index 0000000..92ae700
--- /dev/null
+++ b/extensions/libxt_CHECKSUM.man
@@ -0,0 +1,8 @@
+This target allows to selectively work around broken/old applications.
+It can only be used in the mangle table.
+.TP
+\fB\-\-checksum\-fill\fP
+Compute and fill in the checksum in a packet that lacks a checksum.
+This is particularly useful, if you need to work around old applications
+such as dhcp clients, that do not work well with checksum offloads,
+but don't want to disable checksum offload in your device.
-- 
1.7.2.rc0.14.g41c1c

^ permalink raw reply related

* [PATCHv2] extensions: libxt_CHECKSUM extension
From: Michael S. Tsirkin @ 2010-07-11 15:08 UTC (permalink / raw)
  To: Patrick McHardy, David S. Miller, Jan Engelhardt, Randy Dunlap,
	netfilter-devel
In-Reply-To: <20100711150623.GA7420@redhat.com>

This adds a `CHECKSUM' target, which can be used in the iptables mangle
table.

You can use this target to compute and fill in the checksum in
a packet that lacks a checksum.  This is particularly useful,
if you need to work around old applications such as dhcp clients,
that do not work well with checksum offloads, but don't want to disable
checksum offload in your device.

The problem happens in the field with virtualized applications.
For reference, see Red Hat bz 605555, as well as
http://www.spinics.net/lists/kvm/msg37660.html

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

Changes from v1:
	switched from ipt to xt target

 extensions/libipt_CHECKSUM.man |    8 +++
 extensions/libxt_CHECKSUM.c    |   99 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 0 deletions(-)
 create mode 100644 extensions/libipt_CHECKSUM.man
 create mode 100644 extensions/libxt_CHECKSUM.c

diff --git a/extensions/libipt_CHECKSUM.man b/extensions/libipt_CHECKSUM.man
new file mode 100644
index 0000000..4f83335
--- /dev/null
+++ b/extensions/libipt_CHECKSUM.man
@@ -0,0 +1,8 @@
+This target allows to selectively work around broken/old applications.
+It can only be used in the mangle table.
+.TP
+\fB\-\-checksum\-fill\fP
+Compute and fill in the checksum in a packet that lacks a checksum.
+This is particularly useful, if you need to work around old applications
+such as dhcp clients, that do not work well with checksum offloads,
+but don't want to disable checksum offload in your device.
diff --git a/extensions/libxt_CHECKSUM.c b/extensions/libxt_CHECKSUM.c
new file mode 100644
index 0000000..00fbd8f
--- /dev/null
+++ b/extensions/libxt_CHECKSUM.c
@@ -0,0 +1,99 @@
+/* Shared library add-on to xtables for CHECKSUM
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2010 by Red Hat, Inc
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is distributed under the terms of GNU GPL v2, 1991
+ *
+ * libxt_CHECKSUM.c borrowed some bits from libipt_ECN.c
+ *
+ * $Id$
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <getopt.h>
+
+#include <xtables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+static void CHECKSUM_help(void)
+{
+	printf(
+"CHECKSUM target options\n"
+"  --checksum-fill			Fill in packet checksum.\n");
+}
+
+static const struct option CHECKSUM_opts[] = {
+	{ "checksum-fill", 0, NULL, 'F' },
+	{ .name = NULL }
+};
+
+static int CHECKSUM_parse(int c, char **argv, int invert, unsigned int *flags,
+                     const void *entry, struct xt_entry_target **target)
+{
+	struct xt_CHECKSUM_info *einfo
+		= (struct xt_CHECKSUM_info *)(*target)->data;
+
+	switch (c) {
+	case 'F':
+		if (*flags)
+			xtables_error(PARAMETER_PROBLEM,
+			        "CHECKSUM target: Only use --checksum-fill ONCE!");
+		einfo->operation = XT_CHECKSUM_OP_FILL;
+		*flags |= XT_CHECKSUM_OP_FILL;
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
+static void CHECKSUM_check(unsigned int flags)
+{
+	if (!flags)
+		xtables_error(PARAMETER_PROBLEM,
+		           "CHECKSUM target: Parameter --checksum-fill is required");
+}
+
+static void CHECKSUM_print(const void *ip, const struct xt_entry_target *target,
+                      int numeric)
+{
+	const struct xt_CHECKSUM_info *einfo =
+		(const struct xt_CHECKSUM_info *)target->data;
+
+	printf("CHECKSUM ");
+
+	if (einfo->operation & XT_CHECKSUM_OP_FILL)
+		printf("fill ");
+}
+
+static void CHECKSUM_save(const void *ip, const struct xt_entry_target *target)
+{
+	const struct xt_CHECKSUM_info *einfo =
+		(const struct xt_CHECKSUM_info *)target->data;
+
+	if (einfo->operation & XT_CHECKSUM_OP_FILL)
+		printf("--checksum-fill ");
+}
+
+static struct xtables_target checksum_tg_reg = {
+	.name		= "CHECKSUM",
+	.version	= XTABLES_VERSION,
+	.family		= NFPROTO_UNSPEC,
+	.size		= XT_ALIGN(sizeof(struct xt_CHECKSUM_info)),
+	.userspacesize	= XT_ALIGN(sizeof(struct xt_CHECKSUM_info)),
+	.help		= CHECKSUM_help,
+	.parse		= CHECKSUM_parse,
+	.final_check	= CHECKSUM_check,
+	.print		= CHECKSUM_print,
+	.save		= CHECKSUM_save,
+	.extra_opts	= CHECKSUM_opts,
+};
+
+void _init(void)
+{
+	xtables_register_target(&checksum_tg_reg);
+}

^ permalink raw reply related

* [PATCHv2] netfilter: add CHECKSUM target
From: Michael S. Tsirkin @ 2010-07-11 15:06 UTC (permalink / raw)
  To: Patrick McHardy, Michael S. Tsirkin, David S. Miller,
	Jan Engelhardt, Randy Dunlap

This adds a `CHECKSUM' target, which can be used in the iptables mangle
table.

You can use this target to compute and fill in the checksum in
a packet that lacks a checksum.  This is particularly useful,
if you need to work around old applications such as dhcp clients,
that do not work well with checksum offloads, but don't want to
disable checksum offload in your device.

The problem happens in the field with virtualized applications.
For reference, see Red Hat bz 605555, as well as
http://www.spinics.net/lists/kvm/msg37660.html

Typical expected use (helps old dhclient binary running in a VM):
iptables -A POSTROUTING -t mangle -p udp --dport bootpc \
	-j CHECKSUM --checksum-fill

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

Changes from v1:
	moved from ipt to xt
	get rid of any ipv4 dependencies
	coding style tweaks

 include/linux/netfilter/xt_CHECKSUM.h |   18 ++++++++
 net/netfilter/Kconfig                 |   17 +++++++-
 net/netfilter/Makefile                |    1 +
 net/netfilter/xt_CHECKSUM.c           |   72 +++++++++++++++++++++++++++++++++
 4 files changed, 107 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/netfilter/xt_CHECKSUM.h
 create mode 100644 net/netfilter/xt_CHECKSUM.c

diff --git a/include/linux/netfilter/xt_CHECKSUM.h b/include/linux/netfilter/xt_CHECKSUM.h
new file mode 100644
index 0000000..56afe57
--- /dev/null
+++ b/include/linux/netfilter/xt_CHECKSUM.h
@@ -0,0 +1,18 @@
+/* Header file for iptables ipt_CHECKSUM target
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2010 Red Hat Inc
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This software is distributed under GNU GPL v2, 1991
+*/
+#ifndef _IPT_CHECKSUM_TARGET_H
+#define _IPT_CHECKSUM_TARGET_H
+
+#define XT_CHECKSUM_OP_FILL	0x01	/* fill in checksum in IP header */
+
+struct xt_CHECKSUM_info {
+	u_int8_t operation;	/* bitset of operations */
+};
+
+#endif /* _IPT_CHECKSUM_TARGET_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8593a77..1cf4852 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -294,7 +294,7 @@ endif # NF_CONNTRACK
 config NETFILTER_TPROXY
 	tristate "Transparent proxying support (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
-	depends on IP_NF_MANGLE
+	depends on IP_NF_MANGLE || IP6_NF_MANGLE
 	depends on NETFILTER_ADVANCED
 	help
 	  This option enables transparent proxying support, that is,
@@ -347,6 +347,21 @@ config NETFILTER_XT_CONNMARK
 
 comment "Xtables targets"
 
+config NETFILTER_XT_TARGET_CHECKSUM
+	tristate "CHECKSUM target support"
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option adds a `CHECKSUM' target, which can be used in the iptables mangle
+	  table.  
+
+	  You can use this target to compute and fill in the checksum in
+	  a packet that lacks a checksum.  This is particularly useful,
+	  if you need to work around old applications such as dhcp clients,
+	  that do not work well with checksum offloads, but don't want to disable
+	  checksum offload in your device.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_CLASSIFY
 	tristate '"CLASSIFY" target support'
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 14e3a8f..8eb541d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
 
 # targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
new file mode 100644
index 0000000..0fee1a7
--- /dev/null
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -0,0 +1,72 @@
+/* iptables module for the packet checksum mangling
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * (C) 2010 Red Hat, Inc.
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>");
+MODULE_DESCRIPTION("Xtables: checksum modification");
+MODULE_ALIAS("ipt_CHECKSUM");
+MODULE_ALIAS("ip6t_CHECKSUM");
+
+static unsigned int
+checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		skb_checksum_help(skb);
+
+	return XT_CONTINUE;
+}
+
+static int checksum_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_CHECKSUM_info *einfo = par->targinfo;
+
+	if (einfo->operation & ~XT_CHECKSUM_OP_FILL) {
+		pr_info("unsupported CHECKSUM operation %x\n", einfo->operation);
+		return -EINVAL;
+	}
+	if (!einfo->operation) {
+		pr_info("no CHECKSUM operation enabled\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_target checksum_tg_reg __read_mostly = {
+	.name		= "CHECKSUM",
+	.family		= NFPROTO_UNSPEC,
+	.target		= checksum_tg,
+	.targetsize	= sizeof(struct xt_CHECKSUM_info),
+	.table		= "mangle",
+	.checkentry	= checksum_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init checksum_tg_init(void)
+{
+	return xt_register_target(&checksum_tg_reg);
+}
+
+static void __exit checksum_tg_exit(void)
+{
+	xt_unregister_target(&checksum_tg_reg);
+}
+
+module_init(checksum_tg_init);
+module_exit(checksum_tg_exit);
-- 
1.7.2.rc0.14.g41c1c

^ permalink raw reply related

* Re: [PATCH] netfilter: add CHECKSUM target
From: Michael S. Tsirkin @ 2010-07-11 13:19 UTC (permalink / raw)
  To: Changli Gao
  Cc: Jan Engelhardt, Patrick McHardy, David S. Miller,
	Alexey Kuznetsov, Pekka Savola (ipv6), James Morris,
	Hideaki YOSHIFUJI, linux-kernel, netfilter-devel, netfilter,
	coreteam, netdev, herbert.xu, kvm
In-Reply-To: <AANLkTikamhUUuoQ8EjwPn9gB4zHZkeDuzpiqd4XcN3lx@mail.gmail.com>

On Sun, Jul 11, 2010 at 08:45:01PM +0800, Changli Gao wrote:
> On Sun, Jul 11, 2010 at 8:27 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Fri, Jul 09, 2010 at 11:56:26AM +0200, Jan Engelhardt wrote:
> >>
> >> On Friday 2010-07-09 00:29, Michael S. Tsirkin wrote:
> >> >
> >> > include/linux/netfilter_ipv4/ipt_CHECKSUM.h |   18 +++++++
> >> > net/ipv4/netfilter/Kconfig                  |   16 ++++++
> >> > net/ipv4/netfilter/Makefile                 |    1 +
> >> > net/ipv4/netfilter/ipt_CHECKSUM.c           |   72 +++++++++++++++++++++++++++
> >>
> >> New modules should use xt.
> >
> > I tried moving the module to xt_CHECKSUM but now
> >        iptables -A POSTROUTING -t mangle -p udp --dport 68 -j CHECKSUM --checksum-fill
> > does not to load the module.
> > It seems that xt_request_find_target in x_tables uses the prefix 'ip' for a module.
> > What am I missing?
> >
> 
> You should add MODULE_ALIAS clause, i.e.
> 
> MODULE_ALIAS("ipt_CHECKSUM");

This worked, thanks!

> -- 
> Regards,
> Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH] netfilter: add CHECKSUM target
From: Michael S. Tsirkin @ 2010-07-11 13:19 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Patrick McHardy, David S. Miller, Alexey Kuznetsov,
	Pekka Savola (ipv6), James Morris, Hideaki YOSHIFUJI,
	linux-kernel, netfilter-devel, netfilter, coreteam, netdev,
	herbert.xu, kvm
In-Reply-To: <alpine.LSU.2.01.1007111445320.6015@obet.zrqbmnf.qr>

On Sun, Jul 11, 2010 at 02:45:49PM +0200, Jan Engelhardt wrote:
> 
> On Sunday 2010-07-11 14:27, Michael S. Tsirkin wrote:
> >> >
> >> > include/linux/netfilter_ipv4/ipt_CHECKSUM.h |   18 +++++++
> >> > net/ipv4/netfilter/Kconfig                  |   16 ++++++
> >> > net/ipv4/netfilter/Makefile                 |    1 +
> >> > net/ipv4/netfilter/ipt_CHECKSUM.c           |   72 +++++++++++++++++++++++++++
> >> 
> >> New modules should use xt.
> >
> >I tried moving the module to xt_CHECKSUM but now
> >	iptables -A POSTROUTING -t mangle -p udp --dport 68 -j CHECKSUM --checksum-fill
> >does not to load the module.
> >It seems that xt_request_find_target in x_tables uses the prefix 'ip' for a module.
> >What am I missing?
> 
> MODULE_ALIAS like the others have.

Worked for me, thanks!

^ permalink raw reply

* RE: [PATCH] bnx2x: add support for receive hashing
From: Vladislav Zolotarov @ 2010-07-11 13:16 UTC (permalink / raw)
  To: Vladislav Zolotarov, David Miller, therbert@google.com
  Cc: netdev@vger.kernel.org
In-Reply-To: <8628FE4E7912BF47A96AE7DD7BAC0AADDDE64704DC@SJEXCHCCR02.corp.ad.broadcom.com>

Dave, it's obvious that there a demand for a new HW/FW configuration from our side - "rx hash enable" which is currently tightly coupled with the RSS capability. As long as RSS flow in our FW includes a few more things apart from just creating an RSS hash and as long as there are flows (even hypothetical) that demand the RSS hash regardless the RSS itself we started to work on separation of these two features from FW perspective. This of course will demand a new FW version but once we have it we'll be able to be more specific in HW configuration and have a cleaner code.

Technically, our FW may provide the Rx HASH always and in a current driver configuration this is what it does.
I wonder if the driver always provides the HW RX HASH in the skb->rxhash regardless the value of NETIF_F_RXHASH bit in a netdev->features will it cause any harm? If not we can get rid of two extra conditionals in the bnx2x_rx_int().

Thanks,
vlad

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> Behalf Of Vladislav Zolotarov
> Sent: Sunday, July 11, 2010 1:02 PM
> To: David Miller; therbert@google.com
> Cc: netdev@vger.kernel.org
> Subject: RE: [PATCH] bnx2x: add support for receive hashing
> 
> 
> 
> > -----Original Message-----
> > From: David Miller [mailto:davem@davemloft.net]
> > Sent: Sunday, July 11, 2010 5:13 AM
> > To: therbert@google.com
> > Cc: Vladislav Zolotarov; netdev@vger.kernel.org
> > Subject: Re: [PATCH] bnx2x: add support for receive hashing
> >
> > From: Tom Herbert <therbert@google.com>
> > Date: Wed, 7 Jul 2010 12:17:31 -0700
> >
> > > It is to enable the device to provide the RSS hash in RX descriptor.
> > > The hash severs two purposes now, it's used internally in the device
> > > to perform RSS table lookup and also value in RX descriptor.  The
> > > latter does not require multi-queue.  Strictly speaking, on a single
> > > processor system without multqueue, it would be true that enabling the
> > > RX hash on bnx2x is currently superfluous (notwithstanding some other
> > > use of the hash might be implemented).
> >
> > We intend to use the card provided RSS hash to optimize GSO
> > flow comparisons at some point.
> 
> Could u explain what did u mean here, pls.? GSO is a Tx flow while RSS hash
> is generated for the ingress traffic.
> 
> >
> > There are other possible uses as well.
> 
> Could u elaborate, pls?
> 
> >
> > Therefore even in a single RX queue configuration, the driver
> > should provide the hash if it can.
> 
> Providing an RSS hash adds an additional work to our FW and HW. We would like
> to understand why do we need to pay this penalty. And surely we don't want to
> pay it for a possible use. We'd prefer to pay it when it's really needed.
> 
> Thanks,
> vlad
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



^ permalink raw reply

* Re: Splice status
From: Changli Gao @ 2010-07-11 13:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jens Axboe, Ofer Heifetz, netdev@vger.kernel.org
In-Reply-To: <1278388580.2466.305.camel@edumazet-laptop>

On Tue, Jul 6, 2010 at 11:56 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mardi 06 juillet 2010 à 10:01 +0800, Changli Gao a écrit :
>>
>> If we don't drain the pipe before calling splice(2), the data spliced
>> from pipe maybe not be what we expect. Then data corruption occurs.
>>
>
> This is not true. A pipe is a pipe is a buffer. You dont need it to be
> empty when using it. Nowhere in documentation its stated.

Do you mean splice(2) empties the pipe buffer before using it as an
output buffer? If not, the pipe draining is needed to avoid data
corruption.

>
> However, a single skb can fill a pipe, even if "its empty"
>

Yea. Because tcp_splice_read() doesn't know if the __tcp_splice_read
returns due to pipe fulling.

>
>> >
>> > splice(sock, pipe) can block if caller dont use appropriate "non
>> > blocking pipe' splice() mode, even if pipe is empty before a splice()
>> > call.
>>
>> I don't think it is expected. The code of sys_recvfile is much like
>> the sendfile(2) implementation in kernel. If sys_recvfile may block
>> without non_block flag, sendfile(2) may block too.
>
> Then it would be a bug. You might fix it easily.

It seems reasonable. I'll fix it.

>
> Using splice() correctly (ie, not blocking on sock->pipe) should work
> too.
>
> Again, you can block on splice(sock, pipe), iff you have a second thread
> doing the opposite (pipe->file) in parallel to unblock you. But samba
> recvfile algo is using a single thread.
>




-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH] netfilter: add CHECKSUM target
From: Jan Engelhardt @ 2010-07-11 12:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Patrick McHardy, David S. Miller, Alexey Kuznetsov,
	Pekka Savola (ipv6), James Morris, Hideaki YOSHIFUJI,
	linux-kernel, netfilter-devel, netfilter, coreteam, netdev,
	herbert.xu, kvm
In-Reply-To: <20100711122708.GA3573@redhat.com>


On Sunday 2010-07-11 14:27, Michael S. Tsirkin wrote:
>> >
>> > include/linux/netfilter_ipv4/ipt_CHECKSUM.h |   18 +++++++
>> > net/ipv4/netfilter/Kconfig                  |   16 ++++++
>> > net/ipv4/netfilter/Makefile                 |    1 +
>> > net/ipv4/netfilter/ipt_CHECKSUM.c           |   72 +++++++++++++++++++++++++++
>> 
>> New modules should use xt.
>
>I tried moving the module to xt_CHECKSUM but now
>	iptables -A POSTROUTING -t mangle -p udp --dport 68 -j CHECKSUM --checksum-fill
>does not to load the module.
>It seems that xt_request_find_target in x_tables uses the prefix 'ip' for a module.
>What am I missing?

MODULE_ALIAS like the others have.

^ permalink raw reply

* Re: [PATCH] netfilter: add CHECKSUM target
From: Changli Gao @ 2010-07-11 12:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jan Engelhardt, Patrick McHardy, David S. Miller,
	Alexey Kuznetsov, Pekka Savola (ipv6), James Morris,
	Hideaki YOSHIFUJI, linux-kernel, netfilter-devel, netfilter,
	coreteam, netdev, herbert.xu, kvm
In-Reply-To: <20100711122708.GA3573@redhat.com>

On Sun, Jul 11, 2010 at 8:27 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Fri, Jul 09, 2010 at 11:56:26AM +0200, Jan Engelhardt wrote:
>>
>> On Friday 2010-07-09 00:29, Michael S. Tsirkin wrote:
>> >
>> > include/linux/netfilter_ipv4/ipt_CHECKSUM.h |   18 +++++++
>> > net/ipv4/netfilter/Kconfig                  |   16 ++++++
>> > net/ipv4/netfilter/Makefile                 |    1 +
>> > net/ipv4/netfilter/ipt_CHECKSUM.c           |   72 +++++++++++++++++++++++++++
>>
>> New modules should use xt.
>
> I tried moving the module to xt_CHECKSUM but now
>        iptables -A POSTROUTING -t mangle -p udp --dport 68 -j CHECKSUM --checksum-fill
> does not to load the module.
> It seems that xt_request_find_target in x_tables uses the prefix 'ip' for a module.
> What am I missing?
>

You should add MODULE_ALIAS clause, i.e.

MODULE_ALIAS("ipt_CHECKSUM");

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH] netfilter: add CHECKSUM target
From: Michael S. Tsirkin @ 2010-07-11 12:27 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Patrick McHardy, David S. Miller, Alexey Kuznetsov,
	Pekka Savola (ipv6), James Morris, Hideaki YOSHIFUJI,
	linux-kernel, netfilter-devel, netfilter, coreteam, netdev,
	herbert.xu, kvm
In-Reply-To: <alpine.LSU.2.01.1007091154120.7235@obet.zrqbmnf.qr>

On Fri, Jul 09, 2010 at 11:56:26AM +0200, Jan Engelhardt wrote:
> 
> On Friday 2010-07-09 00:29, Michael S. Tsirkin wrote:
> >
> > include/linux/netfilter_ipv4/ipt_CHECKSUM.h |   18 +++++++
> > net/ipv4/netfilter/Kconfig                  |   16 ++++++
> > net/ipv4/netfilter/Makefile                 |    1 +
> > net/ipv4/netfilter/ipt_CHECKSUM.c           |   72 +++++++++++++++++++++++++++
> 
> New modules should use xt.

I tried moving the module to xt_CHECKSUM but now
	iptables -A POSTROUTING -t mangle -p udp --dport 68 -j CHECKSUM --checksum-fill
does not to load the module.
It seems that xt_request_find_target in x_tables uses the prefix 'ip' for a module.
What am I missing?

> >+static unsigned int
> >+checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
> >+{
> >+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
> >+		skb_checksum_help(skb);
> >+	}
> 
> - {}

^ permalink raw reply

* Re: [PATCH] netfilter: add CHECKSUM target
From: Michael S. Tsirkin @ 2010-07-11 10:47 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, Alexey Kuznetsov, Pekka Savola (ipv6),
	James Morris, Hideaki YOSHIFUJI, linux-kernel, netfilter-devel,
	netfilter, coreteam, netdev, herbert.xu, kvm
In-Reply-To: <4C373D90.8070000@trash.net>

On Fri, Jul 09, 2010 at 05:17:36PM +0200, Patrick McHardy wrote:
> Am 09.07.2010 00:29, schrieb Michael S. Tsirkin:
> > This adds a `CHECKSUM' target, which can be used in the iptables mangle
> > table.
> > 
> > You can use this target to compute and fill in the checksum in
> > an IP packet that lacks a checksum.  This is particularly useful,
> > if you need to work around old applications such as dhcp clients,
> > that do not work well with checksum offloads, but don't want to
> > disable checksum offload in your device.
> > 
> > The problem happens in the field with virtualized applications.
> > For reference, see Red Hat bz 605555, as well as
> > http://www.spinics.net/lists/kvm/msg37660.html
> > 
> > Typical expected use (helps old dhclient binary running in a VM):
> > iptables -A POSTROUTING -t mangle -p udp --dport 68 -j CHECKSUM
> > --checksum-fill
> 
> I'm not sure this is something we want to merge upstream and
> support indefinitely. Dave suggested this as a temporary
> out-of-tree workaround until the majority of guest dhcp clients
> are fixed. Has anything changed that makes this course of
> action impractical?

If I understand what Dave said correctly, it's up to you ...

The arguments for putting this upstream are:

Given the track record, I wouldn't hope for quick fix in the majority of
guest dhcp clients, unfortunately :(.  We are talking years here.
Even after that, one of the uses of virtualization is
to keep old guests running. So yes, I think we'll
keep using work-arounds for this for a very long time.

Further, since we have to add the module and we have to teach management
to program it, it will be much less painful for everyone
involved if we can put the code upstream, rather than forking
management code.

-- 
MST

^ permalink raw reply

* Re: [PATCH] netfilter: add CHECKSUM target
From: Herbert Xu @ 2010-07-11 10:24 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Patrick McHardy, David S. Miller, Alexey Kuznetsov,
	Pekka Savola (ipv6), James Morris, Hideaki YOSHIFUJI,
	linux-kernel, netfilter-devel, netfilter, coreteam, netdev, kvm
In-Reply-To: <20100711092117.GA16502@redhat.com>

On Sun, Jul 11, 2010 at 12:21:17PM +0300, Michael S. Tsirkin wrote:
>
> > I'd think that this target would be protocol-agnostic, no?
> 
> Meaning it should go into net/netfilter/? Will do.

Right, and you should get rid of any IPv4 dependencies.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* RE: [PATCH] bnx2x: add support for receive hashing
From: Vladislav Zolotarov @ 2010-07-11 10:02 UTC (permalink / raw)
  To: David Miller, therbert@google.com; +Cc: netdev@vger.kernel.org
In-Reply-To: <20100710.191242.70196353.davem@davemloft.net>



> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Sunday, July 11, 2010 5:13 AM
> To: therbert@google.com
> Cc: Vladislav Zolotarov; netdev@vger.kernel.org
> Subject: Re: [PATCH] bnx2x: add support for receive hashing
> 
> From: Tom Herbert <therbert@google.com>
> Date: Wed, 7 Jul 2010 12:17:31 -0700
> 
> > It is to enable the device to provide the RSS hash in RX descriptor.
> > The hash severs two purposes now, it's used internally in the device
> > to perform RSS table lookup and also value in RX descriptor.  The
> > latter does not require multi-queue.  Strictly speaking, on a single
> > processor system without multqueue, it would be true that enabling the
> > RX hash on bnx2x is currently superfluous (notwithstanding some other
> > use of the hash might be implemented).
> 
> We intend to use the card provided RSS hash to optimize GSO
> flow comparisons at some point.

Could u explain what did u mean here, pls.? GSO is a Tx flow while RSS hash is generated for the ingress traffic.

> 
> There are other possible uses as well.

Could u elaborate, pls?

> 
> Therefore even in a single RX queue configuration, the driver
> should provide the hash if it can.

Providing an RSS hash adds an additional work to our FW and HW. We would like to understand why do we need to pay this penalty. And surely we don't want to pay it for a possible use. We'd prefer to pay it when it's really needed.

Thanks,
vlad



^ permalink raw reply

* Re: [PATCH] netfilter: add CHECKSUM target
From: Michael S. Tsirkin @ 2010-07-11  9:21 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Patrick McHardy, David S. Miller, Alexey Kuznetsov,
	Pekka Savola (ipv6), James Morris, Hideaki YOSHIFUJI,
	linux-kernel, netfilter-devel, netfilter, coreteam, netdev, kvm
In-Reply-To: <20100709071814.GA27964@gondor.apana.org.au>

On Fri, Jul 09, 2010 at 03:18:14PM +0800, Herbert Xu wrote:
> On Fri, Jul 09, 2010 at 01:29:13AM +0300, Michael S. Tsirkin wrote:
> > This adds a `CHECKSUM' target, which can be used in the iptables mangle
> > table.
> > 
> > You can use this target to compute and fill in the checksum in
> > an IP packet that lacks a checksum.  This is particularly useful,
> > if you need to work around old applications such as dhcp clients,
> > that do not work well with checksum offloads, but don't want to
> > disable checksum offload in your device.
> > 
> > The problem happens in the field with virtualized applications.
> > For reference, see Red Hat bz 605555, as well as
> > http://www.spinics.net/lists/kvm/msg37660.html
> > 
> > Typical expected use (helps old dhclient binary running in a VM):
> > iptables -A POSTROUTING -t mangle -p udp --dport 68 -j CHECKSUM
> > --checksum-fill
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> I'd think that this target would be protocol-agnostic, no?

Meaning it should go into net/netfilter/? Will do.

> Cheers,
> -- 
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [patch v2.6 4/4] libxt_ipvs: user-space lib for netfilter matcher xt_ipvs
From: horms, Simon Horman @ 2010-07-11  9:03 UTC (permalink / raw)
  To: lvs-devel, netdev, linux-kernel, netfilter, netfilter-devel
  Cc: Malcolm Turnbull, Wensong Zhang, Julius Volz, Patrick McHardy,
	David S. Miller, Hannes Eder
In-Reply-To: <20100711090342.035149543@vergenet.net>

[-- Attachment #1: libxt_ipvs-user-space-lib-for-netfilter-matcher-xt_ipvs.patch --]
[-- Type: text/plain, Size: 13930 bytes --]

From:	Hannes Eder <heder@google.com>

The user-space library for the netfilter matcher xt_ipvs.

[ trivial up-port by Simon Horman <horms@verge.net.au> ]
Signed-off-by: Hannes Eder <heder@google.com>
Acked-by: Simon Horman <horms@verge.net.au>

 configure.ac                      |   10 -
 extensions/libxt_ipvs.c           |  365 +++++++++++++++++++++++++++++++++++++
 extensions/libxt_ipvs.man         |   24 ++
 include/linux/netfilter/xt_ipvs.h |   25 +++
 4 files changed, 422 insertions(+), 2 deletions(-)
 create mode 100644 extensions/libxt_ipvs.c
 create mode 100644 extensions/libxt_ipvs.man
 create mode 100644 include/linux/netfilter/xt_ipvs.h

v2.1, v2.3
Trivial up-port

v2.2
No change

Index: iptables/configure.ac
===================================================================
--- iptables.orig/configure.ac	2010-07-04 20:21:07.000000000 +0900
+++ iptables/configure.ac	2010-07-04 20:23:30.000000000 +0900
@@ -52,12 +52,18 @@ AC_ARG_WITH([pkgconfigdir], AS_HELP_STRI
 	[Path to the pkgconfig directory [[LIBDIR/pkgconfig]]]),
 	[pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
 
-AC_CHECK_HEADER([linux/dccp.h])
-
 blacklist_modules="";
+
+AC_CHECK_HEADER([linux/dccp.h])
 if test "$ac_cv_header_linux_dccp_h" != "yes"; then
 	blacklist_modules="$blacklist_modules dccp";
 fi;
+
+AC_CHECK_HEADER([linux/ip_vs.h])
+if test "$ac_cv_header_linux_ip_vs_h" != "yes"; then
+	blacklist_modules="$blacklist_modules ipvs";
+fi;
+
 AC_SUBST([blacklist_modules])
 
 AM_CONDITIONAL([ENABLE_STATIC], [test "$enable_static" = "yes"])
Index: iptables/extensions/libxt_ipvs.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ iptables/extensions/libxt_ipvs.c	2010-07-04 20:24:01.000000000 +0900
@@ -0,0 +1,365 @@
+/*
+ * Shared library add-on to iptables to add IPVS matching.
+ *
+ * Detailed doc is in the kernel module source net/netfilter/xt_ipvs.c
+ *
+ * Author: Hannes Eder <heder@google.com>
+ */
+#include <sys/types.h>
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <xtables.h>
+#include <linux/ip_vs.h>
+#include <linux/netfilter/xt_ipvs.h>
+
+static const struct option ipvs_mt_opts[] = {
+	{ .name = "ipvs",     .has_arg = false, .val = '0' },
+	{ .name = "vproto",   .has_arg = true,  .val = '1' },
+	{ .name = "vaddr",    .has_arg = true,  .val = '2' },
+	{ .name = "vport",    .has_arg = true,  .val = '3' },
+	{ .name = "vdir",     .has_arg = true,  .val = '4' },
+	{ .name = "vmethod",  .has_arg = true,  .val = '5' },
+	{ .name = "vportctl", .has_arg = true,  .val = '6' },
+	{ .name = NULL }
+};
+
+static void ipvs_mt_help(void)
+{
+	printf(
+"IPVS match options:\n"
+"[!] --ipvs                      packet belongs to an IPVS connection\n"
+"\n"
+"Any of the following options implies --ipvs (even negated)\n"
+"[!] --vproto protocol           VIP protocol to match; by number or name,\n"
+"                                e.g. \"tcp\"\n"
+"[!] --vaddr address[/mask]      VIP address to match\n"
+"[!] --vport port                VIP port to match; by number or name,\n"
+"                                e.g. \"http\"\n"
+"    --vdir {ORIGINAL|REPLY}     flow direction of packet\n"
+"[!] --vmethod {GATE|IPIP|MASQ}  IPVS forwarding method used\n"
+"[!] --vportctl port             VIP port of the controlling connection to\n"
+"                                match, e.g. 21 for FTP\n"
+		);
+}
+
+static void ipvs_mt_parse_addr_and_mask(const char *arg,
+					union nf_inet_addr *address,
+					union nf_inet_addr *mask,
+					unsigned int family)
+{
+	struct in_addr *addr = NULL;
+	struct in6_addr *addr6 = NULL;
+	unsigned int naddrs = 0;
+
+	if (family == NFPROTO_IPV4) {
+		xtables_ipparse_any(arg, &addr, &mask->in, &naddrs);
+		if (naddrs > 1)
+			xtables_error(PARAMETER_PROBLEM,
+				      "multiple IP addresses not allowed");
+		if (naddrs == 1)
+			memcpy(&address->in, addr, sizeof(*addr));
+	} else if (family == NFPROTO_IPV6) {
+		xtables_ip6parse_any(arg, &addr6, &mask->in6, &naddrs);
+		if (naddrs > 1)
+			xtables_error(PARAMETER_PROBLEM,
+				      "multiple IP addresses not allowed");
+		if (naddrs == 1)
+			memcpy(&address->in6, addr6, sizeof(*addr6));
+	} else {
+		/* Hu? */
+		assert(false);
+	}
+}
+
+/* Function which parses command options; returns true if it ate an option */
+static int ipvs_mt_parse(int c, char **argv, int invert, unsigned int *flags,
+			 const void *entry, struct xt_entry_match **match,
+			 unsigned int family)
+{
+	struct xt_ipvs_mtinfo *data = (void *)(*match)->data;
+	char *p = NULL;
+	u_int8_t op = 0;
+
+	if ('0' <= c && c <= '6') {
+		static const int ops[] = {
+			XT_IPVS_IPVS_PROPERTY,
+			XT_IPVS_PROTO,
+			XT_IPVS_VADDR,
+			XT_IPVS_VPORT,
+			XT_IPVS_DIR,
+			XT_IPVS_METHOD,
+			XT_IPVS_VPORTCTL
+		};
+		op = ops[c - '0'];
+	} else
+		return 0;
+
+	if (*flags & op & XT_IPVS_ONCE_MASK)
+		goto multiple_use;
+
+	switch (c) {
+	case '0': /* --ipvs */
+		/* Nothing to do here. */
+		break;
+
+	case '1': /* --vproto */
+		/* Canonicalize into lower case */
+		for (p = optarg; *p != '\0'; ++p)
+			*p = tolower(*p);
+
+		data->l4proto = xtables_parse_protocol(optarg);
+		break;
+
+	case '2': /* --vaddr */
+		ipvs_mt_parse_addr_and_mask(optarg, &data->vaddr,
+					    &data->vmask, family);
+		break;
+
+	case '3': /* --vport */
+		data->vport = htons(xtables_parse_port(optarg, "tcp"));
+		break;
+
+	case '4': /* --vdir */
+		xtables_param_act(XTF_NO_INVERT, "ipvs", "--vdir", invert);
+		if (strcasecmp(optarg, "ORIGINAL") == 0) {
+			data->bitmask |= XT_IPVS_DIR;
+			data->invert   &= ~XT_IPVS_DIR;
+		} else if (strcasecmp(optarg, "REPLY") == 0) {
+			data->bitmask |= XT_IPVS_DIR;
+			data->invert  |= XT_IPVS_DIR;
+		} else {
+			xtables_param_act(XTF_BAD_VALUE,
+					  "ipvs", "--vdir", optarg);
+		}
+		break;
+
+	case '5': /* --vmethod */
+		if (strcasecmp(optarg, "GATE") == 0)
+			data->fwd_method = IP_VS_CONN_F_DROUTE;
+		else if (strcasecmp(optarg, "IPIP") == 0)
+			data->fwd_method = IP_VS_CONN_F_TUNNEL;
+		else if (strcasecmp(optarg, "MASQ") == 0)
+			data->fwd_method = IP_VS_CONN_F_MASQ;
+		else
+			xtables_param_act(XTF_BAD_VALUE,
+					  "ipvs", "--vmethod", optarg);
+		break;
+
+	case '6': /* --vportctl */
+		data->vportctl = htons(xtables_parse_port(optarg, "tcp"));
+		break;
+
+	default:
+		/* Hu? How did we come here? */
+		assert(false);
+		return 0;
+	}
+
+	if (op & XT_IPVS_ONCE_MASK) {
+		if (data->invert & XT_IPVS_IPVS_PROPERTY)
+			xtables_error(PARAMETER_PROBLEM,
+				      "! --ipvs cannot be together with"
+				      " other options");
+		data->bitmask |= XT_IPVS_IPVS_PROPERTY;
+	}
+
+	data->bitmask |= op;
+	if (invert)
+		data->invert |= op;
+	*flags |= op;
+	return 1;
+
+multiple_use:
+	xtables_error(PARAMETER_PROBLEM,
+		      "multiple use of the same IPVS option is not allowed");
+}
+
+static int ipvs_mt4_parse(int c, char **argv, int invert, unsigned int *flags,
+			  const void *entry, struct xt_entry_match **match)
+{
+	return ipvs_mt_parse(c, argv, invert, flags, entry, match,
+			     NFPROTO_IPV4);
+}
+
+static int ipvs_mt6_parse(int c, char **argv, int invert, unsigned int *flags,
+			  const void *entry, struct xt_entry_match **match)
+{
+	return ipvs_mt_parse(c, argv, invert, flags, entry, match,
+			     NFPROTO_IPV6);
+}
+
+static void ipvs_mt_check(unsigned int flags)
+{
+	if (flags == 0)
+		xtables_error(PARAMETER_PROBLEM,
+			      "IPVS: At least one option is required");
+}
+
+/* Shamelessly copied from libxt_conntrack.c */
+static void ipvs_mt_dump_addr(const union nf_inet_addr *addr,
+			      const union nf_inet_addr *mask,
+			      unsigned int family, bool numeric)
+{
+	char buf[BUFSIZ];
+
+	if (family == NFPROTO_IPV4) {
+		if (!numeric && addr->ip == 0) {
+			printf("anywhere ");
+			return;
+		}
+		if (numeric)
+			strcpy(buf, xtables_ipaddr_to_numeric(&addr->in));
+		else
+			strcpy(buf, xtables_ipaddr_to_anyname(&addr->in));
+		strcat(buf, xtables_ipmask_to_numeric(&mask->in));
+		printf("%s ", buf);
+	} else if (family == NFPROTO_IPV6) {
+		if (!numeric && addr->ip6[0] == 0 && addr->ip6[1] == 0 &&
+		    addr->ip6[2] == 0 && addr->ip6[3] == 0) {
+			printf("anywhere ");
+			return;
+		}
+		if (numeric)
+			strcpy(buf, xtables_ip6addr_to_numeric(&addr->in6));
+		else
+			strcpy(buf, xtables_ip6addr_to_anyname(&addr->in6));
+		strcat(buf, xtables_ip6mask_to_numeric(&mask->in6));
+		printf("%s ", buf);
+	}
+}
+
+static void ipvs_mt_dump(const void *ip, const struct xt_ipvs_mtinfo *data,
+			 unsigned int family, bool numeric, const char *prefix)
+{
+	if (data->bitmask == XT_IPVS_IPVS_PROPERTY) {
+		if (data->invert & XT_IPVS_IPVS_PROPERTY)
+			printf("! ");
+		printf("%sipvs ", prefix);
+	}
+
+	if (data->bitmask & XT_IPVS_PROTO) {
+		if (data->invert & XT_IPVS_PROTO)
+			printf("! ");
+		printf("%sproto %u ", prefix, data->l4proto);
+	}
+
+	if (data->bitmask & XT_IPVS_VADDR) {
+		if (data->invert & XT_IPVS_VADDR)
+			printf("! ");
+
+		printf("%svaddr ", prefix);
+		ipvs_mt_dump_addr(&data->vaddr, &data->vmask, family, numeric);
+	}
+
+	if (data->bitmask & XT_IPVS_VPORT) {
+		if (data->invert & XT_IPVS_VPORT)
+			printf("! ");
+
+		printf("%svport %u ", prefix, ntohs(data->vport));
+	}
+
+	if (data->bitmask & XT_IPVS_DIR) {
+		if (data->invert & XT_IPVS_DIR)
+			printf("%svdir REPLY ", prefix);
+		else
+			printf("%svdir ORIGINAL ", prefix);
+	}
+
+	if (data->bitmask & XT_IPVS_METHOD) {
+		if (data->invert & XT_IPVS_METHOD)
+			printf("! ");
+
+		printf("%svmethod ", prefix);
+		switch (data->fwd_method) {
+		case IP_VS_CONN_F_DROUTE:
+			printf("GATE ");
+			break;
+		case IP_VS_CONN_F_TUNNEL:
+			printf("IPIP ");
+			break;
+		case IP_VS_CONN_F_MASQ:
+			printf("MASQ ");
+			break;
+		default:
+			/* Hu? */
+			printf("UNKNOWN ");
+			break;
+		}
+	}
+
+	if (data->bitmask & XT_IPVS_VPORTCTL) {
+		if (data->invert & XT_IPVS_VPORTCTL)
+			printf("! ");
+
+		printf("%svportctl %u ", prefix, ntohs(data->vportctl));
+	}
+}
+
+static void ipvs_mt4_print(const void *ip, const struct xt_entry_match *match,
+			   int numeric)
+{
+	const struct xt_ipvs_mtinfo *data = (const void *)match->data;
+	ipvs_mt_dump(ip, data, NFPROTO_IPV4, numeric, "");
+}
+
+static void ipvs_mt6_print(const void *ip, const struct xt_entry_match *match,
+			   int numeric)
+{
+	const struct xt_ipvs_mtinfo *data = (const void *)match->data;
+	ipvs_mt_dump(ip, data, NFPROTO_IPV6, numeric, "");
+}
+
+static void ipvs_mt4_save(const void *ip, const struct xt_entry_match *match)
+{
+	const struct xt_ipvs_mtinfo *data = (const void *)match->data;
+	ipvs_mt_dump(ip, data, NFPROTO_IPV4, true, "--");
+}
+
+static void ipvs_mt6_save(const void *ip, const struct xt_entry_match *match)
+{
+	const struct xt_ipvs_mtinfo *data = (const void *)match->data;
+	ipvs_mt_dump(ip, data, NFPROTO_IPV6, true, "--");
+}
+
+static struct xtables_match ipvs_matches_reg[] = {
+	{
+		.version       = XTABLES_VERSION,
+		.name          = "ipvs",
+		.revision      = 0,
+		.family        = NFPROTO_IPV4,
+		.size          = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+		.userspacesize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+		.help          = ipvs_mt_help,
+		.parse         = ipvs_mt4_parse,
+		.final_check   = ipvs_mt_check,
+		.print         = ipvs_mt4_print,
+		.save          = ipvs_mt4_save,
+		.extra_opts    = ipvs_mt_opts,
+	},
+	{
+		.version       = XTABLES_VERSION,
+		.name          = "ipvs",
+		.revision      = 0,
+		.family        = NFPROTO_IPV6,
+		.size          = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+		.userspacesize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+		.help          = ipvs_mt_help,
+		.parse         = ipvs_mt6_parse,
+		.final_check   = ipvs_mt_check,
+		.print         = ipvs_mt6_print,
+		.save          = ipvs_mt6_save,
+		.extra_opts    = ipvs_mt_opts,
+	},
+};
+
+void _init(void)
+{
+	xtables_register_matches(ipvs_matches_reg,
+				 ARRAY_SIZE(ipvs_matches_reg));
+}
Index: iptables/extensions/libxt_ipvs.man
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ iptables/extensions/libxt_ipvs.man	2010-07-04 20:23:30.000000000 +0900
@@ -0,0 +1,24 @@
+Match IPVS connection properties.
+.TP
+[\fB!\fR] \fB\-\-ipvs\fP
+packet belongs to an IPVS connection
+.TP
+Any of the following options implies \-\-ipvs (even negated)
+.TP
+[\fB!\fR] \fB\-\-vproto\fP \fIprotocol\fP
+VIP protocol to match; by number or name, e.g. "tcp"
+.TP
+[\fB!\fR] \fB\-\-vaddr\fP \fIaddress\fP[\fB/\fP\fImask\fP]
+VIP address to match
+.TP
+[\fB!\fR] \fB\-\-vport\fP \fIport\fP
+VIP port to match; by number or name, e.g. "http"
+.TP
+\fB\-\-vdir\fP {\fBORIGINAL\fP|\fBREPLY\fP}
+flow direction of packet
+.TP
+[\fB!\fR] \fB\-\-vmethod\fP {\fBGATE\fP|\fBIPIP\fP|\fBMASQ\fP}
+IPVS forwarding method used
+.TP
+[\fB!\fR] \fB\-\-vportctl\fP \fIport\fP
+VIP port of the controlling connection to match, e.g. 21 for FTP
Index: iptables/include/linux/netfilter/xt_ipvs.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ iptables/include/linux/netfilter/xt_ipvs.h	2010-07-04 20:23:30.000000000 +0900
@@ -0,0 +1,25 @@
+#ifndef _XT_IPVS_H
+#define _XT_IPVS_H 1
+
+#define XT_IPVS_IPVS_PROPERTY	(1 << 0) /* all other options imply this one */
+#define XT_IPVS_PROTO		(1 << 1)
+#define XT_IPVS_VADDR		(1 << 2)
+#define XT_IPVS_VPORT		(1 << 3)
+#define XT_IPVS_DIR		(1 << 4)
+#define XT_IPVS_METHOD		(1 << 5)
+#define XT_IPVS_VPORTCTL	(1 << 6)
+#define XT_IPVS_MASK		((1 << 7) - 1)
+#define XT_IPVS_ONCE_MASK	(XT_IPVS_MASK & ~XT_IPVS_IPVS_PROPERTY)
+
+struct xt_ipvs_mtinfo {
+	union nf_inet_addr	vaddr, vmask;
+	__be16			vport;
+	__u16			l4proto;
+	__u16			fwd_method;
+	__be16			vportctl;
+
+	__u8			invert;
+	__u8			bitmask;
+};
+
+#endif /* _XT_IPVS_H */


^ permalink raw reply

* [patch v2.6 3/4] IPVS: make FTP work with full NAT support
From: horms, Simon Horman @ 2010-07-11  9:03 UTC (permalink / raw)
  To: lvs-devel, netdev, linux-kernel, netfilter, netfilter-devel
  Cc: Malcolm Turnbull, Wensong Zhang, Julius Volz, Patrick McHardy,
	David S. Miller, Hannes Eder
In-Reply-To: <20100711090342.035149543@vergenet.net>

[-- Attachment #1: IPVS-make-FTP-work-with-full-NAT-support.patch --]
[-- Type: text/plain, Size: 12645 bytes --]

From:	Hannes Eder <heder@google.com>

Use nf_conntrack/nf_nat code to do the packet mangling and the TCP
sequence adjusting.  The function 'ip_vs_skb_replace' is now dead
code, so it is removed.

To SNAT FTP, use something like:

% iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
> --vport 21 -j SNAT --to-source 192.168.10.10

and for the data connections in passive mode:

% iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
> --vportctl 21 -j SNAT --to-source 192.168.10.10

using '-m state --state RELATED' would also works.

Make sure the kernel modules ip_vs_ftp, nf_conntrack_ftp, and
nf_nat_ftp are loaded.

[ up-port and minor fixes by Simon Horman <horms@verge.net.au> ]
Signed-off-by: Hannes Eder <heder@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>

--- 

 include/net/ip_vs.h             |    2 
 net/netfilter/ipvs/Kconfig      |    2 
 net/netfilter/ipvs/ip_vs_app.c  |   43 ---------
 net/netfilter/ipvs/ip_vs_core.c |    1 
 net/netfilter/ipvs/ip_vs_ftp.c  |  174 ++++++++++++++++++++++++++++++++++++---
 5 files changed, 164 insertions(+), 58 deletions(-)

v2.6
* pointer arguments for %pI4

v2.5
* Use nf_ct_is_untracked(ct) instead of nf_ct_is_untracked(),
  the latter is blatantly incorrect
* Return 0 (and thus drop the packet) if mangling wasn't attempted

v2.4
As suggested by Patrick McHardy
* Use nf_conntrack_untracked() instead of &nf_conntrack_untracked
* Fix ip_vs_ftp_out logic
  - Don't call nf_nat_mangle_tcp_packet() unless ct is valid and tracked
  - Only call ip_vs_expect_relatedi() if  nf_nat_mangle_tcp_packet()
    succeeds
  - Note that packets are dropped if mangling fails
Other
* Drop unrelated cosmetic change to sizing of buf in ip_vs_ftp_out()

v2.3
* Up-port
* Drop buf_len = snprintf() change - its a separate, cosmetic, fix
As suggested by Patrick McHardy
* Use %pI4 instead of NIPQUAD

v2.2
* No change

v2.1
* Up-port

Index: nf-next-2.6/include/net/ip_vs.h
===================================================================
--- nf-next-2.6.orig/include/net/ip_vs.h	2010-07-11 17:30:19.000000000 +0900
+++ nf-next-2.6/include/net/ip_vs.h	2010-07-11 17:33:33.000000000 +0900
@@ -736,8 +736,6 @@ extern void ip_vs_app_inc_put(struct ip_
 
 extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb);
 extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb);
-extern int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
-			     char *o_buf, int o_len, char *n_buf, int n_len);
 extern int ip_vs_app_init(void);
 extern void ip_vs_app_cleanup(void);
 
Index: nf-next-2.6/net/netfilter/ipvs/Kconfig
===================================================================
--- nf-next-2.6.orig/net/netfilter/ipvs/Kconfig	2010-07-11 17:33:06.000000000 +0900
+++ nf-next-2.6/net/netfilter/ipvs/Kconfig	2010-07-11 17:33:33.000000000 +0900
@@ -235,7 +235,7 @@ comment 'IPVS application helper'
 
 config	IP_VS_FTP
   	tristate "FTP protocol helper"
-        depends on IP_VS_PROTO_TCP
+        depends on IP_VS_PROTO_TCP && NF_NAT
 	---help---
 	  FTP is a protocol that transfers IP address and/or port number in
 	  the payload. In the virtual server via Network Address Translation,
Index: nf-next-2.6/net/netfilter/ipvs/ip_vs_app.c
===================================================================
--- nf-next-2.6.orig/net/netfilter/ipvs/ip_vs_app.c	2010-07-11 17:30:19.000000000 +0900
+++ nf-next-2.6/net/netfilter/ipvs/ip_vs_app.c	2010-07-11 17:33:33.000000000 +0900
@@ -569,49 +569,6 @@ static const struct file_operations ip_v
 };
 #endif
 
-
-/*
- *	Replace a segment of data with a new segment
- */
-int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
-		      char *o_buf, int o_len, char *n_buf, int n_len)
-{
-	int diff;
-	int o_offset;
-	int o_left;
-
-	EnterFunction(9);
-
-	diff = n_len - o_len;
-	o_offset = o_buf - (char *)skb->data;
-	/* The length of left data after o_buf+o_len in the skb data */
-	o_left = skb->len - (o_offset + o_len);
-
-	if (diff <= 0) {
-		memmove(o_buf + n_len, o_buf + o_len, o_left);
-		memcpy(o_buf, n_buf, n_len);
-		skb_trim(skb, skb->len + diff);
-	} else if (diff <= skb_tailroom(skb)) {
-		skb_put(skb, diff);
-		memmove(o_buf + n_len, o_buf + o_len, o_left);
-		memcpy(o_buf, n_buf, n_len);
-	} else {
-		if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
-			return -ENOMEM;
-		skb_put(skb, diff);
-		memmove(skb->data + o_offset + n_len,
-			skb->data + o_offset + o_len, o_left);
-		skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
-	}
-
-	/* must update the iph total length here */
-	ip_hdr(skb)->tot_len = htons(skb->len);
-
-	LeaveFunction(9);
-	return 0;
-}
-
-
 int __init ip_vs_app_init(void)
 {
 	/* we will replace it with proc_net_ipvs_create() soon */
Index: nf-next-2.6/net/netfilter/ipvs/ip_vs_core.c
===================================================================
--- nf-next-2.6.orig/net/netfilter/ipvs/ip_vs_core.c	2010-07-11 17:33:06.000000000 +0900
+++ nf-next-2.6/net/netfilter/ipvs/ip_vs_core.c	2010-07-11 17:33:33.000000000 +0900
@@ -54,7 +54,6 @@
 
 EXPORT_SYMBOL(register_ip_vs_scheduler);
 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
-EXPORT_SYMBOL(ip_vs_skb_replace);
 EXPORT_SYMBOL(ip_vs_proto_name);
 EXPORT_SYMBOL(ip_vs_conn_new);
 EXPORT_SYMBOL(ip_vs_conn_in_get);
Index: nf-next-2.6/net/netfilter/ipvs/ip_vs_ftp.c
===================================================================
--- nf-next-2.6.orig/net/netfilter/ipvs/ip_vs_ftp.c	2010-07-11 17:30:19.000000000 +0900
+++ nf-next-2.6/net/netfilter/ipvs/ip_vs_ftp.c	2010-07-11 18:01:58.000000000 +0900
@@ -20,6 +20,17 @@
  *
  * Author:	Wouter Gadeyne
  *
+ *
+ * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from
+ * http://www.ssi.bg/~ja/nfct/:
+ *
+ * ip_vs_nfct.c:	Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2008
+ * Julian Anastasov
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -32,6 +43,9 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
 #include <linux/gfp.h>
 #include <net/protocol.h>
 #include <net/tcp.h>
@@ -43,6 +57,16 @@
 #define SERVER_STRING "227 Entering Passive Mode ("
 #define CLIENT_STRING "PORT "
 
+#define FMT_TUPLE	"%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T)	&(T)->src.u3.ip, ntohs((T)->src.u.all), \
+			&(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+			(T)->dst.protonum
+
+#define FMT_CONN	"%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C)	&((C)->caddr.ip), ntohs((C)->cport), \
+			&((C)->vaddr.ip), ntohs((C)->vport), \
+			&((C)->daddr.ip), ntohs((C)->dport), \
+			(C)->protocol, (C)->state
 
 /*
  * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -123,6 +147,119 @@ static int ip_vs_ftp_get_addrport(char *
 	return 1;
 }
 
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void
+ip_vs_expect_callback(struct nf_conn *ct,
+		      struct nf_conntrack_expect *exp)
+{
+	struct nf_conntrack_tuple *orig, new_reply;
+	struct ip_vs_conn *cp;
+
+	if (exp->tuple.src.l3num != PF_INET)
+		return;
+
+	/*
+	 * We assume that no NF locks are held before this callback.
+	 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+	 * expectations even if they use wildcard values, now we provide the
+	 * actual values from the newly created original conntrack direction.
+	 * The conntrack is confirmed when packet reaches IPVS hooks.
+	 */
+
+	/* RS->CLIENT */
+	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
+				&orig->src.u3, orig->src.u.tcp.port,
+				&orig->dst.u3, orig->dst.u.tcp.port);
+	if (cp) {
+		/* Change reply CLIENT->RS to CLIENT->VS */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.dst.u3 = cp->vaddr;
+		new_reply.dst.u.tcp.port = cp->vport;
+		IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+			  ", inout cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	/* CLIENT->VS */
+	cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
+			       &orig->src.u3, orig->src.u.tcp.port,
+			       &orig->dst.u3, orig->dst.u.tcp.port);
+	if (cp) {
+		/* Change reply VS->CLIENT to RS->CLIENT */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.src.u3 = cp->daddr;
+		new_reply.src.u.tcp.port = cp->dport;
+		IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+		  " - unknown expect\n",
+		  __func__, ct, ct->status, ARG_TUPLE(orig));
+	return;
+
+alter:
+	/* Never alter conntrack for non-NAT conns */
+	if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+		nf_conntrack_alter_reply(ct, &new_reply);
+	ip_vs_conn_put(cp);
+	return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ */
+static void
+ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+		     struct ip_vs_conn *cp, u_int8_t proto,
+		     const __be16 *port, int from_rs)
+{
+	struct nf_conntrack_expect *exp;
+
+	BUG_ON(!ct || ct == &nf_conntrack_untracked);
+
+	exp = nf_ct_expect_alloc(ct);
+	if (!exp)
+		return;
+
+	if (from_rs)
+		nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+				  nf_ct_l3num(ct), &cp->daddr, &cp->caddr,
+				  proto, port, &cp->cport);
+	else
+		nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+				  nf_ct_l3num(ct), &cp->caddr, &cp->vaddr,
+				  proto, port, &cp->vport);
+
+	exp->expectfn = ip_vs_expect_callback;
+
+	IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n",
+		  __func__, ct, ARG_TUPLE(&exp->tuple));
+	nf_ct_expect_related(exp);
+	nf_ct_expect_put(exp);
+}
 
 /*
  * Look at outgoing ftp packets to catch the response to a PASV command
@@ -149,7 +286,9 @@ static int ip_vs_ftp_out(struct ip_vs_ap
 	struct ip_vs_conn *n_cp;
 	char buf[24];		/* xxx.xxx.xxx.xxx,ppp,ppp\000 */
 	unsigned buf_len;
-	int ret;
+	int ret = 0;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
 
 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
@@ -219,19 +358,26 @@ static int ip_vs_ftp_out(struct ip_vs_ap
 
 		buf_len = strlen(buf);
 
+		ct = nf_ct_get(skb, &ctinfo);
+		if (ct && !nf_ct_is_untracked(ct)) {
+			/* If mangling fails this function will return 0
+			 * which will cause the packet to be dropped.
+			 * Mangling can only fail under memory pressure,
+			 * hopefully it will succeed on the retransmitted
+			 * packet.
+			 */
+			ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+						       start-data, end-start,
+						       buf, buf_len);
+			if (ret)
+				ip_vs_expect_related(skb, ct, n_cp,
+						     IPPROTO_TCP, NULL, 0);
+		}
+
 		/*
-		 * Calculate required delta-offset to keep TCP happy
+		 * Not setting 'diff' is intentional, otherwise the sequence
+		 * would be adjusted twice.
 		 */
-		*diff = buf_len - (end-start);
-
-		if (*diff == 0) {
-			/* simply replace it with new passive address */
-			memcpy(start, buf, buf_len);
-			ret = 1;
-		} else {
-			ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
-					  end-start, buf, buf_len);
-		}
 
 		cp->app_data = NULL;
 		ip_vs_tcp_conn_listen(n_cp);
@@ -263,6 +409,7 @@ static int ip_vs_ftp_in(struct ip_vs_app
 	union nf_inet_addr to;
 	__be16 port;
 	struct ip_vs_conn *n_cp;
+	struct nf_conn *ct;
 
 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
@@ -349,6 +496,11 @@ static int ip_vs_ftp_in(struct ip_vs_app
 		ip_vs_control_add(n_cp, cp);
 	}
 
+	ct = (struct nf_conn *)skb->nfct;
+	if (ct && ct != &nf_conntrack_untracked)
+		ip_vs_expect_related(skb, ct, n_cp,
+				     IPPROTO_TCP, &n_cp->dport, 1);
+
 	/*
 	 *	Move tunnel to listen state
 	 */

^ permalink raw reply

* [patch v2.6 2/4] IPVS: make friends with nf_conntrack
From: horms, Simon Horman @ 2010-07-11  9:03 UTC (permalink / raw)
  To: lvs-devel, netdev, linux-kernel, netfilter, netfilter-devel
  Cc: Malcolm Turnbull, Wensong Zhang, Julius Volz, Patrick McHardy,
	David S. Miller, Hannes Eder
In-Reply-To: <20100711090342.035149543@vergenet.net>

[-- Attachment #1: IPVS-make-friends-with-nf_conntrack.patch --]
[-- Type: text/plain, Size: 5649 bytes --]

From:	Hannes Eder <heder@google.com>

Update the nf_conntrack tuple in reply direction, as we will see
traffic from the real server (RIP) to the client (CIP).  Once this is
done we can use netfilters SNAT in POSTROUTING, especially with
xt_ipvs, to do source NAT, e.g.:

% iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 --vport 80 \
> -j SNAT --to-source 192.168.10.10

[ minor fixes by Simon Horman <horms@verge.net.au> ]
Signed-off-by: Hannes Eder <heder@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>

--- 
 net/netfilter/ipvs/Kconfig      |    2 +-
 net/netfilter/ipvs/ip_vs_core.c |   36 ------------------------------------
 net/netfilter/ipvs/ip_vs_xmit.c |   29 +++++++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 37 deletions(-)

v2.4
As per advice from Patrick McHardy
* Use nf_conntrack_untracked() instead of &nf_conntrack_untracked

v2.1, v2.2, v2.3
No change

Index: nf-next-2.6/net/netfilter/ipvs/Kconfig
===================================================================
--- nf-next-2.6.orig/net/netfilter/ipvs/Kconfig	2010-07-07 13:24:31.000000000 +0900
+++ nf-next-2.6/net/netfilter/ipvs/Kconfig	2010-07-07 13:38:23.000000000 +0900
@@ -3,7 +3,7 @@
 #
 menuconfig IP_VS
 	tristate "IP virtual server support"
-	depends on NET && INET && NETFILTER
+	depends on NET && INET && NETFILTER && NF_CONNTRACK
 	---help---
 	  IP Virtual Server support will let you build a high-performance
 	  virtual server based on cluster of two or more real servers. This
Index: nf-next-2.6/net/netfilter/ipvs/ip_vs_core.c
===================================================================
--- nf-next-2.6.orig/net/netfilter/ipvs/ip_vs_core.c	2010-07-07 13:23:37.000000000 +0900
+++ nf-next-2.6/net/netfilter/ipvs/ip_vs_core.c	2010-07-07 13:38:23.000000000 +0900
@@ -536,26 +536,6 @@ int ip_vs_leave(struct ip_vs_service *sv
 	return NF_DROP;
 }
 
-
-/*
- *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
- *      chain, and is used for VS/NAT.
- *      It detects packets for VS/NAT connections and sends the packets
- *      immediately. This can avoid that iptable_nat mangles the packets
- *      for VS/NAT.
- */
-static unsigned int ip_vs_post_routing(unsigned int hooknum,
-				       struct sk_buff *skb,
-				       const struct net_device *in,
-				       const struct net_device *out,
-				       int (*okfn)(struct sk_buff *))
-{
-	if (!skb->ipvs_property)
-		return NF_ACCEPT;
-	/* The packet was sent from IPVS, exit this chain */
-	return NF_STOP;
-}
-
 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 {
 	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
@@ -1499,14 +1479,6 @@ static struct nf_hook_ops ip_vs_ops[] __
 		.hooknum        = NF_INET_FORWARD,
 		.priority       = 99,
 	},
-	/* Before the netfilter connection tracking, exit from POST_ROUTING */
-	{
-		.hook		= ip_vs_post_routing,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET,
-		.hooknum        = NF_INET_POST_ROUTING,
-		.priority       = NF_IP_PRI_NAT_SRC-1,
-	},
 #ifdef CONFIG_IP_VS_IPV6
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
 	 * or VS/NAT(change destination), so that filtering rules can be
@@ -1535,14 +1507,6 @@ static struct nf_hook_ops ip_vs_ops[] __
 		.hooknum        = NF_INET_FORWARD,
 		.priority       = 99,
 	},
-	/* Before the netfilter connection tracking, exit from POST_ROUTING */
-	{
-		.hook		= ip_vs_post_routing,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET6,
-		.hooknum        = NF_INET_POST_ROUTING,
-		.priority       = NF_IP6_PRI_NAT_SRC-1,
-	},
 #endif
 };
 
Index: nf-next-2.6/net/netfilter/ipvs/ip_vs_xmit.c
===================================================================
--- nf-next-2.6.orig/net/netfilter/ipvs/ip_vs_xmit.c	2010-07-07 13:23:37.000000000 +0900
+++ nf-next-2.6/net/netfilter/ipvs/ip_vs_xmit.c	2010-07-07 13:42:22.000000000 +0900
@@ -28,6 +28,7 @@
 #include <net/ip6_route.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
 #include <linux/netfilter_ipv4.h>
 
 #include <net/ip_vs.h>
@@ -348,6 +349,30 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb
 }
 #endif
 
+static void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
+	struct nf_conntrack_tuple new_tuple;
+
+	if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
+		return;
+
+	/*
+	 * The connection is not yet in the hashtable, so we update it.
+	 * CIP->VIP will remain the same, so leave the tuple in
+	 * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
+	 * real-server we will see RIP->DIP.
+	 */
+	new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+	new_tuple.src.u3 = cp->daddr;
+	/*
+	 * This will also take care of UDP and other protocols.
+	 */
+	new_tuple.src.u.tcp.port = cp->dport;
+	nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
 /*
  *      NAT transmitter (only for outside-to-inside nat forwarding)
  *      Not used for related ICMP
@@ -403,6 +428,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
 
+	ip_vs_update_conntrack(skb, cp);
+
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
 	   MTU problem. */
@@ -479,6 +506,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
 
+	ip_vs_update_conntrack(skb, cp);
+
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
 	   MTU problem. */


^ permalink raw reply

* [patch v2.6 1/4] netfilter: xt_ipvs (netfilter matcher for IPVS)
From: horms, Simon Horman @ 2010-07-11  9:03 UTC (permalink / raw)
  To: lvs-devel, netdev, linux-kernel, netfilter, netfilter-devel
  Cc: Malcolm Turnbull, Wensong Zhang, Julius Volz, Patrick McHardy,
	David S. Miller, Hannes Eder
In-Reply-To: <20100711090342.035149543@vergenet.net>

[-- Attachment #1: netfilter-xt_ipvs-netfilter-matcher-for-IPVS.patch --]
[-- Type: text/plain, Size: 8873 bytes --]

From:	Hannes Eder <heder@google.com>

This implements the kernel-space side of the netfilter matcher xt_ipvs.

[ minor fixes by Simon Horman <horms@verge.net.au> ]
Signed-off-by: Hannes Eder <heder@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>

--- 

 include/linux/netfilter/xt_ipvs.h |   25 ++++
 net/netfilter/Kconfig             |   10 +
 net/netfilter/Makefile            |    1 
 net/netfilter/ipvs/ip_vs_proto.c  |    1 
 net/netfilter/xt_ipvs.c           |  189 +++++++++++++++++++++++++++++++++++++
 5 files changed, 226 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/netfilter/xt_ipvs.h
 create mode 100644 net/netfilter/xt_ipvs.c

v2.5
* Use nf_ct_is_untracked(ct) instead of ct == nf_ct_is_untracked(ct),
  the later is blatantly incorrect.

v2.4
As per advice from Patrick McHardy
* Reduce size of l4proto and fwd_method members of struct xt_ipvs_mtinf
  from __u16 to __u8
* Use nf_conntrack_untracked() instead of &nf_conntrack_untracked

v2.3
As per advice from Patrick McHardy
* Don't define a value for _XT_IPVS_H in xt_ipvs.h
* Depend on NF_CONNTRACK
* Update to new API
  - ipvs_mt_check() should return an int rather than a bool
  - Change type of ipvs_mt()'s par parameter from
    struct xt_action_param to struct xt_match_param
  - Make ipvs_mt()'s par parameter non-const

v2.1, v2.2
No Change
Index: nf-next-2.6/include/linux/netfilter/xt_ipvs.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ nf-next-2.6/include/linux/netfilter/xt_ipvs.h	2010-07-10 11:49:23.000000000 +0900
@@ -0,0 +1,25 @@
+#ifndef _XT_IPVS_H
+#define _XT_IPVS_H
+
+#define XT_IPVS_IPVS_PROPERTY	(1 << 0) /* all other options imply this one */
+#define XT_IPVS_PROTO		(1 << 1)
+#define XT_IPVS_VADDR		(1 << 2)
+#define XT_IPVS_VPORT		(1 << 3)
+#define XT_IPVS_DIR		(1 << 4)
+#define XT_IPVS_METHOD		(1 << 5)
+#define XT_IPVS_VPORTCTL	(1 << 6)
+#define XT_IPVS_MASK		((1 << 7) - 1)
+#define XT_IPVS_ONCE_MASK	(XT_IPVS_MASK & ~XT_IPVS_IPVS_PROPERTY)
+
+struct xt_ipvs_mtinfo {
+	union nf_inet_addr	vaddr, vmask;
+	__be16			vport;
+	__u8			l4proto;
+	__u8			fwd_method;
+	__be16			vportctl;
+
+	__u8			invert;
+	__u8			bitmask;
+};
+
+#endif /* _XT_IPVS_H */
Index: nf-next-2.6/net/netfilter/Kconfig
===================================================================
--- nf-next-2.6.orig/net/netfilter/Kconfig	2010-07-10 11:48:54.000000000 +0900
+++ nf-next-2.6/net/netfilter/Kconfig	2010-07-10 11:49:23.000000000 +0900
@@ -726,6 +726,16 @@ config NETFILTER_XT_MATCH_IPRANGE
 
 	If unsure, say M.
 
+config NETFILTER_XT_MATCH_IPVS
+	tristate '"ipvs" match support'
+	depends on IP_VS
+	depends on NETFILTER_ADVANCED
+	depends on NF_CONNTRACK
+	help
+	  This option allows you to match against IPVS properties of a packet.
+
+	  If unsure, say N.
+
 config NETFILTER_XT_MATCH_LENGTH
 	tristate '"length" match support'
 	depends on NETFILTER_ADVANCED
Index: nf-next-2.6/net/netfilter/Makefile
===================================================================
--- nf-next-2.6.orig/net/netfilter/Makefile	2010-07-10 11:48:54.000000000 +0900
+++ nf-next-2.6/net/netfilter/Makefile	2010-07-10 11:49:23.000000000 +0900
@@ -76,6 +76,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMI
 obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o
Index: nf-next-2.6/net/netfilter/ipvs/ip_vs_proto.c
===================================================================
--- nf-next-2.6.orig/net/netfilter/ipvs/ip_vs_proto.c	2010-07-10 11:48:54.000000000 +0900
+++ nf-next-2.6/net/netfilter/ipvs/ip_vs_proto.c	2010-07-10 11:49:23.000000000 +0900
@@ -98,6 +98,7 @@ struct ip_vs_protocol * ip_vs_proto_get(
 
 	return NULL;
 }
+EXPORT_SYMBOL(ip_vs_proto_get);
 
 
 /*
Index: nf-next-2.6/net/netfilter/xt_ipvs.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ nf-next-2.6/net/netfilter/xt_ipvs.c	2010-07-10 11:54:50.000000000 +0900
@@ -0,0 +1,189 @@
+/*
+ *	xt_ipvs - kernel module to match IPVS connection properties
+ *
+ *	Author: Hannes Eder <heder@google.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#endif
+#include <linux/ip_vs.h>
+#include <linux/types.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_ipvs.h>
+#include <net/netfilter/nf_conntrack.h>
+
+#include <net/ip_vs.h>
+
+MODULE_AUTHOR("Hannes Eder <heder@google.com>");
+MODULE_DESCRIPTION("Xtables: match IPVS connection properties");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_ipvs");
+MODULE_ALIAS("ip6t_ipvs");
+
+/* borrowed from xt_conntrack */
+static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr,
+			    const union nf_inet_addr *uaddr,
+			    const union nf_inet_addr *umask,
+			    unsigned int l3proto)
+{
+	if (l3proto == NFPROTO_IPV4)
+		return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0;
+#ifdef CONFIG_IP_VS_IPV6
+	else if (l3proto == NFPROTO_IPV6)
+		return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6,
+		       &uaddr->in6) == 0;
+#endif
+	else
+		return false;
+}
+
+static bool
+ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_ipvs_mtinfo *data = par->matchinfo;
+	/* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
+	const u_int8_t family = par->family;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn *cp;
+	bool match = true;
+
+	if (data->bitmask == XT_IPVS_IPVS_PROPERTY) {
+		match = skb->ipvs_property ^
+			!!(data->invert & XT_IPVS_IPVS_PROPERTY);
+		goto out;
+	}
+
+	/* other flags than XT_IPVS_IPVS_PROPERTY are set */
+	if (!skb->ipvs_property) {
+		match = false;
+		goto out;
+	}
+
+	ip_vs_fill_iphdr(family, skb_network_header(skb), &iph);
+
+	if (data->bitmask & XT_IPVS_PROTO)
+		if ((iph.protocol == data->l4proto) ^
+		    !(data->invert & XT_IPVS_PROTO)) {
+			match = false;
+			goto out;
+		}
+
+	pp = ip_vs_proto_get(iph.protocol);
+	if (unlikely(!pp)) {
+		match = false;
+		goto out;
+	}
+
+	/*
+	 * Check if the packet belongs to an existing entry
+	 */
+	cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */);
+	if (unlikely(cp == NULL)) {
+		match = false;
+		goto out;
+	}
+
+	/*
+	 * We found a connection, i.e. ct != 0, make sure to call
+	 * __ip_vs_conn_put before returning.  In our case jump to out_put_con.
+	 */
+
+	if (data->bitmask & XT_IPVS_VPORT)
+		if ((cp->vport == data->vport) ^
+		    !(data->invert & XT_IPVS_VPORT)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_VPORTCTL)
+		if ((cp->control != NULL &&
+		     cp->control->vport == data->vportctl) ^
+		    !(data->invert & XT_IPVS_VPORTCTL)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_DIR) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct == NULL || nf_ct_is_untracked(ct)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+		if ((ctinfo >= IP_CT_IS_REPLY) ^
+		    !!(data->invert & XT_IPVS_DIR)) {
+			match = false;
+			goto out_put_cp;
+		}
+	}
+
+	if (data->bitmask & XT_IPVS_METHOD)
+		if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^
+		    !(data->invert & XT_IPVS_METHOD)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_VADDR) {
+		if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr,
+				    &data->vmask, family) ^
+		    !(data->invert & XT_IPVS_VADDR)) {
+			match = false;
+			goto out_put_cp;
+		}
+	}
+
+out_put_cp:
+	__ip_vs_conn_put(cp);
+out:
+	pr_debug("match=%d\n", match);
+	return match;
+}
+
+static int ipvs_mt_check(const struct xt_mtchk_param *par)
+{
+	if (par->family != NFPROTO_IPV4
+#ifdef CONFIG_IP_VS_IPV6
+	    && par->family != NFPROTO_IPV6
+#endif
+		) {
+		pr_info("protocol family %u not supported\n", par->family);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match xt_ipvs_mt_reg __read_mostly = {
+	.name       = "ipvs",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = ipvs_mt,
+	.checkentry = ipvs_mt_check,
+	.matchsize  = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+	.me         = THIS_MODULE,
+};
+
+static int __init ipvs_mt_init(void)
+{
+	return xt_register_match(&xt_ipvs_mt_reg);
+}
+
+static void __exit ipvs_mt_exit(void)
+{
+	xt_unregister_match(&xt_ipvs_mt_reg);
+}
+
+module_init(ipvs_mt_init);
+module_exit(ipvs_mt_exit);

^ permalink raw reply

* [patch v2.6 0/4] IPVS full NAT support + netfilter 'ipvs' match support
From: horms, Simon Horman @ 2010-07-11  9:03 UTC (permalink / raw)
  To: lvs-devel, netdev, linux-kernel, netfilter, netfilter-devel
  Cc: Malcolm Turnbull, Wensong Zhang, Julius Volz, Patrick McHardy,
	David S. Miller, Hannes Eder

This is a repost of a patch-series posted by Hannes Eder last September.
This is v2 of the patch series and I don't see any outstanding objections to
it in the mailing list archives.

Series v2.6 fixes the arguments to  of %pI4

Series v2.5 fixes some problems introduced in v2.4.

Series v2.4 addresses all of the concerns that Patrick McHardy raised
witht the v2.3 series.

Malcolm Turnbull has offered to test this code so I'd like to get
a Reviewed-by from him before the code gets merged. In other words,
at this stage these patches are for review not merging.

The original cover-email from Hannes follows.
The diffstat output has been updated to reflect minor up-porting by me.

From:	Hannes Eder <heder@google.com>

The following series implements full NAT support for IPVS.  The
approach is via a minimal change to IPVS (make friends with
nf_conntrack) and adding a netfilter matcher, kernel- and user-space
part, i.e. xt_ipvs and libxt_ipvs.

Example usage:

% ipvsadm -A -t 192.168.100.30:80 -s rr
% ipvsadm -a -t 192.168.100.30:80 -r 192.168.10.20:80 -m
# ...

# Source NAT for VIP 192.168.100.30:80
% iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
> --vport 80 -j SNAT --to-source 192.168.10.10

or SNAT-ing only a specific real server:

% iptables -t nat -A POSTROUTING --dst 192.168.11.20 \
> -m ipvs --vaddr 192.168.100.30/32 -j SNAT --to-source 192.168.10.10


First of all, thanks for all the feedback.  This is the changelog for v2:

- Make ip_vs_ftp work again.  Setup nf_conntrack expectations for
  related data connections (based on Julian's patch see
  http://www.ssi.bg/~ja/nfct/) and let nf_conntrack/nf_nat do the
  packet mangling and the TCP sequence adjusting.

  This change rises the question how to deal with ip_vs_sync?  Does it
  work together with conntrackd?  Wild idea: what about getting rid of
  ip_vs_sync and piggy packing all on nf_conntrack and use conntrackd?

  Any comments on this?

- xt_ipvs: add new rule '--vportctl port' to match the VIP port of the
  controlling connection, e.g. port 21 for FTP.  Can be used to match
  a related data connection for FTP:

  # SNAT FTP control connection
  % iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
  > --vport 21 -j SNAT --to-source 192.168.10.10
  
  # SNAT FTP passive data connection
  % iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
  > --vportctl 21 -j SNAT --to-source 192.168.10.10

- xt_ipvs: use 'par->family' instead of 'skb->protocol'

- xt_ipvs: add ipvs_mt_check and restrict to NFPROTO_IPV4 and NFPROTO_IPV6

- Call nf_conntrack_alter_reply(), so helper lookup is performed based
  on the changed tuple.

Changes to the linux kernel
(nf-next-2.6, "bridge: add per bridge device controls for invoking iptables")

Hannes Eder (3):
      netfilter: xt_ipvs (netfilter matcher for IPVS)
      IPVS: make friends with nf_conntrack
      IPVS: make FTP work with full NAT support


 include/linux/netfilter/xt_ipvs.h |   25 +++++
 include/net/ip_vs.h              |    2 
 net/netfilter/Kconfig            |   10 ++
 net/netfilter/Makefile           |    1 
 net/netfilter/ipvs/Kconfig       |    4 
 net/netfilter/ipvs/ip_vs_app.c   |   43 ---------
 net/netfilter/ipvs/ip_vs_core.c  |   37 --------
 net/netfilter/ipvs/ip_vs_ftp.c   |  174 +++++++++++++++++++++++++++++++++++---
 net/netfilter/ipvs/ip_vs_proto.c |    1 
 net/netfilter/ipvs/ip_vs_xmit.c  |   29 ++++++
 net/netfilter/xt_ipvs.c           |  189 +++++++++++++++++++++++++++++++++++++
 11 files changed, 420 insertions(+), 95 deletions(-)
 create mode 100644 include/linux/netfilter/xt_ipvs.h
 create mode 100644 net/netfilter/xt_ipvs.c


Changes to iptables
(iptables.git, "xt_quota: also document negation")

Hannes Eder (1):
      libxt_ipvs: user-space lib for netfilter matcher xt_ipvs

 configure.ac                      |   10 1
 extensions/libxt_ipvs.c           |  365 +++++++++++++++++++++++++++++++++++++
 extensions/libxt_ipvs.man         |   24 ++
 include/linux/netfilter/xt_ipvs.h |   25 +++
 4 files changed, 422 insertions(+), 2 deletions(-)
 create mode 100644 extensions/libxt_ipvs.c
 create mode 100644 extensions/libxt_ipvs.man
 create mode 100644 include/linux/netfilter/xt_ipvs.h


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [patch v2.5 3/4] IPVS: make FTP work with full NAT support
From: Simon Horman @ 2010-07-11  9:01 UTC (permalink / raw)
  To: lvs-devel, netdev, linux-kernel, netfilter, netfilter-devel
  Cc: Malcolm Turnbull, Wensong Zhang, Julius Volz, Patrick McHardy,
	David S. Miller, Hannes Eder
In-Reply-To: <20100710030222.732009020@vergenet.net>

On Sat, Jul 10, 2010 at 12:01:00PM +0900, Simon Horman wrote:
> From:	Hannes Eder <heder@google.com>
> 
> Use nf_conntrack/nf_nat code to do the packet mangling and the TCP
> sequence adjusting.  The function 'ip_vs_skb_replace' is now dead
> code, so it is removed.
> 
> To SNAT FTP, use something like:
> 
> % iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
> > --vport 21 -j SNAT --to-source 192.168.10.10
> 
> and for the data connections in passive mode:
> 
> % iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
> > --vportctl 21 -j SNAT --to-source 192.168.10.10
> 
> using '-m state --state RELATED' would also works.
> 
> Make sure the kernel modules ip_vs_ftp, nf_conntrack_ftp, and
> nf_nat_ftp are loaded.

[snip]

> Index: nf-next-2.6/net/netfilter/ipvs/ip_vs_ftp.c
> ===================================================================
> --- nf-next-2.6.orig/net/netfilter/ipvs/ip_vs_ftp.c	2010-07-10 11:48:54.000000000 +0900
> +++ nf-next-2.6/net/netfilter/ipvs/ip_vs_ftp.c	2010-07-10 11:59:19.000000000 +0900

[snip]

> @@ -43,6 +57,16 @@
>  #define SERVER_STRING "227 Entering Passive Mode ("
>  #define CLIENT_STRING "PORT "
>  
> +#define FMT_TUPLE	"%pI4:%u->%pI4:%u/%u"
> +#define ARG_TUPLE(T)	(T)->src.u3.ip, ntohs((T)->src.u.all), \
> +			(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
> +			(T)->dst.protonum
> +
> +#define FMT_CONN	"%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
> +#define ARG_CONN(C)	(C)->caddr, ntohs((C)->cport), \
> +			(C)->vaddr, ntohs((C)->vport), \
> +			(C)->daddr, ntohs((C)->dport), \
> +			(C)->protocol, (C)->state
>  
>  /*
>   * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper

The argument to the %pI4 needs to be a pointer
so (T)->src.u3.ip should be &(T)->src.u3.ip and
(C)->caddr should be &(C)->caddr.ip.

I'm not sure how this slipped through the cracks so far.
I will repost.

^ permalink raw reply

* [PATCH net-next-2.6] net: sock_free() optimizations
From: Eric Dumazet @ 2010-07-11  8:45 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Avoid two extra instructions in sock_free(), to reload
skb->truesize and skb->sk

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/core/sock.c b/net/core/sock.c
index fef2434..363bc26 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1339,9 +1339,10 @@ EXPORT_SYMBOL(sock_wfree);
 void sock_rfree(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
+	unsigned int len = skb->truesize;
 
-	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
-	sk_mem_uncharge(skb->sk, skb->truesize);
+	atomic_sub(len, &sk->sk_rmem_alloc);
+	sk_mem_uncharge(sk, len);
 }
 EXPORT_SYMBOL(sock_rfree);
 



^ permalink raw reply related

* Re: [PATCH] tproxy: nf_tproxy_assign_sock() can handle tw sockets
From: Eric Dumazet @ 2010-07-11  8:36 UTC (permalink / raw)
  To: David Miller; +Cc: felipewd, kaber, linux-kernel, netdev
In-Reply-To: <20100710.123011.260095467.davem@davemloft.net>

Le samedi 10 juillet 2010 à 12:30 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Sat, 10 Jul 2010 08:17:29 +0200
> 
> > Strange thing with your crash report is CR2 value, with unexpected value
> > of 000000000b388000 while RAX value is dce8dce85d415d41
> > 
> > Faulting instruction is :
> > 
> > 48 83 b8 b0 00 00 00 00   cmpq   $0x0,0xb0(%rax)
> > 
> > So I would have expected CR2 being RAX+0xb0, but its not.
> 
> It could be corruption from elsewhere.  Those last four hex
> digits (0x5d415d41) are "]A]A" in ascii, but that could just
> be coincidence.
> 

x86 being litle endian, string is "A]A]" followed by another "XYXY"
pattern (non ASCII chars : 0xE8, 0xDC, 0xE8, 0xDC, "èÜèÜ" in ISO8859)

^ permalink raw reply

* Re: [PATCH] tproxy: nf_tproxy_assign_sock() can handle tw sockets
From: Eric Dumazet @ 2010-07-11  8:02 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Felipe W Damasio, David Miller, Patrick McHardy, linux-kernel,
	netdev
In-Reply-To: <4C395459.6080407@redhat.com>

Le dimanche 11 juillet 2010 à 08:19 +0300, Avi Kivity a écrit :
> On 07/10/2010 09:17 AM, Eric Dumazet wrote:
> >
> > Strange thing with your crash report is CR2 value, with unexpected value
> > of 000000000b388000 while RAX value is dce8dce85d415d41
> >
> > Faulting instruction is :
> >
> > 48 83 b8 b0 00 00 00 00   cmpq   $0x0,0xb0(%rax)
> >
> > So I would have expected CR2 being RAX+0xb0, but its not.
> >    
> 
> Nothing strange about it.  You only get page faults and valid cr2 for 
> canonical addresses (17 high order bits all equal).  In this case 
> rax+0xb0 is not a canonical address, so you got a general protection 
> fault instead, with cr2 unchanged.
> 

OK, thanks Avi for this information, as I was not aware of this.

So something overwrote sk->sk_prot pointer (or skb->sk pointer) with
some data.

tcp sockets are allocated from a dedicated kmem_cache (because of
SLAB_DESTROY_RCU attribute). Their sk->sk_prot should never change in
normal operation, since underlying memory cannot be reused by another
object type in kernel. It should be NULL or &tcp_prot

Felipe, please describe your configuration as much as possible.
It might be a driver bug with with special kind of network frames.

lsmod
lspci -v
ethtool -k eth0
ethtool -k eth1 (if applicable)

^ permalink raw reply

* Re: [PATCH] tproxy: nf_tproxy_assign_sock() can handle tw sockets
From: Felipe W Damasio @ 2010-07-11  7:13 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, kaber, linux-kernel, netdev
In-Reply-To: <AANLkTikqNLp5HN0Puu00_bbJEQCUNCCtSsOFxuxfX5g0@mail.gmail.com>

2010/7/11 Felipe W Damasio <felipewd@gmail.com>:
>   The production machine has 8GB of RAM:

  I'm sorry, this is not right. The production machine has 16GB of RAM.

  Don't know if that matters regarding those proc parameters, though.

  Cheers,

Felipe Damasio

^ permalink raw reply

* Re: [PATCH] tproxy: nf_tproxy_assign_sock() can handle tw sockets
From: Felipe W Damasio @ 2010-07-11  7:11 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, kaber, linux-kernel, netdev
In-Reply-To: <AANLkTilQbM8F27a5SLZLMzndLYfz-eRhzyxP18KCFz2K@mail.gmail.com>

Hi,

2010/7/11 Felipe W Damasio <felipewd@gmail.com>:
> What do you mean "from elsewhere"? You mean elsewhere on the network code?
>
> Since the function that had the problem was tcp_recvmsg and we're
> talking about a squid process, we're either talking about a typical
> webserver-objet response, or about about an incorrect/faulty http
> request from the user.

I'm trying to understand the network code to see if I can help track
this down...

So I looked at the code of tcp_recvmsg, and I saw that the function
that calls __kfree_skb is sk_eat_skb, which is called in tcp_recvmsg:

                if (tcp_hdr(skb)->fin)
                        goto found_fin_ok;
                if (!(flags & MSG_PEEK)) {
                        sk_eat_skb(sk, skb, copied_early);
                        copied_early = 0;
                }
                continue;

        found_fin_ok:
                /* Process the FIN. */
                ++*seq;
                if (!(flags & MSG_PEEK)) {
                        sk_eat_skb(sk, skb, copied_early);
                        copied_early = 0;
                }
                break;


  Now, I'm no kernel programmer, but I saw the "Process the FIN" thing
on the code. I'm tuning these proc parameters....maybe some of these
are triggering the bug?

   The production machine has 8GB of RAM:


echo 1 >  /proc/sys/net/ipv4/ip_forward
echo 0 > /proc/sys/net/ipv4/conf/lo/rp_filter
echo 1 > /proc/sys/net/ipv4/ip_nonlocal_bind
echo 1 >  /proc/sys/net/ipv4/tcp_low_latency

echo 0 > /proc/sys/net/ipv4/conf/eth2/rp_filter
echo 0 > /proc/sys/net/ipv4/conf/eth1/rp_filter
echo 0 > /proc/sys/net/ipv4/conf/eth0/rp_filter
echo 0 > /proc/sys/net/ipv4/conf/br0/rp_filter

echo 1 > /proc/sys/net/ipv4/conf/all/forwarding
echo 1 > /proc/sys/net/ipv4/conf/all/send_redirects
echo 1 > /proc/sys/net/ipv4/conf/eth0/send_redirects

echo 16384 > /proc/sys/net/ipv4/neigh/default/gc_thresh1
echo 32768 > /proc/sys/net/ipv4/neigh/default/gc_thresh2
echo 65535 > /proc/sys/net/ipv4/neigh/default/gc_thresh3

echo 1024 65535 > /proc/sys/net/ipv4/ip_local_port_range

echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter
echo 0 > /proc/sys/net/ipv4/tcp_ecn
echo 1 > /proc/sys/net/ipv4/tcp_low_latency
echo 100000 > /proc/sys/net/core/netdev_max_backlog
echo 409600  > /proc/sys/net/ipv4/tcp_max_syn_backlog


echo 3 > /proc/sys/net/ipv4/tcp_fin_timeout
echo 15 > /proc/sys/net/ipv4/tcp_keepalive_intvl
echo 3 > /proc/sys/net/ipv4/tcp_keepalive_probes
echo 65536 > /proc/sys/vm/min_free_kbytes
echo "262144 1024000 4194304" > /proc/sys/net/ipv4/tcp_rmem
echo "262144 1024000 4194304" > /proc/sys/net/ipv4/tcp_wmem
echo "1024000" > /proc/sys/net/core/rmem_max
echo "1024000" > /proc/sys/net/core/wmem_max
echo "512000" > /proc/sys/net/core/rmem_default
echo "512000" > /proc/sys/net/core/wmem_default
echo "524288" > /proc/sys/net/ipv4/netfilter/ip_conntrack_max

echo "2" > /proc/sys/net/ipv4/tcp_synack_retries
echo "2" > /proc/sys/net/ipv4/tcp_syn_retries
echo "262144" > /proc/sys/net/ipv4/tcp_max_orphans
echo "262144" > /proc/sys/net/core/somaxconn

  Maybe these can help?

  Cheers,

Felipe Damasio

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox