Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 1/2] phy/marvell: add 88e1121 interface mode support
From: Cyril Chemparathy @ 2010-08-02 19:44 UTC (permalink / raw)
  To: netdev; +Cc: Cyril Chemparathy
In-Reply-To: <1280778294-2993-1-git-send-email-cyril@ti.com>

This patch adds support for RGMII RX/TX delay configuration on marvell 88e1121
and derivatives.  With this patch, PHY_INTERFACE_MODE_RGMII_*ID modes are now
supported on these devices.

Signed-off-by: Cyril Chemparathy <cyril@ti.com>
---
 drivers/net/phy/marvell.c |   35 ++++++++++++++++++++++++++++++++---
 1 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 78b74e8..b1413ae 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -69,6 +69,12 @@
 #define MII_M1111_COPPER		0
 #define MII_M1111_FIBER			1
 
+#define MII_88E1121_PHY_MSCR_PAGE	2
+#define MII_88E1121_PHY_MSCR_REG	21
+#define MII_88E1121_PHY_MSCR_RX_DELAY	BIT(5)
+#define MII_88E1121_PHY_MSCR_TX_DELAY	BIT(4)
+#define MII_88E1121_PHY_MSCR_DELAY_MASK	(~(0x3 << 4))
+
 #define MII_88E1121_PHY_LED_CTRL	16
 #define MII_88E1121_PHY_LED_PAGE	3
 #define MII_88E1121_PHY_LED_DEF		0x0030
@@ -180,7 +186,30 @@ static int marvell_config_aneg(struct phy_device *phydev)
 
 static int m88e1121_config_aneg(struct phy_device *phydev)
 {
-	int err, temp;
+	int err, oldpage, mscr;
+
+	oldpage = phy_read(phydev, MII_88E1121_PHY_PAGE);
+
+	err = phy_write(phydev, MII_88E1121_PHY_PAGE,
+			MII_88E1121_PHY_MSCR_PAGE);
+	if (err < 0)
+		return err;
+	mscr = phy_read(phydev, MII_88E1121_PHY_MSCR_REG) &
+		MII_88E1121_PHY_MSCR_DELAY_MASK;
+
+	if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID)
+		mscr |= (MII_88E1121_PHY_MSCR_RX_DELAY |
+			 MII_88E1121_PHY_MSCR_TX_DELAY);
+	else if (phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID)
+		mscr |= MII_88E1121_PHY_MSCR_RX_DELAY;
+	else if (phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID)
+		mscr |= MII_88E1121_PHY_MSCR_TX_DELAY;
+
+	err = phy_write(phydev, MII_88E1121_PHY_MSCR_REG, mscr);
+	if (err < 0)
+		return err;
+
+	phy_write(phydev, MII_88E1121_PHY_PAGE, oldpage);
 
 	err = phy_write(phydev, MII_BMCR, BMCR_RESET);
 	if (err < 0)
@@ -191,11 +220,11 @@ static int m88e1121_config_aneg(struct phy_device *phydev)
 	if (err < 0)
 		return err;
 
-	temp = phy_read(phydev, MII_88E1121_PHY_PAGE);
+	oldpage = phy_read(phydev, MII_88E1121_PHY_PAGE);
 
 	phy_write(phydev, MII_88E1121_PHY_PAGE, MII_88E1121_PHY_LED_PAGE);
 	phy_write(phydev, MII_88E1121_PHY_LED_CTRL, MII_88E1121_PHY_LED_DEF);
-	phy_write(phydev, MII_88E1121_PHY_PAGE, temp);
+	phy_write(phydev, MII_88E1121_PHY_PAGE, oldpage);
 
 	err = genphy_config_aneg(phydev);
 
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 0/2] Minor extensions to marvell phy driver
From: Cyril Chemparathy @ 2010-08-02 19:44 UTC (permalink / raw)
  To: netdev; +Cc: Cyril Chemparathy

This patch series adds a couple of minor extensions to the marvell phy driver.
The first patch in the series allows for RGMII TX and RX delay configuration
via interface mode.  The second patch adds support for a new device (88ec048).

Cyril Chemparathy (2):
  phy/marvell: add 88e1121 interface mode support
  phy/marvell: add 88ec048 support

 drivers/net/phy/marvell.c |   76 +++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 73 insertions(+), 3 deletions(-)

^ permalink raw reply

* [PATCH 2/2] phy/marvell: add 88ec048 support
From: Cyril Chemparathy @ 2010-08-02 19:44 UTC (permalink / raw)
  To: netdev; +Cc: Cyril Chemparathy
In-Reply-To: <1280778294-2993-1-git-send-email-cyril@ti.com>

Marvell 88ec048 is a derivative of its 88e1121r device.  From the programmer's
perspective, the one major difference is the addition of an additional control
bit in Page 2 Register 16 - used to control the padding of odd nibble
preambles.

This patch adds support for this new device, while inheriting as much code as
possible from the existing 88e1121r implementation.

Signed-off-by: Cyril Chemparathy <cyril@ti.com>
---
 drivers/net/phy/marvell.c |   41 +++++++++++++++++++++++++++++++++++++++++
 1 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index b1413ae..0887218 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -75,6 +75,9 @@
 #define MII_88E1121_PHY_MSCR_TX_DELAY	BIT(4)
 #define MII_88E1121_PHY_MSCR_DELAY_MASK	(~(0x3 << 4))
 
+#define MII_88EC048_PHY_MSCR1_REG	16
+#define MII_88EC048_PHY_MSCR1_PAD_ODD	BIT(6)
+
 #define MII_88E1121_PHY_LED_CTRL	16
 #define MII_88E1121_PHY_LED_PAGE	3
 #define MII_88E1121_PHY_LED_DEF		0x0030
@@ -231,6 +234,31 @@ static int m88e1121_config_aneg(struct phy_device *phydev)
 	return err;
 }
 
+static int m88ec048_config_aneg(struct phy_device *phydev)
+{
+	int err, oldpage, mscr;
+
+	oldpage = phy_read(phydev, MII_88E1121_PHY_PAGE);
+
+	err = phy_write(phydev, MII_88E1121_PHY_PAGE,
+			MII_88E1121_PHY_MSCR_PAGE);
+	if (err < 0)
+		return err;
+
+	mscr = phy_read(phydev, MII_88EC048_PHY_MSCR1_REG);
+	mscr |= MII_88EC048_PHY_MSCR1_PAD_ODD;
+
+	err = phy_write(phydev, MII_88E1121_PHY_MSCR_REG, mscr);
+	if (err < 0)
+		return err;
+
+	err = phy_write(phydev, MII_88E1121_PHY_PAGE, oldpage);
+	if (err < 0)
+		return err;
+
+	return m88e1121_config_aneg(phydev);
+}
+
 static int m88e1111_config_init(struct phy_device *phydev)
 {
 	int err;
@@ -622,6 +650,19 @@ static struct phy_driver marvell_drivers[] = {
 		.driver = { .owner = THIS_MODULE },
 	},
 	{
+		.phy_id = 0x01410e90,
+		.phy_id_mask = 0xfffffff0,
+		.name = "Marvell 88EC048",
+		.features = PHY_GBIT_FEATURES,
+		.flags = PHY_HAS_INTERRUPT,
+		.config_aneg = &m88ec048_config_aneg,
+		.read_status = &marvell_read_status,
+		.ack_interrupt = &marvell_ack_interrupt,
+		.config_intr = &marvell_config_intr,
+		.did_interrupt = &m88e1121_did_interrupt,
+		.driver = { .owner = THIS_MODULE },
+	},
+	{
 		.phy_id = 0x01410cd0,
 		.phy_id_mask = 0xfffffff0,
 		.name = "Marvell 88E1145",
-- 
1.7.0.4


^ permalink raw reply related

* Re: [PATCH 01/11] pcmcia: use pcmica_{read,write}_config_byte
From: Dominik Brodowski @ 2010-08-02 19:52 UTC (permalink / raw)
  To: Komuro; +Cc: Michael Buesch, netdev, linux-pcmcia, linux-wireless,
	linux-serial
In-Reply-To: <15238373.192661280750366920.komurojun-mbn@nifty.com>

Hey,

On Mon, Aug 02, 2010 at 08:59:26PM +0900, Komuro wrote:
> >--- a/drivers/net/pcmcia/xirc2ps_cs.c
> >+++ b/drivers/net/pcmcia/xirc2ps_cs.c
> 
> 
> >+	if (err)
> > 	    goto config_error;
> >-	reg.Action = CS_WRITE;
> >-	reg.Offset = CISREG_IOBASE_1;
> >-	reg.Value = (link->io.BasePort2 >> 8) & 0xff;
> >-	if ((err = pcmcia_access_configuration_register(link, &reg)))
> >+
> >+	err = pcmcia_write_config_byte(link, CISREG_IOBASE_1,
> >+				link->io.BasePort2 & 0xff);
> 
> It should be
> 
> 	err = pcmcia_write_config_byte(link, CISREG_IOBASE_1,
> 				(link->io.BasePort2 >> 8) & 0xff);
> 

Fixed, thanks.

Best,
	Dominik

^ permalink raw reply

* [PATCH 01/28] netfilter: nf_conntrack_reasm: add fast path for in-order fragments
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

As the fragments are sent in order in most of OSes, such as Windows, Darwin and
FreeBSD, it is likely the new fragments are at the end of the inet_frag_queue.
In the fast path, we check if the skb at the end of the inet_frag_queue is the
prev we expect.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 9254008..098a050 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -269,6 +269,11 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 	 * in the chain of fragments so far.  We must know where to put
 	 * this fragment, right?
 	 */
+	prev = fq->q.fragments_tail;
+	if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) {
+		next = NULL;
+		goto found;
+	}
 	prev = NULL;
 	for (next = fq->q.fragments; next != NULL; next = next->next) {
 		if (NFCT_FRAG6_CB(next)->offset >= offset)
@@ -276,6 +281,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 		prev = next;
 	}
 
+found:
 	/* We found where to put this one.  Check for overlap with
 	 * preceding fragment, and, if needed, align things so that
 	 * any overlaps are eliminated.
@@ -341,6 +347,8 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 
 	/* Insert this fragment in the chain of fragments. */
 	skb->next = next;
+	if (!next)
+		fq->q.fragments_tail = skb;
 	if (prev)
 		prev->next = skb;
 	else
@@ -464,6 +472,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
 					  head->csum);
 
 	fq->q.fragments = NULL;
+	fq->q.fragments_tail = NULL;
 
 	/* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
 	fp = skb_shinfo(head)->frag_list;
-- 
1.7.1


^ permalink raw reply related

* [PATCH 04/28] ipvs: Kconfig cleanup
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Michal Marek <mmarek@suse.cz>

IP_VS_PROTO_AH_ESP should be set iff either of IP_VS_PROTO_{AH,ESP} is
selected. Express this with standard kconfig syntax.

Signed-off-by: Michal Marek <mmarek@suse.cz>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipvs/Kconfig |    5 +----
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 712ccad..d80b41a 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -87,19 +87,16 @@ config	IP_VS_PROTO_UDP
 	  protocol. Say Y if unsure.
 
 config	IP_VS_PROTO_AH_ESP
-	bool
-	depends on UNDEFINED
+	def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH
 
 config	IP_VS_PROTO_ESP
 	bool "ESP load balancing support"
-	select IP_VS_PROTO_AH_ESP
 	---help---
 	  This option enables support for load balancing ESP (Encapsulation
 	  Security Payload) transport protocol. Say Y if unsure.
 
 config	IP_VS_PROTO_AH
 	bool "AH load balancing support"
-	select IP_VS_PROTO_AH_ESP
 	---help---
 	  This option enables support for load balancing AH (Authentication
 	  Header) transport protocol. Say Y if unsure.
-- 
1.7.1


^ permalink raw reply related

* [PATCH 06/28] netfilter: xt_TPROXY: the length of lines should be within 80
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

According to the Documentation/CodingStyle, the length of lines should
be within 80.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_TPROXY.c |    6 ++++--
 1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index e1a0ded..c61294d 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -37,8 +37,10 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		return NF_DROP;
 
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-				   iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
-				   hp->source, tgi->lport ? tgi->lport : hp->dest,
+				   iph->saddr,
+				   tgi->laddr ? tgi->laddr : iph->daddr,
+				   hp->source,
+				   tgi->lport ? tgi->lport : hp->dest,
 				   par->in, true);
 
 	/* NOTE: assign_sock consumes our sk reference */
-- 
1.7.1


^ permalink raw reply related

* [PATCH 08/28] netfilter: nf_ct_tcp: fix flow recovery with TCP window tracking enabled
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Pablo Neira Ayuso <pablo@netfilter.org>

This patch adds the missing bits to support the recovery of TCP flows
without disabling window tracking (aka be_liberal). To ensure a
successful recovery, we have to inject the window scale factor via
ctnetlink.

This patch has been tested with a development snapshot of conntrackd
and the new clause `TCPWindowTracking' that allows to perform strict
TCP window tracking recovery across fail-overs.

With this patch, we don't update the receiver's window until it's not
initiated. We require this to perform a successful recovery. Jozsef
confirmed in a private email that this spotted a real issue since that
should not happen.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_proto_tcp.c |   10 +++++++++-
 1 files changed, 9 insertions(+), 1 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 802dbff..c4c885d 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -585,8 +585,16 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			 * Let's try to use the data from the packet.
 			 */
 			sender->td_end = end;
+			win <<= sender->td_scale;
 			sender->td_maxwin = (win == 0 ? 1 : win);
 			sender->td_maxend = end + sender->td_maxwin;
+			/*
+			 * We haven't seen traffic in the other direction yet
+			 * but we have to tweak window tracking to pass III
+			 * and IV until that happens.
+			 */
+			if (receiver->td_maxwin == 0)
+				receiver->td_end = receiver->td_maxend = sack;
 		}
 	} else if (((state->state == TCP_CONNTRACK_SYN_SENT
 		     && dir == IP_CT_DIR_ORIGINAL)
@@ -680,7 +688,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 		/*
 		 * Update receiver data.
 		 */
-		if (after(end, sender->td_maxend))
+		if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
 			receiver->td_maxwin += end - sender->td_maxend;
 		if (after(sack + win, receiver->td_maxend - 1)) {
 			receiver->td_maxend = sack + win;
-- 
1.7.1


^ permalink raw reply related

* [PATCH 10/28] netfilter: correct CHECKSUM header and export it
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Michael S. Tsirkin <mst@redhat.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/Kbuild        |    1 +
 include/linux/netfilter/xt_CHECKSUM.h |    8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index bb103f4..b93b64d 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -3,6 +3,7 @@ header-y += nf_conntrack_tuple_common.h
 header-y += nfnetlink_conntrack.h
 header-y += nfnetlink_log.h
 header-y += nfnetlink_queue.h
+header-y += xt_CHECKSUM.h
 header-y += xt_CLASSIFY.h
 header-y += xt_CONNMARK.h
 header-y += xt_CONNSECMARK.h
diff --git a/include/linux/netfilter/xt_CHECKSUM.h b/include/linux/netfilter/xt_CHECKSUM.h
index 3b4fb77..9a2e466 100644
--- a/include/linux/netfilter/xt_CHECKSUM.h
+++ b/include/linux/netfilter/xt_CHECKSUM.h
@@ -6,8 +6,10 @@
  *
  * This software is distributed under GNU GPL v2, 1991
 */
-#ifndef _IPT_CHECKSUM_TARGET_H
-#define _IPT_CHECKSUM_TARGET_H
+#ifndef _XT_CHECKSUM_TARGET_H
+#define _XT_CHECKSUM_TARGET_H
+
+#include <linux/types.h>
 
 #define XT_CHECKSUM_OP_FILL	0x01	/* fill in checksum in IP header */
 
@@ -15,4 +17,4 @@ struct xt_CHECKSUM_info {
 	__u8 operation;	/* bitset of operations */
 };
 
-#endif /* _IPT_CHECKSUM_TARGET_H */
+#endif /* _XT_CHECKSUM_TARGET_H */
-- 
1.7.1


^ permalink raw reply related

* [PATCH 12/28] IPVS: make friends with nf_conntrack
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Hannes Eder <heder@google.com>

Update the nf_conntrack tuple in reply direction, as we will see
traffic from the real server (RIP) to the client (CIP).  Once this is
done we can use netfilters SNAT in POSTROUTING, especially with
xt_ipvs, to do source NAT, e.g.:

% iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 --vport 80 \
		  -j SNAT --to-source 192.168.10.10

[ minor fixes by Simon Horman <horms@verge.net.au> ]
Signed-off-by: Hannes Eder <heder@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipvs/Kconfig      |    2 +-
 net/netfilter/ipvs/ip_vs_core.c |   36 ------------------------------------
 net/netfilter/ipvs/ip_vs_xmit.c |   29 +++++++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index d80b41a..3662444 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -3,7 +3,7 @@
 #
 menuconfig IP_VS
 	tristate "IP virtual server support"
-	depends on NET && INET && NETFILTER
+	depends on NET && INET && NETFILTER && NF_CONNTRACK
 	---help---
 	  IP Virtual Server support will let you build a high-performance
 	  virtual server based on cluster of two or more real servers. This
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 50907d8..58f82df 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -536,26 +536,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	return NF_DROP;
 }
 
-
-/*
- *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
- *      chain, and is used for VS/NAT.
- *      It detects packets for VS/NAT connections and sends the packets
- *      immediately. This can avoid that iptable_nat mangles the packets
- *      for VS/NAT.
- */
-static unsigned int ip_vs_post_routing(unsigned int hooknum,
-				       struct sk_buff *skb,
-				       const struct net_device *in,
-				       const struct net_device *out,
-				       int (*okfn)(struct sk_buff *))
-{
-	if (!skb->ipvs_property)
-		return NF_ACCEPT;
-	/* The packet was sent from IPVS, exit this chain */
-	return NF_STOP;
-}
-
 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 {
 	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
@@ -1499,14 +1479,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum        = NF_INET_FORWARD,
 		.priority       = 99,
 	},
-	/* Before the netfilter connection tracking, exit from POST_ROUTING */
-	{
-		.hook		= ip_vs_post_routing,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET,
-		.hooknum        = NF_INET_POST_ROUTING,
-		.priority       = NF_IP_PRI_NAT_SRC-1,
-	},
 #ifdef CONFIG_IP_VS_IPV6
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
 	 * or VS/NAT(change destination), so that filtering rules can be
@@ -1535,14 +1507,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum        = NF_INET_FORWARD,
 		.priority       = 99,
 	},
-	/* Before the netfilter connection tracking, exit from POST_ROUTING */
-	{
-		.hook		= ip_vs_post_routing,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET6,
-		.hooknum        = NF_INET_POST_ROUTING,
-		.priority       = NF_IP6_PRI_NAT_SRC-1,
-	},
 #endif
 };
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 02b078e..21e1a5e 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -28,6 +28,7 @@
 #include <net/ip6_route.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
 #include <linux/netfilter_ipv4.h>
 
 #include <net/ip_vs.h>
@@ -348,6 +349,30 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 }
 #endif
 
+static void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
+	struct nf_conntrack_tuple new_tuple;
+
+	if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
+		return;
+
+	/*
+	 * The connection is not yet in the hashtable, so we update it.
+	 * CIP->VIP will remain the same, so leave the tuple in
+	 * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
+	 * real-server we will see RIP->DIP.
+	 */
+	new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+	new_tuple.src.u3 = cp->daddr;
+	/*
+	 * This will also take care of UDP and other protocols.
+	 */
+	new_tuple.src.u.tcp.port = cp->dport;
+	nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
 /*
  *      NAT transmitter (only for outside-to-inside nat forwarding)
  *      Not used for related ICMP
@@ -403,6 +428,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
 
+	ip_vs_update_conntrack(skb, cp);
+
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
 	   MTU problem. */
@@ -479,6 +506,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
 
+	ip_vs_update_conntrack(skb, cp);
+
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
 	   MTU problem. */
-- 
1.7.1


^ permalink raw reply related

* [PATCH 13/28] IPVS: make FTP work with full NAT support
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Hannes Eder <heder@google.com>

Use nf_conntrack/nf_nat code to do the packet mangling and the TCP
sequence adjusting.  The function 'ip_vs_skb_replace' is now dead
code, so it is removed.

To SNAT FTP, use something like:

% iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
    --vport 21 -j SNAT --to-source 192.168.10.10
and for the data connections in passive mode:

% iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
    --vportctl 21 -j SNAT --to-source 192.168.10.10
using '-m state --state RELATED' would also works.

Make sure the kernel modules ip_vs_ftp, nf_conntrack_ftp, and
nf_nat_ftp are loaded.

[ up-port and minor fixes by Simon Horman <horms@verge.net.au> ]
Signed-off-by: Hannes Eder <heder@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/ip_vs.h             |    2 -
 net/netfilter/ipvs/Kconfig      |    2 +-
 net/netfilter/ipvs/ip_vs_app.c  |   43 ----------
 net/netfilter/ipvs/ip_vs_core.c |    1 -
 net/netfilter/ipvs/ip_vs_ftp.c  |  176 ++++++++++++++++++++++++++++++++++++---
 5 files changed, 165 insertions(+), 59 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index fe82b1e..1f9e511 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -736,8 +736,6 @@ extern void ip_vs_app_inc_put(struct ip_vs_app *inc);
 
 extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb);
 extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb);
-extern int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
-			     char *o_buf, int o_len, char *n_buf, int n_len);
 extern int ip_vs_app_init(void);
 extern void ip_vs_app_cleanup(void);
 
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 3662444..be10f65 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -235,7 +235,7 @@ comment 'IPVS application helper'
 
 config	IP_VS_FTP
   	tristate "FTP protocol helper"
-        depends on IP_VS_PROTO_TCP
+        depends on IP_VS_PROTO_TCP && NF_NAT
 	---help---
 	  FTP is a protocol that transfers IP address and/or port number in
 	  the payload. In the virtual server via Network Address Translation,
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 1cb0e83..e76f87f 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -569,49 +569,6 @@ static const struct file_operations ip_vs_app_fops = {
 };
 #endif
 
-
-/*
- *	Replace a segment of data with a new segment
- */
-int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
-		      char *o_buf, int o_len, char *n_buf, int n_len)
-{
-	int diff;
-	int o_offset;
-	int o_left;
-
-	EnterFunction(9);
-
-	diff = n_len - o_len;
-	o_offset = o_buf - (char *)skb->data;
-	/* The length of left data after o_buf+o_len in the skb data */
-	o_left = skb->len - (o_offset + o_len);
-
-	if (diff <= 0) {
-		memmove(o_buf + n_len, o_buf + o_len, o_left);
-		memcpy(o_buf, n_buf, n_len);
-		skb_trim(skb, skb->len + diff);
-	} else if (diff <= skb_tailroom(skb)) {
-		skb_put(skb, diff);
-		memmove(o_buf + n_len, o_buf + o_len, o_left);
-		memcpy(o_buf, n_buf, n_len);
-	} else {
-		if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
-			return -ENOMEM;
-		skb_put(skb, diff);
-		memmove(skb->data + o_offset + n_len,
-			skb->data + o_offset + o_len, o_left);
-		skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
-	}
-
-	/* must update the iph total length here */
-	ip_hdr(skb)->tot_len = htons(skb->len);
-
-	LeaveFunction(9);
-	return 0;
-}
-
-
 int __init ip_vs_app_init(void)
 {
 	/* we will replace it with proc_net_ipvs_create() soon */
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 58f82df..4f8ddba 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -54,7 +54,6 @@
 
 EXPORT_SYMBOL(register_ip_vs_scheduler);
 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
-EXPORT_SYMBOL(ip_vs_skb_replace);
 EXPORT_SYMBOL(ip_vs_proto_name);
 EXPORT_SYMBOL(ip_vs_conn_new);
 EXPORT_SYMBOL(ip_vs_conn_in_get);
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 2ae747a..f228a17 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -20,6 +20,17 @@
  *
  * Author:	Wouter Gadeyne
  *
+ *
+ * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from
+ * http://www.ssi.bg/~ja/nfct/:
+ *
+ * ip_vs_nfct.c:	Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2008
+ * Julian Anastasov
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -32,6 +43,9 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
 #include <linux/gfp.h>
 #include <net/protocol.h>
 #include <net/tcp.h>
@@ -43,6 +57,16 @@
 #define SERVER_STRING "227 Entering Passive Mode ("
 #define CLIENT_STRING "PORT "
 
+#define FMT_TUPLE	"%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T)	&(T)->src.u3.ip, ntohs((T)->src.u.all), \
+			&(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+			(T)->dst.protonum
+
+#define FMT_CONN	"%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C)	&((C)->caddr.ip), ntohs((C)->cport), \
+			&((C)->vaddr.ip), ntohs((C)->vport), \
+			&((C)->daddr.ip), ntohs((C)->dport), \
+			(C)->protocol, (C)->state
 
 /*
  * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -123,6 +147,119 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 	return 1;
 }
 
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void
+ip_vs_expect_callback(struct nf_conn *ct,
+		      struct nf_conntrack_expect *exp)
+{
+	struct nf_conntrack_tuple *orig, new_reply;
+	struct ip_vs_conn *cp;
+
+	if (exp->tuple.src.l3num != PF_INET)
+		return;
+
+	/*
+	 * We assume that no NF locks are held before this callback.
+	 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+	 * expectations even if they use wildcard values, now we provide the
+	 * actual values from the newly created original conntrack direction.
+	 * The conntrack is confirmed when packet reaches IPVS hooks.
+	 */
+
+	/* RS->CLIENT */
+	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
+				&orig->src.u3, orig->src.u.tcp.port,
+				&orig->dst.u3, orig->dst.u.tcp.port);
+	if (cp) {
+		/* Change reply CLIENT->RS to CLIENT->VS */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.dst.u3 = cp->vaddr;
+		new_reply.dst.u.tcp.port = cp->vport;
+		IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+			  ", inout cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	/* CLIENT->VS */
+	cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
+			       &orig->src.u3, orig->src.u.tcp.port,
+			       &orig->dst.u3, orig->dst.u.tcp.port);
+	if (cp) {
+		/* Change reply VS->CLIENT to RS->CLIENT */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.src.u3 = cp->daddr;
+		new_reply.src.u.tcp.port = cp->dport;
+		IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+		  " - unknown expect\n",
+		  __func__, ct, ct->status, ARG_TUPLE(orig));
+	return;
+
+alter:
+	/* Never alter conntrack for non-NAT conns */
+	if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+		nf_conntrack_alter_reply(ct, &new_reply);
+	ip_vs_conn_put(cp);
+	return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ */
+static void
+ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+		     struct ip_vs_conn *cp, u_int8_t proto,
+		     const __be16 *port, int from_rs)
+{
+	struct nf_conntrack_expect *exp;
+
+	BUG_ON(!ct || ct == &nf_conntrack_untracked);
+
+	exp = nf_ct_expect_alloc(ct);
+	if (!exp)
+		return;
+
+	if (from_rs)
+		nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+				  nf_ct_l3num(ct), &cp->daddr, &cp->caddr,
+				  proto, port, &cp->cport);
+	else
+		nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+				  nf_ct_l3num(ct), &cp->caddr, &cp->vaddr,
+				  proto, port, &cp->vport);
+
+	exp->expectfn = ip_vs_expect_callback;
+
+	IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n",
+		  __func__, ct, ARG_TUPLE(&exp->tuple));
+	nf_ct_expect_related(exp);
+	nf_ct_expect_put(exp);
+}
 
 /*
  * Look at outgoing ftp packets to catch the response to a PASV command
@@ -149,7 +286,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	struct ip_vs_conn *n_cp;
 	char buf[24];		/* xxx.xxx.xxx.xxx,ppp,ppp\000 */
 	unsigned buf_len;
-	int ret;
+	int ret = 0;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
 
 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
@@ -219,19 +358,26 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 
 		buf_len = strlen(buf);
 
+		ct = nf_ct_get(skb, &ctinfo);
+		if (ct && !nf_ct_is_untracked(ct)) {
+			/* If mangling fails this function will return 0
+			 * which will cause the packet to be dropped.
+			 * Mangling can only fail under memory pressure,
+			 * hopefully it will succeed on the retransmitted
+			 * packet.
+			 */
+			ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+						       start-data, end-start,
+						       buf, buf_len);
+			if (ret)
+				ip_vs_expect_related(skb, ct, n_cp,
+						     IPPROTO_TCP, NULL, 0);
+		}
+
 		/*
-		 * Calculate required delta-offset to keep TCP happy
+		 * Not setting 'diff' is intentional, otherwise the sequence
+		 * would be adjusted twice.
 		 */
-		*diff = buf_len - (end-start);
-
-		if (*diff == 0) {
-			/* simply replace it with new passive address */
-			memcpy(start, buf, buf_len);
-			ret = 1;
-		} else {
-			ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
-					  end-start, buf, buf_len);
-		}
 
 		cp->app_data = NULL;
 		ip_vs_tcp_conn_listen(n_cp);
@@ -263,6 +409,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	union nf_inet_addr to;
 	__be16 port;
 	struct ip_vs_conn *n_cp;
+	struct nf_conn *ct;
 
 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
@@ -349,6 +496,11 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		ip_vs_control_add(n_cp, cp);
 	}
 
+	ct = (struct nf_conn *)skb->nfct;
+	if (ct && ct != &nf_conntrack_untracked)
+		ip_vs_expect_related(skb, ct, n_cp,
+				     IPPROTO_TCP, &n_cp->dport, 1);
+
 	/*
 	 *	Move tunnel to listen state
 	 */
-- 
1.7.1


^ permalink raw reply related

* [PATCH 15/28] netfilter: nf_nat_core: merge the same lines
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

proto->unique_tuple() will be called finally, if the previous calls fail. This
patch checks the false condition of (range->flags &IP_NAT_RANGE_PROTO_RANDOM)
instead to avoid duplicate line of code: proto->unique_tuple().

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_core.c |    9 ++-------
 1 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c7719b2..037a3a6 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -261,14 +261,9 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	rcu_read_lock();
 	proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
 
-	/* Change protocol info to have some randomization */
-	if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) {
-		proto->unique_tuple(tuple, range, maniptype, ct);
-		goto out;
-	}
-
 	/* Only bother mapping if it's not already in range and unique */
-	if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
+	if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) &&
+	    (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
 	     proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
 	    !nf_nat_used_tuple(tuple, ct))
 		goto out;
-- 
1.7.1


^ permalink raw reply related

* [PATCH 19/28] netfilter: ip6tables: use skb->len for accounting
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

ipv6_hdr(skb)->payload_len is ZERO and can't be used for accounting, if
the payload is a Jumbo Payload specified in RFC2675.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv6/netfilter/ip6_tables.c |    4 +---
 1 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index dc41d6d..33113c1 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -387,9 +387,7 @@ ip6t_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
-		ADD_COUNTER(e->counters,
-			    ntohs(ipv6_hdr(skb)->payload_len) +
-			    sizeof(struct ipv6hdr), 1);
+		ADD_COUNTER(e->counters, skb->len, 1);
 
 		t = ip6t_get_target_c(e);
 		IP_NF_ASSERT(t->u.kernel.target);
-- 
1.7.1


^ permalink raw reply related

* [PATCH 20/28] netfilter: iptables: use skb->len for accounting
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

Use skb->len for accounting as xt_quota does.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ip_tables.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b38c118..3c584a6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -364,7 +364,7 @@ ipt_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
-		ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+		ADD_COUNTER(e->counters, skb->len, 1);
 
 		t = ipt_get_target(e);
 		IP_NF_ASSERT(t->u.kernel.target);
-- 
1.7.1


^ permalink raw reply related

* [PATCH 21/28] netfilter: {ip,ip6,arp}_tables: dont block bottom half more than necessary
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

We currently disable BH for the whole duration of get_counters()

On machines with a lot of cpus and large tables, this might be too long.

We can disable preemption during the whole function, and disable BH only
while fetching counters for the current cpu.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/arp_tables.c |   10 ++++++----
 net/ipv4/netfilter/ip_tables.c  |   10 ++++++----
 net/ipv6/netfilter/ip6_tables.c |   10 ++++++----
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index c868dd5..6bccba3 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -710,7 +710,7 @@ static void get_counters(const struct xt_table_info *t,
 	struct arpt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu;
+	unsigned int curcpu = get_cpu();
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
@@ -720,14 +720,16 @@ static void get_counters(const struct xt_table_info *t,
 	 * if new softirq were to run and call ipt_do_table
 	 */
 	local_bh_disable();
-	curcpu = smp_processor_id();
-
 	i = 0;
 	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
 		SET_COUNTER(counters[i], iter->counters.bcnt,
 			    iter->counters.pcnt);
 		++i;
 	}
+	local_bh_enable();
+	/* Processing counters from other cpus, we can let bottom half enabled,
+	 * (preemption is disabled)
+	 */
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
@@ -741,7 +743,7 @@ static void get_counters(const struct xt_table_info *t,
 		}
 		xt_info_wrunlock(cpu);
 	}
-	local_bh_enable();
+	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 3c584a6..c439721 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -884,7 +884,7 @@ get_counters(const struct xt_table_info *t,
 	struct ipt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu;
+	unsigned int curcpu = get_cpu();
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
@@ -894,14 +894,16 @@ get_counters(const struct xt_table_info *t,
 	 * if new softirq were to run and call ipt_do_table
 	 */
 	local_bh_disable();
-	curcpu = smp_processor_id();
-
 	i = 0;
 	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
 		SET_COUNTER(counters[i], iter->counters.bcnt,
 			    iter->counters.pcnt);
 		++i;
 	}
+	local_bh_enable();
+	/* Processing counters from other cpus, we can let bottom half enabled,
+	 * (preemption is disabled)
+	 */
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
@@ -915,7 +917,7 @@ get_counters(const struct xt_table_info *t,
 		}
 		xt_info_wrunlock(cpu);
 	}
-	local_bh_enable();
+	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 33113c1..5359ef4 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -897,7 +897,7 @@ get_counters(const struct xt_table_info *t,
 	struct ip6t_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu;
+	unsigned int curcpu = get_cpu();
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
@@ -907,14 +907,16 @@ get_counters(const struct xt_table_info *t,
 	 * if new softirq were to run and call ipt_do_table
 	 */
 	local_bh_disable();
-	curcpu = smp_processor_id();
-
 	i = 0;
 	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
 		SET_COUNTER(counters[i], iter->counters.bcnt,
 			    iter->counters.pcnt);
 		++i;
 	}
+	local_bh_enable();
+	/* Processing counters from other cpus, we can let bottom half enabled,
+	 * (preemption is disabled)
+	 */
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
@@ -928,7 +930,7 @@ get_counters(const struct xt_table_info *t,
 		}
 		xt_info_wrunlock(cpu);
 	}
-	local_bh_enable();
+	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
-- 
1.7.1


^ permalink raw reply related

* [PATCH 22/28] netfilter: nf_conntrack_extend: introduce __nf_ct_ext_exist()
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

some users of nf_ct_ext_exist() know ct->ext isn't NULL. For these users, the
check for ct->ext isn't necessary, the function __nf_ct_ext_exist() can be
used instead.

the type of the return value of nf_ct_ext_exist() is changed to bool.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/netfilter/nf_conntrack_extend.h |    9 +++++++--
 net/netfilter/nf_conntrack_extend.c         |   22 ++++++++++++----------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 32d15bd..0772d29 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -28,9 +28,14 @@ struct nf_ct_ext {
 	char data[0];
 };
 
-static inline int nf_ct_ext_exist(const struct nf_conn *ct, u8 id)
+static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id)
 {
-	return (ct->ext && ct->ext->offset[id]);
+	return !!ext->offset[id];
+}
+
+static inline bool nf_ct_ext_exist(const struct nf_conn *ct, u8 id)
+{
+	return (ct->ext && __nf_ct_ext_exist(ct->ext, id));
 }
 
 static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id)
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index fdc8fb4..7dcf7a4 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -23,9 +23,10 @@ void __nf_ct_ext_destroy(struct nf_conn *ct)
 {
 	unsigned int i;
 	struct nf_ct_ext_type *t;
+	struct nf_ct_ext *ext = ct->ext;
 
 	for (i = 0; i < NF_CT_EXT_NUM; i++) {
-		if (!nf_ct_ext_exist(ct, i))
+		if (!__nf_ct_ext_exist(ext, i))
 			continue;
 
 		rcu_read_lock();
@@ -73,44 +74,45 @@ static void __nf_ct_ext_free_rcu(struct rcu_head *head)
 
 void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
 {
-	struct nf_ct_ext *new;
+	struct nf_ct_ext *old, *new;
 	int i, newlen, newoff;
 	struct nf_ct_ext_type *t;
 
 	/* Conntrack must not be confirmed to avoid races on reallocation. */
 	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 
-	if (!ct->ext)
+	old = ct->ext;
+	if (!old)
 		return nf_ct_ext_create(&ct->ext, id, gfp);
 
-	if (nf_ct_ext_exist(ct, id))
+	if (__nf_ct_ext_exist(old, id))
 		return NULL;
 
 	rcu_read_lock();
 	t = rcu_dereference(nf_ct_ext_types[id]);
 	BUG_ON(t == NULL);
 
-	newoff = ALIGN(ct->ext->len, t->align);
+	newoff = ALIGN(old->len, t->align);
 	newlen = newoff + t->len;
 	rcu_read_unlock();
 
-	new = __krealloc(ct->ext, newlen, gfp);
+	new = __krealloc(old, newlen, gfp);
 	if (!new)
 		return NULL;
 
-	if (new != ct->ext) {
+	if (new != old) {
 		for (i = 0; i < NF_CT_EXT_NUM; i++) {
-			if (!nf_ct_ext_exist(ct, i))
+			if (!__nf_ct_ext_exist(old, i))
 				continue;
 
 			rcu_read_lock();
 			t = rcu_dereference(nf_ct_ext_types[i]);
 			if (t && t->move)
 				t->move((void *)new + new->offset[i],
-					(void *)ct->ext + ct->ext->offset[i]);
+					(void *)old + old->offset[i]);
 			rcu_read_unlock();
 		}
-		call_rcu(&ct->ext->rcu, __nf_ct_ext_free_rcu);
+		call_rcu(&old->rcu, __nf_ct_ext_free_rcu);
 		ct->ext = new;
 	}
 
-- 
1.7.1


^ permalink raw reply related

* [PATCH 23/28] ipvs: remove EXPERIMENTAL tag
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Simon Horman <horms@verge.net.au>

IPVS was merged into the kernel quite a long time ago and
has been seeing wide-spread production use for even longer.

It seems appropriate for it to be no longer tagged as EXPERIMENTAL

Signed-off-as: Simon Horman <horms@verge.net.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipvs/Kconfig |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index be10f65..46a77d5 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -26,7 +26,7 @@ if IP_VS
 
 config	IP_VS_IPV6
 	bool "IPv6 support for IPVS"
-	depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
+	depends on IPV6 = y || IP_VS = IPV6
 	---help---
 	  Add IPv6 support to IPVS. This is incomplete and might be dangerous.
 
-- 
1.7.1


^ permalink raw reply related

* [PATCH 26/28] netfilter: nf_nat: make unique_tuple return void
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

The only user of unique_tuple() get_unique_tuple() doesn't care about the
return value of unique_tuple(), so make unique_tuple() return void (nothing).

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/netfilter/nf_nat_protocol.h   |    8 ++++----
 net/ipv4/netfilter/nf_nat_proto_common.c  |    8 ++++----
 net/ipv4/netfilter/nf_nat_proto_dccp.c    |    6 +++---
 net/ipv4/netfilter/nf_nat_proto_gre.c     |    8 ++++----
 net/ipv4/netfilter/nf_nat_proto_icmp.c    |    6 +++---
 net/ipv4/netfilter/nf_nat_proto_sctp.c    |    6 +++---
 net/ipv4/netfilter/nf_nat_proto_tcp.c     |    5 ++---
 net/ipv4/netfilter/nf_nat_proto_udp.c     |    5 ++---
 net/ipv4/netfilter/nf_nat_proto_udplite.c |    6 +++---
 net/ipv4/netfilter/nf_nat_proto_unknown.c |    4 ++--
 10 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/include/net/netfilter/nf_nat_protocol.h b/include/net/netfilter/nf_nat_protocol.h
index c398017..df17bac 100644
--- a/include/net/netfilter/nf_nat_protocol.h
+++ b/include/net/netfilter/nf_nat_protocol.h
@@ -27,9 +27,9 @@ struct nf_nat_protocol {
 
 	/* Alter the per-proto part of the tuple (depending on
 	   maniptype), to give a unique tuple in the given range if
-	   possible; return false if not.  Per-protocol part of tuple
-	   is initialized to the incoming packet. */
-	bool (*unique_tuple)(struct nf_conntrack_tuple *tuple,
+	   possible.  Per-protocol part of tuple is initialized to the
+	   incoming packet. */
+	void (*unique_tuple)(struct nf_conntrack_tuple *tuple,
 			     const struct nf_nat_range *range,
 			     enum nf_nat_manip_type maniptype,
 			     const struct nf_conn *ct);
@@ -63,7 +63,7 @@ extern bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
 				  const union nf_conntrack_man_proto *min,
 				  const union nf_conntrack_man_proto *max);
 
-extern bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
+extern void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
 				      const struct nf_nat_range *range,
 				      enum nf_nat_manip_type maniptype,
 				      const struct nf_conn *ct,
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
index 6c4f11f..2844a03 100644
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ b/net/ipv4/netfilter/nf_nat_proto_common.c
@@ -34,7 +34,7 @@ bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
 }
 EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
 
-bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
+void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
 			       const struct nf_nat_range *range,
 			       enum nf_nat_manip_type maniptype,
 			       const struct nf_conn *ct,
@@ -53,7 +53,7 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
 	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
 		/* If it's dst rewrite, can't change port */
 		if (maniptype == IP_NAT_MANIP_DST)
-			return false;
+			return;
 
 		if (ntohs(*portptr) < 1024) {
 			/* Loose convention: >> 512 is credential passing */
@@ -87,9 +87,9 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
 			continue;
 		if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
 			*rover = off;
-		return true;
+		return;
 	}
-	return false;
+	return;
 }
 EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
 
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
index 22485ce..570faf2 100644
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c
@@ -22,14 +22,14 @@
 
 static u_int16_t dccp_port_rover;
 
-static bool
+static void
 dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
-	return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-					 &dccp_port_rover);
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+				  &dccp_port_rover);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index d7e8920..89933ab 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -37,7 +37,7 @@ MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
 MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
 
 /* generate unique tuple ... */
-static bool
+static void
 gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 const struct nf_nat_range *range,
 		 enum nf_nat_manip_type maniptype,
@@ -50,7 +50,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 	/* If there is no master conntrack we are not PPTP,
 	   do not change tuples */
 	if (!ct->master)
-		return false;
+		return;
 
 	if (maniptype == IP_NAT_MANIP_SRC)
 		keyptr = &tuple->src.u.gre.key;
@@ -71,11 +71,11 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 	for (i = 0; i < range_size; i++, key++) {
 		*keyptr = htons(min + key % range_size);
 		if (!nf_nat_used_tuple(tuple, ct))
-			return true;
+			return;
 	}
 
 	pr_debug("%p: no NAT mapping\n", ct);
-	return false;
+	return;
 }
 
 /* manipulate a GRE packet according to maniptype */
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 19a8b0b..97003fe 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -27,7 +27,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
 	       ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
 }
 
-static bool
+static void
 icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype,
@@ -46,9 +46,9 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
 		tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
 					     (id % range_size));
 		if (!nf_nat_used_tuple(tuple, ct))
-			return true;
+			return;
 	}
-	return false;
+	return;
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
index 3fc598e..756331d 100644
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c
@@ -16,14 +16,14 @@
 
 static u_int16_t nf_sctp_port_rover;
 
-static bool
+static void
 sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
-	return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-					 &nf_sctp_port_rover);
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+				  &nf_sctp_port_rover);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
index 399e2cf..aa460a5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c
@@ -20,14 +20,13 @@
 
 static u_int16_t tcp_port_rover;
 
-static bool
+static void
 tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 const struct nf_nat_range *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
-	return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-					 &tcp_port_rover);
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
index 9e61c79..dfe65c7 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udp.c
@@ -19,14 +19,13 @@
 
 static u_int16_t udp_port_rover;
 
-static bool
+static void
 udp_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 const struct nf_nat_range *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
-	return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-					 &udp_port_rover);
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
index 440a229..3cc8c8a 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c
@@ -18,14 +18,14 @@
 
 static u_int16_t udplite_port_rover;
 
-static bool
+static void
 udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
 		     const struct nf_nat_range *range,
 		     enum nf_nat_manip_type maniptype,
 		     const struct nf_conn *ct)
 {
-	return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-					 &udplite_port_rover);
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+				  &udplite_port_rover);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
index 14381c6..a50f2bc 100644
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c
@@ -26,14 +26,14 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-static bool unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
+static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
 				 const struct nf_nat_range *range,
 				 enum nf_nat_manip_type maniptype,
 				 const struct nf_conn *ct)
 {
 	/* Sorry: we can't help you; if it's not unique, we can't frob
 	   anything. */
-	return false;
+	return;
 }
 
 static bool
-- 
1.7.1


^ permalink raw reply related

* [PATCH 27/28] netfilter: nf_nat: don't check if the tuple is unique when there isn't any other choice
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

The tuple got from unique_tuple() doesn't need to be really unique, so the
check for the unique tuple isn't necessary, when there isn't any other
choice. Eliminating the unnecessary nf_nat_used_tuple() can save some CPU
cycles too.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_proto_common.c |    4 ++--
 net/ipv4/netfilter/nf_nat_proto_gre.c    |    4 ++--
 net/ipv4/netfilter/nf_nat_proto_icmp.c   |    4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
index 2844a03..3e61faf 100644
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ b/net/ipv4/netfilter/nf_nat_proto_common.c
@@ -81,9 +81,9 @@ void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
 	else
 		off = *rover;
 
-	for (i = 0; i < range_size; i++, off++) {
+	for (i = 0; ; ++off) {
 		*portptr = htons(min + off % range_size);
-		if (nf_nat_used_tuple(tuple, ct))
+		if (++i != range_size && nf_nat_used_tuple(tuple, ct))
 			continue;
 		if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
 			*rover = off;
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 89933ab..bc8d83a 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -68,9 +68,9 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 
 	pr_debug("min = %u, range_size = %u\n", min, range_size);
 
-	for (i = 0; i < range_size; i++, key++) {
+	for (i = 0; ; ++key) {
 		*keyptr = htons(min + key % range_size);
-		if (!nf_nat_used_tuple(tuple, ct))
+		if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
 			return;
 	}
 
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 97003fe..5744c3e 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -42,10 +42,10 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
 	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
 		range_size = 0xFFFF;
 
-	for (i = 0; i < range_size; i++, id++) {
+	for (i = 0; ; ++id) {
 		tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
 					     (id % range_size));
-		if (!nf_nat_used_tuple(tuple, ct))
+		if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
 			return;
 	}
 	return;
-- 
1.7.1


^ permalink raw reply related

* [PATCH 02/28] netfilter: ipt_REJECT: postpone the checksum calculation.
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Changli Gao <xiaosuo@gmail.com>

postpone the checksum calculation, then if the output NIC supports checksum
offloading, we can utlize it. And though the output NIC doesn't support
checksum offloading, but we'll mangle this packet, this can free us from
updating the checksum, as the checksum calculation occurs later.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ipt_REJECT.c |   10 +++++-----
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index f5f4a88..3d0e064 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -95,10 +95,11 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	}
 
 	tcph->rst	= 1;
-	tcph->check	= tcp_v4_check(sizeof(struct tcphdr),
-				       niph->saddr, niph->daddr,
-				       csum_partial(tcph,
-						    sizeof(struct tcphdr), 0));
+	tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
+				    niph->daddr, 0);
+	nskb->ip_summed = CHECKSUM_PARTIAL;
+	nskb->csum_start = (unsigned char *)tcph - nskb->head;
+	nskb->csum_offset = offsetof(struct tcphdr, check);
 
 	addr_type = RTN_UNSPEC;
 	if (hook != NF_INET_FORWARD
@@ -115,7 +116,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 		goto free_nskb;
 
 	niph->ttl	= dst_metric(skb_dst(nskb), RTAX_HOPLIMIT);
-	nskb->ip_summed = CHECKSUM_NONE;
 
 	/* "Never happens" */
 	if (nskb->len > dst_mtu(skb_dst(nskb)))
-- 
1.7.1


^ permalink raw reply related

* [PATCH 03/28] netfilter: ipt_REJECT: avoid touching dst ref
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

We can avoid a pair of atomic ops in ipt_REJECT send_reset()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ipt_REJECT.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 3d0e064..b254daf 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -110,7 +110,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 		addr_type = RTN_LOCAL;
 
 	/* ip_route_me_harder expects skb->dst to be set */
-	skb_dst_set(nskb, dst_clone(skb_dst(oldskb)));
+	skb_dst_set_noref(nskb, skb_dst(oldskb));
 
 	if (ip_route_me_harder(nskb, addr_type))
 		goto free_nskb;
-- 
1.7.1


^ permalink raw reply related

* [PATCH 00/28] netfilter: netfilter update
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev

following is a final netfilter update for net-next, containing:

- fast path reassembly optimization for in-order fragments, from Changli Gao

- ipt_REJECT checksum offloading, from Changli Gao

- use of skb_dst_set_noref in ipt_REJECT, from Eric

- IPVS protocol handler invocation fixes, from Xiaoyu Du

- TCP connection tracking improvements for state synchronization, from Pablo

- a new 'CHECKSUM' target for working around broken virtualized applications,
  from Michael S. Tsirkin

- patches to make IPVS work with netfilter SNAT, from Hannes Eder

- a new 'cpu' match for distributing connections in a cache friendly manner,
  from Eric

- a fix for the quota match to dump the initial quota instead of the remaining
  one to userspace, from Changli Gao

- patches to use skb->len for accounting instead of the values contained in
  the header in order to fix jumbo frame handling, from Changli Gao

- reduction of the time *tables runs with BHs disabled when summing up the
  counters, from Eric

- IPVS code consolidation, from Simon

Please pull from:

git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git master

Thanks!


^ permalink raw reply

* [PATCH 07/28] nfnetlink_log: do not expose NFULNL_COPY_DISABLED to user-space
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Pablo Neira Ayuso <pablo@netfilter.org>

This patch moves NFULNL_COPY_PACKET definition from
linux/netfilter/nfnetlink_log.h to net/netfilter/nfnetlink_log.h
since this copy mode is only for internal use.

I have also changed the value from 0x03 to 0xff. Thus, we avoid
a gap from user-space that may confuse users if we add new
copy modes in the future.

This change was introduced in:
http://www.spinics.net/lists/netfilter-devel/msg13535.html

Since this change is not included in any stable Linux kernel,
I think it's safe to make this change now. Anyway, this copy
mode does not make any sense from user-space, so this patch
should not break any existing setup.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nfnetlink_log.h |    2 +-
 include/net/netfilter/nfnetlink_log.h   |    2 ++
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h
index 1d0b84a..ea9b8d3 100644
--- a/include/linux/netfilter/nfnetlink_log.h
+++ b/include/linux/netfilter/nfnetlink_log.h
@@ -89,7 +89,7 @@ enum nfulnl_attr_config {
 #define NFULNL_COPY_NONE	0x00
 #define NFULNL_COPY_META	0x01
 #define NFULNL_COPY_PACKET	0x02
-#define NFULNL_COPY_DISABLED	0x03
+/* 0xff is reserved, don't use it for new copy modes. */
 
 #define NFULNL_CFG_F_SEQ	0x0001
 #define NFULNL_CFG_F_SEQ_GLOBAL	0x0002
diff --git a/include/net/netfilter/nfnetlink_log.h b/include/net/netfilter/nfnetlink_log.h
index b0569ff..e2dec42 100644
--- a/include/net/netfilter/nfnetlink_log.h
+++ b/include/net/netfilter/nfnetlink_log.h
@@ -10,5 +10,7 @@ nfulnl_log_packet(u_int8_t pf,
 		  const struct nf_loginfo *li_user,
 		  const char *prefix);
 
+#define NFULNL_COPY_DISABLED    0xff
+
 #endif /* _KER_NFNETLINK_LOG_H */
 
-- 
1.7.1


^ permalink raw reply related

* [PATCH 05/28] ipvs: lvs sctp protocol handler is incorrectly invoked ip_vs_app_pkt_out
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Xiaoyu Du <tingsrain@gmail.com>

lvs sctp protocol handler is incorrectly invoked ip_vs_app_pkt_out
Since there's no sctp helpers at present, it does the same thing as
ip_vs_app_pkt_in.

Signed-off-by: Xiaoyu Du <tingsrain@gmail.com>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipvs/ip_vs_proto_sctp.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index c9a3f7a..db55759 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -173,7 +173,7 @@ sctp_dnat_handler(struct sk_buff *skb,
 			return 0;
 
 		/* Call application helper if needed */
-		if (!ip_vs_app_pkt_out(cp, skb))
+		if (!ip_vs_app_pkt_in(cp, skb))
 			return 0;
 	}
 
-- 
1.7.1


^ permalink raw reply related

* [PATCH 09/28] netfilter: add CHECKSUM target
From: kaber @ 2010-08-02 19:57 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1280779065-9333-1-git-send-email-kaber@trash.net>

From: Michael S. Tsirkin <mst@redhat.com>

This adds a `CHECKSUM' target, which can be used in the iptables mangle
table.

You can use this target to compute and fill in the checksum in
a packet that lacks a checksum.  This is particularly useful,
if you need to work around old applications such as dhcp clients,
that do not work well with checksum offloads, but don't want to
disable checksum offload in your device.

The problem happens in the field with virtualized applications.
For reference, see Red Hat bz 605555, as well as
http://www.spinics.net/lists/kvm/msg37660.html

Typical expected use (helps old dhclient binary running in a VM):
iptables -A POSTROUTING -t mangle -p udp --dport bootpc \
	-j CHECKSUM --checksum-fill

Includes fixes by Jan Engelhardt <jengelh@medozas.de>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_CHECKSUM.h |   18 ++++++++
 net/netfilter/Kconfig                 |   16 +++++++
 net/netfilter/Makefile                |    1 +
 net/netfilter/xt_CHECKSUM.c           |   70 +++++++++++++++++++++++++++++++++
 4 files changed, 105 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/netfilter/xt_CHECKSUM.h
 create mode 100644 net/netfilter/xt_CHECKSUM.c

diff --git a/include/linux/netfilter/xt_CHECKSUM.h b/include/linux/netfilter/xt_CHECKSUM.h
new file mode 100644
index 0000000..3b4fb77
--- /dev/null
+++ b/include/linux/netfilter/xt_CHECKSUM.h
@@ -0,0 +1,18 @@
+/* Header file for iptables ipt_CHECKSUM target
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2010 Red Hat Inc
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This software is distributed under GNU GPL v2, 1991
+*/
+#ifndef _IPT_CHECKSUM_TARGET_H
+#define _IPT_CHECKSUM_TARGET_H
+
+#define XT_CHECKSUM_OP_FILL	0x01	/* fill in checksum in IP header */
+
+struct xt_CHECKSUM_info {
+	__u8 operation;	/* bitset of operations */
+};
+
+#endif /* _IPT_CHECKSUM_TARGET_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index aa2f106..5fb8efa 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -326,6 +326,22 @@ config NETFILTER_XT_CONNMARK
 
 comment "Xtables targets"
 
+config NETFILTER_XT_TARGET_CHECKSUM
+	tristate "CHECKSUM target support"
+	depends on IP_NF_MANGLE || IP6_NF_MANGLE
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option adds a `CHECKSUM' target, which can be used in the iptables mangle
+	  table.
+
+	  You can use this target to compute and fill in the checksum in
+	  a packet that lacks a checksum.  This is particularly useful,
+	  if you need to work around old applications such as dhcp clients,
+	  that do not work well with checksum offloads, but don't want to disable
+	  checksum offload in your device.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_CLASSIFY
 	tristate '"CLASSIFY" target support'
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index e28420a..36ef8e6 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
 
 # targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
new file mode 100644
index 0000000..0f642ef
--- /dev/null
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -0,0 +1,70 @@
+/* iptables module for the packet checksum mangling
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * (C) 2010 Red Hat, Inc.
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>");
+MODULE_DESCRIPTION("Xtables: checksum modification");
+MODULE_ALIAS("ipt_CHECKSUM");
+MODULE_ALIAS("ip6t_CHECKSUM");
+
+static unsigned int
+checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		skb_checksum_help(skb);
+
+	return XT_CONTINUE;
+}
+
+static int checksum_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_CHECKSUM_info *einfo = par->targinfo;
+
+	if (einfo->operation & ~XT_CHECKSUM_OP_FILL) {
+		pr_info("unsupported CHECKSUM operation %x\n", einfo->operation);
+		return -EINVAL;
+	}
+	if (!einfo->operation) {
+		pr_info("no CHECKSUM operation enabled\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_target checksum_tg_reg __read_mostly = {
+	.name		= "CHECKSUM",
+	.family		= NFPROTO_UNSPEC,
+	.target		= checksum_tg,
+	.targetsize	= sizeof(struct xt_CHECKSUM_info),
+	.table		= "mangle",
+	.checkentry	= checksum_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init checksum_tg_init(void)
+{
+	return xt_register_target(&checksum_tg_reg);
+}
+
+static void __exit checksum_tg_exit(void)
+{
+	xt_unregister_target(&checksum_tg_reg);
+}
+
+module_init(checksum_tg_init);
+module_exit(checksum_tg_exit);
-- 
1.7.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox